1
0
mirror of https://github.com/systemd/systemd synced 2026-03-18 19:14:46 +01:00

Compare commits

..

No commits in common. "49eb2d50b4328956b0d8fdf173425e2029b7d5ee" and "b6585c811a82633c082182145ca5d5132a8228ff" have entirely different histories.

20 changed files with 465 additions and 159 deletions

22
README
View File

@ -47,25 +47,25 @@ REQUIREMENTS:
≥ 5.3 for bounded loops in BPF program, keyring namespacing,
and nexthop support
≥ 5.4 for pidfd and signed Verity images
≥ 5.6 for getrandom() GRND_INSECURE
≥ 5.7 for CLONE_INTO_CGROUP, cgroup2fs memory_recursiveprot option,
BPF links and the BPF LSM hook
≥ 5.8 for LOOP_CONFIGURE and STATX_ATTR_MOUNT_ROOT
≥ 5.9 for close_range()
≥ 5.10 for STATX_MNT_ID
⛔ Kernel versions below 5.10 ("minimum baseline") are not supported at all,
⛔ Kernel versions below 5.4 ("minimum baseline") are not supported at all,
and are missing required functionality as listed above.
Linux kernel ≥ 5.12 for idmapped mount
≥ 5.14 for cgroup.kill and quotactl_fd()
Linux kernel ≥ 5.6 for getrandom() GRND_INSECURE
≥ 5.7 for CLONE_INTO_CGROUP, cgroup2fs memory_recursiveprot option,
BPF links and the BPF LSM hook
⚠️ Kernel versions below 5.14 ("recommended baseline") have significant gaps
⚠️ Kernel versions below 5.7 ("recommended baseline") have significant gaps
in functionality and are not recommended for use with this version
of systemd. Taint flag 'old-kernel' will be set. systemd will most likely
still function, but upstream support and testing are limited.
Linux kernel ≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
Linux kernel ≥ 5.8 for LOOP_CONFIGURE and STATX_ATTR_MOUNT_ROOT
≥ 5.9 for close_range()
≥ 5.12 for idmapped mount
≥ 5.14 for cgroup.kill
≥ 5.14 for quotactl_fd()
≥ 6.3 for MFD_EXEC/MFD_NOEXEC_SEAL and tmpfs noswap option
≥ 6.5 for name_to_handle_at() AT_HANDLE_FID, SO_PEERPIDFD/SO_PASSPIDFD,
and MOVE_MOUNT_BENEATH
≥ 6.6 for quota support on tmpfs

View File

@ -68,4 +68,4 @@
#define VARLINK_PATH_MACHINED_RESOLVE_HOOK "/run/systemd/resolve.hook/io.systemd.Machine"
/* Recommended baseline - see README for details */
#define KERNEL_BASELINE_VERSION "5.14"
#define KERNEL_BASELINE_VERSION "5.7"

View File

@ -16,6 +16,7 @@
#include "format-util.h"
#include "fs-util.h"
#include "log.h"
#include "mountpoint-util.h"
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
@ -251,9 +252,10 @@ int close_all_fds_frugal(const int except[], size_t n_except) {
assert(except || n_except == 0);
/* This is the inner fallback core of close_all_fds(). This never calls malloc() or so and hence is
* safe to be called in signal handler context. Most users should call close_all_fds(), but when we
* assume we are called from signal handler context, then use this simpler call instead. */
/* This is the inner fallback core of close_all_fds(). This never calls malloc() or opendir() or so
* and hence is safe to be called in signal handler context. Most users should call close_all_fds(),
* but when we assume we are called from signal handler context, then use this simpler call
* instead. */
max_fd = get_max_fd();
if (max_fd < 0)
@ -279,6 +281,44 @@ int close_all_fds_frugal(const int except[], size_t n_except) {
return r;
}
int close_all_fds_by_proc(const int except[], size_t n_except) {
_cleanup_closedir_ DIR *d = NULL;
int r = 0;
d = opendir("/proc/self/fd");
if (!d)
return close_all_fds_frugal(except, n_except); /* ultimate fallback if /proc/ is not available */
FOREACH_DIRENT(de, d, return -errno) {
int fd = -EBADF, q;
if (!IN_SET(de->d_type, DT_LNK, DT_UNKNOWN))
continue;
fd = parse_fd(de->d_name);
if (fd < 0)
/* Let's better ignore this, just in case */
continue;
if (fd < 3)
continue;
if (fd == dirfd(d))
continue;
if (fd_in_set(fd, except, n_except))
continue;
q = close_nointr(fd);
if (q != -EBADF) /* Valgrind has its own FD and doesn't want to have it closed */
RET_GATHER(r, q);
}
return r;
}
static bool have_close_range = true; /* Assume we live in the future */
static int close_all_fds_special_case(const int except[], size_t n_except) {
assert(n_except == 0 || except);
@ -286,6 +326,9 @@ static int close_all_fds_special_case(const int except[], size_t n_except) {
* nicely, since we won't need sorting for them. Returns > 0 if the special casing worked, 0
* otherwise. */
if (!have_close_range)
return 0;
if (n_except == 1 && except[0] < 0) /* Minor optimization: if we only got one fd, and it's invalid,
* we got none */
n_except = 0;
@ -294,22 +337,31 @@ static int close_all_fds_special_case(const int except[], size_t n_except) {
case 0:
/* Close everything. Yay! */
if (close_range(3, INT_MAX, 0) < 0)
return -errno;
return 1;
if (close_range(3, INT_MAX, 0) >= 0)
return 1;
if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) {
have_close_range = false;
return 0;
}
return -errno;
case 1:
/* Close all but exactly one, then we don't need no sorting. This is a pretty common
* case, hence let's handle it specially. */
if (except[0] > 3 && close_range(3, except[0] - 1, 0) < 0)
return -errno;
if ((except[0] <= 3 || close_range(3, except[0]-1, 0) >= 0) &&
(except[0] >= INT_MAX || close_range(MAX(3, except[0]+1), -1, 0) >= 0))
return 1;
if (except[0] < INT_MAX && close_range(MAX(3, except[0] + 1), -1, 0) < 0)
return -errno;
if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) {
have_close_range = false;
return 0;
}
return 1;
return -errno;
default:
return 0;
@ -341,6 +393,9 @@ int close_all_fds(const int except[], size_t n_except) {
if (r > 0) /* special case worked! */
return 0;
if (!have_close_range)
return close_all_fds_by_proc(except, n_except);
_cleanup_free_ int *sorted_malloc = NULL;
size_t n_sorted;
int *sorted;
@ -360,7 +415,7 @@ int close_all_fds(const int except[], size_t n_except) {
sorted = newa(int, n_sorted);
if (!sorted) /* Fallback on OOM. */
return close_all_fds_frugal(except, n_except);
return close_all_fds_by_proc(except, n_except);
memcpy(sorted, except, n_except * sizeof(int));
@ -382,8 +437,13 @@ int close_all_fds(const int except[], size_t n_except) {
continue;
/* Close everything between the start and end fds (both of which shall stay open) */
if (close_range(start + 1, end - 1, 0) < 0)
return -errno;
if (close_range(start + 1, end - 1, 0) < 0) {
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
return -errno;
have_close_range = false;
return close_all_fds_by_proc(except, n_except);
}
}
/* The loop succeeded. Let's now close everything beyond the end */
@ -391,8 +451,13 @@ int close_all_fds(const int except[], size_t n_except) {
if (sorted[n_sorted-1] >= INT_MAX) /* Dont let the addition below overflow */
return 0;
if (close_range(sorted[n_sorted-1] + 1, INT_MAX, 0) < 0)
return -errno;
if (close_range(sorted[n_sorted-1] + 1, INT_MAX, 0) < 0) {
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
return -errno;
have_close_range = false;
return close_all_fds_by_proc(except, n_except);
}
return 0;
}
@ -1054,6 +1119,7 @@ int path_is_root_at(int dir_fd, const char *path) {
int fds_are_same_mount(int fd1, int fd2) {
struct statx sx1 = {}, sx2 = {}; /* explicitly initialize the struct to make msan silent. */
int r;
assert(fd1 >= 0);
assert(fd2 >= 0);
@ -1064,7 +1130,39 @@ int fds_are_same_mount(int fd1, int fd2) {
if (statx(fd2, "", AT_EMPTY_PATH, STATX_TYPE|STATX_INO|STATX_MNT_ID, &sx2) < 0)
return -errno;
return statx_inode_same(&sx1, &sx2) && statx_mount_same(&sx1, &sx2);
/* First, compare inode. If these are different, the fd does not point to the root directory "/". */
if (!statx_inode_same(&sx1, &sx2))
return false;
/* Note, statx() does not provide the mount ID and path_get_mnt_id_at() does not work when an old
* kernel is used. In that case, let's assume that we do not have such spurious mount points in an
* early boot stage, and silently skip the following check. */
if (!FLAGS_SET(sx1.stx_mask, STATX_MNT_ID)) {
int mntid;
r = path_get_mnt_id_at_fallback(fd1, "", &mntid);
if (r < 0)
return r;
assert(mntid >= 0);
sx1.stx_mnt_id = mntid;
sx1.stx_mask |= STATX_MNT_ID;
}
if (!FLAGS_SET(sx2.stx_mask, STATX_MNT_ID)) {
int mntid;
r = path_get_mnt_id_at_fallback(fd2, "", &mntid);
if (r < 0)
return r;
assert(mntid >= 0);
sx2.stx_mnt_id = mntid;
sx2.stx_mask |= STATX_MNT_ID;
}
return statx_mount_same(&sx1, &sx2);
}
char* format_proc_fd_path(char buf[static PROC_FD_PATH_MAX], int fd) {

View File

@ -112,6 +112,7 @@ int get_max_fd(void);
int close_all_fds(const int except[], size_t n_except);
int close_all_fds_without_malloc(const int except[], size_t n_except);
int close_all_fds_by_proc(const int except[], size_t n_except);
int close_all_fds_frugal(const int except[], size_t n_except);
int pack_fds(int fds[], size_t n);

View File

@ -13,8 +13,10 @@
#include "log.h"
#include "mountpoint-util.h"
#include "nulstr-util.h"
#include "parse-util.h"
#include "path-util.h"
#include "stat-util.h"
#include "stdio-util.h"
#include "string-util.h"
#include "strv.h"
@ -133,6 +135,57 @@ int name_to_handle_at_try_fid(
return name_to_handle_at_loop(fd, path, ret_handle, ret_mnt_id, flags & ~AT_HANDLE_FID);
}
static int fd_fdinfo_mnt_id(int fd, const char *filename, int flags, int *ret_mnt_id) {
char path[STRLEN("/proc/self/fdinfo/") + DECIMAL_STR_MAX(int)];
_cleanup_close_ int subfd = -EBADF;
int r;
assert((flags & ~(AT_SYMLINK_FOLLOW|AT_EMPTY_PATH)) == 0);
assert(ret_mnt_id);
if ((flags & AT_EMPTY_PATH) && isempty(filename))
xsprintf(path, "/proc/self/fdinfo/%i", fd);
else {
subfd = openat(fd, filename, O_CLOEXEC|O_PATH|(flags & AT_SYMLINK_FOLLOW ? 0 : O_NOFOLLOW));
if (subfd < 0)
return -errno;
xsprintf(path, "/proc/self/fdinfo/%i", subfd);
}
_cleanup_free_ char *p = NULL;
r = get_proc_field(path, "mnt_id", &p);
if (r == -ENOENT)
return -EBADF;
if (r < 0)
return r;
return safe_atoi(p, ret_mnt_id);
}
static bool filename_possibly_with_slash_suffix(const char *s) {
const char *slash, *copied;
/* Checks whether the specified string is either file name, or a filename with a suffix of
* slashes. But nothing else.
*
* this is OK: foo, bar, foo/, bar/, foo//, bar///
* this is not OK: "", "/", "/foo", "foo/bar", ".", ".." */
slash = strchr(s, '/');
if (!slash)
return filename_is_valid(s);
if (slash - s > PATH_MAX) /* We want to allocate on the stack below, hence do a size check first */
return false;
if (slash[strspn(slash, "/")] != 0) /* Check that the suffix consist only of one or more slashes */
return false;
copied = strndupa_safe(s, slash - s);
return filename_is_valid(copied);
}
bool file_handle_equal(const struct file_handle *a, const struct file_handle *b) {
if (a == b)
return true;
@ -144,73 +197,191 @@ bool file_handle_equal(const struct file_handle *a, const struct file_handle *b)
return memcmp_nn(a->f_handle, a->handle_bytes, b->f_handle, b->handle_bytes) == 0;
}
int is_mount_point_at(int dir_fd, const char *path, int flags) {
int is_mount_point_at(int fd, const char *filename, int flags) {
bool fd_is_self;
int r;
assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
assert(fd >= 0 || fd == AT_FDCWD);
assert((flags & ~AT_SYMLINK_FOLLOW) == 0);
if (path_equal(path, "/"))
return true;
if (isempty(path)) {
if (dir_fd == AT_FDCWD)
path = ".";
if (isempty(filename)) {
if (fd == AT_FDCWD)
filename = ".";
else {
/* If the file name is empty we'll see if the specified 'fd' is a mount point.
* That's only supported by statx(), or if the inode specified via 'fd' refers to a
* directory. Otherwise, we'll have to fail (ENOTDIR), because we have no kernel API
* to query the information we need. */
flags |= AT_EMPTY_PATH;
path = "";
filename = "";
}
fd_is_self = true;
} else if (STR_IN_SET(filename, ".", "./"))
fd_is_self = true;
else {
/* Insist that the specified filename is actually a filename, and not a path, i.e. some inode
* further up or down the tree then immediately below the specified directory fd. */
if (!filename_possibly_with_slash_suffix(filename))
return -EINVAL;
fd_is_self = false;
}
/* First we will try statx()' STATX_ATTR_MOUNT_ROOT attribute, which is our ideal API, available
* since kernel 5.8.
*
* If that fails, our second try is the name_to_handle_at() syscall, which tells us the mount id and
* an opaque file "handle". It is not supported everywhere though (kernel compile-time option, not
* all file systems are hooked up). If it works the mount id is usually good enough to tell us
* whether something is a mount point.
*
* If that didn't work we will try to read the mount id from /proc/self/fdinfo/<fd>. This is almost
* as good as name_to_handle_at(), however, does not return the opaque file handle. The opaque file
* handle is pretty useful to detect the root directory, which we should always consider a mount
* point. Hence we use this only as fallback.
*
* Note that traditionally the check is done via fstat()-based st_dev comparisons. However, various
* file systems don't guarantee same st_dev across single fs anymore, e.g. unionfs exposes file systems
* with a variety of st_dev reported. Also, btrfs subvolumes have different st_dev, even though
* they aren't real mounts of their own. */
struct statx sx = {}; /* explicitly initialize the struct to make msan silent. */
if (statx(dir_fd, path,
if (statx(fd, filename,
at_flags_normalize_nofollow(flags) |
AT_NO_AUTOMOUNT | /* don't trigger automounts mounts are a local concept, hence no need to trigger automounts to determine STATX_ATTR_MOUNT_ROOT */
AT_STATX_DONT_SYNC, /* don't go to the network for this for similar reasons */
STATX_TYPE|STATX_INO,
STATX_TYPE,
&sx) < 0)
return -errno;
r = statx_warn_mount_root(&sx, LOG_DEBUG);
if (FLAGS_SET(sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) /* yay! */
return FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT);
_cleanup_free_ struct file_handle *h = NULL, *h_parent = NULL;
int mount_id = -1, mount_id_parent = -1;
bool nosupp = false;
r = name_to_handle_at_try_fid(fd, filename, &h, &mount_id, flags);
if (r < 0) {
if (is_name_to_handle_at_fatal_error(r))
return r;
if (!ERRNO_IS_NOT_SUPPORTED(r))
goto fallback_fdinfo;
/* This file system does not support name_to_handle_at(), hence let's see if the upper fs
* supports it (in which case it is a mount point), otherwise fall back to the fdinfo logic. */
nosupp = true;
}
if (fd_is_self)
r = name_to_handle_at_try_fid(fd, "..", &h_parent, &mount_id_parent, 0); /* can't work for non-directories 😢 */
else
r = name_to_handle_at_try_fid(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH);
if (r < 0) {
if (is_name_to_handle_at_fatal_error(r))
return r;
if (!ERRNO_IS_NOT_SUPPORTED(r))
goto fallback_fdinfo;
if (nosupp)
/* Both the parent and the directory can't do name_to_handle_at() */
goto fallback_fdinfo;
/* The parent can't do name_to_handle_at() but the directory we are
* interested in can? If so, it must be a mount point. */
return 1;
}
/* The parent can do name_to_handle_at() but the directory we are interested in can't? If
* so, it must be a mount point. */
if (nosupp)
return 1;
/* If the file handle for the directory we are interested in and its parent are identical,
* we assume this is the root directory, which is a mount point. */
if (file_handle_equal(h_parent, h))
return 1;
return mount_id != mount_id_parent;
fallback_fdinfo:
r = fd_fdinfo_mnt_id(fd, filename, flags, &mount_id);
if (r < 0)
return r;
if (FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT))
return true;
if (fd_is_self)
r = fd_fdinfo_mnt_id(fd, "..", 0, &mount_id_parent); /* can't work for non-directories 😢 */
else
r = fd_fdinfo_mnt_id(fd, "", AT_EMPTY_PATH, &mount_id_parent);
if (r < 0)
return r;
/* When running on chroot environment, the root may not be a mount point, but we unconditionally
* return true when the input is "/" in the above, but the shortcut may not work e.g. when the path
* is relative. */
struct statx sx2 = {}; /* explicitly initialize the struct to make msan silent. */
if (statx(AT_FDCWD, "/", AT_STATX_DONT_SYNC, STATX_TYPE|STATX_INO, &sx2) < 0)
if (mount_id != mount_id_parent)
return 1;
/* Hmm, so, the mount ids are the same. This leaves one special case though for the root file
* system. For that, let's see if the parent directory has the same inode as we are interested
* in. */
struct stat a, b;
/* yay for fstatat() taking a different set of flags than the other _at() above */
if (fstatat(fd, filename, &a, at_flags_normalize_nofollow(flags)) < 0)
return -errno;
return statx_inode_same(&sx, &sx2);
if (fd_is_self)
r = fstatat(fd, "..", &b, 0);
else
r = fstatat(fd, "", &b, AT_EMPTY_PATH);
if (r < 0)
return -errno;
/* A directory with same device and inode as its parent must be the root directory. Otherwise
* not a mount point.
*
* NB: we avoid inode_same_at() here because it internally attempts name_to_handle_at_try_fid() first,
* which is redundant. */
return stat_inode_same(&a, &b);
}
/* flags can be AT_SYMLINK_FOLLOW or 0 */
int path_is_mount_point_full(const char *path, const char *root, int flags) {
_cleanup_close_ int dir_fd = -EBADF;
int r;
_cleanup_close_ int dfd = -EBADF;
_cleanup_free_ char *fn = NULL;
assert(path);
assert((flags & ~AT_SYMLINK_FOLLOW) == 0);
if (empty_or_root(root))
return is_mount_point_at(AT_FDCWD, path, flags);
if (path_equal(path, "/"))
return 1;
r = chase(path, root,
FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : CHASE_NOFOLLOW,
/* ret_path= */ NULL, &dir_fd);
if (r < 0)
/* we need to resolve symlinks manually, we can't just rely on is_mount_point_at() to do that for us;
* if we have a structure like /bin -> /usr/bin/ and /usr is a mount point, then the parent that we
* look at needs to be /usr, not /. */
dfd = chase_and_open_parent(path, root,
CHASE_TRAIL_SLASH|(FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : CHASE_NOFOLLOW),
&fn);
if (dfd < 0)
return dfd;
return is_mount_point_at(dfd, fn, flags);
}
int path_get_mnt_id_at_fallback(int dir_fd, const char *path, int *ret) {
int r;
assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
assert(ret);
r = name_to_handle_at_loop(dir_fd, path, NULL, ret, isempty(path) ? AT_EMPTY_PATH : 0);
if (r >= 0 || is_name_to_handle_at_fatal_error(r))
return r;
return is_mount_point_at(dir_fd, /* path= */ NULL, flags);
return fd_fdinfo_mnt_id(dir_fd, path, isempty(path) ? AT_EMPTY_PATH : 0, ret);
}
int path_get_mnt_id_at(int dir_fd, const char *path, int *ret) {
struct statx sx;
int r;
assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
assert(ret);
@ -224,12 +395,12 @@ int path_get_mnt_id_at(int dir_fd, const char *path, int *ret) {
&sx) < 0)
return -errno;
r = statx_warn_mount_id(&sx, LOG_DEBUG);
if (r < 0)
return r;
if (FLAGS_SET(sx.stx_mask, STATX_MNT_ID)) {
*ret = sx.stx_mnt_id;
return 0;
}
*ret = sx.stx_mnt_id;
return 0;
return path_get_mnt_id_at_fallback(dir_fd, path, ret);
}
bool fstype_is_network(const char *fstype) {

View File

@ -39,12 +39,13 @@ int name_to_handle_at_try_fid(int fd, const char *path, struct file_handle **ret
bool file_handle_equal(const struct file_handle *a, const struct file_handle *b);
int path_get_mnt_id_at_fallback(int dir_fd, const char *path, int *ret);
int path_get_mnt_id_at(int dir_fd, const char *path, int *ret);
static inline int path_get_mnt_id(const char *path, int *ret) {
return path_get_mnt_id_at(AT_FDCWD, path, ret);
}
int is_mount_point_at(int dir_fd, const char *path, int flags);
int is_mount_point_at(int fd, const char *filename, int flags);
int path_is_mount_point_full(const char *path, const char *root, int flags);
static inline int path_is_mount_point(const char *path) {
return path_is_mount_point_full(path, NULL, 0);

View File

@ -2119,14 +2119,15 @@ int posix_spawn_wrapper(
if (ERRNO_IS_NOT_SUPPORTED(r) && FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP) && cg_is_threaded(cgroup) > 0)
return -EUCLEAN; /* clone3() could also return EOPNOTSUPP if the target cgroup is in threaded mode,
turn that into something recognizable */
if ((ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r)) &&
if ((ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || r == E2BIG) &&
FLAGS_SET(flags, POSIX_SPAWN_SETCGROUP)) {
/* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but
* need to disable POSIX_SPAWN_SETCGROUP, which is what redirects to clone3().
* CLONE_INTO_CGROUP definitely won't work, hence remember the fact so that we don't
* retry every time.
* Note, CLONE_INTO_CGROUP is supported since kernel v5.7, but some architectures still
* do not support clone3(). Hence, we need to keep the fallback logic for a while. */
* Note that we might get E2BIG here since some kernels (e.g. 5.4) support clone3()
* but not CLONE_INTO_CGROUP. */
/* CLONE_INTO_CGROUP definitely won't work, hence remember the fact so that we don't
* retry every time. */
have_clone_into_cgroup = false;
flags &= ~POSIX_SPAWN_SETCGROUP;

View File

@ -68,6 +68,8 @@ static void fallback_random_bytes(void *p, size_t n) {
}
void random_bytes(void *p, size_t n) {
static bool have_grndinsecure = true;
assert(p || n == 0);
if (n == 0)
@ -76,9 +78,15 @@ void random_bytes(void *p, size_t n) {
for (;;) {
ssize_t l;
l = getrandom(p, n, GRND_INSECURE);
l = getrandom(p, n, have_grndinsecure ? GRND_INSECURE : GRND_NONBLOCK);
if (l < 0 && errno == EINVAL && have_grndinsecure) {
/* No GRND_INSECURE; fallback to GRND_NONBLOCK. */
have_grndinsecure = false;
continue;
}
if (l <= 0)
break; /* Unexpected error. Give up and fallback to /dev/urandom. */
break; /* Will block (with GRND_NONBLOCK), or unexpected error. Give up and fallback
to /dev/urandom. */
if ((size_t) l == n)
return; /* Done reading, success. */

View File

@ -463,13 +463,8 @@ bool statx_inode_same(const struct statx *a, const struct statx *b) {
/* Same as stat_inode_same() but for struct statx */
if (!statx_is_set(a) || !statx_is_set(b))
return false;
assert(FLAGS_SET(a->stx_mask, STATX_TYPE|STATX_INO));
assert(FLAGS_SET(b->stx_mask, STATX_TYPE|STATX_INO));
return
return statx_is_set(a) && statx_is_set(b) &&
FLAGS_SET(a->stx_mask, STATX_TYPE|STATX_INO) && FLAGS_SET(b->stx_mask, STATX_TYPE|STATX_INO) &&
((a->stx_mode ^ b->stx_mode) & S_IFMT) == 0 &&
a->stx_dev_major == b->stx_dev_major &&
a->stx_dev_minor == b->stx_dev_minor &&
@ -480,10 +475,13 @@ bool statx_mount_same(const struct statx *a, const struct statx *b) {
if (!statx_is_set(a) || !statx_is_set(b))
return false;
assert(FLAGS_SET(a->stx_mask, STATX_MNT_ID));
assert(FLAGS_SET(b->stx_mask, STATX_MNT_ID));
/* if we have the mount ID, that's all we need */
if (FLAGS_SET(a->stx_mask, STATX_MNT_ID) && FLAGS_SET(b->stx_mask, STATX_MNT_ID))
return a->stx_mnt_id == b->stx_mnt_id;
return a->stx_mnt_id == b->stx_mnt_id;
/* Otherwise, major/minor of backing device must match */
return a->stx_dev_major == b->stx_dev_major &&
a->stx_dev_minor == b->stx_dev_minor;
}
int xstatfsat(int dir_fd, const char *path, struct statfs *ret) {
@ -574,25 +572,3 @@ mode_t inode_type_from_string(const char *s) {
return MODE_INVALID;
}
int statx_warn_mount_root(const struct statx *sx, int log_level) {
assert(sx);
/* The STATX_ATTR_MOUNT_ROOT flag is supported since kernel v5.8. */
if (!FLAGS_SET(sx->stx_attributes_mask, STATX_ATTR_MOUNT_ROOT))
return log_full_errno(log_level, SYNTHETIC_ERRNO(ENOSYS),
"statx() did not set STATX_ATTR_MOUNT_ROOT, running on an old kernel?");
return 0;
}
int statx_warn_mount_id(const struct statx *sx, int log_level) {
assert(sx);
/* The STATX_MNT_ID flag is supported since kernel v5.10. */
if (!FLAGS_SET(sx->stx_mask, STATX_MNT_ID))
return log_full_errno(log_level, SYNTHETIC_ERRNO(ENOSYS),
"statx() does not support STATX_MNT_ID, running on an old kernel?");
return 0;
}

View File

@ -117,6 +117,3 @@ static inline bool inode_type_can_hardlink(mode_t m) {
* type). */
return IN_SET(m & S_IFMT, S_IFSOCK, S_IFLNK, S_IFREG, S_IFBLK, S_IFCHR, S_IFIFO);
}
int statx_warn_mount_root(const struct statx *sx, int log_level);
int statx_warn_mount_id(const struct statx *sx, int log_level);

View File

@ -3467,7 +3467,7 @@ static int is_extension_overlay(const char *path, int fd) {
fd = dfd;
}
r = is_mount_point_at(fd, /* path= */ NULL, /* flags= */ 0);
r = is_mount_point_at(fd, /* filename= */ NULL, /* flags= */ 0);
if (r < 0)
return log_debug_errno(r, "Unable to determine whether '%s' is a mount point: %m", path);
if (r == 0)

View File

@ -1,7 +1,7 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#include_next <linux/bpf_insn.h> /* IWYU pragma: export */
#include_next <linux/bpf.h> /* IWYU pragma: export */
/* defined in linux/filter.h */
/* Unconditional jumps, goto pc + off16 */

View File

@ -1113,7 +1113,7 @@ static int action_umount(sd_bus *bus, int argc, char **argv) {
if (fstat(fd, &st) < 0)
return log_error_errno(errno, "Can't stat '%s' (from %s): %m", p, argv[i]);
r = is_mount_point_at(fd, /* path= */ NULL, /* flags= */ 0);
r = is_mount_point_at(fd, /* filename= */ NULL, /* flags= */ 0);
fd = safe_close(fd); /* before continuing make sure the dir is not keeping anything busy */
if (r > 0)
RET_GATHER(ret, stop_mounts(bus, p));

View File

@ -147,7 +147,7 @@ int mount_cgroups(const char *dest, bool accept_existing) {
if (r < 0)
return log_error_errno(r, "Failed to chase %s/sys/fs/cgroup: %m", strempty(dest));
r = is_mount_point_at(fd, /* path= */ NULL, /* flags= */ 0);
r = is_mount_point_at(fd, /* filename= */ NULL, /* flags= */ 0);
if (r < 0)
return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
if (r > 0) {

View File

@ -268,7 +268,7 @@ static int verify_fsroot_dir(
bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING),
unprivileged_mode = FLAGS_SET(flags, VERIFY_ESP_UNPRIVILEGED_MODE);
_cleanup_free_ char *f = NULL;
struct statx sx;
struct statx sxa, sxb;
int r;
/* Checks if the specified directory is at the root of its file system, and returns device
@ -287,30 +287,49 @@ static int verify_fsroot_dir(
if (statx(dir_fd, strempty(f),
AT_SYMLINK_NOFOLLOW|(isempty(f) ? AT_EMPTY_PATH : 0),
STATX_TYPE|STATX_INO|STATX_MNT_ID, &sx) < 0)
STATX_TYPE|STATX_INO|STATX_MNT_ID, &sxa) < 0)
return log_full_errno((searching && errno == ENOENT) ||
(unprivileged_mode && ERRNO_IS_PRIVILEGE(errno)) ? LOG_DEBUG : LOG_ERR, errno,
"Failed to determine block device node of \"%s\": %m", path);
if (!S_ISDIR(sx.stx_mode))
if (!S_ISDIR(sxa.stx_mode))
return log_error_errno(SYNTHETIC_ERRNO(ENOTDIR), "Path \"%s\" is not a directory", path);
r = statx_warn_mount_root(&sx, LOG_ERR);
if (r < 0)
return r;
if (FLAGS_SET(sxa.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) {
if (!FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT))
/* If we have STATX_ATTR_MOUNT_ROOT, we are happy, that's all we need. We operate under the
* assumption that a top of a mount point is also the top of the file system. (Which of
* course is strictly speaking not always true...) */
if (!FLAGS_SET(sxa.stx_attributes, STATX_ATTR_MOUNT_ROOT))
return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
"Directory \"%s\" is not the root of the file system.", path);
goto success;
}
/* Now let's look at the parent */
if (statx(dir_fd, "", AT_EMPTY_PATH, STATX_TYPE|STATX_INO|STATX_MNT_ID, &sxb) < 0)
return log_full_errno(unprivileged_mode && ERRNO_IS_PRIVILEGE(errno) ? LOG_DEBUG : LOG_ERR, errno,
"Failed to determine block device node of parent of \"%s\": %m", path);
if (statx_inode_same(&sxa, &sxb)) /* for the root dir inode nr for both inodes will be the same */
goto success;
if (statx_mount_same(&sxa, &sxb))
return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
"Directory \"%s\" is not the root of the file system.", path);
success:
if (!ret_dev)
return 0;
if (sx.stx_dev_major == 0) /* Hmm, maybe a btrfs device, and the caller asked for the backing device? Then let's try to get it. */
if (sxa.stx_dev_major == 0) /* Hmm, maybe a btrfs device, and the caller asked for the backing device? Then let's try to get it. */
return btrfs_get_block_device_at(dir_fd, strempty(f), ret_dev);
*ret_dev = makedev(sx.stx_dev_major, sx.stx_dev_minor);
*ret_dev = makedev(sxa.stx_dev_major, sxa.stx_dev_minor);
return 0;
}

View File

@ -49,20 +49,19 @@ static int cgroupfs_mount_options(int priority, const char *type, char **ret) {
assert(streq(type, "cgroup2"));
assert(ret);
/* memory_hugetlb_accounting mount option is since kernel v6.7 (8cba9576df601c384abd334a503c3f6e1e29eefb). */
r = mount_option_supported("cgroup2", "memory_hugetlb_accounting", /* value= */ NULL);
if (r <= 0) {
_cleanup_free_ char *opts = NULL;
FOREACH_STRING(o, "memory_recursiveprot", "memory_hugetlb_accounting") {
r = mount_option_supported("cgroup2", o, /* value= */ NULL);
if (r < 0)
log_full_errno(priority, r, "Failed to determine whether cgroupfs supports 'memory_hugetlb_accounting' mount option, assuming not: %m");
else
log_debug("'memory_hugetlb_accounting' not supported by cgroupfs, not using mount option.");
*ret = NULL;
return 0;
log_full_errno(priority, r, "Failed to determine whether cgroupfs supports '%s' mount option, assuming not: %m", o);
else if (r == 0)
log_debug("'%s' not supported by cgroupfs, not using mount option.", o);
else if (!strextend_with_separator(&opts, ",", o))
return log_oom_full(priority);
}
return strdup_to(ret, "memory_hugetlb_accounting");
*ret = TAKE_PTR(opts);
return 0;
}
int mount_cgroupfs(const char *path) {
@ -82,7 +81,7 @@ int mount_cgroupfs(const char *path) {
return r;
/* These options shall be kept in sync with those in mount_table below. */
if (!strprepend_with_separator(&opts, ",", "nsdelegate,memory_recursiveprot"))
if (!strprepend_with_separator(&opts, ",", "nsdelegate"))
return log_oom();
return mount_nofollow_verbose(LOG_ERR, "cgroup2", path, "cgroup2", MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
@ -200,7 +199,7 @@ static const MountPoint mount_table[] = {
.what = "cgroup2",
.where = "/sys/fs/cgroup",
.type = "cgroup2",
.options = "nsdelegate,memory_recursiveprot",
.options = "nsdelegate",
.options_fn = cgroupfs_mount_options,
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
.mode = MNT_FATAL|MNT_IN_CONTAINER|MNT_CHECK_WRITABLE,

View File

@ -343,13 +343,19 @@ TEST(close_all_fds) {
_exit(EXIT_SUCCESS);
}
r = ASSERT_OK(pidref_safe_fork("(caf-nomalloc)", flags, NULL));
ASSERT_OK(r = pidref_safe_fork("(caf-nomalloc)", flags, NULL));
if (r == 0) {
test_close_all_fds_inner(close_all_fds_without_malloc);
_exit(EXIT_SUCCESS);
}
r = ASSERT_OK(pidref_safe_fork("(caf-frugal)", flags, NULL));
ASSERT_OK(r = pidref_safe_fork("(caf-proc)", flags, NULL));
if (r == 0) {
test_close_all_fds_inner(close_all_fds_by_proc);
_exit(EXIT_SUCCESS);
}
ASSERT_OK(r = pidref_safe_fork("(caf-frugal)", flags, NULL));
if (r == 0) {
test_close_all_fds_inner(close_all_fds_frugal);
_exit(EXIT_SUCCESS);

View File

@ -283,16 +283,18 @@ TEST(is_mount_point_at) {
fd = open("/", O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY);
assert_se(fd >= 0);
ASSERT_OK_POSITIVE(is_mount_point_at(fd, "/", /* flags= */ 0));
ASSERT_OK_POSITIVE(is_mount_point_at(fd, "..", /* flags= */ 0));
ASSERT_OK_POSITIVE(is_mount_point_at(fd, "../", /* flags= */ 0));
r = ASSERT_OK(proc_mounted());
ASSERT_OK_EQ(is_mount_point_at(fd, "/proc", /* flags= */ 0), r);
ASSERT_OK_EQ(is_mount_point_at(fd, "/proc/", /* flags= */ 0), r);
ASSERT_OK_EQ(is_mount_point_at(fd, "proc", /* flags= */ 0), r);
ASSERT_OK_EQ(is_mount_point_at(fd, "proc/", /* flags= */ 0), r);
ASSERT_OK_ZERO(is_mount_point_at(fd, "usr/lib", /* flags= */ 0));
ASSERT_OK_ZERO(is_mount_point_at(fd, "usr/lib", /* flags= */ 0));
/* Not allowed, since "/" is a path, not a plain filename */
assert_se(is_mount_point_at(fd, "/", 0) == -EINVAL);
assert_se(is_mount_point_at(fd, "..", 0) == -EINVAL);
assert_se(is_mount_point_at(fd, "../", 0) == -EINVAL);
assert_se(is_mount_point_at(fd, "/proc", 0) == -EINVAL);
assert_se(is_mount_point_at(fd, "/proc/", 0) == -EINVAL);
assert_se(is_mount_point_at(fd, "proc/sys", 0) == -EINVAL);
assert_se(is_mount_point_at(fd, "proc/sys/", 0) == -EINVAL);
/* This one definitely is a mount point */
assert_se(is_mount_point_at(fd, "proc", 0) > 0);
assert_se(is_mount_point_at(fd, "proc/", 0) > 0);
safe_close(fd);
fd = open("/tmp", O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY);

View File

@ -559,7 +559,7 @@ static int opendir_and_stat(
bool *ret_mountpoint) {
_cleanup_closedir_ DIR *d = NULL;
struct statx sx;
struct statx sx1;
int r;
assert(path);
@ -586,16 +586,21 @@ static int opendir_and_stat(
return 0;
}
if (statx(dirfd(d), "", AT_EMPTY_PATH, STATX_MODE|STATX_INO|STATX_ATIME|STATX_MTIME, &sx) < 0)
if (statx(dirfd(d), "", AT_EMPTY_PATH, STATX_MODE|STATX_INO|STATX_ATIME|STATX_MTIME, &sx1) < 0)
return log_error_errno(errno, "statx(%s) failed: %m", path);
r = statx_warn_mount_root(&sx, LOG_ERR);
if (r < 0)
return r;
if (FLAGS_SET(sx1.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT))
*ret_mountpoint = FLAGS_SET(sx1.stx_attributes, STATX_ATTR_MOUNT_ROOT);
else {
struct statx sx2;
if (statx(dirfd(d), "..", 0, STATX_INO, &sx2) < 0)
return log_error_errno(errno, "statx(%s/..) failed: %m", path);
*ret_mountpoint = !statx_mount_same(&sx1, &sx2);
}
*ret_mountpoint = FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT);
*ret = TAKE_PTR(d);
*ret_sx = sx;
*ret_sx = sx1;
return 1;
}
@ -708,13 +713,35 @@ static int dir_cleanup(
continue;
}
r = statx_warn_mount_root(&sx, LOG_ERR);
if (r < 0)
return r;
if (FLAGS_SET(sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) {
/* Yay, we have the mount point API, use it */
if (FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT)) {
log_debug("Ignoring \"%s/%s\": different mount points.", p, de->d_name);
continue;
}
} else {
/* So we might have statx() but the STATX_ATTR_MOUNT_ROOT flag is not supported, fall
* back to traditional stx_dev checking. */
if (sx.stx_dev_major != rootdev_major ||
sx.stx_dev_minor != rootdev_minor) {
log_debug("Ignoring \"%s/%s\": different filesystem.", p, de->d_name);
continue;
}
if (FLAGS_SET(sx.stx_attributes, STATX_ATTR_MOUNT_ROOT)) {
log_debug("Ignoring \"%s/%s\": different mount points.", p, de->d_name);
continue;
/* Try to detect bind mounts of the same filesystem instance; they do not differ in
* device major/minors. This type of query is not supported on all kernels or
* filesystem types though. */
if (S_ISDIR(sx.stx_mode)) {
int q;
q = is_mount_point_at(dirfd(d), de->d_name, 0);
if (q < 0)
log_debug_errno(q, "Failed to determine whether \"%s/%s\" is a mount point, ignoring: %m", p, de->d_name);
else if (q > 0) {
log_debug("Ignoring \"%s/%s\": different mount of the same filesystem.", p, de->d_name);
continue;
}
}
}
atime_nsec = FLAGS_SET(sx.stx_mask, STATX_ATIME) ? statx_timestamp_load_nsec(&sx.stx_atime) : 0;

View File

@ -426,7 +426,7 @@ static int run(int argc, char *argv[]) {
if (target_fd < 0)
return log_error_errno(target_fd, "Failed to open directory '%s': %m", arg_target);
r = is_mount_point_at(target_fd, /* path= */ NULL, /* flags= */ 0);
r = is_mount_point_at(target_fd, /* filename= */ NULL, /* flags= */ 0);
if (r < 0)
return log_error_errno(r, "Failed to determine whether '%s' is a mount point: %m", resolved);
if (!r)