1
0
mirror of https://github.com/systemd/systemd synced 2026-03-26 00:34:53 +01:00

Compare commits

..

4 Commits

Author SHA1 Message Date
Daan De Meyer
b96dccaba3
namespace: Clone root dir descriptor before use (#39939)
Before doing anything with the root directory file descriptor, let's
make sure we clone it first so that the caller can't mess with mount fd
attributes via mount_setattr() anymore.

We clone during parsing instead of in executor so that the caller can't
mess with the mount fd between invocations.
2025-12-14 13:34:17 +01:00
Daan De Meyer
2cee5c22b6 TEST-50-DISSECT: Make sure RootDirectoryFileDescriptor= can be reused 2025-12-14 11:32:08 +01:00
Daan De Meyer
5eb0639faa namespace: Clone root dir descriptor before use
Before doing anything with the root directory file descriptor, let's
make sure we clone it first so that the caller can't mess with mount fd
attributes via mount_setattr() anymore.

We clone during parsing instead of in executor so that the caller can't
mess with the mount fd between invocations.
2025-12-14 11:32:06 +01:00
Daan De Meyer
273c6bc045 mount-util: Add mount_fd_clone() helper
The kernel prevents you from open_tree()'ing an open_tree() fd unless it
was created from the caller's mount namespace. For various use cases, we
want to be able to open_tree() arbitrary mount file descriptors. Turns
out there's a way go get around it by mounting the mount file descriptor
in a throw-away mount namespace and then open_tree()'ing the mount file
descriptor. Let's implement this as a new helper mount_fd_clone() and
add a test for it.

Because move_mount()'ing the original fd makes it pretty useless as it
can't be move_mount()'ed again, we optionally make a second clone which
can replace the original fd so it can be cloned again later.
2025-12-14 11:31:59 +01:00
7 changed files with 206 additions and 8 deletions

View File

@ -502,15 +502,12 @@ static int bus_set_transient_exec_context_fd(
assert(name);
assert(p);
assert(b);
assert(verify_mode == O_DIRECTORY || (verify_mode & ~O_ACCMODE_STRICT) == 0);
assert((verify_mode & ~O_ACCMODE_STRICT) == 0);
r = sd_bus_message_read(message, "h", &fd);
if (r < 0)
return r;
if (verify_mode == O_DIRECTORY)
r = fd_verify_directory(fd);
else
r = fd_vet_accmode(fd, verify_mode);
if (r < 0)
return sd_bus_error_set_errnof(reterr_error, r, "%s passed is of incompatible type: %m", name);
@ -813,8 +810,34 @@ static int bus_service_set_transient_property(
return 1;
}
if (streq(name, "RootDirectoryFileDescriptor"))
return bus_set_transient_exec_context_fd(u, name, &s->root_directory_fd, &s->exec_context.root_directory_as_fd, O_DIRECTORY, message, flags, reterr_error);
if (streq(name, "RootDirectoryFileDescriptor")) {
int fd;
r = sd_bus_message_read(message, "h", &fd);
if (r < 0)
return r;
r = fd_verify_directory(fd);
if (r < 0)
return sd_bus_error_set_errnof(reterr_error, r, "RootDirectoryFileDescriptor= is not a directory: %m");
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
int fd_clone;
/* Note that this invalidates the fd we got from the client. They won't be able to
* move_mount() it themselves. If they already move_mount()'ed it themselves, this
* will fail to clone the fd. */
fd_clone = mount_fd_clone(fd, /* recursive= */ true, /* replacement_fd= */ NULL);
if (fd_clone < 0)
return fd_clone;
/* We're closing our own clone here, which shouldn't need an asynchronous_close(). */
close_and_replace(s->root_directory_fd, fd_clone);
s->exec_context.root_directory_as_fd = true;
}
return 1;
}
return 0;
}

View File

@ -2877,6 +2877,7 @@ void exec_params_shallow_clear(ExecParameters *p) {
p->fd_names = strv_free(p->fd_names);
p->files_env = strv_free(p->files_env);
p->fds = mfree(p->fds);
p->root_directory_fd = safe_close(p->root_directory_fd);
p->exec_fd = safe_close(p->exec_fd);
p->user_lookup_fd = -EBADF;
p->bpf_restrict_fs_map_fd = -EBADF;

View File

@ -1951,7 +1951,14 @@ static int service_spawn_internal(
exec_params.stdin_fd = s->stdin_fd;
exec_params.stdout_fd = s->stdout_fd;
exec_params.stderr_fd = s->stderr_fd;
exec_params.root_directory_fd = s->root_directory_fd;
if (s->root_directory_fd >= 0) {
r = mount_fd_clone(s->root_directory_fd, /* recursive= */ true, &s->root_directory_fd);
if (r < 0)
return r;
exec_params.root_directory_fd = r;
}
r = exec_spawn(UNIT(s),
c,

View File

@ -1,7 +1,9 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <sched.h>
#include <stdlib.h>
#include <sys/mount.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <unistd.h>
@ -30,6 +32,7 @@
#include "process-util.h"
#include "runtime-scope.h"
#include "set.h"
#include "socket-util.h"
#include "sort-util.h"
#include "stat-util.h"
#include "string-util.h"
@ -1421,6 +1424,103 @@ int fd_make_mount_point(int fd) {
return 1;
}
int mount_fd_clone(int mount_fd, bool recursive, int *replacement_fd) {
const int flags = OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_EMPTY_PATH|(recursive ? AT_RECURSIVE : 0);
int r;
assert(mount_fd >= 0);
/* If the input mount fd is supposed to remain clonable after calling this function, call it as
* follows: mount_fd_clone(mount_fd, recursive, &mount_fd). */
/* Clone a detached mount (that may be owned by a foreign mountns, e.g. mountfsd's). For this to
* work on older kernels, we have to jump through some hoops, because the kernel currently doesn't
* allow us to just call open_tree(OPEN_TREE_CLONE) directly to get a clone of a mount that is
* detached and owned by another mountns. Hence here's what we do: we clone short-lived child in a
* new mount namespace owned by our userns. There, we attach the mount (invisible to anyone else).
* This is sufficient to pass the kernel check, so next we use open_tree(OPEN_TREE_CLONE) to get our
* own detached mount. This we send back to the parent, which then can use it. */
r = RET_NERRNO(open_tree(mount_fd, "", flags));
if (r != -EINVAL)
/* The straightforward path just works? Yay! Don't bother with the complex logic below. No
* need to put a replacement fd in replacement_fd as the original fd is still usable. */
return r;
_cleanup_close_pair_ int transfer_fds[2] = EBADF_PAIR;
r = socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, transfer_fds);
if (r < 0)
return log_debug_errno(errno, "Failed to create socket pair: %m");
_cleanup_close_pair_ int errno_pipe_fds[2] = EBADF_PAIR;
if (pipe2(errno_pipe_fds, O_CLOEXEC|O_NONBLOCK) < 0)
return log_debug_errno(errno, "Failed to open pipe: %m");
/* Fork a child. Note that we set FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE here, i.e. get a new mount namespace */
r = safe_fork_full(
"(sd-clonemnt)",
/* stdio_fds= */ NULL,
(int[]) { mount_fd, transfer_fds[1], errno_pipe_fds[1] }, 3,
FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_REOPEN_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE,
/* ret_pid= */ NULL);
if (r < 0) {
errno_pipe_fds[1] = safe_close(errno_pipe_fds[1]);
int q = read_errno(errno_pipe_fds[0]);
if (q < 0 && q != -EIO)
return q;
return r;
}
if (r == 0) { /* Child */
/* Attach mount */
if (move_mount(mount_fd, "", -EBADF, "/", MOVE_MOUNT_F_EMPTY_PATH) < 0) {
log_debug_errno(errno, "Failed to move mount file descriptor to '/': %m");
report_errno_and_exit(errno_pipe_fds[1], -errno);
}
/* If requested by the caller, we clone the fd twice. Why? After move_mount(), the input file
* descriptor can't be move_mount()'ed again, which means we can't clone it again if it comes
* from a different mount namespace. To ensure they can clone the same fd multiple times,
* callers can pass a pointer to the input fd which will be replaced with a second clone,
* which can be move_mount()'ed and thus can be cloned again. */
for (int i = 0; i < 1 + !!replacement_fd; i++) {
/* And now clone the attached mount that is now ours. */
_cleanup_close_ int cloned_fd = open_tree(mount_fd, "", flags);
if (cloned_fd < 0) {
log_debug_errno(errno, "Failed to clone mount file descriptor: %m");
report_errno_and_exit(errno_pipe_fds[1], -errno);
}
/* And send it to the parent. */
r = send_one_fd(transfer_fds[1], cloned_fd, /* flags= */ 0);
if (r < 0)
report_errno_and_exit(errno_pipe_fds[1], r);
}
_exit(EXIT_SUCCESS);
}
transfer_fds[1] = safe_close(transfer_fds[1]);
/* Accept the new cloned mount */
_cleanup_close_ int fd1 = receive_one_fd(transfer_fds[0], 0);
if (fd1 < 0)
return fd1;
if (replacement_fd) {
int fd2 = receive_one_fd(transfer_fds[0], 0);
if (fd2 < 0)
return fd2;
close_and_replace(*replacement_fd, fd2);
}
return TAKE_FD(fd1);
}
int make_userns(uid_t uid_shift,
uid_t uid_range,
uid_t source_owner,

View File

@ -121,6 +121,8 @@ int mount_image_in_namespace(
int make_mount_point(const char *path);
int fd_make_mount_point(int fd);
int mount_fd_clone(int mount_fd, bool recursive, int *replacement_fd);
typedef enum RemountIdmapping {
REMOUNT_IDMAPPING_NONE,
/* Include a mapping from UID_MAPPED_ROOT (i.e. UID 2^31-2) on the backing fs to UID 0 on the

View File

@ -19,6 +19,7 @@
#include "process-util.h"
#include "random-util.h"
#include "rm-rf.h"
#include "socket-util.h"
#include "string-util.h"
#include "strv.h"
#include "tests.h"
@ -541,4 +542,66 @@ TEST(umountat) {
ASSERT_ERROR(umountat_detach_verbose(LOG_ERR, dfd, "foo"), EINVAL);
}
TEST(mount_fd_clone) {
_cleanup_(rm_rf_physical_and_freep) char *t = NULL;
_cleanup_close_pair_ int fds[2] = EBADF_PAIR;
int r;
CHECK_PRIV;
ASSERT_OK(mkdtemp_malloc(NULL, &t));
/* Set up a socket pair to transfer the mount fd from the child (in a different mountns) to us. */
ASSERT_OK_ERRNO(socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, fds));
r = ASSERT_OK(safe_fork_full(
"(mount-fd-clone-setup)",
/* stdio_fds= */ NULL,
&fds[1], 1,
FORK_COMMON_FLAGS,
NULL));
if (r == 0) {
/* Create a tmpfs mount in this child's mountns. */
ASSERT_OK(mount_nofollow_verbose(LOG_ERR, "tmpfs", t, "tmpfs", 0, NULL));
/* Create a file in it to verify the mount later. */
_cleanup_free_ char *marker = ASSERT_NOT_NULL(path_join(t, "marker"));
ASSERT_OK(touch(marker));
/* Clone the mount as a detached mount fd. */
_cleanup_close_ int mount_fd = ASSERT_OK_ERRNO(open_tree(AT_FDCWD, t, OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC));
/* Send the mount fd to the parent. */
ASSERT_OK(send_one_fd(fds[1], mount_fd, 0));
_exit(EXIT_SUCCESS);
}
fds[1] = safe_close(fds[1]);
/* Parent: Receive the mount fd, clone it with mount_fd_clone(), and verify we can attach it. */
_cleanup_close_ int foreign_mount_fd = ASSERT_OK(receive_one_fd(fds[0], 0));
_cleanup_close_ int first_clone = ASSERT_OK(
mount_fd_clone(foreign_mount_fd, /* recursive= */ true, &foreign_mount_fd));
_cleanup_close_ _unused_ int second_clone = ASSERT_OK(
mount_fd_clone(foreign_mount_fd, /* recursive= */ true, /* replacement_fd= */ NULL));
_cleanup_free_ char *target = ASSERT_NOT_NULL(path_join(t, "target"));
ASSERT_OK_ERRNO(mkdir(target, 0755));
r = ASSERT_OK(safe_fork_full(
"(mount-fd-clone-verify)",
/* stdio_fds= */ NULL,
&first_clone, 1,
FORK_COMMON_FLAGS,
NULL));
if (r == 0) {
ASSERT_OK_ERRNO(move_mount(first_clone, "", AT_FDCWD, target, MOVE_MOUNT_F_EMPTY_PATH));
_cleanup_free_ char *marker = ASSERT_NOT_NULL(path_join(target, "marker"));
ASSERT_OK_ERRNO(access(marker, F_OK));
_exit(EXIT_SUCCESS);
}
}
DEFINE_TEST_MAIN(LOG_DEBUG);

View File

@ -923,6 +923,8 @@ test ! -f /tmp/img/abc
# Test RootDirectoryFileDescriptor=
systemd-run --wait --pipe --root-directory=/tmp/img -- grep -q 'MARKER=1' /usr/lib/os-release
# Make sure the same root file descriptor can be reused multiple times.
systemd-run --wait --pipe --same-root-dir -p ExecStartPre=true true
systemd-dissect --mtree /tmp/img >/dev/null
systemd-dissect --list /tmp/img >/dev/null