1
0
mirror of https://github.com/systemd/systemd synced 2025-10-05 19:54:46 +02:00

Compare commits

...

7 Commits

Author SHA1 Message Date
Luca Boccassi
467b1c4e4b
core: gracefully ignore PrivateBPF=yes if the kernel does not support it (#38238)
Fixes #38225.
2025-07-18 23:36:24 +01:00
Yu Watanabe
a673826003 TEST-07-PID1: check if PrivateBPF=yes is gracefully ignored 2025-07-18 20:25:42 +09:00
Yu Watanabe
8abdceac77 TEST-07-PID1: fix negative check 2025-07-18 20:25:42 +09:00
Yu Watanabe
f1eed4e592 test-bpf-token: use test macros and functions
No functional change, just refactoring.
2025-07-18 20:25:42 +09:00
Yu Watanabe
0e8e655c52 core/namespace: gracefully handle errors in mounting new bpffs instance
Then, fallback to remount /sys/fs/bpf read-only when ProtectKernelTunables=yes.
2025-07-18 20:25:35 +09:00
Yu Watanabe
9eabf82750 core: it is not necessary to send message after fsconfig() for bpffs
Instead, let's wait for the helper process being finished.
2025-07-18 20:15:25 +09:00
Yu Watanabe
8509ceea10 core/exec-invoke: negative errno needs to be passed to report_errno_and_exit()
Hence, we cannot pass errno as is to report_errno_and_exit().

This splits out bpffs_helper(), which returns negative errno on failure,
and 0 on success. And make the returned value passed to report_errno_and_exit().

Follow-up for #36134.
2025-07-18 20:15:20 +09:00
5 changed files with 114 additions and 98 deletions

View File

@ -2270,20 +2270,51 @@ static int setup_private_users_child(int unshare_ready_fd, const char *uid_map,
return 0;
}
static int bpffs_helper(const ExecContext *c, int socket_fd) {
assert(c);
assert(socket_fd >= 0);
_cleanup_close_ int fs_fd = receive_one_fd(socket_fd, /* flags = */ 0);
if (fs_fd < 0)
return log_debug_errno(fs_fd, "Failed to receive file descriptor from parent: %m");
char number[STRLEN("0x") + sizeof(c->bpf_delegate_commands) * 2 + 1];
xsprintf(number, "0x%"PRIx64, c->bpf_delegate_commands);
if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_cmds", number, /* aux = */ 0) < 0)
return log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m");
xsprintf(number, "0x%"PRIx64, c->bpf_delegate_maps);
if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_maps", number, /* aux = */ 0) < 0)
return log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m");
xsprintf(number, "0x%"PRIx64, c->bpf_delegate_programs);
if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_progs", number, /* aux = */ 0) < 0)
return log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m");
xsprintf(number, "0x%"PRIx64, c->bpf_delegate_attachments);
if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_attachs", number, /* aux = */ 0) < 0)
return log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m");
if (fsconfig(fs_fd, FSCONFIG_CMD_CREATE, /* key = */ NULL, /* value = */ NULL, /* aux = */ 0) < 0)
return log_debug_errno(errno, "Failed to create bpffs superblock: %m");
return 0;
}
static int bpffs_prepare(
const ExecContext *c,
PidRef *ret_pid,
int *ret_sock_fd,
int *ret_errno_pipe) {
_cleanup_close_pair_ int socket_fds[2] = EBADF_PAIR, bpffs_errno_pipe[2] = EBADF_PAIR;
_cleanup_close_pair_ int socket_fds[2] = EBADF_PAIR, errno_pipe[2] = EBADF_PAIR;
int r;
assert(ret_sock_fd);
assert(ret_pid);
assert(ret_errno_pipe);
r = pipe2(bpffs_errno_pipe, O_CLOEXEC|O_NONBLOCK);
r = pipe2(errno_pipe, O_CLOEXEC|O_NONBLOCK);
if (r < 0)
return log_debug_errno(errno, "Failed to create pipe: %m");
@ -2295,67 +2326,13 @@ static int bpffs_prepare(
if (r < 0)
return log_debug_errno(r, "Failed to fork bpffs privileged helper: %m");
if (r == 0) {
_cleanup_close_ int fs_fd = -EBADF;
char number[STRLEN("0x") + sizeof(c->bpf_delegate_commands) * 2 + 1];
bpffs_errno_pipe[0] = safe_close(bpffs_errno_pipe[0]);
errno_pipe[0] = safe_close(errno_pipe[0]);
socket_fds[0] = safe_close(socket_fds[0]);
fs_fd = receive_one_fd(socket_fds[1], /* flags = */ 0);
if (fs_fd < 0) {
log_debug_errno(fs_fd, "Failed to receive file descriptor from parent: %m");
report_errno_and_exit(bpffs_errno_pipe[1], fs_fd);
}
xsprintf(number, "0x%"PRIx64, c->bpf_delegate_commands);
r = fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_cmds", number, /* aux = */ 0);
if (r < 0) {
log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
xsprintf(number, "0x%"PRIx64, c->bpf_delegate_maps);
r = fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_maps", number, /* aux = */ 0);
if (r < 0) {
log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
xsprintf(number, "0x%"PRIx64, c->bpf_delegate_programs);
r = fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_progs", number, /* aux = */ 0);
if (r < 0) {
log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
xsprintf(number, "0x%"PRIx64, c->bpf_delegate_attachments);
r = fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_attachs", number, /* aux = */ 0);
if (r < 0) {
log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
r = fsconfig(fs_fd, FSCONFIG_CMD_CREATE, /* key = */ NULL, /* value = */ NULL, /* aux = */ 0);
if (r < 0) {
log_debug_errno(errno, "Failed to create bpffs superblock: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
if (write(socket_fds[1], (uint8_t[1]) {}, 1) < 0) {
log_debug_errno(errno, "Failed to send data to child: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
_exit(EXIT_SUCCESS);
report_errno_and_exit(errno_pipe[1], bpffs_helper(c, socket_fds[1]));
}
*ret_sock_fd = TAKE_FD(socket_fds[0]);
*ret_errno_pipe = TAKE_FD(bpffs_errno_pipe[0]);
*ret_errno_pipe = TAKE_FD(errno_pipe[0]);
return 0;
}
@ -3691,7 +3668,9 @@ static int apply_mount_namespace(
bool needs_sandboxing,
uid_t exec_directory_uid,
gid_t exec_directory_gid,
PidRef *bpffs_pidref,
int bpffs_socket_fd,
int bpffs_errno_pipe,
char **reterr_path) {
_cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
@ -3906,7 +3885,9 @@ static int apply_mount_namespace(
.proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL,
.private_bpf = needs_sandboxing ? context->private_bpf : PRIVATE_BPF_NO,
.bpffs_pidref = bpffs_pidref,
.bpffs_socket_fd = bpffs_socket_fd,
.bpffs_errno_pipe = bpffs_errno_pipe,
};
r = setup_namespace(&parameters, reterr_path);
@ -4547,7 +4528,9 @@ static int setup_delegated_namespaces(
const ExecCommand *command,
bool needs_sandboxing,
bool have_cap_sys_admin,
PidRef *bpffs_pidref,
int bpffs_socket_fd,
int bpffs_errno_pipe,
int *reterr_exit_status) {
int r;
@ -4670,7 +4653,9 @@ static int setup_delegated_namespaces(
needs_sandboxing,
uid,
gid,
bpffs_pidref,
bpffs_socket_fd,
bpffs_errno_pipe,
&error_path);
if (r < 0) {
*reterr_exit_status = EXIT_NAMESPACE;
@ -5782,7 +5767,9 @@ int exec_invoke(
command,
needs_sandboxing,
have_cap_sys_admin,
&bpffs_pidref,
bpffs_socket_fd,
bpffs_errno_pipe,
exit_status);
if (r < 0)
return r;
@ -5842,29 +5829,15 @@ int exec_invoke(
command,
needs_sandboxing,
have_cap_sys_admin,
&bpffs_pidref,
bpffs_socket_fd,
bpffs_errno_pipe,
exit_status);
if (r < 0)
return r;
if (context->private_bpf != PRIVATE_BPF_NO) {
r = pidref_wait_for_terminate_and_check("(sd-bpffs)", &bpffs_pidref, /* flags = */ 0);
if (r < 0) {
*exit_status = EXIT_BPF;
return r;
}
/* If something strange happened with the child, let's consider this fatal, too */
if (r != EXIT_SUCCESS) {
*exit_status = EXIT_BPF;
ssize_t ss = read(bpffs_errno_pipe, &r, sizeof(r));
if (ss == sizeof(r))
return log_debug_errno(r, "bpffs helper exited with error: %m");
if (ss < 0)
return log_debug_errno(errno, "Failed to read from the bpffs helper errno pipe: %m");
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Short read from the bpffs helper errno pipe.");
}
pidref_done(&bpffs_pidref);
}
/* Kill unnecessary process, for the case that e.g. when the bpffs mount point is hidden. */
pidref_done_sigkill_wait(&bpffs_pidref);
if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
/* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which

View File

@ -957,6 +957,7 @@ static int append_private_bpf(
*me = (MountEntry) {
.path_const = "/sys/fs/bpf",
.mode = MOUNT_BPFFS,
.ignore = !protect_kernel_tunables, /* indicate whether we should fall back to MOUNT_READ_ONLY on failure. */
};
return 0;
}
@ -1735,11 +1736,13 @@ static int mount_overlay(const MountEntry *m) {
return 1;
}
static int mount_bpffs(const MountEntry *m, int socket_fd) {
static int mount_bpffs(const MountEntry *m, PidRef *pidref, int socket_fd, int errno_pipe) {
int r;
assert(m);
assert(pidref_is_set(pidref));
assert(socket_fd >= 0);
assert(errno_pipe >= 0);
_cleanup_close_ int fs_fd = fsopen("bpf", FSOPEN_CLOEXEC);
if (fs_fd < 0)
@ -1749,8 +1752,21 @@ static int mount_bpffs(const MountEntry *m, int socket_fd) {
if (r < 0)
return log_debug_errno(r, "Failed to send bpffs fd to child: %m");
if (read(socket_fd, (uint8_t[1]) {}, 1) < 0)
return log_debug_errno(errno, "Failed to receive data from child: %m");
r = pidref_wait_for_terminate_and_check("(sd-bpffs)", pidref, /* flags = */ 0);
if (r < 0)
return r;
/* If something strange happened with the child, let's consider this fatal, too */
if (r != EXIT_SUCCESS) {
ssize_t ss = read(errno_pipe, &r, sizeof(r));
if (ss < 0)
return log_debug_errno(errno, "Failed to read from the bpffs helper errno pipe: %m");
if (ss != sizeof(r))
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Short read from the bpffs helper errno pipe.");
return log_debug_errno(r, "bpffs helper exited with error: %m");
}
pidref_done(pidref);
_cleanup_close_ int mnt_fd = fsmount(fs_fd, /* flags = */ 0, /* mount_attrs = */ 0);
if (mnt_fd < 0)
@ -1817,6 +1833,23 @@ static int apply_one_mount(
log_debug("Applying namespace mount on %s", mount_entry_path(m));
if (m->mode == MOUNT_BPFFS) {
r = mount_bpffs(m, p->bpffs_pidref, p->bpffs_socket_fd, p->bpffs_errno_pipe);
if (r >= 0 ||
(!ERRNO_IS_NEG_NOT_SUPPORTED(r) && /* old kernel? */
!ERRNO_IS_NEG_PRIVILEGE(r))) /* ubuntu kernel bug? See issue #38225 */
return r;
if (m->ignore) {
log_debug_errno(r, "Failed to mount new bpffs instance, ignoring: %m");
return 0;
}
log_debug_errno(r, "Failed to mount new bpffs instance, fallback to making %s read-only, ignoring: %m", mount_entry_path(m));
m->mode = MOUNT_READ_ONLY;
m->ignore = true;
}
switch (m->mode) {
case MOUNT_INACCESSIBLE: {
@ -2019,9 +2052,6 @@ static int apply_one_mount(
case MOUNT_OVERLAY:
return mount_overlay(m);
case MOUNT_BPFFS:
return mount_bpffs(m, p->bpffs_socket_fd);
default:
assert_not_reached();
}

View File

@ -200,7 +200,9 @@ typedef struct NamespaceParameters {
PrivateTmp private_var_tmp;
PrivatePIDs private_pids;
PidRef *bpffs_pidref;
int bpffs_socket_fd;
int bpffs_errno_pipe;
} NamespaceParameters;
int setup_namespace(const NamespaceParameters *p, char **reterr_path);

View File

@ -4,25 +4,22 @@
#include <fcntl.h>
#include "fd-util.h"
#include "main-func.h"
#include "tests.h"
static int run(int argc, char *argv[]) {
static int intro(void) {
#if __LIBBPF_CURRENT_VERSION_GEQ(1, 5)
_cleanup_close_ int bpffs_fd = -EBADF, token_fd = -EBADF;
bpffs_fd = open("/sys/fs/bpf", O_RDONLY);
_cleanup_close_ int bpffs_fd = open("/sys/fs/bpf", O_RDONLY);
if (bpffs_fd < 0)
return -errno;
return log_error_errno(errno, "Failed to open '/sys/fs/bpf': %m");
token_fd = bpf_token_create(bpffs_fd, /* opts = */ NULL);
_cleanup_close_ int token_fd = bpf_token_create(bpffs_fd, /* opts = */ NULL);
if (token_fd < 0)
return -errno;
return log_error_errno(errno, "Failed to create bpf token: %m");
return 0;
return EXIT_SUCCESS;
#else
exit(77);
return log_tests_skipped("libbpf is older than v1.5");
#endif
}
DEFINE_MAIN_FUNCTION(run);
DEFINE_TEST_MAIN_WITH_INTRO(LOG_DEBUG, intro);

View File

@ -13,12 +13,26 @@ systemd-run --wait \
grep -q '/sys/fs/bpf .* ro,' /proc/mounts
# Check that with PrivateBPF=yes, a new bpffs instance is mounted
systemd-run --wait \
if ! systemd-run --wait \
-p PrivateUsers=yes \
-p PrivateMounts=yes \
-p DelegateNamespaces=mnt \
-p PrivateBPF=yes \
grep -q '^none /sys/fs/bpf bpf rw' /proc/mounts
grep -q '^none /sys/fs/bpf bpf rw' /proc/mounts; then
# If it does not work, maybe the kernel is old or the system has buggy ubuntu kernel.
# Let's check if PrivateBPF=yes is ignored gracefully in that case.
systemd-run --wait \
-p PrivateUsers=yes \
-p PrivateMounts=yes \
-p DelegateNamespaces=mnt \
-p ProtectKernelTunables=yes \
-p PrivateBPF=yes \
grep -q '/sys/fs/bpf .* ro,' /proc/mounts
# Skip all remaining tests.
exit 0
fi
# Check that when specifying the delegate arguments, the mount options are set properly
check_mount_opts() {
@ -63,9 +77,9 @@ systemd-run --wait \
/usr/lib/systemd/tests/unit-tests/manual/test-bpf-token
# Check that without the delegates, the helper aborts trying to get a token
! systemd-run --wait \
(! systemd-run --wait \
-p PrivateUsers=yes \
-p PrivateMounts=yes \
-p DelegateNamespaces=mnt \
-p PrivateBPF=yes \
/usr/lib/systemd/tests/unit-tests/manual/test-bpf-token
/usr/lib/systemd/tests/unit-tests/manual/test-bpf-token)