Compare commits

...

16 Commits

Author SHA1 Message Date
Michal Sekletar 9ce6f5fa08
Merge 15a932e452 into e0258ac886 2024-09-15 23:05:06 +02:00
Michal Sekletar 15a932e452 test: add test coverage for EnterNamespace= 2024-09-11 17:56:22 +02:00
Michal Sekletar dbd9d5c2bf coredump: rename AccessContainer= to EnterNamespace= 2024-09-11 17:56:22 +02:00
Michal Sekletar 9a2c1187df coredump: rework gather_pid_mount_tree_fd() 2024-09-11 17:56:17 +02:00
Michal Sekletar c650603902 coredump: use FORK_WAIT 2024-09-10 19:03:19 +02:00
Michal Sekletar ed81a358e1 coredump: store actual fd in appropriate variable 2024-09-10 19:03:19 +02:00
Michal Sekletar 8b22bf9ac3 coredump: use FORK_LOG to get more precise logging 2024-09-10 19:03:19 +02:00
Michal Sekletar 5b9bff7114 coredump: fix coding style 2024-09-10 19:03:19 +02:00
Michal Sekletar 84e3e4f6ec coredump: get rid of redundant double space 2024-09-10 19:03:19 +02:00
Michal Sekletar 4b8f07937a coredump: use more appropriate return code 2024-09-10 19:03:19 +02:00
Michal Sekletar e35768d4ef coredump: check for and close unexpected FDs 2024-09-10 19:03:15 +02:00
Michal Sekletar ddf6298935 coredump: fix line spacing 2024-09-10 18:30:20 +02:00
Michal Sekletar 5348e7d0c8 coredump: merge variable definitions 2024-09-10 18:30:20 +02:00
Michal Sekletar 75c5dfed33 coredump: rework attaching container mount trees 2024-09-10 18:30:15 +02:00
Michal Sekletar 0e95abfb39 analyze: don't use Yoda conditions 2024-09-10 18:16:11 +02:00
Michal Sekletar be528e9678 analyze: modernize opening ELF binary a bit 2024-09-10 18:16:07 +02:00
6 changed files with 150 additions and 80 deletions

View File

@ -110,14 +110,14 @@
</varlistentry>
<varlistentry>
<term><varname>AccessContainer=</varname></term>
<term><varname>EnterNamespace=</varname></term>
<listitem><para>Controls whether <command>systemd-coredump</command> will attempt to use the mount tree of
a process that crashed within a container. Access to the container's filesystem might be necessary to generate
a process that crashed in PID namespace. Access to the namespace's mount tree might be necessary to generate
a fully symbolized backtrace. If set to <literal>yes</literal>, then <command>systemd-coredump</command> will
obtain the mount tree from corresponding mount namespace and will try to generate the stack trace using the
binary and libraries from the mount namespace. Note that the coredump of the containerized process might
still be saved in <filename>/var/lib/systemd/coredump/</filename> even if <varname>AccessContainer=</varname>
binary and libraries from the mount namespace. Note that the coredump of the namespaced process might
still be saved in <filename>/var/lib/systemd/coredump/</filename> even if <varname>EnterNamespace=</varname>
is set to <literal>no</literal>. Defaults to <literal>no</literal>.</para>
<xi:include href="version-info.xml" xpointer="v257"/>

View File

@ -4,6 +4,7 @@
#include "analyze.h"
#include "analyze-inspect-elf.h"
#include "chase.h"
#include "elf-util.h"
#include "errno-util.h"
#include "fd-util.h"
@ -19,23 +20,13 @@ static int analyze_elf(char **filenames, sd_json_format_flags_t json_flags) {
STRV_FOREACH(filename, filenames) {
_cleanup_(sd_json_variant_unrefp) sd_json_variant *package_metadata = NULL;
_cleanup_(table_unrefp) Table *t = NULL;
_cleanup_free_ char *abspath = NULL, *path = NULL, *stacktrace = NULL;
_cleanup_free_ char *abspath = NULL, *stacktrace = NULL;
_cleanup_close_ int fd = -EBADF;
bool coredump = false;
r = path_make_absolute_cwd(*filename, &abspath);
if (r < 0)
return log_error_errno(r, "Could not make an absolute path out of \"%s\": %m", *filename);
path = path_join(empty_to_root(arg_root), abspath);
if (!path)
return log_oom();
path_simplify(path);
fd = RET_NERRNO(open(path, O_RDONLY|O_CLOEXEC));
fd = chase_and_open(*filename, arg_root, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC, &abspath);
if (fd < 0)
return log_error_errno(fd, "Could not open \"%s\": %m", path);
return log_error_errno(fd, "Could not open \"%s\": %m", *filename);
r = parse_elf_object(fd, abspath, arg_root, /* fork_disable_dump= */false, &stacktrace, &package_metadata);
if (r < 0)
@ -65,7 +56,7 @@ static int analyze_elf(char **filenames, sd_json_format_flags_t json_flags) {
* metadata is parsed recursively in core files, so there might be
* multiple modules. */
if (STR_IN_SET(module_name, "elfType", "elfArchitecture")) {
if (streq(module_name, "elfType") && streq("coredump", sd_json_variant_string(module_json)))
if (streq(module_name, "elfType") && streq(sd_json_variant_string(module_json), "coredump"))
coredump = true;
r = table_add_many(

View File

@ -2,11 +2,15 @@
#include <errno.h>
#include <stdio.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/statvfs.h>
#include <sys/auxv.h>
#include <sys/xattr.h>
#include <unistd.h>
#if WANT_LINUX_FS_H
#include <linux/fs.h>
#endif
#include "sd-daemon.h"
#include "sd-journal.h"
@ -86,6 +90,8 @@
* size. See DATA_SIZE_MAX in journal-importer.h. */
assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX);
#define MOUNT_TREE_ROOT "/run/systemd/mount-rootfs"
enum {
/* We use these as array indexes for our process metadata cache.
*
@ -167,7 +173,7 @@ static uint64_t arg_external_size_max = EXTERNAL_SIZE_MAX;
static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX;
static uint64_t arg_keep_free = UINT64_MAX;
static uint64_t arg_max_use = UINT64_MAX;
static bool arg_access_container = false;
static bool arg_enter_namespace = false;
static int parse_config(void) {
static const ConfigTableItem items[] = {
@ -179,9 +185,9 @@ static int parse_config(void) {
{ "Coredump", "KeepFree", config_parse_iec_uint64, 0, &arg_keep_free },
{ "Coredump", "MaxUse", config_parse_iec_uint64, 0, &arg_max_use },
#if HAVE_DWFL_SET_SYSROOT
{ "Coredump", "AccessContainer", config_parse_bool, 0, &arg_access_container },
{ "Coredump", "EnterNamespace", config_parse_bool, 0, &arg_enter_namespace },
#else
{ "Coredump", "AccessContainer", config_parse_warn_compat, DISABLED_CONFIGURATION, 0 },
{ "Coredump", "EnterNamespace", config_parse_warn_compat, DISABLED_CONFIGURATION, 0 },
#endif
{}
};
@ -782,30 +788,32 @@ static int change_uid_gid(const Context *context) {
return drop_privileges(uid, gid, 0);
}
static int setup_container_mount_tree(int mount_tree_fd, char **container_root) {
static int attach_mount_tree(int mount_tree_fd) {
_cleanup_free_ char *root = NULL;
int r;
assert(mount_tree_fd >= 0);
assert(container_root);
r = unshare(CLONE_NEWNS);
r = detach_mount_namespace();
if (r < 0)
return log_warning_errno(errno, "Failed to unshare mount namespace: %m");
return log_warning_errno(r, "Failed to detach mount namespace: %m");
r = mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL);
r = mkdir_label(MOUNT_TREE_ROOT, 0555);
if (r < 0 && r != -EEXIST)
return log_warning_errno(r, "Failed to create directory: %m");
r = mount_setattr(mount_tree_fd, "", AT_EMPTY_PATH,
&(struct mount_attr) {
.attr_set = MOUNT_ATTR_RDONLY|MOUNT_ATTR_NOSUID|MOUNT_ATTR_NODEV|MOUNT_ATTR_NOEXEC,
.propagation = MS_SLAVE,
}, sizeof(struct mount_attr));
if (r < 0)
return log_warning_errno(errno, "Failed to disable mount propagation: %m");
return log_warning_errno(r, "Failed to change properties mount tree: %m");
r = mkdtemp_malloc("/tmp/systemd-coredump-root-XXXXXX", &root);
if (r < 0)
return log_warning_errno(r, "Failed to create temporary directory: %m");
r = move_mount(mount_tree_fd, "", -EBADF, root, MOVE_MOUNT_F_EMPTY_PATH);
r = move_mount(mount_tree_fd, "", -EBADF, MOUNT_TREE_ROOT, MOVE_MOUNT_F_EMPTY_PATH);
if (r < 0)
return log_warning_errno(errno, "Failed to move mount tree: %m");
*container_root = TAKE_PTR(root);
return 0;
}
@ -817,10 +825,8 @@ static int submit_coredump(
_cleanup_(sd_json_variant_unrefp) sd_json_variant *json_metadata = NULL;
_cleanup_close_ int coredump_fd = -EBADF, coredump_node_fd = -EBADF;
_cleanup_free_ char *filename = NULL, *coredump_data = NULL;
_cleanup_free_ char *stacktrace = NULL;
_cleanup_free_ char *root = NULL;
const char *module_name;
_cleanup_free_ char *filename = NULL, *coredump_data = NULL, *stacktrace = NULL;
const char *module_name, *root = MOUNT_TREE_ROOT;
uint64_t coredump_size = UINT64_MAX, coredump_compressed_size = UINT64_MAX;
bool truncated = false, written = false;
sd_json_variant *module_json;
@ -856,10 +862,10 @@ static int submit_coredump(
(void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);
}
if (mount_tree_fd >= 0 && arg_access_container) {
r = setup_container_mount_tree(mount_tree_fd, &root);
if (mount_tree_fd >= 0) {
r = attach_mount_tree(mount_tree_fd);
if (r < 0)
log_warning_errno(r, "Failed to setup container mount tree, ignoring: %m");
root = "/";
}
/* Now, let's drop privileges to become the user who owns the segfaulted process and allocate the
@ -869,6 +875,7 @@ static int submit_coredump(
r = change_uid_gid(context);
if (r < 0)
return log_error_errno(r, "Failed to drop privileges: %m");
if (written) {
/* Try to get a stack trace if we can */
if (coredump_size > arg_process_size_max)
@ -1112,18 +1119,43 @@ static int process_socket(int fd) {
/* We have all FDs we need let's take a shortcut here. */
break;
} else {
struct cmsghdr *cmsg;
unsigned n_fds = 0;
found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
if (found)
if (first && found) {
/* This is the first message that carries file descriptors, maybe there will be
* one more that actually contains array of two descriptors. */
assert(input_fd < 0);
input_fd = *CMSG_TYPED_DATA(found, int);
}
first = false;
/* This is the first message that carries file descriptors, maybe there will be one more that actually contains array of descriptors. */
if (first) {
first = false;
continue;
}
continue;
} else if (first && !found) {
/* This is the first message of zero length and it has no file descriptor,
* this is the protocol violation so let's bail out. */
cmsg_close_all(&mh);
r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
"Received zero length message with no file descriptor.");
goto finish;
}
break;
/* This is second iteration and we didn't find array of two FDs, hence we either
* have no FDs which is OK and we can break or we have some other number of FDs
* and somebody is playing games with us. So let's check for that. */
CMSG_FOREACH(cmsg, &mh)
if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS)
n_fds++;
if (n_fds == 0)
break;
cmsg_close_all(&mh);
r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
"Received '%u' unexpected file descriptors.", n_fds);
goto finish;
}
} else
cmsg_close_all(&mh);
@ -1628,42 +1660,59 @@ static int forward_coredump_to_container(Context *context) {
return 0;
}
static int gather_pid_mount_tree_fd(const Context *context) {
static int gather_pid_mount_tree_fd(const Context *context, int *ret_fd) {
_cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF;
_cleanup_close_pair_ int pair[2] = EBADF_PAIR;
int fd = -EBADF, r;
pid_t child;
assert(context);
assert(ret_fd);
/* Don't bother preparing environment if we can't pass it to libdwfl. */
#if !HAVE_DWFL_SET_SYSROOT
return -EBADF;
r = 0;
goto finish;
#endif
if (!arg_access_container)
return -EBADF;
if (!arg_enter_namespace) {
r = 0;
goto finish;
}
if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair) < 0)
return log_error_errno(errno, "Failed to create socket pair: %m");
if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair) < 0) {
r = log_error_errno(errno, "Failed to create socket pair: %m");
goto finish;
}
r = namespace_open(context->pid, NULL, &mntns_fd, NULL, NULL, &root_fd);
r = namespace_open(context->pid, NULL, &mntns_fd, NULL, NULL, &root_fd);
if (r < 0) {
log_error_errno(r, "Failed to open mount namespace of crashing process: %m");
goto finish;
}
r = namespace_fork("(sd-mount-tree-ns)",
"(sd-mount-tree)",
/* except_fds= */ NULL,
/* n_except_fds= */ 0,
FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG|FORK_WAIT,
/* pidns_fd= */ -EBADF,
mntns_fd,
/* netns_fd= */ -EBADF,
/* userns_fd= */ -EBADF,
root_fd,
NULL);
if (r < 0)
return log_error_errno(r, "Failed to open mount namespace of crashing process: %m");
r = namespace_fork("(sd-mount-tree-ns)", "(sd-mount-tree)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, -1, mntns_fd, -1, -1, root_fd, &child);
if (r < 0)
return log_error_errno(r, "Failed to fork(): %m");
goto finish;
if (r == 0) {
pair[0] = safe_close(pair[0]);
r = open_tree(-EBADF, "/", AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE);
if (r < 0) {
fd = open_tree(-EBADF, "/", AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE);
if (fd < 0) {
log_error_errno(errno, "Failed to clone mount tree: %m");
_exit(EXIT_FAILURE);
}
r = send_one_fd(pair[1], r, 0);
r = send_one_fd(pair[1], fd, 0);
if (r < 0) {
log_error_errno(r, "Failed to send mount tree to parent: %m");
_exit(EXIT_FAILURE);
@ -1674,17 +1723,17 @@ static int gather_pid_mount_tree_fd(const Context *context) {
pair[1] = safe_close(pair[1]);
r = wait_for_terminate_and_check("(sd-mount-tree-ns)", child, 0);
if (r < 0)
return log_error_errno(r, "Failed to wait for child: %m");
if (r != EXIT_SUCCESS)
return log_error_errno(SYNTHETIC_ERRNO(ECHILD), "Child died abnormally.");
r = receive_one_fd(pair[0], MSG_DONTWAIT);
if (r < 0) {
log_error_errno(r, "Failed to receive mount tree: %m");
goto finish;
}
fd = receive_one_fd(pair[0], MSG_DONTWAIT);
if (fd < 0)
return log_error_errno(fd, "Failed to receive mount tree: %m");
return fd;
fd = r;
r = 0;
finish:
*ret_fd = TAKE_FD(fd);
return r;
}
static int process_kernel(int argc, char* argv[]) {
@ -1736,11 +1785,9 @@ static int process_kernel(int argc, char* argv[]) {
if (r >= 0)
return 0;
r = gather_pid_mount_tree_fd(&context);
if (r < 0 && r != -EBADF)
r = gather_pid_mount_tree_fd(&context, &mount_tree_fd);
if (r < 0)
log_warning_errno(r, "Failed to access the mount tree of a container, ignoring: %m");
else
mount_tree_fd = r;
}
/* If this is PID 1 disable coredump collection, we'll unlikely be able to process

View File

@ -25,4 +25,4 @@
#JournalSizeMax=767M
#MaxUse=
#KeepFree=
#AccessContainer=no
#EnterNamespace=no

View File

@ -36,6 +36,7 @@ test_append_files() {
instmods vmw_vsock_virtio_transport
instmods vsock_loopback
instmods vmw_vsock_vmci_transport
inst_binary gcc
generate_module_dependencies
}

View File

@ -8,13 +8,15 @@ set -o pipefail
# Make sure the binary name fits into 15 characters
CORE_TEST_BIN="/tmp/test-dump"
CORE_STACKTRACE_TEST_BIN="/tmp/test-stacktrace-dump"
MAKE_STACKTRACE_DUMP="/tmp/make-stacktrace-dump"
CORE_TEST_UNPRIV_BIN="/tmp/test-usr-dump"
MAKE_DUMP_SCRIPT="/tmp/make-dump"
# Unset $PAGER so we don't have to use --no-pager everywhere
export PAGER=
at_exit() {
rm -fv -- "$CORE_TEST_BIN" "$CORE_TEST_UNPRIV_BIN" "$MAKE_DUMP_SCRIPT"
rm -fv -- "$CORE_TEST_BIN" "$CORE_TEST_UNPRIV_BIN" "$MAKE_DUMP_SCRIPT" "$MAKE_STACKTRACE_DUMP"
}
trap at_exit EXIT
@ -225,3 +227,32 @@ systemd-run -t --property CoredumpFilter=default ls /tmp
(! coredumpctl dump --output=/dev/null --output=/dev/null "$CORE_TEST_BIN")
(! coredumpctl debug --debugger=/bin/false)
(! coredumpctl debug --debugger=/bin/true --debugger-arguments='"')
# Test for EnterNamespace= feature
if pkgconf --atleast-version 0.192 libdw ; then
# dwfl_set_sysroot() is supported only in libdw-0.192 or newer.
cat > "$MAKE_STACKTRACE_DUMP" <<END
#!/bin/bash
mount -t tmpfs tmpfs /tmp
gcc -xc -O0 -g -o $CORE_STACKTRACE_TEST_BIN - <<EOF
void baz(void) { int *x = 0; *x = 42; }
void bar(void) { baz(); }
void foo(void) { bar(); }
int main(void) { foo(); return 0;}
EOF
$CORE_STACKTRACE_TEST_BIN
END
chmod +x "$MAKE_STACKTRACE_DUMP"
mkdir -p /run/systemd/coredump.conf.d/
printf '[Coredump]\nEnterNamespace=no' >/run/systemd/coredump.conf.d/99-enter-namespace.conf
unshare --pid --fork --mount-proc --mount --uts --ipc --net /bin/bash -c "$MAKE_STACKTRACE_DUMP" || :
coredumpctl -1 info "$CORE_STACKTRACE_TEST_BIN" | grep -zvqE 'baz.*bar.*foo'
printf '[Coredump]\nEnterNamespace=yes' >/run/systemd/coredump.conf.d/99-enter-namespace.conf
unshare --pid --fork --mount-proc --mount --uts --ipc --net /bin/bash -c "$MAKE_STACKTRACE_DUMP" || :
coredumpctl -1 info "$CORE_STACKTRACE_TEST_BIN" | grep -zqE 'baz.*bar.*foo'
else
echo "libdw doesn't not support setting sysroot, skipping EnterNamespace= test"
fi