1
0
mirror of https://github.com/systemd/systemd synced 2026-03-29 11:14:50 +02:00

Compare commits

..

11 Commits

Author SHA1 Message Date
Franck Bui
964ccab828 mount-util: fix fd_is_mount_point() when both the parent and directory are network fs
The second call to name_to_handle_at_loop() didn't check for the specific
errors that can happen when the parent dir is mounted by nfs and instead of
falling back like it's done for the child dir, fd_is_mount_point() failed in
this case.
2021-10-01 11:11:45 +02:00
Zbigniew Jędrzejewski-Szmek
8e9b3bcf12
Merge pull request #20894 from andir/editorconfig
Set maximum line length in editorconfig for C and XML files
2021-10-01 10:51:48 +02:00
Lukas Senionis
6ca3d087e2 reduce the fuzz values in evdev hwdb for Asus UX362FA 2021-10-01 10:49:01 +02:00
Yu Watanabe
1924f26d2a
Merge pull request #20777 from benzea/benzea/fix-seccomp-filter
seccomp: Always install filters for native architecture
2021-10-01 15:12:55 +09:00
Michael Biebl
528dd6a423 networkd-test: fix resolved_domain_restricted_dns
megasearch.net was meant to be a non-existing bogus domain, and had been
for a long time. But it seems some domain grabber recently registered
it, and it's an actual thing now:

  $ host megasearch.net
  megasearch.net has address 207.148.248.143

This causes the test to fail randomly.

Use search.example.com instead which yields

  $ host search.example.com
  Host search.example.com not found: 3(NXDOMAIN)

Fixes: #18357
2021-10-01 14:34:00 +09:00
Frantisek Sumsal
e72be068b1 test: use a less restrictive portable profile when running w/ sanitizers
Since f833df3 we now actually use the seccomp rules defined in portable
profiles. However, the default one is too restrictive for sanitizers, as
it blocks certain syscall required by LSan. Mitigate this by using the
'trusted' profile when running TEST-29-PORTABLE under sanitizers.
2021-09-30 14:23:27 +02:00
Andreas Rammhold
c5f26a0250
editorconfig: set maximum line length to 109 for man/*.xml files 2021-09-30 13:45:34 +02:00
Andreas Rammhold
83f0ec7978
editorconfig: enforce maximum line length in .c and .h files 2021-09-30 13:45:34 +02:00
Benjamin Berg
e975a94559 test: Add failing/non-failing syscall filter test setting architecture
This adds a high level test verifying that syscall filtering in
combination with a simple architecture filter for the "native"
architecture works fine.
2021-09-30 08:06:25 +09:00
Benjamin Berg
08bf703cc1 test: Check that "native" architecture is always filtered 2021-09-30 08:06:19 +09:00
Benjamin Berg
f833df3848 seccomp: Always install filters for native architecture
The commit 6597686865ff ("seccomp: don't install filters for archs that
can't use syscalls") introduced a regression where filters may not be
installed for the "native" architecture. This means that setting
SystemCallArchitectures=native for a unit effectively disables the
SystemCallFilter= and SystemCallLog= options.

Conceptually, we have two filter stages:
 1. architecture used for syscall (SystemCallArchitectures=)
 2. syscall + architecture combination (SystemCallFilter=)

The above commit tried to optimize the filter generation by skipping the
second level filtering when it is not required.

However, systemd will never fully block the "native" architecture using
the first level filter. This makes the code a lot simpler, as systemd
can execve() the target binary using its own architecture. And, it
should be perfectly fine as the "native" architecture will always be the
one with the most restrictive seccomp filtering.

Said differently, the bug arises because (on x86_64):
 1. x86_64 is permitted by libseccomp already
 2. native != x86_64
 3. the loop wants to block x86_64 because the permitted set only
    contains "native" (i.e. "native" != "x86_64")
 4. x86_64 is marked as blocked in seccomp_local_archs

Thereby we have an inconsistency, where it is marked as blocked in the
seccomp_local_archs array but it is allowed by libseccomp. i.e. we will
skip generating filter stage 2 without having stage 1 in place.

The fix is simple, we just skip the native architecture when looping
seccomp_local_archs. This way the inconsistency cannot happen.
2021-09-30 08:04:59 +09:00
10 changed files with 153 additions and 46 deletions

View File

@ -18,6 +18,7 @@ charset = utf-8
[*.{c,h}] [*.{c,h}]
indent_style = space indent_style = space
indent_size = 8 indent_size = 8
max_line_length = 109
[*.sh] [*.sh]
indent_style = space indent_style = space
@ -30,3 +31,4 @@ indent_size = 8
[man/*.xml] [man/*.xml]
indent_size = 2 indent_size = 2
indent_style = space indent_style = space
max_line_length = 109

View File

@ -191,10 +191,10 @@ evdev:name:Elan Touchpad:dmi:*:svnASUSTeKCOMPUTERINC.:pnUX305UA:*
# Asus UX362FA # Asus UX362FA
evdev:name:ELAN1401:00 04F3:30DC Touchpad:dmi:*:svnASUSTeKCOMPUTERINC.:pnZenBookUX362FA_UX362FA:* evdev:name:ELAN1401:00 04F3:30DC Touchpad:dmi:*:svnASUSTeKCOMPUTERINC.:pnZenBookUX362FA_UX362FA:*
EVDEV_ABS_00=:::16 EVDEV_ABS_00=:::8
EVDEV_ABS_01=:::16 EVDEV_ABS_01=:::8
EVDEV_ABS_35=:::16 EVDEV_ABS_35=:::8
EVDEV_ABS_36=:::16 EVDEV_ABS_36=:::8
######################################### #########################################
# Bangho # Bangho

View File

@ -157,6 +157,19 @@ static bool filename_possibly_with_slash_suffix(const char *s) {
return filename_is_valid(copied); return filename_is_valid(copied);
} }
static bool is_name_to_handle_at_fatal_error(int err) {
/* name_to_handle_at() can return "acceptable" errors that are due to the context. For
* example the kernel does not support name_to_handle_at() at all (ENOSYS), or the syscall
* was blocked (EACCES/EPERM; maybe through seccomp, because we are running inside of a
* container), or the mount point is not triggered yet (EOVERFLOW, think nfs4), or some
* general name_to_handle_at() flakiness (EINVAL). However other errors are not supposed to
* happen and therefore are considered fatal ones. */
assert(err < 0);
return !IN_SET(err, -EOPNOTSUPP, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL);
}
int fd_is_mount_point(int fd, const char *filename, int flags) { int fd_is_mount_point(int fd, const char *filename, int flags) {
_cleanup_free_ struct file_handle *h = NULL, *h_parent = NULL; _cleanup_free_ struct file_handle *h = NULL, *h_parent = NULL;
int mount_id = -1, mount_id_parent = -1; int mount_id = -1, mount_id_parent = -1;
@ -206,39 +219,40 @@ int fd_is_mount_point(int fd, const char *filename, int flags) {
return false; /* symlinks are never mount points */ return false; /* symlinks are never mount points */
r = name_to_handle_at_loop(fd, filename, &h, &mount_id, flags); r = name_to_handle_at_loop(fd, filename, &h, &mount_id, flags);
if (IN_SET(r, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL)) if (r < 0) {
/* This kernel does not support name_to_handle_at() at all (ENOSYS), or the syscall was blocked if (is_name_to_handle_at_fatal_error(r))
* (EACCES/EPERM; maybe through seccomp, because we are running inside of a container?), or the mount
* point is not triggered yet (EOVERFLOW, think nfs4), or some general name_to_handle_at() flakiness
* (EINVAL): fall back to simpler logic. */
goto fallback_fdinfo;
else if (r == -EOPNOTSUPP)
/* This kernel or file system does not support name_to_handle_at(), hence let's see if the upper fs
* supports it (in which case it is a mount point), otherwise fall back to the traditional stat()
* logic */
nosupp = true;
else if (r < 0)
return r; return r;
if (r != -EOPNOTSUPP)
goto fallback_fdinfo;
/* This kernel or file system does not support name_to_handle_at(), hence let's see
* if the upper fs supports it (in which case it is a mount point), otherwise fall
* back to the traditional stat() logic */
nosupp = true;
}
r = name_to_handle_at_loop(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH); r = name_to_handle_at_loop(fd, "", &h_parent, &mount_id_parent, AT_EMPTY_PATH);
if (r == -EOPNOTSUPP) { if (r < 0) {
if (nosupp) if (is_name_to_handle_at_fatal_error(r))
/* Neither parent nor child do name_to_handle_at()? We have no choice but to fall back. */
goto fallback_fdinfo;
else
/* The parent can't do name_to_handle_at() but the directory we are interested in can? If so,
* it must be a mount point. */
return 1;
} else if (r < 0)
return r; return r;
if (r != -EOPNOTSUPP)
goto fallback_fdinfo;
if (nosupp)
/* Both the parent and the directory can't do name_to_handle_at() */
goto fallback_fdinfo;
/* The parent can do name_to_handle_at() but the directory we are interested in can't? If so, it must /* The parent can't do name_to_handle_at() but the directory we are
* be a mount point. */ * interested in can? If so, it must be a mount point. */
return 1;
}
/* The parent can do name_to_handle_at() but the directory we are interested in can't? If
* so, it must be a mount point. */
if (nosupp) if (nosupp)
return 1; return 1;
/* If the file handle for the directory we are interested in and its parent are identical, we assume /* If the file handle for the directory we are interested in and its parent are identical,
* this is the root directory, which is a mount point. */ * we assume this is the root directory, which is a mount point. */
if (h->handle_bytes == h_parent->handle_bytes && if (h->handle_bytes == h_parent->handle_bytes &&
h->handle_type == h_parent->handle_type && h->handle_type == h_parent->handle_type &&
@ -338,10 +352,10 @@ int path_get_mnt_id(const char *path, int *ret) {
} }
r = name_to_handle_at_loop(AT_FDCWD, path, NULL, ret, 0); r = name_to_handle_at_loop(AT_FDCWD, path, NULL, ret, 0);
if (IN_SET(r, -EOPNOTSUPP, -ENOSYS, -EACCES, -EPERM, -EOVERFLOW, -EINVAL)) /* kernel/fs don't support this, or seccomp blocks access, or untriggered mount, or name_to_handle_at() is flaky */ if (r == 0 || is_name_to_handle_at_fatal_error(r))
return fd_fdinfo_mnt_id(AT_FDCWD, path, 0, ret);
return r; return r;
return fd_fdinfo_mnt_id(AT_FDCWD, path, 0, ret);
} }
bool fstype_is_network(const char *fstype) { bool fstype_is_network(const char *fstype) {

View File

@ -1789,6 +1789,10 @@ int seccomp_restrict_archs(Set *archs) {
for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) { for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
uint32_t arch = seccomp_local_archs[i]; uint32_t arch = seccomp_local_archs[i];
/* See above comment, our "native" architecture is never blocked. */
if (arch == seccomp_arch_native())
continue;
/* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */ /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
if (arch == SECCOMP_LOCAL_ARCH_BLOCKED) if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
continue; continue;

View File

@ -713,8 +713,10 @@ static void test_exec_systemcallfilter(Manager *m) {
test(m, "exec-systemcallfilter-not-failing.service", 0, CLD_EXITED); test(m, "exec-systemcallfilter-not-failing.service", 0, CLD_EXITED);
test(m, "exec-systemcallfilter-not-failing2.service", 0, CLD_EXITED); test(m, "exec-systemcallfilter-not-failing2.service", 0, CLD_EXITED);
test(m, "exec-systemcallfilter-not-failing3.service", 0, CLD_EXITED);
test(m, "exec-systemcallfilter-failing.service", SIGSYS, CLD_KILLED); test(m, "exec-systemcallfilter-failing.service", SIGSYS, CLD_KILLED);
test(m, "exec-systemcallfilter-failing2.service", SIGSYS, CLD_KILLED); test(m, "exec-systemcallfilter-failing2.service", SIGSYS, CLD_KILLED);
test(m, "exec-systemcallfilter-failing3.service", SIGSYS, CLD_KILLED);
r = find_executable("python3", NULL); r = find_executable("python3", NULL);
if (r < 0) { if (r < 0) {

View File

@ -890,6 +890,66 @@ static void test_load_syscall_filter_set_raw(void) {
assert_se(wait_for_terminate_and_check("syscallrawseccomp", pid, WAIT_LOG) == EXIT_SUCCESS); assert_se(wait_for_terminate_and_check("syscallrawseccomp", pid, WAIT_LOG) == EXIT_SUCCESS);
} }
static void test_native_syscalls_filtered(void) {
pid_t pid;
log_info("/* %s */", __func__);
if (!is_seccomp_available()) {
log_notice("Seccomp not available, skipping %s", __func__);
return;
}
if (!have_seccomp_privs()) {
log_notice("Not privileged, skipping %s", __func__);
return;
}
pid = fork();
assert_se(pid >= 0);
if (pid == 0) {
_cleanup_set_free_ Set *arch_s = NULL;
_cleanup_hashmap_free_ Hashmap *s = NULL;
/* Passing "native" or an empty set is equivalent, just do both here. */
assert_se(arch_s = set_new(NULL));
assert_se(seccomp_restrict_archs(arch_s) >= 0);
assert_se(set_put(arch_s, SCMP_ARCH_NATIVE) >= 0);
assert_se(seccomp_restrict_archs(arch_s) >= 0);
assert_se(access("/", F_OK) >= 0);
assert_se(poll(NULL, 0, 0) == 0);
assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, NULL, scmp_act_kill_process(), true) >= 0);
assert_se(access("/", F_OK) >= 0);
assert_se(poll(NULL, 0, 0) == 0);
assert_se(s = hashmap_new(NULL));
#if defined __NR_access && __NR_access >= 0
assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_access + 1), INT_TO_PTR(-1)) >= 0);
log_debug("has access()");
#endif
#if defined __NR_faccessat && __NR_faccessat >= 0
assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_faccessat + 1), INT_TO_PTR(-1)) >= 0);
log_debug("has faccessat()");
#endif
#if defined __NR_faccessat2 && __NR_faccessat2 >= 0
assert_se(hashmap_put(s, UINT32_TO_PTR(__NR_faccessat2 + 1), INT_TO_PTR(-1)) >= 0);
log_debug("has faccessat2()");
#endif
assert_se(!hashmap_isempty(s));
assert_se(seccomp_load_syscall_filter_set_raw(SCMP_ACT_ALLOW, s, SCMP_ACT_ERRNO(EUCLEAN), true) >= 0);
assert_se(access("/", F_OK) < 0);
assert_se(errno == EUCLEAN);
_exit(EXIT_SUCCESS);
}
assert_se(wait_for_terminate_and_check("nativeseccomp", pid, WAIT_LOG) == EXIT_SUCCESS);
}
static void test_lock_personality(void) { static void test_lock_personality(void) {
unsigned long current; unsigned long current;
pid_t pid; pid_t pid;
@ -1171,6 +1231,7 @@ int main(int argc, char *argv[]) {
test_memory_deny_write_execute_shmat(); test_memory_deny_write_execute_shmat();
test_restrict_archs(); test_restrict_archs();
test_load_syscall_filter_set_raw(); test_load_syscall_filter_set_raw();
test_native_syscalls_filtered();
test_lock_personality(); test_lock_personality();
test_restrict_suid_sgid(); test_restrict_suid_sgid();

View File

@ -635,7 +635,7 @@ Name={}
[Network] [Network]
DHCP=ipv4 DHCP=ipv4
IPv6AcceptRA=False IPv6AcceptRA=False
DNSSECNegativeTrustAnchors=megasearch.net DNSSECNegativeTrustAnchors=search.example.com
'''.format(self.iface)) '''.format(self.iface))
# create second device/dnsmasq for a .company/.lab VPN interface # create second device/dnsmasq for a .company/.lab VPN interface
@ -681,8 +681,8 @@ DNSSECNegativeTrustAnchors=company lab
self.assertIn(b'kettle.cantina.company: 10.241.4.4', out) self.assertIn(b'kettle.cantina.company: 10.241.4.4', out)
# test general domains # test general domains
out = subprocess.check_output(['resolvectl', 'query', 'megasearch.net']) out = subprocess.check_output(['resolvectl', 'query', 'search.example.com'])
self.assertIn(b'megasearch.net: 192.168.42.1', out) self.assertIn(b'search.example.com: 192.168.42.1', out)
with open(self.dnsmasq_log) as f: with open(self.dnsmasq_log) as f:
general_log = f.read() general_log = f.read()
@ -696,8 +696,8 @@ DNSSECNegativeTrustAnchors=company lab
self.assertNotIn('.company', general_log) self.assertNotIn('.company', general_log)
# general domains should not be sent to the VPN DNS # general domains should not be sent to the VPN DNS
self.assertRegex(general_log, 'query.*megasearch.net') self.assertRegex(general_log, 'query.*search.example.com')
self.assertNotIn('megasearch.net', vpn_log) self.assertNotIn('search.example.com', vpn_log)
def test_resolved_etc_hosts(self): def test_resolved_etc_hosts(self):
'''resolved queries to /etc/hosts''' '''resolved queries to /etc/hosts'''

View File

@ -0,0 +1,9 @@
[Unit]
Description=Test for SystemCallFilter
[Service]
ExecStart=/bin/sh -c '/bin/echo "This should not be seen"'
Type=oneshot
LimitCORE=0
SystemCallArchitectures=native
SystemCallFilter=~write open execve fexecve execveat exit_group close mmap munmap fstat DONOTEXIST

View File

@ -0,0 +1,8 @@
[Unit]
Description=Test for SystemCallFilter
[Service]
ExecStart=/bin/sh -c 'echo "Foo bar"'
Type=oneshot
SystemCallArchitectures=native
SystemCallFilter=

View File

@ -4,6 +4,13 @@
set -eux set -eux
set -o pipefail set -o pipefail
ARGS=()
if [[ -v ASAN_OPTIONS || -v UBSAN_OPTIONS ]]; then
# If we're running under sanitizers, we need to use a less restrictive
# profile, otherwise LSan syscall would get blocked by seccomp
ARGS+=(--profile=trusted)
fi
export SYSTEMD_LOG_LEVEL=debug export SYSTEMD_LOG_LEVEL=debug
mkdir -p /run/systemd/system/systemd-portabled.service.d/ mkdir -p /run/systemd/system/systemd-portabled.service.d/
cat <<EOF >/run/systemd/system/systemd-portabled.service.d/override.conf cat <<EOF >/run/systemd/system/systemd-portabled.service.d/override.conf
@ -11,7 +18,7 @@ cat <<EOF >/run/systemd/system/systemd-portabled.service.d/override.conf
Environment=SYSTEMD_LOG_LEVEL=debug Environment=SYSTEMD_LOG_LEVEL=debug
EOF EOF
portablectl attach --now --runtime /usr/share/minimal_0.raw app0 portablectl "${ARGS[@]}" attach --now --runtime /usr/share/minimal_0.raw app0
systemctl is-active app0.service systemctl is-active app0.service
systemctl is-active app0-foo.service systemctl is-active app0-foo.service
@ -21,7 +28,7 @@ systemctl is-active app0-bar.service && exit 1
set -e set -e
set -o pipefail set -o pipefail
portablectl reattach --now --runtime /usr/share/minimal_1.raw app0 portablectl "${ARGS[@]}" reattach --now --runtime /usr/share/minimal_1.raw app0
systemctl is-active app0.service systemctl is-active app0.service
systemctl is-active app0-bar.service systemctl is-active app0-bar.service
@ -42,7 +49,7 @@ portablectl list | grep -q -F "No images."
unsquashfs -dest /tmp/minimal_0 /usr/share/minimal_0.raw unsquashfs -dest /tmp/minimal_0 /usr/share/minimal_0.raw
unsquashfs -dest /tmp/minimal_1 /usr/share/minimal_1.raw unsquashfs -dest /tmp/minimal_1 /usr/share/minimal_1.raw
portablectl attach --copy=symlink --now --runtime /tmp/minimal_0 app0 portablectl "${ARGS[@]}" attach --copy=symlink --now --runtime /tmp/minimal_0 app0
systemctl is-active app0.service systemctl is-active app0.service
systemctl is-active app0-foo.service systemctl is-active app0-foo.service
@ -52,7 +59,7 @@ systemctl is-active app0-bar.service && exit 1
set -e set -e
set -o pipefail set -o pipefail
portablectl reattach --now --enable --runtime /tmp/minimal_1 app0 portablectl "${ARGS[@]}" reattach --now --enable --runtime /tmp/minimal_1 app0
systemctl is-active app0.service systemctl is-active app0.service
systemctl is-active app0-bar.service systemctl is-active app0-bar.service
@ -68,21 +75,21 @@ portablectl detach --now --enable --runtime /tmp/minimal_1 app0
portablectl list | grep -q -F "No images." portablectl list | grep -q -F "No images."
portablectl attach --now --runtime --extension /usr/share/app0.raw /usr/share/minimal_0.raw app0 portablectl "${ARGS[@]}" attach --now --runtime --extension /usr/share/app0.raw /usr/share/minimal_0.raw app0
systemctl is-active app0.service systemctl is-active app0.service
portablectl reattach --now --runtime --extension /usr/share/app0.raw /usr/share/minimal_1.raw app0 portablectl "${ARGS[@]}" reattach --now --runtime --extension /usr/share/app0.raw /usr/share/minimal_1.raw app0
systemctl is-active app0.service systemctl is-active app0.service
portablectl detach --now --runtime --extension /usr/share/app0.raw /usr/share/minimal_1.raw app0 portablectl detach --now --runtime --extension /usr/share/app0.raw /usr/share/minimal_1.raw app0
portablectl attach --now --runtime --extension /usr/share/app1.raw /usr/share/minimal_0.raw app1 portablectl "${ARGS[@]}" attach --now --runtime --extension /usr/share/app1.raw /usr/share/minimal_0.raw app1
systemctl is-active app1.service systemctl is-active app1.service
portablectl reattach --now --runtime --extension /usr/share/app1.raw /usr/share/minimal_1.raw app1 portablectl "${ARGS[@]}" reattach --now --runtime --extension /usr/share/app1.raw /usr/share/minimal_1.raw app1
systemctl is-active app1.service systemctl is-active app1.service
@ -95,7 +102,7 @@ mount /usr/share/app1.raw /tmp/app1
mount /usr/share/minimal_0.raw /tmp/rootdir mount /usr/share/minimal_0.raw /tmp/rootdir
mount -t overlay overlay -o lowerdir=/tmp/app1:/tmp/rootdir /tmp/overlay mount -t overlay overlay -o lowerdir=/tmp/app1:/tmp/rootdir /tmp/overlay
portablectl attach --copy=symlink --now --runtime /tmp/overlay app1 portablectl "${ARGS[@]}" attach --copy=symlink --now --runtime /tmp/overlay app1
systemctl is-active app1.service systemctl is-active app1.service