1
0
mirror of https://github.com/systemd/systemd synced 2025-10-08 21:24:45 +02:00

Compare commits

..

6 Commits

Author SHA1 Message Date
Yu Watanabe
f17c49c569
Add support for BPF tokens (#36134)
Add a new option `PrivateBPF=` to mount a private instance of bpffs.
Add also four configuration options
`BPFDelegate{Commands,Maps,Programs,Attachments}=` which set the
corresponding bpffs mount options in order to create BPF tokens:
https://lwn.net/Articles/947173/

Closes#35108.
2025-07-09 15:12:22 +09:00
Matteo Croce
ea9826eb94 core: add options to delegate BPFFS token creation
Add four new options BPFDelegate{Commands,Maps,Programs,Attachments}=
in order to delegate to a BPFFS instance the permission to create tokens.

The value is a list of options taken from:
https://github.com/torvalds/linux/blob/v6.14/include/uapi/linux/bpf.h#L922-L1121
The special value "any" means to allow every possible values.

More informations about BPF tokens here:
https://lwn.net/Articles/947173/
2025-07-08 22:35:29 +02:00
Matteo Croce
3a47437fc9 core: Introduce PrivateBPF= to mount a private BPFFS
Add a new option PrivateBPF= to mount a new instance of bpffs within a
namespace.
PrivateBPF= can be set to "no" to use the host bpffs in readonly mode
and "yes" to do a new mount.
The mount is done with the new fsopen()/fsmount() API because in future
we'll hook some commands between the two calls.
2025-07-08 22:33:28 +02:00
Matteo Croce
2c7dabff50 core: split out setup_private_users_child()
Drop support for kernels older than 3.19, as this is where
/proc/<pid>/setgroups was added.

9cc46516dd
2025-07-08 18:23:46 +02:00
Matteo Croce
9554ac3052 tests: run test with CAP_BPF
Add CAP_BPF to tests run with nspawn, so we don't have to use a VM
to test BPF calls.
2025-07-08 18:23:46 +02:00
Matteo Croce
a80c06cf02 nspawn: create mountpoint for bpffs
When we mount a tmpfs as /sys, create a mountpoint for bpf, as we
already do for cgroup
2025-07-08 18:23:46 +02:00
23 changed files with 1007 additions and 80 deletions

View File

@ -41,7 +41,7 @@ jobs:
- uses: systemd/mkosi@0d1143150835b21c1bfe64428df5f45b558280b1 - uses: systemd/mkosi@0d1143150835b21c1bfe64428df5f45b558280b1
- name: Check that tabs are not used in Python code - name: Check that tabs are not used in Python code
run: sh -c '! git grep -P "\\t" -- src/boot/generate-hwids-section.py src/ukify/ukify.py test/integration-tests/integration-test-wrapper.py' run: sh -c '! git grep -P "\\t" -- src/basic/generate-bpf-delegate-configs.py src/boot/generate-hwids-section.py src/ukify/ukify.py test/integration-tests/integration-test-wrapper.py'
- name: Build tools tree - name: Build tools tree
run: | run: |
@ -56,20 +56,20 @@ jobs:
- name: Run mypy - name: Run mypy
run: | run: |
mkosi sandbox -- mypy --version mkosi sandbox -- mypy --version
mkosi sandbox -- mypy src/boot/generate-hwids-section.py src/test/generate-sym-test.py src/ukify/ukify.py test/integration-tests/integration-test-wrapper.py mkosi sandbox -- mypy src/basic/generate-bpf-delegate-configs.py src/boot/generate-hwids-section.py src/test/generate-sym-test.py src/ukify/ukify.py test/integration-tests/integration-test-wrapper.py
- name: Run ruff check - name: Run ruff check
run: | run: |
mkosi sandbox -- ruff --version mkosi sandbox -- ruff --version
mkosi sandbox -- ruff check src/boot/generate-hwids-section.py src/test/generate-sym-test.py src/ukify/ukify.py test/integration-tests/integration-test-wrapper.py mkosi sandbox -- ruff check src/basic/generate-bpf-delegate-configs.py src/boot/generate-hwids-section.py src/test/generate-sym-test.py src/ukify/ukify.py test/integration-tests/integration-test-wrapper.py
- name: Run ruff format - name: Run ruff format
run: | run: |
mkosi sandbox -- ruff --version mkosi sandbox -- ruff --version
if ! mkosi sandbox -- ruff format --check src/boot/generate-hwids-section.py src/test/generate-sym-test.py src/ukify/ukify.py test/integration-tests/integration-test-wrapper.py if ! mkosi sandbox -- ruff format --check src/basic/generate-bpf-delegate-configs.py src/boot/generate-hwids-section.py src/test/generate-sym-test.py src/ukify/ukify.py test/integration-tests/integration-test-wrapper.py
then then
echo "Please run 'ruff format' on the above files or apply the diffs below manually" echo "Please run 'ruff format' on the above files or apply the diffs below manually"
mkosi sandbox -- ruff format --check --quiet --diff src/boot/generate-hwids-section.py src/test/generate-sym-test.py src/ukify/ukify.py test/integration-tests/integration-test-wrapper.py mkosi sandbox -- ruff format --check --quiet --diff src/basic/generate-bpf-delegate-configs.py src/boot/generate-hwids-section.py src/test/generate-sym-test.py src/ukify/ukify.py test/integration-tests/integration-test-wrapper.py
fi fi
- name: Configure meson - name: Configure meson

View File

@ -35,6 +35,17 @@ custom_entities_ent = custom_target(
man_page_depends += custom_entities_ent man_page_depends += custom_entities_ent
generate_bpf_delegate_configs = find_program('../src/basic/generate-bpf-delegate-configs.py')
bpf_delegate_xml = custom_target(
input : files('../src/basic/include/linux/bpf.h'),
output : 'bpf-delegate.xml',
command : [generate_bpf_delegate_configs,
'doc',
'@INPUT@'],
capture : true)
man_page_depends += bpf_delegate_xml
man_pages = [] man_pages = []
html_pages = [] html_pages = []
source_xml_files = [] source_xml_files = []

View File

@ -3374,6 +3374,16 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const") @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...; readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const") @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s PrivateBPF = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegateCommands = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegateMaps = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegatePrograms = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegateAttachments = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...; readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const") @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...'; readonly s NetworkNamespacePath = '...';
@ -3975,6 +3985,16 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<!--property ProcSubset is not documented!--> <!--property ProcSubset is not documented!-->
<!--property PrivateBPF is not documented!-->
<!--property BPFDelegateCommands is not documented!-->
<!--property BPFDelegateMaps is not documented!-->
<!--property BPFDelegatePrograms is not documented!-->
<!--property BPFDelegateAttachments is not documented!-->
<!--property MemoryKSM is not documented!--> <!--property MemoryKSM is not documented!-->
<!--property NetworkNamespacePath is not documented!--> <!--property NetworkNamespacePath is not documented!-->
@ -4701,6 +4721,16 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/> <variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegateCommands"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegateMaps"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegatePrograms"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegateAttachments"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/> <variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/> <variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@ -5583,6 +5613,16 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const") @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...; readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const") @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s PrivateBPF = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegateCommands = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegateMaps = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegatePrograms = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegateAttachments = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...; readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const") @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...'; readonly s NetworkNamespacePath = '...';
@ -6204,6 +6244,16 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<!--property ProcSubset is not documented!--> <!--property ProcSubset is not documented!-->
<!--property PrivateBPF is not documented!-->
<!--property BPFDelegateCommands is not documented!-->
<!--property BPFDelegateMaps is not documented!-->
<!--property BPFDelegatePrograms is not documented!-->
<!--property BPFDelegateAttachments is not documented!-->
<!--property MemoryKSM is not documented!--> <!--property MemoryKSM is not documented!-->
<!--property NetworkNamespacePath is not documented!--> <!--property NetworkNamespacePath is not documented!-->
@ -6910,6 +6960,16 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/> <variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegateCommands"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegateMaps"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegatePrograms"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegateAttachments"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/> <variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/> <variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@ -7616,6 +7676,16 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const") @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...; readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const") @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s PrivateBPF = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegateCommands = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegateMaps = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegatePrograms = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegateAttachments = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...; readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const") @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...'; readonly s NetworkNamespacePath = '...';
@ -8159,6 +8229,16 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<!--property ProcSubset is not documented!--> <!--property ProcSubset is not documented!-->
<!--property PrivateBPF is not documented!-->
<!--property BPFDelegateCommands is not documented!-->
<!--property BPFDelegateMaps is not documented!-->
<!--property BPFDelegatePrograms is not documented!-->
<!--property BPFDelegateAttachments is not documented!-->
<!--property MemoryKSM is not documented!--> <!--property MemoryKSM is not documented!-->
<!--property NetworkNamespacePath is not documented!--> <!--property NetworkNamespacePath is not documented!-->
@ -8773,6 +8853,16 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/> <variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegateCommands"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegateMaps"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegatePrograms"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegateAttachments"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/> <variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/> <variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@ -9612,6 +9702,16 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const") @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (ss) ProtectHostnameEx = ...; readonly (ss) ProtectHostnameEx = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const") @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s PrivateBPF = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegateCommands = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegateMaps = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegatePrograms = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BPFDelegateAttachments = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly b MemoryKSM = ...; readonly b MemoryKSM = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const") @org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s NetworkNamespacePath = '...'; readonly s NetworkNamespacePath = '...';
@ -10137,6 +10237,16 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<!--property ProcSubset is not documented!--> <!--property ProcSubset is not documented!-->
<!--property PrivateBPF is not documented!-->
<!--property BPFDelegateCommands is not documented!-->
<!--property BPFDelegateMaps is not documented!-->
<!--property BPFDelegatePrograms is not documented!-->
<!--property BPFDelegateAttachments is not documented!-->
<!--property MemoryKSM is not documented!--> <!--property MemoryKSM is not documented!-->
<!--property NetworkNamespacePath is not documented!--> <!--property NetworkNamespacePath is not documented!-->
@ -10733,6 +10843,16 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/> <variablelist class="dbus-property" generated="True" extra-ref="ProtectHostnameEx"/>
<variablelist class="dbus-property" generated="True" extra-ref="PrivateBPF"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegateCommands"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegateMaps"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegatePrograms"/>
<variablelist class="dbus-property" generated="True" extra-ref="BPFDelegateAttachments"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/> <variablelist class="dbus-property" generated="True" extra-ref="MemoryKSM"/>
<variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/> <variablelist class="dbus-property" generated="True" extra-ref="NetworkNamespacePath"/>
@ -12316,6 +12436,11 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>PrivatePIDs</varname> were added in version 257.</para> <varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname>, <para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>, <varname>DelegateNamespaces</varname>,
<varname>PrivateBPF</varname>,
<varname>BPFDelegateCommands</varname>,
<varname>BPFDelegateMaps</varname>,
<varname>BPFDelegatePrograms</varname>,
<varname>BPFDelegateAttachments</varname>,
<function>RemoveSubGroup()</function>, <function>RemoveSubGroup()</function>,
<varname>StateDirectoryQuota</varname>, <varname>StateDirectoryQuota</varname>,
<varname>StateDirectoryQuotaUsage</varname>, <varname>StateDirectoryQuotaUsage</varname>,
@ -12374,6 +12499,11 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>PassPIDFD</varname>, <varname>PassPIDFD</varname>,
<varname>AcceptFileDescriptors</varname>, <varname>AcceptFileDescriptors</varname>,
<varname>DelegateNamespaces</varname>, <varname>DelegateNamespaces</varname>,
<varname>PrivateBPF</varname>,
<varname>BPFDelegateCommands</varname>,
<varname>BPFDelegateMaps</varname>,
<varname>BPFDelegatePrograms</varname>,
<varname>BPFDelegateAttachments</varname>,
<function>RemoveSubgroup()</function>, <function>RemoveSubgroup()</function>,
<varname>DeferTrigger</varname>, <varname>DeferTrigger</varname>,
<varname>DeferTriggerMaxUSec</varname>, <varname>DeferTriggerMaxUSec</varname>,
@ -12429,6 +12559,11 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>PrivatePIDs</varname> were added in version 257.</para> <varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname>, <para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>, <varname>DelegateNamespaces</varname>,
<varname>PrivateBPF</varname>,
<varname>BPFDelegateCommands</varname>,
<varname>BPFDelegateMaps</varname>,
<varname>BPFDelegatePrograms</varname>,
<varname>BPFDelegateAttachments</varname>,
<function>RemoveSubgroup()</function>, <function>RemoveSubgroup()</function>,
<varname>ReloadResult</varname>, <varname>ReloadResult</varname>,
<varname>CleanResult</varname>, <varname>CleanResult</varname>,
@ -12484,6 +12619,11 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>PrivatePIDs</varname> were added in version 257.</para> <varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname>, <para><varname>ProtectHostnameEx</varname>,
<varname>DelegateNamespaces</varname>, <varname>DelegateNamespaces</varname>,
<varname>PrivateBPF</varname>,
<varname>BPFDelegateCommands</varname>,
<varname>BPFDelegateMaps</varname>,
<varname>BPFDelegatePrograms</varname>,
<varname>BPFDelegateAttachments</varname>,
<function>RemoveSubgroup()</function>, <function>RemoveSubgroup()</function>,
<varname>StateDirectoryQuota</varname>, <varname>StateDirectoryQuota</varname>,
<varname>StateDirectoryQuotaUsage</varname>, <varname>StateDirectoryQuotaUsage</varname>,

View File

@ -2555,6 +2555,66 @@ RestrictNamespaces=~cgroup net</programlisting>
<xi:include href="version-info.xml" xpointer="v258"/></listitem> <xi:include href="version-info.xml" xpointer="v258"/></listitem>
</varlistentry> </varlistentry>
<varlistentry>
<term><varname>PrivateBPF=</varname></term>
<listitem><para>Takes a boolean argument. If set, mount a private instance of the BPF filesystem
on <filename>/sys/fs/bpf/</filename>. Otherwise, if <varname>ProtectKernelTunables=</varname> is set,
the instance from the host is inherited but mounted read-only. Defaults to false.</para>
<xi:include href="version-info.xml" xpointer="v258"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>BPFDelegateCommands=</varname></term>
<listitem><para>Accepts a list of BPF commands to allow or <literal>any</literal> to allow everything.
Defaults to none. The accepted values are:
<xi:include href="bpf-delegate.xml" xpointer="bpf_delegate_cmd"/>
Requires <varname>PrivateBPF=</varname> enabled to be effective.
This will set the <constant>delegate_cmds</constant> bpffs mount option.
A more detailed explanation of the feature can be found in this
<ulink url="https://lwn.net/Articles/947173/">LWN post</ulink>.</para>
<xi:include href="version-info.xml" xpointer="v258"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>BPFDelegateMaps=</varname></term>
<listitem><para>Accepts a list of BPF maps to allow or <literal>any</literal> to allow everything.
Defaults to none. The accepted values are:
<xi:include href="bpf-delegate.xml" xpointer="bpf_delegate_map_type"/>
This will set the <constant>delegate_maps</constant> bpffs mount option.
See <varname>BPFDelegateCommands=</varname> for dependencies and more details.</para>
<xi:include href="version-info.xml" xpointer="v258"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>BPFDelegatePrograms=</varname></term>
<listitem><para>Accepts a list of BPF programs to allow or <literal>any</literal> to allow everything.
Defaults to none. The accepted values are:
<xi:include href="bpf-delegate.xml" xpointer="bpf_delegate_prog_type"/>
This will set the <constant>delegate_progs</constant> bpffs mount option.
See <varname>BPFDelegateCommands=</varname> for dependencies and more details.</para>
<xi:include href="version-info.xml" xpointer="v258"/></listitem>
</varlistentry>
<varlistentry>
<term><varname>BPFDelegateAttachments=</varname></term>
<listitem><para>Accepts a list of BPF attach points to allow or <literal>any</literal> to allow everything.
Defaults to none. The accepted values are:
<xi:include href="bpf-delegate.xml" xpointer="bpf_delegate_attach_type"/>
This will set the <constant>delegate_attachs</constant> bpffs mount option.
See <varname>BPFDelegateCommands=</varname> for dependencies and more details.</para>
<xi:include href="version-info.xml" xpointer="v258"/></listitem>
</varlistentry>
<varlistentry> <varlistentry>
<term><varname>LockPersonality=</varname></term> <term><varname>LockPersonality=</varname></term>

View File

@ -0,0 +1,76 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: LGPL-2.1-or-later
# Convert the bpf_{cmd,map_type,prog_type,attach_type} enums into a string
# array to be used as configuration options
import re
import sys
def print_usage_and_exit() -> None:
print(f'Usage: {sys.argv[0]} <code|doc> <header>')
sys.exit(1)
if len(sys.argv) != 3:
print_usage_and_exit()
output = sys.argv[1]
header = sys.argv[2]
if output not in ['code', 'doc']:
print(f'Invalid format: {format}')
print_usage_and_exit()
with open(header) as file:
inEnum = False
enumValues: list[str] = []
enumName = ''
if output == 'doc':
print("""\
<?xml version="1.0"?>
<!DOCTYPE bpf-delegates PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
"http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
<para>
""")
for line in file:
line = line.strip()
if inEnum:
# Inside an enum definition
if re.match(r'^\s*}', line):
# End of an enum definition
inEnum = False
# Print the enum values as a static const char* array
if output == 'code':
print(f'static const char* const {enumName}_table[] = {{')
else:
print(f'<para id="{enumName}">')
for enumValue in enumValues:
words = enumValue.split('_')
enumValue = words[0] + ''.join(word.capitalize() for word in words[1:])
if output == 'code':
print(f'\t"{enumValue}",')
else:
print(f'<literal>{enumValue}</literal>')
if output == 'code':
print('};')
else:
print('</para>')
enumValues = []
else:
# Collect enum values
match = re.fullmatch(r'(\w+)\b,', line)
if match and len(match.groups()) > 0 and not match[1].startswith('__'):
enumValues.append(match[1])
elif match := re.match(r'^\s*enum\s+bpf_(cmd|map_type|prog_type|attach_type)+\s*{', line):
# Start of a new enum
inEnum = True
enumName = 'bpf_delegate_' + match[1]
if output == 'doc':
print('</para>')

View File

@ -273,8 +273,17 @@ filesystem_switch_case_inc = custom_target(
'@INPUT@'], '@INPUT@'],
capture : true) capture : true)
generated_sources += [filesystem_list_inc, filesystem_switch_case_inc, filesystems_gperf_h] generate_bpf_delegate_configs = find_program('generate-bpf-delegate-configs.py')
basic_sources += [filesystem_list_inc, filesystem_switch_case_inc, filesystems_gperf_h] bpf_delegate_configs_inc = custom_target(
input : files('include/linux/bpf.h'),
output : 'bpf-delegate-configs.inc',
command : [generate_bpf_delegate_configs,
'code',
'@INPUT@'],
capture : true)
generated_sources += [filesystem_list_inc, filesystem_switch_case_inc, filesystems_gperf_h, bpf_delegate_configs_inc]
basic_sources += [filesystem_list_inc, filesystem_switch_case_inc, filesystems_gperf_h, bpf_delegate_configs_inc]
libbasic_static = static_library( libbasic_static = static_library(
'basic', 'basic',

View File

@ -54,6 +54,7 @@ BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_exec_preserve_mode, exec_preserve_
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_proc, protect_proc, ProtectProc);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_proc_subset, proc_subset, ProcSubset);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_private_bpf, private_bpf, PrivateBPF);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_home, protect_home, ProtectHome);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_protect_system, protect_system, ProtectSystem);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_personality, personality, unsigned long);
@ -1133,6 +1134,90 @@ static int property_get_unsigned_as_uint16(
return sd_bus_message_append_basic(reply, 'q', &q); return sd_bus_message_append_basic(reply, 'q', &q);
} }
static int property_get_bpf_delegate_commands(
sd_bus *bus,
const char *path,
const char *interface,
const char *property,
sd_bus_message *reply,
void *userdata,
sd_bus_error *error) {
uint64_t *u = ASSERT_PTR(userdata);
_cleanup_free_ char *s = NULL;
assert(reply);
s = bpf_delegate_commands_to_string(*u);
if (!s)
return -ENOMEM;
return sd_bus_message_append(reply, "s", s);
}
static int property_get_bpf_delegate_maps(
sd_bus *bus,
const char *path,
const char *interface,
const char *property,
sd_bus_message *reply,
void *userdata,
sd_bus_error *error) {
uint64_t *u = ASSERT_PTR(userdata);
_cleanup_free_ char *s = NULL;
assert(reply);
s = bpf_delegate_maps_to_string(*u);
if (!s)
return -ENOMEM;
return sd_bus_message_append(reply, "s", s);
}
static int property_get_bpf_delegate_programs(
sd_bus *bus,
const char *path,
const char *interface,
const char *property,
sd_bus_message *reply,
void *userdata,
sd_bus_error *error) {
uint64_t *u = ASSERT_PTR(userdata);
_cleanup_free_ char *s = NULL;
assert(reply);
s = bpf_delegate_programs_to_string(*u);
if (!s)
return -ENOMEM;
return sd_bus_message_append(reply, "s", s);
}
static int property_get_bpf_delegate_attachments(
sd_bus *bus,
const char *path,
const char *interface,
const char *property,
sd_bus_message *reply,
void *userdata,
sd_bus_error *error) {
uint64_t *u = ASSERT_PTR(userdata);
_cleanup_free_ char *s = NULL;
assert(reply);
s = bpf_delegate_attachments_to_string(*u);
if (!s)
return -ENOMEM;
return sd_bus_message_append(reply, "s", s);
}
const sd_bus_vtable bus_exec_vtable[] = { const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_VTABLE_START(0), SD_BUS_VTABLE_START(0),
SD_BUS_PROPERTY("Environment", "as", NULL, offsetof(ExecContext, environment), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("Environment", "as", NULL, offsetof(ExecContext, environment), SD_BUS_VTABLE_PROPERTY_CONST),
@ -1316,6 +1401,11 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHostname", "b", property_get_protect_hostname, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHostname", "b", property_get_protect_hostname, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("ProtectHostnameEx", "(ss)", property_get_protect_hostname_ex, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHostnameEx", "(ss)", property_get_protect_hostname_ex, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("PrivateBPF", "s", property_get_private_bpf, offsetof(ExecContext, private_bpf), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BPFDelegateCommands", "s", property_get_bpf_delegate_commands, offsetof(ExecContext, bpf_delegate_commands), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BPFDelegateMaps", "s", property_get_bpf_delegate_maps, offsetof(ExecContext, bpf_delegate_maps), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BPFDelegatePrograms", "s", property_get_bpf_delegate_programs, offsetof(ExecContext, bpf_delegate_programs), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BPFDelegateAttachments", "s", property_get_bpf_delegate_attachments, offsetof(ExecContext, bpf_delegate_attachments), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("MemoryKSM", "b", bus_property_get_tristate, offsetof(ExecContext, memory_ksm), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("MemoryKSM", "b", bus_property_get_tristate, offsetof(ExecContext, memory_ksm), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST),
@ -1753,6 +1843,11 @@ static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_home, ProtectHome, protect_home_fr
static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string); static BUS_DEFINE_SET_TRANSIENT_PARSE(keyring_mode, ExecKeyringMode, exec_keyring_mode_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string); static BUS_DEFINE_SET_TRANSIENT_PARSE(protect_proc, ProtectProc, protect_proc_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string); static BUS_DEFINE_SET_TRANSIENT_PARSE(proc_subset, ProcSubset, proc_subset_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE(private_bpf, PrivateBPF, private_bpf_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(bpf_delegate_commands, uint64_t, bpf_delegate_commands_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(bpf_delegate_maps, uint64_t, bpf_delegate_maps_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(bpf_delegate_programs, uint64_t, bpf_delegate_programs_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(bpf_delegate_attachments, uint64_t, bpf_delegate_attachments_from_string);
BUS_DEFINE_SET_TRANSIENT_PARSE(exec_preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string); BUS_DEFINE_SET_TRANSIENT_PARSE(exec_preserve_mode, ExecPreserveMode, exec_preserve_mode_from_string);
static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality); static BUS_DEFINE_SET_TRANSIENT_PARSE_PTR(personality, unsigned long, parse_personality);
static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check); static BUS_DEFINE_SET_TRANSIENT_TO_STRING_ALLOC(secure_bits, "i", int32_t, int, "%" PRIi32, secure_bits_to_string_alloc_with_check);
@ -2279,6 +2374,21 @@ int bus_exec_context_set_transient_property(
if (streq(name, "ProcSubset")) if (streq(name, "ProcSubset"))
return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error); return bus_set_transient_proc_subset(u, name, &c->proc_subset, message, flags, error);
if (streq(name, "PrivateBPF"))
return bus_set_transient_private_bpf(u, name, &c->private_bpf, message, flags, error);
if (streq(name, "BPFDelegateCommands"))
return bus_set_transient_bpf_delegate_commands(u, name, &c->bpf_delegate_commands, message, flags, error);
if (streq(name, "BPFDelegateMaps"))
return bus_set_transient_bpf_delegate_maps(u, name, &c->bpf_delegate_maps, message, flags, error);
if (streq(name, "BPFDelegatePrograms"))
return bus_set_transient_bpf_delegate_programs(u, name, &c->bpf_delegate_programs, message, flags, error);
if (streq(name, "BPFDelegateAttachments"))
return bus_set_transient_bpf_delegate_attachments(u, name, &c->bpf_delegate_attachments, message, flags, error);
if (streq(name, "RuntimeDirectoryPreserve")) if (streq(name, "RuntimeDirectoryPreserve"))
return bus_set_transient_exec_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error); return bus_set_transient_exec_preserve_mode(u, name, &c->runtime_directory_preserve_mode, message, flags, error);

View File

@ -2234,6 +2234,131 @@ static int build_pass_environment(const ExecContext *c, char ***ret) {
return 0; return 0;
} }
static int setup_private_users_child(int unshare_ready_fd, const char *uid_map, const char *gid_map, bool allow_setgroups) {
int r;
/* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
* here, after the parent opened its own user namespace. */
pid_t ppid = getppid();
/* Wait until the parent unshared the user namespace */
uint64_t c;
if (read(unshare_ready_fd, &c, sizeof(c)) < 0)
return log_debug_errno(errno, "Failed to read from signaling eventfd: %m");
/* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full
* and using the system service manager. */
const char *a = procfs_file_alloca(ppid, "setgroups");
const char *setgroups = allow_setgroups ? "allow" : "deny";
r = write_string_file(a, setgroups, WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return log_debug_errno(r, "Failed to write '%s' to %s: %m", setgroups, a);
/* First write the GID map */
a = procfs_file_alloca(ppid, "gid_map");
r = write_string_file(a, gid_map, WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return log_debug_errno(r, "Failed to write GID map to %s: %m", a);
/* Then write the UID map */
a = procfs_file_alloca(ppid, "uid_map");
r = write_string_file(a, uid_map, WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return log_debug_errno(r, "Failed to write UID map to %s: %m", a);
return 0;
}
static int bpffs_prepare(
const ExecContext *c,
PidRef *ret_pid,
int *ret_sock_fd,
int *ret_errno_pipe) {
_cleanup_close_pair_ int socket_fds[2] = EBADF_PAIR, bpffs_errno_pipe[2] = EBADF_PAIR;
int r;
assert(ret_sock_fd);
assert(ret_pid);
assert(ret_errno_pipe);
r = pipe2(bpffs_errno_pipe, O_CLOEXEC|O_NONBLOCK);
if (r < 0)
return log_debug_errno(errno, "Failed to create pipe: %m");
r = socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, socket_fds);
if (r < 0)
return log_debug_errno(errno, "Failed to create socket pair: %m");
r = pidref_safe_fork("(sd-bpffs)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, ret_pid);
if (r < 0)
return log_debug_errno(r, "Failed to fork bpffs privileged helper: %m");
if (r == 0) {
_cleanup_close_ int fs_fd = -EBADF;
char number[STRLEN("0x") + sizeof(c->bpf_delegate_commands) * 2 + 1];
bpffs_errno_pipe[0] = safe_close(bpffs_errno_pipe[0]);
socket_fds[0] = safe_close(socket_fds[0]);
fs_fd = receive_one_fd(socket_fds[1], /* flags = */ 0);
if (fs_fd < 0) {
log_debug_errno(fs_fd, "Failed to receive file descriptor from parent: %m");
report_errno_and_exit(bpffs_errno_pipe[1], fs_fd);
}
xsprintf(number, "0x%"PRIx64, c->bpf_delegate_commands);
r = fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_cmds", number, /* aux = */ 0);
if (r < 0) {
log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
xsprintf(number, "0x%"PRIx64, c->bpf_delegate_maps);
r = fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_maps", number, /* aux = */ 0);
if (r < 0) {
log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
xsprintf(number, "0x%"PRIx64, c->bpf_delegate_programs);
r = fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_progs", number, /* aux = */ 0);
if (r < 0) {
log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
xsprintf(number, "0x%"PRIx64, c->bpf_delegate_attachments);
r = fsconfig(fs_fd, FSCONFIG_SET_STRING, "delegate_attachs", number, /* aux = */ 0);
if (r < 0) {
log_debug_errno(errno, "Failed to FSCONFIG_SET_STRING: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
r = fsconfig(fs_fd, FSCONFIG_CMD_CREATE, /* key = */ NULL, /* value = */ NULL, /* aux = */ 0);
if (r < 0) {
log_debug_errno(errno, "Failed to create bpffs superblock: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
if (write(socket_fds[1], (uint8_t[1]) {}, 1) < 0) {
log_debug_errno(errno, "Failed to send data to child: %m");
report_errno_and_exit(bpffs_errno_pipe[1], errno);
}
_exit(EXIT_SUCCESS);
}
*ret_sock_fd = TAKE_FD(socket_fds[0]);
*ret_errno_pipe = TAKE_FD(bpffs_errno_pipe[0]);
return 0;
}
static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) { static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL; _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
_cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR; _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
@ -2339,69 +2464,10 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
if (r < 0) if (r < 0)
return r; return r;
if (r == 0) { if (r == 0) {
_cleanup_close_ int fd = -EBADF;
const char *a;
pid_t ppid;
/* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
* here, after the parent opened its own user namespace. */
ppid = getppid();
errno_pipe[0] = safe_close(errno_pipe[0]); errno_pipe[0] = safe_close(errno_pipe[0]);
r = setup_private_users_child(unshare_ready_fd, uid_map, gid_map, allow_setgroups);
/* Wait until the parent unshared the user namespace */ if (r < 0)
if (read(unshare_ready_fd, &c, sizeof(c)) < 0)
report_errno_and_exit(errno_pipe[1], -errno);
/* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full
* and using the system service manager. */
a = procfs_file_alloca(ppid, "setgroups");
fd = open(a, O_WRONLY|O_CLOEXEC);
if (fd < 0) {
if (errno != ENOENT) {
r = log_debug_errno(errno, "Failed to open %s: %m", a);
report_errno_and_exit(errno_pipe[1], r);
}
/* If the file is missing the kernel is too old, let's continue anyway. */
} else {
const char *setgroups = allow_setgroups ? "allow\n" : "deny\n";
if (write(fd, setgroups, strlen(setgroups)) < 0) {
r = log_debug_errno(errno, "Failed to write '%s' to %s: %m", setgroups, a);
report_errno_and_exit(errno_pipe[1], r);
}
fd = safe_close(fd);
}
/* First write the GID map */
a = procfs_file_alloca(ppid, "gid_map");
fd = open(a, O_WRONLY|O_CLOEXEC);
if (fd < 0) {
r = log_debug_errno(errno, "Failed to open %s: %m", a);
report_errno_and_exit(errno_pipe[1], r); report_errno_and_exit(errno_pipe[1], r);
}
if (write(fd, gid_map, strlen(gid_map)) < 0) {
r = log_debug_errno(errno, "Failed to write GID map to %s: %m", a);
report_errno_and_exit(errno_pipe[1], r);
}
fd = safe_close(fd);
/* The write the UID map */
a = procfs_file_alloca(ppid, "uid_map");
fd = open(a, O_WRONLY|O_CLOEXEC);
if (fd < 0) {
r = log_debug_errno(errno, "Failed to open %s: %m", a);
report_errno_and_exit(errno_pipe[1], r);
}
if (write(fd, uid_map, strlen(uid_map)) < 0) {
r = log_debug_errno(errno, "Failed to write UID map to %s: %m", a);
report_errno_and_exit(errno_pipe[1], r);
}
_exit(EXIT_SUCCESS); _exit(EXIT_SUCCESS);
} }
@ -3623,9 +3689,10 @@ static int apply_mount_namespace(
ExecRuntime *runtime, ExecRuntime *runtime,
const char *memory_pressure_path, const char *memory_pressure_path,
bool needs_sandboxing, bool needs_sandboxing,
char **reterr_path,
uid_t exec_directory_uid, uid_t exec_directory_uid,
gid_t exec_directory_gid) { gid_t exec_directory_gid,
int bpffs_socket_fd,
char **reterr_path) {
_cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT; _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
_cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL, _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
@ -3837,6 +3904,9 @@ static int apply_mount_namespace(
.protect_system = needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO, .protect_system = needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
.protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT, .protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT,
.proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL, .proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL,
.private_bpf = needs_sandboxing ? context->private_bpf : PRIVATE_BPF_NO,
.bpffs_socket_fd = bpffs_socket_fd,
}; };
r = setup_namespace(&parameters, reterr_path); r = setup_namespace(&parameters, reterr_path);
@ -4477,6 +4547,7 @@ static int setup_delegated_namespaces(
const ExecCommand *command, const ExecCommand *command,
bool needs_sandboxing, bool needs_sandboxing,
bool have_cap_sys_admin, bool have_cap_sys_admin,
int bpffs_socket_fd,
int *reterr_exit_status) { int *reterr_exit_status) {
int r; int r;
@ -4597,9 +4668,10 @@ static int setup_delegated_namespaces(
runtime, runtime,
memory_pressure_path, memory_pressure_path,
needs_sandboxing, needs_sandboxing,
&error_path,
uid, uid,
gid); gid,
bpffs_socket_fd,
&error_path);
if (r < 0) { if (r < 0) {
*reterr_exit_status = EXIT_NAMESPACE; *reterr_exit_status = EXIT_NAMESPACE;
return log_error_errno(r, "Failed to set up mount namespacing%s%s: %m", return log_error_errno(r, "Failed to set up mount namespacing%s%s: %m",
@ -4934,7 +5006,9 @@ int exec_invoke(
_cleanup_free_ gid_t *gids = NULL, *gids_after_pam = NULL; _cleanup_free_ gid_t *gids = NULL, *gids_after_pam = NULL;
int ngids = 0, ngids_after_pam = 0; int ngids = 0, ngids_after_pam = 0;
int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET; int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
_cleanup_close_ int bpffs_socket_fd = -EBADF, bpffs_errno_pipe = -EBADF;
size_t n_storage_fds, n_socket_fds, n_extra_fds; size_t n_storage_fds, n_socket_fds, n_extra_fds;
_cleanup_(pidref_done_sigkill_wait) PidRef bpffs_pidref = PIDREF_NULL;
assert(command); assert(command);
assert(context); assert(context);
@ -5650,6 +5724,26 @@ int exec_invoke(
} }
} }
if (context->private_bpf != PRIVATE_BPF_NO) {
/* To create a BPF token, the bpffs has to be mounted with the fsopen()/fsmount() API.
* More specifically, fsopen() must be called within the user namespace, then all the
* fsconfig() as privileged user, and finally and fsmount() and move_mount() in
* the user namespace.
* To do this, we split the code into a bpffs_prepare() and mount_bpffs() functions,
* the first runs as privileged user the second as unprivileged one, and they coordinate
* by sending messages and file descriptors via a socket pair.
* The user and mount namespaces need to be unshared in this exact order and before
* the fsopen() call for the fsopen() API to work as unprivileged.
* This is the kernel sample doing this:
* https://github.com/torvalds/linux/blob/master/tools/testing/selftests/bpf/prog_tests/token.c
*/
r = bpffs_prepare(context, &bpffs_pidref, &bpffs_socket_fd, &bpffs_errno_pipe);
if (r < 0) {
*exit_status = EXIT_BPF;
return log_error_errno(r, "Failed to mount bpffs in bpffs_prepare(): %m");
}
}
if (needs_sandboxing && !have_cap_sys_admin && exec_needs_cap_sys_admin(context, params)) { if (needs_sandboxing && !have_cap_sys_admin && exec_needs_cap_sys_admin(context, params)) {
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces. /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
@ -5688,6 +5782,7 @@ int exec_invoke(
command, command,
needs_sandboxing, needs_sandboxing,
have_cap_sys_admin, have_cap_sys_admin,
bpffs_socket_fd,
exit_status); exit_status);
if (r < 0) if (r < 0)
return r; return r;
@ -5747,10 +5842,30 @@ int exec_invoke(
command, command,
needs_sandboxing, needs_sandboxing,
have_cap_sys_admin, have_cap_sys_admin,
bpffs_socket_fd,
exit_status); exit_status);
if (r < 0) if (r < 0)
return r; return r;
if (context->private_bpf != PRIVATE_BPF_NO) {
r = pidref_wait_for_terminate_and_check("(sd-bpffs)", &bpffs_pidref, /* flags = */ 0);
if (r < 0) {
*exit_status = EXIT_BPF;
return r;
}
/* If something strange happened with the child, let's consider this fatal, too */
if (r != EXIT_SUCCESS) {
*exit_status = EXIT_BPF;
ssize_t ss = read(bpffs_errno_pipe, &r, sizeof(r));
if (ss == sizeof(r))
return log_debug_errno(r, "bpffs helper exited with error: %m");
if (ss < 0)
return log_debug_errno(errno, "Failed to read from the bpffs helper errno pipe: %m");
return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Short read from the bpffs helper errno pipe.");
}
pidref_done(&bpffs_pidref);
}
if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) { if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
/* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
* ensures the root of the cgroup namespace is the top level service cgroup and not the * ensures the root of the cgroup namespace is the top level service cgroup and not the

View File

@ -1803,6 +1803,34 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) {
if (r < 0) if (r < 0)
return r; return r;
r = serialize_item(f, "exec-context-private-bpf", private_bpf_to_string(c->private_bpf));
if (r < 0)
return r;
if (c->bpf_delegate_commands != 0) {
r = serialize_item_format(f, "exec-context-bpf-delegate-commands", "0x%"PRIx64, c->bpf_delegate_commands);
if (r < 0)
return r;
}
if (c->bpf_delegate_maps != 0) {
r = serialize_item_format(f, "exec-context-bpf-delegate-maps", "0x%"PRIx64, c->bpf_delegate_maps);
if (r < 0)
return r;
}
if (c->bpf_delegate_programs != 0) {
r = serialize_item_format(f, "exec-context-bpf-delegate-programs", "0x%"PRIx64, c->bpf_delegate_programs);
if (r < 0)
return r;
}
if (c->bpf_delegate_attachments != 0) {
r = serialize_item_format(f, "exec-context-bpf-delegate-attachments", "0x%"PRIx64, c->bpf_delegate_attachments);
if (r < 0)
return r;
}
r = serialize_item(f, "exec-context-runtime-directory-preserve-mode", exec_preserve_mode_to_string(c->runtime_directory_preserve_mode)); r = serialize_item(f, "exec-context-runtime-directory-preserve-mode", exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
if (r < 0) if (r < 0)
return r; return r;
@ -2741,6 +2769,26 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) {
c->proc_subset = proc_subset_from_string(val); c->proc_subset = proc_subset_from_string(val);
if (c->proc_subset < 0) if (c->proc_subset < 0)
return -EINVAL; return -EINVAL;
} else if ((val = startswith(l, "exec-context-private-bpf="))) {
c->private_bpf = private_bpf_from_string(val);
if (c->private_bpf < 0)
return -EINVAL;
} else if ((val = startswith(l, "exec-context-bpf-delegate-commands="))) {
r = safe_atoux64(val, &c->bpf_delegate_commands);
if (r < 0)
return r;
} else if ((val = startswith(l, "exec-context-bpf-delegate-maps="))) {
r = safe_atoux64(val, &c->bpf_delegate_maps);
if (r < 0)
return r;
} else if ((val = startswith(l, "exec-context-bpf-delegate-programs="))) {
r = safe_atoux64(val, &c->bpf_delegate_programs);
if (r < 0)
return r;
} else if ((val = startswith(l, "exec-context-bpf-delegate-attachments="))) {
r = safe_atoux64(val, &c->bpf_delegate_attachments);
if (r < 0)
return r;
} else if ((val = startswith(l, "exec-context-runtime-directory-preserve-mode="))) { } else if ((val = startswith(l, "exec-context-runtime-directory-preserve-mode="))) {
c->runtime_directory_preserve_mode = exec_preserve_mode_from_string(val); c->runtime_directory_preserve_mode = exec_preserve_mode_from_string(val);
if (c->runtime_directory_preserve_mode < 0) if (c->runtime_directory_preserve_mode < 0)

View File

@ -324,6 +324,7 @@ bool exec_needs_mount_namespace(
exec_needs_cgroup_mount(context) || exec_needs_cgroup_mount(context) ||
context->protect_proc != PROTECT_PROC_DEFAULT || context->protect_proc != PROTECT_PROC_DEFAULT ||
context->proc_subset != PROC_SUBSET_ALL || context->proc_subset != PROC_SUBSET_ALL ||
context->private_bpf != PRIVATE_BPF_NO ||
exec_needs_ipc_namespace(context) || exec_needs_ipc_namespace(context) ||
exec_needs_pid_namespace(context, params)) exec_needs_pid_namespace(context, params))
return true; return true;
@ -1124,7 +1125,8 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
"%sKeyringMode: %s\n" "%sKeyringMode: %s\n"
"%sProtectHostname: %s%s%s\n" "%sProtectHostname: %s%s%s\n"
"%sProtectProc: %s\n" "%sProtectProc: %s\n"
"%sProcSubset: %s\n", "%sProcSubset: %s\n"
"%sPrivateBPF: %s\n",
prefix, c->umask, prefix, c->umask,
prefix, empty_to_root(c->working_directory), prefix, empty_to_root(c->working_directory),
prefix, empty_to_root(c->root_directory), prefix, empty_to_root(c->root_directory),
@ -1151,7 +1153,21 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
prefix, exec_keyring_mode_to_string(c->keyring_mode), prefix, exec_keyring_mode_to_string(c->keyring_mode),
prefix, protect_hostname_to_string(c->protect_hostname), c->private_hostname ? ":" : "", strempty(c->private_hostname), prefix, protect_hostname_to_string(c->protect_hostname), c->private_hostname ? ":" : "", strempty(c->private_hostname),
prefix, protect_proc_to_string(c->protect_proc), prefix, protect_proc_to_string(c->protect_proc),
prefix, proc_subset_to_string(c->proc_subset)); prefix, proc_subset_to_string(c->proc_subset),
prefix, private_bpf_to_string(c->private_bpf));
if (c->private_bpf == PRIVATE_BPF_YES) {
_cleanup_free_ char
*commands = bpf_delegate_commands_to_string(c->bpf_delegate_commands),
*maps = bpf_delegate_maps_to_string(c->bpf_delegate_maps),
*programs = bpf_delegate_programs_to_string(c->bpf_delegate_programs),
*attachments = bpf_delegate_attachments_to_string(c->bpf_delegate_attachments);
fprintf(f, "%sBPFDelegateCommands: %s\n", prefix, strna(commands));
fprintf(f, "%sBPFDelegateMaps: %s\n", prefix, strna(maps));
fprintf(f, "%sBPFDelegatePrograms: %s\n", prefix, strna(programs));
fprintf(f, "%sBPFDelegateAttachments: %s\n", prefix, strna(attachments));
}
if (c->set_login_environment >= 0) if (c->set_login_environment >= 0)
fprintf(f, "%sSetLoginEnvironment: %s\n", prefix, yes_no(c->set_login_environment > 0)); fprintf(f, "%sSetLoginEnvironment: %s\n", prefix, yes_no(c->set_login_environment > 0));

View File

@ -300,6 +300,9 @@ typedef struct ExecContext {
ProtectProc protect_proc; /* hidepid= */ ProtectProc protect_proc; /* hidepid= */
ProcSubset proc_subset; /* subset= */ ProcSubset proc_subset; /* subset= */
PrivateBPF private_bpf;
uint64_t bpf_delegate_commands, bpf_delegate_maps, bpf_delegate_programs, bpf_delegate_attachments;
int private_mounts; int private_mounts;
int mount_apivfs; int mount_apivfs;
int bind_log_sockets; int bind_log_sockets;

View File

@ -67,6 +67,11 @@
{{type}}.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof({{type}}, exec_context.keyring_mode) {{type}}.KeyringMode, config_parse_exec_keyring_mode, 0, offsetof({{type}}, exec_context.keyring_mode)
{{type}}.ProtectProc, config_parse_protect_proc, 0, offsetof({{type}}, exec_context.protect_proc) {{type}}.ProtectProc, config_parse_protect_proc, 0, offsetof({{type}}, exec_context.protect_proc)
{{type}}.ProcSubset, config_parse_proc_subset, 0, offsetof({{type}}, exec_context.proc_subset) {{type}}.ProcSubset, config_parse_proc_subset, 0, offsetof({{type}}, exec_context.proc_subset)
{{type}}.PrivateBPF, config_parse_private_bpf, 0, offsetof({{type}}, exec_context.private_bpf)
{{type}}.BPFDelegateCommands, config_parse_bpf_delegate_commands, 0, offsetof({{type}}, exec_context.bpf_delegate_commands)
{{type}}.BPFDelegateMaps, config_parse_bpf_delegate_maps, 0, offsetof({{type}}, exec_context.bpf_delegate_maps)
{{type}}.BPFDelegatePrograms, config_parse_bpf_delegate_programs, 0, offsetof({{type}}, exec_context.bpf_delegate_programs)
{{type}}.BPFDelegateAttachments, config_parse_bpf_delegate_attachments, 0, offsetof({{type}}, exec_context.bpf_delegate_attachments)
{% if HAVE_SECCOMP %} {% if HAVE_SECCOMP %}
{{type}}.SystemCallFilter, config_parse_syscall_filter, 0, offsetof({{type}}, exec_context) {{type}}.SystemCallFilter, config_parse_syscall_filter, 0, offsetof({{type}}, exec_context)
{{type}}.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof({{type}}, exec_context.syscall_archs) {{type}}.SystemCallArchitectures, config_parse_syscall_archs, 0, offsetof({{type}}, exec_context.syscall_archs)

View File

@ -133,6 +133,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_device_policy, cgroup_device_policy, CGrou
DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode); DEFINE_CONFIG_PARSE_ENUM(config_parse_exec_keyring_mode, exec_keyring_mode, ExecKeyringMode);
DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc); DEFINE_CONFIG_PARSE_ENUM(config_parse_protect_proc, protect_proc, ProtectProc);
DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset); DEFINE_CONFIG_PARSE_ENUM(config_parse_proc_subset, proc_subset, ProcSubset);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_bpf, private_bpf, PrivateBPF);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_tmp, private_tmp, PrivateTmp); DEFINE_CONFIG_PARSE_ENUM(config_parse_private_tmp, private_tmp, PrivateTmp);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_users, private_users, PrivateUsers); DEFINE_CONFIG_PARSE_ENUM(config_parse_private_users, private_users, PrivateUsers);
DEFINE_CONFIG_PARSE_ENUM(config_parse_private_pids, private_pids, PrivatePIDs); DEFINE_CONFIG_PARSE_ENUM(config_parse_private_pids, private_pids, PrivatePIDs);
@ -160,6 +161,10 @@ DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_numa_policy, mpol, int, -1);
DEFINE_CONFIG_PARSE_ENUM(config_parse_status_unit_format, status_unit_format, StatusUnitFormat); DEFINE_CONFIG_PARSE_ENUM(config_parse_status_unit_format, status_unit_format, StatusUnitFormat);
DEFINE_CONFIG_PARSE_ENUM_FULL(config_parse_socket_timestamping, socket_timestamping_from_string_harder, SocketTimestamping); DEFINE_CONFIG_PARSE_ENUM_FULL(config_parse_socket_timestamping, socket_timestamping_from_string_harder, SocketTimestamping);
DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_defer_trigger, socket_defer_trigger, SocketDeferTrigger); DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_defer_trigger, socket_defer_trigger, SocketDeferTrigger);
DEFINE_CONFIG_PARSE_PTR(config_parse_bpf_delegate_commands, bpf_delegate_commands_from_string, uint64_t);
DEFINE_CONFIG_PARSE_PTR(config_parse_bpf_delegate_maps, bpf_delegate_maps_from_string, uint64_t);
DEFINE_CONFIG_PARSE_PTR(config_parse_bpf_delegate_programs, bpf_delegate_programs_from_string, uint64_t);
DEFINE_CONFIG_PARSE_PTR(config_parse_bpf_delegate_attachments, bpf_delegate_attachments_from_string, uint64_t);
bool contains_instance_specifier_superset(const char *s) { bool contains_instance_specifier_superset(const char *s) {
const char *p, *q; const char *p, *q;
@ -6270,6 +6275,10 @@ void unit_dump_config_items(FILE *f) {
{ config_parse_personality, "PERSONALITY" }, { config_parse_personality, "PERSONALITY" },
{ config_parse_log_filter_patterns, "REGEX" }, { config_parse_log_filter_patterns, "REGEX" },
{ config_parse_mount_node, "NODE" }, { config_parse_mount_node, "NODE" },
{ config_parse_bpf_delegate_commands, "BPF_DELEGATE_COMMANDS" },
{ config_parse_bpf_delegate_maps, "BPF_DELEGATE_MAPS" },
{ config_parse_bpf_delegate_programs, "BPF_DELEGATE_PROGRAMS" },
{ config_parse_bpf_delegate_attachments, "BPF_DELEGATE_ATTACHMENTS" },
}; };
const char *prev = NULL; const char *prev = NULL;

View File

@ -129,6 +129,11 @@ CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode); CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc); CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);
CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset); CONFIG_PARSER_PROTOTYPE(config_parse_proc_subset);
CONFIG_PARSER_PROTOTYPE(config_parse_private_bpf);
CONFIG_PARSER_PROTOTYPE(config_parse_bpf_delegate_commands);
CONFIG_PARSER_PROTOTYPE(config_parse_bpf_delegate_maps);
CONFIG_PARSER_PROTOTYPE(config_parse_bpf_delegate_programs);
CONFIG_PARSER_PROTOTYPE(config_parse_bpf_delegate_attachments);
CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec); CONFIG_PARSER_PROTOTYPE(config_parse_job_timeout_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec); CONFIG_PARSER_PROTOTYPE(config_parse_job_running_timeout_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields); CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);

View File

@ -10,6 +10,7 @@
#include "alloc-util.h" #include "alloc-util.h"
#include "base-filesystem.h" #include "base-filesystem.h"
#include "bitfield.h"
#include "chase.h" #include "chase.h"
#include "dev-setup.h" #include "dev-setup.h"
#include "devnum-util.h" #include "devnum-util.h"
@ -17,6 +18,7 @@
#include "errno-util.h" #include "errno-util.h"
#include "escape.h" #include "escape.h"
#include "extension-util.h" #include "extension-util.h"
#include "extract-word.h"
#include "fd-util.h" #include "fd-util.h"
#include "format-util.h" #include "format-util.h"
#include "fs-util.h" #include "fs-util.h"
@ -36,6 +38,7 @@
#include "nsflags.h" #include "nsflags.h"
#include "nulstr-util.h" #include "nulstr-util.h"
#include "os-util.h" #include "os-util.h"
#include "parse-util.h"
#include "path-util.h" #include "path-util.h"
#include "pidref.h" #include "pidref.h"
#include "process-util.h" #include "process-util.h"
@ -79,6 +82,7 @@ typedef enum MountMode {
MOUNT_EXTENSION_IMAGE, /* Mounted outside the root directory, and used by subsequent mounts */ MOUNT_EXTENSION_IMAGE, /* Mounted outside the root directory, and used by subsequent mounts */
MOUNT_MQUEUEFS, MOUNT_MQUEUEFS,
MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */ MOUNT_READ_WRITE_IMPLICIT, /* Should have the lowest priority. */
MOUNT_BPFFS, /* Special mount for bpffs, which is mounted with fsmount() and move_mount() */
_MOUNT_MODE_MAX, _MOUNT_MODE_MAX,
_MOUNT_MODE_INVALID = -EINVAL, _MOUNT_MODE_INVALID = -EINVAL,
} MountMode; } MountMode;
@ -161,13 +165,17 @@ static const MountEntry protect_kernel_tunables_proc_table[] = {
static const MountEntry protect_kernel_tunables_sys_table[] = { static const MountEntry protect_kernel_tunables_sys_table[] = {
{ "/sys", MOUNT_READ_ONLY, false }, { "/sys", MOUNT_READ_ONLY, false },
{ "/sys/fs/bpf", MOUNT_READ_ONLY, true },
{ "/sys/fs/cgroup", MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */ { "/sys/fs/cgroup", MOUNT_READ_WRITE_IMPLICIT, false }, /* READ_ONLY is set by ProtectControlGroups= option */
{ "/sys/fs/selinux", MOUNT_READ_WRITE_IMPLICIT, true }, { "/sys/fs/selinux", MOUNT_READ_WRITE_IMPLICIT, true },
{ "/sys/kernel/debug", MOUNT_READ_ONLY, true }, { "/sys/kernel/debug", MOUNT_READ_ONLY, true },
{ "/sys/kernel/tracing", MOUNT_READ_ONLY, true }, { "/sys/kernel/tracing", MOUNT_READ_ONLY, true },
}; };
/* PrivateBPF= option */
static const MountEntry private_bpf_no_table[] = {
{ "/sys/fs/bpf", MOUNT_READ_ONLY, true },
};
/* ProtectKernelModules= option */ /* ProtectKernelModules= option */
static const MountEntry protect_kernel_modules_table[] = { static const MountEntry protect_kernel_modules_table[] = {
{ "/usr/lib/modules", MOUNT_INACCESSIBLE, true }, { "/usr/lib/modules", MOUNT_INACCESSIBLE, true },
@ -927,6 +935,36 @@ static int append_protect_system(MountList *ml, ProtectSystem protect_system, bo
} }
} }
static int append_private_bpf(
MountList *ml,
PrivateBPF private_bpf,
bool protect_kernel_tunables,
bool ignore_protect,
const NamespaceParameters *p) {
assert(ml);
switch (private_bpf) {
case PRIVATE_BPF_NO:
if (protect_kernel_tunables)
return append_static_mounts(ml, private_bpf_no_table, ELEMENTSOF(private_bpf_no_table), ignore_protect);
return 0;
case PRIVATE_BPF_YES: {
MountEntry *me = mount_list_extend(ml);
if (!me)
return log_oom_debug();
*me = (MountEntry) {
.path_const = "/sys/fs/bpf",
.mode = MOUNT_BPFFS,
};
return 0;
}
default:
assert_not_reached();
}
}
static int mount_path_compare(const MountEntry *a, const MountEntry *b) { static int mount_path_compare(const MountEntry *a, const MountEntry *b) {
int d; int d;
@ -1697,6 +1735,34 @@ static int mount_overlay(const MountEntry *m) {
return 1; return 1;
} }
static int mount_bpffs(const MountEntry *m, int socket_fd) {
int r;
assert(m);
assert(socket_fd >= 0);
_cleanup_close_ int fs_fd = fsopen("bpf", FSOPEN_CLOEXEC);
if (fs_fd < 0)
return log_debug_errno(errno, "Failed to fsopen: %m");
r = send_one_fd(socket_fd, fs_fd, /* flags = */ 0);
if (r < 0)
return log_debug_errno(r, "Failed to send bpffs fd to child: %m");
if (read(socket_fd, (uint8_t[1]) {}, 1) < 0)
return log_debug_errno(errno, "Failed to receive data from child: %m");
_cleanup_close_ int mnt_fd = fsmount(fs_fd, /* flags = */ 0, /* mount_attrs = */ 0);
if (mnt_fd < 0)
return log_debug_errno(errno, "Failed to fsmount bpffs: %m");
r = move_mount(mnt_fd, "", AT_FDCWD, mount_entry_path(m), MOVE_MOUNT_F_EMPTY_PATH);
if (r < 0)
return log_debug_errno(errno, "Failed to move bpffs mount to %s: %m", mount_entry_path(m));
return 1;
}
static int follow_symlink( static int follow_symlink(
const char *root_directory, const char *root_directory,
MountEntry *m) { MountEntry *m) {
@ -1953,6 +2019,9 @@ static int apply_one_mount(
case MOUNT_OVERLAY: case MOUNT_OVERLAY:
return mount_overlay(m); return mount_overlay(m);
case MOUNT_BPFFS:
return mount_bpffs(m, p->bpffs_socket_fd);
default: default:
assert_not_reached(); assert_not_reached();
} }
@ -2151,6 +2220,7 @@ static bool namespace_parameters_mount_apivfs(const NamespaceParameters *p) {
p->protect_kernel_tunables || p->protect_kernel_tunables ||
p->protect_proc != PROTECT_PROC_DEFAULT || p->protect_proc != PROTECT_PROC_DEFAULT ||
p->proc_subset != PROC_SUBSET_ALL || p->proc_subset != PROC_SUBSET_ALL ||
p->private_bpf != PRIVATE_BPF_NO ||
p->private_pids != PRIVATE_PIDS_NO; p->private_pids != PRIVATE_PIDS_NO;
} }
@ -2653,6 +2723,10 @@ int setup_namespace(const NamespaceParameters *p, char **reterr_path) {
if (r < 0) if (r < 0)
return r; return r;
r = append_private_bpf(&ml, p->private_bpf, p->protect_kernel_tunables, /* ignore_protect = */ false, p);
if (r < 0)
return r;
if (namespace_parameters_mount_apivfs(p)) { if (namespace_parameters_mount_apivfs(p)) {
r = append_static_mounts(&ml, r = append_static_mounts(&ml,
apivfs_table, apivfs_table,
@ -3888,6 +3962,76 @@ static const char* const proc_subset_table[_PROC_SUBSET_MAX] = {
DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset); DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset);
static const char* const private_bpf_table[_PRIVATE_BPF_MAX] = {
[PRIVATE_BPF_NO] = "no",
[PRIVATE_BPF_YES] = "yes",
};
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_bpf, PrivateBPF, PRIVATE_BPF_YES);
#include "bpf-delegate-configs.inc"
DEFINE_STRING_TABLE_LOOKUP(bpf_delegate_cmd, uint64_t);
DEFINE_STRING_TABLE_LOOKUP(bpf_delegate_map_type, uint64_t);
DEFINE_STRING_TABLE_LOOKUP(bpf_delegate_prog_type, uint64_t);
DEFINE_STRING_TABLE_LOOKUP(bpf_delegate_attach_type, uint64_t);
char* bpf_delegate_to_string(uint64_t u, const char * (*parser)(uint64_t) _const_ ) {
assert(parser);
if (u == UINT64_MAX)
return strdup("any");
_cleanup_free_ char *buf = NULL;
BIT_FOREACH(i, u) {
const char *s = parser(i);
if (s) {
if (!strextend_with_separator(&buf, ",", s))
return NULL;
} else {
if (strextendf_with_separator(&buf, ",", "%d", i) < 0)
return NULL;
}
}
return TAKE_PTR(buf) ?: strdup("");
}
int bpf_delegate_from_string(const char *s, uint64_t *ret, uint64_t (*parser)(const char *)) {
int r;
assert(s);
assert(ret);
assert(parser);
if (streq(s, "any")) {
*ret = UINT64_MAX;
return 0;
}
uint64_t mask = 0;
for (;;) {
_cleanup_free_ char *word = NULL;
r = extract_first_word(&s, &word, ",", /* flags = */ 0);
if (r < 0)
return log_warning_errno(r, "Failed to parse delegate options \"%s\": %m", s);
if (r == 0)
break;
r = parser(word);
if (r < 0)
log_warning_errno(r, "Unknown BPF delegate option, ignoring: %s", word);
else
mask |= UINT64_C(1) << r;
}
*ret = mask;
return 0;
}
static const char* const private_tmp_table[_PRIVATE_TMP_MAX] = { static const char* const private_tmp_table[_PRIVATE_TMP_MAX] = {
[PRIVATE_TMP_NO] = "no", [PRIVATE_TMP_NO] = "no",
[PRIVATE_TMP_CONNECTED] = "connected", [PRIVATE_TMP_CONNECTED] = "connected",

View File

@ -51,6 +51,13 @@ typedef enum ProcSubset {
_PROC_SUBSET_INVALID = -EINVAL, _PROC_SUBSET_INVALID = -EINVAL,
} ProcSubset; } ProcSubset;
typedef enum PrivateBPF {
PRIVATE_BPF_NO,
PRIVATE_BPF_YES,
_PRIVATE_BPF_MAX,
_PRIVATE_BPF_INVALID = -EINVAL,
} PrivateBPF;
typedef enum PrivateTmp { typedef enum PrivateTmp {
PRIVATE_TMP_NO, PRIVATE_TMP_NO,
PRIVATE_TMP_CONNECTED, /* Bind mounted from the host's filesystem */ PRIVATE_TMP_CONNECTED, /* Bind mounted from the host's filesystem */
@ -188,9 +195,12 @@ typedef struct NamespaceParameters {
ProtectSystem protect_system; ProtectSystem protect_system;
ProtectProc protect_proc; ProtectProc protect_proc;
ProcSubset proc_subset; ProcSubset proc_subset;
PrivateBPF private_bpf;
PrivateTmp private_tmp; PrivateTmp private_tmp;
PrivateTmp private_var_tmp; PrivateTmp private_var_tmp;
PrivatePIDs private_pids; PrivatePIDs private_pids;
int bpffs_socket_fd;
} NamespaceParameters; } NamespaceParameters;
int setup_namespace(const NamespaceParameters *p, char **reterr_path); int setup_namespace(const NamespaceParameters *p, char **reterr_path);
@ -223,6 +233,56 @@ ProtectProc protect_proc_from_string(const char *s) _pure_;
const char* proc_subset_to_string(ProcSubset i) _const_; const char* proc_subset_to_string(ProcSubset i) _const_;
ProcSubset proc_subset_from_string(const char *s) _pure_; ProcSubset proc_subset_from_string(const char *s) _pure_;
const char* private_bpf_to_string(PrivateBPF i) _const_;
PrivateBPF private_bpf_from_string(const char *s) _pure_;
const char* bpf_delegate_cmd_to_string(uint64_t u) _const_;
uint64_t bpf_delegate_cmd_from_string(const char *s) _pure_;
const char* bpf_delegate_map_type_to_string(uint64_t u) _const_;
uint64_t bpf_delegate_map_type_from_string(const char *s) _pure_;
const char* bpf_delegate_prog_type_to_string(uint64_t u) _const_;
uint64_t bpf_delegate_prog_type_from_string(const char *s) _pure_;
const char* bpf_delegate_attach_type_to_string(uint64_t u) _const_;
uint64_t bpf_delegate_attach_type_from_string(const char *s) _pure_;
char* bpf_delegate_to_string(uint64_t u, const char * (*parser)(uint64_t) _const_);
int bpf_delegate_from_string(const char *s, uint64_t *ret, uint64_t (*parser)(const char *));
static inline int bpf_delegate_commands_from_string(const char *s, uint64_t *ret) {
return bpf_delegate_from_string(s, ret, bpf_delegate_cmd_from_string);
}
static inline char * bpf_delegate_commands_to_string(uint64_t u) {
return bpf_delegate_to_string(u, bpf_delegate_cmd_to_string);
}
static inline int bpf_delegate_maps_from_string(const char *s, uint64_t *ret) {
return bpf_delegate_from_string(s, ret, bpf_delegate_map_type_from_string);
}
static inline char * bpf_delegate_maps_to_string(uint64_t u) {
return bpf_delegate_to_string(u, bpf_delegate_map_type_to_string);
}
static inline int bpf_delegate_programs_from_string(const char *s, uint64_t *ret) {
return bpf_delegate_from_string(s, ret, bpf_delegate_prog_type_from_string);
}
static inline char * bpf_delegate_programs_to_string(uint64_t u) {
return bpf_delegate_to_string(u, bpf_delegate_prog_type_to_string);
}
static inline int bpf_delegate_attachments_from_string(const char *s, uint64_t *ret) {
return bpf_delegate_from_string(s, ret, bpf_delegate_attach_type_from_string);
}
static inline char * bpf_delegate_attachments_to_string(uint64_t u) {
return bpf_delegate_to_string(u, bpf_delegate_attach_type_to_string);
}
const char* private_tmp_to_string(PrivateTmp i) _const_; const char* private_tmp_to_string(PrivateTmp i) _const_;
PrivateTmp private_tmp_from_string(const char *s) _pure_; PrivateTmp private_tmp_from_string(const char *s) _pure_;

View File

@ -509,12 +509,14 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
if (rmdir(full) < 0) if (rmdir(full) < 0)
return log_error_errno(errno, "Failed to remove %s: %m", full); return log_error_errno(errno, "Failed to remove %s: %m", full);
/* Create mountpoint for cgroups. Otherwise we are not allowed since we remount /sys/ read-only. */ /* Create mountpoints. Otherwise we are not allowed since we remount /sys/ read-only. */
_cleanup_free_ char *x = path_join(top, "/fs/cgroup"); FOREACH_STRING(p, "/fs/cgroup", "/fs/bpf") {
if (!x) _cleanup_free_ char *x = path_join(top, p);
return log_oom(); if (!x)
return log_oom();
(void) mkdir_p(x, 0755); (void) mkdir_p(x, 0755);
}
return mount_nofollow_verbose(LOG_ERR, NULL, top, NULL, return mount_nofollow_verbose(LOG_ERR, NULL, top, NULL,
MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL); MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);

View File

@ -2425,6 +2425,11 @@ static const BusProperty execute_properties[] = {
{ "MountImagePolicy", bus_append_string }, { "MountImagePolicy", bus_append_string },
{ "ExtensionImagePolicy", bus_append_string }, { "ExtensionImagePolicy", bus_append_string },
{ "PrivatePIDs", bus_append_string }, { "PrivatePIDs", bus_append_string },
{ "PrivateBPF", bus_append_string },
{ "BPFDelegateCommands", bus_append_string },
{ "BPFDelegateMaps", bus_append_string },
{ "BPFDelegatePrograms", bus_append_string },
{ "BPFDelegateAttachments", bus_append_string },
{ "IgnoreSIGPIPE", bus_append_parse_boolean }, { "IgnoreSIGPIPE", bus_append_parse_boolean },
{ "TTYVHangup", bus_append_parse_boolean }, { "TTYVHangup", bus_append_parse_boolean },
{ "TTYReset", bus_append_parse_boolean }, { "TTYReset", bus_append_parse_boolean },

View File

@ -486,6 +486,12 @@ executables += [
'sources' : files('test-bpf-restrict-fs.c'), 'sources' : files('test-bpf-restrict-fs.c'),
'dependencies' : common_test_dependencies, 'dependencies' : common_test_dependencies,
}, },
core_test_template + {
'sources' : files('test-bpf-token.c'),
'dependencies' : common_test_dependencies + libbpf,
'conditions' : ['BPF_FRAMEWORK'],
'type' : 'manual',
},
core_test_template + { core_test_template + {
'sources' : files('test-cgroup-cpu.c'), 'sources' : files('test-cgroup-cpu.c'),
}, },

28
src/test/test-bpf-token.c Normal file
View File

@ -0,0 +1,28 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <bpf/bpf.h>
#include <fcntl.h>
#include "fd-util.h"
#include "main-func.h"
#include "tests.h"
static int run(int argc, char *argv[]) {
#if __LIBBPF_CURRENT_VERSION_GEQ(1, 5)
_cleanup_close_ int bpffs_fd = -EBADF, token_fd = -EBADF;
bpffs_fd = open("/sys/fs/bpf", O_RDONLY);
if (bpffs_fd < 0)
return -errno;
token_fd = bpf_token_create(bpffs_fd, /* opts = */ NULL);
if (token_fd < 0)
return -errno;
return 0;
#else
exit(77);
#endif
}
DEFINE_MAIN_FUNCTION(run);

View File

@ -617,6 +617,8 @@ def main() -> None:
'--credential', f"journal.storage={'persistent' if sys.stdin.isatty() else args.storage}", '--credential', f"journal.storage={'persistent' if sys.stdin.isatty() else args.storage}",
*(['--runtime-build-sources=no', '--register=no'] if not sys.stdin.isatty() else []), *(['--runtime-build-sources=no', '--register=no'] if not sys.stdin.isatty() else []),
'vm' if args.vm or os.getuid() != 0 or os.getenv('TEST_PREFER_QEMU', '0') == '1' else 'boot', 'vm' if args.vm or os.getuid() != 0 or os.getenv('TEST_PREFER_QEMU', '0') == '1' else 'boot',
*(['--', '--capability=CAP_BPF'] \
if not args.vm and os.getenv('TEST_PREFER_QEMU', '0') == '0' else []),
] # fmt: skip ] # fmt: skip
try: try:

View File

@ -0,0 +1,71 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: LGPL-2.1-or-later
set -eux
set -o pipefail
# Check that with ProtectKernelTunables=yes and PrivateBPF=no, the host bpffs is remounted ro
systemd-run --wait \
-p PrivateUsers=yes \
-p PrivateMounts=yes \
-p DelegateNamespaces=mnt \
-p ProtectKernelTunables=yes \
-p PrivateBPF=no \
grep -q '/sys/fs/bpf .* ro,' /proc/mounts
# Check that with PrivateBPF=yes, a new bpffs instance is mounted
systemd-run --wait \
-p PrivateUsers=yes \
-p PrivateMounts=yes \
-p DelegateNamespaces=mnt \
-p PrivateBPF=yes \
grep -q '^none /sys/fs/bpf bpf rw' /proc/mounts
# Check that when specifying the delegate arguments, the mount options are set properly
check_mount_opts() {
local delegate=$1 mnt_opts=$2
systemd-run --wait \
-p PrivateUsers=yes \
-p PrivateMounts=yes \
-p DelegateNamespaces=mnt \
-p PrivateBPF=yes \
-p "$delegate" \
grep -q "$mnt_opts" /proc/mounts
}
check_mount_opts 'BPFDelegateCommands=BPFObjPin,BPFBtfLoad,BPFMapFreeze,BPFLinkDetach' 'delegate_cmds=obj_pin:btf_load:map_freeze:link_detach'
check_mount_opts 'BPFDelegateMaps=BPFMapTypeArray,BPFMapTypeCpumap,BPFMapTypeRingbuf' 'delegate_maps=array:cpumap:ringbuf'
check_mount_opts 'BPFDelegatePrograms=BPFProgTypeTracepoint,BPFProgTypeXdp,BPFProgTypeTracing' 'delegate_progs=tracepoint:xdp:tracing'
check_mount_opts 'BPFDelegateAttachments=BPFFlowDissector,BPFCgroupSysctl,BPFNetfilter' 'delegate_attachs=flow_dissector:cgroup_sysctl:netfilter'
# Building test-bpf-token requires BPF support
if systemctl --version | grep -q -- -BPF_FRAMEWORK; then
exit 0
fi
# The following test will always return 77 if at compile time the libbpf version
# is less than 1.5.0. If it happens don't let the whole test fail
set +e
/usr/lib/systemd/tests/unit-tests/manual/test-bpf-token
if [ $? -eq 77 ]; then
exit 0
fi
set -e
# Check that our helper is able to get a BPF token
systemd-run --wait \
-p PrivateUsers=yes \
-p PrivateMounts=yes \
-p DelegateNamespaces=mnt \
-p PrivateBPF=yes \
-p BPFDelegateCommands=BPFProgLoad \
/usr/lib/systemd/tests/unit-tests/manual/test-bpf-token
# Check that without the delegates, the helper aborts trying to get a token
! systemd-run --wait \
-p PrivateUsers=yes \
-p PrivateMounts=yes \
-p DelegateNamespaces=mnt \
-p PrivateBPF=yes \
/usr/lib/systemd/tests/unit-tests/manual/test-bpf-token

View File

@ -10,6 +10,8 @@ class CustomResolver(tree.Resolver):
return self.resolve_filename('man/custom-entities.ent', context) return self.resolve_filename('man/custom-entities.ent', context)
if 'ethtool-link-mode' in url: if 'ethtool-link-mode' in url:
return self.resolve_filename('src/shared/ethtool-link-mode.xml', context) return self.resolve_filename('src/shared/ethtool-link-mode.xml', context)
if 'bpf-delegate' in url:
return self.resolve_filename('man/bpf-delegate.xml', context)
return None return None