1
0
mirror of https://github.com/systemd/systemd synced 2026-03-16 10:04:47 +01:00

Compare commits

..

15 Commits

Author SHA1 Message Date
Yu Watanabe
354dadb30f nspawn: fix build failure
This fixes a conflict between #19555 and #19653.
2021-05-20 10:04:14 +02:00
Anita Zhang
1539124b39 man: document default rlimits
Fixes #19645
2021-05-20 09:58:48 +02:00
Zbigniew Jędrzejewski-Szmek
f78ad5f046 test: enable fuzz regression tests by default
This ensures that the fuzz test code is also built by default.
It also increases the test coverage a bit. Compiling the tests
*with* sanitizers is painfully slow, so this is not enabled. But
just compiling them sauté is hardly noticable. Running the tests
increases the test count and runtime:
  622 tests, 26 s
to
  922 tests, 35 s
I think this is acceptable.
2021-05-20 09:30:43 +02:00
Lennart Poettering
d99c2df2df
Merge pull request #19555 from poettering/nspawn-bind-user
nspawn: add --bind-user= feature for binding  host user+homedir into a container
2021-05-20 07:33:51 +02:00
Lennart Poettering
48b4a760c9
Merge pull request #19591 from poettering/terminal-fixes
five terminal handling fixes
2021-05-20 07:33:21 +02:00
Lennart Poettering
11f3c130aa terminal: don't hardcode major number of PTYs
Hardcoding major numbers sucks. And we generally don't do it, except
when determining whether something is a PTY. Thing though is that we
don't actually need to do that here either, hence don#t.
2021-05-19 17:58:01 +02:00
Lennart Poettering
a06c9ac277 man: document new nspawn --bind-user= feature 2021-05-19 17:46:59 +02:00
Lennart Poettering
2f89304490 nspawn: add new --bind-user= option for binding a host user into the container
This new option does three things for a host user specified via
--bind-user=:

1. Bind mount the home directory from the host directory into
   /run/host/home/<username>

2. Install an additional user namepace UID/GID mapping mapping the host
   UID/GID of the host user to an unused one from the container in the range
   60514…60577.

3. Synthesize a user/group record for the user/group under the same name
   as on the host, with minimized information, and the UID/GID set to
   the mapped UID/GID. This data is written to /run/host/userdb/ where
   nss-system will pick it up.

This should make sharing users and home directories from host into the
container pretty seamless, under some conditions:

1. User namespacing must be used.

2. The host UID/GID of the user/group cannot be in the range assigned to
   the container (kernel already refuses this, as this would mean two
   host UIDs/GIDs might end up being mapped to the same continer
   UID/GID.

3. There's a free UID/GID in the aforementioned range in the container,
   and the name of the user/group is not used in the container.

4. Container payload is new enough to include an nss-systemd version
   that picks up records from /run/host/userdb/
2021-05-19 17:46:59 +02:00
Lennart Poettering
91181e075b nspawn: export userns_mkdir() + userns_lchown() so that it can be used elsewhere in nspawn 2021-05-19 17:33:25 +02:00
Lennart Poettering
1a298a206c user-record: optionally, allow parsing empty user record JSON objects 2021-05-19 17:33:25 +02:00
Lennart Poettering
0ba976e8da execute: don't chown/chmod non-TTY inodes thinking they were TTYs
Fixes: #19213

This is a safety net for invalid configurations, see the original bug
report.
2021-05-19 17:12:01 +02:00
Lennart Poettering
f2df231fed core: use GID_INVALID instead of -1 where appropriate 2021-05-19 17:12:01 +02:00
Lennart Poettering
4768529ff1 terminal-util: use _cleanup_close_ where appropriate 2021-05-19 17:12:01 +02:00
Lennart Poettering
e60a4a3c46 terminal-util: add extra validity checks that we operate on a TTY before doing so
Prompted by #19213, but not fixing it.

This is mostly paranoia that we don't do stuff on inodes that aren't
actually ttys.
2021-05-19 16:53:50 +02:00
Lennart Poettering
7eaee90286 terminal-util: add debug logging for when TTY ioctls fail 2021-05-19 16:53:50 +02:00
20 changed files with 964 additions and 73 deletions

View File

@ -241,8 +241,9 @@ the artifacts the container manager persistently leaves in the system.
| 5 | `tty` group | `systemd` | `/etc/passwd` |
| 6…999 | System users | Distributions | `/etc/passwd` |
| 1000…60000 | Regular users | Distributions | `/etc/passwd` + LDAP/NIS/… |
| 60001…60513 | Human Users (homed) | `systemd` | `nss-systemd` |
| 60514…61183 | Unused | | |
| 60001…60513 | Human users (homed) | `systemd` | `nss-systemd` |
| 60514…60577 | Host users mapped into containers | `systemd` | `systemd-nspawn` |
| 60578…61183 | Unused | | |
| 61184…65519 | Dynamic service users | `systemd` | `nss-systemd` |
| 65520…65533 | Unused | | |
| 65534 | `nobody` user | Linux | `/etc/passwd` + `nss-systemd` |

View File

@ -11,5 +11,6 @@
<!ENTITY KILL_USER_PROCESSES "{{ 'yes' if KILL_USER_PROCESSES else 'no' }}">
<!ENTITY DEBUGTTY "{{DEBUGTTY}}">
<!ENTITY RC_LOCAL_PATH "{{RC_LOCAL_PATH}}">
<!ENTITY HIGH_RLIMIT_NOFILE "{{HIGH_RLIMIT_NOFILE}}">
<!ENTITY fedora_latest_version "34">
<!ENTITY fedora_cloud_release "1.2">

View File

@ -1352,6 +1352,58 @@ After=sys-subsystem-net-devices-ens1.device</programlisting>
make them read-only, using <option>--bind-ro=</option>.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--bind-user=</option></term>
<listitem><para>Binds the home directory of the specified user on the host into the container. Takes
the name of an existing user on the host as argument. May be used multiple times to bind multiple
users into the container. This does three things:</para>
<orderedlist>
<listitem><para>The user's home directory is bind mounted from the host into
<filename>/run/hosts/home/</filename>.</para></listitem>
<listitem><para>An additional UID/GID mapping is added that maps the host user's UID/GID to a
container UID/GID, allocated from the 60514…60577 range.</para></listitem>
<listitem><para>A JSON user and group record is generated in <filename>/run/userdb/</filename> that
describes the mapped user. It contains a minimized representation of the host's user record,
adjusted to the UID/GID and home directory path assigned to the user in the container. The
<citerefentry><refentrytitle>nss-systemd</refentrytitle><manvolnum>8</manvolnum></citerefentry>
glibc NSS module will pick up these records from there and make them available in the container's
user/group databases.</para></listitem>
</orderedlist>
<para>The combination of the three operations above ensures that it is possible to log into the
host's user account inside the container as if it was local to the container. The user is only mapped
transiently, while the container is running and the mapping itself does not result in persistent
changes to the container (except maybe for generated log messages at login time, and similar). Note
that in particular the UID/GID assignment in the container is not made persistently. If the user is
mapped transiently, it is best to not allow the user to make persistent changes to the container. If
the user leaves files or directories owned by the user, and those UIDs/GIDs are recycled during later
container invocations (possibly with a different <option>--bind-user=</option> mapping), those files
and directories will be accessible to the "new" user.</para>
<para>The user/group record mapping only works if the container contains systemd 249 or newer, with
<command>nss-systemd</command> properly configured in <filename>nsswitch.conf</filename>. See
<citerefentry><refentrytitle>nss-systemd</refentrytitle><manvolnum>8</manvolnum></citerefentry> for
details.</para>
<para>Note that the user record propagated from the host into the container will contain the UNIX
password hash of the user, so that seamless logins in the container are possible. If the container is
less trusted than the host it's hence important to use a strong UNIX password hash function
(e.g. yescrypt or similar, with the <literal>$y$</literal> hash prefix).</para>
<para>When binding a user from the host into the container checks are executed to ensure that the
username is not yet known in the container. Moreover, it is checked that the UID/GID allocated for it
is not currently defined in the user/group databases of the container. Both checks directly access
the container's <filename>/etc/passwd</filename> and <filename>/etc/group</filename>, and thus might
not detect existing accounts in other databases.</para>
<para>This operation is only supported in combination with
<option>--private-users=</option>/<option>-U</option>.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--inaccessible=</option></term>

View File

@ -408,7 +408,23 @@
<varname>LimitXXX=</varname> directives and they accept the same parameter syntax,
see <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry>
for details. Note that these resource limits are only defaults
for units, they are not applied to the service manager process (i.e. PID 1) itself.</para></listitem>
for units, they are not applied to the service manager process (i.e. PID 1) itself.</para>
<para>Most of these settings are unset, which means the resource limits are inherited from the kernel or, if
invoked in a container, from the container manager. However, the following have defaults:</para>
<itemizedlist>
<listitem><para><varname>DefaultLimitNOFILE=</varname> defaults to <literal>1024:&HIGH_RLIMIT_NOFILE;</literal>.
</para></listitem>
<listitem><para><varname>DefaultLimitCORE=</varname> does not have a default but it is worth mentioning that
<varname>RLIMIT_CORE</varname> is set to <literal>infinity</literal> by PID 1 which is inherited by its
children.</para></listitem>
<listitem><para>Note that the service manager internally increases <varname>RLIMIT_MEMLOCK</varname> for
itself, however the limit is reverted to the original value for child processes forked off.</para></listitem>
</itemizedlist>
</listitem>
</varlistentry>
<varlistentry>

View File

@ -415,6 +415,16 @@
is privileged (see above).</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>BindUser=</varname></term>
<listitem><para>Binds a user from the host into the container. This option is equivalent to the
command line switch <option>--bind-user=</option>, see
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
for details about the specific options supported. This setting is privileged (see
above).</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>TemporaryFileSystem=</varname></term>

View File

@ -3440,7 +3440,7 @@ foreach tuple : fuzzers
name = sources[0].split('/')[-1].split('.')[0]
fuzzer_exes += executable(
exe = executable(
name,
sources,
include_directories : [incs, include_directories('src/fuzz')],
@ -3449,7 +3449,23 @@ foreach tuple : fuzzers
c_args : defs + test_cflags,
link_args: link_args,
install : false,
build_by_default : fuzz_tests or fuzzer_build)
build_by_default : fuzzer_build)
fuzzer_exes += exe
if want_tests != 'false'
# Run the fuzz regression tests without any sanitizers enabled.
# Additional invocations with sanitizers may be added below.
foreach p : fuzz_regression_tests
b = p.split('/')[-2]
c = p.split('/')[-1]
if b == name
test('@0@_@1@'.format(b, c),
exe,
args : [join_paths(project_source_root, p)])
endif
endforeach
endif
endforeach
run_target(

View File

@ -393,7 +393,7 @@ option('tests', type : 'combo', choices : ['true', 'unsafe', 'false'],
option('slow-tests', type : 'boolean', value : 'false',
description : 'run the slow tests by default')
option('fuzz-tests', type : 'boolean', value : 'false',
description : 'run the fuzzer regression tests by default')
description : 'run the fuzzer regression tests by default (with sanitizers)')
option('install-tests', type : 'boolean', value : 'false',
description : 'install test executables')

View File

@ -37,6 +37,7 @@
#include "process-util.h"
#include "socket-util.h"
#include "stat-util.h"
#include "stdio-util.h"
#include "string-util.h"
#include "strv.h"
#include "terminal-util.h"
@ -240,22 +241,27 @@ int reset_terminal_fd(int fd, bool switch_to_text) {
assert(fd >= 0);
/* We leave locked terminal attributes untouched, so that
* Plymouth may set whatever it wants to set, and we don't
* interfere with that. */
if (isatty(fd) < 1)
return log_debug_errno(errno, "Asked to reset a terminal that actually isn't a terminal: %m");
/* We leave locked terminal attributes untouched, so that Plymouth may set whatever it wants to set,
* and we don't interfere with that. */
/* Disable exclusive mode, just in case */
(void) ioctl(fd, TIOCNXCL);
if (ioctl(fd, TIOCNXCL) < 0)
log_debug_errno(errno, "TIOCNXCL ioctl failed on TTY, ignoring: %m");
/* Switch to text mode */
if (switch_to_text)
(void) ioctl(fd, KDSETMODE, KD_TEXT);
if (ioctl(fd, KDSETMODE, KD_TEXT) < 0)
log_debug_errno(errno, "KDSETMODE ioctl for switching to text mode failed on TTY, ignoring: %m");
/* Set default keyboard mode */
(void) vt_reset_keyboard(fd);
if (tcgetattr(fd, &termios) < 0) {
r = -errno;
r = log_debug_errno(errno, "Failed to get terminal parameters: %m");
goto finish;
}
@ -311,14 +317,13 @@ int reset_terminal(const char *name) {
}
int open_terminal(const char *name, int mode) {
_cleanup_close_ int fd = -1;
unsigned c = 0;
int fd;
/*
* If a TTY is in the process of being closed opening it might
* cause EIO. This is horribly awful, but unlikely to be
* changed in the kernel. Hence we work around this problem by
* retrying a couple of times.
* If a TTY is in the process of being closed opening it might cause EIO. This is horribly awful, but
* unlikely to be changed in the kernel. Hence we work around this problem by retrying a couple of
* times.
*
* https://bugs.launchpad.net/ubuntu/+source/linux/+bug/554172/comments/245
*/
@ -338,16 +343,14 @@ int open_terminal(const char *name, int mode) {
if (c >= 20)
return -errno;
usleep(50 * USEC_PER_MSEC);
(void) usleep(50 * USEC_PER_MSEC);
c++;
}
if (isatty(fd) <= 0) {
safe_close(fd);
return -ENOTTY;
}
if (isatty(fd) < 1)
return negative_errno();
return fd;
return TAKE_FD(fd);
}
int acquire_terminal(
@ -960,7 +963,9 @@ int get_ctty_devnr(pid_t pid, dev_t *d) {
}
int get_ctty(pid_t pid, dev_t *ret_devnr, char **ret) {
_cleanup_free_ char *fn = NULL, *b = NULL;
char pty[STRLEN("/dev/pts/") + DECIMAL_STR_MAX(dev_t) + 1];
_cleanup_free_ char *buf = NULL;
const char *fn = NULL, *w;
dev_t devnr;
int r;
@ -968,44 +973,53 @@ int get_ctty(pid_t pid, dev_t *ret_devnr, char **ret) {
if (r < 0)
return r;
r = device_path_make_canonical(S_IFCHR, devnr, &fn);
r = device_path_make_canonical(S_IFCHR, devnr, &buf);
if (r < 0) {
struct stat st;
if (r != -ENOENT) /* No symlink for this in /dev/char/? */
return r;
if (major(devnr) == 136) {
/* This is an ugly hack: PTY devices are not listed in /dev/char/, as they don't follow the
* Linux device model. This means we have no nice way to match them up against their actual
* device node. Let's hence do the check by the fixed, assigned major number. Normally we try
* to avoid such fixed major/minor matches, but there appears to nother nice way to handle
* this. */
/* Maybe this is PTY? PTY devices are not listed in /dev/char/, as they don't follow the
* Linux device model and hence device_path_make_canonical() doesn't work for them. Let's
* assume this is a PTY for a moment, and check if the device node this would then map to in
* /dev/pts/ matches the one we are looking for. This way we don't have to hardcode the major
* number (which is 136 btw), but we still rely on the fact that PTY numbers map directly to
* the minor number of the pty. */
xsprintf(pty, "/dev/pts/%u", minor(devnr));
if (asprintf(&b, "pts/%u", minor(devnr)) < 0)
return -ENOMEM;
} else {
/* Probably something similar to the ptys which have no symlink in /dev/char/. Let's return
* something vaguely useful. */
if (stat(pty, &st) < 0) {
if (errno != ENOENT)
return -errno;
r = device_path_make_major_minor(S_IFCHR, devnr, &fn);
} else if (S_ISCHR(st.st_mode) && devnr == st.st_rdev) /* Bingo! */
fn = pty;
if (!fn) {
/* Doesn't exist, or not a PTY? Probably something similar to the PTYs which have no
* symlink in /dev/char/. Let's return something vaguely useful. */
r = device_path_make_major_minor(S_IFCHR, devnr, &buf);
if (r < 0)
return r;
fn = buf;
}
}
} else
fn = buf;
if (!b) {
const char *w;
w = path_startswith(fn, "/dev/");
if (!w)
return -EINVAL;
w = path_startswith(fn, "/dev/");
if (w) {
b = strdup(w);
if (!b)
return -ENOMEM;
} else
b = TAKE_PTR(fn);
}
if (ret) {
_cleanup_free_ char *b = NULL;
b = strdup(w);
if (!b)
return -ENOMEM;
if (ret)
*ret = TAKE_PTR(b);
}
if (ret_devnr)
*ret_devnr = devnr;
@ -1326,6 +1340,9 @@ int vt_restore(int fd) {
};
int r, q = 0;
if (isatty(fd) < 1)
return log_debug_errno(errno, "Asked to restore the VT for an fd that does not refer to a terminal: %m");
if (ioctl(fd, KDSETMODE, KD_TEXT) < 0)
q = log_debug_errno(errno, "Failed to set VT in text mode, ignoring: %m");
@ -1359,6 +1376,9 @@ int vt_release(int fd, bool restore) {
* sent by the kernel and optionally reset the VT in text and auto
* VT-switching modes. */
if (isatty(fd) < 1)
return log_debug_errno(errno, "Asked to release the VT for an fd that does not refer to a terminal: %m");
if (ioctl(fd, VT_RELDISP, 1) < 0)
return -errno;

View File

@ -744,7 +744,7 @@ static int chown_terminal(int fd, uid_t uid) {
}
/* This might fail. What matters are the results. */
r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
if (r < 0)
return r;
@ -5775,6 +5775,9 @@ void exec_context_free_log_extra_fields(ExecContext *c) {
}
void exec_context_revert_tty(ExecContext *c) {
_cleanup_close_ int fd = -1;
const char *path;
struct stat st;
int r;
assert(c);
@ -5785,17 +5788,33 @@ void exec_context_revert_tty(ExecContext *c) {
/* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
* configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
* by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
if (!exec_context_may_touch_tty(c))
return;
if (exec_context_may_touch_tty(c)) {
const char *path;
path = exec_context_tty_path(c);
if (!path)
return;
path = exec_context_tty_path(c);
if (path) {
r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
if (r < 0 && r != -ENOENT)
log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
}
}
fd = open(path, O_PATH|O_CLOEXEC);
if (fd < 0)
return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
"Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
path);
if (fstat(fd, &st) < 0)
return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
/* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
* if things are a character device, since a proper check either means we'd have to open the TTY and
* use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
* and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
* with this at all? https://github.com/systemd/systemd/issues/19213 */
if (!S_ISCHR(st.st_mode))
return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
if (r < 0)
log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
}
int exec_context_get_clean_directories(

View File

@ -1,6 +1,8 @@
# SPDX-License-Identifier: LGPL-2.1-or-later
libnspawn_core_sources = files('''
nspawn-bind-user.c
nspawn-bind-user.h
nspawn-cgroup.c
nspawn-cgroup.h
nspawn-creds.c
@ -26,6 +28,7 @@ libnspawn_core_sources = files('''
nspawn-setuid.h
nspawn-stub-pid1.c
nspawn-stub-pid1.h
nspawn.h
'''.split())
nspawn_gperf_c = custom_target(

View File

@ -0,0 +1,478 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include "fd-util.h"
#include "fileio.h"
#include "format-util.h"
#include "fs-util.h"
#include "nspawn-bind-user.h"
#include "nspawn.h"
#include "path-util.h"
#include "user-util.h"
#include "userdb.h"
#define MAP_UID_START 60514
#define MAP_UID_END 60577
static int check_etc_passwd_collisions(
const char *directory,
const char *name,
uid_t uid) {
_cleanup_fclose_ FILE *f = NULL;
int r;
assert(directory);
assert(name || uid_is_valid(uid));
r = chase_symlinks_and_fopen_unlocked("/etc/passwd", directory, CHASE_PREFIX_ROOT, "re", &f, NULL);
if (r == -ENOENT)
return 0; /* no user database? then no user, hence no collision */
if (r < 0)
return log_error_errno(r, "Failed to open /etc/passwd of container: %m");
for (;;) {
struct passwd *pw;
r = fgetpwent_sane(f, &pw);
if (r < 0)
return log_error_errno(r, "Failed to iterate through /etc/passwd of container: %m");
if (r == 0) /* EOF */
return 0; /* no collision */
if (name && streq_ptr(pw->pw_name, name))
return 1; /* name collision */
if (uid_is_valid(uid) && pw->pw_uid == uid)
return 1; /* UID collision */
}
}
static int check_etc_group_collisions(
const char *directory,
const char *name,
gid_t gid) {
_cleanup_fclose_ FILE *f = NULL;
int r;
assert(directory);
assert(name || gid_is_valid(gid));
r = chase_symlinks_and_fopen_unlocked("/etc/group", directory, CHASE_PREFIX_ROOT, "re", &f, NULL);
if (r == -ENOENT)
return 0; /* no group database? then no group, hence no collision */
if (r < 0)
return log_error_errno(r, "Failed to open /etc/group of container: %m");
for (;;) {
struct group *gr;
r = fgetgrent_sane(f, &gr);
if (r < 0)
return log_error_errno(r, "Failed to iterate through /etc/group of container: %m");
if (r == 0)
return 0; /* no collision */
if (name && streq_ptr(gr->gr_name, name))
return 1; /* name collision */
if (gid_is_valid(gid) && gr->gr_gid == gid)
return 1; /* gid collision */
}
}
static int convert_user(
const char *directory,
UserRecord *u,
GroupRecord *g,
uid_t allocate_uid,
UserRecord **ret_converted_user,
GroupRecord **ret_converted_group) {
_cleanup_(group_record_unrefp) GroupRecord *converted_group = NULL;
_cleanup_(user_record_unrefp) UserRecord *converted_user = NULL;
_cleanup_free_ char *h = NULL;
JsonVariant *p, *hp = NULL;
int r;
assert(u);
assert(g);
assert(u->gid == g->gid);
r = check_etc_passwd_collisions(directory, u->user_name, UID_INVALID);
if (r < 0)
return r;
if (r > 0)
return log_error_errno(SYNTHETIC_ERRNO(EBUSY),
"Sorry, the user '%s' already exists in the container.", u->user_name);
r = check_etc_group_collisions(directory, g->group_name, GID_INVALID);
if (r < 0)
return r;
if (r > 0)
return log_error_errno(SYNTHETIC_ERRNO(EBUSY),
"Sorry, the group '%s' already exists in the container.", g->group_name);
h = path_join("/run/host/home/", u->user_name);
if (!h)
return log_oom();
/* Acquire the source hashed password array as-is, so that it retains the JSON_VARIANT_SENSITIVE flag */
p = json_variant_by_key(u->json, "privileged");
if (p)
hp = json_variant_by_key(p, "hashedPassword");
r = user_record_build(
&converted_user,
JSON_BUILD_OBJECT(
JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(u->user_name)),
JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(allocate_uid)),
JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(allocate_uid)),
JSON_BUILD_PAIR_CONDITION(u->disposition >= 0, "disposition", JSON_BUILD_STRING(user_disposition_to_string(u->disposition))),
JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_STRING(h)),
JSON_BUILD_PAIR("service", JSON_BUILD_STRING("io.systemd.NSpawn")),
JSON_BUILD_PAIR_CONDITION(!strv_isempty(u->hashed_password), "privileged", JSON_BUILD_OBJECT(
JSON_BUILD_PAIR("hashedPassword", JSON_BUILD_VARIANT(hp))))));
if (r < 0)
return log_error_errno(r, "Failed to build container user record: %m");
r = group_record_build(
&converted_group,
JSON_BUILD_OBJECT(
JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(g->group_name)),
JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(allocate_uid)),
JSON_BUILD_PAIR_CONDITION(g->disposition >= 0, "disposition", JSON_BUILD_STRING(user_disposition_to_string(g->disposition))),
JSON_BUILD_PAIR("service", JSON_BUILD_STRING("io.systemd.NSpawn"))));
if (r < 0)
return log_error_errno(r, "Failed to build container group record: %m");
*ret_converted_user = TAKE_PTR(converted_user);
*ret_converted_group = TAKE_PTR(converted_group);
return 0;
}
static int find_free_uid(const char *directory, uid_t max_uid, uid_t *current_uid) {
int r;
assert(directory);
assert(current_uid);
for (;; (*current_uid) ++) {
if (*current_uid > MAP_UID_END || *current_uid > max_uid)
return log_error_errno(
SYNTHETIC_ERRNO(EBUSY),
"No suitable available UID in range " UID_FMT "" UID_FMT " in container detected, can't map user.",
MAP_UID_START, MAP_UID_END);
r = check_etc_passwd_collisions(directory, NULL, *current_uid);
if (r < 0)
return r;
if (r > 0) /* already used */
continue;
/* We want to use the UID also as GID, hence check for it in /etc/group too */
r = check_etc_group_collisions(directory, NULL, (gid_t) *current_uid);
if (r < 0)
return r;
if (r == 0) /* free! yay! */
return 0;
}
}
BindUserContext* bind_user_context_free(BindUserContext *c) {
if (!c)
return NULL;
assert(c->n_data == 0 || c->data);
for (size_t i = 0; i < c->n_data; i++) {
user_record_unref(c->data[i].host_user);
group_record_unref(c->data[i].host_group);
user_record_unref(c->data[i].payload_user);
group_record_unref(c->data[i].payload_group);
}
return mfree(c);
}
int bind_user_prepare(
const char *directory,
char **bind_user,
uid_t uid_shift,
uid_t uid_range,
CustomMount **custom_mounts,
size_t *n_custom_mounts,
BindUserContext **ret) {
_cleanup_(bind_user_context_freep) BindUserContext *c = NULL;
uid_t current_uid = MAP_UID_START;
char **n;
int r;
assert(custom_mounts);
assert(n_custom_mounts);
assert(ret);
/* This resolves the users specified in 'bind_user', generates a minimalized JSON user + group record
* for it to stick in the container, allocates a UID/GID for it, and updates the custom mount table,
* to include an appropriate bind mount mapping.
*
* This extends the passed custom_mounts/n_custom_mounts with the home directories, and allocates a
* new BindUserContext for the user records */
if (strv_isempty(bind_user)) {
*ret = NULL;
return 0;
}
c = new0(BindUserContext, 1);
if (!c)
return log_oom();
STRV_FOREACH(n, bind_user) {
_cleanup_(user_record_unrefp) UserRecord *u = NULL, *cu = NULL;
_cleanup_(group_record_unrefp) GroupRecord *g = NULL, *cg = NULL;
_cleanup_free_ char *sm = NULL, *sd = NULL;
CustomMount *cm;
r = userdb_by_name(*n, USERDB_DONT_SYNTHESIZE, &u);
if (r < 0)
return log_error_errno(r, "Failed to resolve user '%s': %m", *n);
/* For now, let's refuse mapping the root/nobody users explicitly. The records we generate
* are strictly additive, nss-systemd is typically placed last in /etc/nsswitch.conf. Thus
* even if we wanted, we couldn't override the root or nobody user records. Note we also
* check for name conflicts in /etc/passwd + /etc/group later on, which would usually filter
* out root/nobody too, hence these checks might appear redundant but they actually are
* not, as we want to support environments where /etc/passwd and /etc/group are non-existent,
* and the user/group databases fully synthesized at runtime. Moreover, the name of the
* user/group name of the "nobody" account differs between distros, hence a check by numeric
* UID is safer. */
if (u->uid == 0 || streq(u->user_name, "root"))
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Mapping 'root' user not supported, sorry.");
if (u->uid == UID_NOBODY || STR_IN_SET(u->user_name, NOBODY_USER_NAME, "nobody"))
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Mapping 'nobody' user not supported, sorry.");
if (u->uid >= uid_shift && u->uid < uid_shift + uid_range)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID of user '%s' to map is already in container UID range, refusing.", u->user_name);
r = groupdb_by_gid(u->gid, USERDB_DONT_SYNTHESIZE, &g);
if (r < 0)
return log_error_errno(r, "Failed to resolve group of user '%s': %m", u->user_name);
if (g->gid >= uid_shift && g->gid < uid_shift + uid_range)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "GID of group '%s' to map is already in container GID range, refusing.", g->group_name);
/* We want to synthesize exactly one user + group from the host into the container. This only
* makes sense if the user on the host has its own private group. We can't reasonably check
* this, so we just check of the name of user and group match.
*
* One of these days we might want to support users in a shared/common group too, but it's
* not clear to me how this would have to be mapped, precisely given that the common group
* probably already exists in the container. */
if (!streq(u->user_name, g->group_name))
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
"Sorry, mapping users without private groups is currently not supported.");
r = find_free_uid(directory, uid_range, &current_uid);
if (r < 0)
return r;
r = convert_user(directory, u, g, current_uid, &cu, &cg);
if (r < 0)
return r;
if (!GREEDY_REALLOC(c->data, c->n_data + 1))
return log_oom();
sm = strdup(u->home_directory);
if (!sm)
return log_oom();
sd = strdup(cu->home_directory);
if (!sd)
return log_oom();
cm = reallocarray(*custom_mounts, sizeof(CustomMount), *n_custom_mounts + 1);
if (!cm)
return log_oom();
*custom_mounts = cm;
(*custom_mounts)[(*n_custom_mounts)++] = (CustomMount) {
.type = CUSTOM_MOUNT_BIND,
.source = TAKE_PTR(sm),
.destination = TAKE_PTR(sd),
};
c->data[c->n_data++] = (BindUserData) {
.host_user = TAKE_PTR(u),
.host_group = TAKE_PTR(g),
.payload_user = TAKE_PTR(cu),
.payload_group = TAKE_PTR(cg),
};
current_uid++;
}
*ret = TAKE_PTR(c);
return 1;
}
static int write_and_symlink(
const char *root,
JsonVariant *v,
const char *name,
uid_t uid,
const char *suffix,
WriteStringFileFlags extra_flags) {
_cleanup_free_ char *j = NULL, *f = NULL, *p = NULL, *q = NULL;
int r;
assert(root);
assert(v);
assert(name);
assert(uid_is_valid(uid));
assert(suffix);
r = json_variant_format(v, JSON_FORMAT_NEWLINE, &j);
if (r < 0)
return log_error_errno(r, "Failed to format user record JSON: %m");
f = strjoin(name, suffix);
if (!f)
return log_oom();
p = path_join(root, "/run/host/userdb/", f);
if (!p)
return log_oom();
if (asprintf(&q, "%s/run/host/userdb/" UID_FMT "%s", root, uid, suffix) < 0)
return log_oom();
if (symlink(f, q) < 0)
return log_error_errno(errno, "Failed to create symlink '%s': %m", q);
r = userns_lchown(q, 0, 0);
if (r < 0)
return log_error_errno(r, "Failed to adjust access mode of '%s': %m", q);
r = write_string_file(p, j, WRITE_STRING_FILE_CREATE|extra_flags);
if (r < 0)
return log_error_errno(r, "Failed to write %s: %m", p);
r = userns_lchown(p, 0, 0);
if (r < 0)
return log_error_errno(r, "Failed to adjust access mode of '%s': %m", p);
return 0;
}
int bind_user_setup(
const BindUserContext *c,
const char *root) {
static const UserRecordLoadFlags strip_flags = /* Removes privileged info */
USER_RECORD_REQUIRE_REGULAR|
USER_RECORD_STRIP_PRIVILEGED|
USER_RECORD_ALLOW_PER_MACHINE|
USER_RECORD_ALLOW_BINDING|
USER_RECORD_ALLOW_SIGNATURE;
static const UserRecordLoadFlags shadow_flags = /* Extracts privileged info */
USER_RECORD_STRIP_REGULAR|
USER_RECORD_ALLOW_PRIVILEGED|
USER_RECORD_STRIP_PER_MACHINE|
USER_RECORD_STRIP_BINDING|
USER_RECORD_STRIP_SIGNATURE|
USER_RECORD_EMPTY_OK;
int r;
assert(root);
if (!c || c->n_data == 0)
return 0;
r = userns_mkdir(root, "/run/host", 0755, 0, 0);
if (r < 0)
return log_error_errno(r, "Failed to create /run/host: %m");
r = userns_mkdir(root, "/run/host/home", 0755, 0, 0);
if (r < 0)
return log_error_errno(r, "Failed to create /run/host/userdb: %m");
r = userns_mkdir(root, "/run/host/userdb", 0755, 0, 0);
if (r < 0)
return log_error_errno(r, "Failed to create /run/host/userdb: %m");
for (size_t i = 0; i < c->n_data; i++) {
_cleanup_(group_record_unrefp) GroupRecord *stripped_group = NULL, *shadow_group = NULL;
_cleanup_(user_record_unrefp) UserRecord *stripped_user = NULL, *shadow_user = NULL;
const BindUserData *d = c->data + i;
/* First, write shadow (i.e. privileged) data for group record */
r = group_record_clone(d->payload_group, shadow_flags, &shadow_group);
if (r < 0)
return log_error_errno(r, "Failed to extract privileged information from group record: %m");
if (!json_variant_is_blank_object(shadow_group->json)) {
r = write_and_symlink(
root,
shadow_group->json,
d->payload_group->group_name,
d->payload_group->gid,
".group-privileged",
WRITE_STRING_FILE_MODE_0600);
if (r < 0)
return r;
}
/* Second, write main part of group record. */
r = group_record_clone(d->payload_group, strip_flags, &stripped_group);
if (r < 0)
return log_error_errno(r, "Failed to strip privileged information from group record: %m");
r = write_and_symlink(
root,
stripped_group->json,
d->payload_group->group_name,
d->payload_group->gid,
".group",
0);
if (r < 0)
return r;
/* Third, write out user shadow data. i.e. extract privileged info from user record */
r = user_record_clone(d->payload_user, shadow_flags, &shadow_user);
if (r < 0)
return log_error_errno(r, "Failed to extract privileged information from user record: %m");
if (!json_variant_is_blank_object(shadow_user->json)) {
r = write_and_symlink(
root,
shadow_user->json,
d->payload_user->user_name,
d->payload_user->uid,
".user-privileged",
WRITE_STRING_FILE_MODE_0600);
if (r < 0)
return r;
}
/* Finally write out the main part of the user record */
r = user_record_clone(d->payload_user, strip_flags, &stripped_user);
if (r < 0)
return log_error_errno(r, "Failed to strip privileged information from user record: %m");
r = write_and_symlink(
root,
stripped_user->json,
d->payload_user->user_name,
d->payload_user->uid,
".user",
0);
if (r < 0)
return r;
}
return 1;
}

View File

@ -0,0 +1,29 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#include "user-record.h"
#include "group-record.h"
#include "nspawn-mount.h"
typedef struct BindUserData {
/* The host's user/group records */
UserRecord *host_user;
GroupRecord *host_group;
/* The mapped records to place into the container */
UserRecord *payload_user;
GroupRecord *payload_group;
} BindUserData;
typedef struct BindUserContext {
BindUserData *data;
size_t n_data;
} BindUserContext;
BindUserContext* bind_user_context_free(BindUserContext *c);
DEFINE_TRIVIAL_CLEANUP_FUNC(BindUserContext*, bind_user_context_free);
int bind_user_prepare(const char *directory, char **bind_user, uid_t uid_shift, uid_t uid_range, CustomMount **custom_mounts, size_t *n_custom_mounts, BindUserContext **ret);
int bind_user_setup(const BindUserContext *c, const char *root);

View File

@ -69,6 +69,7 @@ Files.Overlay, config_parse_overlay, 0, 0
Files.OverlayReadOnly, config_parse_overlay, 1, 0
Files.PrivateUsersChown, config_parse_userns_chown, 0, offsetof(Settings, userns_ownership)
Files.PrivateUsersOwnership, config_parse_userns_ownership, 0, offsetof(Settings, userns_ownership)
Files.BindUser, config_parse_bind_user, 0, offsetof(Settings, bind_user)
Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network)
Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces)
Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan)

View File

@ -132,6 +132,7 @@ Settings* settings_free(Settings *s) {
rlimit_free_all(s->rlimit);
free(s->hostname);
cpu_set_reset(&s->cpu_set);
strv_free(s->bind_user);
strv_free(s->network_interfaces);
strv_free(s->network_macvlan);
@ -907,3 +908,51 @@ int config_parse_userns_chown(
*ownership = r ? USER_NAMESPACE_OWNERSHIP_CHOWN : USER_NAMESPACE_OWNERSHIP_OFF;
return 0;
}
int config_parse_bind_user(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
char ***bind_user = data;
int r;
assert(rvalue);
assert(bind_user);
if (isempty(rvalue)) {
*bind_user = strv_free(*bind_user);
return 0;
}
for (const char* p = rvalue;;) {
_cleanup_free_ char *word = NULL;
r = extract_first_word(&p, &word, NULL, 0);
if (r == -ENOMEM)
return log_oom();
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse BindUser= list, ignoring: %s", rvalue);
return 0;
}
if (r == 0)
break;
if (!valid_user_group_name(word, 0)) {
log_syntax(unit, LOG_WARNING, filename, line, 0, "User name '%s' not valid, ignoring.", word);
return 0;
}
if (strv_consume(bind_user, TAKE_PTR(word)) < 0)
return log_oom();
}
return 0;
}

View File

@ -126,9 +126,10 @@ typedef enum SettingsMask {
SETTING_CLONE_NS_FLAGS = UINT64_C(1) << 28,
SETTING_CONSOLE_MODE = UINT64_C(1) << 29,
SETTING_CREDENTIALS = UINT64_C(1) << 30,
SETTING_RLIMIT_FIRST = UINT64_C(1) << 31, /* we define one bit per resource limit here */
SETTING_RLIMIT_LAST = UINT64_C(1) << (31 + _RLIMIT_MAX - 1),
_SETTINGS_MASK_ALL = (UINT64_C(1) << (31 + _RLIMIT_MAX)) -1,
SETTING_BIND_USER = UINT64_C(1) << 31,
SETTING_RLIMIT_FIRST = UINT64_C(1) << 32, /* we define one bit per resource limit here */
SETTING_RLIMIT_LAST = UINT64_C(1) << (32 + _RLIMIT_MAX - 1),
_SETTINGS_MASK_ALL = (UINT64_C(1) << (32 + _RLIMIT_MAX)) -1,
_SETTING_FORCE_ENUM_WIDTH = UINT64_MAX
} SettingsMask;
@ -195,6 +196,7 @@ typedef struct Settings {
CustomMount *custom_mounts;
size_t n_custom_mounts;
UserNamespaceOwnership userns_ownership;
char **bind_user;
/* [Network] */
int private_network;
@ -266,6 +268,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_link_journal);
CONFIG_PARSER_PROTOTYPE(config_parse_timezone);
CONFIG_PARSER_PROTOTYPE(config_parse_userns_chown);
CONFIG_PARSER_PROTOTYPE(config_parse_userns_ownership);
CONFIG_PARSER_PROTOTYPE(config_parse_bind_user);
const char *resolv_conf_mode_to_string(ResolvConfMode a) _const_;
ResolvConfMode resolv_conf_mode_from_string(const char *s) _pure_;

View File

@ -63,6 +63,7 @@
#include "mountpoint-util.h"
#include "namespace-util.h"
#include "netlink-util.h"
#include "nspawn-bind-user.h"
#include "nspawn-cgroup.h"
#include "nspawn-creds.h"
#include "nspawn-def.h"
@ -76,6 +77,7 @@
#include "nspawn-settings.h"
#include "nspawn-setuid.h"
#include "nspawn-stub-pid1.h"
#include "nspawn.h"
#include "nulstr-util.h"
#include "os-util.h"
#include "pager.h"
@ -225,6 +227,7 @@ static char **arg_sysctl = NULL;
static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
static Credential *arg_credentials = NULL;
static size_t arg_n_credentials = 0;
static char **arg_bind_user = NULL;
STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
@ -257,6 +260,7 @@ STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
#endif
STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
static int handle_arg_console(const char *arg) {
if (streq(arg, "help")) {
@ -422,7 +426,8 @@ static int help(void) {
" Create an overlay mount from the host to \n"
" the container\n"
" --overlay-ro=PATH[:PATH...]:PATH\n"
" Similar, but creates a read-only overlay mount\n\n"
" Similar, but creates a read-only overlay mount\n"
" --bind-user=NAME Bind user from host to container\n\n"
"%3$sInput/Output:%4$s\n"
" --console=MODE Select how stdin/stdout/stderr and /dev/console are\n"
" set up for the container.\n"
@ -706,6 +711,7 @@ static int parse_argv(int argc, char *argv[]) {
ARG_NO_PAGER,
ARG_SET_CREDENTIAL,
ARG_LOAD_CREDENTIAL,
ARG_BIND_USER,
};
static const struct option options[] = {
@ -777,6 +783,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "no-pager", no_argument, NULL, ARG_NO_PAGER },
{ "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
{ "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
{ "bind-user", required_argument, NULL, ARG_BIND_USER },
{}
};
@ -1655,6 +1662,16 @@ static int parse_argv(int argc, char *argv[]) {
break;
}
case ARG_BIND_USER:
if (!valid_user_group_name(optarg, 0))
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
if (strv_extend(&arg_bind_user, optarg) < 0)
return log_oom();
arg_settings_mask |= SETTING_BIND_USER;
break;
case '?':
return -EINVAL;
@ -1811,6 +1828,12 @@ static int verify_arguments(void) {
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
}
if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
/* Drop duplicate --bind-user= entries */
strv_uniq(arg_bind_user);
r = custom_mount_check_all();
if (r < 0)
return r;
@ -1818,7 +1841,7 @@ static int verify_arguments(void) {
return 0;
}
static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
int userns_lchown(const char *p, uid_t uid, gid_t gid) {
assert(p);
if (arg_userns_mode == USER_NAMESPACE_NO)
@ -1847,7 +1870,7 @@ static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
return 0;
}
static int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
const char *q;
int r;
@ -3568,6 +3591,7 @@ static int outer_child(
FDSet *fds,
int netns_fd) {
_cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
_cleanup_strv_free_ char **os_release_pairs = NULL;
_cleanup_close_ int fd = -1;
bool idmap = false;
@ -3715,6 +3739,36 @@ static int outer_child(
if (r < 0)
return r;
r = bind_user_prepare(
directory,
arg_bind_user,
arg_uid_shift,
arg_uid_range,
&arg_custom_mounts, &arg_n_custom_mounts,
&bind_user_context);
if (r < 0)
return r;
if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
/* Send the user maps we determined to the parent, so that it installs it in our user namespace UID map table */
for (size_t i = 0; i < bind_user_context->n_data; i++) {
uid_t map[] = {
bind_user_context->data[i].payload_user->uid,
bind_user_context->data[i].host_user->uid,
(uid_t) bind_user_context->data[i].payload_group->gid,
(uid_t) bind_user_context->data[i].host_group->gid,
};
l = send(uid_shift_socket, map, sizeof(map), MSG_NOSIGNAL);
if (l < 0)
return log_error_errno(errno, "Failed to send user UID map: %m");
if (l != sizeof(map))
return log_error_errno(SYNTHETIC_ERRNO(EIO),
"Short write while sending user UID map.");
}
}
r = mount_custom(
directory,
arg_custom_mounts,
@ -3831,6 +3885,10 @@ static int outer_child(
if (r < 0)
return r;
r = bind_user_setup(bind_user_context, directory);
if (r < 0)
return r;
r = mount_custom(
directory,
arg_custom_mounts,
@ -4011,21 +4069,96 @@ static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
}
}
static int setup_uid_map(pid_t pid) {
char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
static int add_one_uid_map(
char **p,
uid_t container_uid,
uid_t host_uid,
uid_t range) {
return strextendf(p,
UID_FMT " " UID_FMT " " UID_FMT "\n",
container_uid, host_uid, range);
}
static int make_uid_map_string(
const uid_t bind_user_uid[],
size_t n_bind_user_uid,
size_t offset,
char **ret) {
_cleanup_free_ char *s = NULL;
uid_t previous_uid = 0;
int r;
assert(n_bind_user_uid == 0 || bind_user_uid);
assert(offset == 0 || offset == 2); /* used to switch between UID and GID map */
assert(ret);
/* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
* quadruplet, consisting of host and container UID + GID. */
for (size_t i = 0; i < n_bind_user_uid; i++) {
uid_t payload_uid = bind_user_uid[i*2+offset],
host_uid = bind_user_uid[i*2+offset+1];
assert(previous_uid <= payload_uid);
assert(payload_uid < arg_uid_range);
/* Add a range to close the gap to previous entry */
if (payload_uid > previous_uid) {
r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
if (r < 0)
return r;
}
/* Map this specific user */
r = add_one_uid_map(&s, payload_uid, host_uid, 1);
if (r < 0)
return r;
previous_uid = payload_uid + 1;
}
/* And add a range to close the gap to finish the range */
if (arg_uid_range > previous_uid) {
r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
if (r < 0)
return r;
}
assert(s);
*ret = TAKE_PTR(s);
return 0;
}
static int setup_uid_map(
pid_t pid,
const uid_t bind_user_uid[],
size_t n_bind_user_uid) {
char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
_cleanup_free_ char *s = NULL;
int r;
assert(pid > 1);
/* Build the UID map string */
if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
return log_oom();
xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, arg_uid_shift, arg_uid_range);
r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return log_error_errno(r, "Failed to write UID map: %m");
/* We always assign the same UID and GID ranges */
/* And now build the GID map string */
s = mfree(s);
if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
return log_oom();
xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
r = write_string_file(uid_map, line, WRITE_STRING_FILE_DISABLE_BUFFER);
r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return log_error_errno(r, "Failed to write GID map: %m");
@ -4301,6 +4434,9 @@ static int merge_settings(Settings *settings, const char *path) {
}
}
if ((arg_settings_mask & SETTING_BIND_USER) == 0)
strv_free_and_replace(arg_bind_user, settings->bind_user);
if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
arg_notify_ready = settings->notify_ready;
@ -4567,6 +4703,8 @@ static int run_container(
_cleanup_(pty_forward_freep) PTYForward *forward = NULL;
_cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
_cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
_cleanup_free_ uid_t *bind_user_uid = NULL;
size_t n_bind_user_uid = 0;
ContainerStatus container_status = 0;
int ifi = 0, r;
ssize_t l;
@ -4722,6 +4860,26 @@ static int run_container(
if (l != sizeof arg_uid_shift)
return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
}
n_bind_user_uid = strv_length(arg_bind_user);
if (n_bind_user_uid > 0) {
/* Right after the UID shift, we'll receive the list of UID mappings for the
* --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
bind_user_uid = new(uid_t, n_bind_user_uid*4);
if (!bind_user_uid)
return log_oom();
for (size_t i = 0; i < n_bind_user_uid; i++) {
l = recv(uid_shift_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
if (l < 0)
return log_error_errno(errno, "Failed to read user UID map pair: %m");
if (l != sizeof(uid_t)*4)
return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
SYNTHETIC_ERRNO(EIO),
"Short read while reading bind user UID pairs.");
}
}
}
if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
@ -4767,7 +4925,7 @@ static int run_container(
if (!barrier_place_and_sync(&barrier)) /* #1 */
return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
r = setup_uid_map(*pid);
r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
if (r < 0)
return r;

7
src/nspawn/nspawn.h Normal file
View File

@ -0,0 +1,7 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#include <sys/types.h>
int userns_lchown(const char *p, uid_t uid, gid_t gid);
int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid);

View File

@ -1552,7 +1552,7 @@ int user_group_record_mangle(
if (FLAGS_SET(load_flags, USER_RECORD_REQUIRE_REGULAR) && !FLAGS_SET(m, USER_RECORD_REGULAR))
return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record lacks basic identity fields, which are required.");
if (m == 0)
if (!FLAGS_SET(load_flags, USER_RECORD_EMPTY_OK) && m == 0)
return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record is empty.");
if (w)

View File

@ -169,6 +169,9 @@ typedef enum UserRecordLoadFlags {
/* Whether to ignore errors and load what we can */
USER_RECORD_PERMISSIVE = 1U << 29,
/* Whether an empty record is OK */
USER_RECORD_EMPTY_OK = 1U << 30,
} UserRecordLoadFlags;
static inline UserRecordLoadFlags USER_RECORD_REQUIRE(UserRecordMask m) {

View File

@ -3,6 +3,7 @@
#include <fcntl.h>
#include <stdbool.h>
#include <stdio.h>
#include <sys/stat.h>
#include <unistd.h>
#include "alloc-util.h"
@ -153,6 +154,29 @@ static void test_text(void) {
}
}
static void test_get_ctty(void) {
_cleanup_free_ char *ctty = NULL;
struct stat st;
dev_t devnr;
int r;
r = get_ctty(0, &devnr, &ctty);
if (r < 0) {
log_notice_errno(r, "Apparently called without a controlling TTY, cutting get_ctty() test short: %m");
return;
}
/* In almost all cases STDIN will match our controlling TTY. Let's verify that and then compare paths */
assert_se(fstat(STDIN_FILENO, &st) >= 0);
if (S_ISCHR(st.st_mode) && st.st_rdev == devnr) {
_cleanup_free_ char *stdin_name = NULL;
assert_se(getttyname_malloc(STDIN_FILENO, &stdin_name) >= 0);
assert_se(path_equal(stdin_name, ctty));
} else
log_notice("Not invoked with stdin == ctty, cutting get_ctty() test short");
}
int main(int argc, char *argv[]) {
test_setup_logging(LOG_INFO);
@ -161,6 +185,7 @@ int main(int argc, char *argv[]) {
test_getttyname_malloc();
test_colors();
test_text();
test_get_ctty();
return 0;
}