mirror of
https://github.com/systemd/systemd
synced 2026-04-23 07:24:51 +02:00
Compare commits
11 Commits
19c510bec7
...
ad337e55a3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ad337e55a3 | ||
|
|
3c45ad24c2 | ||
|
|
3e6f89e013 | ||
|
|
0456118807 | ||
|
|
50ae2966d2 | ||
|
|
264caae299 | ||
|
|
aff7ae0d67 | ||
|
|
1eb874b978 | ||
|
|
d1d0b895dc | ||
|
|
a1f4fd3876 | ||
|
|
4267084642 |
2
NEWS
2
NEWS
@ -38,7 +38,7 @@ CHANGES WITH 251:
|
||||
image" environments, where the machine ID shall be initialized on
|
||||
first boot (as opposed to at installation time before first boot) the
|
||||
machine ID is not be available at build time to name the entry
|
||||
after. In this case the the --entry-token= switch to bootctl (or the
|
||||
after. In this case the --entry-token= switch to bootctl (or the
|
||||
/etc/kernel/entry-token file) may be used to override the "token" to
|
||||
identify the entry by, and use another ID, for example the IMAGE_ID=
|
||||
or ID= fields from /etc/os-release. This will make the OS images
|
||||
|
||||
@ -67,6 +67,19 @@ int take_etc_passwd_lock(const char *root);
|
||||
#define UID_NOBODY ((uid_t) 65534U)
|
||||
#define GID_NOBODY ((gid_t) 65534U)
|
||||
|
||||
/* If REMOUNT_IDMAP_HOST_ROOT is set for remount_idmap() we'll include a mapping here that maps the host root
|
||||
* user accessing the idmapped mount to the this user ID on the backing fs. This is the last valid UID in the
|
||||
* *signed* 32bit range. You might wonder why precisely use this specific UID for this purpose? Well, we
|
||||
* definitely cannot use the first 0…65536 UIDs for that, since in most cases that's precisely the file range
|
||||
* we intend to map to some high UID range, and since UID mappings have to be bijective we thus cannot use
|
||||
* them at all. Furthermore the UID range beyond INT32_MAX (i.e. the range above the signed 32bit range) is
|
||||
* icky, since many APIs cannot use it (example: setfsuid() returns the old UID as signed integer). Following
|
||||
* our usual logic of assigning a 16bit UID range to each container, so that the upper 16bit of a 32bit UID
|
||||
* value indicate kind of a "container ID" and the lower 16bit map directly to the intended user you can read
|
||||
* this specific UID as the "nobody" user of the container with ID 0x7FFF, which is kinda nice. */
|
||||
#define UID_MAPPED_ROOT ((uid_t) (INT32_MAX-1))
|
||||
#define GID_MAPPED_ROOT ((gid_t) (INT32_MAX-1))
|
||||
|
||||
#define ETC_PASSWD_LOCK_PATH "/etc/.pwd.lock"
|
||||
|
||||
/* The following macros add 1 when converting things, since UID 0 is a valid UID, while the pointer
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
/* This TPM PCR is where we extend the kernel command line and any passed credentials here. */
|
||||
#define TPM_PCR_INDEX_KERNEL_PARAMETERS 12U
|
||||
|
||||
/* We used to write the the kernel command line/credentials into PCR 8, in systemd <= 250. Let's provide for
|
||||
/* We used to write the kernel command line/credentials into PCR 8, in systemd <= 250. Let's provide for
|
||||
* some compatibility. (Remove in 2023!) */
|
||||
#if EFI_TPM_PCR_COMPAT
|
||||
#define TPM_PCR_INDEX_KERNEL_PARAMETERS_COMPAT 8U
|
||||
|
||||
@ -1186,7 +1186,7 @@ static int attach_luks_or_plain_or_bitlk_by_pkcs11(
|
||||
/* Before using this key as passphrase we base64 encode it. Why? For compatibility
|
||||
* with homed's PKCS#11 hookup: there we want to use the key we acquired through
|
||||
* PKCS#11 for other authentication/decryption mechanisms too, and some of them do
|
||||
* not not take arbitrary binary blobs, but require NUL-terminated strings — most
|
||||
* not take arbitrary binary blobs, but require NUL-terminated strings — most
|
||||
* importantly UNIX password hashes. Hence, for compatibility we want to use a string
|
||||
* without embedded NUL here too, and that's easiest to generate from a binary blob
|
||||
* via base64 encoding. */
|
||||
|
||||
@ -1452,7 +1452,7 @@ static int home_deactivate_internal(Home *h, bool force, sd_bus_error *error) {
|
||||
}
|
||||
|
||||
/* Let's start a timer to retry deactivation in 15. We'll stop the timer once we manage to deactivate
|
||||
* the home directory again, or we we start any other operation. */
|
||||
* the home directory again, or we start any other operation. */
|
||||
home_start_retry_deactivate(h);
|
||||
|
||||
return r;
|
||||
|
||||
@ -216,6 +216,12 @@ static int make_userns(uid_t stored_uid, uid_t exposed_uid) {
|
||||
if (r < 0)
|
||||
return log_oom();
|
||||
|
||||
/* Map nspawn's mapped root UID as identity mapping so that people can run nspawn uidmap mounted
|
||||
* containers off $HOME, if they want. */
|
||||
r = strextendf(&text, UID_FMT " " UID_FMT " " UID_FMT "\n", UID_MAPPED_ROOT, UID_MAPPED_ROOT, 1);
|
||||
if (r < 0)
|
||||
return log_oom();
|
||||
|
||||
/* Leave everything else unmapped, starting from UID_NOBODY itself. Specifically, this means the
|
||||
* whole space outside of 16bit remains unmapped */
|
||||
|
||||
|
||||
@ -154,7 +154,7 @@ int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested)
|
||||
* the unified hierarchy and the container does the same, and we did not create a scope unit for the container
|
||||
* move us and the container into two separate subcgroups.
|
||||
*
|
||||
* Moreover, container payloads such as systemd try to manage the cgroup they run in in full (i.e. including
|
||||
* Moreover, container payloads such as systemd try to manage the cgroup they run in full (i.e. including
|
||||
* its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a
|
||||
* delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the
|
||||
* host systemd directly to the payload, the host and payload systemd might fight for the cgroup
|
||||
|
||||
@ -780,7 +780,7 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
|
||||
}
|
||||
|
||||
if (idmapped) {
|
||||
r = remount_idmap(where, uid_shift, uid_range);
|
||||
r = remount_idmap(where, uid_shift, uid_range, REMOUNT_IDMAP_HOST_ROOT);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to map ids for bind mount %s: %m", where);
|
||||
}
|
||||
|
||||
@ -2006,7 +2006,7 @@ static int oci_masked_paths(const char *name, JsonVariant *v, JsonDispatchFlags
|
||||
|
||||
if (!path_is_absolute(p))
|
||||
return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
|
||||
"Path is not not absolute, refusing: %s", p);
|
||||
"Path is not absolute, refusing: %s", p);
|
||||
|
||||
if (oci_exclude_mount(p))
|
||||
continue;
|
||||
@ -2048,7 +2048,7 @@ static int oci_readonly_paths(const char *name, JsonVariant *v, JsonDispatchFlag
|
||||
|
||||
if (!path_is_absolute(p))
|
||||
return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL),
|
||||
"Path is not not absolute, refusing: %s", p);
|
||||
"Path is not absolute, refusing: %s", p);
|
||||
|
||||
if (oci_exclude_mount(p))
|
||||
continue;
|
||||
|
||||
@ -3563,7 +3563,7 @@ static int inner_child(
|
||||
|
||||
static int setup_notify_child(void) {
|
||||
_cleanup_close_ int fd = -1;
|
||||
union sockaddr_union sa = {
|
||||
static const union sockaddr_union sa = {
|
||||
.un.sun_family = AF_UNIX,
|
||||
.un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
|
||||
};
|
||||
@ -3616,10 +3616,11 @@ static int outer_child(
|
||||
ssize_t l;
|
||||
int r;
|
||||
|
||||
/* This is the "outer" child process, i.e the one forked off by the container manager itself. It already has
|
||||
* its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in the host's CLONE_NEWPID,
|
||||
* CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET namespaces. After it completed a number of
|
||||
* initializations a second child (the "inner" one) is forked off it, and it exits. */
|
||||
/* This is the "outer" child process, i.e the one forked off by the container manager itself. It
|
||||
* already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
|
||||
* the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
|
||||
* namespaces. After it completed a number of initializations a second child (the "inner" one) is
|
||||
* forked off it, and it exits. */
|
||||
|
||||
assert(barrier);
|
||||
assert(directory);
|
||||
@ -3649,10 +3650,10 @@ static int outer_child(
|
||||
return r;
|
||||
|
||||
if (dissected_image) {
|
||||
/* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We
|
||||
* can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the
|
||||
* uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This
|
||||
* makes sure ESP partitions and userns are compatible. */
|
||||
/* If we are operating on a disk image, then mount its root directory now, but leave out the
|
||||
* rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
|
||||
* but then with the uid shift known. That way we can mount VFAT file systems shifted to the
|
||||
* right place right away. This makes sure ESP partitions and userns are compatible. */
|
||||
|
||||
r = dissected_image_mount_and_warn(
|
||||
dissected_image,
|
||||
@ -3682,9 +3683,9 @@ static int outer_child(
|
||||
"Short write while sending UID shift.");
|
||||
|
||||
if (arg_userns_mode == USER_NAMESPACE_PICK) {
|
||||
/* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
|
||||
* we just read from the image is available. If yes, it will send the UID shift back to us, if
|
||||
* not it will pick a different one, and send it back to us. */
|
||||
/* When we are supposed to pick the UID shift, the parent will check now whether the
|
||||
* UID shift we just read from the image is available. If yes, it will send the UID
|
||||
* shift back to us, if not it will pick a different one, and send it back to us. */
|
||||
|
||||
l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
|
||||
if (l < 0)
|
||||
@ -3740,7 +3741,8 @@ static int outer_child(
|
||||
return r;
|
||||
|
||||
if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
|
||||
/* Send the user maps we determined to the parent, so that it installs it in our user namespace UID map table */
|
||||
/* Send the user maps we determined to the parent, so that it installs it in our user
|
||||
* namespace UID map table */
|
||||
|
||||
for (size_t i = 0; i < bind_user_context->n_data; i++) {
|
||||
uid_t map[] = {
|
||||
@ -3779,7 +3781,7 @@ static int outer_child(
|
||||
IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
|
||||
arg_uid_shift != 0) {
|
||||
|
||||
r = remount_idmap(directory, arg_uid_shift, arg_uid_range);
|
||||
r = remount_idmap(directory, arg_uid_shift, arg_uid_range, REMOUNT_IDMAP_HOST_ROOT);
|
||||
if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
|
||||
/* This might fail because the kernel or file system doesn't support idmapping. We
|
||||
* can't really distinguish this nicely, nor do we have any guarantees about the
|
||||
@ -3833,15 +3835,13 @@ static int outer_child(
|
||||
unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
|
||||
}
|
||||
|
||||
/* Mark everything as shared so our mounts get propagated down. This is
|
||||
* required to make new bind mounts available in systemd services
|
||||
* inside the container that create a new mount namespace.
|
||||
* See https://github.com/systemd/systemd/issues/3860
|
||||
* Further submounts (such as /dev) done after this will inherit the
|
||||
* shared propagation mode.
|
||||
/* Mark everything as shared so our mounts get propagated down. This is required to make new bind
|
||||
* mounts available in systemd services inside the container that create a new mount namespace. See
|
||||
* https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
|
||||
* will inherit the shared propagation mode.
|
||||
*
|
||||
* IMPORTANT: Do not overmount the root directory anymore from now on to
|
||||
* enable moving the root directory mount to root later on.
|
||||
* IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
|
||||
* directory mount to root later on.
|
||||
* https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
|
||||
*/
|
||||
r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
|
||||
@ -5612,31 +5612,36 @@ static int run(int argc, char *argv[]) {
|
||||
}
|
||||
|
||||
if (arg_start_mode == START_BOOT) {
|
||||
_cleanup_free_ char *b = NULL;
|
||||
const char *p;
|
||||
|
||||
if (arg_pivot_root_new)
|
||||
p = prefix_roota(arg_directory, arg_pivot_root_new);
|
||||
else
|
||||
if (arg_pivot_root_new) {
|
||||
b = path_join(arg_directory, arg_pivot_root_new);
|
||||
if (!b)
|
||||
return log_oom();
|
||||
|
||||
p = b;
|
||||
} else
|
||||
p = arg_directory;
|
||||
|
||||
if (path_is_os_tree(p) <= 0) {
|
||||
log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
|
||||
r = -EINVAL;
|
||||
r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
|
||||
"Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
|
||||
goto finish;
|
||||
}
|
||||
} else {
|
||||
const char *p, *q;
|
||||
_cleanup_free_ char *p = NULL;
|
||||
|
||||
if (arg_pivot_root_new)
|
||||
p = prefix_roota(arg_directory, arg_pivot_root_new);
|
||||
p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
|
||||
else
|
||||
p = arg_directory;
|
||||
p = path_join(arg_directory, "/usr/");
|
||||
if (!p)
|
||||
return log_oom();
|
||||
|
||||
q = strjoina(p, "/usr/");
|
||||
|
||||
if (laccess(q, F_OK) < 0) {
|
||||
log_error("Directory %s doesn't look like it has an OS tree. Refusing.", p);
|
||||
r = -EINVAL;
|
||||
if (laccess(p, F_OK) < 0) {
|
||||
r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
|
||||
"Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
|
||||
goto finish;
|
||||
}
|
||||
}
|
||||
|
||||
@ -176,7 +176,7 @@ int base_filesystem_create(const char *root, uid_t uid, gid_t gid) {
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (uid != UID_INVALID || gid != UID_INVALID)
|
||||
if (uid_is_valid(uid) || gid_is_valid(gid))
|
||||
if (fchownat(fd, table[i].dir, uid, gid, AT_SYMLINK_NOFOLLOW) < 0)
|
||||
return log_error_errno(errno, "Failed to chown directory at %s/%s: %m", root, table[i].dir);
|
||||
}
|
||||
|
||||
@ -1807,7 +1807,7 @@ static int mount_partition(
|
||||
(void) fs_grow(node, p);
|
||||
|
||||
if (remap_uid_gid) {
|
||||
r = remount_idmap(p, uid_shift, uid_range);
|
||||
r = remount_idmap(p, uid_shift, uid_range, REMOUNT_IDMAP_HOST_ROOT);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -1049,14 +1049,31 @@ int make_mount_point(const char *path) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int make_userns(uid_t uid_shift, uid_t uid_range) {
|
||||
char line[DECIMAL_STR_MAX(uid_t)*3+3+1];
|
||||
static int make_userns(uid_t uid_shift, uid_t uid_range, RemountIdmapFlags flags) {
|
||||
_cleanup_close_ int userns_fd = -1;
|
||||
_cleanup_free_ char *line = NULL;
|
||||
|
||||
/* Allocates a userns file descriptor with the mapping we need. For this we'll fork off a child
|
||||
* process whose only purpose is to give us a new user namespace. It's killed when we got it. */
|
||||
|
||||
xsprintf(line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, uid_shift, uid_range);
|
||||
if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, uid_shift, uid_range) < 0)
|
||||
return log_oom_debug();
|
||||
|
||||
/* If requested we'll include an entry in the mapping so that the host root user can make changes to
|
||||
* the uidmapped mount like it normally would. Specifically, we'll map the user with UID_HOST_ROOT on
|
||||
* the backing fs to UID 0. This is useful, since nspawn code wants to create various missing inodes
|
||||
* in the OS tree before booting into it, and this becomes very easy and straightforward to do if it
|
||||
* can just do it under its own regular UID. Note that in that case the container's runtime uidmap
|
||||
* (i.e. the one the container payload processes run in) will leave this UID unmapped, i.e. if we
|
||||
* accidentally leave files owned by host root in the already uidmapped tree around they'll show up
|
||||
* as owned by 'nobody', which is safe. (Of course, we shouldn't leave such inodes around, but always
|
||||
* chown() them to the container's own UID range, but it's good to have a safety net, in case we
|
||||
* forget it.) */
|
||||
if (flags & REMOUNT_IDMAP_HOST_ROOT)
|
||||
if (strextendf(&line,
|
||||
UID_FMT " " UID_FMT " " UID_FMT "\n",
|
||||
UID_MAPPED_ROOT, 0, 1) < 0)
|
||||
return log_oom_debug();
|
||||
|
||||
/* We always assign the same UID and GID ranges */
|
||||
userns_fd = userns_acquire(line, line);
|
||||
@ -1069,7 +1086,8 @@ static int make_userns(uid_t uid_shift, uid_t uid_range) {
|
||||
int remount_idmap(
|
||||
const char *p,
|
||||
uid_t uid_shift,
|
||||
uid_t uid_range) {
|
||||
uid_t uid_range,
|
||||
RemountIdmapFlags flags) {
|
||||
|
||||
_cleanup_close_ int mount_fd = -1, userns_fd = -1;
|
||||
int r;
|
||||
@ -1085,7 +1103,7 @@ int remount_idmap(
|
||||
return log_debug_errno(errno, "Failed to open tree of mounted filesystem '%s': %m", p);
|
||||
|
||||
/* Create a user namespace mapping */
|
||||
userns_fd = make_userns(uid_shift, uid_range);
|
||||
userns_fd = make_userns(uid_shift, uid_range, flags);
|
||||
if (userns_fd < 0)
|
||||
return userns_fd;
|
||||
|
||||
|
||||
@ -112,7 +112,18 @@ int mount_image_in_namespace(pid_t target, const char *propagate_path, const cha
|
||||
|
||||
int make_mount_point(const char *path);
|
||||
|
||||
int remount_idmap(const char *p, uid_t uid_shift, uid_t uid_range);
|
||||
typedef enum RemountIdmapFlags {
|
||||
/* Include a mapping from UID_MAPPED_ROOT (i.e. UID 2^31-2) on the backing fs to UID 0 on the
|
||||
* uidmapped fs. This is useful to ensure that the host root user can safely add inodes to the
|
||||
* uidmapped fs (which otherwise wouldn't work as the host root user is not defined on the uidmapped
|
||||
* mount and any attempts to create inodes will then be refused with EOVERFLOW). The idea is that
|
||||
* these inodes are quickly re-chown()ed to more suitable UIDs/GIDs. Any code that intends to be able
|
||||
* to add inodes to file systems mapped this way should set this flag, but given it comes with
|
||||
* certain security implications defaults to off, and requires explicit opt-in. */
|
||||
REMOUNT_IDMAP_HOST_ROOT = 1 << 0,
|
||||
} RemountIdmapFlags;
|
||||
|
||||
int remount_idmap(const char *p, uid_t uid_shift, uid_t uid_range, RemountIdmapFlags flags);
|
||||
|
||||
/* Creates a mount point (not parents) based on the source path or stat - ie, a file or a directory */
|
||||
int make_mount_point_inode_from_stat(const struct stat *st, const char *dest, mode_t mode);
|
||||
|
||||
@ -29,7 +29,7 @@ static const NamingScheme naming_schemes[] = {
|
||||
};
|
||||
|
||||
const NamingScheme* naming_scheme_from_name(const char *name) {
|
||||
/* "latest" may either be defined explicitly by the extra map, in which case we we will find it in
|
||||
/* "latest" may either be defined explicitly by the extra map, in which case we will find it in
|
||||
* the table like any other name. After iterating through the table, we check for "latest" again,
|
||||
* which means that if not mapped explicitly, it maps to the last defined entry, whatever that is. */
|
||||
|
||||
|
||||
@ -12,7 +12,6 @@
|
||||
|
||||
#include "alloc-util.h"
|
||||
#include "errno-util.h"
|
||||
#include "event-util.h"
|
||||
#include "fd-util.h"
|
||||
#include "format-util.h"
|
||||
#include "io-util.h"
|
||||
@ -106,13 +105,6 @@ static void udev_ctrl_disconnect(UdevCtrl *uctrl) {
|
||||
uctrl->sock_connect = safe_close(uctrl->sock_connect);
|
||||
}
|
||||
|
||||
int udev_ctrl_is_connected(UdevCtrl *uctrl) {
|
||||
if (!uctrl)
|
||||
return false;
|
||||
|
||||
return event_source_is_enabled(uctrl->event_source_connect);
|
||||
}
|
||||
|
||||
static UdevCtrl *udev_ctrl_free(UdevCtrl *uctrl) {
|
||||
assert(uctrl);
|
||||
|
||||
@ -314,8 +306,6 @@ int udev_ctrl_send(UdevCtrl *uctrl, UdevCtrlMessageType type, const void *data)
|
||||
strscpy(ctrl_msg_wire.value.buf, sizeof(ctrl_msg_wire.value.buf), data);
|
||||
} else if (IN_SET(type, UDEV_CTRL_SET_LOG_LEVEL, UDEV_CTRL_SET_CHILDREN_MAX))
|
||||
ctrl_msg_wire.value.intval = PTR_TO_INT(data);
|
||||
else if (type == UDEV_CTRL_SENDER_PID)
|
||||
ctrl_msg_wire.value.pid = PTR_TO_PID(data);
|
||||
|
||||
if (!uctrl->connected) {
|
||||
if (connect(uctrl->sock, &uctrl->saddr.sa, uctrl->addrlen) < 0)
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
#include "sd-event.h"
|
||||
|
||||
#include "macro.h"
|
||||
#include "process-util.h"
|
||||
#include "time-util.h"
|
||||
|
||||
typedef struct UdevCtrl UdevCtrl;
|
||||
@ -19,12 +18,10 @@ typedef enum UdevCtrlMessageType {
|
||||
UDEV_CTRL_SET_CHILDREN_MAX,
|
||||
UDEV_CTRL_PING,
|
||||
UDEV_CTRL_EXIT,
|
||||
UDEV_CTRL_SENDER_PID,
|
||||
} UdevCtrlMessageType;
|
||||
|
||||
typedef union UdevCtrlMessageValue {
|
||||
int intval;
|
||||
pid_t pid;
|
||||
char buf[256];
|
||||
} UdevCtrlMessageValue;
|
||||
|
||||
@ -42,7 +39,6 @@ UdevCtrl *udev_ctrl_unref(UdevCtrl *uctrl);
|
||||
int udev_ctrl_attach_event(UdevCtrl *uctrl, sd_event *event);
|
||||
int udev_ctrl_start(UdevCtrl *uctrl, udev_ctrl_handler_t callback, void *userdata);
|
||||
sd_event_source *udev_ctrl_get_event_source(UdevCtrl *uctrl);
|
||||
int udev_ctrl_is_connected(UdevCtrl *uctrl);
|
||||
|
||||
int udev_ctrl_wait(UdevCtrl *uctrl, usec_t timeout);
|
||||
|
||||
@ -79,8 +75,4 @@ static inline int udev_ctrl_send_exit(UdevCtrl *uctrl) {
|
||||
return udev_ctrl_send(uctrl, UDEV_CTRL_EXIT, NULL);
|
||||
}
|
||||
|
||||
static inline int udev_ctrl_send_pid(UdevCtrl *uctrl) {
|
||||
return udev_ctrl_send(uctrl, UDEV_CTRL_SENDER_PID, PID_TO_PTR(getpid_cached()));
|
||||
}
|
||||
|
||||
DEFINE_TRIVIAL_CLEANUP_FUNC(UdevCtrl*, udev_ctrl_unref);
|
||||
|
||||
@ -87,11 +87,6 @@ int control_main(int argc, char *argv[], void *userdata) {
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to initialize udev control: %m");
|
||||
|
||||
/* See comments in on_post() in udevd.c. */
|
||||
r = udev_ctrl_send_pid(uctrl);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to send pid of this process: %m");
|
||||
|
||||
while ((c = getopt_long(argc, argv, "el:sSRp:m:t:Vh", options, NULL)) >= 0)
|
||||
switch (c) {
|
||||
case 'e':
|
||||
|
||||
116
src/udev/udevd.c
116
src/udev/udevd.c
@ -28,6 +28,7 @@
|
||||
#include "sd-event.h"
|
||||
|
||||
#include "alloc-util.h"
|
||||
#include "cgroup-setup.h"
|
||||
#include "cgroup-util.h"
|
||||
#include "cpu-set-util.h"
|
||||
#include "dev-setup.h"
|
||||
@ -48,6 +49,7 @@
|
||||
#include "mkdir.h"
|
||||
#include "netlink-util.h"
|
||||
#include "parse-util.h"
|
||||
#include "path-util.h"
|
||||
#include "pretty-print.h"
|
||||
#include "proc-cmdline.h"
|
||||
#include "process-util.h"
|
||||
@ -85,7 +87,7 @@ typedef struct Manager {
|
||||
sd_event *event;
|
||||
Hashmap *workers;
|
||||
LIST_HEAD(Event, events);
|
||||
const char *cgroup;
|
||||
char *cgroup;
|
||||
pid_t pid; /* the process that originally allocated the manager object */
|
||||
int log_level;
|
||||
|
||||
@ -108,8 +110,6 @@ typedef struct Manager {
|
||||
|
||||
bool stop_exec_queue;
|
||||
bool exit;
|
||||
|
||||
pid_t pid_udevadm; /* pid of 'udevadm control' */
|
||||
} Manager;
|
||||
|
||||
typedef enum EventState {
|
||||
@ -240,6 +240,7 @@ static Manager* manager_free(Manager *manager) {
|
||||
safe_close(manager->inotify_fd);
|
||||
safe_close_pair(manager->worker_watch);
|
||||
|
||||
free(manager->cgroup);
|
||||
return mfree(manager);
|
||||
}
|
||||
|
||||
@ -496,7 +497,7 @@ static int worker_process_device(Manager *manager, sd_device *dev) {
|
||||
* supposed to be enabled via an option set via udev rules (OPTIONS+="watch"). If we skip the
|
||||
* udev rules here however (as we just said we do), we would thus never see that specific
|
||||
* udev rule, and thus never turn on inotify watching. But in order to catch up eventually
|
||||
* and run them we we need the inotify watching: hence a classic chicken and egg problem.
|
||||
* and run them we need the inotify watching: hence a classic chicken and egg problem.
|
||||
*
|
||||
* Our way out here: if we see the block device locked, unconditionally watch the device via
|
||||
* inotify, regardless of any explicit request via OPTIONS+="watch". Thus, a device that is
|
||||
@ -1217,10 +1218,6 @@ static int on_ctrl_msg(UdevCtrl *uctrl, UdevCtrlMessageType type, const UdevCtrl
|
||||
log_debug("Received udev control message (EXIT)");
|
||||
manager_exit(manager);
|
||||
break;
|
||||
case UDEV_CTRL_SENDER_PID:
|
||||
log_debug("Received udev control message (SENDER_PID)");
|
||||
manager->pid_udevadm = value->pid;
|
||||
break;
|
||||
default:
|
||||
log_debug("Received unknown udev control message, ignoring");
|
||||
}
|
||||
@ -1496,35 +1493,9 @@ static int on_post(sd_event_source *s, void *userdata) {
|
||||
if (manager->exit)
|
||||
return sd_event_exit(manager->event, 0);
|
||||
|
||||
if (!manager->cgroup)
|
||||
return 1;
|
||||
|
||||
/* Cleanup possible left-over processes in our cgroup. But do not kill udevadm called by
|
||||
* ExecReload= in systemd-udevd.service. See #16867. To protect it, the following two ways are
|
||||
* introduced:
|
||||
*
|
||||
* 1. After the connection is accepted, but the PID of udevadm is not received, do not call
|
||||
* cg_kill(). So, in this period, unwanted process or threads may exist in our cgroup.
|
||||
* But, it is expected that the PID of the udevadm will be received soon. So, this time
|
||||
* period should be short enough.
|
||||
* 2. After the PID of udevadm is received, check the process is active or not, and if it is
|
||||
* still active, set the PID to the deny list for cg_kill(). Why udev_ctrl_is_connected() is
|
||||
* not enough? Because the udevadm may be still active after the control socket is
|
||||
* disconnected. If the process is not active, then clear the PID for later connections.
|
||||
*/
|
||||
|
||||
if (udev_ctrl_is_connected(manager->ctrl) >= 0 && !pid_is_valid(manager->pid_udevadm))
|
||||
return 1;
|
||||
|
||||
_cleanup_set_free_ Set *pids = NULL;
|
||||
if (pid_is_valid(manager->pid_udevadm)) {
|
||||
if (pid_is_alive(manager->pid_udevadm))
|
||||
(void) set_ensure_put(&pids, NULL, PID_TO_PTR(manager->pid_udevadm));
|
||||
else
|
||||
manager->pid_udevadm = 0;
|
||||
}
|
||||
|
||||
(void) cg_kill(SYSTEMD_CGROUP_CONTROLLER, manager->cgroup, SIGKILL, CGROUP_IGNORE_SELF, pids, NULL, NULL);
|
||||
if (manager->cgroup)
|
||||
/* cleanup possible left-over processes in our cgroup */
|
||||
(void) cg_kill(SYSTEMD_CGROUP_CONTROLLER, manager->cgroup, SIGKILL, CGROUP_IGNORE_SELF, NULL, NULL, NULL);
|
||||
|
||||
return 1;
|
||||
}
|
||||
@ -1754,12 +1725,63 @@ static int parse_argv(int argc, char *argv[]) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int manager_new(Manager **ret, int fd_ctrl, int fd_uevent, const char *cgroup) {
|
||||
static int create_subcgroup(char **ret) {
|
||||
_cleanup_free_ char *cgroup = NULL, *subcgroup = NULL;
|
||||
int r;
|
||||
|
||||
if (getppid() != 1)
|
||||
return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Not invoked by PID1.");
|
||||
|
||||
r = sd_booted();
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to check if systemd is running: %m");
|
||||
if (r == 0)
|
||||
return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "systemd is not running.");
|
||||
|
||||
/* Get our own cgroup, we regularly kill everything udev has left behind.
|
||||
* We only do this on systemd systems, and only if we are directly spawned
|
||||
* by PID1. Otherwise we are not guaranteed to have a dedicated cgroup. */
|
||||
|
||||
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
|
||||
if (r < 0) {
|
||||
if (IN_SET(r, -ENOENT, -ENOMEDIUM))
|
||||
return log_debug_errno(r, "Dedicated cgroup not found: %m");
|
||||
return log_debug_errno(r, "Failed to get cgroup: %m");
|
||||
}
|
||||
|
||||
r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, cgroup, "trusted.delegate");
|
||||
if (IN_SET(r, 0, -ENODATA))
|
||||
return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "The cgroup %s is not delegated to us.", cgroup);
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to read trusted.delegate attribute: %m");
|
||||
|
||||
/* We are invoked with our own delegated cgroup tree, let's move us one level down, so that we
|
||||
* don't collide with the "no processes in inner nodes" rule of cgroups, when the service
|
||||
* manager invokes the ExecReload= job in the .control/ subcgroup. */
|
||||
|
||||
subcgroup = path_join(cgroup, "/udev");
|
||||
if (!subcgroup)
|
||||
return log_oom_debug();
|
||||
|
||||
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup, 0);
|
||||
if (r < 0)
|
||||
return log_debug_errno(r, "Failed to create %s subcgroup: %m", subcgroup);
|
||||
|
||||
log_debug("Created %s subcgroup.", subcgroup);
|
||||
if (ret)
|
||||
*ret = TAKE_PTR(subcgroup);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int manager_new(Manager **ret, int fd_ctrl, int fd_uevent) {
|
||||
_cleanup_(manager_freep) Manager *manager = NULL;
|
||||
_cleanup_free_ char *cgroup = NULL;
|
||||
int r;
|
||||
|
||||
assert(ret);
|
||||
|
||||
(void) create_subcgroup(&cgroup);
|
||||
|
||||
manager = new(Manager, 1);
|
||||
if (!manager)
|
||||
return log_oom();
|
||||
@ -1767,7 +1789,7 @@ static int manager_new(Manager **ret, int fd_ctrl, int fd_uevent, const char *cg
|
||||
*manager = (Manager) {
|
||||
.inotify_fd = -1,
|
||||
.worker_watch = { -1, -1 },
|
||||
.cgroup = cgroup,
|
||||
.cgroup = TAKE_PTR(cgroup),
|
||||
};
|
||||
|
||||
r = udev_ctrl_new_from_fd(&manager->ctrl, fd_ctrl);
|
||||
@ -1912,7 +1934,6 @@ static int main_loop(Manager *manager) {
|
||||
}
|
||||
|
||||
int run_udevd(int argc, char *argv[]) {
|
||||
_cleanup_free_ char *cgroup = NULL;
|
||||
_cleanup_(manager_freep) Manager *manager = NULL;
|
||||
int fd_ctrl = -1, fd_uevent = -1;
|
||||
int r;
|
||||
@ -1969,24 +1990,11 @@ int run_udevd(int argc, char *argv[]) {
|
||||
if (r < 0 && r != -EEXIST)
|
||||
return log_error_errno(r, "Failed to create /run/udev: %m");
|
||||
|
||||
if (getppid() == 1 && sd_booted() > 0) {
|
||||
/* Get our own cgroup, we regularly kill everything udev has left behind.
|
||||
* We only do this on systemd systems, and only if we are directly spawned
|
||||
* by PID1. Otherwise we are not guaranteed to have a dedicated cgroup. */
|
||||
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
|
||||
if (r < 0) {
|
||||
if (IN_SET(r, -ENOENT, -ENOMEDIUM))
|
||||
log_debug_errno(r, "Dedicated cgroup not found: %m");
|
||||
else
|
||||
log_warning_errno(r, "Failed to get cgroup: %m");
|
||||
}
|
||||
}
|
||||
|
||||
r = listen_fds(&fd_ctrl, &fd_uevent);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to listen on fds: %m");
|
||||
|
||||
r = manager_new(&manager, fd_ctrl, fd_uevent, cgroup);
|
||||
r = manager_new(&manager, fd_ctrl, fd_uevent);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to create manager: %m");
|
||||
|
||||
|
||||
@ -138,7 +138,7 @@ class NetworkdTestingUtilities:
|
||||
|
||||
def read_attr(self, link, attribute):
|
||||
"""Read a link attributed from the sysfs."""
|
||||
# Note we we don't want to check if interface `link' is managed, we
|
||||
# Note we don't want to check if interface `link' is managed, we
|
||||
# want to evaluate link variable and pass the value of the link to
|
||||
# assert_link_states e.g. eth0=managed.
|
||||
self.assert_link_states(**{link:'managed'})
|
||||
|
||||
@ -16,6 +16,7 @@ Before=sysinit.target
|
||||
ConditionPathIsReadWrite=/sys
|
||||
|
||||
[Service]
|
||||
Delegate=pids
|
||||
DeviceAllow=block-* rwm
|
||||
DeviceAllow=char-* rwm
|
||||
Type=notify
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user