1
0
mirror of https://github.com/systemd/systemd synced 2025-09-30 09:14:46 +02:00

Compare commits

...

8 Commits

Author SHA1 Message Date
Zbigniew Jędrzejewski-Szmek
aaf73b2ecf
Merge pull request #17902 from bugaevc/fix-container-detection
improve container detection
2021-02-11 12:56:01 +01:00
Zbigniew Jędrzejewski-Szmek
372a5002dc
Merge pull request #18545 from poettering/netlink-seqno-fix
sd-netlink seqnum fixes
2021-02-11 08:24:39 +01:00
Lennart Poettering
ac3bc1b819 sd-netlink: spread out sequence numbers a bit
An (imperfect) fix for #14760.

This makes collisions unlikely, but still theoretically possible.

Fixes: #14760
2021-02-10 23:07:46 +01:00
Lennart Poettering
baf78f1a51 sd-netlink: reduce indentation levels a bit 2021-02-10 22:01:24 +01:00
Lennart Poettering
13ec9f103b sd-netlink: use getsockopt_int() where appropriate 2021-02-10 22:01:24 +01:00
Lennart Poettering
b522c4b92a sd-netlink: revamp message serial handling
Let's use uint32_t everywhere to maintain the seqno, since that's what
the kernel does. Prviously in the reply_callback logic we used 64bit,
for no apparent reason.

Using 32bit also provides us with the benefit that we can avoid using
uint64_hash_ops, and can use trivial_hash_ops instead for the reply
hashmap, so that we can store the seqno in the key pointer directly.

While we are at it, let's make sure we never run into serial collisions
internally (32bit is a lot, but not that much), and let's put a limit on
outstanding serials, to catch programming errors.
2021-02-10 22:01:24 +01:00
Sergey Bugaev
0e13779d37 virt: detect cgroups namespaces
detect_container() is now able to detect if we're running in a cgroup namespace.
2021-02-10 22:25:04 +03:00
Sergey Bugaev
a4a9a6f7c6 virt: detect Docker and Podman containers
Docker doesn't set $container, so it cannot be detected that way. Instead, we
check for presence of /.dockerinit, which it creates. Podman does set
$container, but some Red Hat images (in particular, Fedora images) override
$container to equal "oci". So to correctly detect Podman containers, we check
for presence of /run/.containerenv, which is created by Podman and is now the
official way to get information about the container from within the container.

Fixes https://github.com/systemd/systemd/issues/15393
2021-02-10 22:25:04 +03:00
3 changed files with 184 additions and 41 deletions

View File

@ -9,6 +9,7 @@
#include <unistd.h>
#include "alloc-util.h"
#include "cgroup-util.h"
#include "dirent-util.h"
#include "env-util.h"
#include "fd-util.h"
@ -453,6 +454,100 @@ static const char *const container_table[_VIRTUALIZATION_MAX] = {
DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(container, int);
static int running_in_cgroupns(void) {
int r;
if (!cg_ns_supported())
return false;
r = cg_all_unified();
if (r < 0)
return r;
if (r) {
/* cgroup v2 */
r = access("/sys/fs/cgroup/cgroup.events", F_OK);
if (r < 0) {
if (errno != ENOENT)
return -errno;
/* All kernel versions have cgroup.events in nested cgroups. */
return false;
}
/* There's no cgroup.type in the root cgroup, and future kernel versions
* are unlikely to add it since cgroup.type is something that makes no sense
* whatsoever in the root cgroup. */
r = access("/sys/fs/cgroup/cgroup.type", F_OK);
if (r == 0)
return true;
if (r < 0 && errno != ENOENT)
return -errno;
/* On older kernel versions, there's no cgroup.type */
r = access("/sys/kernel/cgroup/features", F_OK);
if (r < 0) {
if (errno != ENOENT)
return -errno;
/* This is an old kernel that we know for sure has cgroup.events
* only in nested cgroups. */
return true;
}
/* This is a recent kernel, and cgroup.type doesn't exist, so we must be
* in the root cgroup. */
return false;
} else {
/* cgroup v1 */
/* If systemd controller is not mounted, do not even bother. */
r = access("/sys/fs/cgroup/systemd", F_OK);
if (r < 0) {
if (errno != ENOENT)
return -errno;
return false;
}
/* release_agent only exists in the root cgroup. */
r = access("/sys/fs/cgroup/systemd/release_agent", F_OK);
if (r < 0) {
if (errno != ENOENT)
return -errno;
return true;
}
return false;
}
}
static int detect_container_files(void) {
unsigned i;
static const struct {
const char *file_path;
int id;
} container_file_table[] = {
/* https://github.com/containers/podman/issues/6192 */
/* https://github.com/containers/podman/issues/3586#issuecomment-661918679 */
{ "/run/.containerenv", VIRTUALIZATION_PODMAN },
/* https://github.com/moby/moby/issues/18355 */
/* Docker must be the last in this table, see below. */
{ "/.dockerenv", VIRTUALIZATION_DOCKER },
};
for (i = 0; i < ELEMENTSOF(container_file_table); i++) {
if (access(container_file_table[i].file_path, F_OK) >= 0)
return container_file_table[i].id;
if (errno != ENOENT)
log_debug_errno(errno,
"Checking if %s exists failed, ignoring: %m",
container_file_table[i].file_path);
}
return VIRTUALIZATION_NONE;
}
int detect_container(void) {
static thread_local int cached_found = _VIRTUALIZATION_INVALID;
_cleanup_free_ char *m = NULL, *o = NULL, *p = NULL;
@ -530,7 +625,7 @@ int detect_container(void) {
*/
e = getenv("container");
if (!e)
goto none;
goto check_files;
if (isempty(e)) {
r = VIRTUALIZATION_NONE;
goto finish;
@ -558,12 +653,36 @@ int detect_container(void) {
if (r < 0) /* This only works if we have CAP_SYS_PTRACE, hence let's better ignore failures here */
log_debug_errno(r, "Failed to read $container of PID 1, ignoring: %m");
none:
/* If that didn't work, give up, assume no container manager. */
check_files:
/* Check for existence of some well-known files. We only do this after checking
* for other specific container managers, otherwise we risk mistaking another
* container manager for Docker: the /.dockerenv file could inadvertently end up
* in a file system image. */
r = detect_container_files();
if (r)
goto finish;
r = running_in_cgroupns();
if (r > 0) {
r = VIRTUALIZATION_CONTAINER_OTHER;
goto finish;
}
if (r < 0)
log_debug_errno(r, "Failed to detect cgroup namespace: %m");
/* If none of that worked, give up, assume no container manager. */
r = VIRTUALIZATION_NONE;
goto finish;
translate_name:
if (streq(e, "oci")) {
/* Some images hardcode container=oci, but OCI is not a specific container manager.
* Try to detect one based on well-known files. */
r = detect_container_files();
if (!r)
r = VIRTUALIZATION_CONTAINER_OTHER;
goto finish;
}
r = container_from_string(e);
if (r < 0)
r = VIRTUALIZATION_CONTAINER_OTHER;

View File

@ -19,7 +19,7 @@
struct reply_callback {
sd_netlink_message_handler_t callback;
usec_t timeout;
uint64_t serial;
uint32_t serial;
unsigned prioq_idx;
};

View File

@ -17,6 +17,9 @@
#include "string-util.h"
#include "util.h"
/* Some really high limit, to catch programming errors */
#define REPLY_CALLBACKS_MAX UINT16_MAX
static int sd_netlink_new(sd_netlink **ret) {
_cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
@ -33,11 +36,29 @@ static int sd_netlink_new(sd_netlink **ret) {
.original_pid = getpid_cached(),
.protocol = -1,
/* Change notification responses have sequence 0, so we must
* start our request sequence numbers at 1, or we may confuse our
* responses with notifications from the kernel */
.serial = 1,
/* Kernel change notification messages have sequence number 0. We want to avoid that with our
* own serials, in order not to get confused when matching up kernel replies to our earlier
* requests.
*
* Moreover, when using netlink socket activation (i.e. where PID 1 binds an AF_NETLINK
* socket for us and passes it to us across execve()) and we get restarted multiple times
* while the socket sticks around we might get confused by replies from earlier runs coming
* in late which is pretty likely if we'd start our sequence numbers always from 1. Hence,
* let's start with a value based on the system clock. This should make collisions much less
* likely (though still theoretically possible). We use a 32 bit µs counter starting at boot
* for this (and explicitly exclude the zero, see above). This counter will wrap around after
* a bit more than 1h, but that's hopefully OK as the kernel shouldn't take that long to
* reply to our requests.
*
* We only pick the initial start value this way. For each message we simply increase the
* sequence number by 1. This means we could enqueue 1 netlink message per µs without risking
* collisions, which should be OK.
*
* Note this means the serials will be in the range 1UINT32_MAX here.
*
* (In an ideal world we'd attach the current serial counter to the netlink socket itself
* somehow, to avoid all this, but I couldn't come up with a nice way to do this) */
.serial = (uint32_t) (now(CLOCK_MONOTONIC) % UINT32_MAX) + 1,
};
/* We guarantee that the read buffer has at least space for
@ -89,9 +110,7 @@ static bool rtnl_pid_changed(const sd_netlink *rtnl) {
int sd_netlink_open_fd(sd_netlink **ret, int fd) {
_cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
int r;
int protocol;
socklen_t l;
int r, protocol;
assert_return(ret, -EINVAL);
assert_return(fd >= 0, -EBADF);
@ -100,8 +119,7 @@ int sd_netlink_open_fd(sd_netlink **ret, int fd) {
if (r < 0)
return r;
l = sizeof(protocol);
r = getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &protocol, &l);
r = getsockopt_int(fd, SOL_SOCKET, SO_PROTOCOL, &protocol);
if (r < 0)
return r;
@ -190,18 +208,25 @@ static sd_netlink *netlink_free(sd_netlink *rtnl) {
DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_netlink, sd_netlink, netlink_free);
static void rtnl_seal_message(sd_netlink *rtnl, sd_netlink_message *m) {
uint32_t picked;
assert(rtnl);
assert(!rtnl_pid_changed(rtnl));
assert(m);
assert(m->hdr);
/* don't use seq == 0, as that is used for broadcasts, so we
would get confused by replies to such messages */
m->hdr->nlmsg_seq = rtnl->serial++ ? : rtnl->serial++;
/* Avoid collisions with outstanding requests */
do {
picked = rtnl->serial;
/* Don't use seq == 0, as that is used for broadcasts, so we would get confused by replies to
such messages */
rtnl->serial = rtnl->serial == UINT32_MAX ? 1 : rtnl->serial + 1;
} while (hashmap_contains(rtnl->reply_callbacks, UINT32_TO_PTR(picked)));
m->hdr->nlmsg_seq = picked;
rtnl_message_seal(m);
return;
}
int sd_netlink_send(sd_netlink *nl,
@ -339,7 +364,7 @@ static int process_timeout(sd_netlink *rtnl) {
assert_se(prioq_pop(rtnl->reply_callbacks_prioq) == c);
c->timeout = 0;
hashmap_remove(rtnl->reply_callbacks, &c->serial);
hashmap_remove(rtnl->reply_callbacks, UINT32_TO_PTR(c->serial));
slot = container_of(c, sd_netlink_slot, reply_callback);
@ -359,7 +384,7 @@ static int process_timeout(sd_netlink *rtnl) {
static int process_reply(sd_netlink *rtnl, sd_netlink_message *m) {
struct reply_callback *c;
sd_netlink_slot *slot;
uint64_t serial;
uint32_t serial;
uint16_t type;
int r;
@ -367,7 +392,7 @@ static int process_reply(sd_netlink *rtnl, sd_netlink_message *m) {
assert(m);
serial = rtnl_message_get_serial(m);
c = hashmap_remove(rtnl->reply_callbacks, &serial);
c = hashmap_remove(rtnl->reply_callbacks, UINT32_TO_PTR(serial));
if (!c)
return 0;
@ -412,20 +437,19 @@ static int process_match(sd_netlink *rtnl, sd_netlink_message *m) {
return r;
LIST_FOREACH(match_callbacks, c, rtnl->match_callbacks) {
if (type == c->type) {
slot = container_of(c, sd_netlink_slot, match_callback);
if (type != c->type)
continue;
r = c->callback(rtnl, m, slot->userdata);
if (r != 0) {
if (r < 0)
log_debug_errno(r, "sd-netlink: match callback %s%s%sfailed: %m",
slot->description ? "'" : "",
strempty(slot->description),
slot->description ? "' " : "");
slot = container_of(c, sd_netlink_slot, match_callback);
break;
}
}
r = c->callback(rtnl, m, slot->userdata);
if (r < 0)
log_debug_errno(r, "sd-netlink: match callback %s%s%sfailed: %m",
slot->description ? "'" : "",
strempty(slot->description),
slot->description ? "' " : "");
if (r != 0)
break;
}
return 1;
@ -568,7 +592,6 @@ int sd_netlink_call_async(
uint64_t usec,
const char *description) {
_cleanup_free_ sd_netlink_slot *slot = NULL;
uint32_t s;
int r, k;
assert_return(nl, -EINVAL);
@ -576,7 +599,10 @@ int sd_netlink_call_async(
assert_return(callback, -EINVAL);
assert_return(!rtnl_pid_changed(nl), -ECHILD);
r = hashmap_ensure_allocated(&nl->reply_callbacks, &uint64_hash_ops);
if (hashmap_size(nl->reply_callbacks) >= REPLY_CALLBACKS_MAX)
return -ERANGE;
r = hashmap_ensure_allocated(&nl->reply_callbacks, &trivial_hash_ops);
if (r < 0)
return r;
@ -593,20 +619,18 @@ int sd_netlink_call_async(
slot->reply_callback.callback = callback;
slot->reply_callback.timeout = calc_elapse(usec);
k = sd_netlink_send(nl, m, &s);
k = sd_netlink_send(nl, m, &slot->reply_callback.serial);
if (k < 0)
return k;
slot->reply_callback.serial = s;
r = hashmap_put(nl->reply_callbacks, &slot->reply_callback.serial, &slot->reply_callback);
r = hashmap_put(nl->reply_callbacks, UINT32_TO_PTR(slot->reply_callback.serial), &slot->reply_callback);
if (r < 0)
return r;
if (slot->reply_callback.timeout != 0) {
r = prioq_put(nl->reply_callbacks_prioq, &slot->reply_callback, &slot->reply_callback.prioq_idx);
if (r < 0) {
(void) hashmap_remove(nl->reply_callbacks, &slot->reply_callback.serial);
(void) hashmap_remove(nl->reply_callbacks, UINT32_TO_PTR(slot->reply_callback.serial));
return r;
}
}