Merge pull request #17902 from bugaevc/fix-container-detection

improve container detection
Merge pull request #18545 from poettering/netlink-seqno-fix
2025-09-30 09:14:46 +02:00 · 2021-02-11 12:56:01 +01:00 · 2021-02-11 08:24:39 +01:00 · 2021-02-10 23:07:46 +01:00 · 2021-02-10 22:01:24 +01:00 · 2021-02-10 22:01:24 +01:00
3 changed files with 184 additions and 41 deletions
--- a/src/basic/virt.c
+++ b/src/basic/virt.c
@ -9,6 +9,7 @@
 #include <unistd.h>

 #include "alloc-util.h"
+#include "cgroup-util.h"
 #include "dirent-util.h"
 #include "env-util.h"
 #include "fd-util.h"
@ -453,6 +454,100 @@ static const char *const container_table[_VIRTUALIZATION_MAX] = {

 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(container, int);

+static int running_in_cgroupns(void) {
+        int r;
+
+        if (!cg_ns_supported())
+                return false;
+
+        r = cg_all_unified();
+        if (r < 0)
+                return r;
+
+        if (r) {
+                /* cgroup v2 */
+
+                r = access("/sys/fs/cgroup/cgroup.events", F_OK);
+                if (r < 0) {
+                        if (errno != ENOENT)
+                                return -errno;
+                        /* All kernel versions have cgroup.events in nested cgroups. */
+                        return false;
+                }
+
+                /* There's no cgroup.type in the root cgroup, and future kernel versions
+                 * are unlikely to add it since cgroup.type is something that makes no sense
+                 * whatsoever in the root cgroup. */
+                r = access("/sys/fs/cgroup/cgroup.type", F_OK);
+                if (r == 0)
+                        return true;
+                if (r < 0 && errno != ENOENT)
+                        return -errno;
+
+                /* On older kernel versions, there's no cgroup.type */
+                r = access("/sys/kernel/cgroup/features", F_OK);
+                if (r < 0) {
+                        if (errno != ENOENT)
+                                return -errno;
+                        /* This is an old kernel that we know for sure has cgroup.events
+                         * only in nested cgroups. */
+                        return true;
+                }
+
+                /* This is a recent kernel, and cgroup.type doesn't exist, so we must be
+                 * in the root cgroup. */
+                return false;
+        } else {
+                /* cgroup v1 */
+
+                /* If systemd controller is not mounted, do not even bother. */
+                r = access("/sys/fs/cgroup/systemd", F_OK);
+                if (r < 0) {
+                        if (errno != ENOENT)
+                                return -errno;
+                        return false;
+                }
+
+                /* release_agent only exists in the root cgroup. */
+                r = access("/sys/fs/cgroup/systemd/release_agent", F_OK);
+                if (r < 0) {
+                        if (errno != ENOENT)
+                                return -errno;
+                        return true;
+                }
+
+                return false;
+        }
+}
+
+static int detect_container_files(void) {
+        unsigned i;
+
+        static const struct {
+                const char *file_path;
+                int id;
+        } container_file_table[] = {
+                /* https://github.com/containers/podman/issues/6192 */
+                /* https://github.com/containers/podman/issues/3586#issuecomment-661918679 */
+                { "/run/.containerenv", VIRTUALIZATION_PODMAN },
+                /* https://github.com/moby/moby/issues/18355 */
+                /* Docker must be the last in this table, see below. */
+                { "/.dockerenv",        VIRTUALIZATION_DOCKER },
+        };
+
+        for (i = 0; i < ELEMENTSOF(container_file_table); i++) {
+                if (access(container_file_table[i].file_path, F_OK) >= 0)
+                        return container_file_table[i].id;
+
+                if (errno != ENOENT)
+                        log_debug_errno(errno,
+                                        "Checking if %s exists failed, ignoring: %m",
+                                        container_file_table[i].file_path);
+        }
+
+        return VIRTUALIZATION_NONE;
+}
+
 int detect_container(void) {
        static thread_local int cached_found = _VIRTUALIZATION_INVALID;
        _cleanup_free_ char *m = NULL, *o = NULL, *p = NULL;
@ -530,7 +625,7 @@ int detect_container(void) {
                 */
                e = getenv("container");
                if (!e)
-                        goto none;
+                        goto check_files;
                if (isempty(e)) {
                        r = VIRTUALIZATION_NONE;
                        goto finish;
@ -558,12 +653,36 @@ int detect_container(void) {
        if (r < 0) /* This only works if we have CAP_SYS_PTRACE, hence let's better ignore failures here */
                log_debug_errno(r, "Failed to read $container of PID 1, ignoring: %m");

-none:
-        /* If that didn't work, give up, assume no container manager. */
+check_files:
+        /* Check for existence of some well-known files. We only do this after checking
+         * for other specific container managers, otherwise we risk mistaking another
+         * container manager for Docker: the /.dockerenv file could inadvertently end up
+         * in a file system image. */
+        r = detect_container_files();
+        if (r)
+                goto finish;
+
+        r = running_in_cgroupns();
+        if (r > 0) {
+                r = VIRTUALIZATION_CONTAINER_OTHER;
+                goto finish;
+        }
+        if (r < 0)
+                log_debug_errno(r, "Failed to detect cgroup namespace: %m");
+
+        /* If none of that worked, give up, assume no container manager. */
        r = VIRTUALIZATION_NONE;
        goto finish;

 translate_name:
+        if (streq(e, "oci")) {
+                /* Some images hardcode container=oci, but OCI is not a specific container manager.
+                 * Try to detect one based on well-known files. */
+                r = detect_container_files();
+                if (!r)
+                        r = VIRTUALIZATION_CONTAINER_OTHER;
+                goto finish;
+        }
        r = container_from_string(e);
        if (r < 0)
                r = VIRTUALIZATION_CONTAINER_OTHER;
--- a/src/libsystemd/sd-netlink/netlink-internal.h
+++ b/src/libsystemd/sd-netlink/netlink-internal.h
@ -19,7 +19,7 @@
 struct reply_callback {
        sd_netlink_message_handler_t callback;
        usec_t timeout;
-        uint64_t serial;
+        uint32_t serial;
        unsigned prioq_idx;
 };

--- a/src/libsystemd/sd-netlink/sd-netlink.c
+++ b/src/libsystemd/sd-netlink/sd-netlink.c
@ -17,6 +17,9 @@
 #include "string-util.h"
 #include "util.h"

+/* Some really high limit, to catch programming errors */
+#define REPLY_CALLBACKS_MAX UINT16_MAX
+
 static int sd_netlink_new(sd_netlink **ret) {
        _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;

@ -33,11 +36,29 @@ static int sd_netlink_new(sd_netlink **ret) {
                .original_pid = getpid_cached(),
                .protocol = -1,

-                /* Change notification responses have sequence 0, so we must
-                 * start our request sequence numbers at 1, or we may confuse our
-                 * responses with notifications from the kernel */
-                .serial = 1,
-
+                /* Kernel change notification messages have sequence number 0. We want to avoid that with our
+                 * own serials, in order not to get confused when matching up kernel replies to our earlier
+                 * requests.
+                 *
+                 * Moreover, when using netlink socket activation (i.e. where PID 1 binds an AF_NETLINK
+                 * socket for us and passes it to us across execve()) and we get restarted multiple times
+                 * while the socket sticks around we might get confused by replies from earlier runs coming
+                 * in late — which is pretty likely if we'd start our sequence numbers always from 1. Hence,
+                 * let's start with a value based on the system clock. This should make collisions much less
+                 * likely (though still theoretically possible). We use a 32 bit µs counter starting at boot
+                 * for this (and explicitly exclude the zero, see above). This counter will wrap around after
+                 * a bit more than 1h, but that's hopefully OK as the kernel shouldn't take that long to
+                 * reply to our requests.
+                 *
+                 * We only pick the initial start value this way. For each message we simply increase the
+                 * sequence number by 1. This means we could enqueue 1 netlink message per µs without risking
+                 * collisions, which should be OK.
+                 *
+                 * Note this means the serials will be in the range 1…UINT32_MAX here.
+                 *
+                 * (In an ideal world we'd attach the current serial counter to the netlink socket itself
+                 * somehow, to avoid all this, but I couldn't come up with a nice way to do this) */
+                .serial = (uint32_t) (now(CLOCK_MONOTONIC) % UINT32_MAX) + 1,
        };

        /* We guarantee that the read buffer has at least space for
@ -89,9 +110,7 @@ static bool rtnl_pid_changed(const sd_netlink *rtnl) {

 int sd_netlink_open_fd(sd_netlink **ret, int fd) {
        _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
-        int r;
-        int protocol;
-        socklen_t l;
+        int r, protocol;

        assert_return(ret, -EINVAL);
        assert_return(fd >= 0, -EBADF);
@ -100,8 +119,7 @@ int sd_netlink_open_fd(sd_netlink **ret, int fd) {
        if (r < 0)
                return r;

-        l = sizeof(protocol);
-        r = getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &protocol, &l);
+        r = getsockopt_int(fd, SOL_SOCKET, SO_PROTOCOL, &protocol);
        if (r < 0)
                return r;

@ -190,18 +208,25 @@ static sd_netlink *netlink_free(sd_netlink *rtnl) {
 DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_netlink, sd_netlink, netlink_free);

 static void rtnl_seal_message(sd_netlink *rtnl, sd_netlink_message *m) {
+        uint32_t picked;
+
        assert(rtnl);
        assert(!rtnl_pid_changed(rtnl));
        assert(m);
        assert(m->hdr);

-        /* don't use seq == 0, as that is used for broadcasts, so we
-           would get confused by replies to such messages */
-        m->hdr->nlmsg_seq = rtnl->serial++ ? : rtnl->serial++;
+        /* Avoid collisions with outstanding requests */
+        do {
+                picked = rtnl->serial;

+                /* Don't use seq == 0, as that is used for broadcasts, so we would get confused by replies to
+                   such messages */
+                rtnl->serial = rtnl->serial == UINT32_MAX ? 1 : rtnl->serial + 1;
+
+        } while (hashmap_contains(rtnl->reply_callbacks, UINT32_TO_PTR(picked)));
+
+        m->hdr->nlmsg_seq = picked;
        rtnl_message_seal(m);
-
-        return;
 }

 int sd_netlink_send(sd_netlink *nl,
@ -339,7 +364,7 @@ static int process_timeout(sd_netlink *rtnl) {

        assert_se(prioq_pop(rtnl->reply_callbacks_prioq) == c);
        c->timeout = 0;
-        hashmap_remove(rtnl->reply_callbacks, &c->serial);
+        hashmap_remove(rtnl->reply_callbacks, UINT32_TO_PTR(c->serial));

        slot = container_of(c, sd_netlink_slot, reply_callback);

@ -359,7 +384,7 @@ static int process_timeout(sd_netlink *rtnl) {
 static int process_reply(sd_netlink *rtnl, sd_netlink_message *m) {
        struct reply_callback *c;
        sd_netlink_slot *slot;
-        uint64_t serial;
+        uint32_t serial;
        uint16_t type;
        int r;

@ -367,7 +392,7 @@ static int process_reply(sd_netlink *rtnl, sd_netlink_message *m) {
        assert(m);

        serial = rtnl_message_get_serial(m);
-        c = hashmap_remove(rtnl->reply_callbacks, &serial);
+        c = hashmap_remove(rtnl->reply_callbacks, UINT32_TO_PTR(serial));
        if (!c)
                return 0;

@ -412,20 +437,19 @@ static int process_match(sd_netlink *rtnl, sd_netlink_message *m) {
                return r;

        LIST_FOREACH(match_callbacks, c, rtnl->match_callbacks) {
-                if (type == c->type) {
-                        slot = container_of(c, sd_netlink_slot, match_callback);
+                if (type != c->type)
+                        continue;

-                        r = c->callback(rtnl, m, slot->userdata);
-                        if (r != 0) {
-                                if (r < 0)
-                                        log_debug_errno(r, "sd-netlink: match callback %s%s%sfailed: %m",
-                                                        slot->description ? "'" : "",
-                                                        strempty(slot->description),
-                                                        slot->description ? "' " : "");
+                slot = container_of(c, sd_netlink_slot, match_callback);

-                                break;
-                        }
-                }
+                r = c->callback(rtnl, m, slot->userdata);
+                if (r < 0)
+                        log_debug_errno(r, "sd-netlink: match callback %s%s%sfailed: %m",
+                                        slot->description ? "'" : "",
+                                        strempty(slot->description),
+                                        slot->description ? "' " : "");
+                if (r != 0)
+                        break;
        }

        return 1;
@ -568,7 +592,6 @@ int sd_netlink_call_async(
                uint64_t usec,
                const char *description) {
        _cleanup_free_ sd_netlink_slot *slot = NULL;
-        uint32_t s;
        int r, k;

        assert_return(nl, -EINVAL);
@ -576,7 +599,10 @@ int sd_netlink_call_async(
        assert_return(callback, -EINVAL);
        assert_return(!rtnl_pid_changed(nl), -ECHILD);

-        r = hashmap_ensure_allocated(&nl->reply_callbacks, &uint64_hash_ops);
+        if (hashmap_size(nl->reply_callbacks) >= REPLY_CALLBACKS_MAX)
+                return -ERANGE;
+
+        r = hashmap_ensure_allocated(&nl->reply_callbacks, &trivial_hash_ops);
        if (r < 0)
                return r;

@ -593,20 +619,18 @@ int sd_netlink_call_async(
        slot->reply_callback.callback = callback;
        slot->reply_callback.timeout = calc_elapse(usec);

-        k = sd_netlink_send(nl, m, &s);
+        k = sd_netlink_send(nl, m, &slot->reply_callback.serial);
        if (k < 0)
                return k;

-        slot->reply_callback.serial = s;
-
-        r = hashmap_put(nl->reply_callbacks, &slot->reply_callback.serial, &slot->reply_callback);
+        r = hashmap_put(nl->reply_callbacks, UINT32_TO_PTR(slot->reply_callback.serial), &slot->reply_callback);
        if (r < 0)
                return r;

        if (slot->reply_callback.timeout != 0) {
                r = prioq_put(nl->reply_callbacks_prioq, &slot->reply_callback, &slot->reply_callback.prioq_idx);
                if (r < 0) {
-                        (void) hashmap_remove(nl->reply_callbacks, &slot->reply_callback.serial);
+                        (void) hashmap_remove(nl->reply_callbacks, UINT32_TO_PTR(slot->reply_callback.serial));
                        return r;
                }
        }
Author	SHA1	Message	Date
Zbigniew Jędrzejewski-Szmek	aaf73b2ecf	Merge pull request #17902 from bugaevc/fix-container-detection improve container detection	2021-02-11 12:56:01 +01:00
Zbigniew Jędrzejewski-Szmek	372a5002dc	Merge pull request #18545 from poettering/netlink-seqno-fix sd-netlink seqnum fixes	2021-02-11 08:24:39 +01:00
Lennart Poettering	ac3bc1b819	sd-netlink: spread out sequence numbers a bit An (imperfect) fix for #14760. This makes collisions unlikely, but still theoretically possible. Fixes: #14760	2021-02-10 23:07:46 +01:00
Lennart Poettering	baf78f1a51	sd-netlink: reduce indentation levels a bit	2021-02-10 22:01:24 +01:00
Lennart Poettering	13ec9f103b	sd-netlink: use getsockopt_int() where appropriate	2021-02-10 22:01:24 +01:00
Lennart Poettering	b522c4b92a	sd-netlink: revamp message serial handling Let's use uint32_t everywhere to maintain the seqno, since that's what the kernel does. Prviously in the reply_callback logic we used 64bit, for no apparent reason. Using 32bit also provides us with the benefit that we can avoid using uint64_hash_ops, and can use trivial_hash_ops instead for the reply hashmap, so that we can store the seqno in the key pointer directly. While we are at it, let's make sure we never run into serial collisions internally (32bit is a lot, but not that much), and let's put a limit on outstanding serials, to catch programming errors.	2021-02-10 22:01:24 +01:00
Sergey Bugaev	0e13779d37	virt: detect cgroups namespaces detect_container() is now able to detect if we're running in a cgroup namespace.	2021-02-10 22:25:04 +03:00
Sergey Bugaev	a4a9a6f7c6	virt: detect Docker and Podman containers Docker doesn't set $container, so it cannot be detected that way. Instead, we check for presence of /.dockerinit, which it creates. Podman does set $container, but some Red Hat images (in particular, Fedora images) override $container to equal "oci". So to correctly detect Podman containers, we check for presence of /run/.containerenv, which is created by Podman and is now the official way to get information about the container from within the container. Fixes https://github.com/systemd/systemd/issues/15393	2021-02-10 22:25:04 +03:00