2025-09-30 17:24:46 +02:00
3 changed files with 41 additions and 184 deletions
--- a/src/basic/virt.c
+++ b/src/basic/virt.c
@ -9,7 +9,6 @@
 #include <unistd.h>

 #include "alloc-util.h"
-#include "cgroup-util.h"
 #include "dirent-util.h"
 #include "env-util.h"
 #include "fd-util.h"
@ -454,100 +453,6 @@ static const char *const container_table[_VIRTUALIZATION_MAX] = {

 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(container, int);

-static int running_in_cgroupns(void) {
-        int r;
-
-        if (!cg_ns_supported())
-                return false;
-
-        r = cg_all_unified();
-        if (r < 0)
-                return r;
-
-        if (r) {
-                /* cgroup v2 */
-
-                r = access("/sys/fs/cgroup/cgroup.events", F_OK);
-                if (r < 0) {
-                        if (errno != ENOENT)
-                                return -errno;
-                        /* All kernel versions have cgroup.events in nested cgroups. */
-                        return false;
-                }
-
-                /* There's no cgroup.type in the root cgroup, and future kernel versions
-                 * are unlikely to add it since cgroup.type is something that makes no sense
-                 * whatsoever in the root cgroup. */
-                r = access("/sys/fs/cgroup/cgroup.type", F_OK);
-                if (r == 0)
-                        return true;
-                if (r < 0 && errno != ENOENT)
-                        return -errno;
-
-                /* On older kernel versions, there's no cgroup.type */
-                r = access("/sys/kernel/cgroup/features", F_OK);
-                if (r < 0) {
-                        if (errno != ENOENT)
-                                return -errno;
-                        /* This is an old kernel that we know for sure has cgroup.events
-                         * only in nested cgroups. */
-                        return true;
-                }
-
-                /* This is a recent kernel, and cgroup.type doesn't exist, so we must be
-                 * in the root cgroup. */
-                return false;
-        } else {
-                /* cgroup v1 */
-
-                /* If systemd controller is not mounted, do not even bother. */
-                r = access("/sys/fs/cgroup/systemd", F_OK);
-                if (r < 0) {
-                        if (errno != ENOENT)
-                                return -errno;
-                        return false;
-                }
-
-                /* release_agent only exists in the root cgroup. */
-                r = access("/sys/fs/cgroup/systemd/release_agent", F_OK);
-                if (r < 0) {
-                        if (errno != ENOENT)
-                                return -errno;
-                        return true;
-                }
-
-                return false;
-        }
-}
-
-static int detect_container_files(void) {
-        unsigned i;
-
-        static const struct {
-                const char *file_path;
-                int id;
-        } container_file_table[] = {
-                /* https://github.com/containers/podman/issues/6192 */
-                /* https://github.com/containers/podman/issues/3586#issuecomment-661918679 */
-                { "/run/.containerenv", VIRTUALIZATION_PODMAN },
-                /* https://github.com/moby/moby/issues/18355 */
-                /* Docker must be the last in this table, see below. */
-                { "/.dockerenv",        VIRTUALIZATION_DOCKER },
-        };
-
-        for (i = 0; i < ELEMENTSOF(container_file_table); i++) {
-                if (access(container_file_table[i].file_path, F_OK) >= 0)
-                        return container_file_table[i].id;
-
-                if (errno != ENOENT)
-                        log_debug_errno(errno,
-                                        "Checking if %s exists failed, ignoring: %m",
-                                        container_file_table[i].file_path);
-        }
-
-        return VIRTUALIZATION_NONE;
-}
-
 int detect_container(void) {
        static thread_local int cached_found = _VIRTUALIZATION_INVALID;
        _cleanup_free_ char *m = NULL, *o = NULL, *p = NULL;
@ -625,7 +530,7 @@ int detect_container(void) {
                 */
                e = getenv("container");
                if (!e)
-                        goto check_files;
+                        goto none;
                if (isempty(e)) {
                        r = VIRTUALIZATION_NONE;
                        goto finish;
@ -653,36 +558,12 @@ int detect_container(void) {
        if (r < 0) /* This only works if we have CAP_SYS_PTRACE, hence let's better ignore failures here */
                log_debug_errno(r, "Failed to read $container of PID 1, ignoring: %m");

-check_files:
-        /* Check for existence of some well-known files. We only do this after checking
-         * for other specific container managers, otherwise we risk mistaking another
-         * container manager for Docker: the /.dockerenv file could inadvertently end up
-         * in a file system image. */
-        r = detect_container_files();
-        if (r)
-                goto finish;
-
-        r = running_in_cgroupns();
-        if (r > 0) {
-                r = VIRTUALIZATION_CONTAINER_OTHER;
-                goto finish;
-        }
-        if (r < 0)
-                log_debug_errno(r, "Failed to detect cgroup namespace: %m");
-
-        /* If none of that worked, give up, assume no container manager. */
+none:
+        /* If that didn't work, give up, assume no container manager. */
        r = VIRTUALIZATION_NONE;
        goto finish;

 translate_name:
-        if (streq(e, "oci")) {
-                /* Some images hardcode container=oci, but OCI is not a specific container manager.
-                 * Try to detect one based on well-known files. */
-                r = detect_container_files();
-                if (!r)
-                        r = VIRTUALIZATION_CONTAINER_OTHER;
-                goto finish;
-        }
        r = container_from_string(e);
        if (r < 0)
                r = VIRTUALIZATION_CONTAINER_OTHER;
--- a/src/libsystemd/sd-netlink/netlink-internal.h
+++ b/src/libsystemd/sd-netlink/netlink-internal.h
@ -19,7 +19,7 @@
 struct reply_callback {
        sd_netlink_message_handler_t callback;
        usec_t timeout;
-        uint32_t serial;
+        uint64_t serial;
        unsigned prioq_idx;
 };

--- a/src/libsystemd/sd-netlink/sd-netlink.c
+++ b/src/libsystemd/sd-netlink/sd-netlink.c
@ -17,9 +17,6 @@
 #include "string-util.h"
 #include "util.h"

-/* Some really high limit, to catch programming errors */
-#define REPLY_CALLBACKS_MAX UINT16_MAX
-
 static int sd_netlink_new(sd_netlink **ret) {
        _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;

@ -36,29 +33,11 @@ static int sd_netlink_new(sd_netlink **ret) {
                .original_pid = getpid_cached(),
                .protocol = -1,

-                /* Kernel change notification messages have sequence number 0. We want to avoid that with our
-                 * own serials, in order not to get confused when matching up kernel replies to our earlier
-                 * requests.
-                 *
-                 * Moreover, when using netlink socket activation (i.e. where PID 1 binds an AF_NETLINK
-                 * socket for us and passes it to us across execve()) and we get restarted multiple times
-                 * while the socket sticks around we might get confused by replies from earlier runs coming
-                 * in late — which is pretty likely if we'd start our sequence numbers always from 1. Hence,
-                 * let's start with a value based on the system clock. This should make collisions much less
-                 * likely (though still theoretically possible). We use a 32 bit µs counter starting at boot
-                 * for this (and explicitly exclude the zero, see above). This counter will wrap around after
-                 * a bit more than 1h, but that's hopefully OK as the kernel shouldn't take that long to
-                 * reply to our requests.
-                 *
-                 * We only pick the initial start value this way. For each message we simply increase the
-                 * sequence number by 1. This means we could enqueue 1 netlink message per µs without risking
-                 * collisions, which should be OK.
-                 *
-                 * Note this means the serials will be in the range 1…UINT32_MAX here.
-                 *
-                 * (In an ideal world we'd attach the current serial counter to the netlink socket itself
-                 * somehow, to avoid all this, but I couldn't come up with a nice way to do this) */
-                .serial = (uint32_t) (now(CLOCK_MONOTONIC) % UINT32_MAX) + 1,
+                /* Change notification responses have sequence 0, so we must
+                 * start our request sequence numbers at 1, or we may confuse our
+                 * responses with notifications from the kernel */
+                .serial = 1,
+
        };

        /* We guarantee that the read buffer has at least space for
@ -110,7 +89,9 @@ static bool rtnl_pid_changed(const sd_netlink *rtnl) {

 int sd_netlink_open_fd(sd_netlink **ret, int fd) {
        _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
-        int r, protocol;
+        int r;
+        int protocol;
+        socklen_t l;

        assert_return(ret, -EINVAL);
        assert_return(fd >= 0, -EBADF);
@ -119,7 +100,8 @@ int sd_netlink_open_fd(sd_netlink **ret, int fd) {
        if (r < 0)
                return r;

-        r = getsockopt_int(fd, SOL_SOCKET, SO_PROTOCOL, &protocol);
+        l = sizeof(protocol);
+        r = getsockopt(fd, SOL_SOCKET, SO_PROTOCOL, &protocol, &l);
        if (r < 0)
                return r;

@ -208,25 +190,18 @@ static sd_netlink *netlink_free(sd_netlink *rtnl) {
 DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_netlink, sd_netlink, netlink_free);

 static void rtnl_seal_message(sd_netlink *rtnl, sd_netlink_message *m) {
-        uint32_t picked;
-
        assert(rtnl);
        assert(!rtnl_pid_changed(rtnl));
        assert(m);
        assert(m->hdr);

-        /* Avoid collisions with outstanding requests */
-        do {
-                picked = rtnl->serial;
+        /* don't use seq == 0, as that is used for broadcasts, so we
+           would get confused by replies to such messages */
+        m->hdr->nlmsg_seq = rtnl->serial++ ? : rtnl->serial++;

-                /* Don't use seq == 0, as that is used for broadcasts, so we would get confused by replies to
-                   such messages */
-                rtnl->serial = rtnl->serial == UINT32_MAX ? 1 : rtnl->serial + 1;
-
-        } while (hashmap_contains(rtnl->reply_callbacks, UINT32_TO_PTR(picked)));
-
-        m->hdr->nlmsg_seq = picked;
        rtnl_message_seal(m);
+
+        return;
 }

 int sd_netlink_send(sd_netlink *nl,
@ -364,7 +339,7 @@ static int process_timeout(sd_netlink *rtnl) {

        assert_se(prioq_pop(rtnl->reply_callbacks_prioq) == c);
        c->timeout = 0;
-        hashmap_remove(rtnl->reply_callbacks, UINT32_TO_PTR(c->serial));
+        hashmap_remove(rtnl->reply_callbacks, &c->serial);

        slot = container_of(c, sd_netlink_slot, reply_callback);

@ -384,7 +359,7 @@ static int process_timeout(sd_netlink *rtnl) {
 static int process_reply(sd_netlink *rtnl, sd_netlink_message *m) {
        struct reply_callback *c;
        sd_netlink_slot *slot;
-        uint32_t serial;
+        uint64_t serial;
        uint16_t type;
        int r;

@ -392,7 +367,7 @@ static int process_reply(sd_netlink *rtnl, sd_netlink_message *m) {
        assert(m);

        serial = rtnl_message_get_serial(m);
-        c = hashmap_remove(rtnl->reply_callbacks, UINT32_TO_PTR(serial));
+        c = hashmap_remove(rtnl->reply_callbacks, &serial);
        if (!c)
                return 0;

@ -437,19 +412,20 @@ static int process_match(sd_netlink *rtnl, sd_netlink_message *m) {
                return r;

        LIST_FOREACH(match_callbacks, c, rtnl->match_callbacks) {
-                if (type != c->type)
-                        continue;
+                if (type == c->type) {
+                        slot = container_of(c, sd_netlink_slot, match_callback);

-                slot = container_of(c, sd_netlink_slot, match_callback);
+                        r = c->callback(rtnl, m, slot->userdata);
+                        if (r != 0) {
+                                if (r < 0)
+                                        log_debug_errno(r, "sd-netlink: match callback %s%s%sfailed: %m",
+                                                        slot->description ? "'" : "",
+                                                        strempty(slot->description),
+                                                        slot->description ? "' " : "");

-                r = c->callback(rtnl, m, slot->userdata);
-                if (r < 0)
-                        log_debug_errno(r, "sd-netlink: match callback %s%s%sfailed: %m",
-                                        slot->description ? "'" : "",
-                                        strempty(slot->description),
-                                        slot->description ? "' " : "");
-                if (r != 0)
-                        break;
+                                break;
+                        }
+                }
        }

        return 1;
@ -592,6 +568,7 @@ int sd_netlink_call_async(
                uint64_t usec,
                const char *description) {
        _cleanup_free_ sd_netlink_slot *slot = NULL;
+        uint32_t s;
        int r, k;

        assert_return(nl, -EINVAL);
@ -599,10 +576,7 @@ int sd_netlink_call_async(
        assert_return(callback, -EINVAL);
        assert_return(!rtnl_pid_changed(nl), -ECHILD);

-        if (hashmap_size(nl->reply_callbacks) >= REPLY_CALLBACKS_MAX)
-                return -ERANGE;
-
-        r = hashmap_ensure_allocated(&nl->reply_callbacks, &trivial_hash_ops);
+        r = hashmap_ensure_allocated(&nl->reply_callbacks, &uint64_hash_ops);
        if (r < 0)
                return r;

@ -619,18 +593,20 @@ int sd_netlink_call_async(
        slot->reply_callback.callback = callback;
        slot->reply_callback.timeout = calc_elapse(usec);

-        k = sd_netlink_send(nl, m, &slot->reply_callback.serial);
+        k = sd_netlink_send(nl, m, &s);
        if (k < 0)
                return k;

-        r = hashmap_put(nl->reply_callbacks, UINT32_TO_PTR(slot->reply_callback.serial), &slot->reply_callback);
+        slot->reply_callback.serial = s;
+
+        r = hashmap_put(nl->reply_callbacks, &slot->reply_callback.serial, &slot->reply_callback);
        if (r < 0)
                return r;

        if (slot->reply_callback.timeout != 0) {
                r = prioq_put(nl->reply_callbacks_prioq, &slot->reply_callback, &slot->reply_callback.prioq_idx);
                if (r < 0) {
-                        (void) hashmap_remove(nl->reply_callbacks, UINT32_TO_PTR(slot->reply_callback.serial));
+                        (void) hashmap_remove(nl->reply_callbacks, &slot->reply_callback.serial);
                        return r;
                }
        }