Merge pull request #23002 from yuwata/udev-use-child-event

udev: use child event source
udev: use child event source to manage workers
2026-04-25 16:34:50 +02:00 · 2022-04-13 08:32:16 +02:00 · 2022-04-13 01:08:42 +09:00 · 2022-04-13 01:00:08 +09:00 · 2022-04-13 01:00:08 +09:00 · 2022-04-13 01:00:08 +09:00
2 changed files with 89 additions and 97 deletions
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@ -706,6 +706,9 @@ static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig
                return;
        }

+        if (event_pid_changed(e))
+                return;
+
        assert(d->fd >= 0);

        if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
@ -851,6 +854,9 @@ static void source_disconnect(sd_event_source *s) {
                break;

        case SOURCE_CHILD:
+                if (event_pid_changed(s->event))
+                        s->child.process_owned = false;
+
                if (s->child.pid > 0) {
                        if (event_source_is_online(s)) {
                                assert(s->event->n_online_child_sources > 0);
@ -1426,7 +1432,6 @@ _public_ int sd_event_add_child(
                return -ENOMEM;

        s->wakeup = WAKEUP_EVENT_SOURCE;
-        s->child.pid = pid;
        s->child.options = options;
        s->child.callback = callback;
        s->userdata = userdata;
@ -1436,7 +1441,7 @@ _public_ int sd_event_add_child(
         * pin the PID, and make regular waitid() handling race-free. */

        if (shall_use_pidfd()) {
-                s->child.pidfd = pidfd_open(s->child.pid, 0);
+                s->child.pidfd = pidfd_open(pid, 0);
                if (s->child.pidfd < 0) {
                        /* Propagate errors unless the syscall is not supported or blocked */
                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
@ -1446,10 +1451,6 @@ _public_ int sd_event_add_child(
        } else
                s->child.pidfd = -1;

-        r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
-        if (r < 0)
-                return r;
-
        if (EVENT_SOURCE_WATCH_PIDFD(s)) {
                /* We have a pidfd and we only want to watch for exit */
                r = source_child_pidfd_register(s, s->enabled);
@ -1465,6 +1466,12 @@ _public_ int sd_event_add_child(
                e->need_process_child = true;
        }

+        r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
+        if (r < 0)
+                return r;
+
+        /* These must be done after everything succeeds. */
+        s->child.pid = pid;
        e->n_online_child_sources++;

        if (ret)
@ -1691,7 +1698,8 @@ static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
        assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);

        if (d->fd >= 0) {
-                if (epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
+                if (!event_pid_changed(e) &&
+                    epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
                        log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");

                safe_close(d->fd);
@ -1801,7 +1809,7 @@ static void event_free_inode_data(
        if (d->inotify_data) {

                if (d->wd >= 0) {
-                        if (d->inotify_data->fd >= 0) {
+                        if (d->inotify_data->fd >= 0 && !event_pid_changed(e)) {
                                /* So here's a problem. At the time this runs the watch descriptor might already be
                                 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
                                 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
@ -3218,23 +3226,16 @@ static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priori

        e->need_process_child = false;

-        /*
-           So, this is ugly. We iteratively invoke waitid() with P_PID
-           + WNOHANG for each PID we wait for, instead of using
-           P_ALL. This is because we only want to get child
-           information of very specific child processes, and not all
-           of them. We might not have processed the SIGCHLD even of a
-           previous invocation and we don't want to maintain a
-           unbounded *per-child* event queue, hence we really don't
-           want anything flushed out of the kernel's queue that we
-           don't care about. Since this is O(n) this means that if you
-           have a lot of processes you probably want to handle SIGCHLD
-           yourself.
-
-           We do not reap the children here (by using WNOWAIT), this
-           is only done after the event source is dispatched so that
-           the callback still sees the process as a zombie.
-        */
+        /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
+         * for, instead of using P_ALL. This is because we only want to get child information of very
+         * specific child processes, and not all of them. We might not have processed the SIGCHLD event
+         * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
+         * hence we really don't want anything flushed out of the kernel's queue that we don't care
+         * about. Since this is O(n) this means that if you have a lot of processes you probably want
+         * to handle SIGCHLD yourself.
+         *
+         * We do not reap the children here (by using WNOWAIT), this is only done after the event
+         * source is dispatched so that the callback still sees the process as a zombie. */

        HASHMAP_FOREACH(s, e->child_sources) {
                assert(s->type == SOURCE_CHILD);
@ -3251,7 +3252,9 @@ static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priori
                if (s->child.exited)
                        continue;

-                if (EVENT_SOURCE_WATCH_PIDFD(s)) /* There's a usable pidfd known for this event source? then don't waitid() for it here */
+                if (EVENT_SOURCE_WATCH_PIDFD(s))
+                        /* There's a usable pidfd known for this event source? Then don't waitid() for
+                         * it here */
                        continue;

                zero(s->child.siginfo);
@ -3266,10 +3269,9 @@ static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priori
                                s->child.exited = true;

                        if (!zombie && (s->child.options & WEXITED)) {
-                                /* If the child isn't dead then let's
-                                 * immediately remove the state change
-                                 * from the queue, since there's no
-                                 * benefit in leaving it queued */
+                                /* If the child isn't dead then let's immediately remove the state
+                                 * change from the queue, since there's no benefit in leaving it
+                                 * queued. */

                                assert(s->child.options & (WSTOPPED|WCONTINUED));
                                (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
@ -3324,19 +3326,16 @@ static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, i
        assert_return(events == EPOLLIN, -EIO);
        assert(min_priority);

-        /* If there's a signal queued on this priority and SIGCHLD is
-           on this priority too, then make sure to recheck the
-           children we watch. This is because we only ever dequeue
-           the first signal per priority, and if we dequeue one, and
-           SIGCHLD might be enqueued later we wouldn't know, but we
-           might have higher priority children we care about hence we
-           need to check that explicitly. */
+        /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
+         * sure to recheck the children we watch. This is because we only ever dequeue the first signal
+         * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
+         * but we might have higher priority children we care about hence we need to check that
+         * explicitly. */

        if (sigismember(&d->sigset, SIGCHLD))
                e->need_process_child = true;

-        /* If there's already an event source pending for this
-         * priority we don't read another */
+        /* If there's already an event source pending for this priority we don't read another */
        if (d->current)
                return 0;

--- a/src/udev/udevd.c
+++ b/src/udev/udevd.c
@ -151,6 +151,7 @@ typedef enum WorkerState {
 typedef struct Worker {
        Manager *manager;
        pid_t pid;
+        sd_event_source *child_event_source;
        sd_device_monitor *monitor;
        WorkerState state;
        Event *event;
@ -160,6 +161,7 @@ typedef struct Worker {
 typedef enum EventResult {
        EVENT_RESULT_NERRNO_MIN       = -ERRNO_MAX,
        EVENT_RESULT_NERRNO_MAX       = -1,
+        EVENT_RESULT_SUCCESS          = 0,
        EVENT_RESULT_EXIT_STATUS_BASE = 0,
        EVENT_RESULT_EXIT_STATUS_MAX  = 255,
        EVENT_RESULT_TRY_AGAIN        = 256, /* when the block device is locked by another process. */
@ -202,9 +204,10 @@ static Worker *worker_free(Worker *worker) {
        if (!worker)
                return NULL;

-        assert(worker->manager);
+        if (worker->manager)
+                hashmap_remove(worker->manager->workers, PID_TO_PTR(worker->pid));

-        hashmap_remove(worker->manager->workers, PID_TO_PTR(worker->pid));
+        sd_event_source_unref(worker->child_event_source);
        sd_device_monitor_unref(worker->monitor);
        event_free(worker->event);

@ -255,6 +258,8 @@ static Manager* manager_free(Manager *manager) {

 DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);

+static int on_sigchld(sd_event_source *s, const siginfo_t *si, void *userdata);
+
 static int worker_new(Worker **ret, Manager *manager, sd_device_monitor *worker_monitor, pid_t pid) {
        _cleanup_(worker_freep) Worker *worker = NULL;
        int r;
@ -272,17 +277,21 @@ static int worker_new(Worker **ret, Manager *manager, sd_device_monitor *worker_
                return -ENOMEM;

        *worker = (Worker) {
-                .manager = manager,
                .monitor = sd_device_monitor_ref(worker_monitor),
                .pid = pid,
        };

+        r = sd_event_add_child(manager->event, &worker->child_event_source, pid, WEXITED, on_sigchld, worker);
+        if (r < 0)
+                return r;
+
        r = hashmap_ensure_put(&manager->workers, &worker_hash_op, PID_TO_PTR(pid), worker);
        if (r < 0)
                return r;

-        *ret = TAKE_PTR(worker);
+        worker->manager = manager;

+        *ret = TAKE_PTR(worker);
        return 0;
 }

@ -363,7 +372,7 @@ static int on_kill_workers_event(sd_event_source *s, uint64_t usec, void *userda
        return 1;
 }

-static void device_broadcast(sd_device_monitor *monitor, sd_device *dev, int result) {
+static void device_broadcast(sd_device_monitor *monitor, sd_device *dev, EventResult result) {
        int r;

        assert(dev);
@ -372,7 +381,7 @@ static void device_broadcast(sd_device_monitor *monitor, sd_device *dev, int res
        if (!monitor)
                return;

-        if (result != 0) {
+        if (result != EVENT_RESULT_SUCCESS) {
                (void) device_add_property(dev, "UDEV_WORKER_FAILED", "1");

                switch (result) {
@ -415,7 +424,7 @@ static void device_broadcast(sd_device_monitor *monitor, sd_device *dev, int res
                                         "Failed to broadcast event to libudev listeners, ignoring: %m");
 }

-static int worker_send_result(Manager *manager, int result) {
+static int worker_send_result(Manager *manager, EventResult result) {
        assert(manager);
        assert(manager->worker_watch[WRITE_END] >= 0);

@ -1200,7 +1209,7 @@ static int on_worker(sd_event_source *s, int fd, uint32_t revents, void *userdat
        assert(manager);

        for (;;) {
-                int result;
+                EventResult result;
                struct iovec iovec = IOVEC_MAKE(&result, sizeof(result));
                CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control;
                struct msghdr msghdr = {
@ -1543,59 +1552,47 @@ static int on_sighup(sd_event_source *s, const struct signalfd_siginfo *si, void
        return 1;
 }

-static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
-        Manager *manager = userdata;
+static int on_sigchld(sd_event_source *s, const siginfo_t *si, void *userdata) {
+        Worker *worker = ASSERT_PTR(userdata);
+        Manager *manager = ASSERT_PTR(worker->manager);
+        sd_device *dev = worker->event ? ASSERT_PTR(worker->event->dev) : NULL;
+        EventResult result;
        int r;

-        assert(manager);
+        assert(si);

-        for (;;) {
-                pid_t pid;
-                int status;
-                Worker *worker;
+        switch (si->si_code) {
+        case CLD_EXITED:
+                if (si->si_status == 0)
+                        log_device_debug(dev, "Worker ["PID_FMT"] exited.", si->si_pid);
+                else
+                        log_device_warning(dev, "Worker ["PID_FMT"] exited with return code %i.",
+                                           si->si_pid, si->si_status);
+                result = EVENT_RESULT_EXIT_STATUS_BASE + si->si_status;
+                break;

-                pid = waitpid(-1, &status, WNOHANG);
-                if (pid <= 0)
-                        break;
+        case CLD_KILLED:
+        case CLD_DUMPED:
+                log_device_warning(dev, "Worker ["PID_FMT"] terminated by signal %i (%s).",
+                                   si->si_pid, si->si_status, signal_to_string(si->si_status));
+                result = EVENT_RESULT_SIGNAL_BASE + si->si_status;
+                break;

-                worker = hashmap_get(manager->workers, PID_TO_PTR(pid));
-                if (!worker) {
-                        log_warning("Worker ["PID_FMT"] is unknown, ignoring", pid);
-                        continue;
-                }
-
-                if (WIFEXITED(status)) {
-                        if (WEXITSTATUS(status) == 0)
-                                log_debug("Worker ["PID_FMT"] exited", pid);
-                        else
-                                log_warning("Worker ["PID_FMT"] exited with return code %i", pid, WEXITSTATUS(status));
-                } else if (WIFSIGNALED(status))
-                        log_warning("Worker ["PID_FMT"] terminated by signal %i (%s)", pid, WTERMSIG(status), signal_to_string(WTERMSIG(status)));
-                else if (WIFSTOPPED(status)) {
-                        log_info("Worker ["PID_FMT"] stopped", pid);
-                        continue;
-                } else if (WIFCONTINUED(status)) {
-                        log_info("Worker ["PID_FMT"] continued", pid);
-                        continue;
-                } else
-                        log_warning("Worker ["PID_FMT"] exit with status 0x%04x", pid, status);
-
-                if ((!WIFEXITED(status) || WEXITSTATUS(status) != 0) && worker->event) {
-                        log_device_error(worker->event->dev, "Worker ["PID_FMT"] failed", pid);
-
-                        /* delete state from disk */
-                        device_delete_db(worker->event->dev);
-                        device_tag_index(worker->event->dev, NULL, false);
-
-                        /* Forward kernel event to libudev listeners */
-                        device_broadcast(manager->monitor, worker->event->dev,
-                                         WIFEXITED(status) ? EVENT_RESULT_EXIT_STATUS_BASE + WEXITSTATUS(status):
-                                         WIFSIGNALED(status) ? EVENT_RESULT_SIGNAL_BASE + WTERMSIG(status) : 0);
-                }
-
-                worker_free(worker);
+        default:
+                assert_not_reached();
        }

+        if (result != EVENT_RESULT_SUCCESS && dev) {
+                /* delete state from disk */
+                device_delete_db(dev);
+                device_tag_index(dev, NULL, false);
+
+                /* Forward kernel event to libudev listeners */
+                device_broadcast(manager->monitor, dev, result);
+        }
+
+        worker_free(worker);
+
        /* we can start new workers, try to schedule events */
        event_queue_start(manager);

@ -2017,10 +2014,6 @@ static int main_loop(Manager *manager) {
        if (r < 0)
                return log_error_errno(r, "Failed to create SIGHUP event source: %m");

-        r = sd_event_add_signal(manager->event, NULL, SIGCHLD, on_sigchld, manager);
-        if (r < 0)
-                return log_error_errno(r, "Failed to create SIGCHLD event source: %m");
-
        r = sd_event_set_watchdog(manager->event, true);
        if (r < 0)
                return log_error_errno(r, "Failed to create watchdog event source: %m");
Author	SHA1	Message	Date
Zbigniew Jędrzejewski-Szmek	87d44b6e1a	Merge pull request #23002 from yuwata/udev-use-child-event udev: use child event source	2022-04-13 08:32:16 +02:00
Yu Watanabe	82e0b63183	udev: use child event source to manage workers	2022-04-13 01:08:42 +09:00
Yu Watanabe	a79cba3326	udev: use EventResult type This also adds EVENT_RESULT_SUCCESS for readability.	2022-04-13 01:00:08 +09:00
Yu Watanabe	fbae50904f	sd-event: make inotify event work after the process is forked	2022-04-13 01:00:08 +09:00
Yu Watanabe	86587c93b0	sd-event: do not kill a child process from another child	2022-04-13 01:00:08 +09:00
Yu Watanabe	01e6af7374	sd-event: do not update signal fd after PID is changed Otherwise, child event source will not work after the process is forked and the event source is unref()ed on the child process.	2022-04-13 01:00:08 +09:00
Yu Watanabe	54988a27b9	sd-event: set pid to event source after all setup processes finished Otherwise, the assertion in source_disconnect() may be triggered,	2022-04-13 01:00:08 +09:00
Yu Watanabe	91c700713f	sd-event: rebreak comments	2022-04-13 01:00:08 +09:00