On Tue, Jul 15, 2025 at 09:42:52AM -0700, Linus Torvalds wrote: > On Tue, 15 Jul 2025 at 05:47, Nam Cao <namcao@xxxxxxxxxxxxx> wrote: > > > > fs/eventpoll.c | 139 +++++++++---------------------------------------- > > 1 file changed, 26 insertions(+), 113 deletions(-) > > Yeah, this is more like the kind of diffstat I like to see for > eventpoll. Plus it makes things fundamentally simpler. > > It might be worth looking at ep_poll_callback() - the only case that > had read_lock_irqsave() - and seeing if perhaps some of the tests > inside the lock might be done optimistically, or delayed to after the > lock. > > For example, the whole wakeup sequence looks like it should be done > *after* the ep->lock has been released, because it uses its own lock > (ie the > > if (sync) > wake_up_sync(&ep->wq); > else > wake_up(&ep->wq); > > thing uses the wq lock, and there is nothing that ep->lock protects > there as far as I can tell. > > So I think this has some potential for _simple_ optimizations, but I'm > not sure how worth it it is. Actually ep->lock does protect this part. Because ep_poll() touches ep->wq without holding the waitqueue's lock. We could do something like the diff below. But things are not so simple anymore. Best regards, Nam diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a171f7e7dacc..5e6c7da606e7 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1252,8 +1252,6 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v unsigned long flags; int ewake = 0; - spin_lock_irqsave(&ep->lock, flags); - ep_set_busy_poll_napi_id(epi); /* @@ -1263,7 +1261,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) - goto out_unlock; + goto out; /* * Check the events coming with the callback. At this stage, not @@ -1272,7 +1270,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * test for "key" != NULL before the event match test. */ if (pollflags && !(pollflags & epi->event.events)) - goto out_unlock; + goto out; /* * If we are transferring events to userspace, we can hold no locks @@ -1280,6 +1278,8 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ + spin_lock_irqsave(&ep->lock, flags); + if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = READ_ONCE(ep->ovflist); @@ -1292,9 +1292,14 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v ep_pm_stay_awake_rcu(epi); } + spin_unlock_irqrestore(&ep->lock, flags); + + /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. + * + * Memory barrier for waitqueue_active() from spin_unlock_irqrestore(). */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && @@ -1321,9 +1326,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v if (waitqueue_active(&ep->poll_wait)) pwake++; -out_unlock: - spin_unlock_irqrestore(&ep->lock, flags); - +out: /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); @@ -2001,13 +2004,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, init_wait(&wait); wait.func = ep_autoremove_wake_function; - spin_lock_irq(&ep->lock); - /* - * Barrierless variant, waitqueue_active() is called under - * the same lock on wakeup ep_poll_callback() side, so it - * is safe to avoid an explicit barrier. - */ - __set_current_state(TASK_INTERRUPTIBLE); + prepare_to_wait_exclusive(&ep->wq, &wait, TASK_INTERRUPTIBLE); /* * Do the final check under the lock. ep_start/done_scan() @@ -2016,10 +2013,8 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * period of time although events are pending, so lock is * important. */ + spin_lock_irq(&ep->lock); eavail = ep_events_available(ep); - if (!eavail) - __add_wait_queue_exclusive(&ep->wq, &wait); - spin_unlock_irq(&ep->lock); if (!eavail) @@ -2036,7 +2031,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, eavail = 1; if (!list_empty_careful(&wait.entry)) { - spin_lock_irq(&ep->lock); + spin_lock_irq(&ep->wq.lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its @@ -2047,7 +2042,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); - spin_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->wq.lock); } } }