Fix latch event policy that hid socket events.
authorThomas Munro <[email protected]>
Mon, 20 Jan 2025 02:17:47 +0000 (15:17 +1300)
committerThomas Munro <[email protected]>
Mon, 20 Jan 2025 03:43:29 +0000 (16:43 +1300)
If a WaitEventSetWait() caller asks for multiple events, an already set
latch would previously prevent other events from being reported at the
same time.  Now, we'll also poll the kernel for other events that would
fit in the caller's output buffer with a zero wait time.  This policy
change doesn't affect callers that ask for only one event.

The main caller affected is the postmaster.  If its latch is set
extremely frequently by backends launching workers and workers exiting,
we don't want it to handle only those jobs and ignore incoming client
connections.

Back- to 16 where the postmaster began using the API.  The
fast-return policy changed here is older than that, but doesn't cause
any known problems in earlier releases.

Reported-by: Nathan Bossart <[email protected]>
Reviewed-by: Nathan Bossart <[email protected]>
Discussion: https://postgr.es/m/Z1n5UpAiGDmFcMmd%40nathan

src/backend/storage/ipc/latch.c

index 8db6630517ae93988235344f9041183bc2a07931..4572684f2242e8a142bd9640adfb00b412a85bcb 100644 (file)
@@ -1452,9 +1452,9 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
        int         rc;
 
        /*
-        * Check if the latch is set already. If so, leave the loop
-        * immediately, avoid blocking again. We don't attempt to report any
-        * other events that might also be satisfied.
+        * Check if the latch is set already first.  If so, we either exit
+        * immediately or ask the kernel for further events available right
+        * now without waiting, depending on how many events the caller wants.
         *
         * If someone sets the latch between this and the
         * WaitEventSetWaitBlock() below, the setter will write a byte to the
@@ -1499,7 +1499,16 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
            /* could have been set above */
            set->latch->maybe_sleeping = false;
 
-           break;
+           if (returned_events == nevents)
+               break;          /* output buffer full already */
+
+           /*
+            * Even though we already have an event, we'll poll just once with
+            * zero timeout to see what non-latch events we can fit into the
+            * output buffer at the same time.
+            */
+           cur_timeout = 0;
+           timeout = 0;
        }
 
        /*
@@ -1508,18 +1517,16 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
         * to retry, everything >= 1 is the number of returned events.
         */
        rc = WaitEventSetWaitBlock(set, cur_timeout,
-                                  occurred_events, nevents);
+                                  occurred_events, nevents - returned_events);
 
-       if (set->latch)
-       {
-           Assert(set->latch->maybe_sleeping);
+       if (set->latch &&
+           set->latch->maybe_sleeping)
            set->latch->maybe_sleeping = false;
-       }
 
        if (rc == -1)
            break;              /* timeout occurred */
        else
-           returned_events = rc;
+           returned_events += rc;
 
        /* If we're not done, update cur_timeout for next iteration */
        if (returned_events == 0 && timeout >= 0)
@@ -1607,7 +1614,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
            /* Drain the signalfd. */
            drain();
 
-           if (set->latch && set->latch->is_set)
+           if (set->latch && set->latch->maybe_sleeping && set->latch->is_set)
            {
                occurred_events->fd = PGINVALID_SOCKET;
                occurred_events->events = WL_LATCH_SET;
@@ -1766,7 +1773,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
        if (cur_event->events == WL_LATCH_SET &&
            cur_kqueue_event->filter == EVFILT_SIGNAL)
        {
-           if (set->latch && set->latch->is_set)
+           if (set->latch && set->latch->maybe_sleeping && set->latch->is_set)
            {
                occurred_events->fd = PGINVALID_SOCKET;
                occurred_events->events = WL_LATCH_SET;
@@ -1891,7 +1898,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
            /* There's data in the self-pipe, clear it. */
            drain();
 
-           if (set->latch && set->latch->is_set)
+           if (set->latch && set->latch->maybe_sleeping && set->latch->is_set)
            {
                occurred_events->fd = PGINVALID_SOCKET;
                occurred_events->events = WL_LATCH_SET;
@@ -2107,7 +2114,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
            if (!ResetEvent(set->handles[cur_event->pos + 1]))
                elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
 
-           if (set->latch && set->latch->is_set)
+           if (set->latch && set->latch->maybe_sleeping && set->latch->is_set)
            {
                occurred_events->fd = PGINVALID_SOCKET;
                occurred_events->events = WL_LATCH_SET;