1 /*-------------------------------------------------------------------------
2 *
3 * latch.c
4 * Routines for inter-process latches
5 *
6 * The Unix implementation uses the so-called self-pipe trick to overcome the
7 * race condition involved with poll() (or epoll_wait() on linux) and setting
8 * a global flag in the signal handler. When a latch is set and the current
9 * process is waiting for it, the signal handler wakes up the poll() in
10 * WaitLatch by writing a byte to a pipe. A signal by itself doesn't interrupt
11 * poll() on all platforms, and even on platforms where it does, a signal that
12 * arrives just before the poll() call does not prevent poll() from entering
13 * sleep. An incoming byte on a pipe however reliably interrupts the sleep,
14 * and causes poll() to return immediately even if the signal arrives before
15 * poll() begins.
16 *
17 * When SetLatch is called from the same process that owns the latch,
18 * SetLatch writes the byte directly to the pipe. If it's owned by another
19 * process, SIGUSR1 is sent and the signal handler in the waiting process
20 * writes the byte to the pipe on behalf of the signaling process.
21 *
22 * The Windows implementation uses Windows events that are inherited by all
23 * postmaster child processes. There's no need for the self-pipe trick there.
24 *
25 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
26 * Portions Copyright (c) 1994, Regents of the University of California
cleanup(llvm::StringRef Code,const std::vector<tooling::Range> & Ranges,const FormatStyle & Style=getLLVMStyle ())27 *
28 * IDENTIFICATION
29 * src/backend/storage/ipc/latch.c
30 *
31 *-------------------------------------------------------------------------
32 */
33 #include "postgres.h"
34
35 #include <fcntl.h>
36 #include <limits.h>
37 #include <signal.h>
38 #include <unistd.h>
39 #ifdef HAVE_SYS_EPOLL_H
40 #include <sys/epoll.h>
41 #endif
42 #ifdef HAVE_SYS_EVENT_H
43 #include <sys/event.h>
44 #endif
45 #ifdef HAVE_POLL_H
46 #include <poll.h>
47 #endif
48
49 #include "miscadmin.h"
50 #include "pgstat.h"
51 #include "port/atomics.h"
52 #include "portability/instr_time.h"
53 #include "postmaster/postmaster.h"
54 #include "storage/fd.h"
55 #include "storage/ipc.h"
56 #include "storage/latch.h"
57 #include "storage/pmsignal.h"
58 #include "storage/shmem.h"
59
60 /*
61 * Select the fd readiness primitive to use. Normally the "most modern"
62 * primitive supported by the OS will be used, but for testing it can be
63 * useful to manually specify the used primitive. If desired, just add a
64 * define somewhere before this block.
65 */
66 #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
67 defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32)
68 /* don't overwrite manual choice */
69 #elif defined(HAVE_SYS_EPOLL_H)
70 #define WAIT_USE_EPOLL
71 #elif defined(HAVE_KQUEUE)
72 #define WAIT_USE_KQUEUE
73 #elif defined(HAVE_POLL)
74 #define WAIT_USE_POLL
75 #elif WIN32
76 #define WAIT_USE_WIN32
77 #else
78 #error "no wait set implementation available"
79 #endif
80
81 /* typedef in latch.h */
82 struct WaitEventSet
83 {
84 int nevents; /* number of registered events */
85 int nevents_space; /* maximum number of events in this set */
86
87 /*
88 * Array, of nevents_space length, storing the definition of events this
89 * set is waiting for.
90 */
91 WaitEvent *events;
92
93 /*
94 * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
95 * said latch, and latch_pos the offset in the ->events array. This is
96 * useful because we check the state of the latch before performing doing
97 * syscalls related to waiting.
98 */
99 Latch *latch;
100 int latch_pos;
101
102 /*
103 * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
104 * is set so that we'll exit immediately if postmaster death is detected,
105 * instead of returning.
106 */
107 bool exit_on_postmaster_death;
108
109 #if defined(WAIT_USE_EPOLL)
110 int epoll_fd;
111 /* epoll_wait returns events in a user provided arrays, allocate once */
112 struct epoll_event *epoll_ret_events;
113 #elif defined(WAIT_USE_KQUEUE)
114 int kqueue_fd;
115 /* kevent returns events in a user provided arrays, allocate once */
116 struct kevent *kqueue_ret_events;
117 bool report_postmaster_not_running;
118 #elif defined(WAIT_USE_POLL)
119 /* poll expects events to be waited on every poll() call, prepare once */
120 struct pollfd *pollfds;
121 #elif defined(WAIT_USE_WIN32)
122
123 /*
124 * Array of windows events. The first element always contains
125 * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
126 * event->pos + 1).
127 */
128 HANDLE *handles;
129 #endif
130 };
131
132 #ifndef WIN32
133 /* Are we currently in WaitLatch? The signal handler would like to know. */
134 static volatile sig_atomic_t waiting = false;
135
136 /* Read and write ends of the self-pipe */
137 static int selfpipe_readfd = -1;
138 static int selfpipe_writefd = -1;
139
140 /* Process owning the self-pipe --- needed for checking purposes */
TEST_F(CleanupTest,CtorInitializationSimpleRedundantComma)141 static int selfpipe_owner_pid = 0;
142
143 /* Private function prototypes */
144 static void sendSelfPipeByte(void);
145 static void drainSelfPipe(void);
146 #endif /* WIN32 */
147
148 #if defined(WAIT_USE_EPOLL)
149 static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
150 #elif defined(WAIT_USE_KQUEUE)
151 static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events);
152 #elif defined(WAIT_USE_POLL)
153 static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
154 #elif defined(WAIT_USE_WIN32)
TEST_F(CleanupTest,CtorInitializationSimpleRedundantColon)155 static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
156 #endif
157
158 static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
159 WaitEvent *occurred_events, int nevents);
160
161 /*
162 * Initialize the process-local latch infrastructure.
163 *
164 * This must be called once during startup of any process that can wait on
TEST_F(CleanupTest,ListRedundantComma)165 * latches, before it issues any InitLatch() or OwnLatch() calls.
166 */
167 void
168 InitializeLatchSupport(void)
169 {
170 #ifndef WIN32
171 int pipefd[2];
172
173 if (IsUnderPostmaster)
174 {
175 /*
176 * We might have inherited connections to a self-pipe created by the
177 * postmaster. It's critical that child processes create their own
178 * self-pipes, of course, and we really want them to close the
179 * inherited FDs for safety's sake.
180 */
181 if (selfpipe_owner_pid != 0)
182 {
183 /* Assert we go through here but once in a child process */
184 Assert(selfpipe_owner_pid != MyProcPid);
185 /* Release postmaster's pipe FDs; ignore any error */
186 (void) close(selfpipe_readfd);
187 (void) close(selfpipe_writefd);
188 /* Clean up, just for safety's sake; we'll set these below */
189 selfpipe_readfd = selfpipe_writefd = -1;
190 selfpipe_owner_pid = 0;
191 /* Keep fd.c's accounting straight */
192 ReleaseExternalFD();
193 ReleaseExternalFD();
194 }
195 else
196 {
197 /*
198 * Postmaster didn't create a self-pipe ... or else we're in an
199 * EXEC_BACKEND build, in which case it doesn't matter since the
200 * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
201 * fd.c won't have state to clean up, either.
202 */
203 Assert(selfpipe_readfd == -1);
204 }
205 }
206 else
207 {
208 /* In postmaster or standalone backend, assert we do this but once */
209 Assert(selfpipe_readfd == -1);
210 Assert(selfpipe_owner_pid == 0);
211 }
212
213 /*
214 * Set up the self-pipe that allows a signal handler to wake up the
215 * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
216 * that SetLatch won't block if the event has already been set many times
217 * filling the kernel buffer. Make the read-end non-blocking too, so that
218 * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
219 * Also, make both FDs close-on-exec, since we surely do not want any
220 * child processes messing with them.
221 */
222 if (pipe(pipefd) < 0)
223 elog(FATAL, "pipe() failed: %m");
224 if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
225 elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
226 if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
227 elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
228 if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
229 elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
230 if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
231 elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
232
233 selfpipe_readfd = pipefd[0];
234 selfpipe_writefd = pipefd[1];
235 selfpipe_owner_pid = MyProcPid;
236
237 /* Tell fd.c about these two long-lived FDs */
238 ReserveExternalFD();
239 ReserveExternalFD();
240 #else
241 /* currently, nothing to do here for Windows */
242 #endif
243 }
244
245 /*
246 * Initialize a process-local latch.
247 */
248 void
249 InitLatch(Latch *latch)
250 {
251 latch->is_set = false;
252 latch->owner_pid = MyProcPid;
253 latch->is_shared = false;
254
255 #ifndef WIN32
256 /* Assert InitializeLatchSupport has been called in this process */
257 Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
258 #else
259 latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
260 if (latch->event == NULL)
261 elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
262 #endif /* WIN32 */
263 }
264
265 /*
266 * Initialize a shared latch that can be set from other processes. The latch
267 * is initially owned by no-one; use OwnLatch to associate it with the
268 * current process.
269 *
270 * InitSharedLatch needs to be called in postmaster before forking child
TEST_F(CleanupTest,CtorInitializerInNamespace)271 * processes, usually right after allocating the shared memory block
272 * containing the latch with ShmemInitStruct. (The Unix implementation
273 * doesn't actually require that, but the Windows one does.) Because of
274 * this restriction, we have no concurrency issues to worry about here.
275 *
276 * Note that other handles created in this module are never marked as
277 * inheritable. Thus we do not need to worry about cleaning up child
278 * process references to postmaster-private latches or WaitEventSets.
279 */
280 void
281 InitSharedLatch(Latch *latch)
282 {
283 #ifdef WIN32
284 SECURITY_ATTRIBUTES sa;
285
286 /*
287 * Set up security attributes to specify that the events are inherited.
288 */
289 ZeroMemory(&sa, sizeof(sa));
290 sa.nLength = sizeof(sa);
291 sa.bInheritHandle = TRUE;
292
293 latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
294 if (latch->event == NULL)
295 elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
296 #endif
297
298 latch->is_set = false;
299 latch->owner_pid = 0;
300 latch->is_shared = true;
301 }
302
apply(StringRef Code,const tooling::Replacements & Replaces)303 /*
304 * Associate a shared latch with the current process, allowing it to
305 * wait on the latch.
306 *
307 * Although there is a sanity check for latch-already-owned, we don't do
308 * any sort of locking here, meaning that we could fail to detect the error
309 * if two processes try to own the same latch at about the same time. If
310 * there is any risk of that, caller must provide an interlock to prevent it.
311 *
312 * In any process that calls OwnLatch(), make sure that
313 * latch_sigusr1_handler() is called from the SIGUSR1 signal handler,
314 * as shared latches use SIGUSR1 for inter-process communication.
315 */
316 void
317 OwnLatch(Latch *latch)
318 {
319 /* Sanity checks */
320 Assert(latch->is_shared);
321
322 #ifndef WIN32
323 /* Assert InitializeLatchSupport has been called in this process */
324 Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
325 #endif
326
327 if (latch->owner_pid != 0)
328 elog(ERROR, "latch already owned");
329
330 latch->owner_pid = MyProcPid;
331 }
332
333 /*
334 * Disown a shared latch currently owned by the current process.
335 */
336 void
337 DisownLatch(Latch *latch)
TEST_F(CleanUpReplacementsTest,FixOnlyAffectedCodeAfterReplacements)338 {
339 Assert(latch->is_shared);
340 Assert(latch->owner_pid == MyProcPid);
341
342 latch->owner_pid = 0;
343 }
344
345 /*
346 * Wait for a given latch to be set, or for postmaster death, or until timeout
347 * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
348 * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
349 * function returns immediately.
350 *
351 * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
352 * is given. Although it is declared as "long", we don't actually support
353 * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
354 * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
355 *
356 * The latch must be owned by the current process, ie. it must be a
357 * process-local latch initialized with InitLatch, or a shared latch
358 * associated with the current process by calling OwnLatch.
359 *
360 * Returns bit mask indicating which condition(s) caused the wake-up. Note
TEST_F(CleanUpReplacementsTest,InsertMultipleIncludesLLVMStyle)361 * that if multiple wake-up conditions are true, there is no guarantee that
362 * we return all of them in one call, but we will return at least one.
363 */
364 int
365 WaitLatch(Latch *latch, int wakeEvents, long timeout,
366 uint32 wait_event_info)
367 {
368 return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
369 wait_event_info);
370 }
371
372 /*
373 * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
374 * conditions.
375 *
376 * When waiting on a socket, EOF and error conditions always cause the socket
377 * to be reported as readable/writable/connected, so that the caller can deal
378 * with the condition.
379 *
380 * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
381 * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
TEST_F(CleanUpReplacementsTest,InsertMultipleIncludesGoogleStyle)382 * return value if the postmaster dies. The latter is useful for rare cases
383 * where some behavior other than immediate exit is needed.
384 *
385 * NB: These days this is just a wrapper around the WaitEventSet API. When
386 * using a latch very frequently, consider creating a longer living
387 * WaitEventSet instead; that's more efficient.
388 */
389 int
390 WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
391 long timeout, uint32 wait_event_info)
392 {
393 int ret = 0;
394 int rc;
395 WaitEvent event;
396 WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
397
398 if (wakeEvents & WL_TIMEOUT)
399 Assert(timeout >= 0);
400 else
401 timeout = -1;
402
403 if (wakeEvents & WL_LATCH_SET)
404 AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
405 latch, NULL);
406
407 /* Postmaster-managed callers must handle postmaster death somehow. */
408 Assert(!IsUnderPostmaster ||
409 (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
410 (wakeEvents & WL_POSTMASTER_DEATH));
411
412 if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
413 AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
414 NULL, NULL);
415
416 if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
417 AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
418 NULL, NULL);
419
420 if (wakeEvents & WL_SOCKET_MASK)
421 {
422 int ev;
423
424 ev = wakeEvents & WL_SOCKET_MASK;
425 AddWaitEventToSet(set, ev, sock, NULL, NULL);
426 }
427
428 rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
429
430 if (rc == 0)
431 ret |= WL_TIMEOUT;
432 else
433 {
434 ret |= event.events & (WL_LATCH_SET |
435 WL_POSTMASTER_DEATH |
436 WL_SOCKET_MASK);
437 }
438
439 FreeWaitEventSet(set);
440
441 return ret;
442 }
443
444 /*
445 * Sets a latch and wakes up anyone waiting on it.
446 *
447 * This is cheap if the latch is already set, otherwise not so much.
448 *
449 * NB: when calling this in a signal handler, be sure to save and restore
450 * errno around it. (That's standard practice in most signal handlers, of
451 * course, but we used to omit it in handlers that only set a flag.)
TEST_F(CleanUpReplacementsTest,FormatCorrectLineWhenHeadersAreInserted)452 *
453 * NB: this function is called from critical sections and signal handlers so
454 * throwing an error is not a good idea.
455 */
456 void
457 SetLatch(Latch *latch)
458 {
459 #ifndef WIN32
460 pid_t owner_pid;
461 #else
462 HANDLE handle;
463 #endif
464
465 /*
466 * The memory barrier has to be placed here to ensure that any flag
467 * variables possibly changed by this process have been flushed to main
468 * memory, before we check/set is_set.
469 */
470 pg_memory_barrier();
471
472 /* Quick exit if already set */
473 if (latch->is_set)
474 return;
475
476 latch->is_set = true;
477
478 #ifndef WIN32
479
480 /*
481 * See if anyone's waiting for the latch. It can be the current process if
482 * we're in a signal handler. We use the self-pipe to wake up the
483 * poll()/epoll_wait() in that case. If it's another process, send a
484 * signal.
485 *
486 * Fetch owner_pid only once, in case the latch is concurrently getting
487 * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
488 * guaranteed to be true! In practice, the effective range of pid_t fits
489 * in a 32 bit integer, and so should be atomic. In the worst case, we
490 * might end up signaling the wrong process. Even then, you're very
491 * unlucky if a process with that bogus pid exists and belongs to
492 * Postgres; and PG database processes should handle excess SIGUSR1
493 * interrupts without a problem anyhow.
494 *
495 * Another sort of race condition that's possible here is for a new
496 * process to own the latch immediately after we look, so we don't signal
497 * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
498 * the standard coding convention of waiting at the bottom of their loops,
499 * not the top, so that they'll correctly process latch-setting events
500 * that happen before they enter the loop.
501 */
502 owner_pid = latch->owner_pid;
503 if (owner_pid == 0)
504 return;
505 else if (owner_pid == MyProcPid)
506 {
507 if (waiting)
508 sendSelfPipeByte();
509 }
510 else
511 kill(owner_pid, SIGUSR1);
512 #else
513
514 /*
515 * See if anyone's waiting for the latch. It can be the current process if
516 * we're in a signal handler.
517 *
518 * Use a local variable here just in case somebody changes the event field
519 * concurrently (which really should not happen).
520 */
521 handle = latch->event;
522 if (handle)
523 {
524 SetEvent(handle);
525
526 /*
527 * Note that we silently ignore any errors. We might be in a signal
528 * handler or other critical path where it's not safe to call elog().
529 */
530 }
531 #endif
532
533 }
534
535 /*
536 * Clear the latch. Calling WaitLatch after this will sleep, unless
537 * the latch is set again before the WaitLatch call.
538 */
539 void
540 ResetLatch(Latch *latch)
541 {
542 /* Only the owner should reset the latch */
543 Assert(latch->owner_pid == MyProcPid);
544
545 latch->is_set = false;
546
547 /*
548 * Ensure that the write to is_set gets flushed to main memory before we
549 * examine any flag variables. Otherwise a concurrent SetLatch might
550 * falsely conclude that it needn't signal us, even though we have missed
551 * seeing some flag updates that SetLatch was supposed to inform us of.
552 */
553 pg_memory_barrier();
554 }
555
556 /*
557 * Create a WaitEventSet with space for nevents different events to wait for.
558 *
559 * These events can then be efficiently waited upon together, using
560 * WaitEventSetWait().
561 */
562 WaitEventSet *
563 CreateWaitEventSet(MemoryContext context, int nevents)
564 {
565 WaitEventSet *set;
566 char *data;
567 Size sz = 0;
568
569 /*
570 * Use MAXALIGN size/alignment to guarantee that later uses of memory are
571 * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
572 * platforms, but earlier allocations like WaitEventSet and WaitEvent
573 * might not sized to guarantee that when purely using sizeof().
574 */
575 sz += MAXALIGN(sizeof(WaitEventSet));
576 sz += MAXALIGN(sizeof(WaitEvent) * nevents);
577
578 #if defined(WAIT_USE_EPOLL)
579 sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
580 #elif defined(WAIT_USE_KQUEUE)
581 sz += MAXALIGN(sizeof(struct kevent) * nevents);
582 #elif defined(WAIT_USE_POLL)
583 sz += MAXALIGN(sizeof(struct pollfd) * nevents);
584 #elif defined(WAIT_USE_WIN32)
585 /* need space for the pgwin32_signal_event */
586 sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
587 #endif
588
589 data = (char *) MemoryContextAllocZero(context, sz);
590
591 set = (WaitEventSet *) data;
592 data += MAXALIGN(sizeof(WaitEventSet));
593
594 set->events = (WaitEvent *) data;
595 data += MAXALIGN(sizeof(WaitEvent) * nevents);
596
597 #if defined(WAIT_USE_EPOLL)
598 set->epoll_ret_events = (struct epoll_event *) data;
599 data += MAXALIGN(sizeof(struct epoll_event) * nevents);
600 #elif defined(WAIT_USE_KQUEUE)
601 set->kqueue_ret_events = (struct kevent *) data;
602 data += MAXALIGN(sizeof(struct kevent) * nevents);
603 #elif defined(WAIT_USE_POLL)
604 set->pollfds = (struct pollfd *) data;
605 data += MAXALIGN(sizeof(struct pollfd) * nevents);
606 #elif defined(WAIT_USE_WIN32)
607 set->handles = (HANDLE) data;
608 data += MAXALIGN(sizeof(HANDLE) * nevents);
609 #endif
610
611 set->latch = NULL;
612 set->nevents_space = nevents;
613 set->exit_on_postmaster_death = false;
614
615 #if defined(WAIT_USE_EPOLL)
616 if (!AcquireExternalFD())
617 {
618 /* treat this as though epoll_create1 itself returned EMFILE */
619 elog(ERROR, "epoll_create1 failed: %m");
620 }
621 #ifdef EPOLL_CLOEXEC
622 set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
623 if (set->epoll_fd < 0)
624 {
625 ReleaseExternalFD();
626 elog(ERROR, "epoll_create1 failed: %m");
627 }
628 #else
629 /* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
630 set->epoll_fd = epoll_create(nevents);
631 if (set->epoll_fd < 0)
632 {
633 ReleaseExternalFD();
634 elog(ERROR, "epoll_create failed: %m");
635 }
636 if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
637 {
638 int save_errno = errno;
639
640 close(set->epoll_fd);
641 ReleaseExternalFD();
642 errno = save_errno;
643 elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
644 }
645 #endif /* EPOLL_CLOEXEC */
646 #elif defined(WAIT_USE_KQUEUE)
647 if (!AcquireExternalFD())
648 {
649 /* treat this as though kqueue itself returned EMFILE */
650 elog(ERROR, "kqueue failed: %m");
651 }
652 set->kqueue_fd = kqueue();
653 if (set->kqueue_fd < 0)
654 {
655 ReleaseExternalFD();
656 elog(ERROR, "kqueue failed: %m");
657 }
658 if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
659 {
660 int save_errno = errno;
661
662 close(set->kqueue_fd);
663 ReleaseExternalFD();
664 errno = save_errno;
665 elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
666 }
667 set->report_postmaster_not_running = false;
668 #elif defined(WAIT_USE_WIN32)
669
670 /*
671 * To handle signals while waiting, we need to add a win32 specific event.
672 * We accounted for the additional event at the top of this routine. See
673 * port/win32/signal.c for more details.
674 *
675 * Note: pgwin32_signal_event should be first to ensure that it will be
676 * reported when multiple events are set. We want to guarantee that
677 * pending signals are serviced.
678 */
679 set->handles[0] = pgwin32_signal_event;
680 StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
681 #endif
682
683 return set;
684 }
685
686 /*
687 * Free a previously created WaitEventSet.
688 *
689 * Note: preferably, this shouldn't have to free any resources that could be
690 * inherited across an exec(). If it did, we'd likely leak those resources in
691 * many scenarios. For the epoll case, we ensure that by setting FD_CLOEXEC
692 * when the FD is created. For the Windows case, we assume that the handles
693 * involved are non-inheritable.
694 */
695 void
696 FreeWaitEventSet(WaitEventSet *set)
697 {
698 #if defined(WAIT_USE_EPOLL)
699 close(set->epoll_fd);
700 ReleaseExternalFD();
701 #elif defined(WAIT_USE_KQUEUE)
702 close(set->kqueue_fd);
703 ReleaseExternalFD();
704 #elif defined(WAIT_USE_WIN32)
705 WaitEvent *cur_event;
706
707 for (cur_event = set->events;
708 cur_event < (set->events + set->nevents);
709 cur_event++)
710 {
711 if (cur_event->events & WL_LATCH_SET)
712 {
713 /* uses the latch's HANDLE */
714 }
715 else if (cur_event->events & WL_POSTMASTER_DEATH)
716 {
717 /* uses PostmasterHandle */
718 }
719 else
720 {
721 /* Clean up the event object we created for the socket */
722 WSAEventSelect(cur_event->fd, NULL, 0);
723 WSACloseEvent(set->handles[cur_event->pos + 1]);
724 }
725 }
726 #endif
727
728 pfree(set);
729 }
730
731 /* ---
732 * Add an event to the set. Possible events are:
733 * - WL_LATCH_SET: Wait for the latch to be set
734 * - WL_POSTMASTER_DEATH: Wait for postmaster to die
735 * - WL_SOCKET_READABLE: Wait for socket to become readable,
736 * can be combined in one event with other WL_SOCKET_* events
737 * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
738 * can be combined with other WL_SOCKET_* events
739 * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
740 * can be combined with other WL_SOCKET_* events (on non-Windows
741 * platforms, this is the same as WL_SOCKET_WRITEABLE)
742 * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
743 *
744 * Returns the offset in WaitEventSet->events (starting from 0), which can be
745 * used to modify previously added wait events using ModifyWaitEvent().
746 *
747 * In the WL_LATCH_SET case the latch must be owned by the current process,
748 * i.e. it must be a process-local latch initialized with InitLatch, or a
749 * shared latch associated with the current process by calling OwnLatch.
750 *
751 * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
752 * conditions cause the socket to be reported as readable/writable/connected,
753 * so that the caller can deal with the condition.
754 *
755 * The user_data pointer specified here will be set for the events returned
756 * by WaitEventSetWait(), allowing to easily associate additional data with
757 * events.
758 */
759 int
760 AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
761 void *user_data)
762 {
763 WaitEvent *event;
764
765 /* not enough space */
766 Assert(set->nevents < set->nevents_space);
767
768 if (events == WL_EXIT_ON_PM_DEATH)
769 {
770 events = WL_POSTMASTER_DEATH;
771 set->exit_on_postmaster_death = true;
772 }
773
774 if (latch)
775 {
776 if (latch->owner_pid != MyProcPid)
777 elog(ERROR, "cannot wait on a latch owned by another process");
778 if (set->latch)
779 elog(ERROR, "cannot wait on more than one latch");
780 if ((events & WL_LATCH_SET) != WL_LATCH_SET)
781 elog(ERROR, "latch events only support being set");
782 }
783 else
784 {
785 if (events & WL_LATCH_SET)
786 elog(ERROR, "cannot wait on latch without a specified latch");
787 }
788
789 /* waiting for socket readiness without a socket indicates a bug */
790 if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
791 elog(ERROR, "cannot wait on socket event without a socket");
792
793 event = &set->events[set->nevents];
794 event->pos = set->nevents++;
795 event->fd = fd;
796 event->events = events;
797 event->user_data = user_data;
798 #ifdef WIN32
799 event->reset = false;
800 #endif
801
802 if (events == WL_LATCH_SET)
803 {
804 set->latch = latch;
805 set->latch_pos = event->pos;
806 #ifndef WIN32
807 event->fd = selfpipe_readfd;
808 #endif
809 }
810 else if (events == WL_POSTMASTER_DEATH)
811 {
812 #ifndef WIN32
813 event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
814 #endif
815 }
816
817 /* perform wait primitive specific initialization, if needed */
818 #if defined(WAIT_USE_EPOLL)
819 WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
820 #elif defined(WAIT_USE_KQUEUE)
821 WaitEventAdjustKqueue(set, event, 0);
822 #elif defined(WAIT_USE_POLL)
823 WaitEventAdjustPoll(set, event);
824 #elif defined(WAIT_USE_WIN32)
825 WaitEventAdjustWin32(set, event);
826 #endif
827
828 return event->pos;
829 }
830
831 /*
832 * Change the event mask and, in the WL_LATCH_SET case, the latch associated
833 * with the WaitEvent.
834 *
835 * 'pos' is the id returned by AddWaitEventToSet.
836 */
837 void
838 ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
839 {
840 WaitEvent *event;
841 #if defined(WAIT_USE_KQUEUE)
842 int old_events;
843 #endif
844
845 Assert(pos < set->nevents);
846
847 event = &set->events[pos];
848 #if defined(WAIT_USE_KQUEUE)
849 old_events = event->events;
850 #endif
851
852 /*
853 * If neither the event mask nor the associated latch changes, return
854 * early. That's an important optimization for some sockets, where
855 * ModifyWaitEvent is frequently used to switch from waiting for reads to
856 * waiting on writes.
857 */
858 if (events == event->events &&
859 (!(event->events & WL_LATCH_SET) || set->latch == latch))
860 return;
861
862 if (event->events & WL_LATCH_SET &&
863 events != event->events)
864 {
865 /* we could allow to disable latch events for a while */
866 elog(ERROR, "cannot modify latch event");
867 }
868
869 if (event->events & WL_POSTMASTER_DEATH)
870 {
871 elog(ERROR, "cannot modify postmaster death event");
872 }
873
874 /* FIXME: validate event mask */
875 event->events = events;
876
877 if (events == WL_LATCH_SET)
878 {
879 set->latch = latch;
880 }
881
882 #if defined(WAIT_USE_EPOLL)
883 WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
884 #elif defined(WAIT_USE_KQUEUE)
885 WaitEventAdjustKqueue(set, event, old_events);
886 #elif defined(WAIT_USE_POLL)
887 WaitEventAdjustPoll(set, event);
888 #elif defined(WAIT_USE_WIN32)
889 WaitEventAdjustWin32(set, event);
890 #endif
891 }
892
893 #if defined(WAIT_USE_EPOLL)
894 /*
895 * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
896 */
897 static void
898 WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
899 {
900 struct epoll_event epoll_ev;
901 int rc;
902
903 /* pointer to our event, returned by epoll_wait */
904 epoll_ev.data.ptr = event;
905 /* always wait for errors */
906 epoll_ev.events = EPOLLERR | EPOLLHUP;
907
908 /* prepare pollfd entry once */
909 if (event->events == WL_LATCH_SET)
910 {
911 Assert(set->latch != NULL);
912 epoll_ev.events |= EPOLLIN;
913 }
914 else if (event->events == WL_POSTMASTER_DEATH)
915 {
916 epoll_ev.events |= EPOLLIN;
917 }
918 else
919 {
920 Assert(event->fd != PGINVALID_SOCKET);
921 Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
922
923 if (event->events & WL_SOCKET_READABLE)
924 epoll_ev.events |= EPOLLIN;
925 if (event->events & WL_SOCKET_WRITEABLE)
926 epoll_ev.events |= EPOLLOUT;
927 }
928
929 /*
930 * Even though unused, we also pass epoll_ev as the data argument if
931 * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
932 * requiring that, and actually it makes the code simpler...
933 */
934 rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
935
936 if (rc < 0)
937 ereport(ERROR,
938 (errcode_for_socket_access(),
939 /* translator: %s is a syscall name, such as "poll()" */
940 errmsg("%s failed: %m",
941 "epoll_ctl()")));
942 }
943 #endif
944
945 #if defined(WAIT_USE_POLL)
946 static void
947 WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
948 {
949 struct pollfd *pollfd = &set->pollfds[event->pos];
950
951 pollfd->revents = 0;
952 pollfd->fd = event->fd;
953
954 /* prepare pollfd entry once */
955 if (event->events == WL_LATCH_SET)
956 {
957 Assert(set->latch != NULL);
958 pollfd->events = POLLIN;
959 }
960 else if (event->events == WL_POSTMASTER_DEATH)
961 {
962 pollfd->events = POLLIN;
963 }
964 else
965 {
966 Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
967 pollfd->events = 0;
968 if (event->events & WL_SOCKET_READABLE)
969 pollfd->events |= POLLIN;
970 if (event->events & WL_SOCKET_WRITEABLE)
971 pollfd->events |= POLLOUT;
972 }
973
974 Assert(event->fd != PGINVALID_SOCKET);
975 }
976 #endif
977
978 #if defined(WAIT_USE_KQUEUE)
979
980 /*
981 * On most BSD family systems, the udata member of struct kevent is of type
982 * void *, so we could directly convert to/from WaitEvent *. Unfortunately,
983 * NetBSD has it as intptr_t, so here we wallpaper over that difference with
984 * an lvalue cast.
985 */
986 #define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata)))
987
988 static inline void
989 WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
990 WaitEvent *event)
991 {
992 k_ev->ident = event->fd;
993 k_ev->filter = filter;
994 k_ev->flags = action;
995 k_ev->fflags = 0;
996 k_ev->data = 0;
997 AccessWaitEvent(k_ev) = event;
998 }
999
1000 static inline void
1001 WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
1002 {
1003 /* For now postmaster death can only be added, not removed. */
1004 k_ev->ident = PostmasterPid;
1005 k_ev->filter = EVFILT_PROC;
1006 k_ev->flags = EV_ADD;
1007 k_ev->fflags = NOTE_EXIT;
1008 k_ev->data = 0;
1009 AccessWaitEvent(k_ev) = event;
1010 }
1011
1012 /*
1013 * old_events is the previous event mask, used to compute what has changed.
1014 */
1015 static void
1016 WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
1017 {
1018 int rc;
1019 struct kevent k_ev[2];
1020 int count = 0;
1021 bool new_filt_read = false;
1022 bool old_filt_read = false;
1023 bool new_filt_write = false;
1024 bool old_filt_write = false;
1025
1026 if (old_events == event->events)
1027 return;
1028
1029 Assert(event->events != WL_LATCH_SET || set->latch != NULL);
1030 Assert(event->events == WL_LATCH_SET ||
1031 event->events == WL_POSTMASTER_DEATH ||
1032 (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
1033
1034 if (event->events == WL_POSTMASTER_DEATH)
1035 {
1036 /*
1037 * Unlike all the other implementations, we detect postmaster death
1038 * using process notification instead of waiting on the postmaster
1039 * alive pipe.
1040 */
1041 WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event);
1042 }
1043 else
1044 {
1045 /*
1046 * We need to compute the adds and deletes required to get from the
1047 * old event mask to the new event mask, since kevent treats readable
1048 * and writable as separate events.
1049 */
1050 if (old_events == WL_LATCH_SET ||
1051 (old_events & WL_SOCKET_READABLE))
1052 old_filt_read = true;
1053 if (event->events == WL_LATCH_SET ||
1054 (event->events & WL_SOCKET_READABLE))
1055 new_filt_read = true;
1056 if (old_events & WL_SOCKET_WRITEABLE)
1057 old_filt_write = true;
1058 if (event->events & WL_SOCKET_WRITEABLE)
1059 new_filt_write = true;
1060 if (old_filt_read && !new_filt_read)
1061 WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE,
1062 event);
1063 else if (!old_filt_read && new_filt_read)
1064 WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD,
1065 event);
1066 if (old_filt_write && !new_filt_write)
1067 WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE,
1068 event);
1069 else if (!old_filt_write && new_filt_write)
1070 WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD,
1071 event);
1072 }
1073
1074 Assert(count > 0);
1075 Assert(count <= 2);
1076
1077 rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
1078
1079 /*
1080 * When adding the postmaster's pid, we have to consider that it might
1081 * already have exited and perhaps even been replaced by another process
1082 * with the same pid. If so, we have to defer reporting this as an event
1083 * until the next call to WaitEventSetWaitBlock().
1084 */
1085
1086 if (rc < 0)
1087 {
1088 if (event->events == WL_POSTMASTER_DEATH &&
1089 (errno == ESRCH || errno == EACCES))
1090 set->report_postmaster_not_running = true;
1091 else
1092 ereport(ERROR,
1093 (errcode_for_socket_access(),
1094 /* translator: %s is a syscall name, such as "poll()" */
1095 errmsg("%s failed: %m",
1096 "kevent()")));
1097 }
1098 else if (event->events == WL_POSTMASTER_DEATH &&
1099 PostmasterPid != getppid() &&
1100 !PostmasterIsAlive())
1101 {
1102 /*
1103 * The extra PostmasterIsAliveInternal() check prevents false alarms
1104 * on systems that give a different value for getppid() while being
1105 * traced by a debugger.
1106 */
1107 set->report_postmaster_not_running = true;
1108 }
1109 }
1110
1111 #endif
1112
1113 #if defined(WAIT_USE_WIN32)
1114 static void
1115 WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
1116 {
1117 HANDLE *handle = &set->handles[event->pos + 1];
1118
1119 if (event->events == WL_LATCH_SET)
1120 {
1121 Assert(set->latch != NULL);
1122 *handle = set->latch->event;
1123 }
1124 else if (event->events == WL_POSTMASTER_DEATH)
1125 {
1126 *handle = PostmasterHandle;
1127 }
1128 else
1129 {
1130 int flags = FD_CLOSE; /* always check for errors/EOF */
1131
1132 if (event->events & WL_SOCKET_READABLE)
1133 flags |= FD_READ;
1134 if (event->events & WL_SOCKET_WRITEABLE)
1135 flags |= FD_WRITE;
1136 if (event->events & WL_SOCKET_CONNECTED)
1137 flags |= FD_CONNECT;
1138
1139 if (*handle == WSA_INVALID_EVENT)
1140 {
1141 *handle = WSACreateEvent();
1142 if (*handle == WSA_INVALID_EVENT)
1143 elog(ERROR, "failed to create event for socket: error code %u",
1144 WSAGetLastError());
1145 }
1146 if (WSAEventSelect(event->fd, *handle, flags) != 0)
1147 elog(ERROR, "failed to set up event for socket: error code %u",
1148 WSAGetLastError());
1149
1150 Assert(event->fd != PGINVALID_SOCKET);
1151 }
1152 }
1153 #endif
1154
1155 /*
1156 * Wait for events added to the set to happen, or until the timeout is
1157 * reached. At most nevents occurred events are returned.
1158 *
1159 * If timeout = -1, block until an event occurs; if 0, check sockets for
1160 * readiness, but don't block; if > 0, block for at most timeout milliseconds.
1161 *
1162 * Returns the number of events occurred, or 0 if the timeout was reached.
1163 *
1164 * Returned events will have the fd, pos, user_data fields set to the
1165 * values associated with the registered event.
1166 */
1167 int
1168 WaitEventSetWait(WaitEventSet *set, long timeout,
1169 WaitEvent *occurred_events, int nevents,
1170 uint32 wait_event_info)
1171 {
1172 int returned_events = 0;
1173 instr_time start_time;
1174 instr_time cur_time;
1175 long cur_timeout = -1;
1176
1177 Assert(nevents > 0);
1178
1179 /*
1180 * Initialize timeout if requested. We must record the current time so
1181 * that we can determine the remaining timeout if interrupted.
1182 */
1183 if (timeout >= 0)
1184 {
1185 INSTR_TIME_SET_CURRENT(start_time);
1186 Assert(timeout >= 0 && timeout <= INT_MAX);
1187 cur_timeout = timeout;
1188 }
1189
1190 pgstat_report_wait_start(wait_event_info);
1191
1192 #ifndef WIN32
1193 waiting = true;
1194 #else
1195 /* Ensure that signals are serviced even if latch is already set */
1196 pgwin32_dispatch_queued_signals();
1197 #endif
1198 while (returned_events == 0)
1199 {
1200 int rc;
1201
1202 /*
1203 * Check if the latch is set already. If so, leave the loop
1204 * immediately, avoid blocking again. We don't attempt to report any
1205 * other events that might also be satisfied.
1206 *
1207 * If someone sets the latch between this and the
1208 * WaitEventSetWaitBlock() below, the setter will write a byte to the
1209 * pipe (or signal us and the signal handler will do that), and the
1210 * readiness routine will return immediately.
1211 *
1212 * On unix, If there's a pending byte in the self pipe, we'll notice
1213 * whenever blocking. Only clearing the pipe in that case avoids
1214 * having to drain it every time WaitLatchOrSocket() is used. Should
1215 * the pipe-buffer fill up we're still ok, because the pipe is in
1216 * nonblocking mode. It's unlikely for that to happen, because the
1217 * self pipe isn't filled unless we're blocking (waiting = true), or
1218 * from inside a signal handler in latch_sigusr1_handler().
1219 *
1220 * On windows, we'll also notice if there's a pending event for the
1221 * latch when blocking, but there's no danger of anything filling up,
1222 * as "Setting an event that is already set has no effect.".
1223 *
1224 * Note: we assume that the kernel calls involved in latch management
1225 * will provide adequate synchronization on machines with weak memory
1226 * ordering, so that we cannot miss seeing is_set if a notification
1227 * has already been queued.
1228 */
1229 if (set->latch && set->latch->is_set)
1230 {
1231 occurred_events->fd = PGINVALID_SOCKET;
1232 occurred_events->pos = set->latch_pos;
1233 occurred_events->user_data =
1234 set->events[set->latch_pos].user_data;
1235 occurred_events->events = WL_LATCH_SET;
1236 occurred_events++;
1237 returned_events++;
1238
1239 break;
1240 }
1241
1242 /*
1243 * Wait for events using the readiness primitive chosen at the top of
1244 * this file. If -1 is returned, a timeout has occurred, if 0 we have
1245 * to retry, everything >= 1 is the number of returned events.
1246 */
1247 rc = WaitEventSetWaitBlock(set, cur_timeout,
1248 occurred_events, nevents);
1249
1250 if (rc == -1)
1251 break; /* timeout occurred */
1252 else
1253 returned_events = rc;
1254
1255 /* If we're not done, update cur_timeout for next iteration */
1256 if (returned_events == 0 && timeout >= 0)
1257 {
1258 INSTR_TIME_SET_CURRENT(cur_time);
1259 INSTR_TIME_SUBTRACT(cur_time, start_time);
1260 cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1261 if (cur_timeout <= 0)
1262 break;
1263 }
1264 }
1265 #ifndef WIN32
1266 waiting = false;
1267 #endif
1268
1269 pgstat_report_wait_end();
1270
1271 return returned_events;
1272 }
1273
1274
1275 #if defined(WAIT_USE_EPOLL)
1276
1277 /*
1278 * Wait using linux's epoll_wait(2).
1279 *
1280 * This is the preferable wait method, as several readiness notifications are
1281 * delivered, without having to iterate through all of set->events. The return
1282 * epoll_event struct contain a pointer to our events, making association
1283 * easy.
1284 */
1285 static inline int
1286 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1287 WaitEvent *occurred_events, int nevents)
1288 {
1289 int returned_events = 0;
1290 int rc;
1291 WaitEvent *cur_event;
1292 struct epoll_event *cur_epoll_event;
1293
1294 /* Sleep */
1295 rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1296 nevents, cur_timeout);
1297
1298 /* Check return code */
1299 if (rc < 0)
1300 {
1301 /* EINTR is okay, otherwise complain */
1302 if (errno != EINTR)
1303 {
1304 waiting = false;
1305 ereport(ERROR,
1306 (errcode_for_socket_access(),
1307 /* translator: %s is a syscall name, such as "poll()" */
1308 errmsg("%s failed: %m",
1309 "epoll_wait()")));
1310 }
1311 return 0;
1312 }
1313 else if (rc == 0)
1314 {
1315 /* timeout exceeded */
1316 return -1;
1317 }
1318
1319 /*
1320 * At least one event occurred, iterate over the returned epoll events
1321 * until they're either all processed, or we've returned all the events
1322 * the caller desired.
1323 */
1324 for (cur_epoll_event = set->epoll_ret_events;
1325 cur_epoll_event < (set->epoll_ret_events + rc) &&
1326 returned_events < nevents;
1327 cur_epoll_event++)
1328 {
1329 /* epoll's data pointer is set to the associated WaitEvent */
1330 cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1331
1332 occurred_events->pos = cur_event->pos;
1333 occurred_events->user_data = cur_event->user_data;
1334 occurred_events->events = 0;
1335
1336 if (cur_event->events == WL_LATCH_SET &&
1337 cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1338 {
1339 /* There's data in the self-pipe, clear it. */
1340 drainSelfPipe();
1341
1342 if (set->latch->is_set)
1343 {
1344 occurred_events->fd = PGINVALID_SOCKET;
1345 occurred_events->events = WL_LATCH_SET;
1346 occurred_events++;
1347 returned_events++;
1348 }
1349 }
1350 else if (cur_event->events == WL_POSTMASTER_DEATH &&
1351 cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1352 {
1353 /*
1354 * We expect an EPOLLHUP when the remote end is closed, but
1355 * because we don't expect the pipe to become readable or to have
1356 * any errors either, treat those cases as postmaster death, too.
1357 *
1358 * Be paranoid about a spurious event signaling the postmaster as
1359 * being dead. There have been reports about that happening with
1360 * older primitives (select(2) to be specific), and a spurious
1361 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1362 * cost much.
1363 */
1364 if (!PostmasterIsAliveInternal())
1365 {
1366 if (set->exit_on_postmaster_death)
1367 proc_exit(1);
1368 occurred_events->fd = PGINVALID_SOCKET;
1369 occurred_events->events = WL_POSTMASTER_DEATH;
1370 occurred_events++;
1371 returned_events++;
1372 }
1373 }
1374 else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1375 {
1376 Assert(cur_event->fd != PGINVALID_SOCKET);
1377
1378 if ((cur_event->events & WL_SOCKET_READABLE) &&
1379 (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1380 {
1381 /* data available in socket, or EOF */
1382 occurred_events->events |= WL_SOCKET_READABLE;
1383 }
1384
1385 if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1386 (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1387 {
1388 /* writable, or EOF */
1389 occurred_events->events |= WL_SOCKET_WRITEABLE;
1390 }
1391
1392 if (occurred_events->events != 0)
1393 {
1394 occurred_events->fd = cur_event->fd;
1395 occurred_events++;
1396 returned_events++;
1397 }
1398 }
1399 }
1400
1401 return returned_events;
1402 }
1403
1404 #elif defined(WAIT_USE_KQUEUE)
1405
1406 /*
1407 * Wait using kevent(2) on BSD-family systems and macOS.
1408 *
1409 * For now this mirrors the epoll code, but in future it could modify the fd
1410 * set in the same call to kevent as it uses for waiting instead of doing that
1411 * with separate system calls.
1412 */
1413 static int
1414 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1415 WaitEvent *occurred_events, int nevents)
1416 {
1417 int returned_events = 0;
1418 int rc;
1419 WaitEvent *cur_event;
1420 struct kevent *cur_kqueue_event;
1421 struct timespec timeout;
1422 struct timespec *timeout_p;
1423
1424 if (cur_timeout < 0)
1425 timeout_p = NULL;
1426 else
1427 {
1428 timeout.tv_sec = cur_timeout / 1000;
1429 timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
1430 timeout_p = &timeout;
1431 }
1432
1433 /*
1434 * Report postmaster events discovered by WaitEventAdjustKqueue() or an
1435 * earlier call to WaitEventSetWait().
1436 */
1437 if (unlikely(set->report_postmaster_not_running))
1438 {
1439 if (set->exit_on_postmaster_death)
1440 proc_exit(1);
1441 occurred_events->fd = PGINVALID_SOCKET;
1442 occurred_events->events = WL_POSTMASTER_DEATH;
1443 return 1;
1444 }
1445
1446 /* Sleep */
1447 rc = kevent(set->kqueue_fd, NULL, 0,
1448 set->kqueue_ret_events, nevents,
1449 timeout_p);
1450
1451 /* Check return code */
1452 if (rc < 0)
1453 {
1454 /* EINTR is okay, otherwise complain */
1455 if (errno != EINTR)
1456 {
1457 waiting = false;
1458 ereport(ERROR,
1459 (errcode_for_socket_access(),
1460 /* translator: %s is a syscall name, such as "poll()" */
1461 errmsg("%s failed: %m",
1462 "kevent()")));
1463 }
1464 return 0;
1465 }
1466 else if (rc == 0)
1467 {
1468 /* timeout exceeded */
1469 return -1;
1470 }
1471
1472 /*
1473 * At least one event occurred, iterate over the returned kqueue events
1474 * until they're either all processed, or we've returned all the events
1475 * the caller desired.
1476 */
1477 for (cur_kqueue_event = set->kqueue_ret_events;
1478 cur_kqueue_event < (set->kqueue_ret_events + rc) &&
1479 returned_events < nevents;
1480 cur_kqueue_event++)
1481 {
1482 /* kevent's udata points to the associated WaitEvent */
1483 cur_event = AccessWaitEvent(cur_kqueue_event);
1484
1485 occurred_events->pos = cur_event->pos;
1486 occurred_events->user_data = cur_event->user_data;
1487 occurred_events->events = 0;
1488
1489 if (cur_event->events == WL_LATCH_SET &&
1490 cur_kqueue_event->filter == EVFILT_READ)
1491 {
1492 /* There's data in the self-pipe, clear it. */
1493 drainSelfPipe();
1494
1495 if (set->latch->is_set)
1496 {
1497 occurred_events->fd = PGINVALID_SOCKET;
1498 occurred_events->events = WL_LATCH_SET;
1499 occurred_events++;
1500 returned_events++;
1501 }
1502 }
1503 else if (cur_event->events == WL_POSTMASTER_DEATH &&
1504 cur_kqueue_event->filter == EVFILT_PROC &&
1505 (cur_kqueue_event->fflags & NOTE_EXIT) != 0)
1506 {
1507 /*
1508 * The kernel will tell this kqueue object only once about the exit
1509 * of the postmaster, so let's remember that for next time so that
1510 * we provide level-triggered semantics.
1511 */
1512 set->report_postmaster_not_running = true;
1513
1514 if (set->exit_on_postmaster_death)
1515 proc_exit(1);
1516 occurred_events->fd = PGINVALID_SOCKET;
1517 occurred_events->events = WL_POSTMASTER_DEATH;
1518 occurred_events++;
1519 returned_events++;
1520 }
1521 else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1522 {
1523 Assert(cur_event->fd >= 0);
1524
1525 if ((cur_event->events & WL_SOCKET_READABLE) &&
1526 (cur_kqueue_event->filter == EVFILT_READ))
1527 {
1528 /* readable, or EOF */
1529 occurred_events->events |= WL_SOCKET_READABLE;
1530 }
1531
1532 if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1533 (cur_kqueue_event->filter == EVFILT_WRITE))
1534 {
1535 /* writable, or EOF */
1536 occurred_events->events |= WL_SOCKET_WRITEABLE;
1537 }
1538
1539 if (occurred_events->events != 0)
1540 {
1541 occurred_events->fd = cur_event->fd;
1542 occurred_events++;
1543 returned_events++;
1544 }
1545 }
1546 }
1547
1548 return returned_events;
1549 }
1550
1551 #elif defined(WAIT_USE_POLL)
1552
1553 /*
1554 * Wait using poll(2).
1555 *
1556 * This allows to receive readiness notifications for several events at once,
1557 * but requires iterating through all of set->pollfds.
1558 */
1559 static inline int
1560 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1561 WaitEvent *occurred_events, int nevents)
1562 {
1563 int returned_events = 0;
1564 int rc;
1565 WaitEvent *cur_event;
1566 struct pollfd *cur_pollfd;
1567
1568 /* Sleep */
1569 rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1570
1571 /* Check return code */
1572 if (rc < 0)
1573 {
1574 /* EINTR is okay, otherwise complain */
1575 if (errno != EINTR)
1576 {
1577 waiting = false;
1578 ereport(ERROR,
1579 (errcode_for_socket_access(),
1580 /* translator: %s is a syscall name, such as "poll()" */
1581 errmsg("%s failed: %m",
1582 "poll()")));
1583 }
1584 return 0;
1585 }
1586 else if (rc == 0)
1587 {
1588 /* timeout exceeded */
1589 return -1;
1590 }
1591
1592 for (cur_event = set->events, cur_pollfd = set->pollfds;
1593 cur_event < (set->events + set->nevents) &&
1594 returned_events < nevents;
1595 cur_event++, cur_pollfd++)
1596 {
1597 /* no activity on this FD, skip */
1598 if (cur_pollfd->revents == 0)
1599 continue;
1600
1601 occurred_events->pos = cur_event->pos;
1602 occurred_events->user_data = cur_event->user_data;
1603 occurred_events->events = 0;
1604
1605 if (cur_event->events == WL_LATCH_SET &&
1606 (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1607 {
1608 /* There's data in the self-pipe, clear it. */
1609 drainSelfPipe();
1610
1611 if (set->latch->is_set)
1612 {
1613 occurred_events->fd = PGINVALID_SOCKET;
1614 occurred_events->events = WL_LATCH_SET;
1615 occurred_events++;
1616 returned_events++;
1617 }
1618 }
1619 else if (cur_event->events == WL_POSTMASTER_DEATH &&
1620 (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1621 {
1622 /*
1623 * We expect an POLLHUP when the remote end is closed, but because
1624 * we don't expect the pipe to become readable or to have any
1625 * errors either, treat those cases as postmaster death, too.
1626 *
1627 * Be paranoid about a spurious event signaling the postmaster as
1628 * being dead. There have been reports about that happening with
1629 * older primitives (select(2) to be specific), and a spurious
1630 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1631 * cost much.
1632 */
1633 if (!PostmasterIsAliveInternal())
1634 {
1635 if (set->exit_on_postmaster_death)
1636 proc_exit(1);
1637 occurred_events->fd = PGINVALID_SOCKET;
1638 occurred_events->events = WL_POSTMASTER_DEATH;
1639 occurred_events++;
1640 returned_events++;
1641 }
1642 }
1643 else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1644 {
1645 int errflags = POLLHUP | POLLERR | POLLNVAL;
1646
1647 Assert(cur_event->fd >= PGINVALID_SOCKET);
1648
1649 if ((cur_event->events & WL_SOCKET_READABLE) &&
1650 (cur_pollfd->revents & (POLLIN | errflags)))
1651 {
1652 /* data available in socket, or EOF */
1653 occurred_events->events |= WL_SOCKET_READABLE;
1654 }
1655
1656 if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1657 (cur_pollfd->revents & (POLLOUT | errflags)))
1658 {
1659 /* writeable, or EOF */
1660 occurred_events->events |= WL_SOCKET_WRITEABLE;
1661 }
1662
1663 if (occurred_events->events != 0)
1664 {
1665 occurred_events->fd = cur_event->fd;
1666 occurred_events++;
1667 returned_events++;
1668 }
1669 }
1670 }
1671 return returned_events;
1672 }
1673
1674 #elif defined(WAIT_USE_WIN32)
1675
1676 /*
1677 * Wait using Windows' WaitForMultipleObjects().
1678 *
1679 * Unfortunately this will only ever return a single readiness notification at
1680 * a time. Note that while the official documentation for
1681 * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
1682 * with a single bWaitAll = FALSE call,
1683 * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
1684 * that only one event is "consumed".
1685 */
1686 static inline int
1687 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1688 WaitEvent *occurred_events, int nevents)
1689 {
1690 int returned_events = 0;
1691 DWORD rc;
1692 WaitEvent *cur_event;
1693
1694 /* Reset any wait events that need it */
1695 for (cur_event = set->events;
1696 cur_event < (set->events + set->nevents);
1697 cur_event++)
1698 {
1699 if (cur_event->reset)
1700 {
1701 WaitEventAdjustWin32(set, cur_event);
1702 cur_event->reset = false;
1703 }
1704
1705 /*
1706 * Windows does not guarantee to log an FD_WRITE network event
1707 * indicating that more data can be sent unless the previous send()
1708 * failed with WSAEWOULDBLOCK. While our caller might well have made
1709 * such a call, we cannot assume that here. Therefore, if waiting for
1710 * write-ready, force the issue by doing a dummy send(). If the dummy
1711 * send() succeeds, assume that the socket is in fact write-ready, and
1712 * return immediately. Also, if it fails with something other than
1713 * WSAEWOULDBLOCK, return a write-ready indication to let our caller
1714 * deal with the error condition.
1715 */
1716 if (cur_event->events & WL_SOCKET_WRITEABLE)
1717 {
1718 char c;
1719 WSABUF buf;
1720 DWORD sent;
1721 int r;
1722
1723 buf.buf = &c;
1724 buf.len = 0;
1725
1726 r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
1727 if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
1728 {
1729 occurred_events->pos = cur_event->pos;
1730 occurred_events->user_data = cur_event->user_data;
1731 occurred_events->events = WL_SOCKET_WRITEABLE;
1732 occurred_events->fd = cur_event->fd;
1733 return 1;
1734 }
1735 }
1736 }
1737
1738 /*
1739 * Sleep.
1740 *
1741 * Need to wait for ->nevents + 1, because signal handle is in [0].
1742 */
1743 rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
1744 cur_timeout);
1745
1746 /* Check return code */
1747 if (rc == WAIT_FAILED)
1748 elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
1749 GetLastError());
1750 else if (rc == WAIT_TIMEOUT)
1751 {
1752 /* timeout exceeded */
1753 return -1;
1754 }
1755
1756 if (rc == WAIT_OBJECT_0)
1757 {
1758 /* Service newly-arrived signals */
1759 pgwin32_dispatch_queued_signals();
1760 return 0; /* retry */
1761 }
1762
1763 /*
1764 * With an offset of one, due to the always present pgwin32_signal_event,
1765 * the handle offset directly corresponds to a wait event.
1766 */
1767 cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
1768
1769 occurred_events->pos = cur_event->pos;
1770 occurred_events->user_data = cur_event->user_data;
1771 occurred_events->events = 0;
1772
1773 if (cur_event->events == WL_LATCH_SET)
1774 {
1775 if (!ResetEvent(set->latch->event))
1776 elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
1777
1778 if (set->latch->is_set)
1779 {
1780 occurred_events->fd = PGINVALID_SOCKET;
1781 occurred_events->events = WL_LATCH_SET;
1782 occurred_events++;
1783 returned_events++;
1784 }
1785 }
1786 else if (cur_event->events == WL_POSTMASTER_DEATH)
1787 {
1788 /*
1789 * Postmaster apparently died. Since the consequences of falsely
1790 * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
1791 * the trouble to positively verify this with PostmasterIsAlive(),
1792 * even though there is no known reason to think that the event could
1793 * be falsely set on Windows.
1794 */
1795 if (!PostmasterIsAliveInternal())
1796 {
1797 if (set->exit_on_postmaster_death)
1798 proc_exit(1);
1799 occurred_events->fd = PGINVALID_SOCKET;
1800 occurred_events->events = WL_POSTMASTER_DEATH;
1801 occurred_events++;
1802 returned_events++;
1803 }
1804 }
1805 else if (cur_event->events & WL_SOCKET_MASK)
1806 {
1807 WSANETWORKEVENTS resEvents;
1808 HANDLE handle = set->handles[cur_event->pos + 1];
1809
1810 Assert(cur_event->fd);
1811
1812 occurred_events->fd = cur_event->fd;
1813
1814 ZeroMemory(&resEvents, sizeof(resEvents));
1815 if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
1816 elog(ERROR, "failed to enumerate network events: error code %u",
1817 WSAGetLastError());
1818 if ((cur_event->events & WL_SOCKET_READABLE) &&
1819 (resEvents.lNetworkEvents & FD_READ))
1820 {
1821 /* data available in socket */
1822 occurred_events->events |= WL_SOCKET_READABLE;
1823
1824 /*------
1825 * WaitForMultipleObjects doesn't guarantee that a read event will
1826 * be returned if the latch is set at the same time. Even if it
1827 * did, the caller might drop that event expecting it to reoccur
1828 * on next call. So, we must force the event to be reset if this
1829 * WaitEventSet is used again in order to avoid an indefinite
1830 * hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
1831 * for the behavior of socket events.
1832 *------
1833 */
1834 cur_event->reset = true;
1835 }
1836 if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1837 (resEvents.lNetworkEvents & FD_WRITE))
1838 {
1839 /* writeable */
1840 occurred_events->events |= WL_SOCKET_WRITEABLE;
1841 }
1842 if ((cur_event->events & WL_SOCKET_CONNECTED) &&
1843 (resEvents.lNetworkEvents & FD_CONNECT))
1844 {
1845 /* connected */
1846 occurred_events->events |= WL_SOCKET_CONNECTED;
1847 }
1848 if (resEvents.lNetworkEvents & FD_CLOSE)
1849 {
1850 /* EOF/error, so signal all caller-requested socket flags */
1851 occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
1852 }
1853
1854 if (occurred_events->events != 0)
1855 {
1856 occurred_events++;
1857 returned_events++;
1858 }
1859 }
1860
1861 return returned_events;
1862 }
1863 #endif
1864
1865 /*
1866 * SetLatch uses SIGUSR1 to wake up the process waiting on the latch.
1867 *
1868 * Wake up WaitLatch, if we're waiting. (We might not be, since SIGUSR1 is
1869 * overloaded for multiple purposes; or we might not have reached WaitLatch
1870 * yet, in which case we don't need to fill the pipe either.)
1871 *
1872 * NB: when calling this in a signal handler, be sure to save and restore
1873 * errno around it.
1874 */
1875 #ifndef WIN32
1876 void
1877 latch_sigusr1_handler(void)
1878 {
1879 if (waiting)
1880 sendSelfPipeByte();
1881 }
1882 #endif /* !WIN32 */
1883
1884 /* Send one byte to the self-pipe, to wake up WaitLatch */
1885 #ifndef WIN32
1886 static void
1887 sendSelfPipeByte(void)
1888 {
1889 int rc;
1890 char dummy = 0;
1891
1892 retry:
1893 rc = write(selfpipe_writefd, &dummy, 1);
1894 if (rc < 0)
1895 {
1896 /* If interrupted by signal, just retry */
1897 if (errno == EINTR)
1898 goto retry;
1899
1900 /*
1901 * If the pipe is full, we don't need to retry, the data that's there
1902 * already is enough to wake up WaitLatch.
1903 */
1904 if (errno == EAGAIN || errno == EWOULDBLOCK)
1905 return;
1906
1907 /*
1908 * Oops, the write() failed for some other reason. We might be in a
1909 * signal handler, so it's not safe to elog(). We have no choice but
1910 * silently ignore the error.
1911 */
1912 return;
1913 }
1914 }
1915 #endif /* !WIN32 */
1916
1917 /*
1918 * Read all available data from the self-pipe
1919 *
1920 * Note: this is only called when waiting = true. If it fails and doesn't
1921 * return, it must reset that flag first (though ideally, this will never
1922 * happen).
1923 */
1924 #ifndef WIN32
1925 static void
1926 drainSelfPipe(void)
1927 {
1928 /*
1929 * There shouldn't normally be more than one byte in the pipe, or maybe a
1930 * few bytes if multiple processes run SetLatch at the same instant.
1931 */
1932 char buf[16];
1933 int rc;
1934
1935 for (;;)
1936 {
1937 rc = read(selfpipe_readfd, buf, sizeof(buf));
1938 if (rc < 0)
1939 {
1940 if (errno == EAGAIN || errno == EWOULDBLOCK)
1941 break; /* the pipe is empty */
1942 else if (errno == EINTR)
1943 continue; /* retry */
1944 else
1945 {
1946 waiting = false;
1947 elog(ERROR, "read() on self-pipe failed: %m");
1948 }
1949 }
1950 else if (rc == 0)
1951 {
1952 waiting = false;
1953 elog(ERROR, "unexpected EOF on self-pipe");
1954 }
1955 else if (rc < sizeof(buf))
1956 {
1957 /* we successfully drained the pipe; no need to read() again */
1958 break;
1959 }
1960 /* else buffer wasn't big enough, so read again */
1961 }
1962 }
1963 #endif /* !WIN32 */
1964