1 /*-------------------------------------------------------------------------
2  *
3  * latch.c
4  *	  Routines for inter-process latches
5  *
6  * The Unix implementation uses the so-called self-pipe trick to overcome the
7  * race condition involved with poll() (or epoll_wait() on linux) and setting
8  * a global flag in the signal handler. When a latch is set and the current
9  * process is waiting for it, the signal handler wakes up the poll() in
10  * WaitLatch by writing a byte to a pipe. A signal by itself doesn't interrupt
11  * poll() on all platforms, and even on platforms where it does, a signal that
12  * arrives just before the poll() call does not prevent poll() from entering
13  * sleep. An incoming byte on a pipe however reliably interrupts the sleep,
14  * and causes poll() to return immediately even if the signal arrives before
15  * poll() begins.
16  *
17  * When SetLatch is called from the same process that owns the latch,
18  * SetLatch writes the byte directly to the pipe. If it's owned by another
19  * process, SIGUSR1 is sent and the signal handler in the waiting process
20  * writes the byte to the pipe on behalf of the signaling process.
21  *
22  * The Windows implementation uses Windows events that are inherited by all
23  * postmaster child processes. There's no need for the self-pipe trick there.
24  *
25  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
26  * Portions Copyright (c) 1994, Regents of the University of California
27  *
28  * IDENTIFICATION
29  *	  src/backend/storage/ipc/latch.c
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34 
35 #include <fcntl.h>
36 #include <limits.h>
37 #include <signal.h>
38 #include <unistd.h>
39 #ifdef HAVE_SYS_EPOLL_H
40 #include <sys/epoll.h>
41 #endif
42 #ifdef HAVE_SYS_EVENT_H
43 #include <sys/event.h>
44 #endif
45 #ifdef HAVE_POLL_H
46 #include <poll.h>
47 #endif
48 
49 #include "miscadmin.h"
50 #include "pgstat.h"
51 #include "port/atomics.h"
52 #include "portability/instr_time.h"
53 #include "postmaster/postmaster.h"
54 #include "storage/fd.h"
55 #include "storage/ipc.h"
56 #include "storage/latch.h"
57 #include "storage/pmsignal.h"
58 #include "storage/shmem.h"
59 
60 /*
61  * Select the fd readiness primitive to use. Normally the "most modern"
62  * primitive supported by the OS will be used, but for testing it can be
63  * useful to manually specify the used primitive.  If desired, just add a
64  * define somewhere before this block.
65  */
66 #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
67 	defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32)
68 /* don't overwrite manual choice */
69 #elif defined(HAVE_SYS_EPOLL_H)
70 #define WAIT_USE_EPOLL
71 #elif defined(HAVE_KQUEUE)
72 #define WAIT_USE_KQUEUE
73 #elif defined(HAVE_POLL)
74 #define WAIT_USE_POLL
75 #elif WIN32
76 #define WAIT_USE_WIN32
77 #else
78 #error "no wait set implementation available"
79 #endif
80 
81 /* typedef in latch.h */
82 struct WaitEventSet
83 {
84 	int			nevents;		/* number of registered events */
85 	int			nevents_space;	/* maximum number of events in this set */
86 
87 	/*
88 	 * Array, of nevents_space length, storing the definition of events this
89 	 * set is waiting for.
90 	 */
91 	WaitEvent  *events;
92 
93 	/*
94 	 * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
95 	 * said latch, and latch_pos the offset in the ->events array. This is
96 	 * useful because we check the state of the latch before performing doing
97 	 * syscalls related to waiting.
98 	 */
99 	Latch	   *latch;
100 	int			latch_pos;
101 
102 	/*
103 	 * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
104 	 * is set so that we'll exit immediately if postmaster death is detected,
105 	 * instead of returning.
106 	 */
107 	bool		exit_on_postmaster_death;
108 
109 #if defined(WAIT_USE_EPOLL)
110 	int			epoll_fd;
111 	/* epoll_wait returns events in a user provided arrays, allocate once */
112 	struct epoll_event *epoll_ret_events;
113 #elif defined(WAIT_USE_KQUEUE)
114 	int			kqueue_fd;
115 	/* kevent returns events in a user provided arrays, allocate once */
116 	struct kevent *kqueue_ret_events;
117 	bool		report_postmaster_not_running;
118 #elif defined(WAIT_USE_POLL)
119 	/* poll expects events to be waited on every poll() call, prepare once */
120 	struct pollfd *pollfds;
121 #elif defined(WAIT_USE_WIN32)
122 
123 	/*
124 	 * Array of windows events. The first element always contains
125 	 * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
126 	 * event->pos + 1).
127 	 */
128 	HANDLE	   *handles;
129 #endif
130 };
131 
132 #ifndef WIN32
133 /* Are we currently in WaitLatch? The signal handler would like to know. */
134 static volatile sig_atomic_t waiting = false;
135 
136 /* Read and write ends of the self-pipe */
137 static int	selfpipe_readfd = -1;
138 static int	selfpipe_writefd = -1;
139 
140 /* Process owning the self-pipe --- needed for checking purposes */
141 static int	selfpipe_owner_pid = 0;
142 
143 /* Private function prototypes */
144 static void sendSelfPipeByte(void);
145 static void drainSelfPipe(void);
146 #endif							/* WIN32 */
147 
148 #if defined(WAIT_USE_EPOLL)
149 static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
150 #elif defined(WAIT_USE_KQUEUE)
151 static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events);
152 #elif defined(WAIT_USE_POLL)
153 static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
154 #elif defined(WAIT_USE_WIN32)
155 static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
156 #endif
157 
158 static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
159 										WaitEvent *occurred_events, int nevents);
160 
161 /*
162  * Initialize the process-local latch infrastructure.
163  *
164  * This must be called once during startup of any process that can wait on
165  * latches, before it issues any InitLatch() or OwnLatch() calls.
166  */
167 void
InitializeLatchSupport(void)168 InitializeLatchSupport(void)
169 {
170 #ifndef WIN32
171 	int			pipefd[2];
172 
173 	if (IsUnderPostmaster)
174 	{
175 		/*
176 		 * We might have inherited connections to a self-pipe created by the
177 		 * postmaster.  It's critical that child processes create their own
178 		 * self-pipes, of course, and we really want them to close the
179 		 * inherited FDs for safety's sake.
180 		 */
181 		if (selfpipe_owner_pid != 0)
182 		{
183 			/* Assert we go through here but once in a child process */
184 			Assert(selfpipe_owner_pid != MyProcPid);
185 			/* Release postmaster's pipe FDs; ignore any error */
186 			(void) close(selfpipe_readfd);
187 			(void) close(selfpipe_writefd);
188 			/* Clean up, just for safety's sake; we'll set these below */
189 			selfpipe_readfd = selfpipe_writefd = -1;
190 			selfpipe_owner_pid = 0;
191 			/* Keep fd.c's accounting straight */
192 			ReleaseExternalFD();
193 			ReleaseExternalFD();
194 		}
195 		else
196 		{
197 			/*
198 			 * Postmaster didn't create a self-pipe ... or else we're in an
199 			 * EXEC_BACKEND build, in which case it doesn't matter since the
200 			 * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
201 			 * fd.c won't have state to clean up, either.
202 			 */
203 			Assert(selfpipe_readfd == -1);
204 		}
205 	}
206 	else
207 	{
208 		/* In postmaster or standalone backend, assert we do this but once */
209 		Assert(selfpipe_readfd == -1);
210 		Assert(selfpipe_owner_pid == 0);
211 	}
212 
213 	/*
214 	 * Set up the self-pipe that allows a signal handler to wake up the
215 	 * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
216 	 * that SetLatch won't block if the event has already been set many times
217 	 * filling the kernel buffer. Make the read-end non-blocking too, so that
218 	 * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
219 	 * Also, make both FDs close-on-exec, since we surely do not want any
220 	 * child processes messing with them.
221 	 */
222 	if (pipe(pipefd) < 0)
223 		elog(FATAL, "pipe() failed: %m");
224 	if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
225 		elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
226 	if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
227 		elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
228 	if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
229 		elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
230 	if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
231 		elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
232 
233 	selfpipe_readfd = pipefd[0];
234 	selfpipe_writefd = pipefd[1];
235 	selfpipe_owner_pid = MyProcPid;
236 
237 	/* Tell fd.c about these two long-lived FDs */
238 	ReserveExternalFD();
239 	ReserveExternalFD();
240 #else
241 	/* currently, nothing to do here for Windows */
242 #endif
243 }
244 
245 /*
246  * Initialize a process-local latch.
247  */
248 void
InitLatch(Latch * latch)249 InitLatch(Latch *latch)
250 {
251 	latch->is_set = false;
252 	latch->owner_pid = MyProcPid;
253 	latch->is_shared = false;
254 
255 #ifndef WIN32
256 	/* Assert InitializeLatchSupport has been called in this process */
257 	Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
258 #else
259 	latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
260 	if (latch->event == NULL)
261 		elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
262 #endif							/* WIN32 */
263 }
264 
265 /*
266  * Initialize a shared latch that can be set from other processes. The latch
267  * is initially owned by no-one; use OwnLatch to associate it with the
268  * current process.
269  *
270  * InitSharedLatch needs to be called in postmaster before forking child
271  * processes, usually right after allocating the shared memory block
272  * containing the latch with ShmemInitStruct. (The Unix implementation
273  * doesn't actually require that, but the Windows one does.) Because of
274  * this restriction, we have no concurrency issues to worry about here.
275  *
276  * Note that other handles created in this module are never marked as
277  * inheritable.  Thus we do not need to worry about cleaning up child
278  * process references to postmaster-private latches or WaitEventSets.
279  */
280 void
InitSharedLatch(Latch * latch)281 InitSharedLatch(Latch *latch)
282 {
283 #ifdef WIN32
284 	SECURITY_ATTRIBUTES sa;
285 
286 	/*
287 	 * Set up security attributes to specify that the events are inherited.
288 	 */
289 	ZeroMemory(&sa, sizeof(sa));
290 	sa.nLength = sizeof(sa);
291 	sa.bInheritHandle = TRUE;
292 
293 	latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
294 	if (latch->event == NULL)
295 		elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
296 #endif
297 
298 	latch->is_set = false;
299 	latch->owner_pid = 0;
300 	latch->is_shared = true;
301 }
302 
303 /*
304  * Associate a shared latch with the current process, allowing it to
305  * wait on the latch.
306  *
307  * Although there is a sanity check for latch-already-owned, we don't do
308  * any sort of locking here, meaning that we could fail to detect the error
309  * if two processes try to own the same latch at about the same time.  If
310  * there is any risk of that, caller must provide an interlock to prevent it.
311  *
312  * In any process that calls OwnLatch(), make sure that
313  * latch_sigusr1_handler() is called from the SIGUSR1 signal handler,
314  * as shared latches use SIGUSR1 for inter-process communication.
315  */
316 void
OwnLatch(Latch * latch)317 OwnLatch(Latch *latch)
318 {
319 	/* Sanity checks */
320 	Assert(latch->is_shared);
321 
322 #ifndef WIN32
323 	/* Assert InitializeLatchSupport has been called in this process */
324 	Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
325 #endif
326 
327 	if (latch->owner_pid != 0)
328 		elog(ERROR, "latch already owned");
329 
330 	latch->owner_pid = MyProcPid;
331 }
332 
333 /*
334  * Disown a shared latch currently owned by the current process.
335  */
336 void
DisownLatch(Latch * latch)337 DisownLatch(Latch *latch)
338 {
339 	Assert(latch->is_shared);
340 	Assert(latch->owner_pid == MyProcPid);
341 
342 	latch->owner_pid = 0;
343 }
344 
345 /*
346  * Wait for a given latch to be set, or for postmaster death, or until timeout
347  * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
348  * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
349  * function returns immediately.
350  *
351  * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
352  * is given.  Although it is declared as "long", we don't actually support
353  * timeouts longer than INT_MAX milliseconds.  Note that some extra overhead
354  * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
355  *
356  * The latch must be owned by the current process, ie. it must be a
357  * process-local latch initialized with InitLatch, or a shared latch
358  * associated with the current process by calling OwnLatch.
359  *
360  * Returns bit mask indicating which condition(s) caused the wake-up. Note
361  * that if multiple wake-up conditions are true, there is no guarantee that
362  * we return all of them in one call, but we will return at least one.
363  */
364 int
WaitLatch(Latch * latch,int wakeEvents,long timeout,uint32 wait_event_info)365 WaitLatch(Latch *latch, int wakeEvents, long timeout,
366 		  uint32 wait_event_info)
367 {
368 	return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
369 							 wait_event_info);
370 }
371 
372 /*
373  * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
374  * conditions.
375  *
376  * When waiting on a socket, EOF and error conditions always cause the socket
377  * to be reported as readable/writable/connected, so that the caller can deal
378  * with the condition.
379  *
380  * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
381  * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
382  * return value if the postmaster dies.  The latter is useful for rare cases
383  * where some behavior other than immediate exit is needed.
384  *
385  * NB: These days this is just a wrapper around the WaitEventSet API. When
386  * using a latch very frequently, consider creating a longer living
387  * WaitEventSet instead; that's more efficient.
388  */
389 int
WaitLatchOrSocket(Latch * latch,int wakeEvents,pgsocket sock,long timeout,uint32 wait_event_info)390 WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
391 				  long timeout, uint32 wait_event_info)
392 {
393 	int			ret = 0;
394 	int			rc;
395 	WaitEvent	event;
396 	WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
397 
398 	if (wakeEvents & WL_TIMEOUT)
399 		Assert(timeout >= 0);
400 	else
401 		timeout = -1;
402 
403 	if (wakeEvents & WL_LATCH_SET)
404 		AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
405 						  latch, NULL);
406 
407 	/* Postmaster-managed callers must handle postmaster death somehow. */
408 	Assert(!IsUnderPostmaster ||
409 		   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
410 		   (wakeEvents & WL_POSTMASTER_DEATH));
411 
412 	if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
413 		AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
414 						  NULL, NULL);
415 
416 	if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
417 		AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
418 						  NULL, NULL);
419 
420 	if (wakeEvents & WL_SOCKET_MASK)
421 	{
422 		int			ev;
423 
424 		ev = wakeEvents & WL_SOCKET_MASK;
425 		AddWaitEventToSet(set, ev, sock, NULL, NULL);
426 	}
427 
428 	rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
429 
430 	if (rc == 0)
431 		ret |= WL_TIMEOUT;
432 	else
433 	{
434 		ret |= event.events & (WL_LATCH_SET |
435 							   WL_POSTMASTER_DEATH |
436 							   WL_SOCKET_MASK);
437 	}
438 
439 	FreeWaitEventSet(set);
440 
441 	return ret;
442 }
443 
444 /*
445  * Sets a latch and wakes up anyone waiting on it.
446  *
447  * This is cheap if the latch is already set, otherwise not so much.
448  *
449  * NB: when calling this in a signal handler, be sure to save and restore
450  * errno around it.  (That's standard practice in most signal handlers, of
451  * course, but we used to omit it in handlers that only set a flag.)
452  *
453  * NB: this function is called from critical sections and signal handlers so
454  * throwing an error is not a good idea.
455  */
456 void
SetLatch(Latch * latch)457 SetLatch(Latch *latch)
458 {
459 #ifndef WIN32
460 	pid_t		owner_pid;
461 #else
462 	HANDLE		handle;
463 #endif
464 
465 	/*
466 	 * The memory barrier has to be placed here to ensure that any flag
467 	 * variables possibly changed by this process have been flushed to main
468 	 * memory, before we check/set is_set.
469 	 */
470 	pg_memory_barrier();
471 
472 	/* Quick exit if already set */
473 	if (latch->is_set)
474 		return;
475 
476 	latch->is_set = true;
477 
478 #ifndef WIN32
479 
480 	/*
481 	 * See if anyone's waiting for the latch. It can be the current process if
482 	 * we're in a signal handler. We use the self-pipe to wake up the
483 	 * poll()/epoll_wait() in that case. If it's another process, send a
484 	 * signal.
485 	 *
486 	 * Fetch owner_pid only once, in case the latch is concurrently getting
487 	 * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
488 	 * guaranteed to be true! In practice, the effective range of pid_t fits
489 	 * in a 32 bit integer, and so should be atomic. In the worst case, we
490 	 * might end up signaling the wrong process. Even then, you're very
491 	 * unlucky if a process with that bogus pid exists and belongs to
492 	 * Postgres; and PG database processes should handle excess SIGUSR1
493 	 * interrupts without a problem anyhow.
494 	 *
495 	 * Another sort of race condition that's possible here is for a new
496 	 * process to own the latch immediately after we look, so we don't signal
497 	 * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
498 	 * the standard coding convention of waiting at the bottom of their loops,
499 	 * not the top, so that they'll correctly process latch-setting events
500 	 * that happen before they enter the loop.
501 	 */
502 	owner_pid = latch->owner_pid;
503 	if (owner_pid == 0)
504 		return;
505 	else if (owner_pid == MyProcPid)
506 	{
507 		if (waiting)
508 			sendSelfPipeByte();
509 	}
510 	else
511 		kill(owner_pid, SIGUSR1);
512 #else
513 
514 	/*
515 	 * See if anyone's waiting for the latch. It can be the current process if
516 	 * we're in a signal handler.
517 	 *
518 	 * Use a local variable here just in case somebody changes the event field
519 	 * concurrently (which really should not happen).
520 	 */
521 	handle = latch->event;
522 	if (handle)
523 	{
524 		SetEvent(handle);
525 
526 		/*
527 		 * Note that we silently ignore any errors. We might be in a signal
528 		 * handler or other critical path where it's not safe to call elog().
529 		 */
530 	}
531 #endif
532 
533 }
534 
535 /*
536  * Clear the latch. Calling WaitLatch after this will sleep, unless
537  * the latch is set again before the WaitLatch call.
538  */
539 void
ResetLatch(Latch * latch)540 ResetLatch(Latch *latch)
541 {
542 	/* Only the owner should reset the latch */
543 	Assert(latch->owner_pid == MyProcPid);
544 
545 	latch->is_set = false;
546 
547 	/*
548 	 * Ensure that the write to is_set gets flushed to main memory before we
549 	 * examine any flag variables.  Otherwise a concurrent SetLatch might
550 	 * falsely conclude that it needn't signal us, even though we have missed
551 	 * seeing some flag updates that SetLatch was supposed to inform us of.
552 	 */
553 	pg_memory_barrier();
554 }
555 
556 /*
557  * Create a WaitEventSet with space for nevents different events to wait for.
558  *
559  * These events can then be efficiently waited upon together, using
560  * WaitEventSetWait().
561  */
562 WaitEventSet *
CreateWaitEventSet(MemoryContext context,int nevents)563 CreateWaitEventSet(MemoryContext context, int nevents)
564 {
565 	WaitEventSet *set;
566 	char	   *data;
567 	Size		sz = 0;
568 
569 	/*
570 	 * Use MAXALIGN size/alignment to guarantee that later uses of memory are
571 	 * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
572 	 * platforms, but earlier allocations like WaitEventSet and WaitEvent
573 	 * might not sized to guarantee that when purely using sizeof().
574 	 */
575 	sz += MAXALIGN(sizeof(WaitEventSet));
576 	sz += MAXALIGN(sizeof(WaitEvent) * nevents);
577 
578 #if defined(WAIT_USE_EPOLL)
579 	sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
580 #elif defined(WAIT_USE_KQUEUE)
581 	sz += MAXALIGN(sizeof(struct kevent) * nevents);
582 #elif defined(WAIT_USE_POLL)
583 	sz += MAXALIGN(sizeof(struct pollfd) * nevents);
584 #elif defined(WAIT_USE_WIN32)
585 	/* need space for the pgwin32_signal_event */
586 	sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
587 #endif
588 
589 	data = (char *) MemoryContextAllocZero(context, sz);
590 
591 	set = (WaitEventSet *) data;
592 	data += MAXALIGN(sizeof(WaitEventSet));
593 
594 	set->events = (WaitEvent *) data;
595 	data += MAXALIGN(sizeof(WaitEvent) * nevents);
596 
597 #if defined(WAIT_USE_EPOLL)
598 	set->epoll_ret_events = (struct epoll_event *) data;
599 	data += MAXALIGN(sizeof(struct epoll_event) * nevents);
600 #elif defined(WAIT_USE_KQUEUE)
601 	set->kqueue_ret_events = (struct kevent *) data;
602 	data += MAXALIGN(sizeof(struct kevent) * nevents);
603 #elif defined(WAIT_USE_POLL)
604 	set->pollfds = (struct pollfd *) data;
605 	data += MAXALIGN(sizeof(struct pollfd) * nevents);
606 #elif defined(WAIT_USE_WIN32)
607 	set->handles = (HANDLE) data;
608 	data += MAXALIGN(sizeof(HANDLE) * nevents);
609 #endif
610 
611 	set->latch = NULL;
612 	set->nevents_space = nevents;
613 	set->exit_on_postmaster_death = false;
614 
615 #if defined(WAIT_USE_EPOLL)
616 	if (!AcquireExternalFD())
617 	{
618 		/* treat this as though epoll_create1 itself returned EMFILE */
619 		elog(ERROR, "epoll_create1 failed: %m");
620 	}
621 #ifdef EPOLL_CLOEXEC
622 	set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
623 	if (set->epoll_fd < 0)
624 	{
625 		ReleaseExternalFD();
626 		elog(ERROR, "epoll_create1 failed: %m");
627 	}
628 #else
629 	/* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
630 	set->epoll_fd = epoll_create(nevents);
631 	if (set->epoll_fd < 0)
632 	{
633 		ReleaseExternalFD();
634 		elog(ERROR, "epoll_create failed: %m");
635 	}
636 	if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
637 	{
638 		int			save_errno = errno;
639 
640 		close(set->epoll_fd);
641 		ReleaseExternalFD();
642 		errno = save_errno;
643 		elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
644 	}
645 #endif							/* EPOLL_CLOEXEC */
646 #elif defined(WAIT_USE_KQUEUE)
647 	if (!AcquireExternalFD())
648 	{
649 		/* treat this as though kqueue itself returned EMFILE */
650 		elog(ERROR, "kqueue failed: %m");
651 	}
652 	set->kqueue_fd = kqueue();
653 	if (set->kqueue_fd < 0)
654 	{
655 		ReleaseExternalFD();
656 		elog(ERROR, "kqueue failed: %m");
657 	}
658 	if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
659 	{
660 		int			save_errno = errno;
661 
662 		close(set->kqueue_fd);
663 		ReleaseExternalFD();
664 		errno = save_errno;
665 		elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
666 	}
667 	set->report_postmaster_not_running = false;
668 #elif defined(WAIT_USE_WIN32)
669 
670 	/*
671 	 * To handle signals while waiting, we need to add a win32 specific event.
672 	 * We accounted for the additional event at the top of this routine. See
673 	 * port/win32/signal.c for more details.
674 	 *
675 	 * Note: pgwin32_signal_event should be first to ensure that it will be
676 	 * reported when multiple events are set.  We want to guarantee that
677 	 * pending signals are serviced.
678 	 */
679 	set->handles[0] = pgwin32_signal_event;
680 	StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
681 #endif
682 
683 	return set;
684 }
685 
686 /*
687  * Free a previously created WaitEventSet.
688  *
689  * Note: preferably, this shouldn't have to free any resources that could be
690  * inherited across an exec().  If it did, we'd likely leak those resources in
691  * many scenarios.  For the epoll case, we ensure that by setting FD_CLOEXEC
692  * when the FD is created.  For the Windows case, we assume that the handles
693  * involved are non-inheritable.
694  */
695 void
FreeWaitEventSet(WaitEventSet * set)696 FreeWaitEventSet(WaitEventSet *set)
697 {
698 #if defined(WAIT_USE_EPOLL)
699 	close(set->epoll_fd);
700 	ReleaseExternalFD();
701 #elif defined(WAIT_USE_KQUEUE)
702 	close(set->kqueue_fd);
703 	ReleaseExternalFD();
704 #elif defined(WAIT_USE_WIN32)
705 	WaitEvent  *cur_event;
706 
707 	for (cur_event = set->events;
708 		 cur_event < (set->events + set->nevents);
709 		 cur_event++)
710 	{
711 		if (cur_event->events & WL_LATCH_SET)
712 		{
713 			/* uses the latch's HANDLE */
714 		}
715 		else if (cur_event->events & WL_POSTMASTER_DEATH)
716 		{
717 			/* uses PostmasterHandle */
718 		}
719 		else
720 		{
721 			/* Clean up the event object we created for the socket */
722 			WSAEventSelect(cur_event->fd, NULL, 0);
723 			WSACloseEvent(set->handles[cur_event->pos + 1]);
724 		}
725 	}
726 #endif
727 
728 	pfree(set);
729 }
730 
731 /* ---
732  * Add an event to the set. Possible events are:
733  * - WL_LATCH_SET: Wait for the latch to be set
734  * - WL_POSTMASTER_DEATH: Wait for postmaster to die
735  * - WL_SOCKET_READABLE: Wait for socket to become readable,
736  *	 can be combined in one event with other WL_SOCKET_* events
737  * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
738  *	 can be combined with other WL_SOCKET_* events
739  * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
740  *	 can be combined with other WL_SOCKET_* events (on non-Windows
741  *	 platforms, this is the same as WL_SOCKET_WRITEABLE)
742  * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
743  *
744  * Returns the offset in WaitEventSet->events (starting from 0), which can be
745  * used to modify previously added wait events using ModifyWaitEvent().
746  *
747  * In the WL_LATCH_SET case the latch must be owned by the current process,
748  * i.e. it must be a process-local latch initialized with InitLatch, or a
749  * shared latch associated with the current process by calling OwnLatch.
750  *
751  * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
752  * conditions cause the socket to be reported as readable/writable/connected,
753  * so that the caller can deal with the condition.
754  *
755  * The user_data pointer specified here will be set for the events returned
756  * by WaitEventSetWait(), allowing to easily associate additional data with
757  * events.
758  */
759 int
AddWaitEventToSet(WaitEventSet * set,uint32 events,pgsocket fd,Latch * latch,void * user_data)760 AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
761 				  void *user_data)
762 {
763 	WaitEvent  *event;
764 
765 	/* not enough space */
766 	Assert(set->nevents < set->nevents_space);
767 
768 	if (events == WL_EXIT_ON_PM_DEATH)
769 	{
770 		events = WL_POSTMASTER_DEATH;
771 		set->exit_on_postmaster_death = true;
772 	}
773 
774 	if (latch)
775 	{
776 		if (latch->owner_pid != MyProcPid)
777 			elog(ERROR, "cannot wait on a latch owned by another process");
778 		if (set->latch)
779 			elog(ERROR, "cannot wait on more than one latch");
780 		if ((events & WL_LATCH_SET) != WL_LATCH_SET)
781 			elog(ERROR, "latch events only support being set");
782 	}
783 	else
784 	{
785 		if (events & WL_LATCH_SET)
786 			elog(ERROR, "cannot wait on latch without a specified latch");
787 	}
788 
789 	/* waiting for socket readiness without a socket indicates a bug */
790 	if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
791 		elog(ERROR, "cannot wait on socket event without a socket");
792 
793 	event = &set->events[set->nevents];
794 	event->pos = set->nevents++;
795 	event->fd = fd;
796 	event->events = events;
797 	event->user_data = user_data;
798 #ifdef WIN32
799 	event->reset = false;
800 #endif
801 
802 	if (events == WL_LATCH_SET)
803 	{
804 		set->latch = latch;
805 		set->latch_pos = event->pos;
806 #ifndef WIN32
807 		event->fd = selfpipe_readfd;
808 #endif
809 	}
810 	else if (events == WL_POSTMASTER_DEATH)
811 	{
812 #ifndef WIN32
813 		event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
814 #endif
815 	}
816 
817 	/* perform wait primitive specific initialization, if needed */
818 #if defined(WAIT_USE_EPOLL)
819 	WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
820 #elif defined(WAIT_USE_KQUEUE)
821 	WaitEventAdjustKqueue(set, event, 0);
822 #elif defined(WAIT_USE_POLL)
823 	WaitEventAdjustPoll(set, event);
824 #elif defined(WAIT_USE_WIN32)
825 	WaitEventAdjustWin32(set, event);
826 #endif
827 
828 	return event->pos;
829 }
830 
831 /*
832  * Change the event mask and, in the WL_LATCH_SET case, the latch associated
833  * with the WaitEvent.
834  *
835  * 'pos' is the id returned by AddWaitEventToSet.
836  */
837 void
ModifyWaitEvent(WaitEventSet * set,int pos,uint32 events,Latch * latch)838 ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
839 {
840 	WaitEvent  *event;
841 #if defined(WAIT_USE_KQUEUE)
842 	int			old_events;
843 #endif
844 
845 	Assert(pos < set->nevents);
846 
847 	event = &set->events[pos];
848 #if defined(WAIT_USE_KQUEUE)
849 	old_events = event->events;
850 #endif
851 
852 	/*
853 	 * If neither the event mask nor the associated latch changes, return
854 	 * early. That's an important optimization for some sockets, where
855 	 * ModifyWaitEvent is frequently used to switch from waiting for reads to
856 	 * waiting on writes.
857 	 */
858 	if (events == event->events &&
859 		(!(event->events & WL_LATCH_SET) || set->latch == latch))
860 		return;
861 
862 	if (event->events & WL_LATCH_SET &&
863 		events != event->events)
864 	{
865 		/* we could allow to disable latch events for a while */
866 		elog(ERROR, "cannot modify latch event");
867 	}
868 
869 	if (event->events & WL_POSTMASTER_DEATH)
870 	{
871 		elog(ERROR, "cannot modify postmaster death event");
872 	}
873 
874 	/* FIXME: validate event mask */
875 	event->events = events;
876 
877 	if (events == WL_LATCH_SET)
878 	{
879 		set->latch = latch;
880 	}
881 
882 #if defined(WAIT_USE_EPOLL)
883 	WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
884 #elif defined(WAIT_USE_KQUEUE)
885 	WaitEventAdjustKqueue(set, event, old_events);
886 #elif defined(WAIT_USE_POLL)
887 	WaitEventAdjustPoll(set, event);
888 #elif defined(WAIT_USE_WIN32)
889 	WaitEventAdjustWin32(set, event);
890 #endif
891 }
892 
893 #if defined(WAIT_USE_EPOLL)
894 /*
895  * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
896  */
897 static void
WaitEventAdjustEpoll(WaitEventSet * set,WaitEvent * event,int action)898 WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
899 {
900 	struct epoll_event epoll_ev;
901 	int			rc;
902 
903 	/* pointer to our event, returned by epoll_wait */
904 	epoll_ev.data.ptr = event;
905 	/* always wait for errors */
906 	epoll_ev.events = EPOLLERR | EPOLLHUP;
907 
908 	/* prepare pollfd entry once */
909 	if (event->events == WL_LATCH_SET)
910 	{
911 		Assert(set->latch != NULL);
912 		epoll_ev.events |= EPOLLIN;
913 	}
914 	else if (event->events == WL_POSTMASTER_DEATH)
915 	{
916 		epoll_ev.events |= EPOLLIN;
917 	}
918 	else
919 	{
920 		Assert(event->fd != PGINVALID_SOCKET);
921 		Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
922 
923 		if (event->events & WL_SOCKET_READABLE)
924 			epoll_ev.events |= EPOLLIN;
925 		if (event->events & WL_SOCKET_WRITEABLE)
926 			epoll_ev.events |= EPOLLOUT;
927 	}
928 
929 	/*
930 	 * Even though unused, we also pass epoll_ev as the data argument if
931 	 * EPOLL_CTL_DEL is passed as action.  There used to be an epoll bug
932 	 * requiring that, and actually it makes the code simpler...
933 	 */
934 	rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
935 
936 	if (rc < 0)
937 		ereport(ERROR,
938 				(errcode_for_socket_access(),
939 		/* translator: %s is a syscall name, such as "poll()" */
940 				 errmsg("%s failed: %m",
941 						"epoll_ctl()")));
942 }
943 #endif
944 
945 #if defined(WAIT_USE_POLL)
946 static void
WaitEventAdjustPoll(WaitEventSet * set,WaitEvent * event)947 WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
948 {
949 	struct pollfd *pollfd = &set->pollfds[event->pos];
950 
951 	pollfd->revents = 0;
952 	pollfd->fd = event->fd;
953 
954 	/* prepare pollfd entry once */
955 	if (event->events == WL_LATCH_SET)
956 	{
957 		Assert(set->latch != NULL);
958 		pollfd->events = POLLIN;
959 	}
960 	else if (event->events == WL_POSTMASTER_DEATH)
961 	{
962 		pollfd->events = POLLIN;
963 	}
964 	else
965 	{
966 		Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
967 		pollfd->events = 0;
968 		if (event->events & WL_SOCKET_READABLE)
969 			pollfd->events |= POLLIN;
970 		if (event->events & WL_SOCKET_WRITEABLE)
971 			pollfd->events |= POLLOUT;
972 	}
973 
974 	Assert(event->fd != PGINVALID_SOCKET);
975 }
976 #endif
977 
978 #if defined(WAIT_USE_KQUEUE)
979 
980 /*
981  * On most BSD family systems, the udata member of struct kevent is of type
982  * void *, so we could directly convert to/from WaitEvent *.  Unfortunately,
983  * NetBSD has it as intptr_t, so here we wallpaper over that difference with
984  * an lvalue cast.
985  */
986 #define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata)))
987 
988 static inline void
WaitEventAdjustKqueueAdd(struct kevent * k_ev,int filter,int action,WaitEvent * event)989 WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
990 						 WaitEvent *event)
991 {
992 	k_ev->ident = event->fd;
993 	k_ev->filter = filter;
994 	k_ev->flags = action;
995 	k_ev->fflags = 0;
996 	k_ev->data = 0;
997 	AccessWaitEvent(k_ev) = event;
998 }
999 
1000 static inline void
WaitEventAdjustKqueueAddPostmaster(struct kevent * k_ev,WaitEvent * event)1001 WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
1002 {
1003 	/* For now postmaster death can only be added, not removed. */
1004 	k_ev->ident = PostmasterPid;
1005 	k_ev->filter = EVFILT_PROC;
1006 	k_ev->flags = EV_ADD;
1007 	k_ev->fflags = NOTE_EXIT;
1008 	k_ev->data = 0;
1009 	AccessWaitEvent(k_ev) = event;
1010 }
1011 
1012 /*
1013  * old_events is the previous event mask, used to compute what has changed.
1014  */
1015 static void
WaitEventAdjustKqueue(WaitEventSet * set,WaitEvent * event,int old_events)1016 WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
1017 {
1018 	int			rc;
1019 	struct kevent k_ev[2];
1020 	int			count = 0;
1021 	bool		new_filt_read = false;
1022 	bool		old_filt_read = false;
1023 	bool		new_filt_write = false;
1024 	bool		old_filt_write = false;
1025 
1026 	if (old_events == event->events)
1027 		return;
1028 
1029 	Assert(event->events != WL_LATCH_SET || set->latch != NULL);
1030 	Assert(event->events == WL_LATCH_SET ||
1031 		   event->events == WL_POSTMASTER_DEATH ||
1032 		   (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
1033 
1034 	if (event->events == WL_POSTMASTER_DEATH)
1035 	{
1036 		/*
1037 		 * Unlike all the other implementations, we detect postmaster death
1038 		 * using process notification instead of waiting on the postmaster
1039 		 * alive pipe.
1040 		 */
1041 		WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event);
1042 	}
1043 	else
1044 	{
1045 		/*
1046 		 * We need to compute the adds and deletes required to get from the
1047 		 * old event mask to the new event mask, since kevent treats readable
1048 		 * and writable as separate events.
1049 		 */
1050 		if (old_events == WL_LATCH_SET ||
1051 			(old_events & WL_SOCKET_READABLE))
1052 			old_filt_read = true;
1053 		if (event->events == WL_LATCH_SET ||
1054 			(event->events & WL_SOCKET_READABLE))
1055 			new_filt_read = true;
1056 		if (old_events & WL_SOCKET_WRITEABLE)
1057 			old_filt_write = true;
1058 		if (event->events & WL_SOCKET_WRITEABLE)
1059 			new_filt_write = true;
1060 		if (old_filt_read && !new_filt_read)
1061 			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE,
1062 									 event);
1063 		else if (!old_filt_read && new_filt_read)
1064 			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD,
1065 									 event);
1066 		if (old_filt_write && !new_filt_write)
1067 			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE,
1068 									 event);
1069 		else if (!old_filt_write && new_filt_write)
1070 			WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD,
1071 									 event);
1072 	}
1073 
1074 	Assert(count > 0);
1075 	Assert(count <= 2);
1076 
1077 	rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
1078 
1079 	/*
1080 	 * When adding the postmaster's pid, we have to consider that it might
1081 	 * already have exited and perhaps even been replaced by another process
1082 	 * with the same pid.  If so, we have to defer reporting this as an event
1083 	 * until the next call to WaitEventSetWaitBlock().
1084 	 */
1085 
1086 	if (rc < 0)
1087 	{
1088 		if (event->events == WL_POSTMASTER_DEATH &&
1089 			(errno == ESRCH || errno == EACCES))
1090 			set->report_postmaster_not_running = true;
1091 		else
1092 			ereport(ERROR,
1093 					(errcode_for_socket_access(),
1094 			/* translator: %s is a syscall name, such as "poll()" */
1095 					 errmsg("%s failed: %m",
1096 							"kevent()")));
1097 	}
1098 	else if (event->events == WL_POSTMASTER_DEATH &&
1099 			 PostmasterPid != getppid() &&
1100 			 !PostmasterIsAlive())
1101 	{
1102 		/*
1103 		 * The extra PostmasterIsAliveInternal() check prevents false alarms
1104 		 * on systems that give a different value for getppid() while being
1105 		 * traced by a debugger.
1106 		 */
1107 		set->report_postmaster_not_running = true;
1108 	}
1109 }
1110 
1111 #endif
1112 
1113 #if defined(WAIT_USE_WIN32)
1114 static void
WaitEventAdjustWin32(WaitEventSet * set,WaitEvent * event)1115 WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
1116 {
1117 	HANDLE	   *handle = &set->handles[event->pos + 1];
1118 
1119 	if (event->events == WL_LATCH_SET)
1120 	{
1121 		Assert(set->latch != NULL);
1122 		*handle = set->latch->event;
1123 	}
1124 	else if (event->events == WL_POSTMASTER_DEATH)
1125 	{
1126 		*handle = PostmasterHandle;
1127 	}
1128 	else
1129 	{
1130 		int			flags = FD_CLOSE;	/* always check for errors/EOF */
1131 
1132 		if (event->events & WL_SOCKET_READABLE)
1133 			flags |= FD_READ;
1134 		if (event->events & WL_SOCKET_WRITEABLE)
1135 			flags |= FD_WRITE;
1136 		if (event->events & WL_SOCKET_CONNECTED)
1137 			flags |= FD_CONNECT;
1138 
1139 		if (*handle == WSA_INVALID_EVENT)
1140 		{
1141 			*handle = WSACreateEvent();
1142 			if (*handle == WSA_INVALID_EVENT)
1143 				elog(ERROR, "failed to create event for socket: error code %u",
1144 					 WSAGetLastError());
1145 		}
1146 		if (WSAEventSelect(event->fd, *handle, flags) != 0)
1147 			elog(ERROR, "failed to set up event for socket: error code %u",
1148 				 WSAGetLastError());
1149 
1150 		Assert(event->fd != PGINVALID_SOCKET);
1151 	}
1152 }
1153 #endif
1154 
1155 /*
1156  * Wait for events added to the set to happen, or until the timeout is
1157  * reached.  At most nevents occurred events are returned.
1158  *
1159  * If timeout = -1, block until an event occurs; if 0, check sockets for
1160  * readiness, but don't block; if > 0, block for at most timeout milliseconds.
1161  *
1162  * Returns the number of events occurred, or 0 if the timeout was reached.
1163  *
1164  * Returned events will have the fd, pos, user_data fields set to the
1165  * values associated with the registered event.
1166  */
1167 int
WaitEventSetWait(WaitEventSet * set,long timeout,WaitEvent * occurred_events,int nevents,uint32 wait_event_info)1168 WaitEventSetWait(WaitEventSet *set, long timeout,
1169 				 WaitEvent *occurred_events, int nevents,
1170 				 uint32 wait_event_info)
1171 {
1172 	int			returned_events = 0;
1173 	instr_time	start_time;
1174 	instr_time	cur_time;
1175 	long		cur_timeout = -1;
1176 
1177 	Assert(nevents > 0);
1178 
1179 	/*
1180 	 * Initialize timeout if requested.  We must record the current time so
1181 	 * that we can determine the remaining timeout if interrupted.
1182 	 */
1183 	if (timeout >= 0)
1184 	{
1185 		INSTR_TIME_SET_CURRENT(start_time);
1186 		Assert(timeout >= 0 && timeout <= INT_MAX);
1187 		cur_timeout = timeout;
1188 	}
1189 
1190 	pgstat_report_wait_start(wait_event_info);
1191 
1192 #ifndef WIN32
1193 	waiting = true;
1194 #else
1195 	/* Ensure that signals are serviced even if latch is already set */
1196 	pgwin32_dispatch_queued_signals();
1197 #endif
1198 	while (returned_events == 0)
1199 	{
1200 		int			rc;
1201 
1202 		/*
1203 		 * Check if the latch is set already. If so, leave the loop
1204 		 * immediately, avoid blocking again. We don't attempt to report any
1205 		 * other events that might also be satisfied.
1206 		 *
1207 		 * If someone sets the latch between this and the
1208 		 * WaitEventSetWaitBlock() below, the setter will write a byte to the
1209 		 * pipe (or signal us and the signal handler will do that), and the
1210 		 * readiness routine will return immediately.
1211 		 *
1212 		 * On unix, If there's a pending byte in the self pipe, we'll notice
1213 		 * whenever blocking. Only clearing the pipe in that case avoids
1214 		 * having to drain it every time WaitLatchOrSocket() is used. Should
1215 		 * the pipe-buffer fill up we're still ok, because the pipe is in
1216 		 * nonblocking mode. It's unlikely for that to happen, because the
1217 		 * self pipe isn't filled unless we're blocking (waiting = true), or
1218 		 * from inside a signal handler in latch_sigusr1_handler().
1219 		 *
1220 		 * On windows, we'll also notice if there's a pending event for the
1221 		 * latch when blocking, but there's no danger of anything filling up,
1222 		 * as "Setting an event that is already set has no effect.".
1223 		 *
1224 		 * Note: we assume that the kernel calls involved in latch management
1225 		 * will provide adequate synchronization on machines with weak memory
1226 		 * ordering, so that we cannot miss seeing is_set if a notification
1227 		 * has already been queued.
1228 		 */
1229 		if (set->latch && set->latch->is_set)
1230 		{
1231 			occurred_events->fd = PGINVALID_SOCKET;
1232 			occurred_events->pos = set->latch_pos;
1233 			occurred_events->user_data =
1234 				set->events[set->latch_pos].user_data;
1235 			occurred_events->events = WL_LATCH_SET;
1236 			occurred_events++;
1237 			returned_events++;
1238 
1239 			break;
1240 		}
1241 
1242 		/*
1243 		 * Wait for events using the readiness primitive chosen at the top of
1244 		 * this file. If -1 is returned, a timeout has occurred, if 0 we have
1245 		 * to retry, everything >= 1 is the number of returned events.
1246 		 */
1247 		rc = WaitEventSetWaitBlock(set, cur_timeout,
1248 								   occurred_events, nevents);
1249 
1250 		if (rc == -1)
1251 			break;				/* timeout occurred */
1252 		else
1253 			returned_events = rc;
1254 
1255 		/* If we're not done, update cur_timeout for next iteration */
1256 		if (returned_events == 0 && timeout >= 0)
1257 		{
1258 			INSTR_TIME_SET_CURRENT(cur_time);
1259 			INSTR_TIME_SUBTRACT(cur_time, start_time);
1260 			cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1261 			if (cur_timeout <= 0)
1262 				break;
1263 		}
1264 	}
1265 #ifndef WIN32
1266 	waiting = false;
1267 #endif
1268 
1269 	pgstat_report_wait_end();
1270 
1271 	return returned_events;
1272 }
1273 
1274 
1275 #if defined(WAIT_USE_EPOLL)
1276 
1277 /*
1278  * Wait using linux's epoll_wait(2).
1279  *
1280  * This is the preferable wait method, as several readiness notifications are
1281  * delivered, without having to iterate through all of set->events. The return
1282  * epoll_event struct contain a pointer to our events, making association
1283  * easy.
1284  */
1285 static inline int
WaitEventSetWaitBlock(WaitEventSet * set,int cur_timeout,WaitEvent * occurred_events,int nevents)1286 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1287 					  WaitEvent *occurred_events, int nevents)
1288 {
1289 	int			returned_events = 0;
1290 	int			rc;
1291 	WaitEvent  *cur_event;
1292 	struct epoll_event *cur_epoll_event;
1293 
1294 	/* Sleep */
1295 	rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1296 					nevents, cur_timeout);
1297 
1298 	/* Check return code */
1299 	if (rc < 0)
1300 	{
1301 		/* EINTR is okay, otherwise complain */
1302 		if (errno != EINTR)
1303 		{
1304 			waiting = false;
1305 			ereport(ERROR,
1306 					(errcode_for_socket_access(),
1307 			/* translator: %s is a syscall name, such as "poll()" */
1308 					 errmsg("%s failed: %m",
1309 							"epoll_wait()")));
1310 		}
1311 		return 0;
1312 	}
1313 	else if (rc == 0)
1314 	{
1315 		/* timeout exceeded */
1316 		return -1;
1317 	}
1318 
1319 	/*
1320 	 * At least one event occurred, iterate over the returned epoll events
1321 	 * until they're either all processed, or we've returned all the events
1322 	 * the caller desired.
1323 	 */
1324 	for (cur_epoll_event = set->epoll_ret_events;
1325 		 cur_epoll_event < (set->epoll_ret_events + rc) &&
1326 		 returned_events < nevents;
1327 		 cur_epoll_event++)
1328 	{
1329 		/* epoll's data pointer is set to the associated WaitEvent */
1330 		cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1331 
1332 		occurred_events->pos = cur_event->pos;
1333 		occurred_events->user_data = cur_event->user_data;
1334 		occurred_events->events = 0;
1335 
1336 		if (cur_event->events == WL_LATCH_SET &&
1337 			cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1338 		{
1339 			/* There's data in the self-pipe, clear it. */
1340 			drainSelfPipe();
1341 
1342 			if (set->latch->is_set)
1343 			{
1344 				occurred_events->fd = PGINVALID_SOCKET;
1345 				occurred_events->events = WL_LATCH_SET;
1346 				occurred_events++;
1347 				returned_events++;
1348 			}
1349 		}
1350 		else if (cur_event->events == WL_POSTMASTER_DEATH &&
1351 				 cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1352 		{
1353 			/*
1354 			 * We expect an EPOLLHUP when the remote end is closed, but
1355 			 * because we don't expect the pipe to become readable or to have
1356 			 * any errors either, treat those cases as postmaster death, too.
1357 			 *
1358 			 * Be paranoid about a spurious event signaling the postmaster as
1359 			 * being dead.  There have been reports about that happening with
1360 			 * older primitives (select(2) to be specific), and a spurious
1361 			 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1362 			 * cost much.
1363 			 */
1364 			if (!PostmasterIsAliveInternal())
1365 			{
1366 				if (set->exit_on_postmaster_death)
1367 					proc_exit(1);
1368 				occurred_events->fd = PGINVALID_SOCKET;
1369 				occurred_events->events = WL_POSTMASTER_DEATH;
1370 				occurred_events++;
1371 				returned_events++;
1372 			}
1373 		}
1374 		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1375 		{
1376 			Assert(cur_event->fd != PGINVALID_SOCKET);
1377 
1378 			if ((cur_event->events & WL_SOCKET_READABLE) &&
1379 				(cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1380 			{
1381 				/* data available in socket, or EOF */
1382 				occurred_events->events |= WL_SOCKET_READABLE;
1383 			}
1384 
1385 			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1386 				(cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1387 			{
1388 				/* writable, or EOF */
1389 				occurred_events->events |= WL_SOCKET_WRITEABLE;
1390 			}
1391 
1392 			if (occurred_events->events != 0)
1393 			{
1394 				occurred_events->fd = cur_event->fd;
1395 				occurred_events++;
1396 				returned_events++;
1397 			}
1398 		}
1399 	}
1400 
1401 	return returned_events;
1402 }
1403 
1404 #elif defined(WAIT_USE_KQUEUE)
1405 
1406 /*
1407  * Wait using kevent(2) on BSD-family systems and macOS.
1408  *
1409  * For now this mirrors the epoll code, but in future it could modify the fd
1410  * set in the same call to kevent as it uses for waiting instead of doing that
1411  * with separate system calls.
1412  */
1413 static int
WaitEventSetWaitBlock(WaitEventSet * set,int cur_timeout,WaitEvent * occurred_events,int nevents)1414 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1415 					  WaitEvent *occurred_events, int nevents)
1416 {
1417 	int			returned_events = 0;
1418 	int			rc;
1419 	WaitEvent  *cur_event;
1420 	struct kevent *cur_kqueue_event;
1421 	struct timespec timeout;
1422 	struct timespec *timeout_p;
1423 
1424 	if (cur_timeout < 0)
1425 		timeout_p = NULL;
1426 	else
1427 	{
1428 		timeout.tv_sec = cur_timeout / 1000;
1429 		timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
1430 		timeout_p = &timeout;
1431 	}
1432 
1433 	/*
1434 	 * Report postmaster events discovered by WaitEventAdjustKqueue() or an
1435 	 * earlier call to WaitEventSetWait().
1436 	 */
1437 	if (unlikely(set->report_postmaster_not_running))
1438 	{
1439 		if (set->exit_on_postmaster_death)
1440 			proc_exit(1);
1441 		occurred_events->fd = PGINVALID_SOCKET;
1442 		occurred_events->events = WL_POSTMASTER_DEATH;
1443 		return 1;
1444 	}
1445 
1446 	/* Sleep */
1447 	rc = kevent(set->kqueue_fd, NULL, 0,
1448 				set->kqueue_ret_events, nevents,
1449 				timeout_p);
1450 
1451 	/* Check return code */
1452 	if (rc < 0)
1453 	{
1454 		/* EINTR is okay, otherwise complain */
1455 		if (errno != EINTR)
1456 		{
1457 			waiting = false;
1458 			ereport(ERROR,
1459 					(errcode_for_socket_access(),
1460 			/* translator: %s is a syscall name, such as "poll()" */
1461 					 errmsg("%s failed: %m",
1462 							"kevent()")));
1463 		}
1464 		return 0;
1465 	}
1466 	else if (rc == 0)
1467 	{
1468 		/* timeout exceeded */
1469 		return -1;
1470 	}
1471 
1472 	/*
1473 	 * At least one event occurred, iterate over the returned kqueue events
1474 	 * until they're either all processed, or we've returned all the events
1475 	 * the caller desired.
1476 	 */
1477 	for (cur_kqueue_event = set->kqueue_ret_events;
1478 		 cur_kqueue_event < (set->kqueue_ret_events + rc) &&
1479 		 returned_events < nevents;
1480 		 cur_kqueue_event++)
1481 	{
1482 		/* kevent's udata points to the associated WaitEvent */
1483 		cur_event = AccessWaitEvent(cur_kqueue_event);
1484 
1485 		occurred_events->pos = cur_event->pos;
1486 		occurred_events->user_data = cur_event->user_data;
1487 		occurred_events->events = 0;
1488 
1489 		if (cur_event->events == WL_LATCH_SET &&
1490 			cur_kqueue_event->filter == EVFILT_READ)
1491 		{
1492 			/* There's data in the self-pipe, clear it. */
1493 			drainSelfPipe();
1494 
1495 			if (set->latch->is_set)
1496 			{
1497 				occurred_events->fd = PGINVALID_SOCKET;
1498 				occurred_events->events = WL_LATCH_SET;
1499 				occurred_events++;
1500 				returned_events++;
1501 			}
1502 		}
1503 		else if (cur_event->events == WL_POSTMASTER_DEATH &&
1504 				 cur_kqueue_event->filter == EVFILT_PROC &&
1505 				 (cur_kqueue_event->fflags & NOTE_EXIT) != 0)
1506 		{
1507 			/*
1508 			 * The kernel will tell this kqueue object only once about the exit
1509 			 * of the postmaster, so let's remember that for next time so that
1510 			 * we provide level-triggered semantics.
1511 			 */
1512 			set->report_postmaster_not_running = true;
1513 
1514 			if (set->exit_on_postmaster_death)
1515 				proc_exit(1);
1516 			occurred_events->fd = PGINVALID_SOCKET;
1517 			occurred_events->events = WL_POSTMASTER_DEATH;
1518 			occurred_events++;
1519 			returned_events++;
1520 		}
1521 		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1522 		{
1523 			Assert(cur_event->fd >= 0);
1524 
1525 			if ((cur_event->events & WL_SOCKET_READABLE) &&
1526 				(cur_kqueue_event->filter == EVFILT_READ))
1527 			{
1528 				/* readable, or EOF */
1529 				occurred_events->events |= WL_SOCKET_READABLE;
1530 			}
1531 
1532 			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1533 				(cur_kqueue_event->filter == EVFILT_WRITE))
1534 			{
1535 				/* writable, or EOF */
1536 				occurred_events->events |= WL_SOCKET_WRITEABLE;
1537 			}
1538 
1539 			if (occurred_events->events != 0)
1540 			{
1541 				occurred_events->fd = cur_event->fd;
1542 				occurred_events++;
1543 				returned_events++;
1544 			}
1545 		}
1546 	}
1547 
1548 	return returned_events;
1549 }
1550 
1551 #elif defined(WAIT_USE_POLL)
1552 
1553 /*
1554  * Wait using poll(2).
1555  *
1556  * This allows to receive readiness notifications for several events at once,
1557  * but requires iterating through all of set->pollfds.
1558  */
1559 static inline int
WaitEventSetWaitBlock(WaitEventSet * set,int cur_timeout,WaitEvent * occurred_events,int nevents)1560 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1561 					  WaitEvent *occurred_events, int nevents)
1562 {
1563 	int			returned_events = 0;
1564 	int			rc;
1565 	WaitEvent  *cur_event;
1566 	struct pollfd *cur_pollfd;
1567 
1568 	/* Sleep */
1569 	rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1570 
1571 	/* Check return code */
1572 	if (rc < 0)
1573 	{
1574 		/* EINTR is okay, otherwise complain */
1575 		if (errno != EINTR)
1576 		{
1577 			waiting = false;
1578 			ereport(ERROR,
1579 					(errcode_for_socket_access(),
1580 			/* translator: %s is a syscall name, such as "poll()" */
1581 					 errmsg("%s failed: %m",
1582 							"poll()")));
1583 		}
1584 		return 0;
1585 	}
1586 	else if (rc == 0)
1587 	{
1588 		/* timeout exceeded */
1589 		return -1;
1590 	}
1591 
1592 	for (cur_event = set->events, cur_pollfd = set->pollfds;
1593 		 cur_event < (set->events + set->nevents) &&
1594 		 returned_events < nevents;
1595 		 cur_event++, cur_pollfd++)
1596 	{
1597 		/* no activity on this FD, skip */
1598 		if (cur_pollfd->revents == 0)
1599 			continue;
1600 
1601 		occurred_events->pos = cur_event->pos;
1602 		occurred_events->user_data = cur_event->user_data;
1603 		occurred_events->events = 0;
1604 
1605 		if (cur_event->events == WL_LATCH_SET &&
1606 			(cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1607 		{
1608 			/* There's data in the self-pipe, clear it. */
1609 			drainSelfPipe();
1610 
1611 			if (set->latch->is_set)
1612 			{
1613 				occurred_events->fd = PGINVALID_SOCKET;
1614 				occurred_events->events = WL_LATCH_SET;
1615 				occurred_events++;
1616 				returned_events++;
1617 			}
1618 		}
1619 		else if (cur_event->events == WL_POSTMASTER_DEATH &&
1620 				 (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1621 		{
1622 			/*
1623 			 * We expect an POLLHUP when the remote end is closed, but because
1624 			 * we don't expect the pipe to become readable or to have any
1625 			 * errors either, treat those cases as postmaster death, too.
1626 			 *
1627 			 * Be paranoid about a spurious event signaling the postmaster as
1628 			 * being dead.  There have been reports about that happening with
1629 			 * older primitives (select(2) to be specific), and a spurious
1630 			 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1631 			 * cost much.
1632 			 */
1633 			if (!PostmasterIsAliveInternal())
1634 			{
1635 				if (set->exit_on_postmaster_death)
1636 					proc_exit(1);
1637 				occurred_events->fd = PGINVALID_SOCKET;
1638 				occurred_events->events = WL_POSTMASTER_DEATH;
1639 				occurred_events++;
1640 				returned_events++;
1641 			}
1642 		}
1643 		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1644 		{
1645 			int			errflags = POLLHUP | POLLERR | POLLNVAL;
1646 
1647 			Assert(cur_event->fd >= PGINVALID_SOCKET);
1648 
1649 			if ((cur_event->events & WL_SOCKET_READABLE) &&
1650 				(cur_pollfd->revents & (POLLIN | errflags)))
1651 			{
1652 				/* data available in socket, or EOF */
1653 				occurred_events->events |= WL_SOCKET_READABLE;
1654 			}
1655 
1656 			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1657 				(cur_pollfd->revents & (POLLOUT | errflags)))
1658 			{
1659 				/* writeable, or EOF */
1660 				occurred_events->events |= WL_SOCKET_WRITEABLE;
1661 			}
1662 
1663 			if (occurred_events->events != 0)
1664 			{
1665 				occurred_events->fd = cur_event->fd;
1666 				occurred_events++;
1667 				returned_events++;
1668 			}
1669 		}
1670 	}
1671 	return returned_events;
1672 }
1673 
1674 #elif defined(WAIT_USE_WIN32)
1675 
1676 /*
1677  * Wait using Windows' WaitForMultipleObjects().
1678  *
1679  * Unfortunately this will only ever return a single readiness notification at
1680  * a time.  Note that while the official documentation for
1681  * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
1682  * with a single bWaitAll = FALSE call,
1683  * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
1684  * that only one event is "consumed".
1685  */
1686 static inline int
WaitEventSetWaitBlock(WaitEventSet * set,int cur_timeout,WaitEvent * occurred_events,int nevents)1687 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1688 					  WaitEvent *occurred_events, int nevents)
1689 {
1690 	int			returned_events = 0;
1691 	DWORD		rc;
1692 	WaitEvent  *cur_event;
1693 
1694 	/* Reset any wait events that need it */
1695 	for (cur_event = set->events;
1696 		 cur_event < (set->events + set->nevents);
1697 		 cur_event++)
1698 	{
1699 		if (cur_event->reset)
1700 		{
1701 			WaitEventAdjustWin32(set, cur_event);
1702 			cur_event->reset = false;
1703 		}
1704 
1705 		/*
1706 		 * Windows does not guarantee to log an FD_WRITE network event
1707 		 * indicating that more data can be sent unless the previous send()
1708 		 * failed with WSAEWOULDBLOCK.  While our caller might well have made
1709 		 * such a call, we cannot assume that here.  Therefore, if waiting for
1710 		 * write-ready, force the issue by doing a dummy send().  If the dummy
1711 		 * send() succeeds, assume that the socket is in fact write-ready, and
1712 		 * return immediately.  Also, if it fails with something other than
1713 		 * WSAEWOULDBLOCK, return a write-ready indication to let our caller
1714 		 * deal with the error condition.
1715 		 */
1716 		if (cur_event->events & WL_SOCKET_WRITEABLE)
1717 		{
1718 			char		c;
1719 			WSABUF		buf;
1720 			DWORD		sent;
1721 			int			r;
1722 
1723 			buf.buf = &c;
1724 			buf.len = 0;
1725 
1726 			r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
1727 			if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
1728 			{
1729 				occurred_events->pos = cur_event->pos;
1730 				occurred_events->user_data = cur_event->user_data;
1731 				occurred_events->events = WL_SOCKET_WRITEABLE;
1732 				occurred_events->fd = cur_event->fd;
1733 				return 1;
1734 			}
1735 		}
1736 	}
1737 
1738 	/*
1739 	 * Sleep.
1740 	 *
1741 	 * Need to wait for ->nevents + 1, because signal handle is in [0].
1742 	 */
1743 	rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
1744 								cur_timeout);
1745 
1746 	/* Check return code */
1747 	if (rc == WAIT_FAILED)
1748 		elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
1749 			 GetLastError());
1750 	else if (rc == WAIT_TIMEOUT)
1751 	{
1752 		/* timeout exceeded */
1753 		return -1;
1754 	}
1755 
1756 	if (rc == WAIT_OBJECT_0)
1757 	{
1758 		/* Service newly-arrived signals */
1759 		pgwin32_dispatch_queued_signals();
1760 		return 0;				/* retry */
1761 	}
1762 
1763 	/*
1764 	 * With an offset of one, due to the always present pgwin32_signal_event,
1765 	 * the handle offset directly corresponds to a wait event.
1766 	 */
1767 	cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
1768 
1769 	occurred_events->pos = cur_event->pos;
1770 	occurred_events->user_data = cur_event->user_data;
1771 	occurred_events->events = 0;
1772 
1773 	if (cur_event->events == WL_LATCH_SET)
1774 	{
1775 		if (!ResetEvent(set->latch->event))
1776 			elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
1777 
1778 		if (set->latch->is_set)
1779 		{
1780 			occurred_events->fd = PGINVALID_SOCKET;
1781 			occurred_events->events = WL_LATCH_SET;
1782 			occurred_events++;
1783 			returned_events++;
1784 		}
1785 	}
1786 	else if (cur_event->events == WL_POSTMASTER_DEATH)
1787 	{
1788 		/*
1789 		 * Postmaster apparently died.  Since the consequences of falsely
1790 		 * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
1791 		 * the trouble to positively verify this with PostmasterIsAlive(),
1792 		 * even though there is no known reason to think that the event could
1793 		 * be falsely set on Windows.
1794 		 */
1795 		if (!PostmasterIsAliveInternal())
1796 		{
1797 			if (set->exit_on_postmaster_death)
1798 				proc_exit(1);
1799 			occurred_events->fd = PGINVALID_SOCKET;
1800 			occurred_events->events = WL_POSTMASTER_DEATH;
1801 			occurred_events++;
1802 			returned_events++;
1803 		}
1804 	}
1805 	else if (cur_event->events & WL_SOCKET_MASK)
1806 	{
1807 		WSANETWORKEVENTS resEvents;
1808 		HANDLE		handle = set->handles[cur_event->pos + 1];
1809 
1810 		Assert(cur_event->fd);
1811 
1812 		occurred_events->fd = cur_event->fd;
1813 
1814 		ZeroMemory(&resEvents, sizeof(resEvents));
1815 		if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
1816 			elog(ERROR, "failed to enumerate network events: error code %u",
1817 				 WSAGetLastError());
1818 		if ((cur_event->events & WL_SOCKET_READABLE) &&
1819 			(resEvents.lNetworkEvents & FD_READ))
1820 		{
1821 			/* data available in socket */
1822 			occurred_events->events |= WL_SOCKET_READABLE;
1823 
1824 			/*------
1825 			 * WaitForMultipleObjects doesn't guarantee that a read event will
1826 			 * be returned if the latch is set at the same time.  Even if it
1827 			 * did, the caller might drop that event expecting it to reoccur
1828 			 * on next call.  So, we must force the event to be reset if this
1829 			 * WaitEventSet is used again in order to avoid an indefinite
1830 			 * hang.  Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
1831 			 * for the behavior of socket events.
1832 			 *------
1833 			 */
1834 			cur_event->reset = true;
1835 		}
1836 		if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1837 			(resEvents.lNetworkEvents & FD_WRITE))
1838 		{
1839 			/* writeable */
1840 			occurred_events->events |= WL_SOCKET_WRITEABLE;
1841 		}
1842 		if ((cur_event->events & WL_SOCKET_CONNECTED) &&
1843 			(resEvents.lNetworkEvents & FD_CONNECT))
1844 		{
1845 			/* connected */
1846 			occurred_events->events |= WL_SOCKET_CONNECTED;
1847 		}
1848 		if (resEvents.lNetworkEvents & FD_CLOSE)
1849 		{
1850 			/* EOF/error, so signal all caller-requested socket flags */
1851 			occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
1852 		}
1853 
1854 		if (occurred_events->events != 0)
1855 		{
1856 			occurred_events++;
1857 			returned_events++;
1858 		}
1859 	}
1860 
1861 	return returned_events;
1862 }
1863 #endif
1864 
1865 /*
1866  * SetLatch uses SIGUSR1 to wake up the process waiting on the latch.
1867  *
1868  * Wake up WaitLatch, if we're waiting.  (We might not be, since SIGUSR1 is
1869  * overloaded for multiple purposes; or we might not have reached WaitLatch
1870  * yet, in which case we don't need to fill the pipe either.)
1871  *
1872  * NB: when calling this in a signal handler, be sure to save and restore
1873  * errno around it.
1874  */
1875 #ifndef WIN32
1876 void
latch_sigusr1_handler(void)1877 latch_sigusr1_handler(void)
1878 {
1879 	if (waiting)
1880 		sendSelfPipeByte();
1881 }
1882 #endif							/* !WIN32 */
1883 
1884 /* Send one byte to the self-pipe, to wake up WaitLatch */
1885 #ifndef WIN32
1886 static void
sendSelfPipeByte(void)1887 sendSelfPipeByte(void)
1888 {
1889 	int			rc;
1890 	char		dummy = 0;
1891 
1892 retry:
1893 	rc = write(selfpipe_writefd, &dummy, 1);
1894 	if (rc < 0)
1895 	{
1896 		/* If interrupted by signal, just retry */
1897 		if (errno == EINTR)
1898 			goto retry;
1899 
1900 		/*
1901 		 * If the pipe is full, we don't need to retry, the data that's there
1902 		 * already is enough to wake up WaitLatch.
1903 		 */
1904 		if (errno == EAGAIN || errno == EWOULDBLOCK)
1905 			return;
1906 
1907 		/*
1908 		 * Oops, the write() failed for some other reason. We might be in a
1909 		 * signal handler, so it's not safe to elog(). We have no choice but
1910 		 * silently ignore the error.
1911 		 */
1912 		return;
1913 	}
1914 }
1915 #endif							/* !WIN32 */
1916 
1917 /*
1918  * Read all available data from the self-pipe
1919  *
1920  * Note: this is only called when waiting = true.  If it fails and doesn't
1921  * return, it must reset that flag first (though ideally, this will never
1922  * happen).
1923  */
1924 #ifndef WIN32
1925 static void
drainSelfPipe(void)1926 drainSelfPipe(void)
1927 {
1928 	/*
1929 	 * There shouldn't normally be more than one byte in the pipe, or maybe a
1930 	 * few bytes if multiple processes run SetLatch at the same instant.
1931 	 */
1932 	char		buf[16];
1933 	int			rc;
1934 
1935 	for (;;)
1936 	{
1937 		rc = read(selfpipe_readfd, buf, sizeof(buf));
1938 		if (rc < 0)
1939 		{
1940 			if (errno == EAGAIN || errno == EWOULDBLOCK)
1941 				break;			/* the pipe is empty */
1942 			else if (errno == EINTR)
1943 				continue;		/* retry */
1944 			else
1945 			{
1946 				waiting = false;
1947 				elog(ERROR, "read() on self-pipe failed: %m");
1948 			}
1949 		}
1950 		else if (rc == 0)
1951 		{
1952 			waiting = false;
1953 			elog(ERROR, "unexpected EOF on self-pipe");
1954 		}
1955 		else if (rc < sizeof(buf))
1956 		{
1957 			/* we successfully drained the pipe; no need to read() again */
1958 			break;
1959 		}
1960 		/* else buffer wasn't big enough, so read again */
1961 	}
1962 }
1963 #endif							/* !WIN32 */
1964