1 /*-------------------------------------------------------------------------
2  *
3  * latch.c
4  *	  Routines for inter-process latches
5  *
6  * The Unix implementation uses the so-called self-pipe trick to overcome the
7  * race condition involved with poll() (or epoll_wait() on linux) and setting
8  * a global flag in the signal handler. When a latch is set and the current
9  * process is waiting for it, the signal handler wakes up the poll() in
10  * WaitLatch by writing a byte to a pipe. A signal by itself doesn't interrupt
11  * poll() on all platforms, and even on platforms where it does, a signal that
12  * arrives just before the poll() call does not prevent poll() from entering
13  * sleep. An incoming byte on a pipe however reliably interrupts the sleep,
14  * and causes poll() to return immediately even if the signal arrives before
15  * poll() begins.
16  *
17  * When SetLatch is called from the same process that owns the latch,
18  * SetLatch writes the byte directly to the pipe. If it's owned by another
19  * process, SIGUSR1 is sent and the signal handler in the waiting process
20  * writes the byte to the pipe on behalf of the signaling process.
21  *
22  * The Windows implementation uses Windows events that are inherited by all
23  * postmaster child processes. There's no need for the self-pipe trick there.
24  *
25  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
26  * Portions Copyright (c) 1994, Regents of the University of California
27  *
28  * IDENTIFICATION
29  *	  src/backend/storage/ipc/latch.c
30  *
31  *-------------------------------------------------------------------------
32  */
33 #include "postgres.h"
34 
35 #include <fcntl.h>
36 #include <limits.h>
37 #include <signal.h>
38 #include <unistd.h>
39 #ifdef HAVE_SYS_EPOLL_H
40 #include <sys/epoll.h>
41 #endif
42 #ifdef HAVE_POLL_H
43 #include <poll.h>
44 #endif
45 
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "port/atomics.h"
49 #include "portability/instr_time.h"
50 #include "postmaster/postmaster.h"
51 #include "storage/ipc.h"
52 #include "storage/latch.h"
53 #include "storage/pmsignal.h"
54 #include "storage/shmem.h"
55 
56 /*
57  * Select the fd readiness primitive to use. Normally the "most modern"
58  * primitive supported by the OS will be used, but for testing it can be
59  * useful to manually specify the used primitive.  If desired, just add a
60  * define somewhere before this block.
61  */
62 #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
63 	defined(WAIT_USE_WIN32)
64 /* don't overwrite manual choice */
65 #elif defined(HAVE_SYS_EPOLL_H)
66 #define WAIT_USE_EPOLL
67 #elif defined(HAVE_POLL)
68 #define WAIT_USE_POLL
69 #elif WIN32
70 #define WAIT_USE_WIN32
71 #else
72 #error "no wait set implementation available"
73 #endif
74 
75 /* typedef in latch.h */
76 struct WaitEventSet
77 {
78 	int			nevents;		/* number of registered events */
79 	int			nevents_space;	/* maximum number of events in this set */
80 
81 	/*
82 	 * Array, of nevents_space length, storing the definition of events this
83 	 * set is waiting for.
84 	 */
85 	WaitEvent  *events;
86 
87 	/*
88 	 * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
89 	 * said latch, and latch_pos the offset in the ->events array. This is
90 	 * useful because we check the state of the latch before performing doing
91 	 * syscalls related to waiting.
92 	 */
93 	Latch	   *latch;
94 	int			latch_pos;
95 
96 	/*
97 	 * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
98 	 * is set so that we'll exit immediately if postmaster death is detected,
99 	 * instead of returning.
100 	 */
101 	bool		exit_on_postmaster_death;
102 
103 #if defined(WAIT_USE_EPOLL)
104 	int			epoll_fd;
105 	/* epoll_wait returns events in a user provided arrays, allocate once */
106 	struct epoll_event *epoll_ret_events;
107 #elif defined(WAIT_USE_POLL)
108 	/* poll expects events to be waited on every poll() call, prepare once */
109 	struct pollfd *pollfds;
110 #elif defined(WAIT_USE_WIN32)
111 
112 	/*
113 	 * Array of windows events. The first element always contains
114 	 * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
115 	 * event->pos + 1).
116 	 */
117 	HANDLE	   *handles;
118 #endif
119 };
120 
121 #ifndef WIN32
122 /* Are we currently in WaitLatch? The signal handler would like to know. */
123 static volatile sig_atomic_t waiting = false;
124 
125 /* Read and write ends of the self-pipe */
126 static int	selfpipe_readfd = -1;
127 static int	selfpipe_writefd = -1;
128 
129 /* Process owning the self-pipe --- needed for checking purposes */
130 static int	selfpipe_owner_pid = 0;
131 
132 /* Private function prototypes */
133 static void sendSelfPipeByte(void);
134 static void drainSelfPipe(void);
135 #endif							/* WIN32 */
136 
137 #if defined(WAIT_USE_EPOLL)
138 static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
139 #elif defined(WAIT_USE_POLL)
140 static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
141 #elif defined(WAIT_USE_WIN32)
142 static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
143 #endif
144 
145 static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
146 										WaitEvent *occurred_events, int nevents);
147 
148 /*
149  * Initialize the process-local latch infrastructure.
150  *
151  * This must be called once during startup of any process that can wait on
152  * latches, before it issues any InitLatch() or OwnLatch() calls.
153  */
154 void
InitializeLatchSupport(void)155 InitializeLatchSupport(void)
156 {
157 #ifndef WIN32
158 	int			pipefd[2];
159 
160 	if (IsUnderPostmaster)
161 	{
162 		/*
163 		 * We might have inherited connections to a self-pipe created by the
164 		 * postmaster.  It's critical that child processes create their own
165 		 * self-pipes, of course, and we really want them to close the
166 		 * inherited FDs for safety's sake.
167 		 */
168 		if (selfpipe_owner_pid != 0)
169 		{
170 			/* Assert we go through here but once in a child process */
171 			Assert(selfpipe_owner_pid != MyProcPid);
172 			/* Release postmaster's pipe FDs; ignore any error */
173 			(void) close(selfpipe_readfd);
174 			(void) close(selfpipe_writefd);
175 			/* Clean up, just for safety's sake; we'll set these below */
176 			selfpipe_readfd = selfpipe_writefd = -1;
177 			selfpipe_owner_pid = 0;
178 		}
179 		else
180 		{
181 			/*
182 			 * Postmaster didn't create a self-pipe ... or else we're in an
183 			 * EXEC_BACKEND build, in which case it doesn't matter since the
184 			 * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
185 			 */
186 			Assert(selfpipe_readfd == -1);
187 		}
188 	}
189 	else
190 	{
191 		/* In postmaster or standalone backend, assert we do this but once */
192 		Assert(selfpipe_readfd == -1);
193 		Assert(selfpipe_owner_pid == 0);
194 	}
195 
196 	/*
197 	 * Set up the self-pipe that allows a signal handler to wake up the
198 	 * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
199 	 * that SetLatch won't block if the event has already been set many times
200 	 * filling the kernel buffer. Make the read-end non-blocking too, so that
201 	 * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
202 	 * Also, make both FDs close-on-exec, since we surely do not want any
203 	 * child processes messing with them.
204 	 */
205 	if (pipe(pipefd) < 0)
206 		elog(FATAL, "pipe() failed: %m");
207 	if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
208 		elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
209 	if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
210 		elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
211 	if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
212 		elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
213 	if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
214 		elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
215 
216 	selfpipe_readfd = pipefd[0];
217 	selfpipe_writefd = pipefd[1];
218 	selfpipe_owner_pid = MyProcPid;
219 #else
220 	/* currently, nothing to do here for Windows */
221 #endif
222 }
223 
224 /*
225  * Initialize a process-local latch.
226  */
227 void
InitLatch(Latch * latch)228 InitLatch(Latch *latch)
229 {
230 	latch->is_set = false;
231 	latch->owner_pid = MyProcPid;
232 	latch->is_shared = false;
233 
234 #ifndef WIN32
235 	/* Assert InitializeLatchSupport has been called in this process */
236 	Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
237 #else
238 	latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
239 	if (latch->event == NULL)
240 		elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
241 #endif							/* WIN32 */
242 }
243 
244 /*
245  * Initialize a shared latch that can be set from other processes. The latch
246  * is initially owned by no-one; use OwnLatch to associate it with the
247  * current process.
248  *
249  * InitSharedLatch needs to be called in postmaster before forking child
250  * processes, usually right after allocating the shared memory block
251  * containing the latch with ShmemInitStruct. (The Unix implementation
252  * doesn't actually require that, but the Windows one does.) Because of
253  * this restriction, we have no concurrency issues to worry about here.
254  *
255  * Note that other handles created in this module are never marked as
256  * inheritable.  Thus we do not need to worry about cleaning up child
257  * process references to postmaster-private latches or WaitEventSets.
258  */
259 void
InitSharedLatch(Latch * latch)260 InitSharedLatch(Latch *latch)
261 {
262 #ifdef WIN32
263 	SECURITY_ATTRIBUTES sa;
264 
265 	/*
266 	 * Set up security attributes to specify that the events are inherited.
267 	 */
268 	ZeroMemory(&sa, sizeof(sa));
269 	sa.nLength = sizeof(sa);
270 	sa.bInheritHandle = TRUE;
271 
272 	latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
273 	if (latch->event == NULL)
274 		elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
275 #endif
276 
277 	latch->is_set = false;
278 	latch->owner_pid = 0;
279 	latch->is_shared = true;
280 }
281 
282 /*
283  * Associate a shared latch with the current process, allowing it to
284  * wait on the latch.
285  *
286  * Although there is a sanity check for latch-already-owned, we don't do
287  * any sort of locking here, meaning that we could fail to detect the error
288  * if two processes try to own the same latch at about the same time.  If
289  * there is any risk of that, caller must provide an interlock to prevent it.
290  *
291  * In any process that calls OwnLatch(), make sure that
292  * latch_sigusr1_handler() is called from the SIGUSR1 signal handler,
293  * as shared latches use SIGUSR1 for inter-process communication.
294  */
295 void
OwnLatch(Latch * latch)296 OwnLatch(Latch *latch)
297 {
298 	/* Sanity checks */
299 	Assert(latch->is_shared);
300 
301 #ifndef WIN32
302 	/* Assert InitializeLatchSupport has been called in this process */
303 	Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
304 #endif
305 
306 	if (latch->owner_pid != 0)
307 		elog(ERROR, "latch already owned");
308 
309 	latch->owner_pid = MyProcPid;
310 }
311 
312 /*
313  * Disown a shared latch currently owned by the current process.
314  */
315 void
DisownLatch(Latch * latch)316 DisownLatch(Latch *latch)
317 {
318 	Assert(latch->is_shared);
319 	Assert(latch->owner_pid == MyProcPid);
320 
321 	latch->owner_pid = 0;
322 }
323 
324 /*
325  * Wait for a given latch to be set, or for postmaster death, or until timeout
326  * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
327  * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
328  * function returns immediately.
329  *
330  * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
331  * is given.  Although it is declared as "long", we don't actually support
332  * timeouts longer than INT_MAX milliseconds.  Note that some extra overhead
333  * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
334  *
335  * The latch must be owned by the current process, ie. it must be a
336  * process-local latch initialized with InitLatch, or a shared latch
337  * associated with the current process by calling OwnLatch.
338  *
339  * Returns bit mask indicating which condition(s) caused the wake-up. Note
340  * that if multiple wake-up conditions are true, there is no guarantee that
341  * we return all of them in one call, but we will return at least one.
342  */
343 int
WaitLatch(Latch * latch,int wakeEvents,long timeout,uint32 wait_event_info)344 WaitLatch(Latch *latch, int wakeEvents, long timeout,
345 		  uint32 wait_event_info)
346 {
347 	return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
348 							 wait_event_info);
349 }
350 
351 /*
352  * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
353  * conditions.
354  *
355  * When waiting on a socket, EOF and error conditions always cause the socket
356  * to be reported as readable/writable/connected, so that the caller can deal
357  * with the condition.
358  *
359  * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
360  * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
361  * return value if the postmaster dies.  The latter is useful for rare cases
362  * where some behavior other than immediate exit is needed.
363  *
364  * NB: These days this is just a wrapper around the WaitEventSet API. When
365  * using a latch very frequently, consider creating a longer living
366  * WaitEventSet instead; that's more efficient.
367  */
368 int
WaitLatchOrSocket(Latch * latch,int wakeEvents,pgsocket sock,long timeout,uint32 wait_event_info)369 WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
370 				  long timeout, uint32 wait_event_info)
371 {
372 	int			ret = 0;
373 	int			rc;
374 	WaitEvent	event;
375 	WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
376 
377 	if (wakeEvents & WL_TIMEOUT)
378 		Assert(timeout >= 0);
379 	else
380 		timeout = -1;
381 
382 	if (wakeEvents & WL_LATCH_SET)
383 		AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
384 						  latch, NULL);
385 
386 	/* Postmaster-managed callers must handle postmaster death somehow. */
387 	Assert(!IsUnderPostmaster ||
388 		   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
389 		   (wakeEvents & WL_POSTMASTER_DEATH));
390 
391 	if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
392 		AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
393 						  NULL, NULL);
394 
395 	if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
396 		AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
397 						  NULL, NULL);
398 
399 	if (wakeEvents & WL_SOCKET_MASK)
400 	{
401 		int			ev;
402 
403 		ev = wakeEvents & WL_SOCKET_MASK;
404 		AddWaitEventToSet(set, ev, sock, NULL, NULL);
405 	}
406 
407 	rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
408 
409 	if (rc == 0)
410 		ret |= WL_TIMEOUT;
411 	else
412 	{
413 		ret |= event.events & (WL_LATCH_SET |
414 							   WL_POSTMASTER_DEATH |
415 							   WL_SOCKET_MASK);
416 	}
417 
418 	FreeWaitEventSet(set);
419 
420 	return ret;
421 }
422 
423 /*
424  * Sets a latch and wakes up anyone waiting on it.
425  *
426  * This is cheap if the latch is already set, otherwise not so much.
427  *
428  * NB: when calling this in a signal handler, be sure to save and restore
429  * errno around it.  (That's standard practice in most signal handlers, of
430  * course, but we used to omit it in handlers that only set a flag.)
431  *
432  * NB: this function is called from critical sections and signal handlers so
433  * throwing an error is not a good idea.
434  */
435 void
SetLatch(Latch * latch)436 SetLatch(Latch *latch)
437 {
438 #ifndef WIN32
439 	pid_t		owner_pid;
440 #else
441 	HANDLE		handle;
442 #endif
443 
444 	/*
445 	 * The memory barrier has to be placed here to ensure that any flag
446 	 * variables possibly changed by this process have been flushed to main
447 	 * memory, before we check/set is_set.
448 	 */
449 	pg_memory_barrier();
450 
451 	/* Quick exit if already set */
452 	if (latch->is_set)
453 		return;
454 
455 	latch->is_set = true;
456 
457 #ifndef WIN32
458 
459 	/*
460 	 * See if anyone's waiting for the latch. It can be the current process if
461 	 * we're in a signal handler. We use the self-pipe to wake up the
462 	 * poll()/epoll_wait() in that case. If it's another process, send a
463 	 * signal.
464 	 *
465 	 * Fetch owner_pid only once, in case the latch is concurrently getting
466 	 * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
467 	 * guaranteed to be true! In practice, the effective range of pid_t fits
468 	 * in a 32 bit integer, and so should be atomic. In the worst case, we
469 	 * might end up signaling the wrong process. Even then, you're very
470 	 * unlucky if a process with that bogus pid exists and belongs to
471 	 * Postgres; and PG database processes should handle excess SIGUSR1
472 	 * interrupts without a problem anyhow.
473 	 *
474 	 * Another sort of race condition that's possible here is for a new
475 	 * process to own the latch immediately after we look, so we don't signal
476 	 * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
477 	 * the standard coding convention of waiting at the bottom of their loops,
478 	 * not the top, so that they'll correctly process latch-setting events
479 	 * that happen before they enter the loop.
480 	 */
481 	owner_pid = latch->owner_pid;
482 	if (owner_pid == 0)
483 		return;
484 	else if (owner_pid == MyProcPid)
485 	{
486 		if (waiting)
487 			sendSelfPipeByte();
488 	}
489 	else
490 		kill(owner_pid, SIGUSR1);
491 #else
492 
493 	/*
494 	 * See if anyone's waiting for the latch. It can be the current process if
495 	 * we're in a signal handler.
496 	 *
497 	 * Use a local variable here just in case somebody changes the event field
498 	 * concurrently (which really should not happen).
499 	 */
500 	handle = latch->event;
501 	if (handle)
502 	{
503 		SetEvent(handle);
504 
505 		/*
506 		 * Note that we silently ignore any errors. We might be in a signal
507 		 * handler or other critical path where it's not safe to call elog().
508 		 */
509 	}
510 #endif
511 
512 }
513 
514 /*
515  * Clear the latch. Calling WaitLatch after this will sleep, unless
516  * the latch is set again before the WaitLatch call.
517  */
518 void
ResetLatch(Latch * latch)519 ResetLatch(Latch *latch)
520 {
521 	/* Only the owner should reset the latch */
522 	Assert(latch->owner_pid == MyProcPid);
523 
524 	latch->is_set = false;
525 
526 	/*
527 	 * Ensure that the write to is_set gets flushed to main memory before we
528 	 * examine any flag variables.  Otherwise a concurrent SetLatch might
529 	 * falsely conclude that it needn't signal us, even though we have missed
530 	 * seeing some flag updates that SetLatch was supposed to inform us of.
531 	 */
532 	pg_memory_barrier();
533 }
534 
535 /*
536  * Create a WaitEventSet with space for nevents different events to wait for.
537  *
538  * These events can then be efficiently waited upon together, using
539  * WaitEventSetWait().
540  */
541 WaitEventSet *
CreateWaitEventSet(MemoryContext context,int nevents)542 CreateWaitEventSet(MemoryContext context, int nevents)
543 {
544 	WaitEventSet *set;
545 	char	   *data;
546 	Size		sz = 0;
547 
548 	/*
549 	 * Use MAXALIGN size/alignment to guarantee that later uses of memory are
550 	 * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
551 	 * platforms, but earlier allocations like WaitEventSet and WaitEvent
552 	 * might not sized to guarantee that when purely using sizeof().
553 	 */
554 	sz += MAXALIGN(sizeof(WaitEventSet));
555 	sz += MAXALIGN(sizeof(WaitEvent) * nevents);
556 
557 #if defined(WAIT_USE_EPOLL)
558 	sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
559 #elif defined(WAIT_USE_POLL)
560 	sz += MAXALIGN(sizeof(struct pollfd) * nevents);
561 #elif defined(WAIT_USE_WIN32)
562 	/* need space for the pgwin32_signal_event */
563 	sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
564 #endif
565 
566 	data = (char *) MemoryContextAllocZero(context, sz);
567 
568 	set = (WaitEventSet *) data;
569 	data += MAXALIGN(sizeof(WaitEventSet));
570 
571 	set->events = (WaitEvent *) data;
572 	data += MAXALIGN(sizeof(WaitEvent) * nevents);
573 
574 #if defined(WAIT_USE_EPOLL)
575 	set->epoll_ret_events = (struct epoll_event *) data;
576 	data += MAXALIGN(sizeof(struct epoll_event) * nevents);
577 #elif defined(WAIT_USE_POLL)
578 	set->pollfds = (struct pollfd *) data;
579 	data += MAXALIGN(sizeof(struct pollfd) * nevents);
580 #elif defined(WAIT_USE_WIN32)
581 	set->handles = (HANDLE) data;
582 	data += MAXALIGN(sizeof(HANDLE) * nevents);
583 #endif
584 
585 	set->latch = NULL;
586 	set->nevents_space = nevents;
587 	set->exit_on_postmaster_death = false;
588 
589 #if defined(WAIT_USE_EPOLL)
590 #ifdef EPOLL_CLOEXEC
591 	set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
592 	if (set->epoll_fd < 0)
593 		elog(ERROR, "epoll_create1 failed: %m");
594 #else
595 	/* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
596 	set->epoll_fd = epoll_create(nevents);
597 	if (set->epoll_fd < 0)
598 		elog(ERROR, "epoll_create failed: %m");
599 	if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
600 		elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
601 #endif							/* EPOLL_CLOEXEC */
602 #elif defined(WAIT_USE_WIN32)
603 
604 	/*
605 	 * To handle signals while waiting, we need to add a win32 specific event.
606 	 * We accounted for the additional event at the top of this routine. See
607 	 * port/win32/signal.c for more details.
608 	 *
609 	 * Note: pgwin32_signal_event should be first to ensure that it will be
610 	 * reported when multiple events are set.  We want to guarantee that
611 	 * pending signals are serviced.
612 	 */
613 	set->handles[0] = pgwin32_signal_event;
614 	StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
615 #endif
616 
617 	return set;
618 }
619 
620 /*
621  * Free a previously created WaitEventSet.
622  *
623  * Note: preferably, this shouldn't have to free any resources that could be
624  * inherited across an exec().  If it did, we'd likely leak those resources in
625  * many scenarios.  For the epoll case, we ensure that by setting FD_CLOEXEC
626  * when the FD is created.  For the Windows case, we assume that the handles
627  * involved are non-inheritable.
628  */
629 void
FreeWaitEventSet(WaitEventSet * set)630 FreeWaitEventSet(WaitEventSet *set)
631 {
632 #if defined(WAIT_USE_EPOLL)
633 	close(set->epoll_fd);
634 #elif defined(WAIT_USE_WIN32)
635 	WaitEvent  *cur_event;
636 
637 	for (cur_event = set->events;
638 		 cur_event < (set->events + set->nevents);
639 		 cur_event++)
640 	{
641 		if (cur_event->events & WL_LATCH_SET)
642 		{
643 			/* uses the latch's HANDLE */
644 		}
645 		else if (cur_event->events & WL_POSTMASTER_DEATH)
646 		{
647 			/* uses PostmasterHandle */
648 		}
649 		else
650 		{
651 			/* Clean up the event object we created for the socket */
652 			WSAEventSelect(cur_event->fd, NULL, 0);
653 			WSACloseEvent(set->handles[cur_event->pos + 1]);
654 		}
655 	}
656 #endif
657 
658 	pfree(set);
659 }
660 
661 /* ---
662  * Add an event to the set. Possible events are:
663  * - WL_LATCH_SET: Wait for the latch to be set
664  * - WL_POSTMASTER_DEATH: Wait for postmaster to die
665  * - WL_SOCKET_READABLE: Wait for socket to become readable,
666  *	 can be combined in one event with other WL_SOCKET_* events
667  * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
668  *	 can be combined with other WL_SOCKET_* events
669  * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
670  *	 can be combined with other WL_SOCKET_* events (on non-Windows
671  *	 platforms, this is the same as WL_SOCKET_WRITEABLE)
672  * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
673  *
674  * Returns the offset in WaitEventSet->events (starting from 0), which can be
675  * used to modify previously added wait events using ModifyWaitEvent().
676  *
677  * In the WL_LATCH_SET case the latch must be owned by the current process,
678  * i.e. it must be a process-local latch initialized with InitLatch, or a
679  * shared latch associated with the current process by calling OwnLatch.
680  *
681  * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
682  * conditions cause the socket to be reported as readable/writable/connected,
683  * so that the caller can deal with the condition.
684  *
685  * The user_data pointer specified here will be set for the events returned
686  * by WaitEventSetWait(), allowing to easily associate additional data with
687  * events.
688  */
689 int
AddWaitEventToSet(WaitEventSet * set,uint32 events,pgsocket fd,Latch * latch,void * user_data)690 AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
691 				  void *user_data)
692 {
693 	WaitEvent  *event;
694 
695 	/* not enough space */
696 	Assert(set->nevents < set->nevents_space);
697 
698 	if (events == WL_EXIT_ON_PM_DEATH)
699 	{
700 		events = WL_POSTMASTER_DEATH;
701 		set->exit_on_postmaster_death = true;
702 	}
703 
704 	if (latch)
705 	{
706 		if (latch->owner_pid != MyProcPid)
707 			elog(ERROR, "cannot wait on a latch owned by another process");
708 		if (set->latch)
709 			elog(ERROR, "cannot wait on more than one latch");
710 		if ((events & WL_LATCH_SET) != WL_LATCH_SET)
711 			elog(ERROR, "latch events only support being set");
712 	}
713 	else
714 	{
715 		if (events & WL_LATCH_SET)
716 			elog(ERROR, "cannot wait on latch without a specified latch");
717 	}
718 
719 	/* waiting for socket readiness without a socket indicates a bug */
720 	if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
721 		elog(ERROR, "cannot wait on socket event without a socket");
722 
723 	event = &set->events[set->nevents];
724 	event->pos = set->nevents++;
725 	event->fd = fd;
726 	event->events = events;
727 	event->user_data = user_data;
728 #ifdef WIN32
729 	event->reset = false;
730 #endif
731 
732 	if (events == WL_LATCH_SET)
733 	{
734 		set->latch = latch;
735 		set->latch_pos = event->pos;
736 #ifndef WIN32
737 		event->fd = selfpipe_readfd;
738 #endif
739 	}
740 	else if (events == WL_POSTMASTER_DEATH)
741 	{
742 #ifndef WIN32
743 		event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
744 #endif
745 	}
746 
747 	/* perform wait primitive specific initialization, if needed */
748 #if defined(WAIT_USE_EPOLL)
749 	WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
750 #elif defined(WAIT_USE_POLL)
751 	WaitEventAdjustPoll(set, event);
752 #elif defined(WAIT_USE_WIN32)
753 	WaitEventAdjustWin32(set, event);
754 #endif
755 
756 	return event->pos;
757 }
758 
759 /*
760  * Change the event mask and, in the WL_LATCH_SET case, the latch associated
761  * with the WaitEvent.
762  *
763  * 'pos' is the id returned by AddWaitEventToSet.
764  */
765 void
ModifyWaitEvent(WaitEventSet * set,int pos,uint32 events,Latch * latch)766 ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
767 {
768 	WaitEvent  *event;
769 
770 	Assert(pos < set->nevents);
771 
772 	event = &set->events[pos];
773 
774 	/*
775 	 * If neither the event mask nor the associated latch changes, return
776 	 * early. That's an important optimization for some sockets, where
777 	 * ModifyWaitEvent is frequently used to switch from waiting for reads to
778 	 * waiting on writes.
779 	 */
780 	if (events == event->events &&
781 		(!(event->events & WL_LATCH_SET) || set->latch == latch))
782 		return;
783 
784 	if (event->events & WL_LATCH_SET &&
785 		events != event->events)
786 	{
787 		/* we could allow to disable latch events for a while */
788 		elog(ERROR, "cannot modify latch event");
789 	}
790 
791 	if (event->events & WL_POSTMASTER_DEATH)
792 	{
793 		elog(ERROR, "cannot modify postmaster death event");
794 	}
795 
796 	/* FIXME: validate event mask */
797 	event->events = events;
798 
799 	if (events == WL_LATCH_SET)
800 	{
801 		set->latch = latch;
802 	}
803 
804 #if defined(WAIT_USE_EPOLL)
805 	WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
806 #elif defined(WAIT_USE_POLL)
807 	WaitEventAdjustPoll(set, event);
808 #elif defined(WAIT_USE_WIN32)
809 	WaitEventAdjustWin32(set, event);
810 #endif
811 }
812 
813 #if defined(WAIT_USE_EPOLL)
814 /*
815  * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
816  */
817 static void
WaitEventAdjustEpoll(WaitEventSet * set,WaitEvent * event,int action)818 WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
819 {
820 	struct epoll_event epoll_ev;
821 	int			rc;
822 
823 	/* pointer to our event, returned by epoll_wait */
824 	epoll_ev.data.ptr = event;
825 	/* always wait for errors */
826 	epoll_ev.events = EPOLLERR | EPOLLHUP;
827 
828 	/* prepare pollfd entry once */
829 	if (event->events == WL_LATCH_SET)
830 	{
831 		Assert(set->latch != NULL);
832 		epoll_ev.events |= EPOLLIN;
833 	}
834 	else if (event->events == WL_POSTMASTER_DEATH)
835 	{
836 		epoll_ev.events |= EPOLLIN;
837 	}
838 	else
839 	{
840 		Assert(event->fd != PGINVALID_SOCKET);
841 		Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
842 
843 		if (event->events & WL_SOCKET_READABLE)
844 			epoll_ev.events |= EPOLLIN;
845 		if (event->events & WL_SOCKET_WRITEABLE)
846 			epoll_ev.events |= EPOLLOUT;
847 	}
848 
849 	/*
850 	 * Even though unused, we also pass epoll_ev as the data argument if
851 	 * EPOLL_CTL_DEL is passed as action.  There used to be an epoll bug
852 	 * requiring that, and actually it makes the code simpler...
853 	 */
854 	rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
855 
856 	if (rc < 0)
857 		ereport(ERROR,
858 				(errcode_for_socket_access(),
859 		/* translator: %s is a syscall name, such as "poll()" */
860 				 errmsg("%s failed: %m",
861 						"epoll_ctl()")));
862 }
863 #endif
864 
865 #if defined(WAIT_USE_POLL)
866 static void
WaitEventAdjustPoll(WaitEventSet * set,WaitEvent * event)867 WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
868 {
869 	struct pollfd *pollfd = &set->pollfds[event->pos];
870 
871 	pollfd->revents = 0;
872 	pollfd->fd = event->fd;
873 
874 	/* prepare pollfd entry once */
875 	if (event->events == WL_LATCH_SET)
876 	{
877 		Assert(set->latch != NULL);
878 		pollfd->events = POLLIN;
879 	}
880 	else if (event->events == WL_POSTMASTER_DEATH)
881 	{
882 		pollfd->events = POLLIN;
883 	}
884 	else
885 	{
886 		Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
887 		pollfd->events = 0;
888 		if (event->events & WL_SOCKET_READABLE)
889 			pollfd->events |= POLLIN;
890 		if (event->events & WL_SOCKET_WRITEABLE)
891 			pollfd->events |= POLLOUT;
892 	}
893 
894 	Assert(event->fd != PGINVALID_SOCKET);
895 }
896 #endif
897 
898 #if defined(WAIT_USE_WIN32)
899 static void
WaitEventAdjustWin32(WaitEventSet * set,WaitEvent * event)900 WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
901 {
902 	HANDLE	   *handle = &set->handles[event->pos + 1];
903 
904 	if (event->events == WL_LATCH_SET)
905 	{
906 		Assert(set->latch != NULL);
907 		*handle = set->latch->event;
908 	}
909 	else if (event->events == WL_POSTMASTER_DEATH)
910 	{
911 		*handle = PostmasterHandle;
912 	}
913 	else
914 	{
915 		int			flags = FD_CLOSE;	/* always check for errors/EOF */
916 
917 		if (event->events & WL_SOCKET_READABLE)
918 			flags |= FD_READ;
919 		if (event->events & WL_SOCKET_WRITEABLE)
920 			flags |= FD_WRITE;
921 		if (event->events & WL_SOCKET_CONNECTED)
922 			flags |= FD_CONNECT;
923 
924 		if (*handle == WSA_INVALID_EVENT)
925 		{
926 			*handle = WSACreateEvent();
927 			if (*handle == WSA_INVALID_EVENT)
928 				elog(ERROR, "failed to create event for socket: error code %u",
929 					 WSAGetLastError());
930 		}
931 		if (WSAEventSelect(event->fd, *handle, flags) != 0)
932 			elog(ERROR, "failed to set up event for socket: error code %u",
933 				 WSAGetLastError());
934 
935 		Assert(event->fd != PGINVALID_SOCKET);
936 	}
937 }
938 #endif
939 
940 /*
941  * Wait for events added to the set to happen, or until the timeout is
942  * reached.  At most nevents occurred events are returned.
943  *
944  * If timeout = -1, block until an event occurs; if 0, check sockets for
945  * readiness, but don't block; if > 0, block for at most timeout milliseconds.
946  *
947  * Returns the number of events occurred, or 0 if the timeout was reached.
948  *
949  * Returned events will have the fd, pos, user_data fields set to the
950  * values associated with the registered event.
951  */
952 int
WaitEventSetWait(WaitEventSet * set,long timeout,WaitEvent * occurred_events,int nevents,uint32 wait_event_info)953 WaitEventSetWait(WaitEventSet *set, long timeout,
954 				 WaitEvent *occurred_events, int nevents,
955 				 uint32 wait_event_info)
956 {
957 	int			returned_events = 0;
958 	instr_time	start_time;
959 	instr_time	cur_time;
960 	long		cur_timeout = -1;
961 
962 	Assert(nevents > 0);
963 
964 	/*
965 	 * Initialize timeout if requested.  We must record the current time so
966 	 * that we can determine the remaining timeout if interrupted.
967 	 */
968 	if (timeout >= 0)
969 	{
970 		INSTR_TIME_SET_CURRENT(start_time);
971 		Assert(timeout >= 0 && timeout <= INT_MAX);
972 		cur_timeout = timeout;
973 	}
974 
975 	pgstat_report_wait_start(wait_event_info);
976 
977 #ifndef WIN32
978 	waiting = true;
979 #else
980 	/* Ensure that signals are serviced even if latch is already set */
981 	pgwin32_dispatch_queued_signals();
982 #endif
983 	while (returned_events == 0)
984 	{
985 		int			rc;
986 
987 		/*
988 		 * Check if the latch is set already. If so, leave the loop
989 		 * immediately, avoid blocking again. We don't attempt to report any
990 		 * other events that might also be satisfied.
991 		 *
992 		 * If someone sets the latch between this and the
993 		 * WaitEventSetWaitBlock() below, the setter will write a byte to the
994 		 * pipe (or signal us and the signal handler will do that), and the
995 		 * readiness routine will return immediately.
996 		 *
997 		 * On unix, If there's a pending byte in the self pipe, we'll notice
998 		 * whenever blocking. Only clearing the pipe in that case avoids
999 		 * having to drain it every time WaitLatchOrSocket() is used. Should
1000 		 * the pipe-buffer fill up we're still ok, because the pipe is in
1001 		 * nonblocking mode. It's unlikely for that to happen, because the
1002 		 * self pipe isn't filled unless we're blocking (waiting = true), or
1003 		 * from inside a signal handler in latch_sigusr1_handler().
1004 		 *
1005 		 * On windows, we'll also notice if there's a pending event for the
1006 		 * latch when blocking, but there's no danger of anything filling up,
1007 		 * as "Setting an event that is already set has no effect.".
1008 		 *
1009 		 * Note: we assume that the kernel calls involved in latch management
1010 		 * will provide adequate synchronization on machines with weak memory
1011 		 * ordering, so that we cannot miss seeing is_set if a notification
1012 		 * has already been queued.
1013 		 */
1014 		if (set->latch && set->latch->is_set)
1015 		{
1016 			occurred_events->fd = PGINVALID_SOCKET;
1017 			occurred_events->pos = set->latch_pos;
1018 			occurred_events->user_data =
1019 				set->events[set->latch_pos].user_data;
1020 			occurred_events->events = WL_LATCH_SET;
1021 			occurred_events++;
1022 			returned_events++;
1023 
1024 			break;
1025 		}
1026 
1027 		/*
1028 		 * Wait for events using the readiness primitive chosen at the top of
1029 		 * this file. If -1 is returned, a timeout has occurred, if 0 we have
1030 		 * to retry, everything >= 1 is the number of returned events.
1031 		 */
1032 		rc = WaitEventSetWaitBlock(set, cur_timeout,
1033 								   occurred_events, nevents);
1034 
1035 		if (rc == -1)
1036 			break;				/* timeout occurred */
1037 		else
1038 			returned_events = rc;
1039 
1040 		/* If we're not done, update cur_timeout for next iteration */
1041 		if (returned_events == 0 && timeout >= 0)
1042 		{
1043 			INSTR_TIME_SET_CURRENT(cur_time);
1044 			INSTR_TIME_SUBTRACT(cur_time, start_time);
1045 			cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1046 			if (cur_timeout <= 0)
1047 				break;
1048 		}
1049 	}
1050 #ifndef WIN32
1051 	waiting = false;
1052 #endif
1053 
1054 	pgstat_report_wait_end();
1055 
1056 	return returned_events;
1057 }
1058 
1059 
1060 #if defined(WAIT_USE_EPOLL)
1061 
1062 /*
1063  * Wait using linux's epoll_wait(2).
1064  *
1065  * This is the preferable wait method, as several readiness notifications are
1066  * delivered, without having to iterate through all of set->events. The return
1067  * epoll_event struct contain a pointer to our events, making association
1068  * easy.
1069  */
1070 static inline int
WaitEventSetWaitBlock(WaitEventSet * set,int cur_timeout,WaitEvent * occurred_events,int nevents)1071 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1072 					  WaitEvent *occurred_events, int nevents)
1073 {
1074 	int			returned_events = 0;
1075 	int			rc;
1076 	WaitEvent  *cur_event;
1077 	struct epoll_event *cur_epoll_event;
1078 
1079 	/* Sleep */
1080 	rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1081 					nevents, cur_timeout);
1082 
1083 	/* Check return code */
1084 	if (rc < 0)
1085 	{
1086 		/* EINTR is okay, otherwise complain */
1087 		if (errno != EINTR)
1088 		{
1089 			waiting = false;
1090 			ereport(ERROR,
1091 					(errcode_for_socket_access(),
1092 			/* translator: %s is a syscall name, such as "poll()" */
1093 					 errmsg("%s failed: %m",
1094 							"epoll_wait()")));
1095 		}
1096 		return 0;
1097 	}
1098 	else if (rc == 0)
1099 	{
1100 		/* timeout exceeded */
1101 		return -1;
1102 	}
1103 
1104 	/*
1105 	 * At least one event occurred, iterate over the returned epoll events
1106 	 * until they're either all processed, or we've returned all the events
1107 	 * the caller desired.
1108 	 */
1109 	for (cur_epoll_event = set->epoll_ret_events;
1110 		 cur_epoll_event < (set->epoll_ret_events + rc) &&
1111 		 returned_events < nevents;
1112 		 cur_epoll_event++)
1113 	{
1114 		/* epoll's data pointer is set to the associated WaitEvent */
1115 		cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1116 
1117 		occurred_events->pos = cur_event->pos;
1118 		occurred_events->user_data = cur_event->user_data;
1119 		occurred_events->events = 0;
1120 
1121 		if (cur_event->events == WL_LATCH_SET &&
1122 			cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1123 		{
1124 			/* There's data in the self-pipe, clear it. */
1125 			drainSelfPipe();
1126 
1127 			if (set->latch->is_set)
1128 			{
1129 				occurred_events->fd = PGINVALID_SOCKET;
1130 				occurred_events->events = WL_LATCH_SET;
1131 				occurred_events++;
1132 				returned_events++;
1133 			}
1134 		}
1135 		else if (cur_event->events == WL_POSTMASTER_DEATH &&
1136 				 cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1137 		{
1138 			/*
1139 			 * We expect an EPOLLHUP when the remote end is closed, but
1140 			 * because we don't expect the pipe to become readable or to have
1141 			 * any errors either, treat those cases as postmaster death, too.
1142 			 *
1143 			 * Be paranoid about a spurious event signalling the postmaster as
1144 			 * being dead.  There have been reports about that happening with
1145 			 * older primitives (select(2) to be specific), and a spurious
1146 			 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1147 			 * cost much.
1148 			 */
1149 			if (!PostmasterIsAliveInternal())
1150 			{
1151 				if (set->exit_on_postmaster_death)
1152 					proc_exit(1);
1153 				occurred_events->fd = PGINVALID_SOCKET;
1154 				occurred_events->events = WL_POSTMASTER_DEATH;
1155 				occurred_events++;
1156 				returned_events++;
1157 			}
1158 		}
1159 		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1160 		{
1161 			Assert(cur_event->fd != PGINVALID_SOCKET);
1162 
1163 			if ((cur_event->events & WL_SOCKET_READABLE) &&
1164 				(cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1165 			{
1166 				/* data available in socket, or EOF */
1167 				occurred_events->events |= WL_SOCKET_READABLE;
1168 			}
1169 
1170 			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1171 				(cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1172 			{
1173 				/* writable, or EOF */
1174 				occurred_events->events |= WL_SOCKET_WRITEABLE;
1175 			}
1176 
1177 			if (occurred_events->events != 0)
1178 			{
1179 				occurred_events->fd = cur_event->fd;
1180 				occurred_events++;
1181 				returned_events++;
1182 			}
1183 		}
1184 	}
1185 
1186 	return returned_events;
1187 }
1188 
1189 #elif defined(WAIT_USE_POLL)
1190 
1191 /*
1192  * Wait using poll(2).
1193  *
1194  * This allows to receive readiness notifications for several events at once,
1195  * but requires iterating through all of set->pollfds.
1196  */
1197 static inline int
WaitEventSetWaitBlock(WaitEventSet * set,int cur_timeout,WaitEvent * occurred_events,int nevents)1198 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1199 					  WaitEvent *occurred_events, int nevents)
1200 {
1201 	int			returned_events = 0;
1202 	int			rc;
1203 	WaitEvent  *cur_event;
1204 	struct pollfd *cur_pollfd;
1205 
1206 	/* Sleep */
1207 	rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1208 
1209 	/* Check return code */
1210 	if (rc < 0)
1211 	{
1212 		/* EINTR is okay, otherwise complain */
1213 		if (errno != EINTR)
1214 		{
1215 			waiting = false;
1216 			ereport(ERROR,
1217 					(errcode_for_socket_access(),
1218 			/* translator: %s is a syscall name, such as "poll()" */
1219 					 errmsg("%s failed: %m",
1220 							"poll()")));
1221 		}
1222 		return 0;
1223 	}
1224 	else if (rc == 0)
1225 	{
1226 		/* timeout exceeded */
1227 		return -1;
1228 	}
1229 
1230 	for (cur_event = set->events, cur_pollfd = set->pollfds;
1231 		 cur_event < (set->events + set->nevents) &&
1232 		 returned_events < nevents;
1233 		 cur_event++, cur_pollfd++)
1234 	{
1235 		/* no activity on this FD, skip */
1236 		if (cur_pollfd->revents == 0)
1237 			continue;
1238 
1239 		occurred_events->pos = cur_event->pos;
1240 		occurred_events->user_data = cur_event->user_data;
1241 		occurred_events->events = 0;
1242 
1243 		if (cur_event->events == WL_LATCH_SET &&
1244 			(cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1245 		{
1246 			/* There's data in the self-pipe, clear it. */
1247 			drainSelfPipe();
1248 
1249 			if (set->latch->is_set)
1250 			{
1251 				occurred_events->fd = PGINVALID_SOCKET;
1252 				occurred_events->events = WL_LATCH_SET;
1253 				occurred_events++;
1254 				returned_events++;
1255 			}
1256 		}
1257 		else if (cur_event->events == WL_POSTMASTER_DEATH &&
1258 				 (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1259 		{
1260 			/*
1261 			 * We expect an POLLHUP when the remote end is closed, but because
1262 			 * we don't expect the pipe to become readable or to have any
1263 			 * errors either, treat those cases as postmaster death, too.
1264 			 *
1265 			 * Be paranoid about a spurious event signalling the postmaster as
1266 			 * being dead.  There have been reports about that happening with
1267 			 * older primitives (select(2) to be specific), and a spurious
1268 			 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1269 			 * cost much.
1270 			 */
1271 			if (!PostmasterIsAliveInternal())
1272 			{
1273 				if (set->exit_on_postmaster_death)
1274 					proc_exit(1);
1275 				occurred_events->fd = PGINVALID_SOCKET;
1276 				occurred_events->events = WL_POSTMASTER_DEATH;
1277 				occurred_events++;
1278 				returned_events++;
1279 			}
1280 		}
1281 		else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1282 		{
1283 			int			errflags = POLLHUP | POLLERR | POLLNVAL;
1284 
1285 			Assert(cur_event->fd >= PGINVALID_SOCKET);
1286 
1287 			if ((cur_event->events & WL_SOCKET_READABLE) &&
1288 				(cur_pollfd->revents & (POLLIN | errflags)))
1289 			{
1290 				/* data available in socket, or EOF */
1291 				occurred_events->events |= WL_SOCKET_READABLE;
1292 			}
1293 
1294 			if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1295 				(cur_pollfd->revents & (POLLOUT | errflags)))
1296 			{
1297 				/* writeable, or EOF */
1298 				occurred_events->events |= WL_SOCKET_WRITEABLE;
1299 			}
1300 
1301 			if (occurred_events->events != 0)
1302 			{
1303 				occurred_events->fd = cur_event->fd;
1304 				occurred_events++;
1305 				returned_events++;
1306 			}
1307 		}
1308 	}
1309 	return returned_events;
1310 }
1311 
1312 #elif defined(WAIT_USE_WIN32)
1313 
1314 /*
1315  * Wait using Windows' WaitForMultipleObjects().
1316  *
1317  * Unfortunately this will only ever return a single readiness notification at
1318  * a time.  Note that while the official documentation for
1319  * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
1320  * with a single bWaitAll = FALSE call,
1321  * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
1322  * that only one event is "consumed".
1323  */
1324 static inline int
WaitEventSetWaitBlock(WaitEventSet * set,int cur_timeout,WaitEvent * occurred_events,int nevents)1325 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1326 					  WaitEvent *occurred_events, int nevents)
1327 {
1328 	int			returned_events = 0;
1329 	DWORD		rc;
1330 	WaitEvent  *cur_event;
1331 
1332 	/* Reset any wait events that need it */
1333 	for (cur_event = set->events;
1334 		 cur_event < (set->events + set->nevents);
1335 		 cur_event++)
1336 	{
1337 		if (cur_event->reset)
1338 		{
1339 			WaitEventAdjustWin32(set, cur_event);
1340 			cur_event->reset = false;
1341 		}
1342 
1343 		/*
1344 		 * Windows does not guarantee to log an FD_WRITE network event
1345 		 * indicating that more data can be sent unless the previous send()
1346 		 * failed with WSAEWOULDBLOCK.  While our caller might well have made
1347 		 * such a call, we cannot assume that here.  Therefore, if waiting for
1348 		 * write-ready, force the issue by doing a dummy send().  If the dummy
1349 		 * send() succeeds, assume that the socket is in fact write-ready, and
1350 		 * return immediately.  Also, if it fails with something other than
1351 		 * WSAEWOULDBLOCK, return a write-ready indication to let our caller
1352 		 * deal with the error condition.
1353 		 */
1354 		if (cur_event->events & WL_SOCKET_WRITEABLE)
1355 		{
1356 			char		c;
1357 			WSABUF		buf;
1358 			DWORD		sent;
1359 			int			r;
1360 
1361 			buf.buf = &c;
1362 			buf.len = 0;
1363 
1364 			r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
1365 			if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
1366 			{
1367 				occurred_events->pos = cur_event->pos;
1368 				occurred_events->user_data = cur_event->user_data;
1369 				occurred_events->events = WL_SOCKET_WRITEABLE;
1370 				occurred_events->fd = cur_event->fd;
1371 				return 1;
1372 			}
1373 		}
1374 	}
1375 
1376 	/*
1377 	 * Sleep.
1378 	 *
1379 	 * Need to wait for ->nevents + 1, because signal handle is in [0].
1380 	 */
1381 	rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
1382 								cur_timeout);
1383 
1384 	/* Check return code */
1385 	if (rc == WAIT_FAILED)
1386 		elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
1387 			 GetLastError());
1388 	else if (rc == WAIT_TIMEOUT)
1389 	{
1390 		/* timeout exceeded */
1391 		return -1;
1392 	}
1393 
1394 	if (rc == WAIT_OBJECT_0)
1395 	{
1396 		/* Service newly-arrived signals */
1397 		pgwin32_dispatch_queued_signals();
1398 		return 0;				/* retry */
1399 	}
1400 
1401 	/*
1402 	 * With an offset of one, due to the always present pgwin32_signal_event,
1403 	 * the handle offset directly corresponds to a wait event.
1404 	 */
1405 	cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
1406 
1407 	occurred_events->pos = cur_event->pos;
1408 	occurred_events->user_data = cur_event->user_data;
1409 	occurred_events->events = 0;
1410 
1411 	if (cur_event->events == WL_LATCH_SET)
1412 	{
1413 		if (!ResetEvent(set->latch->event))
1414 			elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
1415 
1416 		if (set->latch->is_set)
1417 		{
1418 			occurred_events->fd = PGINVALID_SOCKET;
1419 			occurred_events->events = WL_LATCH_SET;
1420 			occurred_events++;
1421 			returned_events++;
1422 		}
1423 	}
1424 	else if (cur_event->events == WL_POSTMASTER_DEATH)
1425 	{
1426 		/*
1427 		 * Postmaster apparently died.  Since the consequences of falsely
1428 		 * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
1429 		 * the trouble to positively verify this with PostmasterIsAlive(),
1430 		 * even though there is no known reason to think that the event could
1431 		 * be falsely set on Windows.
1432 		 */
1433 		if (!PostmasterIsAliveInternal())
1434 		{
1435 			if (set->exit_on_postmaster_death)
1436 				proc_exit(1);
1437 			occurred_events->fd = PGINVALID_SOCKET;
1438 			occurred_events->events = WL_POSTMASTER_DEATH;
1439 			occurred_events++;
1440 			returned_events++;
1441 		}
1442 	}
1443 	else if (cur_event->events & WL_SOCKET_MASK)
1444 	{
1445 		WSANETWORKEVENTS resEvents;
1446 		HANDLE		handle = set->handles[cur_event->pos + 1];
1447 
1448 		Assert(cur_event->fd);
1449 
1450 		occurred_events->fd = cur_event->fd;
1451 
1452 		ZeroMemory(&resEvents, sizeof(resEvents));
1453 		if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
1454 			elog(ERROR, "failed to enumerate network events: error code %u",
1455 				 WSAGetLastError());
1456 		if ((cur_event->events & WL_SOCKET_READABLE) &&
1457 			(resEvents.lNetworkEvents & FD_READ))
1458 		{
1459 			/* data available in socket */
1460 			occurred_events->events |= WL_SOCKET_READABLE;
1461 
1462 			/*------
1463 			 * WaitForMultipleObjects doesn't guarantee that a read event will
1464 			 * be returned if the latch is set at the same time.  Even if it
1465 			 * did, the caller might drop that event expecting it to reoccur
1466 			 * on next call.  So, we must force the event to be reset if this
1467 			 * WaitEventSet is used again in order to avoid an indefinite
1468 			 * hang.  Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
1469 			 * for the behavior of socket events.
1470 			 *------
1471 			 */
1472 			cur_event->reset = true;
1473 		}
1474 		if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1475 			(resEvents.lNetworkEvents & FD_WRITE))
1476 		{
1477 			/* writeable */
1478 			occurred_events->events |= WL_SOCKET_WRITEABLE;
1479 		}
1480 		if ((cur_event->events & WL_SOCKET_CONNECTED) &&
1481 			(resEvents.lNetworkEvents & FD_CONNECT))
1482 		{
1483 			/* connected */
1484 			occurred_events->events |= WL_SOCKET_CONNECTED;
1485 		}
1486 		if (resEvents.lNetworkEvents & FD_CLOSE)
1487 		{
1488 			/* EOF/error, so signal all caller-requested socket flags */
1489 			occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
1490 		}
1491 
1492 		if (occurred_events->events != 0)
1493 		{
1494 			occurred_events++;
1495 			returned_events++;
1496 		}
1497 	}
1498 
1499 	return returned_events;
1500 }
1501 #endif
1502 
1503 /*
1504  * SetLatch uses SIGUSR1 to wake up the process waiting on the latch.
1505  *
1506  * Wake up WaitLatch, if we're waiting.  (We might not be, since SIGUSR1 is
1507  * overloaded for multiple purposes; or we might not have reached WaitLatch
1508  * yet, in which case we don't need to fill the pipe either.)
1509  *
1510  * NB: when calling this in a signal handler, be sure to save and restore
1511  * errno around it.
1512  */
1513 #ifndef WIN32
1514 void
latch_sigusr1_handler(void)1515 latch_sigusr1_handler(void)
1516 {
1517 	if (waiting)
1518 		sendSelfPipeByte();
1519 }
1520 #endif							/* !WIN32 */
1521 
1522 /* Send one byte to the self-pipe, to wake up WaitLatch */
1523 #ifndef WIN32
1524 static void
sendSelfPipeByte(void)1525 sendSelfPipeByte(void)
1526 {
1527 	int			rc;
1528 	char		dummy = 0;
1529 
1530 retry:
1531 	rc = write(selfpipe_writefd, &dummy, 1);
1532 	if (rc < 0)
1533 	{
1534 		/* If interrupted by signal, just retry */
1535 		if (errno == EINTR)
1536 			goto retry;
1537 
1538 		/*
1539 		 * If the pipe is full, we don't need to retry, the data that's there
1540 		 * already is enough to wake up WaitLatch.
1541 		 */
1542 		if (errno == EAGAIN || errno == EWOULDBLOCK)
1543 			return;
1544 
1545 		/*
1546 		 * Oops, the write() failed for some other reason. We might be in a
1547 		 * signal handler, so it's not safe to elog(). We have no choice but
1548 		 * silently ignore the error.
1549 		 */
1550 		return;
1551 	}
1552 }
1553 #endif							/* !WIN32 */
1554 
1555 /*
1556  * Read all available data from the self-pipe
1557  *
1558  * Note: this is only called when waiting = true.  If it fails and doesn't
1559  * return, it must reset that flag first (though ideally, this will never
1560  * happen).
1561  */
1562 #ifndef WIN32
1563 static void
drainSelfPipe(void)1564 drainSelfPipe(void)
1565 {
1566 	/*
1567 	 * There shouldn't normally be more than one byte in the pipe, or maybe a
1568 	 * few bytes if multiple processes run SetLatch at the same instant.
1569 	 */
1570 	char		buf[16];
1571 	int			rc;
1572 
1573 	for (;;)
1574 	{
1575 		rc = read(selfpipe_readfd, buf, sizeof(buf));
1576 		if (rc < 0)
1577 		{
1578 			if (errno == EAGAIN || errno == EWOULDBLOCK)
1579 				break;			/* the pipe is empty */
1580 			else if (errno == EINTR)
1581 				continue;		/* retry */
1582 			else
1583 			{
1584 				waiting = false;
1585 				elog(ERROR, "read() on self-pipe failed: %m");
1586 			}
1587 		}
1588 		else if (rc == 0)
1589 		{
1590 			waiting = false;
1591 			elog(ERROR, "unexpected EOF on self-pipe");
1592 		}
1593 		else if (rc < sizeof(buf))
1594 		{
1595 			/* we successfully drained the pipe; no need to read() again */
1596 			break;
1597 		}
1598 		/* else buffer wasn't big enough, so read again */
1599 	}
1600 }
1601 #endif							/* !WIN32 */
1602