1 /*-------------------------------------------------------------------------
2 *
3 * latch.c
4 * Routines for inter-process latches
5 *
6 * The Unix implementation uses the so-called self-pipe trick to overcome the
7 * race condition involved with poll() (or epoll_wait() on linux) and setting
8 * a global flag in the signal handler. When a latch is set and the current
9 * process is waiting for it, the signal handler wakes up the poll() in
10 * WaitLatch by writing a byte to a pipe. A signal by itself doesn't interrupt
11 * poll() on all platforms, and even on platforms where it does, a signal that
12 * arrives just before the poll() call does not prevent poll() from entering
13 * sleep. An incoming byte on a pipe however reliably interrupts the sleep,
14 * and causes poll() to return immediately even if the signal arrives before
15 * poll() begins.
16 *
17 * When SetLatch is called from the same process that owns the latch,
18 * SetLatch writes the byte directly to the pipe. If it's owned by another
19 * process, SIGUSR1 is sent and the signal handler in the waiting process
20 * writes the byte to the pipe on behalf of the signaling process.
21 *
22 * The Windows implementation uses Windows events that are inherited by all
23 * postmaster child processes. There's no need for the self-pipe trick there.
24 *
25 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
26 * Portions Copyright (c) 1994, Regents of the University of California
27 *
28 * IDENTIFICATION
29 * src/backend/storage/ipc/latch.c
30 *
31 *-------------------------------------------------------------------------
32 */
33 #include "postgres.h"
34
35 #include <fcntl.h>
36 #include <limits.h>
37 #include <signal.h>
38 #include <unistd.h>
39 #ifdef HAVE_SYS_EPOLL_H
40 #include <sys/epoll.h>
41 #endif
42 #ifdef HAVE_POLL_H
43 #include <poll.h>
44 #endif
45
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "port/atomics.h"
49 #include "portability/instr_time.h"
50 #include "postmaster/postmaster.h"
51 #include "storage/latch.h"
52 #include "storage/pmsignal.h"
53 #include "storage/shmem.h"
54
55 /*
56 * Select the fd readiness primitive to use. Normally the "most modern"
57 * primitive supported by the OS will be used, but for testing it can be
58 * useful to manually specify the used primitive. If desired, just add a
59 * define somewhere before this block.
60 */
61 #if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
62 defined(WAIT_USE_WIN32)
63 /* don't overwrite manual choice */
64 #elif defined(HAVE_SYS_EPOLL_H)
65 #define WAIT_USE_EPOLL
66 #elif defined(HAVE_POLL)
67 #define WAIT_USE_POLL
68 #elif WIN32
69 #define WAIT_USE_WIN32
70 #else
71 #error "no wait set implementation available"
72 #endif
73
74 /* typedef in latch.h */
75 struct WaitEventSet
76 {
77 int nevents; /* number of registered events */
78 int nevents_space; /* maximum number of events in this set */
79
80 /*
81 * Array, of nevents_space length, storing the definition of events this
82 * set is waiting for.
83 */
84 WaitEvent *events;
85
86 /*
87 * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
88 * said latch, and latch_pos the offset in the ->events array. This is
89 * useful because we check the state of the latch before performing doing
90 * syscalls related to waiting.
91 */
92 Latch *latch;
93 int latch_pos;
94
95 #if defined(WAIT_USE_EPOLL)
96 int epoll_fd;
97 /* epoll_wait returns events in a user provided arrays, allocate once */
98 struct epoll_event *epoll_ret_events;
99 #elif defined(WAIT_USE_POLL)
100 /* poll expects events to be waited on every poll() call, prepare once */
101 struct pollfd *pollfds;
102 #elif defined(WAIT_USE_WIN32)
103
104 /*
105 * Array of windows events. The first element always contains
106 * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
107 * event->pos + 1).
108 */
109 HANDLE *handles;
110 #endif
111 };
112
113 #ifndef WIN32
114 /* Are we currently in WaitLatch? The signal handler would like to know. */
115 static volatile sig_atomic_t waiting = false;
116
117 /* Read and write ends of the self-pipe */
118 static int selfpipe_readfd = -1;
119 static int selfpipe_writefd = -1;
120
121 /* Process owning the self-pipe --- needed for checking purposes */
122 static int selfpipe_owner_pid = 0;
123
124 /* Private function prototypes */
125 static void sendSelfPipeByte(void);
126 static void drainSelfPipe(void);
127 #endif /* WIN32 */
128
129 #if defined(WAIT_USE_EPOLL)
130 static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
131 #elif defined(WAIT_USE_POLL)
132 static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
133 #elif defined(WAIT_USE_WIN32)
134 static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
135 #endif
136
137 static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
138 WaitEvent *occurred_events, int nevents);
139
140 /*
141 * Initialize the process-local latch infrastructure.
142 *
143 * This must be called once during startup of any process that can wait on
144 * latches, before it issues any InitLatch() or OwnLatch() calls.
145 */
146 void
InitializeLatchSupport(void)147 InitializeLatchSupport(void)
148 {
149 #ifndef WIN32
150 int pipefd[2];
151
152 if (IsUnderPostmaster)
153 {
154 /*
155 * We might have inherited connections to a self-pipe created by the
156 * postmaster. It's critical that child processes create their own
157 * self-pipes, of course, and we really want them to close the
158 * inherited FDs for safety's sake.
159 */
160 if (selfpipe_owner_pid != 0)
161 {
162 /* Assert we go through here but once in a child process */
163 Assert(selfpipe_owner_pid != MyProcPid);
164 /* Release postmaster's pipe FDs; ignore any error */
165 (void) close(selfpipe_readfd);
166 (void) close(selfpipe_writefd);
167 /* Clean up, just for safety's sake; we'll set these below */
168 selfpipe_readfd = selfpipe_writefd = -1;
169 selfpipe_owner_pid = 0;
170 }
171 else
172 {
173 /*
174 * Postmaster didn't create a self-pipe ... or else we're in an
175 * EXEC_BACKEND build, in which case it doesn't matter since the
176 * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
177 */
178 Assert(selfpipe_readfd == -1);
179 }
180 }
181 else
182 {
183 /* In postmaster or standalone backend, assert we do this but once */
184 Assert(selfpipe_readfd == -1);
185 Assert(selfpipe_owner_pid == 0);
186 }
187
188 /*
189 * Set up the self-pipe that allows a signal handler to wake up the
190 * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
191 * that SetLatch won't block if the event has already been set many times
192 * filling the kernel buffer. Make the read-end non-blocking too, so that
193 * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
194 * Also, make both FDs close-on-exec, since we surely do not want any
195 * child processes messing with them.
196 */
197 if (pipe(pipefd) < 0)
198 elog(FATAL, "pipe() failed: %m");
199 if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
200 elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
201 if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
202 elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
203 if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
204 elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
205 if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
206 elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
207
208 selfpipe_readfd = pipefd[0];
209 selfpipe_writefd = pipefd[1];
210 selfpipe_owner_pid = MyProcPid;
211 #else
212 /* currently, nothing to do here for Windows */
213 #endif
214 }
215
216 /*
217 * Initialize a process-local latch.
218 */
219 void
InitLatch(volatile Latch * latch)220 InitLatch(volatile Latch *latch)
221 {
222 latch->is_set = false;
223 latch->owner_pid = MyProcPid;
224 latch->is_shared = false;
225
226 #ifndef WIN32
227 /* Assert InitializeLatchSupport has been called in this process */
228 Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
229 #else
230 latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
231 if (latch->event == NULL)
232 elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
233 #endif /* WIN32 */
234 }
235
236 /*
237 * Initialize a shared latch that can be set from other processes. The latch
238 * is initially owned by no-one; use OwnLatch to associate it with the
239 * current process.
240 *
241 * InitSharedLatch needs to be called in postmaster before forking child
242 * processes, usually right after allocating the shared memory block
243 * containing the latch with ShmemInitStruct. (The Unix implementation
244 * doesn't actually require that, but the Windows one does.) Because of
245 * this restriction, we have no concurrency issues to worry about here.
246 *
247 * Note that other handles created in this module are never marked as
248 * inheritable. Thus we do not need to worry about cleaning up child
249 * process references to postmaster-private latches or WaitEventSets.
250 */
251 void
InitSharedLatch(volatile Latch * latch)252 InitSharedLatch(volatile Latch *latch)
253 {
254 #ifdef WIN32
255 SECURITY_ATTRIBUTES sa;
256
257 /*
258 * Set up security attributes to specify that the events are inherited.
259 */
260 ZeroMemory(&sa, sizeof(sa));
261 sa.nLength = sizeof(sa);
262 sa.bInheritHandle = TRUE;
263
264 latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
265 if (latch->event == NULL)
266 elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
267 #endif
268
269 latch->is_set = false;
270 latch->owner_pid = 0;
271 latch->is_shared = true;
272 }
273
274 /*
275 * Associate a shared latch with the current process, allowing it to
276 * wait on the latch.
277 *
278 * Although there is a sanity check for latch-already-owned, we don't do
279 * any sort of locking here, meaning that we could fail to detect the error
280 * if two processes try to own the same latch at about the same time. If
281 * there is any risk of that, caller must provide an interlock to prevent it.
282 *
283 * In any process that calls OwnLatch(), make sure that
284 * latch_sigusr1_handler() is called from the SIGUSR1 signal handler,
285 * as shared latches use SIGUSR1 for inter-process communication.
286 */
287 void
OwnLatch(volatile Latch * latch)288 OwnLatch(volatile Latch *latch)
289 {
290 /* Sanity checks */
291 Assert(latch->is_shared);
292
293 #ifndef WIN32
294 /* Assert InitializeLatchSupport has been called in this process */
295 Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
296 #endif
297
298 if (latch->owner_pid != 0)
299 elog(ERROR, "latch already owned");
300
301 latch->owner_pid = MyProcPid;
302 }
303
304 /*
305 * Disown a shared latch currently owned by the current process.
306 */
307 void
DisownLatch(volatile Latch * latch)308 DisownLatch(volatile Latch *latch)
309 {
310 Assert(latch->is_shared);
311 Assert(latch->owner_pid == MyProcPid);
312
313 latch->owner_pid = 0;
314 }
315
316 /*
317 * Wait for a given latch to be set, or for postmaster death, or until timeout
318 * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
319 * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
320 * function returns immediately.
321 *
322 * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
323 * is given. Although it is declared as "long", we don't actually support
324 * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
325 * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
326 *
327 * The latch must be owned by the current process, ie. it must be a
328 * process-local latch initialized with InitLatch, or a shared latch
329 * associated with the current process by calling OwnLatch.
330 *
331 * Returns bit mask indicating which condition(s) caused the wake-up. Note
332 * that if multiple wake-up conditions are true, there is no guarantee that
333 * we return all of them in one call, but we will return at least one.
334 */
335 int
WaitLatch(volatile Latch * latch,int wakeEvents,long timeout,uint32 wait_event_info)336 WaitLatch(volatile Latch *latch, int wakeEvents, long timeout,
337 uint32 wait_event_info)
338 {
339 return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
340 wait_event_info);
341 }
342
343 /*
344 * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
345 * conditions.
346 *
347 * When waiting on a socket, EOF and error conditions always cause the socket
348 * to be reported as readable/writable/connected, so that the caller can deal
349 * with the condition.
350 *
351 * NB: These days this is just a wrapper around the WaitEventSet API. When
352 * using a latch very frequently, consider creating a longer living
353 * WaitEventSet instead; that's more efficient.
354 */
355 int
WaitLatchOrSocket(volatile Latch * latch,int wakeEvents,pgsocket sock,long timeout,uint32 wait_event_info)356 WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
357 long timeout, uint32 wait_event_info)
358 {
359 int ret = 0;
360 int rc;
361 WaitEvent event;
362 WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
363
364 if (wakeEvents & WL_TIMEOUT)
365 Assert(timeout >= 0);
366 else
367 timeout = -1;
368
369 if (wakeEvents & WL_LATCH_SET)
370 AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
371 (Latch *) latch, NULL);
372
373 if (wakeEvents & WL_POSTMASTER_DEATH && IsUnderPostmaster)
374 AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
375 NULL, NULL);
376
377 if (wakeEvents & WL_SOCKET_MASK)
378 {
379 int ev;
380
381 ev = wakeEvents & WL_SOCKET_MASK;
382 AddWaitEventToSet(set, ev, sock, NULL, NULL);
383 }
384
385 rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
386
387 if (rc == 0)
388 ret |= WL_TIMEOUT;
389 else
390 {
391 ret |= event.events & (WL_LATCH_SET |
392 WL_POSTMASTER_DEATH |
393 WL_SOCKET_MASK);
394 }
395
396 FreeWaitEventSet(set);
397
398 return ret;
399 }
400
401 /*
402 * Sets a latch and wakes up anyone waiting on it.
403 *
404 * This is cheap if the latch is already set, otherwise not so much.
405 *
406 * NB: when calling this in a signal handler, be sure to save and restore
407 * errno around it. (That's standard practice in most signal handlers, of
408 * course, but we used to omit it in handlers that only set a flag.)
409 *
410 * NB: this function is called from critical sections and signal handlers so
411 * throwing an error is not a good idea.
412 */
413 void
SetLatch(volatile Latch * latch)414 SetLatch(volatile Latch *latch)
415 {
416 #ifndef WIN32
417 pid_t owner_pid;
418 #else
419 HANDLE handle;
420 #endif
421
422 /*
423 * The memory barrier has to be placed here to ensure that any flag
424 * variables possibly changed by this process have been flushed to main
425 * memory, before we check/set is_set.
426 */
427 pg_memory_barrier();
428
429 /* Quick exit if already set */
430 if (latch->is_set)
431 return;
432
433 latch->is_set = true;
434
435 #ifndef WIN32
436
437 /*
438 * See if anyone's waiting for the latch. It can be the current process if
439 * we're in a signal handler. We use the self-pipe to wake up the
440 * poll()/epoll_wait() in that case. If it's another process, send a
441 * signal.
442 *
443 * Fetch owner_pid only once, in case the latch is concurrently getting
444 * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
445 * guaranteed to be true! In practice, the effective range of pid_t fits
446 * in a 32 bit integer, and so should be atomic. In the worst case, we
447 * might end up signaling the wrong process. Even then, you're very
448 * unlucky if a process with that bogus pid exists and belongs to
449 * Postgres; and PG database processes should handle excess SIGUSR1
450 * interrupts without a problem anyhow.
451 *
452 * Another sort of race condition that's possible here is for a new
453 * process to own the latch immediately after we look, so we don't signal
454 * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
455 * the standard coding convention of waiting at the bottom of their loops,
456 * not the top, so that they'll correctly process latch-setting events
457 * that happen before they enter the loop.
458 */
459 owner_pid = latch->owner_pid;
460 if (owner_pid == 0)
461 return;
462 else if (owner_pid == MyProcPid)
463 {
464 if (waiting)
465 sendSelfPipeByte();
466 }
467 else
468 kill(owner_pid, SIGUSR1);
469 #else
470
471 /*
472 * See if anyone's waiting for the latch. It can be the current process if
473 * we're in a signal handler.
474 *
475 * Use a local variable here just in case somebody changes the event field
476 * concurrently (which really should not happen).
477 */
478 handle = latch->event;
479 if (handle)
480 {
481 SetEvent(handle);
482
483 /*
484 * Note that we silently ignore any errors. We might be in a signal
485 * handler or other critical path where it's not safe to call elog().
486 */
487 }
488 #endif
489
490 }
491
492 /*
493 * Clear the latch. Calling WaitLatch after this will sleep, unless
494 * the latch is set again before the WaitLatch call.
495 */
496 void
ResetLatch(volatile Latch * latch)497 ResetLatch(volatile Latch *latch)
498 {
499 /* Only the owner should reset the latch */
500 Assert(latch->owner_pid == MyProcPid);
501
502 latch->is_set = false;
503
504 /*
505 * Ensure that the write to is_set gets flushed to main memory before we
506 * examine any flag variables. Otherwise a concurrent SetLatch might
507 * falsely conclude that it needn't signal us, even though we have missed
508 * seeing some flag updates that SetLatch was supposed to inform us of.
509 */
510 pg_memory_barrier();
511 }
512
513 /*
514 * Create a WaitEventSet with space for nevents different events to wait for.
515 *
516 * These events can then be efficiently waited upon together, using
517 * WaitEventSetWait().
518 */
519 WaitEventSet *
CreateWaitEventSet(MemoryContext context,int nevents)520 CreateWaitEventSet(MemoryContext context, int nevents)
521 {
522 WaitEventSet *set;
523 char *data;
524 Size sz = 0;
525
526 /*
527 * Use MAXALIGN size/alignment to guarantee that later uses of memory are
528 * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
529 * platforms, but earlier allocations like WaitEventSet and WaitEvent
530 * might not sized to guarantee that when purely using sizeof().
531 */
532 sz += MAXALIGN(sizeof(WaitEventSet));
533 sz += MAXALIGN(sizeof(WaitEvent) * nevents);
534
535 #if defined(WAIT_USE_EPOLL)
536 sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
537 #elif defined(WAIT_USE_POLL)
538 sz += MAXALIGN(sizeof(struct pollfd) * nevents);
539 #elif defined(WAIT_USE_WIN32)
540 /* need space for the pgwin32_signal_event */
541 sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
542 #endif
543
544 data = (char *) MemoryContextAllocZero(context, sz);
545
546 set = (WaitEventSet *) data;
547 data += MAXALIGN(sizeof(WaitEventSet));
548
549 set->events = (WaitEvent *) data;
550 data += MAXALIGN(sizeof(WaitEvent) * nevents);
551
552 #if defined(WAIT_USE_EPOLL)
553 set->epoll_ret_events = (struct epoll_event *) data;
554 data += MAXALIGN(sizeof(struct epoll_event) * nevents);
555 #elif defined(WAIT_USE_POLL)
556 set->pollfds = (struct pollfd *) data;
557 data += MAXALIGN(sizeof(struct pollfd) * nevents);
558 #elif defined(WAIT_USE_WIN32)
559 set->handles = (HANDLE) data;
560 data += MAXALIGN(sizeof(HANDLE) * nevents);
561 #endif
562
563 set->latch = NULL;
564 set->nevents_space = nevents;
565
566 #if defined(WAIT_USE_EPOLL)
567 #ifdef EPOLL_CLOEXEC
568 set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
569 if (set->epoll_fd < 0)
570 elog(ERROR, "epoll_create1 failed: %m");
571 #else
572 /* cope with ancient glibc lacking epoll_create1 (e.g., RHEL5) */
573 set->epoll_fd = epoll_create(nevents);
574 if (set->epoll_fd < 0)
575 elog(ERROR, "epoll_create failed: %m");
576 if (fcntl(set->epoll_fd, F_SETFD, FD_CLOEXEC) == -1)
577 elog(ERROR, "fcntl(F_SETFD) failed on epoll descriptor: %m");
578 #endif /* EPOLL_CLOEXEC */
579 #elif defined(WAIT_USE_WIN32)
580
581 /*
582 * To handle signals while waiting, we need to add a win32 specific event.
583 * We accounted for the additional event at the top of this routine. See
584 * port/win32/signal.c for more details.
585 *
586 * Note: pgwin32_signal_event should be first to ensure that it will be
587 * reported when multiple events are set. We want to guarantee that
588 * pending signals are serviced.
589 */
590 set->handles[0] = pgwin32_signal_event;
591 StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
592 #endif
593
594 return set;
595 }
596
597 /*
598 * Free a previously created WaitEventSet.
599 *
600 * Note: preferably, this shouldn't have to free any resources that could be
601 * inherited across an exec(). If it did, we'd likely leak those resources in
602 * many scenarios. For the epoll case, we ensure that by setting FD_CLOEXEC
603 * when the FD is created. For the Windows case, we assume that the handles
604 * involved are non-inheritable.
605 */
606 void
FreeWaitEventSet(WaitEventSet * set)607 FreeWaitEventSet(WaitEventSet *set)
608 {
609 #if defined(WAIT_USE_EPOLL)
610 close(set->epoll_fd);
611 #elif defined(WAIT_USE_WIN32)
612 WaitEvent *cur_event;
613
614 for (cur_event = set->events;
615 cur_event < (set->events + set->nevents);
616 cur_event++)
617 {
618 if (cur_event->events & WL_LATCH_SET)
619 {
620 /* uses the latch's HANDLE */
621 }
622 else if (cur_event->events & WL_POSTMASTER_DEATH)
623 {
624 /* uses PostmasterHandle */
625 }
626 else
627 {
628 /* Clean up the event object we created for the socket */
629 WSAEventSelect(cur_event->fd, NULL, 0);
630 WSACloseEvent(set->handles[cur_event->pos + 1]);
631 }
632 }
633 #endif
634
635 pfree(set);
636 }
637
638 /* ---
639 * Add an event to the set. Possible events are:
640 * - WL_LATCH_SET: Wait for the latch to be set
641 * - WL_POSTMASTER_DEATH: Wait for postmaster to die
642 * - WL_SOCKET_READABLE: Wait for socket to become readable,
643 * can be combined in one event with other WL_SOCKET_* events
644 * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
645 * can be combined with other WL_SOCKET_* events
646 * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
647 * can be combined with other WL_SOCKET_* events (on non-Windows
648 * platforms, this is the same as WL_SOCKET_WRITEABLE)
649 *
650 * Returns the offset in WaitEventSet->events (starting from 0), which can be
651 * used to modify previously added wait events using ModifyWaitEvent().
652 *
653 * In the WL_LATCH_SET case the latch must be owned by the current process,
654 * i.e. it must be a process-local latch initialized with InitLatch, or a
655 * shared latch associated with the current process by calling OwnLatch.
656 *
657 * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
658 * conditions cause the socket to be reported as readable/writable/connected,
659 * so that the caller can deal with the condition.
660 *
661 * The user_data pointer specified here will be set for the events returned
662 * by WaitEventSetWait(), allowing to easily associate additional data with
663 * events.
664 */
665 int
AddWaitEventToSet(WaitEventSet * set,uint32 events,pgsocket fd,Latch * latch,void * user_data)666 AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
667 void *user_data)
668 {
669 WaitEvent *event;
670
671 /* not enough space */
672 Assert(set->nevents < set->nevents_space);
673
674 if (latch)
675 {
676 if (latch->owner_pid != MyProcPid)
677 elog(ERROR, "cannot wait on a latch owned by another process");
678 if (set->latch)
679 elog(ERROR, "cannot wait on more than one latch");
680 if ((events & WL_LATCH_SET) != WL_LATCH_SET)
681 elog(ERROR, "latch events only support being set");
682 }
683 else
684 {
685 if (events & WL_LATCH_SET)
686 elog(ERROR, "cannot wait on latch without a specified latch");
687 }
688
689 /* waiting for socket readiness without a socket indicates a bug */
690 if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
691 elog(ERROR, "cannot wait on socket event without a socket");
692
693 event = &set->events[set->nevents];
694 event->pos = set->nevents++;
695 event->fd = fd;
696 event->events = events;
697 event->user_data = user_data;
698 #ifdef WIN32
699 event->reset = false;
700 #endif
701
702 if (events == WL_LATCH_SET)
703 {
704 set->latch = latch;
705 set->latch_pos = event->pos;
706 #ifndef WIN32
707 event->fd = selfpipe_readfd;
708 #endif
709 }
710 else if (events == WL_POSTMASTER_DEATH)
711 {
712 #ifndef WIN32
713 event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
714 #endif
715 }
716
717 /* perform wait primitive specific initialization, if needed */
718 #if defined(WAIT_USE_EPOLL)
719 WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
720 #elif defined(WAIT_USE_POLL)
721 WaitEventAdjustPoll(set, event);
722 #elif defined(WAIT_USE_WIN32)
723 WaitEventAdjustWin32(set, event);
724 #endif
725
726 return event->pos;
727 }
728
729 /*
730 * Change the event mask and, in the WL_LATCH_SET case, the latch associated
731 * with the WaitEvent.
732 *
733 * 'pos' is the id returned by AddWaitEventToSet.
734 */
735 void
ModifyWaitEvent(WaitEventSet * set,int pos,uint32 events,Latch * latch)736 ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
737 {
738 WaitEvent *event;
739
740 Assert(pos < set->nevents);
741
742 event = &set->events[pos];
743
744 /*
745 * If neither the event mask nor the associated latch changes, return
746 * early. That's an important optimization for some sockets, where
747 * ModifyWaitEvent is frequently used to switch from waiting for reads to
748 * waiting on writes.
749 */
750 if (events == event->events &&
751 (!(event->events & WL_LATCH_SET) || set->latch == latch))
752 return;
753
754 if (event->events & WL_LATCH_SET &&
755 events != event->events)
756 {
757 /* we could allow to disable latch events for a while */
758 elog(ERROR, "cannot modify latch event");
759 }
760
761 if (event->events & WL_POSTMASTER_DEATH)
762 {
763 elog(ERROR, "cannot modify postmaster death event");
764 }
765
766 /* FIXME: validate event mask */
767 event->events = events;
768
769 if (events == WL_LATCH_SET)
770 {
771 set->latch = latch;
772 }
773
774 #if defined(WAIT_USE_EPOLL)
775 WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
776 #elif defined(WAIT_USE_POLL)
777 WaitEventAdjustPoll(set, event);
778 #elif defined(WAIT_USE_WIN32)
779 WaitEventAdjustWin32(set, event);
780 #endif
781 }
782
783 #if defined(WAIT_USE_EPOLL)
784 /*
785 * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
786 */
787 static void
WaitEventAdjustEpoll(WaitEventSet * set,WaitEvent * event,int action)788 WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
789 {
790 struct epoll_event epoll_ev;
791 int rc;
792
793 /* pointer to our event, returned by epoll_wait */
794 epoll_ev.data.ptr = event;
795 /* always wait for errors */
796 epoll_ev.events = EPOLLERR | EPOLLHUP;
797
798 /* prepare pollfd entry once */
799 if (event->events == WL_LATCH_SET)
800 {
801 Assert(set->latch != NULL);
802 epoll_ev.events |= EPOLLIN;
803 }
804 else if (event->events == WL_POSTMASTER_DEATH)
805 {
806 epoll_ev.events |= EPOLLIN;
807 }
808 else
809 {
810 Assert(event->fd != PGINVALID_SOCKET);
811 Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
812
813 if (event->events & WL_SOCKET_READABLE)
814 epoll_ev.events |= EPOLLIN;
815 if (event->events & WL_SOCKET_WRITEABLE)
816 epoll_ev.events |= EPOLLOUT;
817 }
818
819 /*
820 * Even though unused, we also pass epoll_ev as the data argument if
821 * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
822 * requiring that, and actually it makes the code simpler...
823 */
824 rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
825
826 if (rc < 0)
827 ereport(ERROR,
828 (errcode_for_socket_access(),
829 errmsg("epoll_ctl() failed: %m")));
830 }
831 #endif
832
833 #if defined(WAIT_USE_POLL)
834 static void
WaitEventAdjustPoll(WaitEventSet * set,WaitEvent * event)835 WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
836 {
837 struct pollfd *pollfd = &set->pollfds[event->pos];
838
839 pollfd->revents = 0;
840 pollfd->fd = event->fd;
841
842 /* prepare pollfd entry once */
843 if (event->events == WL_LATCH_SET)
844 {
845 Assert(set->latch != NULL);
846 pollfd->events = POLLIN;
847 }
848 else if (event->events == WL_POSTMASTER_DEATH)
849 {
850 pollfd->events = POLLIN;
851 }
852 else
853 {
854 Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
855 pollfd->events = 0;
856 if (event->events & WL_SOCKET_READABLE)
857 pollfd->events |= POLLIN;
858 if (event->events & WL_SOCKET_WRITEABLE)
859 pollfd->events |= POLLOUT;
860 }
861
862 Assert(event->fd != PGINVALID_SOCKET);
863 }
864 #endif
865
866 #if defined(WAIT_USE_WIN32)
867 static void
WaitEventAdjustWin32(WaitEventSet * set,WaitEvent * event)868 WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
869 {
870 HANDLE *handle = &set->handles[event->pos + 1];
871
872 if (event->events == WL_LATCH_SET)
873 {
874 Assert(set->latch != NULL);
875 *handle = set->latch->event;
876 }
877 else if (event->events == WL_POSTMASTER_DEATH)
878 {
879 *handle = PostmasterHandle;
880 }
881 else
882 {
883 int flags = FD_CLOSE; /* always check for errors/EOF */
884
885 if (event->events & WL_SOCKET_READABLE)
886 flags |= FD_READ;
887 if (event->events & WL_SOCKET_WRITEABLE)
888 flags |= FD_WRITE;
889 if (event->events & WL_SOCKET_CONNECTED)
890 flags |= FD_CONNECT;
891
892 if (*handle == WSA_INVALID_EVENT)
893 {
894 *handle = WSACreateEvent();
895 if (*handle == WSA_INVALID_EVENT)
896 elog(ERROR, "failed to create event for socket: error code %u",
897 WSAGetLastError());
898 }
899 if (WSAEventSelect(event->fd, *handle, flags) != 0)
900 elog(ERROR, "failed to set up event for socket: error code %u",
901 WSAGetLastError());
902
903 Assert(event->fd != PGINVALID_SOCKET);
904 }
905 }
906 #endif
907
908 /*
909 * Wait for events added to the set to happen, or until the timeout is
910 * reached. At most nevents occurred events are returned.
911 *
912 * If timeout = -1, block until an event occurs; if 0, check sockets for
913 * readiness, but don't block; if > 0, block for at most timeout milliseconds.
914 *
915 * Returns the number of events occurred, or 0 if the timeout was reached.
916 *
917 * Returned events will have the fd, pos, user_data fields set to the
918 * values associated with the registered event.
919 */
920 int
WaitEventSetWait(WaitEventSet * set,long timeout,WaitEvent * occurred_events,int nevents,uint32 wait_event_info)921 WaitEventSetWait(WaitEventSet *set, long timeout,
922 WaitEvent *occurred_events, int nevents,
923 uint32 wait_event_info)
924 {
925 int returned_events = 0;
926 instr_time start_time;
927 instr_time cur_time;
928 long cur_timeout = -1;
929
930 Assert(nevents > 0);
931
932 /*
933 * Initialize timeout if requested. We must record the current time so
934 * that we can determine the remaining timeout if interrupted.
935 */
936 if (timeout >= 0)
937 {
938 INSTR_TIME_SET_CURRENT(start_time);
939 Assert(timeout >= 0 && timeout <= INT_MAX);
940 cur_timeout = timeout;
941 }
942
943 pgstat_report_wait_start(wait_event_info);
944
945 #ifndef WIN32
946 waiting = true;
947 #else
948 /* Ensure that signals are serviced even if latch is already set */
949 pgwin32_dispatch_queued_signals();
950 #endif
951 while (returned_events == 0)
952 {
953 int rc;
954
955 /*
956 * Check if the latch is set already. If so, leave the loop
957 * immediately, avoid blocking again. We don't attempt to report any
958 * other events that might also be satisfied.
959 *
960 * If someone sets the latch between this and the
961 * WaitEventSetWaitBlock() below, the setter will write a byte to the
962 * pipe (or signal us and the signal handler will do that), and the
963 * readiness routine will return immediately.
964 *
965 * On unix, If there's a pending byte in the self pipe, we'll notice
966 * whenever blocking. Only clearing the pipe in that case avoids
967 * having to drain it every time WaitLatchOrSocket() is used. Should
968 * the pipe-buffer fill up we're still ok, because the pipe is in
969 * nonblocking mode. It's unlikely for that to happen, because the
970 * self pipe isn't filled unless we're blocking (waiting = true), or
971 * from inside a signal handler in latch_sigusr1_handler().
972 *
973 * On windows, we'll also notice if there's a pending event for the
974 * latch when blocking, but there's no danger of anything filling up,
975 * as "Setting an event that is already set has no effect.".
976 *
977 * Note: we assume that the kernel calls involved in latch management
978 * will provide adequate synchronization on machines with weak memory
979 * ordering, so that we cannot miss seeing is_set if a notification
980 * has already been queued.
981 */
982 if (set->latch && set->latch->is_set)
983 {
984 occurred_events->fd = PGINVALID_SOCKET;
985 occurred_events->pos = set->latch_pos;
986 occurred_events->user_data =
987 set->events[set->latch_pos].user_data;
988 occurred_events->events = WL_LATCH_SET;
989 occurred_events++;
990 returned_events++;
991
992 break;
993 }
994
995 /*
996 * Wait for events using the readiness primitive chosen at the top of
997 * this file. If -1 is returned, a timeout has occurred, if 0 we have
998 * to retry, everything >= 1 is the number of returned events.
999 */
1000 rc = WaitEventSetWaitBlock(set, cur_timeout,
1001 occurred_events, nevents);
1002
1003 if (rc == -1)
1004 break; /* timeout occurred */
1005 else
1006 returned_events = rc;
1007
1008 /* If we're not done, update cur_timeout for next iteration */
1009 if (returned_events == 0 && timeout >= 0)
1010 {
1011 INSTR_TIME_SET_CURRENT(cur_time);
1012 INSTR_TIME_SUBTRACT(cur_time, start_time);
1013 cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
1014 if (cur_timeout <= 0)
1015 break;
1016 }
1017 }
1018 #ifndef WIN32
1019 waiting = false;
1020 #endif
1021
1022 pgstat_report_wait_end();
1023
1024 return returned_events;
1025 }
1026
1027
1028 #if defined(WAIT_USE_EPOLL)
1029
1030 /*
1031 * Wait using linux's epoll_wait(2).
1032 *
1033 * This is the preferrable wait method, as several readiness notifications are
1034 * delivered, without having to iterate through all of set->events. The return
1035 * epoll_event struct contain a pointer to our events, making association
1036 * easy.
1037 */
1038 static inline int
WaitEventSetWaitBlock(WaitEventSet * set,int cur_timeout,WaitEvent * occurred_events,int nevents)1039 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1040 WaitEvent *occurred_events, int nevents)
1041 {
1042 int returned_events = 0;
1043 int rc;
1044 WaitEvent *cur_event;
1045 struct epoll_event *cur_epoll_event;
1046
1047 /* Sleep */
1048 rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
1049 nevents, cur_timeout);
1050
1051 /* Check return code */
1052 if (rc < 0)
1053 {
1054 /* EINTR is okay, otherwise complain */
1055 if (errno != EINTR)
1056 {
1057 waiting = false;
1058 ereport(ERROR,
1059 (errcode_for_socket_access(),
1060 errmsg("epoll_wait() failed: %m")));
1061 }
1062 return 0;
1063 }
1064 else if (rc == 0)
1065 {
1066 /* timeout exceeded */
1067 return -1;
1068 }
1069
1070 /*
1071 * At least one event occurred, iterate over the returned epoll events
1072 * until they're either all processed, or we've returned all the events
1073 * the caller desired.
1074 */
1075 for (cur_epoll_event = set->epoll_ret_events;
1076 cur_epoll_event < (set->epoll_ret_events + rc) &&
1077 returned_events < nevents;
1078 cur_epoll_event++)
1079 {
1080 /* epoll's data pointer is set to the associated WaitEvent */
1081 cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
1082
1083 occurred_events->pos = cur_event->pos;
1084 occurred_events->user_data = cur_event->user_data;
1085 occurred_events->events = 0;
1086
1087 if (cur_event->events == WL_LATCH_SET &&
1088 cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1089 {
1090 /* There's data in the self-pipe, clear it. */
1091 drainSelfPipe();
1092
1093 if (set->latch->is_set)
1094 {
1095 occurred_events->fd = PGINVALID_SOCKET;
1096 occurred_events->events = WL_LATCH_SET;
1097 occurred_events++;
1098 returned_events++;
1099 }
1100 }
1101 else if (cur_event->events == WL_POSTMASTER_DEATH &&
1102 cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
1103 {
1104 /*
1105 * We expect an EPOLLHUP when the remote end is closed, but
1106 * because we don't expect the pipe to become readable or to have
1107 * any errors either, treat those cases as postmaster death, too.
1108 *
1109 * Be paranoid about a spurious event signalling the postmaster as
1110 * being dead. There have been reports about that happening with
1111 * older primitives (select(2) to be specific), and a spurious
1112 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1113 * cost much.
1114 */
1115 if (!PostmasterIsAlive())
1116 {
1117 occurred_events->fd = PGINVALID_SOCKET;
1118 occurred_events->events = WL_POSTMASTER_DEATH;
1119 occurred_events++;
1120 returned_events++;
1121 }
1122 }
1123 else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1124 {
1125 Assert(cur_event->fd != PGINVALID_SOCKET);
1126
1127 if ((cur_event->events & WL_SOCKET_READABLE) &&
1128 (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
1129 {
1130 /* data available in socket, or EOF */
1131 occurred_events->events |= WL_SOCKET_READABLE;
1132 }
1133
1134 if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1135 (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
1136 {
1137 /* writable, or EOF */
1138 occurred_events->events |= WL_SOCKET_WRITEABLE;
1139 }
1140
1141 if (occurred_events->events != 0)
1142 {
1143 occurred_events->fd = cur_event->fd;
1144 occurred_events++;
1145 returned_events++;
1146 }
1147 }
1148 }
1149
1150 return returned_events;
1151 }
1152
1153 #elif defined(WAIT_USE_POLL)
1154
1155 /*
1156 * Wait using poll(2).
1157 *
1158 * This allows to receive readiness notifications for several events at once,
1159 * but requires iterating through all of set->pollfds.
1160 */
1161 static inline int
WaitEventSetWaitBlock(WaitEventSet * set,int cur_timeout,WaitEvent * occurred_events,int nevents)1162 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1163 WaitEvent *occurred_events, int nevents)
1164 {
1165 int returned_events = 0;
1166 int rc;
1167 WaitEvent *cur_event;
1168 struct pollfd *cur_pollfd;
1169
1170 /* Sleep */
1171 rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
1172
1173 /* Check return code */
1174 if (rc < 0)
1175 {
1176 /* EINTR is okay, otherwise complain */
1177 if (errno != EINTR)
1178 {
1179 waiting = false;
1180 ereport(ERROR,
1181 (errcode_for_socket_access(),
1182 errmsg("poll() failed: %m")));
1183 }
1184 return 0;
1185 }
1186 else if (rc == 0)
1187 {
1188 /* timeout exceeded */
1189 return -1;
1190 }
1191
1192 for (cur_event = set->events, cur_pollfd = set->pollfds;
1193 cur_event < (set->events + set->nevents) &&
1194 returned_events < nevents;
1195 cur_event++, cur_pollfd++)
1196 {
1197 /* no activity on this FD, skip */
1198 if (cur_pollfd->revents == 0)
1199 continue;
1200
1201 occurred_events->pos = cur_event->pos;
1202 occurred_events->user_data = cur_event->user_data;
1203 occurred_events->events = 0;
1204
1205 if (cur_event->events == WL_LATCH_SET &&
1206 (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1207 {
1208 /* There's data in the self-pipe, clear it. */
1209 drainSelfPipe();
1210
1211 if (set->latch->is_set)
1212 {
1213 occurred_events->fd = PGINVALID_SOCKET;
1214 occurred_events->events = WL_LATCH_SET;
1215 occurred_events++;
1216 returned_events++;
1217 }
1218 }
1219 else if (cur_event->events == WL_POSTMASTER_DEATH &&
1220 (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
1221 {
1222 /*
1223 * We expect an POLLHUP when the remote end is closed, but because
1224 * we don't expect the pipe to become readable or to have any
1225 * errors either, treat those cases as postmaster death, too.
1226 *
1227 * Be paranoid about a spurious event signalling the postmaster as
1228 * being dead. There have been reports about that happening with
1229 * older primitives (select(2) to be specific), and a spurious
1230 * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
1231 * cost much.
1232 */
1233 if (!PostmasterIsAlive())
1234 {
1235 occurred_events->fd = PGINVALID_SOCKET;
1236 occurred_events->events = WL_POSTMASTER_DEATH;
1237 occurred_events++;
1238 returned_events++;
1239 }
1240 }
1241 else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
1242 {
1243 int errflags = POLLHUP | POLLERR | POLLNVAL;
1244
1245 Assert(cur_event->fd >= PGINVALID_SOCKET);
1246
1247 if ((cur_event->events & WL_SOCKET_READABLE) &&
1248 (cur_pollfd->revents & (POLLIN | errflags)))
1249 {
1250 /* data available in socket, or EOF */
1251 occurred_events->events |= WL_SOCKET_READABLE;
1252 }
1253
1254 if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1255 (cur_pollfd->revents & (POLLOUT | errflags)))
1256 {
1257 /* writeable, or EOF */
1258 occurred_events->events |= WL_SOCKET_WRITEABLE;
1259 }
1260
1261 if (occurred_events->events != 0)
1262 {
1263 occurred_events->fd = cur_event->fd;
1264 occurred_events++;
1265 returned_events++;
1266 }
1267 }
1268 }
1269 return returned_events;
1270 }
1271
1272 #elif defined(WAIT_USE_WIN32)
1273
1274 /*
1275 * Wait using Windows' WaitForMultipleObjects().
1276 *
1277 * Unfortunately this will only ever return a single readiness notification at
1278 * a time. Note that while the official documentation for
1279 * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
1280 * with a single bWaitAll = FALSE call,
1281 * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
1282 * that only one event is "consumed".
1283 */
1284 static inline int
WaitEventSetWaitBlock(WaitEventSet * set,int cur_timeout,WaitEvent * occurred_events,int nevents)1285 WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
1286 WaitEvent *occurred_events, int nevents)
1287 {
1288 int returned_events = 0;
1289 DWORD rc;
1290 WaitEvent *cur_event;
1291
1292 /* Reset any wait events that need it */
1293 for (cur_event = set->events;
1294 cur_event < (set->events + set->nevents);
1295 cur_event++)
1296 {
1297 if (cur_event->reset)
1298 {
1299 WaitEventAdjustWin32(set, cur_event);
1300 cur_event->reset = false;
1301 }
1302
1303 /*
1304 * Windows does not guarantee to log an FD_WRITE network event
1305 * indicating that more data can be sent unless the previous send()
1306 * failed with WSAEWOULDBLOCK. While our caller might well have made
1307 * such a call, we cannot assume that here. Therefore, if waiting for
1308 * write-ready, force the issue by doing a dummy send(). If the dummy
1309 * send() succeeds, assume that the socket is in fact write-ready, and
1310 * return immediately. Also, if it fails with something other than
1311 * WSAEWOULDBLOCK, return a write-ready indication to let our caller
1312 * deal with the error condition.
1313 */
1314 if (cur_event->events & WL_SOCKET_WRITEABLE)
1315 {
1316 char c;
1317 WSABUF buf;
1318 DWORD sent;
1319 int r;
1320
1321 buf.buf = &c;
1322 buf.len = 0;
1323
1324 r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
1325 if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
1326 {
1327 occurred_events->pos = cur_event->pos;
1328 occurred_events->user_data = cur_event->user_data;
1329 occurred_events->events = WL_SOCKET_WRITEABLE;
1330 occurred_events->fd = cur_event->fd;
1331 return 1;
1332 }
1333 }
1334 }
1335
1336 /*
1337 * Sleep.
1338 *
1339 * Need to wait for ->nevents + 1, because signal handle is in [0].
1340 */
1341 rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
1342 cur_timeout);
1343
1344 /* Check return code */
1345 if (rc == WAIT_FAILED)
1346 elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
1347 GetLastError());
1348 else if (rc == WAIT_TIMEOUT)
1349 {
1350 /* timeout exceeded */
1351 return -1;
1352 }
1353
1354 if (rc == WAIT_OBJECT_0)
1355 {
1356 /* Service newly-arrived signals */
1357 pgwin32_dispatch_queued_signals();
1358 return 0; /* retry */
1359 }
1360
1361 /*
1362 * With an offset of one, due to the always present pgwin32_signal_event,
1363 * the handle offset directly corresponds to a wait event.
1364 */
1365 cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
1366
1367 occurred_events->pos = cur_event->pos;
1368 occurred_events->user_data = cur_event->user_data;
1369 occurred_events->events = 0;
1370
1371 if (cur_event->events == WL_LATCH_SET)
1372 {
1373 if (!ResetEvent(set->latch->event))
1374 elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
1375
1376 if (set->latch->is_set)
1377 {
1378 occurred_events->fd = PGINVALID_SOCKET;
1379 occurred_events->events = WL_LATCH_SET;
1380 occurred_events++;
1381 returned_events++;
1382 }
1383 }
1384 else if (cur_event->events == WL_POSTMASTER_DEATH)
1385 {
1386 /*
1387 * Postmaster apparently died. Since the consequences of falsely
1388 * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
1389 * the trouble to positively verify this with PostmasterIsAlive(),
1390 * even though there is no known reason to think that the event could
1391 * be falsely set on Windows.
1392 */
1393 if (!PostmasterIsAlive())
1394 {
1395 occurred_events->fd = PGINVALID_SOCKET;
1396 occurred_events->events = WL_POSTMASTER_DEATH;
1397 occurred_events++;
1398 returned_events++;
1399 }
1400 }
1401 else if (cur_event->events & WL_SOCKET_MASK)
1402 {
1403 WSANETWORKEVENTS resEvents;
1404 HANDLE handle = set->handles[cur_event->pos + 1];
1405
1406 Assert(cur_event->fd);
1407
1408 occurred_events->fd = cur_event->fd;
1409
1410 ZeroMemory(&resEvents, sizeof(resEvents));
1411 if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
1412 elog(ERROR, "failed to enumerate network events: error code %u",
1413 WSAGetLastError());
1414 if ((cur_event->events & WL_SOCKET_READABLE) &&
1415 (resEvents.lNetworkEvents & FD_READ))
1416 {
1417 /* data available in socket */
1418 occurred_events->events |= WL_SOCKET_READABLE;
1419
1420 /*------
1421 * WaitForMultipleObjects doesn't guarantee that a read event will
1422 * be returned if the latch is set at the same time. Even if it
1423 * did, the caller might drop that event expecting it to reoccur
1424 * on next call. So, we must force the event to be reset if this
1425 * WaitEventSet is used again in order to avoid an indefinite
1426 * hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
1427 * for the behavior of socket events.
1428 *------
1429 */
1430 cur_event->reset = true;
1431 }
1432 if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
1433 (resEvents.lNetworkEvents & FD_WRITE))
1434 {
1435 /* writeable */
1436 occurred_events->events |= WL_SOCKET_WRITEABLE;
1437 }
1438 if ((cur_event->events & WL_SOCKET_CONNECTED) &&
1439 (resEvents.lNetworkEvents & FD_CONNECT))
1440 {
1441 /* connected */
1442 occurred_events->events |= WL_SOCKET_CONNECTED;
1443 }
1444 if (resEvents.lNetworkEvents & FD_CLOSE)
1445 {
1446 /* EOF/error, so signal all caller-requested socket flags */
1447 occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
1448 }
1449
1450 if (occurred_events->events != 0)
1451 {
1452 occurred_events++;
1453 returned_events++;
1454 }
1455 }
1456
1457 return returned_events;
1458 }
1459 #endif
1460
1461 /*
1462 * SetLatch uses SIGUSR1 to wake up the process waiting on the latch.
1463 *
1464 * Wake up WaitLatch, if we're waiting. (We might not be, since SIGUSR1 is
1465 * overloaded for multiple purposes; or we might not have reached WaitLatch
1466 * yet, in which case we don't need to fill the pipe either.)
1467 *
1468 * NB: when calling this in a signal handler, be sure to save and restore
1469 * errno around it.
1470 */
1471 #ifndef WIN32
1472 void
latch_sigusr1_handler(void)1473 latch_sigusr1_handler(void)
1474 {
1475 if (waiting)
1476 sendSelfPipeByte();
1477 }
1478 #endif /* !WIN32 */
1479
1480 /* Send one byte to the self-pipe, to wake up WaitLatch */
1481 #ifndef WIN32
1482 static void
sendSelfPipeByte(void)1483 sendSelfPipeByte(void)
1484 {
1485 int rc;
1486 char dummy = 0;
1487
1488 retry:
1489 rc = write(selfpipe_writefd, &dummy, 1);
1490 if (rc < 0)
1491 {
1492 /* If interrupted by signal, just retry */
1493 if (errno == EINTR)
1494 goto retry;
1495
1496 /*
1497 * If the pipe is full, we don't need to retry, the data that's there
1498 * already is enough to wake up WaitLatch.
1499 */
1500 if (errno == EAGAIN || errno == EWOULDBLOCK)
1501 return;
1502
1503 /*
1504 * Oops, the write() failed for some other reason. We might be in a
1505 * signal handler, so it's not safe to elog(). We have no choice but
1506 * silently ignore the error.
1507 */
1508 return;
1509 }
1510 }
1511 #endif /* !WIN32 */
1512
1513 /*
1514 * Read all available data from the self-pipe
1515 *
1516 * Note: this is only called when waiting = true. If it fails and doesn't
1517 * return, it must reset that flag first (though ideally, this will never
1518 * happen).
1519 */
1520 #ifndef WIN32
1521 static void
drainSelfPipe(void)1522 drainSelfPipe(void)
1523 {
1524 /*
1525 * There shouldn't normally be more than one byte in the pipe, or maybe a
1526 * few bytes if multiple processes run SetLatch at the same instant.
1527 */
1528 char buf[16];
1529 int rc;
1530
1531 for (;;)
1532 {
1533 rc = read(selfpipe_readfd, buf, sizeof(buf));
1534 if (rc < 0)
1535 {
1536 if (errno == EAGAIN || errno == EWOULDBLOCK)
1537 break; /* the pipe is empty */
1538 else if (errno == EINTR)
1539 continue; /* retry */
1540 else
1541 {
1542 waiting = false;
1543 elog(ERROR, "read() on self-pipe failed: %m");
1544 }
1545 }
1546 else if (rc == 0)
1547 {
1548 waiting = false;
1549 elog(ERROR, "unexpected EOF on self-pipe");
1550 }
1551 else if (rc < sizeof(buf))
1552 {
1553 /* we successfully drained the pipe; no need to read() again */
1554 break;
1555 }
1556 /* else buffer wasn't big enough, so read again */
1557 }
1558 }
1559 #endif /* !WIN32 */
1560