1 /*-------------------------------------------------------------------------
2 *
3 * parallel.c
4 *
5 * Parallel support for pg_dump and pg_restore
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/bin/pg_dump/parallel.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16 /*
17 * Parallel operation works like this:
18 *
19 * The original, master process calls ParallelBackupStart(), which forks off
20 * the desired number of worker processes, which each enter WaitForCommands().
21 *
22 * The master process dispatches an individual work item to one of the worker
23 * processes in DispatchJobForTocEntry(). We send a command string such as
24 * "DUMP 1234" or "RESTORE 1234", where 1234 is the TocEntry ID.
25 * The worker process receives and decodes the command and passes it to the
26 * routine pointed to by AH->WorkerJobDumpPtr or AH->WorkerJobRestorePtr,
27 * which are routines of the current archive format. That routine performs
28 * the required action (dump or restore) and returns an integer status code.
29 * This is passed back to the master where we pass it to the
30 * ParallelCompletionPtr callback function that was passed to
31 * DispatchJobForTocEntry(). The callback function does state updating
32 * for the master control logic in pg_backup_archiver.c.
33 *
34 * In principle additional archive-format-specific information might be needed
35 * in commands or worker status responses, but so far that hasn't proved
36 * necessary, since workers have full copies of the ArchiveHandle/TocEntry
37 * data structures. Remember that we have forked off the workers only after
38 * we have read in the catalog. That's why our worker processes can also
39 * access the catalog information. (In the Windows case, the workers are
40 * threads in the same process. To avoid problems, they work with cloned
41 * copies of the Archive data structure; see RunWorker().)
42 *
43 * In the master process, the workerStatus field for each worker has one of
44 * the following values:
45 * WRKR_NOT_STARTED: we've not yet forked this worker
46 * WRKR_IDLE: it's waiting for a command
47 * WRKR_WORKING: it's working on a command
48 * WRKR_TERMINATED: process ended
49 * The pstate->te[] entry for each worker is valid when it's in WRKR_WORKING
50 * state, and must be NULL in other states.
51 */
52
53 #include "postgres_fe.h"
54
55 #ifndef WIN32
56 #include <sys/wait.h>
57 #include <signal.h>
58 #include <unistd.h>
59 #include <fcntl.h>
60 #endif
61 #ifdef HAVE_SYS_SELECT_H
62 #include <sys/select.h>
63 #endif
64
65 #include "parallel.h"
66 #include "pg_backup_utils.h"
67
68 #include "fe_utils/string_utils.h"
69 #include "port/pg_bswap.h"
70
71 /* Mnemonic macros for indexing the fd array returned by pipe(2) */
72 #define PIPE_READ 0
73 #define PIPE_WRITE 1
74
75 #define NO_SLOT (-1) /* Failure result for GetIdleWorker() */
76
77 /* Worker process statuses */
78 typedef enum
79 {
80 WRKR_NOT_STARTED = 0,
81 WRKR_IDLE,
82 WRKR_WORKING,
83 WRKR_TERMINATED
84 } T_WorkerStatus;
85
86 #define WORKER_IS_RUNNING(workerStatus) \
87 ((workerStatus) == WRKR_IDLE || (workerStatus) == WRKR_WORKING)
88
89 /*
90 * Private per-parallel-worker state (typedef for this is in parallel.h).
91 *
92 * Much of this is valid only in the master process (or, on Windows, should
93 * be touched only by the master thread). But the AH field should be touched
94 * only by workers. The pipe descriptors are valid everywhere.
95 */
96 struct ParallelSlot
97 {
98 T_WorkerStatus workerStatus; /* see enum above */
99
100 /* These fields are valid if workerStatus == WRKR_WORKING: */
101 ParallelCompletionPtr callback; /* function to call on completion */
102 void *callback_data; /* passthrough data for it */
103
104 ArchiveHandle *AH; /* Archive data worker is using */
105
106 int pipeRead; /* master's end of the pipes */
107 int pipeWrite;
108 int pipeRevRead; /* child's end of the pipes */
109 int pipeRevWrite;
110
111 /* Child process/thread identity info: */
112 #ifdef WIN32
113 uintptr_t hThread;
114 unsigned int threadId;
115 #else
116 pid_t pid;
117 #endif
118 };
119
120 #ifdef WIN32
121
122 /*
123 * Structure to hold info passed by _beginthreadex() to the function it calls
124 * via its single allowed argument.
125 */
126 typedef struct
127 {
128 ArchiveHandle *AH; /* master database connection */
129 ParallelSlot *slot; /* this worker's parallel slot */
130 } WorkerInfo;
131
132 /* Windows implementation of pipe access */
133 static int pgpipe(int handles[2]);
134 static int piperead(int s, char *buf, int len);
135 #define pipewrite(a,b,c) send(a,b,c,0)
136
137 #else /* !WIN32 */
138
139 /* Non-Windows implementation of pipe access */
140 #define pgpipe(a) pipe(a)
141 #define piperead(a,b,c) read(a,b,c)
142 #define pipewrite(a,b,c) write(a,b,c)
143
144 #endif /* WIN32 */
145
146 /*
147 * State info for archive_close_connection() shutdown callback.
148 */
149 typedef struct ShutdownInformation
150 {
151 ParallelState *pstate;
152 Archive *AHX;
153 } ShutdownInformation;
154
155 static ShutdownInformation shutdown_info;
156
157 /*
158 * State info for signal handling.
159 * We assume signal_info initializes to zeroes.
160 *
161 * On Unix, myAH is the master DB connection in the master process, and the
162 * worker's own connection in worker processes. On Windows, we have only one
163 * instance of signal_info, so myAH is the master connection and the worker
164 * connections must be dug out of pstate->parallelSlot[].
165 */
166 typedef struct DumpSignalInformation
167 {
168 ArchiveHandle *myAH; /* database connection to issue cancel for */
169 ParallelState *pstate; /* parallel state, if any */
170 bool handler_set; /* signal handler set up in this process? */
171 #ifndef WIN32
172 bool am_worker; /* am I a worker process? */
173 #endif
174 } DumpSignalInformation;
175
176 static volatile DumpSignalInformation signal_info;
177
178 #ifdef WIN32
179 static CRITICAL_SECTION signal_info_lock;
180 #endif
181
182 /*
183 * Write a simple string to stderr --- must be safe in a signal handler.
184 * We ignore the write() result since there's not much we could do about it.
185 * Certain compilers make that harder than it ought to be.
186 */
187 #define write_stderr(str) \
188 do { \
189 const char *str_ = (str); \
190 int rc_; \
191 rc_ = write(fileno(stderr), str_, strlen(str_)); \
192 (void) rc_; \
193 } while (0)
194
195
196 #ifdef WIN32
197 /* file-scope variables */
198 static DWORD tls_index;
199
200 /* globally visible variables (needed by exit_nicely) */
201 bool parallel_init_done = false;
202 DWORD mainThreadId;
203 #endif /* WIN32 */
204
205 /* Local function prototypes */
206 static ParallelSlot *GetMyPSlot(ParallelState *pstate);
207 static void archive_close_connection(int code, void *arg);
208 static void ShutdownWorkersHard(ParallelState *pstate);
209 static void WaitForTerminatingWorkers(ParallelState *pstate);
210 static void setup_cancel_handler(void);
211 static void set_cancel_pstate(ParallelState *pstate);
212 static void set_cancel_slot_archive(ParallelSlot *slot, ArchiveHandle *AH);
213 static void RunWorker(ArchiveHandle *AH, ParallelSlot *slot);
214 static int GetIdleWorker(ParallelState *pstate);
215 static bool HasEveryWorkerTerminated(ParallelState *pstate);
216 static void lockTableForWorker(ArchiveHandle *AH, TocEntry *te);
217 static void WaitForCommands(ArchiveHandle *AH, int pipefd[2]);
218 static bool ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate,
219 bool do_wait);
220 static char *getMessageFromMaster(int pipefd[2]);
221 static void sendMessageToMaster(int pipefd[2], const char *str);
222 static int select_loop(int maxFd, fd_set *workerset);
223 static char *getMessageFromWorker(ParallelState *pstate,
224 bool do_wait, int *worker);
225 static void sendMessageToWorker(ParallelState *pstate,
226 int worker, const char *str);
227 static char *readMessageFromPipe(int fd);
228
229 #define messageStartsWith(msg, prefix) \
230 (strncmp(msg, prefix, strlen(prefix)) == 0)
231
232
233 /*
234 * Initialize parallel dump support --- should be called early in process
235 * startup. (Currently, this is called whether or not we intend parallel
236 * activity.)
237 */
238 void
init_parallel_dump_utils(void)239 init_parallel_dump_utils(void)
240 {
241 #ifdef WIN32
242 if (!parallel_init_done)
243 {
244 WSADATA wsaData;
245 int err;
246
247 /* Prepare for threaded operation */
248 tls_index = TlsAlloc();
249 mainThreadId = GetCurrentThreadId();
250
251 /* Initialize socket access */
252 err = WSAStartup(MAKEWORD(2, 2), &wsaData);
253 if (err != 0)
254 {
255 pg_log_error("WSAStartup failed: %d", err);
256 exit_nicely(1);
257 }
258
259 parallel_init_done = true;
260 }
261 #endif
262 }
263
264 /*
265 * Find the ParallelSlot for the current worker process or thread.
266 *
267 * Returns NULL if no matching slot is found (this implies we're the master).
268 */
269 static ParallelSlot *
GetMyPSlot(ParallelState * pstate)270 GetMyPSlot(ParallelState *pstate)
271 {
272 int i;
273
274 for (i = 0; i < pstate->numWorkers; i++)
275 {
276 #ifdef WIN32
277 if (pstate->parallelSlot[i].threadId == GetCurrentThreadId())
278 #else
279 if (pstate->parallelSlot[i].pid == getpid())
280 #endif
281 return &(pstate->parallelSlot[i]);
282 }
283
284 return NULL;
285 }
286
287 /*
288 * A thread-local version of getLocalPQExpBuffer().
289 *
290 * Non-reentrant but reduces memory leakage: we'll consume one buffer per
291 * thread, which is much better than one per fmtId/fmtQualifiedId call.
292 */
293 #ifdef WIN32
294 static PQExpBuffer
getThreadLocalPQExpBuffer(void)295 getThreadLocalPQExpBuffer(void)
296 {
297 /*
298 * The Tls code goes awry if we use a static var, so we provide for both
299 * static and auto, and omit any use of the static var when using Tls. We
300 * rely on TlsGetValue() to return 0 if the value is not yet set.
301 */
302 static PQExpBuffer s_id_return = NULL;
303 PQExpBuffer id_return;
304
305 if (parallel_init_done)
306 id_return = (PQExpBuffer) TlsGetValue(tls_index);
307 else
308 id_return = s_id_return;
309
310 if (id_return) /* first time through? */
311 {
312 /* same buffer, just wipe contents */
313 resetPQExpBuffer(id_return);
314 }
315 else
316 {
317 /* new buffer */
318 id_return = createPQExpBuffer();
319 if (parallel_init_done)
320 TlsSetValue(tls_index, id_return);
321 else
322 s_id_return = id_return;
323 }
324
325 return id_return;
326 }
327 #endif /* WIN32 */
328
329 /*
330 * pg_dump and pg_restore call this to register the cleanup handler
331 * as soon as they've created the ArchiveHandle.
332 */
333 void
on_exit_close_archive(Archive * AHX)334 on_exit_close_archive(Archive *AHX)
335 {
336 shutdown_info.AHX = AHX;
337 on_exit_nicely(archive_close_connection, &shutdown_info);
338 }
339
340 /*
341 * on_exit_nicely handler for shutting down database connections and
342 * worker processes cleanly.
343 */
344 static void
archive_close_connection(int code,void * arg)345 archive_close_connection(int code, void *arg)
346 {
347 ShutdownInformation *si = (ShutdownInformation *) arg;
348
349 if (si->pstate)
350 {
351 /* In parallel mode, must figure out who we are */
352 ParallelSlot *slot = GetMyPSlot(si->pstate);
353
354 if (!slot)
355 {
356 /*
357 * We're the master. Forcibly shut down workers, then close our
358 * own database connection, if any.
359 */
360 ShutdownWorkersHard(si->pstate);
361
362 if (si->AHX)
363 DisconnectDatabase(si->AHX);
364 }
365 else
366 {
367 /*
368 * We're a worker. Shut down our own DB connection if any. On
369 * Windows, we also have to close our communication sockets, to
370 * emulate what will happen on Unix when the worker process exits.
371 * (Without this, if this is a premature exit, the master would
372 * fail to detect it because there would be no EOF condition on
373 * the other end of the pipe.)
374 */
375 if (slot->AH)
376 DisconnectDatabase(&(slot->AH->public));
377
378 #ifdef WIN32
379 closesocket(slot->pipeRevRead);
380 closesocket(slot->pipeRevWrite);
381 #endif
382 }
383 }
384 else
385 {
386 /* Non-parallel operation: just kill the master DB connection */
387 if (si->AHX)
388 DisconnectDatabase(si->AHX);
389 }
390 }
391
392 /*
393 * Forcibly shut down any remaining workers, waiting for them to finish.
394 *
395 * Note that we don't expect to come here during normal exit (the workers
396 * should be long gone, and the ParallelState too). We're only here in a
397 * fatal() situation, so intervening to cancel active commands is
398 * appropriate.
399 */
400 static void
ShutdownWorkersHard(ParallelState * pstate)401 ShutdownWorkersHard(ParallelState *pstate)
402 {
403 int i;
404
405 /*
406 * Close our write end of the sockets so that any workers waiting for
407 * commands know they can exit. (Note: some of the pipeWrite fields might
408 * still be zero, if we failed to initialize all the workers. Hence, just
409 * ignore errors here.)
410 */
411 for (i = 0; i < pstate->numWorkers; i++)
412 closesocket(pstate->parallelSlot[i].pipeWrite);
413
414 /*
415 * Force early termination of any commands currently in progress.
416 */
417 #ifndef WIN32
418 /* On non-Windows, send SIGTERM to each worker process. */
419 for (i = 0; i < pstate->numWorkers; i++)
420 {
421 pid_t pid = pstate->parallelSlot[i].pid;
422
423 if (pid != 0)
424 kill(pid, SIGTERM);
425 }
426 #else
427
428 /*
429 * On Windows, send query cancels directly to the workers' backends. Use
430 * a critical section to ensure worker threads don't change state.
431 */
432 EnterCriticalSection(&signal_info_lock);
433 for (i = 0; i < pstate->numWorkers; i++)
434 {
435 ArchiveHandle *AH = pstate->parallelSlot[i].AH;
436 char errbuf[1];
437
438 if (AH != NULL && AH->connCancel != NULL)
439 (void) PQcancel(AH->connCancel, errbuf, sizeof(errbuf));
440 }
441 LeaveCriticalSection(&signal_info_lock);
442 #endif
443
444 /* Now wait for them to terminate. */
445 WaitForTerminatingWorkers(pstate);
446 }
447
448 /*
449 * Wait for all workers to terminate.
450 */
451 static void
WaitForTerminatingWorkers(ParallelState * pstate)452 WaitForTerminatingWorkers(ParallelState *pstate)
453 {
454 while (!HasEveryWorkerTerminated(pstate))
455 {
456 ParallelSlot *slot = NULL;
457 int j;
458
459 #ifndef WIN32
460 /* On non-Windows, use wait() to wait for next worker to end */
461 int status;
462 pid_t pid = wait(&status);
463
464 /* Find dead worker's slot, and clear the PID field */
465 for (j = 0; j < pstate->numWorkers; j++)
466 {
467 slot = &(pstate->parallelSlot[j]);
468 if (slot->pid == pid)
469 {
470 slot->pid = 0;
471 break;
472 }
473 }
474 #else /* WIN32 */
475 /* On Windows, we must use WaitForMultipleObjects() */
476 HANDLE *lpHandles = pg_malloc(sizeof(HANDLE) * pstate->numWorkers);
477 int nrun = 0;
478 DWORD ret;
479 uintptr_t hThread;
480
481 for (j = 0; j < pstate->numWorkers; j++)
482 {
483 if (WORKER_IS_RUNNING(pstate->parallelSlot[j].workerStatus))
484 {
485 lpHandles[nrun] = (HANDLE) pstate->parallelSlot[j].hThread;
486 nrun++;
487 }
488 }
489 ret = WaitForMultipleObjects(nrun, lpHandles, false, INFINITE);
490 Assert(ret != WAIT_FAILED);
491 hThread = (uintptr_t) lpHandles[ret - WAIT_OBJECT_0];
492 free(lpHandles);
493
494 /* Find dead worker's slot, and clear the hThread field */
495 for (j = 0; j < pstate->numWorkers; j++)
496 {
497 slot = &(pstate->parallelSlot[j]);
498 if (slot->hThread == hThread)
499 {
500 /* For cleanliness, close handles for dead threads */
501 CloseHandle((HANDLE) slot->hThread);
502 slot->hThread = (uintptr_t) INVALID_HANDLE_VALUE;
503 break;
504 }
505 }
506 #endif /* WIN32 */
507
508 /* On all platforms, update workerStatus and te[] as well */
509 Assert(j < pstate->numWorkers);
510 slot->workerStatus = WRKR_TERMINATED;
511 pstate->te[j] = NULL;
512 }
513 }
514
515
516 /*
517 * Code for responding to cancel interrupts (SIGINT, control-C, etc)
518 *
519 * This doesn't quite belong in this module, but it needs access to the
520 * ParallelState data, so there's not really a better place either.
521 *
522 * When we get a cancel interrupt, we could just die, but in pg_restore that
523 * could leave a SQL command (e.g., CREATE INDEX on a large table) running
524 * for a long time. Instead, we try to send a cancel request and then die.
525 * pg_dump probably doesn't really need this, but we might as well use it
526 * there too. Note that sending the cancel directly from the signal handler
527 * is safe because PQcancel() is written to make it so.
528 *
529 * In parallel operation on Unix, each process is responsible for canceling
530 * its own connection (this must be so because nobody else has access to it).
531 * Furthermore, the master process should attempt to forward its signal to
532 * each child. In simple manual use of pg_dump/pg_restore, forwarding isn't
533 * needed because typing control-C at the console would deliver SIGINT to
534 * every member of the terminal process group --- but in other scenarios it
535 * might be that only the master gets signaled.
536 *
537 * On Windows, the cancel handler runs in a separate thread, because that's
538 * how SetConsoleCtrlHandler works. We make it stop worker threads, send
539 * cancels on all active connections, and then return FALSE, which will allow
540 * the process to die. For safety's sake, we use a critical section to
541 * protect the PGcancel structures against being changed while the signal
542 * thread runs.
543 */
544
545 #ifndef WIN32
546
547 /*
548 * Signal handler (Unix only)
549 */
550 static void
sigTermHandler(SIGNAL_ARGS)551 sigTermHandler(SIGNAL_ARGS)
552 {
553 int i;
554 char errbuf[1];
555
556 /*
557 * Some platforms allow delivery of new signals to interrupt an active
558 * signal handler. That could muck up our attempt to send PQcancel, so
559 * disable the signals that setup_cancel_handler enabled.
560 */
561 pqsignal(SIGINT, SIG_IGN);
562 pqsignal(SIGTERM, SIG_IGN);
563 pqsignal(SIGQUIT, SIG_IGN);
564
565 /*
566 * If we're in the master, forward signal to all workers. (It seems best
567 * to do this before PQcancel; killing the master transaction will result
568 * in invalid-snapshot errors from active workers, which maybe we can
569 * quiet by killing workers first.) Ignore any errors.
570 */
571 if (signal_info.pstate != NULL)
572 {
573 for (i = 0; i < signal_info.pstate->numWorkers; i++)
574 {
575 pid_t pid = signal_info.pstate->parallelSlot[i].pid;
576
577 if (pid != 0)
578 kill(pid, SIGTERM);
579 }
580 }
581
582 /*
583 * Send QueryCancel if we have a connection to send to. Ignore errors,
584 * there's not much we can do about them anyway.
585 */
586 if (signal_info.myAH != NULL && signal_info.myAH->connCancel != NULL)
587 (void) PQcancel(signal_info.myAH->connCancel, errbuf, sizeof(errbuf));
588
589 /*
590 * Report we're quitting, using nothing more complicated than write(2).
591 * When in parallel operation, only the master process should do this.
592 */
593 if (!signal_info.am_worker)
594 {
595 if (progname)
596 {
597 write_stderr(progname);
598 write_stderr(": ");
599 }
600 write_stderr("terminated by user\n");
601 }
602
603 /*
604 * And die, using _exit() not exit() because the latter will invoke atexit
605 * handlers that can fail if we interrupted related code.
606 */
607 _exit(1);
608 }
609
610 /*
611 * Enable cancel interrupt handler, if not already done.
612 */
613 static void
setup_cancel_handler(void)614 setup_cancel_handler(void)
615 {
616 /*
617 * When forking, signal_info.handler_set will propagate into the new
618 * process, but that's fine because the signal handler state does too.
619 */
620 if (!signal_info.handler_set)
621 {
622 signal_info.handler_set = true;
623
624 pqsignal(SIGINT, sigTermHandler);
625 pqsignal(SIGTERM, sigTermHandler);
626 pqsignal(SIGQUIT, sigTermHandler);
627 }
628 }
629
630 #else /* WIN32 */
631
632 /*
633 * Console interrupt handler --- runs in a newly-started thread.
634 *
635 * After stopping other threads and sending cancel requests on all open
636 * connections, we return FALSE which will allow the default ExitProcess()
637 * action to be taken.
638 */
639 static BOOL WINAPI
consoleHandler(DWORD dwCtrlType)640 consoleHandler(DWORD dwCtrlType)
641 {
642 int i;
643 char errbuf[1];
644
645 if (dwCtrlType == CTRL_C_EVENT ||
646 dwCtrlType == CTRL_BREAK_EVENT)
647 {
648 /* Critical section prevents changing data we look at here */
649 EnterCriticalSection(&signal_info_lock);
650
651 /*
652 * If in parallel mode, stop worker threads and send QueryCancel to
653 * their connected backends. The main point of stopping the worker
654 * threads is to keep them from reporting the query cancels as errors,
655 * which would clutter the user's screen. We needn't stop the master
656 * thread since it won't be doing much anyway. Do this before
657 * canceling the main transaction, else we might get invalid-snapshot
658 * errors reported before we can stop the workers. Ignore errors,
659 * there's not much we can do about them anyway.
660 */
661 if (signal_info.pstate != NULL)
662 {
663 for (i = 0; i < signal_info.pstate->numWorkers; i++)
664 {
665 ParallelSlot *slot = &(signal_info.pstate->parallelSlot[i]);
666 ArchiveHandle *AH = slot->AH;
667 HANDLE hThread = (HANDLE) slot->hThread;
668
669 /*
670 * Using TerminateThread here may leave some resources leaked,
671 * but it doesn't matter since we're about to end the whole
672 * process.
673 */
674 if (hThread != INVALID_HANDLE_VALUE)
675 TerminateThread(hThread, 0);
676
677 if (AH != NULL && AH->connCancel != NULL)
678 (void) PQcancel(AH->connCancel, errbuf, sizeof(errbuf));
679 }
680 }
681
682 /*
683 * Send QueryCancel to master connection, if enabled. Ignore errors,
684 * there's not much we can do about them anyway.
685 */
686 if (signal_info.myAH != NULL && signal_info.myAH->connCancel != NULL)
687 (void) PQcancel(signal_info.myAH->connCancel,
688 errbuf, sizeof(errbuf));
689
690 LeaveCriticalSection(&signal_info_lock);
691
692 /*
693 * Report we're quitting, using nothing more complicated than
694 * write(2). (We might be able to get away with using pg_log_*()
695 * here, but since we terminated other threads uncleanly above, it
696 * seems better to assume as little as possible.)
697 */
698 if (progname)
699 {
700 write_stderr(progname);
701 write_stderr(": ");
702 }
703 write_stderr("terminated by user\n");
704 }
705
706 /* Always return FALSE to allow signal handling to continue */
707 return FALSE;
708 }
709
710 /*
711 * Enable cancel interrupt handler, if not already done.
712 */
713 static void
setup_cancel_handler(void)714 setup_cancel_handler(void)
715 {
716 if (!signal_info.handler_set)
717 {
718 signal_info.handler_set = true;
719
720 InitializeCriticalSection(&signal_info_lock);
721
722 SetConsoleCtrlHandler(consoleHandler, TRUE);
723 }
724 }
725
726 #endif /* WIN32 */
727
728
729 /*
730 * set_archive_cancel_info
731 *
732 * Fill AH->connCancel with cancellation info for the specified database
733 * connection; or clear it if conn is NULL.
734 */
735 void
set_archive_cancel_info(ArchiveHandle * AH,PGconn * conn)736 set_archive_cancel_info(ArchiveHandle *AH, PGconn *conn)
737 {
738 PGcancel *oldConnCancel;
739
740 /*
741 * Activate the interrupt handler if we didn't yet in this process. On
742 * Windows, this also initializes signal_info_lock; therefore it's
743 * important that this happen at least once before we fork off any
744 * threads.
745 */
746 setup_cancel_handler();
747
748 /*
749 * On Unix, we assume that storing a pointer value is atomic with respect
750 * to any possible signal interrupt. On Windows, use a critical section.
751 */
752
753 #ifdef WIN32
754 EnterCriticalSection(&signal_info_lock);
755 #endif
756
757 /* Free the old one if we have one */
758 oldConnCancel = AH->connCancel;
759 /* be sure interrupt handler doesn't use pointer while freeing */
760 AH->connCancel = NULL;
761
762 if (oldConnCancel != NULL)
763 PQfreeCancel(oldConnCancel);
764
765 /* Set the new one if specified */
766 if (conn)
767 AH->connCancel = PQgetCancel(conn);
768
769 /*
770 * On Unix, there's only ever one active ArchiveHandle per process, so we
771 * can just set signal_info.myAH unconditionally. On Windows, do that
772 * only in the main thread; worker threads have to make sure their
773 * ArchiveHandle appears in the pstate data, which is dealt with in
774 * RunWorker().
775 */
776 #ifndef WIN32
777 signal_info.myAH = AH;
778 #else
779 if (mainThreadId == GetCurrentThreadId())
780 signal_info.myAH = AH;
781 #endif
782
783 #ifdef WIN32
784 LeaveCriticalSection(&signal_info_lock);
785 #endif
786 }
787
788 /*
789 * set_cancel_pstate
790 *
791 * Set signal_info.pstate to point to the specified ParallelState, if any.
792 * We need this mainly to have an interlock against Windows signal thread.
793 */
794 static void
set_cancel_pstate(ParallelState * pstate)795 set_cancel_pstate(ParallelState *pstate)
796 {
797 #ifdef WIN32
798 EnterCriticalSection(&signal_info_lock);
799 #endif
800
801 signal_info.pstate = pstate;
802
803 #ifdef WIN32
804 LeaveCriticalSection(&signal_info_lock);
805 #endif
806 }
807
808 /*
809 * set_cancel_slot_archive
810 *
811 * Set ParallelSlot's AH field to point to the specified archive, if any.
812 * We need this mainly to have an interlock against Windows signal thread.
813 */
814 static void
set_cancel_slot_archive(ParallelSlot * slot,ArchiveHandle * AH)815 set_cancel_slot_archive(ParallelSlot *slot, ArchiveHandle *AH)
816 {
817 #ifdef WIN32
818 EnterCriticalSection(&signal_info_lock);
819 #endif
820
821 slot->AH = AH;
822
823 #ifdef WIN32
824 LeaveCriticalSection(&signal_info_lock);
825 #endif
826 }
827
828
829 /*
830 * This function is called by both Unix and Windows variants to set up
831 * and run a worker process. Caller should exit the process (or thread)
832 * upon return.
833 */
834 static void
RunWorker(ArchiveHandle * AH,ParallelSlot * slot)835 RunWorker(ArchiveHandle *AH, ParallelSlot *slot)
836 {
837 int pipefd[2];
838
839 /* fetch child ends of pipes */
840 pipefd[PIPE_READ] = slot->pipeRevRead;
841 pipefd[PIPE_WRITE] = slot->pipeRevWrite;
842
843 /*
844 * Clone the archive so that we have our own state to work with, and in
845 * particular our own database connection.
846 *
847 * We clone on Unix as well as Windows, even though technically we don't
848 * need to because fork() gives us a copy in our own address space
849 * already. But CloneArchive resets the state information and also clones
850 * the database connection which both seem kinda helpful.
851 */
852 AH = CloneArchive(AH);
853
854 /* Remember cloned archive where signal handler can find it */
855 set_cancel_slot_archive(slot, AH);
856
857 /*
858 * Call the setup worker function that's defined in the ArchiveHandle.
859 */
860 (AH->SetupWorkerPtr) ((Archive *) AH);
861
862 /*
863 * Execute commands until done.
864 */
865 WaitForCommands(AH, pipefd);
866
867 /*
868 * Disconnect from database and clean up.
869 */
870 set_cancel_slot_archive(slot, NULL);
871 DisconnectDatabase(&(AH->public));
872 DeCloneArchive(AH);
873 }
874
875 /*
876 * Thread base function for Windows
877 */
878 #ifdef WIN32
879 static unsigned __stdcall
init_spawned_worker_win32(WorkerInfo * wi)880 init_spawned_worker_win32(WorkerInfo *wi)
881 {
882 ArchiveHandle *AH = wi->AH;
883 ParallelSlot *slot = wi->slot;
884
885 /* Don't need WorkerInfo anymore */
886 free(wi);
887
888 /* Run the worker ... */
889 RunWorker(AH, slot);
890
891 /* Exit the thread */
892 _endthreadex(0);
893 return 0;
894 }
895 #endif /* WIN32 */
896
897 /*
898 * This function starts a parallel dump or restore by spawning off the worker
899 * processes. For Windows, it creates a number of threads; on Unix the
900 * workers are created with fork().
901 */
902 ParallelState *
ParallelBackupStart(ArchiveHandle * AH)903 ParallelBackupStart(ArchiveHandle *AH)
904 {
905 ParallelState *pstate;
906 int i;
907
908 Assert(AH->public.numWorkers > 0);
909
910 pstate = (ParallelState *) pg_malloc(sizeof(ParallelState));
911
912 pstate->numWorkers = AH->public.numWorkers;
913 pstate->te = NULL;
914 pstate->parallelSlot = NULL;
915
916 if (AH->public.numWorkers == 1)
917 return pstate;
918
919 /* Create status arrays, being sure to initialize all fields to 0 */
920 pstate->te = (TocEntry **)
921 pg_malloc0(pstate->numWorkers * sizeof(TocEntry *));
922 pstate->parallelSlot = (ParallelSlot *)
923 pg_malloc0(pstate->numWorkers * sizeof(ParallelSlot));
924
925 #ifdef WIN32
926 /* Make fmtId() and fmtQualifiedId() use thread-local storage */
927 getLocalPQExpBuffer = getThreadLocalPQExpBuffer;
928 #endif
929
930 /*
931 * Set the pstate in shutdown_info, to tell the exit handler that it must
932 * clean up workers as well as the main database connection. But we don't
933 * set this in signal_info yet, because we don't want child processes to
934 * inherit non-NULL signal_info.pstate.
935 */
936 shutdown_info.pstate = pstate;
937
938 /*
939 * Temporarily disable query cancellation on the master connection. This
940 * ensures that child processes won't inherit valid AH->connCancel
941 * settings and thus won't try to issue cancels against the master's
942 * connection. No harm is done if we fail while it's disabled, because
943 * the master connection is idle at this point anyway.
944 */
945 set_archive_cancel_info(AH, NULL);
946
947 /* Ensure stdio state is quiesced before forking */
948 fflush(NULL);
949
950 /* Create desired number of workers */
951 for (i = 0; i < pstate->numWorkers; i++)
952 {
953 #ifdef WIN32
954 WorkerInfo *wi;
955 uintptr_t handle;
956 #else
957 pid_t pid;
958 #endif
959 ParallelSlot *slot = &(pstate->parallelSlot[i]);
960 int pipeMW[2],
961 pipeWM[2];
962
963 /* Create communication pipes for this worker */
964 if (pgpipe(pipeMW) < 0 || pgpipe(pipeWM) < 0)
965 fatal("could not create communication channels: %m");
966
967 /* master's ends of the pipes */
968 slot->pipeRead = pipeWM[PIPE_READ];
969 slot->pipeWrite = pipeMW[PIPE_WRITE];
970 /* child's ends of the pipes */
971 slot->pipeRevRead = pipeMW[PIPE_READ];
972 slot->pipeRevWrite = pipeWM[PIPE_WRITE];
973
974 #ifdef WIN32
975 /* Create transient structure to pass args to worker function */
976 wi = (WorkerInfo *) pg_malloc(sizeof(WorkerInfo));
977
978 wi->AH = AH;
979 wi->slot = slot;
980
981 handle = _beginthreadex(NULL, 0, (void *) &init_spawned_worker_win32,
982 wi, 0, &(slot->threadId));
983 slot->hThread = handle;
984 slot->workerStatus = WRKR_IDLE;
985 #else /* !WIN32 */
986 pid = fork();
987 if (pid == 0)
988 {
989 /* we are the worker */
990 int j;
991
992 /* this is needed for GetMyPSlot() */
993 slot->pid = getpid();
994
995 /* instruct signal handler that we're in a worker now */
996 signal_info.am_worker = true;
997
998 /* close read end of Worker -> Master */
999 closesocket(pipeWM[PIPE_READ]);
1000 /* close write end of Master -> Worker */
1001 closesocket(pipeMW[PIPE_WRITE]);
1002
1003 /*
1004 * Close all inherited fds for communication of the master with
1005 * previously-forked workers.
1006 */
1007 for (j = 0; j < i; j++)
1008 {
1009 closesocket(pstate->parallelSlot[j].pipeRead);
1010 closesocket(pstate->parallelSlot[j].pipeWrite);
1011 }
1012
1013 /* Run the worker ... */
1014 RunWorker(AH, slot);
1015
1016 /* We can just exit(0) when done */
1017 exit(0);
1018 }
1019 else if (pid < 0)
1020 {
1021 /* fork failed */
1022 fatal("could not create worker process: %m");
1023 }
1024
1025 /* In Master after successful fork */
1026 slot->pid = pid;
1027 slot->workerStatus = WRKR_IDLE;
1028
1029 /* close read end of Master -> Worker */
1030 closesocket(pipeMW[PIPE_READ]);
1031 /* close write end of Worker -> Master */
1032 closesocket(pipeWM[PIPE_WRITE]);
1033 #endif /* WIN32 */
1034 }
1035
1036 /*
1037 * Having forked off the workers, disable SIGPIPE so that master isn't
1038 * killed if it tries to send a command to a dead worker. We don't want
1039 * the workers to inherit this setting, though.
1040 */
1041 #ifndef WIN32
1042 pqsignal(SIGPIPE, SIG_IGN);
1043 #endif
1044
1045 /*
1046 * Re-establish query cancellation on the master connection.
1047 */
1048 set_archive_cancel_info(AH, AH->connection);
1049
1050 /*
1051 * Tell the cancel signal handler to forward signals to worker processes,
1052 * too. (As with query cancel, we did not need this earlier because the
1053 * workers have not yet been given anything to do; if we die before this
1054 * point, any already-started workers will see EOF and quit promptly.)
1055 */
1056 set_cancel_pstate(pstate);
1057
1058 return pstate;
1059 }
1060
1061 /*
1062 * Close down a parallel dump or restore.
1063 */
1064 void
ParallelBackupEnd(ArchiveHandle * AH,ParallelState * pstate)1065 ParallelBackupEnd(ArchiveHandle *AH, ParallelState *pstate)
1066 {
1067 int i;
1068
1069 /* No work if non-parallel */
1070 if (pstate->numWorkers == 1)
1071 return;
1072
1073 /* There should not be any unfinished jobs */
1074 Assert(IsEveryWorkerIdle(pstate));
1075
1076 /* Close the sockets so that the workers know they can exit */
1077 for (i = 0; i < pstate->numWorkers; i++)
1078 {
1079 closesocket(pstate->parallelSlot[i].pipeRead);
1080 closesocket(pstate->parallelSlot[i].pipeWrite);
1081 }
1082
1083 /* Wait for them to exit */
1084 WaitForTerminatingWorkers(pstate);
1085
1086 /*
1087 * Unlink pstate from shutdown_info, so the exit handler will not try to
1088 * use it; and likewise unlink from signal_info.
1089 */
1090 shutdown_info.pstate = NULL;
1091 set_cancel_pstate(NULL);
1092
1093 /* Release state (mere neatnik-ism, since we're about to terminate) */
1094 free(pstate->te);
1095 free(pstate->parallelSlot);
1096 free(pstate);
1097 }
1098
1099 /*
1100 * These next four functions handle construction and parsing of the command
1101 * strings and response strings for parallel workers.
1102 *
1103 * Currently, these can be the same regardless of which archive format we are
1104 * processing. In future, we might want to let format modules override these
1105 * functions to add format-specific data to a command or response.
1106 */
1107
1108 /*
1109 * buildWorkerCommand: format a command string to send to a worker.
1110 *
1111 * The string is built in the caller-supplied buffer of size buflen.
1112 */
1113 static void
buildWorkerCommand(ArchiveHandle * AH,TocEntry * te,T_Action act,char * buf,int buflen)1114 buildWorkerCommand(ArchiveHandle *AH, TocEntry *te, T_Action act,
1115 char *buf, int buflen)
1116 {
1117 if (act == ACT_DUMP)
1118 snprintf(buf, buflen, "DUMP %d", te->dumpId);
1119 else if (act == ACT_RESTORE)
1120 snprintf(buf, buflen, "RESTORE %d", te->dumpId);
1121 else
1122 Assert(false);
1123 }
1124
1125 /*
1126 * parseWorkerCommand: interpret a command string in a worker.
1127 */
1128 static void
parseWorkerCommand(ArchiveHandle * AH,TocEntry ** te,T_Action * act,const char * msg)1129 parseWorkerCommand(ArchiveHandle *AH, TocEntry **te, T_Action *act,
1130 const char *msg)
1131 {
1132 DumpId dumpId;
1133 int nBytes;
1134
1135 if (messageStartsWith(msg, "DUMP "))
1136 {
1137 *act = ACT_DUMP;
1138 sscanf(msg, "DUMP %d%n", &dumpId, &nBytes);
1139 Assert(nBytes == strlen(msg));
1140 *te = getTocEntryByDumpId(AH, dumpId);
1141 Assert(*te != NULL);
1142 }
1143 else if (messageStartsWith(msg, "RESTORE "))
1144 {
1145 *act = ACT_RESTORE;
1146 sscanf(msg, "RESTORE %d%n", &dumpId, &nBytes);
1147 Assert(nBytes == strlen(msg));
1148 *te = getTocEntryByDumpId(AH, dumpId);
1149 Assert(*te != NULL);
1150 }
1151 else
1152 fatal("unrecognized command received from master: \"%s\"",
1153 msg);
1154 }
1155
1156 /*
1157 * buildWorkerResponse: format a response string to send to the master.
1158 *
1159 * The string is built in the caller-supplied buffer of size buflen.
1160 */
1161 static void
buildWorkerResponse(ArchiveHandle * AH,TocEntry * te,T_Action act,int status,char * buf,int buflen)1162 buildWorkerResponse(ArchiveHandle *AH, TocEntry *te, T_Action act, int status,
1163 char *buf, int buflen)
1164 {
1165 snprintf(buf, buflen, "OK %d %d %d",
1166 te->dumpId,
1167 status,
1168 status == WORKER_IGNORED_ERRORS ? AH->public.n_errors : 0);
1169 }
1170
1171 /*
1172 * parseWorkerResponse: parse the status message returned by a worker.
1173 *
1174 * Returns the integer status code, and may update fields of AH and/or te.
1175 */
1176 static int
parseWorkerResponse(ArchiveHandle * AH,TocEntry * te,const char * msg)1177 parseWorkerResponse(ArchiveHandle *AH, TocEntry *te,
1178 const char *msg)
1179 {
1180 DumpId dumpId;
1181 int nBytes,
1182 n_errors;
1183 int status = 0;
1184
1185 if (messageStartsWith(msg, "OK "))
1186 {
1187 sscanf(msg, "OK %d %d %d%n", &dumpId, &status, &n_errors, &nBytes);
1188
1189 Assert(dumpId == te->dumpId);
1190 Assert(nBytes == strlen(msg));
1191
1192 AH->public.n_errors += n_errors;
1193 }
1194 else
1195 fatal("invalid message received from worker: \"%s\"",
1196 msg);
1197
1198 return status;
1199 }
1200
1201 /*
1202 * Dispatch a job to some free worker.
1203 *
1204 * te is the TocEntry to be processed, act is the action to be taken on it.
1205 * callback is the function to call on completion of the job.
1206 *
1207 * If no worker is currently available, this will block, and previously
1208 * registered callback functions may be called.
1209 */
1210 void
DispatchJobForTocEntry(ArchiveHandle * AH,ParallelState * pstate,TocEntry * te,T_Action act,ParallelCompletionPtr callback,void * callback_data)1211 DispatchJobForTocEntry(ArchiveHandle *AH,
1212 ParallelState *pstate,
1213 TocEntry *te,
1214 T_Action act,
1215 ParallelCompletionPtr callback,
1216 void *callback_data)
1217 {
1218 int worker;
1219 char buf[256];
1220
1221 /* Get a worker, waiting if none are idle */
1222 while ((worker = GetIdleWorker(pstate)) == NO_SLOT)
1223 WaitForWorkers(AH, pstate, WFW_ONE_IDLE);
1224
1225 /* Construct and send command string */
1226 buildWorkerCommand(AH, te, act, buf, sizeof(buf));
1227
1228 sendMessageToWorker(pstate, worker, buf);
1229
1230 /* Remember worker is busy, and which TocEntry it's working on */
1231 pstate->parallelSlot[worker].workerStatus = WRKR_WORKING;
1232 pstate->parallelSlot[worker].callback = callback;
1233 pstate->parallelSlot[worker].callback_data = callback_data;
1234 pstate->te[worker] = te;
1235 }
1236
1237 /*
1238 * Find an idle worker and return its slot number.
1239 * Return NO_SLOT if none are idle.
1240 */
1241 static int
GetIdleWorker(ParallelState * pstate)1242 GetIdleWorker(ParallelState *pstate)
1243 {
1244 int i;
1245
1246 for (i = 0; i < pstate->numWorkers; i++)
1247 {
1248 if (pstate->parallelSlot[i].workerStatus == WRKR_IDLE)
1249 return i;
1250 }
1251 return NO_SLOT;
1252 }
1253
1254 /*
1255 * Return true iff no worker is running.
1256 */
1257 static bool
HasEveryWorkerTerminated(ParallelState * pstate)1258 HasEveryWorkerTerminated(ParallelState *pstate)
1259 {
1260 int i;
1261
1262 for (i = 0; i < pstate->numWorkers; i++)
1263 {
1264 if (WORKER_IS_RUNNING(pstate->parallelSlot[i].workerStatus))
1265 return false;
1266 }
1267 return true;
1268 }
1269
1270 /*
1271 * Return true iff every worker is in the WRKR_IDLE state.
1272 */
1273 bool
IsEveryWorkerIdle(ParallelState * pstate)1274 IsEveryWorkerIdle(ParallelState *pstate)
1275 {
1276 int i;
1277
1278 for (i = 0; i < pstate->numWorkers; i++)
1279 {
1280 if (pstate->parallelSlot[i].workerStatus != WRKR_IDLE)
1281 return false;
1282 }
1283 return true;
1284 }
1285
1286 /*
1287 * Acquire lock on a table to be dumped by a worker process.
1288 *
1289 * The master process is already holding an ACCESS SHARE lock. Ordinarily
1290 * it's no problem for a worker to get one too, but if anything else besides
1291 * pg_dump is running, there's a possible deadlock:
1292 *
1293 * 1) Master dumps the schema and locks all tables in ACCESS SHARE mode.
1294 * 2) Another process requests an ACCESS EXCLUSIVE lock (which is not granted
1295 * because the master holds a conflicting ACCESS SHARE lock).
1296 * 3) A worker process also requests an ACCESS SHARE lock to read the table.
1297 * The worker is enqueued behind the ACCESS EXCLUSIVE lock request.
1298 * 4) Now we have a deadlock, since the master is effectively waiting for
1299 * the worker. The server cannot detect that, however.
1300 *
1301 * To prevent an infinite wait, prior to touching a table in a worker, request
1302 * a lock in ACCESS SHARE mode but with NOWAIT. If we don't get the lock,
1303 * then we know that somebody else has requested an ACCESS EXCLUSIVE lock and
1304 * so we have a deadlock. We must fail the backup in that case.
1305 */
1306 static void
lockTableForWorker(ArchiveHandle * AH,TocEntry * te)1307 lockTableForWorker(ArchiveHandle *AH, TocEntry *te)
1308 {
1309 const char *qualId;
1310 PQExpBuffer query;
1311 PGresult *res;
1312
1313 /* Nothing to do for BLOBS */
1314 if (strcmp(te->desc, "BLOBS") == 0)
1315 return;
1316
1317 query = createPQExpBuffer();
1318
1319 qualId = fmtQualifiedId(te->namespace, te->tag);
1320
1321 appendPQExpBuffer(query, "LOCK TABLE %s IN ACCESS SHARE MODE NOWAIT",
1322 qualId);
1323
1324 res = PQexec(AH->connection, query->data);
1325
1326 if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
1327 fatal("could not obtain lock on relation \"%s\"\n"
1328 "This usually means that someone requested an ACCESS EXCLUSIVE lock "
1329 "on the table after the pg_dump parent process had gotten the "
1330 "initial ACCESS SHARE lock on the table.", qualId);
1331
1332 PQclear(res);
1333 destroyPQExpBuffer(query);
1334 }
1335
1336 /*
1337 * WaitForCommands: main routine for a worker process.
1338 *
1339 * Read and execute commands from the master until we see EOF on the pipe.
1340 */
1341 static void
WaitForCommands(ArchiveHandle * AH,int pipefd[2])1342 WaitForCommands(ArchiveHandle *AH, int pipefd[2])
1343 {
1344 char *command;
1345 TocEntry *te;
1346 T_Action act;
1347 int status = 0;
1348 char buf[256];
1349
1350 for (;;)
1351 {
1352 if (!(command = getMessageFromMaster(pipefd)))
1353 {
1354 /* EOF, so done */
1355 return;
1356 }
1357
1358 /* Decode the command */
1359 parseWorkerCommand(AH, &te, &act, command);
1360
1361 if (act == ACT_DUMP)
1362 {
1363 /* Acquire lock on this table within the worker's session */
1364 lockTableForWorker(AH, te);
1365
1366 /* Perform the dump command */
1367 status = (AH->WorkerJobDumpPtr) (AH, te);
1368 }
1369 else if (act == ACT_RESTORE)
1370 {
1371 /* Perform the restore command */
1372 status = (AH->WorkerJobRestorePtr) (AH, te);
1373 }
1374 else
1375 Assert(false);
1376
1377 /* Return status to master */
1378 buildWorkerResponse(AH, te, act, status, buf, sizeof(buf));
1379
1380 sendMessageToMaster(pipefd, buf);
1381
1382 /* command was pg_malloc'd and we are responsible for free()ing it. */
1383 free(command);
1384 }
1385 }
1386
1387 /*
1388 * Check for status messages from workers.
1389 *
1390 * If do_wait is true, wait to get a status message; otherwise, just return
1391 * immediately if there is none available.
1392 *
1393 * When we get a status message, we pass the status code to the callback
1394 * function that was specified to DispatchJobForTocEntry, then reset the
1395 * worker status to IDLE.
1396 *
1397 * Returns true if we collected a status message, else false.
1398 *
1399 * XXX is it worth checking for more than one status message per call?
1400 * It seems somewhat unlikely that multiple workers would finish at exactly
1401 * the same time.
1402 */
1403 static bool
ListenToWorkers(ArchiveHandle * AH,ParallelState * pstate,bool do_wait)1404 ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate, bool do_wait)
1405 {
1406 int worker;
1407 char *msg;
1408
1409 /* Try to collect a status message */
1410 msg = getMessageFromWorker(pstate, do_wait, &worker);
1411
1412 if (!msg)
1413 {
1414 /* If do_wait is true, we must have detected EOF on some socket */
1415 if (do_wait)
1416 fatal("a worker process died unexpectedly");
1417 return false;
1418 }
1419
1420 /* Process it and update our idea of the worker's status */
1421 if (messageStartsWith(msg, "OK "))
1422 {
1423 ParallelSlot *slot = &pstate->parallelSlot[worker];
1424 TocEntry *te = pstate->te[worker];
1425 int status;
1426
1427 status = parseWorkerResponse(AH, te, msg);
1428 slot->callback(AH, te, status, slot->callback_data);
1429 slot->workerStatus = WRKR_IDLE;
1430 pstate->te[worker] = NULL;
1431 }
1432 else
1433 fatal("invalid message received from worker: \"%s\"",
1434 msg);
1435
1436 /* Free the string returned from getMessageFromWorker */
1437 free(msg);
1438
1439 return true;
1440 }
1441
1442 /*
1443 * Check for status results from workers, waiting if necessary.
1444 *
1445 * Available wait modes are:
1446 * WFW_NO_WAIT: reap any available status, but don't block
1447 * WFW_GOT_STATUS: wait for at least one more worker to finish
1448 * WFW_ONE_IDLE: wait for at least one worker to be idle
1449 * WFW_ALL_IDLE: wait for all workers to be idle
1450 *
1451 * Any received results are passed to the callback specified to
1452 * DispatchJobForTocEntry.
1453 *
1454 * This function is executed in the master process.
1455 */
1456 void
WaitForWorkers(ArchiveHandle * AH,ParallelState * pstate,WFW_WaitOption mode)1457 WaitForWorkers(ArchiveHandle *AH, ParallelState *pstate, WFW_WaitOption mode)
1458 {
1459 bool do_wait = false;
1460
1461 /*
1462 * In GOT_STATUS mode, always block waiting for a message, since we can't
1463 * return till we get something. In other modes, we don't block the first
1464 * time through the loop.
1465 */
1466 if (mode == WFW_GOT_STATUS)
1467 {
1468 /* Assert that caller knows what it's doing */
1469 Assert(!IsEveryWorkerIdle(pstate));
1470 do_wait = true;
1471 }
1472
1473 for (;;)
1474 {
1475 /*
1476 * Check for status messages, even if we don't need to block. We do
1477 * not try very hard to reap all available messages, though, since
1478 * there's unlikely to be more than one.
1479 */
1480 if (ListenToWorkers(AH, pstate, do_wait))
1481 {
1482 /*
1483 * If we got a message, we are done by definition for GOT_STATUS
1484 * mode, and we can also be certain that there's at least one idle
1485 * worker. So we're done in all but ALL_IDLE mode.
1486 */
1487 if (mode != WFW_ALL_IDLE)
1488 return;
1489 }
1490
1491 /* Check whether we must wait for new status messages */
1492 switch (mode)
1493 {
1494 case WFW_NO_WAIT:
1495 return; /* never wait */
1496 case WFW_GOT_STATUS:
1497 Assert(false); /* can't get here, because we waited */
1498 break;
1499 case WFW_ONE_IDLE:
1500 if (GetIdleWorker(pstate) != NO_SLOT)
1501 return;
1502 break;
1503 case WFW_ALL_IDLE:
1504 if (IsEveryWorkerIdle(pstate))
1505 return;
1506 break;
1507 }
1508
1509 /* Loop back, and this time wait for something to happen */
1510 do_wait = true;
1511 }
1512 }
1513
1514 /*
1515 * Read one command message from the master, blocking if necessary
1516 * until one is available, and return it as a malloc'd string.
1517 * On EOF, return NULL.
1518 *
1519 * This function is executed in worker processes.
1520 */
1521 static char *
getMessageFromMaster(int pipefd[2])1522 getMessageFromMaster(int pipefd[2])
1523 {
1524 return readMessageFromPipe(pipefd[PIPE_READ]);
1525 }
1526
1527 /*
1528 * Send a status message to the master.
1529 *
1530 * This function is executed in worker processes.
1531 */
1532 static void
sendMessageToMaster(int pipefd[2],const char * str)1533 sendMessageToMaster(int pipefd[2], const char *str)
1534 {
1535 int len = strlen(str) + 1;
1536
1537 if (pipewrite(pipefd[PIPE_WRITE], str, len) != len)
1538 fatal("could not write to the communication channel: %m");
1539 }
1540
1541 /*
1542 * Wait until some descriptor in "workerset" becomes readable.
1543 * Returns -1 on error, else the number of readable descriptors.
1544 */
1545 static int
select_loop(int maxFd,fd_set * workerset)1546 select_loop(int maxFd, fd_set *workerset)
1547 {
1548 int i;
1549 fd_set saveSet = *workerset;
1550
1551 for (;;)
1552 {
1553 *workerset = saveSet;
1554 i = select(maxFd + 1, workerset, NULL, NULL, NULL);
1555
1556 #ifndef WIN32
1557 if (i < 0 && errno == EINTR)
1558 continue;
1559 #else
1560 if (i == SOCKET_ERROR && WSAGetLastError() == WSAEINTR)
1561 continue;
1562 #endif
1563 break;
1564 }
1565
1566 return i;
1567 }
1568
1569
1570 /*
1571 * Check for messages from worker processes.
1572 *
1573 * If a message is available, return it as a malloc'd string, and put the
1574 * index of the sending worker in *worker.
1575 *
1576 * If nothing is available, wait if "do_wait" is true, else return NULL.
1577 *
1578 * If we detect EOF on any socket, we'll return NULL. It's not great that
1579 * that's hard to distinguish from the no-data-available case, but for now
1580 * our one caller is okay with that.
1581 *
1582 * This function is executed in the master process.
1583 */
1584 static char *
getMessageFromWorker(ParallelState * pstate,bool do_wait,int * worker)1585 getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker)
1586 {
1587 int i;
1588 fd_set workerset;
1589 int maxFd = -1;
1590 struct timeval nowait = {0, 0};
1591
1592 /* construct bitmap of socket descriptors for select() */
1593 FD_ZERO(&workerset);
1594 for (i = 0; i < pstate->numWorkers; i++)
1595 {
1596 if (!WORKER_IS_RUNNING(pstate->parallelSlot[i].workerStatus))
1597 continue;
1598 FD_SET(pstate->parallelSlot[i].pipeRead, &workerset);
1599 if (pstate->parallelSlot[i].pipeRead > maxFd)
1600 maxFd = pstate->parallelSlot[i].pipeRead;
1601 }
1602
1603 if (do_wait)
1604 {
1605 i = select_loop(maxFd, &workerset);
1606 Assert(i != 0);
1607 }
1608 else
1609 {
1610 if ((i = select(maxFd + 1, &workerset, NULL, NULL, &nowait)) == 0)
1611 return NULL;
1612 }
1613
1614 if (i < 0)
1615 fatal("select() failed: %m");
1616
1617 for (i = 0; i < pstate->numWorkers; i++)
1618 {
1619 char *msg;
1620
1621 if (!WORKER_IS_RUNNING(pstate->parallelSlot[i].workerStatus))
1622 continue;
1623 if (!FD_ISSET(pstate->parallelSlot[i].pipeRead, &workerset))
1624 continue;
1625
1626 /*
1627 * Read the message if any. If the socket is ready because of EOF,
1628 * we'll return NULL instead (and the socket will stay ready, so the
1629 * condition will persist).
1630 *
1631 * Note: because this is a blocking read, we'll wait if only part of
1632 * the message is available. Waiting a long time would be bad, but
1633 * since worker status messages are short and are always sent in one
1634 * operation, it shouldn't be a problem in practice.
1635 */
1636 msg = readMessageFromPipe(pstate->parallelSlot[i].pipeRead);
1637 *worker = i;
1638 return msg;
1639 }
1640 Assert(false);
1641 return NULL;
1642 }
1643
1644 /*
1645 * Send a command message to the specified worker process.
1646 *
1647 * This function is executed in the master process.
1648 */
1649 static void
sendMessageToWorker(ParallelState * pstate,int worker,const char * str)1650 sendMessageToWorker(ParallelState *pstate, int worker, const char *str)
1651 {
1652 int len = strlen(str) + 1;
1653
1654 if (pipewrite(pstate->parallelSlot[worker].pipeWrite, str, len) != len)
1655 {
1656 fatal("could not write to the communication channel: %m");
1657 }
1658 }
1659
1660 /*
1661 * Read one message from the specified pipe (fd), blocking if necessary
1662 * until one is available, and return it as a malloc'd string.
1663 * On EOF, return NULL.
1664 *
1665 * A "message" on the channel is just a null-terminated string.
1666 */
1667 static char *
readMessageFromPipe(int fd)1668 readMessageFromPipe(int fd)
1669 {
1670 char *msg;
1671 int msgsize,
1672 bufsize;
1673 int ret;
1674
1675 /*
1676 * In theory, if we let piperead() read multiple bytes, it might give us
1677 * back fragments of multiple messages. (That can't actually occur, since
1678 * neither master nor workers send more than one message without waiting
1679 * for a reply, but we don't wish to assume that here.) For simplicity,
1680 * read a byte at a time until we get the terminating '\0'. This method
1681 * is a bit inefficient, but since this is only used for relatively short
1682 * command and status strings, it shouldn't matter.
1683 */
1684 bufsize = 64; /* could be any number */
1685 msg = (char *) pg_malloc(bufsize);
1686 msgsize = 0;
1687 for (;;)
1688 {
1689 Assert(msgsize < bufsize);
1690 ret = piperead(fd, msg + msgsize, 1);
1691 if (ret <= 0)
1692 break; /* error or connection closure */
1693
1694 Assert(ret == 1);
1695
1696 if (msg[msgsize] == '\0')
1697 return msg; /* collected whole message */
1698
1699 msgsize++;
1700 if (msgsize == bufsize) /* enlarge buffer if needed */
1701 {
1702 bufsize += 16; /* could be any number */
1703 msg = (char *) pg_realloc(msg, bufsize);
1704 }
1705 }
1706
1707 /* Other end has closed the connection */
1708 pg_free(msg);
1709 return NULL;
1710 }
1711
1712 #ifdef WIN32
1713
1714 /*
1715 * This is a replacement version of pipe(2) for Windows which allows the pipe
1716 * handles to be used in select().
1717 *
1718 * Reads and writes on the pipe must go through piperead()/pipewrite().
1719 *
1720 * For consistency with Unix we declare the returned handles as "int".
1721 * This is okay even on WIN64 because system handles are not more than
1722 * 32 bits wide, but we do have to do some casting.
1723 */
1724 static int
pgpipe(int handles[2])1725 pgpipe(int handles[2])
1726 {
1727 pgsocket s,
1728 tmp_sock;
1729 struct sockaddr_in serv_addr;
1730 int len = sizeof(serv_addr);
1731
1732 /* We have to use the Unix socket invalid file descriptor value here. */
1733 handles[0] = handles[1] = -1;
1734
1735 /*
1736 * setup listen socket
1737 */
1738 if ((s = socket(AF_INET, SOCK_STREAM, 0)) == PGINVALID_SOCKET)
1739 {
1740 pg_log_error("pgpipe: could not create socket: error code %d",
1741 WSAGetLastError());
1742 return -1;
1743 }
1744
1745 memset((void *) &serv_addr, 0, sizeof(serv_addr));
1746 serv_addr.sin_family = AF_INET;
1747 serv_addr.sin_port = pg_hton16(0);
1748 serv_addr.sin_addr.s_addr = pg_hton32(INADDR_LOOPBACK);
1749 if (bind(s, (SOCKADDR *) &serv_addr, len) == SOCKET_ERROR)
1750 {
1751 pg_log_error("pgpipe: could not bind: error code %d",
1752 WSAGetLastError());
1753 closesocket(s);
1754 return -1;
1755 }
1756 if (listen(s, 1) == SOCKET_ERROR)
1757 {
1758 pg_log_error("pgpipe: could not listen: error code %d",
1759 WSAGetLastError());
1760 closesocket(s);
1761 return -1;
1762 }
1763 if (getsockname(s, (SOCKADDR *) &serv_addr, &len) == SOCKET_ERROR)
1764 {
1765 pg_log_error("pgpipe: getsockname() failed: error code %d",
1766 WSAGetLastError());
1767 closesocket(s);
1768 return -1;
1769 }
1770
1771 /*
1772 * setup pipe handles
1773 */
1774 if ((tmp_sock = socket(AF_INET, SOCK_STREAM, 0)) == PGINVALID_SOCKET)
1775 {
1776 pg_log_error("pgpipe: could not create second socket: error code %d",
1777 WSAGetLastError());
1778 closesocket(s);
1779 return -1;
1780 }
1781 handles[1] = (int) tmp_sock;
1782
1783 if (connect(handles[1], (SOCKADDR *) &serv_addr, len) == SOCKET_ERROR)
1784 {
1785 pg_log_error("pgpipe: could not connect socket: error code %d",
1786 WSAGetLastError());
1787 closesocket(handles[1]);
1788 handles[1] = -1;
1789 closesocket(s);
1790 return -1;
1791 }
1792 if ((tmp_sock = accept(s, (SOCKADDR *) &serv_addr, &len)) == PGINVALID_SOCKET)
1793 {
1794 pg_log_error("pgpipe: could not accept connection: error code %d",
1795 WSAGetLastError());
1796 closesocket(handles[1]);
1797 handles[1] = -1;
1798 closesocket(s);
1799 return -1;
1800 }
1801 handles[0] = (int) tmp_sock;
1802
1803 closesocket(s);
1804 return 0;
1805 }
1806
1807 /*
1808 * Windows implementation of reading from a pipe.
1809 */
1810 static int
piperead(int s,char * buf,int len)1811 piperead(int s, char *buf, int len)
1812 {
1813 int ret = recv(s, buf, len, 0);
1814
1815 if (ret < 0 && WSAGetLastError() == WSAECONNRESET)
1816 {
1817 /* EOF on the pipe! */
1818 ret = 0;
1819 }
1820 return ret;
1821 }
1822
1823 #endif /* WIN32 */
1824