1 /***********************************************************************************************************************************
2 Backup Command
3 ***********************************************************************************************************************************/
4 #include "build.auto.h"
5 
6 #include <string.h>
7 #include <sys/stat.h>
8 #include <time.h>
9 #include <unistd.h>
10 
11 #include "command/archive/common.h"
12 #include "command/control/common.h"
13 #include "command/backup/backup.h"
14 #include "command/backup/common.h"
15 #include "command/backup/file.h"
16 #include "command/backup/protocol.h"
17 #include "command/check/common.h"
18 #include "command/stanza/common.h"
19 #include "common/crypto/cipherBlock.h"
20 #include "common/compress/helper.h"
21 #include "common/debug.h"
22 #include "common/io/filter/size.h"
23 #include "common/log.h"
24 #include "common/time.h"
25 #include "common/type/convert.h"
26 #include "common/type/json.h"
27 #include "config/config.h"
28 #include "db/helper.h"
29 #include "info/infoArchive.h"
30 #include "info/infoBackup.h"
31 #include "info/manifest.h"
32 #include "postgres/interface.h"
33 #include "postgres/version.h"
34 #include "protocol/helper.h"
35 #include "protocol/parallel.h"
36 #include "storage/helper.h"
37 #include "version.h"
38 
39 /**********************************************************************************************************************************
40 Generate a unique backup label that does not contain a timestamp from a previous backup
41 ***********************************************************************************************************************************/
42 static String *
backupLabelCreate(BackupType type,const String * backupLabelPrior,time_t timestamp)43 backupLabelCreate(BackupType type, const String *backupLabelPrior, time_t timestamp)
44 {
45     FUNCTION_LOG_BEGIN(logLevelTrace);
46         FUNCTION_LOG_PARAM(STRING_ID, type);
47         FUNCTION_LOG_PARAM(STRING, backupLabelPrior);
48         FUNCTION_LOG_PARAM(TIME, timestamp);
49     FUNCTION_LOG_END();
50 
51     ASSERT((type == backupTypeFull && backupLabelPrior == NULL) || (type != backupTypeFull && backupLabelPrior != NULL));
52     ASSERT(timestamp > 0);
53 
54     String *result = NULL;
55 
56     MEM_CONTEXT_TEMP_BEGIN()
57     {
58         const String *backupLabelLatest = NULL;
59 
60         // Get the newest backup
61         const StringList *backupList = strLstSort(
62             storageListP(
63                 storageRepo(), STRDEF(STORAGE_REPO_BACKUP),
64                 .expression = backupRegExpP(.full = true, .differential = true, .incremental = true)),
65             sortOrderDesc);
66 
67         if (!strLstEmpty(backupList))
68             backupLabelLatest = strLstGet(backupList, 0);
69 
70         // Get the newest history
71         const StringList *historyYearList = strLstSort(
72             storageListP(storageRepo(), STRDEF(STORAGE_REPO_BACKUP "/" BACKUP_PATH_HISTORY), .expression = STRDEF("^2[0-9]{3}$")),
73             sortOrderDesc);
74 
75         if (!strLstEmpty(historyYearList))
76         {
77             const StringList *historyList = strLstSort(
78                 storageListP(
79                     storageRepo(),
80                     strNewFmt(STORAGE_REPO_BACKUP "/" BACKUP_PATH_HISTORY "/%s", strZ(strLstGet(historyYearList, 0))),
81                     .expression = strNewFmt(
82                         "%s\\.manifest\\.%s$",
83                         strZ(backupRegExpP(.full = true, .differential = true, .incremental = true, .noAnchorEnd = true)),
84                         strZ(compressTypeStr(compressTypeGz)))),
85                 sortOrderDesc);
86 
87             if (!strLstEmpty(historyList))
88             {
89                 const String *historyLabelLatest = strLstGet(historyList, 0);
90 
91                 if (backupLabelLatest == NULL || strCmp(historyLabelLatest, backupLabelLatest) > 0)
92                     backupLabelLatest = historyLabelLatest;
93             }
94         }
95 
96         // Now that we have the latest label check if the provided timestamp will give us an even later label
97         result = backupLabelFormat(type, backupLabelPrior, timestamp);
98 
99         if (backupLabelLatest != NULL && strCmp(result, backupLabelLatest) <= 0)
100         {
101             // If that didn't give us a later label then add one second.  It's possible that two backups (they would need to be
102             // offline or halted online) have run very close together.
103             result = backupLabelFormat(type, backupLabelPrior, timestamp + 1);
104 
105             // If the label is still not latest then error.  There is probably a timezone change or massive clock skew.
106             if (strCmp(result, backupLabelLatest) <= 0)
107             {
108                 THROW_FMT(
109                     FormatError,
110                     "new backup label '%s' is not later than latest backup label '%s'\n"
111                     "HINT: has the timezone changed?\n"
112                     "HINT: is there clock skew?",
113                     strZ(result), strZ(backupLabelLatest));
114             }
115 
116             // If adding a second worked then sleep the remainder of the current second so we don't start early
117             sleepMSec(MSEC_PER_SEC - (timeMSec() % MSEC_PER_SEC));
118         }
119 
120         MEM_CONTEXT_PRIOR_BEGIN()
121         {
122             result = strDup(result);
123         }
124         MEM_CONTEXT_PRIOR_END();
125     }
126     MEM_CONTEXT_TEMP_END();
127 
128     FUNCTION_LOG_RETURN(STRING, result);
129 }
130 
131 /***********************************************************************************************************************************
132 Get the postgres database and storage objects
133 ***********************************************************************************************************************************/
134 #define FUNCTION_LOG_BACKUP_DATA_TYPE                                                                                              \
135     BackupData *
136 #define FUNCTION_LOG_BACKUP_DATA_FORMAT(value, buffer, bufferSize)                                                                 \
137     objToLog(value, "BackupData", buffer, bufferSize)
138 
139 typedef struct BackupData
140 {
141     unsigned int pgIdxPrimary;                                      // cfgOptGrpPg index of the primary
142     Db *dbPrimary;                                                  // Database connection to the primary
143     const Storage *storagePrimary;                                  // Storage object for the primary
144     const String *hostPrimary;                                      // Host name of the primary
145 
146     unsigned int pgIdxStandby;                                      // cfgOptGrpPg index of the standby
147     Db *dbStandby;                                                  // Database connection to the standby
148     const Storage *storageStandby;                                  // Storage object for the standby
149     const String *hostStandby;                                      // Host name of the standby
150 
151     unsigned int version;                                           // PostgreSQL version
152     unsigned int walSegmentSize;                                    // PostgreSQL wal segment size
153 } BackupData;
154 
155 static BackupData *
backupInit(const InfoBackup * infoBackup)156 backupInit(const InfoBackup *infoBackup)
157 {
158     FUNCTION_LOG_BEGIN(logLevelDebug);
159         FUNCTION_LOG_PARAM(INFO_BACKUP, infoBackup);
160     FUNCTION_LOG_END();
161 
162     ASSERT(infoBackup != NULL);
163 
164     // Initialize for offline backup
165     BackupData *result = memNew(sizeof(BackupData));
166     *result = (BackupData){0};
167 
168     // Check that the PostgreSQL version supports backup from standby. The check is done using the stanza info because pg_control
169     // cannot be loaded until a primary is found -- which will also lead to an error if the version does not support standby. If the
170     // pg_control version does not match the stanza version then there will be an error further down.
171     InfoPgData infoPg = infoPgDataCurrent(infoBackupPg(infoBackup));
172 
173     if (cfgOptionBool(cfgOptOnline) && cfgOptionBool(cfgOptBackupStandby) && infoPg.version < PG_VERSION_BACKUP_STANDBY)
174     {
175         THROW_FMT(
176             ConfigError, "option '" CFGOPT_BACKUP_STANDBY "' not valid for " PG_NAME " < %s",
177             strZ(pgVersionToStr(PG_VERSION_BACKUP_STANDBY)));
178     }
179 
180     // Don't allow backup from standby when offline
181     if (!cfgOptionBool(cfgOptOnline) && cfgOptionBool(cfgOptBackupStandby))
182     {
183         LOG_WARN(
184             "option " CFGOPT_BACKUP_STANDBY " is enabled but backup is offline - backups will be performed from the primary");
185         cfgOptionSet(cfgOptBackupStandby, cfgSourceParam, BOOL_FALSE_VAR);
186     }
187 
188     // Get database info when online
189     if (cfgOptionBool(cfgOptOnline))
190     {
191         bool backupStandby = cfgOptionBool(cfgOptBackupStandby);
192         DbGetResult dbInfo = dbGet(!backupStandby, true, backupStandby);
193 
194         result->pgIdxPrimary = dbInfo.primaryIdx;
195         result->dbPrimary = dbInfo.primary;
196 
197         if (backupStandby)
198         {
199             ASSERT(dbInfo.standby != NULL);
200 
201             result->pgIdxStandby = dbInfo.standbyIdx;
202             result->dbStandby = dbInfo.standby;
203             result->storageStandby = storagePgIdx(result->pgIdxStandby);
204             result->hostStandby = cfgOptionIdxStrNull(cfgOptPgHost, result->pgIdxStandby);
205         }
206     }
207 
208     // Add primary info
209     result->storagePrimary = storagePgIdx(result->pgIdxPrimary);
210     result->hostPrimary = cfgOptionIdxStrNull(cfgOptPgHost, result->pgIdxPrimary);
211 
212     // Get pg_control info from the primary
213     PgControl pgControl = pgControlFromFile(result->storagePrimary);
214 
215     result->version = pgControl.version;
216     result->walSegmentSize = pgControl.walSegmentSize;
217 
218     // Validate pg_control info against the stanza
219     if (result->version != infoPg.version || pgControl.systemId != infoPg.systemId)
220     {
221         THROW_FMT(
222             BackupMismatchError,
223             PG_NAME " version %s, system-id %" PRIu64 " do not match stanza version %s, system-id %" PRIu64 "\n"
224             "HINT: is this the correct stanza?", strZ(pgVersionToStr(pgControl.version)), pgControl.systemId,
225             strZ(pgVersionToStr(infoPg.version)), infoPg.systemId);
226     }
227 
228     // Only allow stop auto in PostgreSQL >= 9.3 and <= 9.5
229     if (cfgOptionBool(cfgOptStopAuto) && result->version < PG_VERSION_93)
230     {
231         LOG_WARN(CFGOPT_STOP_AUTO " option is only available in " PG_NAME " >= " PG_VERSION_93_STR);
232         cfgOptionSet(cfgOptStopAuto, cfgSourceParam, BOOL_FALSE_VAR);
233     }
234 
235     // Only allow start-fast option for PostgreSQL >= 8.4
236     if (cfgOptionBool(cfgOptStartFast) && result->version < PG_VERSION_84)
237     {
238         LOG_WARN(CFGOPT_START_FAST " option is only available in " PG_NAME " >= " PG_VERSION_84_STR);
239         cfgOptionSet(cfgOptStartFast, cfgSourceParam, BOOL_FALSE_VAR);
240     }
241 
242     // If checksum page is not explicity set then automatically enable it when checksums are available
243     if (!cfgOptionTest(cfgOptChecksumPage))
244     {
245         // If online then use the value in pg_control to set checksum-page
246         if (cfgOptionBool(cfgOptOnline))
247         {
248             cfgOptionSet(cfgOptChecksumPage, cfgSourceParam, VARBOOL(pgControl.pageChecksum));
249         }
250         // Else set to false.  An offline cluster is likely to have false positives so better if the user enables manually.
251         else
252             cfgOptionSet(cfgOptChecksumPage, cfgSourceParam, BOOL_FALSE_VAR);
253     }
254     // Else if checksums have been explicitly enabled but are not available then warn and reset. ??? We should be able to make this
255     // determination when offline as well, but the integration tests don't write pg_control accurately enough to support it.
256     else if (cfgOptionBool(cfgOptOnline) && !pgControl.pageChecksum && cfgOptionBool(cfgOptChecksumPage))
257     {
258         LOG_WARN(CFGOPT_CHECKSUM_PAGE " option set to true but checksums are not enabled on the cluster, resetting to false");
259         cfgOptionSet(cfgOptChecksumPage, cfgSourceParam, BOOL_FALSE_VAR);
260     }
261 
262     FUNCTION_LOG_RETURN(BACKUP_DATA, result);
263 }
264 
265 /**********************************************************************************************************************************
266 Get time from the database or locally depending on online
267 ***********************************************************************************************************************************/
268 static time_t
backupTime(BackupData * backupData,bool waitRemainder)269 backupTime(BackupData *backupData, bool waitRemainder)
270 {
271     FUNCTION_LOG_BEGIN(logLevelDebug);
272         FUNCTION_LOG_PARAM(BACKUP_DATA, backupData);
273         FUNCTION_LOG_PARAM(BOOL, waitRemainder);
274     FUNCTION_LOG_END();
275 
276     // Offline backups will just grab the time from the local system since the value of copyStart is not important in this context.
277     // No worries about causing a delta backup since switching online will do that anyway.
278     time_t result = time(NULL);
279 
280     // When online get the time from the database server
281     if (cfgOptionBool(cfgOptOnline))
282     {
283         // Get time from the database
284         TimeMSec timeMSec = dbTimeMSec(backupData->dbPrimary);
285         result = (time_t)(timeMSec / MSEC_PER_SEC);
286 
287         // Sleep the remainder of the second when requested (this is so copyStart is not subject to one second resolution issues)
288         if (waitRemainder)
289         {
290             unsigned int retry = 0;
291 
292             // Just to be safe we'll loop until PostgreSQL reports that we have slept long enough
293             do
294             {
295                 // Error if the clock has not advanced after several attempts
296                 if (retry == 3)
297                     THROW_FMT(KernelError, PG_NAME " clock has not advanced to the next second after %u tries", retry);
298 
299                 // Sleep remainder of current second
300                 sleepMSec(((TimeMSec)(result + 1) * MSEC_PER_SEC) - timeMSec);
301 
302                 // Check time again to be sure we slept long enough
303                 timeMSec = dbTimeMSec(backupData->dbPrimary);
304 
305                 // Increment retry to prevent an infinite loop
306                 retry++;
307             }
308             while ((time_t)(timeMSec / MSEC_PER_SEC) <= result);
309         }
310     }
311 
312     FUNCTION_LOG_RETURN(TIME, result);
313 }
314 
315 /***********************************************************************************************************************************
316 Create an incremental backup if type is not full and a compatible prior backup exists
317 ***********************************************************************************************************************************/
318 // Helper to find a compatible prior backup
319 static Manifest *
backupBuildIncrPrior(const InfoBackup * infoBackup)320 backupBuildIncrPrior(const InfoBackup *infoBackup)
321 {
322     FUNCTION_LOG_BEGIN(logLevelDebug);
323         FUNCTION_LOG_PARAM(INFO_BACKUP, infoBackup);
324     FUNCTION_LOG_END();
325 
326     ASSERT(infoBackup != NULL);
327 
328     Manifest *result = NULL;
329 
330     // No incremental if backup type is full
331     BackupType type = (BackupType)cfgOptionStrId(cfgOptType);
332 
333     if (type != backupTypeFull)
334     {
335         MEM_CONTEXT_TEMP_BEGIN()
336         {
337             InfoPgData infoPg = infoPgDataCurrent(infoBackupPg(infoBackup));
338             const String *backupLabelPrior = NULL;
339             unsigned int backupTotal = infoBackupDataTotal(infoBackup);
340 
341             for (unsigned int backupIdx = backupTotal - 1; backupIdx < backupTotal; backupIdx--)
342             {
343                  InfoBackupData backupPrior = infoBackupData(infoBackup, backupIdx);
344 
345                 // The prior backup for a diff must be full
346                 if (type == backupTypeDiff && backupPrior.backupType != backupTypeFull)
347                     continue;
348 
349                 // The backups must come from the same cluster ??? This should enable delta instead
350                 if (infoPg.id != backupPrior.backupPgId)
351                     continue;
352 
353                 // This backup is a candidate for prior
354                 backupLabelPrior = strDup(backupPrior.backupLabel);
355                 break;
356             }
357 
358             // If there is a prior backup then check that options for the new backup are compatible
359             if (backupLabelPrior != NULL)
360             {
361                 result = manifestLoadFile(
362                     storageRepo(), strNewFmt(STORAGE_REPO_BACKUP "/%s/" BACKUP_MANIFEST_FILE, strZ(backupLabelPrior)),
363                     cfgOptionStrId(cfgOptRepoCipherType), infoPgCipherPass(infoBackupPg(infoBackup)));
364                 const ManifestData *manifestPriorData = manifestData(result);
365 
366                 LOG_INFO_FMT(
367                     "last backup label = %s, version = %s", strZ(manifestData(result)->backupLabel),
368                     strZ(manifestData(result)->backrestVersion));
369 
370                 // Warn if compress-type option changed
371                 if (compressTypeEnum(cfgOptionStr(cfgOptCompressType)) != manifestPriorData->backupOptionCompressType)
372                 {
373                     LOG_WARN_FMT(
374                         "%s backup cannot alter " CFGOPT_COMPRESS_TYPE " option to '%s', reset to value in %s",
375                         strZ(cfgOptionDisplay(cfgOptType)), strZ(cfgOptionDisplay(cfgOptCompressType)), strZ(backupLabelPrior));
376 
377                     // Set the compression type back to whatever was in the prior backup.  This is not strictly needed since we
378                     // could store compression type on a per file basis, but it seems simplest and safest for now.
379                     cfgOptionSet(
380                         cfgOptCompressType, cfgSourceParam, VARSTR(compressTypeStr(manifestPriorData->backupOptionCompressType)));
381 
382                     // There's a small chance that the prior manifest is old enough that backupOptionCompressLevel was not recorded.
383                     // There's an even smaller chance that the user will also alter compression-type in this scenario right after
384                     // upgrading to a newer version. Because we judge this combination of events to be nearly impossible just assert
385                     // here so no test coverage is needed.
386                     CHECK(manifestPriorData->backupOptionCompressLevel != NULL);
387 
388                     // Set the compression level back to whatever was in the prior backup
389                     cfgOptionSet(cfgOptCompressLevel, cfgSourceParam, manifestPriorData->backupOptionCompressLevel);
390                 }
391 
392                 // Warn if hardlink option changed ??? Doesn't seem like this is needed?  Hardlinks are always to a directory that
393                 // is guaranteed to contain a real file -- like references.  Also annoying that if the full backup was not
394                 // hardlinked then an diff/incr can't be used because we need more testing.
395                 if (cfgOptionBool(cfgOptRepoHardlink) != manifestPriorData->backupOptionHardLink)
396                 {
397                     LOG_WARN_FMT(
398                         "%s backup cannot alter hardlink option to '%s', reset to value in %s",
399                         strZ(cfgOptionDisplay(cfgOptType)), strZ(cfgOptionDisplay(cfgOptRepoHardlink)), strZ(backupLabelPrior));
400                     cfgOptionSet(cfgOptRepoHardlink, cfgSourceParam, VARBOOL(manifestPriorData->backupOptionHardLink));
401                 }
402 
403                 // If not defined this backup was done in a version prior to page checksums being introduced.  Just set
404                 // checksum-page to false and move on without a warning.  Page checksums will start on the next full backup.
405                 if (manifestData(result)->backupOptionChecksumPage == NULL)
406                 {
407                     cfgOptionSet(cfgOptChecksumPage, cfgSourceParam, BOOL_FALSE_VAR);
408                 }
409                 // Don't allow the checksum-page option to change in a diff or incr backup.  This could be confusing as only
410                 // certain files would be checksummed and the list could be incomplete during reporting.
411                 else
412                 {
413                     bool checksumPagePrior = varBool(manifestData(result)->backupOptionChecksumPage);
414 
415                     // Warn if an incompatible setting was explicitly requested
416                     if (checksumPagePrior != cfgOptionBool(cfgOptChecksumPage))
417                     {
418                         LOG_WARN_FMT(
419                             "%s backup cannot alter '" CFGOPT_CHECKSUM_PAGE "' option to '%s', reset to '%s' from %s",
420                             strZ(cfgOptionDisplay(cfgOptType)), strZ(cfgOptionDisplay(cfgOptChecksumPage)),
421                             cvtBoolToConstZ(checksumPagePrior), strZ(manifestData(result)->backupLabel));
422                     }
423 
424                     cfgOptionSet(cfgOptChecksumPage, cfgSourceParam, VARBOOL(checksumPagePrior));
425                 }
426 
427                 manifestMove(result, memContextPrior());
428             }
429             else
430             {
431                 LOG_WARN_FMT("no prior backup exists, %s backup has been changed to full", strZ(cfgOptionDisplay(cfgOptType)));
432                 cfgOptionSet(cfgOptType, cfgSourceParam, VARSTR(strIdToStr(backupTypeFull)));
433             }
434         }
435         MEM_CONTEXT_TEMP_END();
436     }
437 
438     FUNCTION_LOG_RETURN(MANIFEST, result);
439 }
440 
441 static bool
backupBuildIncr(const InfoBackup * infoBackup,Manifest * manifest,Manifest * manifestPrior,const String * archiveStart)442 backupBuildIncr(const InfoBackup *infoBackup, Manifest *manifest, Manifest *manifestPrior, const String *archiveStart)
443 {
444     FUNCTION_LOG_BEGIN(logLevelDebug);
445         FUNCTION_LOG_PARAM(INFO_BACKUP, infoBackup);
446         FUNCTION_LOG_PARAM(MANIFEST, manifest);
447         FUNCTION_LOG_PARAM(MANIFEST, manifestPrior);
448         FUNCTION_LOG_PARAM(STRING, archiveStart);
449     FUNCTION_LOG_END();
450 
451     ASSERT(infoBackup != NULL);
452     ASSERT(manifest != NULL);
453 
454     bool result = false;
455 
456     // No incremental if no prior manifest
457     if (manifestPrior != NULL)
458     {
459         MEM_CONTEXT_TEMP_BEGIN()
460         {
461             // Move the manifest to this context so it will be freed when we are done
462             manifestMove(manifestPrior, MEM_CONTEXT_TEMP());
463 
464             // Build incremental manifest
465             manifestBuildIncr(manifest, manifestPrior, (BackupType)cfgOptionStrId(cfgOptType), archiveStart);
466 
467             // Set the cipher subpass from prior manifest since we want a single subpass for the entire backup set
468             manifestCipherSubPassSet(manifest, manifestCipherSubPass(manifestPrior));
469 
470             // Incremental was built
471             result = true;
472         }
473         MEM_CONTEXT_TEMP_END();
474     }
475 
476     FUNCTION_LOG_RETURN(BOOL, result);
477 }
478 
479 /***********************************************************************************************************************************
480 Check for a backup that can be resumed and merge into the manifest if found
481 ***********************************************************************************************************************************/
482 typedef struct BackupResumeData
483 {
484     Manifest *manifest;                                             // New manifest
485     const Manifest *manifestResume;                                 // Resumed manifest
486     const CompressType compressType;                                // Backup compression type
487     const bool delta;                                               // Is this a delta backup?
488     const String *backupPath;                                       // Path to the current level of the backup being cleaned
489     const String *manifestParentName;                               // Parent manifest name used to construct manifest name
490 } BackupResumeData;
491 
492 // Callback to clean invalid paths/files/links out of the resumable backup path
backupResumeCallback(void * data,const StorageInfo * info)493 void backupResumeCallback(void *data, const StorageInfo *info)
494 {
495     FUNCTION_TEST_BEGIN();
496         FUNCTION_TEST_PARAM_P(VOID, data);
497         FUNCTION_TEST_PARAM(STORAGE_INFO, *storageInfo);
498     FUNCTION_TEST_END();
499 
500     ASSERT(data != NULL);
501     ASSERT(info != NULL);
502 
503     BackupResumeData *resumeData = data;
504 
505     // Skip all . paths because they have already been handled on the previous level of recursion
506     if (strEq(info->name, DOT_STR))
507     {
508         FUNCTION_TEST_RETURN_VOID();
509         return;
510     }
511 
512     // Skip backup.manifest.copy -- it must be preserved to allow resume again if this process throws an error before writing the
513     // manifest for the first time
514     if (resumeData->manifestParentName == NULL && strEqZ(info->name, BACKUP_MANIFEST_FILE INFO_COPY_EXT))
515     {
516         FUNCTION_TEST_RETURN_VOID();
517         return;
518     }
519 
520     // Build the name used to lookup files in the manifest
521     const String *manifestName = resumeData->manifestParentName != NULL ?
522         strNewFmt("%s/%s", strZ(resumeData->manifestParentName), strZ(info->name)) : info->name;
523 
524     // Build the backup path used to remove files/links/paths that are invalid
525     const String *backupPath = strNewFmt("%s/%s", strZ(resumeData->backupPath), strZ(info->name));
526 
527     // Process file types
528     switch (info->type)
529     {
530         // Check paths
531         // -------------------------------------------------------------------------------------------------------------------------
532         case storageTypePath:
533         {
534             // If the path was not found in the new manifest then remove it
535             if (manifestPathFindDefault(resumeData->manifest, manifestName, NULL) == NULL)
536             {
537                 LOG_DETAIL_FMT("remove path '%s' from resumed backup", strZ(storagePathP(storageRepo(), backupPath)));
538                 storagePathRemoveP(storageRepoWrite(), backupPath, .recurse = true);
539             }
540             // Else recurse into the path
541             {
542                 BackupResumeData resumeDataSub = *resumeData;
543                 resumeDataSub.manifestParentName = manifestName;
544                 resumeDataSub.backupPath = backupPath;
545 
546                 storageInfoListP(
547                     storageRepo(), resumeDataSub.backupPath, backupResumeCallback, &resumeDataSub, .sortOrder = sortOrderAsc);
548             }
549 
550             break;
551         }
552 
553         // Check files
554         // -------------------------------------------------------------------------------------------------------------------------
555         case storageTypeFile:
556         {
557             // If the file is compressed then strip off the extension before doing the lookup
558             CompressType fileCompressType = compressTypeFromName(manifestName);
559 
560             if (fileCompressType != compressTypeNone)
561                 manifestName = compressExtStrip(manifestName, fileCompressType);
562 
563             // Find the file in both manifests
564             const ManifestFile *file = manifestFileFindDefault(resumeData->manifest, manifestName, NULL);
565             const ManifestFile *fileResume = manifestFileFindDefault(resumeData->manifestResume, manifestName, NULL);
566 
567             // Check if the file can be resumed or must be removed
568             const char *removeReason = NULL;
569 
570             if (fileCompressType != resumeData->compressType)
571                 removeReason = "mismatched compression type";
572             else if (file == NULL)
573                 removeReason = "missing in manifest";
574             else if (file->reference != NULL)
575                 removeReason = "reference in manifest";
576             else if (fileResume == NULL)
577                 removeReason = "missing in resumed manifest";
578             else if (fileResume->reference != NULL)
579                 removeReason = "reference in resumed manifest";
580             else if (fileResume->checksumSha1[0] == '\0')
581                 removeReason = "no checksum in resumed manifest";
582             else if (file->size != fileResume->size)
583                 removeReason = "mismatched size";
584             else if (!resumeData->delta && file->timestamp != fileResume->timestamp)
585                 removeReason = "mismatched timestamp";
586             else if (file->size == 0)
587                 // ??? don't resume zero size files because Perl wouldn't -- this can be removed after the migration)
588                 removeReason = "zero size";
589             else
590             {
591                 manifestFileUpdate(
592                     resumeData->manifest, manifestName, file->size, fileResume->sizeRepo, fileResume->checksumSha1, NULL,
593                     fileResume->checksumPage, fileResume->checksumPageError, fileResume->checksumPageErrorList);
594             }
595 
596             // Remove the file if it could not be resumed
597             if (removeReason != NULL)
598             {
599                 LOG_DETAIL_FMT(
600                     "remove file '%s' from resumed backup (%s)", strZ(storagePathP(storageRepo(), backupPath)), removeReason);
601                 storageRemoveP(storageRepoWrite(), backupPath);
602             }
603 
604             break;
605         }
606 
607         // Remove links.  We could check that the link has not changed and preserve it but it doesn't seem worth the extra testing.
608         // The link will be recreated during the backup if needed.
609         // -------------------------------------------------------------------------------------------------------------------------
610         case storageTypeLink:
611             storageRemoveP(storageRepoWrite(), backupPath);
612             break;
613 
614         // Remove special files
615         // -------------------------------------------------------------------------------------------------------------------------
616         case storageTypeSpecial:
617             LOG_WARN_FMT("remove special file '%s' from resumed backup", strZ(storagePathP(storageRepo(), backupPath)));
618             storageRemoveP(storageRepoWrite(), backupPath);
619             break;
620     }
621 
622     FUNCTION_TEST_RETURN_VOID();
623 }
624 
625 // Helper to find a resumable backup
626 static const Manifest *
backupResumeFind(const Manifest * manifest,const String * cipherPassBackup)627 backupResumeFind(const Manifest *manifest, const String *cipherPassBackup)
628 {
629     FUNCTION_LOG_BEGIN(logLevelDebug);
630         FUNCTION_LOG_PARAM(MANIFEST, manifest);
631         FUNCTION_TEST_PARAM(STRING, cipherPassBackup);
632     FUNCTION_LOG_END();
633 
634     ASSERT(manifest != NULL);
635 
636     Manifest *result = NULL;
637 
638     MEM_CONTEXT_TEMP_BEGIN()
639     {
640         // Only the last backup can be resumed
641         const StringList *backupList = strLstSort(
642             storageListP(
643                 storageRepo(), STRDEF(STORAGE_REPO_BACKUP),
644                 .expression = backupRegExpP(.full = true, .differential = true, .incremental = true)),
645             sortOrderDesc);
646 
647         if (!strLstEmpty(backupList))
648         {
649             const String *backupLabel = strLstGet(backupList, 0);
650             const String *manifestFile = strNewFmt(STORAGE_REPO_BACKUP "/%s/" BACKUP_MANIFEST_FILE, strZ(backupLabel));
651 
652             // Resumable backups do not have backup.manifest
653             if (!storageExistsP(storageRepo(), manifestFile))
654             {
655                 bool usable = false;
656                 const String *reason = STRDEF("partially deleted by prior resume or invalid");
657                 Manifest *manifestResume = NULL;
658 
659                 // Resumable backups must have backup.manifest.copy
660                 if (storageExistsP(storageRepo(), strNewFmt("%s" INFO_COPY_EXT, strZ(manifestFile))))
661                 {
662                     reason = STRDEF("resume is disabled");
663 
664                     // Attempt to read the manifest file in the resumable backup to see if it can be used. If any error at all
665                     // occurs then the backup will be considered unusable and a resume will not be attempted.
666                     if (cfgOptionBool(cfgOptResume))
667                     {
668                         reason = strNewFmt("unable to read %s" INFO_COPY_EXT, strZ(manifestFile));
669 
670                         TRY_BEGIN()
671                         {
672                             manifestResume = manifestLoadFile(
673                                 storageRepo(), manifestFile, cfgOptionStrId(cfgOptRepoCipherType), cipherPassBackup);
674                             const ManifestData *manifestResumeData = manifestData(manifestResume);
675 
676                             // Check pgBackRest version. This allows the resume implementation to be changed with each version of
677                             // pgBackRest at the expense of users losing a resumable back after an upgrade, which seems worth the
678                             // cost.
679                             if (!strEq(manifestResumeData->backrestVersion, manifestData(manifest)->backrestVersion))
680                             {
681                                 reason = strNewFmt(
682                                     "new " PROJECT_NAME " version '%s' does not match resumable " PROJECT_NAME " version '%s'",
683                                     strZ(manifestData(manifest)->backrestVersion), strZ(manifestResumeData->backrestVersion));
684                             }
685                             // Check backup type because new backup label must be the same type as resume backup label
686                             else if (manifestResumeData->backupType != cfgOptionStrId(cfgOptType))
687                             {
688                                 reason = strNewFmt(
689                                     "new backup type '%s' does not match resumable backup type '%s'",
690                                     strZ(cfgOptionDisplay(cfgOptType)), strZ(strIdToStr(manifestResumeData->backupType)));
691                             }
692                             // Check prior backup label ??? Do we really care about the prior backup label?
693                             else if (!strEq(manifestResumeData->backupLabelPrior, manifestData(manifest)->backupLabelPrior))
694                             {
695                                 reason = strNewFmt(
696                                     "new prior backup label '%s' does not match resumable prior backup label '%s'",
697                                     manifestResumeData->backupLabelPrior ? strZ(manifestResumeData->backupLabelPrior) : "<undef>",
698                                     manifestData(manifest)->backupLabelPrior ?
699                                         strZ(manifestData(manifest)->backupLabelPrior) : "<undef>");
700                             }
701                             // Check compression. Compression can't be changed between backups so resume won't work either.
702                             else if (
703                                 manifestResumeData->backupOptionCompressType != compressTypeEnum(cfgOptionStr(cfgOptCompressType)))
704                             {
705                                 reason = strNewFmt(
706                                     "new compression '%s' does not match resumable compression '%s'",
707                                     strZ(cfgOptionDisplay(cfgOptCompressType)),
708                                     strZ(compressTypeStr(manifestResumeData->backupOptionCompressType)));
709                             }
710                             else
711                                 usable = true;
712                         }
713                         CATCH_ANY()
714                         {
715                         }
716                         TRY_END();
717                     }
718                 }
719 
720                 // If the backup is usable then return the manifest
721                 if (usable)
722                 {
723                     result = manifestMove(manifestResume, memContextPrior());
724                 }
725                 // Else warn and remove the unusable backup
726                 else
727                 {
728                     LOG_WARN_FMT("backup '%s' cannot be resumed: %s", strZ(backupLabel), strZ(reason));
729 
730                     storagePathRemoveP(
731                         storageRepoWrite(), strNewFmt(STORAGE_REPO_BACKUP "/%s", strZ(backupLabel)), .recurse = true);
732                 }
733             }
734         }
735     }
736     MEM_CONTEXT_TEMP_END();
737 
738     FUNCTION_LOG_RETURN(MANIFEST, result);
739 }
740 
741 static bool
backupResume(Manifest * manifest,const String * cipherPassBackup)742 backupResume(Manifest *manifest, const String *cipherPassBackup)
743 {
744     FUNCTION_LOG_BEGIN(logLevelDebug);
745         FUNCTION_LOG_PARAM(MANIFEST, manifest);
746         FUNCTION_TEST_PARAM(STRING, cipherPassBackup);
747     FUNCTION_LOG_END();
748 
749     ASSERT(manifest != NULL);
750 
751     bool result = false;
752 
753     MEM_CONTEXT_TEMP_BEGIN()
754     {
755         const Manifest *manifestResume = backupResumeFind(manifest, cipherPassBackup);
756 
757         // If a resumable backup was found set the label and cipher subpass
758         if (manifestResume)
759         {
760             // Resuming
761             result = true;
762 
763             // Set the backup label to the resumed backup
764             manifestBackupLabelSet(manifest, manifestData(manifestResume)->backupLabel);
765 
766             LOG_WARN_FMT(
767                 "resumable backup %s of same type exists -- remove invalid files and resume",
768                 strZ(manifestData(manifest)->backupLabel));
769 
770             // If resuming a full backup then copy cipher subpass since it was used to encrypt the resumable files
771             if (manifestData(manifest)->backupType == backupTypeFull)
772                 manifestCipherSubPassSet(manifest, manifestCipherSubPass(manifestResume));
773 
774             // Clean resumed backup
775             BackupResumeData resumeData =
776             {
777                 .manifest = manifest,
778                 .manifestResume = manifestResume,
779                 .compressType = compressTypeEnum(cfgOptionStr(cfgOptCompressType)),
780                 .delta = cfgOptionBool(cfgOptDelta),
781                 .backupPath = strNewFmt(STORAGE_REPO_BACKUP "/%s", strZ(manifestData(manifest)->backupLabel)),
782             };
783 
784             storageInfoListP(storageRepo(), resumeData.backupPath, backupResumeCallback, &resumeData, .sortOrder = sortOrderAsc);
785         }
786     }
787     MEM_CONTEXT_TEMP_END();
788 
789     FUNCTION_LOG_RETURN(BOOL, result);
790 }
791 
792 /***********************************************************************************************************************************
793 Start the backup
794 ***********************************************************************************************************************************/
795 typedef struct BackupStartResult
796 {
797     String *lsn;
798     String *walSegmentName;
799     VariantList *dbList;
800     VariantList *tablespaceList;
801 } BackupStartResult;
802 
803 static BackupStartResult
backupStart(BackupData * backupData)804 backupStart(BackupData *backupData)
805 {
806     FUNCTION_LOG_BEGIN(logLevelDebug);
807         FUNCTION_LOG_PARAM(BACKUP_DATA, backupData);
808     FUNCTION_LOG_END();
809 
810     BackupStartResult result = {.lsn = NULL};
811 
812     MEM_CONTEXT_TEMP_BEGIN()
813     {
814         // If this is an offline backup
815         if (!cfgOptionBool(cfgOptOnline))
816         {
817             // Check if Postgres is running and if so only continue when forced
818             if (storageExistsP(backupData->storagePrimary, PG_FILE_POSTMASTERPID_STR))
819             {
820                 if (cfgOptionBool(cfgOptForce))
821                 {
822                     LOG_WARN(
823                         "--no-" CFGOPT_ONLINE " passed and " PG_FILE_POSTMASTERPID " exists but --" CFGOPT_FORCE " was passed so"
824                         " backup will continue though it looks like " PG_NAME " is running and the backup will probably not be"
825                         " consistent");
826                 }
827                 else
828                 {
829                     THROW(
830                         PgRunningError,
831                         "--no-" CFGOPT_ONLINE " passed but " PG_FILE_POSTMASTERPID " exists - looks like " PG_NAME " is running."
832                         " Shut down " PG_NAME " and try again, or use --force.");
833                 }
834             }
835         }
836         // Else start the backup normally
837         else
838         {
839             // Check database configuration
840             checkDbConfig(backupData->version, backupData->pgIdxPrimary, backupData->dbPrimary, false);
841 
842             // Start backup
843             LOG_INFO_FMT(
844                 "execute %sexclusive pg_start_backup(): backup begins after the %s checkpoint completes",
845                 backupData->version >= PG_VERSION_96 ? "non-" : "",
846                 cfgOptionBool(cfgOptStartFast) ? "requested immediate" : "next regular");
847 
848             DbBackupStartResult dbBackupStartResult = dbBackupStart(
849                 backupData->dbPrimary, cfgOptionBool(cfgOptStartFast), cfgOptionBool(cfgOptStopAuto));
850 
851             MEM_CONTEXT_PRIOR_BEGIN()
852             {
853                 result.lsn = strDup(dbBackupStartResult.lsn);
854                 result.walSegmentName = strDup(dbBackupStartResult.walSegmentName);
855                 result.dbList = dbList(backupData->dbPrimary);
856                 result.tablespaceList = dbTablespaceList(backupData->dbPrimary);
857             }
858             MEM_CONTEXT_PRIOR_END();
859 
860             LOG_INFO_FMT("backup start archive = %s, lsn = %s", strZ(result.walSegmentName), strZ(result.lsn));
861 
862             // Wait for replay on the standby to catch up
863             if (cfgOptionBool(cfgOptBackupStandby))
864             {
865                 LOG_INFO_FMT("wait for replay on the standby to reach %s", strZ(result.lsn));
866                 dbReplayWait(backupData->dbStandby, result.lsn, cfgOptionUInt64(cfgOptArchiveTimeout));
867                 LOG_INFO_FMT("replay on the standby reached %s", strZ(result.lsn));
868 
869                 // The standby db object won't be used anymore so free it
870                 dbFree(backupData->dbStandby);
871 
872                 // The standby protocol connection won't be used anymore so free it
873                 protocolRemoteFree(backupData->pgIdxStandby);
874             }
875         }
876     }
877     MEM_CONTEXT_TEMP_END();
878 
879     FUNCTION_LOG_RETURN_STRUCT(result);
880 }
881 
882 /***********************************************************************************************************************************
883 Stop the backup
884 ***********************************************************************************************************************************/
885 // Helper to write a file from a string to the repository and update the manifest
886 static void
backupFilePut(BackupData * backupData,Manifest * manifest,const String * name,time_t timestamp,const String * content)887 backupFilePut(BackupData *backupData, Manifest *manifest, const String *name, time_t timestamp, const String *content)
888 {
889     FUNCTION_LOG_BEGIN(logLevelDebug);
890         FUNCTION_LOG_PARAM(BACKUP_DATA, backupData);
891         FUNCTION_LOG_PARAM(MANIFEST, manifest);
892         FUNCTION_LOG_PARAM(STRING, name);
893         FUNCTION_LOG_PARAM(TIME, timestamp);
894         FUNCTION_LOG_PARAM(STRING, content);
895     FUNCTION_LOG_END();
896 
897     // Skip files with no content
898     if (content != NULL)
899     {
900         MEM_CONTEXT_TEMP_BEGIN()
901         {
902             // Create file
903             const String *manifestName = strNewFmt(MANIFEST_TARGET_PGDATA "/%s", strZ(name));
904             CompressType compressType = compressTypeEnum(cfgOptionStr(cfgOptCompressType));
905 
906             StorageWrite *write = storageNewWriteP(
907                 storageRepoWrite(),
908                 strNewFmt(
909                     STORAGE_REPO_BACKUP "/%s/%s%s", strZ(manifestData(manifest)->backupLabel), strZ(manifestName),
910                     strZ(compressExtStr(compressType))),
911                 .compressible = true);
912 
913             IoFilterGroup *filterGroup = ioWriteFilterGroup(storageWriteIo(write));
914 
915             // Add SHA1 filter
916             ioFilterGroupAdd(filterGroup, cryptoHashNew(HASH_TYPE_SHA1_STR));
917 
918             // Add compression
919             if (compressType != compressTypeNone)
920             {
921                 ioFilterGroupAdd(
922                     ioWriteFilterGroup(storageWriteIo(write)), compressFilter(compressType, cfgOptionInt(cfgOptCompressLevel)));
923             }
924 
925             // Add encryption filter if required
926             cipherBlockFilterGroupAdd(
927                 filterGroup, cfgOptionStrId(cfgOptRepoCipherType), cipherModeEncrypt, manifestCipherSubPass(manifest));
928 
929             // Add size filter last to calculate repo size
930             ioFilterGroupAdd(filterGroup, ioSizeNew());
931 
932             // Write file
933             storagePutP(write, BUFSTR(content));
934 
935             // Use base path to set ownership and mode
936             const ManifestPath *basePath = manifestPathFind(manifest, MANIFEST_TARGET_PGDATA_STR);
937 
938             // Add to manifest
939             ManifestFile file =
940             {
941                 .name = manifestName,
942                 .primary = true,
943                 .mode = basePath->mode & (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH),
944                 .user = basePath->user,
945                 .group = basePath->group,
946                 .size = strSize(content),
947                 .sizeRepo = varUInt64Force(ioFilterGroupResult(filterGroup, SIZE_FILTER_TYPE_STR)),
948                 .timestamp = timestamp,
949             };
950 
951             memcpy(
952                 file.checksumSha1, strZ(varStr(ioFilterGroupResult(filterGroup, CRYPTO_HASH_FILTER_TYPE_STR))),
953                 HASH_TYPE_SHA1_SIZE_HEX + 1);
954 
955             manifestFileAdd(manifest, &file);
956 
957             LOG_DETAIL_FMT("wrote '%s' file returned from pg_stop_backup()", strZ(name));
958         }
959         MEM_CONTEXT_TEMP_END();
960     }
961 
962     FUNCTION_LOG_RETURN_VOID();
963 }
964 
965 /*--------------------------------------------------------------------------------------------------------------------------------*/
966 typedef struct BackupStopResult
967 {
968     String *lsn;
969     String *walSegmentName;
970     time_t timestamp;
971 } BackupStopResult;
972 
973 static BackupStopResult
backupStop(BackupData * backupData,Manifest * manifest)974 backupStop(BackupData *backupData, Manifest *manifest)
975 {
976     FUNCTION_LOG_BEGIN(logLevelDebug);
977         FUNCTION_LOG_PARAM(BACKUP_DATA, backupData);
978         FUNCTION_LOG_PARAM(MANIFEST, manifest);
979     FUNCTION_LOG_END();
980 
981     BackupStopResult result = {.lsn = NULL};
982 
983     if (cfgOptionBool(cfgOptOnline))
984     {
985         MEM_CONTEXT_TEMP_BEGIN()
986         {
987             // Stop the backup
988             LOG_INFO_FMT(
989                 "execute %sexclusive pg_stop_backup() and wait for all WAL segments to archive",
990                 backupData->version >= PG_VERSION_96 ? "non-" : "");
991 
992             DbBackupStopResult dbBackupStopResult = dbBackupStop(backupData->dbPrimary);
993 
994             MEM_CONTEXT_PRIOR_BEGIN()
995             {
996                 result.timestamp = backupTime(backupData, false);
997                 result.lsn = strDup(dbBackupStopResult.lsn);
998                 result.walSegmentName = strDup(dbBackupStopResult.walSegmentName);
999             }
1000             MEM_CONTEXT_PRIOR_END();
1001 
1002             LOG_INFO_FMT("backup stop archive = %s, lsn = %s", strZ(result.walSegmentName), strZ(result.lsn));
1003 
1004             // Save files returned by stop backup
1005             backupFilePut(backupData, manifest, STRDEF(PG_FILE_BACKUPLABEL), result.timestamp, dbBackupStopResult.backupLabel);
1006             backupFilePut(backupData, manifest, STRDEF(PG_FILE_TABLESPACEMAP), result.timestamp, dbBackupStopResult.tablespaceMap);
1007         }
1008         MEM_CONTEXT_TEMP_END();
1009     }
1010     else
1011         result.timestamp = backupTime(backupData, false);
1012 
1013     FUNCTION_LOG_RETURN_STRUCT(result);
1014 }
1015 
1016 /***********************************************************************************************************************************
1017 Log the results of a job and throw errors
1018 ***********************************************************************************************************************************/
1019 static uint64_t
backupJobResult(Manifest * manifest,const String * host,const String * const fileName,StringList * fileRemove,ProtocolParallelJob * const job,const uint64_t sizeTotal,uint64_t sizeCopied)1020 backupJobResult(
1021     Manifest *manifest, const String *host, const String *const fileName, StringList *fileRemove, ProtocolParallelJob *const job,
1022     const uint64_t sizeTotal, uint64_t sizeCopied)
1023 {
1024     FUNCTION_LOG_BEGIN(logLevelDebug);
1025         FUNCTION_LOG_PARAM(MANIFEST, manifest);
1026         FUNCTION_LOG_PARAM(STRING, host);
1027         FUNCTION_LOG_PARAM(STRING, fileName);
1028         FUNCTION_LOG_PARAM(STRING_LIST, fileRemove);
1029         FUNCTION_LOG_PARAM(PROTOCOL_PARALLEL_JOB, job);
1030         FUNCTION_LOG_PARAM(UINT64, sizeTotal);
1031         FUNCTION_LOG_PARAM(UINT64, sizeCopied);
1032     FUNCTION_LOG_END();
1033 
1034     ASSERT(manifest != NULL);
1035     ASSERT(fileName != NULL);
1036     ASSERT(fileRemove != NULL);
1037     ASSERT(job != NULL);
1038 
1039     // The job was successful
1040     if (protocolParallelJobErrorCode(job) == 0)
1041     {
1042         MEM_CONTEXT_TEMP_BEGIN()
1043         {
1044             const ManifestFile *const file = manifestFileFind(manifest, varStr(protocolParallelJobKey(job)));
1045             const unsigned int processId = protocolParallelJobProcessId(job);
1046 
1047             PackRead *const jobResult = protocolParallelJobResult(job);
1048             const BackupCopyResult copyResult = (BackupCopyResult)pckReadU32P(jobResult);
1049             const uint64_t copySize = pckReadU64P(jobResult);
1050             const uint64_t repoSize = pckReadU64P(jobResult);
1051             const String *const copyChecksum = pckReadStrP(jobResult);
1052             const KeyValue *const checksumPageResult = varKv(jsonToVar(pckReadStrP(jobResult, .defaultValue = NULL_STR)));
1053 
1054             // Increment backup copy progress
1055             sizeCopied += copySize;
1056 
1057             // Create log file name
1058             const String *fileLog = host == NULL ? fileName : strNewFmt("%s:%s", strZ(host), strZ(fileName));
1059 
1060             // Format log strings
1061             const String *const logProgress =
1062                 strNewFmt(
1063                     "%s, %" PRIu64 "%%", strZ(strSizeFormat(copySize)), sizeTotal == 0 ? 100 : sizeCopied * 100 / sizeTotal);
1064             const String *const logChecksum = copySize != 0 ? strNewFmt(" checksum %s", strZ(copyChecksum)) : EMPTY_STR;
1065 
1066             // If the file is in a prior backup and nothing changed, just log it
1067             if (copyResult == backupCopyResultNoOp)
1068             {
1069                 LOG_DETAIL_PID_FMT(
1070                     processId, "match file from prior backup %s (%s)%s", strZ(fileLog), strZ(logProgress), strZ(logChecksum));
1071             }
1072             // Else if the repo matched the expect checksum, just log it
1073             else if (copyResult == backupCopyResultChecksum)
1074             {
1075                 LOG_DETAIL_PID_FMT(
1076                     processId, "checksum resumed file %s (%s)%s", strZ(fileLog), strZ(logProgress), strZ(logChecksum));
1077             }
1078             // Else if the file was removed during backup add it to the list of files to be removed from the manifest when the
1079             // backup is complete.  It can't be removed right now because that will invalidate the pointers that are being used for
1080             // processing.
1081             else if (copyResult == backupCopyResultSkip)
1082             {
1083                 LOG_DETAIL_PID_FMT(processId, "skip file removed by database %s", strZ(fileLog));
1084                 strLstAdd(fileRemove, file->name);
1085             }
1086             // Else file was copied so update manifest
1087             else
1088             {
1089                 // If the file had to be recopied then warn that there may be an issue with corruption in the repository
1090                 // ??? This should really be below the message below for more context -- can be moved after the migration
1091                 // ??? The name should be a pg path not manifest name -- can be fixed after the migration
1092                 if (copyResult == backupCopyResultReCopy)
1093                 {
1094                     LOG_WARN_FMT(
1095                         "resumed backup file %s does not have expected checksum %s. The file will be recopied and backup will"
1096                         " continue but this may be an issue unless the resumed backup path in the repository is known to be"
1097                         " corrupted.\n"
1098                         "NOTE: this does not indicate a problem with the PostgreSQL page checksums.",
1099                         strZ(file->name), file->checksumSha1);
1100                 }
1101 
1102                 LOG_DETAIL_PID_FMT(processId, "backup file %s (%s)%s", strZ(fileLog), strZ(logProgress), strZ(logChecksum));
1103 
1104                 // If the file had page checksums calculated during the copy
1105                 ASSERT((!file->checksumPage && checksumPageResult == NULL) || (file->checksumPage && checksumPageResult != NULL));
1106 
1107                 bool checksumPageError = false;
1108                 const VariantList *checksumPageErrorList = NULL;
1109 
1110                 if (checksumPageResult != NULL)
1111                 {
1112                     // If the checksum was valid
1113                     if (!varBool(kvGet(checksumPageResult, VARSTRDEF("valid"))))
1114                     {
1115                         checksumPageError = true;
1116 
1117                         if (!varBool(kvGet(checksumPageResult, VARSTRDEF("align"))))
1118                         {
1119                             checksumPageErrorList = NULL;
1120 
1121                             // ??? Update formatting after migration
1122                             LOG_WARN_FMT(
1123                                 "page misalignment in file %s: file size %" PRIu64 " is not divisible by page size %u",
1124                                 strZ(fileLog), copySize, PG_PAGE_SIZE_DEFAULT);
1125                         }
1126                         else
1127                         {
1128                             // Format the page checksum errors
1129                             checksumPageErrorList = varVarLst(kvGet(checksumPageResult, VARSTRDEF("error")));
1130                             ASSERT(!varLstEmpty(checksumPageErrorList));
1131 
1132                             String *error = strNew();
1133                             unsigned int errorTotalMin = 0;
1134 
1135                             for (unsigned int errorIdx = 0; errorIdx < varLstSize(checksumPageErrorList); errorIdx++)
1136                             {
1137                                 const Variant *const errorItem = varLstGet(checksumPageErrorList, errorIdx);
1138 
1139                                 // Add a comma if this is not the first item
1140                                 if (errorIdx != 0)
1141                                     strCatZ(error, ", ");
1142 
1143                                 // If an error range
1144                                 if (varType(errorItem) == varTypeVariantList)
1145                                 {
1146                                     const VariantList *const errorItemList = varVarLst(errorItem);
1147                                     ASSERT(varLstSize(errorItemList) == 2);
1148 
1149                                     strCatFmt(
1150                                         error, "%" PRIu64 "-%" PRIu64, varUInt64(varLstGet(errorItemList, 0)),
1151                                         varUInt64(varLstGet(errorItemList, 1)));
1152                                     errorTotalMin += 2;
1153                                 }
1154                                 // Else a single error
1155                                 else
1156                                 {
1157                                     ASSERT(varType(errorItem) == varTypeUInt64);
1158 
1159                                     strCatFmt(error, "%" PRIu64, varUInt64(errorItem));
1160                                     errorTotalMin++;
1161                                 }
1162                             }
1163 
1164                             // Make message plural when appropriate
1165                             const String *const plural = errorTotalMin > 1 ? STRDEF("s") : EMPTY_STR;
1166 
1167                             // ??? Update formatting after migration
1168                             LOG_WARN_FMT(
1169                                 "invalid page checksum%s found in file %s at page%s %s", strZ(plural), strZ(fileLog), strZ(plural),
1170                                 strZ(error));
1171                         }
1172                     }
1173                 }
1174 
1175                 // Update file info and remove any reference to the file's existence in a prior backup
1176                 manifestFileUpdate(
1177                     manifest, file->name, copySize, repoSize, strZ(copyChecksum), VARSTR(NULL), file->checksumPage,
1178                     checksumPageError, checksumPageErrorList);
1179             }
1180         }
1181         MEM_CONTEXT_TEMP_END();
1182 
1183         // Free the job
1184         protocolParallelJobFree(job);
1185     }
1186     // Else the job errored
1187     else
1188         THROW_CODE(protocolParallelJobErrorCode(job), strZ(protocolParallelJobErrorMessage(job)));
1189 
1190     FUNCTION_LOG_RETURN(UINT64, sizeCopied);
1191 }
1192 
1193 /***********************************************************************************************************************************
1194 Save a copy of the backup manifest during processing to preserve checksums for a possible resume
1195 ***********************************************************************************************************************************/
1196 static void
backupManifestSaveCopy(Manifest * const manifest,const String * cipherPassBackup)1197 backupManifestSaveCopy(Manifest *const manifest, const String *cipherPassBackup)
1198 {
1199     FUNCTION_LOG_BEGIN(logLevelDebug);
1200         FUNCTION_LOG_PARAM(MANIFEST, manifest);
1201         FUNCTION_TEST_PARAM(STRING, cipherPassBackup);
1202     FUNCTION_LOG_END();
1203 
1204     ASSERT(manifest != NULL);
1205 
1206     MEM_CONTEXT_TEMP_BEGIN()
1207     {
1208         // Open file for write
1209         IoWrite *write = storageWriteIo(
1210             storageNewWriteP(
1211                 storageRepoWrite(),
1212                 strNewFmt(
1213                     STORAGE_REPO_BACKUP "/%s/" BACKUP_MANIFEST_FILE INFO_COPY_EXT, strZ(manifestData(manifest)->backupLabel))));
1214 
1215         // Add encryption filter if required
1216         cipherBlockFilterGroupAdd(
1217             ioWriteFilterGroup(write), cfgOptionStrId(cfgOptRepoCipherType), cipherModeEncrypt, cipherPassBackup);
1218 
1219         // Save file
1220         manifestSave(manifest, write);
1221     }
1222     MEM_CONTEXT_TEMP_END();
1223 
1224     FUNCTION_LOG_RETURN_VOID();
1225 }
1226 
1227 /***********************************************************************************************************************************
1228 Process the backup manifest
1229 ***********************************************************************************************************************************/
1230 // Comparator to order ManifestFile objects by size then name
1231 static int
backupProcessQueueComparator(const void * item1,const void * item2)1232 backupProcessQueueComparator(const void *item1, const void *item2)
1233 {
1234     FUNCTION_TEST_BEGIN();
1235         FUNCTION_TEST_PARAM_P(VOID, item1);
1236         FUNCTION_TEST_PARAM_P(VOID, item2);
1237     FUNCTION_TEST_END();
1238 
1239     ASSERT(item1 != NULL);
1240     ASSERT(item2 != NULL);
1241 
1242     // If the size differs then that's enough to determine order
1243     if ((*(ManifestFile **)item1)->size < (*(ManifestFile **)item2)->size)
1244         FUNCTION_TEST_RETURN(-1);
1245     else if ((*(ManifestFile **)item1)->size > (*(ManifestFile **)item2)->size)
1246         FUNCTION_TEST_RETURN(1);
1247 
1248     // If size is the same then use name to generate a deterministic ordering (names must be unique)
1249     FUNCTION_TEST_RETURN(strCmp((*(ManifestFile **)item1)->name, (*(ManifestFile **)item2)->name));
1250 }
1251 
1252 // Helper to generate the backup queues
1253 static uint64_t
backupProcessQueue(Manifest * manifest,List ** queueList)1254 backupProcessQueue(Manifest *manifest, List **queueList)
1255 {
1256     FUNCTION_LOG_BEGIN(logLevelDebug);
1257         FUNCTION_LOG_PARAM(MANIFEST, manifest);
1258         FUNCTION_LOG_PARAM_P(LIST, queueList);
1259     FUNCTION_LOG_END();
1260 
1261     ASSERT(manifest != NULL);
1262 
1263     uint64_t result = 0;
1264 
1265     MEM_CONTEXT_TEMP_BEGIN()
1266     {
1267         // Create list of process queue
1268         *queueList = lstNewP(sizeof(List *));
1269 
1270         // Generate the list of targets
1271         StringList *targetList = strLstNew();
1272         strLstAdd(targetList, STRDEF(MANIFEST_TARGET_PGDATA "/"));
1273 
1274         for (unsigned int targetIdx = 0; targetIdx < manifestTargetTotal(manifest); targetIdx++)
1275         {
1276             const ManifestTarget *target = manifestTarget(manifest, targetIdx);
1277 
1278             if (target->tablespaceId != 0)
1279                 strLstAdd(targetList, strNewFmt("%s/", strZ(target->name)));
1280         }
1281 
1282         // Generate the processing queues (there is always at least one)
1283         bool backupStandby = cfgOptionBool(cfgOptBackupStandby);
1284         unsigned int queueOffset = backupStandby ? 1 : 0;
1285 
1286         MEM_CONTEXT_BEGIN(lstMemContext(*queueList))
1287         {
1288             for (unsigned int queueIdx = 0; queueIdx < strLstSize(targetList) + queueOffset; queueIdx++)
1289             {
1290                 List *queue = lstNewP(sizeof(ManifestFile *), .comparator = backupProcessQueueComparator);
1291                 lstAdd(*queueList, &queue);
1292             }
1293         }
1294         MEM_CONTEXT_END();
1295 
1296         // Now put all files into the processing queues
1297         bool delta = cfgOptionBool(cfgOptDelta);
1298         uint64_t fileTotal = 0;
1299         bool pgControlFound = false;
1300 
1301         for (unsigned int fileIdx = 0; fileIdx < manifestFileTotal(manifest); fileIdx++)
1302         {
1303             const ManifestFile *file = manifestFile(manifest, fileIdx);
1304 
1305             // If the file is a reference it should only be backed up if delta and not zero size
1306             if (file->reference != NULL && (!delta || file->size == 0))
1307                 continue;
1308 
1309             // Is pg_control in the backup?
1310             if (strEq(file->name, STRDEF(MANIFEST_TARGET_PGDATA "/" PG_PATH_GLOBAL "/" PG_FILE_PGCONTROL)))
1311                 pgControlFound = true;
1312 
1313             // Files that must be copied from the primary are always put in queue 0 when backup from standby
1314             if (backupStandby && file->primary)
1315             {
1316                 lstAdd(*(List **)lstGet(*queueList, 0), &file);
1317             }
1318             // Else find the correct queue by matching the file to a target
1319             else
1320             {
1321                 // Find the target that contains this file
1322                 unsigned int targetIdx = 0;
1323 
1324                 do
1325                 {
1326                     // A target should always be found
1327                     CHECK(targetIdx < strLstSize(targetList));
1328 
1329                     if (strBeginsWith(file->name, strLstGet(targetList, targetIdx)))
1330                         break;
1331 
1332                     targetIdx++;
1333                 }
1334                 while (1);
1335 
1336                 // Add file to queue
1337                 lstAdd(*(List **)lstGet(*queueList, targetIdx + queueOffset), &file);
1338             }
1339 
1340             // Add size to total
1341             result += file->size;
1342 
1343             // Increment total files
1344             fileTotal++;
1345         }
1346 
1347         // pg_control should always be in an online backup
1348         if (!pgControlFound && cfgOptionBool(cfgOptOnline))
1349         {
1350             THROW(
1351                 FileMissingError,
1352                 PG_FILE_PGCONTROL " must be present in all online backups\n"
1353                 "HINT: is something wrong with the clock or filesystem timestamps?");
1354          }
1355 
1356         // If there are no files to backup then we'll exit with an error.  This could happen if the database is down and backup is
1357         // called with --no-online twice in a row.
1358         if (fileTotal == 0)
1359             THROW(FileMissingError, "no files have changed since the last backup - this seems unlikely");
1360 
1361         // Sort the queues
1362         for (unsigned int queueIdx = 0; queueIdx < lstSize(*queueList); queueIdx++)
1363             lstSort(*(List **)lstGet(*queueList, queueIdx), sortOrderDesc);
1364 
1365         // Move process queues to prior context
1366         lstMove(*queueList, memContextPrior());
1367     }
1368     MEM_CONTEXT_TEMP_END();
1369 
1370     FUNCTION_LOG_RETURN(UINT64, result);
1371 }
1372 
1373 // Helper to caculate the next queue to scan based on the client index
1374 static int
backupJobQueueNext(unsigned int clientIdx,int queueIdx,unsigned int queueTotal)1375 backupJobQueueNext(unsigned int clientIdx, int queueIdx, unsigned int queueTotal)
1376 {
1377     FUNCTION_TEST_BEGIN();
1378         FUNCTION_TEST_PARAM(UINT, clientIdx);
1379         FUNCTION_TEST_PARAM(INT, queueIdx);
1380         FUNCTION_TEST_PARAM(UINT, queueTotal);
1381     FUNCTION_TEST_END();
1382 
1383     // Move (forward or back) to the next queue
1384     queueIdx += clientIdx % 2 ? -1 : 1;
1385 
1386     // Deal with wrapping on either end
1387     if (queueIdx < 0)
1388         FUNCTION_TEST_RETURN((int)queueTotal - 1);
1389     else if (queueIdx == (int)queueTotal)
1390         FUNCTION_TEST_RETURN(0);
1391 
1392     FUNCTION_TEST_RETURN(queueIdx);
1393 }
1394 
1395 // Callback to fetch backup jobs for the parallel executor
1396 typedef struct BackupJobData
1397 {
1398     const String *const backupLabel;                                // Backup label (defines the backup path)
1399     const bool backupStandby;                                       // Backup from standby
1400     const CipherType cipherType;                                    // Cipher type
1401     const String *const cipherSubPass;                              // Passphrase used to encrypt files in the backup
1402     const CompressType compressType;                                // Backup compression type
1403     const int compressLevel;                                        // Compress level if backup is compressed
1404     const bool delta;                                               // Is this a checksum delta backup?
1405     const uint64_t lsnStart;                                        // Starting lsn for the backup
1406 
1407     List *queueList;                                                // List of processing queues
1408 } BackupJobData;
1409 
backupJobCallback(void * data,unsigned int clientIdx)1410 static ProtocolParallelJob *backupJobCallback(void *data, unsigned int clientIdx)
1411 {
1412     FUNCTION_TEST_BEGIN();
1413         FUNCTION_TEST_PARAM_P(VOID, data);
1414         FUNCTION_TEST_PARAM(UINT, clientIdx);
1415     FUNCTION_TEST_END();
1416 
1417     ASSERT(data != NULL);
1418 
1419     ProtocolParallelJob *result = NULL;
1420 
1421     MEM_CONTEXT_TEMP_BEGIN()
1422     {
1423         // Get a new job if there are any left
1424         BackupJobData *jobData = data;
1425 
1426         // Determine where to begin scanning the queue (we'll stop when we get back here).  When copying from the primary during
1427         // backup from standby only queue 0 will be used.
1428         unsigned int queueOffset = jobData->backupStandby && clientIdx > 0 ? 1 : 0;
1429         int queueIdx = jobData->backupStandby && clientIdx == 0 ?
1430             0 : (int)(clientIdx % (lstSize(jobData->queueList) - queueOffset));
1431         int queueEnd = queueIdx;
1432 
1433         do
1434         {
1435             List *queue = *(List **)lstGet(jobData->queueList, (unsigned int)queueIdx + queueOffset);
1436 
1437             if (!lstEmpty(queue))
1438             {
1439                 const ManifestFile *file = *(ManifestFile **)lstGet(queue, 0);
1440 
1441                 // Create backup job
1442                 ProtocolCommand *command = protocolCommandNew(PROTOCOL_COMMAND_BACKUP_FILE);
1443                 PackWrite *const param = protocolCommandParam(command);
1444 
1445                 pckWriteStrP(param, manifestPathPg(file->name));
1446                 pckWriteBoolP(param, !strEq(file->name, STRDEF(MANIFEST_TARGET_PGDATA "/" PG_PATH_GLOBAL "/" PG_FILE_PGCONTROL)));
1447                 pckWriteU64P(param, file->size);
1448                 pckWriteBoolP(param, !file->primary);
1449                 pckWriteStrP(param, file->checksumSha1[0] != 0 ? STR(file->checksumSha1) : NULL);
1450                 pckWriteBoolP(param, file->checksumPage);
1451                 pckWriteU64P(param, jobData->lsnStart);
1452                 pckWriteStrP(param, file->name);
1453                 pckWriteBoolP(param, file->reference != NULL);
1454                 pckWriteU32P(param, jobData->compressType);
1455                 pckWriteI32P(param, jobData->compressLevel);
1456                 pckWriteStrP(param, jobData->backupLabel);
1457                 pckWriteBoolP(param, jobData->delta);
1458                 pckWriteU64P(param, jobData->cipherSubPass == NULL ? cipherTypeNone : cipherTypeAes256Cbc);
1459                 pckWriteStrP(param, jobData->cipherSubPass);
1460 
1461                 // Remove job from the queue
1462                 lstRemoveIdx(queue, 0);
1463 
1464                 // Assign job to result
1465                 MEM_CONTEXT_PRIOR_BEGIN()
1466                 {
1467                     result = protocolParallelJobNew(VARSTR(file->name), command);
1468                 }
1469                 MEM_CONTEXT_PRIOR_END();
1470 
1471                 // Break out of the loop early since we found a job
1472                 break;
1473             }
1474 
1475             // Don't get next queue when copying from primary during backup from standby since the primary only has one queue
1476             if (!jobData->backupStandby || clientIdx > 0)
1477                 queueIdx = backupJobQueueNext(clientIdx, queueIdx, lstSize(jobData->queueList) - queueOffset);
1478         }
1479         while (queueIdx != queueEnd);
1480     }
1481     MEM_CONTEXT_TEMP_END();
1482 
1483     FUNCTION_TEST_RETURN(result);
1484 }
1485 
1486 static uint64_t
backupProcess(BackupData * backupData,Manifest * manifest,const String * lsnStart,const String * cipherPassBackup)1487 backupProcess(BackupData *backupData, Manifest *manifest, const String *lsnStart, const String *cipherPassBackup)
1488 {
1489     FUNCTION_LOG_BEGIN(logLevelDebug);
1490         FUNCTION_LOG_PARAM(BACKUP_DATA, backupData);
1491         FUNCTION_LOG_PARAM(MANIFEST, manifest);
1492         FUNCTION_LOG_PARAM(STRING, lsnStart);
1493         FUNCTION_TEST_PARAM(STRING, cipherPassBackup);
1494     FUNCTION_LOG_END();
1495 
1496     ASSERT(manifest != NULL);
1497 
1498     uint64_t sizeTotal = 0;
1499 
1500     MEM_CONTEXT_TEMP_BEGIN()
1501     {
1502         // Get backup info
1503         const BackupType backupType = manifestData(manifest)->backupType;
1504         const String *const backupLabel = manifestData(manifest)->backupLabel;
1505         const String *const backupPathExp = strNewFmt(STORAGE_REPO_BACKUP "/%s", strZ(backupLabel));
1506         bool hardLink = cfgOptionBool(cfgOptRepoHardlink) && storageFeature(storageRepoWrite(), storageFeatureHardLink);
1507         bool backupStandby = cfgOptionBool(cfgOptBackupStandby);
1508 
1509         // If this is a full backup or hard-linked and paths are supported then create all paths explicitly so that empty paths will
1510         // exist in to repo.  Also create tablspace symlinks when symlinks are available,  This makes it possible for the user to
1511         // make a copy of the backup path and get a valid cluster.
1512         if (backupType == backupTypeFull || hardLink)
1513         {
1514             // Create paths when available
1515             if (storageFeature(storageRepoWrite(), storageFeaturePath))
1516             {
1517                 for (unsigned int pathIdx = 0; pathIdx < manifestPathTotal(manifest); pathIdx++)
1518                 {
1519                     storagePathCreateP(
1520                         storageRepoWrite(),
1521                         strNewFmt("%s/%s", strZ(backupPathExp), strZ(manifestPath(manifest, pathIdx)->name)));
1522                 }
1523             }
1524 
1525             // Create tablespace symlinks when available
1526             if (storageFeature(storageRepoWrite(), storageFeatureSymLink))
1527             {
1528                 for (unsigned int targetIdx = 0; targetIdx < manifestTargetTotal(manifest); targetIdx++)
1529                 {
1530                     const ManifestTarget *const target = manifestTarget(manifest, targetIdx);
1531 
1532                     if (target->tablespaceId != 0)
1533                     {
1534                         const String *const link = storagePathP(
1535                             storageRepo(),
1536                             strNewFmt("%s/" MANIFEST_TARGET_PGDATA "/%s", strZ(backupPathExp), strZ(target->name)));
1537                         const String *const linkDestination = strNewFmt(
1538                             "../../" MANIFEST_TARGET_PGTBLSPC "/%u", target->tablespaceId);
1539 
1540                         THROW_ON_SYS_ERROR_FMT(
1541                             symlink(strZ(linkDestination), strZ(link)) == -1, FileOpenError,
1542                             "unable to create symlink '%s' to '%s'", strZ(link), strZ(linkDestination));
1543                     }
1544                 }
1545             }
1546         }
1547 
1548         // Generate processing queues
1549         BackupJobData jobData =
1550         {
1551             .backupLabel = backupLabel,
1552             .backupStandby = backupStandby,
1553             .compressType = compressTypeEnum(cfgOptionStr(cfgOptCompressType)),
1554             .compressLevel = cfgOptionInt(cfgOptCompressLevel),
1555             .cipherType = cfgOptionStrId(cfgOptRepoCipherType),
1556             .cipherSubPass = manifestCipherSubPass(manifest),
1557             .delta = cfgOptionBool(cfgOptDelta),
1558             .lsnStart = cfgOptionBool(cfgOptOnline) ? pgLsnFromStr(lsnStart) : 0xFFFFFFFFFFFFFFFF,
1559         };
1560 
1561         sizeTotal = backupProcessQueue(manifest, &jobData.queueList);
1562 
1563         // Create the parallel executor
1564         ProtocolParallel *parallelExec = protocolParallelNew(
1565             cfgOptionUInt64(cfgOptProtocolTimeout) / 2, backupJobCallback, &jobData);
1566 
1567         // First client is always on the primary
1568         protocolParallelClientAdd(parallelExec, protocolLocalGet(protocolStorageTypePg, backupData->pgIdxPrimary, 1));
1569 
1570         // Create the rest of the clients on the primary or standby depending on the value of backup-standby.  Note that standby
1571         // backups don't count the primary client in process-max.
1572         unsigned int processMax = cfgOptionUInt(cfgOptProcessMax) + (backupStandby ? 1 : 0);
1573         unsigned int pgIdx = backupStandby ? backupData->pgIdxStandby : backupData->pgIdxPrimary;
1574 
1575         for (unsigned int processIdx = 2; processIdx <= processMax; processIdx++)
1576             protocolParallelClientAdd(parallelExec, protocolLocalGet(protocolStorageTypePg, pgIdx, processIdx));
1577 
1578         // Maintain a list of files that need to be removed from the manifest when the backup is complete
1579         StringList *fileRemove = strLstNew();
1580 
1581         // Determine how often the manifest will be saved (every one percent or threshold size, whichever is greater)
1582         uint64_t manifestSaveLast = 0;
1583         uint64_t manifestSaveSize = sizeTotal / 100;
1584 
1585         if (manifestSaveSize < cfgOptionUInt64(cfgOptManifestSaveThreshold))
1586             manifestSaveSize = cfgOptionUInt64(cfgOptManifestSaveThreshold);
1587 
1588         // Process jobs
1589         uint64_t sizeCopied = 0;
1590 
1591         MEM_CONTEXT_TEMP_RESET_BEGIN()
1592         {
1593             do
1594             {
1595                 unsigned int completed = protocolParallelProcess(parallelExec);
1596 
1597                 for (unsigned int jobIdx = 0; jobIdx < completed; jobIdx++)
1598                 {
1599                     ProtocolParallelJob *job = protocolParallelResult(parallelExec);
1600 
1601                     sizeCopied = backupJobResult(
1602                         manifest,
1603                         backupStandby && protocolParallelJobProcessId(job) > 1 ? backupData->hostStandby : backupData->hostPrimary,
1604                         storagePathP(
1605                             protocolParallelJobProcessId(job) > 1 ? storagePgIdx(pgIdx) : backupData->storagePrimary,
1606                             manifestPathPg(manifestFileFind(manifest, varStr(protocolParallelJobKey(job)))->name)),
1607                         fileRemove, job, sizeTotal, sizeCopied);
1608                 }
1609 
1610                 // A keep-alive is required here for the remote holding open the backup connection
1611                 protocolKeepAlive();
1612 
1613                 // Save the manifest periodically to preserve checksums for resume
1614                 if (sizeCopied - manifestSaveLast >= manifestSaveSize)
1615                 {
1616                     backupManifestSaveCopy(manifest, cipherPassBackup);
1617                     manifestSaveLast = sizeCopied;
1618                 }
1619 
1620                 // Reset the memory context occasionally so we don't use too much memory or slow down processing
1621                 MEM_CONTEXT_TEMP_RESET(1000);
1622             }
1623             while (!protocolParallelDone(parallelExec));
1624         }
1625         MEM_CONTEXT_TEMP_END();
1626 
1627 #ifdef DEBUG
1628         // Ensure that all processing queues are empty
1629         for (unsigned int queueIdx = 0; queueIdx < lstSize(jobData.queueList); queueIdx++)
1630             ASSERT(lstEmpty(*(List **)lstGet(jobData.queueList, queueIdx)));
1631 #endif
1632 
1633         // Remove files from the manifest that were removed during the backup.  This must happen after processing to avoid
1634         // invalidating pointers by deleting items from the list.
1635         for (unsigned int fileRemoveIdx = 0; fileRemoveIdx < strLstSize(fileRemove); fileRemoveIdx++)
1636             manifestFileRemove(manifest, strLstGet(fileRemove, fileRemoveIdx));
1637 
1638         // Log references or create hardlinks for all files
1639         const char *const compressExt = strZ(compressExtStr(jobData.compressType));
1640 
1641         for (unsigned int fileIdx = 0; fileIdx < manifestFileTotal(manifest); fileIdx++)
1642         {
1643             const ManifestFile *const file = manifestFile(manifest, fileIdx);
1644 
1645             // If the file has a reference, then it was not copied since it can be retrieved from the referenced backup. However,
1646             // if hardlinking is enabled the link will need to be created.
1647             if (file->reference != NULL)
1648             {
1649                 // If hardlinking is enabled then create a hardlink for files that have not changed since the last backup
1650                 if (hardLink)
1651                 {
1652                     LOG_DETAIL_FMT("hardlink %s to %s",  strZ(file->name), strZ(file->reference));
1653 
1654                     const String *const linkName = storagePathP(
1655                         storageRepo(), strNewFmt("%s/%s%s", strZ(backupPathExp), strZ(file->name), compressExt));
1656                     const String *const linkDestination =  storagePathP(
1657                         storageRepo(),
1658                         strNewFmt(STORAGE_REPO_BACKUP "/%s/%s%s", strZ(file->reference), strZ(file->name), compressExt));
1659 
1660                     THROW_ON_SYS_ERROR_FMT(
1661                         link(strZ(linkDestination), strZ(linkName)) == -1, FileOpenError,
1662                         "unable to create hardlink '%s' to '%s'", strZ(linkName), strZ(linkDestination));
1663                 }
1664                 // Else log the reference. With delta, it is possible that references may have been removed if a file needed to be
1665                 // recopied.
1666                 else
1667                     LOG_DETAIL_FMT("reference %s to %s", strZ(file->name), strZ(file->reference));
1668             }
1669         }
1670 
1671         // Sync backup paths if required
1672         if (storageFeature(storageRepoWrite(), storageFeaturePathSync))
1673         {
1674             for (unsigned int pathIdx = 0; pathIdx < manifestPathTotal(manifest); pathIdx++)
1675             {
1676                 const String *const path = strNewFmt("%s/%s", strZ(backupPathExp), strZ(manifestPath(manifest, pathIdx)->name));
1677 
1678                 if (backupType == backupTypeFull || hardLink || storagePathExistsP(storageRepo(), path))
1679                     storagePathSyncP(storageRepoWrite(), path);
1680             }
1681         }
1682     }
1683     MEM_CONTEXT_TEMP_END();
1684 
1685     FUNCTION_LOG_RETURN(UINT64, sizeTotal);
1686 }
1687 
1688 /***********************************************************************************************************************************
1689 Check and copy WAL segments required to make the backup consistent
1690 ***********************************************************************************************************************************/
1691 static void
backupArchiveCheckCopy(Manifest * manifest,unsigned int walSegmentSize,const String * cipherPassBackup)1692 backupArchiveCheckCopy(Manifest *manifest, unsigned int walSegmentSize, const String *cipherPassBackup)
1693 {
1694     FUNCTION_LOG_BEGIN(logLevelDebug);
1695         FUNCTION_LOG_PARAM(MANIFEST, manifest);
1696         FUNCTION_LOG_PARAM(UINT, walSegmentSize);
1697         FUNCTION_TEST_PARAM(STRING, cipherPassBackup);
1698     FUNCTION_LOG_END();
1699 
1700     ASSERT(manifest != NULL);
1701 
1702     // If archive logs are required to complete the backup, then check them.  This is the default, but can be overridden if the
1703     // archive logs are going to a different server.  Be careful of disabling this option because there is no way to verify that the
1704     // backup will be consistent - at least not here.
1705     if (cfgOptionBool(cfgOptOnline) && cfgOptionBool(cfgOptArchiveCheck))
1706     {
1707         MEM_CONTEXT_TEMP_BEGIN()
1708         {
1709             unsigned int timeline = cvtZToUIntBase(strZ(strSubN(manifestData(manifest)->archiveStart, 0, 8)), 16);
1710             uint64_t lsnStart = pgLsnFromStr(manifestData(manifest)->lsnStart);
1711             uint64_t lsnStop = pgLsnFromStr(manifestData(manifest)->lsnStop);
1712 
1713             LOG_INFO_FMT(
1714                 "check archive for segment(s) %s:%s", strZ(pgLsnToWalSegment(timeline, lsnStart, walSegmentSize)),
1715                 strZ(pgLsnToWalSegment(timeline, lsnStop, walSegmentSize)));
1716 
1717             // Save the backup manifest before getting archive logs in case of failure
1718             backupManifestSaveCopy(manifest, cipherPassBackup);
1719 
1720             // Use base path to set ownership and mode
1721             const ManifestPath *basePath = manifestPathFind(manifest, MANIFEST_TARGET_PGDATA_STR);
1722 
1723             // Loop through all the segments in the lsn range
1724             InfoArchive *infoArchive = infoArchiveLoadFile(
1725                 storageRepo(), INFO_ARCHIVE_PATH_FILE_STR, cfgOptionStrId(cfgOptRepoCipherType),
1726                 cfgOptionStrNull(cfgOptRepoCipherPass));
1727             const String *archiveId = infoArchiveId(infoArchive);
1728 
1729             StringList *walSegmentList = pgLsnRangeToWalSegmentList(
1730                 manifestData(manifest)->pgVersion, timeline, lsnStart, lsnStop, walSegmentSize);
1731 
1732             for (unsigned int walSegmentIdx = 0; walSegmentIdx < strLstSize(walSegmentList); walSegmentIdx++)
1733             {
1734                 MEM_CONTEXT_TEMP_BEGIN()
1735                 {
1736                     const String *walSegment = strLstGet(walSegmentList, walSegmentIdx);
1737 
1738                     // Find the actual wal segment file in the archive
1739                     const String *archiveFile = walSegmentFind(
1740                         storageRepo(), archiveId, walSegment,  cfgOptionUInt64(cfgOptArchiveTimeout));
1741 
1742                     if (cfgOptionBool(cfgOptArchiveCopy))
1743                     {
1744                         // Copy can be a pretty expensive operation so log it
1745                         LOG_DETAIL_FMT("copy segment %s to backup", strZ(walSegment));
1746 
1747                         // Get compression type of the WAL segment and backup
1748                         CompressType archiveCompressType = compressTypeFromName(archiveFile);
1749                         CompressType backupCompressType = compressTypeEnum(cfgOptionStr(cfgOptCompressType));
1750 
1751                         // Open the archive file
1752                         StorageRead *read = storageNewReadP(
1753                             storageRepo(), strNewFmt(STORAGE_REPO_ARCHIVE "/%s/%s", strZ(archiveId), strZ(archiveFile)));
1754                         IoFilterGroup *filterGroup = ioReadFilterGroup(storageReadIo(read));
1755 
1756                         // Decrypt with archive key if encrypted
1757                         cipherBlockFilterGroupAdd(
1758                             filterGroup, cfgOptionStrId(cfgOptRepoCipherType), cipherModeDecrypt,
1759                             infoArchiveCipherPass(infoArchive));
1760 
1761                         // Compress/decompress if archive and backup do not have the same compression settings
1762                         if (archiveCompressType != backupCompressType)
1763                         {
1764                             if (archiveCompressType != compressTypeNone)
1765                                 ioFilterGroupAdd(filterGroup, decompressFilter(archiveCompressType));
1766 
1767                             if (backupCompressType != compressTypeNone)
1768                             {
1769                                 ioFilterGroupAdd(
1770                                     filterGroup, compressFilter(backupCompressType, cfgOptionInt(cfgOptCompressLevel)));
1771                             }
1772                         }
1773 
1774                         // Encrypt with backup key if encrypted
1775                         cipherBlockFilterGroupAdd(
1776                             filterGroup, cfgOptionStrId(cfgOptRepoCipherType), cipherModeEncrypt, manifestCipherSubPass(manifest));
1777 
1778                         // Add size filter last to calculate repo size
1779                         ioFilterGroupAdd(filterGroup, ioSizeNew());
1780 
1781                         // Copy the file
1782                         const String *manifestName = strNewFmt(
1783                             MANIFEST_TARGET_PGDATA "/%s/%s", strZ(pgWalPath(manifestData(manifest)->pgVersion)), strZ(walSegment));
1784 
1785                         storageCopyP(
1786                             read,
1787                             storageNewWriteP(
1788                                 storageRepoWrite(),
1789                                 strNewFmt(
1790                                     STORAGE_REPO_BACKUP "/%s/%s%s", strZ(manifestData(manifest)->backupLabel), strZ(manifestName),
1791                                     strZ(compressExtStr(compressTypeEnum(cfgOptionStr(cfgOptCompressType)))))));
1792 
1793                         // Add to manifest
1794                         ManifestFile file =
1795                         {
1796                             .name = manifestName,
1797                             .primary = true,
1798                             .mode = basePath->mode & (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH),
1799                             .user = basePath->user,
1800                             .group = basePath->group,
1801                             .size = walSegmentSize,
1802                             .sizeRepo = varUInt64Force(ioFilterGroupResult(filterGroup, SIZE_FILTER_TYPE_STR)),
1803                             .timestamp = manifestData(manifest)->backupTimestampStop,
1804                         };
1805 
1806                         memcpy(file.checksumSha1, strZ(strSubN(archiveFile, 25, 40)), HASH_TYPE_SHA1_SIZE_HEX + 1);
1807 
1808                         manifestFileAdd(manifest, &file);
1809                     }
1810                 }
1811                 MEM_CONTEXT_TEMP_END();
1812 
1813                 // A keep-alive is required here for the remote holding the backup lock
1814                 protocolKeepAlive();
1815             }
1816         }
1817         MEM_CONTEXT_TEMP_END();
1818     }
1819 
1820     FUNCTION_LOG_RETURN_VOID();
1821 }
1822 
1823 /***********************************************************************************************************************************
1824 Save and update all files required to complete the backup
1825 ***********************************************************************************************************************************/
1826 static void
backupComplete(InfoBackup * const infoBackup,Manifest * const manifest)1827 backupComplete(InfoBackup *const infoBackup, Manifest *const manifest)
1828 {
1829     FUNCTION_LOG_BEGIN(logLevelDebug);
1830         FUNCTION_LOG_PARAM(INFO_BACKUP, infoBackup);
1831         FUNCTION_LOG_PARAM(MANIFEST, manifest);
1832     FUNCTION_LOG_END();
1833 
1834     ASSERT(manifest != NULL);
1835 
1836     MEM_CONTEXT_TEMP_BEGIN()
1837     {
1838         const String *const backupLabel = manifestData(manifest)->backupLabel;
1839 
1840         // Validation and final save of the backup manifest.  Validate in strict mode to catch as many potential issues as possible.
1841         // -------------------------------------------------------------------------------------------------------------------------
1842         manifestValidate(manifest, true);
1843 
1844         backupManifestSaveCopy(manifest, infoPgCipherPass(infoBackupPg(infoBackup)));
1845 
1846         storageCopy(
1847             storageNewReadP(
1848                 storageRepo(), strNewFmt(STORAGE_REPO_BACKUP "/%s/" BACKUP_MANIFEST_FILE INFO_COPY_EXT, strZ(backupLabel))),
1849             storageNewWriteP(
1850                 storageRepoWrite(), strNewFmt(STORAGE_REPO_BACKUP "/%s/" BACKUP_MANIFEST_FILE, strZ(backupLabel))));
1851 
1852         // Copy a compressed version of the manifest to history. If the repo is encrypted then the passphrase to open the manifest
1853         // is required.  We can't just do a straight copy since the destination needs to be compressed and that must happen before
1854         // encryption in order to be efficient. Compression will always be gz for compatibility and since it is always available.
1855         // -------------------------------------------------------------------------------------------------------------------------
1856         StorageRead *manifestRead = storageNewReadP(
1857                 storageRepo(), strNewFmt(STORAGE_REPO_BACKUP "/%s/" BACKUP_MANIFEST_FILE, strZ(backupLabel)));
1858 
1859         cipherBlockFilterGroupAdd(
1860             ioReadFilterGroup(storageReadIo(manifestRead)), cfgOptionStrId(cfgOptRepoCipherType), cipherModeDecrypt,
1861             infoPgCipherPass(infoBackupPg(infoBackup)));
1862 
1863         StorageWrite *manifestWrite = storageNewWriteP(
1864                 storageRepoWrite(),
1865                 strNewFmt(
1866                     STORAGE_REPO_BACKUP "/" BACKUP_PATH_HISTORY "/%s/%s.manifest%s", strZ(strSubN(backupLabel, 0, 4)),
1867                     strZ(backupLabel), strZ(compressExtStr(compressTypeGz))));
1868 
1869         ioFilterGroupAdd(ioWriteFilterGroup(storageWriteIo(manifestWrite)), compressFilter(compressTypeGz, 9));
1870 
1871         cipherBlockFilterGroupAdd(
1872             ioWriteFilterGroup(storageWriteIo(manifestWrite)), cfgOptionStrId(cfgOptRepoCipherType), cipherModeEncrypt,
1873             infoPgCipherPass(infoBackupPg(infoBackup)));
1874 
1875         storageCopyP(manifestRead, manifestWrite);
1876 
1877         // Sync history path if required
1878         if (storageFeature(storageRepoWrite(), storageFeaturePathSync))
1879             storagePathSyncP(storageRepoWrite(), STRDEF(STORAGE_REPO_BACKUP "/" BACKUP_PATH_HISTORY));
1880 
1881         // Create a symlink to the most recent backup if supported.  This link is purely informational for the user and is never
1882         // used by us since symlinks are not supported on all storage types.
1883         // -------------------------------------------------------------------------------------------------------------------------
1884         backupLinkLatest(backupLabel, cfgOptionGroupIdxDefault(cfgOptGrpRepo));
1885 
1886         // Add manifest and save backup.info (infoBackupSaveFile() is responsible for proper syncing)
1887         // -------------------------------------------------------------------------------------------------------------------------
1888         infoBackupDataAdd(infoBackup, manifest);
1889 
1890         infoBackupSaveFile(
1891             infoBackup, storageRepoWrite(), INFO_BACKUP_PATH_FILE_STR, cfgOptionStrId(cfgOptRepoCipherType),
1892             cfgOptionStrNull(cfgOptRepoCipherPass));
1893     }
1894     MEM_CONTEXT_TEMP_END();
1895 
1896     FUNCTION_LOG_RETURN_VOID();
1897 }
1898 
1899 /**********************************************************************************************************************************/
1900 void
cmdBackup(void)1901 cmdBackup(void)
1902 {
1903     FUNCTION_LOG_VOID(logLevelDebug);
1904 
1905     // Verify the repo is local
1906     repoIsLocalVerify();
1907 
1908     // Test for stop file
1909     lockStopTest();
1910 
1911     MEM_CONTEXT_TEMP_BEGIN()
1912     {
1913         // If the repo option was not provided and more than one repo is configured, then log the default repo chosen
1914         if (!cfgOptionTest(cfgOptRepo) && cfgOptionGroupIdxTotal(cfgOptGrpRepo) > 1)
1915         {
1916             LOG_INFO_FMT(
1917                 "repo option not specified, defaulting to repo%u",
1918                 cfgOptionGroupIdxToKey(cfgOptGrpRepo, cfgOptionGroupIdxDefault(cfgOptGrpRepo)));
1919         }
1920 
1921         // Load backup.info
1922         InfoBackup *infoBackup = infoBackupLoadFileReconstruct(
1923             storageRepo(), INFO_BACKUP_PATH_FILE_STR, cfgOptionStrId(cfgOptRepoCipherType), cfgOptionStrNull(cfgOptRepoCipherPass));
1924         InfoPgData infoPg = infoPgDataCurrent(infoBackupPg(infoBackup));
1925         const String *cipherPassBackup = infoPgCipherPass(infoBackupPg(infoBackup));
1926 
1927         // Get pg storage and database objects
1928         BackupData *backupData = backupInit(infoBackup);
1929 
1930         // Get the start timestamp which will later be written into the manifest to track total backup time
1931         time_t timestampStart = backupTime(backupData, false);
1932 
1933         // Check if there is a prior manifest when backup type is diff/incr
1934         Manifest *manifestPrior = backupBuildIncrPrior(infoBackup);
1935 
1936         // Start the backup
1937         BackupStartResult backupStartResult = backupStart(backupData);
1938 
1939         // Build the manifest
1940         Manifest *manifest = manifestNewBuild(
1941             backupData->storagePrimary, infoPg.version, infoPg.catalogVersion, cfgOptionBool(cfgOptOnline),
1942             cfgOptionBool(cfgOptChecksumPage), strLstNewVarLst(cfgOptionLst(cfgOptExclude)), backupStartResult.tablespaceList);
1943 
1944         // Validate the manifest using the copy start time
1945         manifestBuildValidate(
1946             manifest, cfgOptionBool(cfgOptDelta), backupTime(backupData, true), compressTypeEnum(cfgOptionStr(cfgOptCompressType)));
1947 
1948         // Build an incremental backup if type is not full (manifestPrior will be freed in this call)
1949         if (!backupBuildIncr(infoBackup, manifest, manifestPrior, backupStartResult.walSegmentName))
1950             manifestCipherSubPassSet(manifest, cipherPassGen(cfgOptionStrId(cfgOptRepoCipherType)));
1951 
1952         // Set delta if it is not already set and the manifest requires it
1953         if (!cfgOptionBool(cfgOptDelta) && varBool(manifestData(manifest)->backupOptionDelta))
1954             cfgOptionSet(cfgOptDelta, cfgSourceParam, BOOL_TRUE_VAR);
1955 
1956         // Resume a backup when possible
1957         if (!backupResume(manifest, cipherPassBackup))
1958         {
1959             manifestBackupLabelSet(
1960                 manifest,
1961                 backupLabelCreate(
1962                     (BackupType)cfgOptionStrId(cfgOptType), manifestData(manifest)->backupLabelPrior, timestampStart));
1963         }
1964 
1965         // Save the manifest before processing starts
1966         backupManifestSaveCopy(manifest, cipherPassBackup);
1967 
1968         // Process the backup manifest
1969         uint64_t backupSizeTotal = backupProcess(backupData, manifest, backupStartResult.lsn, cipherPassBackup);
1970 
1971         // Stop the backup
1972         BackupStopResult backupStopResult = backupStop(backupData, manifest);
1973 
1974         // Complete manifest
1975         manifestBuildComplete(
1976             manifest, timestampStart, backupStartResult.lsn, backupStartResult.walSegmentName, backupStopResult.timestamp,
1977             backupStopResult.lsn, backupStopResult.walSegmentName, infoPg.id, infoPg.systemId, backupStartResult.dbList,
1978             cfgOptionBool(cfgOptOnline) && cfgOptionBool(cfgOptArchiveCheck),
1979             !cfgOptionBool(cfgOptOnline) || (cfgOptionBool(cfgOptArchiveCheck) && cfgOptionBool(cfgOptArchiveCopy)),
1980             cfgOptionUInt(cfgOptBufferSize), cfgOptionUInt(cfgOptCompressLevel), cfgOptionUInt(cfgOptCompressLevelNetwork),
1981             cfgOptionBool(cfgOptRepoHardlink), cfgOptionUInt(cfgOptProcessMax), cfgOptionBool(cfgOptBackupStandby));
1982 
1983         // The primary db object won't be used anymore so free it
1984         dbFree(backupData->dbPrimary);
1985 
1986         // Check and copy WAL segments required to make the backup consistent
1987         backupArchiveCheckCopy(manifest, backupData->walSegmentSize, cipherPassBackup);
1988 
1989         // The primary protocol connection won't be used anymore so free it. This needs to happen after backupArchiveCheckCopy() so
1990         // the backup lock is held on the remote which allows conditional archiving based on the backup lock. Any further access to
1991         // the primary storage object may result in an error (likely eof).
1992         protocolRemoteFree(backupData->pgIdxPrimary);
1993 
1994         // Complete the backup
1995         LOG_INFO_FMT("new backup label = %s", strZ(manifestData(manifest)->backupLabel));
1996         backupComplete(infoBackup, manifest);
1997 
1998         // Backup info
1999         LOG_INFO_FMT(
2000             "%s backup size = %s, file total = %u", strZ(strIdToStr(manifestData(manifest)->backupType)),
2001             strZ(strSizeFormat(backupSizeTotal)), manifestFileTotal(manifest));
2002     }
2003     MEM_CONTEXT_TEMP_END();
2004 
2005     FUNCTION_LOG_RETURN_VOID();
2006 }
2007