1 /*------------------------------------------------------------------------- 2 * 3 * reinit.c 4 * Reinitialization of unlogged relations 5 * 6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group 7 * Portions Copyright (c) 1994, Regents of the University of California 8 * 9 * IDENTIFICATION 10 * src/backend/storage/file/reinit.c 11 * 12 *------------------------------------------------------------------------- 13 */ 14 15 #include "postgres.h" 16 17 #include <unistd.h> 18 19 #include "common/relpath.h" 20 #include "storage/copydir.h" 21 #include "storage/fd.h" 22 #include "storage/reinit.h" 23 #include "utils/hsearch.h" 24 #include "utils/memutils.h" 25 26 static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, 27 int op); 28 static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, 29 int op); 30 31 typedef struct 32 { 33 char oid[OIDCHARS + 1]; 34 } unlogged_relation_entry; 35 36 /* 37 * Reset unlogged relations from before the last restart. 38 * 39 * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any 40 * relation with an "init" fork, except for the "init" fork itself. 41 * 42 * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main 43 * fork. 44 */ 45 void 46 ResetUnloggedRelations(int op) 47 { 48 char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)]; 49 DIR *spc_dir; 50 struct dirent *spc_de; 51 MemoryContext tmpctx, 52 oldctx; 53 54 /* Log it. */ 55 elog(DEBUG1, "resetting unlogged relations: cleanup %d init %d", 56 (op & UNLOGGED_RELATION_CLEANUP) != 0, 57 (op & UNLOGGED_RELATION_INIT) != 0); 58 59 /* 60 * Just to be sure we don't leak any memory, let's create a temporary 61 * memory context for this operation. 62 */ 63 tmpctx = AllocSetContextCreate(CurrentMemoryContext, 64 "ResetUnloggedRelations", 65 ALLOCSET_DEFAULT_SIZES); 66 oldctx = MemoryContextSwitchTo(tmpctx); 67 68 /* 69 * First process unlogged files in pg_default ($PGDATA/base) 70 */ 71 ResetUnloggedRelationsInTablespaceDir("base", op); 72 73 /* 74 * Cycle through directories for all non-default tablespaces. 75 */ 76 spc_dir = AllocateDir("pg_tblspc"); 77 78 while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL) 79 { 80 if (strcmp(spc_de->d_name, ".") == 0 || 81 strcmp(spc_de->d_name, "..") == 0) 82 continue; 83 84 snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", 85 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); 86 ResetUnloggedRelationsInTablespaceDir(temp_path, op); 87 } 88 89 FreeDir(spc_dir); 90 91 /* 92 * Restore memory context. 93 */ 94 MemoryContextSwitchTo(oldctx); 95 MemoryContextDelete(tmpctx); 96 } 97 98 /* 99 * Process one tablespace directory for ResetUnloggedRelations 100 */ 101 static void 102 ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) 103 { 104 DIR *ts_dir; 105 struct dirent *de; 106 char dbspace_path[MAXPGPATH * 2]; 107 108 ts_dir = AllocateDir(tsdirname); 109 110 /* 111 * If we get ENOENT on a tablespace directory, log it and return. This 112 * can happen if a previous DROP TABLESPACE crashed between removing the 113 * tablespace directory and removing the symlink in pg_tblspc. We don't 114 * really want to prevent database startup in that scenario, so let it 115 * pass instead. Any other type of error will be reported by ReadDir 116 * (causing a startup failure). 117 */ 118 if (ts_dir == NULL && errno == ENOENT) 119 { 120 ereport(LOG, 121 (errcode_for_file_access(), 122 errmsg("could not open directory \"%s\": %m", 123 tsdirname))); 124 return; 125 } 126 127 while ((de = ReadDir(ts_dir, tsdirname)) != NULL) 128 { 129 /* 130 * We're only interested in the per-database directories, which have 131 * numeric names. Note that this code will also (properly) ignore "." 132 * and "..". 133 */ 134 if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) 135 continue; 136 137 snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s", 138 tsdirname, de->d_name); 139 ResetUnloggedRelationsInDbspaceDir(dbspace_path, op); 140 } 141 142 FreeDir(ts_dir); 143 } 144 145 /* 146 * Process one per-dbspace directory for ResetUnloggedRelations 147 */ 148 static void 149 ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) 150 { 151 DIR *dbspace_dir; 152 struct dirent *de; 153 char rm_path[MAXPGPATH * 2]; 154 155 /* Caller must specify at least one operation. */ 156 Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0); 157 158 /* 159 * Cleanup is a two-pass operation. First, we go through and identify all 160 * the files with init forks. Then, we go through again and nuke 161 * everything with the same OID except the init fork. 162 */ 163 if ((op & UNLOGGED_RELATION_CLEANUP) != 0) 164 { 165 HTAB *hash; 166 HASHCTL ctl; 167 168 /* 169 * It's possible that someone could create a ton of unlogged relations 170 * in the same database & tablespace, so we'd better use a hash table 171 * rather than an array or linked list to keep track of which files 172 * need to be reset. Otherwise, this cleanup operation would be 173 * O(n^2). 174 */ 175 memset(&ctl, 0, sizeof(ctl)); 176 ctl.keysize = sizeof(unlogged_relation_entry); 177 ctl.entrysize = sizeof(unlogged_relation_entry); 178 hash = hash_create("unlogged hash", 32, &ctl, HASH_ELEM); 179 180 /* Scan the directory. */ 181 dbspace_dir = AllocateDir(dbspacedirname); 182 while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) 183 { 184 ForkNumber forkNum; 185 int oidchars; 186 unlogged_relation_entry ent; 187 188 /* Skip anything that doesn't look like a relation data file. */ 189 if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, 190 &forkNum)) 191 continue; 192 193 /* Also skip it unless this is the init fork. */ 194 if (forkNum != INIT_FORKNUM) 195 continue; 196 197 /* 198 * Put the OID portion of the name into the hash table, if it 199 * isn't already. 200 */ 201 memset(ent.oid, 0, sizeof(ent.oid)); 202 memcpy(ent.oid, de->d_name, oidchars); 203 hash_search(hash, &ent, HASH_ENTER, NULL); 204 } 205 206 /* Done with the first pass. */ 207 FreeDir(dbspace_dir); 208 209 /* 210 * If we didn't find any init forks, there's no point in continuing; 211 * we can bail out now. 212 */ 213 if (hash_get_num_entries(hash) == 0) 214 { 215 hash_destroy(hash); 216 return; 217 } 218 219 /* 220 * Now, make a second pass and remove anything that matches. 221 */ 222 dbspace_dir = AllocateDir(dbspacedirname); 223 while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) 224 { 225 ForkNumber forkNum; 226 int oidchars; 227 bool found; 228 unlogged_relation_entry ent; 229 230 /* Skip anything that doesn't look like a relation data file. */ 231 if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, 232 &forkNum)) 233 continue; 234 235 /* We never remove the init fork. */ 236 if (forkNum == INIT_FORKNUM) 237 continue; 238 239 /* 240 * See whether the OID portion of the name shows up in the hash 241 * table. 242 */ 243 memset(ent.oid, 0, sizeof(ent.oid)); 244 memcpy(ent.oid, de->d_name, oidchars); 245 hash_search(hash, &ent, HASH_FIND, &found); 246 247 /* If so, nuke it! */ 248 if (found) 249 { 250 snprintf(rm_path, sizeof(rm_path), "%s/%s", 251 dbspacedirname, de->d_name); 252 if (unlink(rm_path) < 0) 253 ereport(ERROR, 254 (errcode_for_file_access(), 255 errmsg("could not remove file \"%s\": %m", 256 rm_path))); 257 else 258 elog(DEBUG2, "unlinked file \"%s\"", rm_path); 259 } 260 } 261 262 /* Cleanup is complete. */ 263 FreeDir(dbspace_dir); 264 hash_destroy(hash); 265 } 266 267 /* 268 * Initialization happens after cleanup is complete: we copy each init 269 * fork file to the corresponding main fork file. Note that if we are 270 * asked to do both cleanup and init, we may never get here: if the 271 * cleanup code determines that there are no init forks in this dbspace, 272 * it will return before we get to this point. 273 */ 274 if ((op & UNLOGGED_RELATION_INIT) != 0) 275 { 276 /* Scan the directory. */ 277 dbspace_dir = AllocateDir(dbspacedirname); 278 while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) 279 { 280 ForkNumber forkNum; 281 int oidchars; 282 char oidbuf[OIDCHARS + 1]; 283 char srcpath[MAXPGPATH * 2]; 284 char dstpath[MAXPGPATH]; 285 286 /* Skip anything that doesn't look like a relation data file. */ 287 if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, 288 &forkNum)) 289 continue; 290 291 /* Also skip it unless this is the init fork. */ 292 if (forkNum != INIT_FORKNUM) 293 continue; 294 295 /* Construct source pathname. */ 296 snprintf(srcpath, sizeof(srcpath), "%s/%s", 297 dbspacedirname, de->d_name); 298 299 /* Construct destination pathname. */ 300 memcpy(oidbuf, de->d_name, oidchars); 301 oidbuf[oidchars] = '\0'; 302 snprintf(dstpath, sizeof(dstpath), "%s/%s%s", 303 dbspacedirname, oidbuf, de->d_name + oidchars + 1 + 304 strlen(forkNames[INIT_FORKNUM])); 305 306 /* OK, we're ready to perform the actual copy. */ 307 elog(DEBUG2, "copying %s to %s", srcpath, dstpath); 308 copy_file(srcpath, dstpath); 309 } 310 311 FreeDir(dbspace_dir); 312 313 /* 314 * copy_file() above has already called pg_flush_data() on the files 315 * it created. Now we need to fsync those files, because a checkpoint 316 * won't do it for us while we're in recovery. We do this in a 317 * separate pass to allow the kernel to perform all the flushes 318 * (especially the metadata ones) at once. 319 */ 320 dbspace_dir = AllocateDir(dbspacedirname); 321 while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) 322 { 323 ForkNumber forkNum; 324 int oidchars; 325 char oidbuf[OIDCHARS + 1]; 326 char mainpath[MAXPGPATH]; 327 328 /* Skip anything that doesn't look like a relation data file. */ 329 if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars, 330 &forkNum)) 331 continue; 332 333 /* Also skip it unless this is the init fork. */ 334 if (forkNum != INIT_FORKNUM) 335 continue; 336 337 /* Construct main fork pathname. */ 338 memcpy(oidbuf, de->d_name, oidchars); 339 oidbuf[oidchars] = '\0'; 340 snprintf(mainpath, sizeof(mainpath), "%s/%s%s", 341 dbspacedirname, oidbuf, de->d_name + oidchars + 1 + 342 strlen(forkNames[INIT_FORKNUM])); 343 344 fsync_fname(mainpath, false); 345 } 346 347 FreeDir(dbspace_dir); 348 349 /* 350 * Lastly, fsync the database directory itself, ensuring the 351 * filesystem remembers the file creations and deletions we've done. 352 * We don't bother with this during a call that does only 353 * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we 354 * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step 355 * too at the next startup attempt. 356 */ 357 fsync_fname(dbspacedirname, true); 358 } 359 } 360 361 /* 362 * Basic parsing of putative relation filenames. 363 * 364 * This function returns true if the file appears to be in the correct format 365 * for a non-temporary relation and false otherwise. 366 * 367 * NB: If this function returns true, the caller is entitled to assume that 368 * *oidchars has been set to the a value no more than OIDCHARS, and thus 369 * that a buffer of OIDCHARS+1 characters is sufficient to hold the OID 370 * portion of the filename. This is critical to protect against a possible 371 * buffer overrun. 372 */ 373 bool 374 parse_filename_for_nontemp_relation(const char *name, int *oidchars, 375 ForkNumber *fork) 376 { 377 int pos; 378 379 /* Look for a non-empty string of digits (that isn't too long). */ 380 for (pos = 0; isdigit((unsigned char) name[pos]); ++pos) 381 ; 382 if (pos == 0 || pos > OIDCHARS) 383 return false; 384 *oidchars = pos; 385 386 /* Check for a fork name. */ 387 if (name[pos] != '_') 388 *fork = MAIN_FORKNUM; 389 else 390 { 391 int forkchar; 392 393 forkchar = forkname_chars(&name[pos + 1], fork); 394 if (forkchar <= 0) 395 return false; 396 pos += forkchar + 1; 397 } 398 399 /* Check for a segment number. */ 400 if (name[pos] == '.') 401 { 402 int segchar; 403 404 for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar) 405 ; 406 if (segchar <= 1) 407 return false; 408 pos += segchar; 409 } 410 411 /* Now we should be at the end. */ 412 if (name[pos] != '\0') 413 return false; 414 return true; 415 } 416