1 /*-------------------------------------------------------------------------
2  *
3  * reinit.c
4  *	  Reinitialization of unlogged relations
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *	  src/backend/storage/file/reinit.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <unistd.h>
18 
19 #include "common/relpath.h"
20 #include "storage/copydir.h"
21 #include "storage/fd.h"
22 #include "storage/reinit.h"
23 #include "utils/hsearch.h"
24 #include "utils/memutils.h"
25 
26 static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname,
27 												  int op);
28 static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname,
29 											   int op);
30 
31 typedef struct
32 {
33 	char		oid[OIDCHARS + 1];
34 } unlogged_relation_entry;
35 
36 /*
37  * Reset unlogged relations from before the last restart.
38  *
39  * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any
40  * relation with an "init" fork, except for the "init" fork itself.
41  *
42  * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main
43  * fork.
44  */
45 void
ResetUnloggedRelations(int op)46 ResetUnloggedRelations(int op)
47 {
48 	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
49 	DIR		   *spc_dir;
50 	struct dirent *spc_de;
51 	MemoryContext tmpctx,
52 				oldctx;
53 
54 	/* Log it. */
55 	elog(DEBUG1, "resetting unlogged relations: cleanup %d init %d",
56 		 (op & UNLOGGED_RELATION_CLEANUP) != 0,
57 		 (op & UNLOGGED_RELATION_INIT) != 0);
58 
59 	/*
60 	 * Just to be sure we don't leak any memory, let's create a temporary
61 	 * memory context for this operation.
62 	 */
63 	tmpctx = AllocSetContextCreate(CurrentMemoryContext,
64 								   "ResetUnloggedRelations",
65 								   ALLOCSET_DEFAULT_SIZES);
66 	oldctx = MemoryContextSwitchTo(tmpctx);
67 
68 	/*
69 	 * First process unlogged files in pg_default ($PGDATA/base)
70 	 */
71 	ResetUnloggedRelationsInTablespaceDir("base", op);
72 
73 	/*
74 	 * Cycle through directories for all non-default tablespaces.
75 	 */
76 	spc_dir = AllocateDir("pg_tblspc");
77 
78 	while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
79 	{
80 		if (strcmp(spc_de->d_name, ".") == 0 ||
81 			strcmp(spc_de->d_name, "..") == 0)
82 			continue;
83 
84 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
85 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
86 		ResetUnloggedRelationsInTablespaceDir(temp_path, op);
87 	}
88 
89 	FreeDir(spc_dir);
90 
91 	/*
92 	 * Restore memory context.
93 	 */
94 	MemoryContextSwitchTo(oldctx);
95 	MemoryContextDelete(tmpctx);
96 }
97 
98 /*
99  * Process one tablespace directory for ResetUnloggedRelations
100  */
101 static void
ResetUnloggedRelationsInTablespaceDir(const char * tsdirname,int op)102 ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
103 {
104 	DIR		   *ts_dir;
105 	struct dirent *de;
106 	char		dbspace_path[MAXPGPATH * 2];
107 
108 	ts_dir = AllocateDir(tsdirname);
109 
110 	/*
111 	 * If we get ENOENT on a tablespace directory, log it and return.  This
112 	 * can happen if a previous DROP TABLESPACE crashed between removing the
113 	 * tablespace directory and removing the symlink in pg_tblspc.  We don't
114 	 * really want to prevent database startup in that scenario, so let it
115 	 * pass instead.  Any other type of error will be reported by ReadDir
116 	 * (causing a startup failure).
117 	 */
118 	if (ts_dir == NULL && errno == ENOENT)
119 	{
120 		ereport(LOG,
121 				(errcode_for_file_access(),
122 				 errmsg("could not open directory \"%s\": %m",
123 						tsdirname)));
124 		return;
125 	}
126 
127 	while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
128 	{
129 		/*
130 		 * We're only interested in the per-database directories, which have
131 		 * numeric names.  Note that this code will also (properly) ignore "."
132 		 * and "..".
133 		 */
134 		if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
135 			continue;
136 
137 		snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
138 				 tsdirname, de->d_name);
139 		ResetUnloggedRelationsInDbspaceDir(dbspace_path, op);
140 	}
141 
142 	FreeDir(ts_dir);
143 }
144 
145 /*
146  * Process one per-dbspace directory for ResetUnloggedRelations
147  */
148 static void
ResetUnloggedRelationsInDbspaceDir(const char * dbspacedirname,int op)149 ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
150 {
151 	DIR		   *dbspace_dir;
152 	struct dirent *de;
153 	char		rm_path[MAXPGPATH * 2];
154 
155 	/* Caller must specify at least one operation. */
156 	Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0);
157 
158 	/*
159 	 * Cleanup is a two-pass operation.  First, we go through and identify all
160 	 * the files with init forks.  Then, we go through again and nuke
161 	 * everything with the same OID except the init fork.
162 	 */
163 	if ((op & UNLOGGED_RELATION_CLEANUP) != 0)
164 	{
165 		HTAB	   *hash;
166 		HASHCTL		ctl;
167 
168 		/*
169 		 * It's possible that someone could create a ton of unlogged relations
170 		 * in the same database & tablespace, so we'd better use a hash table
171 		 * rather than an array or linked list to keep track of which files
172 		 * need to be reset.  Otherwise, this cleanup operation would be
173 		 * O(n^2).
174 		 */
175 		memset(&ctl, 0, sizeof(ctl));
176 		ctl.keysize = sizeof(unlogged_relation_entry);
177 		ctl.entrysize = sizeof(unlogged_relation_entry);
178 		hash = hash_create("unlogged hash", 32, &ctl, HASH_ELEM);
179 
180 		/* Scan the directory. */
181 		dbspace_dir = AllocateDir(dbspacedirname);
182 		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
183 		{
184 			ForkNumber	forkNum;
185 			int			oidchars;
186 			unlogged_relation_entry ent;
187 
188 			/* Skip anything that doesn't look like a relation data file. */
189 			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
190 													 &forkNum))
191 				continue;
192 
193 			/* Also skip it unless this is the init fork. */
194 			if (forkNum != INIT_FORKNUM)
195 				continue;
196 
197 			/*
198 			 * Put the OID portion of the name into the hash table, if it
199 			 * isn't already.
200 			 */
201 			memset(ent.oid, 0, sizeof(ent.oid));
202 			memcpy(ent.oid, de->d_name, oidchars);
203 			hash_search(hash, &ent, HASH_ENTER, NULL);
204 		}
205 
206 		/* Done with the first pass. */
207 		FreeDir(dbspace_dir);
208 
209 		/*
210 		 * If we didn't find any init forks, there's no point in continuing;
211 		 * we can bail out now.
212 		 */
213 		if (hash_get_num_entries(hash) == 0)
214 		{
215 			hash_destroy(hash);
216 			return;
217 		}
218 
219 		/*
220 		 * Now, make a second pass and remove anything that matches.
221 		 */
222 		dbspace_dir = AllocateDir(dbspacedirname);
223 		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
224 		{
225 			ForkNumber	forkNum;
226 			int			oidchars;
227 			bool		found;
228 			unlogged_relation_entry ent;
229 
230 			/* Skip anything that doesn't look like a relation data file. */
231 			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
232 													 &forkNum))
233 				continue;
234 
235 			/* We never remove the init fork. */
236 			if (forkNum == INIT_FORKNUM)
237 				continue;
238 
239 			/*
240 			 * See whether the OID portion of the name shows up in the hash
241 			 * table.
242 			 */
243 			memset(ent.oid, 0, sizeof(ent.oid));
244 			memcpy(ent.oid, de->d_name, oidchars);
245 			hash_search(hash, &ent, HASH_FIND, &found);
246 
247 			/* If so, nuke it! */
248 			if (found)
249 			{
250 				snprintf(rm_path, sizeof(rm_path), "%s/%s",
251 						 dbspacedirname, de->d_name);
252 				if (unlink(rm_path) < 0)
253 					ereport(ERROR,
254 							(errcode_for_file_access(),
255 							 errmsg("could not remove file \"%s\": %m",
256 									rm_path)));
257 				else
258 					elog(DEBUG2, "unlinked file \"%s\"", rm_path);
259 			}
260 		}
261 
262 		/* Cleanup is complete. */
263 		FreeDir(dbspace_dir);
264 		hash_destroy(hash);
265 	}
266 
267 	/*
268 	 * Initialization happens after cleanup is complete: we copy each init
269 	 * fork file to the corresponding main fork file.  Note that if we are
270 	 * asked to do both cleanup and init, we may never get here: if the
271 	 * cleanup code determines that there are no init forks in this dbspace,
272 	 * it will return before we get to this point.
273 	 */
274 	if ((op & UNLOGGED_RELATION_INIT) != 0)
275 	{
276 		/* Scan the directory. */
277 		dbspace_dir = AllocateDir(dbspacedirname);
278 		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
279 		{
280 			ForkNumber	forkNum;
281 			int			oidchars;
282 			char		oidbuf[OIDCHARS + 1];
283 			char		srcpath[MAXPGPATH * 2];
284 			char		dstpath[MAXPGPATH];
285 
286 			/* Skip anything that doesn't look like a relation data file. */
287 			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
288 													 &forkNum))
289 				continue;
290 
291 			/* Also skip it unless this is the init fork. */
292 			if (forkNum != INIT_FORKNUM)
293 				continue;
294 
295 			/* Construct source pathname. */
296 			snprintf(srcpath, sizeof(srcpath), "%s/%s",
297 					 dbspacedirname, de->d_name);
298 
299 			/* Construct destination pathname. */
300 			memcpy(oidbuf, de->d_name, oidchars);
301 			oidbuf[oidchars] = '\0';
302 			snprintf(dstpath, sizeof(dstpath), "%s/%s%s",
303 					 dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
304 					 strlen(forkNames[INIT_FORKNUM]));
305 
306 			/* OK, we're ready to perform the actual copy. */
307 			elog(DEBUG2, "copying %s to %s", srcpath, dstpath);
308 			copy_file(srcpath, dstpath);
309 		}
310 
311 		FreeDir(dbspace_dir);
312 
313 		/*
314 		 * copy_file() above has already called pg_flush_data() on the files
315 		 * it created. Now we need to fsync those files, because a checkpoint
316 		 * won't do it for us while we're in recovery. We do this in a
317 		 * separate pass to allow the kernel to perform all the flushes
318 		 * (especially the metadata ones) at once.
319 		 */
320 		dbspace_dir = AllocateDir(dbspacedirname);
321 		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
322 		{
323 			ForkNumber	forkNum;
324 			int			oidchars;
325 			char		oidbuf[OIDCHARS + 1];
326 			char		mainpath[MAXPGPATH];
327 
328 			/* Skip anything that doesn't look like a relation data file. */
329 			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
330 													 &forkNum))
331 				continue;
332 
333 			/* Also skip it unless this is the init fork. */
334 			if (forkNum != INIT_FORKNUM)
335 				continue;
336 
337 			/* Construct main fork pathname. */
338 			memcpy(oidbuf, de->d_name, oidchars);
339 			oidbuf[oidchars] = '\0';
340 			snprintf(mainpath, sizeof(mainpath), "%s/%s%s",
341 					 dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
342 					 strlen(forkNames[INIT_FORKNUM]));
343 
344 			fsync_fname(mainpath, false);
345 		}
346 
347 		FreeDir(dbspace_dir);
348 
349 		/*
350 		 * Lastly, fsync the database directory itself, ensuring the
351 		 * filesystem remembers the file creations and deletions we've done.
352 		 * We don't bother with this during a call that does only
353 		 * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we
354 		 * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step
355 		 * too at the next startup attempt.
356 		 */
357 		fsync_fname(dbspacedirname, true);
358 	}
359 }
360 
361 /*
362  * Basic parsing of putative relation filenames.
363  *
364  * This function returns true if the file appears to be in the correct format
365  * for a non-temporary relation and false otherwise.
366  *
367  * NB: If this function returns true, the caller is entitled to assume that
368  * *oidchars has been set to the a value no more than OIDCHARS, and thus
369  * that a buffer of OIDCHARS+1 characters is sufficient to hold the OID
370  * portion of the filename.  This is critical to protect against a possible
371  * buffer overrun.
372  */
373 bool
parse_filename_for_nontemp_relation(const char * name,int * oidchars,ForkNumber * fork)374 parse_filename_for_nontemp_relation(const char *name, int *oidchars,
375 									ForkNumber *fork)
376 {
377 	int			pos;
378 
379 	/* Look for a non-empty string of digits (that isn't too long). */
380 	for (pos = 0; isdigit((unsigned char) name[pos]); ++pos)
381 		;
382 	if (pos == 0 || pos > OIDCHARS)
383 		return false;
384 	*oidchars = pos;
385 
386 	/* Check for a fork name. */
387 	if (name[pos] != '_')
388 		*fork = MAIN_FORKNUM;
389 	else
390 	{
391 		int			forkchar;
392 
393 		forkchar = forkname_chars(&name[pos + 1], fork);
394 		if (forkchar <= 0)
395 			return false;
396 		pos += forkchar + 1;
397 	}
398 
399 	/* Check for a segment number. */
400 	if (name[pos] == '.')
401 	{
402 		int			segchar;
403 
404 		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
405 			;
406 		if (segchar <= 1)
407 			return false;
408 		pos += segchar;
409 	}
410 
411 	/* Now we should be at the end. */
412 	if (name[pos] != '\0')
413 		return false;
414 	return true;
415 }
416