1 /*-------------------------------------------------------------------------
2  *
3  * reinit.c
4  *	  Reinitialization of unlogged relations
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *	  src/backend/storage/file/reinit.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <unistd.h>
18 
19 #include "catalog/catalog.h"
20 #include "common/relpath.h"
21 #include "storage/copydir.h"
22 #include "storage/fd.h"
23 #include "storage/reinit.h"
24 #include "utils/hsearch.h"
25 #include "utils/memutils.h"
26 
27 static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname,
28 									  int op);
29 static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname,
30 								   int op);
31 static bool parse_filename_for_nontemp_relation(const char *name,
32 									int *oidchars, ForkNumber *fork);
33 
34 typedef struct
35 {
36 	char		oid[OIDCHARS + 1];
37 } unlogged_relation_entry;
38 
39 /*
40  * Reset unlogged relations from before the last restart.
41  *
42  * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any
43  * relation with an "init" fork, except for the "init" fork itself.
44  *
45  * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main
46  * fork.
47  */
48 void
ResetUnloggedRelations(int op)49 ResetUnloggedRelations(int op)
50 {
51 	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
52 	DIR		   *spc_dir;
53 	struct dirent *spc_de;
54 	MemoryContext tmpctx,
55 				oldctx;
56 
57 	/* Log it. */
58 	elog(DEBUG1, "resetting unlogged relations: cleanup %d init %d",
59 		 (op & UNLOGGED_RELATION_CLEANUP) != 0,
60 		 (op & UNLOGGED_RELATION_INIT) != 0);
61 
62 	/*
63 	 * Just to be sure we don't leak any memory, let's create a temporary
64 	 * memory context for this operation.
65 	 */
66 	tmpctx = AllocSetContextCreate(CurrentMemoryContext,
67 								   "ResetUnloggedRelations",
68 								   ALLOCSET_DEFAULT_SIZES);
69 	oldctx = MemoryContextSwitchTo(tmpctx);
70 
71 	/*
72 	 * First process unlogged files in pg_default ($PGDATA/base)
73 	 */
74 	ResetUnloggedRelationsInTablespaceDir("base", op);
75 
76 	/*
77 	 * Cycle through directories for all non-default tablespaces.
78 	 */
79 	spc_dir = AllocateDir("pg_tblspc");
80 
81 	while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
82 	{
83 		if (strcmp(spc_de->d_name, ".") == 0 ||
84 			strcmp(spc_de->d_name, "..") == 0)
85 			continue;
86 
87 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
88 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
89 		ResetUnloggedRelationsInTablespaceDir(temp_path, op);
90 	}
91 
92 	FreeDir(spc_dir);
93 
94 	/*
95 	 * Restore memory context.
96 	 */
97 	MemoryContextSwitchTo(oldctx);
98 	MemoryContextDelete(tmpctx);
99 }
100 
101 /* Process one tablespace directory for ResetUnloggedRelations */
102 static void
ResetUnloggedRelationsInTablespaceDir(const char * tsdirname,int op)103 ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
104 {
105 	DIR		   *ts_dir;
106 	struct dirent *de;
107 	char		dbspace_path[MAXPGPATH * 2];
108 
109 	ts_dir = AllocateDir(tsdirname);
110 	if (ts_dir == NULL)
111 	{
112 		/* anything except ENOENT is fishy */
113 		if (errno != ENOENT)
114 			elog(LOG,
115 				 "could not open tablespace directory \"%s\": %m",
116 				 tsdirname);
117 		return;
118 	}
119 
120 	while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
121 	{
122 		int			i = 0;
123 
124 		/*
125 		 * We're only interested in the per-database directories, which have
126 		 * numeric names.  Note that this code will also (properly) ignore "."
127 		 * and "..".
128 		 */
129 		while (isdigit((unsigned char) de->d_name[i]))
130 			++i;
131 		if (de->d_name[i] != '\0' || i == 0)
132 			continue;
133 
134 		snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
135 				 tsdirname, de->d_name);
136 		ResetUnloggedRelationsInDbspaceDir(dbspace_path, op);
137 	}
138 
139 	FreeDir(ts_dir);
140 }
141 
142 /* Process one per-dbspace directory for ResetUnloggedRelations */
143 static void
ResetUnloggedRelationsInDbspaceDir(const char * dbspacedirname,int op)144 ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
145 {
146 	DIR		   *dbspace_dir;
147 	struct dirent *de;
148 	char		rm_path[MAXPGPATH * 2];
149 
150 	/* Caller must specify at least one operation. */
151 	Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0);
152 
153 	/*
154 	 * Cleanup is a two-pass operation.  First, we go through and identify all
155 	 * the files with init forks.  Then, we go through again and nuke
156 	 * everything with the same OID except the init fork.
157 	 */
158 	if ((op & UNLOGGED_RELATION_CLEANUP) != 0)
159 	{
160 		HTAB	   *hash = NULL;
161 		HASHCTL		ctl;
162 
163 		/* Open the directory. */
164 		dbspace_dir = AllocateDir(dbspacedirname);
165 		if (dbspace_dir == NULL)
166 		{
167 			elog(LOG,
168 				 "could not open dbspace directory \"%s\": %m",
169 				 dbspacedirname);
170 			return;
171 		}
172 
173 		/*
174 		 * It's possible that someone could create a ton of unlogged relations
175 		 * in the same database & tablespace, so we'd better use a hash table
176 		 * rather than an array or linked list to keep track of which files
177 		 * need to be reset.  Otherwise, this cleanup operation would be
178 		 * O(n^2).
179 		 */
180 		ctl.keysize = sizeof(unlogged_relation_entry);
181 		ctl.entrysize = sizeof(unlogged_relation_entry);
182 		hash = hash_create("unlogged hash", 32, &ctl, HASH_ELEM);
183 
184 		/* Scan the directory. */
185 		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
186 		{
187 			ForkNumber	forkNum;
188 			int			oidchars;
189 			unlogged_relation_entry ent;
190 
191 			/* Skip anything that doesn't look like a relation data file. */
192 			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
193 													 &forkNum))
194 				continue;
195 
196 			/* Also skip it unless this is the init fork. */
197 			if (forkNum != INIT_FORKNUM)
198 				continue;
199 
200 			/*
201 			 * Put the OID portion of the name into the hash table, if it
202 			 * isn't already.
203 			 */
204 			memset(ent.oid, 0, sizeof(ent.oid));
205 			memcpy(ent.oid, de->d_name, oidchars);
206 			hash_search(hash, &ent, HASH_ENTER, NULL);
207 		}
208 
209 		/* Done with the first pass. */
210 		FreeDir(dbspace_dir);
211 
212 		/*
213 		 * If we didn't find any init forks, there's no point in continuing;
214 		 * we can bail out now.
215 		 */
216 		if (hash_get_num_entries(hash) == 0)
217 		{
218 			hash_destroy(hash);
219 			return;
220 		}
221 
222 		/*
223 		 * Now, make a second pass and remove anything that matches. First,
224 		 * reopen the directory.
225 		 */
226 		dbspace_dir = AllocateDir(dbspacedirname);
227 		if (dbspace_dir == NULL)
228 		{
229 			elog(LOG,
230 				 "could not open dbspace directory \"%s\": %m",
231 				 dbspacedirname);
232 			hash_destroy(hash);
233 			return;
234 		}
235 
236 		/* Scan the directory. */
237 		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
238 		{
239 			ForkNumber	forkNum;
240 			int			oidchars;
241 			bool		found;
242 			unlogged_relation_entry ent;
243 
244 			/* Skip anything that doesn't look like a relation data file. */
245 			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
246 													 &forkNum))
247 				continue;
248 
249 			/* We never remove the init fork. */
250 			if (forkNum == INIT_FORKNUM)
251 				continue;
252 
253 			/*
254 			 * See whether the OID portion of the name shows up in the hash
255 			 * table.
256 			 */
257 			memset(ent.oid, 0, sizeof(ent.oid));
258 			memcpy(ent.oid, de->d_name, oidchars);
259 			hash_search(hash, &ent, HASH_FIND, &found);
260 
261 			/* If so, nuke it! */
262 			if (found)
263 			{
264 				snprintf(rm_path, sizeof(rm_path), "%s/%s",
265 						 dbspacedirname, de->d_name);
266 
267 				/*
268 				 * It's tempting to actually throw an error here, but since
269 				 * this code gets run during database startup, that could
270 				 * result in the database failing to start.  (XXX Should we do
271 				 * it anyway?)
272 				 */
273 				if (unlink(rm_path))
274 					elog(LOG, "could not unlink file \"%s\": %m", rm_path);
275 				else
276 					elog(DEBUG2, "unlinked file \"%s\"", rm_path);
277 			}
278 		}
279 
280 		/* Cleanup is complete. */
281 		FreeDir(dbspace_dir);
282 		hash_destroy(hash);
283 	}
284 
285 	/*
286 	 * Initialization happens after cleanup is complete: we copy each init
287 	 * fork file to the corresponding main fork file.  Note that if we are
288 	 * asked to do both cleanup and init, we may never get here: if the
289 	 * cleanup code determines that there are no init forks in this dbspace,
290 	 * it will return before we get to this point.
291 	 */
292 	if ((op & UNLOGGED_RELATION_INIT) != 0)
293 	{
294 		/* Open the directory. */
295 		dbspace_dir = AllocateDir(dbspacedirname);
296 		if (dbspace_dir == NULL)
297 		{
298 			/* we just saw this directory, so it really ought to be there */
299 			elog(LOG,
300 				 "could not open dbspace directory \"%s\": %m",
301 				 dbspacedirname);
302 			return;
303 		}
304 
305 		/* Scan the directory. */
306 		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
307 		{
308 			ForkNumber	forkNum;
309 			int			oidchars;
310 			char		oidbuf[OIDCHARS + 1];
311 			char		srcpath[MAXPGPATH * 2];
312 			char		dstpath[MAXPGPATH];
313 
314 			/* Skip anything that doesn't look like a relation data file. */
315 			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
316 													 &forkNum))
317 				continue;
318 
319 			/* Also skip it unless this is the init fork. */
320 			if (forkNum != INIT_FORKNUM)
321 				continue;
322 
323 			/* Construct source pathname. */
324 			snprintf(srcpath, sizeof(srcpath), "%s/%s",
325 					 dbspacedirname, de->d_name);
326 
327 			/* Construct destination pathname. */
328 			memcpy(oidbuf, de->d_name, oidchars);
329 			oidbuf[oidchars] = '\0';
330 			snprintf(dstpath, sizeof(dstpath), "%s/%s%s",
331 					 dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
332 					 strlen(forkNames[INIT_FORKNUM]));
333 
334 			/* OK, we're ready to perform the actual copy. */
335 			elog(DEBUG2, "copying %s to %s", srcpath, dstpath);
336 			copy_file(srcpath, dstpath);
337 		}
338 
339 		FreeDir(dbspace_dir);
340 
341 		/*
342 		 * copy_file() above has already called pg_flush_data() on the files
343 		 * it created. Now we need to fsync those files, because a checkpoint
344 		 * won't do it for us while we're in recovery. We do this in a
345 		 * separate pass to allow the kernel to perform all the flushes
346 		 * (especially the metadata ones) at once.
347 		 */
348 		dbspace_dir = AllocateDir(dbspacedirname);
349 		if (dbspace_dir == NULL)
350 		{
351 			/* we just saw this directory, so it really ought to be there */
352 			elog(LOG,
353 				 "could not open dbspace directory \"%s\": %m",
354 				 dbspacedirname);
355 			return;
356 		}
357 
358 		while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
359 		{
360 			ForkNumber	forkNum;
361 			int			oidchars;
362 			char		oidbuf[OIDCHARS + 1];
363 			char		mainpath[MAXPGPATH];
364 
365 			/* Skip anything that doesn't look like a relation data file. */
366 			if (!parse_filename_for_nontemp_relation(de->d_name, &oidchars,
367 													 &forkNum))
368 				continue;
369 
370 			/* Also skip it unless this is the init fork. */
371 			if (forkNum != INIT_FORKNUM)
372 				continue;
373 
374 			/* Construct main fork pathname. */
375 			memcpy(oidbuf, de->d_name, oidchars);
376 			oidbuf[oidchars] = '\0';
377 			snprintf(mainpath, sizeof(mainpath), "%s/%s%s",
378 					 dbspacedirname, oidbuf, de->d_name + oidchars + 1 +
379 					 strlen(forkNames[INIT_FORKNUM]));
380 
381 			fsync_fname(mainpath, false);
382 		}
383 
384 		FreeDir(dbspace_dir);
385 
386 		fsync_fname(dbspacedirname, true);
387 	}
388 }
389 
390 /*
391  * Basic parsing of putative relation filenames.
392  *
393  * This function returns true if the file appears to be in the correct format
394  * for a non-temporary relation and false otherwise.
395  *
396  * NB: If this function returns true, the caller is entitled to assume that
397  * *oidchars has been set to the a value no more than OIDCHARS, and thus
398  * that a buffer of OIDCHARS+1 characters is sufficient to hold the OID
399  * portion of the filename.  This is critical to protect against a possible
400  * buffer overrun.
401  */
402 static bool
parse_filename_for_nontemp_relation(const char * name,int * oidchars,ForkNumber * fork)403 parse_filename_for_nontemp_relation(const char *name, int *oidchars,
404 									ForkNumber *fork)
405 {
406 	int			pos;
407 
408 	/* Look for a non-empty string of digits (that isn't too long). */
409 	for (pos = 0; isdigit((unsigned char) name[pos]); ++pos)
410 		;
411 	if (pos == 0 || pos > OIDCHARS)
412 		return false;
413 	*oidchars = pos;
414 
415 	/* Check for a fork name. */
416 	if (name[pos] != '_')
417 		*fork = MAIN_FORKNUM;
418 	else
419 	{
420 		int			forkchar;
421 
422 		forkchar = forkname_chars(&name[pos + 1], fork);
423 		if (forkchar <= 0)
424 			return false;
425 		pos += forkchar + 1;
426 	}
427 
428 	/* Check for a segment number. */
429 	if (name[pos] == '.')
430 	{
431 		int			segchar;
432 
433 		for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
434 			;
435 		if (segchar <= 1)
436 			return false;
437 		pos += segchar;
438 	}
439 
440 	/* Now we should be at the end. */
441 	if (name[pos] != '\0')
442 		return false;
443 	return true;
444 }
445