1 /*-------------------------------------------------------------------------
2  *
3  * buffile.c
4  *	  Management of large buffered temporary files.
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * IDENTIFICATION
10  *	  src/backend/storage/file/buffile.c
11  *
12  * NOTES:
13  *
14  * BufFiles provide a very incomplete emulation of stdio atop virtual Files
15  * (as managed by fd.c).  Currently, we only support the buffered-I/O
16  * aspect of stdio: a read or write of the low-level File occurs only
17  * when the buffer is filled or emptied.  This is an even bigger win
18  * for virtual Files than for ordinary kernel files, since reducing the
19  * frequency with which a virtual File is touched reduces "thrashing"
20  * of opening/closing file descriptors.
21  *
22  * Note that BufFile structs are allocated with palloc(), and therefore
23  * will go away automatically at query/transaction end.  Since the underlying
24  * virtual Files are made with OpenTemporaryFile, all resources for
25  * the file are certain to be cleaned up even if processing is aborted
26  * by ereport(ERROR).  The data structures required are made in the
27  * palloc context that was current when the BufFile was created, and
28  * any external resources such as temp files are owned by the ResourceOwner
29  * that was current at that time.
30  *
31  * BufFile also supports temporary files that exceed the OS file size limit
32  * (by opening multiple fd.c temporary files).  This is an essential feature
33  * for sorts and hashjoins on large amounts of data.
34  *
35  * BufFile supports temporary files that can be made read-only and shared with
36  * other backends, as infrastructure for parallel execution.  Such files need
37  * to be created as a member of a SharedFileSet that all participants are
38  * attached to.
39  *-------------------------------------------------------------------------
40  */
41 
42 #include "postgres.h"
43 
44 #include "commands/tablespace.h"
45 #include "executor/instrument.h"
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "storage/buf_internals.h"
49 #include "storage/buffile.h"
50 #include "storage/fd.h"
51 #include "utils/resowner.h"
52 
53 /*
54  * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE.
55  * The reason is that we'd like large BufFiles to be spread across multiple
56  * tablespaces when available.
57  */
58 #define MAX_PHYSICAL_FILESIZE	0x40000000
59 #define BUFFILE_SEG_SIZE		(MAX_PHYSICAL_FILESIZE / BLCKSZ)
60 
61 /*
62  * This data structure represents a buffered file that consists of one or
63  * more physical files (each accessed through a virtual file descriptor
64  * managed by fd.c).
65  */
66 struct BufFile
67 {
68 	int			numFiles;		/* number of physical files in set */
69 	/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
70 	File	   *files;			/* palloc'd array with numFiles entries */
71 
72 	bool		isInterXact;	/* keep open over transactions? */
73 	bool		dirty;			/* does buffer need to be written? */
74 	bool		readOnly;		/* has the file been set to read only? */
75 
76 	SharedFileSet *fileset;		/* space for segment files if shared */
77 	const char *name;			/* name of this BufFile if shared */
78 
79 	/*
80 	 * resowner is the ResourceOwner to use for underlying temp files.  (We
81 	 * don't need to remember the memory context we're using explicitly,
82 	 * because after creation we only repalloc our arrays larger.)
83 	 */
84 	ResourceOwner resowner;
85 
86 	/*
87 	 * "current pos" is position of start of buffer within the logical file.
88 	 * Position as seen by user of BufFile is (curFile, curOffset + pos).
89 	 */
90 	int			curFile;		/* file index (0..n) part of current pos */
91 	off_t		curOffset;		/* offset part of current pos */
92 	int			pos;			/* next read/write position in buffer */
93 	int			nbytes;			/* total # of valid bytes in buffer */
94 	PGAlignedBlock buffer;
95 };
96 
97 static BufFile *makeBufFileCommon(int nfiles);
98 static BufFile *makeBufFile(File firstfile);
99 static void extendBufFile(BufFile *file);
100 static void BufFileLoadBuffer(BufFile *file);
101 static void BufFileDumpBuffer(BufFile *file);
102 static void BufFileFlush(BufFile *file);
103 static File MakeNewSharedSegment(BufFile *file, int segment);
104 
105 /*
106  * Create BufFile and perform the common initialization.
107  */
108 static BufFile *
makeBufFileCommon(int nfiles)109 makeBufFileCommon(int nfiles)
110 {
111 	BufFile    *file = (BufFile *) palloc(sizeof(BufFile));
112 
113 	file->numFiles = nfiles;
114 	file->isInterXact = false;
115 	file->dirty = false;
116 	file->resowner = CurrentResourceOwner;
117 	file->curFile = 0;
118 	file->curOffset = 0L;
119 	file->pos = 0;
120 	file->nbytes = 0;
121 
122 	return file;
123 }
124 
125 /*
126  * Create a BufFile given the first underlying physical file.
127  * NOTE: caller must set isInterXact if appropriate.
128  */
129 static BufFile *
makeBufFile(File firstfile)130 makeBufFile(File firstfile)
131 {
132 	BufFile    *file = makeBufFileCommon(1);
133 
134 	file->files = (File *) palloc(sizeof(File));
135 	file->files[0] = firstfile;
136 	file->readOnly = false;
137 	file->fileset = NULL;
138 	file->name = NULL;
139 
140 	return file;
141 }
142 
143 /*
144  * Add another component temp file.
145  */
146 static void
extendBufFile(BufFile * file)147 extendBufFile(BufFile *file)
148 {
149 	File		pfile;
150 	ResourceOwner oldowner;
151 
152 	/* Be sure to associate the file with the BufFile's resource owner */
153 	oldowner = CurrentResourceOwner;
154 	CurrentResourceOwner = file->resowner;
155 
156 	if (file->fileset == NULL)
157 		pfile = OpenTemporaryFile(file->isInterXact);
158 	else
159 		pfile = MakeNewSharedSegment(file, file->numFiles);
160 
161 	Assert(pfile >= 0);
162 
163 	CurrentResourceOwner = oldowner;
164 
165 	file->files = (File *) repalloc(file->files,
166 									(file->numFiles + 1) * sizeof(File));
167 	file->files[file->numFiles] = pfile;
168 	file->numFiles++;
169 }
170 
171 /*
172  * Create a BufFile for a new temporary file (which will expand to become
173  * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are
174  * written to it).
175  *
176  * If interXact is true, the temp file will not be automatically deleted
177  * at end of transaction.
178  *
179  * Note: if interXact is true, the caller had better be calling us in a
180  * memory context, and with a resource owner, that will survive across
181  * transaction boundaries.
182  */
183 BufFile *
BufFileCreateTemp(bool interXact)184 BufFileCreateTemp(bool interXact)
185 {
186 	BufFile    *file;
187 	File		pfile;
188 
189 	/*
190 	 * Ensure that temp tablespaces are set up for OpenTemporaryFile to use.
191 	 * Possibly the caller will have done this already, but it seems useful to
192 	 * double-check here.  Failure to do this at all would result in the temp
193 	 * files always getting placed in the default tablespace, which is a
194 	 * pretty hard-to-detect bug.  Callers may prefer to do it earlier if they
195 	 * want to be sure that any required catalog access is done in some other
196 	 * resource context.
197 	 */
198 	PrepareTempTablespaces();
199 
200 	pfile = OpenTemporaryFile(interXact);
201 	Assert(pfile >= 0);
202 
203 	file = makeBufFile(pfile);
204 	file->isInterXact = interXact;
205 
206 	return file;
207 }
208 
209 /*
210  * Build the name for a given segment of a given BufFile.
211  */
212 static void
SharedSegmentName(char * name,const char * buffile_name,int segment)213 SharedSegmentName(char *name, const char *buffile_name, int segment)
214 {
215 	snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment);
216 }
217 
218 /*
219  * Create a new segment file backing a shared BufFile.
220  */
221 static File
MakeNewSharedSegment(BufFile * buffile,int segment)222 MakeNewSharedSegment(BufFile *buffile, int segment)
223 {
224 	char		name[MAXPGPATH];
225 	File		file;
226 
227 	/*
228 	 * It is possible that there are files left over from before a crash
229 	 * restart with the same name.  In order for BufFileOpenShared() not to
230 	 * get confused about how many segments there are, we'll unlink the next
231 	 * segment number if it already exists.
232 	 */
233 	SharedSegmentName(name, buffile->name, segment + 1);
234 	SharedFileSetDelete(buffile->fileset, name, true);
235 
236 	/* Create the new segment. */
237 	SharedSegmentName(name, buffile->name, segment);
238 	file = SharedFileSetCreate(buffile->fileset, name);
239 
240 	/* SharedFileSetCreate would've errored out */
241 	Assert(file > 0);
242 
243 	return file;
244 }
245 
246 /*
247  * Create a BufFile that can be discovered and opened read-only by other
248  * backends that are attached to the same SharedFileSet using the same name.
249  *
250  * The naming scheme for shared BufFiles is left up to the calling code.  The
251  * name will appear as part of one or more filenames on disk, and might
252  * provide clues to administrators about which subsystem is generating
253  * temporary file data.  Since each SharedFileSet object is backed by one or
254  * more uniquely named temporary directory, names don't conflict with
255  * unrelated SharedFileSet objects.
256  */
257 BufFile *
BufFileCreateShared(SharedFileSet * fileset,const char * name)258 BufFileCreateShared(SharedFileSet *fileset, const char *name)
259 {
260 	BufFile    *file;
261 
262 	file = makeBufFileCommon(1);
263 	file->fileset = fileset;
264 	file->name = pstrdup(name);
265 	file->files = (File *) palloc(sizeof(File));
266 	file->files[0] = MakeNewSharedSegment(file, 0);
267 	file->readOnly = false;
268 
269 	return file;
270 }
271 
272 /*
273  * Open a file that was previously created in another backend (or this one)
274  * with BufFileCreateShared in the same SharedFileSet using the same name.
275  * The backend that created the file must have called BufFileClose() or
276  * BufFileExportShared() to make sure that it is ready to be opened by other
277  * backends and render it read-only.
278  */
279 BufFile *
BufFileOpenShared(SharedFileSet * fileset,const char * name)280 BufFileOpenShared(SharedFileSet *fileset, const char *name)
281 {
282 	BufFile    *file;
283 	char		segment_name[MAXPGPATH];
284 	Size		capacity = 16;
285 	File	   *files;
286 	int			nfiles = 0;
287 
288 	files = palloc(sizeof(File) * capacity);
289 
290 	/*
291 	 * We don't know how many segments there are, so we'll probe the
292 	 * filesystem to find out.
293 	 */
294 	for (;;)
295 	{
296 		/* See if we need to expand our file segment array. */
297 		if (nfiles + 1 > capacity)
298 		{
299 			capacity *= 2;
300 			files = repalloc(files, sizeof(File) * capacity);
301 		}
302 		/* Try to load a segment. */
303 		SharedSegmentName(segment_name, name, nfiles);
304 		files[nfiles] = SharedFileSetOpen(fileset, segment_name);
305 		if (files[nfiles] <= 0)
306 			break;
307 		++nfiles;
308 
309 		CHECK_FOR_INTERRUPTS();
310 	}
311 
312 	/*
313 	 * If we didn't find any files at all, then no BufFile exists with this
314 	 * name.
315 	 */
316 	if (nfiles == 0)
317 		ereport(ERROR,
318 				(errcode_for_file_access(),
319 				 errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m",
320 						segment_name, name)));
321 
322 	file = makeBufFileCommon(nfiles);
323 	file->files = files;
324 	file->readOnly = true;		/* Can't write to files opened this way */
325 	file->fileset = fileset;
326 	file->name = pstrdup(name);
327 
328 	return file;
329 }
330 
331 /*
332  * Delete a BufFile that was created by BufFileCreateShared in the given
333  * SharedFileSet using the given name.
334  *
335  * It is not necessary to delete files explicitly with this function.  It is
336  * provided only as a way to delete files proactively, rather than waiting for
337  * the SharedFileSet to be cleaned up.
338  *
339  * Only one backend should attempt to delete a given name, and should know
340  * that it exists and has been exported or closed.
341  */
342 void
BufFileDeleteShared(SharedFileSet * fileset,const char * name)343 BufFileDeleteShared(SharedFileSet *fileset, const char *name)
344 {
345 	char		segment_name[MAXPGPATH];
346 	int			segment = 0;
347 	bool		found = false;
348 
349 	/*
350 	 * We don't know how many segments the file has.  We'll keep deleting
351 	 * until we run out.  If we don't manage to find even an initial segment,
352 	 * raise an error.
353 	 */
354 	for (;;)
355 	{
356 		SharedSegmentName(segment_name, name, segment);
357 		if (!SharedFileSetDelete(fileset, segment_name, true))
358 			break;
359 		found = true;
360 		++segment;
361 
362 		CHECK_FOR_INTERRUPTS();
363 	}
364 
365 	if (!found)
366 		elog(ERROR, "could not delete unknown shared BufFile \"%s\"", name);
367 }
368 
369 /*
370  * BufFileExportShared --- flush and make read-only, in preparation for sharing.
371  */
372 void
BufFileExportShared(BufFile * file)373 BufFileExportShared(BufFile *file)
374 {
375 	/* Must be a file belonging to a SharedFileSet. */
376 	Assert(file->fileset != NULL);
377 
378 	/* It's probably a bug if someone calls this twice. */
379 	Assert(!file->readOnly);
380 
381 	BufFileFlush(file);
382 	file->readOnly = true;
383 }
384 
385 /*
386  * Close a BufFile
387  *
388  * Like fclose(), this also implicitly FileCloses the underlying File.
389  */
390 void
BufFileClose(BufFile * file)391 BufFileClose(BufFile *file)
392 {
393 	int			i;
394 
395 	/* flush any unwritten data */
396 	BufFileFlush(file);
397 	/* close and delete the underlying file(s) */
398 	for (i = 0; i < file->numFiles; i++)
399 		FileClose(file->files[i]);
400 	/* release the buffer space */
401 	pfree(file->files);
402 	pfree(file);
403 }
404 
405 /*
406  * BufFileLoadBuffer
407  *
408  * Load some data into buffer, if possible, starting from curOffset.
409  * At call, must have dirty = false, pos and nbytes = 0.
410  * On exit, nbytes is number of bytes loaded.
411  */
412 static void
BufFileLoadBuffer(BufFile * file)413 BufFileLoadBuffer(BufFile *file)
414 {
415 	File		thisfile;
416 
417 	/*
418 	 * Advance to next component file if necessary and possible.
419 	 */
420 	if (file->curOffset >= MAX_PHYSICAL_FILESIZE &&
421 		file->curFile + 1 < file->numFiles)
422 	{
423 		file->curFile++;
424 		file->curOffset = 0L;
425 	}
426 
427 	/*
428 	 * Read whatever we can get, up to a full bufferload.
429 	 */
430 	thisfile = file->files[file->curFile];
431 	file->nbytes = FileRead(thisfile,
432 							file->buffer.data,
433 							sizeof(file->buffer),
434 							file->curOffset,
435 							WAIT_EVENT_BUFFILE_READ);
436 	if (file->nbytes < 0)
437 	{
438 		file->nbytes = 0;
439 		ereport(ERROR,
440 				(errcode_for_file_access(),
441 				 errmsg("could not read file \"%s\": %m",
442 						FilePathName(thisfile))));
443 	}
444 
445 	/* we choose not to advance curOffset here */
446 
447 	if (file->nbytes > 0)
448 		pgBufferUsage.temp_blks_read++;
449 }
450 
451 /*
452  * BufFileDumpBuffer
453  *
454  * Dump buffer contents starting at curOffset.
455  * At call, should have dirty = true, nbytes > 0.
456  * On exit, dirty is cleared if successful write, and curOffset is advanced.
457  */
458 static void
BufFileDumpBuffer(BufFile * file)459 BufFileDumpBuffer(BufFile *file)
460 {
461 	int			wpos = 0;
462 	int			bytestowrite;
463 	File		thisfile;
464 
465 	/*
466 	 * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it
467 	 * crosses a component-file boundary; so we need a loop.
468 	 */
469 	while (wpos < file->nbytes)
470 	{
471 		off_t		availbytes;
472 
473 		/*
474 		 * Advance to next component file if necessary and possible.
475 		 */
476 		if (file->curOffset >= MAX_PHYSICAL_FILESIZE)
477 		{
478 			while (file->curFile + 1 >= file->numFiles)
479 				extendBufFile(file);
480 			file->curFile++;
481 			file->curOffset = 0L;
482 		}
483 
484 		/*
485 		 * Determine how much we need to write into this file.
486 		 */
487 		bytestowrite = file->nbytes - wpos;
488 		availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
489 
490 		if ((off_t) bytestowrite > availbytes)
491 			bytestowrite = (int) availbytes;
492 
493 		thisfile = file->files[file->curFile];
494 		bytestowrite = FileWrite(thisfile,
495 								 file->buffer.data + wpos,
496 								 bytestowrite,
497 								 file->curOffset,
498 								 WAIT_EVENT_BUFFILE_WRITE);
499 		if (bytestowrite <= 0)
500 			ereport(ERROR,
501 					(errcode_for_file_access(),
502 					 errmsg("could not write to file \"%s\": %m",
503 							FilePathName(thisfile))));
504 		file->curOffset += bytestowrite;
505 		wpos += bytestowrite;
506 
507 		pgBufferUsage.temp_blks_written++;
508 	}
509 	file->dirty = false;
510 
511 	/*
512 	 * At this point, curOffset has been advanced to the end of the buffer,
513 	 * ie, its original value + nbytes.  We need to make it point to the
514 	 * logical file position, ie, original value + pos, in case that is less
515 	 * (as could happen due to a small backwards seek in a dirty buffer!)
516 	 */
517 	file->curOffset -= (file->nbytes - file->pos);
518 	if (file->curOffset < 0)	/* handle possible segment crossing */
519 	{
520 		file->curFile--;
521 		Assert(file->curFile >= 0);
522 		file->curOffset += MAX_PHYSICAL_FILESIZE;
523 	}
524 
525 	/*
526 	 * Now we can set the buffer empty without changing the logical position
527 	 */
528 	file->pos = 0;
529 	file->nbytes = 0;
530 }
531 
532 /*
533  * BufFileRead
534  *
535  * Like fread() except we assume 1-byte element size and report I/O errors via
536  * ereport().
537  */
538 size_t
BufFileRead(BufFile * file,void * ptr,size_t size)539 BufFileRead(BufFile *file, void *ptr, size_t size)
540 {
541 	size_t		nread = 0;
542 	size_t		nthistime;
543 
544 	BufFileFlush(file);
545 
546 	while (size > 0)
547 	{
548 		if (file->pos >= file->nbytes)
549 		{
550 			/* Try to load more data into buffer. */
551 			file->curOffset += file->pos;
552 			file->pos = 0;
553 			file->nbytes = 0;
554 			BufFileLoadBuffer(file);
555 			if (file->nbytes <= 0)
556 				break;			/* no more data available */
557 		}
558 
559 		nthistime = file->nbytes - file->pos;
560 		if (nthistime > size)
561 			nthistime = size;
562 		Assert(nthistime > 0);
563 
564 		memcpy(ptr, file->buffer.data + file->pos, nthistime);
565 
566 		file->pos += nthistime;
567 		ptr = (void *) ((char *) ptr + nthistime);
568 		size -= nthistime;
569 		nread += nthistime;
570 	}
571 
572 	return nread;
573 }
574 
575 /*
576  * BufFileWrite
577  *
578  * Like fwrite() except we assume 1-byte element size and report errors via
579  * ereport().
580  */
581 size_t
BufFileWrite(BufFile * file,void * ptr,size_t size)582 BufFileWrite(BufFile *file, void *ptr, size_t size)
583 {
584 	size_t		nwritten = 0;
585 	size_t		nthistime;
586 
587 	Assert(!file->readOnly);
588 
589 	while (size > 0)
590 	{
591 		if (file->pos >= BLCKSZ)
592 		{
593 			/* Buffer full, dump it out */
594 			if (file->dirty)
595 				BufFileDumpBuffer(file);
596 			else
597 			{
598 				/* Hmm, went directly from reading to writing? */
599 				file->curOffset += file->pos;
600 				file->pos = 0;
601 				file->nbytes = 0;
602 			}
603 		}
604 
605 		nthistime = BLCKSZ - file->pos;
606 		if (nthistime > size)
607 			nthistime = size;
608 		Assert(nthistime > 0);
609 
610 		memcpy(file->buffer.data + file->pos, ptr, nthistime);
611 
612 		file->dirty = true;
613 		file->pos += nthistime;
614 		if (file->nbytes < file->pos)
615 			file->nbytes = file->pos;
616 		ptr = (void *) ((char *) ptr + nthistime);
617 		size -= nthistime;
618 		nwritten += nthistime;
619 	}
620 
621 	return nwritten;
622 }
623 
624 /*
625  * BufFileFlush
626  *
627  * Like fflush(), except that I/O errors are reported with ereport().
628  */
629 static void
BufFileFlush(BufFile * file)630 BufFileFlush(BufFile *file)
631 {
632 	if (file->dirty)
633 		BufFileDumpBuffer(file);
634 
635 	Assert(!file->dirty);
636 }
637 
638 /*
639  * BufFileSeek
640  *
641  * Like fseek(), except that target position needs two values in order to
642  * work when logical filesize exceeds maximum value representable by off_t.
643  * We do not support relative seeks across more than that, however.
644  * I/O errors are reported by ereport().
645  *
646  * Result is 0 if OK, EOF if not.  Logical position is not moved if an
647  * impossible seek is attempted.
648  */
649 int
BufFileSeek(BufFile * file,int fileno,off_t offset,int whence)650 BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
651 {
652 	int			newFile;
653 	off_t		newOffset;
654 
655 	switch (whence)
656 	{
657 		case SEEK_SET:
658 			if (fileno < 0)
659 				return EOF;
660 			newFile = fileno;
661 			newOffset = offset;
662 			break;
663 		case SEEK_CUR:
664 
665 			/*
666 			 * Relative seek considers only the signed offset, ignoring
667 			 * fileno. Note that large offsets (> 1 GB) risk overflow in this
668 			 * add, unless we have 64-bit off_t.
669 			 */
670 			newFile = file->curFile;
671 			newOffset = (file->curOffset + file->pos) + offset;
672 			break;
673 #ifdef NOT_USED
674 		case SEEK_END:
675 			/* could be implemented, not needed currently */
676 			break;
677 #endif
678 		default:
679 			elog(ERROR, "invalid whence: %d", whence);
680 			return EOF;
681 	}
682 	while (newOffset < 0)
683 	{
684 		if (--newFile < 0)
685 			return EOF;
686 		newOffset += MAX_PHYSICAL_FILESIZE;
687 	}
688 	if (newFile == file->curFile &&
689 		newOffset >= file->curOffset &&
690 		newOffset <= file->curOffset + file->nbytes)
691 	{
692 		/*
693 		 * Seek is to a point within existing buffer; we can just adjust
694 		 * pos-within-buffer, without flushing buffer.  Note this is OK
695 		 * whether reading or writing, but buffer remains dirty if we were
696 		 * writing.
697 		 */
698 		file->pos = (int) (newOffset - file->curOffset);
699 		return 0;
700 	}
701 	/* Otherwise, must reposition buffer, so flush any dirty data */
702 	BufFileFlush(file);
703 
704 	/*
705 	 * At this point and no sooner, check for seek past last segment. The
706 	 * above flush could have created a new segment, so checking sooner would
707 	 * not work (at least not with this code).
708 	 */
709 
710 	/* convert seek to "start of next seg" to "end of last seg" */
711 	if (newFile == file->numFiles && newOffset == 0)
712 	{
713 		newFile--;
714 		newOffset = MAX_PHYSICAL_FILESIZE;
715 	}
716 	while (newOffset > MAX_PHYSICAL_FILESIZE)
717 	{
718 		if (++newFile >= file->numFiles)
719 			return EOF;
720 		newOffset -= MAX_PHYSICAL_FILESIZE;
721 	}
722 	if (newFile >= file->numFiles)
723 		return EOF;
724 	/* Seek is OK! */
725 	file->curFile = newFile;
726 	file->curOffset = newOffset;
727 	file->pos = 0;
728 	file->nbytes = 0;
729 	return 0;
730 }
731 
732 void
BufFileTell(BufFile * file,int * fileno,off_t * offset)733 BufFileTell(BufFile *file, int *fileno, off_t *offset)
734 {
735 	*fileno = file->curFile;
736 	*offset = file->curOffset + file->pos;
737 }
738 
739 /*
740  * BufFileSeekBlock --- block-oriented seek
741  *
742  * Performs absolute seek to the start of the n'th BLCKSZ-sized block of
743  * the file.  Note that users of this interface will fail if their files
744  * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work
745  * with tables bigger than that, either...
746  *
747  * Result is 0 if OK, EOF if not.  Logical position is not moved if an
748  * impossible seek is attempted.
749  */
750 int
BufFileSeekBlock(BufFile * file,long blknum)751 BufFileSeekBlock(BufFile *file, long blknum)
752 {
753 	return BufFileSeek(file,
754 					   (int) (blknum / BUFFILE_SEG_SIZE),
755 					   (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ,
756 					   SEEK_SET);
757 }
758 
759 #ifdef NOT_USED
760 /*
761  * BufFileTellBlock --- block-oriented tell
762  *
763  * Any fractional part of a block in the current seek position is ignored.
764  */
765 long
BufFileTellBlock(BufFile * file)766 BufFileTellBlock(BufFile *file)
767 {
768 	long		blknum;
769 
770 	blknum = (file->curOffset + file->pos) / BLCKSZ;
771 	blknum += file->curFile * BUFFILE_SEG_SIZE;
772 	return blknum;
773 }
774 
775 #endif
776 
777 /*
778  * Return the current shared BufFile size.
779  *
780  * Counts any holes left behind by BufFileAppend as part of the size.
781  * ereport()s on failure.
782  */
783 int64
BufFileSize(BufFile * file)784 BufFileSize(BufFile *file)
785 {
786 	int64		lastFileSize;
787 
788 	Assert(file->fileset != NULL);
789 
790 	/* Get the size of the last physical file. */
791 	lastFileSize = FileSize(file->files[file->numFiles - 1]);
792 	if (lastFileSize < 0)
793 		ereport(ERROR,
794 				(errcode_for_file_access(),
795 				 errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
796 						FilePathName(file->files[file->numFiles - 1]),
797 						file->name)));
798 
799 	return ((file->numFiles - 1) * (int64) MAX_PHYSICAL_FILESIZE) +
800 		lastFileSize;
801 }
802 
803 /*
804  * Append the contents of source file (managed within shared fileset) to
805  * end of target file (managed within same shared fileset).
806  *
807  * Note that operation subsumes ownership of underlying resources from
808  * "source".  Caller should never call BufFileClose against source having
809  * called here first.  Resource owners for source and target must match,
810  * too.
811  *
812  * This operation works by manipulating lists of segment files, so the
813  * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned
814  * boundary, typically creating empty holes before the boundary.  These
815  * areas do not contain any interesting data, and cannot be read from by
816  * caller.
817  *
818  * Returns the block number within target where the contents of source
819  * begins.  Caller should apply this as an offset when working off block
820  * positions that are in terms of the original BufFile space.
821  */
822 long
BufFileAppend(BufFile * target,BufFile * source)823 BufFileAppend(BufFile *target, BufFile *source)
824 {
825 	long		startBlock = target->numFiles * BUFFILE_SEG_SIZE;
826 	int			newNumFiles = target->numFiles + source->numFiles;
827 	int			i;
828 
829 	Assert(target->fileset != NULL);
830 	Assert(source->readOnly);
831 	Assert(!source->dirty);
832 	Assert(source->fileset != NULL);
833 
834 	if (target->resowner != source->resowner)
835 		elog(ERROR, "could not append BufFile with non-matching resource owner");
836 
837 	target->files = (File *)
838 		repalloc(target->files, sizeof(File) * newNumFiles);
839 	for (i = target->numFiles; i < newNumFiles; i++)
840 		target->files[i] = source->files[i - target->numFiles];
841 	target->numFiles = newNumFiles;
842 
843 	return startBlock;
844 }
845