1 /*-------------------------------------------------------------------------
2  *
3  * tuplesort.c
4  *	  Generalized tuple sorting routines.
5  *
6  * This module handles sorting of heap tuples, index tuples, or single
7  * Datums (and could easily support other kinds of sortable objects,
8  * if necessary).  It works efficiently for both small and large amounts
9  * of data.  Small amounts are sorted in-memory using qsort().  Large
10  * amounts are sorted using temporary files and a standard external sort
11  * algorithm.
12  *
13  * See Knuth, volume 3, for more than you want to know about the external
14  * sorting algorithm.  Historically, we divided the input into sorted runs
15  * using replacement selection, in the form of a priority tree implemented
16  * as a heap (essentially his Algorithm 5.2.3H), but now we only do that
17  * for the first run, and only if the run would otherwise end up being very
18  * short.  We merge the runs using polyphase merge, Knuth's Algorithm
19  * 5.4.2D.  The logical "tapes" used by Algorithm D are implemented by
20  * logtape.c, which avoids space wastage by recycling disk space as soon
21  * as each block is read from its "tape".
22  *
23  * We do not use Knuth's recommended data structure (Algorithm 5.4.1R) for
24  * the replacement selection, because it uses a fixed number of records
25  * in memory at all times.  Since we are dealing with tuples that may vary
26  * considerably in size, we want to be able to vary the number of records
27  * kept in memory to ensure full utilization of the allowed sort memory
28  * space.  So, we keep the tuples in a variable-size heap, with the next
29  * record to go out at the top of the heap.  Like Algorithm 5.4.1R, each
30  * record is stored with the run number that it must go into, and we use
31  * (run number, key) as the ordering key for the heap.  When the run number
32  * at the top of the heap changes, we know that no more records of the prior
33  * run are left in the heap.  Note that there are in practice only ever two
34  * distinct run numbers, because since PostgreSQL 9.6, we only use
35  * replacement selection to form the first run.
36  *
37  * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small
38  * customizations) is only used with the aim of producing just one run,
39  * thereby avoiding all merging.  Only the first run can use replacement
40  * selection, which is why there are now only two possible valid run
41  * numbers, and why heapification is customized to not distinguish between
42  * tuples in the second run (those will be quicksorted).  We generally
43  * prefer a simple hybrid sort-merge strategy, where runs are sorted in much
44  * the same way as the entire input of an internal sort is sorted (using
45  * qsort()).  The replacement_sort_tuples GUC controls the limited remaining
46  * use of replacement selection for the first run.
47  *
48  * There are several reasons to favor a hybrid sort-merge strategy.
49  * Maintaining a priority tree/heap has poor CPU cache characteristics.
50  * Furthermore, the growth in main memory sizes has greatly diminished the
51  * value of having runs that are larger than available memory, even in the
52  * case where there is partially sorted input and runs can be made far
53  * larger by using a heap.  In most cases, a single-pass merge step is all
54  * that is required even when runs are no larger than available memory.
55  * Avoiding multiple merge passes was traditionally considered to be the
56  * major advantage of using replacement selection.
57  *
58  * The approximate amount of memory allowed for any one sort operation
59  * is specified in kilobytes by the caller (most pass work_mem).  Initially,
60  * we absorb tuples and simply store them in an unsorted array as long as
61  * we haven't exceeded workMem.  If we reach the end of the input without
62  * exceeding workMem, we sort the array using qsort() and subsequently return
63  * tuples just by scanning the tuple array sequentially.  If we do exceed
64  * workMem, we begin to emit tuples into sorted runs in temporary tapes.
65  * When tuples are dumped in batch after quicksorting, we begin a new run
66  * with a new output tape (selected per Algorithm D).  After the end of the
67  * input is reached, we dump out remaining tuples in memory into a final run
68  * (or two, when replacement selection is still used), then merge the runs
69  * using Algorithm D.
70  *
71  * When merging runs, we use a heap containing just the frontmost tuple from
72  * each source run; we repeatedly output the smallest tuple and replace it
73  * with the next tuple from its source tape (if any).  When the heap empties,
74  * the merge is complete.  The basic merge algorithm thus needs very little
75  * memory --- only M tuples for an M-way merge, and M is constrained to a
76  * small number.  However, we can still make good use of our full workMem
77  * allocation by pre-reading additional blocks from each source tape.  Without
78  * prereading, our access pattern to the temporary file would be very erratic;
79  * on average we'd read one block from each of M source tapes during the same
80  * time that we're writing M blocks to the output tape, so there is no
81  * sequentiality of access at all, defeating the read-ahead methods used by
82  * most Unix kernels.  Worse, the output tape gets written into a very random
83  * sequence of blocks of the temp file, ensuring that things will be even
84  * worse when it comes time to read that tape.  A straightforward merge pass
85  * thus ends up doing a lot of waiting for disk seeks.  We can improve matters
86  * by prereading from each source tape sequentially, loading about workMem/M
87  * bytes from each tape in turn, and making the sequential blocks immediately
88  * available for reuse.  This approach helps to localize both read and write
89  * accesses.  The pre-reading is handled by logtape.c, we just tell it how
90  * much memory to use for the buffers.
91  *
92  * When the caller requests random access to the sort result, we form
93  * the final sorted run on a logical tape which is then "frozen", so
94  * that we can access it randomly.  When the caller does not need random
95  * access, we return from tuplesort_performsort() as soon as we are down
96  * to one run per logical tape.  The final merge is then performed
97  * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this
98  * saves one cycle of writing all the data out to disk and reading it in.
99  *
100  * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the
101  * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according
102  * to Knuth's figure 70 (section 5.4.2).  However, Knuth is assuming that
103  * tape drives are expensive beasts, and in particular that there will always
104  * be many more runs than tape drives.  In our implementation a "tape drive"
105  * doesn't cost much more than a few Kb of memory buffers, so we can afford
106  * to have lots of them.  In particular, if we can have as many tape drives
107  * as sorted runs, we can eliminate any repeated I/O at all.  In the current
108  * code we determine the number of tapes M on the basis of workMem: we want
109  * workMem/M to be large enough that we read a fair amount of data each time
110  * we preread from a tape, so as to maintain the locality of access described
111  * above.  Nonetheless, with large workMem we can have many tapes (but not
112  * too many -- see the comments in tuplesort_merge_order).
113  *
114  *
115  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
116  * Portions Copyright (c) 1994, Regents of the University of California
117  *
118  * IDENTIFICATION
119  *	  src/backend/utils/sort/tuplesort.c
120  *
121  *-------------------------------------------------------------------------
122  */
123 
124 #include "postgres.h"
125 
126 #include <limits.h>
127 
128 #include "access/htup_details.h"
129 #include "access/nbtree.h"
130 #include "access/hash.h"
131 #include "catalog/index.h"
132 #include "catalog/pg_am.h"
133 #include "commands/tablespace.h"
134 #include "executor/executor.h"
135 #include "miscadmin.h"
136 #include "pg_trace.h"
137 #include "utils/datum.h"
138 #include "utils/logtape.h"
139 #include "utils/lsyscache.h"
140 #include "utils/memutils.h"
141 #include "utils/pg_rusage.h"
142 #include "utils/rel.h"
143 #include "utils/sortsupport.h"
144 #include "utils/tuplesort.h"
145 
146 
147 /* sort-type codes for sort__start probes */
148 #define HEAP_SORT		0
149 #define INDEX_SORT		1
150 #define DATUM_SORT		2
151 #define CLUSTER_SORT	3
152 
153 /* GUC variables */
154 #ifdef TRACE_SORT
155 bool		trace_sort = false;
156 #endif
157 
158 #ifdef DEBUG_BOUNDED_SORT
159 bool		optimize_bounded_sort = true;
160 #endif
161 
162 
163 /*
164  * The objects we actually sort are SortTuple structs.  These contain
165  * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
166  * which is a separate palloc chunk --- we assume it is just one chunk and
167  * can be freed by a simple pfree() (except during merge, when we use a
168  * simple slab allocator).  SortTuples also contain the tuple's first key
169  * column in Datum/nullflag format, and an index integer.
170  *
171  * Storing the first key column lets us save heap_getattr or index_getattr
172  * calls during tuple comparisons.  We could extract and save all the key
173  * columns not just the first, but this would increase code complexity and
174  * overhead, and wouldn't actually save any comparison cycles in the common
175  * case where the first key determines the comparison result.  Note that
176  * for a pass-by-reference datatype, datum1 points into the "tuple" storage.
177  *
178  * There is one special case: when the sort support infrastructure provides an
179  * "abbreviated key" representation, where the key is (typically) a pass by
180  * value proxy for a pass by reference type.  In this case, the abbreviated key
181  * is stored in datum1 in place of the actual first key column.
182  *
183  * When sorting single Datums, the data value is represented directly by
184  * datum1/isnull1 for pass by value types (or null values).  If the datatype is
185  * pass-by-reference and isnull1 is false, then "tuple" points to a separately
186  * palloc'd data value, otherwise "tuple" is NULL.  The value of datum1 is then
187  * either the same pointer as "tuple", or is an abbreviated key value as
188  * described above.  Accordingly, "tuple" is always used in preference to
189  * datum1 as the authoritative value for pass-by-reference cases.
190  *
191  * While building initial runs, tupindex holds the tuple's run number.
192  * Historically, the run number could meaningfully distinguish many runs, but
193  * it now only distinguishes RUN_FIRST and HEAP_RUN_NEXT, since replacement
194  * selection is always abandoned after the first run; no other run number
195  * should be represented here.  During merge passes, we re-use it to hold the
196  * input tape number that each tuple in the heap was read from.  tupindex goes
197  * unused if the sort occurs entirely in memory.
198  */
199 typedef struct
200 {
201 	void	   *tuple;			/* the tuple itself */
202 	Datum		datum1;			/* value of first key column */
203 	bool		isnull1;		/* is first key column NULL? */
204 	int			tupindex;		/* see notes above */
205 } SortTuple;
206 
207 /*
208  * During merge, we use a pre-allocated set of fixed-size slots to hold
209  * tuples.  To avoid palloc/pfree overhead.
210  *
211  * Merge doesn't require a lot of memory, so we can afford to waste some,
212  * by using gratuitously-sized slots.  If a tuple is larger than 1 kB, the
213  * palloc() overhead is not significant anymore.
214  *
215  * 'nextfree' is valid when this chunk is in the free list.  When in use, the
216  * slot holds a tuple.
217  */
218 #define SLAB_SLOT_SIZE 1024
219 
220 typedef union SlabSlot
221 {
222 	union SlabSlot *nextfree;
223 	char		buffer[SLAB_SLOT_SIZE];
224 } SlabSlot;
225 
226 /*
227  * Possible states of a Tuplesort object.  These denote the states that
228  * persist between calls of Tuplesort routines.
229  */
230 typedef enum
231 {
232 	TSS_INITIAL,				/* Loading tuples; still within memory limit */
233 	TSS_BOUNDED,				/* Loading tuples into bounded-size heap */
234 	TSS_BUILDRUNS,				/* Loading tuples; writing to tape */
235 	TSS_SORTEDINMEM,			/* Sort completed entirely in memory */
236 	TSS_SORTEDONTAPE,			/* Sort completed, final run is on tape */
237 	TSS_FINALMERGE				/* Performing final merge on-the-fly */
238 } TupSortStatus;
239 
240 /*
241  * Parameters for calculation of number of tapes to use --- see inittapes()
242  * and tuplesort_merge_order().
243  *
244  * In this calculation we assume that each tape will cost us about 1 blocks
245  * worth of buffer space.  This ignores the overhead of all the other data
246  * structures needed for each tape, but it's probably close enough.
247  *
248  * MERGE_BUFFER_SIZE is how much data we'd like to read from each input
249  * tape during a preread cycle (see discussion at top of file).
250  */
251 #define MINORDER		6		/* minimum merge order */
252 #define MAXORDER		500		/* maximum merge order */
253 #define TAPE_BUFFER_OVERHEAD		BLCKSZ
254 #define MERGE_BUFFER_SIZE			(BLCKSZ * 32)
255 
256  /*
257   * Run numbers, used during external sort operations.
258   *
259   * HEAP_RUN_NEXT is only used for SortTuple.tupindex, never state.currentRun.
260   */
261 #define RUN_FIRST		0
262 #define HEAP_RUN_NEXT	INT_MAX
263 #define RUN_SECOND		1
264 
265 typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
266 									Tuplesortstate *state);
267 
268 /*
269  * Private state of a Tuplesort operation.
270  */
271 struct Tuplesortstate
272 {
273 	TupSortStatus status;		/* enumerated value as shown above */
274 	int			nKeys;			/* number of columns in sort key */
275 	bool		randomAccess;	/* did caller request random access? */
276 	bool		bounded;		/* did caller specify a maximum number of
277 								 * tuples to return? */
278 	bool		boundUsed;		/* true if we made use of a bounded heap */
279 	int			bound;			/* if bounded, the maximum number of tuples */
280 	bool		tuples;			/* Can SortTuple.tuple ever be set? */
281 	int64		availMem;		/* remaining memory available, in bytes */
282 	int64		allowedMem;		/* total memory allowed, in bytes */
283 	int			maxTapes;		/* number of tapes (Knuth's T) */
284 	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
285 	MemoryContext sortcontext;	/* memory context holding most sort data */
286 	MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
287 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
288 
289 	/*
290 	 * These function pointers decouple the routines that must know what kind
291 	 * of tuple we are sorting from the routines that don't need to know it.
292 	 * They are set up by the tuplesort_begin_xxx routines.
293 	 *
294 	 * Function to compare two tuples; result is per qsort() convention, ie:
295 	 * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
296 	 * qsort_arg_comparator.
297 	 */
298 	SortTupleComparator comparetup;
299 
300 	/*
301 	 * Function to copy a supplied input tuple into palloc'd space and set up
302 	 * its SortTuple representation (ie, set tuple/datum1/isnull1).  Also,
303 	 * state->availMem must be decreased by the amount of space used for the
304 	 * tuple copy (note the SortTuple struct itself is not counted).
305 	 */
306 	void		(*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
307 
308 	/*
309 	 * Function to write a stored tuple onto tape.  The representation of the
310 	 * tuple on tape need not be the same as it is in memory; requirements on
311 	 * the tape representation are given below.  Unless the slab allocator is
312 	 * used, after writing the tuple, pfree() the out-of-line data (not the
313 	 * SortTuple struct!), and increase state->availMem by the amount of
314 	 * memory space thereby released.
315 	 */
316 	void		(*writetup) (Tuplesortstate *state, int tapenum,
317 							 SortTuple *stup);
318 
319 	/*
320 	 * Function to read a stored tuple from tape back into memory. 'len' is
321 	 * the already-read length of the stored tuple.  The tuple is allocated
322 	 * from the slab memory arena, or is palloc'd, see readtup_alloc().
323 	 */
324 	void		(*readtup) (Tuplesortstate *state, SortTuple *stup,
325 							int tapenum, unsigned int len);
326 
327 	/*
328 	 * This array holds the tuples now in sort memory.  If we are in state
329 	 * INITIAL, the tuples are in no particular order; if we are in state
330 	 * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
331 	 * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
332 	 * H.  In state SORTEDONTAPE, the array is not used.
333 	 */
334 	SortTuple  *memtuples;		/* array of SortTuple structs */
335 	int			memtupcount;	/* number of tuples currently present */
336 	int			memtupsize;		/* allocated length of memtuples array */
337 	bool		growmemtuples;	/* memtuples' growth still underway? */
338 
339 	/*
340 	 * Memory for tuples is sometimes allocated using a simple slab allocator,
341 	 * rather than with palloc().  Currently, we switch to slab allocation
342 	 * when we start merging.  Merging only needs to keep a small, fixed
343 	 * number of tuples in memory at any time, so we can avoid the
344 	 * palloc/pfree overhead by recycling a fixed number of fixed-size slots
345 	 * to hold the tuples.
346 	 *
347 	 * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
348 	 * slots.  The allocation is sized to have one slot per tape, plus one
349 	 * additional slot.  We need that many slots to hold all the tuples kept
350 	 * in the heap during merge, plus the one we have last returned from the
351 	 * sort, with tuplesort_gettuple.
352 	 *
353 	 * Initially, all the slots are kept in a linked list of free slots.  When
354 	 * a tuple is read from a tape, it is put to the next available slot, if
355 	 * it fits.  If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
356 	 * instead.
357 	 *
358 	 * When we're done processing a tuple, we return the slot back to the free
359 	 * list, or pfree() if it was palloc'd.  We know that a tuple was
360 	 * allocated from the slab, if its pointer value is between
361 	 * slabMemoryBegin and -End.
362 	 *
363 	 * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
364 	 * tracking memory usage is not used.
365 	 */
366 	bool		slabAllocatorUsed;
367 
368 	char	   *slabMemoryBegin;	/* beginning of slab memory arena */
369 	char	   *slabMemoryEnd;	/* end of slab memory arena */
370 	SlabSlot   *slabFreeHead;	/* head of free list */
371 
372 	/* Buffer size to use for reading input tapes, during merge. */
373 	size_t		read_buffer_size;
374 
375 	/*
376 	 * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
377 	 * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
378 	 * modes), we remember the tuple in 'lastReturnedTuple', so that we can
379 	 * recycle the memory on next gettuple call.
380 	 */
381 	void	   *lastReturnedTuple;
382 
383 	/*
384 	 * While building initial runs, this indicates if the replacement
385 	 * selection strategy is in use.  When it isn't, then a simple hybrid
386 	 * sort-merge strategy is in use instead (runs are quicksorted).
387 	 */
388 	bool		replaceActive;
389 
390 	/*
391 	 * While building initial runs, this is the current output run number
392 	 * (starting at RUN_FIRST).  Afterwards, it is the number of initial runs
393 	 * we made.
394 	 */
395 	int			currentRun;
396 
397 	/*
398 	 * Unless otherwise noted, all pointer variables below are pointers to
399 	 * arrays of length maxTapes, holding per-tape data.
400 	 */
401 
402 	/*
403 	 * This variable is only used during merge passes.  mergeactive[i] is true
404 	 * if we are reading an input run from (actual) tape number i and have not
405 	 * yet exhausted that run.
406 	 */
407 	bool	   *mergeactive;	/* active input run source? */
408 
409 	/*
410 	 * Variables for Algorithm D.  Note that destTape is a "logical" tape
411 	 * number, ie, an index into the tp_xxx[] arrays.  Be careful to keep
412 	 * "logical" and "actual" tape numbers straight!
413 	 */
414 	int			Level;			/* Knuth's l */
415 	int			destTape;		/* current output tape (Knuth's j, less 1) */
416 	int		   *tp_fib;			/* Target Fibonacci run counts (A[]) */
417 	int		   *tp_runs;		/* # of real runs on each tape */
418 	int		   *tp_dummy;		/* # of dummy runs for each tape (D[]) */
419 	int		   *tp_tapenum;		/* Actual tape numbers (TAPE[]) */
420 	int			activeTapes;	/* # of active input tapes in merge pass */
421 
422 	/*
423 	 * These variables are used after completion of sorting to keep track of
424 	 * the next tuple to return.  (In the tape case, the tape's current read
425 	 * position is also critical state.)
426 	 */
427 	int			result_tape;	/* actual tape number of finished output */
428 	int			current;		/* array index (only used if SORTEDINMEM) */
429 	bool		eof_reached;	/* reached EOF (needed for cursors) */
430 
431 	/* markpos_xxx holds marked position for mark and restore */
432 	long		markpos_block;	/* tape block# (only used if SORTEDONTAPE) */
433 	int			markpos_offset; /* saved "current", or offset in tape block */
434 	bool		markpos_eof;	/* saved "eof_reached" */
435 
436 	/*
437 	 * The sortKeys variable is used by every case other than the hash index
438 	 * case; it is set by tuplesort_begin_xxx.  tupDesc is only used by the
439 	 * MinimalTuple and CLUSTER routines, though.
440 	 */
441 	TupleDesc	tupDesc;
442 	SortSupport sortKeys;		/* array of length nKeys */
443 
444 	/*
445 	 * This variable is shared by the single-key MinimalTuple case and the
446 	 * Datum case (which both use qsort_ssup()).  Otherwise it's NULL.
447 	 */
448 	SortSupport onlyKey;
449 
450 	/*
451 	 * Additional state for managing "abbreviated key" sortsupport routines
452 	 * (which currently may be used by all cases except the hash index case).
453 	 * Tracks the intervals at which the optimization's effectiveness is
454 	 * tested.
455 	 */
456 	int64		abbrevNext;		/* Tuple # at which to next check
457 								 * applicability */
458 
459 	/*
460 	 * These variables are specific to the CLUSTER case; they are set by
461 	 * tuplesort_begin_cluster.
462 	 */
463 	IndexInfo  *indexInfo;		/* info about index being used for reference */
464 	EState	   *estate;			/* for evaluating index expressions */
465 
466 	/*
467 	 * These variables are specific to the IndexTuple case; they are set by
468 	 * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
469 	 */
470 	Relation	heapRel;		/* table the index is being built on */
471 	Relation	indexRel;		/* index being built */
472 
473 	/* These are specific to the index_btree subcase: */
474 	bool		enforceUnique;	/* complain if we find duplicate tuples */
475 
476 	/* These are specific to the index_hash subcase: */
477 	uint32		high_mask;		/* masks for sortable part of hash code */
478 	uint32		low_mask;
479 	uint32		max_buckets;
480 
481 	/*
482 	 * These variables are specific to the Datum case; they are set by
483 	 * tuplesort_begin_datum and used only by the DatumTuple routines.
484 	 */
485 	Oid			datumType;
486 	/* we need typelen in order to know how to copy the Datums. */
487 	int			datumTypeLen;
488 
489 	/*
490 	 * Resource snapshot for time of sort start.
491 	 */
492 #ifdef TRACE_SORT
493 	PGRUsage	ru_start;
494 #endif
495 };
496 
497 /*
498  * Is the given tuple allocated from the slab memory arena?
499  */
500 #define IS_SLAB_SLOT(state, tuple) \
501 	((char *) (tuple) >= (state)->slabMemoryBegin && \
502 	 (char *) (tuple) < (state)->slabMemoryEnd)
503 
504 /*
505  * Return the given tuple to the slab memory free list, or free it
506  * if it was palloc'd.
507  */
508 #define RELEASE_SLAB_SLOT(state, tuple) \
509 	do { \
510 		SlabSlot *buf = (SlabSlot *) tuple; \
511 		\
512 		if (IS_SLAB_SLOT((state), buf)) \
513 		{ \
514 			buf->nextfree = (state)->slabFreeHead; \
515 			(state)->slabFreeHead = buf; \
516 		} else \
517 			pfree(buf); \
518 	} while(0)
519 
520 #define COMPARETUP(state,a,b)	((*(state)->comparetup) (a, b, state))
521 #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
522 #define WRITETUP(state,tape,stup)	((*(state)->writetup) (state, tape, stup))
523 #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
524 #define LACKMEM(state)		((state)->availMem < 0 && !(state)->slabAllocatorUsed)
525 #define USEMEM(state,amt)	((state)->availMem -= (amt))
526 #define FREEMEM(state,amt)	((state)->availMem += (amt))
527 
528 /*
529  * NOTES about on-tape representation of tuples:
530  *
531  * We require the first "unsigned int" of a stored tuple to be the total size
532  * on-tape of the tuple, including itself (so it is never zero; an all-zero
533  * unsigned int is used to delimit runs).  The remainder of the stored tuple
534  * may or may not match the in-memory representation of the tuple ---
535  * any conversion needed is the job of the writetup and readtup routines.
536  *
537  * If state->randomAccess is true, then the stored representation of the
538  * tuple must be followed by another "unsigned int" that is a copy of the
539  * length --- so the total tape space used is actually sizeof(unsigned int)
540  * more than the stored length value.  This allows read-backwards.  When
541  * randomAccess is not true, the write/read routines may omit the extra
542  * length word.
543  *
544  * writetup is expected to write both length words as well as the tuple
545  * data.  When readtup is called, the tape is positioned just after the
546  * front length word; readtup must read the tuple data and advance past
547  * the back length word (if present).
548  *
549  * The write/read routines can make use of the tuple description data
550  * stored in the Tuplesortstate record, if needed.  They are also expected
551  * to adjust state->availMem by the amount of memory space (not tape space!)
552  * released or consumed.  There is no error return from either writetup
553  * or readtup; they should ereport() on failure.
554  *
555  *
556  * NOTES about memory consumption calculations:
557  *
558  * We count space allocated for tuples against the workMem limit, plus
559  * the space used by the variable-size memtuples array.  Fixed-size space
560  * is not counted; it's small enough to not be interesting.
561  *
562  * Note that we count actual space used (as shown by GetMemoryChunkSpace)
563  * rather than the originally-requested size.  This is important since
564  * palloc can add substantial overhead.  It's not a complete answer since
565  * we won't count any wasted space in palloc allocation blocks, but it's
566  * a lot better than what we were doing before 7.3.  As of 9.6, a
567  * separate memory context is used for caller passed tuples.  Resetting
568  * it at certain key increments significantly ameliorates fragmentation.
569  * Note that this places a responsibility on readtup and copytup routines
570  * to use the right memory context for these tuples (and to not use the
571  * reset context for anything whose lifetime needs to span multiple
572  * external sort runs).
573  */
574 
575 /* When using this macro, beware of double evaluation of len */
576 #define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \
577 	do { \
578 		if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \
579 			elog(ERROR, "unexpected end of data"); \
580 	} while(0)
581 
582 
583 static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
584 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
585 static bool consider_abort_common(Tuplesortstate *state);
586 static bool useselection(Tuplesortstate *state);
587 static void inittapes(Tuplesortstate *state);
588 static void selectnewtape(Tuplesortstate *state);
589 static void init_slab_allocator(Tuplesortstate *state, int numSlots);
590 static void mergeruns(Tuplesortstate *state);
591 static void mergeonerun(Tuplesortstate *state);
592 static void beginmerge(Tuplesortstate *state);
593 static bool mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup);
594 static void dumptuples(Tuplesortstate *state, bool alltuples);
595 static void dumpbatch(Tuplesortstate *state, bool alltuples);
596 static void make_bounded_heap(Tuplesortstate *state);
597 static void sort_bounded_heap(Tuplesortstate *state);
598 static void tuplesort_sort_memtuples(Tuplesortstate *state);
599 static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
600 					  bool checkIndex);
601 static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple,
602 						   bool checkIndex);
603 static void tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex);
604 static void reversedirection(Tuplesortstate *state);
605 static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
606 static void markrunend(Tuplesortstate *state, int tapenum);
607 static void *readtup_alloc(Tuplesortstate *state, Size tuplen);
608 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
609 				Tuplesortstate *state);
610 static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
611 static void writetup_heap(Tuplesortstate *state, int tapenum,
612 			  SortTuple *stup);
613 static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
614 			 int tapenum, unsigned int len);
615 static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
616 				   Tuplesortstate *state);
617 static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
618 static void writetup_cluster(Tuplesortstate *state, int tapenum,
619 				 SortTuple *stup);
620 static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
621 				int tapenum, unsigned int len);
622 static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
623 					   Tuplesortstate *state);
624 static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
625 					  Tuplesortstate *state);
626 static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
627 static void writetup_index(Tuplesortstate *state, int tapenum,
628 			   SortTuple *stup);
629 static void readtup_index(Tuplesortstate *state, SortTuple *stup,
630 			  int tapenum, unsigned int len);
631 static int comparetup_datum(const SortTuple *a, const SortTuple *b,
632 				 Tuplesortstate *state);
633 static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
634 static void writetup_datum(Tuplesortstate *state, int tapenum,
635 			   SortTuple *stup);
636 static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
637 			  int tapenum, unsigned int len);
638 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
639 
640 /*
641  * Special versions of qsort just for SortTuple objects.  qsort_tuple() sorts
642  * any variant of SortTuples, using the appropriate comparetup function.
643  * qsort_ssup() is specialized for the case where the comparetup function
644  * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts
645  * and Datum sorts.
646  */
647 #include "qsort_tuple.c"
648 
649 
650 /*
651  *		tuplesort_begin_xxx
652  *
653  * Initialize for a tuple sort operation.
654  *
655  * After calling tuplesort_begin, the caller should call tuplesort_putXXX
656  * zero or more times, then call tuplesort_performsort when all the tuples
657  * have been supplied.  After performsort, retrieve the tuples in sorted
658  * order by calling tuplesort_getXXX until it returns false/NULL.  (If random
659  * access was requested, rescan, markpos, and restorepos can also be called.)
660  * Call tuplesort_end to terminate the operation and release memory/disk space.
661  *
662  * Each variant of tuplesort_begin has a workMem parameter specifying the
663  * maximum number of kilobytes of RAM to use before spilling data to disk.
664  * (The normal value of this parameter is work_mem, but some callers use
665  * other values.)  Each variant also has a randomAccess parameter specifying
666  * whether the caller needs non-sequential access to the sort result.
667  */
668 
669 static Tuplesortstate *
tuplesort_begin_common(int workMem,bool randomAccess)670 tuplesort_begin_common(int workMem, bool randomAccess)
671 {
672 	Tuplesortstate *state;
673 	MemoryContext sortcontext;
674 	MemoryContext tuplecontext;
675 	MemoryContext oldcontext;
676 
677 	/*
678 	 * Create a working memory context for this sort operation. All data
679 	 * needed by the sort will live inside this context.
680 	 */
681 	sortcontext = AllocSetContextCreate(CurrentMemoryContext,
682 										"TupleSort main",
683 										ALLOCSET_DEFAULT_SIZES);
684 
685 	/*
686 	 * Caller tuple (e.g. IndexTuple) memory context.
687 	 *
688 	 * A dedicated child context used exclusively for caller passed tuples
689 	 * eases memory management.  Resetting at key points reduces
690 	 * fragmentation. Note that the memtuples array of SortTuples is allocated
691 	 * in the parent context, not this context, because there is no need to
692 	 * free memtuples early.
693 	 */
694 	tuplecontext = AllocSetContextCreate(sortcontext,
695 										 "Caller tuples",
696 										 ALLOCSET_DEFAULT_SIZES);
697 
698 	/*
699 	 * Make the Tuplesortstate within the per-sort context.  This way, we
700 	 * don't need a separate pfree() operation for it at shutdown.
701 	 */
702 	oldcontext = MemoryContextSwitchTo(sortcontext);
703 
704 	state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate));
705 
706 #ifdef TRACE_SORT
707 	if (trace_sort)
708 		pg_rusage_init(&state->ru_start);
709 #endif
710 
711 	state->status = TSS_INITIAL;
712 	state->randomAccess = randomAccess;
713 	state->bounded = false;
714 	state->tuples = true;
715 	state->boundUsed = false;
716 	state->allowedMem = workMem * (int64) 1024;
717 	state->availMem = state->allowedMem;
718 	state->sortcontext = sortcontext;
719 	state->tuplecontext = tuplecontext;
720 	state->tapeset = NULL;
721 
722 	state->memtupcount = 0;
723 
724 	/*
725 	 * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
726 	 * see comments in grow_memtuples().
727 	 */
728 	state->memtupsize = Max(1024,
729 							ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1);
730 
731 	state->growmemtuples = true;
732 	state->slabAllocatorUsed = false;
733 	state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
734 
735 	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
736 
737 	/* workMem must be large enough for the minimal memtuples array */
738 	if (LACKMEM(state))
739 		elog(ERROR, "insufficient memory allowed for sort");
740 
741 	state->currentRun = RUN_FIRST;
742 
743 	/*
744 	 * maxTapes, tapeRange, and Algorithm D variables will be initialized by
745 	 * inittapes(), if needed
746 	 */
747 
748 	state->result_tape = -1;	/* flag that result tape has not been formed */
749 
750 	MemoryContextSwitchTo(oldcontext);
751 
752 	return state;
753 }
754 
755 Tuplesortstate *
tuplesort_begin_heap(TupleDesc tupDesc,int nkeys,AttrNumber * attNums,Oid * sortOperators,Oid * sortCollations,bool * nullsFirstFlags,int workMem,bool randomAccess)756 tuplesort_begin_heap(TupleDesc tupDesc,
757 					 int nkeys, AttrNumber *attNums,
758 					 Oid *sortOperators, Oid *sortCollations,
759 					 bool *nullsFirstFlags,
760 					 int workMem, bool randomAccess)
761 {
762 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
763 	MemoryContext oldcontext;
764 	int			i;
765 
766 	oldcontext = MemoryContextSwitchTo(state->sortcontext);
767 
768 	AssertArg(nkeys > 0);
769 
770 #ifdef TRACE_SORT
771 	if (trace_sort)
772 		elog(LOG,
773 			 "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
774 			 nkeys, workMem, randomAccess ? 't' : 'f');
775 #endif
776 
777 	state->nKeys = nkeys;
778 
779 	TRACE_POSTGRESQL_SORT_START(HEAP_SORT,
780 								false,	/* no unique check */
781 								nkeys,
782 								workMem,
783 								randomAccess);
784 
785 	state->comparetup = comparetup_heap;
786 	state->copytup = copytup_heap;
787 	state->writetup = writetup_heap;
788 	state->readtup = readtup_heap;
789 
790 	state->tupDesc = tupDesc;	/* assume we need not copy tupDesc */
791 	state->abbrevNext = 10;
792 
793 	/* Prepare SortSupport data for each column */
794 	state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
795 
796 	for (i = 0; i < nkeys; i++)
797 	{
798 		SortSupport sortKey = state->sortKeys + i;
799 
800 		AssertArg(attNums[i] != 0);
801 		AssertArg(sortOperators[i] != 0);
802 
803 		sortKey->ssup_cxt = CurrentMemoryContext;
804 		sortKey->ssup_collation = sortCollations[i];
805 		sortKey->ssup_nulls_first = nullsFirstFlags[i];
806 		sortKey->ssup_attno = attNums[i];
807 		/* Convey if abbreviation optimization is applicable in principle */
808 		sortKey->abbreviate = (i == 0);
809 
810 		PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
811 	}
812 
813 	/*
814 	 * The "onlyKey" optimization cannot be used with abbreviated keys, since
815 	 * tie-breaker comparisons may be required.  Typically, the optimization
816 	 * is only of value to pass-by-value types anyway, whereas abbreviated
817 	 * keys are typically only of value to pass-by-reference types.
818 	 */
819 	if (nkeys == 1 && !state->sortKeys->abbrev_converter)
820 		state->onlyKey = state->sortKeys;
821 
822 	MemoryContextSwitchTo(oldcontext);
823 
824 	return state;
825 }
826 
827 Tuplesortstate *
tuplesort_begin_cluster(TupleDesc tupDesc,Relation indexRel,int workMem,bool randomAccess)828 tuplesort_begin_cluster(TupleDesc tupDesc,
829 						Relation indexRel,
830 						int workMem, bool randomAccess)
831 {
832 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
833 	ScanKey		indexScanKey;
834 	MemoryContext oldcontext;
835 	int			i;
836 
837 	Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
838 
839 	oldcontext = MemoryContextSwitchTo(state->sortcontext);
840 
841 #ifdef TRACE_SORT
842 	if (trace_sort)
843 		elog(LOG,
844 			 "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
845 			 RelationGetNumberOfAttributes(indexRel),
846 			 workMem, randomAccess ? 't' : 'f');
847 #endif
848 
849 	state->nKeys = RelationGetNumberOfAttributes(indexRel);
850 
851 	TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT,
852 								false,	/* no unique check */
853 								state->nKeys,
854 								workMem,
855 								randomAccess);
856 
857 	state->comparetup = comparetup_cluster;
858 	state->copytup = copytup_cluster;
859 	state->writetup = writetup_cluster;
860 	state->readtup = readtup_cluster;
861 	state->abbrevNext = 10;
862 
863 	state->indexInfo = BuildIndexInfo(indexRel);
864 
865 	state->tupDesc = tupDesc;	/* assume we need not copy tupDesc */
866 
867 	indexScanKey = _bt_mkscankey_nodata(indexRel);
868 
869 	if (state->indexInfo->ii_Expressions != NULL)
870 	{
871 		TupleTableSlot *slot;
872 		ExprContext *econtext;
873 
874 		/*
875 		 * We will need to use FormIndexDatum to evaluate the index
876 		 * expressions.  To do that, we need an EState, as well as a
877 		 * TupleTableSlot to put the table tuples into.  The econtext's
878 		 * scantuple has to point to that slot, too.
879 		 */
880 		state->estate = CreateExecutorState();
881 		slot = MakeSingleTupleTableSlot(tupDesc);
882 		econtext = GetPerTupleExprContext(state->estate);
883 		econtext->ecxt_scantuple = slot;
884 	}
885 
886 	/* Prepare SortSupport data for each column */
887 	state->sortKeys = (SortSupport) palloc0(state->nKeys *
888 											sizeof(SortSupportData));
889 
890 	for (i = 0; i < state->nKeys; i++)
891 	{
892 		SortSupport sortKey = state->sortKeys + i;
893 		ScanKey		scanKey = indexScanKey + i;
894 		int16		strategy;
895 
896 		sortKey->ssup_cxt = CurrentMemoryContext;
897 		sortKey->ssup_collation = scanKey->sk_collation;
898 		sortKey->ssup_nulls_first =
899 			(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
900 		sortKey->ssup_attno = scanKey->sk_attno;
901 		/* Convey if abbreviation optimization is applicable in principle */
902 		sortKey->abbreviate = (i == 0);
903 
904 		AssertState(sortKey->ssup_attno != 0);
905 
906 		strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
907 			BTGreaterStrategyNumber : BTLessStrategyNumber;
908 
909 		PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
910 	}
911 
912 	_bt_freeskey(indexScanKey);
913 
914 	MemoryContextSwitchTo(oldcontext);
915 
916 	return state;
917 }
918 
919 Tuplesortstate *
tuplesort_begin_index_btree(Relation heapRel,Relation indexRel,bool enforceUnique,int workMem,bool randomAccess)920 tuplesort_begin_index_btree(Relation heapRel,
921 							Relation indexRel,
922 							bool enforceUnique,
923 							int workMem, bool randomAccess)
924 {
925 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
926 	ScanKey		indexScanKey;
927 	MemoryContext oldcontext;
928 	int			i;
929 
930 	oldcontext = MemoryContextSwitchTo(state->sortcontext);
931 
932 #ifdef TRACE_SORT
933 	if (trace_sort)
934 		elog(LOG,
935 			 "begin index sort: unique = %c, workMem = %d, randomAccess = %c",
936 			 enforceUnique ? 't' : 'f',
937 			 workMem, randomAccess ? 't' : 'f');
938 #endif
939 
940 	state->nKeys = RelationGetNumberOfAttributes(indexRel);
941 
942 	TRACE_POSTGRESQL_SORT_START(INDEX_SORT,
943 								enforceUnique,
944 								state->nKeys,
945 								workMem,
946 								randomAccess);
947 
948 	state->comparetup = comparetup_index_btree;
949 	state->copytup = copytup_index;
950 	state->writetup = writetup_index;
951 	state->readtup = readtup_index;
952 	state->abbrevNext = 10;
953 
954 	state->heapRel = heapRel;
955 	state->indexRel = indexRel;
956 	state->enforceUnique = enforceUnique;
957 
958 	indexScanKey = _bt_mkscankey_nodata(indexRel);
959 	state->nKeys = RelationGetNumberOfAttributes(indexRel);
960 
961 	/* Prepare SortSupport data for each column */
962 	state->sortKeys = (SortSupport) palloc0(state->nKeys *
963 											sizeof(SortSupportData));
964 
965 	for (i = 0; i < state->nKeys; i++)
966 	{
967 		SortSupport sortKey = state->sortKeys + i;
968 		ScanKey		scanKey = indexScanKey + i;
969 		int16		strategy;
970 
971 		sortKey->ssup_cxt = CurrentMemoryContext;
972 		sortKey->ssup_collation = scanKey->sk_collation;
973 		sortKey->ssup_nulls_first =
974 			(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
975 		sortKey->ssup_attno = scanKey->sk_attno;
976 		/* Convey if abbreviation optimization is applicable in principle */
977 		sortKey->abbreviate = (i == 0);
978 
979 		AssertState(sortKey->ssup_attno != 0);
980 
981 		strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
982 			BTGreaterStrategyNumber : BTLessStrategyNumber;
983 
984 		PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
985 	}
986 
987 	_bt_freeskey(indexScanKey);
988 
989 	MemoryContextSwitchTo(oldcontext);
990 
991 	return state;
992 }
993 
994 Tuplesortstate *
tuplesort_begin_index_hash(Relation heapRel,Relation indexRel,uint32 high_mask,uint32 low_mask,uint32 max_buckets,int workMem,bool randomAccess)995 tuplesort_begin_index_hash(Relation heapRel,
996 						   Relation indexRel,
997 						   uint32 high_mask,
998 						   uint32 low_mask,
999 						   uint32 max_buckets,
1000 						   int workMem, bool randomAccess)
1001 {
1002 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
1003 	MemoryContext oldcontext;
1004 
1005 	oldcontext = MemoryContextSwitchTo(state->sortcontext);
1006 
1007 #ifdef TRACE_SORT
1008 	if (trace_sort)
1009 		elog(LOG,
1010 			 "begin index sort: high_mask = 0x%x, low_mask = 0x%x, "
1011 			 "max_buckets = 0x%x, workMem = %d, randomAccess = %c",
1012 			 high_mask,
1013 			 low_mask,
1014 			 max_buckets,
1015 			 workMem, randomAccess ? 't' : 'f');
1016 #endif
1017 
1018 	state->nKeys = 1;			/* Only one sort column, the hash code */
1019 
1020 	state->comparetup = comparetup_index_hash;
1021 	state->copytup = copytup_index;
1022 	state->writetup = writetup_index;
1023 	state->readtup = readtup_index;
1024 
1025 	state->heapRel = heapRel;
1026 	state->indexRel = indexRel;
1027 
1028 	state->high_mask = high_mask;
1029 	state->low_mask = low_mask;
1030 	state->max_buckets = max_buckets;
1031 
1032 	MemoryContextSwitchTo(oldcontext);
1033 
1034 	return state;
1035 }
1036 
1037 Tuplesortstate *
tuplesort_begin_datum(Oid datumType,Oid sortOperator,Oid sortCollation,bool nullsFirstFlag,int workMem,bool randomAccess)1038 tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
1039 					  bool nullsFirstFlag,
1040 					  int workMem, bool randomAccess)
1041 {
1042 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
1043 	MemoryContext oldcontext;
1044 	int16		typlen;
1045 	bool		typbyval;
1046 
1047 	oldcontext = MemoryContextSwitchTo(state->sortcontext);
1048 
1049 #ifdef TRACE_SORT
1050 	if (trace_sort)
1051 		elog(LOG,
1052 			 "begin datum sort: workMem = %d, randomAccess = %c",
1053 			 workMem, randomAccess ? 't' : 'f');
1054 #endif
1055 
1056 	state->nKeys = 1;			/* always a one-column sort */
1057 
1058 	TRACE_POSTGRESQL_SORT_START(DATUM_SORT,
1059 								false,	/* no unique check */
1060 								1,
1061 								workMem,
1062 								randomAccess);
1063 
1064 	state->comparetup = comparetup_datum;
1065 	state->copytup = copytup_datum;
1066 	state->writetup = writetup_datum;
1067 	state->readtup = readtup_datum;
1068 	state->abbrevNext = 10;
1069 
1070 	state->datumType = datumType;
1071 
1072 	/* lookup necessary attributes of the datum type */
1073 	get_typlenbyval(datumType, &typlen, &typbyval);
1074 	state->datumTypeLen = typlen;
1075 	state->tuples = !typbyval;
1076 
1077 	/* Prepare SortSupport data */
1078 	state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData));
1079 
1080 	state->sortKeys->ssup_cxt = CurrentMemoryContext;
1081 	state->sortKeys->ssup_collation = sortCollation;
1082 	state->sortKeys->ssup_nulls_first = nullsFirstFlag;
1083 
1084 	/*
1085 	 * Abbreviation is possible here only for by-reference types.  In theory,
1086 	 * a pass-by-value datatype could have an abbreviated form that is cheaper
1087 	 * to compare.  In a tuple sort, we could support that, because we can
1088 	 * always extract the original datum from the tuple is needed.  Here, we
1089 	 * can't, because a datum sort only stores a single copy of the datum; the
1090 	 * "tuple" field of each sortTuple is NULL.
1091 	 */
1092 	state->sortKeys->abbreviate = !typbyval;
1093 
1094 	PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys);
1095 
1096 	/*
1097 	 * The "onlyKey" optimization cannot be used with abbreviated keys, since
1098 	 * tie-breaker comparisons may be required.  Typically, the optimization
1099 	 * is only of value to pass-by-value types anyway, whereas abbreviated
1100 	 * keys are typically only of value to pass-by-reference types.
1101 	 */
1102 	if (!state->sortKeys->abbrev_converter)
1103 		state->onlyKey = state->sortKeys;
1104 
1105 	MemoryContextSwitchTo(oldcontext);
1106 
1107 	return state;
1108 }
1109 
1110 /*
1111  * tuplesort_set_bound
1112  *
1113  *	Advise tuplesort that at most the first N result tuples are required.
1114  *
1115  * Must be called before inserting any tuples.  (Actually, we could allow it
1116  * as long as the sort hasn't spilled to disk, but there seems no need for
1117  * delayed calls at the moment.)
1118  *
1119  * This is a hint only. The tuplesort may still return more tuples than
1120  * requested.
1121  */
1122 void
tuplesort_set_bound(Tuplesortstate * state,int64 bound)1123 tuplesort_set_bound(Tuplesortstate *state, int64 bound)
1124 {
1125 	/* Assert we're called before loading any tuples */
1126 	Assert(state->status == TSS_INITIAL);
1127 	Assert(state->memtupcount == 0);
1128 	Assert(!state->bounded);
1129 
1130 #ifdef DEBUG_BOUNDED_SORT
1131 	/* Honor GUC setting that disables the feature (for easy testing) */
1132 	if (!optimize_bounded_sort)
1133 		return;
1134 #endif
1135 
1136 	/* We want to be able to compute bound * 2, so limit the setting */
1137 	if (bound > (int64) (INT_MAX / 2))
1138 		return;
1139 
1140 	state->bounded = true;
1141 	state->bound = (int) bound;
1142 
1143 	/*
1144 	 * Bounded sorts are not an effective target for abbreviated key
1145 	 * optimization.  Disable by setting state to be consistent with no
1146 	 * abbreviation support.
1147 	 */
1148 	state->sortKeys->abbrev_converter = NULL;
1149 	if (state->sortKeys->abbrev_full_comparator)
1150 		state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
1151 
1152 	/* Not strictly necessary, but be tidy */
1153 	state->sortKeys->abbrev_abort = NULL;
1154 	state->sortKeys->abbrev_full_comparator = NULL;
1155 }
1156 
1157 /*
1158  * tuplesort_end
1159  *
1160  *	Release resources and clean up.
1161  *
1162  * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
1163  * pointing to garbage.  Be careful not to attempt to use or free such
1164  * pointers afterwards!
1165  */
1166 void
tuplesort_end(Tuplesortstate * state)1167 tuplesort_end(Tuplesortstate *state)
1168 {
1169 	/* context swap probably not needed, but let's be safe */
1170 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1171 
1172 #ifdef TRACE_SORT
1173 	long		spaceUsed;
1174 
1175 	if (state->tapeset)
1176 		spaceUsed = LogicalTapeSetBlocks(state->tapeset);
1177 	else
1178 		spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
1179 #endif
1180 
1181 	/*
1182 	 * Delete temporary "tape" files, if any.
1183 	 *
1184 	 * Note: want to include this in reported total cost of sort, hence need
1185 	 * for two #ifdef TRACE_SORT sections.
1186 	 */
1187 	if (state->tapeset)
1188 		LogicalTapeSetClose(state->tapeset);
1189 
1190 #ifdef TRACE_SORT
1191 	if (trace_sort)
1192 	{
1193 		if (state->tapeset)
1194 			elog(LOG, "external sort ended, %ld disk blocks used: %s",
1195 				 spaceUsed, pg_rusage_show(&state->ru_start));
1196 		else
1197 			elog(LOG, "internal sort ended, %ld KB used: %s",
1198 				 spaceUsed, pg_rusage_show(&state->ru_start));
1199 	}
1200 
1201 	TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed);
1202 #else
1203 
1204 	/*
1205 	 * If you disabled TRACE_SORT, you can still probe sort__done, but you
1206 	 * ain't getting space-used stats.
1207 	 */
1208 	TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
1209 #endif
1210 
1211 	/* Free any execution state created for CLUSTER case */
1212 	if (state->estate != NULL)
1213 	{
1214 		ExprContext *econtext = GetPerTupleExprContext(state->estate);
1215 
1216 		ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple);
1217 		FreeExecutorState(state->estate);
1218 	}
1219 
1220 	MemoryContextSwitchTo(oldcontext);
1221 
1222 	/*
1223 	 * Free the per-sort memory context, thereby releasing all working memory,
1224 	 * including the Tuplesortstate struct itself.
1225 	 */
1226 	MemoryContextDelete(state->sortcontext);
1227 }
1228 
1229 /*
1230  * Grow the memtuples[] array, if possible within our memory constraint.  We
1231  * must not exceed INT_MAX tuples in memory or the caller-provided memory
1232  * limit.  Return TRUE if we were able to enlarge the array, FALSE if not.
1233  *
1234  * Normally, at each increment we double the size of the array.  When doing
1235  * that would exceed a limit, we attempt one last, smaller increase (and then
1236  * clear the growmemtuples flag so we don't try any more).  That allows us to
1237  * use memory as fully as permitted; sticking to the pure doubling rule could
1238  * result in almost half going unused.  Because availMem moves around with
1239  * tuple addition/removal, we need some rule to prevent making repeated small
1240  * increases in memtupsize, which would just be useless thrashing.  The
1241  * growmemtuples flag accomplishes that and also prevents useless
1242  * recalculations in this function.
1243  */
1244 static bool
grow_memtuples(Tuplesortstate * state)1245 grow_memtuples(Tuplesortstate *state)
1246 {
1247 	int			newmemtupsize;
1248 	int			memtupsize = state->memtupsize;
1249 	int64		memNowUsed = state->allowedMem - state->availMem;
1250 
1251 	/* Forget it if we've already maxed out memtuples, per comment above */
1252 	if (!state->growmemtuples)
1253 		return false;
1254 
1255 	/* Select new value of memtupsize */
1256 	if (memNowUsed <= state->availMem)
1257 	{
1258 		/*
1259 		 * We've used no more than half of allowedMem; double our usage,
1260 		 * clamping at INT_MAX tuples.
1261 		 */
1262 		if (memtupsize < INT_MAX / 2)
1263 			newmemtupsize = memtupsize * 2;
1264 		else
1265 		{
1266 			newmemtupsize = INT_MAX;
1267 			state->growmemtuples = false;
1268 		}
1269 	}
1270 	else
1271 	{
1272 		/*
1273 		 * This will be the last increment of memtupsize.  Abandon doubling
1274 		 * strategy and instead increase as much as we safely can.
1275 		 *
1276 		 * To stay within allowedMem, we can't increase memtupsize by more
1277 		 * than availMem / sizeof(SortTuple) elements.  In practice, we want
1278 		 * to increase it by considerably less, because we need to leave some
1279 		 * space for the tuples to which the new array slots will refer.  We
1280 		 * assume the new tuples will be about the same size as the tuples
1281 		 * we've already seen, and thus we can extrapolate from the space
1282 		 * consumption so far to estimate an appropriate new size for the
1283 		 * memtuples array.  The optimal value might be higher or lower than
1284 		 * this estimate, but it's hard to know that in advance.  We again
1285 		 * clamp at INT_MAX tuples.
1286 		 *
1287 		 * This calculation is safe against enlarging the array so much that
1288 		 * LACKMEM becomes true, because the memory currently used includes
1289 		 * the present array; thus, there would be enough allowedMem for the
1290 		 * new array elements even if no other memory were currently used.
1291 		 *
1292 		 * We do the arithmetic in float8, because otherwise the product of
1293 		 * memtupsize and allowedMem could overflow.  Any inaccuracy in the
1294 		 * result should be insignificant; but even if we computed a
1295 		 * completely insane result, the checks below will prevent anything
1296 		 * really bad from happening.
1297 		 */
1298 		double		grow_ratio;
1299 
1300 		grow_ratio = (double) state->allowedMem / (double) memNowUsed;
1301 		if (memtupsize * grow_ratio < INT_MAX)
1302 			newmemtupsize = (int) (memtupsize * grow_ratio);
1303 		else
1304 			newmemtupsize = INT_MAX;
1305 
1306 		/* We won't make any further enlargement attempts */
1307 		state->growmemtuples = false;
1308 	}
1309 
1310 	/* Must enlarge array by at least one element, else report failure */
1311 	if (newmemtupsize <= memtupsize)
1312 		goto noalloc;
1313 
1314 	/*
1315 	 * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize.  Clamp
1316 	 * to ensure our request won't be rejected.  Note that we can easily
1317 	 * exhaust address space before facing this outcome.  (This is presently
1318 	 * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but
1319 	 * don't rely on that at this distance.)
1320 	 */
1321 	if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple))
1322 	{
1323 		newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple));
1324 		state->growmemtuples = false;	/* can't grow any more */
1325 	}
1326 
1327 	/*
1328 	 * We need to be sure that we do not cause LACKMEM to become true, else
1329 	 * the space management algorithm will go nuts.  The code above should
1330 	 * never generate a dangerous request, but to be safe, check explicitly
1331 	 * that the array growth fits within availMem.  (We could still cause
1332 	 * LACKMEM if the memory chunk overhead associated with the memtuples
1333 	 * array were to increase.  That shouldn't happen because we chose the
1334 	 * initial array size large enough to ensure that palloc will be treating
1335 	 * both old and new arrays as separate chunks.  But we'll check LACKMEM
1336 	 * explicitly below just in case.)
1337 	 */
1338 	if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple)))
1339 		goto noalloc;
1340 
1341 	/* OK, do it */
1342 	FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
1343 	state->memtupsize = newmemtupsize;
1344 	state->memtuples = (SortTuple *)
1345 		repalloc_huge(state->memtuples,
1346 					  state->memtupsize * sizeof(SortTuple));
1347 	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
1348 	if (LACKMEM(state))
1349 		elog(ERROR, "unexpected out-of-memory situation in tuplesort");
1350 	return true;
1351 
1352 noalloc:
1353 	/* If for any reason we didn't realloc, shut off future attempts */
1354 	state->growmemtuples = false;
1355 	return false;
1356 }
1357 
1358 /*
1359  * Accept one tuple while collecting input data for sort.
1360  *
1361  * Note that the input data is always copied; the caller need not save it.
1362  */
1363 void
tuplesort_puttupleslot(Tuplesortstate * state,TupleTableSlot * slot)1364 tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
1365 {
1366 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1367 	SortTuple	stup;
1368 
1369 	/*
1370 	 * Copy the given tuple into memory we control, and decrease availMem.
1371 	 * Then call the common code.
1372 	 */
1373 	COPYTUP(state, &stup, (void *) slot);
1374 
1375 	puttuple_common(state, &stup);
1376 
1377 	MemoryContextSwitchTo(oldcontext);
1378 }
1379 
1380 /*
1381  * Accept one tuple while collecting input data for sort.
1382  *
1383  * Note that the input data is always copied; the caller need not save it.
1384  */
1385 void
tuplesort_putheaptuple(Tuplesortstate * state,HeapTuple tup)1386 tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
1387 {
1388 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1389 	SortTuple	stup;
1390 
1391 	/*
1392 	 * Copy the given tuple into memory we control, and decrease availMem.
1393 	 * Then call the common code.
1394 	 */
1395 	COPYTUP(state, &stup, (void *) tup);
1396 
1397 	puttuple_common(state, &stup);
1398 
1399 	MemoryContextSwitchTo(oldcontext);
1400 }
1401 
1402 /*
1403  * Collect one index tuple while collecting input data for sort, building
1404  * it from caller-supplied values.
1405  */
1406 void
tuplesort_putindextuplevalues(Tuplesortstate * state,Relation rel,ItemPointer self,Datum * values,bool * isnull)1407 tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel,
1408 							  ItemPointer self, Datum *values,
1409 							  bool *isnull)
1410 {
1411 	MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1412 	SortTuple	stup;
1413 	Datum		original;
1414 	IndexTuple	tuple;
1415 
1416 	stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull);
1417 	tuple = ((IndexTuple) stup.tuple);
1418 	tuple->t_tid = *self;
1419 	USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1420 	/* set up first-column key value */
1421 	original = index_getattr(tuple,
1422 							 1,
1423 							 RelationGetDescr(state->indexRel),
1424 							 &stup.isnull1);
1425 
1426 	MemoryContextSwitchTo(state->sortcontext);
1427 
1428 	if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1)
1429 	{
1430 		/*
1431 		 * Store ordinary Datum representation, or NULL value.  If there is a
1432 		 * converter it won't expect NULL values, and cost model is not
1433 		 * required to account for NULL, so in that case we avoid calling
1434 		 * converter and just set datum1 to zeroed representation (to be
1435 		 * consistent, and to support cheap inequality tests for NULL
1436 		 * abbreviated keys).
1437 		 */
1438 		stup.datum1 = original;
1439 	}
1440 	else if (!consider_abort_common(state))
1441 	{
1442 		/* Store abbreviated key representation */
1443 		stup.datum1 = state->sortKeys->abbrev_converter(original,
1444 														state->sortKeys);
1445 	}
1446 	else
1447 	{
1448 		/* Abort abbreviation */
1449 		int			i;
1450 
1451 		stup.datum1 = original;
1452 
1453 		/*
1454 		 * Set state to be consistent with never trying abbreviation.
1455 		 *
1456 		 * Alter datum1 representation in already-copied tuples, so as to
1457 		 * ensure a consistent representation (current tuple was just
1458 		 * handled).  It does not matter if some dumped tuples are already
1459 		 * sorted on tape, since serialized tuples lack abbreviated keys
1460 		 * (TSS_BUILDRUNS state prevents control reaching here in any case).
1461 		 */
1462 		for (i = 0; i < state->memtupcount; i++)
1463 		{
1464 			SortTuple  *mtup = &state->memtuples[i];
1465 
1466 			tuple = mtup->tuple;
1467 			mtup->datum1 = index_getattr(tuple,
1468 										 1,
1469 										 RelationGetDescr(state->indexRel),
1470 										 &mtup->isnull1);
1471 		}
1472 	}
1473 
1474 	puttuple_common(state, &stup);
1475 
1476 	MemoryContextSwitchTo(oldcontext);
1477 }
1478 
1479 /*
1480  * Accept one Datum while collecting input data for sort.
1481  *
1482  * If the Datum is pass-by-ref type, the value will be copied.
1483  */
1484 void
tuplesort_putdatum(Tuplesortstate * state,Datum val,bool isNull)1485 tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)
1486 {
1487 	MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1488 	SortTuple	stup;
1489 
1490 	/*
1491 	 * Pass-by-value types or null values are just stored directly in
1492 	 * stup.datum1 (and stup.tuple is not used and set to NULL).
1493 	 *
1494 	 * Non-null pass-by-reference values need to be copied into memory we
1495 	 * control, and possibly abbreviated. The copied value is pointed to by
1496 	 * stup.tuple and is treated as the canonical copy (e.g. to return via
1497 	 * tuplesort_getdatum or when writing to tape); stup.datum1 gets the
1498 	 * abbreviated value if abbreviation is happening, otherwise it's
1499 	 * identical to stup.tuple.
1500 	 */
1501 
1502 	if (isNull || !state->tuples)
1503 	{
1504 		/*
1505 		 * Set datum1 to zeroed representation for NULLs (to be consistent,
1506 		 * and to support cheap inequality tests for NULL abbreviated keys).
1507 		 */
1508 		stup.datum1 = !isNull ? val : (Datum) 0;
1509 		stup.isnull1 = isNull;
1510 		stup.tuple = NULL;		/* no separate storage */
1511 		MemoryContextSwitchTo(state->sortcontext);
1512 	}
1513 	else
1514 	{
1515 		Datum		original = datumCopy(val, false, state->datumTypeLen);
1516 
1517 		stup.isnull1 = false;
1518 		stup.tuple = DatumGetPointer(original);
1519 		USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1520 		MemoryContextSwitchTo(state->sortcontext);
1521 
1522 		if (!state->sortKeys->abbrev_converter)
1523 		{
1524 			stup.datum1 = original;
1525 		}
1526 		else if (!consider_abort_common(state))
1527 		{
1528 			/* Store abbreviated key representation */
1529 			stup.datum1 = state->sortKeys->abbrev_converter(original,
1530 															state->sortKeys);
1531 		}
1532 		else
1533 		{
1534 			/* Abort abbreviation */
1535 			int			i;
1536 
1537 			stup.datum1 = original;
1538 
1539 			/*
1540 			 * Set state to be consistent with never trying abbreviation.
1541 			 *
1542 			 * Alter datum1 representation in already-copied tuples, so as to
1543 			 * ensure a consistent representation (current tuple was just
1544 			 * handled).  It does not matter if some dumped tuples are already
1545 			 * sorted on tape, since serialized tuples lack abbreviated keys
1546 			 * (TSS_BUILDRUNS state prevents control reaching here in any
1547 			 * case).
1548 			 */
1549 			for (i = 0; i < state->memtupcount; i++)
1550 			{
1551 				SortTuple  *mtup = &state->memtuples[i];
1552 
1553 				mtup->datum1 = PointerGetDatum(mtup->tuple);
1554 			}
1555 		}
1556 	}
1557 
1558 	puttuple_common(state, &stup);
1559 
1560 	MemoryContextSwitchTo(oldcontext);
1561 }
1562 
1563 /*
1564  * Shared code for tuple and datum cases.
1565  */
1566 static void
puttuple_common(Tuplesortstate * state,SortTuple * tuple)1567 puttuple_common(Tuplesortstate *state, SortTuple *tuple)
1568 {
1569 	switch (state->status)
1570 	{
1571 		case TSS_INITIAL:
1572 
1573 			/*
1574 			 * Save the tuple into the unsorted array.  First, grow the array
1575 			 * as needed.  Note that we try to grow the array when there is
1576 			 * still one free slot remaining --- if we fail, there'll still be
1577 			 * room to store the incoming tuple, and then we'll switch to
1578 			 * tape-based operation.
1579 			 */
1580 			if (state->memtupcount >= state->memtupsize - 1)
1581 			{
1582 				(void) grow_memtuples(state);
1583 				Assert(state->memtupcount < state->memtupsize);
1584 			}
1585 			state->memtuples[state->memtupcount++] = *tuple;
1586 
1587 			/*
1588 			 * Check if it's time to switch over to a bounded heapsort. We do
1589 			 * so if the input tuple count exceeds twice the desired tuple
1590 			 * count (this is a heuristic for where heapsort becomes cheaper
1591 			 * than a quicksort), or if we've just filled workMem and have
1592 			 * enough tuples to meet the bound.
1593 			 *
1594 			 * Note that once we enter TSS_BOUNDED state we will always try to
1595 			 * complete the sort that way.  In the worst case, if later input
1596 			 * tuples are larger than earlier ones, this might cause us to
1597 			 * exceed workMem significantly.
1598 			 */
1599 			if (state->bounded &&
1600 				(state->memtupcount > state->bound * 2 ||
1601 				 (state->memtupcount > state->bound && LACKMEM(state))))
1602 			{
1603 #ifdef TRACE_SORT
1604 				if (trace_sort)
1605 					elog(LOG, "switching to bounded heapsort at %d tuples: %s",
1606 						 state->memtupcount,
1607 						 pg_rusage_show(&state->ru_start));
1608 #endif
1609 				make_bounded_heap(state);
1610 				return;
1611 			}
1612 
1613 			/*
1614 			 * Done if we still fit in available memory and have array slots.
1615 			 */
1616 			if (state->memtupcount < state->memtupsize && !LACKMEM(state))
1617 				return;
1618 
1619 			/*
1620 			 * Nope; time to switch to tape-based operation.
1621 			 */
1622 			inittapes(state);
1623 
1624 			/*
1625 			 * Dump tuples until we are back under the limit.
1626 			 */
1627 			dumptuples(state, false);
1628 			break;
1629 
1630 		case TSS_BOUNDED:
1631 
1632 			/*
1633 			 * We don't want to grow the array here, so check whether the new
1634 			 * tuple can be discarded before putting it in.  This should be a
1635 			 * good speed optimization, too, since when there are many more
1636 			 * input tuples than the bound, most input tuples can be discarded
1637 			 * with just this one comparison.  Note that because we currently
1638 			 * have the sort direction reversed, we must check for <= not >=.
1639 			 */
1640 			if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
1641 			{
1642 				/* new tuple <= top of the heap, so we can discard it */
1643 				free_sort_tuple(state, tuple);
1644 				CHECK_FOR_INTERRUPTS();
1645 			}
1646 			else
1647 			{
1648 				/* discard top of heap, replacing it with the new tuple */
1649 				free_sort_tuple(state, &state->memtuples[0]);
1650 				tuple->tupindex = 0;	/* not used */
1651 				tuplesort_heap_replace_top(state, tuple, false);
1652 			}
1653 			break;
1654 
1655 		case TSS_BUILDRUNS:
1656 
1657 			/*
1658 			 * Insert the tuple into the heap, with run number currentRun if
1659 			 * it can go into the current run, else HEAP_RUN_NEXT.  The tuple
1660 			 * can go into the current run if it is >= the first
1661 			 * not-yet-output tuple.  (Actually, it could go into the current
1662 			 * run if it is >= the most recently output tuple ... but that
1663 			 * would require keeping around the tuple we last output, and it's
1664 			 * simplest to let writetup free each tuple as soon as it's
1665 			 * written.)
1666 			 *
1667 			 * Note that this only applies when:
1668 			 *
1669 			 * - currentRun is RUN_FIRST
1670 			 *
1671 			 * - Replacement selection is in use (typically it is never used).
1672 			 *
1673 			 * When these two conditions are not both true, all tuples are
1674 			 * appended indifferently, much like the TSS_INITIAL case.
1675 			 *
1676 			 * There should always be room to store the incoming tuple.
1677 			 */
1678 			Assert(!state->replaceActive || state->memtupcount > 0);
1679 			if (state->replaceActive &&
1680 				COMPARETUP(state, tuple, &state->memtuples[0]) >= 0)
1681 			{
1682 				Assert(state->currentRun == RUN_FIRST);
1683 
1684 				/*
1685 				 * Insert tuple into first, fully heapified run.
1686 				 *
1687 				 * Unlike classic replacement selection, which this module was
1688 				 * previously based on, only RUN_FIRST tuples are fully
1689 				 * heapified.  Any second/next run tuples are appended
1690 				 * indifferently.  While HEAP_RUN_NEXT tuples may be sifted
1691 				 * out of the way of first run tuples, COMPARETUP() will never
1692 				 * be called for the run's tuples during sifting (only our
1693 				 * initial COMPARETUP() call is required for the tuple, to
1694 				 * determine that the tuple does not belong in RUN_FIRST).
1695 				 */
1696 				tuple->tupindex = state->currentRun;
1697 				tuplesort_heap_insert(state, tuple, true);
1698 			}
1699 			else
1700 			{
1701 				/*
1702 				 * Tuple was determined to not belong to heapified RUN_FIRST,
1703 				 * or replacement selection not in play.  Append the tuple to
1704 				 * memtuples indifferently.
1705 				 *
1706 				 * dumptuples() does not trust that the next run's tuples are
1707 				 * heapified.  Anything past the first run will always be
1708 				 * quicksorted even when replacement selection is initially
1709 				 * used.  (When it's never used, every tuple still takes this
1710 				 * path.)
1711 				 */
1712 				tuple->tupindex = HEAP_RUN_NEXT;
1713 				state->memtuples[state->memtupcount++] = *tuple;
1714 			}
1715 
1716 			/*
1717 			 * If we are over the memory limit, dump tuples till we're under.
1718 			 */
1719 			dumptuples(state, false);
1720 			break;
1721 
1722 		default:
1723 			elog(ERROR, "invalid tuplesort state");
1724 			break;
1725 	}
1726 }
1727 
1728 static bool
consider_abort_common(Tuplesortstate * state)1729 consider_abort_common(Tuplesortstate *state)
1730 {
1731 	Assert(state->sortKeys[0].abbrev_converter != NULL);
1732 	Assert(state->sortKeys[0].abbrev_abort != NULL);
1733 	Assert(state->sortKeys[0].abbrev_full_comparator != NULL);
1734 
1735 	/*
1736 	 * Check effectiveness of abbreviation optimization.  Consider aborting
1737 	 * when still within memory limit.
1738 	 */
1739 	if (state->status == TSS_INITIAL &&
1740 		state->memtupcount >= state->abbrevNext)
1741 	{
1742 		state->abbrevNext *= 2;
1743 
1744 		/*
1745 		 * Check opclass-supplied abbreviation abort routine.  It may indicate
1746 		 * that abbreviation should not proceed.
1747 		 */
1748 		if (!state->sortKeys->abbrev_abort(state->memtupcount,
1749 										   state->sortKeys))
1750 			return false;
1751 
1752 		/*
1753 		 * Finally, restore authoritative comparator, and indicate that
1754 		 * abbreviation is not in play by setting abbrev_converter to NULL
1755 		 */
1756 		state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator;
1757 		state->sortKeys[0].abbrev_converter = NULL;
1758 		/* Not strictly necessary, but be tidy */
1759 		state->sortKeys[0].abbrev_abort = NULL;
1760 		state->sortKeys[0].abbrev_full_comparator = NULL;
1761 
1762 		/* Give up - expect original pass-by-value representation */
1763 		return true;
1764 	}
1765 
1766 	return false;
1767 }
1768 
1769 /*
1770  * All tuples have been provided; finish the sort.
1771  */
1772 void
tuplesort_performsort(Tuplesortstate * state)1773 tuplesort_performsort(Tuplesortstate *state)
1774 {
1775 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1776 
1777 #ifdef TRACE_SORT
1778 	if (trace_sort)
1779 		elog(LOG, "performsort starting: %s",
1780 			 pg_rusage_show(&state->ru_start));
1781 #endif
1782 
1783 	switch (state->status)
1784 	{
1785 		case TSS_INITIAL:
1786 
1787 			/*
1788 			 * We were able to accumulate all the tuples within the allowed
1789 			 * amount of memory.  Just qsort 'em and we're done.
1790 			 */
1791 			tuplesort_sort_memtuples(state);
1792 			state->current = 0;
1793 			state->eof_reached = false;
1794 			state->markpos_offset = 0;
1795 			state->markpos_eof = false;
1796 			state->status = TSS_SORTEDINMEM;
1797 			break;
1798 
1799 		case TSS_BOUNDED:
1800 
1801 			/*
1802 			 * We were able to accumulate all the tuples required for output
1803 			 * in memory, using a heap to eliminate excess tuples.  Now we
1804 			 * have to transform the heap to a properly-sorted array.
1805 			 */
1806 			sort_bounded_heap(state);
1807 			state->current = 0;
1808 			state->eof_reached = false;
1809 			state->markpos_offset = 0;
1810 			state->markpos_eof = false;
1811 			state->status = TSS_SORTEDINMEM;
1812 			break;
1813 
1814 		case TSS_BUILDRUNS:
1815 
1816 			/*
1817 			 * Finish tape-based sort.  First, flush all tuples remaining in
1818 			 * memory out to tape; then merge until we have a single remaining
1819 			 * run (or, if !randomAccess, one run per tape). Note that
1820 			 * mergeruns sets the correct state->status.
1821 			 */
1822 			dumptuples(state, true);
1823 			mergeruns(state);
1824 			state->eof_reached = false;
1825 			state->markpos_block = 0L;
1826 			state->markpos_offset = 0;
1827 			state->markpos_eof = false;
1828 			break;
1829 
1830 		default:
1831 			elog(ERROR, "invalid tuplesort state");
1832 			break;
1833 	}
1834 
1835 #ifdef TRACE_SORT
1836 	if (trace_sort)
1837 	{
1838 		if (state->status == TSS_FINALMERGE)
1839 			elog(LOG, "performsort done (except %d-way final merge): %s",
1840 				 state->activeTapes,
1841 				 pg_rusage_show(&state->ru_start));
1842 		else
1843 			elog(LOG, "performsort done: %s",
1844 				 pg_rusage_show(&state->ru_start));
1845 	}
1846 #endif
1847 
1848 	MemoryContextSwitchTo(oldcontext);
1849 }
1850 
1851 /*
1852  * Internal routine to fetch the next tuple in either forward or back
1853  * direction into *stup.  Returns FALSE if no more tuples.
1854  * Returned tuple belongs to tuplesort memory context, and must not be freed
1855  * by caller.  Note that fetched tuple is stored in memory that may be
1856  * recycled by any future fetch.
1857  */
1858 static bool
tuplesort_gettuple_common(Tuplesortstate * state,bool forward,SortTuple * stup)1859 tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
1860 						  SortTuple *stup)
1861 {
1862 	unsigned int tuplen;
1863 	size_t		nmoved;
1864 
1865 	switch (state->status)
1866 	{
1867 		case TSS_SORTEDINMEM:
1868 			Assert(forward || state->randomAccess);
1869 			Assert(!state->slabAllocatorUsed);
1870 			if (forward)
1871 			{
1872 				if (state->current < state->memtupcount)
1873 				{
1874 					*stup = state->memtuples[state->current++];
1875 					return true;
1876 				}
1877 				state->eof_reached = true;
1878 
1879 				/*
1880 				 * Complain if caller tries to retrieve more tuples than
1881 				 * originally asked for in a bounded sort.  This is because
1882 				 * returning EOF here might be the wrong thing.
1883 				 */
1884 				if (state->bounded && state->current >= state->bound)
1885 					elog(ERROR, "retrieved too many tuples in a bounded sort");
1886 
1887 				return false;
1888 			}
1889 			else
1890 			{
1891 				if (state->current <= 0)
1892 					return false;
1893 
1894 				/*
1895 				 * if all tuples are fetched already then we return last
1896 				 * tuple, else - tuple before last returned.
1897 				 */
1898 				if (state->eof_reached)
1899 					state->eof_reached = false;
1900 				else
1901 				{
1902 					state->current--;	/* last returned tuple */
1903 					if (state->current <= 0)
1904 						return false;
1905 				}
1906 				*stup = state->memtuples[state->current - 1];
1907 				return true;
1908 			}
1909 			break;
1910 
1911 		case TSS_SORTEDONTAPE:
1912 			Assert(forward || state->randomAccess);
1913 			Assert(state->slabAllocatorUsed);
1914 
1915 			/*
1916 			 * The slot that held the tuple that we returned in previous
1917 			 * gettuple call can now be reused.
1918 			 */
1919 			if (state->lastReturnedTuple)
1920 			{
1921 				RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
1922 				state->lastReturnedTuple = NULL;
1923 			}
1924 
1925 			if (forward)
1926 			{
1927 				if (state->eof_reached)
1928 					return false;
1929 
1930 				if ((tuplen = getlen(state, state->result_tape, true)) != 0)
1931 				{
1932 					READTUP(state, stup, state->result_tape, tuplen);
1933 
1934 					/*
1935 					 * Remember the tuple we return, so that we can recycle
1936 					 * its memory on next call.  (This can be NULL, in the
1937 					 * !state->tuples case).
1938 					 */
1939 					state->lastReturnedTuple = stup->tuple;
1940 
1941 					return true;
1942 				}
1943 				else
1944 				{
1945 					state->eof_reached = true;
1946 					return false;
1947 				}
1948 			}
1949 
1950 			/*
1951 			 * Backward.
1952 			 *
1953 			 * if all tuples are fetched already then we return last tuple,
1954 			 * else - tuple before last returned.
1955 			 */
1956 			if (state->eof_reached)
1957 			{
1958 				/*
1959 				 * Seek position is pointing just past the zero tuplen at the
1960 				 * end of file; back up to fetch last tuple's ending length
1961 				 * word.  If seek fails we must have a completely empty file.
1962 				 */
1963 				nmoved = LogicalTapeBackspace(state->tapeset,
1964 											  state->result_tape,
1965 											  2 * sizeof(unsigned int));
1966 				if (nmoved == 0)
1967 					return false;
1968 				else if (nmoved != 2 * sizeof(unsigned int))
1969 					elog(ERROR, "unexpected tape position");
1970 				state->eof_reached = false;
1971 			}
1972 			else
1973 			{
1974 				/*
1975 				 * Back up and fetch previously-returned tuple's ending length
1976 				 * word.  If seek fails, assume we are at start of file.
1977 				 */
1978 				nmoved = LogicalTapeBackspace(state->tapeset,
1979 											  state->result_tape,
1980 											  sizeof(unsigned int));
1981 				if (nmoved == 0)
1982 					return false;
1983 				else if (nmoved != sizeof(unsigned int))
1984 					elog(ERROR, "unexpected tape position");
1985 				tuplen = getlen(state, state->result_tape, false);
1986 
1987 				/*
1988 				 * Back up to get ending length word of tuple before it.
1989 				 */
1990 				nmoved = LogicalTapeBackspace(state->tapeset,
1991 											  state->result_tape,
1992 											  tuplen + 2 * sizeof(unsigned int));
1993 				if (nmoved == tuplen + sizeof(unsigned int))
1994 				{
1995 					/*
1996 					 * We backed up over the previous tuple, but there was no
1997 					 * ending length word before it.  That means that the prev
1998 					 * tuple is the first tuple in the file.  It is now the
1999 					 * next to read in forward direction (not obviously right,
2000 					 * but that is what in-memory case does).
2001 					 */
2002 					return false;
2003 				}
2004 				else if (nmoved != tuplen + 2 * sizeof(unsigned int))
2005 					elog(ERROR, "bogus tuple length in backward scan");
2006 			}
2007 
2008 			tuplen = getlen(state, state->result_tape, false);
2009 
2010 			/*
2011 			 * Now we have the length of the prior tuple, back up and read it.
2012 			 * Note: READTUP expects we are positioned after the initial
2013 			 * length word of the tuple, so back up to that point.
2014 			 */
2015 			nmoved = LogicalTapeBackspace(state->tapeset,
2016 										  state->result_tape,
2017 										  tuplen);
2018 			if (nmoved != tuplen)
2019 				elog(ERROR, "bogus tuple length in backward scan");
2020 			READTUP(state, stup, state->result_tape, tuplen);
2021 
2022 			/*
2023 			 * Remember the tuple we return, so that we can recycle its memory
2024 			 * on next call. (This can be NULL, in the Datum case).
2025 			 */
2026 			state->lastReturnedTuple = stup->tuple;
2027 
2028 			return true;
2029 
2030 		case TSS_FINALMERGE:
2031 			Assert(forward);
2032 			/* We are managing memory ourselves, with the slab allocator. */
2033 			Assert(state->slabAllocatorUsed);
2034 
2035 			/*
2036 			 * The slab slot holding the tuple that we returned in previous
2037 			 * gettuple call can now be reused.
2038 			 */
2039 			if (state->lastReturnedTuple)
2040 			{
2041 				RELEASE_SLAB_SLOT(state, state->lastReturnedTuple);
2042 				state->lastReturnedTuple = NULL;
2043 			}
2044 
2045 			/*
2046 			 * This code should match the inner loop of mergeonerun().
2047 			 */
2048 			if (state->memtupcount > 0)
2049 			{
2050 				int			srcTape = state->memtuples[0].tupindex;
2051 				SortTuple	newtup;
2052 
2053 				*stup = state->memtuples[0];
2054 
2055 				/*
2056 				 * Remember the tuple we return, so that we can recycle its
2057 				 * memory on next call. (This can be NULL, in the Datum case).
2058 				 */
2059 				state->lastReturnedTuple = stup->tuple;
2060 
2061 				/*
2062 				 * Pull next tuple from tape, and replace the returned tuple
2063 				 * at top of the heap with it.
2064 				 */
2065 				if (!mergereadnext(state, srcTape, &newtup))
2066 				{
2067 					/*
2068 					 * If no more data, we've reached end of run on this tape.
2069 					 * Remove the top node from the heap.
2070 					 */
2071 					tuplesort_heap_delete_top(state, false);
2072 
2073 					/*
2074 					 * Rewind to free the read buffer.  It'd go away at the
2075 					 * end of the sort anyway, but better to release the
2076 					 * memory early.
2077 					 */
2078 					LogicalTapeRewindForWrite(state->tapeset, srcTape);
2079 					return true;
2080 				}
2081 				newtup.tupindex = srcTape;
2082 				tuplesort_heap_replace_top(state, &newtup, false);
2083 				return true;
2084 			}
2085 			return false;
2086 
2087 		default:
2088 			elog(ERROR, "invalid tuplesort state");
2089 			return false;		/* keep compiler quiet */
2090 	}
2091 }
2092 
2093 /*
2094  * Fetch the next tuple in either forward or back direction.
2095  * If successful, put tuple in slot and return TRUE; else, clear the slot
2096  * and return FALSE.
2097  *
2098  * Caller may optionally be passed back abbreviated value (on TRUE return
2099  * value) when abbreviation was used, which can be used to cheaply avoid
2100  * equality checks that might otherwise be required.  Caller can safely make a
2101  * determination of "non-equal tuple" based on simple binary inequality.  A
2102  * NULL value in leading attribute will set abbreviated value to zeroed
2103  * representation, which caller may rely on in abbreviated inequality check.
2104  *
2105  * If copy is true, the slot receives a tuple that's been copied into the
2106  * caller's memory context, so that it will stay valid regardless of future
2107  * manipulations of the tuplesort's state (up to and including deleting the
2108  * tuplesort).  If copy is false, the slot will just receive a pointer to a
2109  * tuple held within the tuplesort, which is more efficient, but only safe for
2110  * callers that are prepared to have any subsequent manipulation of the
2111  * tuplesort's state invalidate slot contents.
2112  */
2113 bool
tuplesort_gettupleslot(Tuplesortstate * state,bool forward,bool copy,TupleTableSlot * slot,Datum * abbrev)2114 tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
2115 					   TupleTableSlot *slot, Datum *abbrev)
2116 {
2117 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2118 	SortTuple	stup;
2119 
2120 	if (!tuplesort_gettuple_common(state, forward, &stup))
2121 		stup.tuple = NULL;
2122 
2123 	MemoryContextSwitchTo(oldcontext);
2124 
2125 	if (stup.tuple)
2126 	{
2127 		/* Record abbreviated key for caller */
2128 		if (state->sortKeys->abbrev_converter && abbrev)
2129 			*abbrev = stup.datum1;
2130 
2131 		if (copy)
2132 			stup.tuple = heap_copy_minimal_tuple((MinimalTuple) stup.tuple);
2133 
2134 		ExecStoreMinimalTuple((MinimalTuple) stup.tuple, slot, copy);
2135 		return true;
2136 	}
2137 	else
2138 	{
2139 		ExecClearTuple(slot);
2140 		return false;
2141 	}
2142 }
2143 
2144 /*
2145  * Fetch the next tuple in either forward or back direction.
2146  * Returns NULL if no more tuples.  Returned tuple belongs to tuplesort memory
2147  * context, and must not be freed by caller.  Caller may not rely on tuple
2148  * remaining valid after any further manipulation of tuplesort.
2149  */
2150 HeapTuple
tuplesort_getheaptuple(Tuplesortstate * state,bool forward)2151 tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
2152 {
2153 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2154 	SortTuple	stup;
2155 
2156 	if (!tuplesort_gettuple_common(state, forward, &stup))
2157 		stup.tuple = NULL;
2158 
2159 	MemoryContextSwitchTo(oldcontext);
2160 
2161 	return stup.tuple;
2162 }
2163 
2164 /*
2165  * Fetch the next index tuple in either forward or back direction.
2166  * Returns NULL if no more tuples.  Returned tuple belongs to tuplesort memory
2167  * context, and must not be freed by caller.  Caller may not rely on tuple
2168  * remaining valid after any further manipulation of tuplesort.
2169  */
2170 IndexTuple
tuplesort_getindextuple(Tuplesortstate * state,bool forward)2171 tuplesort_getindextuple(Tuplesortstate *state, bool forward)
2172 {
2173 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2174 	SortTuple	stup;
2175 
2176 	if (!tuplesort_gettuple_common(state, forward, &stup))
2177 		stup.tuple = NULL;
2178 
2179 	MemoryContextSwitchTo(oldcontext);
2180 
2181 	return (IndexTuple) stup.tuple;
2182 }
2183 
2184 /*
2185  * Fetch the next Datum in either forward or back direction.
2186  * Returns FALSE if no more datums.
2187  *
2188  * If the Datum is pass-by-ref type, the returned value is freshly palloc'd
2189  * in caller's context, and is now owned by the caller (this differs from
2190  * similar routines for other types of tuplesorts).
2191  *
2192  * Caller may optionally be passed back abbreviated value (on TRUE return
2193  * value) when abbreviation was used, which can be used to cheaply avoid
2194  * equality checks that might otherwise be required.  Caller can safely make a
2195  * determination of "non-equal tuple" based on simple binary inequality.  A
2196  * NULL value will have a zeroed abbreviated value representation, which caller
2197  * may rely on in abbreviated inequality check.
2198  */
2199 bool
tuplesort_getdatum(Tuplesortstate * state,bool forward,Datum * val,bool * isNull,Datum * abbrev)2200 tuplesort_getdatum(Tuplesortstate *state, bool forward,
2201 				   Datum *val, bool *isNull, Datum *abbrev)
2202 {
2203 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2204 	SortTuple	stup;
2205 
2206 	if (!tuplesort_gettuple_common(state, forward, &stup))
2207 	{
2208 		MemoryContextSwitchTo(oldcontext);
2209 		return false;
2210 	}
2211 
2212 	/* Ensure we copy into caller's memory context */
2213 	MemoryContextSwitchTo(oldcontext);
2214 
2215 	/* Record abbreviated key for caller */
2216 	if (state->sortKeys->abbrev_converter && abbrev)
2217 		*abbrev = stup.datum1;
2218 
2219 	if (stup.isnull1 || !state->tuples)
2220 	{
2221 		*val = stup.datum1;
2222 		*isNull = stup.isnull1;
2223 	}
2224 	else
2225 	{
2226 		/* use stup.tuple because stup.datum1 may be an abbreviation */
2227 		*val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen);
2228 		*isNull = false;
2229 	}
2230 
2231 	return true;
2232 }
2233 
2234 /*
2235  * Advance over N tuples in either forward or back direction,
2236  * without returning any data.  N==0 is a no-op.
2237  * Returns TRUE if successful, FALSE if ran out of tuples.
2238  */
2239 bool
tuplesort_skiptuples(Tuplesortstate * state,int64 ntuples,bool forward)2240 tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
2241 {
2242 	MemoryContext oldcontext;
2243 
2244 	/*
2245 	 * We don't actually support backwards skip yet, because no callers need
2246 	 * it.  The API is designed to allow for that later, though.
2247 	 */
2248 	Assert(forward);
2249 	Assert(ntuples >= 0);
2250 
2251 	switch (state->status)
2252 	{
2253 		case TSS_SORTEDINMEM:
2254 			if (state->memtupcount - state->current >= ntuples)
2255 			{
2256 				state->current += ntuples;
2257 				return true;
2258 			}
2259 			state->current = state->memtupcount;
2260 			state->eof_reached = true;
2261 
2262 			/*
2263 			 * Complain if caller tries to retrieve more tuples than
2264 			 * originally asked for in a bounded sort.  This is because
2265 			 * returning EOF here might be the wrong thing.
2266 			 */
2267 			if (state->bounded && state->current >= state->bound)
2268 				elog(ERROR, "retrieved too many tuples in a bounded sort");
2269 
2270 			return false;
2271 
2272 		case TSS_SORTEDONTAPE:
2273 		case TSS_FINALMERGE:
2274 
2275 			/*
2276 			 * We could probably optimize these cases better, but for now it's
2277 			 * not worth the trouble.
2278 			 */
2279 			oldcontext = MemoryContextSwitchTo(state->sortcontext);
2280 			while (ntuples-- > 0)
2281 			{
2282 				SortTuple	stup;
2283 
2284 				if (!tuplesort_gettuple_common(state, forward, &stup))
2285 				{
2286 					MemoryContextSwitchTo(oldcontext);
2287 					return false;
2288 				}
2289 				CHECK_FOR_INTERRUPTS();
2290 			}
2291 			MemoryContextSwitchTo(oldcontext);
2292 			return true;
2293 
2294 		default:
2295 			elog(ERROR, "invalid tuplesort state");
2296 			return false;		/* keep compiler quiet */
2297 	}
2298 }
2299 
2300 /*
2301  * tuplesort_merge_order - report merge order we'll use for given memory
2302  * (note: "merge order" just means the number of input tapes in the merge).
2303  *
2304  * This is exported for use by the planner.  allowedMem is in bytes.
2305  */
2306 int
tuplesort_merge_order(int64 allowedMem)2307 tuplesort_merge_order(int64 allowedMem)
2308 {
2309 	int			mOrder;
2310 
2311 	/*
2312 	 * We need one tape for each merge input, plus another one for the output,
2313 	 * and each of these tapes needs buffer space.  In addition we want
2314 	 * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't
2315 	 * count).
2316 	 *
2317 	 * Note: you might be thinking we need to account for the memtuples[]
2318 	 * array in this calculation, but we effectively treat that as part of the
2319 	 * MERGE_BUFFER_SIZE workspace.
2320 	 */
2321 	mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) /
2322 		(MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD);
2323 
2324 	/*
2325 	 * Even in minimum memory, use at least a MINORDER merge.  On the other
2326 	 * hand, even when we have lots of memory, do not use more than a MAXORDER
2327 	 * merge.  Tapes are pretty cheap, but they're not entirely free.  Each
2328 	 * additional tape reduces the amount of memory available to build runs,
2329 	 * which in turn can cause the same sort to need more runs, which makes
2330 	 * merging slower even if it can still be done in a single pass.  Also,
2331 	 * high order merges are quite slow due to CPU cache effects; it can be
2332 	 * faster to pay the I/O cost of a polyphase merge than to perform a
2333 	 * single merge pass across many hundreds of tapes.
2334 	 */
2335 	mOrder = Max(mOrder, MINORDER);
2336 	mOrder = Min(mOrder, MAXORDER);
2337 
2338 	return mOrder;
2339 }
2340 
2341 /*
2342  * useselection - determine algorithm to use to sort first run.
2343  *
2344  * It can sometimes be useful to use the replacement selection algorithm if it
2345  * results in one large run, and there is little available workMem.  See
2346  * remarks on RUN_SECOND optimization within dumptuples().
2347  */
2348 static bool
useselection(Tuplesortstate * state)2349 useselection(Tuplesortstate *state)
2350 {
2351 	/*
2352 	 * memtupsize might be noticeably higher than memtupcount here in atypical
2353 	 * cases.  It seems slightly preferable to not allow recent outliers to
2354 	 * impact this determination.  Note that caller's trace_sort output
2355 	 * reports memtupcount instead.
2356 	 */
2357 	if (state->memtupsize <= replacement_sort_tuples)
2358 		return true;
2359 
2360 	return false;
2361 }
2362 
2363 /*
2364  * inittapes - initialize for tape sorting.
2365  *
2366  * This is called only if we have found we don't have room to sort in memory.
2367  */
2368 static void
inittapes(Tuplesortstate * state)2369 inittapes(Tuplesortstate *state)
2370 {
2371 	int			maxTapes,
2372 				j;
2373 	int64		tapeSpace;
2374 
2375 	/* Compute number of tapes to use: merge order plus 1 */
2376 	maxTapes = tuplesort_merge_order(state->allowedMem) + 1;
2377 
2378 	state->maxTapes = maxTapes;
2379 	state->tapeRange = maxTapes - 1;
2380 
2381 #ifdef TRACE_SORT
2382 	if (trace_sort)
2383 		elog(LOG, "switching to external sort with %d tapes: %s",
2384 			 maxTapes, pg_rusage_show(&state->ru_start));
2385 #endif
2386 
2387 	/*
2388 	 * Decrease availMem to reflect the space needed for tape buffers, when
2389 	 * writing the initial runs; but don't decrease it to the point that we
2390 	 * have no room for tuples.  (That case is only likely to occur if sorting
2391 	 * pass-by-value Datums; in all other scenarios the memtuples[] array is
2392 	 * unlikely to occupy more than half of allowedMem.  In the pass-by-value
2393 	 * case it's not important to account for tuple space, so we don't care if
2394 	 * LACKMEM becomes inaccurate.)
2395 	 */
2396 	tapeSpace = (int64) maxTapes * TAPE_BUFFER_OVERHEAD;
2397 
2398 	if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
2399 		USEMEM(state, tapeSpace);
2400 
2401 	/*
2402 	 * Make sure that the temp file(s) underlying the tape set are created in
2403 	 * suitable temp tablespaces.
2404 	 */
2405 	PrepareTempTablespaces();
2406 
2407 	/*
2408 	 * Create the tape set and allocate the per-tape data arrays.
2409 	 */
2410 	state->tapeset = LogicalTapeSetCreate(maxTapes);
2411 
2412 	state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool));
2413 	state->tp_fib = (int *) palloc0(maxTapes * sizeof(int));
2414 	state->tp_runs = (int *) palloc0(maxTapes * sizeof(int));
2415 	state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
2416 	state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
2417 
2418 	/*
2419 	 * Give replacement selection a try based on user setting.  There will be
2420 	 * a switch to a simple hybrid sort-merge strategy after the first run
2421 	 * (iff we could not output one long run).
2422 	 */
2423 	state->replaceActive = useselection(state);
2424 
2425 	if (state->replaceActive)
2426 	{
2427 		/*
2428 		 * Convert the unsorted contents of memtuples[] into a heap. Each
2429 		 * tuple is marked as belonging to run number zero.
2430 		 *
2431 		 * NOTE: we pass false for checkIndex since there's no point in
2432 		 * comparing indexes in this step, even though we do intend the
2433 		 * indexes to be part of the sort key...
2434 		 */
2435 		int			ntuples = state->memtupcount;
2436 
2437 #ifdef TRACE_SORT
2438 		if (trace_sort)
2439 			elog(LOG, "replacement selection will sort %d first run tuples",
2440 				 state->memtupcount);
2441 #endif
2442 		state->memtupcount = 0; /* make the heap empty */
2443 
2444 		for (j = 0; j < ntuples; j++)
2445 		{
2446 			/* Must copy source tuple to avoid possible overwrite */
2447 			SortTuple	stup = state->memtuples[j];
2448 
2449 			stup.tupindex = RUN_FIRST;
2450 			tuplesort_heap_insert(state, &stup, false);
2451 		}
2452 		Assert(state->memtupcount == ntuples);
2453 	}
2454 
2455 	state->currentRun = RUN_FIRST;
2456 
2457 	/*
2458 	 * Initialize variables of Algorithm D (step D1).
2459 	 */
2460 	for (j = 0; j < maxTapes; j++)
2461 	{
2462 		state->tp_fib[j] = 1;
2463 		state->tp_runs[j] = 0;
2464 		state->tp_dummy[j] = 1;
2465 		state->tp_tapenum[j] = j;
2466 	}
2467 	state->tp_fib[state->tapeRange] = 0;
2468 	state->tp_dummy[state->tapeRange] = 0;
2469 
2470 	state->Level = 1;
2471 	state->destTape = 0;
2472 
2473 	state->status = TSS_BUILDRUNS;
2474 }
2475 
2476 /*
2477  * selectnewtape -- select new tape for new initial run.
2478  *
2479  * This is called after finishing a run when we know another run
2480  * must be started.  This implements steps D3, D4 of Algorithm D.
2481  */
2482 static void
selectnewtape(Tuplesortstate * state)2483 selectnewtape(Tuplesortstate *state)
2484 {
2485 	int			j;
2486 	int			a;
2487 
2488 	/* Step D3: advance j (destTape) */
2489 	if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1])
2490 	{
2491 		state->destTape++;
2492 		return;
2493 	}
2494 	if (state->tp_dummy[state->destTape] != 0)
2495 	{
2496 		state->destTape = 0;
2497 		return;
2498 	}
2499 
2500 	/* Step D4: increase level */
2501 	state->Level++;
2502 	a = state->tp_fib[0];
2503 	for (j = 0; j < state->tapeRange; j++)
2504 	{
2505 		state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j];
2506 		state->tp_fib[j] = a + state->tp_fib[j + 1];
2507 	}
2508 	state->destTape = 0;
2509 }
2510 
2511 /*
2512  * Initialize the slab allocation arena, for the given number of slots.
2513  */
2514 static void
init_slab_allocator(Tuplesortstate * state,int numSlots)2515 init_slab_allocator(Tuplesortstate *state, int numSlots)
2516 {
2517 	if (numSlots > 0)
2518 	{
2519 		char	   *p;
2520 		int			i;
2521 
2522 		state->slabMemoryBegin = palloc(numSlots * SLAB_SLOT_SIZE);
2523 		state->slabMemoryEnd = state->slabMemoryBegin +
2524 			numSlots * SLAB_SLOT_SIZE;
2525 		state->slabFreeHead = (SlabSlot *) state->slabMemoryBegin;
2526 		USEMEM(state, numSlots * SLAB_SLOT_SIZE);
2527 
2528 		p = state->slabMemoryBegin;
2529 		for (i = 0; i < numSlots - 1; i++)
2530 		{
2531 			((SlabSlot *) p)->nextfree = (SlabSlot *) (p + SLAB_SLOT_SIZE);
2532 			p += SLAB_SLOT_SIZE;
2533 		}
2534 		((SlabSlot *) p)->nextfree = NULL;
2535 	}
2536 	else
2537 	{
2538 		state->slabMemoryBegin = state->slabMemoryEnd = NULL;
2539 		state->slabFreeHead = NULL;
2540 	}
2541 	state->slabAllocatorUsed = true;
2542 }
2543 
2544 /*
2545  * mergeruns -- merge all the completed initial runs.
2546  *
2547  * This implements steps D5, D6 of Algorithm D.  All input data has
2548  * already been written to initial runs on tape (see dumptuples).
2549  */
2550 static void
mergeruns(Tuplesortstate * state)2551 mergeruns(Tuplesortstate *state)
2552 {
2553 	int			tapenum,
2554 				svTape,
2555 				svRuns,
2556 				svDummy;
2557 	int			numTapes;
2558 	int			numInputTapes;
2559 
2560 	Assert(state->status == TSS_BUILDRUNS);
2561 	Assert(state->memtupcount == 0);
2562 
2563 	if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL)
2564 	{
2565 		/*
2566 		 * If there are multiple runs to be merged, when we go to read back
2567 		 * tuples from disk, abbreviated keys will not have been stored, and
2568 		 * we don't care to regenerate them.  Disable abbreviation from this
2569 		 * point on.
2570 		 */
2571 		state->sortKeys->abbrev_converter = NULL;
2572 		state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
2573 
2574 		/* Not strictly necessary, but be tidy */
2575 		state->sortKeys->abbrev_abort = NULL;
2576 		state->sortKeys->abbrev_full_comparator = NULL;
2577 	}
2578 
2579 	/*
2580 	 * Reset tuple memory.  We've freed all the tuples that we previously
2581 	 * allocated.  We will use the slab allocator from now on.
2582 	 */
2583 	MemoryContextDelete(state->tuplecontext);
2584 	state->tuplecontext = NULL;
2585 
2586 	/*
2587 	 * We no longer need a large memtuples array.  (We will allocate a smaller
2588 	 * one for the heap later.)
2589 	 */
2590 	FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
2591 	pfree(state->memtuples);
2592 	state->memtuples = NULL;
2593 
2594 	/*
2595 	 * If we had fewer runs than tapes, refund the memory that we imagined we
2596 	 * would need for the tape buffers of the unused tapes.
2597 	 *
2598 	 * numTapes and numInputTapes reflect the actual number of tapes we will
2599 	 * use.  Note that the output tape's tape number is maxTapes - 1, so the
2600 	 * tape numbers of the used tapes are not consecutive, and you cannot just
2601 	 * loop from 0 to numTapes to visit all used tapes!
2602 	 */
2603 	if (state->Level == 1)
2604 	{
2605 		numInputTapes = state->currentRun;
2606 		numTapes = numInputTapes + 1;
2607 		FREEMEM(state, (state->maxTapes - numTapes) * TAPE_BUFFER_OVERHEAD);
2608 	}
2609 	else
2610 	{
2611 		numInputTapes = state->tapeRange;
2612 		numTapes = state->maxTapes;
2613 	}
2614 
2615 	/*
2616 	 * Initialize the slab allocator.  We need one slab slot per input tape,
2617 	 * for the tuples in the heap, plus one to hold the tuple last returned
2618 	 * from tuplesort_gettuple.  (If we're sorting pass-by-val Datums,
2619 	 * however, we don't need to do allocate anything.)
2620 	 *
2621 	 * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism
2622 	 * to track memory usage of individual tuples.
2623 	 */
2624 	if (state->tuples)
2625 		init_slab_allocator(state, numInputTapes + 1);
2626 	else
2627 		init_slab_allocator(state, 0);
2628 
2629 	/*
2630 	 * If we produced only one initial run (quite likely if the total data
2631 	 * volume is between 1X and 2X workMem when replacement selection is used,
2632 	 * but something we particularly count on when input is presorted), we can
2633 	 * just use that tape as the finished output, rather than doing a useless
2634 	 * merge.  (This obvious optimization is not in Knuth's algorithm.)
2635 	 */
2636 	if (state->currentRun == RUN_SECOND)
2637 	{
2638 		state->result_tape = state->tp_tapenum[state->destTape];
2639 		/* must freeze and rewind the finished output tape */
2640 		LogicalTapeFreeze(state->tapeset, state->result_tape);
2641 		state->status = TSS_SORTEDONTAPE;
2642 		return;
2643 	}
2644 
2645 	/*
2646 	 * Allocate a new 'memtuples' array, for the heap.  It will hold one tuple
2647 	 * from each input tape.
2648 	 */
2649 	state->memtupsize = numInputTapes;
2650 	state->memtuples = (SortTuple *) palloc(numInputTapes * sizeof(SortTuple));
2651 	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
2652 
2653 	/*
2654 	 * Use all the remaining memory we have available for read buffers among
2655 	 * the input tapes.
2656 	 *
2657 	 * We do this only after checking for the case that we produced only one
2658 	 * initial run, because there is no need to use a large read buffer when
2659 	 * we're reading from a single tape.  With one tape, the I/O pattern will
2660 	 * be the same regardless of the buffer size.
2661 	 *
2662 	 * We don't try to "rebalance" the memory among tapes, when we start a new
2663 	 * merge phase, even if some tapes are inactive in the new phase.  That
2664 	 * would be hard, because logtape.c doesn't know where one run ends and
2665 	 * another begins.  When a new merge phase begins, and a tape doesn't
2666 	 * participate in it, its buffer nevertheless already contains tuples from
2667 	 * the next run on same tape, so we cannot release the buffer.  That's OK
2668 	 * in practice, merge performance isn't that sensitive to the amount of
2669 	 * buffers used, and most merge phases use all or almost all tapes,
2670 	 * anyway.
2671 	 */
2672 #ifdef TRACE_SORT
2673 	if (trace_sort)
2674 		elog(LOG, "using " INT64_FORMAT " KB of memory for read buffers among %d input tapes",
2675 			 (state->availMem) / 1024, numInputTapes);
2676 #endif
2677 
2678 	state->read_buffer_size = Max(state->availMem / numInputTapes, 0);
2679 	USEMEM(state, state->read_buffer_size * numInputTapes);
2680 
2681 	/* End of step D2: rewind all output tapes to prepare for merging */
2682 	for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2683 		LogicalTapeRewindForRead(state->tapeset, tapenum, state->read_buffer_size);
2684 
2685 	for (;;)
2686 	{
2687 		/*
2688 		 * At this point we know that tape[T] is empty.  If there's just one
2689 		 * (real or dummy) run left on each input tape, then only one merge
2690 		 * pass remains.  If we don't have to produce a materialized sorted
2691 		 * tape, we can stop at this point and do the final merge on-the-fly.
2692 		 */
2693 		if (!state->randomAccess)
2694 		{
2695 			bool		allOneRun = true;
2696 
2697 			Assert(state->tp_runs[state->tapeRange] == 0);
2698 			for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2699 			{
2700 				if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1)
2701 				{
2702 					allOneRun = false;
2703 					break;
2704 				}
2705 			}
2706 			if (allOneRun)
2707 			{
2708 				/* Tell logtape.c we won't be writing anymore */
2709 				LogicalTapeSetForgetFreeSpace(state->tapeset);
2710 				/* Initialize for the final merge pass */
2711 				beginmerge(state);
2712 				state->status = TSS_FINALMERGE;
2713 				return;
2714 			}
2715 		}
2716 
2717 		/* Step D5: merge runs onto tape[T] until tape[P] is empty */
2718 		while (state->tp_runs[state->tapeRange - 1] ||
2719 			   state->tp_dummy[state->tapeRange - 1])
2720 		{
2721 			bool		allDummy = true;
2722 
2723 			for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2724 			{
2725 				if (state->tp_dummy[tapenum] == 0)
2726 				{
2727 					allDummy = false;
2728 					break;
2729 				}
2730 			}
2731 
2732 			if (allDummy)
2733 			{
2734 				state->tp_dummy[state->tapeRange]++;
2735 				for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2736 					state->tp_dummy[tapenum]--;
2737 			}
2738 			else
2739 				mergeonerun(state);
2740 		}
2741 
2742 		/* Step D6: decrease level */
2743 		if (--state->Level == 0)
2744 			break;
2745 		/* rewind output tape T to use as new input */
2746 		LogicalTapeRewindForRead(state->tapeset, state->tp_tapenum[state->tapeRange],
2747 								 state->read_buffer_size);
2748 		/* rewind used-up input tape P, and prepare it for write pass */
2749 		LogicalTapeRewindForWrite(state->tapeset, state->tp_tapenum[state->tapeRange - 1]);
2750 		state->tp_runs[state->tapeRange - 1] = 0;
2751 
2752 		/*
2753 		 * reassign tape units per step D6; note we no longer care about A[]
2754 		 */
2755 		svTape = state->tp_tapenum[state->tapeRange];
2756 		svDummy = state->tp_dummy[state->tapeRange];
2757 		svRuns = state->tp_runs[state->tapeRange];
2758 		for (tapenum = state->tapeRange; tapenum > 0; tapenum--)
2759 		{
2760 			state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1];
2761 			state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1];
2762 			state->tp_runs[tapenum] = state->tp_runs[tapenum - 1];
2763 		}
2764 		state->tp_tapenum[0] = svTape;
2765 		state->tp_dummy[0] = svDummy;
2766 		state->tp_runs[0] = svRuns;
2767 	}
2768 
2769 	/*
2770 	 * Done.  Knuth says that the result is on TAPE[1], but since we exited
2771 	 * the loop without performing the last iteration of step D6, we have not
2772 	 * rearranged the tape unit assignment, and therefore the result is on
2773 	 * TAPE[T].  We need to do it this way so that we can freeze the final
2774 	 * output tape while rewinding it.  The last iteration of step D6 would be
2775 	 * a waste of cycles anyway...
2776 	 */
2777 	state->result_tape = state->tp_tapenum[state->tapeRange];
2778 	LogicalTapeFreeze(state->tapeset, state->result_tape);
2779 	state->status = TSS_SORTEDONTAPE;
2780 
2781 	/* Release the read buffers of all the other tapes, by rewinding them. */
2782 	for (tapenum = 0; tapenum < state->maxTapes; tapenum++)
2783 	{
2784 		if (tapenum != state->result_tape)
2785 			LogicalTapeRewindForWrite(state->tapeset, tapenum);
2786 	}
2787 }
2788 
2789 /*
2790  * Merge one run from each input tape, except ones with dummy runs.
2791  *
2792  * This is the inner loop of Algorithm D step D5.  We know that the
2793  * output tape is TAPE[T].
2794  */
2795 static void
mergeonerun(Tuplesortstate * state)2796 mergeonerun(Tuplesortstate *state)
2797 {
2798 	int			destTape = state->tp_tapenum[state->tapeRange];
2799 	int			srcTape;
2800 
2801 	/*
2802 	 * Start the merge by loading one tuple from each active source tape into
2803 	 * the heap.  We can also decrease the input run/dummy run counts.
2804 	 */
2805 	beginmerge(state);
2806 
2807 	/*
2808 	 * Execute merge by repeatedly extracting lowest tuple in heap, writing it
2809 	 * out, and replacing it with next tuple from same tape (if there is
2810 	 * another one).
2811 	 */
2812 	while (state->memtupcount > 0)
2813 	{
2814 		SortTuple	stup;
2815 
2816 		/* write the tuple to destTape */
2817 		srcTape = state->memtuples[0].tupindex;
2818 		WRITETUP(state, destTape, &state->memtuples[0]);
2819 
2820 		/* recycle the slot of the tuple we just wrote out, for the next read */
2821 		if (state->memtuples[0].tuple)
2822 			RELEASE_SLAB_SLOT(state, state->memtuples[0].tuple);
2823 
2824 		/*
2825 		 * pull next tuple from the tape, and replace the written-out tuple in
2826 		 * the heap with it.
2827 		 */
2828 		if (mergereadnext(state, srcTape, &stup))
2829 		{
2830 			stup.tupindex = srcTape;
2831 			tuplesort_heap_replace_top(state, &stup, false);
2832 
2833 		}
2834 		else
2835 			tuplesort_heap_delete_top(state, false);
2836 	}
2837 
2838 	/*
2839 	 * When the heap empties, we're done.  Write an end-of-run marker on the
2840 	 * output tape, and increment its count of real runs.
2841 	 */
2842 	markrunend(state, destTape);
2843 	state->tp_runs[state->tapeRange]++;
2844 
2845 #ifdef TRACE_SORT
2846 	if (trace_sort)
2847 		elog(LOG, "finished %d-way merge step: %s", state->activeTapes,
2848 			 pg_rusage_show(&state->ru_start));
2849 #endif
2850 }
2851 
2852 /*
2853  * beginmerge - initialize for a merge pass
2854  *
2855  * We decrease the counts of real and dummy runs for each tape, and mark
2856  * which tapes contain active input runs in mergeactive[].  Then, fill the
2857  * merge heap with the first tuple from each active tape.
2858  */
2859 static void
beginmerge(Tuplesortstate * state)2860 beginmerge(Tuplesortstate *state)
2861 {
2862 	int			activeTapes;
2863 	int			tapenum;
2864 	int			srcTape;
2865 
2866 	/* Heap should be empty here */
2867 	Assert(state->memtupcount == 0);
2868 
2869 	/* Adjust run counts and mark the active tapes */
2870 	memset(state->mergeactive, 0,
2871 		   state->maxTapes * sizeof(*state->mergeactive));
2872 	activeTapes = 0;
2873 	for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2874 	{
2875 		if (state->tp_dummy[tapenum] > 0)
2876 			state->tp_dummy[tapenum]--;
2877 		else
2878 		{
2879 			Assert(state->tp_runs[tapenum] > 0);
2880 			state->tp_runs[tapenum]--;
2881 			srcTape = state->tp_tapenum[tapenum];
2882 			state->mergeactive[srcTape] = true;
2883 			activeTapes++;
2884 		}
2885 	}
2886 	Assert(activeTapes > 0);
2887 	state->activeTapes = activeTapes;
2888 
2889 	/* Load the merge heap with the first tuple from each input tape */
2890 	for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
2891 	{
2892 		SortTuple	tup;
2893 
2894 		if (mergereadnext(state, srcTape, &tup))
2895 		{
2896 			tup.tupindex = srcTape;
2897 			tuplesort_heap_insert(state, &tup, false);
2898 		}
2899 	}
2900 }
2901 
2902 /*
2903  * mergereadnext - read next tuple from one merge input tape
2904  *
2905  * Returns false on EOF.
2906  */
2907 static bool
mergereadnext(Tuplesortstate * state,int srcTape,SortTuple * stup)2908 mergereadnext(Tuplesortstate *state, int srcTape, SortTuple *stup)
2909 {
2910 	unsigned int tuplen;
2911 
2912 	if (!state->mergeactive[srcTape])
2913 		return false;			/* tape's run is already exhausted */
2914 
2915 	/* read next tuple, if any */
2916 	if ((tuplen = getlen(state, srcTape, true)) == 0)
2917 	{
2918 		state->mergeactive[srcTape] = false;
2919 		return false;
2920 	}
2921 	READTUP(state, stup, srcTape, tuplen);
2922 
2923 	return true;
2924 }
2925 
2926 /*
2927  * dumptuples - remove tuples from memtuples and write to tape
2928  *
2929  * This is used during initial-run building, but not during merging.
2930  *
2931  * When alltuples = false and replacement selection is still active, dump
2932  * only enough tuples to get under the availMem limit (and leave at least
2933  * one tuple in memtuples, since puttuple will then assume it is a heap that
2934  * has a tuple to compare to).  We always insist there be at least one free
2935  * slot in the memtuples[] array.
2936  *
2937  * When alltuples = true, dump everything currently in memory.  (This
2938  * case is only used at end of input data, although in practice only the
2939  * first run could fail to dump all tuples when we LACKMEM(), and only
2940  * when replacement selection is active.)
2941  *
2942  * If, when replacement selection is active, we see that the tuple run
2943  * number at the top of the heap has changed, start a new run.  This must be
2944  * the first run, because replacement selection is always abandoned for all
2945  * further runs.
2946  */
2947 static void
dumptuples(Tuplesortstate * state,bool alltuples)2948 dumptuples(Tuplesortstate *state, bool alltuples)
2949 {
2950 	while (alltuples ||
2951 		   (LACKMEM(state) && state->memtupcount > 1) ||
2952 		   state->memtupcount >= state->memtupsize)
2953 	{
2954 		if (state->replaceActive)
2955 		{
2956 			/*
2957 			 * Still holding out for a case favorable to replacement
2958 			 * selection. Still incrementally spilling using heap.
2959 			 *
2960 			 * Dump the heap's frontmost entry, and remove it from the heap.
2961 			 */
2962 			Assert(state->memtupcount > 0);
2963 			WRITETUP(state, state->tp_tapenum[state->destTape],
2964 					 &state->memtuples[0]);
2965 			tuplesort_heap_delete_top(state, true);
2966 		}
2967 		else
2968 		{
2969 			/*
2970 			 * Once committed to quicksorting runs, never incrementally spill
2971 			 */
2972 			dumpbatch(state, alltuples);
2973 			break;
2974 		}
2975 
2976 		/*
2977 		 * If top run number has changed, we've finished the current run (this
2978 		 * can only be the first run), and will no longer spill incrementally.
2979 		 */
2980 		if (state->memtupcount == 0 ||
2981 			state->memtuples[0].tupindex == HEAP_RUN_NEXT)
2982 		{
2983 			markrunend(state, state->tp_tapenum[state->destTape]);
2984 			Assert(state->currentRun == RUN_FIRST);
2985 			state->currentRun++;
2986 			state->tp_runs[state->destTape]++;
2987 			state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
2988 
2989 #ifdef TRACE_SORT
2990 			if (trace_sort)
2991 				elog(LOG, "finished incrementally writing %s run %d to tape %d: %s",
2992 					 (state->memtupcount == 0) ? "only" : "first",
2993 					 state->currentRun, state->destTape,
2994 					 pg_rusage_show(&state->ru_start));
2995 #endif
2996 
2997 			/*
2998 			 * Done if heap is empty, which is possible when there is only one
2999 			 * long run.
3000 			 */
3001 			Assert(state->currentRun == RUN_SECOND);
3002 			if (state->memtupcount == 0)
3003 			{
3004 				/*
3005 				 * Replacement selection best case; no final merge required,
3006 				 * because there was only one initial run (second run has no
3007 				 * tuples).  See RUN_SECOND case in mergeruns().
3008 				 */
3009 				break;
3010 			}
3011 
3012 			/*
3013 			 * Abandon replacement selection for second run (as well as any
3014 			 * subsequent runs).
3015 			 */
3016 			state->replaceActive = false;
3017 
3018 			/*
3019 			 * First tuple of next run should not be heapified, and so will
3020 			 * bear placeholder run number.  In practice this must actually be
3021 			 * the second run, which just became the currentRun, so we're
3022 			 * clear to quicksort and dump the tuples in batch next time
3023 			 * memtuples becomes full.
3024 			 */
3025 			Assert(state->memtuples[0].tupindex == HEAP_RUN_NEXT);
3026 			selectnewtape(state);
3027 		}
3028 	}
3029 }
3030 
3031 /*
3032  * dumpbatch - sort and dump all memtuples, forming one run on tape
3033  *
3034  * Second or subsequent runs are never heapified by this module (although
3035  * heapification still respects run number differences between the first and
3036  * second runs), and a heap (replacement selection priority queue) is often
3037  * avoided in the first place.
3038  */
3039 static void
dumpbatch(Tuplesortstate * state,bool alltuples)3040 dumpbatch(Tuplesortstate *state, bool alltuples)
3041 {
3042 	int			memtupwrite;
3043 	int			i;
3044 
3045 	/*
3046 	 * Final call might require no sorting, in rare cases where we just so
3047 	 * happen to have previously LACKMEM()'d at the point where exactly all
3048 	 * remaining tuples are loaded into memory, just before input was
3049 	 * exhausted.
3050 	 *
3051 	 * In general, short final runs are quite possible.  Rather than allowing
3052 	 * a special case where there was a superfluous selectnewtape() call (i.e.
3053 	 * a call with no subsequent run actually written to destTape), we prefer
3054 	 * to write out a 0 tuple run.
3055 	 *
3056 	 * mergereadnext() is prepared for 0 tuple runs, and will reliably mark
3057 	 * the tape inactive for the merge when called from beginmerge().  This
3058 	 * case is therefore similar to the case where mergeonerun() finds a dummy
3059 	 * run for the tape, and so doesn't need to merge a run from the tape (or
3060 	 * conceptually "merges" the dummy run, if you prefer).  According to
3061 	 * Knuth, Algorithm D "isn't strictly optimal" in its method of
3062 	 * distribution and dummy run assignment; this edge case seems very
3063 	 * unlikely to make that appreciably worse.
3064 	 */
3065 	Assert(state->status == TSS_BUILDRUNS);
3066 
3067 	/*
3068 	 * It seems unlikely that this limit will ever be exceeded, but take no
3069 	 * chances
3070 	 */
3071 	if (state->currentRun == INT_MAX)
3072 		ereport(ERROR,
3073 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
3074 				 errmsg("cannot have more than %d runs for an external sort",
3075 						INT_MAX)));
3076 
3077 	state->currentRun++;
3078 
3079 #ifdef TRACE_SORT
3080 	if (trace_sort)
3081 		elog(LOG, "starting quicksort of run %d: %s",
3082 			 state->currentRun, pg_rusage_show(&state->ru_start));
3083 #endif
3084 
3085 	/*
3086 	 * Sort all tuples accumulated within the allowed amount of memory for
3087 	 * this run using quicksort
3088 	 */
3089 	tuplesort_sort_memtuples(state);
3090 
3091 #ifdef TRACE_SORT
3092 	if (trace_sort)
3093 		elog(LOG, "finished quicksort of run %d: %s",
3094 			 state->currentRun, pg_rusage_show(&state->ru_start));
3095 #endif
3096 
3097 	memtupwrite = state->memtupcount;
3098 	for (i = 0; i < memtupwrite; i++)
3099 	{
3100 		WRITETUP(state, state->tp_tapenum[state->destTape],
3101 				 &state->memtuples[i]);
3102 		state->memtupcount--;
3103 	}
3104 
3105 	/*
3106 	 * Reset tuple memory.  We've freed all of the tuples that we previously
3107 	 * allocated.  It's important to avoid fragmentation when there is a stark
3108 	 * change in the sizes of incoming tuples.  Fragmentation due to
3109 	 * AllocSetFree's bucketing by size class might be particularly bad if
3110 	 * this step wasn't taken.
3111 	 */
3112 	MemoryContextReset(state->tuplecontext);
3113 
3114 	markrunend(state, state->tp_tapenum[state->destTape]);
3115 	state->tp_runs[state->destTape]++;
3116 	state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
3117 
3118 #ifdef TRACE_SORT
3119 	if (trace_sort)
3120 		elog(LOG, "finished writing run %d to tape %d: %s",
3121 			 state->currentRun, state->destTape,
3122 			 pg_rusage_show(&state->ru_start));
3123 #endif
3124 
3125 	if (!alltuples)
3126 		selectnewtape(state);
3127 }
3128 
3129 /*
3130  * tuplesort_rescan		- rewind and replay the scan
3131  */
3132 void
tuplesort_rescan(Tuplesortstate * state)3133 tuplesort_rescan(Tuplesortstate *state)
3134 {
3135 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3136 
3137 	Assert(state->randomAccess);
3138 
3139 	switch (state->status)
3140 	{
3141 		case TSS_SORTEDINMEM:
3142 			state->current = 0;
3143 			state->eof_reached = false;
3144 			state->markpos_offset = 0;
3145 			state->markpos_eof = false;
3146 			break;
3147 		case TSS_SORTEDONTAPE:
3148 			LogicalTapeRewindForRead(state->tapeset,
3149 									 state->result_tape,
3150 									 0);
3151 			state->eof_reached = false;
3152 			state->markpos_block = 0L;
3153 			state->markpos_offset = 0;
3154 			state->markpos_eof = false;
3155 			break;
3156 		default:
3157 			elog(ERROR, "invalid tuplesort state");
3158 			break;
3159 	}
3160 
3161 	MemoryContextSwitchTo(oldcontext);
3162 }
3163 
3164 /*
3165  * tuplesort_markpos	- saves current position in the merged sort file
3166  */
3167 void
tuplesort_markpos(Tuplesortstate * state)3168 tuplesort_markpos(Tuplesortstate *state)
3169 {
3170 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3171 
3172 	Assert(state->randomAccess);
3173 
3174 	switch (state->status)
3175 	{
3176 		case TSS_SORTEDINMEM:
3177 			state->markpos_offset = state->current;
3178 			state->markpos_eof = state->eof_reached;
3179 			break;
3180 		case TSS_SORTEDONTAPE:
3181 			LogicalTapeTell(state->tapeset,
3182 							state->result_tape,
3183 							&state->markpos_block,
3184 							&state->markpos_offset);
3185 			state->markpos_eof = state->eof_reached;
3186 			break;
3187 		default:
3188 			elog(ERROR, "invalid tuplesort state");
3189 			break;
3190 	}
3191 
3192 	MemoryContextSwitchTo(oldcontext);
3193 }
3194 
3195 /*
3196  * tuplesort_restorepos - restores current position in merged sort file to
3197  *						  last saved position
3198  */
3199 void
tuplesort_restorepos(Tuplesortstate * state)3200 tuplesort_restorepos(Tuplesortstate *state)
3201 {
3202 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3203 
3204 	Assert(state->randomAccess);
3205 
3206 	switch (state->status)
3207 	{
3208 		case TSS_SORTEDINMEM:
3209 			state->current = state->markpos_offset;
3210 			state->eof_reached = state->markpos_eof;
3211 			break;
3212 		case TSS_SORTEDONTAPE:
3213 			LogicalTapeSeek(state->tapeset,
3214 							state->result_tape,
3215 							state->markpos_block,
3216 							state->markpos_offset);
3217 			state->eof_reached = state->markpos_eof;
3218 			break;
3219 		default:
3220 			elog(ERROR, "invalid tuplesort state");
3221 			break;
3222 	}
3223 
3224 	MemoryContextSwitchTo(oldcontext);
3225 }
3226 
3227 /*
3228  * tuplesort_get_stats - extract summary statistics
3229  *
3230  * This can be called after tuplesort_performsort() finishes to obtain
3231  * printable summary information about how the sort was performed.
3232  * spaceUsed is measured in kilobytes.
3233  */
3234 void
tuplesort_get_stats(Tuplesortstate * state,const char ** sortMethod,const char ** spaceType,long * spaceUsed)3235 tuplesort_get_stats(Tuplesortstate *state,
3236 					const char **sortMethod,
3237 					const char **spaceType,
3238 					long *spaceUsed)
3239 {
3240 	/*
3241 	 * Note: it might seem we should provide both memory and disk usage for a
3242 	 * disk-based sort.  However, the current code doesn't track memory space
3243 	 * accurately once we have begun to return tuples to the caller (since we
3244 	 * don't account for pfree's the caller is expected to do), so we cannot
3245 	 * rely on availMem in a disk sort.  This does not seem worth the overhead
3246 	 * to fix.  Is it worth creating an API for the memory context code to
3247 	 * tell us how much is actually used in sortcontext?
3248 	 */
3249 	if (state->tapeset)
3250 	{
3251 		*spaceType = "Disk";
3252 		*spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024);
3253 	}
3254 	else
3255 	{
3256 		*spaceType = "Memory";
3257 		*spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
3258 	}
3259 
3260 	switch (state->status)
3261 	{
3262 		case TSS_SORTEDINMEM:
3263 			if (state->boundUsed)
3264 				*sortMethod = "top-N heapsort";
3265 			else
3266 				*sortMethod = "quicksort";
3267 			break;
3268 		case TSS_SORTEDONTAPE:
3269 			*sortMethod = "external sort";
3270 			break;
3271 		case TSS_FINALMERGE:
3272 			*sortMethod = "external merge";
3273 			break;
3274 		default:
3275 			*sortMethod = "still in progress";
3276 			break;
3277 	}
3278 }
3279 
3280 
3281 /*
3282  * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
3283  *
3284  * Compare two SortTuples.  If checkIndex is true, use the tuple index
3285  * as the front of the sort key; otherwise, no.
3286  *
3287  * Note that for checkIndex callers, the heap invariant is never
3288  * maintained beyond the first run, and so there are no COMPARETUP()
3289  * calls needed to distinguish tuples in HEAP_RUN_NEXT.
3290  */
3291 
3292 #define HEAPCOMPARE(tup1,tup2) \
3293 	(checkIndex && ((tup1)->tupindex != (tup2)->tupindex || \
3294 					(tup1)->tupindex == HEAP_RUN_NEXT) ? \
3295 	 ((tup1)->tupindex) - ((tup2)->tupindex) : \
3296 	 COMPARETUP(state, tup1, tup2))
3297 
3298 /*
3299  * Convert the existing unordered array of SortTuples to a bounded heap,
3300  * discarding all but the smallest "state->bound" tuples.
3301  *
3302  * When working with a bounded heap, we want to keep the largest entry
3303  * at the root (array entry zero), instead of the smallest as in the normal
3304  * sort case.  This allows us to discard the largest entry cheaply.
3305  * Therefore, we temporarily reverse the sort direction.
3306  *
3307  * We assume that all entries in a bounded heap will always have tupindex
3308  * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse
3309  * the direction of comparison for tupindexes.
3310  */
3311 static void
make_bounded_heap(Tuplesortstate * state)3312 make_bounded_heap(Tuplesortstate *state)
3313 {
3314 	int			tupcount = state->memtupcount;
3315 	int			i;
3316 
3317 	Assert(state->status == TSS_INITIAL);
3318 	Assert(state->bounded);
3319 	Assert(tupcount >= state->bound);
3320 
3321 	/* Reverse sort direction so largest entry will be at root */
3322 	reversedirection(state);
3323 
3324 	state->memtupcount = 0;		/* make the heap empty */
3325 	for (i = 0; i < tupcount; i++)
3326 	{
3327 		if (state->memtupcount < state->bound)
3328 		{
3329 			/* Insert next tuple into heap */
3330 			/* Must copy source tuple to avoid possible overwrite */
3331 			SortTuple	stup = state->memtuples[i];
3332 
3333 			stup.tupindex = 0;	/* not used */
3334 			tuplesort_heap_insert(state, &stup, false);
3335 		}
3336 		else
3337 		{
3338 			/*
3339 			 * The heap is full.  Replace the largest entry with the new
3340 			 * tuple, or just discard it, if it's larger than anything already
3341 			 * in the heap.
3342 			 */
3343 			if (COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
3344 			{
3345 				free_sort_tuple(state, &state->memtuples[i]);
3346 				CHECK_FOR_INTERRUPTS();
3347 			}
3348 			else
3349 				tuplesort_heap_replace_top(state, &state->memtuples[i], false);
3350 		}
3351 	}
3352 
3353 	Assert(state->memtupcount == state->bound);
3354 	state->status = TSS_BOUNDED;
3355 }
3356 
3357 /*
3358  * Convert the bounded heap to a properly-sorted array
3359  */
3360 static void
sort_bounded_heap(Tuplesortstate * state)3361 sort_bounded_heap(Tuplesortstate *state)
3362 {
3363 	int			tupcount = state->memtupcount;
3364 
3365 	Assert(state->status == TSS_BOUNDED);
3366 	Assert(state->bounded);
3367 	Assert(tupcount == state->bound);
3368 
3369 	/*
3370 	 * We can unheapify in place because each delete-top call will remove the
3371 	 * largest entry, which we can promptly store in the newly freed slot at
3372 	 * the end.  Once we're down to a single-entry heap, we're done.
3373 	 */
3374 	while (state->memtupcount > 1)
3375 	{
3376 		SortTuple	stup = state->memtuples[0];
3377 
3378 		/* this sifts-up the next-largest entry and decreases memtupcount */
3379 		tuplesort_heap_delete_top(state, false);
3380 		state->memtuples[state->memtupcount] = stup;
3381 	}
3382 	state->memtupcount = tupcount;
3383 
3384 	/*
3385 	 * Reverse sort direction back to the original state.  This is not
3386 	 * actually necessary but seems like a good idea for tidiness.
3387 	 */
3388 	reversedirection(state);
3389 
3390 	state->status = TSS_SORTEDINMEM;
3391 	state->boundUsed = true;
3392 }
3393 
3394 /*
3395  * Sort all memtuples using specialized qsort() routines.
3396  *
3397  * Quicksort is used for small in-memory sorts.  Quicksort is also generally
3398  * preferred to replacement selection for generating runs during external sort
3399  * operations, although replacement selection is sometimes used for the first
3400  * run.
3401  */
3402 static void
tuplesort_sort_memtuples(Tuplesortstate * state)3403 tuplesort_sort_memtuples(Tuplesortstate *state)
3404 {
3405 	if (state->memtupcount > 1)
3406 	{
3407 		/* Can we use the single-key sort function? */
3408 		if (state->onlyKey != NULL)
3409 			qsort_ssup(state->memtuples, state->memtupcount,
3410 					   state->onlyKey);
3411 		else
3412 			qsort_tuple(state->memtuples,
3413 						state->memtupcount,
3414 						state->comparetup,
3415 						state);
3416 	}
3417 }
3418 
3419 /*
3420  * Insert a new tuple into an empty or existing heap, maintaining the
3421  * heap invariant.  Caller is responsible for ensuring there's room.
3422  *
3423  * Note: For some callers, tuple points to a memtuples[] entry above the
3424  * end of the heap.  This is safe as long as it's not immediately adjacent
3425  * to the end of the heap (ie, in the [memtupcount] array entry) --- if it
3426  * is, it might get overwritten before being moved into the heap!
3427  */
3428 static void
tuplesort_heap_insert(Tuplesortstate * state,SortTuple * tuple,bool checkIndex)3429 tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
3430 					  bool checkIndex)
3431 {
3432 	SortTuple  *memtuples;
3433 	int			j;
3434 
3435 	memtuples = state->memtuples;
3436 	Assert(state->memtupcount < state->memtupsize);
3437 	Assert(!checkIndex || tuple->tupindex == RUN_FIRST);
3438 
3439 	CHECK_FOR_INTERRUPTS();
3440 
3441 	/*
3442 	 * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is
3443 	 * using 1-based array indexes, not 0-based.
3444 	 */
3445 	j = state->memtupcount++;
3446 	while (j > 0)
3447 	{
3448 		int			i = (j - 1) >> 1;
3449 
3450 		if (HEAPCOMPARE(tuple, &memtuples[i]) >= 0)
3451 			break;
3452 		memtuples[j] = memtuples[i];
3453 		j = i;
3454 	}
3455 	memtuples[j] = *tuple;
3456 }
3457 
3458 /*
3459  * Remove the tuple at state->memtuples[0] from the heap.  Decrement
3460  * memtupcount, and sift up to maintain the heap invariant.
3461  *
3462  * The caller has already free'd the tuple the top node points to,
3463  * if necessary.
3464  */
3465 static void
tuplesort_heap_delete_top(Tuplesortstate * state,bool checkIndex)3466 tuplesort_heap_delete_top(Tuplesortstate *state, bool checkIndex)
3467 {
3468 	SortTuple  *memtuples = state->memtuples;
3469 	SortTuple  *tuple;
3470 
3471 	Assert(!checkIndex || state->currentRun == RUN_FIRST);
3472 	if (--state->memtupcount <= 0)
3473 		return;
3474 
3475 	/*
3476 	 * Remove the last tuple in the heap, and re-insert it, by replacing the
3477 	 * current top node with it.
3478 	 */
3479 	tuple = &memtuples[state->memtupcount];
3480 	tuplesort_heap_replace_top(state, tuple, checkIndex);
3481 }
3482 
3483 /*
3484  * Replace the tuple at state->memtuples[0] with a new tuple.  Sift up to
3485  * maintain the heap invariant.
3486  *
3487  * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H,
3488  * Heapsort, steps H3-H8).
3489  */
3490 static void
tuplesort_heap_replace_top(Tuplesortstate * state,SortTuple * tuple,bool checkIndex)3491 tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple,
3492 						   bool checkIndex)
3493 {
3494 	SortTuple  *memtuples = state->memtuples;
3495 	unsigned int i,
3496 				n;
3497 
3498 	Assert(!checkIndex || state->currentRun == RUN_FIRST);
3499 	Assert(state->memtupcount >= 1);
3500 
3501 	CHECK_FOR_INTERRUPTS();
3502 
3503 	/*
3504 	 * state->memtupcount is "int", but we use "unsigned int" for i, j, n.
3505 	 * This prevents overflow in the "2 * i + 1" calculation, since at the top
3506 	 * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2.
3507 	 */
3508 	n = state->memtupcount;
3509 	i = 0;						/* i is where the "hole" is */
3510 	for (;;)
3511 	{
3512 		unsigned int j = 2 * i + 1;
3513 
3514 		if (j >= n)
3515 			break;
3516 		if (j + 1 < n &&
3517 			HEAPCOMPARE(&memtuples[j], &memtuples[j + 1]) > 0)
3518 			j++;
3519 		if (HEAPCOMPARE(tuple, &memtuples[j]) <= 0)
3520 			break;
3521 		memtuples[i] = memtuples[j];
3522 		i = j;
3523 	}
3524 	memtuples[i] = *tuple;
3525 }
3526 
3527 /*
3528  * Function to reverse the sort direction from its current state
3529  *
3530  * It is not safe to call this when performing hash tuplesorts
3531  */
3532 static void
reversedirection(Tuplesortstate * state)3533 reversedirection(Tuplesortstate *state)
3534 {
3535 	SortSupport sortKey = state->sortKeys;
3536 	int			nkey;
3537 
3538 	for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++)
3539 	{
3540 		sortKey->ssup_reverse = !sortKey->ssup_reverse;
3541 		sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
3542 	}
3543 }
3544 
3545 
3546 /*
3547  * Tape interface routines
3548  */
3549 
3550 static unsigned int
getlen(Tuplesortstate * state,int tapenum,bool eofOK)3551 getlen(Tuplesortstate *state, int tapenum, bool eofOK)
3552 {
3553 	unsigned int len;
3554 
3555 	if (LogicalTapeRead(state->tapeset, tapenum,
3556 						&len, sizeof(len)) != sizeof(len))
3557 		elog(ERROR, "unexpected end of tape");
3558 	if (len == 0 && !eofOK)
3559 		elog(ERROR, "unexpected end of data");
3560 	return len;
3561 }
3562 
3563 static void
markrunend(Tuplesortstate * state,int tapenum)3564 markrunend(Tuplesortstate *state, int tapenum)
3565 {
3566 	unsigned int len = 0;
3567 
3568 	LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len));
3569 }
3570 
3571 /*
3572  * Get memory for tuple from within READTUP() routine.
3573  *
3574  * We use next free slot from the slab allocator, or palloc() if the tuple
3575  * is too large for that.
3576  */
3577 static void *
readtup_alloc(Tuplesortstate * state,Size tuplen)3578 readtup_alloc(Tuplesortstate *state, Size tuplen)
3579 {
3580 	SlabSlot   *buf;
3581 
3582 	/*
3583 	 * We pre-allocate enough slots in the slab arena that we should never run
3584 	 * out.
3585 	 */
3586 	Assert(state->slabFreeHead);
3587 
3588 	if (tuplen > SLAB_SLOT_SIZE || !state->slabFreeHead)
3589 		return MemoryContextAlloc(state->sortcontext, tuplen);
3590 	else
3591 	{
3592 		buf = state->slabFreeHead;
3593 		/* Reuse this slot */
3594 		state->slabFreeHead = buf->nextfree;
3595 
3596 		return buf;
3597 	}
3598 }
3599 
3600 
3601 /*
3602  * Routines specialized for HeapTuple (actually MinimalTuple) case
3603  */
3604 
3605 static int
comparetup_heap(const SortTuple * a,const SortTuple * b,Tuplesortstate * state)3606 comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
3607 {
3608 	SortSupport sortKey = state->sortKeys;
3609 	HeapTupleData ltup;
3610 	HeapTupleData rtup;
3611 	TupleDesc	tupDesc;
3612 	int			nkey;
3613 	int32		compare;
3614 	AttrNumber	attno;
3615 	Datum		datum1,
3616 				datum2;
3617 	bool		isnull1,
3618 				isnull2;
3619 
3620 
3621 	/* Compare the leading sort key */
3622 	compare = ApplySortComparator(a->datum1, a->isnull1,
3623 								  b->datum1, b->isnull1,
3624 								  sortKey);
3625 	if (compare != 0)
3626 		return compare;
3627 
3628 	/* Compare additional sort keys */
3629 	ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3630 	ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET);
3631 	rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3632 	rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
3633 	tupDesc = state->tupDesc;
3634 
3635 	if (sortKey->abbrev_converter)
3636 	{
3637 		attno = sortKey->ssup_attno;
3638 
3639 		datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3640 		datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3641 
3642 		compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3643 												datum2, isnull2,
3644 												sortKey);
3645 		if (compare != 0)
3646 			return compare;
3647 	}
3648 
3649 	sortKey++;
3650 	for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
3651 	{
3652 		attno = sortKey->ssup_attno;
3653 
3654 		datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3655 		datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3656 
3657 		compare = ApplySortComparator(datum1, isnull1,
3658 									  datum2, isnull2,
3659 									  sortKey);
3660 		if (compare != 0)
3661 			return compare;
3662 	}
3663 
3664 	return 0;
3665 }
3666 
3667 static void
copytup_heap(Tuplesortstate * state,SortTuple * stup,void * tup)3668 copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
3669 {
3670 	/*
3671 	 * We expect the passed "tup" to be a TupleTableSlot, and form a
3672 	 * MinimalTuple using the exported interface for that.
3673 	 */
3674 	TupleTableSlot *slot = (TupleTableSlot *) tup;
3675 	Datum		original;
3676 	MinimalTuple tuple;
3677 	HeapTupleData htup;
3678 	MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
3679 
3680 	/* copy the tuple into sort storage */
3681 	tuple = ExecCopySlotMinimalTuple(slot);
3682 	stup->tuple = (void *) tuple;
3683 	USEMEM(state, GetMemoryChunkSpace(tuple));
3684 	/* set up first-column key value */
3685 	htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3686 	htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3687 	original = heap_getattr(&htup,
3688 							state->sortKeys[0].ssup_attno,
3689 							state->tupDesc,
3690 							&stup->isnull1);
3691 
3692 	MemoryContextSwitchTo(oldcontext);
3693 
3694 	if (!state->sortKeys->abbrev_converter || stup->isnull1)
3695 	{
3696 		/*
3697 		 * Store ordinary Datum representation, or NULL value.  If there is a
3698 		 * converter it won't expect NULL values, and cost model is not
3699 		 * required to account for NULL, so in that case we avoid calling
3700 		 * converter and just set datum1 to zeroed representation (to be
3701 		 * consistent, and to support cheap inequality tests for NULL
3702 		 * abbreviated keys).
3703 		 */
3704 		stup->datum1 = original;
3705 	}
3706 	else if (!consider_abort_common(state))
3707 	{
3708 		/* Store abbreviated key representation */
3709 		stup->datum1 = state->sortKeys->abbrev_converter(original,
3710 														 state->sortKeys);
3711 	}
3712 	else
3713 	{
3714 		/* Abort abbreviation */
3715 		int			i;
3716 
3717 		stup->datum1 = original;
3718 
3719 		/*
3720 		 * Set state to be consistent with never trying abbreviation.
3721 		 *
3722 		 * Alter datum1 representation in already-copied tuples, so as to
3723 		 * ensure a consistent representation (current tuple was just
3724 		 * handled).  It does not matter if some dumped tuples are already
3725 		 * sorted on tape, since serialized tuples lack abbreviated keys
3726 		 * (TSS_BUILDRUNS state prevents control reaching here in any case).
3727 		 */
3728 		for (i = 0; i < state->memtupcount; i++)
3729 		{
3730 			SortTuple  *mtup = &state->memtuples[i];
3731 
3732 			htup.t_len = ((MinimalTuple) mtup->tuple)->t_len +
3733 				MINIMAL_TUPLE_OFFSET;
3734 			htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple -
3735 											 MINIMAL_TUPLE_OFFSET);
3736 
3737 			mtup->datum1 = heap_getattr(&htup,
3738 										state->sortKeys[0].ssup_attno,
3739 										state->tupDesc,
3740 										&mtup->isnull1);
3741 		}
3742 	}
3743 }
3744 
3745 static void
writetup_heap(Tuplesortstate * state,int tapenum,SortTuple * stup)3746 writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup)
3747 {
3748 	MinimalTuple tuple = (MinimalTuple) stup->tuple;
3749 
3750 	/* the part of the MinimalTuple we'll write: */
3751 	char	   *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
3752 	unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET;
3753 
3754 	/* total on-disk footprint: */
3755 	unsigned int tuplen = tupbodylen + sizeof(int);
3756 
3757 	LogicalTapeWrite(state->tapeset, tapenum,
3758 					 (void *) &tuplen, sizeof(tuplen));
3759 	LogicalTapeWrite(state->tapeset, tapenum,
3760 					 (void *) tupbody, tupbodylen);
3761 	if (state->randomAccess)	/* need trailing length word? */
3762 		LogicalTapeWrite(state->tapeset, tapenum,
3763 						 (void *) &tuplen, sizeof(tuplen));
3764 
3765 	if (!state->slabAllocatorUsed)
3766 	{
3767 		FREEMEM(state, GetMemoryChunkSpace(tuple));
3768 		heap_free_minimal_tuple(tuple);
3769 	}
3770 }
3771 
3772 static void
readtup_heap(Tuplesortstate * state,SortTuple * stup,int tapenum,unsigned int len)3773 readtup_heap(Tuplesortstate *state, SortTuple *stup,
3774 			 int tapenum, unsigned int len)
3775 {
3776 	unsigned int tupbodylen = len - sizeof(int);
3777 	unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
3778 	MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tuplen);
3779 	char	   *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
3780 	HeapTupleData htup;
3781 
3782 	/* read in the tuple proper */
3783 	tuple->t_len = tuplen;
3784 	LogicalTapeReadExact(state->tapeset, tapenum,
3785 						 tupbody, tupbodylen);
3786 	if (state->randomAccess)	/* need trailing length word? */
3787 		LogicalTapeReadExact(state->tapeset, tapenum,
3788 							 &tuplen, sizeof(tuplen));
3789 	stup->tuple = (void *) tuple;
3790 	/* set up first-column key value */
3791 	htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
3792 	htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
3793 	stup->datum1 = heap_getattr(&htup,
3794 								state->sortKeys[0].ssup_attno,
3795 								state->tupDesc,
3796 								&stup->isnull1);
3797 }
3798 
3799 /*
3800  * Routines specialized for the CLUSTER case (HeapTuple data, with
3801  * comparisons per a btree index definition)
3802  */
3803 
3804 static int
comparetup_cluster(const SortTuple * a,const SortTuple * b,Tuplesortstate * state)3805 comparetup_cluster(const SortTuple *a, const SortTuple *b,
3806 				   Tuplesortstate *state)
3807 {
3808 	SortSupport sortKey = state->sortKeys;
3809 	HeapTuple	ltup;
3810 	HeapTuple	rtup;
3811 	TupleDesc	tupDesc;
3812 	int			nkey;
3813 	int32		compare;
3814 	Datum		datum1,
3815 				datum2;
3816 	bool		isnull1,
3817 				isnull2;
3818 	AttrNumber	leading = state->indexInfo->ii_KeyAttrNumbers[0];
3819 
3820 	/* Be prepared to compare additional sort keys */
3821 	ltup = (HeapTuple) a->tuple;
3822 	rtup = (HeapTuple) b->tuple;
3823 	tupDesc = state->tupDesc;
3824 
3825 	/* Compare the leading sort key, if it's simple */
3826 	if (leading != 0)
3827 	{
3828 		compare = ApplySortComparator(a->datum1, a->isnull1,
3829 									  b->datum1, b->isnull1,
3830 									  sortKey);
3831 		if (compare != 0)
3832 			return compare;
3833 
3834 		if (sortKey->abbrev_converter)
3835 		{
3836 			datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1);
3837 			datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2);
3838 
3839 			compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3840 													datum2, isnull2,
3841 													sortKey);
3842 		}
3843 		if (compare != 0 || state->nKeys == 1)
3844 			return compare;
3845 		/* Compare additional columns the hard way */
3846 		sortKey++;
3847 		nkey = 1;
3848 	}
3849 	else
3850 	{
3851 		/* Must compare all keys the hard way */
3852 		nkey = 0;
3853 	}
3854 
3855 	if (state->indexInfo->ii_Expressions == NULL)
3856 	{
3857 		/* If not expression index, just compare the proper heap attrs */
3858 
3859 		for (; nkey < state->nKeys; nkey++, sortKey++)
3860 		{
3861 			AttrNumber	attno = state->indexInfo->ii_KeyAttrNumbers[nkey];
3862 
3863 			datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
3864 			datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
3865 
3866 			compare = ApplySortComparator(datum1, isnull1,
3867 										  datum2, isnull2,
3868 										  sortKey);
3869 			if (compare != 0)
3870 				return compare;
3871 		}
3872 	}
3873 	else
3874 	{
3875 		/*
3876 		 * In the expression index case, compute the whole index tuple and
3877 		 * then compare values.  It would perhaps be faster to compute only as
3878 		 * many columns as we need to compare, but that would require
3879 		 * duplicating all the logic in FormIndexDatum.
3880 		 */
3881 		Datum		l_index_values[INDEX_MAX_KEYS];
3882 		bool		l_index_isnull[INDEX_MAX_KEYS];
3883 		Datum		r_index_values[INDEX_MAX_KEYS];
3884 		bool		r_index_isnull[INDEX_MAX_KEYS];
3885 		TupleTableSlot *ecxt_scantuple;
3886 
3887 		/* Reset context each time to prevent memory leakage */
3888 		ResetPerTupleExprContext(state->estate);
3889 
3890 		ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
3891 
3892 		ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false);
3893 		FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
3894 					   l_index_values, l_index_isnull);
3895 
3896 		ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false);
3897 		FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
3898 					   r_index_values, r_index_isnull);
3899 
3900 		for (; nkey < state->nKeys; nkey++, sortKey++)
3901 		{
3902 			compare = ApplySortComparator(l_index_values[nkey],
3903 										  l_index_isnull[nkey],
3904 										  r_index_values[nkey],
3905 										  r_index_isnull[nkey],
3906 										  sortKey);
3907 			if (compare != 0)
3908 				return compare;
3909 		}
3910 	}
3911 
3912 	return 0;
3913 }
3914 
3915 static void
copytup_cluster(Tuplesortstate * state,SortTuple * stup,void * tup)3916 copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
3917 {
3918 	HeapTuple	tuple = (HeapTuple) tup;
3919 	Datum		original;
3920 	MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
3921 
3922 	/* copy the tuple into sort storage */
3923 	tuple = heap_copytuple(tuple);
3924 	stup->tuple = (void *) tuple;
3925 	USEMEM(state, GetMemoryChunkSpace(tuple));
3926 
3927 	MemoryContextSwitchTo(oldcontext);
3928 
3929 	/*
3930 	 * set up first-column key value, and potentially abbreviate, if it's a
3931 	 * simple column
3932 	 */
3933 	if (state->indexInfo->ii_KeyAttrNumbers[0] == 0)
3934 		return;
3935 
3936 	original = heap_getattr(tuple,
3937 							state->indexInfo->ii_KeyAttrNumbers[0],
3938 							state->tupDesc,
3939 							&stup->isnull1);
3940 
3941 	if (!state->sortKeys->abbrev_converter || stup->isnull1)
3942 	{
3943 		/*
3944 		 * Store ordinary Datum representation, or NULL value.  If there is a
3945 		 * converter it won't expect NULL values, and cost model is not
3946 		 * required to account for NULL, so in that case we avoid calling
3947 		 * converter and just set datum1 to zeroed representation (to be
3948 		 * consistent, and to support cheap inequality tests for NULL
3949 		 * abbreviated keys).
3950 		 */
3951 		stup->datum1 = original;
3952 	}
3953 	else if (!consider_abort_common(state))
3954 	{
3955 		/* Store abbreviated key representation */
3956 		stup->datum1 = state->sortKeys->abbrev_converter(original,
3957 														 state->sortKeys);
3958 	}
3959 	else
3960 	{
3961 		/* Abort abbreviation */
3962 		int			i;
3963 
3964 		stup->datum1 = original;
3965 
3966 		/*
3967 		 * Set state to be consistent with never trying abbreviation.
3968 		 *
3969 		 * Alter datum1 representation in already-copied tuples, so as to
3970 		 * ensure a consistent representation (current tuple was just
3971 		 * handled).  It does not matter if some dumped tuples are already
3972 		 * sorted on tape, since serialized tuples lack abbreviated keys
3973 		 * (TSS_BUILDRUNS state prevents control reaching here in any case).
3974 		 */
3975 		for (i = 0; i < state->memtupcount; i++)
3976 		{
3977 			SortTuple  *mtup = &state->memtuples[i];
3978 
3979 			tuple = (HeapTuple) mtup->tuple;
3980 			mtup->datum1 = heap_getattr(tuple,
3981 										state->indexInfo->ii_KeyAttrNumbers[0],
3982 										state->tupDesc,
3983 										&mtup->isnull1);
3984 		}
3985 	}
3986 }
3987 
3988 static void
writetup_cluster(Tuplesortstate * state,int tapenum,SortTuple * stup)3989 writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup)
3990 {
3991 	HeapTuple	tuple = (HeapTuple) stup->tuple;
3992 	unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int);
3993 
3994 	/* We need to store t_self, but not other fields of HeapTupleData */
3995 	LogicalTapeWrite(state->tapeset, tapenum,
3996 					 &tuplen, sizeof(tuplen));
3997 	LogicalTapeWrite(state->tapeset, tapenum,
3998 					 &tuple->t_self, sizeof(ItemPointerData));
3999 	LogicalTapeWrite(state->tapeset, tapenum,
4000 					 tuple->t_data, tuple->t_len);
4001 	if (state->randomAccess)	/* need trailing length word? */
4002 		LogicalTapeWrite(state->tapeset, tapenum,
4003 						 &tuplen, sizeof(tuplen));
4004 
4005 	if (!state->slabAllocatorUsed)
4006 	{
4007 		FREEMEM(state, GetMemoryChunkSpace(tuple));
4008 		heap_freetuple(tuple);
4009 	}
4010 }
4011 
4012 static void
readtup_cluster(Tuplesortstate * state,SortTuple * stup,int tapenum,unsigned int tuplen)4013 readtup_cluster(Tuplesortstate *state, SortTuple *stup,
4014 				int tapenum, unsigned int tuplen)
4015 {
4016 	unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
4017 	HeapTuple	tuple = (HeapTuple) readtup_alloc(state,
4018 												  t_len + HEAPTUPLESIZE);
4019 
4020 	/* Reconstruct the HeapTupleData header */
4021 	tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
4022 	tuple->t_len = t_len;
4023 	LogicalTapeReadExact(state->tapeset, tapenum,
4024 						 &tuple->t_self, sizeof(ItemPointerData));
4025 	/* We don't currently bother to reconstruct t_tableOid */
4026 	tuple->t_tableOid = InvalidOid;
4027 	/* Read in the tuple body */
4028 	LogicalTapeReadExact(state->tapeset, tapenum,
4029 						 tuple->t_data, tuple->t_len);
4030 	if (state->randomAccess)	/* need trailing length word? */
4031 		LogicalTapeReadExact(state->tapeset, tapenum,
4032 							 &tuplen, sizeof(tuplen));
4033 	stup->tuple = (void *) tuple;
4034 	/* set up first-column key value, if it's a simple column */
4035 	if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
4036 		stup->datum1 = heap_getattr(tuple,
4037 									state->indexInfo->ii_KeyAttrNumbers[0],
4038 									state->tupDesc,
4039 									&stup->isnull1);
4040 }
4041 
4042 /*
4043  * Routines specialized for IndexTuple case
4044  *
4045  * The btree and hash cases require separate comparison functions, but the
4046  * IndexTuple representation is the same so the copy/write/read support
4047  * functions can be shared.
4048  */
4049 
4050 static int
comparetup_index_btree(const SortTuple * a,const SortTuple * b,Tuplesortstate * state)4051 comparetup_index_btree(const SortTuple *a, const SortTuple *b,
4052 					   Tuplesortstate *state)
4053 {
4054 	/*
4055 	 * This is similar to comparetup_heap(), but expects index tuples.  There
4056 	 * is also special handling for enforcing uniqueness, and special
4057 	 * treatment for equal keys at the end.
4058 	 */
4059 	SortSupport sortKey = state->sortKeys;
4060 	IndexTuple	tuple1;
4061 	IndexTuple	tuple2;
4062 	int			keysz;
4063 	TupleDesc	tupDes;
4064 	bool		equal_hasnull = false;
4065 	int			nkey;
4066 	int32		compare;
4067 	Datum		datum1,
4068 				datum2;
4069 	bool		isnull1,
4070 				isnull2;
4071 
4072 
4073 	/* Compare the leading sort key */
4074 	compare = ApplySortComparator(a->datum1, a->isnull1,
4075 								  b->datum1, b->isnull1,
4076 								  sortKey);
4077 	if (compare != 0)
4078 		return compare;
4079 
4080 	/* Compare additional sort keys */
4081 	tuple1 = (IndexTuple) a->tuple;
4082 	tuple2 = (IndexTuple) b->tuple;
4083 	keysz = state->nKeys;
4084 	tupDes = RelationGetDescr(state->indexRel);
4085 
4086 	if (sortKey->abbrev_converter)
4087 	{
4088 		datum1 = index_getattr(tuple1, 1, tupDes, &isnull1);
4089 		datum2 = index_getattr(tuple2, 1, tupDes, &isnull2);
4090 
4091 		compare = ApplySortAbbrevFullComparator(datum1, isnull1,
4092 												datum2, isnull2,
4093 												sortKey);
4094 		if (compare != 0)
4095 			return compare;
4096 	}
4097 
4098 	/* they are equal, so we only need to examine one null flag */
4099 	if (a->isnull1)
4100 		equal_hasnull = true;
4101 
4102 	sortKey++;
4103 	for (nkey = 2; nkey <= keysz; nkey++, sortKey++)
4104 	{
4105 		datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);
4106 		datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);
4107 
4108 		compare = ApplySortComparator(datum1, isnull1,
4109 									  datum2, isnull2,
4110 									  sortKey);
4111 		if (compare != 0)
4112 			return compare;		/* done when we find unequal attributes */
4113 
4114 		/* they are equal, so we only need to examine one null flag */
4115 		if (isnull1)
4116 			equal_hasnull = true;
4117 	}
4118 
4119 	/*
4120 	 * If btree has asked us to enforce uniqueness, complain if two equal
4121 	 * tuples are detected (unless there was at least one NULL field).
4122 	 *
4123 	 * It is sufficient to make the test here, because if two tuples are equal
4124 	 * they *must* get compared at some stage of the sort --- otherwise the
4125 	 * sort algorithm wouldn't have checked whether one must appear before the
4126 	 * other.
4127 	 */
4128 	if (state->enforceUnique && !equal_hasnull)
4129 	{
4130 		Datum		values[INDEX_MAX_KEYS];
4131 		bool		isnull[INDEX_MAX_KEYS];
4132 		char	   *key_desc;
4133 
4134 		/*
4135 		 * Some rather brain-dead implementations of qsort (such as the one in
4136 		 * QNX 4) will sometimes call the comparison routine to compare a
4137 		 * value to itself, but we always use our own implementation, which
4138 		 * does not.
4139 		 */
4140 		Assert(tuple1 != tuple2);
4141 
4142 		index_deform_tuple(tuple1, tupDes, values, isnull);
4143 
4144 		key_desc = BuildIndexValueDescription(state->indexRel, values, isnull);
4145 
4146 		ereport(ERROR,
4147 				(errcode(ERRCODE_UNIQUE_VIOLATION),
4148 				 errmsg("could not create unique index \"%s\"",
4149 						RelationGetRelationName(state->indexRel)),
4150 				 key_desc ? errdetail("Key %s is duplicated.", key_desc) :
4151 				 errdetail("Duplicate keys exist."),
4152 				 errtableconstraint(state->heapRel,
4153 									RelationGetRelationName(state->indexRel))));
4154 	}
4155 
4156 	/*
4157 	 * If key values are equal, we sort on ItemPointer.  This does not affect
4158 	 * validity of the finished index, but it may be useful to have index
4159 	 * scans in physical order.
4160 	 */
4161 	{
4162 		BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
4163 		BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
4164 
4165 		if (blk1 != blk2)
4166 			return (blk1 < blk2) ? -1 : 1;
4167 	}
4168 	{
4169 		OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
4170 		OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
4171 
4172 		if (pos1 != pos2)
4173 			return (pos1 < pos2) ? -1 : 1;
4174 	}
4175 
4176 	/* ItemPointer values should never be equal */
4177 	Assert(false);
4178 
4179 	return 0;
4180 }
4181 
4182 static int
comparetup_index_hash(const SortTuple * a,const SortTuple * b,Tuplesortstate * state)4183 comparetup_index_hash(const SortTuple *a, const SortTuple *b,
4184 					  Tuplesortstate *state)
4185 {
4186 	Bucket		bucket1;
4187 	Bucket		bucket2;
4188 	IndexTuple	tuple1;
4189 	IndexTuple	tuple2;
4190 
4191 	/*
4192 	 * Fetch hash keys and mask off bits we don't want to sort by. We know
4193 	 * that the first column of the index tuple is the hash key.
4194 	 */
4195 	Assert(!a->isnull1);
4196 	bucket1 = _hash_hashkey2bucket(DatumGetUInt32(a->datum1),
4197 								   state->max_buckets, state->high_mask,
4198 								   state->low_mask);
4199 	Assert(!b->isnull1);
4200 	bucket2 = _hash_hashkey2bucket(DatumGetUInt32(b->datum1),
4201 								   state->max_buckets, state->high_mask,
4202 								   state->low_mask);
4203 	if (bucket1 > bucket2)
4204 		return 1;
4205 	else if (bucket1 < bucket2)
4206 		return -1;
4207 
4208 	/*
4209 	 * If hash values are equal, we sort on ItemPointer.  This does not affect
4210 	 * validity of the finished index, but it may be useful to have index
4211 	 * scans in physical order.
4212 	 */
4213 	tuple1 = (IndexTuple) a->tuple;
4214 	tuple2 = (IndexTuple) b->tuple;
4215 
4216 	{
4217 		BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
4218 		BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
4219 
4220 		if (blk1 != blk2)
4221 			return (blk1 < blk2) ? -1 : 1;
4222 	}
4223 	{
4224 		OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
4225 		OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
4226 
4227 		if (pos1 != pos2)
4228 			return (pos1 < pos2) ? -1 : 1;
4229 	}
4230 
4231 	/* ItemPointer values should never be equal */
4232 	Assert(false);
4233 
4234 	return 0;
4235 }
4236 
4237 static void
copytup_index(Tuplesortstate * state,SortTuple * stup,void * tup)4238 copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
4239 {
4240 	IndexTuple	tuple = (IndexTuple) tup;
4241 	unsigned int tuplen = IndexTupleSize(tuple);
4242 	IndexTuple	newtuple;
4243 	Datum		original;
4244 
4245 	/* copy the tuple into sort storage */
4246 	newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen);
4247 	memcpy(newtuple, tuple, tuplen);
4248 	USEMEM(state, GetMemoryChunkSpace(newtuple));
4249 	stup->tuple = (void *) newtuple;
4250 	/* set up first-column key value */
4251 	original = index_getattr(newtuple,
4252 							 1,
4253 							 RelationGetDescr(state->indexRel),
4254 							 &stup->isnull1);
4255 
4256 	if (!state->sortKeys->abbrev_converter || stup->isnull1)
4257 	{
4258 		/*
4259 		 * Store ordinary Datum representation, or NULL value.  If there is a
4260 		 * converter it won't expect NULL values, and cost model is not
4261 		 * required to account for NULL, so in that case we avoid calling
4262 		 * converter and just set datum1 to zeroed representation (to be
4263 		 * consistent, and to support cheap inequality tests for NULL
4264 		 * abbreviated keys).
4265 		 */
4266 		stup->datum1 = original;
4267 	}
4268 	else if (!consider_abort_common(state))
4269 	{
4270 		/* Store abbreviated key representation */
4271 		stup->datum1 = state->sortKeys->abbrev_converter(original,
4272 														 state->sortKeys);
4273 	}
4274 	else
4275 	{
4276 		/* Abort abbreviation */
4277 		int			i;
4278 
4279 		stup->datum1 = original;
4280 
4281 		/*
4282 		 * Set state to be consistent with never trying abbreviation.
4283 		 *
4284 		 * Alter datum1 representation in already-copied tuples, so as to
4285 		 * ensure a consistent representation (current tuple was just
4286 		 * handled).  It does not matter if some dumped tuples are already
4287 		 * sorted on tape, since serialized tuples lack abbreviated keys
4288 		 * (TSS_BUILDRUNS state prevents control reaching here in any case).
4289 		 */
4290 		for (i = 0; i < state->memtupcount; i++)
4291 		{
4292 			SortTuple  *mtup = &state->memtuples[i];
4293 
4294 			tuple = (IndexTuple) mtup->tuple;
4295 			mtup->datum1 = index_getattr(tuple,
4296 										 1,
4297 										 RelationGetDescr(state->indexRel),
4298 										 &mtup->isnull1);
4299 		}
4300 	}
4301 }
4302 
4303 static void
writetup_index(Tuplesortstate * state,int tapenum,SortTuple * stup)4304 writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup)
4305 {
4306 	IndexTuple	tuple = (IndexTuple) stup->tuple;
4307 	unsigned int tuplen;
4308 
4309 	tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
4310 	LogicalTapeWrite(state->tapeset, tapenum,
4311 					 (void *) &tuplen, sizeof(tuplen));
4312 	LogicalTapeWrite(state->tapeset, tapenum,
4313 					 (void *) tuple, IndexTupleSize(tuple));
4314 	if (state->randomAccess)	/* need trailing length word? */
4315 		LogicalTapeWrite(state->tapeset, tapenum,
4316 						 (void *) &tuplen, sizeof(tuplen));
4317 
4318 	if (!state->slabAllocatorUsed)
4319 	{
4320 		FREEMEM(state, GetMemoryChunkSpace(tuple));
4321 		pfree(tuple);
4322 	}
4323 }
4324 
4325 static void
readtup_index(Tuplesortstate * state,SortTuple * stup,int tapenum,unsigned int len)4326 readtup_index(Tuplesortstate *state, SortTuple *stup,
4327 			  int tapenum, unsigned int len)
4328 {
4329 	unsigned int tuplen = len - sizeof(unsigned int);
4330 	IndexTuple	tuple = (IndexTuple) readtup_alloc(state, tuplen);
4331 
4332 	LogicalTapeReadExact(state->tapeset, tapenum,
4333 						 tuple, tuplen);
4334 	if (state->randomAccess)	/* need trailing length word? */
4335 		LogicalTapeReadExact(state->tapeset, tapenum,
4336 							 &tuplen, sizeof(tuplen));
4337 	stup->tuple = (void *) tuple;
4338 	/* set up first-column key value */
4339 	stup->datum1 = index_getattr(tuple,
4340 								 1,
4341 								 RelationGetDescr(state->indexRel),
4342 								 &stup->isnull1);
4343 }
4344 
4345 /*
4346  * Routines specialized for DatumTuple case
4347  */
4348 
4349 static int
comparetup_datum(const SortTuple * a,const SortTuple * b,Tuplesortstate * state)4350 comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
4351 {
4352 	int			compare;
4353 
4354 	compare = ApplySortComparator(a->datum1, a->isnull1,
4355 								  b->datum1, b->isnull1,
4356 								  state->sortKeys);
4357 	if (compare != 0)
4358 		return compare;
4359 
4360 	/* if we have abbreviations, then "tuple" has the original value */
4361 
4362 	if (state->sortKeys->abbrev_converter)
4363 		compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1,
4364 												PointerGetDatum(b->tuple), b->isnull1,
4365 												state->sortKeys);
4366 
4367 	return compare;
4368 }
4369 
4370 static void
copytup_datum(Tuplesortstate * state,SortTuple * stup,void * tup)4371 copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup)
4372 {
4373 	/* Not currently needed */
4374 	elog(ERROR, "copytup_datum() should not be called");
4375 }
4376 
4377 static void
writetup_datum(Tuplesortstate * state,int tapenum,SortTuple * stup)4378 writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup)
4379 {
4380 	void	   *waddr;
4381 	unsigned int tuplen;
4382 	unsigned int writtenlen;
4383 
4384 	if (stup->isnull1)
4385 	{
4386 		waddr = NULL;
4387 		tuplen = 0;
4388 	}
4389 	else if (!state->tuples)
4390 	{
4391 		waddr = &stup->datum1;
4392 		tuplen = sizeof(Datum);
4393 	}
4394 	else
4395 	{
4396 		waddr = stup->tuple;
4397 		tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen);
4398 		Assert(tuplen != 0);
4399 	}
4400 
4401 	writtenlen = tuplen + sizeof(unsigned int);
4402 
4403 	LogicalTapeWrite(state->tapeset, tapenum,
4404 					 (void *) &writtenlen, sizeof(writtenlen));
4405 	LogicalTapeWrite(state->tapeset, tapenum,
4406 					 waddr, tuplen);
4407 	if (state->randomAccess)	/* need trailing length word? */
4408 		LogicalTapeWrite(state->tapeset, tapenum,
4409 						 (void *) &writtenlen, sizeof(writtenlen));
4410 
4411 	if (!state->slabAllocatorUsed && stup->tuple)
4412 	{
4413 		FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4414 		pfree(stup->tuple);
4415 	}
4416 }
4417 
4418 static void
readtup_datum(Tuplesortstate * state,SortTuple * stup,int tapenum,unsigned int len)4419 readtup_datum(Tuplesortstate *state, SortTuple *stup,
4420 			  int tapenum, unsigned int len)
4421 {
4422 	unsigned int tuplen = len - sizeof(unsigned int);
4423 
4424 	if (tuplen == 0)
4425 	{
4426 		/* it's NULL */
4427 		stup->datum1 = (Datum) 0;
4428 		stup->isnull1 = true;
4429 		stup->tuple = NULL;
4430 	}
4431 	else if (!state->tuples)
4432 	{
4433 		Assert(tuplen == sizeof(Datum));
4434 		LogicalTapeReadExact(state->tapeset, tapenum,
4435 							 &stup->datum1, tuplen);
4436 		stup->isnull1 = false;
4437 		stup->tuple = NULL;
4438 	}
4439 	else
4440 	{
4441 		void	   *raddr = readtup_alloc(state, tuplen);
4442 
4443 		LogicalTapeReadExact(state->tapeset, tapenum,
4444 							 raddr, tuplen);
4445 		stup->datum1 = PointerGetDatum(raddr);
4446 		stup->isnull1 = false;
4447 		stup->tuple = raddr;
4448 	}
4449 
4450 	if (state->randomAccess)	/* need trailing length word? */
4451 		LogicalTapeReadExact(state->tapeset, tapenum,
4452 							 &tuplen, sizeof(tuplen));
4453 }
4454 
4455 /*
4456  * Convenience routine to free a tuple previously loaded into sort memory
4457  */
4458 static void
free_sort_tuple(Tuplesortstate * state,SortTuple * stup)4459 free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
4460 {
4461 	if (stup->tuple)
4462 	{
4463 		FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4464 		pfree(stup->tuple);
4465 		stup->tuple = NULL;
4466 	}
4467 }
4468