1 /*-------------------------------------------------------------------------
2  *
3  * tuplesort.c
4  *	  Generalized tuple sorting routines.
5  *
6  * This module handles sorting of heap tuples, index tuples, or single
7  * Datums (and could easily support other kinds of sortable objects,
8  * if necessary).  It works efficiently for both small and large amounts
9  * of data.  Small amounts are sorted in-memory using qsort().  Large
10  * amounts are sorted using temporary files and a standard external sort
11  * algorithm.
12  *
13  * See Knuth, volume 3, for more than you want to know about the external
14  * sorting algorithm.  Historically, we divided the input into sorted runs
15  * using replacement selection, in the form of a priority tree implemented
16  * as a heap (essentially his Algorithm 5.2.3H), but now we only do that
17  * for the first run, and only if the run would otherwise end up being very
18  * short.  We merge the runs using polyphase merge, Knuth's Algorithm
19  * 5.4.2D.  The logical "tapes" used by Algorithm D are implemented by
20  * logtape.c, which avoids space wastage by recycling disk space as soon
21  * as each block is read from its "tape".
22  *
23  * We do not use Knuth's recommended data structure (Algorithm 5.4.1R) for
24  * the replacement selection, because it uses a fixed number of records
25  * in memory at all times.  Since we are dealing with tuples that may vary
26  * considerably in size, we want to be able to vary the number of records
27  * kept in memory to ensure full utilization of the allowed sort memory
28  * space.  So, we keep the tuples in a variable-size heap, with the next
29  * record to go out at the top of the heap.  Like Algorithm 5.4.1R, each
30  * record is stored with the run number that it must go into, and we use
31  * (run number, key) as the ordering key for the heap.  When the run number
32  * at the top of the heap changes, we know that no more records of the prior
33  * run are left in the heap.  Note that there are in practice only ever two
34  * distinct run numbers, because since PostgreSQL 9.6, we only use
35  * replacement selection to form the first run.
36  *
37  * In PostgreSQL 9.6, a heap (based on Knuth's Algorithm H, with some small
38  * customizations) is only used with the aim of producing just one run,
39  * thereby avoiding all merging.  Only the first run can use replacement
40  * selection, which is why there are now only two possible valid run
41  * numbers, and why heapification is customized to not distinguish between
42  * tuples in the second run (those will be quicksorted).  We generally
43  * prefer a simple hybrid sort-merge strategy, where runs are sorted in much
44  * the same way as the entire input of an internal sort is sorted (using
45  * qsort()).  The replacement_sort_tuples GUC controls the limited remaining
46  * use of replacement selection for the first run.
47  *
48  * There are several reasons to favor a hybrid sort-merge strategy.
49  * Maintaining a priority tree/heap has poor CPU cache characteristics.
50  * Furthermore, the growth in main memory sizes has greatly diminished the
51  * value of having runs that are larger than available memory, even in the
52  * case where there is partially sorted input and runs can be made far
53  * larger by using a heap.  In most cases, a single-pass merge step is all
54  * that is required even when runs are no larger than available memory.
55  * Avoiding multiple merge passes was traditionally considered to be the
56  * major advantage of using replacement selection.
57  *
58  * The approximate amount of memory allowed for any one sort operation
59  * is specified in kilobytes by the caller (most pass work_mem).  Initially,
60  * we absorb tuples and simply store them in an unsorted array as long as
61  * we haven't exceeded workMem.  If we reach the end of the input without
62  * exceeding workMem, we sort the array using qsort() and subsequently return
63  * tuples just by scanning the tuple array sequentially.  If we do exceed
64  * workMem, we begin to emit tuples into sorted runs in temporary tapes.
65  * When tuples are dumped in batch after quicksorting, we begin a new run
66  * with a new output tape (selected per Algorithm D).  After the end of the
67  * input is reached, we dump out remaining tuples in memory into a final run
68  * (or two, when replacement selection is still used), then merge the runs
69  * using Algorithm D.
70  *
71  * When merging runs, we use a heap containing just the frontmost tuple from
72  * each source run; we repeatedly output the smallest tuple and insert the
73  * next tuple from its source tape (if any).  When the heap empties, the merge
74  * is complete.  The basic merge algorithm thus needs very little memory ---
75  * only M tuples for an M-way merge, and M is constrained to a small number.
76  * However, we can still make good use of our full workMem allocation by
77  * pre-reading additional tuples from each source tape.  Without prereading,
78  * our access pattern to the temporary file would be very erratic; on average
79  * we'd read one block from each of M source tapes during the same time that
80  * we're writing M blocks to the output tape, so there is no sequentiality of
81  * access at all, defeating the read-ahead methods used by most Unix kernels.
82  * Worse, the output tape gets written into a very random sequence of blocks
83  * of the temp file, ensuring that things will be even worse when it comes
84  * time to read that tape.  A straightforward merge pass thus ends up doing a
85  * lot of waiting for disk seeks.  We can improve matters by prereading from
86  * each source tape sequentially, loading about workMem/M bytes from each tape
87  * in turn.  Then we run the merge algorithm, writing but not reading until
88  * one of the preloaded tuple series runs out.  Then we switch back to preread
89  * mode, fill memory again, and repeat.  This approach helps to localize both
90  * read and write accesses.
91  *
92  * When the caller requests random access to the sort result, we form
93  * the final sorted run on a logical tape which is then "frozen", so
94  * that we can access it randomly.  When the caller does not need random
95  * access, we return from tuplesort_performsort() as soon as we are down
96  * to one run per logical tape.  The final merge is then performed
97  * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this
98  * saves one cycle of writing all the data out to disk and reading it in.
99  *
100  * Before Postgres 8.2, we always used a seven-tape polyphase merge, on the
101  * grounds that 7 is the "sweet spot" on the tapes-to-passes curve according
102  * to Knuth's figure 70 (section 5.4.2).  However, Knuth is assuming that
103  * tape drives are expensive beasts, and in particular that there will always
104  * be many more runs than tape drives.  In our implementation a "tape drive"
105  * doesn't cost much more than a few Kb of memory buffers, so we can afford
106  * to have lots of them.  In particular, if we can have as many tape drives
107  * as sorted runs, we can eliminate any repeated I/O at all.  In the current
108  * code we determine the number of tapes M on the basis of workMem: we want
109  * workMem/M to be large enough that we read a fair amount of data each time
110  * we preread from a tape, so as to maintain the locality of access described
111  * above.  Nonetheless, with large workMem we can have many tapes.
112  *
113  *
114  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
115  * Portions Copyright (c) 1994, Regents of the University of California
116  *
117  * IDENTIFICATION
118  *	  src/backend/utils/sort/tuplesort.c
119  *
120  *-------------------------------------------------------------------------
121  */
122 
123 #include "postgres.h"
124 
125 #include <limits.h>
126 
127 #include "access/htup_details.h"
128 #include "access/nbtree.h"
129 #include "catalog/index.h"
130 #include "catalog/pg_am.h"
131 #include "commands/tablespace.h"
132 #include "executor/executor.h"
133 #include "miscadmin.h"
134 #include "pg_trace.h"
135 #include "utils/datum.h"
136 #include "utils/logtape.h"
137 #include "utils/lsyscache.h"
138 #include "utils/memutils.h"
139 #include "utils/pg_rusage.h"
140 #include "utils/rel.h"
141 #include "utils/sortsupport.h"
142 #include "utils/tuplesort.h"
143 
144 
145 /* sort-type codes for sort__start probes */
146 #define HEAP_SORT		0
147 #define INDEX_SORT		1
148 #define DATUM_SORT		2
149 #define CLUSTER_SORT	3
150 
151 /* GUC variables */
152 #ifdef TRACE_SORT
153 bool		trace_sort = false;
154 #endif
155 
156 #ifdef DEBUG_BOUNDED_SORT
157 bool		optimize_bounded_sort = true;
158 #endif
159 
160 
161 /*
162  * The objects we actually sort are SortTuple structs.  These contain
163  * a pointer to the tuple proper (might be a MinimalTuple or IndexTuple),
164  * which is a separate palloc chunk --- we assume it is just one chunk and
165  * can be freed by a simple pfree() (except during final on-the-fly merge,
166  * when memory is used in batch).  SortTuples also contain the tuple's
167  * first key column in Datum/nullflag format, and an index integer.
168  *
169  * Storing the first key column lets us save heap_getattr or index_getattr
170  * calls during tuple comparisons.  We could extract and save all the key
171  * columns not just the first, but this would increase code complexity and
172  * overhead, and wouldn't actually save any comparison cycles in the common
173  * case where the first key determines the comparison result.  Note that
174  * for a pass-by-reference datatype, datum1 points into the "tuple" storage.
175  *
176  * There is one special case: when the sort support infrastructure provides an
177  * "abbreviated key" representation, where the key is (typically) a pass by
178  * value proxy for a pass by reference type.  In this case, the abbreviated key
179  * is stored in datum1 in place of the actual first key column.
180  *
181  * When sorting single Datums, the data value is represented directly by
182  * datum1/isnull1 for pass by value types (or null values).  If the datatype is
183  * pass-by-reference and isnull1 is false, then "tuple" points to a separately
184  * palloc'd data value, otherwise "tuple" is NULL.  The value of datum1 is then
185  * either the same pointer as "tuple", or is an abbreviated key value as
186  * described above.  Accordingly, "tuple" is always used in preference to
187  * datum1 as the authoritative value for pass-by-reference cases.
188  *
189  * While building initial runs, tupindex holds the tuple's run number.
190  * Historically, the run number could meaningfully distinguish many runs, but
191  * it now only distinguishes RUN_FIRST and HEAP_RUN_NEXT, since replacement
192  * selection is always abandoned after the first run; no other run number
193  * should be represented here.  During merge passes, we re-use it to hold the
194  * input tape number that each tuple in the heap was read from, or to hold the
195  * index of the next tuple pre-read from the same tape in the case of pre-read
196  * entries.  tupindex goes unused if the sort occurs entirely in memory.
197  */
198 typedef struct
199 {
200 	void	   *tuple;			/* the tuple itself */
201 	Datum		datum1;			/* value of first key column */
202 	bool		isnull1;		/* is first key column NULL? */
203 	int			tupindex;		/* see notes above */
204 } SortTuple;
205 
206 
207 /*
208  * Possible states of a Tuplesort object.  These denote the states that
209  * persist between calls of Tuplesort routines.
210  */
211 typedef enum
212 {
213 	TSS_INITIAL,				/* Loading tuples; still within memory limit */
214 	TSS_BOUNDED,				/* Loading tuples into bounded-size heap */
215 	TSS_BUILDRUNS,				/* Loading tuples; writing to tape */
216 	TSS_SORTEDINMEM,			/* Sort completed entirely in memory */
217 	TSS_SORTEDONTAPE,			/* Sort completed, final run is on tape */
218 	TSS_FINALMERGE				/* Performing final merge on-the-fly */
219 } TupSortStatus;
220 
221 /*
222  * Parameters for calculation of number of tapes to use --- see inittapes()
223  * and tuplesort_merge_order().
224  *
225  * In this calculation we assume that each tape will cost us about 3 blocks
226  * worth of buffer space (which is an underestimate for very large data
227  * volumes, but it's probably close enough --- see logtape.c).
228  *
229  * MERGE_BUFFER_SIZE is how much data we'd like to read from each input
230  * tape during a preread cycle (see discussion at top of file).
231  */
232 #define MINORDER		6		/* minimum merge order */
233 #define TAPE_BUFFER_OVERHEAD		(BLCKSZ * 3)
234 #define MERGE_BUFFER_SIZE			(BLCKSZ * 32)
235 
236  /*
237   * Run numbers, used during external sort operations.
238   *
239   * HEAP_RUN_NEXT is only used for SortTuple.tupindex, never state.currentRun.
240   */
241 #define RUN_FIRST		0
242 #define HEAP_RUN_NEXT	INT_MAX
243 #define RUN_SECOND		1
244 
245 typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
246 												Tuplesortstate *state);
247 
248 /*
249  * Private state of a Tuplesort operation.
250  */
251 struct Tuplesortstate
252 {
253 	TupSortStatus status;		/* enumerated value as shown above */
254 	int			nKeys;			/* number of columns in sort key */
255 	bool		randomAccess;	/* did caller request random access? */
256 	bool		bounded;		/* did caller specify a maximum number of
257 								 * tuples to return? */
258 	bool		boundUsed;		/* true if we made use of a bounded heap */
259 	int			bound;			/* if bounded, the maximum number of tuples */
260 	bool		tuples;			/* Can SortTuple.tuple ever be set? */
261 	int64		availMem;		/* remaining memory available, in bytes */
262 	int64		allowedMem;		/* total memory allowed, in bytes */
263 	int			maxTapes;		/* number of tapes (Knuth's T) */
264 	int			tapeRange;		/* maxTapes-1 (Knuth's P) */
265 	MemoryContext sortcontext;	/* memory context holding most sort data */
266 	MemoryContext tuplecontext; /* sub-context of sortcontext for tuple data */
267 	LogicalTapeSet *tapeset;	/* logtape.c object for tapes in a temp file */
268 
269 	/*
270 	 * These function pointers decouple the routines that must know what kind
271 	 * of tuple we are sorting from the routines that don't need to know it.
272 	 * They are set up by the tuplesort_begin_xxx routines.
273 	 *
274 	 * Function to compare two tuples; result is per qsort() convention, ie:
275 	 * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
276 	 * qsort_arg_comparator.
277 	 */
278 	SortTupleComparator comparetup;
279 
280 	/*
281 	 * Function to copy a supplied input tuple into palloc'd space and set up
282 	 * its SortTuple representation (ie, set tuple/datum1/isnull1).  Also,
283 	 * state->availMem must be decreased by the amount of space used for the
284 	 * tuple copy (note the SortTuple struct itself is not counted).
285 	 */
286 	void		(*copytup) (Tuplesortstate *state, SortTuple *stup, void *tup);
287 
288 	/*
289 	 * Function to write a stored tuple onto tape.  The representation of the
290 	 * tuple on tape need not be the same as it is in memory; requirements on
291 	 * the tape representation are given below.  After writing the tuple,
292 	 * pfree() the out-of-line data (not the SortTuple struct!), and increase
293 	 * state->availMem by the amount of memory space thereby released.
294 	 */
295 	void		(*writetup) (Tuplesortstate *state, int tapenum,
296 										 SortTuple *stup);
297 
298 	/*
299 	 * Function to read a stored tuple from tape back into memory. 'len' is
300 	 * the already-read length of the stored tuple.  Create a palloc'd copy,
301 	 * initialize tuple/datum1/isnull1 in the target SortTuple struct, and
302 	 * decrease state->availMem by the amount of memory space consumed. (See
303 	 * batchUsed notes for details on how memory is handled when incremental
304 	 * accounting is abandoned.)
305 	 */
306 	void		(*readtup) (Tuplesortstate *state, SortTuple *stup,
307 										int tapenum, unsigned int len);
308 
309 	/*
310 	 * Function to move a caller tuple.  This is usually implemented as a
311 	 * memmove() shim, but function may also perform additional fix-up of
312 	 * caller tuple where needed.  Batch memory support requires the movement
313 	 * of caller tuples from one location in memory to another.
314 	 */
315 	void		(*movetup) (void *dest, void *src, unsigned int len);
316 
317 	/*
318 	 * This array holds the tuples now in sort memory.  If we are in state
319 	 * INITIAL, the tuples are in no particular order; if we are in state
320 	 * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
321 	 * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
322 	 * H.  (Note that memtupcount only counts the tuples that are part of the
323 	 * heap --- during merge passes, memtuples[] entries beyond tapeRange are
324 	 * never in the heap and are used to hold pre-read tuples.)  In state
325 	 * SORTEDONTAPE, the array is not used.
326 	 */
327 	SortTuple  *memtuples;		/* array of SortTuple structs */
328 	int			memtupcount;	/* number of tuples currently present */
329 	int			memtupsize;		/* allocated length of memtuples array */
330 	bool		growmemtuples;	/* memtuples' growth still underway? */
331 
332 	/*
333 	 * Memory for tuples is sometimes allocated in batch, rather than
334 	 * incrementally.  This implies that incremental memory accounting has
335 	 * been abandoned.  Currently, this only happens for the final on-the-fly
336 	 * merge step.  Large batch allocations can store tuples (e.g.
337 	 * IndexTuples) without palloc() fragmentation and other overhead.
338 	 */
339 	bool		batchUsed;
340 
341 	/*
342 	 * While building initial runs, this indicates if the replacement
343 	 * selection strategy is in use.  When it isn't, then a simple hybrid
344 	 * sort-merge strategy is in use instead (runs are quicksorted).
345 	 */
346 	bool		replaceActive;
347 
348 	/*
349 	 * While building initial runs, this is the current output run number
350 	 * (starting at RUN_FIRST).  Afterwards, it is the number of initial runs
351 	 * we made.
352 	 */
353 	int			currentRun;
354 
355 	/*
356 	 * Unless otherwise noted, all pointer variables below are pointers to
357 	 * arrays of length maxTapes, holding per-tape data.
358 	 */
359 
360 	/*
361 	 * These variables are only used during merge passes.  mergeactive[i] is
362 	 * true if we are reading an input run from (actual) tape number i and
363 	 * have not yet exhausted that run.  mergenext[i] is the memtuples index
364 	 * of the next pre-read tuple (next to be loaded into the heap) for tape
365 	 * i, or 0 if we are out of pre-read tuples.  mergelast[i] similarly
366 	 * points to the last pre-read tuple from each tape.  mergeavailslots[i]
367 	 * is the number of unused memtuples[] slots reserved for tape i, and
368 	 * mergeavailmem[i] is the amount of unused space allocated for tape i.
369 	 * mergefreelist and mergefirstfree keep track of unused locations in the
370 	 * memtuples[] array.  The memtuples[].tupindex fields link together
371 	 * pre-read tuples for each tape as well as recycled locations in
372 	 * mergefreelist. It is OK to use 0 as a null link in these lists, because
373 	 * memtuples[0] is part of the merge heap and is never a pre-read tuple.
374 	 */
375 	bool	   *mergeactive;	/* active input run source? */
376 	int		   *mergenext;		/* first preread tuple for each source */
377 	int		   *mergelast;		/* last preread tuple for each source */
378 	int		   *mergeavailslots;	/* slots left for prereading each tape */
379 	int64	   *mergeavailmem;	/* availMem for prereading each tape */
380 	int			mergefreelist;	/* head of freelist of recycled slots */
381 	int			mergefirstfree; /* first slot never used in this merge */
382 
383 	/*
384 	 * Per-tape batch state, when final on-the-fly merge consumes memory from
385 	 * just a few large allocations.
386 	 *
387 	 * Aside from the general benefits of performing fewer individual retail
388 	 * palloc() calls, this also helps make merging more cache efficient,
389 	 * since each tape's tuples must naturally be accessed sequentially (in
390 	 * sorted order).
391 	 */
392 	int64		spacePerTape;	/* Space (memory) for tuples (not slots) */
393 	char	  **mergetuples;	/* Each tape's memory allocation */
394 	char	  **mergecurrent;	/* Current offset into each tape's memory */
395 	char	  **mergetail;		/* Last item's start point for each tape */
396 	char	  **mergeoverflow;	/* Retail palloc() "overflow" for each tape */
397 
398 	/*
399 	 * Variables for Algorithm D.  Note that destTape is a "logical" tape
400 	 * number, ie, an index into the tp_xxx[] arrays.  Be careful to keep
401 	 * "logical" and "actual" tape numbers straight!
402 	 */
403 	int			Level;			/* Knuth's l */
404 	int			destTape;		/* current output tape (Knuth's j, less 1) */
405 	int		   *tp_fib;			/* Target Fibonacci run counts (A[]) */
406 	int		   *tp_runs;		/* # of real runs on each tape */
407 	int		   *tp_dummy;		/* # of dummy runs for each tape (D[]) */
408 	int		   *tp_tapenum;		/* Actual tape numbers (TAPE[]) */
409 	int			activeTapes;	/* # of active input tapes in merge pass */
410 
411 	/*
412 	 * These variables are used after completion of sorting to keep track of
413 	 * the next tuple to return.  (In the tape case, the tape's current read
414 	 * position is also critical state.)
415 	 */
416 	int			result_tape;	/* actual tape number of finished output */
417 	int			current;		/* array index (only used if SORTEDINMEM) */
418 	bool		eof_reached;	/* reached EOF (needed for cursors) */
419 
420 	/* markpos_xxx holds marked position for mark and restore */
421 	long		markpos_block;	/* tape block# (only used if SORTEDONTAPE) */
422 	int			markpos_offset; /* saved "current", or offset in tape block */
423 	bool		markpos_eof;	/* saved "eof_reached" */
424 
425 	/*
426 	 * The sortKeys variable is used by every case other than the hash index
427 	 * case; it is set by tuplesort_begin_xxx.  tupDesc is only used by the
428 	 * MinimalTuple and CLUSTER routines, though.
429 	 */
430 	TupleDesc	tupDesc;
431 	SortSupport sortKeys;		/* array of length nKeys */
432 
433 	/*
434 	 * This variable is shared by the single-key MinimalTuple case and the
435 	 * Datum case (which both use qsort_ssup()).  Otherwise it's NULL.
436 	 */
437 	SortSupport onlyKey;
438 
439 	/*
440 	 * Additional state for managing "abbreviated key" sortsupport routines
441 	 * (which currently may be used by all cases except the hash index case).
442 	 * Tracks the intervals at which the optimization's effectiveness is
443 	 * tested.
444 	 */
445 	int64		abbrevNext;		/* Tuple # at which to next check
446 								 * applicability */
447 
448 	/*
449 	 * These variables are specific to the CLUSTER case; they are set by
450 	 * tuplesort_begin_cluster.
451 	 */
452 	IndexInfo  *indexInfo;		/* info about index being used for reference */
453 	EState	   *estate;			/* for evaluating index expressions */
454 
455 	/*
456 	 * These variables are specific to the IndexTuple case; they are set by
457 	 * tuplesort_begin_index_xxx and used only by the IndexTuple routines.
458 	 */
459 	Relation	heapRel;		/* table the index is being built on */
460 	Relation	indexRel;		/* index being built */
461 
462 	/* These are specific to the index_btree subcase: */
463 	bool		enforceUnique;	/* complain if we find duplicate tuples */
464 
465 	/* These are specific to the index_hash subcase: */
466 	uint32		hash_mask;		/* mask for sortable part of hash code */
467 
468 	/*
469 	 * These variables are specific to the Datum case; they are set by
470 	 * tuplesort_begin_datum and used only by the DatumTuple routines.
471 	 */
472 	Oid			datumType;
473 	/* we need typelen in order to know how to copy the Datums. */
474 	int			datumTypeLen;
475 
476 	/*
477 	 * Resource snapshot for time of sort start.
478 	 */
479 #ifdef TRACE_SORT
480 	PGRUsage	ru_start;
481 #endif
482 };
483 
484 #define COMPARETUP(state,a,b)	((*(state)->comparetup) (a, b, state))
485 #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
486 #define WRITETUP(state,tape,stup)	((*(state)->writetup) (state, tape, stup))
487 #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len))
488 #define MOVETUP(dest,src,len) ((*(state)->movetup) (dest, src, len))
489 #define LACKMEM(state)		((state)->availMem < 0 && !(state)->batchUsed)
490 #define USEMEM(state,amt)	((state)->availMem -= (amt))
491 #define FREEMEM(state,amt)	((state)->availMem += (amt))
492 
493 /*
494  * NOTES about on-tape representation of tuples:
495  *
496  * We require the first "unsigned int" of a stored tuple to be the total size
497  * on-tape of the tuple, including itself (so it is never zero; an all-zero
498  * unsigned int is used to delimit runs).  The remainder of the stored tuple
499  * may or may not match the in-memory representation of the tuple ---
500  * any conversion needed is the job of the writetup and readtup routines.
501  *
502  * If state->randomAccess is true, then the stored representation of the
503  * tuple must be followed by another "unsigned int" that is a copy of the
504  * length --- so the total tape space used is actually sizeof(unsigned int)
505  * more than the stored length value.  This allows read-backwards.  When
506  * randomAccess is not true, the write/read routines may omit the extra
507  * length word.
508  *
509  * writetup is expected to write both length words as well as the tuple
510  * data.  When readtup is called, the tape is positioned just after the
511  * front length word; readtup must read the tuple data and advance past
512  * the back length word (if present).
513  *
514  * The write/read routines can make use of the tuple description data
515  * stored in the Tuplesortstate record, if needed.  They are also expected
516  * to adjust state->availMem by the amount of memory space (not tape space!)
517  * released or consumed.  There is no error return from either writetup
518  * or readtup; they should ereport() on failure.
519  *
520  *
521  * NOTES about memory consumption calculations:
522  *
523  * We count space allocated for tuples against the workMem limit, plus
524  * the space used by the variable-size memtuples array.  Fixed-size space
525  * is not counted; it's small enough to not be interesting.
526  *
527  * Note that we count actual space used (as shown by GetMemoryChunkSpace)
528  * rather than the originally-requested size.  This is important since
529  * palloc can add substantial overhead.  It's not a complete answer since
530  * we won't count any wasted space in palloc allocation blocks, but it's
531  * a lot better than what we were doing before 7.3.  As of 9.6, a
532  * separate memory context is used for caller passed tuples.  Resetting
533  * it at certain key increments significantly ameliorates fragmentation.
534  * Note that this places a responsibility on readtup and copytup routines
535  * to use the right memory context for these tuples (and to not use the
536  * reset context for anything whose lifetime needs to span multiple
537  * external sort runs).
538  */
539 
540 /* When using this macro, beware of double evaluation of len */
541 #define LogicalTapeReadExact(tapeset, tapenum, ptr, len) \
542 	do { \
543 		if (LogicalTapeRead(tapeset, tapenum, ptr, len) != (size_t) (len)) \
544 			elog(ERROR, "unexpected end of data"); \
545 	} while(0)
546 
547 
548 static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess);
549 static void puttuple_common(Tuplesortstate *state, SortTuple *tuple);
550 static bool consider_abort_common(Tuplesortstate *state);
551 static bool useselection(Tuplesortstate *state);
552 static void inittapes(Tuplesortstate *state);
553 static void selectnewtape(Tuplesortstate *state);
554 static void mergeruns(Tuplesortstate *state);
555 static void mergeonerun(Tuplesortstate *state);
556 static void beginmerge(Tuplesortstate *state, bool finalMergeBatch);
557 static void batchmemtuples(Tuplesortstate *state);
558 static void mergebatch(Tuplesortstate *state, int64 spacePerTape);
559 static void mergebatchone(Tuplesortstate *state, int srcTape,
560 			  SortTuple *stup, bool *should_free);
561 static void mergebatchfreetape(Tuplesortstate *state, int srcTape,
562 				   SortTuple *rtup, bool *should_free);
563 static void *mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen);
564 static void mergepreread(Tuplesortstate *state);
565 static void mergeprereadone(Tuplesortstate *state, int srcTape);
566 static void dumptuples(Tuplesortstate *state, bool alltuples);
567 static void dumpbatch(Tuplesortstate *state, bool alltuples);
568 static void make_bounded_heap(Tuplesortstate *state);
569 static void sort_bounded_heap(Tuplesortstate *state);
570 static void tuplesort_sort_memtuples(Tuplesortstate *state);
571 static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
572 					  int tupleindex, bool checkIndex);
573 static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex);
574 static void reversedirection(Tuplesortstate *state);
575 static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
576 static void markrunend(Tuplesortstate *state, int tapenum);
577 static void *readtup_alloc(Tuplesortstate *state, int tapenum, Size tuplen);
578 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
579 				Tuplesortstate *state);
580 static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
581 static void writetup_heap(Tuplesortstate *state, int tapenum,
582 			  SortTuple *stup);
583 static void readtup_heap(Tuplesortstate *state, SortTuple *stup,
584 			 int tapenum, unsigned int len);
585 static void movetup_heap(void *dest, void *src, unsigned int len);
586 static int comparetup_cluster(const SortTuple *a, const SortTuple *b,
587 				   Tuplesortstate *state);
588 static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup);
589 static void writetup_cluster(Tuplesortstate *state, int tapenum,
590 				 SortTuple *stup);
591 static void readtup_cluster(Tuplesortstate *state, SortTuple *stup,
592 				int tapenum, unsigned int len);
593 static void movetup_cluster(void *dest, void *src, unsigned int len);
594 static int comparetup_index_btree(const SortTuple *a, const SortTuple *b,
595 					   Tuplesortstate *state);
596 static int comparetup_index_hash(const SortTuple *a, const SortTuple *b,
597 					  Tuplesortstate *state);
598 static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup);
599 static void writetup_index(Tuplesortstate *state, int tapenum,
600 			   SortTuple *stup);
601 static void readtup_index(Tuplesortstate *state, SortTuple *stup,
602 			  int tapenum, unsigned int len);
603 static void movetup_index(void *dest, void *src, unsigned int len);
604 static int comparetup_datum(const SortTuple *a, const SortTuple *b,
605 				 Tuplesortstate *state);
606 static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup);
607 static void writetup_datum(Tuplesortstate *state, int tapenum,
608 			   SortTuple *stup);
609 static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
610 			  int tapenum, unsigned int len);
611 static void movetup_datum(void *dest, void *src, unsigned int len);
612 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
613 
614 /*
615  * Special versions of qsort just for SortTuple objects.  qsort_tuple() sorts
616  * any variant of SortTuples, using the appropriate comparetup function.
617  * qsort_ssup() is specialized for the case where the comparetup function
618  * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts
619  * and Datum sorts.
620  */
621 #include "qsort_tuple.c"
622 
623 
624 /*
625  *		tuplesort_begin_xxx
626  *
627  * Initialize for a tuple sort operation.
628  *
629  * After calling tuplesort_begin, the caller should call tuplesort_putXXX
630  * zero or more times, then call tuplesort_performsort when all the tuples
631  * have been supplied.  After performsort, retrieve the tuples in sorted
632  * order by calling tuplesort_getXXX until it returns false/NULL.  (If random
633  * access was requested, rescan, markpos, and restorepos can also be called.)
634  * Call tuplesort_end to terminate the operation and release memory/disk space.
635  *
636  * Each variant of tuplesort_begin has a workMem parameter specifying the
637  * maximum number of kilobytes of RAM to use before spilling data to disk.
638  * (The normal value of this parameter is work_mem, but some callers use
639  * other values.)  Each variant also has a randomAccess parameter specifying
640  * whether the caller needs non-sequential access to the sort result.
641  */
642 
643 static Tuplesortstate *
tuplesort_begin_common(int workMem,bool randomAccess)644 tuplesort_begin_common(int workMem, bool randomAccess)
645 {
646 	Tuplesortstate *state;
647 	MemoryContext sortcontext;
648 	MemoryContext tuplecontext;
649 	MemoryContext oldcontext;
650 
651 	/*
652 	 * Create a working memory context for this sort operation. All data
653 	 * needed by the sort will live inside this context.
654 	 */
655 	sortcontext = AllocSetContextCreate(CurrentMemoryContext,
656 										"TupleSort main",
657 										ALLOCSET_DEFAULT_SIZES);
658 
659 	/*
660 	 * Caller tuple (e.g. IndexTuple) memory context.
661 	 *
662 	 * A dedicated child context used exclusively for caller passed tuples
663 	 * eases memory management.  Resetting at key points reduces
664 	 * fragmentation. Note that the memtuples array of SortTuples is allocated
665 	 * in the parent context, not this context, because there is no need to
666 	 * free memtuples early.
667 	 */
668 	tuplecontext = AllocSetContextCreate(sortcontext,
669 										 "Caller tuples",
670 										 ALLOCSET_DEFAULT_SIZES);
671 
672 	/*
673 	 * Make the Tuplesortstate within the per-sort context.  This way, we
674 	 * don't need a separate pfree() operation for it at shutdown.
675 	 */
676 	oldcontext = MemoryContextSwitchTo(sortcontext);
677 
678 	state = (Tuplesortstate *) palloc0(sizeof(Tuplesortstate));
679 
680 #ifdef TRACE_SORT
681 	if (trace_sort)
682 		pg_rusage_init(&state->ru_start);
683 #endif
684 
685 	state->status = TSS_INITIAL;
686 	state->randomAccess = randomAccess;
687 	state->bounded = false;
688 	state->tuples = true;
689 	state->boundUsed = false;
690 	state->allowedMem = workMem * (int64) 1024;
691 	state->availMem = state->allowedMem;
692 	state->sortcontext = sortcontext;
693 	state->tuplecontext = tuplecontext;
694 	state->tapeset = NULL;
695 
696 	state->memtupcount = 0;
697 
698 	/*
699 	 * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
700 	 * see comments in grow_memtuples().
701 	 */
702 	state->memtupsize = Max(1024,
703 						ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1);
704 
705 	state->growmemtuples = true;
706 	state->batchUsed = false;
707 	state->memtuples = (SortTuple *) palloc(state->memtupsize * sizeof(SortTuple));
708 
709 	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
710 
711 	/* workMem must be large enough for the minimal memtuples array */
712 	if (LACKMEM(state))
713 		elog(ERROR, "insufficient memory allowed for sort");
714 
715 	state->currentRun = RUN_FIRST;
716 
717 	/*
718 	 * maxTapes, tapeRange, and Algorithm D variables will be initialized by
719 	 * inittapes(), if needed
720 	 */
721 
722 	state->result_tape = -1;	/* flag that result tape has not been formed */
723 
724 	MemoryContextSwitchTo(oldcontext);
725 
726 	return state;
727 }
728 
729 Tuplesortstate *
tuplesort_begin_heap(TupleDesc tupDesc,int nkeys,AttrNumber * attNums,Oid * sortOperators,Oid * sortCollations,bool * nullsFirstFlags,int workMem,bool randomAccess)730 tuplesort_begin_heap(TupleDesc tupDesc,
731 					 int nkeys, AttrNumber *attNums,
732 					 Oid *sortOperators, Oid *sortCollations,
733 					 bool *nullsFirstFlags,
734 					 int workMem, bool randomAccess)
735 {
736 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
737 	MemoryContext oldcontext;
738 	int			i;
739 
740 	oldcontext = MemoryContextSwitchTo(state->sortcontext);
741 
742 	AssertArg(nkeys > 0);
743 
744 #ifdef TRACE_SORT
745 	if (trace_sort)
746 		elog(LOG,
747 			 "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
748 			 nkeys, workMem, randomAccess ? 't' : 'f');
749 #endif
750 
751 	state->nKeys = nkeys;
752 
753 	TRACE_POSTGRESQL_SORT_START(HEAP_SORT,
754 								false,	/* no unique check */
755 								nkeys,
756 								workMem,
757 								randomAccess);
758 
759 	state->comparetup = comparetup_heap;
760 	state->copytup = copytup_heap;
761 	state->writetup = writetup_heap;
762 	state->readtup = readtup_heap;
763 	state->movetup = movetup_heap;
764 
765 	state->tupDesc = tupDesc;	/* assume we need not copy tupDesc */
766 	state->abbrevNext = 10;
767 
768 	/* Prepare SortSupport data for each column */
769 	state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
770 
771 	for (i = 0; i < nkeys; i++)
772 	{
773 		SortSupport sortKey = state->sortKeys + i;
774 
775 		AssertArg(attNums[i] != 0);
776 		AssertArg(sortOperators[i] != 0);
777 
778 		sortKey->ssup_cxt = CurrentMemoryContext;
779 		sortKey->ssup_collation = sortCollations[i];
780 		sortKey->ssup_nulls_first = nullsFirstFlags[i];
781 		sortKey->ssup_attno = attNums[i];
782 		/* Convey if abbreviation optimization is applicable in principle */
783 		sortKey->abbreviate = (i == 0);
784 
785 		PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
786 	}
787 
788 	/*
789 	 * The "onlyKey" optimization cannot be used with abbreviated keys, since
790 	 * tie-breaker comparisons may be required.  Typically, the optimization
791 	 * is only of value to pass-by-value types anyway, whereas abbreviated
792 	 * keys are typically only of value to pass-by-reference types.
793 	 */
794 	if (nkeys == 1 && !state->sortKeys->abbrev_converter)
795 		state->onlyKey = state->sortKeys;
796 
797 	MemoryContextSwitchTo(oldcontext);
798 
799 	return state;
800 }
801 
802 Tuplesortstate *
tuplesort_begin_cluster(TupleDesc tupDesc,Relation indexRel,int workMem,bool randomAccess)803 tuplesort_begin_cluster(TupleDesc tupDesc,
804 						Relation indexRel,
805 						int workMem, bool randomAccess)
806 {
807 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
808 	ScanKey		indexScanKey;
809 	MemoryContext oldcontext;
810 	int			i;
811 
812 	Assert(indexRel->rd_rel->relam == BTREE_AM_OID);
813 
814 	oldcontext = MemoryContextSwitchTo(state->sortcontext);
815 
816 #ifdef TRACE_SORT
817 	if (trace_sort)
818 		elog(LOG,
819 			 "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
820 			 RelationGetNumberOfAttributes(indexRel),
821 			 workMem, randomAccess ? 't' : 'f');
822 #endif
823 
824 	state->nKeys = RelationGetNumberOfAttributes(indexRel);
825 
826 	TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT,
827 								false,	/* no unique check */
828 								state->nKeys,
829 								workMem,
830 								randomAccess);
831 
832 	state->comparetup = comparetup_cluster;
833 	state->copytup = copytup_cluster;
834 	state->writetup = writetup_cluster;
835 	state->readtup = readtup_cluster;
836 	state->movetup = movetup_cluster;
837 	state->abbrevNext = 10;
838 
839 	state->indexInfo = BuildIndexInfo(indexRel);
840 
841 	state->tupDesc = tupDesc;	/* assume we need not copy tupDesc */
842 
843 	indexScanKey = _bt_mkscankey_nodata(indexRel);
844 
845 	if (state->indexInfo->ii_Expressions != NULL)
846 	{
847 		TupleTableSlot *slot;
848 		ExprContext *econtext;
849 
850 		/*
851 		 * We will need to use FormIndexDatum to evaluate the index
852 		 * expressions.  To do that, we need an EState, as well as a
853 		 * TupleTableSlot to put the table tuples into.  The econtext's
854 		 * scantuple has to point to that slot, too.
855 		 */
856 		state->estate = CreateExecutorState();
857 		slot = MakeSingleTupleTableSlot(tupDesc);
858 		econtext = GetPerTupleExprContext(state->estate);
859 		econtext->ecxt_scantuple = slot;
860 	}
861 
862 	/* Prepare SortSupport data for each column */
863 	state->sortKeys = (SortSupport) palloc0(state->nKeys *
864 											sizeof(SortSupportData));
865 
866 	for (i = 0; i < state->nKeys; i++)
867 	{
868 		SortSupport sortKey = state->sortKeys + i;
869 		ScanKey		scanKey = indexScanKey + i;
870 		int16		strategy;
871 
872 		sortKey->ssup_cxt = CurrentMemoryContext;
873 		sortKey->ssup_collation = scanKey->sk_collation;
874 		sortKey->ssup_nulls_first =
875 			(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
876 		sortKey->ssup_attno = scanKey->sk_attno;
877 		/* Convey if abbreviation optimization is applicable in principle */
878 		sortKey->abbreviate = (i == 0);
879 
880 		AssertState(sortKey->ssup_attno != 0);
881 
882 		strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
883 			BTGreaterStrategyNumber : BTLessStrategyNumber;
884 
885 		PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
886 	}
887 
888 	_bt_freeskey(indexScanKey);
889 
890 	MemoryContextSwitchTo(oldcontext);
891 
892 	return state;
893 }
894 
895 Tuplesortstate *
tuplesort_begin_index_btree(Relation heapRel,Relation indexRel,bool enforceUnique,int workMem,bool randomAccess)896 tuplesort_begin_index_btree(Relation heapRel,
897 							Relation indexRel,
898 							bool enforceUnique,
899 							int workMem, bool randomAccess)
900 {
901 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
902 	ScanKey		indexScanKey;
903 	MemoryContext oldcontext;
904 	int			i;
905 
906 	oldcontext = MemoryContextSwitchTo(state->sortcontext);
907 
908 #ifdef TRACE_SORT
909 	if (trace_sort)
910 		elog(LOG,
911 			 "begin index sort: unique = %c, workMem = %d, randomAccess = %c",
912 			 enforceUnique ? 't' : 'f',
913 			 workMem, randomAccess ? 't' : 'f');
914 #endif
915 
916 	state->nKeys = RelationGetNumberOfAttributes(indexRel);
917 
918 	TRACE_POSTGRESQL_SORT_START(INDEX_SORT,
919 								enforceUnique,
920 								state->nKeys,
921 								workMem,
922 								randomAccess);
923 
924 	state->comparetup = comparetup_index_btree;
925 	state->copytup = copytup_index;
926 	state->writetup = writetup_index;
927 	state->readtup = readtup_index;
928 	state->movetup = movetup_index;
929 	state->abbrevNext = 10;
930 
931 	state->heapRel = heapRel;
932 	state->indexRel = indexRel;
933 	state->enforceUnique = enforceUnique;
934 
935 	indexScanKey = _bt_mkscankey_nodata(indexRel);
936 	state->nKeys = RelationGetNumberOfAttributes(indexRel);
937 
938 	/* Prepare SortSupport data for each column */
939 	state->sortKeys = (SortSupport) palloc0(state->nKeys *
940 											sizeof(SortSupportData));
941 
942 	for (i = 0; i < state->nKeys; i++)
943 	{
944 		SortSupport sortKey = state->sortKeys + i;
945 		ScanKey		scanKey = indexScanKey + i;
946 		int16		strategy;
947 
948 		sortKey->ssup_cxt = CurrentMemoryContext;
949 		sortKey->ssup_collation = scanKey->sk_collation;
950 		sortKey->ssup_nulls_first =
951 			(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
952 		sortKey->ssup_attno = scanKey->sk_attno;
953 		/* Convey if abbreviation optimization is applicable in principle */
954 		sortKey->abbreviate = (i == 0);
955 
956 		AssertState(sortKey->ssup_attno != 0);
957 
958 		strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
959 			BTGreaterStrategyNumber : BTLessStrategyNumber;
960 
961 		PrepareSortSupportFromIndexRel(indexRel, strategy, sortKey);
962 	}
963 
964 	_bt_freeskey(indexScanKey);
965 
966 	MemoryContextSwitchTo(oldcontext);
967 
968 	return state;
969 }
970 
971 Tuplesortstate *
tuplesort_begin_index_hash(Relation heapRel,Relation indexRel,uint32 hash_mask,int workMem,bool randomAccess)972 tuplesort_begin_index_hash(Relation heapRel,
973 						   Relation indexRel,
974 						   uint32 hash_mask,
975 						   int workMem, bool randomAccess)
976 {
977 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
978 	MemoryContext oldcontext;
979 
980 	oldcontext = MemoryContextSwitchTo(state->sortcontext);
981 
982 #ifdef TRACE_SORT
983 	if (trace_sort)
984 		elog(LOG,
985 		"begin index sort: hash_mask = 0x%x, workMem = %d, randomAccess = %c",
986 			 hash_mask,
987 			 workMem, randomAccess ? 't' : 'f');
988 #endif
989 
990 	state->nKeys = 1;			/* Only one sort column, the hash code */
991 
992 	state->comparetup = comparetup_index_hash;
993 	state->copytup = copytup_index;
994 	state->writetup = writetup_index;
995 	state->readtup = readtup_index;
996 	state->movetup = movetup_index;
997 
998 	state->heapRel = heapRel;
999 	state->indexRel = indexRel;
1000 
1001 	state->hash_mask = hash_mask;
1002 
1003 	MemoryContextSwitchTo(oldcontext);
1004 
1005 	return state;
1006 }
1007 
1008 Tuplesortstate *
tuplesort_begin_datum(Oid datumType,Oid sortOperator,Oid sortCollation,bool nullsFirstFlag,int workMem,bool randomAccess)1009 tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
1010 					  bool nullsFirstFlag,
1011 					  int workMem, bool randomAccess)
1012 {
1013 	Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess);
1014 	MemoryContext oldcontext;
1015 	int16		typlen;
1016 	bool		typbyval;
1017 
1018 	oldcontext = MemoryContextSwitchTo(state->sortcontext);
1019 
1020 #ifdef TRACE_SORT
1021 	if (trace_sort)
1022 		elog(LOG,
1023 			 "begin datum sort: workMem = %d, randomAccess = %c",
1024 			 workMem, randomAccess ? 't' : 'f');
1025 #endif
1026 
1027 	state->nKeys = 1;			/* always a one-column sort */
1028 
1029 	TRACE_POSTGRESQL_SORT_START(DATUM_SORT,
1030 								false,	/* no unique check */
1031 								1,
1032 								workMem,
1033 								randomAccess);
1034 
1035 	state->comparetup = comparetup_datum;
1036 	state->copytup = copytup_datum;
1037 	state->writetup = writetup_datum;
1038 	state->readtup = readtup_datum;
1039 	state->movetup = movetup_datum;
1040 	state->abbrevNext = 10;
1041 
1042 	state->datumType = datumType;
1043 
1044 	/* lookup necessary attributes of the datum type */
1045 	get_typlenbyval(datumType, &typlen, &typbyval);
1046 	state->datumTypeLen = typlen;
1047 	state->tuples = !typbyval;
1048 
1049 	/* Prepare SortSupport data */
1050 	state->sortKeys = (SortSupport) palloc0(sizeof(SortSupportData));
1051 
1052 	state->sortKeys->ssup_cxt = CurrentMemoryContext;
1053 	state->sortKeys->ssup_collation = sortCollation;
1054 	state->sortKeys->ssup_nulls_first = nullsFirstFlag;
1055 
1056 	/*
1057 	 * Abbreviation is possible here only for by-reference types.  In theory,
1058 	 * a pass-by-value datatype could have an abbreviated form that is cheaper
1059 	 * to compare.  In a tuple sort, we could support that, because we can
1060 	 * always extract the original datum from the tuple is needed.  Here, we
1061 	 * can't, because a datum sort only stores a single copy of the datum; the
1062 	 * "tuple" field of each sortTuple is NULL.
1063 	 */
1064 	state->sortKeys->abbreviate = !typbyval;
1065 
1066 	PrepareSortSupportFromOrderingOp(sortOperator, state->sortKeys);
1067 
1068 	/*
1069 	 * The "onlyKey" optimization cannot be used with abbreviated keys, since
1070 	 * tie-breaker comparisons may be required.  Typically, the optimization
1071 	 * is only of value to pass-by-value types anyway, whereas abbreviated
1072 	 * keys are typically only of value to pass-by-reference types.
1073 	 */
1074 	if (!state->sortKeys->abbrev_converter)
1075 		state->onlyKey = state->sortKeys;
1076 
1077 	MemoryContextSwitchTo(oldcontext);
1078 
1079 	return state;
1080 }
1081 
1082 /*
1083  * tuplesort_set_bound
1084  *
1085  *	Advise tuplesort that at most the first N result tuples are required.
1086  *
1087  * Must be called before inserting any tuples.  (Actually, we could allow it
1088  * as long as the sort hasn't spilled to disk, but there seems no need for
1089  * delayed calls at the moment.)
1090  *
1091  * This is a hint only. The tuplesort may still return more tuples than
1092  * requested.
1093  */
1094 void
tuplesort_set_bound(Tuplesortstate * state,int64 bound)1095 tuplesort_set_bound(Tuplesortstate *state, int64 bound)
1096 {
1097 	/* Assert we're called before loading any tuples */
1098 	Assert(state->status == TSS_INITIAL);
1099 	Assert(state->memtupcount == 0);
1100 	Assert(!state->bounded);
1101 
1102 #ifdef DEBUG_BOUNDED_SORT
1103 	/* Honor GUC setting that disables the feature (for easy testing) */
1104 	if (!optimize_bounded_sort)
1105 		return;
1106 #endif
1107 
1108 	/* We want to be able to compute bound * 2, so limit the setting */
1109 	if (bound > (int64) (INT_MAX / 2))
1110 		return;
1111 
1112 	state->bounded = true;
1113 	state->bound = (int) bound;
1114 
1115 	/*
1116 	 * Bounded sorts are not an effective target for abbreviated key
1117 	 * optimization.  Disable by setting state to be consistent with no
1118 	 * abbreviation support.
1119 	 */
1120 	state->sortKeys->abbrev_converter = NULL;
1121 	if (state->sortKeys->abbrev_full_comparator)
1122 		state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
1123 
1124 	/* Not strictly necessary, but be tidy */
1125 	state->sortKeys->abbrev_abort = NULL;
1126 	state->sortKeys->abbrev_full_comparator = NULL;
1127 }
1128 
1129 /*
1130  * tuplesort_end
1131  *
1132  *	Release resources and clean up.
1133  *
1134  * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
1135  * pointing to garbage.  Be careful not to attempt to use or free such
1136  * pointers afterwards!
1137  */
1138 void
tuplesort_end(Tuplesortstate * state)1139 tuplesort_end(Tuplesortstate *state)
1140 {
1141 	/* context swap probably not needed, but let's be safe */
1142 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1143 
1144 #ifdef TRACE_SORT
1145 	long		spaceUsed;
1146 
1147 	if (state->tapeset)
1148 		spaceUsed = LogicalTapeSetBlocks(state->tapeset);
1149 	else
1150 		spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
1151 #endif
1152 
1153 	/*
1154 	 * Delete temporary "tape" files, if any.
1155 	 *
1156 	 * Note: want to include this in reported total cost of sort, hence need
1157 	 * for two #ifdef TRACE_SORT sections.
1158 	 */
1159 	if (state->tapeset)
1160 		LogicalTapeSetClose(state->tapeset);
1161 
1162 #ifdef TRACE_SORT
1163 	if (trace_sort)
1164 	{
1165 		if (state->tapeset)
1166 			elog(LOG, "external sort ended, %ld disk blocks used: %s",
1167 				 spaceUsed, pg_rusage_show(&state->ru_start));
1168 		else
1169 			elog(LOG, "internal sort ended, %ld KB used: %s",
1170 				 spaceUsed, pg_rusage_show(&state->ru_start));
1171 	}
1172 
1173 	TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, spaceUsed);
1174 #else
1175 
1176 	/*
1177 	 * If you disabled TRACE_SORT, you can still probe sort__done, but you
1178 	 * ain't getting space-used stats.
1179 	 */
1180 	TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L);
1181 #endif
1182 
1183 	/* Free any execution state created for CLUSTER case */
1184 	if (state->estate != NULL)
1185 	{
1186 		ExprContext *econtext = GetPerTupleExprContext(state->estate);
1187 
1188 		ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple);
1189 		FreeExecutorState(state->estate);
1190 	}
1191 
1192 	MemoryContextSwitchTo(oldcontext);
1193 
1194 	/*
1195 	 * Free the per-sort memory context, thereby releasing all working memory,
1196 	 * including the Tuplesortstate struct itself.
1197 	 */
1198 	MemoryContextDelete(state->sortcontext);
1199 }
1200 
1201 /*
1202  * Grow the memtuples[] array, if possible within our memory constraint.  We
1203  * must not exceed INT_MAX tuples in memory or the caller-provided memory
1204  * limit.  Return TRUE if we were able to enlarge the array, FALSE if not.
1205  *
1206  * Normally, at each increment we double the size of the array.  When doing
1207  * that would exceed a limit, we attempt one last, smaller increase (and then
1208  * clear the growmemtuples flag so we don't try any more).  That allows us to
1209  * use memory as fully as permitted; sticking to the pure doubling rule could
1210  * result in almost half going unused.  Because availMem moves around with
1211  * tuple addition/removal, we need some rule to prevent making repeated small
1212  * increases in memtupsize, which would just be useless thrashing.  The
1213  * growmemtuples flag accomplishes that and also prevents useless
1214  * recalculations in this function.
1215  */
1216 static bool
grow_memtuples(Tuplesortstate * state)1217 grow_memtuples(Tuplesortstate *state)
1218 {
1219 	int			newmemtupsize;
1220 	int			memtupsize = state->memtupsize;
1221 	int64		memNowUsed = state->allowedMem - state->availMem;
1222 
1223 	/* Forget it if we've already maxed out memtuples, per comment above */
1224 	if (!state->growmemtuples)
1225 		return false;
1226 
1227 	/* Select new value of memtupsize */
1228 	if (memNowUsed <= state->availMem)
1229 	{
1230 		/*
1231 		 * We've used no more than half of allowedMem; double our usage,
1232 		 * clamping at INT_MAX tuples.
1233 		 */
1234 		if (memtupsize < INT_MAX / 2)
1235 			newmemtupsize = memtupsize * 2;
1236 		else
1237 		{
1238 			newmemtupsize = INT_MAX;
1239 			state->growmemtuples = false;
1240 		}
1241 	}
1242 	else
1243 	{
1244 		/*
1245 		 * This will be the last increment of memtupsize.  Abandon doubling
1246 		 * strategy and instead increase as much as we safely can.
1247 		 *
1248 		 * To stay within allowedMem, we can't increase memtupsize by more
1249 		 * than availMem / sizeof(SortTuple) elements.  In practice, we want
1250 		 * to increase it by considerably less, because we need to leave some
1251 		 * space for the tuples to which the new array slots will refer.  We
1252 		 * assume the new tuples will be about the same size as the tuples
1253 		 * we've already seen, and thus we can extrapolate from the space
1254 		 * consumption so far to estimate an appropriate new size for the
1255 		 * memtuples array.  The optimal value might be higher or lower than
1256 		 * this estimate, but it's hard to know that in advance.  We again
1257 		 * clamp at INT_MAX tuples.
1258 		 *
1259 		 * This calculation is safe against enlarging the array so much that
1260 		 * LACKMEM becomes true, because the memory currently used includes
1261 		 * the present array; thus, there would be enough allowedMem for the
1262 		 * new array elements even if no other memory were currently used.
1263 		 *
1264 		 * We do the arithmetic in float8, because otherwise the product of
1265 		 * memtupsize and allowedMem could overflow.  Any inaccuracy in the
1266 		 * result should be insignificant; but even if we computed a
1267 		 * completely insane result, the checks below will prevent anything
1268 		 * really bad from happening.
1269 		 */
1270 		double		grow_ratio;
1271 
1272 		grow_ratio = (double) state->allowedMem / (double) memNowUsed;
1273 		if (memtupsize * grow_ratio < INT_MAX)
1274 			newmemtupsize = (int) (memtupsize * grow_ratio);
1275 		else
1276 			newmemtupsize = INT_MAX;
1277 
1278 		/* We won't make any further enlargement attempts */
1279 		state->growmemtuples = false;
1280 	}
1281 
1282 	/* Must enlarge array by at least one element, else report failure */
1283 	if (newmemtupsize <= memtupsize)
1284 		goto noalloc;
1285 
1286 	/*
1287 	 * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize.  Clamp
1288 	 * to ensure our request won't be rejected.  Note that we can easily
1289 	 * exhaust address space before facing this outcome.  (This is presently
1290 	 * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but
1291 	 * don't rely on that at this distance.)
1292 	 */
1293 	if ((Size) newmemtupsize >= MaxAllocHugeSize / sizeof(SortTuple))
1294 	{
1295 		newmemtupsize = (int) (MaxAllocHugeSize / sizeof(SortTuple));
1296 		state->growmemtuples = false;	/* can't grow any more */
1297 	}
1298 
1299 	/*
1300 	 * We need to be sure that we do not cause LACKMEM to become true, else
1301 	 * the space management algorithm will go nuts.  The code above should
1302 	 * never generate a dangerous request, but to be safe, check explicitly
1303 	 * that the array growth fits within availMem.  (We could still cause
1304 	 * LACKMEM if the memory chunk overhead associated with the memtuples
1305 	 * array were to increase.  That shouldn't happen because we chose the
1306 	 * initial array size large enough to ensure that palloc will be treating
1307 	 * both old and new arrays as separate chunks.  But we'll check LACKMEM
1308 	 * explicitly below just in case.)
1309 	 */
1310 	if (state->availMem < (int64) ((newmemtupsize - memtupsize) * sizeof(SortTuple)))
1311 		goto noalloc;
1312 
1313 	/* OK, do it */
1314 	FREEMEM(state, GetMemoryChunkSpace(state->memtuples));
1315 	state->memtupsize = newmemtupsize;
1316 	state->memtuples = (SortTuple *)
1317 		repalloc_huge(state->memtuples,
1318 					  state->memtupsize * sizeof(SortTuple));
1319 	USEMEM(state, GetMemoryChunkSpace(state->memtuples));
1320 	if (LACKMEM(state))
1321 		elog(ERROR, "unexpected out-of-memory situation in tuplesort");
1322 	return true;
1323 
1324 noalloc:
1325 	/* If for any reason we didn't realloc, shut off future attempts */
1326 	state->growmemtuples = false;
1327 	return false;
1328 }
1329 
1330 /*
1331  * Accept one tuple while collecting input data for sort.
1332  *
1333  * Note that the input data is always copied; the caller need not save it.
1334  */
1335 void
tuplesort_puttupleslot(Tuplesortstate * state,TupleTableSlot * slot)1336 tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
1337 {
1338 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1339 	SortTuple	stup;
1340 
1341 	/*
1342 	 * Copy the given tuple into memory we control, and decrease availMem.
1343 	 * Then call the common code.
1344 	 */
1345 	COPYTUP(state, &stup, (void *) slot);
1346 
1347 	puttuple_common(state, &stup);
1348 
1349 	MemoryContextSwitchTo(oldcontext);
1350 }
1351 
1352 /*
1353  * Accept one tuple while collecting input data for sort.
1354  *
1355  * Note that the input data is always copied; the caller need not save it.
1356  */
1357 void
tuplesort_putheaptuple(Tuplesortstate * state,HeapTuple tup)1358 tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
1359 {
1360 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1361 	SortTuple	stup;
1362 
1363 	/*
1364 	 * Copy the given tuple into memory we control, and decrease availMem.
1365 	 * Then call the common code.
1366 	 */
1367 	COPYTUP(state, &stup, (void *) tup);
1368 
1369 	puttuple_common(state, &stup);
1370 
1371 	MemoryContextSwitchTo(oldcontext);
1372 }
1373 
1374 /*
1375  * Collect one index tuple while collecting input data for sort, building
1376  * it from caller-supplied values.
1377  */
1378 void
tuplesort_putindextuplevalues(Tuplesortstate * state,Relation rel,ItemPointer self,Datum * values,bool * isnull)1379 tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel,
1380 							  ItemPointer self, Datum *values,
1381 							  bool *isnull)
1382 {
1383 	MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1384 	SortTuple	stup;
1385 	Datum		original;
1386 	IndexTuple	tuple;
1387 
1388 	stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull);
1389 	tuple = ((IndexTuple) stup.tuple);
1390 	tuple->t_tid = *self;
1391 	USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1392 	/* set up first-column key value */
1393 	original = index_getattr(tuple,
1394 							 1,
1395 							 RelationGetDescr(state->indexRel),
1396 							 &stup.isnull1);
1397 
1398 	MemoryContextSwitchTo(state->sortcontext);
1399 
1400 	if (!state->sortKeys || !state->sortKeys->abbrev_converter || stup.isnull1)
1401 	{
1402 		/*
1403 		 * Store ordinary Datum representation, or NULL value.  If there is a
1404 		 * converter it won't expect NULL values, and cost model is not
1405 		 * required to account for NULL, so in that case we avoid calling
1406 		 * converter and just set datum1 to zeroed representation (to be
1407 		 * consistent, and to support cheap inequality tests for NULL
1408 		 * abbreviated keys).
1409 		 */
1410 		stup.datum1 = original;
1411 	}
1412 	else if (!consider_abort_common(state))
1413 	{
1414 		/* Store abbreviated key representation */
1415 		stup.datum1 = state->sortKeys->abbrev_converter(original,
1416 														state->sortKeys);
1417 	}
1418 	else
1419 	{
1420 		/* Abort abbreviation */
1421 		int			i;
1422 
1423 		stup.datum1 = original;
1424 
1425 		/*
1426 		 * Set state to be consistent with never trying abbreviation.
1427 		 *
1428 		 * Alter datum1 representation in already-copied tuples, so as to
1429 		 * ensure a consistent representation (current tuple was just
1430 		 * handled).  It does not matter if some dumped tuples are already
1431 		 * sorted on tape, since serialized tuples lack abbreviated keys
1432 		 * (TSS_BUILDRUNS state prevents control reaching here in any case).
1433 		 */
1434 		for (i = 0; i < state->memtupcount; i++)
1435 		{
1436 			SortTuple  *mtup = &state->memtuples[i];
1437 
1438 			tuple = mtup->tuple;
1439 			mtup->datum1 = index_getattr(tuple,
1440 										 1,
1441 										 RelationGetDescr(state->indexRel),
1442 										 &mtup->isnull1);
1443 		}
1444 	}
1445 
1446 	puttuple_common(state, &stup);
1447 
1448 	MemoryContextSwitchTo(oldcontext);
1449 }
1450 
1451 /*
1452  * Accept one Datum while collecting input data for sort.
1453  *
1454  * If the Datum is pass-by-ref type, the value will be copied.
1455  */
1456 void
tuplesort_putdatum(Tuplesortstate * state,Datum val,bool isNull)1457 tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull)
1458 {
1459 	MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
1460 	SortTuple	stup;
1461 
1462 	/*
1463 	 * Pass-by-value types or null values are just stored directly in
1464 	 * stup.datum1 (and stup.tuple is not used and set to NULL).
1465 	 *
1466 	 * Non-null pass-by-reference values need to be copied into memory we
1467 	 * control, and possibly abbreviated. The copied value is pointed to by
1468 	 * stup.tuple and is treated as the canonical copy (e.g. to return via
1469 	 * tuplesort_getdatum or when writing to tape); stup.datum1 gets the
1470 	 * abbreviated value if abbreviation is happening, otherwise it's
1471 	 * identical to stup.tuple.
1472 	 */
1473 
1474 	if (isNull || !state->tuples)
1475 	{
1476 		/*
1477 		 * Set datum1 to zeroed representation for NULLs (to be consistent,
1478 		 * and to support cheap inequality tests for NULL abbreviated keys).
1479 		 */
1480 		stup.datum1 = !isNull ? val : (Datum) 0;
1481 		stup.isnull1 = isNull;
1482 		stup.tuple = NULL;		/* no separate storage */
1483 		MemoryContextSwitchTo(state->sortcontext);
1484 	}
1485 	else
1486 	{
1487 		Datum		original = datumCopy(val, false, state->datumTypeLen);
1488 
1489 		stup.isnull1 = false;
1490 		stup.tuple = DatumGetPointer(original);
1491 		USEMEM(state, GetMemoryChunkSpace(stup.tuple));
1492 		MemoryContextSwitchTo(state->sortcontext);
1493 
1494 		if (!state->sortKeys->abbrev_converter)
1495 		{
1496 			stup.datum1 = original;
1497 		}
1498 		else if (!consider_abort_common(state))
1499 		{
1500 			/* Store abbreviated key representation */
1501 			stup.datum1 = state->sortKeys->abbrev_converter(original,
1502 															state->sortKeys);
1503 		}
1504 		else
1505 		{
1506 			/* Abort abbreviation */
1507 			int			i;
1508 
1509 			stup.datum1 = original;
1510 
1511 			/*
1512 			 * Set state to be consistent with never trying abbreviation.
1513 			 *
1514 			 * Alter datum1 representation in already-copied tuples, so as to
1515 			 * ensure a consistent representation (current tuple was just
1516 			 * handled).  It does not matter if some dumped tuples are already
1517 			 * sorted on tape, since serialized tuples lack abbreviated keys
1518 			 * (TSS_BUILDRUNS state prevents control reaching here in any
1519 			 * case).
1520 			 */
1521 			for (i = 0; i < state->memtupcount; i++)
1522 			{
1523 				SortTuple  *mtup = &state->memtuples[i];
1524 
1525 				mtup->datum1 = PointerGetDatum(mtup->tuple);
1526 			}
1527 		}
1528 	}
1529 
1530 	puttuple_common(state, &stup);
1531 
1532 	MemoryContextSwitchTo(oldcontext);
1533 }
1534 
1535 /*
1536  * Shared code for tuple and datum cases.
1537  */
1538 static void
puttuple_common(Tuplesortstate * state,SortTuple * tuple)1539 puttuple_common(Tuplesortstate *state, SortTuple *tuple)
1540 {
1541 	switch (state->status)
1542 	{
1543 		case TSS_INITIAL:
1544 
1545 			/*
1546 			 * Save the tuple into the unsorted array.  First, grow the array
1547 			 * as needed.  Note that we try to grow the array when there is
1548 			 * still one free slot remaining --- if we fail, there'll still be
1549 			 * room to store the incoming tuple, and then we'll switch to
1550 			 * tape-based operation.
1551 			 */
1552 			if (state->memtupcount >= state->memtupsize - 1)
1553 			{
1554 				(void) grow_memtuples(state);
1555 				Assert(state->memtupcount < state->memtupsize);
1556 			}
1557 			state->memtuples[state->memtupcount++] = *tuple;
1558 
1559 			/*
1560 			 * Check if it's time to switch over to a bounded heapsort. We do
1561 			 * so if the input tuple count exceeds twice the desired tuple
1562 			 * count (this is a heuristic for where heapsort becomes cheaper
1563 			 * than a quicksort), or if we've just filled workMem and have
1564 			 * enough tuples to meet the bound.
1565 			 *
1566 			 * Note that once we enter TSS_BOUNDED state we will always try to
1567 			 * complete the sort that way.  In the worst case, if later input
1568 			 * tuples are larger than earlier ones, this might cause us to
1569 			 * exceed workMem significantly.
1570 			 */
1571 			if (state->bounded &&
1572 				(state->memtupcount > state->bound * 2 ||
1573 				 (state->memtupcount > state->bound && LACKMEM(state))))
1574 			{
1575 #ifdef TRACE_SORT
1576 				if (trace_sort)
1577 					elog(LOG, "switching to bounded heapsort at %d tuples: %s",
1578 						 state->memtupcount,
1579 						 pg_rusage_show(&state->ru_start));
1580 #endif
1581 				make_bounded_heap(state);
1582 				return;
1583 			}
1584 
1585 			/*
1586 			 * Done if we still fit in available memory and have array slots.
1587 			 */
1588 			if (state->memtupcount < state->memtupsize && !LACKMEM(state))
1589 				return;
1590 
1591 			/*
1592 			 * Nope; time to switch to tape-based operation.
1593 			 */
1594 			inittapes(state);
1595 
1596 			/*
1597 			 * Dump tuples until we are back under the limit.
1598 			 */
1599 			dumptuples(state, false);
1600 			break;
1601 
1602 		case TSS_BOUNDED:
1603 
1604 			/*
1605 			 * We don't want to grow the array here, so check whether the new
1606 			 * tuple can be discarded before putting it in.  This should be a
1607 			 * good speed optimization, too, since when there are many more
1608 			 * input tuples than the bound, most input tuples can be discarded
1609 			 * with just this one comparison.  Note that because we currently
1610 			 * have the sort direction reversed, we must check for <= not >=.
1611 			 */
1612 			if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
1613 			{
1614 				/* new tuple <= top of the heap, so we can discard it */
1615 				free_sort_tuple(state, tuple);
1616 				CHECK_FOR_INTERRUPTS();
1617 			}
1618 			else
1619 			{
1620 				/* discard top of heap, sift up, insert new tuple */
1621 				free_sort_tuple(state, &state->memtuples[0]);
1622 				tuplesort_heap_siftup(state, false);
1623 				tuplesort_heap_insert(state, tuple, 0, false);
1624 			}
1625 			break;
1626 
1627 		case TSS_BUILDRUNS:
1628 
1629 			/*
1630 			 * Insert the tuple into the heap, with run number currentRun if
1631 			 * it can go into the current run, else HEAP_RUN_NEXT.  The tuple
1632 			 * can go into the current run if it is >= the first
1633 			 * not-yet-output tuple.  (Actually, it could go into the current
1634 			 * run if it is >= the most recently output tuple ... but that
1635 			 * would require keeping around the tuple we last output, and it's
1636 			 * simplest to let writetup free each tuple as soon as it's
1637 			 * written.)
1638 			 *
1639 			 * Note that this only applies when:
1640 			 *
1641 			 * - currentRun is RUN_FIRST
1642 			 *
1643 			 * - Replacement selection is in use (typically it is never used).
1644 			 *
1645 			 * When these two conditions are not both true, all tuples are
1646 			 * appended indifferently, much like the TSS_INITIAL case.
1647 			 *
1648 			 * There should always be room to store the incoming tuple.
1649 			 */
1650 			Assert(!state->replaceActive || state->memtupcount > 0);
1651 			if (state->replaceActive &&
1652 				COMPARETUP(state, tuple, &state->memtuples[0]) >= 0)
1653 			{
1654 				Assert(state->currentRun == RUN_FIRST);
1655 
1656 				/*
1657 				 * Insert tuple into first, fully heapified run.
1658 				 *
1659 				 * Unlike classic replacement selection, which this module was
1660 				 * previously based on, only RUN_FIRST tuples are fully
1661 				 * heapified.  Any second/next run tuples are appended
1662 				 * indifferently.  While HEAP_RUN_NEXT tuples may be sifted
1663 				 * out of the way of first run tuples, COMPARETUP() will never
1664 				 * be called for the run's tuples during sifting (only our
1665 				 * initial COMPARETUP() call is required for the tuple, to
1666 				 * determine that the tuple does not belong in RUN_FIRST).
1667 				 */
1668 				tuplesort_heap_insert(state, tuple, state->currentRun, true);
1669 			}
1670 			else
1671 			{
1672 				/*
1673 				 * Tuple was determined to not belong to heapified RUN_FIRST,
1674 				 * or replacement selection not in play.  Append the tuple to
1675 				 * memtuples indifferently.
1676 				 *
1677 				 * dumptuples() does not trust that the next run's tuples are
1678 				 * heapified.  Anything past the first run will always be
1679 				 * quicksorted even when replacement selection is initially
1680 				 * used.  (When it's never used, every tuple still takes this
1681 				 * path.)
1682 				 */
1683 				tuple->tupindex = HEAP_RUN_NEXT;
1684 				state->memtuples[state->memtupcount++] = *tuple;
1685 			}
1686 
1687 			/*
1688 			 * If we are over the memory limit, dump tuples till we're under.
1689 			 */
1690 			dumptuples(state, false);
1691 			break;
1692 
1693 		default:
1694 			elog(ERROR, "invalid tuplesort state");
1695 			break;
1696 	}
1697 }
1698 
1699 static bool
consider_abort_common(Tuplesortstate * state)1700 consider_abort_common(Tuplesortstate *state)
1701 {
1702 	Assert(state->sortKeys[0].abbrev_converter != NULL);
1703 	Assert(state->sortKeys[0].abbrev_abort != NULL);
1704 	Assert(state->sortKeys[0].abbrev_full_comparator != NULL);
1705 
1706 	/*
1707 	 * Check effectiveness of abbreviation optimization.  Consider aborting
1708 	 * when still within memory limit.
1709 	 */
1710 	if (state->status == TSS_INITIAL &&
1711 		state->memtupcount >= state->abbrevNext)
1712 	{
1713 		state->abbrevNext *= 2;
1714 
1715 		/*
1716 		 * Check opclass-supplied abbreviation abort routine.  It may indicate
1717 		 * that abbreviation should not proceed.
1718 		 */
1719 		if (!state->sortKeys->abbrev_abort(state->memtupcount,
1720 										   state->sortKeys))
1721 			return false;
1722 
1723 		/*
1724 		 * Finally, restore authoritative comparator, and indicate that
1725 		 * abbreviation is not in play by setting abbrev_converter to NULL
1726 		 */
1727 		state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator;
1728 		state->sortKeys[0].abbrev_converter = NULL;
1729 		/* Not strictly necessary, but be tidy */
1730 		state->sortKeys[0].abbrev_abort = NULL;
1731 		state->sortKeys[0].abbrev_full_comparator = NULL;
1732 
1733 		/* Give up - expect original pass-by-value representation */
1734 		return true;
1735 	}
1736 
1737 	return false;
1738 }
1739 
1740 /*
1741  * All tuples have been provided; finish the sort.
1742  */
1743 void
tuplesort_performsort(Tuplesortstate * state)1744 tuplesort_performsort(Tuplesortstate *state)
1745 {
1746 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
1747 
1748 #ifdef TRACE_SORT
1749 	if (trace_sort)
1750 		elog(LOG, "performsort starting: %s",
1751 			 pg_rusage_show(&state->ru_start));
1752 #endif
1753 
1754 	switch (state->status)
1755 	{
1756 		case TSS_INITIAL:
1757 
1758 			/*
1759 			 * We were able to accumulate all the tuples within the allowed
1760 			 * amount of memory.  Just qsort 'em and we're done.
1761 			 */
1762 			tuplesort_sort_memtuples(state);
1763 			state->current = 0;
1764 			state->eof_reached = false;
1765 			state->markpos_offset = 0;
1766 			state->markpos_eof = false;
1767 			state->status = TSS_SORTEDINMEM;
1768 			break;
1769 
1770 		case TSS_BOUNDED:
1771 
1772 			/*
1773 			 * We were able to accumulate all the tuples required for output
1774 			 * in memory, using a heap to eliminate excess tuples.  Now we
1775 			 * have to transform the heap to a properly-sorted array.
1776 			 */
1777 			sort_bounded_heap(state);
1778 			state->current = 0;
1779 			state->eof_reached = false;
1780 			state->markpos_offset = 0;
1781 			state->markpos_eof = false;
1782 			state->status = TSS_SORTEDINMEM;
1783 			break;
1784 
1785 		case TSS_BUILDRUNS:
1786 
1787 			/*
1788 			 * Finish tape-based sort.  First, flush all tuples remaining in
1789 			 * memory out to tape; then merge until we have a single remaining
1790 			 * run (or, if !randomAccess, one run per tape). Note that
1791 			 * mergeruns sets the correct state->status.
1792 			 */
1793 			dumptuples(state, true);
1794 			mergeruns(state);
1795 			state->eof_reached = false;
1796 			state->markpos_block = 0L;
1797 			state->markpos_offset = 0;
1798 			state->markpos_eof = false;
1799 			break;
1800 
1801 		default:
1802 			elog(ERROR, "invalid tuplesort state");
1803 			break;
1804 	}
1805 
1806 #ifdef TRACE_SORT
1807 	if (trace_sort)
1808 	{
1809 		if (state->status == TSS_FINALMERGE)
1810 			elog(LOG, "performsort done (except %d-way final merge): %s",
1811 				 state->activeTapes,
1812 				 pg_rusage_show(&state->ru_start));
1813 		else
1814 			elog(LOG, "performsort done: %s",
1815 				 pg_rusage_show(&state->ru_start));
1816 	}
1817 #endif
1818 
1819 	MemoryContextSwitchTo(oldcontext);
1820 }
1821 
1822 /*
1823  * Internal routine to fetch the next tuple in either forward or back
1824  * direction into *stup.  Returns FALSE if no more tuples.
1825  * If *should_free is set, the caller must pfree stup.tuple when done with it.
1826  * Otherwise, caller should not use tuple following next call here.
1827  *
1828  * Note:  Public tuplesort fetch routine callers cannot rely on tuple being
1829  * allocated in their own memory context when should_free is TRUE.  It may be
1830  * necessary to create a new copy of the tuple to meet the requirements of
1831  * public fetch routine callers.
1832  */
1833 static bool
tuplesort_gettuple_common(Tuplesortstate * state,bool forward,SortTuple * stup,bool * should_free)1834 tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
1835 						  SortTuple *stup, bool *should_free)
1836 {
1837 	unsigned int tuplen;
1838 
1839 	switch (state->status)
1840 	{
1841 		case TSS_SORTEDINMEM:
1842 			Assert(forward || state->randomAccess);
1843 			Assert(!state->batchUsed);
1844 			*should_free = false;
1845 			if (forward)
1846 			{
1847 				if (state->current < state->memtupcount)
1848 				{
1849 					*stup = state->memtuples[state->current++];
1850 					return true;
1851 				}
1852 				state->eof_reached = true;
1853 
1854 				/*
1855 				 * Complain if caller tries to retrieve more tuples than
1856 				 * originally asked for in a bounded sort.  This is because
1857 				 * returning EOF here might be the wrong thing.
1858 				 */
1859 				if (state->bounded && state->current >= state->bound)
1860 					elog(ERROR, "retrieved too many tuples in a bounded sort");
1861 
1862 				return false;
1863 			}
1864 			else
1865 			{
1866 				if (state->current <= 0)
1867 					return false;
1868 
1869 				/*
1870 				 * if all tuples are fetched already then we return last
1871 				 * tuple, else - tuple before last returned.
1872 				 */
1873 				if (state->eof_reached)
1874 					state->eof_reached = false;
1875 				else
1876 				{
1877 					state->current--;	/* last returned tuple */
1878 					if (state->current <= 0)
1879 						return false;
1880 				}
1881 				*stup = state->memtuples[state->current - 1];
1882 				return true;
1883 			}
1884 			break;
1885 
1886 		case TSS_SORTEDONTAPE:
1887 			Assert(forward || state->randomAccess);
1888 			Assert(!state->batchUsed);
1889 			*should_free = true;
1890 			if (forward)
1891 			{
1892 				if (state->eof_reached)
1893 					return false;
1894 				if ((tuplen = getlen(state, state->result_tape, true)) != 0)
1895 				{
1896 					READTUP(state, stup, state->result_tape, tuplen);
1897 					return true;
1898 				}
1899 				else
1900 				{
1901 					state->eof_reached = true;
1902 					return false;
1903 				}
1904 			}
1905 
1906 			/*
1907 			 * Backward.
1908 			 *
1909 			 * if all tuples are fetched already then we return last tuple,
1910 			 * else - tuple before last returned.
1911 			 */
1912 			if (state->eof_reached)
1913 			{
1914 				/*
1915 				 * Seek position is pointing just past the zero tuplen at the
1916 				 * end of file; back up to fetch last tuple's ending length
1917 				 * word.  If seek fails we must have a completely empty file.
1918 				 */
1919 				if (!LogicalTapeBackspace(state->tapeset,
1920 										  state->result_tape,
1921 										  2 * sizeof(unsigned int)))
1922 					return false;
1923 				state->eof_reached = false;
1924 			}
1925 			else
1926 			{
1927 				/*
1928 				 * Back up and fetch previously-returned tuple's ending length
1929 				 * word.  If seek fails, assume we are at start of file.
1930 				 */
1931 				if (!LogicalTapeBackspace(state->tapeset,
1932 										  state->result_tape,
1933 										  sizeof(unsigned int)))
1934 					return false;
1935 				tuplen = getlen(state, state->result_tape, false);
1936 
1937 				/*
1938 				 * Back up to get ending length word of tuple before it.
1939 				 */
1940 				if (!LogicalTapeBackspace(state->tapeset,
1941 										  state->result_tape,
1942 										  tuplen + 2 * sizeof(unsigned int)))
1943 				{
1944 					/*
1945 					 * If that fails, presumably the prev tuple is the first
1946 					 * in the file.  Back up so that it becomes next to read
1947 					 * in forward direction (not obviously right, but that is
1948 					 * what in-memory case does).
1949 					 */
1950 					if (!LogicalTapeBackspace(state->tapeset,
1951 											  state->result_tape,
1952 											  tuplen + sizeof(unsigned int)))
1953 						elog(ERROR, "bogus tuple length in backward scan");
1954 					return false;
1955 				}
1956 			}
1957 
1958 			tuplen = getlen(state, state->result_tape, false);
1959 
1960 			/*
1961 			 * Now we have the length of the prior tuple, back up and read it.
1962 			 * Note: READTUP expects we are positioned after the initial
1963 			 * length word of the tuple, so back up to that point.
1964 			 */
1965 			if (!LogicalTapeBackspace(state->tapeset,
1966 									  state->result_tape,
1967 									  tuplen))
1968 				elog(ERROR, "bogus tuple length in backward scan");
1969 			READTUP(state, stup, state->result_tape, tuplen);
1970 			return true;
1971 
1972 		case TSS_FINALMERGE:
1973 			Assert(forward);
1974 			Assert(state->batchUsed || !state->tuples);
1975 			/* For now, assume tuple is stored in tape's batch memory */
1976 			*should_free = false;
1977 
1978 			/*
1979 			 * This code should match the inner loop of mergeonerun().
1980 			 */
1981 			if (state->memtupcount > 0)
1982 			{
1983 				int			srcTape = state->memtuples[0].tupindex;
1984 				int			tupIndex;
1985 				SortTuple  *newtup;
1986 
1987 				/*
1988 				 * Returned tuple is still counted in our memory space most of
1989 				 * the time.  See mergebatchone() for discussion of why caller
1990 				 * may occasionally be required to free returned tuple, and
1991 				 * how preread memory is managed with regard to edge cases
1992 				 * more generally.
1993 				 */
1994 				*stup = state->memtuples[0];
1995 				tuplesort_heap_siftup(state, false);
1996 				if ((tupIndex = state->mergenext[srcTape]) == 0)
1997 				{
1998 					/*
1999 					 * out of preloaded data on this tape, try to read more
2000 					 *
2001 					 * Unlike mergeonerun(), we only preload from the single
2002 					 * tape that's run dry, though not before preparing its
2003 					 * batch memory for a new round of sequential consumption.
2004 					 * See mergepreread() comments.
2005 					 */
2006 					if (state->batchUsed)
2007 						mergebatchone(state, srcTape, stup, should_free);
2008 
2009 					mergeprereadone(state, srcTape);
2010 
2011 					/*
2012 					 * if still no data, we've reached end of run on this tape
2013 					 */
2014 					if ((tupIndex = state->mergenext[srcTape]) == 0)
2015 					{
2016 						/* Free tape's buffer, avoiding dangling pointer */
2017 						if (state->batchUsed)
2018 							mergebatchfreetape(state, srcTape, stup, should_free);
2019 						return true;
2020 					}
2021 				}
2022 				/* pull next preread tuple from list, insert in heap */
2023 				newtup = &state->memtuples[tupIndex];
2024 				state->mergenext[srcTape] = newtup->tupindex;
2025 				if (state->mergenext[srcTape] == 0)
2026 					state->mergelast[srcTape] = 0;
2027 				tuplesort_heap_insert(state, newtup, srcTape, false);
2028 				/* put the now-unused memtuples entry on the freelist */
2029 				newtup->tupindex = state->mergefreelist;
2030 				state->mergefreelist = tupIndex;
2031 				state->mergeavailslots[srcTape]++;
2032 				return true;
2033 			}
2034 			return false;
2035 
2036 		default:
2037 			elog(ERROR, "invalid tuplesort state");
2038 			return false;		/* keep compiler quiet */
2039 	}
2040 }
2041 
2042 /*
2043  * Fetch the next tuple in either forward or back direction.
2044  * If successful, put tuple in slot and return TRUE; else, clear the slot
2045  * and return FALSE.
2046  *
2047  * Caller may optionally be passed back abbreviated value (on TRUE return
2048  * value) when abbreviation was used, which can be used to cheaply avoid
2049  * equality checks that might otherwise be required.  Caller can safely make a
2050  * determination of "non-equal tuple" based on simple binary inequality.  A
2051  * NULL value in leading attribute will set abbreviated value to zeroed
2052  * representation, which caller may rely on in abbreviated inequality check.
2053  *
2054  * The slot receives a tuple that's been copied into the caller's memory
2055  * context, so that it will stay valid regardless of future manipulations of
2056  * the tuplesort's state (up to and including deleting the tuplesort).
2057  * This differs from similar routines for other types of tuplesorts.
2058  */
2059 bool
tuplesort_gettupleslot(Tuplesortstate * state,bool forward,TupleTableSlot * slot,Datum * abbrev)2060 tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
2061 					   TupleTableSlot *slot, Datum *abbrev)
2062 {
2063 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2064 	SortTuple	stup;
2065 	bool		should_free;
2066 
2067 	if (!tuplesort_gettuple_common(state, forward, &stup, &should_free))
2068 		stup.tuple = NULL;
2069 
2070 	MemoryContextSwitchTo(oldcontext);
2071 
2072 	if (stup.tuple)
2073 	{
2074 		/* Record abbreviated key for caller */
2075 		if (state->sortKeys->abbrev_converter && abbrev)
2076 			*abbrev = stup.datum1;
2077 
2078 		/*
2079 		 * Callers rely on tuple being in their own memory context, which is
2080 		 * not guaranteed by tuplesort_gettuple_common(), even when should_free
2081 		 * is set to TRUE.  We must always copy here, since our interface does
2082 		 * not allow callers to opt into arrangement where tuple memory can go
2083 		 * away on the next call here, or after tuplesort_end() is called.
2084 		 */
2085 		ExecStoreMinimalTuple(heap_copy_minimal_tuple((MinimalTuple) stup.tuple),
2086 							  slot, true);
2087 
2088 		/*
2089 		 * Free local copy if needed.  It would be very invasive to get
2090 		 * tuplesort_gettuple_common() to allocate tuple in caller's context
2091 		 * for us, so we just do this instead.
2092 		 */
2093 		if (should_free)
2094 			pfree(stup.tuple);
2095 
2096 		return true;
2097 	}
2098 	else
2099 	{
2100 		ExecClearTuple(slot);
2101 		return false;
2102 	}
2103 }
2104 
2105 /*
2106  * Fetch the next tuple in either forward or back direction.
2107  * Returns NULL if no more tuples.  If *should_free is set, the
2108  * caller must pfree the returned tuple when done with it.
2109  * If it is not set, caller should not use tuple following next
2110  * call here.  It's never okay to use it after tuplesort_end().
2111  */
2112 HeapTuple
tuplesort_getheaptuple(Tuplesortstate * state,bool forward,bool * should_free)2113 tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *should_free)
2114 {
2115 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2116 	SortTuple	stup;
2117 
2118 	if (!tuplesort_gettuple_common(state, forward, &stup, should_free))
2119 		stup.tuple = NULL;
2120 
2121 	MemoryContextSwitchTo(oldcontext);
2122 
2123 	return stup.tuple;
2124 }
2125 
2126 /*
2127  * Fetch the next index tuple in either forward or back direction.
2128  * Returns NULL if no more tuples.  If *should_free is set, the
2129  * caller must pfree the returned tuple when done with it.
2130  * If it is not set, caller should not use tuple following next
2131  * call here.  It's never okay to use it after tuplesort_end().
2132  */
2133 IndexTuple
tuplesort_getindextuple(Tuplesortstate * state,bool forward,bool * should_free)2134 tuplesort_getindextuple(Tuplesortstate *state, bool forward,
2135 						bool *should_free)
2136 {
2137 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2138 	SortTuple	stup;
2139 
2140 	if (!tuplesort_gettuple_common(state, forward, &stup, should_free))
2141 		stup.tuple = NULL;
2142 
2143 	MemoryContextSwitchTo(oldcontext);
2144 
2145 	return (IndexTuple) stup.tuple;
2146 }
2147 
2148 /*
2149  * Fetch the next Datum in either forward or back direction.
2150  * Returns FALSE if no more datums.
2151  *
2152  * If the Datum is pass-by-ref type, the returned value is freshly palloc'd
2153  * in caller's context, and is now owned by the caller (this differs from
2154  * similar routines for other types of tuplesorts).
2155  *
2156  * Caller may optionally be passed back abbreviated value (on TRUE return
2157  * value) when abbreviation was used, which can be used to cheaply avoid
2158  * equality checks that might otherwise be required.  Caller can safely make a
2159  * determination of "non-equal tuple" based on simple binary inequality.  A
2160  * NULL value will have a zeroed abbreviated value representation, which caller
2161  * may rely on in abbreviated inequality check.
2162  */
2163 bool
tuplesort_getdatum(Tuplesortstate * state,bool forward,Datum * val,bool * isNull,Datum * abbrev)2164 tuplesort_getdatum(Tuplesortstate *state, bool forward,
2165 				   Datum *val, bool *isNull, Datum *abbrev)
2166 {
2167 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
2168 	SortTuple	stup;
2169 	bool		should_free;
2170 
2171 	if (!tuplesort_gettuple_common(state, forward, &stup, &should_free))
2172 	{
2173 		MemoryContextSwitchTo(oldcontext);
2174 		return false;
2175 	}
2176 
2177 	/* Ensure we copy into caller's memory context */
2178 	MemoryContextSwitchTo(oldcontext);
2179 
2180 	/* Record abbreviated key for caller */
2181 	if (state->sortKeys->abbrev_converter && abbrev)
2182 		*abbrev = stup.datum1;
2183 
2184 	if (stup.isnull1 || !state->tuples)
2185 	{
2186 		*val = stup.datum1;
2187 		*isNull = stup.isnull1;
2188 	}
2189 	else
2190 	{
2191 		/*
2192 		 * Callers rely on datum being in their own memory context, which is
2193 		 * not guaranteed by tuplesort_gettuple_common(), even when should_free
2194 		 * is set to TRUE.  We must always copy here, since our interface does
2195 		 * not allow callers to opt into arrangement where tuple memory can go
2196 		 * away on the next call here, or after tuplesort_end() is called.
2197 		 *
2198 		 * Use stup.tuple because stup.datum1 may be an abbreviation.
2199 		 */
2200 		*val = datumCopy(PointerGetDatum(stup.tuple), false, state->datumTypeLen);
2201 		*isNull = false;
2202 
2203 		/*
2204 		 * Free local copy if needed.  It would be very invasive to get
2205 		 * tuplesort_gettuple_common() to allocate tuple in caller's context
2206 		 * for us, so we just do this instead.
2207 		 */
2208 		if (should_free)
2209 			pfree(stup.tuple);
2210 	}
2211 
2212 	return true;
2213 }
2214 
2215 /*
2216  * Advance over N tuples in either forward or back direction,
2217  * without returning any data.  N==0 is a no-op.
2218  * Returns TRUE if successful, FALSE if ran out of tuples.
2219  */
2220 bool
tuplesort_skiptuples(Tuplesortstate * state,int64 ntuples,bool forward)2221 tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
2222 {
2223 	MemoryContext oldcontext;
2224 
2225 	/*
2226 	 * We don't actually support backwards skip yet, because no callers need
2227 	 * it.  The API is designed to allow for that later, though.
2228 	 */
2229 	Assert(forward);
2230 	Assert(ntuples >= 0);
2231 
2232 	switch (state->status)
2233 	{
2234 		case TSS_SORTEDINMEM:
2235 			if (state->memtupcount - state->current >= ntuples)
2236 			{
2237 				state->current += ntuples;
2238 				return true;
2239 			}
2240 			state->current = state->memtupcount;
2241 			state->eof_reached = true;
2242 
2243 			/*
2244 			 * Complain if caller tries to retrieve more tuples than
2245 			 * originally asked for in a bounded sort.  This is because
2246 			 * returning EOF here might be the wrong thing.
2247 			 */
2248 			if (state->bounded && state->current >= state->bound)
2249 				elog(ERROR, "retrieved too many tuples in a bounded sort");
2250 
2251 			return false;
2252 
2253 		case TSS_SORTEDONTAPE:
2254 		case TSS_FINALMERGE:
2255 
2256 			/*
2257 			 * We could probably optimize these cases better, but for now it's
2258 			 * not worth the trouble.
2259 			 */
2260 			oldcontext = MemoryContextSwitchTo(state->sortcontext);
2261 			while (ntuples-- > 0)
2262 			{
2263 				SortTuple	stup;
2264 				bool		should_free;
2265 
2266 				if (!tuplesort_gettuple_common(state, forward,
2267 											   &stup, &should_free))
2268 				{
2269 					MemoryContextSwitchTo(oldcontext);
2270 					return false;
2271 				}
2272 				if (should_free && stup.tuple)
2273 					pfree(stup.tuple);
2274 				CHECK_FOR_INTERRUPTS();
2275 			}
2276 			MemoryContextSwitchTo(oldcontext);
2277 			return true;
2278 
2279 		default:
2280 			elog(ERROR, "invalid tuplesort state");
2281 			return false;		/* keep compiler quiet */
2282 	}
2283 }
2284 
2285 /*
2286  * tuplesort_merge_order - report merge order we'll use for given memory
2287  * (note: "merge order" just means the number of input tapes in the merge).
2288  *
2289  * This is exported for use by the planner.  allowedMem is in bytes.
2290  */
2291 int
tuplesort_merge_order(int64 allowedMem)2292 tuplesort_merge_order(int64 allowedMem)
2293 {
2294 	int			mOrder;
2295 
2296 	/*
2297 	 * We need one tape for each merge input, plus another one for the output,
2298 	 * and each of these tapes needs buffer space.  In addition we want
2299 	 * MERGE_BUFFER_SIZE workspace per input tape (but the output tape doesn't
2300 	 * count).
2301 	 *
2302 	 * Note: you might be thinking we need to account for the memtuples[]
2303 	 * array in this calculation, but we effectively treat that as part of the
2304 	 * MERGE_BUFFER_SIZE workspace.
2305 	 */
2306 	mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) /
2307 		(MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD);
2308 
2309 	/* Even in minimum memory, use at least a MINORDER merge */
2310 	mOrder = Max(mOrder, MINORDER);
2311 
2312 	return mOrder;
2313 }
2314 
2315 /*
2316  * useselection - determine algorithm to use to sort first run.
2317  *
2318  * It can sometimes be useful to use the replacement selection algorithm if it
2319  * results in one large run, and there is little available workMem.  See
2320  * remarks on RUN_SECOND optimization within dumptuples().
2321  */
2322 static bool
useselection(Tuplesortstate * state)2323 useselection(Tuplesortstate *state)
2324 {
2325 	/*
2326 	 * memtupsize might be noticeably higher than memtupcount here in atypical
2327 	 * cases.  It seems slightly preferable to not allow recent outliers to
2328 	 * impact this determination.  Note that caller's trace_sort output
2329 	 * reports memtupcount instead.
2330 	 */
2331 	if (state->memtupsize <= replacement_sort_tuples)
2332 		return true;
2333 
2334 	return false;
2335 }
2336 
2337 /*
2338  * inittapes - initialize for tape sorting.
2339  *
2340  * This is called only if we have found we don't have room to sort in memory.
2341  */
2342 static void
inittapes(Tuplesortstate * state)2343 inittapes(Tuplesortstate *state)
2344 {
2345 	int			maxTapes,
2346 				j;
2347 	int64		tapeSpace;
2348 
2349 	/* Compute number of tapes to use: merge order plus 1 */
2350 	maxTapes = tuplesort_merge_order(state->allowedMem) + 1;
2351 
2352 	/*
2353 	 * We must have at least 2*maxTapes slots in the memtuples[] array, else
2354 	 * we'd not have room for merge heap plus preread.  It seems unlikely that
2355 	 * this case would ever occur, but be safe.
2356 	 */
2357 	maxTapes = Min(maxTapes, state->memtupsize / 2);
2358 
2359 	state->maxTapes = maxTapes;
2360 	state->tapeRange = maxTapes - 1;
2361 
2362 #ifdef TRACE_SORT
2363 	if (trace_sort)
2364 		elog(LOG, "switching to external sort with %d tapes: %s",
2365 			 maxTapes, pg_rusage_show(&state->ru_start));
2366 #endif
2367 
2368 	/*
2369 	 * Decrease availMem to reflect the space needed for tape buffers; but
2370 	 * don't decrease it to the point that we have no room for tuples. (That
2371 	 * case is only likely to occur if sorting pass-by-value Datums; in all
2372 	 * other scenarios the memtuples[] array is unlikely to occupy more than
2373 	 * half of allowedMem.  In the pass-by-value case it's not important to
2374 	 * account for tuple space, so we don't care if LACKMEM becomes
2375 	 * inaccurate.)
2376 	 */
2377 	tapeSpace = (int64) maxTapes *TAPE_BUFFER_OVERHEAD;
2378 
2379 	if (tapeSpace + GetMemoryChunkSpace(state->memtuples) < state->allowedMem)
2380 		USEMEM(state, tapeSpace);
2381 
2382 	/*
2383 	 * Make sure that the temp file(s) underlying the tape set are created in
2384 	 * suitable temp tablespaces.
2385 	 */
2386 	PrepareTempTablespaces();
2387 
2388 	/*
2389 	 * Create the tape set and allocate the per-tape data arrays.
2390 	 */
2391 	state->tapeset = LogicalTapeSetCreate(maxTapes);
2392 
2393 	state->mergeactive = (bool *) palloc0(maxTapes * sizeof(bool));
2394 	state->mergenext = (int *) palloc0(maxTapes * sizeof(int));
2395 	state->mergelast = (int *) palloc0(maxTapes * sizeof(int));
2396 	state->mergeavailslots = (int *) palloc0(maxTapes * sizeof(int));
2397 	state->mergeavailmem = (int64 *) palloc0(maxTapes * sizeof(int64));
2398 	state->mergetuples = (char **) palloc0(maxTapes * sizeof(char *));
2399 	state->mergecurrent = (char **) palloc0(maxTapes * sizeof(char *));
2400 	state->mergetail = (char **) palloc0(maxTapes * sizeof(char *));
2401 	state->mergeoverflow = (char **) palloc0(maxTapes * sizeof(char *));
2402 	state->tp_fib = (int *) palloc0(maxTapes * sizeof(int));
2403 	state->tp_runs = (int *) palloc0(maxTapes * sizeof(int));
2404 	state->tp_dummy = (int *) palloc0(maxTapes * sizeof(int));
2405 	state->tp_tapenum = (int *) palloc0(maxTapes * sizeof(int));
2406 
2407 	/*
2408 	 * Give replacement selection a try based on user setting.  There will be
2409 	 * a switch to a simple hybrid sort-merge strategy after the first run
2410 	 * (iff we could not output one long run).
2411 	 */
2412 	state->replaceActive = useselection(state);
2413 
2414 	if (state->replaceActive)
2415 	{
2416 		/*
2417 		 * Convert the unsorted contents of memtuples[] into a heap. Each
2418 		 * tuple is marked as belonging to run number zero.
2419 		 *
2420 		 * NOTE: we pass false for checkIndex since there's no point in
2421 		 * comparing indexes in this step, even though we do intend the
2422 		 * indexes to be part of the sort key...
2423 		 */
2424 		int			ntuples = state->memtupcount;
2425 
2426 #ifdef TRACE_SORT
2427 		if (trace_sort)
2428 			elog(LOG, "replacement selection will sort %d first run tuples",
2429 				 state->memtupcount);
2430 #endif
2431 		state->memtupcount = 0; /* make the heap empty */
2432 
2433 		for (j = 0; j < ntuples; j++)
2434 		{
2435 			/* Must copy source tuple to avoid possible overwrite */
2436 			SortTuple	stup = state->memtuples[j];
2437 
2438 			tuplesort_heap_insert(state, &stup, 0, false);
2439 		}
2440 		Assert(state->memtupcount == ntuples);
2441 	}
2442 
2443 	state->currentRun = RUN_FIRST;
2444 
2445 	/*
2446 	 * Initialize variables of Algorithm D (step D1).
2447 	 */
2448 	for (j = 0; j < maxTapes; j++)
2449 	{
2450 		state->tp_fib[j] = 1;
2451 		state->tp_runs[j] = 0;
2452 		state->tp_dummy[j] = 1;
2453 		state->tp_tapenum[j] = j;
2454 	}
2455 	state->tp_fib[state->tapeRange] = 0;
2456 	state->tp_dummy[state->tapeRange] = 0;
2457 
2458 	state->Level = 1;
2459 	state->destTape = 0;
2460 
2461 	state->status = TSS_BUILDRUNS;
2462 }
2463 
2464 /*
2465  * selectnewtape -- select new tape for new initial run.
2466  *
2467  * This is called after finishing a run when we know another run
2468  * must be started.  This implements steps D3, D4 of Algorithm D.
2469  */
2470 static void
selectnewtape(Tuplesortstate * state)2471 selectnewtape(Tuplesortstate *state)
2472 {
2473 	int			j;
2474 	int			a;
2475 
2476 	/* Step D3: advance j (destTape) */
2477 	if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape + 1])
2478 	{
2479 		state->destTape++;
2480 		return;
2481 	}
2482 	if (state->tp_dummy[state->destTape] != 0)
2483 	{
2484 		state->destTape = 0;
2485 		return;
2486 	}
2487 
2488 	/* Step D4: increase level */
2489 	state->Level++;
2490 	a = state->tp_fib[0];
2491 	for (j = 0; j < state->tapeRange; j++)
2492 	{
2493 		state->tp_dummy[j] = a + state->tp_fib[j + 1] - state->tp_fib[j];
2494 		state->tp_fib[j] = a + state->tp_fib[j + 1];
2495 	}
2496 	state->destTape = 0;
2497 }
2498 
2499 /*
2500  * mergeruns -- merge all the completed initial runs.
2501  *
2502  * This implements steps D5, D6 of Algorithm D.  All input data has
2503  * already been written to initial runs on tape (see dumptuples).
2504  */
2505 static void
mergeruns(Tuplesortstate * state)2506 mergeruns(Tuplesortstate *state)
2507 {
2508 	int			tapenum,
2509 				svTape,
2510 				svRuns,
2511 				svDummy;
2512 
2513 	Assert(state->status == TSS_BUILDRUNS);
2514 	Assert(state->memtupcount == 0);
2515 
2516 	if (state->sortKeys != NULL && state->sortKeys->abbrev_converter != NULL)
2517 	{
2518 		/*
2519 		 * If there are multiple runs to be merged, when we go to read back
2520 		 * tuples from disk, abbreviated keys will not have been stored, and
2521 		 * we don't care to regenerate them.  Disable abbreviation from this
2522 		 * point on.
2523 		 */
2524 		state->sortKeys->abbrev_converter = NULL;
2525 		state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator;
2526 
2527 		/* Not strictly necessary, but be tidy */
2528 		state->sortKeys->abbrev_abort = NULL;
2529 		state->sortKeys->abbrev_full_comparator = NULL;
2530 	}
2531 
2532 	/*
2533 	 * If we produced only one initial run (quite likely if the total data
2534 	 * volume is between 1X and 2X workMem when replacement selection is used,
2535 	 * but something we particular count on when input is presorted), we can
2536 	 * just use that tape as the finished output, rather than doing a useless
2537 	 * merge.  (This obvious optimization is not in Knuth's algorithm.)
2538 	 */
2539 	if (state->currentRun == RUN_SECOND)
2540 	{
2541 		state->result_tape = state->tp_tapenum[state->destTape];
2542 		/* must freeze and rewind the finished output tape */
2543 		LogicalTapeFreeze(state->tapeset, state->result_tape);
2544 		state->status = TSS_SORTEDONTAPE;
2545 		return;
2546 	}
2547 
2548 	/* End of step D2: rewind all output tapes to prepare for merging */
2549 	for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2550 		LogicalTapeRewind(state->tapeset, tapenum, false);
2551 
2552 	for (;;)
2553 	{
2554 		/*
2555 		 * At this point we know that tape[T] is empty.  If there's just one
2556 		 * (real or dummy) run left on each input tape, then only one merge
2557 		 * pass remains.  If we don't have to produce a materialized sorted
2558 		 * tape, we can stop at this point and do the final merge on-the-fly.
2559 		 */
2560 		if (!state->randomAccess)
2561 		{
2562 			bool		allOneRun = true;
2563 
2564 			Assert(state->tp_runs[state->tapeRange] == 0);
2565 			for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2566 			{
2567 				if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1)
2568 				{
2569 					allOneRun = false;
2570 					break;
2571 				}
2572 			}
2573 			if (allOneRun)
2574 			{
2575 				/* Tell logtape.c we won't be writing anymore */
2576 				LogicalTapeSetForgetFreeSpace(state->tapeset);
2577 				/* Initialize for the final merge pass */
2578 				beginmerge(state, state->tuples);
2579 				state->status = TSS_FINALMERGE;
2580 				return;
2581 			}
2582 		}
2583 
2584 		/* Step D5: merge runs onto tape[T] until tape[P] is empty */
2585 		while (state->tp_runs[state->tapeRange - 1] ||
2586 			   state->tp_dummy[state->tapeRange - 1])
2587 		{
2588 			bool		allDummy = true;
2589 
2590 			for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2591 			{
2592 				if (state->tp_dummy[tapenum] == 0)
2593 				{
2594 					allDummy = false;
2595 					break;
2596 				}
2597 			}
2598 
2599 			if (allDummy)
2600 			{
2601 				state->tp_dummy[state->tapeRange]++;
2602 				for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2603 					state->tp_dummy[tapenum]--;
2604 			}
2605 			else
2606 				mergeonerun(state);
2607 		}
2608 
2609 		/* Step D6: decrease level */
2610 		if (--state->Level == 0)
2611 			break;
2612 		/* rewind output tape T to use as new input */
2613 		LogicalTapeRewind(state->tapeset, state->tp_tapenum[state->tapeRange],
2614 						  false);
2615 		/* rewind used-up input tape P, and prepare it for write pass */
2616 		LogicalTapeRewind(state->tapeset, state->tp_tapenum[state->tapeRange - 1],
2617 						  true);
2618 		state->tp_runs[state->tapeRange - 1] = 0;
2619 
2620 		/*
2621 		 * reassign tape units per step D6; note we no longer care about A[]
2622 		 */
2623 		svTape = state->tp_tapenum[state->tapeRange];
2624 		svDummy = state->tp_dummy[state->tapeRange];
2625 		svRuns = state->tp_runs[state->tapeRange];
2626 		for (tapenum = state->tapeRange; tapenum > 0; tapenum--)
2627 		{
2628 			state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum - 1];
2629 			state->tp_dummy[tapenum] = state->tp_dummy[tapenum - 1];
2630 			state->tp_runs[tapenum] = state->tp_runs[tapenum - 1];
2631 		}
2632 		state->tp_tapenum[0] = svTape;
2633 		state->tp_dummy[0] = svDummy;
2634 		state->tp_runs[0] = svRuns;
2635 	}
2636 
2637 	/*
2638 	 * Done.  Knuth says that the result is on TAPE[1], but since we exited
2639 	 * the loop without performing the last iteration of step D6, we have not
2640 	 * rearranged the tape unit assignment, and therefore the result is on
2641 	 * TAPE[T].  We need to do it this way so that we can freeze the final
2642 	 * output tape while rewinding it.  The last iteration of step D6 would be
2643 	 * a waste of cycles anyway...
2644 	 */
2645 	state->result_tape = state->tp_tapenum[state->tapeRange];
2646 	LogicalTapeFreeze(state->tapeset, state->result_tape);
2647 	state->status = TSS_SORTEDONTAPE;
2648 }
2649 
2650 /*
2651  * Merge one run from each input tape, except ones with dummy runs.
2652  *
2653  * This is the inner loop of Algorithm D step D5.  We know that the
2654  * output tape is TAPE[T].
2655  */
2656 static void
mergeonerun(Tuplesortstate * state)2657 mergeonerun(Tuplesortstate *state)
2658 {
2659 	int			destTape = state->tp_tapenum[state->tapeRange];
2660 	int			srcTape;
2661 	int			tupIndex;
2662 	SortTuple  *tup;
2663 	int64		priorAvail,
2664 				spaceFreed;
2665 
2666 	/*
2667 	 * Start the merge by loading one tuple from each active source tape into
2668 	 * the heap.  We can also decrease the input run/dummy run counts.
2669 	 */
2670 	beginmerge(state, false);
2671 
2672 	/*
2673 	 * Execute merge by repeatedly extracting lowest tuple in heap, writing it
2674 	 * out, and replacing it with next tuple from same tape (if there is
2675 	 * another one).
2676 	 */
2677 	while (state->memtupcount > 0)
2678 	{
2679 		/* write the tuple to destTape */
2680 		priorAvail = state->availMem;
2681 		srcTape = state->memtuples[0].tupindex;
2682 		WRITETUP(state, destTape, &state->memtuples[0]);
2683 		/* writetup adjusted total free space, now fix per-tape space */
2684 		spaceFreed = state->availMem - priorAvail;
2685 		state->mergeavailmem[srcTape] += spaceFreed;
2686 		/* compact the heap */
2687 		tuplesort_heap_siftup(state, false);
2688 		if ((tupIndex = state->mergenext[srcTape]) == 0)
2689 		{
2690 			/* out of preloaded data on this tape, try to read more */
2691 			mergepreread(state);
2692 			/* if still no data, we've reached end of run on this tape */
2693 			if ((tupIndex = state->mergenext[srcTape]) == 0)
2694 				continue;
2695 		}
2696 		/* pull next preread tuple from list, insert in heap */
2697 		tup = &state->memtuples[tupIndex];
2698 		state->mergenext[srcTape] = tup->tupindex;
2699 		if (state->mergenext[srcTape] == 0)
2700 			state->mergelast[srcTape] = 0;
2701 		tuplesort_heap_insert(state, tup, srcTape, false);
2702 		/* put the now-unused memtuples entry on the freelist */
2703 		tup->tupindex = state->mergefreelist;
2704 		state->mergefreelist = tupIndex;
2705 		state->mergeavailslots[srcTape]++;
2706 	}
2707 
2708 	/*
2709 	 * Reset tuple memory.  We've freed all of the tuples that we previously
2710 	 * allocated, but AllocSetFree will have put those chunks of memory on
2711 	 * particular free lists, bucketed by size class.  Thus, although all of
2712 	 * that memory is free, it is effectively fragmented.  Resetting the
2713 	 * context gets us out from under that problem.
2714 	 */
2715 	MemoryContextReset(state->tuplecontext);
2716 
2717 	/*
2718 	 * When the heap empties, we're done.  Write an end-of-run marker on the
2719 	 * output tape, and increment its count of real runs.
2720 	 */
2721 	markrunend(state, destTape);
2722 	state->tp_runs[state->tapeRange]++;
2723 
2724 #ifdef TRACE_SORT
2725 	if (trace_sort)
2726 		elog(LOG, "finished %d-way merge step: %s", state->activeTapes,
2727 			 pg_rusage_show(&state->ru_start));
2728 #endif
2729 }
2730 
2731 /*
2732  * beginmerge - initialize for a merge pass
2733  *
2734  * We decrease the counts of real and dummy runs for each tape, and mark
2735  * which tapes contain active input runs in mergeactive[].  Then, load
2736  * as many tuples as we can from each active input tape, and finally
2737  * fill the merge heap with the first tuple from each active tape.
2738  *
2739  * finalMergeBatch indicates if this is the beginning of a final on-the-fly
2740  * merge where a batched allocation of tuple memory is required.
2741  */
2742 static void
beginmerge(Tuplesortstate * state,bool finalMergeBatch)2743 beginmerge(Tuplesortstate *state, bool finalMergeBatch)
2744 {
2745 	int			activeTapes;
2746 	int			tapenum;
2747 	int			srcTape;
2748 	int			slotsPerTape;
2749 	int64		spacePerTape;
2750 
2751 	/* Heap should be empty here */
2752 	Assert(state->memtupcount == 0);
2753 
2754 	/* Adjust run counts and mark the active tapes */
2755 	memset(state->mergeactive, 0,
2756 		   state->maxTapes * sizeof(*state->mergeactive));
2757 	activeTapes = 0;
2758 	for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
2759 	{
2760 		if (state->tp_dummy[tapenum] > 0)
2761 			state->tp_dummy[tapenum]--;
2762 		else
2763 		{
2764 			Assert(state->tp_runs[tapenum] > 0);
2765 			state->tp_runs[tapenum]--;
2766 			srcTape = state->tp_tapenum[tapenum];
2767 			state->mergeactive[srcTape] = true;
2768 			activeTapes++;
2769 		}
2770 	}
2771 	state->activeTapes = activeTapes;
2772 
2773 	/* Clear merge-pass state variables */
2774 	memset(state->mergenext, 0,
2775 		   state->maxTapes * sizeof(*state->mergenext));
2776 	memset(state->mergelast, 0,
2777 		   state->maxTapes * sizeof(*state->mergelast));
2778 	state->mergefreelist = 0;	/* nothing in the freelist */
2779 	state->mergefirstfree = activeTapes;		/* 1st slot avail for preread */
2780 
2781 	if (finalMergeBatch)
2782 	{
2783 		/* Free outright buffers for tape never actually allocated */
2784 		FREEMEM(state, (state->maxTapes - activeTapes) * TAPE_BUFFER_OVERHEAD);
2785 
2786 		/*
2787 		 * Grow memtuples one last time, since the palloc() overhead no longer
2788 		 * incurred can make a big difference
2789 		 */
2790 		batchmemtuples(state);
2791 	}
2792 
2793 	/*
2794 	 * Initialize space allocation to let each active input tape have an equal
2795 	 * share of preread space.
2796 	 */
2797 	Assert(activeTapes > 0);
2798 	slotsPerTape = (state->memtupsize - state->mergefirstfree) / activeTapes;
2799 	Assert(slotsPerTape > 0);
2800 	spacePerTape = MAXALIGN_DOWN(state->availMem / activeTapes);
2801 	for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
2802 	{
2803 		if (state->mergeactive[srcTape])
2804 		{
2805 			state->mergeavailslots[srcTape] = slotsPerTape;
2806 			state->mergeavailmem[srcTape] = spacePerTape;
2807 		}
2808 	}
2809 
2810 	/*
2811 	 * Preallocate tuple batch memory for each tape.  This is the memory used
2812 	 * for tuples themselves (not SortTuples), so it's never used by
2813 	 * pass-by-value datum sorts.  Memory allocation is performed here at most
2814 	 * once per sort, just in advance of the final on-the-fly merge step.
2815 	 */
2816 	if (finalMergeBatch)
2817 		mergebatch(state, spacePerTape);
2818 
2819 	/*
2820 	 * Preread as many tuples as possible (and at least one) from each active
2821 	 * tape
2822 	 */
2823 	mergepreread(state);
2824 
2825 	/* Load the merge heap with the first tuple from each input tape */
2826 	for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
2827 	{
2828 		int			tupIndex = state->mergenext[srcTape];
2829 		SortTuple  *tup;
2830 
2831 		if (tupIndex)
2832 		{
2833 			tup = &state->memtuples[tupIndex];
2834 			state->mergenext[srcTape] = tup->tupindex;
2835 			if (state->mergenext[srcTape] == 0)
2836 				state->mergelast[srcTape] = 0;
2837 			tuplesort_heap_insert(state, tup, srcTape, false);
2838 			/* put the now-unused memtuples entry on the freelist */
2839 			tup->tupindex = state->mergefreelist;
2840 			state->mergefreelist = tupIndex;
2841 			state->mergeavailslots[srcTape]++;
2842 
2843 #ifdef TRACE_SORT
2844 			if (trace_sort && finalMergeBatch)
2845 			{
2846 				int64		perTapeKB = (spacePerTape + 1023) / 1024;
2847 				int64		usedSpaceKB;
2848 				int			usedSlots;
2849 
2850 				/*
2851 				 * Report how effective batchmemtuples() was in balancing the
2852 				 * number of slots against the need for memory for the
2853 				 * underlying tuples (e.g. IndexTuples).  The big preread of
2854 				 * all tapes when switching to FINALMERGE state should be
2855 				 * fairly representative of memory utilization during the
2856 				 * final merge step, and in any case is the only point at
2857 				 * which all tapes are guaranteed to have depleted either
2858 				 * their batch memory allowance or slot allowance.  Ideally,
2859 				 * both will be completely depleted for every tape by now.
2860 				 */
2861 				usedSpaceKB = (state->mergecurrent[srcTape] -
2862 							   state->mergetuples[srcTape] + 1023) / 1024;
2863 				usedSlots = slotsPerTape - state->mergeavailslots[srcTape];
2864 
2865 				elog(LOG, "tape %d initially used " INT64_FORMAT " KB of "
2866 					 INT64_FORMAT " KB batch (%2.3f) and %d out of %d slots "
2867 					 "(%2.3f)", srcTape,
2868 					 usedSpaceKB, perTapeKB,
2869 					 (double) usedSpaceKB / (double) perTapeKB,
2870 					 usedSlots, slotsPerTape,
2871 					 (double) usedSlots / (double) slotsPerTape);
2872 			}
2873 #endif
2874 		}
2875 	}
2876 }
2877 
2878 /*
2879  * batchmemtuples - grow memtuples without palloc overhead
2880  *
2881  * When called, availMem should be approximately the amount of memory we'd
2882  * require to allocate memtupsize - memtupcount tuples (not SortTuples/slots)
2883  * that were allocated with palloc() overhead, and in doing so use up all
2884  * allocated slots.  However, though slots and tuple memory is in balance
2885  * following the last grow_memtuples() call, that's predicated on the observed
2886  * average tuple size for the "final" grow_memtuples() call, which includes
2887  * palloc overhead.  During the final merge pass, where we will arrange to
2888  * squeeze out the palloc overhead, we might need more slots in the memtuples
2889  * array.
2890  *
2891  * To make that happen, arrange for the amount of remaining memory to be
2892  * exactly equal to the palloc overhead multiplied by the current size of
2893  * the memtuples array, force the grow_memtuples flag back to true (it's
2894  * probably but not necessarily false on entry to this routine), and then
2895  * call grow_memtuples.  This simulates loading enough tuples to fill the
2896  * whole memtuples array and then having some space left over because of the
2897  * elided palloc overhead.  We expect that grow_memtuples() will conclude that
2898  * it can't double the size of the memtuples array but that it can increase
2899  * it by some percentage; but if it does decide to double it, that just means
2900  * that we've never managed to use many slots in the memtuples array, in which
2901  * case doubling it shouldn't hurt anything anyway.
2902  */
2903 static void
batchmemtuples(Tuplesortstate * state)2904 batchmemtuples(Tuplesortstate *state)
2905 {
2906 	int64		refund;
2907 	int64		availMemLessRefund;
2908 	int			memtupsize = state->memtupsize;
2909 
2910 	/* Caller error if we have no tapes */
2911 	Assert(state->activeTapes > 0);
2912 
2913 	/* For simplicity, assume no memtuples are actually currently counted */
2914 	Assert(state->memtupcount == 0);
2915 
2916 	/*
2917 	 * Refund STANDARDCHUNKHEADERSIZE per tuple.
2918 	 *
2919 	 * This sometimes fails to make memory use perfectly balanced, but it
2920 	 * should never make the situation worse.  Note that Assert-enabled builds
2921 	 * get a larger refund, due to a varying STANDARDCHUNKHEADERSIZE.
2922 	 */
2923 	refund = memtupsize * STANDARDCHUNKHEADERSIZE;
2924 	availMemLessRefund = state->availMem - refund;
2925 
2926 	/*
2927 	 * We need to be sure that we do not cause LACKMEM to become true, else
2928 	 * the batch allocation size could be calculated as negative, causing
2929 	 * havoc.  Hence, if availMemLessRefund is negative at this point, we must
2930 	 * do nothing.  Moreover, if it's positive but rather small, there's
2931 	 * little point in proceeding because we could only increase memtuples by
2932 	 * a small amount, not worth the cost of the repalloc's.  We somewhat
2933 	 * arbitrarily set the threshold at ALLOCSET_DEFAULT_INITSIZE per tape.
2934 	 * (Note that this does not represent any assumption about tuple sizes.)
2935 	 */
2936 	if (availMemLessRefund <=
2937 		(int64) state->activeTapes * ALLOCSET_DEFAULT_INITSIZE)
2938 		return;
2939 
2940 	/*
2941 	 * To establish balanced memory use after refunding palloc overhead,
2942 	 * temporarily have our accounting indicate that we've allocated all
2943 	 * memory we're allowed to less that refund, and call grow_memtuples() to
2944 	 * have it increase the number of slots.
2945 	 */
2946 	state->growmemtuples = true;
2947 	USEMEM(state, availMemLessRefund);
2948 	(void) grow_memtuples(state);
2949 	state->growmemtuples = false;
2950 	/* availMem must stay accurate for spacePerTape calculation */
2951 	FREEMEM(state, availMemLessRefund);
2952 	if (LACKMEM(state))
2953 		elog(ERROR, "unexpected out-of-memory situation in tuplesort");
2954 
2955 #ifdef TRACE_SORT
2956 	if (trace_sort)
2957 	{
2958 		Size		OldKb = (memtupsize * sizeof(SortTuple) + 1023) / 1024;
2959 		Size		NewKb = (state->memtupsize * sizeof(SortTuple) + 1023) / 1024;
2960 
2961 		elog(LOG, "grew memtuples %1.2fx from %d (%zu KB) to %d (%zu KB) for final merge",
2962 			 (double) NewKb / (double) OldKb,
2963 			 memtupsize, OldKb,
2964 			 state->memtupsize, NewKb);
2965 	}
2966 #endif
2967 }
2968 
2969 /*
2970  * mergebatch - initialize tuple memory in batch
2971  *
2972  * This allows sequential access to sorted tuples buffered in memory from
2973  * tapes/runs on disk during a final on-the-fly merge step.  Note that the
2974  * memory is not used for SortTuples, but for the underlying tuples (e.g.
2975  * MinimalTuples).
2976  *
2977  * Note that when batch memory is used, there is a simple division of space
2978  * into large buffers (one per active tape).  The conventional incremental
2979  * memory accounting (calling USEMEM() and FREEMEM()) is abandoned.  Instead,
2980  * when each tape's memory budget is exceeded, a retail palloc() "overflow" is
2981  * performed, which is then immediately detected in a way that is analogous to
2982  * LACKMEM().  This keeps each tape's use of memory fair, which is always a
2983  * goal.
2984  */
2985 static void
mergebatch(Tuplesortstate * state,int64 spacePerTape)2986 mergebatch(Tuplesortstate *state, int64 spacePerTape)
2987 {
2988 	int			srcTape;
2989 
2990 	Assert(state->activeTapes > 0);
2991 	Assert(state->tuples);
2992 
2993 	/*
2994 	 * For the purposes of tuplesort's memory accounting, the batch allocation
2995 	 * is special, and regular memory accounting through USEMEM() calls is
2996 	 * abandoned (see mergeprereadone()).
2997 	 */
2998 	for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
2999 	{
3000 		char	   *mergetuples;
3001 
3002 		if (!state->mergeactive[srcTape])
3003 			continue;
3004 
3005 		/* Allocate buffer for each active tape */
3006 		mergetuples = MemoryContextAllocHuge(state->tuplecontext,
3007 											 spacePerTape);
3008 
3009 		/* Initialize state for tape */
3010 		state->mergetuples[srcTape] = mergetuples;
3011 		state->mergecurrent[srcTape] = mergetuples;
3012 		state->mergetail[srcTape] = mergetuples;
3013 		state->mergeoverflow[srcTape] = NULL;
3014 	}
3015 
3016 	state->batchUsed = true;
3017 	state->spacePerTape = spacePerTape;
3018 }
3019 
3020 /*
3021  * mergebatchone - prepare batch memory for one merge input tape
3022  *
3023  * This is called following the exhaustion of preread tuples for one input
3024  * tape.  All that actually occurs is that the state for the source tape is
3025  * reset to indicate that all memory may be reused.
3026  *
3027  * This routine must deal with fixing up the tuple that is about to be returned
3028  * to the client, due to "overflow" allocations.
3029  */
3030 static void
mergebatchone(Tuplesortstate * state,int srcTape,SortTuple * rtup,bool * should_free)3031 mergebatchone(Tuplesortstate *state, int srcTape, SortTuple *rtup,
3032 			  bool *should_free)
3033 {
3034 	Assert(state->batchUsed);
3035 
3036 	/*
3037 	 * Tuple about to be returned to caller ("stup") is final preread tuple
3038 	 * from tape, just removed from the top of the heap.  Special steps around
3039 	 * memory management must be performed for that tuple, to make sure it
3040 	 * isn't overwritten early.
3041 	 */
3042 	if (!state->mergeoverflow[srcTape])
3043 	{
3044 		Size		tupLen;
3045 
3046 		/*
3047 		 * Mark tuple buffer range for reuse, but be careful to move final,
3048 		 * tail tuple to start of space for next run so that it's available to
3049 		 * caller when stup is returned, and remains available at least until
3050 		 * the next tuple is requested.
3051 		 */
3052 		tupLen = state->mergecurrent[srcTape] - state->mergetail[srcTape];
3053 		state->mergecurrent[srcTape] = state->mergetuples[srcTape];
3054 		MOVETUP(state->mergecurrent[srcTape], state->mergetail[srcTape],
3055 				tupLen);
3056 
3057 		/* Make SortTuple at top of the merge heap point to new tuple */
3058 		rtup->tuple = (void *) state->mergecurrent[srcTape];
3059 
3060 		state->mergetail[srcTape] = state->mergecurrent[srcTape];
3061 		state->mergecurrent[srcTape] += tupLen;
3062 	}
3063 	else
3064 	{
3065 		/*
3066 		 * Handle an "overflow" retail palloc.
3067 		 *
3068 		 * This is needed when we run out of tuple memory for the tape.
3069 		 */
3070 		state->mergecurrent[srcTape] = state->mergetuples[srcTape];
3071 		state->mergetail[srcTape] = state->mergetuples[srcTape];
3072 
3073 		if (rtup->tuple)
3074 		{
3075 			Assert(rtup->tuple == (void *) state->mergeoverflow[srcTape]);
3076 			/* Caller should free palloc'd tuple */
3077 			*should_free = true;
3078 		}
3079 		state->mergeoverflow[srcTape] = NULL;
3080 	}
3081 }
3082 
3083 /*
3084  * mergebatchfreetape - handle final clean-up for batch memory once tape is
3085  * about to become exhausted
3086  *
3087  * All tuples are returned from tape, but a single final tuple, *rtup, is to be
3088  * passed back to caller.  Free tape's batch allocation buffer while ensuring
3089  * that the final tuple is managed appropriately.
3090  */
3091 static void
mergebatchfreetape(Tuplesortstate * state,int srcTape,SortTuple * rtup,bool * should_free)3092 mergebatchfreetape(Tuplesortstate *state, int srcTape, SortTuple *rtup,
3093 				   bool *should_free)
3094 {
3095 	Assert(state->batchUsed);
3096 	Assert(state->status == TSS_FINALMERGE);
3097 
3098 	/*
3099 	 * Tuple may or may not already be an overflow allocation from
3100 	 * mergebatchone()
3101 	 */
3102 	if (!*should_free && rtup->tuple)
3103 	{
3104 		/*
3105 		 * Final tuple still in tape's batch allocation.
3106 		 *
3107 		 * Return palloc()'d copy to caller, and have it freed in a similar
3108 		 * manner to overflow allocation.  Otherwise, we'd free batch memory
3109 		 * and pass back a pointer to garbage.  Note that we deliberately
3110 		 * allocate this in the parent tuplesort context, to be on the safe
3111 		 * side.
3112 		 */
3113 		Size		tuplen;
3114 		void	   *oldTuple = rtup->tuple;
3115 
3116 		tuplen = state->mergecurrent[srcTape] - state->mergetail[srcTape];
3117 		rtup->tuple = MemoryContextAlloc(state->sortcontext, tuplen);
3118 		MOVETUP(rtup->tuple, oldTuple, tuplen);
3119 		*should_free = true;
3120 	}
3121 
3122 	/* Free spacePerTape-sized buffer */
3123 	pfree(state->mergetuples[srcTape]);
3124 }
3125 
3126 /*
3127  * mergebatchalloc - allocate memory for one tuple using a batch memory
3128  * "logical allocation".
3129  *
3130  * This is used for the final on-the-fly merge phase only.  READTUP() routines
3131  * receive memory from here in place of palloc() and USEMEM() calls.
3132  *
3133  * Tuple tapenum is passed, ensuring each tape's tuples are stored in sorted,
3134  * contiguous order (while allowing safe reuse of memory made available to
3135  * each tape).  This maximizes locality of access as tuples are returned by
3136  * final merge.
3137  *
3138  * Caller must not subsequently attempt to free memory returned here.  In
3139  * general, only mergebatch* functions know about how memory returned from
3140  * here should be freed, and this function's caller must ensure that batch
3141  * memory management code will definitely have the opportunity to do the right
3142  * thing during the final on-the-fly merge.
3143  */
3144 static void *
mergebatchalloc(Tuplesortstate * state,int tapenum,Size tuplen)3145 mergebatchalloc(Tuplesortstate *state, int tapenum, Size tuplen)
3146 {
3147 	Size		reserve_tuplen = MAXALIGN(tuplen);
3148 	char	   *ret;
3149 
3150 	/* Should overflow at most once before mergebatchone() call: */
3151 	Assert(state->mergeoverflow[tapenum] == NULL);
3152 	Assert(state->batchUsed);
3153 
3154 	/* It should be possible to use precisely spacePerTape memory at once */
3155 	if (state->mergecurrent[tapenum] + reserve_tuplen <=
3156 		state->mergetuples[tapenum] + state->spacePerTape)
3157 	{
3158 		/*
3159 		 * Usual case -- caller is returned pointer into its tape's buffer,
3160 		 * and an offset from that point is recorded as where tape has
3161 		 * consumed up to for current round of preloading.
3162 		 */
3163 		ret = state->mergetail[tapenum] = state->mergecurrent[tapenum];
3164 		state->mergecurrent[tapenum] += reserve_tuplen;
3165 	}
3166 	else
3167 	{
3168 		/*
3169 		 * Allocate memory, and record as tape's overflow allocation.  This
3170 		 * will be detected quickly, in a similar fashion to a LACKMEM()
3171 		 * condition, and should not happen again before a new round of
3172 		 * preloading for caller's tape.  Note that we deliberately allocate
3173 		 * this in the parent tuplesort context, to be on the safe side.
3174 		 *
3175 		 * Sometimes, this does not happen because merging runs out of slots
3176 		 * before running out of memory.
3177 		 */
3178 		ret = state->mergeoverflow[tapenum] =
3179 			MemoryContextAlloc(state->sortcontext, tuplen);
3180 	}
3181 
3182 	return ret;
3183 }
3184 
3185 /*
3186  * mergepreread - load tuples from merge input tapes
3187  *
3188  * This routine exists to improve sequentiality of reads during a merge pass,
3189  * as explained in the header comments of this file.  Load tuples from each
3190  * active source tape until the tape's run is exhausted or it has used up
3191  * its fair share of available memory.  In any case, we guarantee that there
3192  * is at least one preread tuple available from each unexhausted input tape.
3193  *
3194  * We invoke this routine at the start of a merge pass for initial load,
3195  * and then whenever any tape's preread data runs out.  Note that we load
3196  * as much data as possible from all tapes, not just the one that ran out.
3197  * This is because logtape.c works best with a usage pattern that alternates
3198  * between reading a lot of data and writing a lot of data, so whenever we
3199  * are forced to read, we should fill working memory completely.
3200  *
3201  * In FINALMERGE state, we *don't* use this routine, but instead just preread
3202  * from the single tape that ran dry.  There's no read/write alternation in
3203  * that state and so no point in scanning through all the tapes to fix one.
3204  * (Moreover, there may be quite a lot of inactive tapes in that state, since
3205  * we might have had many fewer runs than tapes.  In a regular tape-to-tape
3206  * merge we can expect most of the tapes to be active.  Plus, only
3207  * FINALMERGE state has to consider memory management for a batch
3208  * allocation.)
3209  */
3210 static void
mergepreread(Tuplesortstate * state)3211 mergepreread(Tuplesortstate *state)
3212 {
3213 	int			srcTape;
3214 
3215 	for (srcTape = 0; srcTape < state->maxTapes; srcTape++)
3216 		mergeprereadone(state, srcTape);
3217 }
3218 
3219 /*
3220  * mergeprereadone - load tuples from one merge input tape
3221  *
3222  * Read tuples from the specified tape until it has used up its free memory
3223  * or array slots; but ensure that we have at least one tuple, if any are
3224  * to be had.
3225  */
3226 static void
mergeprereadone(Tuplesortstate * state,int srcTape)3227 mergeprereadone(Tuplesortstate *state, int srcTape)
3228 {
3229 	unsigned int tuplen;
3230 	SortTuple	stup;
3231 	int			tupIndex;
3232 	int64		priorAvail,
3233 				spaceUsed;
3234 
3235 	if (!state->mergeactive[srcTape])
3236 		return;					/* tape's run is already exhausted */
3237 
3238 	/*
3239 	 * Manage per-tape availMem.  Only actually matters when batch memory not
3240 	 * in use.
3241 	 */
3242 	priorAvail = state->availMem;
3243 	state->availMem = state->mergeavailmem[srcTape];
3244 
3245 	/*
3246 	 * When batch memory is used if final on-the-fly merge, only mergeoverflow
3247 	 * test is relevant; otherwise, only LACKMEM() test is relevant.
3248 	 */
3249 	while ((state->mergeavailslots[srcTape] > 0 &&
3250 			state->mergeoverflow[srcTape] == NULL && !LACKMEM(state)) ||
3251 		   state->mergenext[srcTape] == 0)
3252 	{
3253 		/* read next tuple, if any */
3254 		if ((tuplen = getlen(state, srcTape, true)) == 0)
3255 		{
3256 			state->mergeactive[srcTape] = false;
3257 			break;
3258 		}
3259 		READTUP(state, &stup, srcTape, tuplen);
3260 		/* find a free slot in memtuples[] for it */
3261 		tupIndex = state->mergefreelist;
3262 		if (tupIndex)
3263 			state->mergefreelist = state->memtuples[tupIndex].tupindex;
3264 		else
3265 		{
3266 			tupIndex = state->mergefirstfree++;
3267 			Assert(tupIndex < state->memtupsize);
3268 		}
3269 		state->mergeavailslots[srcTape]--;
3270 		/* store tuple, append to list for its tape */
3271 		stup.tupindex = 0;
3272 		state->memtuples[tupIndex] = stup;
3273 		if (state->mergelast[srcTape])
3274 			state->memtuples[state->mergelast[srcTape]].tupindex = tupIndex;
3275 		else
3276 			state->mergenext[srcTape] = tupIndex;
3277 		state->mergelast[srcTape] = tupIndex;
3278 	}
3279 	/* update per-tape and global availmem counts */
3280 	spaceUsed = state->mergeavailmem[srcTape] - state->availMem;
3281 	state->mergeavailmem[srcTape] = state->availMem;
3282 	state->availMem = priorAvail - spaceUsed;
3283 }
3284 
3285 /*
3286  * dumptuples - remove tuples from memtuples and write to tape
3287  *
3288  * This is used during initial-run building, but not during merging.
3289  *
3290  * When alltuples = false and replacement selection is still active, dump
3291  * only enough tuples to get under the availMem limit (and leave at least
3292  * one tuple in memtuples, since puttuple will then assume it is a heap that
3293  * has a tuple to compare to).  We always insist there be at least one free
3294  * slot in the memtuples[] array.
3295  *
3296  * When alltuples = true, dump everything currently in memory.  (This
3297  * case is only used at end of input data, although in practice only the
3298  * first run could fail to dump all tuples when we LACKMEM(), and only
3299  * when replacement selection is active.)
3300  *
3301  * If, when replacement selection is active, we see that the tuple run
3302  * number at the top of the heap has changed, start a new run.  This must be
3303  * the first run, because replacement selection is always abandoned for all
3304  * further runs.
3305  */
3306 static void
dumptuples(Tuplesortstate * state,bool alltuples)3307 dumptuples(Tuplesortstate *state, bool alltuples)
3308 {
3309 	while (alltuples ||
3310 		   (LACKMEM(state) && state->memtupcount > 1) ||
3311 		   state->memtupcount >= state->memtupsize)
3312 	{
3313 		if (state->replaceActive)
3314 		{
3315 			/*
3316 			 * Still holding out for a case favorable to replacement
3317 			 * selection. Still incrementally spilling using heap.
3318 			 *
3319 			 * Dump the heap's frontmost entry, and sift up to remove it from
3320 			 * the heap.
3321 			 */
3322 			Assert(state->memtupcount > 0);
3323 			WRITETUP(state, state->tp_tapenum[state->destTape],
3324 					 &state->memtuples[0]);
3325 			tuplesort_heap_siftup(state, true);
3326 		}
3327 		else
3328 		{
3329 			/*
3330 			 * Once committed to quicksorting runs, never incrementally spill
3331 			 */
3332 			dumpbatch(state, alltuples);
3333 			break;
3334 		}
3335 
3336 		/*
3337 		 * If top run number has changed, we've finished the current run (this
3338 		 * can only be the first run), and will no longer spill incrementally.
3339 		 */
3340 		if (state->memtupcount == 0 ||
3341 			state->memtuples[0].tupindex == HEAP_RUN_NEXT)
3342 		{
3343 			markrunend(state, state->tp_tapenum[state->destTape]);
3344 			Assert(state->currentRun == RUN_FIRST);
3345 			state->currentRun++;
3346 			state->tp_runs[state->destTape]++;
3347 			state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
3348 
3349 #ifdef TRACE_SORT
3350 			if (trace_sort)
3351 				elog(LOG, "finished incrementally writing %s run %d to tape %d: %s",
3352 					 (state->memtupcount == 0) ? "only" : "first",
3353 					 state->currentRun, state->destTape,
3354 					 pg_rusage_show(&state->ru_start));
3355 #endif
3356 
3357 			/*
3358 			 * Done if heap is empty, which is possible when there is only one
3359 			 * long run.
3360 			 */
3361 			Assert(state->currentRun == RUN_SECOND);
3362 			if (state->memtupcount == 0)
3363 			{
3364 				/*
3365 				 * Replacement selection best case; no final merge required,
3366 				 * because there was only one initial run (second run has no
3367 				 * tuples).  See RUN_SECOND case in mergeruns().
3368 				 */
3369 				break;
3370 			}
3371 
3372 			/*
3373 			 * Abandon replacement selection for second run (as well as any
3374 			 * subsequent runs).
3375 			 */
3376 			state->replaceActive = false;
3377 
3378 			/*
3379 			 * First tuple of next run should not be heapified, and so will
3380 			 * bear placeholder run number.  In practice this must actually be
3381 			 * the second run, which just became the currentRun, so we're
3382 			 * clear to quicksort and dump the tuples in batch next time
3383 			 * memtuples becomes full.
3384 			 */
3385 			Assert(state->memtuples[0].tupindex == HEAP_RUN_NEXT);
3386 			selectnewtape(state);
3387 		}
3388 	}
3389 }
3390 
3391 /*
3392  * dumpbatch - sort and dump all memtuples, forming one run on tape
3393  *
3394  * Second or subsequent runs are never heapified by this module (although
3395  * heapification still respects run number differences between the first and
3396  * second runs), and a heap (replacement selection priority queue) is often
3397  * avoided in the first place.
3398  */
3399 static void
dumpbatch(Tuplesortstate * state,bool alltuples)3400 dumpbatch(Tuplesortstate *state, bool alltuples)
3401 {
3402 	int			memtupwrite;
3403 	int			i;
3404 
3405 	/*
3406 	 * Final call might require no sorting, in rare cases where we just so
3407 	 * happen to have previously LACKMEM()'d at the point where exactly all
3408 	 * remaining tuples are loaded into memory, just before input was
3409 	 * exhausted.
3410 	 *
3411 	 * In general, short final runs are quite possible.  Rather than allowing
3412 	 * a special case where there was a superfluous selectnewtape() call (i.e.
3413 	 * a call with no subsequent run actually written to destTape), we prefer
3414 	 * to write out a 0 tuple run.
3415 	 *
3416 	 * mergepreread()/mergeprereadone() are prepared for 0 tuple runs, and
3417 	 * will reliably mark the tape inactive for the merge when called from
3418 	 * beginmerge().  This case is therefore similar to the case where
3419 	 * mergeonerun() finds a dummy run for the tape, and so doesn't need to
3420 	 * merge a run from the tape (or conceptually "merges" the dummy run, if
3421 	 * you prefer).  According to Knuth, Algorithm D "isn't strictly optimal"
3422 	 * in its method of distribution and dummy run assignment; this edge case
3423 	 * seems very unlikely to make that appreciably worse.
3424 	 */
3425 	Assert(state->status == TSS_BUILDRUNS);
3426 
3427 	/*
3428 	 * It seems unlikely that this limit will ever be exceeded, but take no
3429 	 * chances
3430 	 */
3431 	if (state->currentRun == INT_MAX)
3432 		ereport(ERROR,
3433 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
3434 				 errmsg("cannot have more than %d runs for an external sort",
3435 						INT_MAX)));
3436 
3437 	state->currentRun++;
3438 
3439 #ifdef TRACE_SORT
3440 	if (trace_sort)
3441 		elog(LOG, "starting quicksort of run %d: %s",
3442 			 state->currentRun, pg_rusage_show(&state->ru_start));
3443 #endif
3444 
3445 	/*
3446 	 * Sort all tuples accumulated within the allowed amount of memory for
3447 	 * this run using quicksort
3448 	 */
3449 	tuplesort_sort_memtuples(state);
3450 
3451 #ifdef TRACE_SORT
3452 	if (trace_sort)
3453 		elog(LOG, "finished quicksort of run %d: %s",
3454 			 state->currentRun, pg_rusage_show(&state->ru_start));
3455 #endif
3456 
3457 	memtupwrite = state->memtupcount;
3458 	for (i = 0; i < memtupwrite; i++)
3459 	{
3460 		WRITETUP(state, state->tp_tapenum[state->destTape],
3461 				 &state->memtuples[i]);
3462 		state->memtupcount--;
3463 	}
3464 
3465 	/*
3466 	 * Reset tuple memory.  We've freed all of the tuples that we previously
3467 	 * allocated.  It's important to avoid fragmentation when there is a stark
3468 	 * change in allocation patterns due to the use of batch memory.
3469 	 * Fragmentation due to AllocSetFree's bucketing by size class might be
3470 	 * particularly bad if this step wasn't taken.
3471 	 */
3472 	MemoryContextReset(state->tuplecontext);
3473 
3474 	markrunend(state, state->tp_tapenum[state->destTape]);
3475 	state->tp_runs[state->destTape]++;
3476 	state->tp_dummy[state->destTape]--; /* per Alg D step D2 */
3477 
3478 #ifdef TRACE_SORT
3479 	if (trace_sort)
3480 		elog(LOG, "finished writing run %d to tape %d: %s",
3481 			 state->currentRun, state->destTape,
3482 			 pg_rusage_show(&state->ru_start));
3483 #endif
3484 
3485 	if (!alltuples)
3486 		selectnewtape(state);
3487 }
3488 
3489 /*
3490  * tuplesort_rescan		- rewind and replay the scan
3491  */
3492 void
tuplesort_rescan(Tuplesortstate * state)3493 tuplesort_rescan(Tuplesortstate *state)
3494 {
3495 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3496 
3497 	Assert(state->randomAccess);
3498 
3499 	switch (state->status)
3500 	{
3501 		case TSS_SORTEDINMEM:
3502 			state->current = 0;
3503 			state->eof_reached = false;
3504 			state->markpos_offset = 0;
3505 			state->markpos_eof = false;
3506 			break;
3507 		case TSS_SORTEDONTAPE:
3508 			LogicalTapeRewind(state->tapeset,
3509 							  state->result_tape,
3510 							  false);
3511 			state->eof_reached = false;
3512 			state->markpos_block = 0L;
3513 			state->markpos_offset = 0;
3514 			state->markpos_eof = false;
3515 			break;
3516 		default:
3517 			elog(ERROR, "invalid tuplesort state");
3518 			break;
3519 	}
3520 
3521 	MemoryContextSwitchTo(oldcontext);
3522 }
3523 
3524 /*
3525  * tuplesort_markpos	- saves current position in the merged sort file
3526  */
3527 void
tuplesort_markpos(Tuplesortstate * state)3528 tuplesort_markpos(Tuplesortstate *state)
3529 {
3530 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3531 
3532 	Assert(state->randomAccess);
3533 
3534 	switch (state->status)
3535 	{
3536 		case TSS_SORTEDINMEM:
3537 			state->markpos_offset = state->current;
3538 			state->markpos_eof = state->eof_reached;
3539 			break;
3540 		case TSS_SORTEDONTAPE:
3541 			LogicalTapeTell(state->tapeset,
3542 							state->result_tape,
3543 							&state->markpos_block,
3544 							&state->markpos_offset);
3545 			state->markpos_eof = state->eof_reached;
3546 			break;
3547 		default:
3548 			elog(ERROR, "invalid tuplesort state");
3549 			break;
3550 	}
3551 
3552 	MemoryContextSwitchTo(oldcontext);
3553 }
3554 
3555 /*
3556  * tuplesort_restorepos - restores current position in merged sort file to
3557  *						  last saved position
3558  */
3559 void
tuplesort_restorepos(Tuplesortstate * state)3560 tuplesort_restorepos(Tuplesortstate *state)
3561 {
3562 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
3563 
3564 	Assert(state->randomAccess);
3565 
3566 	switch (state->status)
3567 	{
3568 		case TSS_SORTEDINMEM:
3569 			state->current = state->markpos_offset;
3570 			state->eof_reached = state->markpos_eof;
3571 			break;
3572 		case TSS_SORTEDONTAPE:
3573 			if (!LogicalTapeSeek(state->tapeset,
3574 								 state->result_tape,
3575 								 state->markpos_block,
3576 								 state->markpos_offset))
3577 				elog(ERROR, "tuplesort_restorepos failed");
3578 			state->eof_reached = state->markpos_eof;
3579 			break;
3580 		default:
3581 			elog(ERROR, "invalid tuplesort state");
3582 			break;
3583 	}
3584 
3585 	MemoryContextSwitchTo(oldcontext);
3586 }
3587 
3588 /*
3589  * tuplesort_get_stats - extract summary statistics
3590  *
3591  * This can be called after tuplesort_performsort() finishes to obtain
3592  * printable summary information about how the sort was performed.
3593  * spaceUsed is measured in kilobytes.
3594  */
3595 void
tuplesort_get_stats(Tuplesortstate * state,const char ** sortMethod,const char ** spaceType,long * spaceUsed)3596 tuplesort_get_stats(Tuplesortstate *state,
3597 					const char **sortMethod,
3598 					const char **spaceType,
3599 					long *spaceUsed)
3600 {
3601 	/*
3602 	 * Note: it might seem we should provide both memory and disk usage for a
3603 	 * disk-based sort.  However, the current code doesn't track memory space
3604 	 * accurately once we have begun to return tuples to the caller (since we
3605 	 * don't account for pfree's the caller is expected to do), so we cannot
3606 	 * rely on availMem in a disk sort.  This does not seem worth the overhead
3607 	 * to fix.  Is it worth creating an API for the memory context code to
3608 	 * tell us how much is actually used in sortcontext?
3609 	 */
3610 	if (state->tapeset)
3611 	{
3612 		*spaceType = "Disk";
3613 		*spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024);
3614 	}
3615 	else
3616 	{
3617 		*spaceType = "Memory";
3618 		*spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
3619 	}
3620 
3621 	switch (state->status)
3622 	{
3623 		case TSS_SORTEDINMEM:
3624 			if (state->boundUsed)
3625 				*sortMethod = "top-N heapsort";
3626 			else
3627 				*sortMethod = "quicksort";
3628 			break;
3629 		case TSS_SORTEDONTAPE:
3630 			*sortMethod = "external sort";
3631 			break;
3632 		case TSS_FINALMERGE:
3633 			*sortMethod = "external merge";
3634 			break;
3635 		default:
3636 			*sortMethod = "still in progress";
3637 			break;
3638 	}
3639 }
3640 
3641 
3642 /*
3643  * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
3644  *
3645  * Compare two SortTuples.  If checkIndex is true, use the tuple index
3646  * as the front of the sort key; otherwise, no.
3647  *
3648  * Note that for checkIndex callers, the heap invariant is never
3649  * maintained beyond the first run, and so there are no COMPARETUP()
3650  * calls needed to distinguish tuples in HEAP_RUN_NEXT.
3651  */
3652 
3653 #define HEAPCOMPARE(tup1,tup2) \
3654 	(checkIndex && ((tup1)->tupindex != (tup2)->tupindex || \
3655 					(tup1)->tupindex == HEAP_RUN_NEXT) ? \
3656 	 ((tup1)->tupindex) - ((tup2)->tupindex) : \
3657 	 COMPARETUP(state, tup1, tup2))
3658 
3659 /*
3660  * Convert the existing unordered array of SortTuples to a bounded heap,
3661  * discarding all but the smallest "state->bound" tuples.
3662  *
3663  * When working with a bounded heap, we want to keep the largest entry
3664  * at the root (array entry zero), instead of the smallest as in the normal
3665  * sort case.  This allows us to discard the largest entry cheaply.
3666  * Therefore, we temporarily reverse the sort direction.
3667  *
3668  * We assume that all entries in a bounded heap will always have tupindex
3669  * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse
3670  * the direction of comparison for tupindexes.
3671  */
3672 static void
make_bounded_heap(Tuplesortstate * state)3673 make_bounded_heap(Tuplesortstate *state)
3674 {
3675 	int			tupcount = state->memtupcount;
3676 	int			i;
3677 
3678 	Assert(state->status == TSS_INITIAL);
3679 	Assert(state->bounded);
3680 	Assert(tupcount >= state->bound);
3681 
3682 	/* Reverse sort direction so largest entry will be at root */
3683 	reversedirection(state);
3684 
3685 	state->memtupcount = 0;		/* make the heap empty */
3686 	for (i = 0; i < tupcount; i++)
3687 	{
3688 		if (state->memtupcount >= state->bound &&
3689 		  COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0)
3690 		{
3691 			/* New tuple would just get thrown out, so skip it */
3692 			free_sort_tuple(state, &state->memtuples[i]);
3693 			CHECK_FOR_INTERRUPTS();
3694 		}
3695 		else
3696 		{
3697 			/* Insert next tuple into heap */
3698 			/* Must copy source tuple to avoid possible overwrite */
3699 			SortTuple	stup = state->memtuples[i];
3700 
3701 			tuplesort_heap_insert(state, &stup, 0, false);
3702 
3703 			/* If heap too full, discard largest entry */
3704 			if (state->memtupcount > state->bound)
3705 			{
3706 				free_sort_tuple(state, &state->memtuples[0]);
3707 				tuplesort_heap_siftup(state, false);
3708 			}
3709 		}
3710 	}
3711 
3712 	Assert(state->memtupcount == state->bound);
3713 	state->status = TSS_BOUNDED;
3714 }
3715 
3716 /*
3717  * Convert the bounded heap to a properly-sorted array
3718  */
3719 static void
sort_bounded_heap(Tuplesortstate * state)3720 sort_bounded_heap(Tuplesortstate *state)
3721 {
3722 	int			tupcount = state->memtupcount;
3723 
3724 	Assert(state->status == TSS_BOUNDED);
3725 	Assert(state->bounded);
3726 	Assert(tupcount == state->bound);
3727 
3728 	/*
3729 	 * We can unheapify in place because each sift-up will remove the largest
3730 	 * entry, which we can promptly store in the newly freed slot at the end.
3731 	 * Once we're down to a single-entry heap, we're done.
3732 	 */
3733 	while (state->memtupcount > 1)
3734 	{
3735 		SortTuple	stup = state->memtuples[0];
3736 
3737 		/* this sifts-up the next-largest entry and decreases memtupcount */
3738 		tuplesort_heap_siftup(state, false);
3739 		state->memtuples[state->memtupcount] = stup;
3740 	}
3741 	state->memtupcount = tupcount;
3742 
3743 	/*
3744 	 * Reverse sort direction back to the original state.  This is not
3745 	 * actually necessary but seems like a good idea for tidiness.
3746 	 */
3747 	reversedirection(state);
3748 
3749 	state->status = TSS_SORTEDINMEM;
3750 	state->boundUsed = true;
3751 }
3752 
3753 /*
3754  * Sort all memtuples using specialized qsort() routines.
3755  *
3756  * Quicksort is used for small in-memory sorts.  Quicksort is also generally
3757  * preferred to replacement selection for generating runs during external sort
3758  * operations, although replacement selection is sometimes used for the first
3759  * run.
3760  */
3761 static void
tuplesort_sort_memtuples(Tuplesortstate * state)3762 tuplesort_sort_memtuples(Tuplesortstate *state)
3763 {
3764 	if (state->memtupcount > 1)
3765 	{
3766 		/* Can we use the single-key sort function? */
3767 		if (state->onlyKey != NULL)
3768 			qsort_ssup(state->memtuples, state->memtupcount,
3769 					   state->onlyKey);
3770 		else
3771 			qsort_tuple(state->memtuples,
3772 						state->memtupcount,
3773 						state->comparetup,
3774 						state);
3775 	}
3776 }
3777 
3778 /*
3779  * Insert a new tuple into an empty or existing heap, maintaining the
3780  * heap invariant.  Caller is responsible for ensuring there's room.
3781  *
3782  * Note: we assume *tuple is a temporary variable that can be scribbled on.
3783  * For some callers, tuple actually points to a memtuples[] entry above the
3784  * end of the heap.  This is safe as long as it's not immediately adjacent
3785  * to the end of the heap (ie, in the [memtupcount] array entry) --- if it
3786  * is, it might get overwritten before being moved into the heap!
3787  */
3788 static void
tuplesort_heap_insert(Tuplesortstate * state,SortTuple * tuple,int tupleindex,bool checkIndex)3789 tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple,
3790 					  int tupleindex, bool checkIndex)
3791 {
3792 	SortTuple  *memtuples;
3793 	int			j;
3794 
3795 	/*
3796 	 * Save the tupleindex --- see notes above about writing on *tuple. It's a
3797 	 * historical artifact that tupleindex is passed as a separate argument
3798 	 * and not in *tuple, but it's notationally convenient so let's leave it
3799 	 * that way.
3800 	 */
3801 	tuple->tupindex = tupleindex;
3802 
3803 	memtuples = state->memtuples;
3804 	Assert(state->memtupcount < state->memtupsize);
3805 	Assert(!checkIndex || tupleindex == RUN_FIRST);
3806 
3807 	CHECK_FOR_INTERRUPTS();
3808 
3809 	/*
3810 	 * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is
3811 	 * using 1-based array indexes, not 0-based.
3812 	 */
3813 	j = state->memtupcount++;
3814 	while (j > 0)
3815 	{
3816 		int			i = (j - 1) >> 1;
3817 
3818 		if (HEAPCOMPARE(tuple, &memtuples[i]) >= 0)
3819 			break;
3820 		memtuples[j] = memtuples[i];
3821 		j = i;
3822 	}
3823 	memtuples[j] = *tuple;
3824 }
3825 
3826 /*
3827  * The tuple at state->memtuples[0] has been removed from the heap.
3828  * Decrement memtupcount, and sift up to maintain the heap invariant.
3829  */
3830 static void
tuplesort_heap_siftup(Tuplesortstate * state,bool checkIndex)3831 tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex)
3832 {
3833 	SortTuple  *memtuples = state->memtuples;
3834 	SortTuple  *tuple;
3835 	unsigned int i,
3836 				n;
3837 
3838 	Assert(!checkIndex || state->currentRun == RUN_FIRST);
3839 	if (--state->memtupcount <= 0)
3840 		return;
3841 
3842 	CHECK_FOR_INTERRUPTS();
3843 
3844 	/*
3845 	 * state->memtupcount is "int", but we use "unsigned int" for i, j, n.
3846 	 * This prevents overflow in the "2 * i + 1" calculation, since at the top
3847 	 * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2.
3848 	 */
3849 	n = state->memtupcount;
3850 	tuple = &memtuples[n];		/* tuple that must be reinserted */
3851 	i = 0;						/* i is where the "hole" is */
3852 	for (;;)
3853 	{
3854 		unsigned int j = 2 * i + 1;
3855 
3856 		if (j >= n)
3857 			break;
3858 		if (j + 1 < n &&
3859 			HEAPCOMPARE(&memtuples[j], &memtuples[j + 1]) > 0)
3860 			j++;
3861 		if (HEAPCOMPARE(tuple, &memtuples[j]) <= 0)
3862 			break;
3863 		memtuples[i] = memtuples[j];
3864 		i = j;
3865 	}
3866 	memtuples[i] = *tuple;
3867 }
3868 
3869 /*
3870  * Function to reverse the sort direction from its current state
3871  *
3872  * It is not safe to call this when performing hash tuplesorts
3873  */
3874 static void
reversedirection(Tuplesortstate * state)3875 reversedirection(Tuplesortstate *state)
3876 {
3877 	SortSupport sortKey = state->sortKeys;
3878 	int			nkey;
3879 
3880 	for (nkey = 0; nkey < state->nKeys; nkey++, sortKey++)
3881 	{
3882 		sortKey->ssup_reverse = !sortKey->ssup_reverse;
3883 		sortKey->ssup_nulls_first = !sortKey->ssup_nulls_first;
3884 	}
3885 }
3886 
3887 
3888 /*
3889  * Tape interface routines
3890  */
3891 
3892 static unsigned int
getlen(Tuplesortstate * state,int tapenum,bool eofOK)3893 getlen(Tuplesortstate *state, int tapenum, bool eofOK)
3894 {
3895 	unsigned int len;
3896 
3897 	if (LogicalTapeRead(state->tapeset, tapenum,
3898 						&len, sizeof(len)) != sizeof(len))
3899 		elog(ERROR, "unexpected end of tape");
3900 	if (len == 0 && !eofOK)
3901 		elog(ERROR, "unexpected end of data");
3902 	return len;
3903 }
3904 
3905 static void
markrunend(Tuplesortstate * state,int tapenum)3906 markrunend(Tuplesortstate *state, int tapenum)
3907 {
3908 	unsigned int len = 0;
3909 
3910 	LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len));
3911 }
3912 
3913 /*
3914  * Get memory for tuple from within READTUP() routine.  Allocate
3915  * memory and account for that, or consume from tape's batch
3916  * allocation.
3917  *
3918  * Memory returned here in the final on-the-fly merge case is recycled
3919  * from tape's batch allocation.  Otherwise, callers must pfree() or
3920  * reset tuple child memory context, and account for that with a
3921  * FREEMEM().  Currently, this only ever needs to happen in WRITETUP()
3922  * routines.
3923  */
3924 static void *
readtup_alloc(Tuplesortstate * state,int tapenum,Size tuplen)3925 readtup_alloc(Tuplesortstate *state, int tapenum, Size tuplen)
3926 {
3927 	if (state->batchUsed)
3928 	{
3929 		/*
3930 		 * No USEMEM() call, because during final on-the-fly merge accounting
3931 		 * is based on tape-private state. ("Overflow" allocations are
3932 		 * detected as an indication that a new round or preloading is
3933 		 * required. Preloading marks existing contents of tape's batch buffer
3934 		 * for reuse.)
3935 		 */
3936 		return mergebatchalloc(state, tapenum, tuplen);
3937 	}
3938 	else
3939 	{
3940 		char	   *ret;
3941 
3942 		/* Batch allocation yet to be performed */
3943 		ret = MemoryContextAlloc(state->tuplecontext, tuplen);
3944 		USEMEM(state, GetMemoryChunkSpace(ret));
3945 		return ret;
3946 	}
3947 }
3948 
3949 
3950 /*
3951  * Routines specialized for HeapTuple (actually MinimalTuple) case
3952  */
3953 
3954 static int
comparetup_heap(const SortTuple * a,const SortTuple * b,Tuplesortstate * state)3955 comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
3956 {
3957 	SortSupport sortKey = state->sortKeys;
3958 	HeapTupleData ltup;
3959 	HeapTupleData rtup;
3960 	TupleDesc	tupDesc;
3961 	int			nkey;
3962 	int32		compare;
3963 	AttrNumber	attno;
3964 	Datum		datum1,
3965 				datum2;
3966 	bool		isnull1,
3967 				isnull2;
3968 
3969 
3970 	/* Compare the leading sort key */
3971 	compare = ApplySortComparator(a->datum1, a->isnull1,
3972 								  b->datum1, b->isnull1,
3973 								  sortKey);
3974 	if (compare != 0)
3975 		return compare;
3976 
3977 	/* Compare additional sort keys */
3978 	ltup.t_len = ((MinimalTuple) a->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3979 	ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET);
3980 	rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
3981 	rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
3982 	tupDesc = state->tupDesc;
3983 
3984 	if (sortKey->abbrev_converter)
3985 	{
3986 		attno = sortKey->ssup_attno;
3987 
3988 		datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
3989 		datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
3990 
3991 		compare = ApplySortAbbrevFullComparator(datum1, isnull1,
3992 												datum2, isnull2,
3993 												sortKey);
3994 		if (compare != 0)
3995 			return compare;
3996 	}
3997 
3998 	sortKey++;
3999 	for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++)
4000 	{
4001 		attno = sortKey->ssup_attno;
4002 
4003 		datum1 = heap_getattr(&ltup, attno, tupDesc, &isnull1);
4004 		datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2);
4005 
4006 		compare = ApplySortComparator(datum1, isnull1,
4007 									  datum2, isnull2,
4008 									  sortKey);
4009 		if (compare != 0)
4010 			return compare;
4011 	}
4012 
4013 	return 0;
4014 }
4015 
4016 static void
copytup_heap(Tuplesortstate * state,SortTuple * stup,void * tup)4017 copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
4018 {
4019 	/*
4020 	 * We expect the passed "tup" to be a TupleTableSlot, and form a
4021 	 * MinimalTuple using the exported interface for that.
4022 	 */
4023 	TupleTableSlot *slot = (TupleTableSlot *) tup;
4024 	Datum		original;
4025 	MinimalTuple tuple;
4026 	HeapTupleData htup;
4027 	MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
4028 
4029 	/* copy the tuple into sort storage */
4030 	tuple = ExecCopySlotMinimalTuple(slot);
4031 	stup->tuple = (void *) tuple;
4032 	USEMEM(state, GetMemoryChunkSpace(tuple));
4033 	/* set up first-column key value */
4034 	htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
4035 	htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
4036 	original = heap_getattr(&htup,
4037 							state->sortKeys[0].ssup_attno,
4038 							state->tupDesc,
4039 							&stup->isnull1);
4040 
4041 	MemoryContextSwitchTo(oldcontext);
4042 
4043 	if (!state->sortKeys->abbrev_converter || stup->isnull1)
4044 	{
4045 		/*
4046 		 * Store ordinary Datum representation, or NULL value.  If there is a
4047 		 * converter it won't expect NULL values, and cost model is not
4048 		 * required to account for NULL, so in that case we avoid calling
4049 		 * converter and just set datum1 to zeroed representation (to be
4050 		 * consistent, and to support cheap inequality tests for NULL
4051 		 * abbreviated keys).
4052 		 */
4053 		stup->datum1 = original;
4054 	}
4055 	else if (!consider_abort_common(state))
4056 	{
4057 		/* Store abbreviated key representation */
4058 		stup->datum1 = state->sortKeys->abbrev_converter(original,
4059 														 state->sortKeys);
4060 	}
4061 	else
4062 	{
4063 		/* Abort abbreviation */
4064 		int			i;
4065 
4066 		stup->datum1 = original;
4067 
4068 		/*
4069 		 * Set state to be consistent with never trying abbreviation.
4070 		 *
4071 		 * Alter datum1 representation in already-copied tuples, so as to
4072 		 * ensure a consistent representation (current tuple was just
4073 		 * handled).  It does not matter if some dumped tuples are already
4074 		 * sorted on tape, since serialized tuples lack abbreviated keys
4075 		 * (TSS_BUILDRUNS state prevents control reaching here in any case).
4076 		 */
4077 		for (i = 0; i < state->memtupcount; i++)
4078 		{
4079 			SortTuple  *mtup = &state->memtuples[i];
4080 
4081 			htup.t_len = ((MinimalTuple) mtup->tuple)->t_len +
4082 				MINIMAL_TUPLE_OFFSET;
4083 			htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple -
4084 											 MINIMAL_TUPLE_OFFSET);
4085 
4086 			mtup->datum1 = heap_getattr(&htup,
4087 										state->sortKeys[0].ssup_attno,
4088 										state->tupDesc,
4089 										&mtup->isnull1);
4090 		}
4091 	}
4092 }
4093 
4094 static void
writetup_heap(Tuplesortstate * state,int tapenum,SortTuple * stup)4095 writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup)
4096 {
4097 	MinimalTuple tuple = (MinimalTuple) stup->tuple;
4098 
4099 	/* the part of the MinimalTuple we'll write: */
4100 	char	   *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
4101 	unsigned int tupbodylen = tuple->t_len - MINIMAL_TUPLE_DATA_OFFSET;
4102 
4103 	/* total on-disk footprint: */
4104 	unsigned int tuplen = tupbodylen + sizeof(int);
4105 
4106 	LogicalTapeWrite(state->tapeset, tapenum,
4107 					 (void *) &tuplen, sizeof(tuplen));
4108 	LogicalTapeWrite(state->tapeset, tapenum,
4109 					 (void *) tupbody, tupbodylen);
4110 	if (state->randomAccess)	/* need trailing length word? */
4111 		LogicalTapeWrite(state->tapeset, tapenum,
4112 						 (void *) &tuplen, sizeof(tuplen));
4113 
4114 	FREEMEM(state, GetMemoryChunkSpace(tuple));
4115 	heap_free_minimal_tuple(tuple);
4116 }
4117 
4118 static void
readtup_heap(Tuplesortstate * state,SortTuple * stup,int tapenum,unsigned int len)4119 readtup_heap(Tuplesortstate *state, SortTuple *stup,
4120 			 int tapenum, unsigned int len)
4121 {
4122 	unsigned int tupbodylen = len - sizeof(int);
4123 	unsigned int tuplen = tupbodylen + MINIMAL_TUPLE_DATA_OFFSET;
4124 	MinimalTuple tuple = (MinimalTuple) readtup_alloc(state, tapenum, tuplen);
4125 	char	   *tupbody = (char *) tuple + MINIMAL_TUPLE_DATA_OFFSET;
4126 	HeapTupleData htup;
4127 
4128 	/* read in the tuple proper */
4129 	tuple->t_len = tuplen;
4130 	LogicalTapeReadExact(state->tapeset, tapenum,
4131 						 tupbody, tupbodylen);
4132 	if (state->randomAccess)	/* need trailing length word? */
4133 		LogicalTapeReadExact(state->tapeset, tapenum,
4134 							 &tuplen, sizeof(tuplen));
4135 	stup->tuple = (void *) tuple;
4136 	/* set up first-column key value */
4137 	htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
4138 	htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
4139 	stup->datum1 = heap_getattr(&htup,
4140 								state->sortKeys[0].ssup_attno,
4141 								state->tupDesc,
4142 								&stup->isnull1);
4143 }
4144 
4145 static void
movetup_heap(void * dest,void * src,unsigned int len)4146 movetup_heap(void *dest, void *src, unsigned int len)
4147 {
4148 	memmove(dest, src, len);
4149 }
4150 
4151 /*
4152  * Routines specialized for the CLUSTER case (HeapTuple data, with
4153  * comparisons per a btree index definition)
4154  */
4155 
4156 static int
comparetup_cluster(const SortTuple * a,const SortTuple * b,Tuplesortstate * state)4157 comparetup_cluster(const SortTuple *a, const SortTuple *b,
4158 				   Tuplesortstate *state)
4159 {
4160 	SortSupport sortKey = state->sortKeys;
4161 	HeapTuple	ltup;
4162 	HeapTuple	rtup;
4163 	TupleDesc	tupDesc;
4164 	int			nkey;
4165 	int32		compare;
4166 	Datum		datum1,
4167 				datum2;
4168 	bool		isnull1,
4169 				isnull2;
4170 	AttrNumber	leading = state->indexInfo->ii_KeyAttrNumbers[0];
4171 
4172 	/* Be prepared to compare additional sort keys */
4173 	ltup = (HeapTuple) a->tuple;
4174 	rtup = (HeapTuple) b->tuple;
4175 	tupDesc = state->tupDesc;
4176 
4177 	/* Compare the leading sort key, if it's simple */
4178 	if (leading != 0)
4179 	{
4180 		compare = ApplySortComparator(a->datum1, a->isnull1,
4181 									  b->datum1, b->isnull1,
4182 									  sortKey);
4183 		if (compare != 0)
4184 			return compare;
4185 
4186 		if (sortKey->abbrev_converter)
4187 		{
4188 			datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1);
4189 			datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2);
4190 
4191 			compare = ApplySortAbbrevFullComparator(datum1, isnull1,
4192 													datum2, isnull2,
4193 													sortKey);
4194 		}
4195 		if (compare != 0 || state->nKeys == 1)
4196 			return compare;
4197 		/* Compare additional columns the hard way */
4198 		sortKey++;
4199 		nkey = 1;
4200 	}
4201 	else
4202 	{
4203 		/* Must compare all keys the hard way */
4204 		nkey = 0;
4205 	}
4206 
4207 	if (state->indexInfo->ii_Expressions == NULL)
4208 	{
4209 		/* If not expression index, just compare the proper heap attrs */
4210 
4211 		for (; nkey < state->nKeys; nkey++, sortKey++)
4212 		{
4213 			AttrNumber	attno = state->indexInfo->ii_KeyAttrNumbers[nkey];
4214 
4215 			datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1);
4216 			datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2);
4217 
4218 			compare = ApplySortComparator(datum1, isnull1,
4219 										  datum2, isnull2,
4220 										  sortKey);
4221 			if (compare != 0)
4222 				return compare;
4223 		}
4224 	}
4225 	else
4226 	{
4227 		/*
4228 		 * In the expression index case, compute the whole index tuple and
4229 		 * then compare values.  It would perhaps be faster to compute only as
4230 		 * many columns as we need to compare, but that would require
4231 		 * duplicating all the logic in FormIndexDatum.
4232 		 */
4233 		Datum		l_index_values[INDEX_MAX_KEYS];
4234 		bool		l_index_isnull[INDEX_MAX_KEYS];
4235 		Datum		r_index_values[INDEX_MAX_KEYS];
4236 		bool		r_index_isnull[INDEX_MAX_KEYS];
4237 		TupleTableSlot *ecxt_scantuple;
4238 
4239 		/* Reset context each time to prevent memory leakage */
4240 		ResetPerTupleExprContext(state->estate);
4241 
4242 		ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple;
4243 
4244 		ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false);
4245 		FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
4246 					   l_index_values, l_index_isnull);
4247 
4248 		ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false);
4249 		FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate,
4250 					   r_index_values, r_index_isnull);
4251 
4252 		for (; nkey < state->nKeys; nkey++, sortKey++)
4253 		{
4254 			compare = ApplySortComparator(l_index_values[nkey],
4255 										  l_index_isnull[nkey],
4256 										  r_index_values[nkey],
4257 										  r_index_isnull[nkey],
4258 										  sortKey);
4259 			if (compare != 0)
4260 				return compare;
4261 		}
4262 	}
4263 
4264 	return 0;
4265 }
4266 
4267 static void
copytup_cluster(Tuplesortstate * state,SortTuple * stup,void * tup)4268 copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup)
4269 {
4270 	HeapTuple	tuple = (HeapTuple) tup;
4271 	Datum		original;
4272 	MemoryContext oldcontext = MemoryContextSwitchTo(state->tuplecontext);
4273 
4274 	/* copy the tuple into sort storage */
4275 	tuple = heap_copytuple(tuple);
4276 	stup->tuple = (void *) tuple;
4277 	USEMEM(state, GetMemoryChunkSpace(tuple));
4278 
4279 	MemoryContextSwitchTo(oldcontext);
4280 
4281 	/*
4282 	 * set up first-column key value, and potentially abbreviate, if it's a
4283 	 * simple column
4284 	 */
4285 	if (state->indexInfo->ii_KeyAttrNumbers[0] == 0)
4286 		return;
4287 
4288 	original = heap_getattr(tuple,
4289 							state->indexInfo->ii_KeyAttrNumbers[0],
4290 							state->tupDesc,
4291 							&stup->isnull1);
4292 
4293 	if (!state->sortKeys->abbrev_converter || stup->isnull1)
4294 	{
4295 		/*
4296 		 * Store ordinary Datum representation, or NULL value.  If there is a
4297 		 * converter it won't expect NULL values, and cost model is not
4298 		 * required to account for NULL, so in that case we avoid calling
4299 		 * converter and just set datum1 to zeroed representation (to be
4300 		 * consistent, and to support cheap inequality tests for NULL
4301 		 * abbreviated keys).
4302 		 */
4303 		stup->datum1 = original;
4304 	}
4305 	else if (!consider_abort_common(state))
4306 	{
4307 		/* Store abbreviated key representation */
4308 		stup->datum1 = state->sortKeys->abbrev_converter(original,
4309 														 state->sortKeys);
4310 	}
4311 	else
4312 	{
4313 		/* Abort abbreviation */
4314 		int			i;
4315 
4316 		stup->datum1 = original;
4317 
4318 		/*
4319 		 * Set state to be consistent with never trying abbreviation.
4320 		 *
4321 		 * Alter datum1 representation in already-copied tuples, so as to
4322 		 * ensure a consistent representation (current tuple was just
4323 		 * handled).  It does not matter if some dumped tuples are already
4324 		 * sorted on tape, since serialized tuples lack abbreviated keys
4325 		 * (TSS_BUILDRUNS state prevents control reaching here in any case).
4326 		 */
4327 		for (i = 0; i < state->memtupcount; i++)
4328 		{
4329 			SortTuple  *mtup = &state->memtuples[i];
4330 
4331 			tuple = (HeapTuple) mtup->tuple;
4332 			mtup->datum1 = heap_getattr(tuple,
4333 									  state->indexInfo->ii_KeyAttrNumbers[0],
4334 										state->tupDesc,
4335 										&mtup->isnull1);
4336 		}
4337 	}
4338 }
4339 
4340 static void
writetup_cluster(Tuplesortstate * state,int tapenum,SortTuple * stup)4341 writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup)
4342 {
4343 	HeapTuple	tuple = (HeapTuple) stup->tuple;
4344 	unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int);
4345 
4346 	/* We need to store t_self, but not other fields of HeapTupleData */
4347 	LogicalTapeWrite(state->tapeset, tapenum,
4348 					 &tuplen, sizeof(tuplen));
4349 	LogicalTapeWrite(state->tapeset, tapenum,
4350 					 &tuple->t_self, sizeof(ItemPointerData));
4351 	LogicalTapeWrite(state->tapeset, tapenum,
4352 					 tuple->t_data, tuple->t_len);
4353 	if (state->randomAccess)	/* need trailing length word? */
4354 		LogicalTapeWrite(state->tapeset, tapenum,
4355 						 &tuplen, sizeof(tuplen));
4356 
4357 	FREEMEM(state, GetMemoryChunkSpace(tuple));
4358 	heap_freetuple(tuple);
4359 }
4360 
4361 static void
readtup_cluster(Tuplesortstate * state,SortTuple * stup,int tapenum,unsigned int tuplen)4362 readtup_cluster(Tuplesortstate *state, SortTuple *stup,
4363 				int tapenum, unsigned int tuplen)
4364 {
4365 	unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int);
4366 	HeapTuple	tuple = (HeapTuple) readtup_alloc(state,
4367 												  tapenum,
4368 												  t_len + HEAPTUPLESIZE);
4369 
4370 	/* Reconstruct the HeapTupleData header */
4371 	tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
4372 	tuple->t_len = t_len;
4373 	LogicalTapeReadExact(state->tapeset, tapenum,
4374 						 &tuple->t_self, sizeof(ItemPointerData));
4375 	/* We don't currently bother to reconstruct t_tableOid */
4376 	tuple->t_tableOid = InvalidOid;
4377 	/* Read in the tuple body */
4378 	LogicalTapeReadExact(state->tapeset, tapenum,
4379 						 tuple->t_data, tuple->t_len);
4380 	if (state->randomAccess)	/* need trailing length word? */
4381 		LogicalTapeReadExact(state->tapeset, tapenum,
4382 							 &tuplen, sizeof(tuplen));
4383 	stup->tuple = (void *) tuple;
4384 	/* set up first-column key value, if it's a simple column */
4385 	if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
4386 		stup->datum1 = heap_getattr(tuple,
4387 									state->indexInfo->ii_KeyAttrNumbers[0],
4388 									state->tupDesc,
4389 									&stup->isnull1);
4390 }
4391 
4392 static void
movetup_cluster(void * dest,void * src,unsigned int len)4393 movetup_cluster(void *dest, void *src, unsigned int len)
4394 {
4395 	HeapTuple	tuple;
4396 
4397 	memmove(dest, src, len);
4398 
4399 	/* Repoint the HeapTupleData header */
4400 	tuple = (HeapTuple) dest;
4401 	tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
4402 }
4403 
4404 
4405 /*
4406  * Routines specialized for IndexTuple case
4407  *
4408  * The btree and hash cases require separate comparison functions, but the
4409  * IndexTuple representation is the same so the copy/write/read support
4410  * functions can be shared.
4411  */
4412 
4413 static int
comparetup_index_btree(const SortTuple * a,const SortTuple * b,Tuplesortstate * state)4414 comparetup_index_btree(const SortTuple *a, const SortTuple *b,
4415 					   Tuplesortstate *state)
4416 {
4417 	/*
4418 	 * This is similar to comparetup_heap(), but expects index tuples.  There
4419 	 * is also special handling for enforcing uniqueness, and special
4420 	 * treatment for equal keys at the end.
4421 	 */
4422 	SortSupport sortKey = state->sortKeys;
4423 	IndexTuple	tuple1;
4424 	IndexTuple	tuple2;
4425 	int			keysz;
4426 	TupleDesc	tupDes;
4427 	bool		equal_hasnull = false;
4428 	int			nkey;
4429 	int32		compare;
4430 	Datum		datum1,
4431 				datum2;
4432 	bool		isnull1,
4433 				isnull2;
4434 
4435 
4436 	/* Compare the leading sort key */
4437 	compare = ApplySortComparator(a->datum1, a->isnull1,
4438 								  b->datum1, b->isnull1,
4439 								  sortKey);
4440 	if (compare != 0)
4441 		return compare;
4442 
4443 	/* Compare additional sort keys */
4444 	tuple1 = (IndexTuple) a->tuple;
4445 	tuple2 = (IndexTuple) b->tuple;
4446 	keysz = state->nKeys;
4447 	tupDes = RelationGetDescr(state->indexRel);
4448 
4449 	if (sortKey->abbrev_converter)
4450 	{
4451 		datum1 = index_getattr(tuple1, 1, tupDes, &isnull1);
4452 		datum2 = index_getattr(tuple2, 1, tupDes, &isnull2);
4453 
4454 		compare = ApplySortAbbrevFullComparator(datum1, isnull1,
4455 												datum2, isnull2,
4456 												sortKey);
4457 		if (compare != 0)
4458 			return compare;
4459 	}
4460 
4461 	/* they are equal, so we only need to examine one null flag */
4462 	if (a->isnull1)
4463 		equal_hasnull = true;
4464 
4465 	sortKey++;
4466 	for (nkey = 2; nkey <= keysz; nkey++, sortKey++)
4467 	{
4468 		datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1);
4469 		datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2);
4470 
4471 		compare = ApplySortComparator(datum1, isnull1,
4472 									  datum2, isnull2,
4473 									  sortKey);
4474 		if (compare != 0)
4475 			return compare;		/* done when we find unequal attributes */
4476 
4477 		/* they are equal, so we only need to examine one null flag */
4478 		if (isnull1)
4479 			equal_hasnull = true;
4480 	}
4481 
4482 	/*
4483 	 * If btree has asked us to enforce uniqueness, complain if two equal
4484 	 * tuples are detected (unless there was at least one NULL field).
4485 	 *
4486 	 * It is sufficient to make the test here, because if two tuples are equal
4487 	 * they *must* get compared at some stage of the sort --- otherwise the
4488 	 * sort algorithm wouldn't have checked whether one must appear before the
4489 	 * other.
4490 	 */
4491 	if (state->enforceUnique && !equal_hasnull)
4492 	{
4493 		Datum		values[INDEX_MAX_KEYS];
4494 		bool		isnull[INDEX_MAX_KEYS];
4495 		char	   *key_desc;
4496 
4497 		/*
4498 		 * Some rather brain-dead implementations of qsort (such as the one in
4499 		 * QNX 4) will sometimes call the comparison routine to compare a
4500 		 * value to itself, but we always use our own implementation, which
4501 		 * does not.
4502 		 */
4503 		Assert(tuple1 != tuple2);
4504 
4505 		index_deform_tuple(tuple1, tupDes, values, isnull);
4506 
4507 		key_desc = BuildIndexValueDescription(state->indexRel, values, isnull);
4508 
4509 		ereport(ERROR,
4510 				(errcode(ERRCODE_UNIQUE_VIOLATION),
4511 				 errmsg("could not create unique index \"%s\"",
4512 						RelationGetRelationName(state->indexRel)),
4513 				 key_desc ? errdetail("Key %s is duplicated.", key_desc) :
4514 				 errdetail("Duplicate keys exist."),
4515 				 errtableconstraint(state->heapRel,
4516 								 RelationGetRelationName(state->indexRel))));
4517 	}
4518 
4519 	/*
4520 	 * If key values are equal, we sort on ItemPointer.  This does not affect
4521 	 * validity of the finished index, but it may be useful to have index
4522 	 * scans in physical order.
4523 	 */
4524 	{
4525 		BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
4526 		BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
4527 
4528 		if (blk1 != blk2)
4529 			return (blk1 < blk2) ? -1 : 1;
4530 	}
4531 	{
4532 		OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
4533 		OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
4534 
4535 		if (pos1 != pos2)
4536 			return (pos1 < pos2) ? -1 : 1;
4537 	}
4538 
4539 	/* ItemPointer values should never be equal */
4540 	Assert(false);
4541 
4542 	return 0;
4543 }
4544 
4545 static int
comparetup_index_hash(const SortTuple * a,const SortTuple * b,Tuplesortstate * state)4546 comparetup_index_hash(const SortTuple *a, const SortTuple *b,
4547 					  Tuplesortstate *state)
4548 {
4549 	uint32		hash1;
4550 	uint32		hash2;
4551 	IndexTuple	tuple1;
4552 	IndexTuple	tuple2;
4553 
4554 	/*
4555 	 * Fetch hash keys and mask off bits we don't want to sort by. We know
4556 	 * that the first column of the index tuple is the hash key.
4557 	 */
4558 	Assert(!a->isnull1);
4559 	hash1 = DatumGetUInt32(a->datum1) & state->hash_mask;
4560 	Assert(!b->isnull1);
4561 	hash2 = DatumGetUInt32(b->datum1) & state->hash_mask;
4562 
4563 	if (hash1 > hash2)
4564 		return 1;
4565 	else if (hash1 < hash2)
4566 		return -1;
4567 
4568 	/*
4569 	 * If hash values are equal, we sort on ItemPointer.  This does not affect
4570 	 * validity of the finished index, but it may be useful to have index
4571 	 * scans in physical order.
4572 	 */
4573 	tuple1 = (IndexTuple) a->tuple;
4574 	tuple2 = (IndexTuple) b->tuple;
4575 
4576 	{
4577 		BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid);
4578 		BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid);
4579 
4580 		if (blk1 != blk2)
4581 			return (blk1 < blk2) ? -1 : 1;
4582 	}
4583 	{
4584 		OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid);
4585 		OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid);
4586 
4587 		if (pos1 != pos2)
4588 			return (pos1 < pos2) ? -1 : 1;
4589 	}
4590 
4591 	/* ItemPointer values should never be equal */
4592 	Assert(false);
4593 
4594 	return 0;
4595 }
4596 
4597 static void
copytup_index(Tuplesortstate * state,SortTuple * stup,void * tup)4598 copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup)
4599 {
4600 	IndexTuple	tuple = (IndexTuple) tup;
4601 	unsigned int tuplen = IndexTupleSize(tuple);
4602 	IndexTuple	newtuple;
4603 	Datum		original;
4604 
4605 	/* copy the tuple into sort storage */
4606 	newtuple = (IndexTuple) MemoryContextAlloc(state->tuplecontext, tuplen);
4607 	memcpy(newtuple, tuple, tuplen);
4608 	USEMEM(state, GetMemoryChunkSpace(newtuple));
4609 	stup->tuple = (void *) newtuple;
4610 	/* set up first-column key value */
4611 	original = index_getattr(newtuple,
4612 							 1,
4613 							 RelationGetDescr(state->indexRel),
4614 							 &stup->isnull1);
4615 
4616 	if (!state->sortKeys->abbrev_converter || stup->isnull1)
4617 	{
4618 		/*
4619 		 * Store ordinary Datum representation, or NULL value.  If there is a
4620 		 * converter it won't expect NULL values, and cost model is not
4621 		 * required to account for NULL, so in that case we avoid calling
4622 		 * converter and just set datum1 to zeroed representation (to be
4623 		 * consistent, and to support cheap inequality tests for NULL
4624 		 * abbreviated keys).
4625 		 */
4626 		stup->datum1 = original;
4627 	}
4628 	else if (!consider_abort_common(state))
4629 	{
4630 		/* Store abbreviated key representation */
4631 		stup->datum1 = state->sortKeys->abbrev_converter(original,
4632 														 state->sortKeys);
4633 	}
4634 	else
4635 	{
4636 		/* Abort abbreviation */
4637 		int			i;
4638 
4639 		stup->datum1 = original;
4640 
4641 		/*
4642 		 * Set state to be consistent with never trying abbreviation.
4643 		 *
4644 		 * Alter datum1 representation in already-copied tuples, so as to
4645 		 * ensure a consistent representation (current tuple was just
4646 		 * handled).  It does not matter if some dumped tuples are already
4647 		 * sorted on tape, since serialized tuples lack abbreviated keys
4648 		 * (TSS_BUILDRUNS state prevents control reaching here in any case).
4649 		 */
4650 		for (i = 0; i < state->memtupcount; i++)
4651 		{
4652 			SortTuple  *mtup = &state->memtuples[i];
4653 
4654 			tuple = (IndexTuple) mtup->tuple;
4655 			mtup->datum1 = index_getattr(tuple,
4656 										 1,
4657 										 RelationGetDescr(state->indexRel),
4658 										 &mtup->isnull1);
4659 		}
4660 	}
4661 }
4662 
4663 static void
writetup_index(Tuplesortstate * state,int tapenum,SortTuple * stup)4664 writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup)
4665 {
4666 	IndexTuple	tuple = (IndexTuple) stup->tuple;
4667 	unsigned int tuplen;
4668 
4669 	tuplen = IndexTupleSize(tuple) + sizeof(tuplen);
4670 	LogicalTapeWrite(state->tapeset, tapenum,
4671 					 (void *) &tuplen, sizeof(tuplen));
4672 	LogicalTapeWrite(state->tapeset, tapenum,
4673 					 (void *) tuple, IndexTupleSize(tuple));
4674 	if (state->randomAccess)	/* need trailing length word? */
4675 		LogicalTapeWrite(state->tapeset, tapenum,
4676 						 (void *) &tuplen, sizeof(tuplen));
4677 
4678 	FREEMEM(state, GetMemoryChunkSpace(tuple));
4679 	pfree(tuple);
4680 }
4681 
4682 static void
readtup_index(Tuplesortstate * state,SortTuple * stup,int tapenum,unsigned int len)4683 readtup_index(Tuplesortstate *state, SortTuple *stup,
4684 			  int tapenum, unsigned int len)
4685 {
4686 	unsigned int tuplen = len - sizeof(unsigned int);
4687 	IndexTuple	tuple = (IndexTuple) readtup_alloc(state, tapenum, tuplen);
4688 
4689 	LogicalTapeReadExact(state->tapeset, tapenum,
4690 						 tuple, tuplen);
4691 	if (state->randomAccess)	/* need trailing length word? */
4692 		LogicalTapeReadExact(state->tapeset, tapenum,
4693 							 &tuplen, sizeof(tuplen));
4694 	stup->tuple = (void *) tuple;
4695 	/* set up first-column key value */
4696 	stup->datum1 = index_getattr(tuple,
4697 								 1,
4698 								 RelationGetDescr(state->indexRel),
4699 								 &stup->isnull1);
4700 }
4701 
4702 static void
movetup_index(void * dest,void * src,unsigned int len)4703 movetup_index(void *dest, void *src, unsigned int len)
4704 {
4705 	memmove(dest, src, len);
4706 }
4707 
4708 /*
4709  * Routines specialized for DatumTuple case
4710  */
4711 
4712 static int
comparetup_datum(const SortTuple * a,const SortTuple * b,Tuplesortstate * state)4713 comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
4714 {
4715 	int			compare;
4716 
4717 	compare = ApplySortComparator(a->datum1, a->isnull1,
4718 								  b->datum1, b->isnull1,
4719 								  state->sortKeys);
4720 	if (compare != 0)
4721 		return compare;
4722 
4723 	/* if we have abbreviations, then "tuple" has the original value */
4724 
4725 	if (state->sortKeys->abbrev_converter)
4726 		compare = ApplySortAbbrevFullComparator(PointerGetDatum(a->tuple), a->isnull1,
4727 									   PointerGetDatum(b->tuple), b->isnull1,
4728 												state->sortKeys);
4729 
4730 	return compare;
4731 }
4732 
4733 static void
copytup_datum(Tuplesortstate * state,SortTuple * stup,void * tup)4734 copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup)
4735 {
4736 	/* Not currently needed */
4737 	elog(ERROR, "copytup_datum() should not be called");
4738 }
4739 
4740 static void
writetup_datum(Tuplesortstate * state,int tapenum,SortTuple * stup)4741 writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup)
4742 {
4743 	void	   *waddr;
4744 	unsigned int tuplen;
4745 	unsigned int writtenlen;
4746 
4747 	if (stup->isnull1)
4748 	{
4749 		waddr = NULL;
4750 		tuplen = 0;
4751 	}
4752 	else if (!state->tuples)
4753 	{
4754 		waddr = &stup->datum1;
4755 		tuplen = sizeof(Datum);
4756 	}
4757 	else
4758 	{
4759 		waddr = stup->tuple;
4760 		tuplen = datumGetSize(PointerGetDatum(stup->tuple), false, state->datumTypeLen);
4761 		Assert(tuplen != 0);
4762 	}
4763 
4764 	writtenlen = tuplen + sizeof(unsigned int);
4765 
4766 	LogicalTapeWrite(state->tapeset, tapenum,
4767 					 (void *) &writtenlen, sizeof(writtenlen));
4768 	LogicalTapeWrite(state->tapeset, tapenum,
4769 					 waddr, tuplen);
4770 	if (state->randomAccess)	/* need trailing length word? */
4771 		LogicalTapeWrite(state->tapeset, tapenum,
4772 						 (void *) &writtenlen, sizeof(writtenlen));
4773 
4774 	if (stup->tuple)
4775 	{
4776 		FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4777 		pfree(stup->tuple);
4778 		stup->tuple = NULL;
4779 	}
4780 }
4781 
4782 static void
readtup_datum(Tuplesortstate * state,SortTuple * stup,int tapenum,unsigned int len)4783 readtup_datum(Tuplesortstate *state, SortTuple *stup,
4784 			  int tapenum, unsigned int len)
4785 {
4786 	unsigned int tuplen = len - sizeof(unsigned int);
4787 
4788 	if (tuplen == 0)
4789 	{
4790 		/* it's NULL */
4791 		stup->datum1 = (Datum) 0;
4792 		stup->isnull1 = true;
4793 		stup->tuple = NULL;
4794 	}
4795 	else if (!state->tuples)
4796 	{
4797 		Assert(tuplen == sizeof(Datum));
4798 		LogicalTapeReadExact(state->tapeset, tapenum,
4799 							 &stup->datum1, tuplen);
4800 		stup->isnull1 = false;
4801 		stup->tuple = NULL;
4802 	}
4803 	else
4804 	{
4805 		void	   *raddr = readtup_alloc(state, tapenum, tuplen);
4806 
4807 		LogicalTapeReadExact(state->tapeset, tapenum,
4808 							 raddr, tuplen);
4809 		stup->datum1 = PointerGetDatum(raddr);
4810 		stup->isnull1 = false;
4811 		stup->tuple = raddr;
4812 	}
4813 
4814 	if (state->randomAccess)	/* need trailing length word? */
4815 		LogicalTapeReadExact(state->tapeset, tapenum,
4816 							 &tuplen, sizeof(tuplen));
4817 }
4818 
4819 static void
movetup_datum(void * dest,void * src,unsigned int len)4820 movetup_datum(void *dest, void *src, unsigned int len)
4821 {
4822 	memmove(dest, src, len);
4823 }
4824 
4825 /*
4826  * Convenience routine to free a tuple previously loaded into sort memory
4827  */
4828 static void
free_sort_tuple(Tuplesortstate * state,SortTuple * stup)4829 free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
4830 {
4831 	if (stup->tuple)
4832 	{
4833 		FREEMEM(state, GetMemoryChunkSpace(stup->tuple));
4834 		pfree(stup->tuple);
4835 	}
4836 }
4837