1 /*-------------------------------------------------------------------------
2  *
3  * tuplesort.h
4  *	  Generalized tuple sorting routines.
5  *
6  * This module handles sorting of heap tuples, index tuples, or single
7  * Datums (and could easily support other kinds of sortable objects,
8  * if necessary).  It works efficiently for both small and large amounts
9  * of data.  Small amounts are sorted in-memory using qsort().  Large
10  * amounts are sorted using temporary files and a standard external sort
11  * algorithm.  Parallel sorts use a variant of this external sort
12  * algorithm, and are typically only used for large amounts of data.
13  *
14  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
15  * Portions Copyright (c) 1994, Regents of the University of California
16  *
17  * src/include/utils/tuplesort.h
18  *
19  *-------------------------------------------------------------------------
20  */
21 #ifndef TUPLESORT_H
22 #define TUPLESORT_H
23 
24 #include "access/itup.h"
25 #include "executor/tuptable.h"
26 #include "storage/dsm.h"
27 #include "utils/relcache.h"
28 
29 
30 /*
31  * Tuplesortstate and Sharedsort are opaque types whose details are not
32  * known outside tuplesort.c.
33  */
34 typedef struct Tuplesortstate Tuplesortstate;
35 typedef struct Sharedsort Sharedsort;
36 
37 /*
38  * Tuplesort parallel coordination state, allocated by each participant in
39  * local memory.  Participant caller initializes everything.  See usage notes
40  * below.
41  */
42 typedef struct SortCoordinateData
43 {
44 	/* Worker process?  If not, must be leader. */
45 	bool		isWorker;
46 
47 	/*
48 	 * Leader-process-passed number of participants known launched (workers
49 	 * set this to -1).  Includes state within leader needed for it to
50 	 * participate as a worker, if any.
51 	 */
52 	int			nParticipants;
53 
54 	/* Private opaque state (points to shared memory) */
55 	Sharedsort *sharedsort;
56 }			SortCoordinateData;
57 
58 typedef struct SortCoordinateData *SortCoordinate;
59 
60 /*
61  * Data structures for reporting sort statistics.  Note that
62  * TuplesortInstrumentation can't contain any pointers because we
63  * sometimes put it in shared memory.
64  *
65  * The parallel-sort infrastructure relies on having a zero TuplesortMethod
66  * to indicate that a worker never did anything, so we assign zero to
67  * SORT_TYPE_STILL_IN_PROGRESS.  The other values of this enum can be
68  * OR'ed together to represent a situation where different workers used
69  * different methods, so we need a separate bit for each one.  Keep the
70  * NUM_TUPLESORTMETHODS constant in sync with the number of bits!
71  */
72 typedef enum
73 {
74 	SORT_TYPE_STILL_IN_PROGRESS = 0,
75 	SORT_TYPE_TOP_N_HEAPSORT = 1 << 0,
76 	SORT_TYPE_QUICKSORT = 1 << 1,
77 	SORT_TYPE_EXTERNAL_SORT = 1 << 2,
78 	SORT_TYPE_EXTERNAL_MERGE = 1 << 3
79 } TuplesortMethod;
80 
81 #define NUM_TUPLESORTMETHODS 4
82 
83 typedef enum
84 {
85 	SORT_SPACE_TYPE_DISK,
86 	SORT_SPACE_TYPE_MEMORY
87 } TuplesortSpaceType;
88 
89 typedef struct TuplesortInstrumentation
90 {
91 	TuplesortMethod sortMethod; /* sort algorithm used */
92 	TuplesortSpaceType spaceType;	/* type of space spaceUsed represents */
93 	int64		spaceUsed;		/* space consumption, in kB */
94 } TuplesortInstrumentation;
95 
96 
97 /*
98  * We provide multiple interfaces to what is essentially the same code,
99  * since different callers have different data to be sorted and want to
100  * specify the sort key information differently.  There are two APIs for
101  * sorting HeapTuples and two more for sorting IndexTuples.  Yet another
102  * API supports sorting bare Datums.
103  *
104  * Serial sort callers should pass NULL for their coordinate argument.
105  *
106  * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't
107  * preserve the system columns (tuple identity and transaction visibility
108  * info).  The sort keys are specified by column numbers within the tuples
109  * and sort operator OIDs.  We save some cycles by passing and returning the
110  * tuples in TupleTableSlots, rather than forming actual HeapTuples (which'd
111  * have to be converted to MinimalTuples).  This API works well for sorts
112  * executed as parts of plan trees.
113  *
114  * The "cluster" API stores/sorts full HeapTuples including all visibility
115  * info. The sort keys are specified by reference to a btree index that is
116  * defined on the relation to be sorted.  Note that putheaptuple/getheaptuple
117  * go with this API, not the "begin_heap" one!
118  *
119  * The "index_btree" API stores/sorts IndexTuples (preserving all their
120  * header fields).  The sort keys are specified by a btree index definition.
121  *
122  * The "index_hash" API is similar to index_btree, but the tuples are
123  * actually sorted by their hash codes not the raw data.
124  *
125  * Parallel sort callers are required to coordinate multiple tuplesort states
126  * in a leader process and one or more worker processes.  The leader process
127  * must launch workers, and have each perform an independent "partial"
128  * tuplesort, typically fed by the parallel heap interface.  The leader later
129  * produces the final output (internally, it merges runs output by workers).
130  *
131  * Callers must do the following to perform a sort in parallel using multiple
132  * worker processes:
133  *
134  * 1. Request tuplesort-private shared memory for n workers.  Use
135  *    tuplesort_estimate_shared() to get the required size.
136  * 2. Have leader process initialize allocated shared memory using
137  *    tuplesort_initialize_shared().  Launch workers.
138  * 3. Initialize a coordinate argument within both the leader process, and
139  *    for each worker process.  This has a pointer to the shared
140  *    tuplesort-private structure, as well as some caller-initialized fields.
141  *    Leader's coordinate argument reliably indicates number of workers
142  *    launched (this is unused by workers).
143  * 4. Begin a tuplesort using some appropriate tuplesort_begin* routine,
144  *    (passing the coordinate argument) within each worker.  The workMem
145  *    arguments need not be identical.  All other arguments should match
146  *    exactly, though.
147  * 5. tuplesort_attach_shared() should be called by all workers.  Feed tuples
148  *    to each worker, and call tuplesort_performsort() within each when input
149  *    is exhausted.
150  * 6. Call tuplesort_end() in each worker process.  Worker processes can shut
151  *    down once tuplesort_end() returns.
152  * 7. Begin a tuplesort in the leader using the same tuplesort_begin*
153  *    routine, passing a leader-appropriate coordinate argument (this can
154  *    happen as early as during step 3, actually, since we only need to know
155  *    the number of workers successfully launched).  The leader must now wait
156  *    for workers to finish.  Caller must use own mechanism for ensuring that
157  *    next step isn't reached until all workers have called and returned from
158  *    tuplesort_performsort().  (Note that it's okay if workers have already
159  *    also called tuplesort_end() by then.)
160  * 8. Call tuplesort_performsort() in leader.  Consume output using the
161  *    appropriate tuplesort_get* routine.  Leader can skip this step if
162  *    tuplesort turns out to be unnecessary.
163  * 9. Call tuplesort_end() in leader.
164  *
165  * This division of labor assumes nothing about how input tuples are produced,
166  * but does require that caller combine the state of multiple tuplesorts for
167  * any purpose other than producing the final output.  For example, callers
168  * must consider that tuplesort_get_stats() reports on only one worker's role
169  * in a sort (or the leader's role), and not statistics for the sort as a
170  * whole.
171  *
172  * Note that callers may use the leader process to sort runs as if it was an
173  * independent worker process (prior to the process performing a leader sort
174  * to produce the final sorted output).  Doing so only requires a second
175  * "partial" tuplesort within the leader process, initialized like that of a
176  * worker process.  The steps above don't touch on this directly.  The only
177  * difference is that the tuplesort_attach_shared() call is never needed within
178  * leader process, because the backend as a whole holds the shared fileset
179  * reference.  A worker Tuplesortstate in leader is expected to do exactly the
180  * same amount of total initial processing work as a worker process
181  * Tuplesortstate, since the leader process has nothing else to do before
182  * workers finish.
183  *
184  * Note that only a very small amount of memory will be allocated prior to
185  * the leader state first consuming input, and that workers will free the
186  * vast majority of their memory upon returning from tuplesort_performsort().
187  * Callers can rely on this to arrange for memory to be used in a way that
188  * respects a workMem-style budget across an entire parallel sort operation.
189  *
190  * Callers are responsible for parallel safety in general.  However, they
191  * can at least rely on there being no parallel safety hazards within
192  * tuplesort, because tuplesort thinks of the sort as several independent
193  * sorts whose results are combined.  Since, in general, the behavior of
194  * sort operators is immutable, caller need only worry about the parallel
195  * safety of whatever the process is through which input tuples are
196  * generated (typically, caller uses a parallel heap scan).
197  */
198 
199 extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
200 											int nkeys, AttrNumber *attNums,
201 											Oid *sortOperators, Oid *sortCollations,
202 											bool *nullsFirstFlags,
203 											int workMem, SortCoordinate coordinate,
204 											bool randomAccess);
205 extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
206 											   Relation indexRel, int workMem,
207 											   SortCoordinate coordinate, bool randomAccess);
208 extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel,
209 												   Relation indexRel,
210 												   bool enforceUnique,
211 												   int workMem, SortCoordinate coordinate,
212 												   bool randomAccess);
213 extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel,
214 												  Relation indexRel,
215 												  uint32 high_mask,
216 												  uint32 low_mask,
217 												  uint32 max_buckets,
218 												  int workMem, SortCoordinate coordinate,
219 												  bool randomAccess);
220 extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
221 											 Oid sortOperator, Oid sortCollation,
222 											 bool nullsFirstFlag,
223 											 int workMem, SortCoordinate coordinate,
224 											 bool randomAccess);
225 
226 extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
227 extern bool tuplesort_used_bound(Tuplesortstate *state);
228 
229 extern void tuplesort_puttupleslot(Tuplesortstate *state,
230 								   TupleTableSlot *slot);
231 extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup);
232 extern void tuplesort_putindextuplevalues(Tuplesortstate *state,
233 										  Relation rel, ItemPointer self,
234 										  Datum *values, bool *isnull);
235 extern void tuplesort_putdatum(Tuplesortstate *state, Datum val,
236 							   bool isNull);
237 
238 extern void tuplesort_performsort(Tuplesortstate *state);
239 
240 extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
241 								   bool copy, TupleTableSlot *slot, Datum *abbrev);
242 extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward);
243 extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward);
244 extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
245 							   Datum *val, bool *isNull, Datum *abbrev);
246 
247 extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples,
248 								 bool forward);
249 
250 extern void tuplesort_end(Tuplesortstate *state);
251 
252 extern void tuplesort_reset(Tuplesortstate *state);
253 
254 extern void tuplesort_get_stats(Tuplesortstate *state,
255 								TuplesortInstrumentation *stats);
256 extern const char *tuplesort_method_name(TuplesortMethod m);
257 extern const char *tuplesort_space_type_name(TuplesortSpaceType t);
258 
259 extern int	tuplesort_merge_order(int64 allowedMem);
260 
261 extern Size tuplesort_estimate_shared(int nworkers);
262 extern void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers,
263 										dsm_segment *seg);
264 extern void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg);
265 
266 /*
267  * These routines may only be called if randomAccess was specified 'true'.
268  * Likewise, backwards scan in gettuple/getdatum is only allowed if
269  * randomAccess was specified.  Note that parallel sorts do not support
270  * randomAccess.
271  */
272 
273 extern void tuplesort_rescan(Tuplesortstate *state);
274 extern void tuplesort_markpos(Tuplesortstate *state);
275 extern void tuplesort_restorepos(Tuplesortstate *state);
276 
277 #endif							/* TUPLESORT_H */
278