1 /*------------------------------------------------------------------------- 2 * 3 * tuplesort.h 4 * Generalized tuple sorting routines. 5 * 6 * This module handles sorting of heap tuples, index tuples, or single 7 * Datums (and could easily support other kinds of sortable objects, 8 * if necessary). It works efficiently for both small and large amounts 9 * of data. Small amounts are sorted in-memory using qsort(). Large 10 * amounts are sorted using temporary files and a standard external sort 11 * algorithm. Parallel sorts use a variant of this external sort 12 * algorithm, and are typically only used for large amounts of data. 13 * 14 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group 15 * Portions Copyright (c) 1994, Regents of the University of California 16 * 17 * src/include/utils/tuplesort.h 18 * 19 *------------------------------------------------------------------------- 20 */ 21 #ifndef TUPLESORT_H 22 #define TUPLESORT_H 23 24 #include "access/itup.h" 25 #include "executor/tuptable.h" 26 #include "storage/dsm.h" 27 #include "utils/relcache.h" 28 29 30 /* 31 * Tuplesortstate and Sharedsort are opaque types whose details are not 32 * known outside tuplesort.c. 33 */ 34 typedef struct Tuplesortstate Tuplesortstate; 35 typedef struct Sharedsort Sharedsort; 36 37 /* 38 * Tuplesort parallel coordination state, allocated by each participant in 39 * local memory. Participant caller initializes everything. See usage notes 40 * below. 41 */ 42 typedef struct SortCoordinateData 43 { 44 /* Worker process? If not, must be leader. */ 45 bool isWorker; 46 47 /* 48 * Leader-process-passed number of participants known launched (workers 49 * set this to -1). Includes state within leader needed for it to 50 * participate as a worker, if any. 51 */ 52 int nParticipants; 53 54 /* Private opaque state (points to shared memory) */ 55 Sharedsort *sharedsort; 56 } SortCoordinateData; 57 58 typedef struct SortCoordinateData *SortCoordinate; 59 60 /* 61 * Data structures for reporting sort statistics. Note that 62 * TuplesortInstrumentation can't contain any pointers because we 63 * sometimes put it in shared memory. 64 * 65 * The parallel-sort infrastructure relies on having a zero TuplesortMethod 66 * to indicate that a worker never did anything, so we assign zero to 67 * SORT_TYPE_STILL_IN_PROGRESS. The other values of this enum can be 68 * OR'ed together to represent a situation where different workers used 69 * different methods, so we need a separate bit for each one. Keep the 70 * NUM_TUPLESORTMETHODS constant in sync with the number of bits! 71 */ 72 typedef enum 73 { 74 SORT_TYPE_STILL_IN_PROGRESS = 0, 75 SORT_TYPE_TOP_N_HEAPSORT = 1 << 0, 76 SORT_TYPE_QUICKSORT = 1 << 1, 77 SORT_TYPE_EXTERNAL_SORT = 1 << 2, 78 SORT_TYPE_EXTERNAL_MERGE = 1 << 3 79 } TuplesortMethod; 80 81 #define NUM_TUPLESORTMETHODS 4 82 83 typedef enum 84 { 85 SORT_SPACE_TYPE_DISK, 86 SORT_SPACE_TYPE_MEMORY 87 } TuplesortSpaceType; 88 89 typedef struct TuplesortInstrumentation 90 { 91 TuplesortMethod sortMethod; /* sort algorithm used */ 92 TuplesortSpaceType spaceType; /* type of space spaceUsed represents */ 93 int64 spaceUsed; /* space consumption, in kB */ 94 } TuplesortInstrumentation; 95 96 97 /* 98 * We provide multiple interfaces to what is essentially the same code, 99 * since different callers have different data to be sorted and want to 100 * specify the sort key information differently. There are two APIs for 101 * sorting HeapTuples and two more for sorting IndexTuples. Yet another 102 * API supports sorting bare Datums. 103 * 104 * Serial sort callers should pass NULL for their coordinate argument. 105 * 106 * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't 107 * preserve the system columns (tuple identity and transaction visibility 108 * info). The sort keys are specified by column numbers within the tuples 109 * and sort operator OIDs. We save some cycles by passing and returning the 110 * tuples in TupleTableSlots, rather than forming actual HeapTuples (which'd 111 * have to be converted to MinimalTuples). This API works well for sorts 112 * executed as parts of plan trees. 113 * 114 * The "cluster" API stores/sorts full HeapTuples including all visibility 115 * info. The sort keys are specified by reference to a btree index that is 116 * defined on the relation to be sorted. Note that putheaptuple/getheaptuple 117 * go with this API, not the "begin_heap" one! 118 * 119 * The "index_btree" API stores/sorts IndexTuples (preserving all their 120 * header fields). The sort keys are specified by a btree index definition. 121 * 122 * The "index_hash" API is similar to index_btree, but the tuples are 123 * actually sorted by their hash codes not the raw data. 124 * 125 * Parallel sort callers are required to coordinate multiple tuplesort states 126 * in a leader process and one or more worker processes. The leader process 127 * must launch workers, and have each perform an independent "partial" 128 * tuplesort, typically fed by the parallel heap interface. The leader later 129 * produces the final output (internally, it merges runs output by workers). 130 * 131 * Callers must do the following to perform a sort in parallel using multiple 132 * worker processes: 133 * 134 * 1. Request tuplesort-private shared memory for n workers. Use 135 * tuplesort_estimate_shared() to get the required size. 136 * 2. Have leader process initialize allocated shared memory using 137 * tuplesort_initialize_shared(). Launch workers. 138 * 3. Initialize a coordinate argument within both the leader process, and 139 * for each worker process. This has a pointer to the shared 140 * tuplesort-private structure, as well as some caller-initialized fields. 141 * Leader's coordinate argument reliably indicates number of workers 142 * launched (this is unused by workers). 143 * 4. Begin a tuplesort using some appropriate tuplesort_begin* routine, 144 * (passing the coordinate argument) within each worker. The workMem 145 * arguments need not be identical. All other arguments should match 146 * exactly, though. 147 * 5. tuplesort_attach_shared() should be called by all workers. Feed tuples 148 * to each worker, and call tuplesort_performsort() within each when input 149 * is exhausted. 150 * 6. Call tuplesort_end() in each worker process. Worker processes can shut 151 * down once tuplesort_end() returns. 152 * 7. Begin a tuplesort in the leader using the same tuplesort_begin* 153 * routine, passing a leader-appropriate coordinate argument (this can 154 * happen as early as during step 3, actually, since we only need to know 155 * the number of workers successfully launched). The leader must now wait 156 * for workers to finish. Caller must use own mechanism for ensuring that 157 * next step isn't reached until all workers have called and returned from 158 * tuplesort_performsort(). (Note that it's okay if workers have already 159 * also called tuplesort_end() by then.) 160 * 8. Call tuplesort_performsort() in leader. Consume output using the 161 * appropriate tuplesort_get* routine. Leader can skip this step if 162 * tuplesort turns out to be unnecessary. 163 * 9. Call tuplesort_end() in leader. 164 * 165 * This division of labor assumes nothing about how input tuples are produced, 166 * but does require that caller combine the state of multiple tuplesorts for 167 * any purpose other than producing the final output. For example, callers 168 * must consider that tuplesort_get_stats() reports on only one worker's role 169 * in a sort (or the leader's role), and not statistics for the sort as a 170 * whole. 171 * 172 * Note that callers may use the leader process to sort runs as if it was an 173 * independent worker process (prior to the process performing a leader sort 174 * to produce the final sorted output). Doing so only requires a second 175 * "partial" tuplesort within the leader process, initialized like that of a 176 * worker process. The steps above don't touch on this directly. The only 177 * difference is that the tuplesort_attach_shared() call is never needed within 178 * leader process, because the backend as a whole holds the shared fileset 179 * reference. A worker Tuplesortstate in leader is expected to do exactly the 180 * same amount of total initial processing work as a worker process 181 * Tuplesortstate, since the leader process has nothing else to do before 182 * workers finish. 183 * 184 * Note that only a very small amount of memory will be allocated prior to 185 * the leader state first consuming input, and that workers will free the 186 * vast majority of their memory upon returning from tuplesort_performsort(). 187 * Callers can rely on this to arrange for memory to be used in a way that 188 * respects a workMem-style budget across an entire parallel sort operation. 189 * 190 * Callers are responsible for parallel safety in general. However, they 191 * can at least rely on there being no parallel safety hazards within 192 * tuplesort, because tuplesort thinks of the sort as several independent 193 * sorts whose results are combined. Since, in general, the behavior of 194 * sort operators is immutable, caller need only worry about the parallel 195 * safety of whatever the process is through which input tuples are 196 * generated (typically, caller uses a parallel heap scan). 197 */ 198 199 extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc, 200 int nkeys, AttrNumber *attNums, 201 Oid *sortOperators, Oid *sortCollations, 202 bool *nullsFirstFlags, 203 int workMem, SortCoordinate coordinate, 204 bool randomAccess); 205 extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc, 206 Relation indexRel, int workMem, 207 SortCoordinate coordinate, bool randomAccess); 208 extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, 209 Relation indexRel, 210 bool enforceUnique, 211 int workMem, SortCoordinate coordinate, 212 bool randomAccess); 213 extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, 214 Relation indexRel, 215 uint32 high_mask, 216 uint32 low_mask, 217 uint32 max_buckets, 218 int workMem, SortCoordinate coordinate, 219 bool randomAccess); 220 extern Tuplesortstate *tuplesort_begin_index_gist(Relation heapRel, 221 Relation indexRel, 222 int workMem, SortCoordinate coordinate, 223 bool randomAccess); 224 extern Tuplesortstate *tuplesort_begin_datum(Oid datumType, 225 Oid sortOperator, Oid sortCollation, 226 bool nullsFirstFlag, 227 int workMem, SortCoordinate coordinate, 228 bool randomAccess); 229 230 extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound); 231 extern bool tuplesort_used_bound(Tuplesortstate *state); 232 233 extern void tuplesort_puttupleslot(Tuplesortstate *state, 234 TupleTableSlot *slot); 235 extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup); 236 extern void tuplesort_putindextuplevalues(Tuplesortstate *state, 237 Relation rel, ItemPointer self, 238 Datum *values, bool *isnull); 239 extern void tuplesort_putdatum(Tuplesortstate *state, Datum val, 240 bool isNull); 241 242 extern void tuplesort_performsort(Tuplesortstate *state); 243 244 extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, 245 bool copy, TupleTableSlot *slot, Datum *abbrev); 246 extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward); 247 extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward); 248 extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward, 249 Datum *val, bool *isNull, Datum *abbrev); 250 251 extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, 252 bool forward); 253 254 extern void tuplesort_end(Tuplesortstate *state); 255 256 extern void tuplesort_reset(Tuplesortstate *state); 257 258 extern void tuplesort_get_stats(Tuplesortstate *state, 259 TuplesortInstrumentation *stats); 260 extern const char *tuplesort_method_name(TuplesortMethod m); 261 extern const char *tuplesort_space_type_name(TuplesortSpaceType t); 262 263 extern int tuplesort_merge_order(int64 allowedMem); 264 265 extern Size tuplesort_estimate_shared(int nworkers); 266 extern void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, 267 dsm_segment *seg); 268 extern void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg); 269 270 /* 271 * These routines may only be called if randomAccess was specified 'true'. 272 * Likewise, backwards scan in gettuple/getdatum is only allowed if 273 * randomAccess was specified. Note that parallel sorts do not support 274 * randomAccess. 275 */ 276 277 extern void tuplesort_rescan(Tuplesortstate *state); 278 extern void tuplesort_markpos(Tuplesortstate *state); 279 extern void tuplesort_restorepos(Tuplesortstate *state); 280 281 #endif /* TUPLESORT_H */ 282