1 /*-------------------------------------------------------------------------
2  *
3  * columnar.h
4  *
5  * Type and function declarations for Columnar
6  *
7  * Copyright (c) Citus Data, Inc.
8  *
9  *-------------------------------------------------------------------------
10  */
11 
12 #ifndef COLUMNAR_H
13 #define COLUMNAR_H
14 #include "postgres.h"
15 
16 #include "fmgr.h"
17 #include "lib/stringinfo.h"
18 #include "nodes/parsenodes.h"
19 #include "storage/bufpage.h"
20 #include "storage/lockdefs.h"
21 #include "storage/relfilenode.h"
22 #include "utils/relcache.h"
23 #include "utils/snapmgr.h"
24 
25 #include "columnar/columnar_compression.h"
26 #include "columnar/columnar_metadata.h"
27 
28 /* Defines for valid option names */
29 #define OPTION_NAME_COMPRESSION_TYPE "compression"
30 #define OPTION_NAME_STRIPE_ROW_COUNT "stripe_row_limit"
31 #define OPTION_NAME_CHUNK_ROW_COUNT "chunk_group_row_limit"
32 
33 /* Limits for option parameters */
34 #define STRIPE_ROW_COUNT_MINIMUM 1000
35 #define STRIPE_ROW_COUNT_MAXIMUM 10000000
36 #define CHUNK_ROW_COUNT_MINIMUM 1000
37 #define CHUNK_ROW_COUNT_MAXIMUM 100000
38 #define COMPRESSION_LEVEL_MIN 1
39 #define COMPRESSION_LEVEL_MAX 19
40 
41 /* Columnar file signature */
42 #define COLUMNAR_VERSION_MAJOR 2
43 #define COLUMNAR_VERSION_MINOR 0
44 
45 /* miscellaneous defines */
46 #define COLUMNAR_TUPLE_COST_MULTIPLIER 10
47 #define COLUMNAR_POSTSCRIPT_SIZE_LENGTH 1
48 #define COLUMNAR_POSTSCRIPT_SIZE_MAX 256
49 #define COLUMNAR_BYTES_PER_PAGE (BLCKSZ - SizeOfPageHeaderData)
50 
51 /*
52  * ColumnarOptions holds the option values to be used when reading or writing
53  * a columnar table. To resolve these values, we first check foreign table's options,
54  * and if not present, we then fall back to the default values specified above.
55  */
56 typedef struct ColumnarOptions
57 {
58 	uint64 stripeRowCount;
59 	uint32 chunkRowCount;
60 	CompressionType compressionType;
61 	int compressionLevel;
62 } ColumnarOptions;
63 
64 
65 /*
66  * ColumnarTableDDLContext holds the instance variable for the TableDDLCommandFunction
67  * instance described below.
68  */
69 typedef struct ColumnarTableDDLContext
70 {
71 	char *schemaName;
72 	char *relationName;
73 	ColumnarOptions options;
74 } ColumnarTableDDLContext;
75 
76 
77 /* ColumnChunkSkipNode contains statistics for a ColumnChunkData. */
78 typedef struct ColumnChunkSkipNode
79 {
80 	/* statistics about values of a column chunk */
81 	bool hasMinMax;
82 	Datum minimumValue;
83 	Datum maximumValue;
84 	uint64 rowCount;
85 
86 	/*
87 	 * Offsets and sizes of value and exists streams in the column data.
88 	 * These enable us to skip reading suppressed row chunks, and start reading
89 	 * a chunk without reading previous chunks.
90 	 */
91 	uint64 valueChunkOffset;
92 	uint64 valueLength;
93 	uint64 existsChunkOffset;
94 	uint64 existsLength;
95 
96 	/*
97 	 * This is used for (1) determining destination size when decompressing,
98 	 * (2) calculating compression rates when logging stats.
99 	 */
100 	uint64 decompressedValueSize;
101 
102 	CompressionType valueCompressionType;
103 	int valueCompressionLevel;
104 } ColumnChunkSkipNode;
105 
106 
107 /*
108  * StripeSkipList can be used for skipping row chunks. It contains a column chunk
109  * skip node for each chunk of each column. chunkSkipNodeArray[column][chunk]
110  * is the entry for the specified column chunk.
111  */
112 typedef struct StripeSkipList
113 {
114 	ColumnChunkSkipNode **chunkSkipNodeArray;
115 	uint32 *chunkGroupRowCounts;
116 	uint32 columnCount;
117 	uint32 chunkCount;
118 } StripeSkipList;
119 
120 
121 /*
122  * ChunkData represents a chunk of data for multiple columns. valueArray stores
123  * the values of data, and existsArray stores whether a value is present.
124  * valueBuffer is used to store (uncompressed) serialized values
125  * referenced by Datum's in valueArray. It is only used for by-reference Datum's.
126  * There is a one-to-one correspondence between valueArray and existsArray.
127  */
128 typedef struct ChunkData
129 {
130 	uint32 rowCount;
131 	uint32 columnCount;
132 
133 	/*
134 	 * Following are indexed by [column][row]. If a column is not projected,
135 	 * then existsArray[column] and valueArray[column] are NULL.
136 	 */
137 	bool **existsArray;
138 	Datum **valueArray;
139 
140 	/* valueBuffer keeps actual data for type-by-reference datums from valueArray. */
141 	StringInfo *valueBufferArray;
142 } ChunkData;
143 
144 
145 /*
146  * ColumnChunkBuffers represents a chunk of serialized data in a column.
147  * valueBuffer stores the serialized values of data, and existsBuffer stores
148  * serialized value of presence information. valueCompressionType contains
149  * compression type if valueBuffer is compressed. Finally rowCount has
150  * the number of rows in this chunk.
151  */
152 typedef struct ColumnChunkBuffers
153 {
154 	StringInfo existsBuffer;
155 	StringInfo valueBuffer;
156 	CompressionType valueCompressionType;
157 	uint64 decompressedValueSize;
158 } ColumnChunkBuffers;
159 
160 
161 /*
162  * ColumnBuffers represents data buffers for a column in a row stripe. Each
163  * column is made of multiple column chunks.
164  */
165 typedef struct ColumnBuffers
166 {
167 	ColumnChunkBuffers **chunkBuffersArray;
168 } ColumnBuffers;
169 
170 
171 /* StripeBuffers represents data for a row stripe. */
172 typedef struct StripeBuffers
173 {
174 	uint32 columnCount;
175 	uint32 rowCount;
176 	ColumnBuffers **columnBuffersArray;
177 
178 	uint32 *selectedChunkGroupRowCounts;
179 } StripeBuffers;
180 
181 
182 /* return value of StripeWriteState to decide stripe write state */
183 typedef enum StripeWriteStateEnum
184 {
185 	/* stripe write is flushed to disk, so it's readable */
186 	STRIPE_WRITE_FLUSHED,
187 
188 	/*
189 	 * Writer transaction did abort either before inserting into
190 	 * columnar.stripe or after.
191 	 */
192 	STRIPE_WRITE_ABORTED,
193 
194 	/*
195 	 * Writer transaction is still in-progress. Note that it is not certain
196 	 * if it is being written by current backend's current transaction or
197 	 * another backend.
198 	 */
199 	STRIPE_WRITE_IN_PROGRESS
200 } StripeWriteStateEnum;
201 
202 
203 /* ColumnarReadState represents state of a columnar scan. */
204 struct ColumnarReadState;
205 typedef struct ColumnarReadState ColumnarReadState;
206 
207 
208 /* ColumnarWriteState represents state of a columnar write operation. */
209 struct ColumnarWriteState;
210 typedef struct ColumnarWriteState ColumnarWriteState;
211 
212 extern int columnar_compression;
213 extern int columnar_stripe_row_limit;
214 extern int columnar_chunk_group_row_limit;
215 extern int columnar_compression_level;
216 
217 extern void columnar_init_gucs(void);
218 
219 extern CompressionType ParseCompressionType(const char *compressionTypeString);
220 
221 /* Function declarations for writing to a columnar table */
222 extern ColumnarWriteState * ColumnarBeginWrite(RelFileNode relfilenode,
223 											   ColumnarOptions options,
224 											   TupleDesc tupleDescriptor);
225 extern uint64 ColumnarWriteRow(ColumnarWriteState *state, Datum *columnValues,
226 							   bool *columnNulls);
227 extern void ColumnarFlushPendingWrites(ColumnarWriteState *state);
228 extern void ColumnarEndWrite(ColumnarWriteState *state);
229 extern bool ContainsPendingWrites(ColumnarWriteState *state);
230 extern MemoryContext ColumnarWritePerTupleContext(ColumnarWriteState *state);
231 
232 /* Function declarations for reading from columnar table */
233 
234 /* functions applicable for both sequential and random access */
235 extern ColumnarReadState * ColumnarBeginRead(Relation relation,
236 											 TupleDesc tupleDescriptor,
237 											 List *projectedColumnList,
238 											 List *qualConditions,
239 											 MemoryContext scanContext,
240 											 Snapshot snaphot,
241 											 bool randomAccess);
242 extern void ColumnarReadFlushPendingWrites(ColumnarReadState *readState);
243 extern void ColumnarEndRead(ColumnarReadState *state);
244 extern void ColumnarResetRead(ColumnarReadState *readState);
245 
246 /* functions only applicable for sequential access */
247 extern bool ColumnarReadNextRow(ColumnarReadState *state, Datum *columnValues,
248 								bool *columnNulls, uint64 *rowNumber);
249 extern int64 ColumnarReadChunkGroupsFiltered(ColumnarReadState *state);
250 extern void ColumnarRescan(ColumnarReadState *readState, List *scanQual);
251 
252 /* functions only applicable for random access */
253 extern void ColumnarReadRowByRowNumberOrError(ColumnarReadState *readState,
254 											  uint64 rowNumber, Datum *columnValues,
255 											  bool *columnNulls);
256 extern bool ColumnarReadRowByRowNumber(ColumnarReadState *readState,
257 									   uint64 rowNumber, Datum *columnValues,
258 									   bool *columnNulls);
259 
260 /* Function declarations for common functions */
261 extern FmgrInfo * GetFunctionInfoOrNull(Oid typeId, Oid accessMethodId,
262 										int16 procedureId);
263 extern ChunkData * CreateEmptyChunkData(uint32 columnCount, bool *columnMask,
264 										uint32 chunkGroupRowCount);
265 extern void FreeChunkData(ChunkData *chunkData);
266 extern uint64 ColumnarTableRowCount(Relation relation);
267 extern const char * CompressionTypeStr(CompressionType type);
268 
269 /* columnar_metadata_tables.c */
270 extern void InitColumnarOptions(Oid regclass);
271 extern void SetColumnarOptions(Oid regclass, ColumnarOptions *options);
272 extern bool DeleteColumnarTableOptions(Oid regclass, bool missingOk);
273 extern bool ReadColumnarOptions(Oid regclass, ColumnarOptions *options);
274 extern bool IsColumnarTableAmTable(Oid relationId);
275 
276 /* columnar_metadata_tables.c */
277 extern void DeleteMetadataRows(RelFileNode relfilenode);
278 extern uint64 ColumnarMetadataNewStorageId(void);
279 extern uint64 GetHighestUsedAddress(RelFileNode relfilenode);
280 extern EmptyStripeReservation * ReserveEmptyStripe(Relation rel, uint64 columnCount,
281 												   uint64 chunkGroupRowCount,
282 												   uint64 stripeRowCount);
283 extern StripeMetadata * CompleteStripeReservation(Relation rel, uint64 stripeId,
284 												  uint64 sizeBytes, uint64 rowCount,
285 												  uint64 chunkCount);
286 extern void SaveStripeSkipList(RelFileNode relfilenode, uint64 stripe,
287 							   StripeSkipList *stripeSkipList,
288 							   TupleDesc tupleDescriptor);
289 extern void SaveChunkGroups(RelFileNode relfilenode, uint64 stripe,
290 							List *chunkGroupRowCounts);
291 extern StripeSkipList * ReadStripeSkipList(RelFileNode relfilenode, uint64 stripe,
292 										   TupleDesc tupleDescriptor,
293 										   uint32 chunkCount,
294 										   Snapshot snapshot);
295 extern StripeMetadata * FindNextStripeByRowNumber(Relation relation, uint64 rowNumber,
296 												  Snapshot snapshot);
297 extern StripeMetadata * FindStripeByRowNumber(Relation relation, uint64 rowNumber,
298 											  Snapshot snapshot);
299 extern StripeMetadata * FindStripeWithMatchingFirstRowNumber(Relation relation,
300 															 uint64 rowNumber,
301 															 Snapshot snapshot);
302 extern StripeWriteStateEnum StripeWriteState(StripeMetadata *stripeMetadata);
303 extern uint64 StripeGetHighestRowNumber(StripeMetadata *stripeMetadata);
304 extern StripeMetadata * FindStripeWithHighestRowNumber(Relation relation,
305 													   Snapshot snapshot);
306 extern Datum columnar_relation_storageid(PG_FUNCTION_ARGS);
307 
308 
309 /* write_state_management.c */
310 extern ColumnarWriteState * columnar_init_write_state(Relation relation, TupleDesc
311 													  tupdesc,
312 													  SubTransactionId currentSubXid);
313 extern void FlushWriteStateForRelfilenode(Oid relfilenode, SubTransactionId
314 										  currentSubXid);
315 extern void FlushWriteStateForAllRels(SubTransactionId currentSubXid, SubTransactionId
316 									  parentSubXid);
317 extern void DiscardWriteStateForAllRels(SubTransactionId currentSubXid, SubTransactionId
318 										parentSubXid);
319 extern void MarkRelfilenodeDropped(Oid relfilenode, SubTransactionId currentSubXid);
320 extern void NonTransactionDropWriteState(Oid relfilenode);
321 extern bool PendingWritesInUpperTransactions(Oid relfilenode,
322 											 SubTransactionId currentSubXid);
323 extern MemoryContext GetWriteContextForDebug(void);
324 
325 
326 #endif /* COLUMNAR_H */
327