1 /*------------------------------------------------------------------------- 2 * 3 * columnar.h 4 * 5 * Type and function declarations for Columnar 6 * 7 * Copyright (c) Citus Data, Inc. 8 * 9 *------------------------------------------------------------------------- 10 */ 11 12 #ifndef COLUMNAR_H 13 #define COLUMNAR_H 14 #include "postgres.h" 15 16 #include "fmgr.h" 17 #include "lib/stringinfo.h" 18 #include "nodes/parsenodes.h" 19 #include "storage/bufpage.h" 20 #include "storage/lockdefs.h" 21 #include "storage/relfilenode.h" 22 #include "utils/relcache.h" 23 #include "utils/snapmgr.h" 24 25 #include "columnar/columnar_compression.h" 26 #include "columnar/columnar_metadata.h" 27 28 /* Defines for valid option names */ 29 #define OPTION_NAME_COMPRESSION_TYPE "compression" 30 #define OPTION_NAME_STRIPE_ROW_COUNT "stripe_row_limit" 31 #define OPTION_NAME_CHUNK_ROW_COUNT "chunk_group_row_limit" 32 33 /* Limits for option parameters */ 34 #define STRIPE_ROW_COUNT_MINIMUM 1000 35 #define STRIPE_ROW_COUNT_MAXIMUM 10000000 36 #define CHUNK_ROW_COUNT_MINIMUM 1000 37 #define CHUNK_ROW_COUNT_MAXIMUM 100000 38 #define COMPRESSION_LEVEL_MIN 1 39 #define COMPRESSION_LEVEL_MAX 19 40 41 /* Columnar file signature */ 42 #define COLUMNAR_VERSION_MAJOR 2 43 #define COLUMNAR_VERSION_MINOR 0 44 45 /* miscellaneous defines */ 46 #define COLUMNAR_TUPLE_COST_MULTIPLIER 10 47 #define COLUMNAR_POSTSCRIPT_SIZE_LENGTH 1 48 #define COLUMNAR_POSTSCRIPT_SIZE_MAX 256 49 #define COLUMNAR_BYTES_PER_PAGE (BLCKSZ - SizeOfPageHeaderData) 50 51 /* 52 * ColumnarOptions holds the option values to be used when reading or writing 53 * a columnar table. To resolve these values, we first check foreign table's options, 54 * and if not present, we then fall back to the default values specified above. 55 */ 56 typedef struct ColumnarOptions 57 { 58 uint64 stripeRowCount; 59 uint32 chunkRowCount; 60 CompressionType compressionType; 61 int compressionLevel; 62 } ColumnarOptions; 63 64 65 /* 66 * ColumnarTableDDLContext holds the instance variable for the TableDDLCommandFunction 67 * instance described below. 68 */ 69 typedef struct ColumnarTableDDLContext 70 { 71 char *schemaName; 72 char *relationName; 73 ColumnarOptions options; 74 } ColumnarTableDDLContext; 75 76 77 /* ColumnChunkSkipNode contains statistics for a ColumnChunkData. */ 78 typedef struct ColumnChunkSkipNode 79 { 80 /* statistics about values of a column chunk */ 81 bool hasMinMax; 82 Datum minimumValue; 83 Datum maximumValue; 84 uint64 rowCount; 85 86 /* 87 * Offsets and sizes of value and exists streams in the column data. 88 * These enable us to skip reading suppressed row chunks, and start reading 89 * a chunk without reading previous chunks. 90 */ 91 uint64 valueChunkOffset; 92 uint64 valueLength; 93 uint64 existsChunkOffset; 94 uint64 existsLength; 95 96 /* 97 * This is used for (1) determining destination size when decompressing, 98 * (2) calculating compression rates when logging stats. 99 */ 100 uint64 decompressedValueSize; 101 102 CompressionType valueCompressionType; 103 int valueCompressionLevel; 104 } ColumnChunkSkipNode; 105 106 107 /* 108 * StripeSkipList can be used for skipping row chunks. It contains a column chunk 109 * skip node for each chunk of each column. chunkSkipNodeArray[column][chunk] 110 * is the entry for the specified column chunk. 111 */ 112 typedef struct StripeSkipList 113 { 114 ColumnChunkSkipNode **chunkSkipNodeArray; 115 uint32 *chunkGroupRowCounts; 116 uint32 columnCount; 117 uint32 chunkCount; 118 } StripeSkipList; 119 120 121 /* 122 * ChunkData represents a chunk of data for multiple columns. valueArray stores 123 * the values of data, and existsArray stores whether a value is present. 124 * valueBuffer is used to store (uncompressed) serialized values 125 * referenced by Datum's in valueArray. It is only used for by-reference Datum's. 126 * There is a one-to-one correspondence between valueArray and existsArray. 127 */ 128 typedef struct ChunkData 129 { 130 uint32 rowCount; 131 uint32 columnCount; 132 133 /* 134 * Following are indexed by [column][row]. If a column is not projected, 135 * then existsArray[column] and valueArray[column] are NULL. 136 */ 137 bool **existsArray; 138 Datum **valueArray; 139 140 /* valueBuffer keeps actual data for type-by-reference datums from valueArray. */ 141 StringInfo *valueBufferArray; 142 } ChunkData; 143 144 145 /* 146 * ColumnChunkBuffers represents a chunk of serialized data in a column. 147 * valueBuffer stores the serialized values of data, and existsBuffer stores 148 * serialized value of presence information. valueCompressionType contains 149 * compression type if valueBuffer is compressed. Finally rowCount has 150 * the number of rows in this chunk. 151 */ 152 typedef struct ColumnChunkBuffers 153 { 154 StringInfo existsBuffer; 155 StringInfo valueBuffer; 156 CompressionType valueCompressionType; 157 uint64 decompressedValueSize; 158 } ColumnChunkBuffers; 159 160 161 /* 162 * ColumnBuffers represents data buffers for a column in a row stripe. Each 163 * column is made of multiple column chunks. 164 */ 165 typedef struct ColumnBuffers 166 { 167 ColumnChunkBuffers **chunkBuffersArray; 168 } ColumnBuffers; 169 170 171 /* StripeBuffers represents data for a row stripe. */ 172 typedef struct StripeBuffers 173 { 174 uint32 columnCount; 175 uint32 rowCount; 176 ColumnBuffers **columnBuffersArray; 177 178 uint32 *selectedChunkGroupRowCounts; 179 } StripeBuffers; 180 181 182 /* return value of StripeWriteState to decide stripe write state */ 183 typedef enum StripeWriteStateEnum 184 { 185 /* stripe write is flushed to disk, so it's readable */ 186 STRIPE_WRITE_FLUSHED, 187 188 /* 189 * Writer transaction did abort either before inserting into 190 * columnar.stripe or after. 191 */ 192 STRIPE_WRITE_ABORTED, 193 194 /* 195 * Writer transaction is still in-progress. Note that it is not certain 196 * if it is being written by current backend's current transaction or 197 * another backend. 198 */ 199 STRIPE_WRITE_IN_PROGRESS 200 } StripeWriteStateEnum; 201 202 203 /* ColumnarReadState represents state of a columnar scan. */ 204 struct ColumnarReadState; 205 typedef struct ColumnarReadState ColumnarReadState; 206 207 208 /* ColumnarWriteState represents state of a columnar write operation. */ 209 struct ColumnarWriteState; 210 typedef struct ColumnarWriteState ColumnarWriteState; 211 212 extern int columnar_compression; 213 extern int columnar_stripe_row_limit; 214 extern int columnar_chunk_group_row_limit; 215 extern int columnar_compression_level; 216 217 extern void columnar_init_gucs(void); 218 219 extern CompressionType ParseCompressionType(const char *compressionTypeString); 220 221 /* Function declarations for writing to a columnar table */ 222 extern ColumnarWriteState * ColumnarBeginWrite(RelFileNode relfilenode, 223 ColumnarOptions options, 224 TupleDesc tupleDescriptor); 225 extern uint64 ColumnarWriteRow(ColumnarWriteState *state, Datum *columnValues, 226 bool *columnNulls); 227 extern void ColumnarFlushPendingWrites(ColumnarWriteState *state); 228 extern void ColumnarEndWrite(ColumnarWriteState *state); 229 extern bool ContainsPendingWrites(ColumnarWriteState *state); 230 extern MemoryContext ColumnarWritePerTupleContext(ColumnarWriteState *state); 231 232 /* Function declarations for reading from columnar table */ 233 234 /* functions applicable for both sequential and random access */ 235 extern ColumnarReadState * ColumnarBeginRead(Relation relation, 236 TupleDesc tupleDescriptor, 237 List *projectedColumnList, 238 List *qualConditions, 239 MemoryContext scanContext, 240 Snapshot snaphot, 241 bool randomAccess); 242 extern void ColumnarReadFlushPendingWrites(ColumnarReadState *readState); 243 extern void ColumnarEndRead(ColumnarReadState *state); 244 extern void ColumnarResetRead(ColumnarReadState *readState); 245 246 /* functions only applicable for sequential access */ 247 extern bool ColumnarReadNextRow(ColumnarReadState *state, Datum *columnValues, 248 bool *columnNulls, uint64 *rowNumber); 249 extern int64 ColumnarReadChunkGroupsFiltered(ColumnarReadState *state); 250 extern void ColumnarRescan(ColumnarReadState *readState, List *scanQual); 251 252 /* functions only applicable for random access */ 253 extern void ColumnarReadRowByRowNumberOrError(ColumnarReadState *readState, 254 uint64 rowNumber, Datum *columnValues, 255 bool *columnNulls); 256 extern bool ColumnarReadRowByRowNumber(ColumnarReadState *readState, 257 uint64 rowNumber, Datum *columnValues, 258 bool *columnNulls); 259 260 /* Function declarations for common functions */ 261 extern FmgrInfo * GetFunctionInfoOrNull(Oid typeId, Oid accessMethodId, 262 int16 procedureId); 263 extern ChunkData * CreateEmptyChunkData(uint32 columnCount, bool *columnMask, 264 uint32 chunkGroupRowCount); 265 extern void FreeChunkData(ChunkData *chunkData); 266 extern uint64 ColumnarTableRowCount(Relation relation); 267 extern const char * CompressionTypeStr(CompressionType type); 268 269 /* columnar_metadata_tables.c */ 270 extern void InitColumnarOptions(Oid regclass); 271 extern void SetColumnarOptions(Oid regclass, ColumnarOptions *options); 272 extern bool DeleteColumnarTableOptions(Oid regclass, bool missingOk); 273 extern bool ReadColumnarOptions(Oid regclass, ColumnarOptions *options); 274 extern bool IsColumnarTableAmTable(Oid relationId); 275 276 /* columnar_metadata_tables.c */ 277 extern void DeleteMetadataRows(RelFileNode relfilenode); 278 extern uint64 ColumnarMetadataNewStorageId(void); 279 extern uint64 GetHighestUsedAddress(RelFileNode relfilenode); 280 extern EmptyStripeReservation * ReserveEmptyStripe(Relation rel, uint64 columnCount, 281 uint64 chunkGroupRowCount, 282 uint64 stripeRowCount); 283 extern StripeMetadata * CompleteStripeReservation(Relation rel, uint64 stripeId, 284 uint64 sizeBytes, uint64 rowCount, 285 uint64 chunkCount); 286 extern void SaveStripeSkipList(RelFileNode relfilenode, uint64 stripe, 287 StripeSkipList *stripeSkipList, 288 TupleDesc tupleDescriptor); 289 extern void SaveChunkGroups(RelFileNode relfilenode, uint64 stripe, 290 List *chunkGroupRowCounts); 291 extern StripeSkipList * ReadStripeSkipList(RelFileNode relfilenode, uint64 stripe, 292 TupleDesc tupleDescriptor, 293 uint32 chunkCount, 294 Snapshot snapshot); 295 extern StripeMetadata * FindNextStripeByRowNumber(Relation relation, uint64 rowNumber, 296 Snapshot snapshot); 297 extern StripeMetadata * FindStripeByRowNumber(Relation relation, uint64 rowNumber, 298 Snapshot snapshot); 299 extern StripeMetadata * FindStripeWithMatchingFirstRowNumber(Relation relation, 300 uint64 rowNumber, 301 Snapshot snapshot); 302 extern StripeWriteStateEnum StripeWriteState(StripeMetadata *stripeMetadata); 303 extern uint64 StripeGetHighestRowNumber(StripeMetadata *stripeMetadata); 304 extern StripeMetadata * FindStripeWithHighestRowNumber(Relation relation, 305 Snapshot snapshot); 306 extern Datum columnar_relation_storageid(PG_FUNCTION_ARGS); 307 308 309 /* write_state_management.c */ 310 extern ColumnarWriteState * columnar_init_write_state(Relation relation, TupleDesc 311 tupdesc, 312 SubTransactionId currentSubXid); 313 extern void FlushWriteStateForRelfilenode(Oid relfilenode, SubTransactionId 314 currentSubXid); 315 extern void FlushWriteStateForAllRels(SubTransactionId currentSubXid, SubTransactionId 316 parentSubXid); 317 extern void DiscardWriteStateForAllRels(SubTransactionId currentSubXid, SubTransactionId 318 parentSubXid); 319 extern void MarkRelfilenodeDropped(Oid relfilenode, SubTransactionId currentSubXid); 320 extern void NonTransactionDropWriteState(Oid relfilenode); 321 extern bool PendingWritesInUpperTransactions(Oid relfilenode, 322 SubTransactionId currentSubXid); 323 extern MemoryContext GetWriteContextForDebug(void); 324 325 326 #endif /* COLUMNAR_H */ 327