1 /*- 2 * Copyright (c) 2014-2018 MongoDB, Inc. 3 * Copyright (c) 2008-2014 WiredTiger, Inc. 4 * All rights reserved. 5 * 6 * See the file LICENSE for redistribution information. 7 */ 8 9 /* 10 * Initialize a static WT_CURSOR structure. 11 */ 12 #define WT_CURSOR_STATIC_INIT(n, \ 13 get_key, \ 14 get_value, \ 15 set_key, \ 16 set_value, \ 17 compare, \ 18 equals, \ 19 next, \ 20 prev, \ 21 reset, \ 22 search, \ 23 search_near, \ 24 insert, \ 25 modify, \ 26 update, \ 27 remove, \ 28 reserve, \ 29 reconfigure, \ 30 cache, \ 31 reopen, \ 32 close) \ 33 static const WT_CURSOR n = { \ 34 NULL, /* session */ \ 35 NULL, /* uri */ \ 36 NULL, /* key_format */ \ 37 NULL, /* value_format */ \ 38 get_key, \ 39 get_value, \ 40 set_key, \ 41 set_value, \ 42 compare, \ 43 equals, \ 44 next, \ 45 prev, \ 46 reset, \ 47 search, \ 48 search_near, \ 49 insert, \ 50 modify, \ 51 update, \ 52 remove, \ 53 reserve, \ 54 close, \ 55 reconfigure, \ 56 cache, \ 57 reopen, \ 58 0, /* uri_hash */ \ 59 { NULL, NULL }, /* TAILQ_ENTRY q */ \ 60 0, /* recno key */ \ 61 { 0 }, /* recno raw buffer */ \ 62 NULL, /* json_private */ \ 63 NULL, /* lang_private */ \ 64 { NULL, 0, NULL, 0, 0 }, /* WT_ITEM key */ \ 65 { NULL, 0, NULL, 0, 0 }, /* WT_ITEM value */ \ 66 0, /* int saved_err */ \ 67 NULL, /* internal_uri */ \ 68 0 /* uint32_t flags */ \ 69 } 70 71 struct __wt_cursor_backup { 72 WT_CURSOR iface; 73 74 size_t next; /* Cursor position */ 75 WT_FSTREAM *bfs; /* Backup file stream */ 76 uint32_t maxid; /* Maximum log file ID seen */ 77 78 char **list; /* List of files to be copied. */ 79 size_t list_allocated; 80 size_t list_next; 81 82 /* AUTOMATIC FLAG VALUE GENERATION START */ 83 #define WT_CURBACKUP_LOCKER 0x1u /* Hot-backup started */ 84 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 85 uint8_t flags; 86 }; 87 #define WT_CURSOR_BACKUP_ID(cursor) (((WT_CURSOR_BACKUP *)(cursor))->maxid) 88 89 struct __wt_cursor_btree { 90 WT_CURSOR iface; 91 92 /* 93 * The btree field is safe to use when the cursor is open. When the 94 * cursor is cached, the btree may be closed, so it is only safe 95 * initially to look at the underlying data handle. 96 */ 97 WT_BTREE *btree; /* Enclosing btree */ 98 WT_DATA_HANDLE *dhandle; /* Data handle for the btree */ 99 100 /* 101 * The following fields are set by the search functions as a precursor 102 * to page modification: we have a page, a WT_COL/WT_ROW slot on the 103 * page, an insert head, insert list and a skiplist stack (the stack of 104 * skiplist entries leading to the insert point). The search functions 105 * also return the relationship of the search key to the found key. 106 */ 107 WT_REF *ref; /* Current page */ 108 uint32_t slot; /* WT_COL/WT_ROW 0-based slot */ 109 110 WT_INSERT_HEAD *ins_head; /* Insert chain head */ 111 WT_INSERT *ins; /* Current insert node */ 112 /* Search stack */ 113 WT_INSERT **ins_stack[WT_SKIP_MAXDEPTH]; 114 115 /* Next item(s) found during search */ 116 WT_INSERT *next_stack[WT_SKIP_MAXDEPTH]; 117 118 uint32_t page_deleted_count; /* Deleted items on the page */ 119 120 uint64_t recno; /* Record number */ 121 122 /* 123 * Next-random cursors can optionally be configured to step through a 124 * percentage of the total leaf pages to their next value. Note the 125 * configured value and the calculated number of leaf pages to skip. 126 */ 127 uint64_t next_random_leaf_skip; 128 u_int next_random_sample_size; 129 130 /* 131 * The search function sets compare to: 132 * < 1 if the found key is less than the specified key 133 * 0 if the found key matches the specified key 134 * > 1 if the found key is larger than the specified key 135 */ 136 int compare; 137 138 /* 139 * A key returned from a binary search or cursor movement on a row-store 140 * page; if we find an exact match on a row-store leaf page in a search 141 * operation, keep a copy of key we built during the search to avoid 142 * doing the additional work of getting the key again for return to the 143 * application. Note, this only applies to exact matches when searching 144 * disk-image structures, so it's not, for example, a key from an insert 145 * list. Additionally, this structure is used to build keys when moving 146 * a cursor through a row-store leaf page. 147 */ 148 WT_ITEM *row_key, _row_key; 149 150 /* 151 * It's relatively expensive to calculate the last record on a variable- 152 * length column-store page because of the repeat values. Calculate it 153 * once per page and cache it. This value doesn't include the skiplist 154 * of appended entries on the last page. 155 */ 156 uint64_t last_standard_recno; 157 158 /* 159 * For row-store pages, we need a single item that tells us the part of 160 * the page we're walking (otherwise switching from next to prev and 161 * vice-versa is just too complicated), so we map the WT_ROW and 162 * WT_INSERT_HEAD insert array slots into a single name space: slot 1 163 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is 164 * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are 165 * odd-numbered slots, and WT_ROW array slots are even-numbered slots. 166 */ 167 uint32_t row_iteration_slot; /* Row-store iteration slot */ 168 169 /* 170 * Variable-length column-store values are run-length encoded and may 171 * be overflow values or Huffman encoded. To avoid repeatedly reading 172 * overflow values or decompressing encoded values, process it once and 173 * store the result in a temporary buffer. The cip_saved field is used 174 * to determine if we've switched columns since our last cursor call. 175 */ 176 WT_COL *cip_saved; /* Last iteration reference */ 177 178 /* 179 * We don't instantiate prefix-compressed keys on pages where there's no 180 * Huffman encoding because we don't want to waste memory if only moving 181 * a cursor through the page, and it's faster to build keys while moving 182 * through the page than to roll-forward from a previously instantiated 183 * key (we don't instantiate all of the keys, just the ones at binary 184 * search points). We can't use the application's WT_CURSOR key field 185 * as a copy of the last-returned key because it may have been altered 186 * by the API layer, for example, dump cursors. Instead we store the 187 * last-returned key in a temporary buffer. The rip_saved field is used 188 * to determine if the key in the temporary buffer has the prefix needed 189 * for building the current key. 190 */ 191 WT_ROW *rip_saved; /* Last-returned key reference */ 192 193 /* 194 * A temporary buffer for caching RLE values for column-store files (if 195 * RLE is non-zero, then we don't unpack the value every time we move 196 * to the next cursor position, we re-use the unpacked value we stored 197 * here the first time we hit the value). 198 * 199 * A temporary buffer for building on-page keys when searching row-store 200 * files. 201 */ 202 WT_ITEM *tmp, _tmp; 203 204 /* 205 * The update structure allocated by the row- and column-store modify 206 * functions, used to avoid a data copy in the WT_CURSOR.update call. 207 */ 208 WT_UPDATE *modify_update; 209 210 /* 211 * Fixed-length column-store items are a single byte, and it's simpler 212 * and cheaper to allocate the space for it now than keep checking to 213 * see if we need to grow the buffer. 214 */ 215 uint8_t v; /* Fixed-length return value */ 216 217 uint8_t append_tree; /* Cursor appended to the tree */ 218 219 #ifdef HAVE_DIAGNOSTIC 220 /* Check that cursor next/prev never returns keys out-of-order. */ 221 WT_ITEM *lastkey, _lastkey; 222 uint64_t lastrecno; 223 #endif 224 225 /* AUTOMATIC FLAG VALUE GENERATION START */ 226 #define WT_CBT_ACTIVE 0x001u /* Active in the tree */ 227 #define WT_CBT_ITERATE_APPEND 0x002u /* Col-store: iterating append list */ 228 #define WT_CBT_ITERATE_NEXT 0x004u /* Next iteration configuration */ 229 #define WT_CBT_ITERATE_PREV 0x008u /* Prev iteration configuration */ 230 #define WT_CBT_NO_TXN 0x010u /* Non-txn cursor (e.g. a checkpoint) */ 231 #define WT_CBT_READ_ONCE 0x020u /* Page in with WT_READ_WONT_NEED */ 232 #define WT_CBT_RETRY_NEXT 0x040u /* Next, resulted in prepare conflict */ 233 #define WT_CBT_RETRY_PREV 0x080u /* Prev, resulted in prepare conflict */ 234 #define WT_CBT_SEARCH_SMALLEST 0x100u /* Row-store: small-key insert list */ 235 #define WT_CBT_VAR_ONPAGE_MATCH 0x200u /* Var-store: on-page recno match */ 236 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 237 238 #define WT_CBT_POSITION_MASK /* Flags associated with position */ \ 239 (WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \ 240 WT_CBT_RETRY_NEXT | WT_CBT_RETRY_PREV | WT_CBT_SEARCH_SMALLEST | \ 241 WT_CBT_VAR_ONPAGE_MATCH) 242 243 uint32_t flags; 244 }; 245 246 struct __wt_cursor_bulk { 247 WT_CURSOR_BTREE cbt; 248 249 /* 250 * Variable-length column store compares values during bulk load as 251 * part of RLE compression, row-store compares keys during bulk load 252 * to avoid corruption. 253 */ 254 bool first_insert; /* First insert */ 255 WT_ITEM last; /* Last key/value inserted */ 256 257 /* 258 * Additional column-store bulk load support. 259 */ 260 uint64_t recno; /* Record number */ 261 uint64_t rle; /* Variable-length RLE counter */ 262 263 /* 264 * Additional fixed-length column store bitmap bulk load support: 265 * current entry in memory chunk count, and the maximum number of 266 * records per chunk. 267 */ 268 bool bitmap; /* Bitmap bulk load */ 269 uint32_t entry; /* Entry count */ 270 uint32_t nrecs; /* Max records per chunk */ 271 272 void *reconcile; /* Reconciliation support */ 273 WT_REF *ref; /* The leaf page */ 274 WT_PAGE *leaf; 275 }; 276 277 struct __wt_cursor_config { 278 WT_CURSOR iface; 279 }; 280 281 struct __wt_cursor_data_source { 282 WT_CURSOR iface; 283 284 WT_COLLATOR *collator; /* Configured collator */ 285 int collator_owned; /* Collator needs to be terminated */ 286 287 WT_CURSOR *source; /* Application-owned cursor */ 288 }; 289 290 struct __wt_cursor_dump { 291 WT_CURSOR iface; 292 293 WT_CURSOR *child; 294 }; 295 296 struct __wt_cursor_index { 297 WT_CURSOR iface; 298 299 WT_TABLE *table; 300 WT_INDEX *index; 301 const char *key_plan, *value_plan; 302 303 WT_CURSOR *child; 304 WT_CURSOR **cg_cursors; 305 uint8_t *cg_needvalue; 306 }; 307 308 /* 309 * A join iterator structure is used to generate candidate primary keys. It 310 * is the responsibility of the caller of the iterator to filter these 311 * primary key against the other conditions of the join before returning 312 * them the caller of WT_CURSOR::next. 313 * 314 * For a conjunction join (the default), entry_count will be 1, meaning that 315 * the iterator only consumes the first entry (WT_CURSOR_JOIN_ENTRY). That 316 * is, it successively returns primary keys from a cursor for the first 317 * index that was joined. When the values returned by that cursor are 318 * exhausted, the iterator has completed. For a disjunction join, 319 * exhausting a cursor just means that the iterator advances to the next 320 * entry. If the next entry represents an index, a new cursor is opened and 321 * primary keys from that index are then successively returned. 322 * 323 * When positioned on an entry that represents a nested join, a new child 324 * iterator is created that will be bound to the nested WT_CURSOR_JOIN. 325 * That iterator is then used to generate candidate primary keys. When its 326 * iteration is completed, that iterator is destroyed and the parent 327 * iterator advances to the next entry. Thus, depending on how deeply joins 328 * are nested, a similarly deep stack of iterators is created. 329 */ 330 struct __wt_cursor_join_iter { 331 WT_SESSION_IMPL *session; 332 WT_CURSOR_JOIN *cjoin; 333 WT_CURSOR_JOIN_ENTRY *entry; 334 WT_CURSOR_JOIN_ITER *child; 335 WT_CURSOR *cursor; /* has null projection */ 336 WT_ITEM *curkey; /* primary key */ 337 WT_ITEM idxkey; 338 u_int entry_pos; /* the current entry */ 339 u_int entry_count; /* entries to walk */ 340 u_int end_pos; /* the current endpoint */ 341 u_int end_count; /* endpoints to walk */ 342 u_int end_skip; /* when testing for inclusion */ 343 /* can we skip current end? */ 344 bool positioned; 345 bool is_equal; 346 }; 347 348 /* 349 * A join endpoint represents a positioned cursor that is 'captured' by a 350 * WT_SESSION::join call. 351 */ 352 struct __wt_cursor_join_endpoint { 353 WT_ITEM key; 354 uint8_t recno_buf[10]; /* holds packed recno */ 355 WT_CURSOR *cursor; 356 357 /* AUTOMATIC FLAG VALUE GENERATION START */ 358 #define WT_CURJOIN_END_EQ 0x1u /* include values == cursor */ 359 #define WT_CURJOIN_END_GT 0x2u /* include values > cursor */ 360 #define WT_CURJOIN_END_LT 0x4u /* include values < cursor */ 361 #define WT_CURJOIN_END_OWN_CURSOR 0x8u /* must close cursor */ 362 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 363 #define WT_CURJOIN_END_GE (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ) 364 #define WT_CURJOIN_END_LE (WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ) 365 uint8_t flags; /* range for this endpoint */ 366 }; 367 #define WT_CURJOIN_END_RANGE(endp) \ 368 ((endp)->flags & \ 369 (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | WT_CURJOIN_END_LT)) 370 371 /* 372 * Each join entry typically represents an index's participation in a join. 373 * For example, if 'k' is an index, then "t.k > 10 && t.k < 20" would be 374 * represented by a single entry, with two endpoints. When the index and 375 * subjoin fields are NULL, the join is on the main table. When subjoin is 376 * non-NULL, there is a nested join clause. 377 */ 378 struct __wt_cursor_join_entry { 379 WT_INDEX *index; 380 WT_CURSOR *main; /* raw main table cursor */ 381 WT_CURSOR_JOIN *subjoin; /* a nested join clause */ 382 WT_BLOOM *bloom; /* Bloom filter handle */ 383 char *repack_format; /* target format for repack */ 384 uint32_t bloom_bit_count; /* bits per item in bloom */ 385 uint32_t bloom_hash_count; /* hash functions in bloom */ 386 uint64_t count; /* approx number of matches */ 387 388 /* AUTOMATIC FLAG VALUE GENERATION START */ 389 #define WT_CURJOIN_ENTRY_BLOOM 0x1u /* use a bloom filter */ 390 #define WT_CURJOIN_ENTRY_DISJUNCTION 0x2u /* endpoints are or-ed */ 391 #define WT_CURJOIN_ENTRY_FALSE_POSITIVES 0x4u /* don't filter false pos */ 392 #define WT_CURJOIN_ENTRY_OWN_BLOOM 0x8u /* this entry owns the bloom */ 393 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 394 uint8_t flags; 395 396 WT_CURSOR_JOIN_ENDPOINT *ends; /* reference endpoints */ 397 size_t ends_allocated; 398 u_int ends_next; 399 400 WT_JOIN_STATS stats; /* Join statistics */ 401 }; 402 403 struct __wt_cursor_join { 404 WT_CURSOR iface; 405 406 WT_TABLE *table; 407 const char *projection; 408 WT_CURSOR *main; /* main table with projection */ 409 WT_CURSOR_JOIN *parent; /* parent of nested group */ 410 WT_CURSOR_JOIN_ITER *iter; /* chain of iterators */ 411 WT_CURSOR_JOIN_ENTRY *entries; 412 size_t entries_allocated; 413 u_int entries_next; 414 uint8_t recno_buf[10]; /* holds packed recno */ 415 416 /* AUTOMATIC FLAG VALUE GENERATION START */ 417 #define WT_CURJOIN_DISJUNCTION 0x1u /* Entries are or-ed */ 418 #define WT_CURJOIN_ERROR 0x2u /* Error in initialization */ 419 #define WT_CURJOIN_INITIALIZED 0x4u /* Successful initialization */ 420 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 421 uint8_t flags; 422 }; 423 424 struct __wt_cursor_json { 425 char *key_buf; /* JSON formatted string */ 426 char *value_buf; /* JSON formatted string */ 427 WT_CONFIG_ITEM key_names; /* Names of key columns */ 428 WT_CONFIG_ITEM value_names; /* Names of value columns */ 429 }; 430 431 struct __wt_cursor_log { 432 WT_CURSOR iface; 433 434 WT_LSN *cur_lsn; /* LSN of current record */ 435 WT_LSN *next_lsn; /* LSN of next record */ 436 WT_ITEM *logrec; /* Copy of record for cursor */ 437 WT_ITEM *opkey, *opvalue; /* Op key/value copy */ 438 const uint8_t *stepp, *stepp_end; /* Pointer within record */ 439 uint8_t *packed_key; /* Packed key for 'raw' interface */ 440 uint8_t *packed_value; /* Packed value for 'raw' interface */ 441 uint32_t step_count; /* Intra-record count */ 442 uint32_t rectype; /* Record type */ 443 uint64_t txnid; /* Record txnid */ 444 445 /* AUTOMATIC FLAG VALUE GENERATION START */ 446 #define WT_CURLOG_ARCHIVE_LOCK 0x1u /* Archive lock held */ 447 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 448 uint8_t flags; 449 }; 450 451 struct __wt_cursor_metadata { 452 WT_CURSOR iface; 453 454 WT_CURSOR *file_cursor; /* Queries of regular metadata */ 455 WT_CURSOR *create_cursor; /* Extra cursor for create option */ 456 457 /* AUTOMATIC FLAG VALUE GENERATION START */ 458 #define WT_MDC_CREATEONLY 0x1u 459 #define WT_MDC_ONMETADATA 0x2u 460 #define WT_MDC_POSITIONED 0x4u 461 /* AUTOMATIC FLAG VALUE GENERATION STOP */ 462 uint8_t flags; 463 }; 464 465 struct __wt_join_stats_group { 466 const char *desc_prefix; /* Prefix appears before description */ 467 WT_CURSOR_JOIN *join_cursor; 468 ssize_t join_cursor_entry; /* Position in entries */ 469 WT_JOIN_STATS join_stats; 470 }; 471 472 struct __wt_cursor_stat { 473 WT_CURSOR iface; 474 475 bool notinitialized; /* Cursor not initialized */ 476 bool notpositioned; /* Cursor not positioned */ 477 478 int64_t *stats; /* Statistics */ 479 int stats_base; /* Base statistics value */ 480 int stats_count; /* Count of statistics values */ 481 int (*stats_desc)(WT_CURSOR_STAT *, int, const char **); 482 /* Statistics descriptions */ 483 int (*next_set)(WT_SESSION_IMPL *, WT_CURSOR_STAT *, bool, 484 bool); /* Advance to next set */ 485 486 union { /* Copies of the statistics */ 487 WT_DSRC_STATS dsrc_stats; 488 WT_CONNECTION_STATS conn_stats; 489 WT_JOIN_STATS_GROUP join_stats_group; 490 } u; 491 492 const char **cfg; /* Original cursor configuration */ 493 char *desc_buf; /* Saved description string */ 494 495 int key; /* Current stats key */ 496 uint64_t v; /* Current stats value */ 497 WT_ITEM pv; /* Current stats value (string) */ 498 499 /* Options declared in flags.py, shared by WT_CONNECTION::stat_flags */ 500 uint32_t flags; 501 }; 502 503 /* 504 * WT_CURSOR_STATS -- 505 * Return a reference to a statistic cursor's stats structures. 506 */ 507 #define WT_CURSOR_STATS(cursor) \ 508 (((WT_CURSOR_STAT *)(cursor))->stats) 509 510 struct __wt_cursor_table { 511 WT_CURSOR iface; 512 513 WT_TABLE *table; 514 const char *plan; 515 516 const char **cfg; /* Saved configuration string */ 517 518 WT_CURSOR **cg_cursors; 519 WT_ITEM *cg_valcopy; /* 520 * Copies of column group values, for 521 * overlapping set_value calls. 522 */ 523 WT_CURSOR **idx_cursors; 524 }; 525 526 #define WT_CURSOR_PRIMARY(cursor) \ 527 (((WT_CURSOR_TABLE *)(cursor))->cg_cursors[0]) 528 529 #define WT_CURSOR_RECNO(cursor) WT_STREQ((cursor)->key_format, "r") 530 531 #define WT_CURSOR_RAW_OK \ 532 (WT_CURSTD_DUMP_HEX | WT_CURSTD_DUMP_PRINT | WT_CURSTD_RAW) 533