1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of TokuDB
6
7
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 TokuDBis is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 TokuDB is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with TokuDB. If not, see <http://www.gnu.org/licenses/>.
21
22 ======= */
23
24 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
25
26 #include "hatoku_hton.h"
27 #include "hatoku_cmp.h"
28 #include "tokudb_buffer.h"
29 #include "tokudb_status.h"
30 #include "tokudb_card.h"
31 #include "ha_tokudb.h"
32 #include "sql_db.h"
33
34 pfs_key_t ha_tokudb_mutex_key;
35 pfs_key_t num_DBs_lock_key;
36
37 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
get_ext_key_parts(const KEY * key)38 static inline uint get_ext_key_parts(const KEY *key) {
39 #if (50609 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \
40 (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799)
41 return key->actual_key_parts;
42 #elif defined(MARIADB_BASE_VERSION)
43 return key->ext_key_parts;
44 #else
45 #error
46 #endif
47 }
48 #endif // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
49
50 std::unordered_map<std::string, TOKUDB_SHARE*> TOKUDB_SHARE::_open_tables;
51 tokudb::thread::mutex_t* TOKUDB_SHARE::_open_tables_mutex;
52
53 static const char* ha_tokudb_exts[] = {
54 ha_tokudb_ext,
55 NullS
56 };
57
58 //
59 // This offset is calculated starting from AFTER the NULL bytes
60 //
get_fixed_field_size(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)61 static inline uint32_t get_fixed_field_size(
62 KEY_AND_COL_INFO* kc_info,
63 TABLE_SHARE* table_share,
64 uint keynr) {
65
66 uint offset = 0;
67 for (uint i = 0; i < table_share->fields; i++) {
68 if (is_fixed_field(kc_info, i) &&
69 !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
70 offset += kc_info->field_lengths[i];
71 }
72 }
73 return offset;
74 }
75
76
get_len_of_offsets(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)77 static inline uint32_t get_len_of_offsets(
78 KEY_AND_COL_INFO* kc_info,
79 TABLE_SHARE* table_share,
80 uint keynr) {
81
82 uint len = 0;
83 for (uint i = 0; i < table_share->fields; i++) {
84 if (is_variable_field(kc_info, i) &&
85 !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
86 len += kc_info->num_offset_bytes;
87 }
88 }
89 return len;
90 }
91
92
allocate_key_and_col_info(TABLE_SHARE * table_share,KEY_AND_COL_INFO * kc_info)93 static int allocate_key_and_col_info(
94 TABLE_SHARE* table_share,
95 KEY_AND_COL_INFO* kc_info) {
96
97 int error;
98 //
99 // initialize all of the bitmaps
100 //
101 for (uint i = 0; i < MAX_KEY + 1; i++) {
102 error =
103 bitmap_init(
104 &kc_info->key_filters[i],
105 NULL,
106 table_share->fields,
107 false);
108 if (error) {
109 goto exit;
110 }
111 }
112
113 //
114 // create the field lengths
115 //
116 kc_info->multi_ptr = tokudb::memory::multi_malloc(
117 MYF(MY_WME+MY_ZEROFILL),
118 &kc_info->field_types, (uint)(table_share->fields * sizeof (uint8_t)),
119 &kc_info->field_lengths, (uint)(table_share->fields * sizeof (uint16_t)),
120 &kc_info->length_bytes, (uint)(table_share->fields * sizeof (uint8_t)),
121 &kc_info->blob_fields, (uint)(table_share->fields * sizeof (uint32_t)),
122 NullS);
123 if (kc_info->multi_ptr == NULL) {
124 error = ENOMEM;
125 goto exit;
126 }
127 exit:
128 if (error) {
129 for (uint i = 0; MAX_KEY + 1; i++) {
130 bitmap_free(&kc_info->key_filters[i]);
131 }
132 tokudb::memory::free(kc_info->multi_ptr);
133 }
134 return error;
135 }
136
free_key_and_col_info(KEY_AND_COL_INFO * kc_info)137 static void free_key_and_col_info (KEY_AND_COL_INFO* kc_info) {
138 for (uint i = 0; i < MAX_KEY+1; i++) {
139 bitmap_free(&kc_info->key_filters[i]);
140 }
141
142 for (uint i = 0; i < MAX_KEY+1; i++) {
143 tokudb::memory::free(kc_info->cp_info[i]);
144 kc_info->cp_info[i] = NULL; // 3144
145 }
146
147 tokudb::memory::free(kc_info->multi_ptr);
148 kc_info->field_types = NULL;
149 kc_info->field_lengths = NULL;
150 kc_info->length_bytes = NULL;
151 kc_info->blob_fields = NULL;
152 }
153
154
static_init()155 void TOKUDB_SHARE::static_init() {
156 assert_always(_open_tables.size() == 0);
157 _open_tables_mutex = new tokudb::thread::mutex_t();
158 }
static_destroy()159 void TOKUDB_SHARE::static_destroy() {
160 for (auto it = _open_tables.cbegin(); it != _open_tables.cend(); it++) {
161 TOKUDB_TRACE("_open_tables %s %p", it->first.c_str(), it->second);
162 TOKUDB_SHARE* share = it->second;
163 share->destroy();
164 delete share;
165 }
166 _open_tables.clear();
167 assert_always(_open_tables.size() == 0);
168 delete _open_tables_mutex;
169 }
get_state_string(share_state_t state)170 const char* TOKUDB_SHARE::get_state_string(share_state_t state) {
171 static const char* state_string[] = {
172 "CLOSED",
173 "OPENED",
174 "ERROR"
175 };
176 assert_always(state == CLOSED || state == OPENED || state == ERROR);
177 return state_string[state];
178 }
operator new(size_t sz)179 void* TOKUDB_SHARE::operator new(size_t sz) {
180 return tokudb::memory::malloc(sz, MYF(MY_WME|MY_ZEROFILL|MY_FAE));
181 }
operator delete(void * p)182 void TOKUDB_SHARE::operator delete(void* p) { tokudb::memory::free(p); }
TOKUDB_SHARE()183 TOKUDB_SHARE::TOKUDB_SHARE()
184 : _num_DBs_lock(num_DBs_lock_key), _mutex(ha_tokudb_mutex_key) {}
init(const char * table_name)185 void TOKUDB_SHARE::init(const char* table_name) {
186 _use_count = 0;
187 thr_lock_init(&_thr_lock);
188 _state = CLOSED;
189 _row_delta_activity = 0;
190 _allow_auto_analysis = true;
191
192 _full_table_name.append(table_name);
193
194 String tmp_dictionary_name;
195 tokudb_split_dname(
196 table_name,
197 _database_name,
198 _table_name,
199 tmp_dictionary_name);
200
201 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
202 _full_table_name.ptr(),
203 get_state_string(_state),
204 _use_count);
205 TOKUDB_SHARE_DBUG_VOID_RETURN();
206 }
destroy()207 void TOKUDB_SHARE::destroy() {
208 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
209 _full_table_name.ptr(),
210 get_state_string(_state),
211 _use_count);
212
213 assert_always(_use_count == 0);
214 assert_always(
215 _state == TOKUDB_SHARE::CLOSED || _state == TOKUDB_SHARE::ERROR);
216 thr_lock_delete(&_thr_lock);
217 TOKUDB_SHARE_DBUG_VOID_RETURN();
218 }
get_share(const char * table_name,THR_LOCK_DATA * data,bool create_new)219 TOKUDB_SHARE* TOKUDB_SHARE::get_share(const char* table_name,
220 THR_LOCK_DATA* data,
221 bool create_new) {
222 std::string find_table_name(table_name);
223 mutex_t_lock(*_open_tables_mutex);
224 auto it = _open_tables.find(find_table_name);
225 TOKUDB_SHARE *share = nullptr;
226 if (it != _open_tables.end()) {
227 share = it->second;
228 assert_always(strcmp(table_name, share->full_table_name()) == 0);
229 }
230 TOKUDB_TRACE_FOR_FLAGS(
231 TOKUDB_DEBUG_SHARE,
232 "existing share[%s] %s:share[%p]",
233 table_name,
234 share == NULL ? "not found" : "found",
235 share);
236
237 if (!share) {
238 if (create_new == false)
239 goto exit;
240 // create share and fill it with all zeroes
241 // hence, all pointers are initialized to NULL
242 share = new TOKUDB_SHARE;
243 assert_always(share);
244
245 share->init(table_name);
246
247 _open_tables.insert({find_table_name, share});
248 }
249
250 share->addref();
251
252 if (data)
253 thr_lock_data_init(&(share->_thr_lock), data, NULL);
254
255 exit:
256 mutex_t_unlock(*_open_tables_mutex);
257 return share;
258 }
drop_share(TOKUDB_SHARE * share)259 void TOKUDB_SHARE::drop_share(TOKUDB_SHARE* share) {
260 TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_SHARE,
261 "share[%p]:file[%s]:state[%s]:use_count[%d]",
262 share,
263 share->_full_table_name.ptr(),
264 get_state_string(share->_state),
265 share->_use_count);
266
267 mutex_t_lock(*_open_tables_mutex);
268 size_t n = _open_tables.erase(std::string(share->full_table_name()));
269 assert_always(n == 1);
270 share->destroy();
271 delete share;
272 mutex_t_unlock(*_open_tables_mutex);
273 }
addref()274 TOKUDB_SHARE::share_state_t TOKUDB_SHARE::addref() {
275 TOKUDB_SHARE_TRACE_FOR_FLAGS((TOKUDB_DEBUG_ENTER & TOKUDB_DEBUG_SHARE),
276 "file[%s]:state[%s]:use_count[%d]",
277 _full_table_name.ptr(),
278 get_state_string(_state),
279 _use_count);
280
281 lock();
282 _use_count++;
283
284 return _state;
285 }
release()286 int TOKUDB_SHARE::release() {
287 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
288 _full_table_name.ptr(),
289 get_state_string(_state),
290 _use_count);
291
292 int error, result = 0;
293
294 mutex_t_lock(_mutex);
295 assert_always(_use_count != 0);
296 _use_count--;
297 if (_use_count == 0 && _state == TOKUDB_SHARE::OPENED) {
298 // number of open DB's may not be equal to number of keys we have
299 // because add_index may have added some. So, we loop through entire
300 // array and close any non-NULL value. It is imperative that we reset
301 // a DB to NULL once we are done with it.
302 for (uint i = 0; i < sizeof(key_file)/sizeof(key_file[0]); i++) {
303 if (key_file[i]) {
304 TOKUDB_TRACE_FOR_FLAGS(
305 TOKUDB_DEBUG_OPEN,
306 "dbclose:%p",
307 key_file[i]);
308 error = key_file[i]->close(key_file[i], 0);
309 assert_always(error == 0);
310 if (error) {
311 result = error;
312 }
313 if (key_file[i] == file)
314 file = NULL;
315 key_file[i] = NULL;
316 }
317 }
318
319 error = tokudb::metadata::close(&status_block);
320 assert_always(error == 0);
321
322 free_key_and_col_info(&kc_info);
323
324 if (_rec_per_key) {
325 tokudb::memory::free(_rec_per_key);
326 _rec_per_key = NULL;
327 _rec_per_keys = 0;
328 }
329
330 for (uint i = 0; i < _keys; i++) {
331 tokudb::memory::free(_key_descriptors[i]._name);
332 }
333 tokudb::memory::free(_key_descriptors);
334 _keys = _max_key_parts = 0; _key_descriptors = NULL;
335
336 _state = TOKUDB_SHARE::CLOSED;
337 }
338 mutex_t_unlock(_mutex);
339
340 TOKUDB_SHARE_DBUG_RETURN(result);
341 }
update_row_count(THD * thd,uint64_t added,uint64_t deleted,uint64_t updated)342 void TOKUDB_SHARE::update_row_count(
343 THD* thd,
344 uint64_t added,
345 uint64_t deleted,
346 uint64_t updated) {
347
348 uint64_t delta = added + deleted + updated;
349 lock();
350 if (deleted > added && _rows < (deleted - added)) {
351 _rows = 0;
352 } else {
353 _rows += added - deleted;
354 }
355 _row_delta_activity += delta;
356 if (_row_delta_activity == (uint64_t)~0)
357 _row_delta_activity = 1;
358
359 ulonglong auto_threshold = tokudb::sysvars::auto_analyze(thd);
360 if (delta && auto_threshold > 0 && _allow_auto_analysis) {
361 ulonglong pct_of_rows_changed_to_trigger;
362 pct_of_rows_changed_to_trigger = ((_rows * auto_threshold) / 100);
363 if (_row_delta_activity >= pct_of_rows_changed_to_trigger) {
364 char msg[200];
365 snprintf(msg,
366 sizeof(msg),
367 "TokuDB: Auto %s analysis for %s, delta_activity %llu is "
368 "greater than %llu percent of %llu rows.",
369 tokudb::sysvars::analyze_in_background(thd) > 0
370 ? "scheduling background"
371 : "running foreground",
372 full_table_name(),
373 _row_delta_activity,
374 auto_threshold,
375 (ulonglong)(_rows));
376
377 // analyze_standard will unlock _mutex regardless of success/failure
378 int ret = analyze_standard(thd, NULL);
379 if (ret == 0) {
380 sql_print_information("%s - succeeded.", msg);
381 } else {
382 sql_print_information(
383 "%s - failed, likely a job already running.",
384 msg);
385 }
386 }
387 }
388 unlock();
389 }
set_cardinality_counts_in_table(TABLE * table)390 void TOKUDB_SHARE::set_cardinality_counts_in_table(TABLE* table) {
391 lock();
392 uint32_t next_key_part = 0;
393 for (uint32_t i = 0; i < table->s->keys; i++) {
394 KEY* key = &table->key_info[i];
395 bool is_unique_key =
396 (i == table->s->primary_key) || (key->flags & HA_NOSAME);
397
398 for (uint32_t j = 0; j < get_ext_key_parts(key); j++) {
399 if (j >= key->user_defined_key_parts) {
400 // MySQL 'hidden' keys, really needs deeper investigation
401 // into MySQL hidden keys vs TokuDB hidden keys
402 key->rec_per_key[j] = 1;
403 continue;
404 }
405
406 assert_always(next_key_part < _rec_per_keys);
407 ulong val = _rec_per_key[next_key_part++];
408 val = (val * tokudb::sysvars::cardinality_scale_percent) / 100;
409 if (val == 0 || _rows == 0 ||
410 (is_unique_key && j == get_ext_key_parts(key) - 1)) {
411 val = 1;
412 }
413 key->rec_per_key[j] = val;
414 }
415 }
416 unlock();
417 }
418
419 #define HANDLE_INVALID_CURSOR() \
420 if (cursor == NULL) { \
421 error = last_cursor_error; \
422 goto cleanup; \
423 }
424
table_type() const425 const char *ha_tokudb::table_type() const {
426 return tokudb_hton_name;
427 }
428
index_type(TOKUDB_UNUSED (uint inx))429 const char *ha_tokudb::index_type(TOKUDB_UNUSED(uint inx)) {
430 return "BTREE";
431 }
432
433 /*
434 * returns NULL terminated file extension string
435 */
bas_ext() const436 const char **ha_tokudb::bas_ext() const {
437 TOKUDB_HANDLER_DBUG_ENTER("");
438 DBUG_RETURN(ha_tokudb_exts);
439 }
440
is_insert_ignore(THD * thd)441 static inline bool is_insert_ignore (THD* thd) {
442 //
443 // from http://lists.mysql.com/internals/37735
444 //
445 return thd->lex->ignore && thd->lex->duplicates == DUP_ERROR;
446 }
447
is_replace_into(THD * thd)448 static inline bool is_replace_into(THD* thd) {
449 return thd->lex->duplicates == DUP_REPLACE;
450 }
451
do_ignore_flag_optimization(THD * thd,TABLE * table,bool opt_eligible)452 static inline bool do_ignore_flag_optimization(
453 THD* thd,
454 TABLE* table,
455 bool opt_eligible) {
456
457 bool do_opt = false;
458 if (opt_eligible &&
459 (is_replace_into(thd) || is_insert_ignore(thd)) &&
460 tokudb::sysvars::pk_insert_mode(thd) == 1 &&
461 !table->triggers &&
462 !(mysql_bin_log.is_open() &&
463 thd->variables.binlog_format != BINLOG_FORMAT_STMT)) {
464 do_opt = true;
465 }
466 return do_opt;
467 }
468
table_flags() const469 ulonglong ha_tokudb::table_flags() const {
470 return int_table_flags | HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE;
471 }
472
473 //
474 // Returns a bit mask of capabilities of the key or its part specified by
475 // the arguments. The capabilities are defined in sql/handler.h.
476 //
index_flags(uint idx,TOKUDB_UNUSED (uint part),TOKUDB_UNUSED (bool all_parts)) const477 ulong ha_tokudb::index_flags(uint idx,
478 TOKUDB_UNUSED(uint part),
479 TOKUDB_UNUSED(bool all_parts)) const {
480 TOKUDB_HANDLER_DBUG_ENTER("");
481 assert_always(table_share);
482 ulong flags = (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER |
483 HA_KEYREAD_ONLY | HA_READ_RANGE | HA_DO_INDEX_COND_PUSHDOWN);
484 if (key_is_clustering(&table_share->key_info[idx])) {
485 flags |= HA_CLUSTERED_INDEX;
486 }
487 DBUG_RETURN(flags);
488 }
489
490
491 //
492 // struct that will be used as a context for smart DBT callbacks
493 // contains parameters needed to complete the smart DBT cursor call
494 //
495 typedef struct smart_dbt_info {
496 ha_tokudb* ha; //instance to ha_tokudb needed for reading the row
497 uchar* buf; // output buffer where row will be written
498 uint keynr; // index into share->key_file that represents DB we are currently operating on
499 } *SMART_DBT_INFO;
500
501 typedef struct smart_dbt_bf_info {
502 ha_tokudb* ha;
503 bool need_val;
504 int direction;
505 THD* thd;
506 uchar* buf;
507 DBT* key_to_compare;
508 } *SMART_DBT_BF_INFO;
509
510 typedef struct index_read_info {
511 struct smart_dbt_info smart_dbt_info;
512 int cmp;
513 DBT* orig_key;
514 } *INDEX_READ_INFO;
515
516 //
517 // smart DBT callback function for optimize
518 // in optimize, we want to flatten DB by doing
519 // a full table scan. Therefore, we don't
520 // want to actually do anything with the data, hence
521 // callback does nothing
522 //
smart_dbt_do_nothing(TOKUDB_UNUSED (DBT const * key),TOKUDB_UNUSED (DBT const * row),TOKUDB_UNUSED (void * context))523 static int smart_dbt_do_nothing(TOKUDB_UNUSED(DBT const* key),
524 TOKUDB_UNUSED(DBT const* row),
525 TOKUDB_UNUSED(void* context)) {
526 return 0;
527 }
528
529 static int
smart_dbt_callback_rowread_ptquery(DBT const * key,DBT const * row,void * context)530 smart_dbt_callback_rowread_ptquery (DBT const *key, DBT const *row, void *context) {
531 SMART_DBT_INFO info = (SMART_DBT_INFO)context;
532 info->ha->extract_hidden_primary_key(info->keynr, key);
533 return info->ha->read_row_callback(info->buf,info->keynr,row,key);
534 }
535
536 //
537 // Smart DBT callback function in case where we have a covering index
538 //
smart_dbt_callback_keyread(DBT const * key,DBT TOKUDB_UNUSED (const * row),void * context)539 static int smart_dbt_callback_keyread(DBT const* key,
540 DBT TOKUDB_UNUSED(const* row),
541 void* context) {
542 SMART_DBT_INFO info = (SMART_DBT_INFO)context;
543 info->ha->extract_hidden_primary_key(info->keynr, key);
544 info->ha->read_key_only(info->buf,info->keynr,key);
545 return 0;
546 }
547
548 //
549 // Smart DBT callback function in case where we do NOT have a covering index
550 //
551 static int
smart_dbt_callback_rowread(DBT const * key,DBT const * row,void * context)552 smart_dbt_callback_rowread(DBT const *key, DBT const *row, void *context) {
553 int error = 0;
554 SMART_DBT_INFO info = (SMART_DBT_INFO)context;
555 info->ha->extract_hidden_primary_key(info->keynr, key);
556 error = info->ha->read_primary_key(info->buf,info->keynr,row,key);
557 return error;
558 }
559
560 //
561 // Smart DBT callback function in case where we have a covering index
562 //
smart_dbt_callback_ir_keyread(DBT const * key,TOKUDB_UNUSED (DBT const * row),void * context)563 static int smart_dbt_callback_ir_keyread(DBT const* key,
564 TOKUDB_UNUSED(DBT const* row),
565 void* context) {
566 INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
567 ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(
568 ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
569 if (ir_info->cmp) {
570 return 0;
571 }
572 return smart_dbt_callback_keyread(key, row, &ir_info->smart_dbt_info);
573 }
574
smart_dbt_callback_lookup(DBT const * key,TOKUDB_UNUSED (DBT const * row),void * context)575 static int smart_dbt_callback_lookup(DBT const* key,
576 TOKUDB_UNUSED(DBT const* row),
577 void* context) {
578 INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
579 ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(
580 ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
581 return 0;
582 }
583
584
585 //
586 // Smart DBT callback function in case where we do NOT have a covering index
587 //
588 static int
smart_dbt_callback_ir_rowread(DBT const * key,DBT const * row,void * context)589 smart_dbt_callback_ir_rowread(DBT const *key, DBT const *row, void *context) {
590 INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
591 ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
592 if (ir_info->cmp) {
593 return 0;
594 }
595 return smart_dbt_callback_rowread(key, row, &ir_info->smart_dbt_info);
596 }
597
598 //
599 // macro for Smart DBT callback function,
600 // so we do not need to put this long line of code in multiple places
601 //
602 #define SMART_DBT_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_keyread : smart_dbt_callback_rowread )
603 #define SMART_DBT_IR_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_ir_keyread : smart_dbt_callback_ir_rowread )
604
605 //
606 // macro that modifies read flag for cursor operations depending on whether
607 // we have preacquired lock or not
608 //
609 #define SET_PRELOCK_FLAG(flg) ((flg) | (range_lock_grabbed ? (use_write_locks ? DB_PRELOCKED_WRITE : DB_PRELOCKED) : 0))
610
611 //
612 // This method retrieves the value of the auto increment column of a record in MySQL format
613 // This was basically taken from MyISAM
614 // Parameters:
615 // type - the type of the auto increment column (e.g. int, float, double...)
616 // offset - offset into the record where the auto increment column is stored
617 // [in] record - MySQL row whose auto increment value we want to extract
618 // Returns:
619 // The value of the auto increment column in record
620 //
retrieve_auto_increment(uint16 type,uint32 offset,const uchar * record)621 static ulonglong retrieve_auto_increment(uint16 type, uint32 offset,const uchar *record)
622 {
623 const uchar *key; /* Key */
624 ulonglong unsigned_autoinc = 0; /* Unsigned auto-increment */
625 longlong signed_autoinc = 0; /* Signed auto-increment */
626 enum { unsigned_type, signed_type } autoinc_type;
627 float float_tmp; /* Temporary variable */
628 double double_tmp; /* Temporary variable */
629
630 key = ((uchar *) record) + offset;
631
632 /* Set default autoincrement type */
633 autoinc_type = unsigned_type;
634
635 switch (type) {
636 case HA_KEYTYPE_INT8:
637 signed_autoinc = (longlong) *(char*)key;
638 autoinc_type = signed_type;
639 break;
640
641 case HA_KEYTYPE_BINARY:
642 unsigned_autoinc = (ulonglong) *(uchar*) key;
643 break;
644
645 case HA_KEYTYPE_SHORT_INT:
646 signed_autoinc = (longlong) sint2korr(key);
647 autoinc_type = signed_type;
648 break;
649
650 case HA_KEYTYPE_USHORT_INT:
651 unsigned_autoinc = (ulonglong) uint2korr(key);
652 break;
653
654 case HA_KEYTYPE_LONG_INT:
655 signed_autoinc = (longlong) sint4korr(key);
656 autoinc_type = signed_type;
657 break;
658
659 case HA_KEYTYPE_ULONG_INT:
660 unsigned_autoinc = (ulonglong) uint4korr(key);
661 break;
662
663 case HA_KEYTYPE_INT24:
664 signed_autoinc = (longlong) sint3korr(key);
665 autoinc_type = signed_type;
666 break;
667
668 case HA_KEYTYPE_UINT24:
669 unsigned_autoinc = (ulonglong) tokudb_uint3korr(key);
670 break;
671
672 case HA_KEYTYPE_LONGLONG:
673 signed_autoinc = sint8korr(key);
674 autoinc_type = signed_type;
675 break;
676
677 case HA_KEYTYPE_ULONGLONG:
678 unsigned_autoinc = uint8korr(key);
679 break;
680
681 /* The remaining two cases should not be used but are included for
682 compatibility */
683 case HA_KEYTYPE_FLOAT:
684 float4get(float_tmp, key); /* Note: float4get is a macro */
685 signed_autoinc = (longlong) float_tmp;
686 autoinc_type = signed_type;
687 break;
688
689 case HA_KEYTYPE_DOUBLE:
690 float8get(double_tmp, key); /* Note: float8get is a macro */
691 signed_autoinc = (longlong) double_tmp;
692 autoinc_type = signed_type;
693 break;
694
695 default:
696 assert_unreachable();
697 }
698
699 if (signed_autoinc < 0) {
700 signed_autoinc = 0;
701 }
702
703 return autoinc_type == unsigned_type ?
704 unsigned_autoinc : (ulonglong) signed_autoinc;
705 }
706
field_offset(Field * field,TABLE * table)707 static inline ulong field_offset(Field* field, TABLE* table) {
708 return((ulong) (field->ptr - table->record[0]));
709 }
710
tx_to_toku_iso(ulong tx_isolation)711 static inline HA_TOKU_ISO_LEVEL tx_to_toku_iso(ulong tx_isolation) {
712 if (tx_isolation == ISO_READ_UNCOMMITTED) {
713 return hatoku_iso_read_uncommitted;
714 }
715 else if (tx_isolation == ISO_READ_COMMITTED) {
716 return hatoku_iso_read_committed;
717 }
718 else if (tx_isolation == ISO_REPEATABLE_READ) {
719 return hatoku_iso_repeatable_read;
720 }
721 else {
722 return hatoku_iso_serializable;
723 }
724 }
725
toku_iso_to_txn_flag(HA_TOKU_ISO_LEVEL lvl)726 static inline uint32_t toku_iso_to_txn_flag (HA_TOKU_ISO_LEVEL lvl) {
727 if (lvl == hatoku_iso_read_uncommitted) {
728 return DB_READ_UNCOMMITTED;
729 }
730 else if (lvl == hatoku_iso_read_committed) {
731 return DB_READ_COMMITTED;
732 }
733 else if (lvl == hatoku_iso_repeatable_read) {
734 return DB_TXN_SNAPSHOT;
735 }
736 else {
737 return 0;
738 }
739 }
740
filter_key_part_compare(const void * left,const void * right)741 static int filter_key_part_compare (const void* left, const void* right) {
742 FILTER_KEY_PART_INFO* left_part= (FILTER_KEY_PART_INFO *)left;
743 FILTER_KEY_PART_INFO* right_part = (FILTER_KEY_PART_INFO *)right;
744 return left_part->offset - right_part->offset;
745 }
746
747 //
748 // Be very careful with parameters passed to this function. Who knows
749 // if key, table have proper info set. I had to verify by checking
750 // in the debugger.
751 //
set_key_filter(MY_BITMAP * key_filter,KEY * key,TABLE * table,bool get_offset_from_keypart)752 void set_key_filter(
753 MY_BITMAP* key_filter,
754 KEY* key,
755 TABLE* table,
756 bool get_offset_from_keypart) {
757
758 FILTER_KEY_PART_INFO parts[MAX_REF_PARTS];
759 uint curr_skip_index = 0;
760
761 for (uint i = 0; i < key->user_defined_key_parts; i++) {
762 //
763 // horrendous hack due to bugs in mysql, basically
764 // we cannot always reliably get the offset from the same source
765 //
766 parts[i].offset =
767 get_offset_from_keypart ?
768 key->key_part[i].offset :
769 field_offset(key->key_part[i].field, table);
770 parts[i].part_index = i;
771 }
772 qsort(
773 parts, // start of array
774 key->user_defined_key_parts, //num elements
775 sizeof(*parts), //size of each element
776 filter_key_part_compare);
777
778 for (uint i = 0; i < table->s->fields; i++) {
779 Field* field = table->field[i];
780 uint curr_field_offset = field_offset(field, table);
781 if (curr_skip_index < key->user_defined_key_parts) {
782 uint curr_skip_offset = 0;
783 curr_skip_offset = parts[curr_skip_index].offset;
784 if (curr_skip_offset == curr_field_offset) {
785 //
786 // we have hit a field that is a portion of the primary key
787 //
788 uint curr_key_index = parts[curr_skip_index].part_index;
789 curr_skip_index++;
790 //
791 // only choose to continue over the key if the key's length matches the field's length
792 // otherwise, we may have a situation where the column is a varchar(10), the
793 // key is only the first 3 characters, and we end up losing the last 7 bytes of the
794 // column
795 //
796 TOKU_TYPE toku_type = mysql_to_toku_type(field);
797 switch (toku_type) {
798 case toku_type_blob:
799 break;
800 case toku_type_varbinary:
801 case toku_type_varstring:
802 case toku_type_fixbinary:
803 case toku_type_fixstring:
804 if (key->key_part[curr_key_index].length == field->field_length) {
805 bitmap_set_bit(key_filter,i);
806 }
807 break;
808 default:
809 bitmap_set_bit(key_filter,i);
810 break;
811 }
812 }
813 }
814 }
815 }
816
pack_fixed_field(uchar * to_tokudb,const uchar * from_mysql,uint32_t num_bytes)817 static inline uchar* pack_fixed_field(
818 uchar* to_tokudb,
819 const uchar* from_mysql,
820 uint32_t num_bytes
821 )
822 {
823 switch (num_bytes) {
824 case (1):
825 memcpy(to_tokudb, from_mysql, 1);
826 break;
827 case (2):
828 memcpy(to_tokudb, from_mysql, 2);
829 break;
830 case (3):
831 memcpy(to_tokudb, from_mysql, 3);
832 break;
833 case (4):
834 memcpy(to_tokudb, from_mysql, 4);
835 break;
836 case (8):
837 memcpy(to_tokudb, from_mysql, 8);
838 break;
839 default:
840 memcpy(to_tokudb, from_mysql, num_bytes);
841 break;
842 }
843 return to_tokudb+num_bytes;
844 }
845
unpack_fixed_field(uchar * to_mysql,const uchar * from_tokudb,uint32_t num_bytes)846 static inline const uchar* unpack_fixed_field(
847 uchar* to_mysql,
848 const uchar* from_tokudb,
849 uint32_t num_bytes
850 )
851 {
852 switch (num_bytes) {
853 case (1):
854 memcpy(to_mysql, from_tokudb, 1);
855 break;
856 case (2):
857 memcpy(to_mysql, from_tokudb, 2);
858 break;
859 case (3):
860 memcpy(to_mysql, from_tokudb, 3);
861 break;
862 case (4):
863 memcpy(to_mysql, from_tokudb, 4);
864 break;
865 case (8):
866 memcpy(to_mysql, from_tokudb, 8);
867 break;
868 default:
869 memcpy(to_mysql, from_tokudb, num_bytes);
870 break;
871 }
872 return from_tokudb+num_bytes;
873 }
874
write_var_field(uchar * to_tokudb_offset_ptr,uchar * to_tokudb_data,uchar * to_tokudb_offset_start,const uchar * data,uint32_t data_length,uint32_t offset_bytes)875 static inline uchar* write_var_field(
876 uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
877 uchar* to_tokudb_data, // location where data is going to be written
878 uchar* to_tokudb_offset_start, //location where offset starts, IS THIS A BAD NAME????
879 const uchar * data, // the data to write
880 uint32_t data_length, // length of data to write
881 uint32_t offset_bytes // number of offset bytes
882 )
883 {
884 memcpy(to_tokudb_data, data, data_length);
885 //
886 // for offset, we pack the offset where the data ENDS!
887 //
888 uint32_t offset = to_tokudb_data + data_length - to_tokudb_offset_start;
889 switch(offset_bytes) {
890 case (1):
891 to_tokudb_offset_ptr[0] = (uchar)offset;
892 break;
893 case (2):
894 int2store(to_tokudb_offset_ptr,offset);
895 break;
896 default:
897 assert_unreachable();
898 break;
899 }
900 return to_tokudb_data + data_length;
901 }
902
get_var_data_length(const uchar * from_mysql,uint32_t mysql_length_bytes)903 static inline uint32_t get_var_data_length(
904 const uchar * from_mysql,
905 uint32_t mysql_length_bytes
906 )
907 {
908 uint32_t data_length;
909 switch(mysql_length_bytes) {
910 case(1):
911 data_length = from_mysql[0];
912 break;
913 case(2):
914 data_length = uint2korr(from_mysql);
915 break;
916 default:
917 assert_unreachable();
918 }
919 return data_length;
920 }
921
pack_var_field(uchar * to_tokudb_offset_ptr,uchar * to_tokudb_data,uchar * to_tokudb_offset_start,const uchar * from_mysql,uint32_t mysql_length_bytes,uint32_t offset_bytes)922 static inline uchar* pack_var_field(
923 uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
924 uchar* to_tokudb_data, // pointer to where tokudb data should be written
925 uchar* to_tokudb_offset_start, //location where data starts, IS THIS A BAD NAME????
926 const uchar * from_mysql, // mysql data
927 uint32_t mysql_length_bytes, //number of bytes used to store length in from_mysql
928 uint32_t offset_bytes //number of offset_bytes used in tokudb row
929 )
930 {
931 uint data_length = get_var_data_length(from_mysql, mysql_length_bytes);
932 return write_var_field(
933 to_tokudb_offset_ptr,
934 to_tokudb_data,
935 to_tokudb_offset_start,
936 from_mysql + mysql_length_bytes,
937 data_length,
938 offset_bytes
939 );
940 }
941
unpack_var_field(uchar * to_mysql,const uchar * from_tokudb_data,uint32_t from_tokudb_data_len,uint32_t mysql_length_bytes)942 static inline void unpack_var_field(
943 uchar* to_mysql,
944 const uchar* from_tokudb_data,
945 uint32_t from_tokudb_data_len,
946 uint32_t mysql_length_bytes
947 )
948 {
949 //
950 // store the length
951 //
952 switch (mysql_length_bytes) {
953 case(1):
954 to_mysql[0] = (uchar)from_tokudb_data_len;
955 break;
956 case(2):
957 int2store(to_mysql, from_tokudb_data_len);
958 break;
959 default:
960 assert_unreachable();
961 }
962 //
963 // store the data
964 //
965 memcpy(to_mysql+mysql_length_bytes, from_tokudb_data, from_tokudb_data_len);
966 }
967
pack_toku_field_blob(uchar * to_tokudb,const uchar * from_mysql,Field * field)968 static uchar* pack_toku_field_blob(
969 uchar* to_tokudb,
970 const uchar* from_mysql,
971 Field* field
972 )
973 {
974 uint32_t len_bytes = field->row_pack_length();
975 uint32_t length = 0;
976 uchar* data_ptr = NULL;
977 memcpy(to_tokudb, from_mysql, len_bytes);
978
979 switch (len_bytes) {
980 case (1):
981 length = (uint32_t)(*from_mysql);
982 break;
983 case (2):
984 length = uint2korr(from_mysql);
985 break;
986 case (3):
987 length = tokudb_uint3korr(from_mysql);
988 break;
989 case (4):
990 length = uint4korr(from_mysql);
991 break;
992 default:
993 assert_unreachable();
994 }
995
996 if (length > 0) {
997 memcpy((uchar *)(&data_ptr), from_mysql + len_bytes, sizeof(uchar*));
998 memcpy(to_tokudb + len_bytes, data_ptr, length);
999 }
1000 return (to_tokudb + len_bytes + length);
1001 }
1002
create_tokudb_trx_data_instance(tokudb_trx_data ** out_trx)1003 static int create_tokudb_trx_data_instance(tokudb_trx_data** out_trx) {
1004 int error;
1005 tokudb_trx_data* trx = (tokudb_trx_data *) tokudb::memory::malloc(
1006 sizeof(*trx),
1007 MYF(MY_ZEROFILL));
1008 if (!trx) {
1009 error = ENOMEM;
1010 goto cleanup;
1011 }
1012
1013 *out_trx = trx;
1014 error = 0;
1015 cleanup:
1016 return error;
1017 }
1018
tokudb_generate_row(DB * dest_db,TOKUDB_UNUSED (DB * src_db),DBT * dest_key,DBT * dest_val,const DBT * src_key,const DBT * src_val)1019 static inline int tokudb_generate_row(DB* dest_db,
1020 TOKUDB_UNUSED(DB* src_db),
1021 DBT* dest_key,
1022 DBT* dest_val,
1023 const DBT* src_key,
1024 const DBT* src_val) {
1025 int error;
1026
1027 DB* curr_db = dest_db;
1028 uchar* row_desc = NULL;
1029 uint32_t desc_size;
1030 uchar* buff = NULL;
1031 uint32_t max_key_len = 0;
1032
1033 row_desc = (uchar *)curr_db->descriptor->dbt.data;
1034 row_desc += (*(uint32_t *)row_desc);
1035 desc_size = (*(uint32_t *)row_desc) - 4;
1036 row_desc += 4;
1037
1038 if (is_key_pk(row_desc)) {
1039 if (dest_key->flags == DB_DBT_REALLOC && dest_key->data != NULL) {
1040 free(dest_key->data);
1041 }
1042 if (dest_val != NULL) {
1043 if (dest_val->flags == DB_DBT_REALLOC && dest_val->data != NULL) {
1044 free(dest_val->data);
1045 }
1046 }
1047 dest_key->data = src_key->data;
1048 dest_key->size = src_key->size;
1049 dest_key->flags = 0;
1050 if (dest_val != NULL) {
1051 dest_val->data = src_val->data;
1052 dest_val->size = src_val->size;
1053 dest_val->flags = 0;
1054 }
1055 error = 0;
1056 goto cleanup;
1057 }
1058 // at this point, we need to create the key/val and set it
1059 // in the DBTs
1060 if (dest_key->flags == 0) {
1061 dest_key->ulen = 0;
1062 dest_key->size = 0;
1063 dest_key->data = NULL;
1064 dest_key->flags = DB_DBT_REALLOC;
1065 }
1066 if (dest_key->flags == DB_DBT_REALLOC) {
1067 max_key_len = max_key_size_from_desc(row_desc, desc_size);
1068 max_key_len += src_key->size;
1069
1070 if (max_key_len > dest_key->ulen) {
1071 void* old_ptr = dest_key->data;
1072 void* new_ptr = NULL;
1073 new_ptr = realloc(old_ptr, max_key_len);
1074 assert_always(new_ptr);
1075 dest_key->data = new_ptr;
1076 dest_key->ulen = max_key_len;
1077 }
1078
1079 buff = (uchar *)dest_key->data;
1080 assert_always(buff != nullptr);
1081 assert_always(max_key_len > 0);
1082 } else {
1083 assert_unreachable();
1084 }
1085
1086 dest_key->size = pack_key_from_desc(buff, row_desc, desc_size, src_key,
1087 src_val);
1088 assert_always(dest_key->ulen >= dest_key->size);
1089 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY)) &&
1090 !max_key_len) {
1091 max_key_len = max_key_size_from_desc(row_desc, desc_size);
1092 max_key_len += src_key->size;
1093 }
1094 if (max_key_len) {
1095 assert_always(max_key_len >= dest_key->size);
1096 }
1097
1098 row_desc += desc_size;
1099 desc_size = (*(uint32_t *)row_desc) - 4;
1100 row_desc += 4;
1101 if (dest_val != NULL) {
1102 if (!is_key_clustering(desc_size) || src_val->size == 0) {
1103 dest_val->size = 0;
1104 } else {
1105 uchar* buff = NULL;
1106 if (dest_val->flags == 0) {
1107 dest_val->ulen = 0;
1108 dest_val->size = 0;
1109 dest_val->data = NULL;
1110 dest_val->flags = DB_DBT_REALLOC;
1111 }
1112 if (dest_val->flags == DB_DBT_REALLOC){
1113 if (dest_val->ulen < src_val->size) {
1114 void* old_ptr = dest_val->data;
1115 void* new_ptr = NULL;
1116 new_ptr = realloc(old_ptr, src_val->size);
1117 assert_always(new_ptr);
1118 dest_val->data = new_ptr;
1119 dest_val->ulen = src_val->size;
1120 }
1121 buff = (uchar *)dest_val->data;
1122 assert_always(buff != NULL);
1123 } else {
1124 assert_unreachable();
1125 }
1126 dest_val->size = pack_clustering_val_from_desc(
1127 buff,
1128 row_desc,
1129 desc_size,
1130 src_val);
1131 assert_always(dest_val->ulen >= dest_val->size);
1132 }
1133 }
1134 error = 0;
1135 cleanup:
1136 return error;
1137 }
1138
generate_row_for_del(DB * dest_db,DB * src_db,DBT_ARRAY * dest_key_arrays,const DBT * src_key,const DBT * src_val)1139 static int generate_row_for_del(
1140 DB *dest_db,
1141 DB *src_db,
1142 DBT_ARRAY *dest_key_arrays,
1143 const DBT *src_key,
1144 const DBT *src_val
1145 )
1146 {
1147 DBT* dest_key = &dest_key_arrays->dbts[0];
1148 return tokudb_generate_row(
1149 dest_db,
1150 src_db,
1151 dest_key,
1152 NULL,
1153 src_key,
1154 src_val
1155 );
1156 }
1157
1158
generate_row_for_put(DB * dest_db,DB * src_db,DBT_ARRAY * dest_key_arrays,DBT_ARRAY * dest_val_arrays,const DBT * src_key,const DBT * src_val)1159 static int generate_row_for_put(
1160 DB *dest_db,
1161 DB *src_db,
1162 DBT_ARRAY *dest_key_arrays,
1163 DBT_ARRAY *dest_val_arrays,
1164 const DBT *src_key,
1165 const DBT *src_val
1166 )
1167 {
1168 DBT* dest_key = &dest_key_arrays->dbts[0];
1169 DBT *dest_val = (dest_val_arrays == NULL) ? NULL : &dest_val_arrays->dbts[0];
1170 return tokudb_generate_row(
1171 dest_db,
1172 src_db,
1173 dest_key,
1174 dest_val,
1175 src_key,
1176 src_val
1177 );
1178 }
1179
ha_tokudb(handlerton * hton,TABLE_SHARE * table_arg)1180 ha_tokudb::ha_tokudb(handlerton * hton, TABLE_SHARE * table_arg):handler(hton, table_arg) {
1181 TOKUDB_HANDLER_DBUG_ENTER("");
1182 share = NULL;
1183 int_table_flags = HA_REC_NOT_IN_SEQ | HA_NULL_IN_KEY | HA_CAN_INDEX_BLOBS
1184 | HA_PRIMARY_KEY_IN_READ_INDEX | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION
1185 | HA_FILE_BASED | HA_AUTO_PART_KEY | HA_TABLE_SCAN_ON_INDEX
1186 | HA_CAN_WRITE_DURING_OPTIMIZE | HA_ONLINE_ANALYZE;
1187 alloc_ptr = NULL;
1188 rec_buff = NULL;
1189 rec_update_buff = NULL;
1190 transaction = NULL;
1191 cursor = NULL;
1192 fixed_cols_for_query = NULL;
1193 var_cols_for_query = NULL;
1194 num_fixed_cols_for_query = 0;
1195 num_var_cols_for_query = 0;
1196 unpack_entire_row = true;
1197 read_blobs = false;
1198 read_key = false;
1199 added_rows = 0;
1200 deleted_rows = 0;
1201 updated_rows = 0;
1202 last_dup_key = UINT_MAX;
1203 using_ignore = false;
1204 using_ignore_no_key = false;
1205 last_cursor_error = 0;
1206 range_lock_grabbed = false;
1207 blob_buff = NULL;
1208 num_blob_bytes = 0;
1209 delay_updating_ai_metadata = false;
1210 ai_metadata_update_required = false;
1211 memset(mult_key_dbt_array, 0, sizeof(mult_key_dbt_array));
1212 memset(mult_rec_dbt_array, 0, sizeof(mult_rec_dbt_array));
1213 for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1214 toku_dbt_array_init(&mult_key_dbt_array[i], 1);
1215 }
1216 for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1217 toku_dbt_array_init(&mult_rec_dbt_array[i], 1);
1218 }
1219 loader = NULL;
1220 abort_loader = false;
1221 memset(&lc, 0, sizeof(lc));
1222 lock.type = TL_IGNORE;
1223 for (uint32_t i = 0; i < MAX_KEY+1; i++) {
1224 mult_put_flags[i] = 0;
1225 mult_del_flags[i] = DB_DELETE_ANY;
1226 mult_dbt_flags[i] = DB_DBT_REALLOC;
1227 }
1228 num_DBs_locked_in_bulk = false;
1229 lock_count = 0;
1230 use_write_locks = false;
1231 range_query_buff = NULL;
1232 size_range_query_buff = 0;
1233 bytes_used_in_range_query_buff = 0;
1234 curr_range_query_buff_offset = 0;
1235 doing_bulk_fetch = false;
1236 prelocked_left_range_size = 0;
1237 prelocked_right_range_size = 0;
1238 tokudb_active_index = MAX_KEY;
1239 invalidate_icp();
1240 trx_handler_list.data = this;
1241 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
1242 in_rpl_write_rows = in_rpl_delete_rows = in_rpl_update_rows = false;
1243 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
1244 TOKUDB_HANDLER_DBUG_VOID_RETURN;
1245 }
1246
~ha_tokudb()1247 ha_tokudb::~ha_tokudb() {
1248 TOKUDB_HANDLER_DBUG_ENTER("");
1249 for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1250 toku_dbt_array_destroy(&mult_key_dbt_array[i]);
1251 }
1252 for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1253 toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
1254 }
1255 TOKUDB_HANDLER_DBUG_VOID_RETURN;
1256 }
1257
1258 //
1259 // states if table has an auto increment column, if so, sets index where auto inc column is to index
1260 // Parameters:
1261 // [out] index - if auto inc exists, then this param is set to where it exists in table, if not, then unchanged
1262 // Returns:
1263 // true if auto inc column exists, false otherwise
1264 //
has_auto_increment_flag(uint * index)1265 bool ha_tokudb::has_auto_increment_flag(uint* index) {
1266 //
1267 // check to see if we have auto increment field
1268 //
1269 bool ai_found = false;
1270 uint ai_index = 0;
1271 for (uint i = 0; i < table_share->fields; i++, ai_index++) {
1272 Field* field = table->field[i];
1273 if (field->flags & AUTO_INCREMENT_FLAG) {
1274 ai_found = true;
1275 *index = ai_index;
1276 break;
1277 }
1278 }
1279 return ai_found;
1280 }
1281
open_status_dictionary(DB ** ptr,const char * name,DB_TXN * txn)1282 static int open_status_dictionary(DB** ptr, const char* name, DB_TXN* txn) {
1283 int error;
1284 char* newname = NULL;
1285 size_t newname_len = get_max_dict_name_path_length(name);
1286 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
1287 if (newname == NULL) {
1288 error = ENOMEM;
1289 goto cleanup;
1290 }
1291 make_name(newname, newname_len, name, "status");
1292 TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "open:%s", newname);
1293
1294 error = tokudb::metadata::open(db_env, ptr, newname, txn);
1295 cleanup:
1296 tokudb::memory::free(newname);
1297 return error;
1298 }
1299
open_main_dictionary(const char * name,bool is_read_only,DB_TXN * txn)1300 int ha_tokudb::open_main_dictionary(
1301 const char* name,
1302 bool is_read_only,
1303 DB_TXN* txn) {
1304
1305 int error;
1306 char* newname = NULL;
1307 size_t newname_len = 0;
1308 uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1309
1310 assert_always(share->file == NULL);
1311 assert_always(share->key_file[primary_key] == NULL);
1312 newname_len = get_max_dict_name_path_length(name);
1313 newname = (char*)tokudb::memory::malloc(
1314 newname_len,
1315 MYF(MY_WME|MY_ZEROFILL));
1316 if (newname == NULL) {
1317 error = ENOMEM;
1318 goto exit;
1319 }
1320 make_name(newname, newname_len, name, "main");
1321
1322 error = db_create(&share->file, db_env, 0);
1323 if (error) {
1324 goto exit;
1325 }
1326 share->key_file[primary_key] = share->file;
1327
1328 error =
1329 share->file->open(
1330 share->file,
1331 txn,
1332 newname,
1333 NULL,
1334 DB_BTREE,
1335 open_flags,
1336 S_IWUSR);
1337 if (error) {
1338 goto exit;
1339 }
1340
1341 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1342 TOKUDB_DEBUG_OPEN,
1343 "open:%s:file=%p",
1344 newname,
1345 share->file);
1346
1347 error = 0;
1348 exit:
1349 if (error) {
1350 if (share->file) {
1351 int r = share->file->close(
1352 share->file,
1353 0
1354 );
1355 assert_always(r==0);
1356 share->file = NULL;
1357 share->key_file[primary_key] = NULL;
1358 }
1359 }
1360 tokudb::memory::free(newname);
1361 return error;
1362 }
1363
1364 //
1365 // Open a secondary table, the key will be a secondary index, the data will
1366 // be a primary key
1367 //
open_secondary_dictionary(DB ** ptr,KEY * key_info,const char * name,bool is_read_only,DB_TXN * txn)1368 int ha_tokudb::open_secondary_dictionary(
1369 DB** ptr,
1370 KEY* key_info,
1371 const char* name,
1372 bool is_read_only,
1373 DB_TXN* txn) {
1374
1375 int error = ENOSYS;
1376 char dict_name[MAX_DICT_NAME_LEN];
1377 uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1378 char* newname = NULL;
1379 size_t newname_len = 0;
1380
1381 sprintf(dict_name, "key-%s", key_info->name.str);
1382
1383 newname_len = get_max_dict_name_path_length(name);
1384 newname =
1385 (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME|MY_ZEROFILL));
1386 if (newname == NULL) {
1387 error = ENOMEM;
1388 goto cleanup;
1389 }
1390 make_name(newname, newname_len, name, dict_name);
1391
1392
1393 if ((error = db_create(ptr, db_env, 0))) {
1394 my_errno = error;
1395 goto cleanup;
1396 }
1397
1398
1399 error = (*ptr)->open(*ptr, txn, newname, NULL, DB_BTREE, open_flags, S_IWUSR);
1400 if (error) {
1401 my_errno = error;
1402 goto cleanup;
1403 }
1404 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1405 TOKUDB_DEBUG_OPEN,
1406 "open:%s:file=%p",
1407 newname,
1408 *ptr);
1409 cleanup:
1410 if (error) {
1411 if (*ptr) {
1412 int r = (*ptr)->close(*ptr, 0);
1413 assert_always(r==0);
1414 *ptr = NULL;
1415 }
1416 }
1417 tokudb::memory::free(newname);
1418 return error;
1419 }
1420
initialize_col_pack_info(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)1421 static int initialize_col_pack_info(KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, uint keynr) {
1422 int error = ENOSYS;
1423 //
1424 // set up the cp_info
1425 //
1426 assert_always(kc_info->cp_info[keynr] == NULL);
1427 kc_info->cp_info[keynr] = (COL_PACK_INFO*)tokudb::memory::malloc(
1428 table_share->fields * sizeof(COL_PACK_INFO),
1429 MYF(MY_WME | MY_ZEROFILL));
1430 if (kc_info->cp_info[keynr] == NULL) {
1431 error = ENOMEM;
1432 goto exit;
1433 }
1434 {
1435 uint32_t curr_fixed_offset = 0;
1436 uint32_t curr_var_index = 0;
1437 for (uint j = 0; j < table_share->fields; j++) {
1438 COL_PACK_INFO* curr = &kc_info->cp_info[keynr][j];
1439 //
1440 // need to set the offsets / indexes
1441 // offsets are calculated AFTER the NULL bytes
1442 //
1443 if (!bitmap_is_set(&kc_info->key_filters[keynr],j)) {
1444 if (is_fixed_field(kc_info, j)) {
1445 curr->col_pack_val = curr_fixed_offset;
1446 curr_fixed_offset += kc_info->field_lengths[j];
1447 }
1448 else if (is_variable_field(kc_info, j)) {
1449 curr->col_pack_val = curr_var_index;
1450 curr_var_index++;
1451 }
1452 }
1453 }
1454
1455 //
1456 // set up the mcp_info
1457 //
1458 kc_info->mcp_info[keynr].fixed_field_size = get_fixed_field_size(
1459 kc_info,
1460 table_share,
1461 keynr
1462 );
1463 kc_info->mcp_info[keynr].len_of_offsets = get_len_of_offsets(
1464 kc_info,
1465 table_share,
1466 keynr
1467 );
1468
1469 error = 0;
1470 }
1471 exit:
1472 return error;
1473 }
1474
1475 // reset the kc_info state at keynr
reset_key_and_col_info(KEY_AND_COL_INFO * kc_info,uint keynr)1476 static void reset_key_and_col_info(KEY_AND_COL_INFO *kc_info, uint keynr) {
1477 bitmap_clear_all(&kc_info->key_filters[keynr]);
1478 tokudb::memory::free(kc_info->cp_info[keynr]);
1479 kc_info->cp_info[keynr] = NULL;
1480 kc_info->mcp_info[keynr] = (MULTI_COL_PACK_INFO) { 0, 0 };
1481 }
1482
initialize_key_and_col_info(TABLE_SHARE * table_share,TABLE * table,KEY_AND_COL_INFO * kc_info,uint hidden_primary_key,uint primary_key)1483 static int initialize_key_and_col_info(
1484 TABLE_SHARE* table_share,
1485 TABLE* table,
1486 KEY_AND_COL_INFO* kc_info,
1487 uint hidden_primary_key,
1488 uint primary_key) {
1489
1490 int error = 0;
1491 uint32_t curr_blob_field_index = 0;
1492 uint32_t max_var_bytes = 0;
1493 //
1494 // fill in the field lengths. 0 means it is a variable sized field length
1495 // fill in length_bytes, 0 means it is fixed or blob
1496 //
1497 for (uint i = 0; i < table_share->fields; i++) {
1498 Field* field = table_share->field[i];
1499 TOKU_TYPE toku_type = mysql_to_toku_type(field);
1500 uint32 pack_length = 0;
1501 switch (toku_type) {
1502 case toku_type_int:
1503 case toku_type_double:
1504 case toku_type_float:
1505 case toku_type_fixbinary:
1506 case toku_type_fixstring:
1507 pack_length = field->pack_length();
1508 assert_always(pack_length < 1<<16);
1509 kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_FIXED_FIELD;
1510 kc_info->field_lengths[i] = (uint16_t)pack_length;
1511 kc_info->length_bytes[i] = 0;
1512 break;
1513 case toku_type_blob:
1514 kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_BLOB_FIELD;
1515 kc_info->field_lengths[i] = 0;
1516 kc_info->length_bytes[i] = 0;
1517 kc_info->blob_fields[curr_blob_field_index] = i;
1518 curr_blob_field_index++;
1519 break;
1520 case toku_type_varstring:
1521 case toku_type_varbinary:
1522 kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_VARIABLE_FIELD;
1523 kc_info->field_lengths[i] = 0;
1524 kc_info->length_bytes[i] =
1525 (uchar)((Field_varstring*)field)->length_bytes;
1526 max_var_bytes += field->field_length;
1527 break;
1528 default:
1529 assert_unreachable();
1530 }
1531 }
1532 kc_info->num_blobs = curr_blob_field_index;
1533
1534 //
1535 // initialize share->num_offset_bytes
1536 // because MAX_REF_LENGTH is 65536, we
1537 // can safely set num_offset_bytes to 1 or 2
1538 //
1539 if (max_var_bytes < 256) {
1540 kc_info->num_offset_bytes = 1;
1541 } else {
1542 kc_info->num_offset_bytes = 2;
1543 }
1544
1545 for (uint i = 0;
1546 i < table_share->keys + tokudb_test(hidden_primary_key);
1547 i++) {
1548 //
1549 // do the cluster/primary key filtering calculations
1550 //
1551 if (!(i==primary_key && hidden_primary_key)) {
1552 if (i == primary_key) {
1553 set_key_filter(
1554 &kc_info->key_filters[primary_key],
1555 &table_share->key_info[primary_key],
1556 table,
1557 true);
1558 } else {
1559 set_key_filter(
1560 &kc_info->key_filters[i],
1561 &table_share->key_info[i],
1562 table,
1563 true);
1564 if (!hidden_primary_key) {
1565 set_key_filter(
1566 &kc_info->key_filters[i],
1567 &table_share->key_info[primary_key],
1568 table,
1569 true);
1570 }
1571 }
1572 }
1573 if (i == primary_key || key_is_clustering(&table_share->key_info[i])) {
1574 error = initialize_col_pack_info(kc_info, table_share, i);
1575 if (error) {
1576 goto exit;
1577 }
1578 }
1579 }
1580 exit:
1581 return error;
1582 }
1583
can_replace_into_be_fast(TABLE_SHARE * table_share,KEY_AND_COL_INFO * kc_info,uint pk)1584 bool ha_tokudb::can_replace_into_be_fast(
1585 TABLE_SHARE* table_share,
1586 KEY_AND_COL_INFO* kc_info,
1587 uint pk) {
1588
1589 uint curr_num_DBs = table_share->keys + tokudb_test(hidden_primary_key);
1590 bool ret_val;
1591 if (curr_num_DBs == 1) {
1592 ret_val = true;
1593 goto exit;
1594 }
1595 ret_val = true;
1596 for (uint curr_index = 0; curr_index < table_share->keys; curr_index++) {
1597 if (curr_index == pk) continue;
1598 KEY* curr_key_info = &table_share->key_info[curr_index];
1599 for (uint i = 0; i < curr_key_info->user_defined_key_parts; i++) {
1600 uint16 curr_field_index = curr_key_info->key_part[i].field->field_index;
1601 if (!bitmap_is_set(&kc_info->key_filters[curr_index],curr_field_index)) {
1602 ret_val = false;
1603 goto exit;
1604 }
1605 if (bitmap_is_set(&kc_info->key_filters[curr_index], curr_field_index) &&
1606 !bitmap_is_set(&kc_info->key_filters[pk], curr_field_index)) {
1607 ret_val = false;
1608 goto exit;
1609 }
1610
1611 }
1612 }
1613 exit:
1614 return ret_val;
1615 }
1616
initialize_share(const char * name,int mode)1617 int ha_tokudb::initialize_share(const char* name, int mode) {
1618 int error = 0;
1619 uint64_t num_rows = 0;
1620 DB_TXN* txn = NULL;
1621 bool do_commit = false;
1622 THD* thd = ha_thd();
1623 tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
1624 if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
1625 txn = trx->sub_sp_level;
1626 }
1627 else {
1628 do_commit = true;
1629 error = txn_begin(db_env, 0, &txn, 0, thd);
1630 if (error) { goto exit; }
1631 }
1632
1633
1634 error = get_status(txn);
1635 if (error) {
1636 goto exit;
1637 }
1638 if (share->version != HA_TOKU_VERSION) {
1639 error = ENOSYS;
1640 goto exit;
1641 }
1642
1643 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
1644 #if defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
1645 // verify frm data for non-partitioned tables
1646 if (TOKU_PARTITION_WRITE_FRM_DATA || table->part_info == NULL) {
1647 error = verify_frm_data(table->s->path.str, txn);
1648 if (error)
1649 goto exit;
1650 } else {
1651 // remove the frm data for partitions since we are not maintaining it
1652 error = remove_frm_data(share->status_block, txn);
1653 if (error)
1654 goto exit;
1655 }
1656 #else
1657 error = verify_frm_data(table->s->path.str, txn);
1658 if (error)
1659 goto exit;
1660 #endif // defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
1661 #endif // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
1662
1663 error =
1664 initialize_key_and_col_info(
1665 table_share,
1666 table,
1667 &share->kc_info,
1668 hidden_primary_key,
1669 primary_key);
1670 if (error) { goto exit; }
1671
1672 error = open_main_dictionary(name, mode == O_RDONLY, txn);
1673 if (error) {
1674 goto exit;
1675 }
1676
1677 share->has_unique_keys = false;
1678 share->_keys = table_share->keys;
1679 share->_max_key_parts = table_share->key_parts;
1680 share->_key_descriptors =
1681 (TOKUDB_SHARE::key_descriptor_t*)tokudb::memory::malloc(
1682 sizeof(TOKUDB_SHARE::key_descriptor_t) * share->_keys,
1683 MYF(MY_ZEROFILL));
1684
1685 /* Open other keys; These are part of the share structure */
1686 for (uint i = 0; i < table_share->keys; i++) {
1687 share->_key_descriptors[i]._parts =
1688 table_share->key_info[i].user_defined_key_parts;
1689 if (i == primary_key) {
1690 share->_key_descriptors[i]._is_unique = true;
1691 share->_key_descriptors[i]._name = tokudb::memory::strdup("primary", 0);
1692 } else {
1693 share->_key_descriptors[i]._is_unique = false;
1694 share->_key_descriptors[i]._name =
1695 tokudb::memory::strdup(table_share->key_info[i].name.str, 0);
1696 }
1697
1698 if (table_share->key_info[i].flags & HA_NOSAME) {
1699 share->_key_descriptors[i]._is_unique = true;
1700 share->has_unique_keys = true;
1701 }
1702 if (i != primary_key) {
1703 error =
1704 open_secondary_dictionary(
1705 &share->key_file[i],
1706 &table_share->key_info[i],
1707 name,
1708 mode == O_RDONLY,
1709 txn);
1710 if (error) {
1711 goto exit;
1712 }
1713 }
1714 }
1715 share->replace_into_fast =
1716 can_replace_into_be_fast(
1717 table_share,
1718 &share->kc_info,
1719 primary_key);
1720
1721 share->pk_has_string = false;
1722 if (!hidden_primary_key) {
1723 //
1724 // We need to set the ref_length to start at 5, to account for
1725 // the "infinity byte" in keys, and for placing the DBT size in the first four bytes
1726 //
1727 ref_length = sizeof(uint32_t) + sizeof(uchar);
1728 KEY_PART_INFO* key_part = table->key_info[primary_key].key_part;
1729 KEY_PART_INFO* end =
1730 key_part + table->key_info[primary_key].user_defined_key_parts;
1731 for (; key_part != end; key_part++) {
1732 ref_length += key_part->field->max_packed_col_length(key_part->length);
1733 TOKU_TYPE toku_type = mysql_to_toku_type(key_part->field);
1734 if (toku_type == toku_type_fixstring ||
1735 toku_type == toku_type_varstring ||
1736 toku_type == toku_type_blob
1737 )
1738 {
1739 share->pk_has_string = true;
1740 }
1741 }
1742 share->status |= STATUS_PRIMARY_KEY_INIT;
1743 }
1744 share->ref_length = ref_length;
1745
1746 error = estimate_num_rows(share->file, &num_rows, txn);
1747 //
1748 // estimate_num_rows should not fail under normal conditions
1749 //
1750 if (error == 0) {
1751 share->set_row_count(num_rows, true);
1752 } else {
1753 goto exit;
1754 }
1755 //
1756 // initialize auto increment data
1757 //
1758 share->has_auto_inc = has_auto_increment_flag(&share->ai_field_index);
1759 if (share->has_auto_inc) {
1760 init_auto_increment();
1761 }
1762
1763 if (may_table_be_empty(txn)) {
1764 share->try_table_lock = true;
1765 } else {
1766 share->try_table_lock = false;
1767 }
1768
1769 share->num_DBs = table_share->keys + tokudb_test(hidden_primary_key);
1770
1771 init_hidden_prim_key_info(txn);
1772
1773 // initialize cardinality info from the status dictionary
1774 {
1775 uint32_t rec_per_keys = tokudb::compute_total_key_parts(table_share);
1776 uint64_t* rec_per_key =
1777 (uint64_t*)tokudb::memory::malloc(
1778 rec_per_keys * sizeof(uint64_t),
1779 MYF(MY_FAE));
1780 error =
1781 tokudb::get_card_from_status(
1782 share->status_block,
1783 txn,
1784 rec_per_keys,
1785 rec_per_key);
1786 if (error) {
1787 memset(rec_per_key, 0, sizeof(ulonglong) * rec_per_keys);
1788 }
1789 share->init_cardinality_counts(rec_per_keys, rec_per_key);
1790 }
1791
1792 error = 0;
1793 exit:
1794 if (do_commit && txn) {
1795 commit_txn(txn,0);
1796 }
1797 return error;
1798 }
1799
1800 //
1801 // Creates and opens a handle to a table which already exists in a tokudb
1802 // database.
1803 // Parameters:
1804 // [in] name - table name
1805 // mode - seems to specify if table is read only
1806 // test_if_locked - unused
1807 // Returns:
1808 // 0 on success
1809 // 1 on error
1810 //
open(const char * name,int mode,uint test_if_locked)1811 int ha_tokudb::open(const char *name, int mode, uint test_if_locked) {
1812 TOKUDB_HANDLER_DBUG_ENTER("%s %o %u", name, mode, test_if_locked);
1813 THD* thd = ha_thd();
1814
1815 int error = 0;
1816 int ret_val = 0;
1817
1818 transaction = NULL;
1819 cursor = NULL;
1820
1821
1822 /* Open primary key */
1823 hidden_primary_key = 0;
1824 if ((primary_key = table_share->primary_key) >= MAX_KEY) {
1825 // No primary key
1826 primary_key = table_share->keys;
1827 key_used_on_scan = MAX_KEY;
1828 hidden_primary_key = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
1829 ref_length = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t);
1830 }
1831 else {
1832 key_used_on_scan = primary_key;
1833 }
1834
1835 /* Need some extra memory in case of packed keys */
1836 // the "+ 1" is for the first byte that states +/- infinity
1837 // multiply everything by 2 to account for clustered keys having a key and primary key together
1838 max_key_length = 2*(table_share->max_key_length + MAX_REF_PARTS * 3 + sizeof(uchar));
1839 alloc_ptr = tokudb::memory::multi_malloc(
1840 MYF(MY_WME),
1841 &key_buff, max_key_length,
1842 &key_buff2, max_key_length,
1843 &key_buff3, max_key_length,
1844 &key_buff4, max_key_length,
1845 &prelocked_left_range, max_key_length,
1846 &prelocked_right_range, max_key_length,
1847 &primary_key_buff, (hidden_primary_key ? 0 : max_key_length),
1848 &fixed_cols_for_query, table_share->fields*sizeof(uint32_t),
1849 &var_cols_for_query, table_share->fields*sizeof(uint32_t),
1850 NullS);
1851 if (alloc_ptr == NULL) {
1852 ret_val = 1;
1853 goto exit;
1854 }
1855
1856 size_range_query_buff = tokudb::sysvars::read_buf_size(thd);
1857 range_query_buff =
1858 (uchar*)tokudb::memory::malloc(size_range_query_buff, MYF(MY_WME));
1859 if (range_query_buff == NULL) {
1860 ret_val = 1;
1861 goto exit;
1862 }
1863
1864 alloced_rec_buff_length = table_share->rec_buff_length +
1865 table_share->fields;
1866 rec_buff = (uchar *) tokudb::memory::malloc(
1867 alloced_rec_buff_length,
1868 MYF(MY_WME));
1869 if (rec_buff == NULL) {
1870 ret_val = 1;
1871 goto exit;
1872 }
1873
1874 alloced_update_rec_buff_length = alloced_rec_buff_length;
1875 rec_update_buff = (uchar*)tokudb::memory::malloc(
1876 alloced_update_rec_buff_length,
1877 MYF(MY_WME));
1878 if (rec_update_buff == NULL) {
1879 ret_val = 1;
1880 goto exit;
1881 }
1882
1883 // lookup or create share
1884 share = TOKUDB_SHARE::get_share(name, &lock, true);
1885 assert_always(share);
1886
1887 if (share->state() != TOKUDB_SHARE::OPENED) {
1888 // means we're responsible for the transition to OPENED, ERROR or CLOSED
1889
1890 ret_val = allocate_key_and_col_info(table_share, &share->kc_info);
1891 if (ret_val == 0) {
1892 ret_val = initialize_share(name, mode);
1893 }
1894
1895 if (ret_val == 0) {
1896 share->set_state(TOKUDB_SHARE::OPENED);
1897 } else {
1898 free_key_and_col_info(&share->kc_info);
1899 share->set_state(TOKUDB_SHARE::ERROR);
1900 }
1901 share->unlock();
1902 } else {
1903 // got an already OPENED instance
1904 share->unlock();
1905 }
1906
1907 if (share->state() == TOKUDB_SHARE::ERROR) {
1908 share->release();
1909 goto exit;
1910 }
1911
1912 assert_always(share->state() == TOKUDB_SHARE::OPENED);
1913
1914 ref_length = share->ref_length; // If second open
1915
1916 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1917 TOKUDB_DEBUG_OPEN,
1918 "tokudbopen:%p:share=%p:file=%p:table=%p:table->s=%p:%d",
1919 this,
1920 share,
1921 share->file,
1922 table,
1923 table->s,
1924 share->use_count());
1925
1926 key_read = false;
1927 stats.block_size = 1<<20; // QQQ Tokudb DB block size
1928
1929 info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
1930
1931 exit:
1932 if (ret_val) {
1933 tokudb::memory::free(range_query_buff);
1934 range_query_buff = NULL;
1935 tokudb::memory::free(alloc_ptr);
1936 alloc_ptr = NULL;
1937 tokudb::memory::free(rec_buff);
1938 rec_buff = NULL;
1939 tokudb::memory::free(rec_update_buff);
1940 rec_update_buff = NULL;
1941
1942 if (error) {
1943 my_errno = error;
1944 }
1945 }
1946 TOKUDB_HANDLER_DBUG_RETURN(ret_val);
1947 }
1948
1949 //
1950 // estimate the number of rows in a DB
1951 // Parameters:
1952 // [in] db - DB whose number of rows will be estimated
1953 // [out] num_rows - number of estimated rows in db
1954 // Returns:
1955 // 0 on success
1956 // error otherwise
1957 //
estimate_num_rows(DB * db,uint64_t * num_rows,DB_TXN * txn)1958 int ha_tokudb::estimate_num_rows(DB* db, uint64_t* num_rows, DB_TXN* txn) {
1959 int error = ENOSYS;
1960 bool do_commit = false;
1961 DB_BTREE_STAT64 dict_stats;
1962 DB_TXN* txn_to_use = NULL;
1963
1964 if (txn == NULL) {
1965 error = txn_begin(db_env, 0, &txn_to_use, DB_READ_UNCOMMITTED, ha_thd());
1966 if (error) goto cleanup;
1967 do_commit = true;
1968 }
1969 else {
1970 txn_to_use = txn;
1971 }
1972
1973 error = db->stat64(db, txn_to_use, &dict_stats);
1974 if (error) { goto cleanup; }
1975
1976 *num_rows = dict_stats.bt_ndata;
1977 error = 0;
1978 cleanup:
1979 if (do_commit) {
1980 commit_txn(txn_to_use, 0);
1981 txn_to_use = NULL;
1982 }
1983 return error;
1984 }
1985
1986
write_to_status(DB * db,HA_METADATA_KEY curr_key_data,void * data,uint size,DB_TXN * txn)1987 int ha_tokudb::write_to_status(DB* db, HA_METADATA_KEY curr_key_data, void* data, uint size, DB_TXN* txn ){
1988 return write_metadata(db, &curr_key_data, sizeof curr_key_data, data, size, txn);
1989 }
1990
remove_from_status(DB * db,HA_METADATA_KEY curr_key_data,DB_TXN * txn)1991 int ha_tokudb::remove_from_status(DB *db, HA_METADATA_KEY curr_key_data, DB_TXN *txn) {
1992 return remove_metadata(db, &curr_key_data, sizeof curr_key_data, txn);
1993 }
1994
remove_metadata(DB * db,void * key_data,uint key_size,DB_TXN * transaction)1995 int ha_tokudb::remove_metadata(DB* db, void* key_data, uint key_size, DB_TXN* transaction){
1996 int error;
1997 DBT key;
1998 DB_TXN* txn = NULL;
1999 bool do_commit = false;
2000 //
2001 // transaction to be used for putting metadata into status.tokudb
2002 //
2003 if (transaction == NULL) {
2004 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
2005 if (error) {
2006 goto cleanup;
2007 }
2008 do_commit = true;
2009 }
2010 else {
2011 txn = transaction;
2012 }
2013
2014 memset(&key, 0, sizeof(key));
2015 key.data = key_data;
2016 key.size = key_size;
2017 error = db->del(db, txn, &key, DB_DELETE_ANY);
2018 if (error) {
2019 goto cleanup;
2020 }
2021
2022 error = 0;
2023 cleanup:
2024 if (do_commit && txn) {
2025 if (!error) {
2026 commit_txn(txn, DB_TXN_NOSYNC);
2027 }
2028 else {
2029 abort_txn(txn);
2030 }
2031 }
2032 return error;
2033 }
2034
2035 //
2036 // helper function to write a piece of metadata in to status.tokudb
2037 //
write_metadata(DB * db,void * key_data,uint key_size,void * val_data,uint val_size,DB_TXN * transaction)2038 int ha_tokudb::write_metadata(DB* db, void* key_data, uint key_size, void* val_data, uint val_size, DB_TXN* transaction ){
2039 int error;
2040 DBT key;
2041 DBT value;
2042 DB_TXN* txn = NULL;
2043 bool do_commit = false;
2044 //
2045 // transaction to be used for putting metadata into status.tokudb
2046 //
2047 if (transaction == NULL) {
2048 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
2049 if (error) {
2050 goto cleanup;
2051 }
2052 do_commit = true;
2053 }
2054 else {
2055 txn = transaction;
2056 }
2057
2058 memset(&key, 0, sizeof(key));
2059 memset(&value, 0, sizeof(value));
2060 key.data = key_data;
2061 key.size = key_size;
2062 value.data = val_data;
2063 value.size = val_size;
2064 error = db->put(db, txn, &key, &value, 0);
2065 if (error) {
2066 goto cleanup;
2067 }
2068
2069 error = 0;
2070 cleanup:
2071 if (do_commit && txn) {
2072 if (!error) {
2073 commit_txn(txn, DB_TXN_NOSYNC);
2074 }
2075 else {
2076 abort_txn(txn);
2077 }
2078 }
2079 return error;
2080 }
2081
2082 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
write_frm_data(DB * db,DB_TXN * txn,const char * frm_name)2083 int ha_tokudb::write_frm_data(DB* db, DB_TXN* txn, const char* frm_name) {
2084 TOKUDB_HANDLER_DBUG_ENTER("%p %p %s", db, txn, frm_name);
2085
2086 uchar* frm_data = NULL;
2087 size_t frm_len = 0;
2088 int error = 0;
2089
2090 #if 100000 <= MYSQL_VERSION_ID
2091 error = table_share->read_frm_image((const uchar**)&frm_data,&frm_len);
2092 if (error) { goto cleanup; }
2093 #else
2094 error = readfrm(frm_name,&frm_data,&frm_len);
2095 if (error) { goto cleanup; }
2096 #endif
2097
2098 error = write_to_status(db,hatoku_frm_data,frm_data,(uint)frm_len, txn);
2099 if (error) { goto cleanup; }
2100
2101 error = 0;
2102 cleanup:
2103 tokudb::memory::free(frm_data);
2104 TOKUDB_HANDLER_DBUG_RETURN(error);
2105 }
2106
remove_frm_data(DB * db,DB_TXN * txn)2107 int ha_tokudb::remove_frm_data(DB *db, DB_TXN *txn) {
2108 return remove_from_status(db, hatoku_frm_data, txn);
2109 }
2110
smart_dbt_callback_verify_frm(TOKUDB_UNUSED (DBT const * key),DBT const * row,void * context)2111 static int smart_dbt_callback_verify_frm(TOKUDB_UNUSED(DBT const* key),
2112 DBT const* row,
2113 void* context) {
2114 DBT* stored_frm = (DBT *)context;
2115 stored_frm->size = row->size;
2116 stored_frm->data = (uchar *)tokudb::memory::malloc(row->size, MYF(MY_WME));
2117 assert_always(stored_frm->data);
2118 memcpy(stored_frm->data, row->data, row->size);
2119 return 0;
2120 }
2121
verify_frm_data(const char * frm_name,DB_TXN * txn)2122 int ha_tokudb::verify_frm_data(const char* frm_name, DB_TXN* txn) {
2123 TOKUDB_HANDLER_DBUG_ENTER("%s", frm_name);
2124 uchar* mysql_frm_data = NULL;
2125 size_t mysql_frm_len = 0;
2126 DBT key = {};
2127 DBT stored_frm = {};
2128 int error = 0;
2129 HA_METADATA_KEY curr_key = hatoku_frm_data;
2130
2131 // get the frm data from MySQL
2132 #if 100000 <= MYSQL_VERSION_ID
2133 error = table_share->read_frm_image((const uchar**)&mysql_frm_data,&mysql_frm_len);
2134 if (error) {
2135 goto cleanup;
2136 }
2137 #else
2138 error = readfrm(frm_name,&mysql_frm_data,&mysql_frm_len);
2139 if (error) {
2140 goto cleanup;
2141 }
2142 #endif
2143
2144 key.data = &curr_key;
2145 key.size = sizeof(curr_key);
2146 error = share->status_block->getf_set(
2147 share->status_block,
2148 txn,
2149 0,
2150 &key,
2151 smart_dbt_callback_verify_frm,
2152 &stored_frm
2153 );
2154 if (error == DB_NOTFOUND) {
2155 // if not found, write it
2156 error = write_frm_data(share->status_block, txn, frm_name);
2157 goto cleanup;
2158 } else if (error) {
2159 goto cleanup;
2160 }
2161
2162 if (stored_frm.size != mysql_frm_len || memcmp(stored_frm.data, mysql_frm_data, stored_frm.size)) {
2163 error = HA_ERR_TABLE_DEF_CHANGED;
2164 goto cleanup;
2165 }
2166
2167 error = 0;
2168 cleanup:
2169 tokudb::memory::free(mysql_frm_data);
2170 tokudb::memory::free(stored_frm.data);
2171 TOKUDB_HANDLER_DBUG_RETURN(error);
2172 }
2173 #endif // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
2174
2175 //
2176 // Updates status.tokudb with a new max value used for the auto increment column
2177 // Parameters:
2178 // [in] db - this will always be status.tokudb
2179 // val - value to store
2180 // Returns:
2181 // 0 on success, error otherwise
2182 //
2183 //
update_max_auto_inc(DB * db,ulonglong val)2184 int ha_tokudb::update_max_auto_inc(DB* db, ulonglong val){
2185 return write_to_status(db,hatoku_max_ai,&val,sizeof(val), NULL);
2186 }
2187
2188 //
2189 // Writes the initial auto increment value, as specified by create table
2190 // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
2191 // then the value 100 will be stored here in val
2192 // Parameters:
2193 // [in] db - this will always be status.tokudb
2194 // val - value to store
2195 // Returns:
2196 // 0 on success, error otherwise
2197 //
2198 //
write_auto_inc_create(DB * db,ulonglong val,DB_TXN * txn)2199 int ha_tokudb::write_auto_inc_create(DB* db, ulonglong val, DB_TXN* txn){
2200 return write_to_status(db,hatoku_ai_create_value,&val,sizeof(val), txn);
2201 }
2202
2203
2204 //
2205 // Closes a handle to a table.
2206 //
close()2207 int ha_tokudb::close() {
2208 TOKUDB_HANDLER_DBUG_ENTER("");
2209 int r = __close();
2210 TOKUDB_HANDLER_DBUG_RETURN(r);
2211 }
2212
__close()2213 int ha_tokudb::__close() {
2214 TOKUDB_HANDLER_DBUG_ENTER("");
2215 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "close:%p", this);
2216 tokudb::memory::free(rec_buff);
2217 tokudb::memory::free(rec_update_buff);
2218 tokudb::memory::free(blob_buff);
2219 tokudb::memory::free(alloc_ptr);
2220 tokudb::memory::free(range_query_buff);
2221 for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
2222 toku_dbt_array_destroy(&mult_key_dbt_array[i]);
2223 }
2224 for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
2225 toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
2226 }
2227 rec_buff = NULL;
2228 rec_update_buff = NULL;
2229 alloc_ptr = NULL;
2230 ha_tokudb::reset();
2231 int retval = share->release();
2232 TOKUDB_HANDLER_DBUG_RETURN(retval);
2233 }
2234
2235 //
2236 // Reallocate record buffer (rec_buff) if needed
2237 // If not needed, does nothing
2238 // Parameters:
2239 // length - size of buffer required for rec_buff
2240 //
fix_rec_buff_for_blob(ulong length)2241 bool ha_tokudb::fix_rec_buff_for_blob(ulong length) {
2242 if (!rec_buff || (length > alloced_rec_buff_length)) {
2243 uchar* newptr = (uchar*)tokudb::memory::realloc(
2244 (void*)rec_buff,
2245 length,
2246 MYF(MY_ALLOW_ZERO_PTR));
2247 if (!newptr)
2248 return 1;
2249 rec_buff = newptr;
2250 alloced_rec_buff_length = length;
2251 }
2252 return 0;
2253 }
2254
2255 //
2256 // Reallocate record buffer (rec_buff) if needed
2257 // If not needed, does nothing
2258 // Parameters:
2259 // length - size of buffer required for rec_buff
2260 //
fix_rec_update_buff_for_blob(ulong length)2261 bool ha_tokudb::fix_rec_update_buff_for_blob(ulong length) {
2262 if (!rec_update_buff || (length > alloced_update_rec_buff_length)) {
2263 uchar* newptr = (uchar*)tokudb::memory::realloc(
2264 (void*)rec_update_buff,
2265 length,
2266 MYF(MY_ALLOW_ZERO_PTR));
2267 if (!newptr)
2268 return 1;
2269 rec_update_buff= newptr;
2270 alloced_update_rec_buff_length = length;
2271 }
2272 return 0;
2273 }
2274
2275 /* Calculate max length needed for row */
max_row_length(const uchar * buf)2276 ulong ha_tokudb::max_row_length(const uchar * buf) {
2277 ulong length = table_share->reclength + table_share->fields * 2;
2278 uint *ptr, *end;
2279 for (ptr = table_share->blob_field, end = ptr + table_share->blob_fields; ptr != end; ptr++) {
2280 Field_blob *blob = ((Field_blob *) table->field[*ptr]);
2281 length += blob->get_length((uchar *) (buf + field_offset(blob, table))) + 2;
2282 }
2283 return length;
2284 }
2285
2286 /*
2287 */
2288 //
2289 // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2290 // Pack a row for storage.
2291 // If the row is of fixed length, just store the row 'as is'.
2292 // If not, we will generate a packed row suitable for storage.
2293 // This will only fail if we don't have enough memory to pack the row,
2294 // which may only happen in rows with blobs, as the default row length is
2295 // pre-allocated.
2296 // Parameters:
2297 // [out] row - row stored in DBT to be converted
2298 // [out] buf - buffer where row is packed
2299 // [in] record - row in MySQL format
2300 //
2301
pack_row_in_buff(DBT * row,const uchar * record,uint index,uchar * row_buff)2302 int ha_tokudb::pack_row_in_buff(
2303 DBT * row,
2304 const uchar* record,
2305 uint index,
2306 uchar* row_buff
2307 )
2308 {
2309 uchar* fixed_field_ptr = NULL;
2310 uchar* var_field_offset_ptr = NULL;
2311 uchar* start_field_data_ptr = NULL;
2312 uchar* var_field_data_ptr = NULL;
2313 int r = ENOSYS;
2314 memset((void *) row, 0, sizeof(*row));
2315
2316 MY_BITMAP *old_map = dbug_tmp_use_all_columns(table, &table->write_set);
2317
2318 // Copy null bytes
2319 memcpy(row_buff, record, table_share->null_bytes);
2320 fixed_field_ptr = row_buff + table_share->null_bytes;
2321 var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2322 start_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2323 var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2324
2325 // assert that when the hidden primary key exists, primary_key_offsets is NULL
2326 for (uint i = 0; i < table_share->fields; i++) {
2327 Field* field = table->field[i];
2328 uint curr_field_offset = field_offset(field, table);
2329 if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2330 continue;
2331 }
2332 if (is_fixed_field(&share->kc_info, i)) {
2333 fixed_field_ptr = pack_fixed_field(
2334 fixed_field_ptr,
2335 record + curr_field_offset,
2336 share->kc_info.field_lengths[i]
2337 );
2338 }
2339 else if (is_variable_field(&share->kc_info, i)) {
2340 var_field_data_ptr = pack_var_field(
2341 var_field_offset_ptr,
2342 var_field_data_ptr,
2343 start_field_data_ptr,
2344 record + curr_field_offset,
2345 share->kc_info.length_bytes[i],
2346 share->kc_info.num_offset_bytes
2347 );
2348 var_field_offset_ptr += share->kc_info.num_offset_bytes;
2349 }
2350 }
2351
2352 for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2353 Field* field = table->field[share->kc_info.blob_fields[i]];
2354 var_field_data_ptr = pack_toku_field_blob(
2355 var_field_data_ptr,
2356 record + field_offset(field, table),
2357 field
2358 );
2359 }
2360
2361 row->data = row_buff;
2362 row->size = (size_t) (var_field_data_ptr - row_buff);
2363 r = 0;
2364
2365 dbug_tmp_restore_column_map(&table->write_set, old_map);
2366 return r;
2367 }
2368
2369
pack_row(DBT * row,const uchar * record,uint index)2370 int ha_tokudb::pack_row(
2371 DBT * row,
2372 const uchar* record,
2373 uint index
2374 )
2375 {
2376 return pack_row_in_buff(row,record,index,rec_buff);
2377 }
2378
pack_old_row_for_update(DBT * row,const uchar * record,uint index)2379 int ha_tokudb::pack_old_row_for_update(
2380 DBT * row,
2381 const uchar* record,
2382 uint index
2383 )
2384 {
2385 return pack_row_in_buff(row,record,index,rec_update_buff);
2386 }
2387
2388
unpack_blobs(uchar * record,const uchar * from_tokudb_blob,uint32_t num_bytes,bool check_bitmap)2389 int ha_tokudb::unpack_blobs(
2390 uchar* record,
2391 const uchar* from_tokudb_blob,
2392 uint32_t num_bytes,
2393 bool check_bitmap
2394 )
2395 {
2396 uint error = 0;
2397 uchar* ptr = NULL;
2398 const uchar* buff = NULL;
2399 //
2400 // assert that num_bytes > 0 iff share->num_blobs > 0
2401 //
2402 assert_always( !((share->kc_info.num_blobs == 0) && (num_bytes > 0)) );
2403 if (num_bytes > num_blob_bytes) {
2404 ptr = (uchar*)tokudb::memory::realloc(
2405 (void*)blob_buff, num_bytes,
2406 MYF(MY_ALLOW_ZERO_PTR));
2407 if (ptr == NULL) {
2408 error = ENOMEM;
2409 goto exit;
2410 }
2411 blob_buff = ptr;
2412 num_blob_bytes = num_bytes;
2413 }
2414
2415 memcpy(blob_buff, from_tokudb_blob, num_bytes);
2416 buff= blob_buff;
2417 for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2418 uint32_t curr_field_index = share->kc_info.blob_fields[i];
2419 bool skip = check_bitmap ?
2420 !(bitmap_is_set(table->read_set,curr_field_index) ||
2421 bitmap_is_set(table->write_set,curr_field_index)) :
2422 false;
2423 Field* field = table->field[curr_field_index];
2424 uint32_t len_bytes = field->row_pack_length();
2425 const uchar* end_buff = unpack_toku_field_blob(
2426 record + field_offset(field, table),
2427 buff,
2428 len_bytes,
2429 skip
2430 );
2431 // verify that the pointers to the blobs are all contained within the blob_buff
2432 if (!(blob_buff <= buff && end_buff <= blob_buff + num_bytes)) {
2433 error = -3000000;
2434 goto exit;
2435 }
2436 buff = end_buff;
2437 }
2438 // verify that the entire blob buffer was parsed
2439 if (share->kc_info.num_blobs > 0 && !(num_bytes > 0 && buff == blob_buff + num_bytes)) {
2440 error = -4000000;
2441 goto exit;
2442 }
2443
2444 error = 0;
2445 exit:
2446 return error;
2447 }
2448
2449 //
2450 // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2451 // Parameters:
2452 // [out] record - row in MySQL format
2453 // [in] row - row stored in DBT to be converted
2454 //
unpack_row(uchar * record,DBT const * row,DBT const * key,uint index)2455 int ha_tokudb::unpack_row(
2456 uchar* record,
2457 DBT const *row,
2458 DBT const *key,
2459 uint index
2460 )
2461 {
2462 //
2463 // two cases, fixed length row, and variable length row
2464 // fixed length row is first below
2465 //
2466 /* Copy null bits */
2467 int error = 0;
2468 const uchar* fixed_field_ptr = (const uchar *) row->data;
2469 const uchar* var_field_offset_ptr = NULL;
2470 const uchar* var_field_data_ptr = NULL;
2471 uint32_t data_end_offset = 0;
2472 memcpy(record, fixed_field_ptr, table_share->null_bytes);
2473 fixed_field_ptr += table_share->null_bytes;
2474
2475 var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2476 var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2477
2478 //
2479 // unpack the key, if necessary
2480 //
2481 if (!(hidden_primary_key && index == primary_key)) {
2482 unpack_key(record,key,index);
2483 }
2484
2485 uint32_t last_offset = 0;
2486 //
2487 // we have two methods of unpacking, one if we need to unpack the entire row
2488 // the second if we unpack a subset of the entire row
2489 // first method here is if we unpack the entire row
2490 //
2491 if (unpack_entire_row) {
2492 //
2493 // fill in parts of record that are not part of the key
2494 //
2495 for (uint i = 0; i < table_share->fields; i++) {
2496 Field* field = table->field[i];
2497 if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2498 continue;
2499 }
2500
2501 if (is_fixed_field(&share->kc_info, i)) {
2502 fixed_field_ptr = unpack_fixed_field(
2503 record + field_offset(field, table),
2504 fixed_field_ptr,
2505 share->kc_info.field_lengths[i]
2506 );
2507 }
2508 //
2509 // here, we DO modify var_field_data_ptr or var_field_offset_ptr
2510 // as we unpack variable sized fields
2511 //
2512 else if (is_variable_field(&share->kc_info, i)) {
2513 switch (share->kc_info.num_offset_bytes) {
2514 case (1):
2515 data_end_offset = var_field_offset_ptr[0];
2516 break;
2517 case (2):
2518 data_end_offset = uint2korr(var_field_offset_ptr);
2519 break;
2520 default:
2521 assert_unreachable();
2522 }
2523 unpack_var_field(
2524 record + field_offset(field, table),
2525 var_field_data_ptr,
2526 data_end_offset - last_offset,
2527 share->kc_info.length_bytes[i]
2528 );
2529 var_field_offset_ptr += share->kc_info.num_offset_bytes;
2530 var_field_data_ptr += data_end_offset - last_offset;
2531 last_offset = data_end_offset;
2532 }
2533 }
2534 error = unpack_blobs(
2535 record,
2536 var_field_data_ptr,
2537 row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2538 false
2539 );
2540 if (error) {
2541 goto exit;
2542 }
2543 }
2544 //
2545 // in this case, we unpack only what is specified
2546 // in fixed_cols_for_query and var_cols_for_query
2547 //
2548 else {
2549 //
2550 // first the fixed fields
2551 //
2552 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
2553 uint field_index = fixed_cols_for_query[i];
2554 Field* field = table->field[field_index];
2555 unpack_fixed_field(
2556 record + field_offset(field, table),
2557 fixed_field_ptr + share->kc_info.cp_info[index][field_index].col_pack_val,
2558 share->kc_info.field_lengths[field_index]
2559 );
2560 }
2561
2562 //
2563 // now the var fields
2564 // here, we do NOT modify var_field_data_ptr or var_field_offset_ptr
2565 //
2566 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
2567 uint field_index = var_cols_for_query[i];
2568 Field* field = table->field[field_index];
2569 uint32_t var_field_index = share->kc_info.cp_info[index][field_index].col_pack_val;
2570 uint32_t data_start_offset;
2571 uint32_t field_len;
2572
2573 get_var_field_info(
2574 &field_len,
2575 &data_start_offset,
2576 var_field_index,
2577 var_field_offset_ptr,
2578 share->kc_info.num_offset_bytes
2579 );
2580
2581 unpack_var_field(
2582 record + field_offset(field, table),
2583 var_field_data_ptr + data_start_offset,
2584 field_len,
2585 share->kc_info.length_bytes[field_index]
2586 );
2587 }
2588
2589 if (read_blobs) {
2590 //
2591 // now the blobs
2592 //
2593 get_blob_field_info(
2594 &data_end_offset,
2595 share->kc_info.mcp_info[index].len_of_offsets,
2596 var_field_data_ptr,
2597 share->kc_info.num_offset_bytes
2598 );
2599
2600 var_field_data_ptr += data_end_offset;
2601 error = unpack_blobs(
2602 record,
2603 var_field_data_ptr,
2604 row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2605 true
2606 );
2607 if (error) {
2608 goto exit;
2609 }
2610 }
2611 }
2612 error = 0;
2613 exit:
2614 return error;
2615 }
2616
place_key_into_mysql_buff(KEY * key_info,uchar * record,uchar * data)2617 uint32_t ha_tokudb::place_key_into_mysql_buff(
2618 KEY* key_info,
2619 uchar* record,
2620 uchar* data) {
2621
2622 KEY_PART_INFO* key_part = key_info->key_part;
2623 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2624 uchar* pos = data;
2625
2626 for (; key_part != end; key_part++) {
2627 if (key_part->field->null_bit) {
2628 uint null_offset = get_null_offset(table, key_part->field);
2629 if (*pos++ == NULL_COL_VAL) { // Null value
2630 //
2631 // We don't need to reset the record data as we will not access it
2632 // if the null data is set
2633 //
2634 record[null_offset] |= key_part->field->null_bit;
2635 continue;
2636 }
2637 record[null_offset] &= ~key_part->field->null_bit;
2638 }
2639 #if !defined(MARIADB_BASE_VERSION)
2640 //
2641 // HOPEFULLY TEMPORARY
2642 //
2643 assert_always(table->s->db_low_byte_first);
2644 #endif
2645 pos = unpack_toku_key_field(
2646 record + field_offset(key_part->field, table),
2647 pos,
2648 key_part->field,
2649 key_part->length
2650 );
2651 }
2652 return pos-data;
2653 }
2654
2655 //
2656 // Store the key and the primary key into the row
2657 // Parameters:
2658 // [out] record - key stored in MySQL format
2659 // [in] key - key stored in DBT to be converted
2660 // index -index into key_file that represents the DB
2661 // unpacking a key of
2662 //
unpack_key(uchar * record,DBT const * key,uint index)2663 void ha_tokudb::unpack_key(uchar * record, DBT const *key, uint index) {
2664 uint32_t bytes_read;
2665 uchar *pos = (uchar *) key->data + 1;
2666 bytes_read = place_key_into_mysql_buff(
2667 &table->key_info[index],
2668 record,
2669 pos
2670 );
2671 if( (index != primary_key) && !hidden_primary_key) {
2672 //
2673 // also unpack primary key
2674 //
2675 place_key_into_mysql_buff(
2676 &table->key_info[primary_key],
2677 record,
2678 pos+bytes_read
2679 );
2680 }
2681 }
2682
place_key_into_dbt_buff(KEY * key_info,uchar * buff,const uchar * record,bool * has_null,int key_length)2683 uint32_t ha_tokudb::place_key_into_dbt_buff(
2684 KEY* key_info,
2685 uchar* buff,
2686 const uchar* record,
2687 bool* has_null,
2688 int key_length) {
2689
2690 KEY_PART_INFO* key_part = key_info->key_part;
2691 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2692 uchar* curr_buff = buff;
2693 *has_null = false;
2694 for (; key_part != end && key_length > 0; key_part++) {
2695 //
2696 // accessing key_part->field->null_bit instead off key_part->null_bit
2697 // because key_part->null_bit is not set in add_index
2698 // filed ticket 862 to look into this
2699 //
2700 if (key_part->field->null_bit) {
2701 /* Store 0 if the key part is a NULL part */
2702 uint null_offset = get_null_offset(table, key_part->field);
2703 if (record[null_offset] & key_part->field->null_bit) {
2704 *curr_buff++ = NULL_COL_VAL;
2705 *has_null = true;
2706 continue;
2707 }
2708 *curr_buff++ = NONNULL_COL_VAL; // Store NOT NULL marker
2709 }
2710 #if !defined(MARIADB_BASE_VERSION)
2711 //
2712 // HOPEFULLY TEMPORARY
2713 //
2714 assert_always(table->s->db_low_byte_first);
2715 #endif
2716 //
2717 // accessing field_offset(key_part->field) instead off key_part->offset
2718 // because key_part->offset is SET INCORRECTLY in add_index
2719 // filed ticket 862 to look into this
2720 //
2721 curr_buff = pack_toku_key_field(
2722 curr_buff,
2723 (uchar *) (record + field_offset(key_part->field, table)),
2724 key_part->field,
2725 key_part->length
2726 );
2727 key_length -= key_part->length;
2728 }
2729 return curr_buff - buff;
2730 }
2731
2732
2733
2734 //
2735 // Create a packed key from a row. This key will be written as such
2736 // to the index tree. This will never fail as the key buffer is pre-allocated.
2737 // Parameters:
2738 // [out] key - DBT that holds the key
2739 // [in] key_info - holds data about the key, such as it's length and offset into record
2740 // [out] buff - buffer that will hold the data for key (unless
2741 // we have a hidden primary key)
2742 // [in] record - row from which to create the key
2743 // key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2744 // Returns:
2745 // the parameter key
2746 //
2747
create_dbt_key_from_key(DBT * key,KEY * key_info,uchar * buff,const uchar * record,bool * has_null,bool dont_pack_pk,int key_length,uint8_t inf_byte)2748 DBT* ha_tokudb::create_dbt_key_from_key(
2749 DBT * key,
2750 KEY* key_info,
2751 uchar * buff,
2752 const uchar * record,
2753 bool* has_null,
2754 bool dont_pack_pk,
2755 int key_length,
2756 uint8_t inf_byte
2757 )
2758 {
2759 uint32_t size = 0;
2760 uchar* tmp_buff = buff;
2761 MY_BITMAP *old_map = dbug_tmp_use_all_columns(table, &table->write_set);
2762
2763 key->data = buff;
2764
2765 //
2766 // first put the "infinity" byte at beginning. States if missing columns are implicitly
2767 // positive infinity or negative infinity or zero. For this, because we are creating key
2768 // from a row, there is no way that columns can be missing, so in practice,
2769 // this will be meaningless. Might as well put in a value
2770 //
2771 *tmp_buff++ = inf_byte;
2772 size++;
2773 size += place_key_into_dbt_buff(
2774 key_info,
2775 tmp_buff,
2776 record,
2777 has_null,
2778 key_length
2779 );
2780 if (!dont_pack_pk) {
2781 tmp_buff = buff + size;
2782 if (hidden_primary_key) {
2783 memcpy(tmp_buff, current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2784 size += TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2785 }
2786 else {
2787 bool tmp_bool = false;
2788 size += place_key_into_dbt_buff(
2789 &table->key_info[primary_key],
2790 tmp_buff,
2791 record,
2792 &tmp_bool,
2793 MAX_KEY_LENGTH //this parameter does not matter
2794 );
2795 }
2796 }
2797
2798 key->size = size;
2799 DBUG_DUMP("key", (uchar *) key->data, key->size);
2800 dbug_tmp_restore_column_map(&table->write_set, old_map);
2801 return key;
2802 }
2803
2804
2805 //
2806 // Create a packed key from a row. This key will be written as such
2807 // to the index tree. This will never fail as the key buffer is pre-allocated.
2808 // Parameters:
2809 // [out] key - DBT that holds the key
2810 // keynr - index for which to create the key
2811 // [out] buff - buffer that will hold the data for key (unless
2812 // we have a hidden primary key)
2813 // [in] record - row from which to create the key
2814 // [out] has_null - says if the key has a NULL value for one of its columns
2815 // key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2816 // Returns:
2817 // the parameter key
2818 //
create_dbt_key_from_table(DBT * key,uint keynr,uchar * buff,const uchar * record,bool * has_null,int key_length)2819 DBT *ha_tokudb::create_dbt_key_from_table(
2820 DBT * key,
2821 uint keynr,
2822 uchar * buff,
2823 const uchar * record,
2824 bool* has_null,
2825 int key_length
2826 )
2827 {
2828 TOKUDB_HANDLER_DBUG_ENTER("");
2829 memset((void *) key, 0, sizeof(*key));
2830 if (hidden_primary_key && keynr == primary_key) {
2831 key->data = buff;
2832 memcpy(buff, ¤t_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2833 key->size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2834 *has_null = false;
2835 DBUG_RETURN(key);
2836 }
2837 DBUG_RETURN(create_dbt_key_from_key(key, &table->key_info[keynr],buff,record, has_null, (keynr == primary_key), key_length, COL_ZERO));
2838 }
2839
create_dbt_key_for_lookup(DBT * key,KEY * key_info,uchar * buff,const uchar * record,bool * has_null,int key_length)2840 DBT* ha_tokudb::create_dbt_key_for_lookup(
2841 DBT * key,
2842 KEY* key_info,
2843 uchar * buff,
2844 const uchar * record,
2845 bool* has_null,
2846 int key_length
2847 )
2848 {
2849 TOKUDB_HANDLER_DBUG_ENTER("");
2850 // override the infinity byte, needed in case the pk is a string
2851 // to make sure that the cursor that uses this key properly positions
2852 // it at the right location. If the table stores "D", but we look up for "d",
2853 // and the infinity byte is 0, then we will skip the "D", because
2854 // in bytes, "d" > "D".
2855 DBT* ret = create_dbt_key_from_key(key, key_info, buff, record, has_null, true, key_length, COL_NEG_INF);
2856 DBUG_RETURN(ret);
2857 }
2858
2859 //
2860 // Create a packed key from from a MySQL unpacked key (like the one that is
2861 // sent from the index_read() This key is to be used to read a row
2862 // Parameters:
2863 // [out] key - DBT that holds the key
2864 // keynr - index for which to pack the key
2865 // [out] buff - buffer that will hold the data for key
2866 // [in] key_ptr - MySQL unpacked key
2867 // key_length - length of key_ptr
2868 // Returns:
2869 // the parameter key
2870 //
pack_key(DBT * key,uint keynr,uchar * buff,const uchar * key_ptr,uint key_length,int8_t inf_byte)2871 DBT* ha_tokudb::pack_key(
2872 DBT* key,
2873 uint keynr,
2874 uchar* buff,
2875 const uchar* key_ptr,
2876 uint key_length,
2877 int8_t inf_byte) {
2878
2879 TOKUDB_HANDLER_DBUG_ENTER(
2880 "key %p %u:%2.2x inf=%d",
2881 key_ptr,
2882 key_length,
2883 key_length > 0 ? key_ptr[0] : 0,
2884 inf_byte);
2885 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2886 if (keynr != primary_key && !tokudb_test(hidden_primary_key)) {
2887 DBUG_RETURN(pack_ext_key(key, keynr, buff, key_ptr, key_length, inf_byte));
2888 }
2889 #endif // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2890 KEY* key_info = &table->key_info[keynr];
2891 KEY_PART_INFO* key_part = key_info->key_part;
2892 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2893 MY_BITMAP* old_map = dbug_tmp_use_all_columns(table, &table->write_set);
2894
2895 memset((void *) key, 0, sizeof(*key));
2896 key->data = buff;
2897
2898 // first put the "infinity" byte at beginning. States if missing columns are implicitly
2899 // positive infinity or negative infinity
2900 *buff++ = (uchar)inf_byte;
2901
2902 for (; key_part != end && (int) key_length > 0; key_part++) {
2903 uint offset = 0;
2904 if (key_part->null_bit) {
2905 if (!(*key_ptr == 0)) {
2906 *buff++ = NULL_COL_VAL;
2907 key_length -= key_part->store_length;
2908 key_ptr += key_part->store_length;
2909 continue;
2910 }
2911 *buff++ = NONNULL_COL_VAL;
2912 offset = 1; // Data is at key_ptr+1
2913 }
2914 #if !defined(MARIADB_BASE_VERSION)
2915 assert_always(table->s->db_low_byte_first);
2916 #endif
2917 buff = pack_key_toku_key_field(
2918 buff,
2919 (uchar *) key_ptr + offset,
2920 key_part->field,
2921 key_part->length
2922 );
2923
2924 key_ptr += key_part->store_length;
2925 key_length -= key_part->store_length;
2926 }
2927
2928 key->size = (buff - (uchar *) key->data);
2929 DBUG_DUMP("key", (uchar *) key->data, key->size);
2930 dbug_tmp_restore_column_map(&table->write_set, old_map);
2931 DBUG_RETURN(key);
2932 }
2933
2934 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
pack_ext_key(DBT * key,uint keynr,uchar * buff,const uchar * key_ptr,uint key_length,int8_t inf_byte)2935 DBT* ha_tokudb::pack_ext_key(
2936 DBT* key,
2937 uint keynr,
2938 uchar* buff,
2939 const uchar* key_ptr,
2940 uint key_length,
2941 int8_t inf_byte) {
2942
2943 TOKUDB_HANDLER_DBUG_ENTER("");
2944
2945 // build a list of PK parts that are in the SK. we will use this list to build the
2946 // extended key if necessary.
2947 KEY* pk_key_info = &table->key_info[primary_key];
2948 uint pk_parts = pk_key_info->user_defined_key_parts;
2949 uint pk_next = 0;
2950 struct {
2951 const uchar *key_ptr;
2952 KEY_PART_INFO *key_part;
2953 } pk_info[pk_parts];
2954
2955 KEY* key_info = &table->key_info[keynr];
2956 KEY_PART_INFO* key_part = key_info->key_part;
2957 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2958 MY_BITMAP* old_map = dbug_tmp_use_all_columns(table, &table->write_set);
2959
2960 memset((void *) key, 0, sizeof(*key));
2961 key->data = buff;
2962
2963 // first put the "infinity" byte at beginning. States if missing columns are implicitly
2964 // positive infinity or negative infinity
2965 *buff++ = (uchar)inf_byte;
2966
2967 for (; key_part != end && (int) key_length > 0; key_part++) {
2968 // if the SK part is part of the PK, then append it to the list.
2969 if (key_part->field->part_of_key.is_set(primary_key)) {
2970 assert_always(pk_next < pk_parts);
2971 pk_info[pk_next].key_ptr = key_ptr;
2972 pk_info[pk_next].key_part = key_part;
2973 pk_next++;
2974 }
2975 uint offset = 0;
2976 if (key_part->null_bit) {
2977 if (!(*key_ptr == 0)) {
2978 *buff++ = NULL_COL_VAL;
2979 key_length -= key_part->store_length;
2980 key_ptr += key_part->store_length;
2981 continue;
2982 }
2983 *buff++ = NONNULL_COL_VAL;
2984 offset = 1; // Data is at key_ptr+1
2985 }
2986 #if !defined(MARIADB_BASE_VERSION)
2987 assert_always(table->s->db_low_byte_first);
2988 #endif
2989 buff = pack_key_toku_key_field(
2990 buff,
2991 (uchar *) key_ptr + offset,
2992 key_part->field,
2993 key_part->length
2994 );
2995
2996 key_ptr += key_part->store_length;
2997 key_length -= key_part->store_length;
2998 }
2999
3000 if (key_length > 0) {
3001 assert_always(key_part == end);
3002 end = key_info->key_part + get_ext_key_parts(key_info);
3003
3004 // pack PK in order of PK key parts
3005 for (uint pk_index = 0;
3006 key_part != end && (int) key_length > 0 && pk_index < pk_parts;
3007 pk_index++) {
3008 uint i;
3009 for (i = 0; i < pk_next; i++) {
3010 if (pk_info[i].key_part->fieldnr ==
3011 pk_key_info->key_part[pk_index].fieldnr)
3012 break;
3013 }
3014 if (i < pk_next) {
3015 const uchar *this_key_ptr = pk_info[i].key_ptr;
3016 KEY_PART_INFO *this_key_part = pk_info[i].key_part;
3017 buff = pack_key_toku_key_field(
3018 buff,
3019 (uchar*)this_key_ptr,
3020 this_key_part->field,
3021 this_key_part->length);
3022 } else {
3023 buff = pack_key_toku_key_field(
3024 buff,
3025 (uchar*)key_ptr,
3026 key_part->field,
3027 key_part->length);
3028 key_ptr += key_part->store_length;
3029 key_length -= key_part->store_length;
3030 key_part++;
3031 }
3032 }
3033 }
3034
3035 key->size = (buff - (uchar *) key->data);
3036 DBUG_DUMP("key", (uchar *) key->data, key->size);
3037 dbug_tmp_restore_column_map(&table->write_set, old_map);
3038 DBUG_RETURN(key);
3039 }
3040 #endif // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
3041
3042 //
3043 // get max used hidden primary key value
3044 //
init_hidden_prim_key_info(DB_TXN * txn)3045 void ha_tokudb::init_hidden_prim_key_info(DB_TXN *txn) {
3046 TOKUDB_HANDLER_DBUG_ENTER("");
3047 if (!(share->status & STATUS_PRIMARY_KEY_INIT)) {
3048 int error = 0;
3049 DBC* c = NULL;
3050 error = share->key_file[primary_key]->cursor(
3051 share->key_file[primary_key],
3052 txn,
3053 &c,
3054 0);
3055 assert_always(error == 0);
3056 DBT key,val;
3057 memset(&key, 0, sizeof(key));
3058 memset(&val, 0, sizeof(val));
3059 error = c->c_get(c, &key, &val, DB_LAST);
3060 if (error == 0) {
3061 assert_always(key.size == TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
3062 share->auto_ident = hpk_char_to_num((uchar *)key.data);
3063 }
3064 error = c->c_close(c);
3065 assert_always(error == 0);
3066 share->status |= STATUS_PRIMARY_KEY_INIT;
3067 }
3068 TOKUDB_HANDLER_DBUG_VOID_RETURN;
3069 }
3070
3071
3072
3073 /** @brief
3074 Get metadata info stored in status.tokudb
3075 */
get_status(DB_TXN * txn)3076 int ha_tokudb::get_status(DB_TXN* txn) {
3077 TOKUDB_HANDLER_DBUG_ENTER("");
3078 DBT key, value;
3079 HA_METADATA_KEY curr_key;
3080 int error;
3081
3082 //
3083 // open status.tokudb
3084 //
3085 if (!share->status_block) {
3086 error =
3087 open_status_dictionary(
3088 &share->status_block,
3089 share->full_table_name(),
3090 txn);
3091 if (error) {
3092 goto cleanup;
3093 }
3094 }
3095
3096 //
3097 // transaction to be used for putting metadata into status.tokudb
3098 //
3099 memset(&key, 0, sizeof(key));
3100 memset(&value, 0, sizeof(value));
3101 key.data = &curr_key;
3102 key.size = sizeof(curr_key);
3103 value.flags = DB_DBT_USERMEM;
3104
3105 assert_always(share->status_block);
3106 //
3107 // get version
3108 //
3109 value.ulen = sizeof(share->version);
3110 value.data = &share->version;
3111 curr_key = hatoku_new_version;
3112 error = share->status_block->get(
3113 share->status_block,
3114 txn,
3115 &key,
3116 &value,
3117 0
3118 );
3119 if (error == DB_NOTFOUND) {
3120 //
3121 // hack to keep handle the issues of going back and forth
3122 // between 5.0.3 to 5.0.4
3123 // the problem with going back and forth
3124 // is with storing the frm file, 5.0.4 stores it, 5.0.3 does not
3125 // so, if a user goes back and forth and alters the schema
3126 // the frm stored can get out of sync with the schema of the table
3127 // This can cause issues.
3128 // To take care of this, we are doing this versioning work here.
3129 // We change the key that stores the version.
3130 // In 5.0.3, it is hatoku_old_version, in 5.0.4 it is hatoku_new_version
3131 // When we encounter a table that does not have hatoku_new_version
3132 // set, we give it the right one, and overwrite the old one with zero.
3133 // This ensures that 5.0.3 cannot open the table. Once it has been opened by 5.0.4
3134 //
3135 uint dummy_version = 0;
3136 share->version = HA_TOKU_ORIG_VERSION;
3137 error = write_to_status(
3138 share->status_block,
3139 hatoku_new_version,
3140 &share->version,
3141 sizeof(share->version),
3142 txn
3143 );
3144 if (error) { goto cleanup; }
3145 error = write_to_status(
3146 share->status_block,
3147 hatoku_old_version,
3148 &dummy_version,
3149 sizeof(dummy_version),
3150 txn
3151 );
3152 if (error) { goto cleanup; }
3153 }
3154 else if (error || value.size != sizeof(share->version)) {
3155 if (error == 0) {
3156 error = HA_ERR_INTERNAL_ERROR;
3157 }
3158 goto cleanup;
3159 }
3160 //
3161 // get capabilities
3162 //
3163 curr_key = hatoku_capabilities;
3164 value.ulen = sizeof(share->capabilities);
3165 value.data = &share->capabilities;
3166 error = share->status_block->get(
3167 share->status_block,
3168 txn,
3169 &key,
3170 &value,
3171 0
3172 );
3173 if (error == DB_NOTFOUND) {
3174 share->capabilities= 0;
3175 }
3176 else if (error || value.size != sizeof(share->version)) {
3177 if (error == 0) {
3178 error = HA_ERR_INTERNAL_ERROR;
3179 }
3180 goto cleanup;
3181 }
3182
3183 error = 0;
3184 cleanup:
3185 TOKUDB_HANDLER_DBUG_RETURN(error);
3186 }
3187
3188 /** @brief
3189 Return an estimated of the number of rows in the table.
3190 Used when sorting to allocate buffers and by the optimizer.
3191 This is used in filesort.cc.
3192 */
estimate_rows_upper_bound()3193 ha_rows ha_tokudb::estimate_rows_upper_bound() {
3194 TOKUDB_HANDLER_DBUG_ENTER("");
3195 DBUG_RETURN(share->row_count() + HA_TOKUDB_EXTRA_ROWS);
3196 }
3197
3198 //
3199 // Function that compares two primary keys that were saved as part of rnd_pos
3200 // and ::position
3201 //
cmp_ref(const uchar * ref1,const uchar * ref2)3202 int ha_tokudb::cmp_ref(const uchar * ref1, const uchar * ref2) {
3203 int ret_val = 0;
3204 bool read_string = false;
3205 ret_val = tokudb_compare_two_keys(
3206 ref1 + sizeof(uint32_t),
3207 *(uint32_t *)ref1,
3208 ref2 + sizeof(uint32_t),
3209 *(uint32_t *)ref2,
3210 (uchar *)share->file->descriptor->dbt.data + 4,
3211 *(uint32_t *)share->file->descriptor->dbt.data - 4,
3212 false,
3213 &read_string
3214 );
3215 return ret_val;
3216 }
3217
check_if_incompatible_data(HA_CREATE_INFO * info,uint table_changes)3218 bool ha_tokudb::check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes) {
3219 //
3220 // This is a horrendous hack for now, as copied by InnoDB.
3221 // This states that if the auto increment create field has changed,
3222 // via a "alter table foo auto_increment=new_val", that this
3223 // change is incompatible, and to rebuild the entire table
3224 // This will need to be fixed
3225 //
3226 if ((info->used_fields & HA_CREATE_USED_AUTO) &&
3227 info->auto_increment_value != 0) {
3228
3229 return COMPATIBLE_DATA_NO;
3230 }
3231 if (table_changes != IS_EQUAL_YES)
3232 return COMPATIBLE_DATA_NO;
3233 return COMPATIBLE_DATA_YES;
3234 }
3235
3236 //
3237 // Method that is called before the beginning of many calls
3238 // to insert rows (ha_tokudb::write_row). There is no guarantee
3239 // that start_bulk_insert is called, however there is a guarantee
3240 // that if start_bulk_insert is called, then end_bulk_insert may be
3241 // called as well.
3242 // Parameters:
3243 // [in] rows - an estimate of the number of rows that will be inserted
3244 // if number of rows is unknown (such as if doing
3245 // "insert into foo select * from bar), then rows
3246 // will be 0
3247 //
3248 //
3249 // This function returns true if the table MAY be empty.
3250 // It is NOT meant to be a 100% check for emptiness.
3251 // This is used for a bulk load optimization.
3252 //
may_table_be_empty(DB_TXN * txn)3253 bool ha_tokudb::may_table_be_empty(DB_TXN *txn) {
3254 int error;
3255 bool ret_val = false;
3256 DBC* tmp_cursor = NULL;
3257 DB_TXN* tmp_txn = NULL;
3258
3259 const int empty_scan = tokudb::sysvars::empty_scan(ha_thd());
3260 if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_DISABLED)
3261 goto cleanup;
3262
3263 if (txn == NULL) {
3264 error = txn_begin(db_env, 0, &tmp_txn, 0, ha_thd());
3265 if (error) {
3266 goto cleanup;
3267 }
3268 txn = tmp_txn;
3269 }
3270
3271 error = share->file->cursor(share->file, txn, &tmp_cursor, 0);
3272 if (error)
3273 goto cleanup;
3274 tmp_cursor->c_set_check_interrupt_callback(tmp_cursor, tokudb_killed_thd_callback, ha_thd());
3275 if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_LR)
3276 error = tmp_cursor->c_getf_next(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3277 else
3278 error = tmp_cursor->c_getf_prev(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3279 error = map_to_handler_error(error);
3280 if (error == DB_NOTFOUND)
3281 ret_val = true;
3282 else
3283 ret_val = false;
3284 error = 0;
3285
3286 cleanup:
3287 if (tmp_cursor) {
3288 int r = tmp_cursor->c_close(tmp_cursor);
3289 assert_always(r == 0);
3290 tmp_cursor = NULL;
3291 }
3292 if (tmp_txn) {
3293 commit_txn(tmp_txn, 0);
3294 tmp_txn = NULL;
3295 }
3296 return ret_val;
3297 }
3298
3299 #if MYSQL_VERSION_ID >= 100000
start_bulk_insert(ha_rows rows,uint flags)3300 void ha_tokudb::start_bulk_insert(ha_rows rows, uint flags) {
3301 TOKUDB_HANDLER_DBUG_ENTER("%llu %u txn %p", (unsigned long long) rows, flags, transaction);
3302 #else
3303 void ha_tokudb::start_bulk_insert(ha_rows rows) {
3304 TOKUDB_HANDLER_DBUG_ENTER("%llu txn %p", (unsigned long long) rows, transaction);
3305 #endif
3306 THD* thd = ha_thd();
3307 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3308 delay_updating_ai_metadata = true;
3309 ai_metadata_update_required = false;
3310 abort_loader = false;
3311
3312 rwlock_t_lock_read(share->_num_DBs_lock);
3313 uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3314 num_DBs_locked_in_bulk = true;
3315 lock_count = 0;
3316
3317 if ((rows == 0 || rows > 1) && share->try_table_lock) {
3318 if (tokudb::sysvars::prelock_empty(thd) &&
3319 may_table_be_empty(transaction) &&
3320 transaction != NULL) {
3321 if (using_ignore || is_insert_ignore(thd) || thd->lex->duplicates != DUP_ERROR
3322 || table->s->next_number_key_offset) {
3323 acquire_table_lock(transaction, lock_write);
3324 } else {
3325 mult_dbt_flags[primary_key] = 0;
3326 if (!thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS) && !hidden_primary_key) {
3327 mult_put_flags[primary_key] = DB_NOOVERWRITE;
3328 }
3329 uint32_t loader_flags = (tokudb::sysvars::load_save_space(thd)) ?
3330 LOADER_COMPRESS_INTERMEDIATES : 0;
3331
3332 int error = db_env->create_loader(
3333 db_env,
3334 transaction,
3335 &loader,
3336 NULL, // no src_db needed
3337 curr_num_DBs,
3338 share->key_file,
3339 mult_put_flags,
3340 mult_dbt_flags,
3341 loader_flags
3342 );
3343 if (error) {
3344 assert_always(loader == NULL);
3345 goto exit_try_table_lock;
3346 }
3347
3348 lc.thd = thd;
3349 lc.ha = this;
3350
3351 error = loader->set_poll_function(
3352 loader, ha_tokudb::bulk_insert_poll, &lc);
3353 assert_always(!error);
3354
3355 error = loader->set_error_callback(
3356 loader, ha_tokudb::loader_dup, &lc);
3357 assert_always(!error);
3358
3359 trx->stmt_progress.using_loader = true;
3360 }
3361 }
3362 exit_try_table_lock:
3363 share->lock();
3364 share->try_table_lock = false;
3365 share->unlock();
3366 }
3367 TOKUDB_HANDLER_DBUG_VOID_RETURN;
3368 }
3369 int ha_tokudb::bulk_insert_poll(void* extra, float progress) {
3370 LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
3371 if (thd_killed(context->thd)) {
3372 snprintf(context->write_status_msg,
3373 sizeof(context->write_status_msg),
3374 "The process has been killed, aborting bulk load.");
3375 return ER_ABORTING_CONNECTION;
3376 }
3377 float percentage = progress * 100;
3378 snprintf(context->write_status_msg,
3379 sizeof(context->write_status_msg),
3380 "Loading of data t %s about %.1f%% done",
3381 context->ha->share->full_table_name(),
3382 percentage);
3383 thd_proc_info(context->thd, context->write_status_msg);
3384 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
3385 thd_progress_report(context->thd, (unsigned long long)percentage, 100);
3386 #endif
3387 return 0;
3388 }
3389 void ha_tokudb::loader_add_index_err(TOKUDB_UNUSED(DB* db),
3390 TOKUDB_UNUSED(int i),
3391 TOKUDB_UNUSED(int err),
3392 TOKUDB_UNUSED(DBT* key),
3393 TOKUDB_UNUSED(DBT* val),
3394 void* error_extra) {
3395 LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3396 assert_always(context->ha);
3397 context->ha->set_loader_error(err);
3398 }
3399 void ha_tokudb::loader_dup(TOKUDB_UNUSED(DB* db),
3400 TOKUDB_UNUSED(int i),
3401 int err,
3402 DBT* key,
3403 TOKUDB_UNUSED(DBT* val),
3404 void* error_extra) {
3405 LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3406 assert_always(context->ha);
3407 context->ha->set_loader_error(err);
3408 if (err == DB_KEYEXIST) {
3409 context->ha->set_dup_value_for_pk(key);
3410 }
3411 }
3412
3413 //
3414 // Method that is called at the end of many calls to insert rows
3415 // (ha_tokudb::write_row). If start_bulk_insert is called, then
3416 // this is guaranteed to be called.
3417 //
3418 int ha_tokudb::end_bulk_insert(TOKUDB_UNUSED(bool abort)) {
3419 TOKUDB_HANDLER_DBUG_ENTER("");
3420 int error = 0;
3421 THD* thd = ha_thd();
3422 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3423 bool using_loader = (loader != NULL);
3424 if (ai_metadata_update_required) {
3425 share->lock();
3426 error = update_max_auto_inc(share->status_block, share->last_auto_increment);
3427 share->unlock();
3428 if (error) { goto cleanup; }
3429 }
3430 delay_updating_ai_metadata = false;
3431 ai_metadata_update_required = false;
3432 loader_error = 0;
3433 if (loader) {
3434 if (!abort_loader && !thd_kill_level(thd)) {
3435 DBUG_EXECUTE_IF("tokudb_end_bulk_insert_sleep", {
3436 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3437 thd_proc_info(thd, "DBUG sleep");
3438 my_sleep(20000000);
3439 thd_proc_info(thd, orig_proc_info);
3440 });
3441 error = loader->close(loader);
3442 loader = NULL;
3443 if (error) {
3444 if (thd_kill_level(thd)) {
3445 my_error(ER_QUERY_INTERRUPTED, MYF(0));
3446 }
3447 goto cleanup;
3448 }
3449
3450 for (uint i = 0; i < table_share->keys; i++) {
3451 if (table_share->key_info[i].flags & HA_NOSAME) {
3452 bool is_unique;
3453 if (i == primary_key && !share->pk_has_string) {
3454 continue;
3455 }
3456 error = is_index_unique(&is_unique, transaction, share->key_file[i], &table->key_info[i],
3457 DB_PRELOCKED_WRITE);
3458 if (error) goto cleanup;
3459 if (!is_unique) {
3460 error = HA_ERR_FOUND_DUPP_KEY;
3461 last_dup_key = i;
3462 goto cleanup;
3463 }
3464 }
3465 }
3466 }
3467 else {
3468 error = sprintf(write_status_msg, "aborting bulk load");
3469 thd_proc_info(thd, write_status_msg);
3470 loader->abort(loader);
3471 loader = NULL;
3472 share->try_table_lock = true;
3473 }
3474 }
3475
3476 cleanup:
3477 if (num_DBs_locked_in_bulk) {
3478 share->_num_DBs_lock.unlock();
3479 }
3480 num_DBs_locked_in_bulk = false;
3481 lock_count = 0;
3482 if (loader) {
3483 error = sprintf(write_status_msg, "aborting bulk load");
3484 thd_proc_info(thd, write_status_msg);
3485 loader->abort(loader);
3486 loader = NULL;
3487 }
3488 abort_loader = false;
3489 memset(&lc, 0, sizeof(lc));
3490 if (error || loader_error) {
3491 my_errno = error ? error : loader_error;
3492 if (using_loader) {
3493 share->try_table_lock = true;
3494 }
3495 }
3496 trx->stmt_progress.using_loader = false;
3497 thd_proc_info(thd, 0);
3498 TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
3499 }
3500
3501 int ha_tokudb::end_bulk_insert() {
3502 return end_bulk_insert( false );
3503 }
3504
3505 int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info, int lock_flags) {
3506 int error;
3507 DBC* tmp_cursor1 = NULL;
3508 DBC* tmp_cursor2 = NULL;
3509 DBT key1, key2, val, packed_key1, packed_key2;
3510 uint64_t cnt = 0;
3511 char status_msg[MAX_ALIAS_NAME + 200]; //buffer of 200 should be a good upper bound.
3512 THD* thd = ha_thd();
3513 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3514 memset(&key1, 0, sizeof(key1));
3515 memset(&key2, 0, sizeof(key2));
3516 memset(&val, 0, sizeof(val));
3517 memset(&packed_key1, 0, sizeof(packed_key1));
3518 memset(&packed_key2, 0, sizeof(packed_key2));
3519 *is_unique = true;
3520
3521 error = db->cursor(db, txn, &tmp_cursor1, DB_SERIALIZABLE);
3522 if (error) { goto cleanup; }
3523
3524 error = db->cursor(db, txn, &tmp_cursor2, DB_SERIALIZABLE);
3525 if (error) { goto cleanup; }
3526
3527 error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3528 if (error == DB_NOTFOUND) {
3529 *is_unique = true;
3530 error = 0;
3531 goto cleanup;
3532 }
3533 else if (error) { goto cleanup; }
3534 error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3535 if (error) { goto cleanup; }
3536
3537 error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3538 if (error == DB_NOTFOUND) {
3539 *is_unique = true;
3540 error = 0;
3541 goto cleanup;
3542 }
3543 else if (error) { goto cleanup; }
3544
3545 while (error != DB_NOTFOUND) {
3546 bool has_null1;
3547 bool has_null2;
3548 int cmp;
3549 place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key1.data + 1);
3550 place_key_into_mysql_buff(key_info, table->record[1], (uchar *) key2.data + 1);
3551
3552 create_dbt_key_for_lookup(&packed_key1, key_info, key_buff, table->record[0], &has_null1);
3553 create_dbt_key_for_lookup(&packed_key2, key_info, key_buff2, table->record[1], &has_null2);
3554
3555 if (!has_null1 && !has_null2) {
3556 cmp = tokudb_prefix_cmp_dbt_key(db, &packed_key1, &packed_key2);
3557 if (cmp == 0) {
3558 memcpy(key_buff, key1.data, key1.size);
3559 place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key_buff + 1);
3560 *is_unique = false;
3561 break;
3562 }
3563 }
3564
3565 error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3566 if (error) { goto cleanup; }
3567 error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3568 if (error && (error != DB_NOTFOUND)) { goto cleanup; }
3569
3570 cnt++;
3571 if ((cnt % 10000) == 0) {
3572 sprintf(
3573 status_msg,
3574 "Verifying index uniqueness: Checked %llu of %llu rows in key-%s.",
3575 (long long unsigned) cnt,
3576 share->row_count(),
3577 key_info->name.str);
3578 thd_proc_info(thd, status_msg);
3579 if (thd_kill_level(thd)) {
3580 my_error(ER_QUERY_INTERRUPTED, MYF(0));
3581 error = ER_QUERY_INTERRUPTED;
3582 goto cleanup;
3583 }
3584 }
3585 }
3586
3587 error = 0;
3588
3589 cleanup:
3590 thd_proc_info(thd, orig_proc_info);
3591 if (tmp_cursor1) {
3592 tmp_cursor1->c_close(tmp_cursor1);
3593 tmp_cursor1 = NULL;
3594 }
3595 if (tmp_cursor2) {
3596 tmp_cursor2->c_close(tmp_cursor2);
3597 tmp_cursor2 = NULL;
3598 }
3599 return error;
3600 }
3601
3602 int ha_tokudb::is_val_unique(bool* is_unique, const uchar* record, KEY* key_info, uint dict_index, DB_TXN* txn) {
3603 int error = 0;
3604 bool has_null;
3605 DBC* tmp_cursor = NULL;
3606
3607 DBT key; memset((void *)&key, 0, sizeof(key));
3608 create_dbt_key_from_key(&key, key_info, key_buff2, record, &has_null, true, MAX_KEY_LENGTH, COL_NEG_INF);
3609 if (has_null) {
3610 error = 0;
3611 *is_unique = true;
3612 goto cleanup;
3613 }
3614
3615 error = share->key_file[dict_index]->cursor(share->key_file[dict_index], txn, &tmp_cursor, DB_SERIALIZABLE | DB_RMW);
3616 if (error) {
3617 goto cleanup;
3618 } else {
3619 // prelock (key,-inf),(key,+inf) so that the subsequent key lookup does not overlock
3620 uint flags = 0;
3621 DBT key_right; memset(&key_right, 0, sizeof key_right);
3622 create_dbt_key_from_key(&key_right, key_info, key_buff3, record, &has_null, true, MAX_KEY_LENGTH, COL_POS_INF);
3623 error = tmp_cursor->c_set_bounds(tmp_cursor, &key, &key_right, true, DB_NOTFOUND);
3624 if (error == 0) {
3625 flags = DB_PRELOCKED | DB_PRELOCKED_WRITE;
3626 }
3627
3628 // lookup key and check unique prefix
3629 struct smart_dbt_info info;
3630 info.ha = this;
3631 info.buf = NULL;
3632 info.keynr = dict_index;
3633
3634 struct index_read_info ir_info;
3635 ir_info.orig_key = &key;
3636 ir_info.smart_dbt_info = info;
3637
3638 error = tmp_cursor->c_getf_set_range(tmp_cursor, flags, &key, smart_dbt_callback_lookup, &ir_info);
3639 if (error == DB_NOTFOUND) {
3640 *is_unique = true;
3641 error = 0;
3642 goto cleanup;
3643 }
3644 else if (error) {
3645 error = map_to_handler_error(error);
3646 goto cleanup;
3647 }
3648 if (ir_info.cmp) {
3649 *is_unique = true;
3650 }
3651 else {
3652 *is_unique = false;
3653 }
3654 }
3655 error = 0;
3656
3657 cleanup:
3658 if (tmp_cursor) {
3659 int r = tmp_cursor->c_close(tmp_cursor);
3660 assert_always(r==0);
3661 tmp_cursor = NULL;
3662 }
3663 return error;
3664 }
3665
3666 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
3667 static void maybe_do_unique_checks_delay_fn(THD *thd) {
3668 if (thd->slave_thread) {
3669 uint64_t delay_ms = tokudb::sysvars::rpl_unique_checks_delay(thd);
3670 if (delay_ms)
3671 usleep(delay_ms * 1000);
3672 }
3673 }
3674
3675 #define maybe_do_unique_checks_delay(__thd) \
3676 (maybe_do_unique_checks_delay_fn(__thd))
3677
3678 #define maybe_do_unique_checks_delay_if_flags_set( \
3679 __thd, __flags_set, __flags_check) \
3680 { if (((__flags_set) & DB_OPFLAGS_MASK) == \
3681 (__flags_check)) maybe_do_unique_checks_delay_fn(__thd); }
3682
3683 static bool need_read_only(THD *thd) {
3684 return opt_readonly || !tokudb::sysvars::rpl_check_readonly(thd);
3685 }
3686
3687 static bool do_unique_checks_fn(THD *thd, bool do_rpl_event) {
3688 if (do_rpl_event &&
3689 thd->slave_thread &&
3690 need_read_only(thd) &&
3691 !tokudb::sysvars::rpl_unique_checks(thd)) {
3692 return false;
3693 } else {
3694 return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3695 }
3696 }
3697
3698 #define do_unique_checks(__thd, __flags) \
3699 (do_unique_checks_fn(__thd, __flags))
3700
3701 #else
3702
3703 #define maybe_do_unique_checks_delay(__thd) ((void)0)
3704
3705 #define maybe_do_unique_checks_delay_if_flags_set( \
3706 __thd, __flags_set, __flags_check) \
3707 ((void)0)
3708
3709 static bool do_unique_checks_fn(THD *thd) {
3710 return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3711 }
3712
3713 #define do_unique_checks(__thd, _flags) \
3714 (do_unique_checks_fn(__thd))
3715
3716 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
3717
3718 int ha_tokudb::do_uniqueness_checks(const uchar* record, DB_TXN* txn, THD* thd) {
3719 int error = 0;
3720 //
3721 // first do uniqueness checks
3722 //
3723 if (share->has_unique_keys && do_unique_checks(thd, in_rpl_write_rows)) {
3724 DBUG_EXECUTE_IF("tokudb_crash_if_rpl_does_uniqueness_check",
3725 DBUG_ASSERT(0););
3726 for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3727 bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
3728 bool is_unique = false;
3729 //
3730 // don't need to do check for primary key that don't have strings
3731 //
3732 if (keynr == primary_key && !share->pk_has_string) {
3733 continue;
3734 }
3735 if (!is_unique_key) {
3736 continue;
3737 }
3738
3739 maybe_do_unique_checks_delay(thd);
3740
3741 //
3742 // if unique key, check uniqueness constraint
3743 // but, we do not need to check it if the key has a null
3744 // and we do not need to check it if unique_checks is off
3745 //
3746 error = is_val_unique(&is_unique, record, &table->key_info[keynr], keynr, txn);
3747 if (error) {
3748 goto cleanup;
3749 }
3750 if (!is_unique) {
3751 error = DB_KEYEXIST;
3752 last_dup_key = keynr;
3753 goto cleanup;
3754 }
3755 }
3756 }
3757 cleanup:
3758 return error;
3759 }
3760
3761 void ha_tokudb::test_row_packing(const uchar* record, DBT* pk_key, DBT* pk_val) {
3762 int error;
3763 DBT row, key;
3764 //
3765 // variables for testing key packing, only used in some debug modes
3766 //
3767 uchar* tmp_pk_key_data = NULL;
3768 uchar* tmp_pk_val_data = NULL;
3769 DBT tmp_pk_key;
3770 DBT tmp_pk_val;
3771 bool has_null;
3772 int cmp;
3773
3774 memset(&tmp_pk_key, 0, sizeof(DBT));
3775 memset(&tmp_pk_val, 0, sizeof(DBT));
3776
3777 //
3778 //use for testing the packing of keys
3779 //
3780 tmp_pk_key_data = (uchar*)tokudb::memory::malloc(pk_key->size, MYF(MY_WME));
3781 assert_always(tmp_pk_key_data);
3782 tmp_pk_val_data = (uchar*)tokudb::memory::malloc(pk_val->size, MYF(MY_WME));
3783 assert_always(tmp_pk_val_data);
3784 memcpy(tmp_pk_key_data, pk_key->data, pk_key->size);
3785 memcpy(tmp_pk_val_data, pk_val->data, pk_val->size);
3786 tmp_pk_key.data = tmp_pk_key_data;
3787 tmp_pk_key.size = pk_key->size;
3788 tmp_pk_val.data = tmp_pk_val_data;
3789 tmp_pk_val.size = pk_val->size;
3790
3791 for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3792 uint32_t tmp_num_bytes = 0;
3793 uchar* row_desc = NULL;
3794 uint32_t desc_size = 0;
3795
3796 if (keynr == primary_key) {
3797 continue;
3798 }
3799
3800 create_dbt_key_from_table(&key, keynr, key_buff2, record, &has_null);
3801
3802 //
3803 // TEST
3804 //
3805 row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3806 row_desc += (*(uint32_t *)row_desc);
3807 desc_size = (*(uint32_t *)row_desc) - 4;
3808 row_desc += 4;
3809 tmp_num_bytes = pack_key_from_desc(
3810 key_buff3,
3811 row_desc,
3812 desc_size,
3813 &tmp_pk_key,
3814 &tmp_pk_val
3815 );
3816 assert_always(tmp_num_bytes == key.size);
3817 cmp = memcmp(key_buff3,key_buff2,tmp_num_bytes);
3818 assert_always(cmp == 0);
3819
3820 //
3821 // test key packing of clustering keys
3822 //
3823 if (key_is_clustering(&table->key_info[keynr])) {
3824 error = pack_row(&row, (const uchar *) record, keynr);
3825 assert_always(error == 0);
3826 uchar* tmp_buff = NULL;
3827 tmp_buff = (uchar*)tokudb::memory::malloc(
3828 alloced_rec_buff_length,
3829 MYF(MY_WME));
3830 assert_always(tmp_buff);
3831 row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3832 row_desc += (*(uint32_t *)row_desc);
3833 row_desc += (*(uint32_t *)row_desc);
3834 desc_size = (*(uint32_t *)row_desc) - 4;
3835 row_desc += 4;
3836 tmp_num_bytes = pack_clustering_val_from_desc(
3837 tmp_buff,
3838 row_desc,
3839 desc_size,
3840 &tmp_pk_val
3841 );
3842 assert_always(tmp_num_bytes == row.size);
3843 cmp = memcmp(tmp_buff,rec_buff,tmp_num_bytes);
3844 assert_always(cmp == 0);
3845 tokudb::memory::free(tmp_buff);
3846 }
3847 }
3848
3849 //
3850 // copy stuff back out
3851 //
3852 error = pack_row(pk_val, (const uchar *) record, primary_key);
3853 assert_always(pk_val->size == tmp_pk_val.size);
3854 cmp = memcmp(pk_val->data, tmp_pk_val_data, pk_val->size);
3855 assert_always( cmp == 0);
3856
3857 tokudb::memory::free(tmp_pk_key_data);
3858 tokudb::memory::free(tmp_pk_val_data);
3859 }
3860
3861 // set the put flags for the main dictionary
3862 void ha_tokudb::set_main_dict_put_flags(THD* thd, bool opt_eligible, uint32_t* put_flags) {
3863 uint32_t old_prelock_flags = 0;
3864 uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3865 bool in_hot_index = share->num_DBs > curr_num_DBs;
3866 bool using_ignore_flag_opt = do_ignore_flag_optimization(thd, table, share->replace_into_fast && !using_ignore_no_key);
3867 //
3868 // optimization for "REPLACE INTO..." (and "INSERT IGNORE") command
3869 // if the command is "REPLACE INTO" and the only table
3870 // is the main table (or all indexes are a subset of the pk),
3871 // then we can simply insert the element
3872 // with DB_YESOVERWRITE. If the element does not exist,
3873 // it will act as a normal insert, and if it does exist, it
3874 // will act as a replace, which is exactly what REPLACE INTO is supposed
3875 // to do. We cannot do this if otherwise, because then we lose
3876 // consistency between indexes
3877 //
3878 if (hidden_primary_key)
3879 {
3880 *put_flags = old_prelock_flags;
3881 }
3882 else if (!do_unique_checks(thd, in_rpl_write_rows | in_rpl_update_rows) && !is_replace_into(thd) && !is_insert_ignore(thd))
3883 {
3884 *put_flags = old_prelock_flags;
3885 }
3886 else if (using_ignore_flag_opt && is_replace_into(thd)
3887 && !in_hot_index)
3888 {
3889 *put_flags = old_prelock_flags;
3890 }
3891 else if (opt_eligible && using_ignore_flag_opt && is_insert_ignore(thd)
3892 && !in_hot_index)
3893 {
3894 *put_flags = DB_NOOVERWRITE_NO_ERROR | old_prelock_flags;
3895 }
3896 else
3897 {
3898 *put_flags = DB_NOOVERWRITE | old_prelock_flags;
3899 }
3900 }
3901
3902 int ha_tokudb::insert_row_to_main_dictionary(
3903 DBT* pk_key,
3904 DBT* pk_val,
3905 DB_TXN* txn) {
3906
3907 int error = 0;
3908 uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3909 assert_always(curr_num_DBs == 1);
3910
3911 uint32_t put_flags = mult_put_flags[primary_key];
3912 THD *thd = ha_thd();
3913 set_main_dict_put_flags(thd, true, &put_flags);
3914
3915 // for test, make unique checks have a very long duration
3916 maybe_do_unique_checks_delay_if_flags_set(thd, put_flags, DB_NOOVERWRITE);
3917
3918 error = share->file->put(share->file, txn, pk_key, pk_val, put_flags);
3919 if (error) {
3920 last_dup_key = primary_key;
3921 goto cleanup;
3922 }
3923
3924 cleanup:
3925 return error;
3926 }
3927
3928 int ha_tokudb::insert_rows_to_dictionaries_mult(DBT* pk_key, DBT* pk_val, DB_TXN* txn, THD* thd) {
3929 int error = 0;
3930 uint curr_num_DBs = share->num_DBs;
3931 set_main_dict_put_flags(thd, true, &mult_put_flags[primary_key]);
3932 uint32_t flags = mult_put_flags[primary_key];
3933
3934 // for test, make unique checks have a very long duration
3935 maybe_do_unique_checks_delay_if_flags_set(thd, flags, DB_NOOVERWRITE);
3936
3937 // the insert ignore optimization uses DB_NOOVERWRITE_NO_ERROR,
3938 // which is not allowed with env->put_multiple.
3939 // we have to insert the rows one by one in this case.
3940 if (flags & DB_NOOVERWRITE_NO_ERROR) {
3941 DB * src_db = share->key_file[primary_key];
3942 for (uint32_t i = 0; i < curr_num_DBs; i++) {
3943 DB * db = share->key_file[i];
3944 if (i == primary_key) {
3945 // if it's the primary key, insert the rows
3946 // as they are.
3947 error = db->put(db, txn, pk_key, pk_val, flags);
3948 } else {
3949 // generate a row for secondary keys.
3950 // use our multi put key/rec buffers
3951 // just as the ydb layer would have in
3952 // env->put_multiple(), except that
3953 // we will just do a put() right away.
3954 error = tokudb_generate_row(db, src_db,
3955 &mult_key_dbt_array[i].dbts[0], &mult_rec_dbt_array[i].dbts[0],
3956 pk_key, pk_val);
3957 if (error != 0) {
3958 goto out;
3959 }
3960 error = db->put(db, txn, &mult_key_dbt_array[i].dbts[0],
3961 &mult_rec_dbt_array[i].dbts[0], flags);
3962 }
3963 if (error != 0) {
3964 goto out;
3965 }
3966 }
3967 } else {
3968 // not insert ignore, so we can use put multiple
3969 error = db_env->put_multiple(
3970 db_env,
3971 share->key_file[primary_key],
3972 txn,
3973 pk_key,
3974 pk_val,
3975 curr_num_DBs,
3976 share->key_file,
3977 mult_key_dbt_array,
3978 mult_rec_dbt_array,
3979 mult_put_flags
3980 );
3981 }
3982
3983 out:
3984 //
3985 // We break if we hit an error, unless it is a dup key error
3986 // and MySQL told us to ignore duplicate key errors
3987 //
3988 if (error) {
3989 last_dup_key = primary_key;
3990 }
3991 return error;
3992 }
3993
3994 //
3995 // Stores a row in the table, called when handling an INSERT query
3996 // Parameters:
3997 // [in] record - a row in MySQL format
3998 // Returns:
3999 // 0 on success
4000 // error otherwise
4001 //
4002 int ha_tokudb::write_row(const uchar * record) {
4003 TOKUDB_HANDLER_DBUG_ENTER("%p", record);
4004
4005 DBT row, prim_key;
4006 int error;
4007 THD *thd = ha_thd();
4008 bool has_null;
4009 DB_TXN* sub_trans = NULL;
4010 DB_TXN* txn = NULL;
4011 tokudb_trx_data *trx = NULL;
4012 uint curr_num_DBs;
4013 bool create_sub_trans = false;
4014 bool num_DBs_locked = false;
4015
4016 //
4017 // some crap that needs to be done because MySQL does not properly abstract
4018 // this work away from us, namely filling in auto increment and setting auto timestamp
4019 //
4020 #if MYSQL_VERSION_ID < 50600
4021 if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT) {
4022 table->timestamp_field->set_time();
4023 }
4024 #endif
4025 if (table->next_number_field && record == table->record[0]) {
4026 error = update_auto_increment();
4027 if (error)
4028 goto cleanup;
4029 }
4030
4031 //
4032 // check to see if some value for the auto increment column that is bigger
4033 // than anything else til now is being used. If so, update the metadata to reflect it
4034 // the goal here is we never want to have a dup key error due to a bad increment
4035 // of the auto inc field.
4036 //
4037 if (share->has_auto_inc && record == table->record[0]) {
4038 share->lock();
4039 ulonglong curr_auto_inc = retrieve_auto_increment(
4040 table->field[share->ai_field_index]->key_type(),
4041 field_offset(table->field[share->ai_field_index], table),
4042 record);
4043 if (curr_auto_inc > share->last_auto_increment) {
4044 share->last_auto_increment = curr_auto_inc;
4045 if (delay_updating_ai_metadata) {
4046 ai_metadata_update_required = true;
4047 } else {
4048 update_max_auto_inc(
4049 share->status_block,
4050 share->last_auto_increment);
4051 }
4052 }
4053 share->unlock();
4054 }
4055
4056 //
4057 // grab reader lock on numDBs_lock
4058 //
4059 if (!num_DBs_locked_in_bulk) {
4060 rwlock_t_lock_read(share->_num_DBs_lock);
4061 num_DBs_locked = true;
4062 } else {
4063 lock_count++;
4064 if (lock_count >= 2000) {
4065 share->_num_DBs_lock.unlock();
4066 rwlock_t_lock_read(share->_num_DBs_lock);
4067 lock_count = 0;
4068 }
4069 }
4070 curr_num_DBs = share->num_DBs;
4071
4072 if (hidden_primary_key) {
4073 get_auto_primary_key(current_ident);
4074 }
4075
4076 if (table_share->blob_fields) {
4077 if (fix_rec_buff_for_blob(max_row_length(record))) {
4078 error = HA_ERR_OUT_OF_MEM;
4079 goto cleanup;
4080 }
4081 }
4082
4083 create_dbt_key_from_table(&prim_key, primary_key, primary_key_buff, record, &has_null);
4084 if ((error = pack_row(&row, (const uchar *) record, primary_key))){
4085 goto cleanup;
4086 }
4087
4088 create_sub_trans = (using_ignore && !(do_ignore_flag_optimization(thd,table,share->replace_into_fast && !using_ignore_no_key)));
4089 if (create_sub_trans) {
4090 error = txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4091 if (error) {
4092 goto cleanup;
4093 }
4094 }
4095 txn = create_sub_trans ? sub_trans : transaction;
4096 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_TXN, "txn %p", txn);
4097 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY))) {
4098 test_row_packing(record,&prim_key,&row);
4099 }
4100 if (loader) {
4101 error = loader->put(loader, &prim_key, &row);
4102 if (error) {
4103 abort_loader = true;
4104 goto cleanup;
4105 }
4106 } else {
4107 error = do_uniqueness_checks(record, txn, thd);
4108 if (error) {
4109 // for #4633
4110 // if we have a duplicate key error, let's check the primary key to see
4111 // if there is a duplicate there. If so, set last_dup_key to the pk
4112 if (error == DB_KEYEXIST && !tokudb_test(hidden_primary_key) && last_dup_key != primary_key) {
4113 int r = share->file->getf_set(share->file, txn, DB_SERIALIZABLE, &prim_key, smart_dbt_do_nothing, NULL);
4114 if (r == 0) {
4115 // if we get no error, that means the row
4116 // was found and this is a duplicate key,
4117 // so we set last_dup_key
4118 last_dup_key = primary_key;
4119 } else if (r != DB_NOTFOUND) {
4120 // if some other error is returned, return that to the user.
4121 error = r;
4122 }
4123 }
4124 goto cleanup;
4125 }
4126 if (curr_num_DBs == 1) {
4127 error = insert_row_to_main_dictionary(&prim_key, &row, txn);
4128 if (error) { goto cleanup; }
4129 } else {
4130 error = insert_rows_to_dictionaries_mult(&prim_key, &row, txn, thd);
4131 if (error) { goto cleanup; }
4132 }
4133 if (error == 0) {
4134 uint64_t full_row_size = prim_key.size + row.size;
4135 toku_hton_update_primary_key_bytes_inserted(full_row_size);
4136 }
4137 }
4138
4139 trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4140 if (!error) {
4141 added_rows++;
4142 trx->stmt_progress.inserted++;
4143 track_progress(thd);
4144 }
4145 cleanup:
4146 if (num_DBs_locked) {
4147 share->_num_DBs_lock.unlock();
4148 }
4149 if (error == DB_KEYEXIST) {
4150 error = HA_ERR_FOUND_DUPP_KEY;
4151 }
4152 if (sub_trans) {
4153 // no point in recording error value of abort.
4154 // nothing we can do about it anyway and it is not what
4155 // we want to return.
4156 if (error) {
4157 abort_txn(sub_trans);
4158 }
4159 else {
4160 commit_txn(sub_trans, DB_TXN_NOSYNC);
4161 }
4162 }
4163 TOKUDB_HANDLER_DBUG_RETURN(error);
4164 }
4165
4166 /* Compare if a key in a row has changed */
4167 bool ha_tokudb::key_changed(uint keynr, const uchar * old_row, const uchar * new_row) {
4168 DBT old_key;
4169 DBT new_key;
4170 memset((void *) &old_key, 0, sizeof(old_key));
4171 memset((void *) &new_key, 0, sizeof(new_key));
4172
4173 bool has_null;
4174 create_dbt_key_from_table(&new_key, keynr, key_buff2, new_row, &has_null);
4175 create_dbt_key_for_lookup(&old_key,&table->key_info[keynr], key_buff3, old_row, &has_null);
4176 return tokudb_prefix_cmp_dbt_key(share->key_file[keynr], &old_key, &new_key);
4177 }
4178
4179 //
4180 // Updates a row in the table, called when handling an UPDATE query
4181 // Parameters:
4182 // [in] old_row - row to be updated, in MySQL format
4183 // [in] new_row - new row, in MySQL format
4184 // Returns:
4185 // 0 on success
4186 // error otherwise
4187 //
4188 int ha_tokudb::update_row(const uchar * old_row, const uchar * new_row) {
4189 TOKUDB_HANDLER_DBUG_ENTER("");
4190 DBT prim_key, old_prim_key, prim_row, old_prim_row;
4191 int UNINIT_VAR(error);
4192 bool has_null;
4193 THD* thd = ha_thd();
4194 DB_TXN* sub_trans = NULL;
4195 DB_TXN* txn = NULL;
4196 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4197 uint curr_num_DBs;
4198
4199 memset((void *) &prim_key, 0, sizeof(prim_key));
4200 memset((void *) &old_prim_key, 0, sizeof(old_prim_key));
4201 memset((void *) &prim_row, 0, sizeof(prim_row));
4202 memset((void *) &old_prim_row, 0, sizeof(old_prim_row));
4203
4204 #if MYSQL_VERSION_ID < 50600
4205 if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) {
4206 table->timestamp_field->set_time();
4207 }
4208 #endif
4209 //
4210 // check to see if some value for the auto increment column that is bigger
4211 // than anything else til now is being used. If so, update the metadata to reflect it
4212 // the goal here is we never want to have a dup key error due to a bad increment
4213 // of the auto inc field.
4214 //
4215 if (share->has_auto_inc && new_row == table->record[0]) {
4216 share->lock();
4217 ulonglong curr_auto_inc = retrieve_auto_increment(
4218 table->field[share->ai_field_index]->key_type(),
4219 field_offset(table->field[share->ai_field_index], table),
4220 new_row
4221 );
4222 if (curr_auto_inc > share->last_auto_increment) {
4223 error = update_max_auto_inc(share->status_block, curr_auto_inc);
4224 if (!error) {
4225 share->last_auto_increment = curr_auto_inc;
4226 }
4227 }
4228 share->unlock();
4229 }
4230
4231 //
4232 // grab reader lock on numDBs_lock
4233 //
4234 bool num_DBs_locked = false;
4235 if (!num_DBs_locked_in_bulk) {
4236 rwlock_t_lock_read(share->_num_DBs_lock);
4237 num_DBs_locked = true;
4238 }
4239 curr_num_DBs = share->num_DBs;
4240
4241 if (using_ignore) {
4242 error = txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4243 if (error) {
4244 goto cleanup;
4245 }
4246 }
4247 txn = using_ignore ? sub_trans : transaction;
4248
4249 if (hidden_primary_key) {
4250 memset((void *) &prim_key, 0, sizeof(prim_key));
4251 prim_key.data = (void *) current_ident;
4252 prim_key.size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
4253 old_prim_key = prim_key;
4254 }
4255 else {
4256 create_dbt_key_from_table(&prim_key, primary_key, key_buff, new_row, &has_null);
4257 create_dbt_key_from_table(&old_prim_key, primary_key, primary_key_buff, old_row, &has_null);
4258 }
4259
4260 // do uniqueness checks
4261 if (share->has_unique_keys && do_unique_checks(thd, in_rpl_update_rows)) {
4262 for (uint keynr = 0; keynr < table_share->keys; keynr++) {
4263 bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
4264 if (keynr == primary_key && !share->pk_has_string) {
4265 continue;
4266 }
4267 if (is_unique_key) {
4268 bool key_ch = key_changed(keynr, old_row, new_row);
4269 if (key_ch) {
4270 bool is_unique;
4271 error = is_val_unique(&is_unique, new_row, &table->key_info[keynr], keynr, txn);
4272 if (error) goto cleanup;
4273 if (!is_unique) {
4274 error = DB_KEYEXIST;
4275 last_dup_key = keynr;
4276 goto cleanup;
4277 }
4278 }
4279 }
4280 }
4281 }
4282
4283 if (table_share->blob_fields) {
4284 if (fix_rec_buff_for_blob(max_row_length(new_row))) {
4285 error = HA_ERR_OUT_OF_MEM;
4286 goto cleanup;
4287 }
4288 if (fix_rec_update_buff_for_blob(max_row_length(old_row))) {
4289 error = HA_ERR_OUT_OF_MEM;
4290 goto cleanup;
4291 }
4292 }
4293
4294 error = pack_row(&prim_row, new_row, primary_key);
4295 if (error) { goto cleanup; }
4296
4297 error = pack_old_row_for_update(&old_prim_row, old_row, primary_key);
4298 if (error) { goto cleanup; }
4299
4300 set_main_dict_put_flags(thd, false, &mult_put_flags[primary_key]);
4301
4302 // for test, make unique checks have a very long duration
4303 if ((mult_put_flags[primary_key] & DB_OPFLAGS_MASK) == DB_NOOVERWRITE)
4304 maybe_do_unique_checks_delay(thd);
4305
4306 error = db_env->update_multiple(
4307 db_env,
4308 share->key_file[primary_key],
4309 txn,
4310 &old_prim_key,
4311 &old_prim_row,
4312 &prim_key,
4313 &prim_row,
4314 curr_num_DBs,
4315 share->key_file,
4316 mult_put_flags,
4317 2*curr_num_DBs,
4318 mult_key_dbt_array,
4319 curr_num_DBs,
4320 mult_rec_dbt_array
4321 );
4322
4323 if (error == DB_KEYEXIST) {
4324 last_dup_key = primary_key;
4325 }
4326 else if (!error) {
4327 updated_rows++;
4328 trx->stmt_progress.updated++;
4329 track_progress(thd);
4330 }
4331
4332
4333 cleanup:
4334 if (num_DBs_locked) {
4335 share->_num_DBs_lock.unlock();
4336 }
4337 if (error == DB_KEYEXIST) {
4338 error = HA_ERR_FOUND_DUPP_KEY;
4339 }
4340 if (sub_trans) {
4341 // no point in recording error value of abort.
4342 // nothing we can do about it anyway and it is not what
4343 // we want to return.
4344 if (error) {
4345 abort_txn(sub_trans);
4346 }
4347 else {
4348 commit_txn(sub_trans, DB_TXN_NOSYNC);
4349 }
4350 }
4351 TOKUDB_HANDLER_DBUG_RETURN(error);
4352 }
4353
4354 //
4355 // Deletes a row in the table, called when handling a DELETE query
4356 // Parameters:
4357 // [in] record - row to be deleted, in MySQL format
4358 // Returns:
4359 // 0 on success
4360 // error otherwise
4361 //
4362 int ha_tokudb::delete_row(const uchar * record) {
4363 TOKUDB_HANDLER_DBUG_ENTER("");
4364 int error = ENOSYS;
4365 DBT row, prim_key;
4366 bool has_null;
4367 THD* thd = ha_thd();
4368 uint curr_num_DBs;
4369 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4370
4371 //
4372 // grab reader lock on numDBs_lock
4373 //
4374 bool num_DBs_locked = false;
4375 if (!num_DBs_locked_in_bulk) {
4376 rwlock_t_lock_read(share->_num_DBs_lock);
4377 num_DBs_locked = true;
4378 }
4379 curr_num_DBs = share->num_DBs;
4380
4381 create_dbt_key_from_table(&prim_key, primary_key, key_buff, record, &has_null);
4382 if (table_share->blob_fields) {
4383 if (fix_rec_buff_for_blob(max_row_length(record))) {
4384 error = HA_ERR_OUT_OF_MEM;
4385 goto cleanup;
4386 }
4387 }
4388 if ((error = pack_row(&row, (const uchar *) record, primary_key))){
4389 goto cleanup;
4390 }
4391
4392 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
4393 TOKUDB_DEBUG_TXN,
4394 "all %p stmt %p sub_sp_level %p transaction %p",
4395 trx->all,
4396 trx->stmt,
4397 trx->sub_sp_level,
4398 transaction);
4399
4400 error =
4401 db_env->del_multiple(
4402 db_env,
4403 share->key_file[primary_key],
4404 transaction,
4405 &prim_key,
4406 &row,
4407 curr_num_DBs,
4408 share->key_file,
4409 mult_key_dbt_array,
4410 mult_del_flags);
4411
4412 if (error) {
4413 DBUG_PRINT("error", ("Got error %d", error));
4414 } else {
4415 deleted_rows++;
4416 trx->stmt_progress.deleted++;
4417 track_progress(thd);
4418 }
4419 cleanup:
4420 if (num_DBs_locked) {
4421 share->_num_DBs_lock.unlock();
4422 }
4423 TOKUDB_HANDLER_DBUG_RETURN(error);
4424 }
4425
4426 //
4427 // takes as input table->read_set and table->write_set
4428 // and puts list of field indexes that need to be read in
4429 // unpack_row in the member variables fixed_cols_for_query
4430 // and var_cols_for_query
4431 //
4432 void ha_tokudb::set_query_columns(uint keynr) {
4433 uint32_t curr_fixed_col_index = 0;
4434 uint32_t curr_var_col_index = 0;
4435 read_key = false;
4436 read_blobs = false;
4437 //
4438 // i know this is probably confusing and will need to be explained better
4439 //
4440 uint key_index = 0;
4441
4442 if (keynr == primary_key || keynr == MAX_KEY) {
4443 key_index = primary_key;
4444 }
4445 else {
4446 key_index = (key_is_clustering(&table->key_info[keynr]) ? keynr : primary_key);
4447 }
4448 for (uint i = 0; i < table_share->fields; i++) {
4449 if (bitmap_is_set(table->read_set,i) ||
4450 bitmap_is_set(table->write_set,i)
4451 )
4452 {
4453 if (bitmap_is_set(&share->kc_info.key_filters[key_index],i)) {
4454 read_key = true;
4455 }
4456 else {
4457 //
4458 // if fixed field length
4459 //
4460 if (is_fixed_field(&share->kc_info, i)) {
4461 //
4462 // save the offset into the list
4463 //
4464 fixed_cols_for_query[curr_fixed_col_index] = i;
4465 curr_fixed_col_index++;
4466 }
4467 //
4468 // varchar or varbinary
4469 //
4470 else if (is_variable_field(&share->kc_info, i)) {
4471 var_cols_for_query[curr_var_col_index] = i;
4472 curr_var_col_index++;
4473 }
4474 //
4475 // it is a blob
4476 //
4477 else {
4478 read_blobs = true;
4479 }
4480 }
4481 }
4482 }
4483 num_fixed_cols_for_query = curr_fixed_col_index;
4484 num_var_cols_for_query = curr_var_col_index;
4485 }
4486
4487 void ha_tokudb::column_bitmaps_signal() {
4488 //
4489 // if we have max number of indexes, then MAX_KEY == primary_key
4490 //
4491 if (tokudb_active_index != MAX_KEY || tokudb_active_index == primary_key) {
4492 set_query_columns(tokudb_active_index);
4493 }
4494 }
4495
4496 //
4497 // Notification that a scan of entire secondary table is about
4498 // to take place. Will pre acquire table read lock
4499 // Returns:
4500 // 0 on success
4501 // error otherwise
4502 //
4503 int ha_tokudb::prepare_index_scan() {
4504 TOKUDB_HANDLER_DBUG_ENTER("");
4505 int error = 0;
4506 HANDLE_INVALID_CURSOR();
4507 error = prelock_range(NULL, NULL);
4508 if (error) { last_cursor_error = error; goto cleanup; }
4509
4510 range_lock_grabbed = true;
4511 error = 0;
4512 cleanup:
4513 TOKUDB_HANDLER_DBUG_RETURN(error);
4514 }
4515
4516 static bool index_key_is_null(
4517 TABLE* table,
4518 uint keynr,
4519 const uchar* key,
4520 uint key_len) {
4521
4522 bool key_can_be_null = false;
4523 KEY* key_info = &table->key_info[keynr];
4524 KEY_PART_INFO* key_part = key_info->key_part;
4525 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
4526 for (; key_part != end; key_part++) {
4527 if (key_part->null_bit) {
4528 key_can_be_null = true;
4529 break;
4530 }
4531 }
4532 return key_can_be_null && key_len > 0 && key[0] != 0;
4533 }
4534
4535 // Return true if bulk fetch can be used
4536 static bool tokudb_do_bulk_fetch(THD *thd) {
4537 switch (thd_sql_command(thd)) {
4538 case SQLCOM_SELECT:
4539 case SQLCOM_CREATE_TABLE:
4540 case SQLCOM_INSERT_SELECT:
4541 case SQLCOM_REPLACE_SELECT:
4542 case SQLCOM_DELETE:
4543 return tokudb::sysvars::bulk_fetch(thd) != 0;
4544 default:
4545 return false;
4546 }
4547 }
4548
4549 //
4550 // Notification that a range query getting all elements that equal a key
4551 // to take place. Will pre acquire read lock
4552 // Returns:
4553 // 0 on success
4554 // error otherwise
4555 //
4556 int ha_tokudb::prepare_index_key_scan(const uchar * key, uint key_len) {
4557 TOKUDB_HANDLER_DBUG_ENTER("%p %u", key, key_len);
4558 int error = 0;
4559 DBT start_key, end_key;
4560 THD* thd = ha_thd();
4561 HANDLE_INVALID_CURSOR();
4562 pack_key(&start_key, tokudb_active_index, prelocked_left_range, key, key_len, COL_NEG_INF);
4563 prelocked_left_range_size = start_key.size;
4564 pack_key(&end_key, tokudb_active_index, prelocked_right_range, key, key_len, COL_POS_INF);
4565 prelocked_right_range_size = end_key.size;
4566
4567 error = cursor->c_set_bounds(
4568 cursor,
4569 &start_key,
4570 &end_key,
4571 true,
4572 (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
4573 );
4574
4575 if (error){
4576 goto cleanup;
4577 }
4578
4579 range_lock_grabbed = true;
4580 range_lock_grabbed_null = index_key_is_null(table, tokudb_active_index, key, key_len);
4581 doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
4582 bulk_fetch_iteration = 0;
4583 rows_fetched_using_bulk_fetch = 0;
4584 error = 0;
4585 cleanup:
4586 if (error) {
4587 error = map_to_handler_error(error);
4588 last_cursor_error = error;
4589 //
4590 // cursor should be initialized here, but in case it is not,
4591 // we still check
4592 //
4593 if (cursor) {
4594 int r = cursor->c_close(cursor);
4595 assert_always(r==0);
4596 cursor = NULL;
4597 remove_from_trx_handler_list();
4598 }
4599 }
4600 TOKUDB_HANDLER_DBUG_RETURN(error);
4601 }
4602
4603 void ha_tokudb::invalidate_bulk_fetch() {
4604 bytes_used_in_range_query_buff= 0;
4605 curr_range_query_buff_offset = 0;
4606 icp_went_out_of_range = false;
4607 }
4608
4609 void ha_tokudb::invalidate_icp() {
4610 toku_pushed_idx_cond = NULL;
4611 toku_pushed_idx_cond_keyno = MAX_KEY;
4612 icp_went_out_of_range = false;
4613 }
4614
4615 //
4616 // Initializes local cursor on DB with index keynr
4617 // Parameters:
4618 // keynr - key (index) number
4619 // sorted - 1 if result MUST be sorted according to index
4620 // Returns:
4621 // 0 on success
4622 // error otherwise
4623 //
4624 int ha_tokudb::index_init(uint keynr, bool sorted) {
4625 TOKUDB_HANDLER_DBUG_ENTER("%d %u txn %p", keynr, sorted, transaction);
4626
4627 int error;
4628 THD* thd = ha_thd();
4629 DBUG_PRINT("enter", ("table: '%s' key: %d", table_share->table_name.str, keynr));
4630
4631 /*
4632 Under some very rare conditions (like full joins) we may already have
4633 an active cursor at this point
4634 */
4635 if (cursor) {
4636 DBUG_PRINT("note", ("Closing active cursor"));
4637 int r = cursor->c_close(cursor);
4638 assert_always(r==0);
4639 remove_from_trx_handler_list();
4640 }
4641 active_index = keynr;
4642
4643 if (active_index < MAX_KEY) {
4644 DBUG_ASSERT(keynr <= table->s->keys);
4645 } else {
4646 DBUG_ASSERT(active_index == MAX_KEY);
4647 keynr = primary_key;
4648 }
4649 tokudb_active_index = keynr;
4650
4651 #if defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
4652 if (keynr < table->s->keys && table->key_info[keynr].option_struct->clustering)
4653 key_read = false;
4654 #endif // defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
4655
4656 last_cursor_error = 0;
4657 range_lock_grabbed = false;
4658 range_lock_grabbed_null = false;
4659 DBUG_ASSERT(share->key_file[keynr]);
4660 cursor_flags = get_cursor_isolation_flags(lock.type, thd);
4661 if (use_write_locks) {
4662 cursor_flags |= DB_RMW;
4663 }
4664 if (tokudb::sysvars::disable_prefetching(thd)) {
4665 cursor_flags |= DBC_DISABLE_PREFETCHING;
4666 }
4667 if (lock.type == TL_READ_WITH_SHARED_LOCKS) {
4668 cursor_flags |= DB_LOCKING_READ;
4669 }
4670 if ((error = share->key_file[keynr]->cursor(share->key_file[keynr],
4671 transaction, &cursor,
4672 cursor_flags))) {
4673 if (error == TOKUDB_MVCC_DICTIONARY_TOO_NEW) {
4674 error = HA_ERR_TABLE_DEF_CHANGED;
4675 my_error(ER_TABLE_DEF_CHANGED, MYF(0));
4676 }
4677 if (error == DB_LOCK_NOTGRANTED) {
4678 error = HA_ERR_LOCK_WAIT_TIMEOUT;
4679 my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
4680 }
4681 table->status = STATUS_NOT_FOUND;
4682 error = map_to_handler_error(error);
4683 last_cursor_error = error;
4684 cursor = NULL; // Safety
4685 goto exit;
4686 }
4687 cursor->c_set_check_interrupt_callback(cursor, tokudb_killed_thd_callback, thd);
4688 memset((void *) &last_key, 0, sizeof(last_key));
4689
4690 add_to_trx_handler_list();
4691
4692 if (thd_sql_command(thd) == SQLCOM_SELECT) {
4693 set_query_columns(keynr);
4694 unpack_entire_row = false;
4695 }
4696 else {
4697 unpack_entire_row = true;
4698 }
4699 invalidate_bulk_fetch();
4700 doing_bulk_fetch = false;
4701 maybe_index_scan = false;
4702 error = 0;
4703 exit:
4704 TOKUDB_HANDLER_DBUG_RETURN(error);
4705 }
4706
4707 //
4708 // closes the local cursor
4709 //
4710 int ha_tokudb::index_end() {
4711 TOKUDB_HANDLER_DBUG_ENTER("");
4712 range_lock_grabbed = false;
4713 range_lock_grabbed_null = false;
4714 if (cursor) {
4715 DBUG_PRINT("enter", ("table: '%s'", table_share->table_name.str));
4716 int r = cursor->c_close(cursor);
4717 assert_always(r==0);
4718 cursor = NULL;
4719 remove_from_trx_handler_list();
4720 last_cursor_error = 0;
4721 }
4722 active_index = tokudb_active_index = MAX_KEY;
4723
4724 //
4725 // reset query variables
4726 //
4727 unpack_entire_row = true;
4728 read_blobs = true;
4729 read_key = true;
4730 num_fixed_cols_for_query = 0;
4731 num_var_cols_for_query = 0;
4732
4733 invalidate_bulk_fetch();
4734 invalidate_icp();
4735 doing_bulk_fetch = false;
4736 close_dsmrr();
4737
4738 TOKUDB_HANDLER_DBUG_RETURN(0);
4739 }
4740
4741 int ha_tokudb::handle_cursor_error(int error, int err_to_return) {
4742 TOKUDB_HANDLER_DBUG_ENTER("");
4743 if (error) {
4744 error = map_to_handler_error(error);
4745 last_cursor_error = error;
4746 table->status = STATUS_NOT_FOUND;
4747 if (error == DB_NOTFOUND) {
4748 error = err_to_return;
4749 }
4750 }
4751 TOKUDB_HANDLER_DBUG_RETURN(error);
4752 }
4753
4754
4755 //
4756 // Helper function for read_row and smart_dbt_callback_xxx functions
4757 // When using a hidden primary key, upon reading a row,
4758 // we set the current_ident field to whatever the primary key we retrieved
4759 // was
4760 //
4761 void ha_tokudb::extract_hidden_primary_key(uint keynr, DBT const *found_key) {
4762 //
4763 // extract hidden primary key to current_ident
4764 //
4765 if (hidden_primary_key) {
4766 if (keynr == primary_key) {
4767 memcpy(current_ident, (char *) found_key->data, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
4768 }
4769 //
4770 // if secondary key, hidden primary key is at end of found_key
4771 //
4772 else {
4773 memcpy(
4774 current_ident,
4775 (char *) found_key->data + found_key->size - TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH,
4776 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH
4777 );
4778 }
4779 }
4780 }
4781
4782
4783 int ha_tokudb::read_row_callback (uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4784 assert_always(keynr == primary_key);
4785 return unpack_row(buf, row,found_key, keynr);
4786 }
4787
4788 //
4789 // Reads the contents of row and found_key, DBT's retrieved from the DB associated to keynr, into buf
4790 // This function assumes that we are using a covering index, as a result, if keynr is the primary key,
4791 // we do not read row into buf
4792 // Parameters:
4793 // [out] buf - buffer for the row, in MySQL format
4794 // keynr - index into key_file that represents DB we are currently operating on.
4795 // [in] row - the row that has been read from the preceding DB call
4796 // [in] found_key - key used to retrieve the row
4797 //
4798 void ha_tokudb::read_key_only(uchar * buf, uint keynr, DBT const *found_key) {
4799 TOKUDB_HANDLER_DBUG_ENTER("");
4800 table->status = 0;
4801 //
4802 // only case when we do not unpack the key is if we are dealing with the main dictionary
4803 // of a table with a hidden primary key
4804 //
4805 if (!(hidden_primary_key && keynr == primary_key)) {
4806 unpack_key(buf, found_key, keynr);
4807 }
4808 TOKUDB_HANDLER_DBUG_VOID_RETURN;
4809 }
4810
4811 //
4812 // Helper function used to try to retrieve the entire row
4813 // If keynr is associated with the main table, reads contents of found_key and row into buf, otherwise,
4814 // makes copy of primary key and saves it to last_key. This can later be used to retrieve the entire row
4815 // Parameters:
4816 // [out] buf - buffer for the row, in MySQL format
4817 // keynr - index into key_file that represents DB we are currently operating on.
4818 // [in] row - the row that has been read from the preceding DB call
4819 // [in] found_key - key used to retrieve the row
4820 //
4821 int ha_tokudb::read_primary_key(uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4822 TOKUDB_HANDLER_DBUG_ENTER("");
4823 int error = 0;
4824 table->status = 0;
4825 //
4826 // case where we read from secondary table that is not clustered
4827 //
4828 if (keynr != primary_key && !key_is_clustering(&table->key_info[keynr])) {
4829 bool has_null;
4830 //
4831 // create a DBT that has the same data as row, this is inefficient
4832 // extract_hidden_primary_key MUST have been called before this
4833 //
4834 memset((void *) &last_key, 0, sizeof(last_key));
4835 if (!hidden_primary_key) {
4836 unpack_key(buf, found_key, keynr);
4837 }
4838 create_dbt_key_from_table(
4839 &last_key,
4840 primary_key,
4841 key_buff,
4842 buf,
4843 &has_null
4844 );
4845 }
4846 //
4847 // else read from clustered/primary key
4848 //
4849 else {
4850 error = unpack_row(buf, row, found_key, keynr);
4851 if (error) { goto exit; }
4852 }
4853 if (found_key) { DBUG_DUMP("read row key", (uchar *) found_key->data, found_key->size); }
4854 error = 0;
4855 exit:
4856 TOKUDB_HANDLER_DBUG_RETURN(error);
4857 }
4858
4859 //
4860 // This function reads an entire row into buf. This function also assumes that
4861 // the key needed to retrieve the row is stored in the member variable last_key
4862 // Parameters:
4863 // [out] buf - buffer for the row, in MySQL format
4864 // Returns:
4865 // 0 on success, error otherwise
4866 //
4867 int ha_tokudb::read_full_row(uchar * buf) {
4868 TOKUDB_HANDLER_DBUG_ENTER("");
4869 int error = 0;
4870 struct smart_dbt_info info;
4871 info.ha = this;
4872 info.buf = buf;
4873 info.keynr = primary_key;
4874 //
4875 // assumes key is stored in this->last_key
4876 //
4877
4878 error = share->file->getf_set(share->file,
4879 transaction,
4880 cursor_flags,
4881 &last_key,
4882 smart_dbt_callback_rowread_ptquery,
4883 &info);
4884
4885 DBUG_EXECUTE_IF("tokudb_fake_db_notfound_error_in_read_full_row", {
4886 error = DB_NOTFOUND;
4887 });
4888
4889 if (error) {
4890 if (error == DB_LOCK_NOTGRANTED) {
4891 error = HA_ERR_LOCK_WAIT_TIMEOUT;
4892 } else if (error == DB_NOTFOUND) {
4893 error = HA_ERR_CRASHED;
4894 if (tokudb_active_index < share->_keys) {
4895 sql_print_error(
4896 "ha_tokudb::read_full_row on table %s cound not locate "
4897 "record in PK that matches record found in key %s",
4898 share->full_table_name(),
4899 share->_key_descriptors[tokudb_active_index]._name);
4900 } else {
4901 sql_print_error(
4902 "ha_tokudb::read_full_row on table %s cound not locate "
4903 "record in PK that matches record found in key %d",
4904 share->full_table_name(),
4905 tokudb_active_index);
4906 }
4907 }
4908 table->status = STATUS_NOT_FOUND;
4909 }
4910
4911 TOKUDB_HANDLER_DBUG_RETURN(error);
4912 }
4913
4914
4915 //
4916 // Reads the next row matching to the key, on success, advances cursor
4917 // Parameters:
4918 // [out] buf - buffer for the next row, in MySQL format
4919 // [in] key - key value
4920 // keylen - length of key
4921 // Returns:
4922 // 0 on success
4923 // HA_ERR_END_OF_FILE if not found
4924 // error otherwise
4925 //
4926 int ha_tokudb::index_next_same(uchar* buf, const uchar* key, uint keylen) {
4927 TOKUDB_HANDLER_DBUG_ENTER("");
4928
4929 DBT curr_key;
4930 DBT found_key;
4931 bool has_null;
4932 int cmp;
4933 // create the key that will be used to compare with what is found
4934 // in order to figure out if we should return an error
4935 pack_key(&curr_key, tokudb_active_index, key_buff2, key, keylen, COL_ZERO);
4936 int error = get_next(buf, 1, &curr_key, key_read);
4937 if (error) {
4938 goto cleanup;
4939 }
4940 //
4941 // now do the comparison
4942 //
4943 create_dbt_key_from_table(
4944 &found_key,
4945 tokudb_active_index,
4946 key_buff3,buf,
4947 &has_null);
4948 cmp =
4949 tokudb_prefix_cmp_dbt_key(
4950 share->key_file[tokudb_active_index],
4951 &curr_key,
4952 &found_key);
4953 if (cmp) {
4954 error = HA_ERR_END_OF_FILE;
4955 }
4956
4957 cleanup:
4958 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
4959 TOKUDB_HANDLER_DBUG_RETURN(error);
4960 }
4961
4962
4963 //
4964 // According to InnoDB handlerton: Positions an index cursor to the index
4965 // specified in keynr. Fetches the row if any
4966 // Parameters:
4967 // [out] buf - buffer for the returned row
4968 // [in] key - key value, according to InnoDB, if NULL,
4969 // position cursor at start or end of index,
4970 // not sure if this is done now
4971 // key_len - length of key
4972 // find_flag - according to InnoDB, search flags from my_base.h
4973 // Returns:
4974 // 0 on success
4975 // HA_ERR_KEY_NOT_FOUND if not found (per InnoDB),
4976 // we seem to return HA_ERR_END_OF_FILE if find_flag != HA_READ_KEY_EXACT
4977 // TODO: investigate this for correctness
4978 // error otherwise
4979 //
4980 int ha_tokudb::index_read(
4981 uchar* buf,
4982 const uchar* key,
4983 uint key_len,
4984 enum ha_rkey_function find_flag) {
4985
4986 TOKUDB_HANDLER_DBUG_ENTER(
4987 "key %p %u:%2.2x find=%u",
4988 key,
4989 key_len,
4990 key ? key[0] : 0,
4991 find_flag);
4992 invalidate_bulk_fetch();
4993 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
4994 TOKUDB_DBUG_DUMP("mysql key=", key, key_len);
4995 }
4996 DBT row;
4997 DBT lookup_key;
4998 int error = 0;
4999 uint32_t flags = 0;
5000 THD* thd = ha_thd();
5001 tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
5002 struct smart_dbt_info info;
5003 struct index_read_info ir_info;
5004
5005 HANDLE_INVALID_CURSOR();
5006
5007 // if we locked a non-null key range and we now have a null key, then
5008 // remove the bounds from the cursor
5009 if (range_lock_grabbed &&
5010 !range_lock_grabbed_null &&
5011 index_key_is_null(table, tokudb_active_index, key, key_len)) {
5012 range_lock_grabbed = range_lock_grabbed_null = false;
5013 cursor->c_remove_restriction(cursor);
5014 }
5015
5016 memset((void *) &row, 0, sizeof(row));
5017
5018 info.ha = this;
5019 info.buf = buf;
5020 info.keynr = tokudb_active_index;
5021
5022 ir_info.smart_dbt_info = info;
5023 ir_info.cmp = 0;
5024
5025 flags = SET_PRELOCK_FLAG(0);
5026 switch (find_flag) {
5027 case HA_READ_KEY_EXACT: /* Find first record else error */ {
5028 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5029 DBT lookup_bound;
5030 pack_key(&lookup_bound, tokudb_active_index, key_buff4, key, key_len, COL_POS_INF);
5031 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
5032 TOKUDB_DBUG_DUMP("tokudb key=", lookup_key.data, lookup_key.size);
5033 }
5034 ir_info.orig_key = &lookup_key;
5035 error = cursor->c_getf_set_range_with_bound(cursor, flags, &lookup_key, &lookup_bound, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5036 if (ir_info.cmp) {
5037 error = DB_NOTFOUND;
5038 }
5039 break;
5040 }
5041 case HA_READ_AFTER_KEY: /* Find next rec. after key-record */
5042 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5043 error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5044 break;
5045 case HA_READ_BEFORE_KEY: /* Find next rec. before key-record */
5046 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5047 error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5048 break;
5049 case HA_READ_KEY_OR_NEXT: /* Record or next record */
5050 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5051 error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5052 break;
5053 //
5054 // This case does not seem to ever be used, it is ok for it to be slow
5055 //
5056 case HA_READ_KEY_OR_PREV: /* Record or previous */
5057 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5058 ir_info.orig_key = &lookup_key;
5059 error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5060 if (error == DB_NOTFOUND) {
5061 error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5062 }
5063 else if (ir_info.cmp) {
5064 error = cursor->c_getf_prev(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5065 }
5066 break;
5067 case HA_READ_PREFIX_LAST_OR_PREV: /* Last or prev key with the same prefix */
5068 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5069 error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5070 break;
5071 case HA_READ_PREFIX_LAST:
5072 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5073 ir_info.orig_key = &lookup_key;
5074 error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5075 if (ir_info.cmp) {
5076 error = DB_NOTFOUND;
5077 }
5078 break;
5079 default:
5080 TOKUDB_HANDLER_TRACE("unsupported:%d", find_flag);
5081 error = HA_ERR_UNSUPPORTED;
5082 break;
5083 }
5084 error = handle_cursor_error(error, HA_ERR_KEY_NOT_FOUND);
5085 if (!error && !key_read && tokudb_active_index != primary_key && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5086 error = read_full_row(buf);
5087 }
5088
5089 if (TOKUDB_UNLIKELY(error && TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ERROR))) {
5090 TOKUDB_HANDLER_TRACE("error:%d:%d", error, find_flag);
5091 }
5092 trx->stmt_progress.queried++;
5093 track_progress(thd);
5094
5095 cleanup:
5096 TOKUDB_HANDLER_DBUG_RETURN(error);
5097 }
5098
5099
5100 int ha_tokudb::read_data_from_range_query_buff(uchar* buf, bool need_val, bool do_key_read) {
5101 // buffer has the next row, get it from there
5102 int error;
5103 uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
5104 DBT curr_key;
5105 memset((void *) &curr_key, 0, sizeof(curr_key));
5106
5107 // get key info
5108 uint32_t key_size = *(uint32_t *)curr_pos;
5109 curr_pos += sizeof(key_size);
5110 uchar* curr_key_buff = curr_pos;
5111 curr_pos += key_size;
5112
5113 curr_key.data = curr_key_buff;
5114 curr_key.size = key_size;
5115
5116 // if this is a covering index, this is all we need
5117 if (do_key_read) {
5118 assert_always(!need_val);
5119 extract_hidden_primary_key(tokudb_active_index, &curr_key);
5120 read_key_only(buf, tokudb_active_index, &curr_key);
5121 error = 0;
5122 }
5123 // we need to get more data
5124 else {
5125 DBT curr_val;
5126 memset((void *) &curr_val, 0, sizeof(curr_val));
5127 uchar* curr_val_buff = NULL;
5128 uint32_t val_size = 0;
5129 // in this case, we don't have a val, we are simply extracting the pk
5130 if (!need_val) {
5131 curr_val.data = curr_val_buff;
5132 curr_val.size = val_size;
5133 extract_hidden_primary_key(tokudb_active_index, &curr_key);
5134 error = read_primary_key( buf, tokudb_active_index, &curr_val, &curr_key);
5135 }
5136 else {
5137 extract_hidden_primary_key(tokudb_active_index, &curr_key);
5138 // need to extract a val and place it into buf
5139 if (unpack_entire_row) {
5140 // get val info
5141 val_size = *(uint32_t *)curr_pos;
5142 curr_pos += sizeof(val_size);
5143 curr_val_buff = curr_pos;
5144 curr_pos += val_size;
5145 curr_val.data = curr_val_buff;
5146 curr_val.size = val_size;
5147 error = unpack_row(buf,&curr_val, &curr_key, tokudb_active_index);
5148 }
5149 else {
5150 if (!(hidden_primary_key && tokudb_active_index == primary_key)) {
5151 unpack_key(buf,&curr_key,tokudb_active_index);
5152 }
5153 // read rows we care about
5154
5155 // first the null bytes;
5156 memcpy(buf, curr_pos, table_share->null_bytes);
5157 curr_pos += table_share->null_bytes;
5158
5159 // now the fixed sized rows
5160 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5161 uint field_index = fixed_cols_for_query[i];
5162 Field* field = table->field[field_index];
5163 unpack_fixed_field(
5164 buf + field_offset(field, table),
5165 curr_pos,
5166 share->kc_info.field_lengths[field_index]
5167 );
5168 curr_pos += share->kc_info.field_lengths[field_index];
5169 }
5170 // now the variable sized rows
5171 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5172 uint field_index = var_cols_for_query[i];
5173 Field* field = table->field[field_index];
5174 uint32_t field_len = *(uint32_t *)curr_pos;
5175 curr_pos += sizeof(field_len);
5176 unpack_var_field(
5177 buf + field_offset(field, table),
5178 curr_pos,
5179 field_len,
5180 share->kc_info.length_bytes[field_index]
5181 );
5182 curr_pos += field_len;
5183 }
5184 // now the blobs
5185 if (read_blobs) {
5186 uint32_t blob_size = *(uint32_t *)curr_pos;
5187 curr_pos += sizeof(blob_size);
5188 error = unpack_blobs(
5189 buf,
5190 curr_pos,
5191 blob_size,
5192 true
5193 );
5194 curr_pos += blob_size;
5195 if (error) {
5196 invalidate_bulk_fetch();
5197 goto exit;
5198 }
5199 }
5200 error = 0;
5201 }
5202 }
5203 }
5204
5205 curr_range_query_buff_offset = curr_pos - range_query_buff;
5206 exit:
5207 return error;
5208 }
5209
5210 static int smart_dbt_bf_callback(
5211 DBT const* key,
5212 DBT const* row,
5213 void* context) {
5214 SMART_DBT_BF_INFO info = (SMART_DBT_BF_INFO)context;
5215 return
5216 info->ha->fill_range_query_buf(
5217 info->need_val,
5218 key,
5219 row,
5220 info->direction,
5221 info->thd,
5222 info->buf,
5223 info->key_to_compare);
5224 }
5225
5226 check_result_t ha_tokudb::toku_handler_index_cond_check(
5227 Item* pushed_idx_cond) {
5228
5229 check_result_t res;
5230 if (end_range) {
5231 int cmp;
5232 #ifdef MARIADB_BASE_VERSION
5233 cmp = compare_key2(end_range);
5234 #else
5235 cmp = compare_key_icp(end_range);
5236 #endif
5237 if (cmp > 0) {
5238 return CHECK_OUT_OF_RANGE;
5239 }
5240 }
5241 res = pushed_idx_cond->val_int() ? CHECK_POS : CHECK_NEG;
5242 return res;
5243 }
5244
5245 // fill in the range query buf for bulk fetch
5246 int ha_tokudb::fill_range_query_buf(
5247 bool need_val,
5248 DBT const* key,
5249 DBT const* row,
5250 int direction,
5251 THD* thd,
5252 uchar* buf,
5253 DBT* key_to_compare) {
5254
5255 int error;
5256 //
5257 // first put the value into range_query_buf
5258 //
5259 uint32_t size_remaining =
5260 size_range_query_buff - bytes_used_in_range_query_buff;
5261 uint32_t size_needed;
5262 uint32_t user_defined_size = tokudb::sysvars::read_buf_size(thd);
5263 uchar* curr_pos = NULL;
5264
5265 if (key_to_compare) {
5266 int cmp = tokudb_prefix_cmp_dbt_key(
5267 share->key_file[tokudb_active_index],
5268 key_to_compare,
5269 key);
5270 if (cmp) {
5271 icp_went_out_of_range = true;
5272 error = 0;
5273 goto cleanup;
5274 }
5275 }
5276
5277 // if we have an index condition pushed down, we check it
5278 if (toku_pushed_idx_cond &&
5279 (tokudb_active_index == toku_pushed_idx_cond_keyno)) {
5280 unpack_key(buf, key, tokudb_active_index);
5281 check_result_t result =
5282 toku_handler_index_cond_check(toku_pushed_idx_cond);
5283
5284 // If we have reason to stop, we set icp_went_out_of_range and get out
5285 // otherwise, if we simply see that the current key is no match,
5286 // we tell the cursor to continue and don't store
5287 // the key locally
5288 if (result == CHECK_OUT_OF_RANGE || thd_kill_level(thd)) {
5289 icp_went_out_of_range = true;
5290 error = 0;
5291 DEBUG_SYNC(ha_thd(), "tokudb_icp_asc_scan_out_of_range");
5292 goto cleanup;
5293 } else if (result == CHECK_NEG) {
5294 // Optimizer change for MyRocks also benefits us here in TokuDB as
5295 // opt_range.cc QUICK_SELECT::get_next now sets end_range during
5296 // descending scan. We should not ever hit this condition, but
5297 // leaving this code in to prevent any possibility of a descending
5298 // scan to the beginning of an index and catch any possibility
5299 // in debug builds with an assertion
5300 assert_debug(!(!end_range && direction < 0));
5301 if (!end_range &&
5302 direction < 0) {
5303 cancel_pushed_idx_cond();
5304 }
5305 error = TOKUDB_CURSOR_CONTINUE;
5306 goto cleanup;
5307 }
5308 }
5309
5310 // at this point, if ICP is on, we have verified that the key is one
5311 // we are interested in, so we proceed with placing the data
5312 // into the range query buffer
5313
5314 if (need_val) {
5315 if (unpack_entire_row) {
5316 size_needed = 2*sizeof(uint32_t) + key->size + row->size;
5317 } else {
5318 // this is an upper bound
5319 size_needed =
5320 // size of key length
5321 sizeof(uint32_t) +
5322 // key and row
5323 key->size + row->size +
5324 // lengths of varchars stored
5325 num_var_cols_for_query * (sizeof(uint32_t)) +
5326 // length of blobs
5327 sizeof(uint32_t);
5328 }
5329 } else {
5330 size_needed = sizeof(uint32_t) + key->size;
5331 }
5332 if (size_remaining < size_needed) {
5333 range_query_buff =
5334 static_cast<uchar*>(tokudb::memory::realloc(
5335 static_cast<void*>(range_query_buff),
5336 bytes_used_in_range_query_buff + size_needed,
5337 MYF(MY_WME)));
5338 if (range_query_buff == NULL) {
5339 error = ENOMEM;
5340 invalidate_bulk_fetch();
5341 goto cleanup;
5342 }
5343 size_range_query_buff = bytes_used_in_range_query_buff + size_needed;
5344 }
5345 //
5346 // now we know we have the size, let's fill the buffer, starting with the key
5347 //
5348 curr_pos = range_query_buff + bytes_used_in_range_query_buff;
5349
5350 *reinterpret_cast<uint32_t*>(curr_pos) = key->size;
5351 curr_pos += sizeof(uint32_t);
5352 memcpy(curr_pos, key->data, key->size);
5353 curr_pos += key->size;
5354 if (need_val) {
5355 if (unpack_entire_row) {
5356 *reinterpret_cast<uint32_t*>(curr_pos) = row->size;
5357 curr_pos += sizeof(uint32_t);
5358 memcpy(curr_pos, row->data, row->size);
5359 curr_pos += row->size;
5360 } else {
5361 // need to unpack just the data we care about
5362 const uchar* fixed_field_ptr = static_cast<const uchar*>(row->data);
5363 fixed_field_ptr += table_share->null_bytes;
5364
5365 const uchar* var_field_offset_ptr = NULL;
5366 const uchar* var_field_data_ptr = NULL;
5367
5368 var_field_offset_ptr =
5369 fixed_field_ptr +
5370 share->kc_info.mcp_info[tokudb_active_index].fixed_field_size;
5371 var_field_data_ptr =
5372 var_field_offset_ptr +
5373 share->kc_info.mcp_info[tokudb_active_index].len_of_offsets;
5374
5375 // first the null bytes
5376 memcpy(curr_pos, row->data, table_share->null_bytes);
5377 curr_pos += table_share->null_bytes;
5378 // now the fixed fields
5379 //
5380 // first the fixed fields
5381 //
5382 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5383 uint field_index = fixed_cols_for_query[i];
5384 memcpy(
5385 curr_pos,
5386 fixed_field_ptr + share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val,
5387 share->kc_info.field_lengths[field_index]);
5388 curr_pos += share->kc_info.field_lengths[field_index];
5389 }
5390
5391 //
5392 // now the var fields
5393 //
5394 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5395 uint field_index = var_cols_for_query[i];
5396 uint32_t var_field_index =
5397 share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val;
5398 uint32_t data_start_offset;
5399 uint32_t field_len;
5400
5401 get_var_field_info(
5402 &field_len,
5403 &data_start_offset,
5404 var_field_index,
5405 var_field_offset_ptr,
5406 share->kc_info.num_offset_bytes);
5407 memcpy(curr_pos, &field_len, sizeof(field_len));
5408 curr_pos += sizeof(field_len);
5409 memcpy(
5410 curr_pos,
5411 var_field_data_ptr + data_start_offset,
5412 field_len);
5413 curr_pos += field_len;
5414 }
5415
5416 if (read_blobs) {
5417 uint32_t blob_offset = 0;
5418 uint32_t data_size = 0;
5419 //
5420 // now the blobs
5421 //
5422 get_blob_field_info(
5423 &blob_offset,
5424 share->kc_info.mcp_info[tokudb_active_index].len_of_offsets,
5425 var_field_data_ptr,
5426 share->kc_info.num_offset_bytes);
5427 data_size =
5428 row->size -
5429 blob_offset -
5430 static_cast<uint32_t>((var_field_data_ptr -
5431 static_cast<const uchar*>(row->data)));
5432 memcpy(curr_pos, &data_size, sizeof(data_size));
5433 curr_pos += sizeof(data_size);
5434 memcpy(curr_pos, var_field_data_ptr + blob_offset, data_size);
5435 curr_pos += data_size;
5436 }
5437 }
5438 }
5439
5440 bytes_used_in_range_query_buff = curr_pos - range_query_buff;
5441 assert_always(bytes_used_in_range_query_buff <= size_range_query_buff);
5442
5443 //
5444 // now determine if we should continue with the bulk fetch
5445 // we want to stop under these conditions:
5446 // - we overran the prelocked range
5447 // - we are close to the end of the buffer
5448 // - we have fetched an exponential amount of rows with
5449 // respect to the bulk fetch iteration, which is initialized
5450 // to 0 in index_init() and prelock_range().
5451
5452 rows_fetched_using_bulk_fetch++;
5453 // if the iteration is less than the number of possible shifts on
5454 // a 64 bit integer, check that we haven't exceeded this iterations
5455 // row fetch upper bound.
5456 if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5457 uint64_t row_fetch_upper_bound = 1LLU << bulk_fetch_iteration;
5458 assert_always(row_fetch_upper_bound > 0);
5459 if (rows_fetched_using_bulk_fetch >= row_fetch_upper_bound) {
5460 error = 0;
5461 goto cleanup;
5462 }
5463 }
5464
5465 if (bytes_used_in_range_query_buff +
5466 table_share->rec_buff_length >
5467 user_defined_size) {
5468 error = 0;
5469 goto cleanup;
5470 }
5471 if (direction > 0) {
5472 // compare what we got to the right endpoint of prelocked range
5473 // because we are searching keys in ascending order
5474 if (prelocked_right_range_size == 0) {
5475 error = TOKUDB_CURSOR_CONTINUE;
5476 goto cleanup;
5477 }
5478 DBT right_range;
5479 memset(&right_range, 0, sizeof(right_range));
5480 right_range.size = prelocked_right_range_size;
5481 right_range.data = prelocked_right_range;
5482 int cmp = tokudb_cmp_dbt_key(
5483 share->key_file[tokudb_active_index],
5484 key,
5485 &right_range);
5486 error = (cmp > 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5487 } else {
5488 // compare what we got to the left endpoint of prelocked range
5489 // because we are searching keys in descending order
5490 if (prelocked_left_range_size == 0) {
5491 error = TOKUDB_CURSOR_CONTINUE;
5492 goto cleanup;
5493 }
5494 DBT left_range;
5495 memset(&left_range, 0, sizeof(left_range));
5496 left_range.size = prelocked_left_range_size;
5497 left_range.data = prelocked_left_range;
5498 int cmp = tokudb_cmp_dbt_key(
5499 share->key_file[tokudb_active_index],
5500 key,
5501 &left_range);
5502 error = (cmp < 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5503 }
5504 cleanup:
5505 return error;
5506 }
5507
5508 int ha_tokudb::get_next(
5509 uchar* buf,
5510 int direction,
5511 DBT* key_to_compare,
5512 bool do_key_read) {
5513
5514 int error = 0;
5515 HANDLE_INVALID_CURSOR();
5516
5517 if (maybe_index_scan) {
5518 maybe_index_scan = false;
5519 if (!range_lock_grabbed) {
5520 error = prepare_index_scan();
5521 }
5522 }
5523
5524 if (!error) {
5525 uint32_t flags = SET_PRELOCK_FLAG(0);
5526
5527 // we need to read the val of what we retrieve if
5528 // we do NOT have a covering index AND we are using a clustering secondary
5529 // key
5530 bool need_val =
5531 (do_key_read == 0) &&
5532 (tokudb_active_index == primary_key ||
5533 key_is_clustering(&table->key_info[tokudb_active_index]));
5534
5535 if ((bytes_used_in_range_query_buff -
5536 curr_range_query_buff_offset) > 0) {
5537 error = read_data_from_range_query_buff(buf, need_val, do_key_read);
5538 } else if (icp_went_out_of_range) {
5539 icp_went_out_of_range = false;
5540 error = HA_ERR_END_OF_FILE;
5541 } else {
5542 invalidate_bulk_fetch();
5543 if (doing_bulk_fetch) {
5544 struct smart_dbt_bf_info bf_info;
5545 bf_info.ha = this;
5546 // you need the val if you have a clustering index and key_read is not 0;
5547 bf_info.direction = direction;
5548 bf_info.thd = ha_thd();
5549 bf_info.need_val = need_val;
5550 bf_info.buf = buf;
5551 bf_info.key_to_compare = key_to_compare;
5552 //
5553 // call c_getf_next with purpose of filling in range_query_buff
5554 //
5555 rows_fetched_using_bulk_fetch = 0;
5556 // it is expected that we can do ICP in the smart_dbt_bf_callback
5557 // as a result, it's possible we don't return any data because
5558 // none of the rows matched the index condition. Therefore, we need
5559 // this while loop. icp_out_of_range will be set if we hit a row that
5560 // the index condition states is out of our range. When that hits,
5561 // we know all the data in the buffer is the last data we will retrieve
5562 while (bytes_used_in_range_query_buff == 0 &&
5563 !icp_went_out_of_range && error == 0) {
5564 if (direction > 0) {
5565 error =
5566 cursor->c_getf_next(
5567 cursor,
5568 flags,
5569 smart_dbt_bf_callback,
5570 &bf_info);
5571 } else {
5572 error =
5573 cursor->c_getf_prev(
5574 cursor,
5575 flags,
5576 smart_dbt_bf_callback,
5577 &bf_info);
5578 }
5579 }
5580 // if there is no data set and we went out of range,
5581 // then there is nothing to return
5582 if (bytes_used_in_range_query_buff == 0 &&
5583 icp_went_out_of_range) {
5584 icp_went_out_of_range = false;
5585 error = HA_ERR_END_OF_FILE;
5586 }
5587 if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5588 bulk_fetch_iteration++;
5589 }
5590
5591 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5592 if (error) {
5593 goto cleanup;
5594 }
5595
5596 //
5597 // now that range_query_buff is filled, read an element
5598 //
5599 error =
5600 read_data_from_range_query_buff(buf, need_val, do_key_read);
5601 } else {
5602 struct smart_dbt_info info;
5603 info.ha = this;
5604 info.buf = buf;
5605 info.keynr = tokudb_active_index;
5606
5607 if (direction > 0) {
5608 error =
5609 cursor->c_getf_next(
5610 cursor,
5611 flags,
5612 SMART_DBT_CALLBACK(do_key_read),
5613 &info);
5614 } else {
5615 error =
5616 cursor->c_getf_prev(
5617 cursor,
5618 flags,
5619 SMART_DBT_CALLBACK(do_key_read),
5620 &info);
5621 }
5622 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5623 }
5624 }
5625 }
5626
5627 //
5628 // at this point, one of two things has happened
5629 // either we have unpacked the data into buf, and we
5630 // are done, or we have unpacked the primary key
5631 // into last_key, and we use the code below to
5632 // read the full row by doing a point query into the
5633 // main table.
5634 //
5635 if (!error &&
5636 !do_key_read &&
5637 (tokudb_active_index != primary_key) &&
5638 !key_is_clustering(&table->key_info[tokudb_active_index])) {
5639 error = read_full_row(buf);
5640 }
5641
5642 if (!error) {
5643 THD *thd = ha_thd();
5644 tokudb_trx_data* trx =
5645 static_cast<tokudb_trx_data*>(thd_get_ha_data(thd, tokudb_hton));
5646 trx->stmt_progress.queried++;
5647 track_progress(thd);
5648 if (thd_kill_level(thd))
5649 error = ER_ABORTING_CONNECTION;
5650 }
5651 cleanup:
5652 return error;
5653 }
5654
5655
5656 //
5657 // Reads the next row from the active index (cursor) into buf, and advances cursor
5658 // Parameters:
5659 // [out] buf - buffer for the next row, in MySQL format
5660 // Returns:
5661 // 0 on success
5662 // HA_ERR_END_OF_FILE if not found
5663 // error otherwise
5664 //
5665 int ha_tokudb::index_next(uchar * buf) {
5666 TOKUDB_HANDLER_DBUG_ENTER("");
5667 int error = get_next(buf, 1, NULL, key_read);
5668 TOKUDB_HANDLER_DBUG_RETURN(error);
5669 }
5670
5671
5672 int ha_tokudb::index_read_last(uchar * buf, const uchar * key, uint key_len) {
5673 return(index_read(buf, key, key_len, HA_READ_PREFIX_LAST));
5674 }
5675
5676
5677 //
5678 // Reads the previous row from the active index (cursor) into buf, and advances cursor
5679 // Parameters:
5680 // [out] buf - buffer for the next row, in MySQL format
5681 // Returns:
5682 // 0 on success
5683 // HA_ERR_END_OF_FILE if not found
5684 // error otherwise
5685 //
5686 int ha_tokudb::index_prev(uchar * buf) {
5687 TOKUDB_HANDLER_DBUG_ENTER("");
5688 int error = get_next(buf, -1, NULL, key_read);
5689 TOKUDB_HANDLER_DBUG_RETURN(error);
5690 }
5691
5692 //
5693 // Reads the first row from the active index (cursor) into buf, and advances cursor
5694 // Parameters:
5695 // [out] buf - buffer for the next row, in MySQL format
5696 // Returns:
5697 // 0 on success
5698 // HA_ERR_END_OF_FILE if not found
5699 // error otherwise
5700 //
5701 int ha_tokudb::index_first(uchar * buf) {
5702 TOKUDB_HANDLER_DBUG_ENTER("");
5703 invalidate_bulk_fetch();
5704 int error = 0;
5705 struct smart_dbt_info info;
5706 uint32_t flags = SET_PRELOCK_FLAG(0);
5707 THD* thd = ha_thd();
5708 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5709 HANDLE_INVALID_CURSOR();
5710
5711 info.ha = this;
5712 info.buf = buf;
5713 info.keynr = tokudb_active_index;
5714
5715 error = cursor->c_getf_first(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5716 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5717
5718 //
5719 // still need to get entire contents of the row if operation done on
5720 // secondary DB and it was NOT a covering index
5721 //
5722 if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5723 error = read_full_row(buf);
5724 }
5725 if (trx) {
5726 trx->stmt_progress.queried++;
5727 }
5728 track_progress(thd);
5729 maybe_index_scan = true;
5730 cleanup:
5731 TOKUDB_HANDLER_DBUG_RETURN(error);
5732 }
5733
5734 //
5735 // Reads the last row from the active index (cursor) into buf, and advances cursor
5736 // Parameters:
5737 // [out] buf - buffer for the next row, in MySQL format
5738 // Returns:
5739 // 0 on success
5740 // HA_ERR_END_OF_FILE if not found
5741 // error otherwise
5742 //
5743 int ha_tokudb::index_last(uchar * buf) {
5744 TOKUDB_HANDLER_DBUG_ENTER("");
5745 invalidate_bulk_fetch();
5746 int error = 0;
5747 struct smart_dbt_info info;
5748 uint32_t flags = SET_PRELOCK_FLAG(0);
5749 THD* thd = ha_thd();
5750 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5751 HANDLE_INVALID_CURSOR();
5752
5753 info.ha = this;
5754 info.buf = buf;
5755 info.keynr = tokudb_active_index;
5756
5757 error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5758 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5759 //
5760 // still need to get entire contents of the row if operation done on
5761 // secondary DB and it was NOT a covering index
5762 //
5763 if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5764 error = read_full_row(buf);
5765 }
5766
5767 if (trx) {
5768 trx->stmt_progress.queried++;
5769 }
5770 track_progress(thd);
5771 maybe_index_scan = true;
5772 cleanup:
5773 TOKUDB_HANDLER_DBUG_RETURN(error);
5774 }
5775
5776 //
5777 // Initialize a scan of the table (which is why index_init is called on primary_key)
5778 // Parameters:
5779 // scan - unused
5780 // Returns:
5781 // 0 on success
5782 // error otherwise
5783 //
5784 int ha_tokudb::rnd_init(bool scan) {
5785 TOKUDB_HANDLER_DBUG_ENTER("");
5786 int error = 0;
5787 range_lock_grabbed = false;
5788 error = index_init(MAX_KEY, 0);
5789 if (error) { goto cleanup;}
5790
5791 if (scan) {
5792 error = prelock_range(NULL, NULL);
5793 if (error) { goto cleanup; }
5794
5795 // only want to set range_lock_grabbed to true after index_init
5796 // successfully executed for two reasons:
5797 // 1) index_init will reset it to false anyway
5798 // 2) if it fails, we don't want prelocking on,
5799 range_lock_grabbed = true;
5800 }
5801
5802 error = 0;
5803 cleanup:
5804 if (error) {
5805 index_end();
5806 last_cursor_error = error;
5807 }
5808 TOKUDB_HANDLER_DBUG_RETURN(error);
5809 }
5810
5811 //
5812 // End a scan of the table
5813 //
5814 int ha_tokudb::rnd_end() {
5815 TOKUDB_HANDLER_DBUG_ENTER("");
5816 range_lock_grabbed = false;
5817 TOKUDB_HANDLER_DBUG_RETURN(index_end());
5818 }
5819
5820
5821 //
5822 // Read the next row in a table scan
5823 // Parameters:
5824 // [out] buf - buffer for the next row, in MySQL format
5825 // Returns:
5826 // 0 on success
5827 // HA_ERR_END_OF_FILE if not found
5828 // error otherwise
5829 //
5830 int ha_tokudb::rnd_next(uchar * buf) {
5831 TOKUDB_HANDLER_DBUG_ENTER("");
5832 int error = get_next(buf, 1, NULL, false);
5833 TOKUDB_HANDLER_DBUG_RETURN(error);
5834 }
5835
5836
5837 void ha_tokudb::track_progress(THD* thd) {
5838 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
5839 if (trx) {
5840 ulonglong num_written = trx->stmt_progress.inserted +
5841 trx->stmt_progress.updated +
5842 trx->stmt_progress.deleted;
5843 bool update_status =
5844 (trx->stmt_progress.queried &&
5845 tokudb::sysvars::read_status_frequency &&
5846 (trx->stmt_progress.queried %
5847 tokudb::sysvars::read_status_frequency) == 0) ||
5848 (num_written && tokudb::sysvars::write_status_frequency &&
5849 (num_written % tokudb::sysvars::write_status_frequency) == 0);
5850 if (update_status) {
5851 char *next_status = write_status_msg;
5852 bool first = true;
5853 int r;
5854 if (trx->stmt_progress.queried) {
5855 r = sprintf(
5856 next_status,
5857 "Queried about %llu row%s",
5858 trx->stmt_progress.queried,
5859 trx->stmt_progress.queried == 1 ? "" : "s");
5860 assert_always(r >= 0);
5861 next_status += r;
5862 first = false;
5863 }
5864 if (trx->stmt_progress.inserted) {
5865 if (trx->stmt_progress.using_loader) {
5866 r = sprintf(
5867 next_status,
5868 "%sFetched about %llu row%s, loading data still remains",
5869 first ? "" : ", ",
5870 trx->stmt_progress.inserted,
5871 trx->stmt_progress.inserted == 1 ? "" : "s");
5872 } else {
5873 r = sprintf(
5874 next_status,
5875 "%sInserted about %llu row%s",
5876 first ? "" : ", ",
5877 trx->stmt_progress.inserted,
5878 trx->stmt_progress.inserted == 1 ? "" : "s");
5879 }
5880 assert_always(r >= 0);
5881 next_status += r;
5882 first = false;
5883 }
5884 if (trx->stmt_progress.updated) {
5885 r = sprintf(
5886 next_status,
5887 "%sUpdated about %llu row%s",
5888 first ? "" : ", ",
5889 trx->stmt_progress.updated,
5890 trx->stmt_progress.updated == 1 ? "" : "s");
5891 assert_always(r >= 0);
5892 next_status += r;
5893 first = false;
5894 }
5895 if (trx->stmt_progress.deleted) {
5896 r = sprintf(
5897 next_status,
5898 "%sDeleted about %llu row%s",
5899 first ? "" : ", ",
5900 trx->stmt_progress.deleted,
5901 trx->stmt_progress.deleted == 1 ? "" : "s");
5902 assert_always(r >= 0);
5903 next_status += r;
5904 first = false;
5905 }
5906 if (!first)
5907 thd_proc_info(thd, write_status_msg);
5908 }
5909 }
5910 }
5911
5912
5913 DBT *ha_tokudb::get_pos(DBT * to, uchar * pos) {
5914 TOKUDB_HANDLER_DBUG_ENTER("");
5915 /* We don't need to set app_data here */
5916 memset((void *) to, 0, sizeof(*to));
5917 to->data = pos + sizeof(uint32_t);
5918 to->size = *(uint32_t *)pos;
5919 DBUG_DUMP("key", (const uchar *) to->data, to->size);
5920 DBUG_RETURN(to);
5921 }
5922
5923 // Retrieves a row with based on the primary key saved in pos
5924 // Returns:
5925 // 0 on success
5926 // HA_ERR_KEY_NOT_FOUND if not found
5927 // error otherwise
5928 int ha_tokudb::rnd_pos(uchar * buf, uchar * pos) {
5929 TOKUDB_HANDLER_DBUG_ENTER("");
5930 DBT db_pos;
5931 int error = 0;
5932 struct smart_dbt_info info;
5933 bool old_unpack_entire_row = unpack_entire_row;
5934 DBT* key = get_pos(&db_pos, pos);
5935
5936 unpack_entire_row = true;
5937 tokudb_active_index = MAX_KEY;
5938
5939 THD *thd = ha_thd();
5940 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
5941 // test rpl slave by inducing a delay before the point query
5942 if (thd->slave_thread && (in_rpl_delete_rows || in_rpl_update_rows)) {
5943 DBUG_EXECUTE_IF("tokudb_crash_if_rpl_looks_up_row", DBUG_ASSERT(0););
5944 uint64_t delay_ms = tokudb::sysvars::rpl_lookup_rows_delay(thd);
5945 if (delay_ms)
5946 usleep(delay_ms * 1000);
5947 }
5948 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
5949
5950 info.ha = this;
5951 info.buf = buf;
5952 info.keynr = primary_key;
5953
5954 error = share->file->getf_set(share->file, transaction,
5955 get_cursor_isolation_flags(lock.type, thd),
5956 key, smart_dbt_callback_rowread_ptquery, &info);
5957
5958 if (error == DB_NOTFOUND) {
5959 error = HA_ERR_KEY_NOT_FOUND;
5960 goto cleanup;
5961 }
5962 cleanup:
5963 unpack_entire_row = old_unpack_entire_row;
5964 TOKUDB_HANDLER_DBUG_RETURN(error);
5965 }
5966
5967 int ha_tokudb::prelock_range(const key_range *start_key, const key_range *end_key) {
5968 TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
5969 THD* thd = ha_thd();
5970
5971 int error = 0;
5972 DBT start_dbt_key;
5973 DBT end_dbt_key;
5974 uchar* start_key_buff = prelocked_left_range;
5975 uchar* end_key_buff = prelocked_right_range;
5976
5977 memset((void *) &start_dbt_key, 0, sizeof(start_dbt_key));
5978 memset((void *) &end_dbt_key, 0, sizeof(end_dbt_key));
5979
5980 HANDLE_INVALID_CURSOR();
5981 if (start_key) {
5982 switch (start_key->flag) {
5983 case HA_READ_AFTER_KEY:
5984 pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_POS_INF);
5985 break;
5986 default:
5987 pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_NEG_INF);
5988 break;
5989 }
5990 prelocked_left_range_size = start_dbt_key.size;
5991 }
5992 else {
5993 prelocked_left_range_size = 0;
5994 }
5995
5996 if (end_key) {
5997 switch (end_key->flag) {
5998 case HA_READ_BEFORE_KEY:
5999 pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_NEG_INF);
6000 break;
6001 default:
6002 pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_POS_INF);
6003 break;
6004 }
6005 prelocked_right_range_size = end_dbt_key.size;
6006 }
6007 else {
6008 prelocked_right_range_size = 0;
6009 }
6010
6011 error = cursor->c_set_bounds(
6012 cursor,
6013 start_key ? &start_dbt_key : share->key_file[tokudb_active_index]->dbt_neg_infty(),
6014 end_key ? &end_dbt_key : share->key_file[tokudb_active_index]->dbt_pos_infty(),
6015 true,
6016 (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
6017 );
6018 if (error) {
6019 error = map_to_handler_error(error);
6020 last_cursor_error = error;
6021 //
6022 // cursor should be initialized here, but in case it is not, we still check
6023 //
6024 if (cursor) {
6025 int r = cursor->c_close(cursor);
6026 assert_always(r==0);
6027 cursor = NULL;
6028 remove_from_trx_handler_list();
6029 }
6030 goto cleanup;
6031 }
6032
6033 // at this point, determine if we will be doing bulk fetch
6034 doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
6035 bulk_fetch_iteration = 0;
6036 rows_fetched_using_bulk_fetch = 0;
6037
6038 cleanup:
6039 TOKUDB_HANDLER_DBUG_RETURN(error);
6040 }
6041
6042 //
6043 // Prelock range if possible, start_key is leftmost, end_key is rightmost
6044 // whether scanning forward or backward. This function is called by MySQL
6045 // for backward range queries (in QUICK_SELECT_DESC::get_next).
6046 // Forward scans use read_range_first()/read_range_next().
6047 //
6048 int ha_tokudb::prepare_range_scan( const key_range *start_key, const key_range *end_key) {
6049 TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
6050 int error = prelock_range(start_key, end_key);
6051 if (!error) {
6052 range_lock_grabbed = true;
6053 }
6054 TOKUDB_HANDLER_DBUG_RETURN(error);
6055 }
6056
6057 int ha_tokudb::read_range_first(
6058 const key_range *start_key,
6059 const key_range *end_key,
6060 bool eq_range,
6061 bool sorted)
6062 {
6063 TOKUDB_HANDLER_DBUG_ENTER("%p %p %u %u", start_key, end_key, eq_range, sorted);
6064 int error = prelock_range(start_key, end_key);
6065 if (error) { goto cleanup; }
6066 range_lock_grabbed = true;
6067
6068 error = handler::read_range_first(start_key, end_key, eq_range, sorted);
6069 cleanup:
6070 TOKUDB_HANDLER_DBUG_RETURN(error);
6071 }
6072
6073 int ha_tokudb::read_range_next()
6074 {
6075 TOKUDB_HANDLER_DBUG_ENTER("");
6076 int error;
6077 error = handler::read_range_next();
6078 if (error) {
6079 range_lock_grabbed = false;
6080 }
6081 TOKUDB_HANDLER_DBUG_RETURN(error);
6082 }
6083
6084
6085
6086 /*
6087 Set a reference to the current record in (ref,ref_length).
6088
6089 SYNOPSIS
6090 ha_tokudb::position()
6091 record The current record buffer
6092
6093 DESCRIPTION
6094 The BDB handler stores the primary key in (ref,ref_length).
6095 There is either an explicit primary key, or an implicit (hidden)
6096 primary key.
6097 During open(), 'ref_length' is calculated as the maximum primary
6098 key length. When an actual key is shorter than that, the rest of
6099 the buffer must be cleared out. The row cannot be identified, if
6100 garbage follows behind the end of the key. There is no length
6101 field for the current key, so that the whole ref_length is used
6102 for comparison.
6103
6104 RETURN
6105 nothing
6106 */
6107 void ha_tokudb::position(const uchar * record) {
6108 TOKUDB_HANDLER_DBUG_ENTER("");
6109 DBT key;
6110 if (hidden_primary_key) {
6111 DBUG_ASSERT(ref_length == (TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t)));
6112 memcpy(ref + sizeof(uint32_t), current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
6113 *(uint32_t *)ref = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
6114 }
6115 else {
6116 bool has_null;
6117 //
6118 // save the data
6119 //
6120 create_dbt_key_from_table(&key, primary_key, ref + sizeof(uint32_t), record, &has_null);
6121 //
6122 // save the size of data in the first four bytes of ref
6123 //
6124 memcpy(ref, &key.size, sizeof(uint32_t));
6125 }
6126 /*
6127 tokudb doesn't always write the last byte. Don't that cause problems with
6128 MariaDB
6129 */
6130 MEM_MAKE_DEFINED(ref, ref_length);
6131 TOKUDB_HANDLER_DBUG_VOID_RETURN;
6132 }
6133
6134 //
6135 // Per InnoDB: Returns statistics information of the table to the MySQL interpreter,
6136 // in various fields of the handle object.
6137 // Return:
6138 // 0, always success
6139 //
6140 int ha_tokudb::info(uint flag) {
6141 TOKUDB_HANDLER_DBUG_ENTER("%d", flag);
6142 int error = 0;
6143 #if defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
6144 for (uint i=0; i < table->s->keys; i++)
6145 if (key_is_clustering(&table->key_info[i]))
6146 table->covering_keys.set_bit(i);
6147 #endif // defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
6148 DB_TXN* txn = NULL;
6149 if (flag & HA_STATUS_VARIABLE) {
6150 stats.records = share->row_count() + share->rows_from_locked_table;
6151 stats.deleted = 0;
6152 if (!(flag & HA_STATUS_NO_LOCK)) {
6153
6154 error = txn_begin(db_env, NULL, &txn, DB_READ_UNCOMMITTED, ha_thd());
6155 if (error) {
6156 goto cleanup;
6157 }
6158
6159 // we should always have a primary key
6160 assert_always(share->file != NULL);
6161
6162 DB_BTREE_STAT64 dict_stats;
6163 error = share->file->stat64(share->file, txn, &dict_stats);
6164 if (error) {
6165 goto cleanup;
6166 }
6167 share->set_row_count(dict_stats.bt_ndata, false);
6168 stats.records = dict_stats.bt_ndata;
6169 stats.create_time = dict_stats.bt_create_time_sec;
6170 stats.update_time = dict_stats.bt_modify_time_sec;
6171 stats.check_time = dict_stats.bt_verify_time_sec;
6172 stats.data_file_length = dict_stats.bt_dsize;
6173 stats.delete_length = dict_stats.bt_fsize - dict_stats.bt_dsize;
6174 if (hidden_primary_key) {
6175 //
6176 // in this case, we have a hidden primary key, do not
6177 // want to report space taken up by the hidden primary key to the user
6178 //
6179 uint64_t hpk_space =
6180 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH * dict_stats.bt_ndata;
6181 stats.data_file_length =
6182 (hpk_space > stats.data_file_length) ?
6183 0 : stats.data_file_length - hpk_space;
6184 } else {
6185 //
6186 // one infinity byte per key needs to be subtracted
6187 //
6188 uint64_t inf_byte_space = dict_stats.bt_ndata;
6189 stats.data_file_length =
6190 (inf_byte_space > stats.data_file_length) ?
6191 0 : stats.data_file_length - inf_byte_space;
6192 }
6193
6194 stats.mean_rec_length =
6195 stats.records ?
6196 (ulong)(stats.data_file_length/stats.records) : 0;
6197 stats.index_file_length = 0;
6198 // curr_num_DBs is the number of keys we have, according
6199 // to the mysql layer. if drop index is running concurrently
6200 // with info() (it can, because info does not take table locks),
6201 // then it could be the case that one of the dbs was dropped
6202 // and set to NULL before mysql was able to set table->s->keys
6203 // accordingly.
6204 //
6205 // we should just ignore any DB * that is NULL.
6206 //
6207 // this solution is much simpler than trying to maintain an
6208 // accurate number of valid keys at the handlerton layer.
6209 uint curr_num_DBs =
6210 table->s->keys + tokudb_test(hidden_primary_key);
6211 for (uint i = 0; i < curr_num_DBs; i++) {
6212 // skip the primary key, skip dropped indexes
6213 if (i == primary_key || share->key_file[i] == NULL) {
6214 continue;
6215 }
6216 error = share->key_file[i]->stat64(
6217 share->key_file[i], txn, &dict_stats);
6218 if (error) {
6219 goto cleanup;
6220 }
6221 stats.index_file_length += dict_stats.bt_dsize;
6222 stats.delete_length +=
6223 dict_stats.bt_fsize - dict_stats.bt_dsize;
6224 }
6225 }
6226
6227 /*
6228 The following comment and logic has been taken from InnoDB and
6229 an old hack was removed that forced to always set stats.records > 0
6230 ---
6231 The MySQL optimizer seems to assume in a left join that n_rows
6232 is an accurate estimate if it is zero. Of course, it is not,
6233 since we do not have any locks on the rows yet at this phase.
6234 Since SHOW TABLE STATUS seems to call this function with the
6235 HA_STATUS_TIME flag set, while the left join optimizer does not
6236 set that flag, we add one to a zero value if the flag is not
6237 set. That way SHOW TABLE STATUS will show the best estimate,
6238 while the optimizer never sees the table empty. */
6239 if (stats.records == 0 && !(flag & HA_STATUS_TIME)) {
6240 stats.records++;
6241 }
6242 }
6243 if ((flag & HA_STATUS_CONST)) {
6244 stats.max_data_file_length = 9223372036854775807ULL;
6245 }
6246 if (flag & (HA_STATUS_VARIABLE | HA_STATUS_CONST)) {
6247 share->set_cardinality_counts_in_table(table);
6248 }
6249
6250 /* Don't return key if we got an error for the internal primary key */
6251 if (flag & HA_STATUS_ERRKEY && last_dup_key < table_share->keys) {
6252 errkey = last_dup_key;
6253 }
6254
6255 if (flag & HA_STATUS_AUTO && table->found_next_number_field) {
6256 THD* thd = table->in_use;
6257 struct system_variables* variables = &thd->variables;
6258 stats.auto_increment_value =
6259 share->last_auto_increment + variables->auto_increment_increment;
6260 }
6261 error = 0;
6262 cleanup:
6263 if (txn != NULL) {
6264 commit_txn(txn, DB_TXN_NOSYNC);
6265 txn = NULL;
6266 }
6267 TOKUDB_HANDLER_DBUG_RETURN(error);
6268 }
6269
6270 //
6271 // Per InnoDB: Tells something additional to the handler about how to do things.
6272 //
6273 int ha_tokudb::extra(enum ha_extra_function operation) {
6274 TOKUDB_HANDLER_DBUG_ENTER("%d", operation);
6275 switch (operation) {
6276 case HA_EXTRA_RESET_STATE:
6277 reset();
6278 break;
6279 case HA_EXTRA_KEYREAD:
6280 key_read = true; // Query satisfied with key
6281 break;
6282 case HA_EXTRA_NO_KEYREAD:
6283 key_read = false;
6284 break;
6285 case HA_EXTRA_IGNORE_DUP_KEY:
6286 using_ignore = true;
6287 break;
6288 case HA_EXTRA_NO_IGNORE_DUP_KEY:
6289 using_ignore = false;
6290 break;
6291 case HA_EXTRA_IGNORE_NO_KEY:
6292 using_ignore_no_key = true;
6293 break;
6294 case HA_EXTRA_NO_IGNORE_NO_KEY:
6295 using_ignore_no_key = false;
6296 break;
6297 case HA_EXTRA_NOT_USED:
6298 case HA_EXTRA_PREPARE_FOR_RENAME:
6299 break; // must do nothing and return 0
6300 default:
6301 break;
6302 }
6303 TOKUDB_HANDLER_DBUG_RETURN(0);
6304 }
6305
6306 int ha_tokudb::reset() {
6307 TOKUDB_HANDLER_DBUG_ENTER("");
6308 key_read = false;
6309 using_ignore = false;
6310 using_ignore_no_key = false;
6311 reset_dsmrr();
6312 invalidate_icp();
6313 TOKUDB_HANDLER_DBUG_RETURN(0);
6314 }
6315
6316 //
6317 // helper function that iterates through all DB's
6318 // and grabs a lock (either read or write, but not both)
6319 // Parameters:
6320 // [in] trans - transaction to be used to pre acquire the lock
6321 // lt - type of lock to get, either lock_read or lock_write
6322 // Returns:
6323 // 0 on success
6324 // error otherwise
6325 //
6326 int ha_tokudb::acquire_table_lock (DB_TXN* trans, TABLE_LOCK_TYPE lt) {
6327 TOKUDB_HANDLER_DBUG_ENTER("%p %s", trans, lt == lock_read ? "r" : "w");
6328 int error = ENOSYS;
6329 if (!num_DBs_locked_in_bulk) {
6330 rwlock_t_lock_read(share->_num_DBs_lock);
6331 }
6332 uint curr_num_DBs = share->num_DBs;
6333 if (lt == lock_read) {
6334 error = 0;
6335 goto cleanup;
6336 } else if (lt == lock_write) {
6337 for (uint i = 0; i < curr_num_DBs; i++) {
6338 DB* db = share->key_file[i];
6339 error = db->pre_acquire_table_lock(db, trans);
6340 if (error == EINVAL)
6341 TOKUDB_HANDLER_TRACE("%d db=%p trans=%p", i, db, trans);
6342 if (error) break;
6343 }
6344 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6345 if (error) goto cleanup;
6346 } else {
6347 error = ENOSYS;
6348 goto cleanup;
6349 }
6350
6351 error = 0;
6352 cleanup:
6353 if (!num_DBs_locked_in_bulk) {
6354 share->_num_DBs_lock.unlock();
6355 }
6356 TOKUDB_HANDLER_DBUG_RETURN(error);
6357 }
6358
6359 int ha_tokudb::create_txn(THD* thd, tokudb_trx_data* trx) {
6360 int error;
6361 ulong tx_isolation = thd_tx_isolation(thd);
6362 HA_TOKU_ISO_LEVEL toku_iso_level = tx_to_toku_iso(tx_isolation);
6363 bool is_autocommit = !thd_test_options(
6364 thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN);
6365
6366 /* First table lock, start transaction */
6367 if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) &&
6368 !trx->all &&
6369 (thd_sql_command(thd) != SQLCOM_CREATE_TABLE) &&
6370 (thd_sql_command(thd) != SQLCOM_DROP_TABLE) &&
6371 (thd_sql_command(thd) != SQLCOM_DROP_INDEX) &&
6372 (thd_sql_command(thd) != SQLCOM_CREATE_INDEX) &&
6373 (thd_sql_command(thd) != SQLCOM_ALTER_TABLE)) {
6374 /* QQQ We have to start a master transaction */
6375 // DBUG_PRINT("trans", ("starting transaction all "));
6376 uint32_t txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6377 #if 50614 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
6378 if (thd_tx_is_read_only(thd)) {
6379 txn_begin_flags |= DB_TXN_READ_ONLY;
6380 }
6381 #endif
6382 if ((error = txn_begin(db_env, NULL, &trx->all, txn_begin_flags, thd))) {
6383 goto cleanup;
6384 }
6385 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6386 TOKUDB_DEBUG_TXN,
6387 "created master %p",
6388 trx->all);
6389 trx->sp_level = trx->all;
6390 trans_register_ha(thd, true, tokudb_hton, 0);
6391 }
6392 DBUG_PRINT("trans", ("starting transaction stmt"));
6393 if (trx->stmt) {
6394 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6395 TOKUDB_DEBUG_TXN,
6396 "warning:stmt=%p",
6397 trx->stmt);
6398 }
6399 uint32_t txn_begin_flags;
6400 if (trx->all == NULL) {
6401 txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6402 //
6403 // if the isolation level that the user has set is serializable,
6404 // but autocommit is on and this is just a select,
6405 // then we can go ahead and set the isolation level to
6406 // be a snapshot read, because we can serialize
6407 // the transaction to be the point in time at which the snapshot began.
6408 //
6409 if (txn_begin_flags == 0 && is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT) {
6410 txn_begin_flags = DB_TXN_SNAPSHOT;
6411 }
6412 if (is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT &&
6413 !thd->in_sub_stmt && lock.type <= TL_READ_NO_INSERT &&
6414 !thd->lex->uses_stored_routines()) {
6415 txn_begin_flags |= DB_TXN_READ_ONLY;
6416 }
6417 } else {
6418 txn_begin_flags = DB_INHERIT_ISOLATION;
6419 }
6420 error = txn_begin(db_env, trx->sp_level, &trx->stmt, txn_begin_flags, thd);
6421 if (error) {
6422 /* We leave the possible master transaction open */
6423 goto cleanup;
6424 }
6425 trx->sub_sp_level = trx->stmt;
6426 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6427 TOKUDB_DEBUG_TXN,
6428 "created stmt %p sp_level %p",
6429 trx->sp_level,
6430 trx->stmt);
6431 reset_stmt_progress(&trx->stmt_progress);
6432 trans_register_ha(thd, false, tokudb_hton, 0);
6433 cleanup:
6434 return error;
6435 }
6436
6437 static const char *lock_type_str(int lock_type) {
6438 if (lock_type == F_RDLCK) return "F_RDLCK";
6439 if (lock_type == F_WRLCK) return "F_WRLCK";
6440 if (lock_type == F_UNLCK) return "F_UNLCK";
6441 return "?";
6442 }
6443
6444 /*
6445 As MySQL will execute an external lock for every new table it uses
6446 we can use this to start the transactions.
6447 If we are in auto_commit mode we just need to start a transaction
6448 for the statement to be able to rollback the statement.
6449 If not, we have to start a master transaction if there doesn't exist
6450 one from before.
6451 */
6452 //
6453 // Parameters:
6454 // [in] thd - handle to the user thread
6455 // lock_type - the type of lock
6456 // Returns:
6457 // 0 on success
6458 // error otherwise
6459 //
6460 int ha_tokudb::external_lock(THD * thd, int lock_type) {
6461 TOKUDB_HANDLER_DBUG_ENTER(
6462 "cmd %d lock %d %s %s",
6463 thd_sql_command(thd),
6464 lock_type,
6465 lock_type_str(lock_type),
6466 share->full_table_name());
6467 if (TOKUDB_UNLIKELY(!TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ENTER) &&
6468 TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_LOCK))) {
6469 TOKUDB_HANDLER_TRACE(
6470 "cmd %d lock %d %s %s",
6471 thd_sql_command(thd),
6472 lock_type,
6473 lock_type_str(lock_type),
6474 share->full_table_name());
6475 }
6476 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s", thd->query());
6477
6478 int error = 0;
6479 tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6480 if (!trx) {
6481 error = create_tokudb_trx_data_instance(&trx);
6482 if (error) { goto cleanup; }
6483 thd_set_ha_data(thd, tokudb_hton, trx);
6484 }
6485
6486 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6487 TOKUDB_DEBUG_TXN,
6488 "trx %p %p %p %p %u %u",
6489 trx->all,
6490 trx->stmt,
6491 trx->sp_level,
6492 trx->sub_sp_level,
6493 trx->tokudb_lock_count,
6494 trx->create_lock_count);
6495
6496 if (trx->all == NULL) {
6497 trx->sp_level = NULL;
6498 }
6499 if (lock_type != F_UNLCK) {
6500 use_write_locks = false;
6501 if (lock_type == F_WRLCK) {
6502 use_write_locks = true;
6503 }
6504 if (!trx->stmt) {
6505 transaction = NULL; // Safety
6506 error = create_txn(thd, trx);
6507 if (error) {
6508 goto cleanup;
6509 }
6510 trx->create_lock_count = trx->tokudb_lock_count;
6511 }
6512 transaction = trx->sub_sp_level;
6513 trx->tokudb_lock_count++;
6514 } else {
6515 share->update_row_count(thd, added_rows, deleted_rows, updated_rows);
6516 added_rows = 0;
6517 deleted_rows = 0;
6518 updated_rows = 0;
6519 share->rows_from_locked_table = 0;
6520 if (trx->tokudb_lock_count > 0) {
6521 if (--trx->tokudb_lock_count <= trx->create_lock_count) {
6522 trx->create_lock_count = 0;
6523 if (trx->stmt) {
6524 /*
6525 F_UNLCK is done without a transaction commit / rollback.
6526 This happens if the thread didn't update any rows
6527 We must in this case commit the work to keep the row locks
6528 */
6529 DBUG_PRINT("trans", ("commiting non-updating transaction"));
6530 reset_stmt_progress(&trx->stmt_progress);
6531 commit_txn(trx->stmt, 0);
6532 trx->stmt = NULL;
6533 trx->sub_sp_level = NULL;
6534 }
6535 }
6536 transaction = NULL;
6537 }
6538 }
6539 cleanup:
6540 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6541 TOKUDB_HANDLER_DBUG_RETURN(error);
6542 }
6543
6544 /*
6545 When using LOCK TABLE's external_lock is only called when the actual
6546 TABLE LOCK is done.
6547 Under LOCK TABLES, each used tables will force a call to start_stmt.
6548 */
6549 int ha_tokudb::start_stmt(THD* thd, thr_lock_type lock_type) {
6550 TOKUDB_HANDLER_DBUG_ENTER(
6551 "cmd %d lock %d %s",
6552 thd_sql_command(thd),
6553 lock_type,
6554 share->full_table_name());
6555
6556 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s", thd->query());
6557
6558 int error = 0;
6559 tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6560 if (!trx) {
6561 error = create_tokudb_trx_data_instance(&trx);
6562 if (error) { goto cleanup; }
6563 thd_set_ha_data(thd, tokudb_hton, trx);
6564 }
6565
6566 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6567 TOKUDB_DEBUG_TXN,
6568 "trx %p %p %p %p %u %u",
6569 trx->all,
6570 trx->stmt,
6571 trx->sp_level,
6572 trx->sub_sp_level,
6573 trx->tokudb_lock_count,
6574 trx->create_lock_count);
6575
6576 /*
6577 note that trx->stmt may have been already initialized as start_stmt()
6578 is called for *each table* not for each storage engine,
6579 and there could be many bdb tables referenced in the query
6580 */
6581 if (!trx->stmt) {
6582 error = create_txn(thd, trx);
6583 if (error) {
6584 goto cleanup;
6585 }
6586 trx->create_lock_count = trx->tokudb_lock_count;
6587 } else {
6588 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6589 TOKUDB_DEBUG_TXN,
6590 "trx->stmt %p already existed",
6591 trx->stmt);
6592 }
6593 if (added_rows > deleted_rows) {
6594 share->rows_from_locked_table = added_rows - deleted_rows;
6595 }
6596 transaction = trx->sub_sp_level;
6597 trans_register_ha(thd, false, tokudb_hton, 0);
6598 cleanup:
6599 TOKUDB_HANDLER_DBUG_RETURN(error);
6600 }
6601
6602
6603 uint32_t ha_tokudb::get_cursor_isolation_flags(enum thr_lock_type lock_type, THD* thd) {
6604 uint sql_command = thd_sql_command(thd);
6605 bool in_lock_tables = thd_in_lock_tables(thd);
6606
6607 //
6608 // following InnoDB's lead and having checksum command use a snapshot read if told
6609 //
6610 if (sql_command == SQLCOM_CHECKSUM) {
6611 return 0;
6612 }
6613 else if ((lock_type == TL_READ && in_lock_tables) ||
6614 (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
6615 sql_command != SQLCOM_SELECT ||
6616 (sql_command == SQLCOM_SELECT && lock_type >= TL_WRITE_ALLOW_WRITE)) { // select for update
6617 ulong tx_isolation = thd_tx_isolation(thd);
6618 // pattern matched from InnoDB
6619 if ( (tx_isolation == ISO_READ_COMMITTED || tx_isolation == ISO_READ_UNCOMMITTED) &&
6620 (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT) &&
6621 (sql_command == SQLCOM_INSERT_SELECT
6622 || sql_command == SQLCOM_REPLACE_SELECT
6623 || sql_command == SQLCOM_UPDATE
6624 || sql_command == SQLCOM_CREATE_TABLE) )
6625 {
6626 return 0;
6627 }
6628 else {
6629 return DB_SERIALIZABLE;
6630 }
6631 }
6632 else {
6633 return 0;
6634 }
6635 }
6636
6637 /*
6638 The idea with handler::store_lock() is the following:
6639
6640 The statement decided which locks we should need for the table
6641 for updates/deletes/inserts we get WRITE locks, for SELECT... we get
6642 read locks.
6643
6644 Before adding the lock into the table lock handler (see thr_lock.c)
6645 mysqld calls store lock with the requested locks. Store lock can now
6646 modify a write lock to a read lock (or some other lock), ignore the
6647 lock (if we don't want to use MySQL table locks at all) or add locks
6648 for many tables (like we do when we are using a MERGE handler).
6649
6650 TokuDB changes all WRITE locks to TL_WRITE_ALLOW_WRITE (which
6651 signals that we are doing WRITES, but we are still allowing other
6652 reader's and writer's.
6653
6654 When releasing locks, store_lock() are also called. In this case one
6655 usually doesn't have to do anything.
6656
6657 In some exceptional cases MySQL may send a request for a TL_IGNORE;
6658 This means that we are requesting the same lock as last time and this
6659 should also be ignored. (This may happen when someone does a flush
6660 table when we have opened a part of the tables, in which case mysqld
6661 closes and reopens the tables and tries to get the same locks at last
6662 time). In the future we will probably try to remove this.
6663 */
6664
6665 THR_LOCK_DATA* *ha_tokudb::store_lock(
6666 THD* thd,
6667 THR_LOCK_DATA** to,
6668 enum thr_lock_type lock_type) {
6669
6670 TOKUDB_HANDLER_DBUG_ENTER(
6671 "lock_type=%d cmd=%d",
6672 lock_type,
6673 thd_sql_command(thd));
6674 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6675 TOKUDB_DEBUG_LOCK,
6676 "lock_type=%d cmd=%d",
6677 lock_type,
6678 thd_sql_command(thd));
6679
6680 if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) {
6681 enum_sql_command sql_command = (enum_sql_command) thd_sql_command(thd);
6682 if (!thd->in_lock_tables) {
6683 if (sql_command == SQLCOM_CREATE_INDEX &&
6684 tokudb::sysvars::create_index_online(thd)) {
6685 // hot indexing
6686 rwlock_t_lock_read(share->_num_DBs_lock);
6687 if (share->num_DBs ==
6688 (table->s->keys + tokudb_test(hidden_primary_key))) {
6689 lock_type = TL_WRITE_ALLOW_WRITE;
6690 }
6691 share->_num_DBs_lock.unlock();
6692 } else if ((lock_type >= TL_WRITE_CONCURRENT_INSERT &&
6693 lock_type <= TL_WRITE) &&
6694 sql_command != SQLCOM_TRUNCATE &&
6695 !thd_tablespace_op(thd)) {
6696 // allow concurrent writes
6697 lock_type = TL_WRITE_ALLOW_WRITE;
6698 } else if (sql_command == SQLCOM_OPTIMIZE &&
6699 lock_type == TL_READ_NO_INSERT) {
6700 // hot optimize table
6701 lock_type = TL_READ;
6702 }
6703 }
6704 lock.type = lock_type;
6705 }
6706 *to++ = &lock;
6707 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6708 TOKUDB_DEBUG_LOCK,
6709 "lock_type=%d",
6710 lock_type);
6711 TOKUDB_HANDLER_DBUG_RETURN_PTR(to);
6712 }
6713
6714 static toku_compression_method get_compression_method(DB* file) {
6715 enum toku_compression_method method;
6716 int r = file->get_compression_method(file, &method);
6717 assert_always(r == 0);
6718 return method;
6719 }
6720
6721 #if defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) && \
6722 TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6723 enum row_type ha_tokudb::get_row_type() const {
6724 toku_compression_method compression_method = get_compression_method(share->file);
6725 return toku_compression_method_to_row_type(compression_method);
6726 }
6727 #endif // defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) &&
6728 // TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6729
6730 static int create_sub_table(
6731 const char* table_name,
6732 DBT* row_descriptor,
6733 DB_TXN* txn,
6734 uint32_t block_size,
6735 uint32_t read_block_size,
6736 toku_compression_method compression_method,
6737 bool is_hot_index,
6738 uint32_t fanout) {
6739
6740 TOKUDB_DBUG_ENTER("");
6741 int error;
6742 DB *file = NULL;
6743 uint32_t create_flags;
6744
6745
6746 error = db_create(&file, db_env, 0);
6747 if (error) {
6748 DBUG_PRINT("error", ("Got error: %d when creating table", error));
6749 my_errno = error;
6750 goto exit;
6751 }
6752
6753
6754 if (block_size != 0) {
6755 error = file->set_pagesize(file, block_size);
6756 if (error != 0) {
6757 DBUG_PRINT(
6758 "error",
6759 ("Got error: %d when setting block size %u for table '%s'",
6760 error,
6761 block_size,
6762 table_name));
6763 goto exit;
6764 }
6765 }
6766 if (read_block_size != 0) {
6767 error = file->set_readpagesize(file, read_block_size);
6768 if (error != 0) {
6769 DBUG_PRINT(
6770 "error",
6771 ("Got error: %d when setting read block size %u for table '%s'",
6772 error,
6773 read_block_size,
6774 table_name));
6775 goto exit;
6776 }
6777 }
6778 if (fanout != 0) {
6779 error = file->set_fanout(file, fanout);
6780 if (error != 0) {
6781 DBUG_PRINT(
6782 "error",
6783 ("Got error: %d when setting fanout %u for table '%s'",
6784 error,
6785 fanout,
6786 table_name));
6787 goto exit;
6788 }
6789 }
6790 error = file->set_compression_method(file, compression_method);
6791 if (error != 0) {
6792 DBUG_PRINT(
6793 "error",
6794 ("Got error: %d when setting compression type %u for table '%s'",
6795 error,
6796 compression_method,
6797 table_name));
6798 goto exit;
6799 }
6800
6801 create_flags =
6802 DB_THREAD | DB_CREATE | DB_EXCL | (is_hot_index ? DB_IS_HOT_INDEX : 0);
6803 error =
6804 file->open(
6805 file,
6806 txn,
6807 table_name,
6808 NULL,
6809 DB_BTREE,
6810 create_flags,
6811 my_umask);
6812 if (error) {
6813 DBUG_PRINT(
6814 "error",
6815 ("Got error: %d when opening table '%s'", error, table_name));
6816 goto exit;
6817 }
6818
6819 error =
6820 file->change_descriptor(
6821 file,
6822 txn,
6823 row_descriptor,
6824 (is_hot_index ? DB_IS_HOT_INDEX |
6825 DB_UPDATE_CMP_DESCRIPTOR :
6826 DB_UPDATE_CMP_DESCRIPTOR));
6827 if (error) {
6828 DBUG_PRINT(
6829 "error",
6830 ("Got error: %d when setting row descriptor for table '%s'",
6831 error,
6832 table_name));
6833 goto exit;
6834 }
6835
6836 error = 0;
6837 exit:
6838 if (file) {
6839 int r = file->close(file, 0);
6840 assert_always(r==0);
6841 }
6842 TOKUDB_DBUG_RETURN(error);
6843 }
6844
6845 void ha_tokudb::update_create_info(HA_CREATE_INFO* create_info) {
6846 if (share->has_auto_inc) {
6847 info(HA_STATUS_AUTO);
6848 if (!(create_info->used_fields & HA_CREATE_USED_AUTO) ||
6849 create_info->auto_increment_value < stats.auto_increment_value) {
6850 create_info->auto_increment_value = stats.auto_increment_value;
6851 }
6852 }
6853 #if defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) && \
6854 TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6855 if (!(create_info->used_fields & HA_CREATE_USED_ROW_FORMAT)) {
6856 // show create table asks us to update this create_info, this makes it
6857 // so we'll always show what compression type we're using
6858 create_info->row_type = get_row_type();
6859 if (create_info->row_type == ROW_TYPE_TOKU_ZLIB &&
6860 tokudb::sysvars::hide_default_row_format(ha_thd()) != 0) {
6861 create_info->row_type = ROW_TYPE_DEFAULT;
6862 }
6863 }
6864 #endif // defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) &&
6865 // TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6866 }
6867
6868 //
6869 // removes key name from status.tokudb.
6870 // needed for when we are dropping indexes, so that
6871 // during drop table, we do not attempt to remove already dropped
6872 // indexes because we did not keep status.tokudb in sync with list of indexes.
6873 //
6874 int ha_tokudb::remove_key_name_from_status(DB* status_block, const char* key_name, DB_TXN* txn) {
6875 int error;
6876 uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6877 HA_METADATA_KEY md_key = hatoku_key_name;
6878 memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6879 //
6880 // put index name in status.tokudb
6881 //
6882 memcpy(
6883 status_key_info + sizeof(HA_METADATA_KEY),
6884 key_name,
6885 strlen(key_name) + 1
6886 );
6887 error = remove_metadata(
6888 status_block,
6889 status_key_info,
6890 sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6891 txn
6892 );
6893 return error;
6894 }
6895
6896 //
6897 // writes the key name in status.tokudb, so that we may later delete or rename
6898 // the dictionary associated with key_name
6899 //
6900 int ha_tokudb::write_key_name_to_status(DB* status_block, const char* key_name,
6901 DB_TXN* txn) {
6902 int error;
6903 uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6904 HA_METADATA_KEY md_key = hatoku_key_name;
6905 memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6906 //
6907 // put index name in status.tokudb
6908 //
6909 memcpy(
6910 status_key_info + sizeof(HA_METADATA_KEY),
6911 key_name,
6912 strlen(key_name) + 1
6913 );
6914 error = write_metadata(
6915 status_block,
6916 status_key_info,
6917 sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6918 NULL,
6919 0,
6920 txn
6921 );
6922 return error;
6923 }
6924
6925 //
6926 // some tracing moved out of ha_tokudb::create, because ::create was
6927 // getting cluttered
6928 //
6929 void ha_tokudb::trace_create_table_info(TABLE* form) {
6930 uint i;
6931 //
6932 // tracing information about what type of table we are creating
6933 //
6934 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_OPEN))) {
6935 for (i = 0; i < form->s->fields; i++) {
6936 Field *field = form->s->field[i];
6937 TOKUDB_HANDLER_TRACE(
6938 "field:%d:%s:type=%d:flags=%x",
6939 i,
6940 field->field_name.str,
6941 field->type(),
6942 field->flags);
6943 }
6944 for (i = 0; i < form->s->keys; i++) {
6945 KEY *key = &form->key_info[i];
6946 TOKUDB_HANDLER_TRACE(
6947 "key:%d:%s:%d",
6948 i,
6949 key->name.str,
6950 key->user_defined_key_parts);
6951 uint p;
6952 for (p = 0; p < key->user_defined_key_parts; p++) {
6953 KEY_PART_INFO* key_part = &key->key_part[p];
6954 Field* field = key_part->field;
6955 TOKUDB_HANDLER_TRACE(
6956 "key:%d:%d:length=%d:%s:type=%d:flags=%x",
6957 i,
6958 p,
6959 key_part->length,
6960 field->field_name.str,
6961 field->type(),
6962 field->flags);
6963 }
6964 }
6965 }
6966 }
6967
6968 static uint32_t get_max_desc_size(KEY_AND_COL_INFO* kc_info, TABLE* form) {
6969 uint32_t max_row_desc_buff_size;
6970 // upper bound of key comparison descriptor
6971 max_row_desc_buff_size = 2*(form->s->fields * 6)+10;
6972 // upper bound for sec. key part
6973 max_row_desc_buff_size += get_max_secondary_key_pack_desc_size(kc_info);
6974 // upper bound for clustering val part
6975 max_row_desc_buff_size += get_max_clustering_val_pack_desc_size(form->s);
6976 return max_row_desc_buff_size;
6977 }
6978
6979 static uint32_t create_secondary_key_descriptor(
6980 uchar* buf,
6981 KEY* key_info,
6982 KEY* prim_key,
6983 uint hpk,
6984 TABLE* form,
6985 uint primary_key,
6986 uint32_t keynr,
6987 KEY_AND_COL_INFO* kc_info) {
6988
6989 uchar* ptr = NULL;
6990
6991 ptr = buf;
6992 ptr += create_toku_key_descriptor(
6993 ptr,
6994 false,
6995 key_info,
6996 hpk,
6997 prim_key
6998 );
6999
7000 ptr += create_toku_secondary_key_pack_descriptor(
7001 ptr,
7002 hpk,
7003 primary_key,
7004 form->s,
7005 form,
7006 kc_info,
7007 key_info,
7008 prim_key
7009 );
7010
7011 ptr += create_toku_clustering_val_pack_descriptor(
7012 ptr,
7013 primary_key,
7014 form->s,
7015 kc_info,
7016 keynr,
7017 key_is_clustering(key_info)
7018 );
7019 return ptr - buf;
7020 }
7021
7022
7023 //
7024 // creates dictionary for secondary index, with key description key_info, all using txn
7025 //
7026 int ha_tokudb::create_secondary_dictionary(
7027 const char* name,
7028 TABLE* form,
7029 KEY* key_info,
7030 DB_TXN* txn,
7031 KEY_AND_COL_INFO* kc_info,
7032 uint32_t keynr,
7033 bool is_hot_index,
7034 toku_compression_method compression_method) {
7035
7036 int error;
7037 DBT row_descriptor;
7038 uchar* row_desc_buff = NULL;
7039 char* newname = NULL;
7040 size_t newname_len = 0;
7041 KEY* prim_key = NULL;
7042 char dict_name[MAX_DICT_NAME_LEN];
7043 uint32_t max_row_desc_buff_size;
7044 uint hpk= (form->s->primary_key >= MAX_KEY) ?
7045 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7046 uint32_t block_size;
7047 uint32_t read_block_size;
7048 uint32_t fanout;
7049 THD* thd = ha_thd();
7050
7051 memset(&row_descriptor, 0, sizeof(row_descriptor));
7052
7053 max_row_desc_buff_size = get_max_desc_size(kc_info,form);
7054
7055 row_desc_buff = (uchar*)tokudb::memory::malloc(
7056 max_row_desc_buff_size,
7057 MYF(MY_WME));
7058 if (row_desc_buff == NULL) {
7059 error = ENOMEM;
7060 goto cleanup;
7061 }
7062
7063 newname_len = get_max_dict_name_path_length(name);
7064 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7065 if (newname == NULL) {
7066 error = ENOMEM;
7067 goto cleanup;
7068 }
7069
7070 sprintf(dict_name, "key-%s", key_info->name.str);
7071 make_name(newname, newname_len, name, dict_name);
7072
7073 prim_key = (hpk) ? NULL : &form->key_info[primary_key];
7074
7075 //
7076 // setup the row descriptor
7077 //
7078 row_descriptor.data = row_desc_buff;
7079 //
7080 // save data necessary for key comparisons
7081 //
7082 row_descriptor.size = create_secondary_key_descriptor(
7083 row_desc_buff,
7084 key_info,
7085 prim_key,
7086 hpk,
7087 form,
7088 primary_key,
7089 keynr,
7090 kc_info);
7091 assert_always(row_descriptor.size <= max_row_desc_buff_size);
7092
7093 block_size = tokudb::sysvars::block_size(thd);
7094 read_block_size = tokudb::sysvars::read_block_size(thd);
7095 fanout = tokudb::sysvars::fanout(thd);
7096
7097 error = create_sub_table(
7098 newname,
7099 &row_descriptor,
7100 txn,
7101 block_size,
7102 read_block_size,
7103 compression_method,
7104 is_hot_index,
7105 fanout);
7106 cleanup:
7107 tokudb::memory::free(newname);
7108 tokudb::memory::free(row_desc_buff);
7109 return error;
7110 }
7111
7112
7113 static uint32_t create_main_key_descriptor(
7114 uchar* buf,
7115 KEY* prim_key,
7116 uint hpk,
7117 uint primary_key,
7118 TABLE* form,
7119 KEY_AND_COL_INFO* kc_info) {
7120
7121 uchar* ptr = buf;
7122 ptr += create_toku_key_descriptor(
7123 ptr,
7124 hpk,
7125 prim_key,
7126 false,
7127 NULL);
7128
7129 ptr += create_toku_main_key_pack_descriptor(ptr);
7130
7131 ptr += create_toku_clustering_val_pack_descriptor(
7132 ptr,
7133 primary_key,
7134 form->s,
7135 kc_info,
7136 primary_key,
7137 false);
7138 return ptr - buf;
7139 }
7140
7141 //
7142 // create and close the main dictionarr with name of "name" using table form, all within
7143 // transaction txn.
7144 //
7145 int ha_tokudb::create_main_dictionary(
7146 const char* name,
7147 TABLE* form,
7148 DB_TXN* txn,
7149 KEY_AND_COL_INFO* kc_info,
7150 toku_compression_method compression_method) {
7151
7152 int error;
7153 DBT row_descriptor;
7154 uchar* row_desc_buff = NULL;
7155 char* newname = NULL;
7156 size_t newname_len = 0;
7157 KEY* prim_key = NULL;
7158 uint32_t max_row_desc_buff_size;
7159 uint hpk = (form->s->primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7160 uint32_t block_size;
7161 uint32_t read_block_size;
7162 uint32_t fanout;
7163 THD* thd = ha_thd();
7164
7165 memset(&row_descriptor, 0, sizeof(row_descriptor));
7166 max_row_desc_buff_size = get_max_desc_size(kc_info, form);
7167
7168 row_desc_buff = (uchar*)tokudb::memory::malloc(
7169 max_row_desc_buff_size,
7170 MYF(MY_WME));
7171 if (row_desc_buff == NULL) {
7172 error = ENOMEM;
7173 goto cleanup;
7174 }
7175
7176 newname_len = get_max_dict_name_path_length(name);
7177 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7178 if (newname == NULL) {
7179 error = ENOMEM;
7180 goto cleanup;
7181 }
7182
7183 make_name(newname, newname_len, name, "main");
7184
7185 prim_key = (hpk) ? NULL : &form->key_info[primary_key];
7186
7187 //
7188 // setup the row descriptor
7189 //
7190 row_descriptor.data = row_desc_buff;
7191 //
7192 // save data necessary for key comparisons
7193 //
7194 row_descriptor.size = create_main_key_descriptor(
7195 row_desc_buff,
7196 prim_key,
7197 hpk,
7198 primary_key,
7199 form,
7200 kc_info);
7201 assert_always(row_descriptor.size <= max_row_desc_buff_size);
7202
7203 block_size = tokudb::sysvars::block_size(thd);
7204 read_block_size = tokudb::sysvars::read_block_size(thd);
7205 fanout = tokudb::sysvars::fanout(thd);
7206
7207 /* Create the main table that will hold the real rows */
7208 error = create_sub_table(
7209 newname,
7210 &row_descriptor,
7211 txn,
7212 block_size,
7213 read_block_size,
7214 compression_method,
7215 false,
7216 fanout);
7217 cleanup:
7218 tokudb::memory::free(newname);
7219 tokudb::memory::free(row_desc_buff);
7220 return error;
7221 }
7222
7223 //
7224 // Creates a new table
7225 // Parameters:
7226 // [in] name - table name
7227 // [in] form - info on table, columns and indexes
7228 // [in] create_info - more info on table, CURRENTLY UNUSED
7229 // Returns:
7230 // 0 on success
7231 // error otherwise
7232 //
7233 int ha_tokudb::create(
7234 const char* name,
7235 TABLE* form,
7236 HA_CREATE_INFO* create_info) {
7237
7238 TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7239
7240 int error;
7241 DB *status_block = NULL;
7242 uint version;
7243 uint capabilities;
7244 DB_TXN* txn = NULL;
7245 bool do_commit = false;
7246 char* newname = NULL;
7247 size_t newname_len = 0;
7248 KEY_AND_COL_INFO kc_info;
7249 tokudb_trx_data *trx = NULL;
7250 THD* thd = ha_thd();
7251
7252 String database_name, table_name, dictionary_name;
7253 tokudb_split_dname(name, database_name, table_name, dictionary_name);
7254 if (database_name.is_empty() || table_name.is_empty()) {
7255 push_warning_printf(thd,
7256 Sql_condition::WARN_LEVEL_WARN,
7257 ER_TABLE_NAME,
7258 "TokuDB: Table Name or Database Name is empty");
7259 DBUG_RETURN(ER_TABLE_NAME);
7260 }
7261
7262 memset(&kc_info, 0, sizeof(kc_info));
7263
7264 #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100999
7265 // TokuDB does not support discover_table_names() and writes no files
7266 // in the database directory, so automatic filename-based
7267 // discover_table_names() doesn't work either. So, it must force .frm
7268 // file to disk.
7269 error= form->s->write_frm_image();
7270 #endif
7271
7272 #if defined(TOKU_INCLUDE_OPTION_STRUCTS) && TOKU_INCLUDE_OPTION_STRUCTS
7273 const tokudb::sysvars::row_format_t row_format =
7274 (tokudb::sysvars::row_format_t)form->s->option_struct->row_format;
7275 #else
7276 // TDB-76 : CREATE TABLE ... LIKE ... does not use source row_format on
7277 // target table
7278 // Original code would only use create_info->row_type if
7279 // create_info->used_fields & HA_CREATE_USED_ROW_FORMAT was true. This
7280 // would cause us to skip transferring the row_format for a table created
7281 // via CREATE TABLE tn LIKE tn. We also take on more InnoDB like behavior
7282 // and throw a warning if we get a row_format that we can't translate into
7283 // a known TokuDB row_format.
7284 tokudb::sysvars::row_format_t row_format =
7285 tokudb::sysvars::row_format(thd);
7286
7287 if ((create_info->used_fields & HA_CREATE_USED_ROW_FORMAT) ||
7288 create_info->row_type != ROW_TYPE_DEFAULT) {
7289 row_format = row_type_to_row_format(create_info->row_type);
7290 if (row_format == tokudb::sysvars::SRV_ROW_FORMAT_DEFAULT &&
7291 create_info->row_type != ROW_TYPE_DEFAULT) {
7292 push_warning(thd,
7293 Sql_condition::WARN_LEVEL_WARN,
7294 ER_ILLEGAL_HA_CREATE_OPTION,
7295 "TokuDB: invalid ROW_FORMAT specifier.");
7296 }
7297 }
7298 #endif // defined(TOKU_INCLUDE_OPTION_STRUCTS) && TOKU_INCLUDE_OPTION_STRUCTS
7299 const toku_compression_method compression_method =
7300 row_format_to_toku_compression_method(row_format);
7301 bool create_from_engine = (create_info->table_options & HA_OPTION_CREATE_FROM_ENGINE);
7302 if (error) { goto cleanup; }
7303 if (create_from_engine) {
7304 // table already exists, nothing to do
7305 error = 0;
7306 goto cleanup;
7307 }
7308
7309 // validate the fields in the table. If the table has fields
7310 // we do not support that came from an old version of MySQL,
7311 // gracefully return an error
7312 for (uint32_t i = 0; i < form->s->fields; i++) {
7313 Field* field = table_share->field[i];
7314 if (!field_valid_for_tokudb_table(field)) {
7315 sql_print_error("Table %s has an invalid field %s, that was created "
7316 "with an old version of MySQL. This field is no longer supported. "
7317 "This is probably due to an alter table engine=TokuDB. To load this "
7318 "table, do a dump and load",
7319 name,
7320 field->field_name.str
7321 );
7322 error = HA_ERR_UNSUPPORTED;
7323 goto cleanup;
7324 }
7325 }
7326
7327 newname_len = get_max_dict_name_path_length(name);
7328 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7329 if (newname == NULL) {
7330 error = ENOMEM;
7331 goto cleanup;
7332 }
7333
7334 trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
7335 if (trx && trx->sub_sp_level &&
7336 thd_sql_command(thd) == SQLCOM_CREATE_TABLE) {
7337 txn = trx->sub_sp_level;
7338 } else {
7339 do_commit = true;
7340 error = txn_begin(db_env, 0, &txn, 0, thd);
7341 if (error) {
7342 goto cleanup;
7343 }
7344 }
7345
7346 primary_key = form->s->primary_key;
7347 hidden_primary_key = (primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7348 if (hidden_primary_key) {
7349 primary_key = form->s->keys;
7350 }
7351
7352 /* do some tracing */
7353 trace_create_table_info(form);
7354
7355 /* Create status.tokudb and save relevant metadata */
7356 make_name(newname, newname_len, name, "status");
7357
7358 error = tokudb::metadata::create(db_env, &status_block, newname, txn);
7359 if (error) { goto cleanup; }
7360
7361 version = HA_TOKU_VERSION;
7362 error = write_to_status(
7363 status_block,
7364 hatoku_new_version,
7365 &version,
7366 sizeof(version),
7367 txn);
7368 if (error) {
7369 goto cleanup;
7370 }
7371
7372 capabilities = HA_TOKU_CAP;
7373 error = write_to_status(
7374 status_block,
7375 hatoku_capabilities,
7376 &capabilities,
7377 sizeof(capabilities),
7378 txn);
7379 if (error) {
7380 goto cleanup;
7381 }
7382
7383 error = write_auto_inc_create(
7384 status_block,
7385 create_info->auto_increment_value,
7386 txn);
7387 if (error) {
7388 goto cleanup;
7389 }
7390
7391 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
7392 #if defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
7393 if (TOKU_PARTITION_WRITE_FRM_DATA || form->part_info == NULL) {
7394 error = write_frm_data(status_block, txn, form->s->path.str);
7395 if (error) {
7396 goto cleanup;
7397 }
7398 }
7399 #else
7400 error = write_frm_data(status_block, txn, form->s->path.str);
7401 if (error) {
7402 goto cleanup;
7403 }
7404 #endif // defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
7405 #endif // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
7406
7407 error = allocate_key_and_col_info(form->s, &kc_info);
7408 if (error) {
7409 goto cleanup;
7410 }
7411
7412 error = initialize_key_and_col_info(
7413 form->s,
7414 form,
7415 &kc_info,
7416 hidden_primary_key,
7417 primary_key);
7418 if (error) {
7419 goto cleanup;
7420 }
7421
7422 error = create_main_dictionary(
7423 name,
7424 form,
7425 txn,
7426 &kc_info,
7427 compression_method);
7428 if (error) {
7429 goto cleanup;
7430 }
7431
7432
7433 for (uint i = 0; i < form->s->keys; i++) {
7434 if (i != primary_key) {
7435 error = create_secondary_dictionary(
7436 name,
7437 form,
7438 &form->key_info[i],
7439 txn,
7440 &kc_info,
7441 i,
7442 false,
7443 compression_method);
7444 if (error) {
7445 goto cleanup;
7446 }
7447
7448 error = write_key_name_to_status(
7449 status_block,
7450 form->key_info[i].name.str,
7451 txn);
7452 if (error) {
7453 goto cleanup;
7454 }
7455 }
7456 }
7457
7458 error = 0;
7459 cleanup:
7460 if (status_block != NULL) {
7461 int r = tokudb::metadata::close(&status_block);
7462 assert_always(r==0);
7463 }
7464 free_key_and_col_info(&kc_info);
7465 if (do_commit && txn) {
7466 if (error) {
7467 abort_txn(txn);
7468 } else {
7469 commit_txn(txn,0);
7470 }
7471 }
7472 tokudb::memory::free(newname);
7473 TOKUDB_HANDLER_DBUG_RETURN(error);
7474 }
7475
7476 int ha_tokudb::discard_or_import_tablespace(TOKUDB_UNUSED(my_bool discard)) {
7477 /*
7478 if (discard) {
7479 my_errno=HA_ERR_WRONG_COMMAND;
7480 return my_errno;
7481 }
7482 return add_table_to_metadata(share->table_name);
7483 */
7484 my_errno=HA_ERR_WRONG_COMMAND;
7485 return my_errno;
7486 }
7487
7488
7489 //
7490 // deletes from_name or renames from_name to to_name, all using transaction txn.
7491 // is_delete specifies which we are doing
7492 // is_key specifies if it is a secondary index (and hence a "key-" needs to be prepended) or
7493 // if it is not a secondary index
7494 //
7495 int ha_tokudb::delete_or_rename_dictionary(
7496 const char* from_name,
7497 const char* to_name,
7498 const char* secondary_name,
7499 bool is_key,
7500 DB_TXN* txn,
7501 bool is_delete) {
7502
7503 int error;
7504 char dict_name[MAX_DICT_NAME_LEN];
7505 char* new_from_name = NULL;
7506 size_t new_from_name_len = 0;
7507 char* new_to_name = NULL;
7508 size_t new_to_name_len = 0;
7509 assert_always(txn);
7510
7511 new_from_name_len = get_max_dict_name_path_length(from_name);
7512 new_from_name = (char*)tokudb::memory::malloc(
7513 new_from_name_len,
7514 MYF(MY_WME));
7515 if (new_from_name == NULL) {
7516 error = ENOMEM;
7517 goto cleanup;
7518 }
7519 if (!is_delete) {
7520 assert_always(to_name);
7521 new_to_name_len = get_max_dict_name_path_length(to_name);
7522 new_to_name = (char*)tokudb::memory::malloc(
7523 new_to_name_len,
7524 MYF(MY_WME));
7525 if (new_to_name == NULL) {
7526 error = ENOMEM;
7527 goto cleanup;
7528 }
7529 }
7530
7531 if (is_key) {
7532 sprintf(dict_name, "key-%s", secondary_name);
7533 make_name(new_from_name, new_from_name_len, from_name, dict_name);
7534 } else {
7535 make_name(new_from_name, new_from_name_len, from_name, secondary_name);
7536 }
7537 if (!is_delete) {
7538 if (is_key) {
7539 sprintf(dict_name, "key-%s", secondary_name);
7540 make_name(new_to_name, new_to_name_len, to_name, dict_name);
7541 } else {
7542 make_name(new_to_name, new_to_name_len, to_name, secondary_name);
7543 }
7544 }
7545
7546 if (is_delete) {
7547 error = db_env->dbremove(db_env, txn, new_from_name, NULL, 0);
7548 } else {
7549 error = db_env->dbrename(
7550 db_env,
7551 txn,
7552 new_from_name,
7553 NULL,
7554 new_to_name,
7555 0);
7556 }
7557 if (error) {
7558 goto cleanup;
7559 }
7560
7561 cleanup:
7562 tokudb::memory::free(new_from_name);
7563 tokudb::memory::free(new_to_name);
7564 return error;
7565 }
7566
7567
7568 //
7569 // deletes or renames a table. if is_delete is true, then we delete, and to_name can be NULL
7570 // if is_delete is false, then to_name must be non-NULL, as we are renaming the table.
7571 //
7572 int ha_tokudb::delete_or_rename_table (const char* from_name, const char* to_name, bool is_delete) {
7573 THD *thd = ha_thd();
7574 int error;
7575 DB* status_db = NULL;
7576 DBC* status_cursor = NULL;
7577 DB_TXN* txn = NULL;
7578 DBT curr_key;
7579 DBT curr_val;
7580 memset(&curr_key, 0, sizeof(curr_key));
7581 memset(&curr_val, 0, sizeof(curr_val));
7582
7583 DB_TXN *parent_txn = NULL;
7584 tokudb_trx_data *trx = NULL;
7585 trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
7586 if (thd_sql_command(ha_thd()) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
7587 parent_txn = trx->sub_sp_level;
7588 }
7589
7590 error = txn_begin(db_env, parent_txn, &txn, 0, thd);
7591 if (error) { goto cleanup; }
7592
7593 //
7594 // open status db,
7595 // create cursor,
7596 // for each name read out of there, create a db and delete or rename it
7597 //
7598 error = open_status_dictionary(&status_db, from_name, txn);
7599 if (error) { goto cleanup; }
7600
7601 error = status_db->cursor(status_db, txn, &status_cursor, 0);
7602 if (error) { goto cleanup; }
7603 status_cursor->c_set_check_interrupt_callback(status_cursor, tokudb_killed_thd_callback, thd);
7604
7605 while (error != DB_NOTFOUND) {
7606 error = status_cursor->c_get(status_cursor, &curr_key, &curr_val, DB_NEXT);
7607 if (error && error != DB_NOTFOUND) {
7608 error = map_to_handler_error(error);
7609 goto cleanup;
7610 }
7611 if (error == DB_NOTFOUND) {
7612 break;
7613 }
7614 HA_METADATA_KEY mk = *(HA_METADATA_KEY *)curr_key.data;
7615 if (mk != hatoku_key_name) {
7616 continue;
7617 }
7618 error = delete_or_rename_dictionary(from_name, to_name, (char *)((char *)curr_key.data + sizeof(HA_METADATA_KEY)), true, txn, is_delete);
7619 if (error) { goto cleanup; }
7620 }
7621
7622 //
7623 // delete or rename main.tokudb
7624 //
7625 error = delete_or_rename_dictionary(from_name, to_name, "main", false, txn, is_delete);
7626 if (error) { goto cleanup; }
7627
7628 error = status_cursor->c_close(status_cursor);
7629 assert_always(error==0);
7630 status_cursor = NULL;
7631 if (error) { goto cleanup; }
7632
7633 error = status_db->close(status_db, 0);
7634 assert_always(error == 0);
7635 status_db = NULL;
7636
7637 //
7638 // delete or rename status.tokudb
7639 //
7640 error = delete_or_rename_dictionary(from_name, to_name, "status", false, txn, is_delete);
7641 if (error) { goto cleanup; }
7642
7643 my_errno = error;
7644 cleanup:
7645 if (status_cursor) {
7646 int r = status_cursor->c_close(status_cursor);
7647 assert_always(r==0);
7648 }
7649 if (status_db) {
7650 int r = status_db->close(status_db, 0);
7651 assert_always(r==0);
7652 }
7653 if (txn) {
7654 if (error) {
7655 abort_txn(txn);
7656 }
7657 else {
7658 commit_txn(txn, 0);
7659 }
7660 }
7661 return error;
7662 }
7663
7664
7665 //
7666 // Drops table
7667 // Parameters:
7668 // [in] name - name of table to be deleted
7669 // Returns:
7670 // 0 on success
7671 // error otherwise
7672 //
7673 int ha_tokudb::delete_table(const char *name) {
7674 TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7675 TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(name, NULL, false);
7676 if (share) {
7677 share->unlock();
7678 share->release();
7679 // this should be enough to handle locking as the higher level MDL
7680 // on this table should prevent any new analyze tasks.
7681 share->cancel_background_jobs();
7682 TOKUDB_SHARE::drop_share(share);
7683 }
7684
7685 int error;
7686 error = delete_or_rename_table(name, NULL, true);
7687 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7688 error == DB_LOCK_NOTGRANTED) {
7689 sql_print_error(
7690 "Could not delete table %s because another transaction has "
7691 "accessed the table. To drop the table, make sure no "
7692 "transactions touch the table.",
7693 name);
7694 }
7695 TOKUDB_HANDLER_DBUG_RETURN(error);
7696 }
7697
7698 static bool tokudb_check_db_dir_exist_from_table_name(const char *table_name) {
7699 DBUG_ASSERT(table_name);
7700 bool mysql_dir_exists;
7701 char db_name[FN_REFLEN];
7702 const char *db_name_begin = strchr(table_name, FN_LIBCHAR);
7703 const char *db_name_end = strrchr(table_name, FN_LIBCHAR);
7704 DBUG_ASSERT(db_name_begin);
7705 DBUG_ASSERT(db_name_end);
7706 DBUG_ASSERT(db_name_begin != db_name_end);
7707
7708 ++db_name_begin;
7709 size_t db_name_size = db_name_end - db_name_begin;
7710
7711 DBUG_ASSERT(db_name_size < FN_REFLEN);
7712
7713 memcpy(db_name, db_name_begin, db_name_size);
7714 db_name[db_name_size] = '\0';
7715
7716 // At this point, db_name contains the MySQL formatted database name.
7717 // This is exactly the same format that would come into us through a
7718 // CREATE TABLE. Some charaters (like ':' for example) might be expanded
7719 // into hex (':' would papear as "@003a").
7720 // We need to check that the MySQL destination database directory exists.
7721 mysql_dir_exists = (my_access(db_name, F_OK) == 0);
7722
7723 return mysql_dir_exists;
7724 }
7725
7726 //
7727 // renames table from "from" to "to"
7728 // Parameters:
7729 // [in] name - old name of table
7730 // [in] to - new name of table
7731 // Returns:
7732 // 0 on success
7733 // error otherwise
7734 //
7735 int ha_tokudb::rename_table(const char *from, const char *to) {
7736 TOKUDB_HANDLER_DBUG_ENTER("%s %s", from, to);
7737 TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(from, NULL, false);
7738 if (share) {
7739 share->unlock();
7740 share->release();
7741 // this should be enough to handle locking as the higher level MDL
7742 // on this table should prevent any new analyze tasks.
7743 share->cancel_background_jobs();
7744 TOKUDB_SHARE::drop_share(share);
7745 }
7746 int error;
7747 bool to_db_dir_exist = tokudb_check_db_dir_exist_from_table_name(to);
7748 if (!to_db_dir_exist) {
7749 sql_print_error(
7750 "Could not rename table from %s to %s because "
7751 "destination db does not exist",
7752 from,
7753 to);
7754 #ifndef __WIN__
7755 /* Small hack. tokudb_check_db_dir_exist_from_table_name calls
7756 * my_access, which sets my_errno on Windows, but doesn't on
7757 * unix. Set it for unix too.
7758 */
7759 my_errno= errno;
7760 #endif
7761 error= my_errno;
7762 }
7763 else {
7764 error = delete_or_rename_table(from, to, false);
7765 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7766 error == DB_LOCK_NOTGRANTED) {
7767 sql_print_error(
7768 "Could not rename table from %s to %s because another transaction "
7769 "has accessed the table. To rename the table, make sure no "
7770 "transactions touch the table.",
7771 from,
7772 to);
7773 }
7774 }
7775 TOKUDB_HANDLER_DBUG_RETURN(error);
7776 }
7777
7778
7779 /*
7780 Returns estimate on number of seeks it will take to read through the table
7781 This is to be comparable to the number returned by records_in_range so
7782 that we can decide if we should scan the table or use keys.
7783 */
7784 /// QQQ why divide by 3
7785 double ha_tokudb::scan_time() {
7786 TOKUDB_HANDLER_DBUG_ENTER("");
7787 double ret_val = (double)stats.records / 3;
7788 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7789 TOKUDB_DEBUG_RETURN,
7790 "return %" PRIu64 " %f",
7791 (uint64_t)stats.records,
7792 ret_val);
7793 DBUG_RETURN(ret_val);
7794 }
7795
7796 double ha_tokudb::keyread_time(uint index, uint ranges, ha_rows rows)
7797 {
7798 TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7799 double cost;
7800 if (index == primary_key || is_clustering_key(index)) {
7801 cost = read_time(index, ranges, rows);
7802 DBUG_RETURN(cost);
7803 }
7804 /*
7805 It is assumed that we will read trough the whole key range and that all
7806 key blocks are half full (normally things are much better). It is also
7807 assumed that each time we read the next key from the index, the handler
7808 performs a random seek, thus the cost is proportional to the number of
7809 blocks read. This model does not take into account clustered indexes -
7810 engines that support that (e.g. InnoDB) may want to overwrite this method.
7811 */
7812 cost= handler::keyread_time(index, ranges, rows);
7813 TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(cost);
7814 }
7815
7816 //
7817 // Calculate the time it takes to read a set of ranges through an index
7818 // This enables us to optimize reads for clustered indexes.
7819 // Implementation pulled from InnoDB
7820 // Parameters:
7821 // index - index to use
7822 // ranges - number of ranges
7823 // rows - estimated number of rows in the range
7824 // Returns:
7825 // estimated time measured in disk seeks
7826 //
7827 double ha_tokudb::read_time(
7828 uint index,
7829 uint ranges,
7830 ha_rows rows
7831 )
7832 {
7833 TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7834 double total_scan;
7835 double ret_val;
7836 bool is_primary = (index == primary_key);
7837 bool is_clustering;
7838
7839 //
7840 // in case for hidden primary key, this is called
7841 //
7842 if (index >= table_share->keys) {
7843 ret_val = handler::read_time(index, ranges, rows);
7844 goto cleanup;
7845 }
7846
7847 is_clustering = key_is_clustering(&table->key_info[index]);
7848
7849
7850 //
7851 // if it is not the primary key, and it is not a clustering key, then return handler::read_time
7852 //
7853 if (!(is_primary || is_clustering)) {
7854 ret_val = handler::read_time(index, ranges, rows);
7855 goto cleanup;
7856 }
7857
7858 //
7859 // for primary key and for clustered keys, return a fraction of scan_time()
7860 //
7861 total_scan = scan_time();
7862
7863 if (stats.records <= rows) {
7864 ret_val = is_clustering ? total_scan + 0.00001 : total_scan;
7865 goto cleanup;
7866 }
7867
7868 //
7869 // one disk seek per range plus the proportional scan time of the rows
7870 //
7871 ret_val = (ranges + (double) rows / (double) stats.records * total_scan);
7872 ret_val = is_clustering ? ret_val + 0.00001 : ret_val;
7873
7874 cleanup:
7875 TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7876 }
7877
7878 double ha_tokudb::index_only_read_time(uint keynr, double records) {
7879 TOKUDB_HANDLER_DBUG_ENTER("%u %f", keynr, records);
7880 double ret_val = keyread_time(keynr, 1, (ha_rows)records);
7881 TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7882 }
7883
7884 //
7885 // Estimates the number of index records in a range. In case of errors, return
7886 // HA_TOKUDB_RANGE_COUNT instead of HA_POS_ERROR. This was behavior
7887 // when we got the handlerton from MySQL.
7888 // Parameters:
7889 // keynr -index to use
7890 // [in] start_key - low end of the range
7891 // [in] end_key - high end of the range
7892 // Returns:
7893 // 0 - There are no matching keys in the given range
7894 // number > 0 - There are approximately number matching rows in the range
7895 // HA_POS_ERROR - Something is wrong with the index tree
7896 //
7897 ha_rows ha_tokudb::records_in_range(uint keynr, const key_range* start_key,
7898 const key_range* end_key,
7899 page_range *pages) {
7900 TOKUDB_HANDLER_DBUG_ENTER("%d %p %p", keynr, start_key, end_key);
7901 DBT *pleft_key, *pright_key;
7902 DBT left_key, right_key;
7903 ha_rows ret_val = HA_TOKUDB_RANGE_COUNT;
7904 DB *kfile = share->key_file[keynr];
7905 uint64_t rows = 0;
7906 int error;
7907
7908 // get start_rows and end_rows values so that we can estimate range
7909 // when calling key_range64, the only value we can trust is the value for less
7910 // The reason is that the key being passed in may be a prefix of keys in the DB
7911 // As a result, equal may be 0 and greater may actually be equal+greater
7912 // So, we call key_range64 on the key, and the key that is after it.
7913 if (!start_key && !end_key) {
7914 error = estimate_num_rows(share->file, &rows, transaction);
7915 if (error) {
7916 ret_val = HA_TOKUDB_RANGE_COUNT;
7917 goto cleanup;
7918 }
7919 ret_val = (rows <= 1) ? 1 : rows;
7920 goto cleanup;
7921 }
7922 if (start_key) {
7923 uchar inf_byte = (start_key->flag == HA_READ_KEY_EXACT) ? COL_NEG_INF : COL_POS_INF;
7924 pack_key(&left_key, keynr, key_buff, start_key->key, start_key->length, inf_byte);
7925 pleft_key = &left_key;
7926 } else {
7927 pleft_key = NULL;
7928 }
7929 if (end_key) {
7930 uchar inf_byte = (end_key->flag == HA_READ_BEFORE_KEY) ? COL_NEG_INF : COL_POS_INF;
7931 pack_key(&right_key, keynr, key_buff2, end_key->key, end_key->length, inf_byte);
7932 pright_key = &right_key;
7933 } else {
7934 pright_key = NULL;
7935 }
7936 // keys_range64 can not handle a degenerate range (left_key > right_key), so we filter here
7937 if (pleft_key && pright_key && tokudb_cmp_dbt_key(kfile, pleft_key, pright_key) > 0) {
7938 rows = 0;
7939 } else {
7940 uint64_t less, equal1, middle, equal2, greater;
7941 bool is_exact;
7942 error = kfile->keys_range64(kfile, transaction, pleft_key, pright_key,
7943 &less, &equal1, &middle, &equal2, &greater, &is_exact);
7944 if (error) {
7945 ret_val = HA_TOKUDB_RANGE_COUNT;
7946 goto cleanup;
7947 }
7948 rows = middle;
7949 }
7950
7951 // MySQL thinks a return value of 0 means there are exactly 0 rows
7952 // Therefore, always return non-zero so this assumption is not made
7953 ret_val = (ha_rows) (rows <= 1 ? 1 : rows);
7954
7955 cleanup:
7956 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7957 TOKUDB_DEBUG_RETURN,
7958 "return %" PRIu64 " %" PRIu64,
7959 (uint64_t)ret_val,
7960 rows);
7961 DBUG_RETURN(ret_val);
7962 }
7963
7964
7965 //
7966 // Initializes the auto-increment data in the local "share" object to the
7967 // greater of two values: what's stored in the metadata or the last inserted
7968 // auto-increment field (if auto-increment field is the first field of a key).
7969 //
7970 void ha_tokudb::init_auto_increment() {
7971 int error;
7972 DB_TXN* txn = NULL;
7973
7974 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
7975 if (error) {
7976 share->last_auto_increment = 0;
7977 } else {
7978 HA_METADATA_KEY key_val;
7979 DBT key;
7980 memset(&key, 0, sizeof(key));
7981 key.data = &key_val;
7982 key.size = sizeof(key_val);
7983 DBT value;
7984 memset(&value, 0, sizeof(value));
7985 value.flags = DB_DBT_USERMEM;
7986
7987 // Retrieve the initial auto increment value, as specified by create table
7988 // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
7989 // then the value 100 should be stored here
7990 key_val = hatoku_ai_create_value;
7991 value.ulen = sizeof(share->auto_inc_create_value);
7992 value.data = &share->auto_inc_create_value;
7993 error = share->status_block->get(share->status_block, txn, &key, &value, 0);
7994
7995 if (error || value.size != sizeof(share->auto_inc_create_value)) {
7996 share->auto_inc_create_value = 0;
7997 }
7998
7999 // Retrieve hatoku_max_ai, which is max value used by auto increment
8000 // column so far, the max value could have been auto generated (e.g. insert (NULL))
8001 // or it could have been manually inserted by user (e.g. insert (345))
8002 key_val = hatoku_max_ai;
8003 value.ulen = sizeof(share->last_auto_increment);
8004 value.data = &share->last_auto_increment;
8005 error = share->status_block->get(share->status_block, txn, &key, &value, 0);
8006
8007 if (error || value.size != sizeof(share->last_auto_increment)) {
8008 if (share->auto_inc_create_value)
8009 share->last_auto_increment = share->auto_inc_create_value - 1;
8010 else
8011 share->last_auto_increment = 0;
8012 }
8013
8014 commit_txn(txn, 0);
8015 }
8016 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
8017 TOKUDB_DEBUG_AUTO_INCREMENT,
8018 "init auto increment:%lld",
8019 share->last_auto_increment);
8020 }
8021
8022 void ha_tokudb::get_auto_increment(
8023 ulonglong offset,
8024 ulonglong increment,
8025 ulonglong nb_desired_values,
8026 ulonglong* first_value,
8027 ulonglong* nb_reserved_values) {
8028
8029 TOKUDB_HANDLER_DBUG_ENTER("");
8030 ulonglong nr;
8031 bool over;
8032
8033 if (table->s->next_number_key_offset)
8034 {
8035 handler::get_auto_increment(offset, increment, nb_desired_values, first_value, nb_reserved_values);
8036 DBUG_VOID_RETURN;
8037 }
8038
8039 share->lock();
8040
8041 if (share->auto_inc_create_value > share->last_auto_increment) {
8042 nr = share->auto_inc_create_value;
8043 over = false;
8044 share->last_auto_increment = share->auto_inc_create_value;
8045 } else {
8046 nr = share->last_auto_increment + increment;
8047 over = nr < share->last_auto_increment;
8048 if (over)
8049 nr = ULONGLONG_MAX;
8050 }
8051 if (!over) {
8052 share->last_auto_increment = nr + (nb_desired_values - 1)*increment;
8053 if (delay_updating_ai_metadata) {
8054 ai_metadata_update_required = true;
8055 } else {
8056 update_max_auto_inc(
8057 share->status_block,
8058 share->last_auto_increment);
8059 }
8060 }
8061 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
8062 TOKUDB_DEBUG_AUTO_INCREMENT,
8063 "get_auto_increment(%lld,%lld,%lld): got:%lld:%lld",
8064 offset,
8065 increment,
8066 nb_desired_values,
8067 nr,
8068 nb_desired_values);
8069 *first_value = nr;
8070 *nb_reserved_values = nb_desired_values;
8071 share->unlock();
8072 TOKUDB_HANDLER_DBUG_VOID_RETURN;
8073 }
8074
8075 bool ha_tokudb::is_optimize_blocking() {
8076 return false;
8077 }
8078
8079 bool ha_tokudb::is_auto_inc_singleton(){
8080 return false;
8081 }
8082
8083
8084 // Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8085 // With a transaction, drops dictionaries associated with indexes in key_num
8086 //
8087 //
8088 // Adds indexes to the table. Takes the array of KEY passed in key_info, and creates
8089 // DB's that will go at the end of share->key_file. THE IMPLICIT ASSUMPTION HERE is
8090 // that the table will be modified and that these added keys will be appended to the end
8091 // of the array table->key_info
8092 // Parameters:
8093 // [in] table_arg - table that is being modified, seems to be identical to this->table
8094 // [in] key_info - array of KEY's to be added
8095 // num_of_keys - number of keys to be added, number of elements in key_info
8096 // Returns:
8097 // 0 on success, error otherwise
8098 //
8099 int ha_tokudb::tokudb_add_index(
8100 TABLE* table_arg,
8101 KEY* key_info,
8102 uint num_of_keys,
8103 DB_TXN* txn,
8104 bool* inc_num_DBs,
8105 bool* modified_DBs) {
8106
8107 TOKUDB_HANDLER_DBUG_ENTER("");
8108 assert_always(txn);
8109
8110 int error;
8111 uint curr_index = 0;
8112 DBC* tmp_cursor = NULL;
8113 int cursor_ret_val = 0;
8114 DBT curr_pk_key, curr_pk_val;
8115 THD* thd = ha_thd();
8116 DB_LOADER* loader = NULL;
8117 DB_INDEXER* indexer = NULL;
8118 bool loader_save_space = tokudb::sysvars::load_save_space(thd);
8119 bool use_hot_index = (lock.type == TL_WRITE_ALLOW_WRITE);
8120 uint32_t loader_flags = loader_save_space ? LOADER_COMPRESS_INTERMEDIATES : 0;
8121 uint32_t indexer_flags = 0;
8122 uint32_t mult_db_flags[MAX_KEY + 1] = {0};
8123 uint32_t mult_put_flags[MAX_KEY + 1];
8124 uint32_t mult_dbt_flags[MAX_KEY + 1];
8125 bool creating_hot_index = false;
8126 struct loader_context lc;
8127 memset(&lc, 0, sizeof lc);
8128 lc.thd = thd;
8129 lc.ha = this;
8130 loader_error = 0;
8131 bool rw_lock_taken = false;
8132 *inc_num_DBs = false;
8133 *modified_DBs = false;
8134 invalidate_bulk_fetch();
8135 unpack_entire_row = true; // for bulk fetching rows
8136 for (uint32_t i = 0; i < MAX_KEY+1; i++) {
8137 mult_put_flags[i] = 0;
8138 mult_dbt_flags[i] = DB_DBT_REALLOC;
8139 }
8140 //
8141 // number of DB files we have open currently, before add_index is executed
8142 //
8143 uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8144
8145 //
8146 // get the row type to use for the indexes we're adding
8147 //
8148 toku_compression_method compression_method =
8149 get_compression_method(share->file);
8150
8151 //
8152 // status message to be shown in "show process list"
8153 //
8154 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
8155 // buffer of 200 should be a good upper bound.
8156 char status_msg[MAX_ALIAS_NAME + 200];
8157 // variable that stores number of elements inserted thus far
8158 ulonglong num_processed = 0;
8159 thd_proc_info(thd, "Adding indexes");
8160
8161 //
8162 // in unpack_row, MySQL passes a buffer that is this long,
8163 // so this length should be good enough for us as well
8164 //
8165 memset((void *) &curr_pk_key, 0, sizeof(curr_pk_key));
8166 memset((void *) &curr_pk_val, 0, sizeof(curr_pk_val));
8167
8168 //
8169 // The files for secondary tables are derived from the name of keys
8170 // If we try to add a key with the same name as an already existing key,
8171 // We can crash. So here we check if any of the keys added has the same
8172 // name of an existing key, and if so, we fail gracefully
8173 //
8174 for (uint i = 0; i < num_of_keys; i++) {
8175 for (uint j = 0; j < table_arg->s->keys; j++) {
8176 if (strcmp(key_info[i].name.str,
8177 table_arg->key_info[j].name.str) == 0) {
8178 error = HA_ERR_WRONG_COMMAND;
8179 goto cleanup;
8180 }
8181 }
8182 }
8183
8184 rwlock_t_lock_write(share->_num_DBs_lock);
8185 rw_lock_taken = true;
8186 //
8187 // open all the DB files and set the appropriate variables in share
8188 // they go to the end of share->key_file
8189 //
8190 creating_hot_index =
8191 use_hot_index && num_of_keys == 1 &&
8192 (key_info[0].flags & HA_NOSAME) == 0;
8193 if (use_hot_index && (share->num_DBs > curr_num_DBs)) {
8194 //
8195 // already have hot index in progress, get out
8196 //
8197 error = HA_ERR_INTERNAL_ERROR;
8198 goto cleanup;
8199 }
8200 curr_index = curr_num_DBs;
8201 *modified_DBs = true;
8202 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8203 if (key_is_clustering(&key_info[i])) {
8204 set_key_filter(
8205 &share->kc_info.key_filters[curr_index],
8206 &key_info[i],
8207 table_arg,
8208 false);
8209 if (!hidden_primary_key) {
8210 set_key_filter(
8211 &share->kc_info.key_filters[curr_index],
8212 &table_arg->key_info[primary_key],
8213 table_arg,
8214 false);
8215 }
8216
8217 error = initialize_col_pack_info(
8218 &share->kc_info,
8219 table_arg->s,
8220 curr_index);
8221 if (error) {
8222 goto cleanup;
8223 }
8224 }
8225
8226
8227 error = create_secondary_dictionary(
8228 share->full_table_name(),
8229 table_arg,
8230 &key_info[i],
8231 txn,
8232 &share->kc_info,
8233 curr_index,
8234 creating_hot_index,
8235 compression_method);
8236 if (error) {
8237 goto cleanup;
8238 }
8239
8240 error = open_secondary_dictionary(
8241 &share->key_file[curr_index],
8242 &key_info[i],
8243 share->full_table_name(),
8244 false,
8245 txn);
8246 if (error) {
8247 goto cleanup;
8248 }
8249 }
8250
8251 if (creating_hot_index) {
8252 share->num_DBs++;
8253 *inc_num_DBs = true;
8254 error = db_env->create_indexer(
8255 db_env,
8256 txn,
8257 &indexer,
8258 share->file,
8259 num_of_keys,
8260 &share->key_file[curr_num_DBs],
8261 mult_db_flags,
8262 indexer_flags);
8263 if (error) {
8264 goto cleanup;
8265 }
8266
8267 error = indexer->set_poll_function(
8268 indexer, ha_tokudb::tokudb_add_index_poll, &lc);
8269 if (error) {
8270 goto cleanup;
8271 }
8272
8273 error = indexer->set_error_callback(
8274 indexer, ha_tokudb::loader_add_index_err, &lc);
8275 if (error) {
8276 goto cleanup;
8277 }
8278
8279 share->_num_DBs_lock.unlock();
8280 rw_lock_taken = false;
8281
8282 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8283 // initialize a one phase progress report.
8284 // incremental reports are done in the indexer's callback function.
8285 thd_progress_init(thd, 1);
8286 #endif
8287
8288 error = indexer->build(indexer);
8289
8290 if (error) {
8291 goto cleanup;
8292 }
8293
8294 rwlock_t_lock_write(share->_num_DBs_lock);
8295 error = indexer->close(indexer);
8296 share->_num_DBs_lock.unlock();
8297 if (error) {
8298 goto cleanup;
8299 }
8300 indexer = NULL;
8301 } else {
8302 DBUG_ASSERT(table->mdl_ticket->get_type() >= MDL_SHARED_NO_WRITE);
8303 share->_num_DBs_lock.unlock();
8304 rw_lock_taken = false;
8305 prelocked_right_range_size = 0;
8306 prelocked_left_range_size = 0;
8307 struct smart_dbt_bf_info bf_info;
8308 bf_info.ha = this;
8309 // you need the val if you have a clustering index and key_read is not 0;
8310 bf_info.direction = 1;
8311 bf_info.thd = ha_thd();
8312 bf_info.need_val = true;
8313 bf_info.key_to_compare = NULL;
8314
8315 error = db_env->create_loader(
8316 db_env,
8317 txn,
8318 &loader,
8319 NULL, // no src_db needed
8320 num_of_keys,
8321 &share->key_file[curr_num_DBs],
8322 mult_put_flags,
8323 mult_dbt_flags,
8324 loader_flags);
8325 if (error) {
8326 goto cleanup;
8327 }
8328
8329 error =
8330 loader->set_poll_function(loader, ha_tokudb::bulk_insert_poll, &lc);
8331 if (error) {
8332 goto cleanup;
8333 }
8334
8335 error = loader->set_error_callback(
8336 loader, ha_tokudb::loader_add_index_err, &lc);
8337 if (error) {
8338 goto cleanup;
8339 }
8340 //
8341 // scan primary table, create each secondary key, add to each DB
8342 //
8343 error = share->file->cursor(
8344 share->file,
8345 txn,
8346 &tmp_cursor,
8347 DB_SERIALIZABLE);
8348 if (error) {
8349 tmp_cursor = NULL; // Safety
8350 goto cleanup;
8351 }
8352
8353 //
8354 // grab some locks to make this go faster
8355 // first a global read lock on the main DB, because
8356 // we intend to scan the entire thing
8357 //
8358 error = tmp_cursor->c_set_bounds(
8359 tmp_cursor,
8360 share->file->dbt_neg_infty(),
8361 share->file->dbt_pos_infty(),
8362 true,
8363 0);
8364 if (error) {
8365 goto cleanup;
8366 }
8367
8368 // set the bulk fetch iteration to its max so that adding an
8369 // index fills the bulk fetch buffer every time. we do not
8370 // want it to grow exponentially fast.
8371 rows_fetched_using_bulk_fetch = 0;
8372 bulk_fetch_iteration = HA_TOKU_BULK_FETCH_ITERATION_MAX;
8373 cursor_ret_val = tmp_cursor->c_getf_next(
8374 tmp_cursor,
8375 DB_PRELOCKED,
8376 smart_dbt_bf_callback,
8377 &bf_info);
8378
8379 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8380 // initialize a two phase progress report.
8381 // first phase: putting rows into the loader
8382 thd_progress_init(thd, 2);
8383 #endif
8384
8385 while (cursor_ret_val != DB_NOTFOUND ||
8386 ((bytes_used_in_range_query_buff -
8387 curr_range_query_buff_offset) > 0)) {
8388 if ((bytes_used_in_range_query_buff -
8389 curr_range_query_buff_offset) == 0) {
8390 invalidate_bulk_fetch(); // reset the buffers
8391 cursor_ret_val = tmp_cursor->c_getf_next(
8392 tmp_cursor,
8393 DB_PRELOCKED,
8394 smart_dbt_bf_callback,
8395 &bf_info);
8396 if (cursor_ret_val != DB_NOTFOUND && cursor_ret_val != 0) {
8397 error = cursor_ret_val;
8398 goto cleanup;
8399 }
8400 }
8401 // do this check in case the the c_getf_next did not put anything
8402 // into the buffer because there was no more data
8403 if ((bytes_used_in_range_query_buff -
8404 curr_range_query_buff_offset) == 0) {
8405 break;
8406 }
8407 // at this point, we know the range query buffer has at least one
8408 // key/val pair
8409 uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
8410
8411 uint32_t key_size = *(uint32_t *)curr_pos;
8412 curr_pos += sizeof(key_size);
8413 uchar* curr_key_buff = curr_pos;
8414 curr_pos += key_size;
8415 curr_pk_key.data = curr_key_buff;
8416 curr_pk_key.size = key_size;
8417
8418 uint32_t val_size = *(uint32_t *)curr_pos;
8419 curr_pos += sizeof(val_size);
8420 uchar* curr_val_buff = curr_pos;
8421 curr_pos += val_size;
8422 curr_pk_val.data = curr_val_buff;
8423 curr_pk_val.size = val_size;
8424
8425 curr_range_query_buff_offset = curr_pos - range_query_buff;
8426
8427 error = loader->put(loader, &curr_pk_key, &curr_pk_val);
8428 if (error) {
8429 goto cleanup;
8430 }
8431
8432 num_processed++;
8433
8434 if ((num_processed % 1000) == 0) {
8435 sprintf(
8436 status_msg,
8437 "Adding indexes: Fetched %llu of about %llu rows, loading "
8438 "of data still remains.",
8439 num_processed,
8440 (long long unsigned)share->row_count());
8441 thd_proc_info(thd, status_msg);
8442
8443 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8444 thd_progress_report(
8445 thd,
8446 num_processed,
8447 (long long unsigned)share->row_count());
8448 #endif
8449
8450 if (thd_kill_level(thd)) {
8451 error = ER_ABORTING_CONNECTION;
8452 goto cleanup;
8453 }
8454 }
8455 }
8456 error = tmp_cursor->c_close(tmp_cursor);
8457 assert_always(error==0);
8458 tmp_cursor = NULL;
8459
8460 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8461 // next progress report phase: closing the loader.
8462 // incremental reports are done in the loader's callback function.
8463 thd_progress_next_stage(thd);
8464 #endif
8465
8466 error = loader->close(loader);
8467 loader = NULL;
8468
8469 if (error) goto cleanup;
8470 }
8471 curr_index = curr_num_DBs;
8472 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8473 if (key_info[i].flags & HA_NOSAME) {
8474 bool is_unique;
8475 error = is_index_unique(
8476 &is_unique,
8477 txn,
8478 share->key_file[curr_index],
8479 &key_info[i],
8480 creating_hot_index ? 0 : DB_PRELOCKED_WRITE);
8481 if (error)
8482 goto cleanup;
8483 if (!is_unique) {
8484 error = HA_ERR_FOUND_DUPP_KEY;
8485 last_dup_key = i;
8486 goto cleanup;
8487 }
8488 }
8489 }
8490
8491 share->lock();
8492 //
8493 // We have an accurate row count, might as well update share->rows
8494 //
8495 if(!creating_hot_index) {
8496 share->set_row_count(num_processed, true);
8497 }
8498 //
8499 // now write stuff to status.tokudb
8500 //
8501 for (uint i = 0; i < num_of_keys; i++) {
8502 write_key_name_to_status(share->status_block, key_info[i].name.str, txn);
8503 }
8504 share->unlock();
8505
8506 error = 0;
8507 cleanup:
8508 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8509 thd_progress_end(thd);
8510 #endif
8511 if (rw_lock_taken) {
8512 share->_num_DBs_lock.unlock();
8513 rw_lock_taken = false;
8514 }
8515 if (tmp_cursor) {
8516 int r = tmp_cursor->c_close(tmp_cursor);
8517 assert_always(r==0);
8518 tmp_cursor = NULL;
8519 }
8520 if (loader != NULL) {
8521 sprintf(status_msg, "aborting creation of indexes.");
8522 thd_proc_info(thd, status_msg);
8523 loader->abort(loader);
8524 }
8525 if (indexer != NULL) {
8526 sprintf(status_msg, "aborting creation of indexes.");
8527 thd_proc_info(thd, status_msg);
8528 rwlock_t_lock_write(share->_num_DBs_lock);
8529 indexer->abort(indexer);
8530 share->_num_DBs_lock.unlock();
8531 }
8532 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8533 error == DB_LOCK_NOTGRANTED) {
8534 sql_print_error(
8535 "Could not add indexes to table %s because another transaction has "
8536 "accessed the table. To add indexes, make sure no transactions "
8537 "touch the table.",
8538 share->full_table_name());
8539 }
8540 thd_proc_info(thd, orig_proc_info);
8541 TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
8542 }
8543 int ha_tokudb::tokudb_add_index_poll(void* extra, float progress) {
8544 LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
8545 if (thd_killed(context->thd)) {
8546 snprintf(context->write_status_msg,
8547 sizeof(context->write_status_msg),
8548 "The process has been killed, aborting add index.");
8549 return ER_ABORTING_CONNECTION;
8550 }
8551 float percentage = progress * 100;
8552 snprintf(context->write_status_msg,
8553 sizeof(context->write_status_msg),
8554 "Adding of indexes to %s about %.1f%% done",
8555 context->ha->share->full_table_name(),
8556 percentage);
8557 thd_proc_info(context->thd, context->write_status_msg);
8558 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8559 thd_progress_report(context->thd, (unsigned long long)percentage, 100);
8560 #endif
8561 return 0;
8562 }
8563
8564 //
8565 // Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8566 // Closes added indexes in case of error in error path of add_index and alter_table_phase2
8567 //
8568 void ha_tokudb::restore_add_index(
8569 TABLE* table_arg,
8570 uint num_of_keys,
8571 bool incremented_numDBs,
8572 bool modified_DBs) {
8573
8574 uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8575 uint curr_index = 0;
8576
8577 //
8578 // need to restore num_DBs, and we have to do it before we close the dictionaries
8579 // so that there is not a window
8580 //
8581 if (incremented_numDBs) {
8582 rwlock_t_lock_write(share->_num_DBs_lock);
8583 share->num_DBs--;
8584 }
8585 if (modified_DBs) {
8586 curr_index = curr_num_DBs;
8587 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8588 reset_key_and_col_info(&share->kc_info, curr_index);
8589 }
8590 curr_index = curr_num_DBs;
8591 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8592 if (share->key_file[curr_index]) {
8593 int r = share->key_file[curr_index]->close(
8594 share->key_file[curr_index],
8595 0);
8596 assert_always(r==0);
8597 share->key_file[curr_index] = NULL;
8598 }
8599 }
8600 }
8601 if (incremented_numDBs) {
8602 share->_num_DBs_lock.unlock();
8603 }
8604 }
8605
8606 //
8607 // Internal function called by ha_tokudb::prepare_drop_index and ha_tokudb::alter_table_phase2
8608 // With a transaction, drops dictionaries associated with indexes in key_num
8609 //
8610 int ha_tokudb::drop_indexes(uint* key_num,
8611 uint num_of_keys,
8612 KEY* key_info,
8613 DB_TXN* txn) {
8614 TOKUDB_HANDLER_DBUG_ENTER("");
8615 assert_always(txn);
8616
8617 int error = 0;
8618 for (uint i = 0; i < num_of_keys; i++) {
8619 uint curr_index = key_num[i];
8620 error = share->key_file[curr_index]->pre_acquire_fileops_lock(
8621 share->key_file[curr_index],
8622 txn);
8623 if (error != 0) {
8624 goto cleanup;
8625 }
8626 }
8627 for (uint i = 0; i < num_of_keys; i++) {
8628 uint curr_index = key_num[i];
8629 int r = share->key_file[curr_index]->close(share->key_file[curr_index],0);
8630 assert_always(r==0);
8631 share->key_file[curr_index] = NULL;
8632
8633 error = remove_key_name_from_status(
8634 share->status_block,
8635 key_info[curr_index].name.str,
8636 txn);
8637 if (error) {
8638 goto cleanup;
8639 }
8640
8641 error = delete_or_rename_dictionary(
8642 share->full_table_name(),
8643 NULL,
8644 key_info[curr_index].name.str,
8645 true,
8646 txn,
8647 true);
8648 if (error) {
8649 goto cleanup;
8650 }
8651 }
8652
8653 cleanup:
8654 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8655 error == DB_LOCK_NOTGRANTED) {
8656 sql_print_error(
8657 "Could not drop indexes from table %s because another transaction "
8658 "has accessed the table. To drop indexes, make sure no "
8659 "transactions touch the table.",
8660 share->full_table_name());
8661 }
8662 TOKUDB_HANDLER_DBUG_RETURN(error);
8663 }
8664
8665 //
8666 // Internal function called by ha_tokudb::prepare_drop_index and
8667 // ha_tokudb::alter_table_phase2
8668 // Restores dropped indexes in case of error in error path of
8669 // prepare_drop_index and alter_table_phase2
8670 //
8671 void ha_tokudb::restore_drop_indexes(uint* key_num, uint num_of_keys) {
8672 //
8673 // reopen closed dictionaries
8674 //
8675 for (uint i = 0; i < num_of_keys; i++) {
8676 int r;
8677 uint curr_index = key_num[i];
8678 if (share->key_file[curr_index] == NULL) {
8679 r = open_secondary_dictionary(
8680 &share->key_file[curr_index],
8681 &table_share->key_info[curr_index],
8682 share->full_table_name(),
8683 false,
8684 NULL);
8685 assert_always(!r);
8686 }
8687 }
8688 }
8689
8690 int ha_tokudb::map_to_handler_error(int error) {
8691 switch (error) {
8692 case DB_LOCK_DEADLOCK:
8693 error = HA_ERR_LOCK_DEADLOCK;
8694 break;
8695 case DB_LOCK_NOTGRANTED:
8696 error = HA_ERR_LOCK_WAIT_TIMEOUT;
8697 break;
8698 #if defined(HA_ERR_DISK_FULL)
8699 case ENOSPC:
8700 error = HA_ERR_DISK_FULL;
8701 break;
8702 #endif
8703 case DB_KEYEXIST:
8704 error = HA_ERR_FOUND_DUPP_KEY;
8705 break;
8706 #if defined(HA_ALTER_ERROR)
8707 case HA_ALTER_ERROR:
8708 error = HA_ERR_UNSUPPORTED;
8709 break;
8710 #endif
8711 case TOKUDB_INTERRUPTED:
8712 error = ER_QUERY_INTERRUPTED;
8713 break;
8714 case TOKUDB_OUT_OF_LOCKS:
8715 error = HA_ERR_LOCK_TABLE_FULL;
8716 break;
8717 }
8718 return error;
8719 }
8720
8721 void ha_tokudb::print_error(int error, myf errflag) {
8722 error = map_to_handler_error(error);
8723 handler::print_error(error, errflag);
8724 }
8725
8726 //
8727 // truncate's dictionary associated with keynr index using transaction txn
8728 // does so by deleting and then recreating the dictionary in the context
8729 // of a transaction
8730 //
8731 int ha_tokudb::truncate_dictionary(uint keynr, DB_TXN* txn) {
8732 int error;
8733 bool is_pk = (keynr == primary_key);
8734
8735 toku_compression_method compression_method =
8736 get_compression_method(share->key_file[keynr]);
8737 error = share->key_file[keynr]->close(share->key_file[keynr], 0);
8738 assert_always(error == 0);
8739
8740 share->key_file[keynr] = NULL;
8741 if (is_pk) {
8742 share->file = NULL;
8743 }
8744
8745 if (is_pk) {
8746 error = delete_or_rename_dictionary(
8747 share->full_table_name(),
8748 NULL,
8749 "main",
8750 false, //is_key
8751 txn,
8752 true); // is a delete
8753 if (error) {
8754 goto cleanup;
8755 }
8756 } else {
8757 error = delete_or_rename_dictionary(
8758 share->full_table_name(),
8759 NULL,
8760 table_share->key_info[keynr].name.str,
8761 true, //is_key
8762 txn,
8763 true); // is a delete
8764 if (error) {
8765 goto cleanup;
8766 }
8767 }
8768
8769 if (is_pk) {
8770 error = create_main_dictionary(
8771 share->full_table_name(),
8772 table,
8773 txn,
8774 &share->kc_info,
8775 compression_method);
8776 } else {
8777 error = create_secondary_dictionary(
8778 share->full_table_name(),
8779 table,
8780 &table_share->key_info[keynr],
8781 txn,
8782 &share->kc_info,
8783 keynr,
8784 false,
8785 compression_method);
8786 }
8787 if (error) {
8788 goto cleanup;
8789 }
8790
8791 cleanup:
8792 return error;
8793 }
8794
8795 // for 5.5
8796 int ha_tokudb::truncate() {
8797 TOKUDB_HANDLER_DBUG_ENTER("");
8798 int error = delete_all_rows_internal();
8799 TOKUDB_HANDLER_DBUG_RETURN(error);
8800 }
8801
8802 // delete all rows from a table
8803 //
8804 // effects: delete all of the rows in the main dictionary and all of the
8805 // indices. this must be atomic, so we use the statement transaction
8806 // for all of the truncate operations.
8807 // locks: if we have an exclusive table write lock, all of the concurrency
8808 // issues go away.
8809 // returns: 0 if success
8810 int ha_tokudb::delete_all_rows() {
8811 TOKUDB_HANDLER_DBUG_ENTER("");
8812 int error = 0;
8813 if (thd_sql_command(ha_thd()) != SQLCOM_TRUNCATE) {
8814 share->try_table_lock = true;
8815 error = HA_ERR_WRONG_COMMAND;
8816 }
8817 if (error == 0)
8818 error = delete_all_rows_internal();
8819 TOKUDB_HANDLER_DBUG_RETURN(error);
8820 }
8821
8822 int ha_tokudb::delete_all_rows_internal() {
8823 TOKUDB_HANDLER_DBUG_ENTER("");
8824 int error = 0;
8825 uint curr_num_DBs = 0;
8826 DB_TXN* txn = NULL;
8827
8828 // this should be enough to handle locking as the higher level MDL
8829 // on this table should prevent any new analyze tasks.
8830 share->cancel_background_jobs();
8831
8832 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
8833 if (error) {
8834 goto cleanup;
8835 }
8836
8837 curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
8838 for (uint i = 0; i < curr_num_DBs; i++) {
8839 error = share->key_file[i]->pre_acquire_fileops_lock(
8840 share->key_file[i],
8841 txn);
8842 if (error) {
8843 goto cleanup;
8844 }
8845 error = share->key_file[i]->pre_acquire_table_lock(
8846 share->key_file[i],
8847 txn);
8848 if (error) {
8849 goto cleanup;
8850 }
8851 }
8852 for (uint i = 0; i < curr_num_DBs; i++) {
8853 error = truncate_dictionary(i, txn);
8854 if (error) {
8855 goto cleanup;
8856 }
8857 }
8858
8859 DEBUG_SYNC(ha_thd(), "tokudb_after_truncate_all_dictionarys");
8860
8861 // zap the row count
8862 if (error == 0) {
8863 share->set_row_count(0, false);
8864 // update auto increment
8865 share->last_auto_increment = 0;
8866 // calling write_to_status directly because we need to use txn
8867 write_to_status(
8868 share->status_block,
8869 hatoku_max_ai,
8870 &share->last_auto_increment,
8871 sizeof(share->last_auto_increment),
8872 txn);
8873 }
8874
8875 share->try_table_lock = true;
8876 cleanup:
8877 if (txn) {
8878 if (error) {
8879 abort_txn(txn);
8880 } else {
8881 commit_txn(txn,0);
8882 }
8883 }
8884
8885 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(
8886 TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8887 error == DB_LOCK_NOTGRANTED) {
8888 sql_print_error(
8889 "Could not truncate table %s because another transaction has "
8890 "accessed the table. To truncate the table, make sure no "
8891 "transactions touch the table.",
8892 share->full_table_name());
8893 }
8894 //
8895 // regardless of errors, need to reopen the DB's
8896 //
8897 for (uint i = 0; i < curr_num_DBs; i++) {
8898 int r = 0;
8899 if (share->key_file[i] == NULL) {
8900 if (i != primary_key) {
8901 r = open_secondary_dictionary(
8902 &share->key_file[i],
8903 &table_share->key_info[i],
8904 share->full_table_name(),
8905 false,
8906 NULL);
8907 assert_always(!r);
8908 } else {
8909 r = open_main_dictionary(
8910 share->full_table_name(),
8911 false,
8912 NULL);
8913 assert_always(!r);
8914 }
8915 }
8916 }
8917 TOKUDB_HANDLER_DBUG_RETURN(error);
8918 }
8919
8920 void ha_tokudb::set_loader_error(int err) {
8921 loader_error = err;
8922 }
8923
8924 void ha_tokudb::set_dup_value_for_pk(DBT* key) {
8925 assert_always(!hidden_primary_key);
8926 unpack_key(table->record[0],key,primary_key);
8927 last_dup_key = primary_key;
8928 }
8929
8930 void ha_tokudb::close_dsmrr() {
8931 #ifdef MARIADB_BASE_VERSION
8932 ds_mrr.dsmrr_close();
8933 #elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
8934 ds_mrr.dsmrr_close();
8935 #endif
8936 }
8937
8938 void ha_tokudb::reset_dsmrr() {
8939 #ifdef MARIADB_BASE_VERSION
8940 ds_mrr.dsmrr_close();
8941 #elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
8942 ds_mrr.reset();
8943 #endif
8944 }
8945
8946 // we cache the information so we can do filtering ourselves,
8947 // but as far as MySQL knows, we are not doing any filtering,
8948 // so if we happen to miss filtering a row that does not match
8949 // idx_cond_arg, MySQL will catch it.
8950 // This allows us the ability to deal with only index_next and index_prev,
8951 // and not need to worry about other index_XXX functions
8952 Item* ha_tokudb::idx_cond_push(uint keyno_arg, Item* idx_cond_arg) {
8953 toku_pushed_idx_cond_keyno = keyno_arg;
8954 toku_pushed_idx_cond = idx_cond_arg;
8955 return idx_cond_arg;
8956 }
8957
8958 void ha_tokudb::cancel_pushed_idx_cond() {
8959 invalidate_icp();
8960 handler::cancel_pushed_idx_cond();
8961 }
8962
8963 void ha_tokudb::cleanup_txn(DB_TXN *txn) {
8964 if (transaction == txn && cursor) {
8965 int r = cursor->c_close(cursor);
8966 assert_always(r == 0);
8967 cursor = NULL;
8968 }
8969 }
8970
8971 void ha_tokudb::add_to_trx_handler_list() {
8972 tokudb_trx_data* trx =
8973 (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8974 trx->handlers = list_add(trx->handlers, &trx_handler_list);
8975 }
8976
8977 void ha_tokudb::remove_from_trx_handler_list() {
8978 tokudb_trx_data* trx =
8979 (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8980 trx->handlers = list_delete(trx->handlers, &trx_handler_list);
8981 }
8982
8983 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
8984 void ha_tokudb::rpl_before_write_rows() {
8985 in_rpl_write_rows = true;
8986 }
8987
8988 void ha_tokudb::rpl_after_write_rows() {
8989 in_rpl_write_rows = false;
8990 }
8991
8992 void ha_tokudb::rpl_before_delete_rows() {
8993 in_rpl_delete_rows = true;
8994 }
8995
8996 void ha_tokudb::rpl_after_delete_rows() {
8997 in_rpl_delete_rows = false;
8998 }
8999
9000 void ha_tokudb::rpl_before_update_rows() {
9001 in_rpl_update_rows = true;
9002 }
9003
9004 void ha_tokudb::rpl_after_update_rows() {
9005 in_rpl_update_rows = false;
9006 }
9007
9008 bool ha_tokudb::rpl_lookup_rows() {
9009 if (!in_rpl_delete_rows && !in_rpl_update_rows)
9010 return true;
9011 else
9012 return tokudb::sysvars::rpl_lookup_rows(ha_thd());
9013 }
9014 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
9015
9016 // table admin
9017 #include "ha_tokudb_admin.cc"
9018
9019 // update functions
9020 #include "tokudb_update_fun.cc"
9021
9022 // fast updates
9023 #include "ha_tokudb_update.cc"
9024
9025 // alter table code for various mysql distros
9026 #include "ha_tokudb_alter_55.cc"
9027 #include "ha_tokudb_alter_56.cc"
9028
9029 // mrr
9030 #ifdef MARIADB_BASE_VERSION
9031 #include "ha_tokudb_mrr_maria.cc"
9032 #elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
9033 #include "ha_tokudb_mrr_mysql.cc"
9034 #endif
9035
9036 // key comparisons
9037 #include "hatoku_cmp.cc"
9038
9039 // handlerton
9040 #include "hatoku_hton.cc"
9041
9042 // generate template functions
9043 namespace tokudb {
9044 template size_t vlq_encode_ui(uint32_t n, void *p, size_t s);
9045 template size_t vlq_decode_ui(uint32_t *np, void *p, size_t s);
9046 template size_t vlq_encode_ui(uint64_t n, void *p, size_t s);
9047 template size_t vlq_decode_ui(uint64_t *np, void *p, size_t s);
9048 };
9049