1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of TokuDB
6
7
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 TokuDBis is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 TokuDB is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with TokuDB. If not, see <http://www.gnu.org/licenses/>.
21
22 ======= */
23
24 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
25
26 #include "hatoku_hton.h"
27 #include "tokudb_buffer.h"
28 #include "tokudb_status.h"
29 #include "ha_tokudb.h"
30 #include "ha_tokupart.h"
31 #include "hatoku_cmp.h"
32 #include "partition_info.h"
33 #include "partitioning/partition_base.h"
34 #include "sql_db.h"
35 #include "sql_parse.h"
36 #include "sql_table.h"
37 #include "table.h"
38 #include "tokudb_card.h"
39
40 #include "mysql/psi/mysql_file.h"
41
42 pfs_key_t ha_tokudb_mutex_key;
43 pfs_key_t num_DBs_lock_key;
44
45 std::unordered_map<std::string, TOKUDB_SHARE*> TOKUDB_SHARE::_open_tables;
46 tokudb::thread::mutex_t TOKUDB_SHARE::_open_tables_mutex;
47
48 static const char* ha_tokudb_exts[] = {
49 ha_tokudb_ext,
50 NullS
51 };
52
53 //
54 // This offset is calculated starting from AFTER the NULL bytes
55 //
get_fixed_field_size(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)56 static inline uint32_t get_fixed_field_size(
57 KEY_AND_COL_INFO* kc_info,
58 TABLE_SHARE* table_share,
59 uint keynr) {
60
61 uint offset = 0;
62 for (uint i = 0; i < table_share->fields; i++) {
63 if (is_fixed_field(kc_info, i) &&
64 !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
65 offset += kc_info->field_lengths[i];
66 }
67 }
68 return offset;
69 }
70
71
get_len_of_offsets(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)72 static inline uint32_t get_len_of_offsets(
73 KEY_AND_COL_INFO* kc_info,
74 TABLE_SHARE* table_share,
75 uint keynr) {
76
77 uint len = 0;
78 for (uint i = 0; i < table_share->fields; i++) {
79 if (is_variable_field(kc_info, i) &&
80 !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
81 len += kc_info->num_offset_bytes;
82 }
83 }
84 return len;
85 }
86
87
allocate_key_and_col_info(TABLE_SHARE * table_share,KEY_AND_COL_INFO * kc_info)88 static int allocate_key_and_col_info(
89 TABLE_SHARE* table_share,
90 KEY_AND_COL_INFO* kc_info) {
91
92 int error;
93 //
94 // initialize all of the bitmaps
95 //
96 for (uint i = 0; i < MAX_KEY + 1; i++) {
97 error =
98 bitmap_init(
99 &kc_info->key_filters[i],
100 NULL,
101 table_share->fields,
102 false);
103 if (error) {
104 goto exit;
105 }
106 }
107
108 //
109 // create the field lengths
110 //
111 kc_info->multi_ptr = tokudb::memory::multi_malloc(
112 MYF(MY_WME+MY_ZEROFILL),
113 &kc_info->field_types, (uint)(table_share->fields * sizeof (uint8_t)),
114 &kc_info->field_lengths, (uint)(table_share->fields * sizeof (uint16_t)),
115 &kc_info->length_bytes, (uint)(table_share->fields * sizeof (uint8_t)),
116 &kc_info->blob_fields, (uint)(table_share->fields * sizeof (uint32_t)),
117 NullS);
118 if (kc_info->multi_ptr == NULL) {
119 error = ENOMEM;
120 goto exit;
121 }
122 exit:
123 if (error) {
124 for (uint i = 0; MAX_KEY + 1; i++) {
125 bitmap_free(&kc_info->key_filters[i]);
126 }
127 tokudb::memory::free(kc_info->multi_ptr);
128 }
129 return error;
130 }
131
free_key_and_col_info(KEY_AND_COL_INFO * kc_info)132 static void free_key_and_col_info (KEY_AND_COL_INFO* kc_info) {
133 for (uint i = 0; i < MAX_KEY+1; i++) {
134 bitmap_free(&kc_info->key_filters[i]);
135 }
136
137 for (uint i = 0; i < MAX_KEY+1; i++) {
138 tokudb::memory::free(kc_info->cp_info[i]);
139 kc_info->cp_info[i] = NULL; // 3144
140 }
141
142 tokudb::memory::free(kc_info->multi_ptr);
143 kc_info->field_types = NULL;
144 kc_info->field_lengths = NULL;
145 kc_info->length_bytes = NULL;
146 kc_info->blob_fields = NULL;
147 }
148
149
static_init()150 void TOKUDB_SHARE::static_init() {
151 assert_always(_open_tables.size() == 0);
152 }
static_destroy()153 void TOKUDB_SHARE::static_destroy() {
154 for (auto it = _open_tables.cbegin(); it != _open_tables.cend(); it++) {
155 TOKUDB_TRACE("_open_tables %s %p", it->first.c_str(), it->second);
156 TOKUDB_SHARE* share = it->second;
157 share->destroy();
158 delete share;
159 }
160 _open_tables.clear();
161 assert_always(_open_tables.size() == 0);
162 }
get_state_string(share_state_t state)163 const char* TOKUDB_SHARE::get_state_string(share_state_t state) {
164 static const char* state_string[] = {
165 "CLOSED",
166 "OPENED",
167 "ERROR"
168 };
169 assert_always(state == CLOSED || state == OPENED || state == ERROR);
170 return state_string[state];
171 }
operator new(size_t sz)172 void* TOKUDB_SHARE::operator new(size_t sz) {
173 return tokudb::memory::malloc(sz, MYF(MY_WME|MY_ZEROFILL|MY_FAE));
174 }
operator delete(void * p)175 void TOKUDB_SHARE::operator delete(void* p) { tokudb::memory::free(p); }
TOKUDB_SHARE()176 TOKUDB_SHARE::TOKUDB_SHARE()
177 : _num_DBs_lock(num_DBs_lock_key), _mutex(ha_tokudb_mutex_key) {}
init(const char * table_name)178 void TOKUDB_SHARE::init(const char* table_name) {
179 _use_count = 0;
180 thr_lock_init(&_thr_lock);
181 _state = CLOSED;
182 _row_delta_activity = 0;
183 _allow_auto_analysis = true;
184
185 _full_table_name.append(table_name);
186
187 String tmp_dictionary_name;
188 tokudb_split_dname(
189 table_name,
190 _database_name,
191 _table_name,
192 tmp_dictionary_name);
193
194 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
195 _full_table_name.ptr(),
196 get_state_string(_state),
197 _use_count);
198 TOKUDB_SHARE_DBUG_VOID_RETURN();
199 }
destroy()200 void TOKUDB_SHARE::destroy() {
201 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
202 _full_table_name.ptr(),
203 get_state_string(_state),
204 _use_count);
205
206 assert_always(_use_count == 0);
207 assert_always(
208 _state == TOKUDB_SHARE::CLOSED || _state == TOKUDB_SHARE::ERROR);
209 thr_lock_delete(&_thr_lock);
210 TOKUDB_SHARE_DBUG_VOID_RETURN();
211 }
get_share(const char * table_name,THR_LOCK_DATA * data,bool create_new)212 TOKUDB_SHARE* TOKUDB_SHARE::get_share(const char* table_name,
213 THR_LOCK_DATA* data,
214 bool create_new) {
215 std::string find_table_name(table_name);
216 mutex_t_lock(_open_tables_mutex);
217 auto it = _open_tables.find(find_table_name);
218 TOKUDB_SHARE *share = nullptr;
219 if (it != _open_tables.end()) {
220 share = it->second;
221 assert_always(strcmp(table_name, share->full_table_name()) == 0);
222 }
223 TOKUDB_TRACE_FOR_FLAGS(
224 TOKUDB_DEBUG_SHARE,
225 "existing share[%s] %s:share[%p]",
226 table_name,
227 share == NULL ? "not found" : "found",
228 share);
229
230 if (!share) {
231 if (create_new == false)
232 goto exit;
233 // create share and fill it with all zeroes
234 // hence, all pointers are initialized to NULL
235 share = new TOKUDB_SHARE;
236 assert_always(share);
237
238 share->init(table_name);
239
240 _open_tables.insert({find_table_name, share});
241 }
242
243 share->addref();
244
245 if (data)
246 thr_lock_data_init(&(share->_thr_lock), data, NULL);
247
248 exit:
249 mutex_t_unlock(_open_tables_mutex);
250 return share;
251 }
drop_share(TOKUDB_SHARE * share)252 void TOKUDB_SHARE::drop_share(TOKUDB_SHARE* share) {
253 TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_SHARE,
254 "share[%p]:file[%s]:state[%s]:use_count[%d]",
255 share,
256 share->_full_table_name.ptr(),
257 get_state_string(share->_state),
258 share->_use_count);
259
260 mutex_t_lock(_open_tables_mutex);
261 size_t n = _open_tables.erase(std::string(share->full_table_name()));
262 assert_always(n == 1);
263 share->destroy();
264 delete share;
265 mutex_t_unlock(_open_tables_mutex);
266 }
addref()267 TOKUDB_SHARE::share_state_t TOKUDB_SHARE::addref() {
268 TOKUDB_SHARE_TRACE_FOR_FLAGS((TOKUDB_DEBUG_ENTER & TOKUDB_DEBUG_SHARE),
269 "file[%s]:state[%s]:use_count[%d]",
270 _full_table_name.ptr(),
271 get_state_string(_state),
272 _use_count);
273
274 lock();
275 _use_count++;
276
277 return _state;
278 }
release()279 int TOKUDB_SHARE::release() {
280 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
281 _full_table_name.ptr(),
282 get_state_string(_state),
283 _use_count);
284
285 int error, result = 0;
286
287 mutex_t_lock(_mutex);
288 assert_always(_use_count != 0);
289 _use_count--;
290 if (_use_count == 0 && _state == TOKUDB_SHARE::OPENED) {
291 // number of open DB's may not be equal to number of keys we have
292 // because add_index may have added some. So, we loop through entire
293 // array and close any non-NULL value. It is imperative that we reset
294 // a DB to NULL once we are done with it.
295 for (uint i = 0; i < sizeof(key_file)/sizeof(key_file[0]); i++) {
296 if (key_file[i]) {
297 TOKUDB_TRACE_FOR_FLAGS(
298 TOKUDB_DEBUG_OPEN,
299 "dbclose:%p",
300 key_file[i]);
301 error = key_file[i]->close(key_file[i], 0);
302 assert_always(error == 0);
303 if (error) {
304 result = error;
305 }
306 if (key_file[i] == file)
307 file = NULL;
308 key_file[i] = NULL;
309 }
310 }
311
312 error = tokudb::metadata::close(&status_block);
313 assert_always(error == 0);
314
315 free_key_and_col_info(&kc_info);
316
317 if (_rec_per_key) {
318 tokudb::memory::free(_rec_per_key);
319 _rec_per_key = NULL;
320 _rec_per_keys = 0;
321 }
322
323 for (uint i = 0; i < _keys; i++) {
324 tokudb::memory::free(_key_descriptors[i]._name);
325 }
326 tokudb::memory::free(_key_descriptors);
327 _keys = _max_key_parts = 0; _key_descriptors = NULL;
328
329 _state = TOKUDB_SHARE::CLOSED;
330 }
331 mutex_t_unlock(_mutex);
332
333 TOKUDB_SHARE_DBUG_RETURN(result);
334 }
update_row_count(THD * thd,uint64_t added,uint64_t deleted,uint64_t updated)335 void TOKUDB_SHARE::update_row_count(
336 THD* thd,
337 uint64_t added,
338 uint64_t deleted,
339 uint64_t updated) {
340
341 uint64_t delta = added + deleted + updated;
342 lock();
343 if (deleted > added && _rows < (deleted - added)) {
344 _rows = 0;
345 } else {
346 _rows += added - deleted;
347 }
348 _row_delta_activity += delta;
349 if (_row_delta_activity == (uint64_t)~0)
350 _row_delta_activity = 1;
351
352 ulonglong auto_threshold = tokudb::sysvars::auto_analyze(thd);
353 if (delta && auto_threshold > 0 && _allow_auto_analysis) {
354 ulonglong pct_of_rows_changed_to_trigger;
355 pct_of_rows_changed_to_trigger = ((_rows * auto_threshold) / 100);
356 if (TOKUDB_UNLIKELY(_row_delta_activity >= pct_of_rows_changed_to_trigger)) {
357 char msg[200];
358 snprintf(msg,
359 sizeof(msg),
360 "TokuDB: Auto %s analysis for %s, delta_activity %llu is "
361 "greater than %llu percent of %llu rows.",
362 tokudb::sysvars::analyze_in_background(thd) > 0
363 ? "scheduling background"
364 : "running foreground",
365 full_table_name(),
366 _row_delta_activity,
367 auto_threshold,
368 (ulonglong)(_rows));
369
370 // analyze_standard will unlock _mutex regardless of success/failure
371 int ret = analyze_standard(thd, NULL);
372 if (TOKUDB_UNLIKELY(ret == 0 && tokudb::sysvars::debug > 0)) {
373 sql_print_information("%s - succeeded.", msg);
374 } else if (TOKUDB_UNLIKELY(ret != 0)) {
375 sql_print_information(
376 "%s - failed, likely a job already running.",
377 msg);
378 }
379 }
380 }
381 unlock();
382 }
set_cardinality_counts_in_table(TABLE * table)383 void TOKUDB_SHARE::set_cardinality_counts_in_table(TABLE* table) {
384 lock();
385 uint32_t next_key_part = 0;
386 for (uint32_t i = 0; i < table->s->keys; i++) {
387 KEY* key = &table->key_info[i];
388 bool is_unique_key =
389 (i == table->s->primary_key) || (key->flags & HA_NOSAME);
390
391 /* Check if this index supports index statistics. */
392 if (!key->supports_records_per_key()) {
393 continue;
394 }
395
396 for (uint32_t j = 0; j < key->actual_key_parts; j++) {
397 if (j >= key->user_defined_key_parts) {
398 // MySQL 'hidden' keys, really needs deeper investigation
399 // into MySQL hidden keys vs TokuDB hidden keys
400 key->set_records_per_key(j, 1.0);
401 key->rec_per_key[j] = 1;
402 continue;
403 }
404
405 assert_always(next_key_part < _rec_per_keys);
406 ulong val = _rec_per_key[next_key_part++];
407 val = (val * tokudb::sysvars::cardinality_scale_percent) / 100;
408 if (val == 0 || _rows == 0 ||
409 (is_unique_key && j == key->actual_key_parts - 1)) {
410 val = 1;
411 }
412 key->set_records_per_key(
413 j,
414 static_cast<rec_per_key_t>(val));
415 key->rec_per_key[j] = val;
416 }
417 }
418 unlock();
419 }
420
421 #define HANDLE_INVALID_CURSOR() \
422 if (cursor == NULL) { \
423 error = last_cursor_error; \
424 goto cleanup; \
425 }
426
table_type() const427 const char *ha_tokudb::table_type() const {
428 return tokudb_hton_name;
429 }
430
index_type(TOKUDB_UNUSED (uint inx))431 const char *ha_tokudb::index_type(TOKUDB_UNUSED(uint inx)) {
432 return "BTREE";
433 }
434
435 /*
436 * returns NULL terminated file extension string
437 */
bas_ext() const438 const char **ha_tokudb::bas_ext() const {
439 TOKUDB_HANDLER_DBUG_ENTER("");
440 DBUG_RETURN(ha_tokudb_exts);
441 }
442
is_insert_ignore(THD * thd)443 static inline bool is_insert_ignore (THD* thd) {
444 //
445 // from http://lists.mysql.com/internals/37735
446 //
447 return thd->lex->is_ignore() && thd->lex->duplicates == DUP_ERROR;
448 }
449
is_replace_into(THD * thd)450 static inline bool is_replace_into(THD* thd) {
451 return thd->lex->duplicates == DUP_REPLACE;
452 }
453
table_flags() const454 ulonglong ha_tokudb::table_flags() const {
455 return int_table_flags | HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE;
456 }
457
458 //
459 // Returns a bit mask of capabilities of the key or its part specified by
460 // the arguments. The capabilities are defined in sql/handler.h.
461 //
index_flags(uint idx,TOKUDB_UNUSED (uint part),TOKUDB_UNUSED (bool all_parts)) const462 ulong ha_tokudb::index_flags(uint idx,
463 TOKUDB_UNUSED(uint part),
464 TOKUDB_UNUSED(bool all_parts)) const {
465 TOKUDB_HANDLER_DBUG_ENTER("");
466 assert_always(table_share);
467 ulong flags = (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER |
468 HA_KEYREAD_ONLY | HA_READ_RANGE | HA_DO_INDEX_COND_PUSHDOWN);
469 if (key_is_clustering(&table_share->key_info[idx])) {
470 flags |= HA_CLUSTERED_INDEX;
471 }
472 DBUG_RETURN(flags);
473 }
474
475
476 //
477 // struct that will be used as a context for smart DBT callbacks
478 // contains parameters needed to complete the smart DBT cursor call
479 //
480 typedef struct smart_dbt_info {
481 ha_tokudb* ha; //instance to ha_tokudb needed for reading the row
482 uchar* buf; // output buffer where row will be written
483 uint keynr; // index into share->key_file that represents DB we are currently operating on
484 } *SMART_DBT_INFO;
485
486 typedef struct smart_dbt_bf_info {
487 ha_tokudb* ha;
488 bool need_val;
489 int direction;
490 THD* thd;
491 uchar* buf;
492 DBT* key_to_compare;
493 } *SMART_DBT_BF_INFO;
494
495 typedef struct index_read_info {
496 struct smart_dbt_info smart_dbt_info;
497 int cmp;
498 DBT* orig_key;
499 } *INDEX_READ_INFO;
500
501 //
502 // smart DBT callback function for optimize
503 // in optimize, we want to flatten DB by doing
504 // a full table scan. Therefore, we don't
505 // want to actually do anything with the data, hence
506 // callback does nothing
507 //
smart_dbt_do_nothing(TOKUDB_UNUSED (DBT const * key),TOKUDB_UNUSED (DBT const * row),TOKUDB_UNUSED (void * context))508 static int smart_dbt_do_nothing(TOKUDB_UNUSED(DBT const* key),
509 TOKUDB_UNUSED(DBT const* row),
510 TOKUDB_UNUSED(void* context)) {
511 return 0;
512 }
513
514 static int
smart_dbt_callback_rowread_ptquery(DBT const * key,DBT const * row,void * context)515 smart_dbt_callback_rowread_ptquery (DBT const *key, DBT const *row, void *context) {
516 SMART_DBT_INFO info = (SMART_DBT_INFO)context;
517 info->ha->extract_hidden_primary_key(info->keynr, key);
518 return info->ha->read_row_callback(info->buf,info->keynr,row,key);
519 }
520
521 //
522 // Smart DBT callback function in case where we have a covering index
523 //
smart_dbt_callback_keyread(DBT const * key,DBT TOKUDB_UNUSED (const * row),void * context)524 static int smart_dbt_callback_keyread(DBT const* key,
525 DBT TOKUDB_UNUSED(const* row),
526 void* context) {
527 SMART_DBT_INFO info = (SMART_DBT_INFO)context;
528 info->ha->extract_hidden_primary_key(info->keynr, key);
529 info->ha->read_key_only(info->buf,info->keynr,key);
530 return 0;
531 }
532
533 //
534 // Smart DBT callback function in case where we do NOT have a covering index
535 //
536 static int
smart_dbt_callback_rowread(DBT const * key,DBT const * row,void * context)537 smart_dbt_callback_rowread(DBT const *key, DBT const *row, void *context) {
538 int error = 0;
539 SMART_DBT_INFO info = (SMART_DBT_INFO)context;
540 info->ha->extract_hidden_primary_key(info->keynr, key);
541 error = info->ha->read_primary_key(info->buf,info->keynr,row,key);
542 return error;
543 }
544
545 //
546 // Smart DBT callback function in case where we have a covering index
547 //
smart_dbt_callback_ir_keyread(DBT const * key,TOKUDB_UNUSED (DBT const * row),void * context)548 static int smart_dbt_callback_ir_keyread(DBT const* key,
549 TOKUDB_UNUSED(DBT const* row),
550 void* context) {
551 INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
552 ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
553 if (ir_info->cmp) {
554 return 0;
555 }
556 return smart_dbt_callback_keyread(key, row, &ir_info->smart_dbt_info);
557 }
558
smart_dbt_callback_lookup(DBT const * key,TOKUDB_UNUSED (DBT const * row),void * context)559 static int smart_dbt_callback_lookup(DBT const* key,
560 TOKUDB_UNUSED(DBT const* row),
561 void* context) {
562 INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
563 ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
564 return 0;
565 }
566
567
568 //
569 // Smart DBT callback function in case where we do NOT have a covering index
570 //
571 static int
smart_dbt_callback_ir_rowread(DBT const * key,DBT const * row,void * context)572 smart_dbt_callback_ir_rowread(DBT const *key, DBT const *row, void *context) {
573 INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
574 ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
575 if (ir_info->cmp) {
576 return 0;
577 }
578 return smart_dbt_callback_rowread(key, row, &ir_info->smart_dbt_info);
579 }
580
581 //
582 // macro for Smart DBT callback function,
583 // so we do not need to put this long line of code in multiple places
584 //
585 #define SMART_DBT_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_keyread : smart_dbt_callback_rowread )
586 #define SMART_DBT_IR_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_ir_keyread : smart_dbt_callback_ir_rowread )
587
588 //
589 // macro that modifies read flag for cursor operations depending on whether
590 // we have preacquired lock or not
591 //
592 #define SET_PRELOCK_FLAG(flg) ((flg) | (range_lock_grabbed ? (use_write_locks ? DB_PRELOCKED_WRITE : DB_PRELOCKED) : 0))
593
594 //
595 // This method retrieves the value of the auto increment column of a record in MySQL format
596 // This was basically taken from MyISAM
597 // Parameters:
598 // type - the type of the auto increment column (e.g. int, float, double...)
599 // offset - offset into the record where the auto increment column is stored
600 // [in] record - MySQL row whose auto increment value we want to extract
601 // Returns:
602 // The value of the auto increment column in record
603 //
retrieve_auto_increment(uint16 type,uint32 offset,const uchar * record)604 static ulonglong retrieve_auto_increment(uint16 type, uint32 offset,const uchar *record)
605 {
606 const uchar *key; /* Key */
607 ulonglong unsigned_autoinc = 0; /* Unsigned auto-increment */
608 longlong signed_autoinc = 0; /* Signed auto-increment */
609 enum { unsigned_type, signed_type } autoinc_type;
610 float float_tmp; /* Temporary variable */
611 double double_tmp; /* Temporary variable */
612
613 key = ((uchar *) record) + offset;
614
615 /* Set default autoincrement type */
616 autoinc_type = unsigned_type;
617
618 switch (type) {
619 case HA_KEYTYPE_INT8:
620 signed_autoinc = (longlong) *(char*)key;
621 autoinc_type = signed_type;
622 break;
623
624 case HA_KEYTYPE_BINARY:
625 unsigned_autoinc = (ulonglong) *(uchar*) key;
626 break;
627
628 case HA_KEYTYPE_SHORT_INT:
629 signed_autoinc = (longlong) sint2korr(key);
630 autoinc_type = signed_type;
631 break;
632
633 case HA_KEYTYPE_USHORT_INT:
634 unsigned_autoinc = (ulonglong) uint2korr(key);
635 break;
636
637 case HA_KEYTYPE_LONG_INT:
638 signed_autoinc = (longlong) sint4korr(key);
639 autoinc_type = signed_type;
640 break;
641
642 case HA_KEYTYPE_ULONG_INT:
643 unsigned_autoinc = (ulonglong) uint4korr(key);
644 break;
645
646 case HA_KEYTYPE_INT24:
647 signed_autoinc = (longlong) sint3korr(key);
648 autoinc_type = signed_type;
649 break;
650
651 case HA_KEYTYPE_UINT24:
652 unsigned_autoinc = (ulonglong) tokudb_uint3korr(key);
653 break;
654
655 case HA_KEYTYPE_LONGLONG:
656 signed_autoinc = sint8korr(key);
657 autoinc_type = signed_type;
658 break;
659
660 case HA_KEYTYPE_ULONGLONG:
661 unsigned_autoinc = uint8korr(key);
662 break;
663
664 /* The remaining two cases should not be used but are included for
665 compatibility */
666 case HA_KEYTYPE_FLOAT:
667 float4get(&float_tmp, key); /* Note: float4get is a macro */
668 signed_autoinc = (longlong) float_tmp;
669 autoinc_type = signed_type;
670 break;
671
672 case HA_KEYTYPE_DOUBLE:
673 float8get(&double_tmp, key); /* Note: float8get is a macro */
674 signed_autoinc = (longlong) double_tmp;
675 autoinc_type = signed_type;
676 break;
677
678 default:
679 assert_unreachable();
680 }
681
682 if (signed_autoinc < 0) {
683 signed_autoinc = 0;
684 }
685
686 return autoinc_type == unsigned_type ?
687 unsigned_autoinc : (ulonglong) signed_autoinc;
688 }
689
field_offset(Field * field,TABLE * table)690 static inline ulong field_offset(Field* field, TABLE* table) {
691 return((ulong) (field->ptr - table->record[0]));
692 }
693
tx_to_toku_iso(ulong tx_isolation)694 static inline HA_TOKU_ISO_LEVEL tx_to_toku_iso(ulong tx_isolation) {
695 if (tx_isolation == ISO_READ_UNCOMMITTED) {
696 return hatoku_iso_read_uncommitted;
697 }
698 else if (tx_isolation == ISO_READ_COMMITTED) {
699 return hatoku_iso_read_committed;
700 }
701 else if (tx_isolation == ISO_REPEATABLE_READ) {
702 return hatoku_iso_repeatable_read;
703 }
704 else {
705 return hatoku_iso_serializable;
706 }
707 }
708
toku_iso_to_txn_flag(HA_TOKU_ISO_LEVEL lvl)709 static inline uint32_t toku_iso_to_txn_flag (HA_TOKU_ISO_LEVEL lvl) {
710 if (lvl == hatoku_iso_read_uncommitted) {
711 return DB_READ_UNCOMMITTED;
712 }
713 else if (lvl == hatoku_iso_read_committed) {
714 return DB_READ_COMMITTED;
715 }
716 else if (lvl == hatoku_iso_repeatable_read) {
717 return DB_TXN_SNAPSHOT;
718 }
719 else {
720 return 0;
721 }
722 }
723
filter_key_part_compare(const void * left,const void * right)724 static int filter_key_part_compare (const void* left, const void* right) {
725 FILTER_KEY_PART_INFO* left_part= (FILTER_KEY_PART_INFO *)left;
726 FILTER_KEY_PART_INFO* right_part = (FILTER_KEY_PART_INFO *)right;
727 return left_part->offset - right_part->offset;
728 }
729
730 //
731 // Be very careful with parameters passed to this function. Who knows
732 // if key, table have proper info set. I had to verify by checking
733 // in the debugger.
734 //
set_key_filter(MY_BITMAP * key_filter,KEY * key,TABLE * table,bool get_offset_from_keypart)735 void set_key_filter(
736 MY_BITMAP* key_filter,
737 KEY* key,
738 TABLE* table,
739 bool get_offset_from_keypart) {
740
741 FILTER_KEY_PART_INFO parts[MAX_REF_PARTS];
742 uint curr_skip_index = 0;
743
744 for (uint i = 0; i < key->user_defined_key_parts; i++) {
745 //
746 // horrendous hack due to bugs in mysql, basically
747 // we cannot always reliably get the offset from the same source
748 //
749 parts[i].offset =
750 get_offset_from_keypart ?
751 key->key_part[i].offset :
752 field_offset(key->key_part[i].field, table);
753 parts[i].part_index = i;
754 }
755 qsort(
756 parts, // start of array
757 key->user_defined_key_parts, //num elements
758 sizeof(*parts), //size of each element
759 filter_key_part_compare);
760
761 for (uint i = 0; i < table->s->fields; i++) {
762 Field* field = table->field[i];
763 uint curr_field_offset = field_offset(field, table);
764 if (curr_skip_index < key->user_defined_key_parts) {
765 uint curr_skip_offset = 0;
766 curr_skip_offset = parts[curr_skip_index].offset;
767 if (curr_skip_offset == curr_field_offset) {
768 //
769 // we have hit a field that is a portion of the primary key
770 //
771 uint curr_key_index = parts[curr_skip_index].part_index;
772 curr_skip_index++;
773 //
774 // only choose to continue over the key if the key's length matches the field's length
775 // otherwise, we may have a situation where the column is a varchar(10), the
776 // key is only the first 3 characters, and we end up losing the last 7 bytes of the
777 // column
778 //
779 TOKU_TYPE toku_type = mysql_to_toku_type(field);
780 switch (toku_type) {
781 case toku_type_blob:
782 break;
783 case toku_type_varbinary:
784 case toku_type_varstring:
785 case toku_type_fixbinary:
786 case toku_type_fixstring:
787 if (key->key_part[curr_key_index].length == field->field_length) {
788 bitmap_set_bit(key_filter,i);
789 }
790 break;
791 default:
792 bitmap_set_bit(key_filter,i);
793 break;
794 }
795 }
796 }
797 }
798 }
799
pack_fixed_field(uchar * to_tokudb,const uchar * from_mysql,uint32_t num_bytes)800 static inline uchar* pack_fixed_field(
801 uchar* to_tokudb,
802 const uchar* from_mysql,
803 uint32_t num_bytes
804 )
805 {
806 switch (num_bytes) {
807 case (1):
808 memcpy(to_tokudb, from_mysql, 1);
809 break;
810 case (2):
811 memcpy(to_tokudb, from_mysql, 2);
812 break;
813 case (3):
814 memcpy(to_tokudb, from_mysql, 3);
815 break;
816 case (4):
817 memcpy(to_tokudb, from_mysql, 4);
818 break;
819 case (8):
820 memcpy(to_tokudb, from_mysql, 8);
821 break;
822 default:
823 memcpy(to_tokudb, from_mysql, num_bytes);
824 break;
825 }
826 return to_tokudb+num_bytes;
827 }
828
unpack_fixed_field(uchar * to_mysql,const uchar * from_tokudb,uint32_t num_bytes)829 static inline const uchar* unpack_fixed_field(
830 uchar* to_mysql,
831 const uchar* from_tokudb,
832 uint32_t num_bytes
833 )
834 {
835 switch (num_bytes) {
836 case (1):
837 memcpy(to_mysql, from_tokudb, 1);
838 break;
839 case (2):
840 memcpy(to_mysql, from_tokudb, 2);
841 break;
842 case (3):
843 memcpy(to_mysql, from_tokudb, 3);
844 break;
845 case (4):
846 memcpy(to_mysql, from_tokudb, 4);
847 break;
848 case (8):
849 memcpy(to_mysql, from_tokudb, 8);
850 break;
851 default:
852 memcpy(to_mysql, from_tokudb, num_bytes);
853 break;
854 }
855 return from_tokudb+num_bytes;
856 }
857
write_var_field(uchar * to_tokudb_offset_ptr,uchar * to_tokudb_data,uchar * to_tokudb_offset_start,const uchar * data,uint32_t data_length,uint32_t offset_bytes)858 static inline uchar* write_var_field(
859 uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
860 uchar* to_tokudb_data, // location where data is going to be written
861 uchar* to_tokudb_offset_start, //location where offset starts, IS THIS A BAD NAME????
862 const uchar * data, // the data to write
863 uint32_t data_length, // length of data to write
864 uint32_t offset_bytes // number of offset bytes
865 )
866 {
867 memcpy(to_tokudb_data, data, data_length);
868 //
869 // for offset, we pack the offset where the data ENDS!
870 //
871 uint32_t offset = to_tokudb_data + data_length - to_tokudb_offset_start;
872 switch(offset_bytes) {
873 case (1):
874 to_tokudb_offset_ptr[0] = (uchar)offset;
875 break;
876 case (2):
877 int2store(to_tokudb_offset_ptr,offset);
878 break;
879 default:
880 assert_unreachable();
881 break;
882 }
883 return to_tokudb_data + data_length;
884 }
885
get_var_data_length(const uchar * from_mysql,uint32_t mysql_length_bytes)886 static inline uint32_t get_var_data_length(
887 const uchar * from_mysql,
888 uint32_t mysql_length_bytes
889 )
890 {
891 uint32_t data_length;
892 switch(mysql_length_bytes) {
893 case(1):
894 data_length = from_mysql[0];
895 break;
896 case(2):
897 data_length = uint2korr(from_mysql);
898 break;
899 default:
900 assert_unreachable();
901 }
902 return data_length;
903 }
904
pack_var_field(uchar * to_tokudb_offset_ptr,uchar * to_tokudb_data,uchar * to_tokudb_offset_start,const uchar * from_mysql,uint32_t mysql_length_bytes,uint32_t offset_bytes)905 static inline uchar* pack_var_field(
906 uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
907 uchar* to_tokudb_data, // pointer to where tokudb data should be written
908 uchar* to_tokudb_offset_start, //location where data starts, IS THIS A BAD NAME????
909 const uchar * from_mysql, // mysql data
910 uint32_t mysql_length_bytes, //number of bytes used to store length in from_mysql
911 uint32_t offset_bytes //number of offset_bytes used in tokudb row
912 )
913 {
914 uint data_length = get_var_data_length(from_mysql, mysql_length_bytes);
915 return write_var_field(
916 to_tokudb_offset_ptr,
917 to_tokudb_data,
918 to_tokudb_offset_start,
919 from_mysql + mysql_length_bytes,
920 data_length,
921 offset_bytes
922 );
923 }
924
unpack_var_field(uchar * to_mysql,const uchar * from_tokudb_data,uint32_t from_tokudb_data_len,uint32_t mysql_length_bytes)925 static inline void unpack_var_field(
926 uchar* to_mysql,
927 const uchar* from_tokudb_data,
928 uint32_t from_tokudb_data_len,
929 uint32_t mysql_length_bytes
930 )
931 {
932 //
933 // store the length
934 //
935 switch (mysql_length_bytes) {
936 case(1):
937 to_mysql[0] = (uchar)from_tokudb_data_len;
938 break;
939 case(2):
940 int2store(to_mysql, from_tokudb_data_len);
941 break;
942 default:
943 assert_unreachable();
944 }
945 //
946 // store the data
947 //
948 memcpy(to_mysql+mysql_length_bytes, from_tokudb_data, from_tokudb_data_len);
949 }
950
pack_toku_field_blob(uchar * to_tokudb,const uchar * from_mysql,Field * field)951 static uchar* pack_toku_field_blob(
952 uchar* to_tokudb,
953 const uchar* from_mysql,
954 Field* field
955 )
956 {
957 uint32_t len_bytes = field->row_pack_length();
958 uint32_t length = 0;
959 uchar* data_ptr = NULL;
960 memcpy(to_tokudb, from_mysql, len_bytes);
961
962 switch (len_bytes) {
963 case (1):
964 length = (uint32_t)(*from_mysql);
965 break;
966 case (2):
967 length = uint2korr(from_mysql);
968 break;
969 case (3):
970 length = tokudb_uint3korr(from_mysql);
971 break;
972 case (4):
973 length = uint4korr(from_mysql);
974 break;
975 default:
976 assert_unreachable();
977 }
978
979 if (length > 0) {
980 memcpy((uchar *)(&data_ptr), from_mysql + len_bytes, sizeof(uchar*));
981 memcpy(to_tokudb + len_bytes, data_ptr, length);
982 }
983 return (to_tokudb + len_bytes + length);
984 }
985
create_tokudb_trx_data_instance(tokudb_trx_data ** out_trx)986 static int create_tokudb_trx_data_instance(tokudb_trx_data** out_trx) {
987 int error;
988 tokudb_trx_data* trx = (tokudb_trx_data *) tokudb::memory::malloc(
989 sizeof(*trx),
990 MYF(MY_ZEROFILL));
991 if (!trx) {
992 error = ENOMEM;
993 goto cleanup;
994 }
995
996 *out_trx = trx;
997 error = 0;
998 cleanup:
999 return error;
1000 }
1001
tokudb_generate_row(DB * dest_db,TOKUDB_UNUSED (DB * src_db),DBT * dest_key,DBT * dest_val,const DBT * src_key,const DBT * src_val)1002 static inline int tokudb_generate_row(DB* dest_db,
1003 TOKUDB_UNUSED(DB* src_db),
1004 DBT* dest_key,
1005 DBT* dest_val,
1006 const DBT* src_key,
1007 const DBT* src_val) {
1008 int error;
1009
1010 DB* curr_db = dest_db;
1011 uchar* row_desc = NULL;
1012 uint32_t desc_size;
1013 uchar* buff = NULL;
1014 uint32_t max_key_len = 0;
1015
1016 row_desc = (uchar *)curr_db->descriptor->dbt.data;
1017 row_desc += (*(uint32_t *)row_desc);
1018 desc_size = (*(uint32_t *)row_desc) - 4;
1019 row_desc += 4;
1020
1021 if (is_key_pk(row_desc)) {
1022 if (dest_key->flags == DB_DBT_REALLOC && dest_key->data != NULL) {
1023 free(dest_key->data);
1024 }
1025 if (dest_val != NULL) {
1026 if (dest_val->flags == DB_DBT_REALLOC && dest_val->data != NULL) {
1027 free(dest_val->data);
1028 }
1029 }
1030 dest_key->data = src_key->data;
1031 dest_key->size = src_key->size;
1032 dest_key->flags = 0;
1033 if (dest_val != NULL) {
1034 dest_val->data = src_val->data;
1035 dest_val->size = src_val->size;
1036 dest_val->flags = 0;
1037 }
1038 error = 0;
1039 goto cleanup;
1040 }
1041 // at this point, we need to create the key/val and set it
1042 // in the DBTs
1043 if (dest_key->flags == 0) {
1044 dest_key->ulen = 0;
1045 dest_key->size = 0;
1046 dest_key->data = NULL;
1047 dest_key->flags = DB_DBT_REALLOC;
1048 }
1049 if (dest_key->flags == DB_DBT_REALLOC) {
1050 max_key_len = max_key_size_from_desc(row_desc, desc_size);
1051 max_key_len += src_key->size;
1052
1053 if (max_key_len > dest_key->ulen) {
1054 void* old_ptr = dest_key->data;
1055 void* new_ptr = NULL;
1056 new_ptr = realloc(old_ptr, max_key_len);
1057 assert_always(new_ptr);
1058 dest_key->data = new_ptr;
1059 dest_key->ulen = max_key_len;
1060 }
1061
1062 buff = (uchar *)dest_key->data;
1063 assert_always(buff != nullptr);
1064 assert_always(max_key_len > 0);
1065 } else {
1066 assert_unreachable();
1067 }
1068
1069 dest_key->size = pack_key_from_desc(buff, row_desc, desc_size, src_key,
1070 src_val);
1071 assert_always(dest_key->ulen >= dest_key->size);
1072 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY)) &&
1073 !max_key_len) {
1074 max_key_len = max_key_size_from_desc(row_desc, desc_size);
1075 max_key_len += src_key->size;
1076 }
1077 if (max_key_len) {
1078 assert_always(max_key_len >= dest_key->size);
1079 }
1080
1081 row_desc += desc_size;
1082 desc_size = (*(uint32_t *)row_desc) - 4;
1083 row_desc += 4;
1084 if (dest_val != NULL) {
1085 if (!is_key_clustering(desc_size) || src_val->size == 0) {
1086 dest_val->size = 0;
1087 } else {
1088 uchar* buff = NULL;
1089 if (dest_val->flags == 0) {
1090 dest_val->ulen = 0;
1091 dest_val->size = 0;
1092 dest_val->data = NULL;
1093 dest_val->flags = DB_DBT_REALLOC;
1094 }
1095 if (dest_val->flags == DB_DBT_REALLOC){
1096 if (dest_val->ulen < src_val->size) {
1097 void* old_ptr = dest_val->data;
1098 void* new_ptr = NULL;
1099 new_ptr = realloc(old_ptr, src_val->size);
1100 assert_always(new_ptr);
1101 dest_val->data = new_ptr;
1102 dest_val->ulen = src_val->size;
1103 }
1104 buff = (uchar *)dest_val->data;
1105 assert_always(buff != NULL);
1106 } else {
1107 assert_unreachable();
1108 }
1109 dest_val->size = pack_clustering_val_from_desc(
1110 buff,
1111 row_desc,
1112 desc_size,
1113 src_val);
1114 assert_always(dest_val->ulen >= dest_val->size);
1115 }
1116 }
1117 error = 0;
1118 cleanup:
1119 return error;
1120 }
1121
generate_row_for_del(DB * dest_db,DB * src_db,DBT_ARRAY * dest_key_arrays,const DBT * src_key,const DBT * src_val)1122 static int generate_row_for_del(
1123 DB *dest_db,
1124 DB *src_db,
1125 DBT_ARRAY *dest_key_arrays,
1126 const DBT *src_key,
1127 const DBT *src_val
1128 )
1129 {
1130 DBT* dest_key = &dest_key_arrays->dbts[0];
1131 return tokudb_generate_row(
1132 dest_db,
1133 src_db,
1134 dest_key,
1135 NULL,
1136 src_key,
1137 src_val
1138 );
1139 }
1140
1141
generate_row_for_put(DB * dest_db,DB * src_db,DBT_ARRAY * dest_key_arrays,DBT_ARRAY * dest_val_arrays,const DBT * src_key,const DBT * src_val)1142 static int generate_row_for_put(
1143 DB *dest_db,
1144 DB *src_db,
1145 DBT_ARRAY *dest_key_arrays,
1146 DBT_ARRAY *dest_val_arrays,
1147 const DBT *src_key,
1148 const DBT *src_val
1149 )
1150 {
1151 DBT* dest_key = &dest_key_arrays->dbts[0];
1152 DBT *dest_val = (dest_val_arrays == NULL) ? NULL : &dest_val_arrays->dbts[0];
1153 return tokudb_generate_row(
1154 dest_db,
1155 src_db,
1156 dest_key,
1157 dest_val,
1158 src_key,
1159 src_val
1160 );
1161 }
1162
ha_tokudb(handlerton * hton,TABLE_SHARE * table_arg)1163 ha_tokudb::ha_tokudb(handlerton * hton, TABLE_SHARE * table_arg):handler(hton, table_arg) {
1164 TOKUDB_HANDLER_DBUG_ENTER("");
1165 share = NULL;
1166 int_table_flags = HA_REC_NOT_IN_SEQ | HA_NULL_IN_KEY | HA_CAN_INDEX_BLOBS
1167 | HA_PRIMARY_KEY_IN_READ_INDEX | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION
1168 | HA_FILE_BASED | HA_AUTO_PART_KEY | HA_TABLE_SCAN_ON_INDEX
1169 | HA_CAN_WRITE_DURING_OPTIMIZE | HA_ONLINE_ANALYZE;
1170 alloc_ptr = NULL;
1171 rec_buff = NULL;
1172 rec_update_buff = NULL;
1173 transaction = NULL;
1174 cursor = NULL;
1175 fixed_cols_for_query = NULL;
1176 var_cols_for_query = NULL;
1177 num_fixed_cols_for_query = 0;
1178 num_var_cols_for_query = 0;
1179 unpack_entire_row = true;
1180 read_blobs = false;
1181 read_key = false;
1182 added_rows = 0;
1183 deleted_rows = 0;
1184 updated_rows = 0;
1185 last_dup_key = UINT_MAX;
1186 using_ignore = false;
1187 using_ignore_no_key = false;
1188 last_cursor_error = 0;
1189 range_lock_grabbed = false;
1190 blob_buff = NULL;
1191 num_blob_bytes = 0;
1192 delay_updating_ai_metadata = false;
1193 ai_metadata_update_required = false;
1194 memset(mult_key_dbt_array, 0, sizeof(mult_key_dbt_array));
1195 memset(mult_rec_dbt_array, 0, sizeof(mult_rec_dbt_array));
1196 for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1197 toku_dbt_array_init(&mult_key_dbt_array[i], 1);
1198 }
1199 for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1200 toku_dbt_array_init(&mult_rec_dbt_array[i], 1);
1201 }
1202 loader = NULL;
1203 abort_loader = false;
1204 memset(&lc, 0, sizeof(lc));
1205 lock.type = TL_IGNORE;
1206 for (uint32_t i = 0; i < MAX_KEY+1; i++) {
1207 mult_put_flags[i] = 0;
1208 mult_del_flags[i] = DB_DELETE_ANY;
1209 mult_dbt_flags[i] = DB_DBT_REALLOC;
1210 }
1211 num_DBs_locked_in_bulk = false;
1212 lock_count = 0;
1213 use_write_locks = false;
1214 range_query_buff = NULL;
1215 size_range_query_buff = 0;
1216 bytes_used_in_range_query_buff = 0;
1217 curr_range_query_buff_offset = 0;
1218 doing_bulk_fetch = false;
1219 prelocked_left_range_size = 0;
1220 prelocked_right_range_size = 0;
1221 tokudb_active_index = MAX_KEY;
1222 invalidate_icp();
1223 trx_handler_list.data = this;
1224 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
1225 in_rpl_write_rows = in_rpl_delete_rows = in_rpl_update_rows = false;
1226 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
1227 TOKUDB_HANDLER_DBUG_VOID_RETURN;
1228 }
1229
~ha_tokudb()1230 ha_tokudb::~ha_tokudb() {
1231 TOKUDB_HANDLER_DBUG_ENTER("");
1232 for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1233 toku_dbt_array_destroy(&mult_key_dbt_array[i]);
1234 }
1235 for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1236 toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
1237 }
1238 TOKUDB_HANDLER_DBUG_VOID_RETURN;
1239 }
1240
1241 //
1242 // states if table has an auto increment column, if so, sets index where auto inc column is to index
1243 // Parameters:
1244 // [out] index - if auto inc exists, then this param is set to where it exists in table, if not, then unchanged
1245 // Returns:
1246 // true if auto inc column exists, false otherwise
1247 //
has_auto_increment_flag(uint * index)1248 bool ha_tokudb::has_auto_increment_flag(uint* index) {
1249 //
1250 // check to see if we have auto increment field
1251 //
1252 bool ai_found = false;
1253 uint ai_index = 0;
1254 for (uint i = 0; i < table_share->fields; i++, ai_index++) {
1255 Field* field = table->field[i];
1256 if (field->flags & AUTO_INCREMENT_FLAG) {
1257 ai_found = true;
1258 *index = ai_index;
1259 break;
1260 }
1261 }
1262 return ai_found;
1263 }
1264
open_status_dictionary(DB ** ptr,const char * name,DB_TXN * txn)1265 static int open_status_dictionary(DB** ptr, const char* name, DB_TXN* txn) {
1266 int error;
1267 char* newname = NULL;
1268 size_t newname_len = get_max_dict_name_path_length(name);
1269 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
1270 if (newname == NULL) {
1271 error = ENOMEM;
1272 goto cleanup;
1273 }
1274 make_name(newname, newname_len, name, "status");
1275 TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "open:%s", newname);
1276
1277 error = tokudb::metadata::open(db_env, ptr, newname, txn);
1278 cleanup:
1279 tokudb::memory::free(newname);
1280 return error;
1281 }
1282
open_main_dictionary(const char * name,bool is_read_only,DB_TXN * txn)1283 int ha_tokudb::open_main_dictionary(
1284 const char* name,
1285 bool is_read_only,
1286 DB_TXN* txn) {
1287
1288 int error;
1289 char* newname = NULL;
1290 size_t newname_len = 0;
1291 uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1292
1293 assert_always(share->file == NULL);
1294 assert_always(share->key_file[primary_key] == NULL);
1295 newname_len = get_max_dict_name_path_length(name);
1296 newname = (char*)tokudb::memory::malloc(
1297 newname_len,
1298 MYF(MY_WME|MY_ZEROFILL));
1299 if (newname == NULL) {
1300 error = ENOMEM;
1301 goto exit;
1302 }
1303 make_name(newname, newname_len, name, "main");
1304
1305 error = db_create(&share->file, db_env, 0);
1306 if (error) {
1307 goto exit;
1308 }
1309 share->key_file[primary_key] = share->file;
1310
1311 error =
1312 share->file->open(
1313 share->file,
1314 txn,
1315 newname,
1316 NULL,
1317 DB_BTREE,
1318 open_flags,
1319 is_read_only ? 0 : S_IWUSR);
1320 if (error) {
1321 goto exit;
1322 }
1323
1324 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1325 TOKUDB_DEBUG_OPEN,
1326 "open:%s:file=%p",
1327 newname,
1328 share->file);
1329
1330 error = 0;
1331 exit:
1332 if (error) {
1333 if (share->file) {
1334 int r = share->file->close(
1335 share->file,
1336 0
1337 );
1338 assert_always(r==0);
1339 share->file = NULL;
1340 share->key_file[primary_key] = NULL;
1341 }
1342 }
1343 tokudb::memory::free(newname);
1344 return error;
1345 }
1346
1347 //
1348 // Open a secondary table, the key will be a secondary index, the data will
1349 // be a primary key
1350 //
open_secondary_dictionary(DB ** ptr,KEY * key_info,const char * name,bool is_read_only,DB_TXN * txn)1351 int ha_tokudb::open_secondary_dictionary(
1352 DB** ptr,
1353 KEY* key_info,
1354 const char* name,
1355 bool is_read_only,
1356 DB_TXN* txn) {
1357
1358 int error = ENOSYS;
1359 char dict_name[MAX_DICT_NAME_LEN];
1360 uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1361 char* newname = NULL;
1362 size_t newname_len = 0;
1363
1364 sprintf(dict_name, "key-%s", key_info->name);
1365
1366 newname_len = get_max_dict_name_path_length(name);
1367 newname =
1368 (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME|MY_ZEROFILL));
1369 if (newname == NULL) {
1370 error = ENOMEM;
1371 goto cleanup;
1372 }
1373 make_name(newname, newname_len, name, dict_name);
1374
1375
1376 if ((error = db_create(ptr, db_env, 0))) {
1377 set_my_errno(error);
1378 goto cleanup;
1379 }
1380
1381
1382 error = (*ptr)->open(*ptr, txn, newname, NULL, DB_BTREE, open_flags, is_read_only ? 0 : S_IWUSR);
1383 if (error) {
1384 set_my_errno(error);
1385 goto cleanup;
1386 }
1387 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1388 TOKUDB_DEBUG_OPEN,
1389 "open:%s:file=%p",
1390 newname,
1391 *ptr);
1392 cleanup:
1393 if (error) {
1394 if (*ptr) {
1395 int r = (*ptr)->close(*ptr, 0);
1396 assert_always(r==0);
1397 *ptr = NULL;
1398 }
1399 }
1400 tokudb::memory::free(newname);
1401 return error;
1402 }
1403
initialize_col_pack_info(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)1404 static int initialize_col_pack_info(KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, uint keynr) {
1405 int error = ENOSYS;
1406 //
1407 // set up the cp_info
1408 //
1409 assert_always(kc_info->cp_info[keynr] == NULL);
1410 kc_info->cp_info[keynr] = (COL_PACK_INFO*)tokudb::memory::malloc(
1411 table_share->fields * sizeof(COL_PACK_INFO),
1412 MYF(MY_WME | MY_ZEROFILL));
1413 if (kc_info->cp_info[keynr] == NULL) {
1414 error = ENOMEM;
1415 goto exit;
1416 }
1417 {
1418 uint32_t curr_fixed_offset = 0;
1419 uint32_t curr_var_index = 0;
1420 for (uint j = 0; j < table_share->fields; j++) {
1421 COL_PACK_INFO* curr = &kc_info->cp_info[keynr][j];
1422 //
1423 // need to set the offsets / indexes
1424 // offsets are calculated AFTER the NULL bytes
1425 //
1426 if (!bitmap_is_set(&kc_info->key_filters[keynr],j)) {
1427 if (is_fixed_field(kc_info, j)) {
1428 curr->col_pack_val = curr_fixed_offset;
1429 curr_fixed_offset += kc_info->field_lengths[j];
1430 }
1431 else if (is_variable_field(kc_info, j)) {
1432 curr->col_pack_val = curr_var_index;
1433 curr_var_index++;
1434 }
1435 }
1436 }
1437
1438 //
1439 // set up the mcp_info
1440 //
1441 kc_info->mcp_info[keynr].fixed_field_size = get_fixed_field_size(
1442 kc_info,
1443 table_share,
1444 keynr
1445 );
1446 kc_info->mcp_info[keynr].len_of_offsets = get_len_of_offsets(
1447 kc_info,
1448 table_share,
1449 keynr
1450 );
1451
1452 error = 0;
1453 }
1454 exit:
1455 return error;
1456 }
1457
1458 // reset the kc_info state at keynr
reset_key_and_col_info(KEY_AND_COL_INFO * kc_info,uint keynr)1459 static void reset_key_and_col_info(KEY_AND_COL_INFO *kc_info, uint keynr) {
1460 bitmap_clear_all(&kc_info->key_filters[keynr]);
1461 tokudb::memory::free(kc_info->cp_info[keynr]);
1462 kc_info->cp_info[keynr] = NULL;
1463 kc_info->mcp_info[keynr] = (MULTI_COL_PACK_INFO) { 0, 0 };
1464 }
1465
initialize_key_and_col_info(TABLE_SHARE * table_share,TABLE * table,KEY_AND_COL_INFO * kc_info,uint hidden_primary_key,uint primary_key)1466 static int initialize_key_and_col_info(
1467 TABLE_SHARE* table_share,
1468 TABLE* table,
1469 KEY_AND_COL_INFO* kc_info,
1470 uint hidden_primary_key,
1471 uint primary_key) {
1472
1473 int error = 0;
1474 uint32_t curr_blob_field_index = 0;
1475 uint32_t max_var_bytes = 0;
1476 //
1477 // fill in the field lengths. 0 means it is a variable sized field length
1478 // fill in length_bytes, 0 means it is fixed or blob
1479 //
1480 for (uint i = 0; i < table_share->fields; i++) {
1481 Field* field = table_share->field[i];
1482 TOKU_TYPE toku_type = mysql_to_toku_type(field);
1483 uint32 pack_length = 0;
1484 switch (toku_type) {
1485 case toku_type_int:
1486 case toku_type_double:
1487 case toku_type_float:
1488 case toku_type_fixbinary:
1489 case toku_type_fixstring:
1490 pack_length = field->pack_length();
1491 assert_always(pack_length < 1<<16);
1492 kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_FIXED_FIELD;
1493 kc_info->field_lengths[i] = (uint16_t)pack_length;
1494 kc_info->length_bytes[i] = 0;
1495 break;
1496 case toku_type_blob:
1497 kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_BLOB_FIELD;
1498 kc_info->field_lengths[i] = 0;
1499 kc_info->length_bytes[i] = 0;
1500 kc_info->blob_fields[curr_blob_field_index] = i;
1501 curr_blob_field_index++;
1502 break;
1503 case toku_type_varstring:
1504 case toku_type_varbinary:
1505 kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_VARIABLE_FIELD;
1506 kc_info->field_lengths[i] = 0;
1507 kc_info->length_bytes[i] =
1508 (uchar)((Field_varstring*)field)->length_bytes;
1509 max_var_bytes += field->field_length;
1510 break;
1511 default:
1512 assert_unreachable();
1513 }
1514 }
1515 kc_info->num_blobs = curr_blob_field_index;
1516
1517 //
1518 // initialize share->num_offset_bytes
1519 // because MAX_REF_LENGTH is 65536, we
1520 // can safely set num_offset_bytes to 1 or 2
1521 //
1522 if (max_var_bytes < 256) {
1523 kc_info->num_offset_bytes = 1;
1524 } else {
1525 kc_info->num_offset_bytes = 2;
1526 }
1527
1528 for (uint i = 0;
1529 i < table_share->keys + tokudb_test(hidden_primary_key);
1530 i++) {
1531 //
1532 // do the cluster/primary key filtering calculations
1533 //
1534 if (!(i==primary_key && hidden_primary_key)) {
1535 if (i == primary_key) {
1536 set_key_filter(
1537 &kc_info->key_filters[primary_key],
1538 &table_share->key_info[primary_key],
1539 table,
1540 true);
1541 } else {
1542 set_key_filter(
1543 &kc_info->key_filters[i],
1544 &table_share->key_info[i],
1545 table,
1546 true);
1547 if (!hidden_primary_key) {
1548 set_key_filter(
1549 &kc_info->key_filters[i],
1550 &table_share->key_info[primary_key],
1551 table,
1552 true);
1553 }
1554 }
1555 }
1556 if (i == primary_key || key_is_clustering(&table_share->key_info[i])) {
1557 error = initialize_col_pack_info(kc_info, table_share, i);
1558 if (error) {
1559 goto exit;
1560 }
1561 }
1562 }
1563 exit:
1564 return error;
1565 }
1566
initialize_share(const char * name,int mode)1567 int ha_tokudb::initialize_share(const char* name, int mode) {
1568
1569 int error = 0;
1570 uint64_t num_rows = 0;
1571 DB_TXN* txn = NULL;
1572 bool do_commit = false;
1573 THD* thd = ha_thd();
1574 tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
1575 if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
1576 txn = trx->sub_sp_level;
1577 }
1578 else {
1579 do_commit = true;
1580 error = txn_begin(db_env, 0, &txn, 0, thd);
1581 if (error) { goto exit; }
1582 }
1583
1584
1585 error = get_status(txn);
1586 if (error) {
1587 goto exit;
1588 }
1589 if (share->version != HA_TOKU_VERSION) {
1590 error = ENOSYS;
1591 goto exit;
1592 }
1593
1594 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
1595 #if defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
1596 // verify frm data for non-partitioned tables
1597 if (table->part_info == NULL) {
1598 error = verify_frm_data(table->s->path.str, txn);
1599 if (error)
1600 goto exit;
1601 } else {
1602 // remove the frm data for partitions since we are not maintaining it
1603 error = remove_frm_data(share->status_block, txn);
1604 if (error)
1605 goto exit;
1606 }
1607 #else
1608 error = verify_frm_data(table->s->path.str, txn);
1609 if (error)
1610 goto exit;
1611 #endif // defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
1612 #endif // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
1613
1614 error =
1615 initialize_key_and_col_info(
1616 table_share,
1617 table,
1618 &share->kc_info,
1619 hidden_primary_key,
1620 primary_key);
1621 if (error) { goto exit; }
1622
1623
1624 error = open_main_dictionary(name, mode == O_RDONLY, txn);
1625 if (error) {
1626 goto exit;
1627 }
1628
1629 share->has_unique_keys = false;
1630 share->_keys = table_share->keys;
1631 share->_max_key_parts = table_share->key_parts;
1632 share->_key_descriptors =
1633 (TOKUDB_SHARE::key_descriptor_t*)tokudb::memory::malloc(
1634 sizeof(TOKUDB_SHARE::key_descriptor_t) * share->_keys,
1635 MYF(MY_ZEROFILL));
1636
1637 /* Open other keys; These are part of the share structure */
1638 for (uint i = 0; i < table_share->keys; i++) {
1639 share->_key_descriptors[i]._parts =
1640 table_share->key_info[i].user_defined_key_parts;
1641 if (i == primary_key) {
1642 share->_key_descriptors[i]._is_unique = true;
1643 share->_key_descriptors[i]._name =
1644 tokudb::memory::strdup("primary", 0);
1645 } else {
1646 share->_key_descriptors[i]._is_unique = false;
1647 share->_key_descriptors[i]._name =
1648 tokudb::memory::strdup(table_share->key_info[i].name, 0);
1649 }
1650
1651 if (table_share->key_info[i].flags & HA_NOSAME) {
1652 share->_key_descriptors[i]._is_unique = true;
1653 share->has_unique_keys = true;
1654 }
1655 if (i != primary_key) {
1656 error =
1657 open_secondary_dictionary(
1658 &share->key_file[i],
1659 &table_share->key_info[i],
1660 name,
1661 mode == O_RDONLY,
1662 txn);
1663 if (error) {
1664 goto exit;
1665 }
1666 }
1667 }
1668
1669 share->pk_has_string = false;
1670 if (!hidden_primary_key) {
1671 //
1672 // We need to set the ref_length to start at 5, to account for
1673 // the "infinity byte" in keys, and for placing the DBT size in the
1674 // first four bytes
1675 //
1676 ref_length = sizeof(uint32_t) + sizeof(uchar);
1677 KEY_PART_INFO* key_part = table->key_info[primary_key].key_part;
1678 KEY_PART_INFO* end =
1679 key_part + table->key_info[primary_key].user_defined_key_parts;
1680 for (; key_part != end; key_part++) {
1681 uint field_length = key_part->field->pack_length();
1682 field_length += (field_length > 255 ? 2 : 1);
1683 ref_length += field_length;
1684 TOKU_TYPE toku_type = mysql_to_toku_type(key_part->field);
1685 if (toku_type == toku_type_fixstring ||
1686 toku_type == toku_type_varstring ||
1687 toku_type == toku_type_blob
1688 )
1689 {
1690 share->pk_has_string = true;
1691 }
1692 }
1693 share->status |= STATUS_PRIMARY_KEY_INIT;
1694 }
1695 share->ref_length = ref_length;
1696
1697 error = estimate_num_rows(share->file, &num_rows, txn);
1698 //
1699 // estimate_num_rows should not fail under normal conditions
1700 //
1701 if (error == 0) {
1702 share->set_row_count(num_rows, true);
1703 } else {
1704 goto exit;
1705 }
1706 //
1707 // initialize auto increment data
1708 //
1709 share->has_auto_inc = has_auto_increment_flag(&share->ai_field_index);
1710 if (share->has_auto_inc) {
1711 init_auto_increment();
1712 }
1713
1714 if (may_table_be_empty(txn)) {
1715 share->try_table_lock = true;
1716 } else {
1717 share->try_table_lock = false;
1718 }
1719
1720 share->num_DBs = table_share->keys + tokudb_test(hidden_primary_key);
1721
1722 init_hidden_prim_key_info(txn);
1723
1724 // initialize cardinality info from the status dictionary
1725 {
1726 uint32_t rec_per_keys = tokudb::compute_total_key_parts(table_share);
1727 uint64_t* rec_per_key =
1728 (uint64_t*)tokudb::memory::malloc(
1729 rec_per_keys * sizeof(uint64_t),
1730 MYF(MY_FAE));
1731 error =
1732 tokudb::get_card_from_status(
1733 share->status_block,
1734 txn,
1735 rec_per_keys,
1736 rec_per_key);
1737 if (error) {
1738 memset(rec_per_key, 0, sizeof(ulonglong) * rec_per_keys);
1739 }
1740 share->init_cardinality_counts(rec_per_keys, rec_per_key);
1741 }
1742
1743 error = 0;
1744 exit:
1745 if (do_commit && txn) {
1746 commit_txn(txn,0);
1747 }
1748 return error;
1749 }
1750
1751 //
1752 // Creates and opens a handle to a table which already exists in a tokudb
1753 // database.
1754 // Parameters:
1755 // [in] name - table name
1756 // mode - seems to specify if table is read only
1757 // test_if_locked - unused
1758 // Returns:
1759 // 0 on success
1760 // 1 on error
1761 //
open(const char * name,int mode,uint test_if_locked)1762 int ha_tokudb::open(const char *name, int mode, uint test_if_locked) {
1763 TOKUDB_HANDLER_DBUG_ENTER("%s %o %u", name, mode, test_if_locked);
1764 THD* thd = ha_thd();
1765
1766 int error = 0;
1767 int ret_val = 0;
1768
1769 transaction = NULL;
1770 cursor = NULL;
1771
1772 /* Open primary key */
1773 hidden_primary_key = 0;
1774 if ((primary_key = table_share->primary_key) >= MAX_KEY) {
1775 // No primary key
1776 primary_key = table_share->keys;
1777 key_used_on_scan = MAX_KEY;
1778 hidden_primary_key = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
1779 ref_length = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t);
1780 }
1781 else {
1782 key_used_on_scan = primary_key;
1783 }
1784
1785 /* Need some extra memory in case of packed keys */
1786 // the "+ 1" is for the first byte that states +/- infinity
1787 // multiply everything by 2 to account for clustered keys having a key and primary key together
1788 max_key_length = 2*(table_share->max_key_length + MAX_REF_PARTS * 3 + sizeof(uchar));
1789 alloc_ptr = tokudb::memory::multi_malloc(
1790 MYF(MY_WME),
1791 &key_buff, max_key_length,
1792 &key_buff2, max_key_length,
1793 &key_buff3, max_key_length,
1794 &key_buff4, max_key_length,
1795 &prelocked_left_range, max_key_length,
1796 &prelocked_right_range, max_key_length,
1797 &primary_key_buff, (hidden_primary_key ? 0 : max_key_length),
1798 &fixed_cols_for_query, table_share->fields*sizeof(uint32_t),
1799 &var_cols_for_query, table_share->fields*sizeof(uint32_t),
1800 NullS);
1801 if (alloc_ptr == NULL) {
1802 ret_val = 1;
1803 goto exit;
1804 }
1805
1806 size_range_query_buff = tokudb::sysvars::read_buf_size(thd);
1807 range_query_buff =
1808 (uchar*)tokudb::memory::malloc(size_range_query_buff, MYF(MY_WME));
1809 if (range_query_buff == NULL) {
1810 ret_val = 1;
1811 goto exit;
1812 }
1813
1814 alloced_rec_buff_length = table_share->rec_buff_length +
1815 table_share->fields;
1816 rec_buff = (uchar *) tokudb::memory::malloc(
1817 alloced_rec_buff_length,
1818 MYF(MY_WME));
1819 if (rec_buff == NULL) {
1820 ret_val = 1;
1821 goto exit;
1822 }
1823
1824 alloced_update_rec_buff_length = alloced_rec_buff_length;
1825 rec_update_buff = (uchar*)tokudb::memory::malloc(
1826 alloced_update_rec_buff_length,
1827 MYF(MY_WME));
1828 if (rec_update_buff == NULL) {
1829 ret_val = 1;
1830 goto exit;
1831 }
1832
1833 // lookup or create share
1834 share = TOKUDB_SHARE::get_share(name, &lock, true);
1835 assert_always(share);
1836
1837 if (share->state() != TOKUDB_SHARE::OPENED) {
1838 // means we're responsible for the transition to OPENED, ERROR or CLOSED
1839
1840 ret_val = allocate_key_and_col_info(table_share, &share->kc_info);
1841 if (ret_val == 0) {
1842 ret_val = initialize_share(name, mode);
1843 }
1844
1845 if (ret_val == 0) {
1846 share->set_state(TOKUDB_SHARE::OPENED);
1847 } else {
1848 free_key_and_col_info(&share->kc_info);
1849 share->set_state(TOKUDB_SHARE::ERROR);
1850 }
1851 share->unlock();
1852 } else {
1853 // got an already OPENED instance
1854 share->unlock();
1855 }
1856
1857 if (share->state() == TOKUDB_SHARE::ERROR) {
1858 share->release();
1859 goto exit;
1860 }
1861
1862 assert_always(share->state() == TOKUDB_SHARE::OPENED);
1863
1864 ref_length = share->ref_length; // If second open
1865
1866 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1867 TOKUDB_DEBUG_OPEN,
1868 "tokudbopen:%p:share=%p:file=%p:table=%p:table->s=%p:%d",
1869 this,
1870 share,
1871 share->file,
1872 table,
1873 table->s,
1874 share->use_count());
1875
1876 key_read = false;
1877 stats.block_size = 1<<20; // QQQ Tokudb DB block size
1878
1879 info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
1880
1881 exit:
1882 if (ret_val) {
1883 tokudb::memory::free(range_query_buff);
1884 range_query_buff = NULL;
1885 tokudb::memory::free(alloc_ptr);
1886 alloc_ptr = NULL;
1887 tokudb::memory::free(rec_buff);
1888 rec_buff = NULL;
1889 tokudb::memory::free(rec_update_buff);
1890 rec_update_buff = NULL;
1891
1892 if (error) {
1893 set_my_errno(error);
1894 }
1895 }
1896 TOKUDB_HANDLER_DBUG_RETURN(ret_val);
1897 }
1898
1899 //
1900 // estimate the number of rows in a DB
1901 // Parameters:
1902 // [in] db - DB whose number of rows will be estimated
1903 // [out] num_rows - number of estimated rows in db
1904 // Returns:
1905 // 0 on success
1906 // error otherwise
1907 //
estimate_num_rows(DB * db,uint64_t * num_rows,DB_TXN * txn)1908 int ha_tokudb::estimate_num_rows(DB* db, uint64_t* num_rows, DB_TXN* txn) {
1909 int error = ENOSYS;
1910 bool do_commit = false;
1911 DB_BTREE_STAT64 dict_stats;
1912 DB_TXN* txn_to_use = NULL;
1913
1914 if (txn == NULL) {
1915 error = txn_begin(db_env, 0, &txn_to_use, DB_READ_UNCOMMITTED, ha_thd());
1916 if (error) goto cleanup;
1917 do_commit = true;
1918 }
1919 else {
1920 txn_to_use = txn;
1921 }
1922
1923 error = db->stat64(db, txn_to_use, &dict_stats);
1924 if (error) { goto cleanup; }
1925
1926 *num_rows = dict_stats.bt_ndata;
1927 error = 0;
1928 cleanup:
1929 if (do_commit) {
1930 commit_txn(txn_to_use, 0);
1931 txn_to_use = NULL;
1932 }
1933 return error;
1934 }
1935
1936
write_to_status(DB * db,HA_METADATA_KEY curr_key_data,void * data,uint size,DB_TXN * txn)1937 int ha_tokudb::write_to_status(DB* db, HA_METADATA_KEY curr_key_data, void* data, uint size, DB_TXN* txn ){
1938 return write_metadata(db, &curr_key_data, sizeof curr_key_data, data, size, txn);
1939 }
1940
remove_from_status(DB * db,HA_METADATA_KEY curr_key_data,DB_TXN * txn)1941 int ha_tokudb::remove_from_status(DB *db, HA_METADATA_KEY curr_key_data, DB_TXN *txn) {
1942 return remove_metadata(db, &curr_key_data, sizeof curr_key_data, txn);
1943 }
1944
remove_metadata(DB * db,void * key_data,uint key_size,DB_TXN * transaction)1945 int ha_tokudb::remove_metadata(DB* db, void* key_data, uint key_size, DB_TXN* transaction){
1946 int error;
1947 DBT key;
1948 DB_TXN* txn = NULL;
1949 bool do_commit = false;
1950 //
1951 // transaction to be used for putting metadata into status.tokudb
1952 //
1953 if (transaction == NULL) {
1954 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
1955 if (error) {
1956 goto cleanup;
1957 }
1958 do_commit = true;
1959 }
1960 else {
1961 txn = transaction;
1962 }
1963
1964 memset(&key, 0, sizeof(key));
1965 key.data = key_data;
1966 key.size = key_size;
1967 error = db->del(db, txn, &key, DB_DELETE_ANY);
1968 if (error) {
1969 goto cleanup;
1970 }
1971
1972 error = 0;
1973 cleanup:
1974 if (do_commit && txn) {
1975 if (!error) {
1976 commit_txn(txn, DB_TXN_NOSYNC);
1977 }
1978 else {
1979 abort_txn(txn);
1980 }
1981 }
1982 return error;
1983 }
1984
1985 //
1986 // helper function to write a piece of metadata in to status.tokudb
1987 //
write_metadata(DB * db,void * key_data,uint key_size,void * val_data,uint val_size,DB_TXN * transaction)1988 int ha_tokudb::write_metadata(DB* db, void* key_data, uint key_size, void* val_data, uint val_size, DB_TXN* transaction ){
1989 int error;
1990 DBT key;
1991 DBT value;
1992 DB_TXN* txn = NULL;
1993 bool do_commit = false;
1994 //
1995 // transaction to be used for putting metadata into status.tokudb
1996 //
1997 if (transaction == NULL) {
1998 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
1999 if (error) {
2000 goto cleanup;
2001 }
2002 do_commit = true;
2003 }
2004 else {
2005 txn = transaction;
2006 }
2007
2008 memset(&key, 0, sizeof(key));
2009 memset(&value, 0, sizeof(value));
2010 key.data = key_data;
2011 key.size = key_size;
2012 value.data = val_data;
2013 value.size = val_size;
2014 error = db->put(db, txn, &key, &value, 0);
2015 if (error) {
2016 goto cleanup;
2017 }
2018
2019 error = 0;
2020 cleanup:
2021 if (do_commit && txn) {
2022 if (!error) {
2023 commit_txn(txn, DB_TXN_NOSYNC);
2024 }
2025 else {
2026 abort_txn(txn);
2027 }
2028 }
2029 return error;
2030 }
2031
2032 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
write_frm_data(DB * db,DB_TXN * txn,const char * frm_name)2033 int ha_tokudb::write_frm_data(DB* db, DB_TXN* txn, const char* frm_name) {
2034 TOKUDB_HANDLER_DBUG_ENTER("%p %p %s", db, txn, frm_name);
2035
2036 uchar* frm_data = NULL;
2037 size_t frm_len = 0;
2038 int error = 0;
2039
2040 error = readfrm(frm_name,&frm_data,&frm_len);
2041 if (error) { goto cleanup; }
2042
2043 error = write_to_status(db,hatoku_frm_data,frm_data,(uint)frm_len, txn);
2044 if (error) { goto cleanup; }
2045
2046 error = 0;
2047 cleanup:
2048 tokudb::memory::free(frm_data);
2049 TOKUDB_HANDLER_DBUG_RETURN(error);
2050 }
2051
remove_frm_data(DB * db,DB_TXN * txn)2052 int ha_tokudb::remove_frm_data(DB *db, DB_TXN *txn) {
2053 return remove_from_status(db, hatoku_frm_data, txn);
2054 }
2055
smart_dbt_callback_verify_frm(TOKUDB_UNUSED (DBT const * key),DBT const * row,void * context)2056 static int smart_dbt_callback_verify_frm(TOKUDB_UNUSED(DBT const* key),
2057 DBT const* row,
2058 void* context) {
2059 DBT* stored_frm = (DBT *)context;
2060 stored_frm->size = row->size;
2061 stored_frm->data = (uchar *)tokudb::memory::malloc(row->size, MYF(MY_WME));
2062 assert_always(stored_frm->data);
2063 memcpy(stored_frm->data, row->data, row->size);
2064 return 0;
2065 }
2066
verify_frm_data(const char * frm_name,DB_TXN * txn)2067 int ha_tokudb::verify_frm_data(const char* frm_name, DB_TXN* txn) {
2068 TOKUDB_HANDLER_DBUG_ENTER("%s", frm_name);
2069 uchar* mysql_frm_data = NULL;
2070 size_t mysql_frm_len = 0;
2071 DBT key = {};
2072 DBT stored_frm = {};
2073 int error = 0;
2074 HA_METADATA_KEY curr_key = hatoku_frm_data;
2075
2076 // get the frm data from MySQL
2077 error = readfrm(frm_name,&mysql_frm_data,&mysql_frm_len);
2078 if (error) {
2079 goto cleanup;
2080 }
2081
2082 key.data = &curr_key;
2083 key.size = sizeof(curr_key);
2084 error = share->status_block->getf_set(
2085 share->status_block,
2086 txn,
2087 0,
2088 &key,
2089 smart_dbt_callback_verify_frm,
2090 &stored_frm
2091 );
2092 if (error == DB_NOTFOUND) {
2093 // if not found, write it
2094 error = write_frm_data(share->status_block, txn, frm_name);
2095 goto cleanup;
2096 } else if (error) {
2097 goto cleanup;
2098 }
2099
2100 if (stored_frm.size != mysql_frm_len || memcmp(stored_frm.data, mysql_frm_data, stored_frm.size)) {
2101 error = HA_ERR_TABLE_DEF_CHANGED;
2102 goto cleanup;
2103 }
2104
2105 error = 0;
2106 cleanup:
2107 tokudb::memory::free(mysql_frm_data);
2108 tokudb::memory::free(stored_frm.data);
2109 TOKUDB_HANDLER_DBUG_RETURN(error);
2110 }
2111 #endif // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
2112
2113 //
2114 // Updates status.tokudb with a new max value used for the auto increment column
2115 // Parameters:
2116 // [in] db - this will always be status.tokudb
2117 // val - value to store
2118 // Returns:
2119 // 0 on success, error otherwise
2120 //
2121 //
update_max_auto_inc(DB * db,ulonglong val)2122 int ha_tokudb::update_max_auto_inc(DB* db, ulonglong val){
2123 return write_to_status(db,hatoku_max_ai,&val,sizeof(val), NULL);
2124 }
2125
2126 //
2127 // Writes the initial auto increment value, as specified by create table
2128 // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
2129 // then the value 100 will be stored here in val
2130 // Parameters:
2131 // [in] db - this will always be status.tokudb
2132 // val - value to store
2133 // Returns:
2134 // 0 on success, error otherwise
2135 //
2136 //
write_auto_inc_create(DB * db,ulonglong val,DB_TXN * txn)2137 int ha_tokudb::write_auto_inc_create(DB* db, ulonglong val, DB_TXN* txn){
2138 return write_to_status(db,hatoku_ai_create_value,&val,sizeof(val), txn);
2139 }
2140
2141
2142 //
2143 // Closes a handle to a table.
2144 //
close()2145 int ha_tokudb::close() {
2146 TOKUDB_HANDLER_DBUG_ENTER("");
2147 int r = __close();
2148 TOKUDB_HANDLER_DBUG_RETURN(r);
2149 }
2150
__close()2151 int ha_tokudb::__close() {
2152 TOKUDB_HANDLER_DBUG_ENTER("");
2153 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "close:%p", this);
2154 tokudb::memory::free(rec_buff);
2155 tokudb::memory::free(rec_update_buff);
2156 tokudb::memory::free(blob_buff);
2157 tokudb::memory::free(alloc_ptr);
2158 tokudb::memory::free(range_query_buff);
2159 for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
2160 toku_dbt_array_destroy(&mult_key_dbt_array[i]);
2161 }
2162 for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
2163 toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
2164 }
2165 rec_buff = NULL;
2166 rec_update_buff = NULL;
2167 alloc_ptr = NULL;
2168 ha_tokudb::reset();
2169 int retval = share->release();
2170 TOKUDB_HANDLER_DBUG_RETURN(retval);
2171 }
2172
2173 //
2174 // Reallocate record buffer (rec_buff) if needed
2175 // If not needed, does nothing
2176 // Parameters:
2177 // length - size of buffer required for rec_buff
2178 //
fix_rec_buff_for_blob(ulong length)2179 bool ha_tokudb::fix_rec_buff_for_blob(ulong length) {
2180 if (!rec_buff || (length > alloced_rec_buff_length)) {
2181 uchar* newptr = (uchar*)tokudb::memory::realloc(
2182 (void*)rec_buff,
2183 length,
2184 MYF(MY_ALLOW_ZERO_PTR));
2185 if (!newptr)
2186 return 1;
2187 rec_buff = newptr;
2188 alloced_rec_buff_length = length;
2189 }
2190 return 0;
2191 }
2192
2193 //
2194 // Reallocate record buffer (rec_buff) if needed
2195 // If not needed, does nothing
2196 // Parameters:
2197 // length - size of buffer required for rec_buff
2198 //
fix_rec_update_buff_for_blob(ulong length)2199 bool ha_tokudb::fix_rec_update_buff_for_blob(ulong length) {
2200 if (!rec_update_buff || (length > alloced_update_rec_buff_length)) {
2201 uchar* newptr = (uchar*)tokudb::memory::realloc(
2202 (void*)rec_update_buff,
2203 length,
2204 MYF(MY_ALLOW_ZERO_PTR));
2205 if (!newptr)
2206 return 1;
2207 rec_update_buff= newptr;
2208 alloced_update_rec_buff_length = length;
2209 }
2210 return 0;
2211 }
2212
2213 /* Calculate max length needed for row */
max_row_length(const uchar * buf)2214 ulong ha_tokudb::max_row_length(const uchar * buf) {
2215 ulong length = table_share->reclength + table_share->fields * 2;
2216 uint *ptr, *end;
2217 for (ptr = table_share->blob_field, end = ptr + table_share->blob_fields; ptr != end; ptr++) {
2218 Field_blob *blob = ((Field_blob *) table->field[*ptr]);
2219 length += blob->get_length((uchar *) (buf + field_offset(blob, table))) + 2;
2220 }
2221 return length;
2222 }
2223
2224 /*
2225 */
2226 //
2227 // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2228 // Pack a row for storage.
2229 // If the row is of fixed length, just store the row 'as is'.
2230 // If not, we will generate a packed row suitable for storage.
2231 // This will only fail if we don't have enough memory to pack the row,
2232 // which may only happen in rows with blobs, as the default row length is
2233 // pre-allocated.
2234 // Parameters:
2235 // [out] row - row stored in DBT to be converted
2236 // [out] buf - buffer where row is packed
2237 // [in] record - row in MySQL format
2238 //
2239
pack_row_in_buff(DBT * row,const uchar * record,uint index,uchar * row_buff)2240 int ha_tokudb::pack_row_in_buff(
2241 DBT * row,
2242 const uchar* record,
2243 uint index,
2244 uchar* row_buff
2245 )
2246 {
2247 uchar* fixed_field_ptr = NULL;
2248 uchar* var_field_offset_ptr = NULL;
2249 uchar* start_field_data_ptr = NULL;
2250 uchar* var_field_data_ptr = NULL;
2251 int r = ENOSYS;
2252 memset((void *) row, 0, sizeof(*row));
2253
2254 my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
2255
2256 // Copy null bytes
2257 memcpy(row_buff, record, table_share->null_bytes);
2258 fixed_field_ptr = row_buff + table_share->null_bytes;
2259 var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2260 start_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2261 var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2262
2263 // assert that when the hidden primary key exists, primary_key_offsets is NULL
2264 for (uint i = 0; i < table_share->fields; i++) {
2265 Field* field = table->field[i];
2266 uint curr_field_offset = field_offset(field, table);
2267 if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2268 continue;
2269 }
2270 if (is_fixed_field(&share->kc_info, i)) {
2271 fixed_field_ptr = pack_fixed_field(
2272 fixed_field_ptr,
2273 record + curr_field_offset,
2274 share->kc_info.field_lengths[i]
2275 );
2276 }
2277 else if (is_variable_field(&share->kc_info, i)) {
2278 var_field_data_ptr = pack_var_field(
2279 var_field_offset_ptr,
2280 var_field_data_ptr,
2281 start_field_data_ptr,
2282 record + curr_field_offset,
2283 share->kc_info.length_bytes[i],
2284 share->kc_info.num_offset_bytes
2285 );
2286 var_field_offset_ptr += share->kc_info.num_offset_bytes;
2287 }
2288 }
2289
2290 for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2291 Field* field = table->field[share->kc_info.blob_fields[i]];
2292 var_field_data_ptr = pack_toku_field_blob(
2293 var_field_data_ptr,
2294 record + field_offset(field, table),
2295 field
2296 );
2297 }
2298
2299 row->data = row_buff;
2300 row->size = (size_t) (var_field_data_ptr - row_buff);
2301 r = 0;
2302
2303 dbug_tmp_restore_column_map(table->write_set, old_map);
2304 return r;
2305 }
2306
2307
pack_row(DBT * row,const uchar * record,uint index)2308 int ha_tokudb::pack_row(
2309 DBT * row,
2310 const uchar* record,
2311 uint index
2312 )
2313 {
2314 return pack_row_in_buff(row,record,index,rec_buff);
2315 }
2316
pack_old_row_for_update(DBT * row,const uchar * record,uint index)2317 int ha_tokudb::pack_old_row_for_update(
2318 DBT * row,
2319 const uchar* record,
2320 uint index
2321 )
2322 {
2323 return pack_row_in_buff(row,record,index,rec_update_buff);
2324 }
2325
2326
unpack_blobs(uchar * record,const uchar * from_tokudb_blob,uint32_t num_bytes,bool check_bitmap)2327 int ha_tokudb::unpack_blobs(
2328 uchar* record,
2329 const uchar* from_tokudb_blob,
2330 uint32_t num_bytes,
2331 bool check_bitmap
2332 )
2333 {
2334 uint error = 0;
2335 uchar* ptr = NULL;
2336 const uchar* buff = NULL;
2337 //
2338 // assert that num_bytes > 0 iff share->num_blobs > 0
2339 //
2340 assert_always( !((share->kc_info.num_blobs == 0) && (num_bytes > 0)) );
2341 if (num_bytes > num_blob_bytes) {
2342 ptr = (uchar*)tokudb::memory::realloc(
2343 (void*)blob_buff, num_bytes,
2344 MYF(MY_ALLOW_ZERO_PTR));
2345 if (ptr == NULL) {
2346 error = ENOMEM;
2347 goto exit;
2348 }
2349 blob_buff = ptr;
2350 num_blob_bytes = num_bytes;
2351 }
2352
2353 memcpy(blob_buff, from_tokudb_blob, num_bytes);
2354 buff= blob_buff;
2355 for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2356 uint32_t curr_field_index = share->kc_info.blob_fields[i];
2357 bool skip = check_bitmap ?
2358 !(bitmap_is_set(table->read_set,curr_field_index) ||
2359 bitmap_is_set(table->write_set,curr_field_index)) :
2360 false;
2361 Field* field = table->field[curr_field_index];
2362 uint32_t len_bytes = field->row_pack_length();
2363 const uchar* end_buff = unpack_toku_field_blob(
2364 record + field_offset(field, table),
2365 buff,
2366 len_bytes,
2367 skip
2368 );
2369 // verify that the pointers to the blobs are all contained within the blob_buff
2370 if (!(blob_buff <= buff && end_buff <= blob_buff + num_bytes)) {
2371 error = -3000000;
2372 goto exit;
2373 }
2374 buff = end_buff;
2375 }
2376 // verify that the entire blob buffer was parsed
2377 if (share->kc_info.num_blobs > 0 && !(num_bytes > 0 && buff == blob_buff + num_bytes)) {
2378 error = -4000000;
2379 goto exit;
2380 }
2381
2382 error = 0;
2383 exit:
2384 return error;
2385 }
2386
2387 //
2388 // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2389 // Parameters:
2390 // [out] record - row in MySQL format
2391 // [in] row - row stored in DBT to be converted
2392 //
unpack_row(uchar * record,DBT const * row,DBT const * key,uint index)2393 int ha_tokudb::unpack_row(
2394 uchar* record,
2395 DBT const *row,
2396 DBT const *key,
2397 uint index
2398 )
2399 {
2400 //
2401 // two cases, fixed length row, and variable length row
2402 // fixed length row is first below
2403 //
2404 /* Copy null bits */
2405 int error = 0;
2406 const uchar* fixed_field_ptr = (const uchar *) row->data;
2407 const uchar* var_field_offset_ptr = NULL;
2408 const uchar* var_field_data_ptr = NULL;
2409 uint32_t data_end_offset = 0;
2410 memcpy(record, fixed_field_ptr, table_share->null_bytes);
2411 fixed_field_ptr += table_share->null_bytes;
2412
2413 var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2414 var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2415
2416 //
2417 // unpack the key, if necessary
2418 //
2419 if (!(hidden_primary_key && index == primary_key)) {
2420 unpack_key(record,key,index);
2421 }
2422
2423 uint32_t last_offset = 0;
2424 //
2425 // we have two methods of unpacking, one if we need to unpack the entire row
2426 // the second if we unpack a subset of the entire row
2427 // first method here is if we unpack the entire row
2428 //
2429 if (unpack_entire_row) {
2430 //
2431 // fill in parts of record that are not part of the key
2432 //
2433 for (uint i = 0; i < table_share->fields; i++) {
2434 Field* field = table->field[i];
2435 if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2436 continue;
2437 }
2438
2439 if (is_fixed_field(&share->kc_info, i)) {
2440 fixed_field_ptr = unpack_fixed_field(
2441 record + field_offset(field, table),
2442 fixed_field_ptr,
2443 share->kc_info.field_lengths[i]
2444 );
2445 }
2446 //
2447 // here, we DO modify var_field_data_ptr or var_field_offset_ptr
2448 // as we unpack variable sized fields
2449 //
2450 else if (is_variable_field(&share->kc_info, i)) {
2451 switch (share->kc_info.num_offset_bytes) {
2452 case (1):
2453 data_end_offset = var_field_offset_ptr[0];
2454 break;
2455 case (2):
2456 data_end_offset = uint2korr(var_field_offset_ptr);
2457 break;
2458 default:
2459 assert_unreachable();
2460 }
2461 unpack_var_field(
2462 record + field_offset(field, table),
2463 var_field_data_ptr,
2464 data_end_offset - last_offset,
2465 share->kc_info.length_bytes[i]
2466 );
2467 var_field_offset_ptr += share->kc_info.num_offset_bytes;
2468 var_field_data_ptr += data_end_offset - last_offset;
2469 last_offset = data_end_offset;
2470 }
2471 }
2472 error = unpack_blobs(
2473 record,
2474 var_field_data_ptr,
2475 row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2476 false
2477 );
2478 if (error) {
2479 goto exit;
2480 }
2481 }
2482 //
2483 // in this case, we unpack only what is specified
2484 // in fixed_cols_for_query and var_cols_for_query
2485 //
2486 else {
2487 //
2488 // first the fixed fields
2489 //
2490 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
2491 uint field_index = fixed_cols_for_query[i];
2492 Field* field = table->field[field_index];
2493 unpack_fixed_field(
2494 record + field_offset(field, table),
2495 fixed_field_ptr + share->kc_info.cp_info[index][field_index].col_pack_val,
2496 share->kc_info.field_lengths[field_index]
2497 );
2498 }
2499
2500 //
2501 // now the var fields
2502 // here, we do NOT modify var_field_data_ptr or var_field_offset_ptr
2503 //
2504 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
2505 uint field_index = var_cols_for_query[i];
2506 Field* field = table->field[field_index];
2507 uint32_t var_field_index = share->kc_info.cp_info[index][field_index].col_pack_val;
2508 uint32_t data_start_offset;
2509 uint32_t field_len;
2510
2511 get_var_field_info(
2512 &field_len,
2513 &data_start_offset,
2514 var_field_index,
2515 var_field_offset_ptr,
2516 share->kc_info.num_offset_bytes
2517 );
2518
2519 unpack_var_field(
2520 record + field_offset(field, table),
2521 var_field_data_ptr + data_start_offset,
2522 field_len,
2523 share->kc_info.length_bytes[field_index]
2524 );
2525 }
2526
2527 if (read_blobs) {
2528 //
2529 // now the blobs
2530 //
2531 get_blob_field_info(
2532 &data_end_offset,
2533 share->kc_info.mcp_info[index].len_of_offsets,
2534 var_field_data_ptr,
2535 share->kc_info.num_offset_bytes
2536 );
2537
2538 var_field_data_ptr += data_end_offset;
2539 error = unpack_blobs(
2540 record,
2541 var_field_data_ptr,
2542 row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2543 true
2544 );
2545 if (error) {
2546 goto exit;
2547 }
2548 }
2549 }
2550 error = 0;
2551 exit:
2552 return error;
2553 }
2554
place_key_into_mysql_buff(KEY * key_info,uchar * record,uchar * data)2555 uint32_t ha_tokudb::place_key_into_mysql_buff(
2556 KEY* key_info,
2557 uchar* record,
2558 uchar* data) {
2559
2560 KEY_PART_INFO* key_part = key_info->key_part;
2561 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2562 uchar* pos = data;
2563
2564 for (; key_part != end; key_part++) {
2565 if (key_part->field->null_bit) {
2566 uint null_offset = get_null_offset(table, key_part->field);
2567 if (*pos++ == NULL_COL_VAL) { // Null value
2568 //
2569 // We don't need to reset the record data as we will not access it
2570 // if the null data is set
2571 //
2572 record[null_offset] |= key_part->field->null_bit;
2573 continue;
2574 }
2575 record[null_offset] &= ~key_part->field->null_bit;
2576 }
2577 //
2578 // HOPEFULLY TEMPORARY
2579 //
2580 assert_always(table->s->db_low_byte_first);
2581 pos = unpack_toku_key_field(
2582 record + field_offset(key_part->field, table),
2583 pos,
2584 key_part->field,
2585 key_part->length
2586 );
2587 }
2588 return pos-data;
2589 }
2590
2591 //
2592 // Store the key and the primary key into the row
2593 // Parameters:
2594 // [out] record - key stored in MySQL format
2595 // [in] key - key stored in DBT to be converted
2596 // index -index into key_file that represents the DB
2597 // unpacking a key of
2598 //
unpack_key(uchar * record,DBT const * key,uint index)2599 void ha_tokudb::unpack_key(uchar * record, DBT const *key, uint index) {
2600 uint32_t bytes_read;
2601 uchar *pos = (uchar *) key->data + 1;
2602 bytes_read = place_key_into_mysql_buff(
2603 &table->key_info[index],
2604 record,
2605 pos
2606 );
2607 if( (index != primary_key) && !hidden_primary_key) {
2608 //
2609 // also unpack primary key
2610 //
2611 place_key_into_mysql_buff(
2612 &table->key_info[primary_key],
2613 record,
2614 pos+bytes_read
2615 );
2616 }
2617 }
2618
place_key_into_dbt_buff(KEY * key_info,uchar * buff,const uchar * record,bool * has_null,int key_length)2619 uint32_t ha_tokudb::place_key_into_dbt_buff(
2620 KEY* key_info,
2621 uchar* buff,
2622 const uchar* record,
2623 bool* has_null,
2624 int key_length) {
2625
2626 KEY_PART_INFO* key_part = key_info->key_part;
2627 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2628 uchar* curr_buff = buff;
2629 *has_null = false;
2630 for (; key_part != end && key_length > 0; key_part++) {
2631 //
2632 // accessing key_part->field->null_bit instead off key_part->null_bit
2633 // because key_part->null_bit is not set in add_index
2634 // filed ticket 862 to look into this
2635 //
2636 if (key_part->field->null_bit) {
2637 /* Store 0 if the key part is a NULL part */
2638 uint null_offset = get_null_offset(table, key_part->field);
2639 if (record[null_offset] & key_part->field->null_bit) {
2640 *curr_buff++ = NULL_COL_VAL;
2641 *has_null = true;
2642 continue;
2643 }
2644 *curr_buff++ = NONNULL_COL_VAL; // Store NOT NULL marker
2645 }
2646 //
2647 // HOPEFULLY TEMPORARY
2648 //
2649 assert_always(table->s->db_low_byte_first);
2650 //
2651 // accessing field_offset(key_part->field) instead off key_part->offset
2652 // because key_part->offset is SET INCORRECTLY in add_index
2653 // filed ticket 862 to look into this
2654 //
2655 curr_buff = pack_toku_key_field(
2656 curr_buff,
2657 (uchar *) (record + field_offset(key_part->field, table)),
2658 key_part->field,
2659 key_part->length
2660 );
2661 key_length -= key_part->length;
2662 }
2663 return curr_buff - buff;
2664 }
2665
2666
2667
2668 //
2669 // Create a packed key from a row. This key will be written as such
2670 // to the index tree. This will never fail as the key buffer is pre-allocated.
2671 // Parameters:
2672 // [out] key - DBT that holds the key
2673 // [in] key_info - holds data about the key, such as it's length and offset into record
2674 // [out] buff - buffer that will hold the data for key (unless
2675 // we have a hidden primary key)
2676 // [in] record - row from which to create the key
2677 // key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2678 // Returns:
2679 // the parameter key
2680 //
2681
create_dbt_key_from_key(DBT * key,KEY * key_info,uchar * buff,const uchar * record,bool * has_null,bool dont_pack_pk,int key_length,uint8_t inf_byte)2682 DBT* ha_tokudb::create_dbt_key_from_key(
2683 DBT * key,
2684 KEY* key_info,
2685 uchar * buff,
2686 const uchar * record,
2687 bool* has_null,
2688 bool dont_pack_pk,
2689 int key_length,
2690 uint8_t inf_byte
2691 )
2692 {
2693 uint32_t size = 0;
2694 uchar* tmp_buff = buff;
2695 my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
2696
2697 key->data = buff;
2698
2699 //
2700 // first put the "infinity" byte at beginning. States if missing columns are implicitly
2701 // positive infinity or negative infinity or zero. For this, because we are creating key
2702 // from a row, there is no way that columns can be missing, so in practice,
2703 // this will be meaningless. Might as well put in a value
2704 //
2705 *tmp_buff++ = inf_byte;
2706 size++;
2707 size += place_key_into_dbt_buff(
2708 key_info,
2709 tmp_buff,
2710 record,
2711 has_null,
2712 key_length
2713 );
2714 if (!dont_pack_pk) {
2715 tmp_buff = buff + size;
2716 if (hidden_primary_key) {
2717 memcpy(tmp_buff, current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2718 size += TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2719 }
2720 else {
2721 bool tmp_bool = false;
2722 size += place_key_into_dbt_buff(
2723 &table->key_info[primary_key],
2724 tmp_buff,
2725 record,
2726 &tmp_bool,
2727 MAX_KEY_LENGTH //this parameter does not matter
2728 );
2729 }
2730 }
2731
2732 key->size = size;
2733 DBUG_DUMP("key", (uchar *) key->data, key->size);
2734 dbug_tmp_restore_column_map(table->write_set, old_map);
2735 return key;
2736 }
2737
2738
2739 //
2740 // Create a packed key from a row. This key will be written as such
2741 // to the index tree. This will never fail as the key buffer is pre-allocated.
2742 // Parameters:
2743 // [out] key - DBT that holds the key
2744 // keynr - index for which to create the key
2745 // [out] buff - buffer that will hold the data for key (unless
2746 // we have a hidden primary key)
2747 // [in] record - row from which to create the key
2748 // [out] has_null - says if the key has a NULL value for one of its columns
2749 // key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2750 // Returns:
2751 // the parameter key
2752 //
create_dbt_key_from_table(DBT * key,uint keynr,uchar * buff,const uchar * record,bool * has_null,int key_length)2753 DBT *ha_tokudb::create_dbt_key_from_table(
2754 DBT * key,
2755 uint keynr,
2756 uchar * buff,
2757 const uchar * record,
2758 bool* has_null,
2759 int key_length
2760 )
2761 {
2762 TOKUDB_HANDLER_DBUG_ENTER("");
2763 memset((void *) key, 0, sizeof(*key));
2764 if (hidden_primary_key && keynr == primary_key) {
2765 key->data = buff;
2766 memcpy(buff, ¤t_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2767 key->size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2768 *has_null = false;
2769 DBUG_RETURN(key);
2770 }
2771 DBUG_RETURN(create_dbt_key_from_key(key, &table->key_info[keynr],buff,record, has_null, (keynr == primary_key), key_length, COL_ZERO));
2772 }
2773
create_dbt_key_for_lookup(DBT * key,KEY * key_info,uchar * buff,const uchar * record,bool * has_null,int key_length)2774 DBT* ha_tokudb::create_dbt_key_for_lookup(
2775 DBT * key,
2776 KEY* key_info,
2777 uchar * buff,
2778 const uchar * record,
2779 bool* has_null,
2780 int key_length
2781 )
2782 {
2783 TOKUDB_HANDLER_DBUG_ENTER("");
2784 // override the infinity byte, needed in case the pk is a string
2785 // to make sure that the cursor that uses this key properly positions
2786 // it at the right location. If the table stores "D", but we look up for "d",
2787 // and the infinity byte is 0, then we will skip the "D", because
2788 // in bytes, "d" > "D".
2789 DBT* ret = create_dbt_key_from_key(key, key_info, buff, record, has_null, true, key_length, COL_NEG_INF);
2790 DBUG_RETURN(ret);
2791 }
2792
2793 //
2794 // Create a packed key from from a MySQL unpacked key (like the one that is
2795 // sent from the index_read() This key is to be used to read a row
2796 // Parameters:
2797 // [out] key - DBT that holds the key
2798 // keynr - index for which to pack the key
2799 // [out] buff - buffer that will hold the data for key
2800 // [in] key_ptr - MySQL unpacked key
2801 // key_length - length of key_ptr
2802 // Returns:
2803 // the parameter key
2804 //
pack_key(DBT * key,uint keynr,uchar * buff,const uchar * key_ptr,uint key_length,int8_t inf_byte)2805 DBT* ha_tokudb::pack_key(
2806 DBT* key,
2807 uint keynr,
2808 uchar* buff,
2809 const uchar* key_ptr,
2810 uint key_length,
2811 int8_t inf_byte) {
2812
2813 TOKUDB_HANDLER_DBUG_ENTER(
2814 "key %p %u:%2.2x inf=%d",
2815 key_ptr,
2816 key_length,
2817 key_length > 0 ? key_ptr[0] : 0,
2818 inf_byte);
2819 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2820 if (keynr != primary_key && !tokudb_test(hidden_primary_key)) {
2821 DBUG_RETURN(pack_ext_key(
2822 key,
2823 keynr,
2824 buff,
2825 key_ptr,
2826 key_length,
2827 inf_byte));
2828 }
2829 #endif // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2830 KEY* key_info = &table->key_info[keynr];
2831 KEY_PART_INFO* key_part = key_info->key_part;
2832 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2833 my_bitmap_map* old_map = dbug_tmp_use_all_columns(table, table->write_set);
2834
2835 memset(key, 0, sizeof(*key));
2836 key->data = buff;
2837
2838 // first put the "infinity" byte at beginning. States if missing columns are implicitly
2839 // positive infinity or negative infinity
2840 *buff++ = (uchar)inf_byte;
2841
2842 for (; key_part != end && (int) key_length > 0; key_part++) {
2843 uint offset = 0;
2844 if (key_part->null_bit) {
2845 if (!(*key_ptr == 0)) {
2846 *buff++ = NULL_COL_VAL;
2847 key_length -= key_part->store_length;
2848 key_ptr += key_part->store_length;
2849 continue;
2850 }
2851 *buff++ = NONNULL_COL_VAL;
2852 offset = 1; // Data is at key_ptr+1
2853 }
2854 assert_always(table->s->db_low_byte_first);
2855 buff = pack_key_toku_key_field(
2856 buff,
2857 (uchar *) key_ptr + offset,
2858 key_part->field,
2859 key_part->length
2860 );
2861
2862 key_ptr += key_part->store_length;
2863 key_length -= key_part->store_length;
2864 }
2865
2866 key->size = (buff - (uchar *) key->data);
2867 DBUG_DUMP("key", (uchar *) key->data, key->size);
2868 dbug_tmp_restore_column_map(table->write_set, old_map);
2869 DBUG_RETURN(key);
2870 }
2871
2872 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
pack_ext_key(DBT * key,uint keynr,uchar * buff,const uchar * key_ptr,uint key_length,int8_t inf_byte)2873 DBT* ha_tokudb::pack_ext_key(
2874 DBT* key,
2875 uint keynr,
2876 uchar* buff,
2877 const uchar* key_ptr,
2878 uint key_length,
2879 int8_t inf_byte) {
2880
2881 TOKUDB_HANDLER_DBUG_ENTER("");
2882
2883 // build a list of PK parts that are in the SK. we will use this list to
2884 // build the extended key if necessary.
2885 KEY* pk_key_info = &table->key_info[primary_key];
2886 uint pk_parts = pk_key_info->user_defined_key_parts;
2887 uint pk_next = 0;
2888 struct {
2889 const uchar *key_ptr;
2890 KEY_PART_INFO *key_part;
2891 } pk_info[pk_parts];
2892
2893 KEY* key_info = &table->key_info[keynr];
2894 KEY_PART_INFO* key_part = key_info->key_part;
2895 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2896 my_bitmap_map* old_map = dbug_tmp_use_all_columns(table, table->write_set);
2897
2898 memset((void *) key, 0, sizeof(*key));
2899 key->data = buff;
2900
2901 // first put the "infinity" byte at beginning. States if missing columns are implicitly
2902 // positive infinity or negative infinity
2903 *buff++ = (uchar)inf_byte;
2904
2905 for (; key_part != end && (int) key_length > 0; key_part++) {
2906 // if the SK part is part of the PK, then append it to the list.
2907 if (key_part->field->part_of_key.is_set(primary_key)) {
2908 assert_always(pk_next < pk_parts);
2909 pk_info[pk_next].key_ptr = key_ptr;
2910 pk_info[pk_next].key_part = key_part;
2911 pk_next++;
2912 }
2913 uint offset = 0;
2914 if (key_part->null_bit) {
2915 if (!(*key_ptr == 0)) {
2916 *buff++ = NULL_COL_VAL;
2917 key_length -= key_part->store_length;
2918 key_ptr += key_part->store_length;
2919 continue;
2920 }
2921 *buff++ = NONNULL_COL_VAL;
2922 offset = 1; // Data is at key_ptr+1
2923 }
2924 assert_always(table->s->db_low_byte_first);
2925 buff = pack_key_toku_key_field(
2926 buff,
2927 (uchar *) key_ptr + offset,
2928 key_part->field,
2929 key_part->length
2930 );
2931
2932 key_ptr += key_part->store_length;
2933 key_length -= key_part->store_length;
2934 }
2935
2936 if (key_length > 0) {
2937 assert_always(key_part == end);
2938 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2939 end = key_info->key_part + key_info->actual_key_parts;
2940 #else
2941 end = key_info->key_part;
2942 #endif // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2943
2944 // pack PK in order of PK key parts
2945 for (uint pk_index = 0;
2946 key_part != end && (int) key_length > 0 && pk_index < pk_parts;
2947 pk_index++) {
2948 uint i;
2949 for (i = 0; i < pk_next; i++) {
2950 if (pk_info[i].key_part->fieldnr ==
2951 pk_key_info->key_part[pk_index].fieldnr)
2952 break;
2953 }
2954 if (i < pk_next) {
2955 const uchar *this_key_ptr = pk_info[i].key_ptr;
2956 KEY_PART_INFO *this_key_part = pk_info[i].key_part;
2957 buff = pack_key_toku_key_field(
2958 buff,
2959 (uchar*)this_key_ptr,
2960 this_key_part->field,
2961 this_key_part->length);
2962 } else {
2963 buff = pack_key_toku_key_field(
2964 buff,
2965 (uchar*)key_ptr,
2966 key_part->field,
2967 key_part->length);
2968 key_ptr += key_part->store_length;
2969 key_length -= key_part->store_length;
2970 key_part++;
2971 }
2972 }
2973 }
2974
2975 key->size = (buff - (uchar *) key->data);
2976 DBUG_DUMP("key", (uchar *) key->data, key->size);
2977 dbug_tmp_restore_column_map(table->write_set, old_map);
2978 DBUG_RETURN(key);
2979 }
2980 #endif // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2981
2982 //
2983 // get max used hidden primary key value
2984 //
init_hidden_prim_key_info(DB_TXN * txn)2985 void ha_tokudb::init_hidden_prim_key_info(DB_TXN *txn) {
2986 TOKUDB_HANDLER_DBUG_ENTER("");
2987 if (!(share->status & STATUS_PRIMARY_KEY_INIT)) {
2988 int error = 0;
2989 DBC* c = NULL;
2990 error = share->key_file[primary_key]->cursor(
2991 share->key_file[primary_key],
2992 txn,
2993 &c,
2994 0);
2995 assert_always(error == 0);
2996 DBT key,val;
2997 memset(&key, 0, sizeof(key));
2998 memset(&val, 0, sizeof(val));
2999 error = c->c_get(c, &key, &val, DB_LAST);
3000 if (error == 0) {
3001 assert_always(key.size == TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
3002 share->auto_ident = hpk_char_to_num((uchar *)key.data);
3003 }
3004 error = c->c_close(c);
3005 assert_always(error == 0);
3006 share->status |= STATUS_PRIMARY_KEY_INIT;
3007 }
3008 TOKUDB_HANDLER_DBUG_VOID_RETURN;
3009 }
3010
3011
3012
3013 /** @brief
3014 Get metadata info stored in status.tokudb
3015 */
get_status(DB_TXN * txn)3016 int ha_tokudb::get_status(DB_TXN* txn) {
3017 TOKUDB_HANDLER_DBUG_ENTER("");
3018 DBT key, value;
3019 HA_METADATA_KEY curr_key;
3020 int error;
3021
3022 //
3023 // open status.tokudb
3024 //
3025 if (!share->status_block) {
3026 error =
3027 open_status_dictionary(
3028 &share->status_block,
3029 share->full_table_name(),
3030 txn);
3031 if (error) {
3032 goto cleanup;
3033 }
3034 }
3035
3036 //
3037 // transaction to be used for putting metadata into status.tokudb
3038 //
3039 memset(&key, 0, sizeof(key));
3040 memset(&value, 0, sizeof(value));
3041 key.data = &curr_key;
3042 key.size = sizeof(curr_key);
3043 value.flags = DB_DBT_USERMEM;
3044
3045 assert_always(share->status_block);
3046 //
3047 // get version
3048 //
3049 value.ulen = sizeof(share->version);
3050 value.data = &share->version;
3051 curr_key = hatoku_new_version;
3052 error = share->status_block->get(
3053 share->status_block,
3054 txn,
3055 &key,
3056 &value,
3057 0
3058 );
3059 if (error == DB_NOTFOUND) {
3060 //
3061 // hack to keep handle the issues of going back and forth
3062 // between 5.0.3 to 5.0.4
3063 // the problem with going back and forth
3064 // is with storing the frm file, 5.0.4 stores it, 5.0.3 does not
3065 // so, if a user goes back and forth and alters the schema
3066 // the frm stored can get out of sync with the schema of the table
3067 // This can cause issues.
3068 // To take care of this, we are doing this versioning work here.
3069 // We change the key that stores the version.
3070 // In 5.0.3, it is hatoku_old_version, in 5.0.4 it is hatoku_new_version
3071 // When we encounter a table that does not have hatoku_new_version
3072 // set, we give it the right one, and overwrite the old one with zero.
3073 // This ensures that 5.0.3 cannot open the table. Once it has been opened by 5.0.4
3074 //
3075 uint dummy_version = 0;
3076 share->version = HA_TOKU_ORIG_VERSION;
3077 error = write_to_status(
3078 share->status_block,
3079 hatoku_new_version,
3080 &share->version,
3081 sizeof(share->version),
3082 txn
3083 );
3084 if (error) { goto cleanup; }
3085 error = write_to_status(
3086 share->status_block,
3087 hatoku_old_version,
3088 &dummy_version,
3089 sizeof(dummy_version),
3090 txn
3091 );
3092 if (error) { goto cleanup; }
3093 }
3094 else if (error || value.size != sizeof(share->version)) {
3095 if (error == 0) {
3096 error = HA_ERR_INTERNAL_ERROR;
3097 }
3098 goto cleanup;
3099 }
3100 //
3101 // get capabilities
3102 //
3103 curr_key = hatoku_capabilities;
3104 value.ulen = sizeof(share->capabilities);
3105 value.data = &share->capabilities;
3106 error = share->status_block->get(
3107 share->status_block,
3108 txn,
3109 &key,
3110 &value,
3111 0
3112 );
3113 if (error == DB_NOTFOUND) {
3114 share->capabilities= 0;
3115 }
3116 else if (error || value.size != sizeof(share->version)) {
3117 if (error == 0) {
3118 error = HA_ERR_INTERNAL_ERROR;
3119 }
3120 goto cleanup;
3121 }
3122
3123 error = 0;
3124 cleanup:
3125 TOKUDB_HANDLER_DBUG_RETURN(error);
3126 }
3127
3128 /** @brief
3129 Return an estimated of the number of rows in the table.
3130 Used when sorting to allocate buffers and by the optimizer.
3131 This is used in filesort.cc.
3132 */
estimate_rows_upper_bound()3133 ha_rows ha_tokudb::estimate_rows_upper_bound() {
3134 TOKUDB_HANDLER_DBUG_ENTER("");
3135 DBUG_RETURN(share->row_count() + HA_TOKUDB_EXTRA_ROWS);
3136 }
3137
3138 //
3139 // Function that compares two primary keys that were saved as part of rnd_pos
3140 // and ::position
3141 //
cmp_ref(const uchar * ref1,const uchar * ref2)3142 int ha_tokudb::cmp_ref(const uchar * ref1, const uchar * ref2) {
3143 int ret_val = 0;
3144 bool read_string = false;
3145 ret_val = tokudb_compare_two_keys(
3146 ref1 + sizeof(uint32_t),
3147 *(uint32_t *)ref1,
3148 ref2 + sizeof(uint32_t),
3149 *(uint32_t *)ref2,
3150 (uchar *)share->file->descriptor->dbt.data + 4,
3151 *(uint32_t *)share->file->descriptor->dbt.data - 4,
3152 false,
3153 &read_string
3154 );
3155 return ret_val;
3156 }
3157
check_if_incompatible_data(HA_CREATE_INFO * info,uint table_changes)3158 bool ha_tokudb::check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes) {
3159 //
3160 // This is a horrendous hack for now, as copied by InnoDB.
3161 // This states that if the auto increment create field has changed,
3162 // via a "alter table foo auto_increment=new_val", that this
3163 // change is incompatible, and to rebuild the entire table
3164 // This will need to be fixed
3165 //
3166 if ((info->used_fields & HA_CREATE_USED_AUTO) &&
3167 info->auto_increment_value != 0) {
3168
3169 return COMPATIBLE_DATA_NO;
3170 }
3171 if (table_changes != IS_EQUAL_YES)
3172 return COMPATIBLE_DATA_NO;
3173 return COMPATIBLE_DATA_YES;
3174 }
3175
3176 //
3177 // Method that is called before the beginning of many calls
3178 // to insert rows (ha_tokudb::write_row). There is no guarantee
3179 // that start_bulk_insert is called, however there is a guarantee
3180 // that if start_bulk_insert is called, then end_bulk_insert may be
3181 // called as well.
3182 // Parameters:
3183 // [in] rows - an estimate of the number of rows that will be inserted
3184 // if number of rows is unknown (such as if doing
3185 // "insert into foo select * from bar), then rows
3186 // will be 0
3187 //
3188 //
3189 // This function returns true if the table MAY be empty.
3190 // It is NOT meant to be a 100% check for emptiness.
3191 // This is used for a bulk load optimization.
3192 //
may_table_be_empty(DB_TXN * txn)3193 bool ha_tokudb::may_table_be_empty(DB_TXN *txn) {
3194 int error;
3195 bool ret_val = false;
3196 DBC* tmp_cursor = NULL;
3197 DB_TXN* tmp_txn = NULL;
3198
3199 const int empty_scan = tokudb::sysvars::empty_scan(ha_thd());
3200 if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_DISABLED)
3201 goto cleanup;
3202
3203 if (txn == NULL) {
3204 error = txn_begin(db_env, 0, &tmp_txn, 0, ha_thd());
3205 if (error) {
3206 goto cleanup;
3207 }
3208 txn = tmp_txn;
3209 }
3210
3211 error = share->file->cursor(share->file, txn, &tmp_cursor, 0);
3212 if (error)
3213 goto cleanup;
3214 tmp_cursor->c_set_check_interrupt_callback(tmp_cursor, tokudb_killed_thd_callback, ha_thd());
3215 if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_LR)
3216 error = tmp_cursor->c_getf_next(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3217 else
3218 error = tmp_cursor->c_getf_prev(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3219 error = map_to_handler_error(error);
3220 if (error == DB_NOTFOUND)
3221 ret_val = true;
3222 else
3223 ret_val = false;
3224 error = 0;
3225
3226 cleanup:
3227 if (tmp_cursor) {
3228 int r = tmp_cursor->c_close(tmp_cursor);
3229 assert_always(r == 0);
3230 tmp_cursor = NULL;
3231 }
3232 if (tmp_txn) {
3233 commit_txn(tmp_txn, 0);
3234 tmp_txn = NULL;
3235 }
3236 return ret_val;
3237 }
3238
start_bulk_insert(ha_rows rows)3239 void ha_tokudb::start_bulk_insert(ha_rows rows) {
3240 TOKUDB_HANDLER_DBUG_ENTER("%llu txn %p", (unsigned long long) rows, transaction);
3241 THD* thd = ha_thd();
3242 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3243 delay_updating_ai_metadata = true;
3244 ai_metadata_update_required = false;
3245 abort_loader = false;
3246
3247 rwlock_t_lock_read(share->_num_DBs_lock);
3248 uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3249 num_DBs_locked_in_bulk = true;
3250 lock_count = 0;
3251
3252 if ((rows == 0 || rows > 1) && share->try_table_lock) {
3253 if (tokudb::sysvars::prelock_empty(thd) &&
3254 may_table_be_empty(transaction) &&
3255 transaction != NULL) {
3256 if (using_ignore || is_insert_ignore(thd) || thd->lex->duplicates != DUP_ERROR) {
3257 acquire_table_lock(transaction, lock_write);
3258 } else {
3259 mult_dbt_flags[primary_key] = 0;
3260 if (!thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS) && !hidden_primary_key) {
3261 mult_put_flags[primary_key] = DB_NOOVERWRITE;
3262 }
3263 uint32_t loader_flags = (tokudb::sysvars::load_save_space(thd)) ?
3264 LOADER_COMPRESS_INTERMEDIATES : 0;
3265
3266 int error = db_env->create_loader(
3267 db_env,
3268 transaction,
3269 &loader,
3270 NULL, // no src_db needed
3271 curr_num_DBs,
3272 share->key_file,
3273 mult_put_flags,
3274 mult_dbt_flags,
3275 loader_flags
3276 );
3277 if (error) {
3278 assert_always(loader == NULL);
3279 goto exit_try_table_lock;
3280 }
3281
3282 lc.thd = thd;
3283 lc.ha = this;
3284
3285 error = loader->set_poll_function(
3286 loader, ha_tokudb::bulk_insert_poll, &lc);
3287 assert_always(!error);
3288
3289 error = loader->set_error_callback(
3290 loader, ha_tokudb::loader_dup, &lc);
3291 assert_always(!error);
3292
3293 trx->stmt_progress.using_loader = true;
3294 }
3295 }
3296 exit_try_table_lock:
3297 share->lock();
3298 share->try_table_lock = false;
3299 share->unlock();
3300 }
3301 TOKUDB_HANDLER_DBUG_VOID_RETURN;
3302 }
bulk_insert_poll(void * extra,float progress)3303 int ha_tokudb::bulk_insert_poll(void* extra, float progress) {
3304 LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
3305 if (thd_killed(context->thd)) {
3306 snprintf(context->write_status_msg,
3307 sizeof(context->write_status_msg),
3308 "The process has been killed, aborting bulk load.");
3309 return ER_ABORTING_CONNECTION;
3310 }
3311 float percentage = progress * 100;
3312 snprintf(context->write_status_msg,
3313 sizeof(context->write_status_msg),
3314 "Loading of data t %s about %.1f%% done",
3315 context->ha->share->full_table_name(),
3316 percentage);
3317 thd_proc_info(context->thd, context->write_status_msg);
3318 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
3319 thd_progress_report(context->thd, (unsigned long long)percentage, 100);
3320 #endif
3321 return 0;
3322 }
loader_add_index_err(TOKUDB_UNUSED (DB * db),TOKUDB_UNUSED (int i),int err,TOKUDB_UNUSED (DBT * key),TOKUDB_UNUSED (DBT * val),void * error_extra)3323 void ha_tokudb::loader_add_index_err(TOKUDB_UNUSED(DB* db),
3324 TOKUDB_UNUSED(int i),
3325 int err,
3326 TOKUDB_UNUSED(DBT* key),
3327 TOKUDB_UNUSED(DBT* val),
3328 void* error_extra) {
3329 LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3330 assert_always(context->ha);
3331 context->ha->set_loader_error(err);
3332 }
loader_dup(TOKUDB_UNUSED (DB * db),TOKUDB_UNUSED (int i),int err,DBT * key,TOKUDB_UNUSED (DBT * val),void * error_extra)3333 void ha_tokudb::loader_dup(TOKUDB_UNUSED(DB* db),
3334 TOKUDB_UNUSED(int i),
3335 int err,
3336 DBT* key,
3337 TOKUDB_UNUSED(DBT* val),
3338 void* error_extra) {
3339 LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3340 assert_always(context->ha);
3341 context->ha->set_loader_error(err);
3342 if (err == DB_KEYEXIST) {
3343 context->ha->set_dup_value_for_pk(key);
3344 }
3345 }
3346
3347 //
3348 // Method that is called at the end of many calls to insert rows
3349 // (ha_tokudb::write_row). If start_bulk_insert is called, then
3350 // this is guaranteed to be called.
3351 //
end_bulk_insert(TOKUDB_UNUSED (bool abort))3352 int ha_tokudb::end_bulk_insert(TOKUDB_UNUSED(bool abort)) {
3353 TOKUDB_HANDLER_DBUG_ENTER("");
3354 int error = 0;
3355 THD* thd = ha_thd();
3356 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3357 bool using_loader = (loader != NULL);
3358 if (ai_metadata_update_required) {
3359 share->lock();
3360 error = update_max_auto_inc(share->status_block, share->last_auto_increment);
3361 share->unlock();
3362 if (error) { goto cleanup; }
3363 }
3364 delay_updating_ai_metadata = false;
3365 ai_metadata_update_required = false;
3366 loader_error = 0;
3367 if (loader) {
3368 if (!abort_loader && !thd_killed(thd)) {
3369 DBUG_EXECUTE_IF("tokudb_end_bulk_insert_sleep", {
3370 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3371 thd_proc_info(thd, "DBUG sleep");
3372 my_sleep(20000000);
3373 thd_proc_info(thd, orig_proc_info);
3374 });
3375 error = loader->close(loader);
3376 loader = NULL;
3377 if (error) {
3378 if (thd_killed(thd)) {
3379 my_error(ER_QUERY_INTERRUPTED, MYF(0));
3380 }
3381 goto cleanup;
3382 }
3383
3384 for (uint i = 0; i < table_share->keys; i++) {
3385 if (table_share->key_info[i].flags & HA_NOSAME) {
3386 bool is_unique;
3387 if (i == primary_key && !share->pk_has_string) {
3388 continue;
3389 }
3390 error = is_index_unique(&is_unique, transaction, share->key_file[i], &table->key_info[i],
3391 DB_PRELOCKED_WRITE);
3392 if (error) goto cleanup;
3393 if (!is_unique) {
3394 error = HA_ERR_FOUND_DUPP_KEY;
3395 last_dup_key = i;
3396 goto cleanup;
3397 }
3398 }
3399 }
3400 }
3401 else {
3402 error = sprintf(write_status_msg, "aborting bulk load");
3403 thd_proc_info(thd, write_status_msg);
3404 loader->abort(loader);
3405 loader = NULL;
3406 share->try_table_lock = true;
3407 }
3408 }
3409
3410 cleanup:
3411 if (num_DBs_locked_in_bulk) {
3412 share->_num_DBs_lock.unlock();
3413 }
3414 num_DBs_locked_in_bulk = false;
3415 lock_count = 0;
3416 if (loader) {
3417 error = sprintf(write_status_msg, "aborting bulk load");
3418 thd_proc_info(thd, write_status_msg);
3419 loader->abort(loader);
3420 loader = NULL;
3421 }
3422 abort_loader = false;
3423 memset(&lc, 0, sizeof(lc));
3424 if (error || loader_error) {
3425 set_my_errno(error ? error : loader_error);
3426 if (using_loader) {
3427 share->try_table_lock = true;
3428 }
3429 }
3430 trx->stmt_progress.using_loader = false;
3431 thd_proc_info(thd, 0);
3432 TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
3433 }
3434
end_bulk_insert()3435 int ha_tokudb::end_bulk_insert() {
3436 return end_bulk_insert( false );
3437 }
3438
is_index_unique(bool * is_unique,DB_TXN * txn,DB * db,KEY * key_info,int lock_flags)3439 int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info, int lock_flags) {
3440 int error;
3441 DBC* tmp_cursor1 = NULL;
3442 DBC* tmp_cursor2 = NULL;
3443 DBT key1, key2, val, packed_key1, packed_key2;
3444 uint64_t cnt = 0;
3445 char status_msg[MAX_ALIAS_NAME + 200]; //buffer of 200 should be a good upper bound.
3446 THD* thd = ha_thd();
3447 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3448 memset(&key1, 0, sizeof(key1));
3449 memset(&key2, 0, sizeof(key2));
3450 memset(&val, 0, sizeof(val));
3451 memset(&packed_key1, 0, sizeof(packed_key1));
3452 memset(&packed_key2, 0, sizeof(packed_key2));
3453 *is_unique = true;
3454
3455 error = db->cursor(db, txn, &tmp_cursor1, DB_SERIALIZABLE);
3456 if (error) { goto cleanup; }
3457
3458 error = db->cursor(db, txn, &tmp_cursor2, DB_SERIALIZABLE);
3459 if (error) { goto cleanup; }
3460
3461 error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3462 if (error == DB_NOTFOUND) {
3463 *is_unique = true;
3464 error = 0;
3465 goto cleanup;
3466 }
3467 else if (error) { goto cleanup; }
3468 error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3469 if (error) { goto cleanup; }
3470
3471 error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3472 if (error == DB_NOTFOUND) {
3473 *is_unique = true;
3474 error = 0;
3475 goto cleanup;
3476 }
3477 else if (error) { goto cleanup; }
3478
3479 while (error != DB_NOTFOUND) {
3480 bool has_null1;
3481 bool has_null2;
3482 int cmp;
3483 place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key1.data + 1);
3484 place_key_into_mysql_buff(key_info, table->record[1], (uchar *) key2.data + 1);
3485
3486 create_dbt_key_for_lookup(&packed_key1, key_info, key_buff, table->record[0], &has_null1);
3487 create_dbt_key_for_lookup(&packed_key2, key_info, key_buff2, table->record[1], &has_null2);
3488
3489 if (!has_null1 && !has_null2) {
3490 cmp = tokudb_prefix_cmp_dbt_key(db, &packed_key1, &packed_key2);
3491 if (cmp == 0) {
3492 memcpy(key_buff, key1.data, key1.size);
3493 place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key_buff + 1);
3494 *is_unique = false;
3495 break;
3496 }
3497 }
3498
3499 error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3500 if (error) { goto cleanup; }
3501 error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3502 if (error && (error != DB_NOTFOUND)) { goto cleanup; }
3503
3504 cnt++;
3505 if ((cnt % 10000) == 0) {
3506 sprintf(
3507 status_msg,
3508 "Verifying index uniqueness: Checked %llu of %llu rows in key-%s.",
3509 (long long unsigned) cnt,
3510 share->row_count(),
3511 key_info->name);
3512 thd_proc_info(thd, status_msg);
3513 if (thd_killed(thd)) {
3514 my_error(ER_QUERY_INTERRUPTED, MYF(0));
3515 error = ER_QUERY_INTERRUPTED;
3516 goto cleanup;
3517 }
3518 }
3519 }
3520
3521 error = 0;
3522
3523 cleanup:
3524 thd_proc_info(thd, orig_proc_info);
3525 if (tmp_cursor1) {
3526 tmp_cursor1->c_close(tmp_cursor1);
3527 tmp_cursor1 = NULL;
3528 }
3529 if (tmp_cursor2) {
3530 tmp_cursor2->c_close(tmp_cursor2);
3531 tmp_cursor2 = NULL;
3532 }
3533 return error;
3534 }
3535
is_val_unique(bool * is_unique,uchar * record,KEY * key_info,uint dict_index,DB_TXN * txn)3536 int ha_tokudb::is_val_unique(bool* is_unique, uchar* record, KEY* key_info, uint dict_index, DB_TXN* txn) {
3537 int error = 0;
3538 bool has_null;
3539 DBC* tmp_cursor = NULL;
3540
3541 DBT key; memset((void *)&key, 0, sizeof(key));
3542 create_dbt_key_from_key(&key, key_info, key_buff2, record, &has_null, true, MAX_KEY_LENGTH, COL_NEG_INF);
3543 if (has_null) {
3544 error = 0;
3545 *is_unique = true;
3546 goto cleanup;
3547 }
3548
3549 error = share->key_file[dict_index]->cursor(share->key_file[dict_index], txn, &tmp_cursor, DB_SERIALIZABLE | DB_RMW);
3550 if (error) {
3551 goto cleanup;
3552 } else {
3553 // prelock (key,-inf),(key,+inf) so that the subsequent key lookup does not overlock
3554 uint flags = 0;
3555 DBT key_right; memset(&key_right, 0, sizeof key_right);
3556 create_dbt_key_from_key(&key_right, key_info, key_buff3, record, &has_null, true, MAX_KEY_LENGTH, COL_POS_INF);
3557 error = tmp_cursor->c_set_bounds(tmp_cursor, &key, &key_right, true, DB_NOTFOUND);
3558 if (error == 0) {
3559 flags = DB_PRELOCKED | DB_PRELOCKED_WRITE;
3560 }
3561
3562 // lookup key and check unique prefix
3563 struct smart_dbt_info info;
3564 info.ha = this;
3565 info.buf = NULL;
3566 info.keynr = dict_index;
3567
3568 struct index_read_info ir_info;
3569 ir_info.orig_key = &key;
3570 ir_info.smart_dbt_info = info;
3571
3572 error = tmp_cursor->c_getf_set_range(tmp_cursor, flags, &key, smart_dbt_callback_lookup, &ir_info);
3573 if (error == DB_NOTFOUND) {
3574 *is_unique = true;
3575 error = 0;
3576 goto cleanup;
3577 }
3578 else if (error) {
3579 error = map_to_handler_error(error);
3580 goto cleanup;
3581 }
3582 if (ir_info.cmp) {
3583 *is_unique = true;
3584 }
3585 else {
3586 *is_unique = false;
3587 }
3588 }
3589 error = 0;
3590
3591 cleanup:
3592 if (tmp_cursor) {
3593 int r = tmp_cursor->c_close(tmp_cursor);
3594 assert_always(r==0);
3595 tmp_cursor = NULL;
3596 }
3597 return error;
3598 }
3599
3600 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
maybe_do_unique_checks_delay_fn(THD * thd)3601 static void maybe_do_unique_checks_delay_fn(THD *thd) {
3602 if (thd->slave_thread) {
3603 uint64_t delay_ms = tokudb::sysvars::rpl_unique_checks_delay(thd);
3604 if (delay_ms)
3605 usleep(delay_ms * 1000);
3606 }
3607 }
3608
3609 #define maybe_do_unique_checks_delay(__thd) \
3610 (maybe_do_unique_checks_delay_fn(__thd))
3611
3612 #define maybe_do_unique_checks_delay_if_flags_set( \
3613 __thd, __flags_set, __flags_check) \
3614 { if (((__flags_set) & DB_OPFLAGS_MASK) == \
3615 (__flags_check)) maybe_do_unique_checks_delay_fn(__thd); }
3616
need_read_only(THD * thd)3617 static bool need_read_only(THD *thd) {
3618 return opt_readonly || !tokudb::sysvars::rpl_check_readonly(thd);
3619 }
3620
do_unique_checks_fn(THD * thd,bool do_rpl_event)3621 static bool do_unique_checks_fn(THD *thd, bool do_rpl_event) {
3622 if (do_rpl_event &&
3623 thd->slave_thread &&
3624 need_read_only(thd) &&
3625 !tokudb::sysvars::rpl_unique_checks(thd)) {
3626 return false;
3627 } else {
3628 return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3629 }
3630 }
3631
3632 #define do_unique_checks(__thd, __flags) \
3633 (do_unique_checks_fn(__thd, __flags))
3634
3635 #else
3636
3637 #define maybe_do_unique_checks_delay(__thd) ((void)0)
3638
3639 #define maybe_do_unique_checks_delay_if_flags_set( \
3640 __thd, __flags_set, __flags_check) \
3641 ((void)0)
3642
do_unique_checks_fn(THD * thd)3643 static bool do_unique_checks_fn(THD *thd) {
3644 return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3645 }
3646
3647 #define do_unique_checks(__thd, _flags) \
3648 (do_unique_checks_fn(__thd))
3649
3650 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
3651
do_uniqueness_checks(uchar * record,DB_TXN * txn,THD * thd)3652 int ha_tokudb::do_uniqueness_checks(uchar* record, DB_TXN* txn, THD* thd) {
3653 int error = 0;
3654 //
3655 // first do uniqueness checks
3656 //
3657 if (share->has_unique_keys && do_unique_checks(thd, in_rpl_write_rows)) {
3658 DBUG_EXECUTE_IF("tokudb_crash_if_rpl_does_uniqueness_check",
3659 assert(0););
3660 for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3661 bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
3662 bool is_unique = false;
3663 //
3664 // don't need to do check for primary key that don't have strings
3665 //
3666 if (keynr == primary_key && !share->pk_has_string) {
3667 continue;
3668 }
3669 if (!is_unique_key) {
3670 continue;
3671 }
3672
3673 maybe_do_unique_checks_delay(thd);
3674
3675 //
3676 // if unique key, check uniqueness constraint
3677 // but, we do not need to check it if the key has a null
3678 // and we do not need to check it if unique_checks is off
3679 //
3680 error = is_val_unique(&is_unique, record, &table->key_info[keynr], keynr, txn);
3681 if (error) {
3682 goto cleanup;
3683 }
3684 if (!is_unique) {
3685 error = DB_KEYEXIST;
3686 last_dup_key = keynr;
3687 goto cleanup;
3688 }
3689 }
3690 }
3691 cleanup:
3692 return error;
3693 }
3694
test_row_packing(uchar * record,DBT * pk_key,DBT * pk_val)3695 void ha_tokudb::test_row_packing(uchar* record, DBT* pk_key, DBT* pk_val) {
3696 int error;
3697 DBT row, key;
3698 //
3699 // variables for testing key packing, only used in some debug modes
3700 //
3701 uchar* tmp_pk_key_data = NULL;
3702 uchar* tmp_pk_val_data = NULL;
3703 DBT tmp_pk_key;
3704 DBT tmp_pk_val;
3705 bool has_null;
3706 int cmp;
3707
3708 memset(&tmp_pk_key, 0, sizeof(DBT));
3709 memset(&tmp_pk_val, 0, sizeof(DBT));
3710
3711 //
3712 //use for testing the packing of keys
3713 //
3714 tmp_pk_key_data = (uchar*)tokudb::memory::malloc(pk_key->size, MYF(MY_WME));
3715 assert_always(tmp_pk_key_data);
3716 tmp_pk_val_data = (uchar*)tokudb::memory::malloc(pk_val->size, MYF(MY_WME));
3717 assert_always(tmp_pk_val_data);
3718 memcpy(tmp_pk_key_data, pk_key->data, pk_key->size);
3719 memcpy(tmp_pk_val_data, pk_val->data, pk_val->size);
3720 tmp_pk_key.data = tmp_pk_key_data;
3721 tmp_pk_key.size = pk_key->size;
3722 tmp_pk_val.data = tmp_pk_val_data;
3723 tmp_pk_val.size = pk_val->size;
3724
3725 for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3726 uint32_t tmp_num_bytes = 0;
3727 uchar* row_desc = NULL;
3728 uint32_t desc_size = 0;
3729
3730 if (keynr == primary_key) {
3731 continue;
3732 }
3733
3734 create_dbt_key_from_table(&key, keynr, key_buff2, record, &has_null);
3735
3736 //
3737 // TEST
3738 //
3739 row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3740 row_desc += (*(uint32_t *)row_desc);
3741 desc_size = (*(uint32_t *)row_desc) - 4;
3742 row_desc += 4;
3743 tmp_num_bytes = pack_key_from_desc(
3744 key_buff3,
3745 row_desc,
3746 desc_size,
3747 &tmp_pk_key,
3748 &tmp_pk_val
3749 );
3750 assert_always(tmp_num_bytes == key.size);
3751 cmp = memcmp(key_buff3,key_buff2,tmp_num_bytes);
3752 assert_always(cmp == 0);
3753
3754 //
3755 // test key packing of clustering keys
3756 //
3757 if (key_is_clustering(&table->key_info[keynr])) {
3758 error = pack_row(&row, (const uchar *) record, keynr);
3759 assert_always(error == 0);
3760 uchar* tmp_buff = NULL;
3761 tmp_buff = (uchar*)tokudb::memory::malloc(
3762 alloced_rec_buff_length,
3763 MYF(MY_WME));
3764 assert_always(tmp_buff);
3765 row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3766 row_desc += (*(uint32_t *)row_desc);
3767 row_desc += (*(uint32_t *)row_desc);
3768 desc_size = (*(uint32_t *)row_desc) - 4;
3769 row_desc += 4;
3770 tmp_num_bytes = pack_clustering_val_from_desc(
3771 tmp_buff,
3772 row_desc,
3773 desc_size,
3774 &tmp_pk_val
3775 );
3776 assert_always(tmp_num_bytes == row.size);
3777 cmp = memcmp(tmp_buff,rec_buff,tmp_num_bytes);
3778 assert_always(cmp == 0);
3779 tokudb::memory::free(tmp_buff);
3780 }
3781 }
3782
3783 //
3784 // copy stuff back out
3785 //
3786 error = pack_row(pk_val, (const uchar *) record, primary_key);
3787 assert_always(pk_val->size == tmp_pk_val.size);
3788 cmp = memcmp(pk_val->data, tmp_pk_val_data, pk_val->size);
3789 assert_always( cmp == 0);
3790
3791 tokudb::memory::free(tmp_pk_key_data);
3792 tokudb::memory::free(tmp_pk_val_data);
3793 }
3794
3795 // set the put flags for the main dictionary
set_main_dict_put_flags(THD * thd,uint32_t * put_flags)3796 void ha_tokudb::set_main_dict_put_flags(THD* thd,
3797 uint32_t* put_flags) {
3798 uint32_t old_prelock_flags = 0;
3799
3800 if (hidden_primary_key ||
3801 (!do_unique_checks(thd, in_rpl_write_rows | in_rpl_update_rows) &&
3802 !is_replace_into(thd) && !is_insert_ignore(thd))) {
3803 *put_flags = old_prelock_flags;
3804 } else {
3805 *put_flags = DB_NOOVERWRITE | old_prelock_flags;
3806 }
3807 }
3808
insert_row_to_main_dictionary(DBT * pk_key,DBT * pk_val,DB_TXN * txn)3809 int ha_tokudb::insert_row_to_main_dictionary(
3810 DBT* pk_key,
3811 DBT* pk_val,
3812 DB_TXN* txn) {
3813
3814 int error = 0;
3815 uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3816 assert_always(curr_num_DBs == 1);
3817
3818 uint32_t put_flags = mult_put_flags[primary_key];
3819 THD *thd = ha_thd();
3820 set_main_dict_put_flags(thd, &put_flags);
3821
3822 // for test, make unique checks have a very long duration
3823 maybe_do_unique_checks_delay_if_flags_set(thd, put_flags, DB_NOOVERWRITE);
3824
3825 error = share->file->put(share->file, txn, pk_key, pk_val, put_flags);
3826 if (error) {
3827 last_dup_key = primary_key;
3828 goto cleanup;
3829 }
3830
3831 cleanup:
3832 return error;
3833 }
3834
insert_rows_to_dictionaries_mult(DBT * pk_key,DBT * pk_val,DB_TXN * txn,THD * thd)3835 int ha_tokudb::insert_rows_to_dictionaries_mult(
3836 DBT* pk_key,
3837 DBT* pk_val,
3838 DB_TXN* txn,
3839 THD* thd) {
3840
3841 int error = 0;
3842 uint curr_num_DBs = share->num_DBs;
3843 set_main_dict_put_flags(thd, &mult_put_flags[primary_key]);
3844 uint32_t flags = mult_put_flags[primary_key];
3845
3846 // for test, make unique checks have a very long duration
3847 maybe_do_unique_checks_delay_if_flags_set(thd, flags, DB_NOOVERWRITE);
3848
3849 // the insert ignore optimization uses DB_NOOVERWRITE_NO_ERROR,
3850 // which is not allowed with env->put_multiple.
3851 // we have to insert the rows one by one in this case.
3852 if (flags & DB_NOOVERWRITE_NO_ERROR) {
3853 DB * src_db = share->key_file[primary_key];
3854 for (uint32_t i = 0; i < curr_num_DBs; i++) {
3855 DB * db = share->key_file[i];
3856 if (i == primary_key) {
3857 // if it's the primary key, insert the rows
3858 // as they are.
3859 error = db->put(db, txn, pk_key, pk_val, flags);
3860 } else {
3861 // generate a row for secondary keys.
3862 // use our multi put key/rec buffers
3863 // just as the ydb layer would have in
3864 // env->put_multiple(), except that
3865 // we will just do a put() right away.
3866 error =
3867 tokudb_generate_row(
3868 db,
3869 src_db,
3870 &mult_key_dbt_array[i].dbts[0],
3871 &mult_rec_dbt_array[i].dbts[0],
3872 pk_key,
3873 pk_val);
3874 if (error != 0) {
3875 goto out;
3876 }
3877 error =
3878 db->put(
3879 db,
3880 txn,
3881 &mult_key_dbt_array[i].dbts[0],
3882 &mult_rec_dbt_array[i].dbts[0],
3883 flags);
3884 }
3885 if (error != 0) {
3886 goto out;
3887 }
3888 }
3889 } else {
3890 // not insert ignore, so we can use put multiple
3891 error =
3892 db_env->put_multiple(
3893 db_env,
3894 share->key_file[primary_key],
3895 txn,
3896 pk_key,
3897 pk_val,
3898 curr_num_DBs,
3899 share->key_file,
3900 mult_key_dbt_array,
3901 mult_rec_dbt_array,
3902 mult_put_flags);
3903 }
3904
3905 out:
3906 //
3907 // We break if we hit an error, unless it is a dup key error
3908 // and MySQL told us to ignore duplicate key errors
3909 //
3910 if (error) {
3911 last_dup_key = primary_key;
3912 }
3913 return error;
3914 }
3915
3916 //
3917 // Stores a row in the table, called when handling an INSERT query
3918 // Parameters:
3919 // [in] record - a row in MySQL format
3920 // Returns:
3921 // 0 on success
3922 // error otherwise
3923 //
write_row(uchar * record)3924 int ha_tokudb::write_row(uchar * record) {
3925 TOKUDB_HANDLER_DBUG_ENTER("%p", record);
3926
3927 DBT row, prim_key;
3928 int error;
3929 THD *thd = ha_thd();
3930 bool has_null;
3931 DB_TXN* sub_trans = nullptr;
3932 DB_TXN* txn = nullptr;
3933 tokudb_trx_data* trx = nullptr;
3934 uint curr_num_DBs;
3935 bool num_DBs_locked = false;
3936
3937 //
3938 // some crap that needs to be done because MySQL does not properly abstract
3939 // this work away from us, namely filling in auto increment and setting
3940 // auto timestamp
3941 //
3942 ha_statistic_increment(&SSV::ha_write_count);
3943 if (table->next_number_field && record == table->record[0]) {
3944 error = update_auto_increment();
3945 if (error)
3946 goto cleanup;
3947 }
3948
3949 //
3950 // check to see if some value for the auto increment column that is bigger
3951 // than anything else til now is being used. If so, update the metadata to
3952 // reflect it the goal here is we never want to have a dup key error due to
3953 // a bad increment of the auto inc field.
3954 //
3955 if (share->has_auto_inc && record == table->record[0]) {
3956 share->lock();
3957 ulonglong curr_auto_inc = retrieve_auto_increment(
3958 table->field[share->ai_field_index]->key_type(),
3959 field_offset(table->field[share->ai_field_index], table),
3960 record);
3961 if (curr_auto_inc > share->last_auto_increment) {
3962 share->last_auto_increment = curr_auto_inc;
3963 if (delay_updating_ai_metadata) {
3964 ai_metadata_update_required = true;
3965 } else {
3966 update_max_auto_inc(
3967 share->status_block,
3968 share->last_auto_increment);
3969 }
3970 }
3971 share->unlock();
3972 }
3973
3974 //
3975 // grab reader lock on numDBs_lock
3976 //
3977 if (!num_DBs_locked_in_bulk) {
3978 rwlock_t_lock_read(share->_num_DBs_lock);
3979 num_DBs_locked = true;
3980 } else {
3981 lock_count++;
3982 if (lock_count >= 2000) {
3983 share->_num_DBs_lock.unlock();
3984 rwlock_t_lock_read(share->_num_DBs_lock);
3985 lock_count = 0;
3986 }
3987 }
3988 curr_num_DBs = share->num_DBs;
3989
3990 if (hidden_primary_key) {
3991 get_auto_primary_key(current_ident);
3992 }
3993
3994 if (table_share->blob_fields) {
3995 if (fix_rec_buff_for_blob(max_row_length(record))) {
3996 error = HA_ERR_OUT_OF_MEM;
3997 goto cleanup;
3998 }
3999 }
4000
4001 create_dbt_key_from_table(
4002 &prim_key,
4003 primary_key,
4004 primary_key_buff,
4005 record,
4006 &has_null);
4007 if ((error = pack_row(&row, (const uchar*)record, primary_key))) {
4008 goto cleanup;
4009 }
4010
4011 if (using_ignore) {
4012 error = txn_begin(
4013 db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4014 if (error) {
4015 goto cleanup;
4016 }
4017 }
4018
4019 txn = using_ignore ? sub_trans : transaction;
4020 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_TXN, "txn %p", txn);
4021 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY))) {
4022 test_row_packing(record,&prim_key,&row);
4023 }
4024 if (loader) {
4025 error = loader->put(loader, &prim_key, &row);
4026 if (error) {
4027 abort_loader = true;
4028 goto cleanup;
4029 }
4030 } else {
4031 error = do_uniqueness_checks(record, txn, thd);
4032 if (error) {
4033 // for #4633
4034 // if we have a duplicate key error, let's check the primary key to
4035 // see if there is a duplicate there. If so, set last_dup_key to the
4036 // pk
4037 if (error == DB_KEYEXIST &&
4038 !tokudb_test(hidden_primary_key) &&
4039 last_dup_key != primary_key) {
4040 int r =
4041 share->file->getf_set(
4042 share->file,
4043 txn,
4044 DB_SERIALIZABLE,
4045 &prim_key,
4046 smart_dbt_do_nothing,
4047 NULL);
4048 if (r == 0) {
4049 // if we get no error, that means the row
4050 // was found and this is a duplicate key,
4051 // so we set last_dup_key
4052 last_dup_key = primary_key;
4053 } else if (r != DB_NOTFOUND) {
4054 // if some other error is returned, return that to the user.
4055 error = r;
4056 }
4057 }
4058 goto cleanup;
4059 }
4060 if (curr_num_DBs == 1) {
4061 error = insert_row_to_main_dictionary(&prim_key, &row, txn);
4062 if (error) { goto cleanup; }
4063 } else {
4064 error = insert_rows_to_dictionaries_mult(&prim_key, &row, txn, thd);
4065 if (error) { goto cleanup; }
4066 }
4067 if (error == 0) {
4068 uint64_t full_row_size = prim_key.size + row.size;
4069 toku_hton_update_primary_key_bytes_inserted(full_row_size);
4070 }
4071 }
4072
4073 trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4074 if (!error) {
4075 added_rows++;
4076 trx->stmt_progress.inserted++;
4077 track_progress(thd);
4078 }
4079 cleanup:
4080 if (num_DBs_locked) {
4081 share->_num_DBs_lock.unlock();
4082 }
4083 if (error == DB_KEYEXIST) {
4084 error = HA_ERR_FOUND_DUPP_KEY;
4085 }
4086 if (sub_trans) {
4087 // no point in recording error value of abort.
4088 // nothing we can do about it anyway and it is not what
4089 // we want to return.
4090 if (error) {
4091 abort_txn(sub_trans);
4092 } else {
4093 commit_txn(sub_trans, DB_TXN_NOSYNC);
4094 }
4095 }
4096 TOKUDB_HANDLER_DBUG_RETURN(error);
4097 }
4098
4099 /* Compare if a key in a row has changed */
key_changed(uint keynr,const uchar * old_row,const uchar * new_row)4100 bool ha_tokudb::key_changed(uint keynr, const uchar * old_row, const uchar * new_row) {
4101 DBT old_key;
4102 DBT new_key;
4103 memset((void *) &old_key, 0, sizeof(old_key));
4104 memset((void *) &new_key, 0, sizeof(new_key));
4105
4106 bool has_null;
4107 create_dbt_key_from_table(&new_key, keynr, key_buff2, new_row, &has_null);
4108 create_dbt_key_for_lookup(&old_key,&table->key_info[keynr], key_buff3, old_row, &has_null);
4109 return tokudb_prefix_cmp_dbt_key(share->key_file[keynr], &old_key, &new_key);
4110 }
4111
4112 //
4113 // Updates a row in the table, called when handling an UPDATE query
4114 // Parameters:
4115 // [in] old_row - row to be updated, in MySQL format
4116 // [in] new_row - new row, in MySQL format
4117 // Returns:
4118 // 0 on success
4119 // error otherwise
4120 //
update_row(const uchar * old_row,uchar * new_row)4121 int ha_tokudb::update_row(const uchar * old_row, uchar * new_row) {
4122 TOKUDB_HANDLER_DBUG_ENTER("");
4123 DBT prim_key, old_prim_key, prim_row, old_prim_row;
4124 int error = 0;
4125 bool has_null;
4126 THD* thd = ha_thd();
4127 DB_TXN* sub_trans = NULL;
4128 DB_TXN* txn = NULL;
4129 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4130 uint curr_num_DBs;
4131
4132 memset((void *) &prim_key, 0, sizeof(prim_key));
4133 memset((void *) &old_prim_key, 0, sizeof(old_prim_key));
4134 memset((void *) &prim_row, 0, sizeof(prim_row));
4135 memset((void *) &old_prim_row, 0, sizeof(old_prim_row));
4136
4137 ha_statistic_increment(&SSV::ha_update_count);
4138 //
4139 // check to see if some value for the auto increment column that is bigger
4140 // than anything else til now is being used. If so, update the metadata to reflect it
4141 // the goal here is we never want to have a dup key error due to a bad increment
4142 // of the auto inc field.
4143 //
4144 if (share->has_auto_inc && new_row == table->record[0]) {
4145 share->lock();
4146 ulonglong curr_auto_inc = retrieve_auto_increment(
4147 table->field[share->ai_field_index]->key_type(),
4148 field_offset(table->field[share->ai_field_index], table),
4149 new_row
4150 );
4151 if (curr_auto_inc > share->last_auto_increment) {
4152 error = update_max_auto_inc(share->status_block, curr_auto_inc);
4153 if (!error) {
4154 share->last_auto_increment = curr_auto_inc;
4155 }
4156 }
4157 share->unlock();
4158 }
4159
4160 //
4161 // grab reader lock on numDBs_lock
4162 //
4163 bool num_DBs_locked = false;
4164 if (!num_DBs_locked_in_bulk) {
4165 rwlock_t_lock_read(share->_num_DBs_lock);
4166 num_DBs_locked = true;
4167 }
4168 curr_num_DBs = share->num_DBs;
4169
4170 if (using_ignore) {
4171 error = txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4172 if (error) {
4173 goto cleanup;
4174 }
4175 }
4176 txn = using_ignore ? sub_trans : transaction;
4177
4178 if (hidden_primary_key) {
4179 memset((void *) &prim_key, 0, sizeof(prim_key));
4180 prim_key.data = (void *) current_ident;
4181 prim_key.size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
4182 old_prim_key = prim_key;
4183 }
4184 else {
4185 create_dbt_key_from_table(&prim_key, primary_key, key_buff, new_row, &has_null);
4186 create_dbt_key_from_table(&old_prim_key, primary_key, primary_key_buff, old_row, &has_null);
4187 }
4188
4189 // do uniqueness checks
4190 if (share->has_unique_keys && do_unique_checks(thd, in_rpl_update_rows)) {
4191 for (uint keynr = 0; keynr < table_share->keys; keynr++) {
4192 bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
4193 if (keynr == primary_key && !share->pk_has_string) {
4194 continue;
4195 }
4196 if (is_unique_key) {
4197 bool key_ch = key_changed(keynr, old_row, new_row);
4198 if (key_ch) {
4199 bool is_unique;
4200 error = is_val_unique(&is_unique, new_row, &table->key_info[keynr], keynr, txn);
4201 if (error) goto cleanup;
4202 if (!is_unique) {
4203 error = DB_KEYEXIST;
4204 last_dup_key = keynr;
4205 goto cleanup;
4206 }
4207 }
4208 }
4209 }
4210 }
4211
4212 if (table_share->blob_fields) {
4213 if (fix_rec_buff_for_blob(max_row_length(new_row))) {
4214 error = HA_ERR_OUT_OF_MEM;
4215 goto cleanup;
4216 }
4217 if (fix_rec_update_buff_for_blob(max_row_length(old_row))) {
4218 error = HA_ERR_OUT_OF_MEM;
4219 goto cleanup;
4220 }
4221 }
4222
4223 error = pack_row(&prim_row, new_row, primary_key);
4224 if (error) { goto cleanup; }
4225
4226 error = pack_old_row_for_update(&old_prim_row, old_row, primary_key);
4227 if (error) { goto cleanup; }
4228
4229 set_main_dict_put_flags(thd, &mult_put_flags[primary_key]);
4230
4231 // for test, make unique checks have a very long duration
4232 if ((mult_put_flags[primary_key] & DB_OPFLAGS_MASK) == DB_NOOVERWRITE)
4233 maybe_do_unique_checks_delay(thd);
4234
4235 error = db_env->update_multiple(
4236 db_env,
4237 share->key_file[primary_key],
4238 txn,
4239 &old_prim_key,
4240 &old_prim_row,
4241 &prim_key,
4242 &prim_row,
4243 curr_num_DBs,
4244 share->key_file,
4245 mult_put_flags,
4246 2*curr_num_DBs,
4247 mult_key_dbt_array,
4248 curr_num_DBs,
4249 mult_rec_dbt_array
4250 );
4251
4252 if (error == DB_KEYEXIST) {
4253 last_dup_key = primary_key;
4254 }
4255 else if (!error) {
4256 updated_rows++;
4257 trx->stmt_progress.updated++;
4258 track_progress(thd);
4259 }
4260
4261
4262 cleanup:
4263 if (num_DBs_locked) {
4264 share->_num_DBs_lock.unlock();
4265 }
4266 if (error == DB_KEYEXIST) {
4267 error = HA_ERR_FOUND_DUPP_KEY;
4268 }
4269 if (sub_trans) {
4270 // no point in recording error value of abort.
4271 // nothing we can do about it anyway and it is not what
4272 // we want to return.
4273 if (error) {
4274 abort_txn(sub_trans);
4275 }
4276 else {
4277 commit_txn(sub_trans, DB_TXN_NOSYNC);
4278 }
4279 }
4280 TOKUDB_HANDLER_DBUG_RETURN(error);
4281 }
4282
4283 //
4284 // Deletes a row in the table, called when handling a DELETE query
4285 // Parameters:
4286 // [in] record - row to be deleted, in MySQL format
4287 // Returns:
4288 // 0 on success
4289 // error otherwise
4290 //
delete_row(const uchar * record)4291 int ha_tokudb::delete_row(const uchar * record) {
4292 TOKUDB_HANDLER_DBUG_ENTER("");
4293 int error = ENOSYS;
4294 DBT row, prim_key;
4295 bool has_null;
4296 THD* thd = ha_thd();
4297 uint curr_num_DBs;
4298 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4299
4300 ha_statistic_increment(&SSV::ha_delete_count);
4301
4302 //
4303 // grab reader lock on numDBs_lock
4304 //
4305 bool num_DBs_locked = false;
4306 if (!num_DBs_locked_in_bulk) {
4307 rwlock_t_lock_read(share->_num_DBs_lock);
4308 num_DBs_locked = true;
4309 }
4310 curr_num_DBs = share->num_DBs;
4311
4312 create_dbt_key_from_table(&prim_key, primary_key, key_buff, record, &has_null);
4313 if (table_share->blob_fields) {
4314 if (fix_rec_buff_for_blob(max_row_length(record))) {
4315 error = HA_ERR_OUT_OF_MEM;
4316 goto cleanup;
4317 }
4318 }
4319 if ((error = pack_row(&row, (const uchar *) record, primary_key))){
4320 goto cleanup;
4321 }
4322
4323 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
4324 TOKUDB_DEBUG_TXN,
4325 "all %p stmt %p sub_sp_level %p transaction %p",
4326 trx->all,
4327 trx->stmt,
4328 trx->sub_sp_level,
4329 transaction);
4330
4331 error =
4332 db_env->del_multiple(
4333 db_env,
4334 share->key_file[primary_key],
4335 transaction,
4336 &prim_key,
4337 &row,
4338 curr_num_DBs,
4339 share->key_file,
4340 mult_key_dbt_array,
4341 mult_del_flags);
4342
4343 if (error) {
4344 DBUG_PRINT("error", ("Got error %d", error));
4345 } else {
4346 deleted_rows++;
4347 trx->stmt_progress.deleted++;
4348 track_progress(thd);
4349 }
4350 cleanup:
4351 if (num_DBs_locked) {
4352 share->_num_DBs_lock.unlock();
4353 }
4354 TOKUDB_HANDLER_DBUG_RETURN(error);
4355 }
4356
4357 //
4358 // takes as input table->read_set and table->write_set
4359 // and puts list of field indexes that need to be read in
4360 // unpack_row in the member variables fixed_cols_for_query
4361 // and var_cols_for_query
4362 //
set_query_columns(uint keynr)4363 void ha_tokudb::set_query_columns(uint keynr) {
4364 uint32_t curr_fixed_col_index = 0;
4365 uint32_t curr_var_col_index = 0;
4366 read_key = false;
4367 read_blobs = false;
4368 //
4369 // i know this is probably confusing and will need to be explained better
4370 //
4371 uint key_index = 0;
4372
4373 if (keynr == primary_key || keynr == MAX_KEY) {
4374 key_index = primary_key;
4375 }
4376 else {
4377 key_index = (key_is_clustering(&table->key_info[keynr]) ? keynr : primary_key);
4378 }
4379 for (uint i = 0; i < table_share->fields; i++) {
4380 if (bitmap_is_set(table->read_set,i) ||
4381 bitmap_is_set(table->write_set,i)
4382 )
4383 {
4384 if (bitmap_is_set(&share->kc_info.key_filters[key_index],i)) {
4385 read_key = true;
4386 }
4387 else {
4388 //
4389 // if fixed field length
4390 //
4391 if (is_fixed_field(&share->kc_info, i)) {
4392 //
4393 // save the offset into the list
4394 //
4395 fixed_cols_for_query[curr_fixed_col_index] = i;
4396 curr_fixed_col_index++;
4397 }
4398 //
4399 // varchar or varbinary
4400 //
4401 else if (is_variable_field(&share->kc_info, i)) {
4402 var_cols_for_query[curr_var_col_index] = i;
4403 curr_var_col_index++;
4404 }
4405 //
4406 // it is a blob
4407 //
4408 else {
4409 read_blobs = true;
4410 }
4411 }
4412 }
4413 }
4414 num_fixed_cols_for_query = curr_fixed_col_index;
4415 num_var_cols_for_query = curr_var_col_index;
4416 }
4417
column_bitmaps_signal()4418 void ha_tokudb::column_bitmaps_signal() {
4419 //
4420 // if we have max number of indexes, then MAX_KEY == primary_key
4421 //
4422 if (tokudb_active_index != MAX_KEY || tokudb_active_index == primary_key) {
4423 set_query_columns(tokudb_active_index);
4424 }
4425 }
4426
4427 //
4428 // Notification that a scan of entire secondary table is about
4429 // to take place. Will pre acquire table read lock
4430 // Returns:
4431 // 0 on success
4432 // error otherwise
4433 //
prepare_index_scan()4434 int ha_tokudb::prepare_index_scan() {
4435 TOKUDB_HANDLER_DBUG_ENTER("");
4436 int error = 0;
4437 HANDLE_INVALID_CURSOR();
4438 error = prelock_range(NULL, NULL);
4439 if (error) { last_cursor_error = error; goto cleanup; }
4440
4441 range_lock_grabbed = true;
4442 error = 0;
4443 cleanup:
4444 TOKUDB_HANDLER_DBUG_RETURN(error);
4445 }
4446
index_key_is_null(TABLE * table,uint keynr,const uchar * key,uint key_len)4447 static bool index_key_is_null(
4448 TABLE* table,
4449 uint keynr,
4450 const uchar* key,
4451 uint key_len) {
4452
4453 bool key_can_be_null = false;
4454 KEY* key_info = &table->key_info[keynr];
4455 KEY_PART_INFO* key_part = key_info->key_part;
4456 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
4457 for (; key_part != end; key_part++) {
4458 if (key_part->null_bit) {
4459 key_can_be_null = true;
4460 break;
4461 }
4462 }
4463 return key_can_be_null && key_len > 0 && key[0] != 0;
4464 }
4465
4466 // Return true if bulk fetch can be used
tokudb_do_bulk_fetch(THD * thd)4467 static bool tokudb_do_bulk_fetch(THD *thd) {
4468 switch (thd_sql_command(thd)) {
4469 case SQLCOM_SELECT:
4470 case SQLCOM_CREATE_TABLE:
4471 case SQLCOM_INSERT_SELECT:
4472 case SQLCOM_REPLACE_SELECT:
4473 case SQLCOM_DELETE:
4474 return tokudb::sysvars::bulk_fetch(thd) != 0;
4475 default:
4476 return false;
4477 }
4478 }
4479
4480 //
4481 // Notification that a range query getting all elements that equal a key
4482 // to take place. Will pre acquire read lock
4483 // Returns:
4484 // 0 on success
4485 // error otherwise
4486 //
prepare_index_key_scan(const uchar * key,uint key_len)4487 int ha_tokudb::prepare_index_key_scan(const uchar * key, uint key_len) {
4488 TOKUDB_HANDLER_DBUG_ENTER("%p %u", key, key_len);
4489 int error = 0;
4490 DBT start_key, end_key;
4491 THD* thd = ha_thd();
4492 HANDLE_INVALID_CURSOR();
4493 pack_key(&start_key, tokudb_active_index, prelocked_left_range, key, key_len, COL_NEG_INF);
4494 prelocked_left_range_size = start_key.size;
4495 pack_key(&end_key, tokudb_active_index, prelocked_right_range, key, key_len, COL_POS_INF);
4496 prelocked_right_range_size = end_key.size;
4497
4498 error = cursor->c_set_bounds(
4499 cursor,
4500 &start_key,
4501 &end_key,
4502 true,
4503 (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
4504 );
4505
4506 if (error){
4507 goto cleanup;
4508 }
4509
4510 range_lock_grabbed = true;
4511 range_lock_grabbed_null = index_key_is_null(table, tokudb_active_index, key, key_len);
4512 doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
4513 bulk_fetch_iteration = 0;
4514 rows_fetched_using_bulk_fetch = 0;
4515 error = 0;
4516 cleanup:
4517 if (error) {
4518 error = map_to_handler_error(error);
4519 last_cursor_error = error;
4520 //
4521 // cursor should be initialized here, but in case it is not,
4522 // we still check
4523 //
4524 if (cursor) {
4525 int r = cursor->c_close(cursor);
4526 assert_always(r==0);
4527 cursor = NULL;
4528 remove_from_trx_handler_list();
4529 }
4530 }
4531 TOKUDB_HANDLER_DBUG_RETURN(error);
4532 }
4533
invalidate_bulk_fetch()4534 void ha_tokudb::invalidate_bulk_fetch() {
4535 bytes_used_in_range_query_buff= 0;
4536 curr_range_query_buff_offset = 0;
4537 icp_went_out_of_range = false;
4538 }
4539
invalidate_icp()4540 void ha_tokudb::invalidate_icp() {
4541 toku_pushed_idx_cond = NULL;
4542 toku_pushed_idx_cond_keyno = MAX_KEY;
4543 icp_went_out_of_range = false;
4544 }
4545
4546 //
4547 // Initializes local cursor on DB with index keynr
4548 // Parameters:
4549 // keynr - key (index) number
4550 // sorted - 1 if result MUST be sorted according to index
4551 // Returns:
4552 // 0 on success
4553 // error otherwise
4554 //
index_init(uint keynr,bool sorted)4555 int ha_tokudb::index_init(uint keynr, bool sorted) {
4556 TOKUDB_HANDLER_DBUG_ENTER("%d %u txn %p", keynr, sorted, transaction);
4557
4558 int error;
4559 THD* thd = ha_thd();
4560 DBUG_PRINT("enter", ("table: '%s' key: %d", table_share->table_name.str, keynr));
4561
4562 /*
4563 Under some very rare conditions (like full joins) we may already have
4564 an active cursor at this point
4565 */
4566 if (cursor) {
4567 DBUG_PRINT("note", ("Closing active cursor"));
4568 int r = cursor->c_close(cursor);
4569 assert_always(r==0);
4570 remove_from_trx_handler_list();
4571 }
4572 active_index = keynr;
4573
4574 if (active_index < MAX_KEY) {
4575 assert(keynr <= table->s->keys);
4576 } else {
4577 assert(active_index == MAX_KEY);
4578 keynr = primary_key;
4579 }
4580 tokudb_active_index = keynr;
4581
4582 #if defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
4583 if (keynr < table->s->keys && table->key_info[keynr].option_struct->clustering)
4584 key_read = false;
4585 #endif // defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
4586
4587 last_cursor_error = 0;
4588 range_lock_grabbed = false;
4589 range_lock_grabbed_null = false;
4590 assert(share->key_file[keynr]);
4591 cursor_flags = get_cursor_isolation_flags(lock.type, thd);
4592 if (use_write_locks) {
4593 cursor_flags |= DB_RMW;
4594 }
4595 if (tokudb::sysvars::disable_prefetching(thd)) {
4596 cursor_flags |= DBC_DISABLE_PREFETCHING;
4597 }
4598 if (lock.type == TL_READ_WITH_SHARED_LOCKS) {
4599 cursor_flags |= DB_LOCKING_READ;
4600 }
4601 if ((error = share->key_file[keynr]->cursor(share->key_file[keynr],
4602 transaction, &cursor,
4603 cursor_flags))) {
4604 if (error == TOKUDB_MVCC_DICTIONARY_TOO_NEW) {
4605 error = HA_ERR_TABLE_DEF_CHANGED;
4606 my_error(ER_TABLE_DEF_CHANGED, MYF(0));
4607 }
4608 if (error == DB_LOCK_NOTGRANTED) {
4609 error = HA_ERR_LOCK_WAIT_TIMEOUT;
4610 my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
4611 }
4612 table->status = STATUS_NOT_FOUND;
4613 error = map_to_handler_error(error);
4614 last_cursor_error = error;
4615 cursor = NULL; // Safety
4616 goto exit;
4617 }
4618 cursor->c_set_check_interrupt_callback(cursor, tokudb_killed_thd_callback, thd);
4619 memset((void *) &last_key, 0, sizeof(last_key));
4620
4621 add_to_trx_handler_list();
4622
4623 if (thd_sql_command(thd) == SQLCOM_SELECT) {
4624 set_query_columns(keynr);
4625 unpack_entire_row = false;
4626 }
4627 else {
4628 unpack_entire_row = true;
4629 }
4630 invalidate_bulk_fetch();
4631 doing_bulk_fetch = false;
4632 maybe_index_scan = false;
4633 error = 0;
4634 exit:
4635 TOKUDB_HANDLER_DBUG_RETURN(error);
4636 }
4637
4638 //
4639 // closes the local cursor
4640 //
index_end()4641 int ha_tokudb::index_end() {
4642 TOKUDB_HANDLER_DBUG_ENTER("");
4643 range_lock_grabbed = false;
4644 range_lock_grabbed_null = false;
4645 if (cursor) {
4646 DBUG_PRINT("enter", ("table: '%s'", table_share->table_name.str));
4647 int r = cursor->c_close(cursor);
4648 assert_always(r==0);
4649 cursor = NULL;
4650 remove_from_trx_handler_list();
4651 last_cursor_error = 0;
4652 }
4653 active_index = tokudb_active_index = MAX_KEY;
4654
4655 //
4656 // reset query variables
4657 //
4658 unpack_entire_row = true;
4659 read_blobs = true;
4660 read_key = true;
4661 num_fixed_cols_for_query = 0;
4662 num_var_cols_for_query = 0;
4663
4664 invalidate_bulk_fetch();
4665 invalidate_icp();
4666 doing_bulk_fetch = false;
4667 ds_mrr.dsmrr_close();
4668
4669 TOKUDB_HANDLER_DBUG_RETURN(0);
4670 }
4671
handle_cursor_error(int error,int err_to_return)4672 int ha_tokudb::handle_cursor_error(int error, int err_to_return) {
4673 TOKUDB_HANDLER_DBUG_ENTER("");
4674 if (error) {
4675 error = map_to_handler_error(error);
4676 last_cursor_error = error;
4677 table->status = STATUS_NOT_FOUND;
4678 if (error == DB_NOTFOUND) {
4679 error = err_to_return;
4680 }
4681 }
4682 TOKUDB_HANDLER_DBUG_RETURN(error);
4683 }
4684
4685
4686 //
4687 // Helper function for read_row and smart_dbt_callback_xxx functions
4688 // When using a hidden primary key, upon reading a row,
4689 // we set the current_ident field to whatever the primary key we retrieved
4690 // was
4691 //
extract_hidden_primary_key(uint keynr,DBT const * found_key)4692 void ha_tokudb::extract_hidden_primary_key(uint keynr, DBT const *found_key) {
4693 //
4694 // extract hidden primary key to current_ident
4695 //
4696 if (hidden_primary_key) {
4697 if (keynr == primary_key) {
4698 memcpy(current_ident, (char *) found_key->data, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
4699 }
4700 //
4701 // if secondary key, hidden primary key is at end of found_key
4702 //
4703 else {
4704 memcpy(
4705 current_ident,
4706 (char *) found_key->data + found_key->size - TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH,
4707 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH
4708 );
4709 }
4710 }
4711 }
4712
4713
read_row_callback(uchar * buf,uint keynr,DBT const * row,DBT const * found_key)4714 int ha_tokudb::read_row_callback (uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4715 assert_always(keynr == primary_key);
4716 return unpack_row(buf, row,found_key, keynr);
4717 }
4718
4719 //
4720 // Reads the contents of row and found_key, DBT's retrieved from the DB associated to keynr, into buf
4721 // This function assumes that we are using a covering index, as a result, if keynr is the primary key,
4722 // we do not read row into buf
4723 // Parameters:
4724 // [out] buf - buffer for the row, in MySQL format
4725 // keynr - index into key_file that represents DB we are currently operating on.
4726 // [in] row - the row that has been read from the preceding DB call
4727 // [in] found_key - key used to retrieve the row
4728 //
read_key_only(uchar * buf,uint keynr,DBT const * found_key)4729 void ha_tokudb::read_key_only(uchar * buf, uint keynr, DBT const *found_key) {
4730 TOKUDB_HANDLER_DBUG_ENTER("");
4731 table->status = 0;
4732 //
4733 // only case when we do not unpack the key is if we are dealing with the main dictionary
4734 // of a table with a hidden primary key
4735 //
4736 if (!(hidden_primary_key && keynr == primary_key)) {
4737 unpack_key(buf, found_key, keynr);
4738 }
4739 TOKUDB_HANDLER_DBUG_VOID_RETURN;
4740 }
4741
4742 //
4743 // Helper function used to try to retrieve the entire row
4744 // If keynr is associated with the main table, reads contents of found_key and row into buf, otherwise,
4745 // makes copy of primary key and saves it to last_key. This can later be used to retrieve the entire row
4746 // Parameters:
4747 // [out] buf - buffer for the row, in MySQL format
4748 // keynr - index into key_file that represents DB we are currently operating on.
4749 // [in] row - the row that has been read from the preceding DB call
4750 // [in] found_key - key used to retrieve the row
4751 //
read_primary_key(uchar * buf,uint keynr,DBT const * row,DBT const * found_key)4752 int ha_tokudb::read_primary_key(uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4753 TOKUDB_HANDLER_DBUG_ENTER("");
4754 int error = 0;
4755 table->status = 0;
4756 //
4757 // case where we read from secondary table that is not clustered
4758 //
4759 if (keynr != primary_key && !key_is_clustering(&table->key_info[keynr])) {
4760 bool has_null;
4761 //
4762 // create a DBT that has the same data as row, this is inefficient
4763 // extract_hidden_primary_key MUST have been called before this
4764 //
4765 memset((void *) &last_key, 0, sizeof(last_key));
4766 if (!hidden_primary_key) {
4767 unpack_key(buf, found_key, keynr);
4768 }
4769 create_dbt_key_from_table(
4770 &last_key,
4771 primary_key,
4772 key_buff,
4773 buf,
4774 &has_null
4775 );
4776 }
4777 //
4778 // else read from clustered/primary key
4779 //
4780 else {
4781 error = unpack_row(buf, row, found_key, keynr);
4782 if (error) { goto exit; }
4783 }
4784 if (found_key) { DBUG_DUMP("read row key", (uchar *) found_key->data, found_key->size); }
4785 error = 0;
4786 exit:
4787 TOKUDB_HANDLER_DBUG_RETURN(error);
4788 }
4789
4790 //
4791 // This function reads an entire row into buf. This function also assumes that
4792 // the key needed to retrieve the row is stored in the member variable last_key
4793 // Parameters:
4794 // [out] buf - buffer for the row, in MySQL format
4795 // Returns:
4796 // 0 on success, error otherwise
4797 //
read_full_row(uchar * buf)4798 int ha_tokudb::read_full_row(uchar * buf) {
4799 TOKUDB_HANDLER_DBUG_ENTER("");
4800 int error = 0;
4801 struct smart_dbt_info info;
4802 info.ha = this;
4803 info.buf = buf;
4804 info.keynr = primary_key;
4805 //
4806 // assumes key is stored in this->last_key
4807 //
4808
4809 error = share->file->getf_set(share->file,
4810 transaction,
4811 cursor_flags,
4812 &last_key,
4813 smart_dbt_callback_rowread_ptquery,
4814 &info);
4815
4816 DBUG_EXECUTE_IF("tokudb_fake_db_notfound_error_in_read_full_row", {
4817 error = DB_NOTFOUND;
4818 });
4819
4820 if (error) {
4821 if (error == DB_LOCK_NOTGRANTED) {
4822 error = HA_ERR_LOCK_WAIT_TIMEOUT;
4823 } else if (error == DB_NOTFOUND) {
4824 error = HA_ERR_CRASHED;
4825 if (tokudb_active_index < share->_keys) {
4826 sql_print_error(
4827 "ha_tokudb::read_full_row on table %s cound not locate "
4828 "record in PK that matches record found in key %s",
4829 share->full_table_name(),
4830 share->_key_descriptors[tokudb_active_index]._name);
4831 } else {
4832 sql_print_error(
4833 "ha_tokudb::read_full_row on table %s cound not locate "
4834 "record in PK that matches record found in key %d",
4835 share->full_table_name(),
4836 tokudb_active_index);
4837 }
4838 }
4839 table->status = STATUS_NOT_FOUND;
4840 }
4841
4842 TOKUDB_HANDLER_DBUG_RETURN(error);
4843 }
4844
4845
4846 //
4847 // Reads the next row matching to the key, on success, advances cursor
4848 // Parameters:
4849 // [out] buf - buffer for the next row, in MySQL format
4850 // [in] key - key value
4851 // keylen - length of key
4852 // Returns:
4853 // 0 on success
4854 // HA_ERR_END_OF_FILE if not found
4855 // error otherwise
4856 //
index_next_same(uchar * buf,const uchar * key,uint keylen)4857 int ha_tokudb::index_next_same(uchar* buf, const uchar* key, uint keylen) {
4858 TOKUDB_HANDLER_DBUG_ENTER("");
4859 ha_statistic_increment(&SSV::ha_read_next_count);
4860
4861 DBT curr_key;
4862 DBT found_key;
4863 bool has_null;
4864 int cmp;
4865 // create the key that will be used to compare with what is found
4866 // in order to figure out if we should return an error
4867 pack_key(&curr_key, tokudb_active_index, key_buff2, key, keylen, COL_ZERO);
4868 int error = get_next(buf, 1, &curr_key, key_read);
4869 if (error) {
4870 goto cleanup;
4871 }
4872 //
4873 // now do the comparison
4874 //
4875 create_dbt_key_from_table(
4876 &found_key,
4877 tokudb_active_index,
4878 key_buff3,buf,
4879 &has_null);
4880 cmp =
4881 tokudb_prefix_cmp_dbt_key(
4882 share->key_file[tokudb_active_index],
4883 &curr_key,
4884 &found_key);
4885 if (cmp) {
4886 error = HA_ERR_END_OF_FILE;
4887 }
4888
4889 cleanup:
4890 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
4891 TOKUDB_HANDLER_DBUG_RETURN(error);
4892 }
4893
4894
4895 //
4896 // According to InnoDB handlerton: Positions an index cursor to the index
4897 // specified in keynr. Fetches the row if any
4898 // Parameters:
4899 // [out] buf - buffer for the returned row
4900 // [in] key - key value, according to InnoDB, if NULL,
4901 // position cursor at start or end of index,
4902 // not sure if this is done now
4903 // key_len - length of key
4904 // find_flag - according to InnoDB, search flags from my_base.h
4905 // Returns:
4906 // 0 on success
4907 // HA_ERR_KEY_NOT_FOUND if not found (per InnoDB),
4908 // we seem to return HA_ERR_END_OF_FILE if find_flag != HA_READ_KEY_EXACT
4909 // TODO: investigate this for correctness
4910 // error otherwise
4911 //
index_read(uchar * buf,const uchar * key,uint key_len,enum ha_rkey_function find_flag)4912 int ha_tokudb::index_read(
4913 uchar* buf,
4914 const uchar* key,
4915 uint key_len,
4916 enum ha_rkey_function find_flag) {
4917
4918 TOKUDB_HANDLER_DBUG_ENTER(
4919 "key %p %u:%2.2x find=%u",
4920 key,
4921 key_len,
4922 key ? key[0] : 0,
4923 find_flag);
4924 invalidate_bulk_fetch();
4925 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
4926 TOKUDB_DBUG_DUMP("mysql key=", key, key_len);
4927 }
4928 DBT row;
4929 DBT lookup_key;
4930 int error = 0;
4931 uint32_t flags = 0;
4932 THD* thd = ha_thd();
4933 tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
4934 struct smart_dbt_info info;
4935 struct index_read_info ir_info;
4936
4937 HANDLE_INVALID_CURSOR();
4938
4939 // if we locked a non-null key range and we now have a null key, then
4940 // remove the bounds from the cursor
4941 if (range_lock_grabbed &&
4942 !range_lock_grabbed_null &&
4943 index_key_is_null(table, tokudb_active_index, key, key_len)) {
4944 range_lock_grabbed = range_lock_grabbed_null = false;
4945 cursor->c_remove_restriction(cursor);
4946 }
4947
4948 ha_statistic_increment(&SSV::ha_read_key_count);
4949 memset((void *) &row, 0, sizeof(row));
4950
4951 info.ha = this;
4952 info.buf = buf;
4953 info.keynr = tokudb_active_index;
4954
4955 ir_info.smart_dbt_info = info;
4956 ir_info.cmp = 0;
4957
4958 flags = SET_PRELOCK_FLAG(0);
4959 switch (find_flag) {
4960 case HA_READ_KEY_EXACT: /* Find first record else error */ {
4961 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
4962 DBT lookup_bound;
4963 pack_key(&lookup_bound, tokudb_active_index, key_buff4, key, key_len, COL_POS_INF);
4964 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
4965 TOKUDB_DBUG_DUMP("tokudb key=", lookup_key.data, lookup_key.size);
4966 }
4967 ir_info.orig_key = &lookup_key;
4968 error = cursor->c_getf_set_range_with_bound(cursor, flags, &lookup_key, &lookup_bound, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
4969 if (ir_info.cmp) {
4970 error = DB_NOTFOUND;
4971 }
4972 break;
4973 }
4974 case HA_READ_AFTER_KEY: /* Find next rec. after key-record */
4975 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
4976 error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
4977 break;
4978 case HA_READ_BEFORE_KEY: /* Find next rec. before key-record */
4979 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
4980 error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
4981 break;
4982 case HA_READ_KEY_OR_NEXT: /* Record or next record */
4983 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
4984 error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
4985 break;
4986 //
4987 // This case does not seem to ever be used, it is ok for it to be slow
4988 //
4989 case HA_READ_KEY_OR_PREV: /* Record or previous */
4990 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
4991 ir_info.orig_key = &lookup_key;
4992 error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
4993 if (error == DB_NOTFOUND) {
4994 error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
4995 }
4996 else if (ir_info.cmp) {
4997 error = cursor->c_getf_prev(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
4998 }
4999 break;
5000 case HA_READ_PREFIX_LAST_OR_PREV: /* Last or prev key with the same prefix */
5001 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5002 error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5003 break;
5004 case HA_READ_PREFIX_LAST:
5005 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5006 ir_info.orig_key = &lookup_key;
5007 error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5008 if (ir_info.cmp) {
5009 error = DB_NOTFOUND;
5010 }
5011 break;
5012 default:
5013 TOKUDB_HANDLER_TRACE("unsupported:%d", find_flag);
5014 error = HA_ERR_UNSUPPORTED;
5015 break;
5016 }
5017 error = handle_cursor_error(error, HA_ERR_KEY_NOT_FOUND);
5018 if (!error && !key_read && tokudb_active_index != primary_key && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5019 error = read_full_row(buf);
5020 }
5021
5022 if (TOKUDB_UNLIKELY(error && TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ERROR))) {
5023 TOKUDB_HANDLER_TRACE("error:%d:%d", error, find_flag);
5024 }
5025 trx->stmt_progress.queried++;
5026 track_progress(thd);
5027
5028 cleanup:
5029 TOKUDB_HANDLER_DBUG_RETURN(error);
5030 }
5031
5032
read_data_from_range_query_buff(uchar * buf,bool need_val,bool do_key_read)5033 int ha_tokudb::read_data_from_range_query_buff(uchar* buf, bool need_val, bool do_key_read) {
5034 // buffer has the next row, get it from there
5035 int error;
5036 uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
5037 DBT curr_key;
5038 memset((void *) &curr_key, 0, sizeof(curr_key));
5039
5040 // get key info
5041 uint32_t key_size = *(uint32_t *)curr_pos;
5042 curr_pos += sizeof(key_size);
5043 uchar* curr_key_buff = curr_pos;
5044 curr_pos += key_size;
5045
5046 curr_key.data = curr_key_buff;
5047 curr_key.size = key_size;
5048
5049 // if this is a covering index, this is all we need
5050 if (do_key_read) {
5051 assert_always(!need_val);
5052 extract_hidden_primary_key(tokudb_active_index, &curr_key);
5053 read_key_only(buf, tokudb_active_index, &curr_key);
5054 error = 0;
5055 }
5056 // we need to get more data
5057 else {
5058 DBT curr_val;
5059 memset((void *) &curr_val, 0, sizeof(curr_val));
5060 uchar* curr_val_buff = NULL;
5061 uint32_t val_size = 0;
5062 // in this case, we don't have a val, we are simply extracting the pk
5063 if (!need_val) {
5064 curr_val.data = curr_val_buff;
5065 curr_val.size = val_size;
5066 extract_hidden_primary_key(tokudb_active_index, &curr_key);
5067 error = read_primary_key( buf, tokudb_active_index, &curr_val, &curr_key);
5068 }
5069 else {
5070 extract_hidden_primary_key(tokudb_active_index, &curr_key);
5071 // need to extract a val and place it into buf
5072 if (unpack_entire_row) {
5073 // get val info
5074 val_size = *(uint32_t *)curr_pos;
5075 curr_pos += sizeof(val_size);
5076 curr_val_buff = curr_pos;
5077 curr_pos += val_size;
5078 curr_val.data = curr_val_buff;
5079 curr_val.size = val_size;
5080 error = unpack_row(buf,&curr_val, &curr_key, tokudb_active_index);
5081 }
5082 else {
5083 if (!(hidden_primary_key && tokudb_active_index == primary_key)) {
5084 unpack_key(buf,&curr_key,tokudb_active_index);
5085 }
5086 // read rows we care about
5087
5088 // first the null bytes;
5089 memcpy(buf, curr_pos, table_share->null_bytes);
5090 curr_pos += table_share->null_bytes;
5091
5092 // now the fixed sized rows
5093 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5094 uint field_index = fixed_cols_for_query[i];
5095 Field* field = table->field[field_index];
5096 unpack_fixed_field(
5097 buf + field_offset(field, table),
5098 curr_pos,
5099 share->kc_info.field_lengths[field_index]
5100 );
5101 curr_pos += share->kc_info.field_lengths[field_index];
5102 }
5103 // now the variable sized rows
5104 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5105 uint field_index = var_cols_for_query[i];
5106 Field* field = table->field[field_index];
5107 uint32_t field_len = *(uint32_t *)curr_pos;
5108 curr_pos += sizeof(field_len);
5109 unpack_var_field(
5110 buf + field_offset(field, table),
5111 curr_pos,
5112 field_len,
5113 share->kc_info.length_bytes[field_index]
5114 );
5115 curr_pos += field_len;
5116 }
5117 // now the blobs
5118 if (read_blobs) {
5119 uint32_t blob_size = *(uint32_t *)curr_pos;
5120 curr_pos += sizeof(blob_size);
5121 error = unpack_blobs(
5122 buf,
5123 curr_pos,
5124 blob_size,
5125 true
5126 );
5127 curr_pos += blob_size;
5128 if (error) {
5129 invalidate_bulk_fetch();
5130 goto exit;
5131 }
5132 }
5133 error = 0;
5134 }
5135 }
5136 }
5137
5138 curr_range_query_buff_offset = curr_pos - range_query_buff;
5139 exit:
5140 return error;
5141 }
5142
smart_dbt_bf_callback(DBT const * key,DBT const * row,void * context)5143 static int smart_dbt_bf_callback(
5144 DBT const* key,
5145 DBT const* row,
5146 void* context) {
5147 SMART_DBT_BF_INFO info = (SMART_DBT_BF_INFO)context;
5148 return
5149 info->ha->fill_range_query_buf(
5150 info->need_val,
5151 key,
5152 row,
5153 info->direction,
5154 info->thd,
5155 info->buf,
5156 info->key_to_compare);
5157 }
5158
toku_handler_index_cond_check(Item * pushed_idx_cond)5159 enum icp_result ha_tokudb::toku_handler_index_cond_check(
5160 Item* pushed_idx_cond) {
5161
5162 enum icp_result res;
5163 if (end_range) {
5164 int cmp;
5165 cmp = compare_key_icp(end_range);
5166 if (cmp > 0) {
5167 return ICP_OUT_OF_RANGE;
5168 }
5169 }
5170 res = pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;
5171 return res;
5172 }
5173
5174 // fill in the range query buf for bulk fetch
fill_range_query_buf(bool need_val,DBT const * key,DBT const * row,int direction,THD * thd,uchar * buf,DBT * key_to_compare)5175 int ha_tokudb::fill_range_query_buf(
5176 bool need_val,
5177 DBT const* key,
5178 DBT const* row,
5179 int direction,
5180 THD* thd,
5181 uchar* buf,
5182 DBT* key_to_compare) {
5183
5184 int error;
5185 //
5186 // first put the value into range_query_buf
5187 //
5188 uint32_t size_remaining =
5189 size_range_query_buff - bytes_used_in_range_query_buff;
5190 uint32_t size_needed;
5191 uint32_t user_defined_size = tokudb::sysvars::read_buf_size(thd);
5192 uchar* curr_pos = NULL;
5193
5194 if (key_to_compare) {
5195 int cmp = tokudb_prefix_cmp_dbt_key(
5196 share->key_file[tokudb_active_index],
5197 key_to_compare,
5198 key);
5199 if (cmp) {
5200 icp_went_out_of_range = true;
5201 error = 0;
5202 goto cleanup;
5203 }
5204 }
5205
5206 // if we have an index condition pushed down, we check it
5207 if (toku_pushed_idx_cond &&
5208 (tokudb_active_index == toku_pushed_idx_cond_keyno)) {
5209 unpack_key(buf, key, tokudb_active_index);
5210 enum icp_result result =
5211 toku_handler_index_cond_check(toku_pushed_idx_cond);
5212
5213 // If we have reason to stop, we set icp_went_out_of_range and get out
5214 // otherwise, if we simply see that the current key is no match,
5215 // we tell the cursor to continue and don't store
5216 // the key locally
5217 if (result == ICP_OUT_OF_RANGE || thd_killed(thd)) {
5218 icp_went_out_of_range = true;
5219 error = 0;
5220 DEBUG_SYNC(ha_thd(), "tokudb_icp_asc_scan_out_of_range");
5221 goto cleanup;
5222 } else if (result == ICP_NO_MATCH) {
5223 // Optimizer change for MyRocks also benefits us here in TokuDB as
5224 // opt_range.cc QUICK_SELECT::get_next now sets end_range during
5225 // descending scan. We should not ever hit this condition, but
5226 // leaving this code in to prevent any possibility of a descending
5227 // scan to the beginning of an index and catch any possibility
5228 // in debug builds with an assertion
5229 assert_debug(!(!end_range && direction < 0));
5230 if (!end_range &&
5231 direction < 0) {
5232 cancel_pushed_idx_cond();
5233 }
5234 error = TOKUDB_CURSOR_CONTINUE;
5235 goto cleanup;
5236 }
5237 }
5238
5239 // at this point, if ICP is on, we have verified that the key is one
5240 // we are interested in, so we proceed with placing the data
5241 // into the range query buffer
5242
5243 if (need_val) {
5244 if (unpack_entire_row) {
5245 size_needed = 2*sizeof(uint32_t) + key->size + row->size;
5246 } else {
5247 // this is an upper bound
5248 size_needed =
5249 // size of key length
5250 sizeof(uint32_t) +
5251 // key and row
5252 key->size + row->size +
5253 // lengths of varchars stored
5254 num_var_cols_for_query * (sizeof(uint32_t)) +
5255 // length of blobs
5256 sizeof(uint32_t);
5257 }
5258 } else {
5259 size_needed = sizeof(uint32_t) + key->size;
5260 }
5261 if (size_remaining < size_needed) {
5262 range_query_buff =
5263 static_cast<uchar*>(tokudb::memory::realloc(
5264 static_cast<void*>(range_query_buff),
5265 bytes_used_in_range_query_buff + size_needed,
5266 MYF(MY_WME)));
5267 if (range_query_buff == NULL) {
5268 error = ENOMEM;
5269 invalidate_bulk_fetch();
5270 goto cleanup;
5271 }
5272 size_range_query_buff = bytes_used_in_range_query_buff + size_needed;
5273 }
5274 //
5275 // now we know we have the size, let's fill the buffer, starting with the key
5276 //
5277 curr_pos = range_query_buff + bytes_used_in_range_query_buff;
5278
5279 *reinterpret_cast<uint32_t*>(curr_pos) = key->size;
5280 curr_pos += sizeof(uint32_t);
5281 memcpy(curr_pos, key->data, key->size);
5282 curr_pos += key->size;
5283 if (need_val) {
5284 if (unpack_entire_row) {
5285 *reinterpret_cast<uint32_t*>(curr_pos) = row->size;
5286 curr_pos += sizeof(uint32_t);
5287 memcpy(curr_pos, row->data, row->size);
5288 curr_pos += row->size;
5289 } else {
5290 // need to unpack just the data we care about
5291 const uchar* fixed_field_ptr = static_cast<const uchar*>(row->data);
5292 fixed_field_ptr += table_share->null_bytes;
5293
5294 const uchar* var_field_offset_ptr = NULL;
5295 const uchar* var_field_data_ptr = NULL;
5296
5297 var_field_offset_ptr =
5298 fixed_field_ptr +
5299 share->kc_info.mcp_info[tokudb_active_index].fixed_field_size;
5300 var_field_data_ptr =
5301 var_field_offset_ptr +
5302 share->kc_info.mcp_info[tokudb_active_index].len_of_offsets;
5303
5304 // first the null bytes
5305 memcpy(curr_pos, row->data, table_share->null_bytes);
5306 curr_pos += table_share->null_bytes;
5307 // now the fixed fields
5308 //
5309 // first the fixed fields
5310 //
5311 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5312 uint field_index = fixed_cols_for_query[i];
5313 memcpy(
5314 curr_pos,
5315 fixed_field_ptr + share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val,
5316 share->kc_info.field_lengths[field_index]);
5317 curr_pos += share->kc_info.field_lengths[field_index];
5318 }
5319
5320 //
5321 // now the var fields
5322 //
5323 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5324 uint field_index = var_cols_for_query[i];
5325 uint32_t var_field_index =
5326 share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val;
5327 uint32_t data_start_offset;
5328 uint32_t field_len;
5329
5330 get_var_field_info(
5331 &field_len,
5332 &data_start_offset,
5333 var_field_index,
5334 var_field_offset_ptr,
5335 share->kc_info.num_offset_bytes);
5336 memcpy(curr_pos, &field_len, sizeof(field_len));
5337 curr_pos += sizeof(field_len);
5338 memcpy(
5339 curr_pos,
5340 var_field_data_ptr + data_start_offset,
5341 field_len);
5342 curr_pos += field_len;
5343 }
5344
5345 if (read_blobs) {
5346 uint32_t blob_offset = 0;
5347 uint32_t data_size = 0;
5348 //
5349 // now the blobs
5350 //
5351 get_blob_field_info(
5352 &blob_offset,
5353 share->kc_info.mcp_info[tokudb_active_index].len_of_offsets,
5354 var_field_data_ptr,
5355 share->kc_info.num_offset_bytes);
5356 data_size =
5357 row->size -
5358 blob_offset -
5359 static_cast<uint32_t>((var_field_data_ptr -
5360 static_cast<const uchar*>(row->data)));
5361 memcpy(curr_pos, &data_size, sizeof(data_size));
5362 curr_pos += sizeof(data_size);
5363 memcpy(curr_pos, var_field_data_ptr + blob_offset, data_size);
5364 curr_pos += data_size;
5365 }
5366 }
5367 }
5368
5369 bytes_used_in_range_query_buff = curr_pos - range_query_buff;
5370 assert_always(bytes_used_in_range_query_buff <= size_range_query_buff);
5371
5372 //
5373 // now determine if we should continue with the bulk fetch
5374 // we want to stop under these conditions:
5375 // - we overran the prelocked range
5376 // - we are close to the end of the buffer
5377 // - we have fetched an exponential amount of rows with
5378 // respect to the bulk fetch iteration, which is initialized
5379 // to 0 in index_init() and prelock_range().
5380
5381 rows_fetched_using_bulk_fetch++;
5382 // if the iteration is less than the number of possible shifts on
5383 // a 64 bit integer, check that we haven't exceeded this iterations
5384 // row fetch upper bound.
5385 if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5386 uint64_t row_fetch_upper_bound = 1LLU << bulk_fetch_iteration;
5387 assert_always(row_fetch_upper_bound > 0);
5388 if (rows_fetched_using_bulk_fetch >= row_fetch_upper_bound) {
5389 error = 0;
5390 goto cleanup;
5391 }
5392 }
5393
5394 if (bytes_used_in_range_query_buff +
5395 table_share->rec_buff_length >
5396 user_defined_size) {
5397 error = 0;
5398 goto cleanup;
5399 }
5400 if (direction > 0) {
5401 // compare what we got to the right endpoint of prelocked range
5402 // because we are searching keys in ascending order
5403 if (prelocked_right_range_size == 0) {
5404 error = TOKUDB_CURSOR_CONTINUE;
5405 goto cleanup;
5406 }
5407 DBT right_range;
5408 memset(&right_range, 0, sizeof(right_range));
5409 right_range.size = prelocked_right_range_size;
5410 right_range.data = prelocked_right_range;
5411 int cmp = tokudb_cmp_dbt_key(
5412 share->key_file[tokudb_active_index],
5413 key,
5414 &right_range);
5415 error = (cmp > 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5416 } else {
5417 // compare what we got to the left endpoint of prelocked range
5418 // because we are searching keys in descending order
5419 if (prelocked_left_range_size == 0) {
5420 error = TOKUDB_CURSOR_CONTINUE;
5421 goto cleanup;
5422 }
5423 DBT left_range;
5424 memset(&left_range, 0, sizeof(left_range));
5425 left_range.size = prelocked_left_range_size;
5426 left_range.data = prelocked_left_range;
5427 int cmp = tokudb_cmp_dbt_key(
5428 share->key_file[tokudb_active_index],
5429 key,
5430 &left_range);
5431 error = (cmp < 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5432 }
5433 cleanup:
5434 return error;
5435 }
5436
get_next(uchar * buf,int direction,DBT * key_to_compare,bool do_key_read)5437 int ha_tokudb::get_next(
5438 uchar* buf,
5439 int direction,
5440 DBT* key_to_compare,
5441 bool do_key_read) {
5442
5443 int error = 0;
5444 HANDLE_INVALID_CURSOR();
5445
5446 if (maybe_index_scan) {
5447 maybe_index_scan = false;
5448 if (!range_lock_grabbed) {
5449 error = prepare_index_scan();
5450 }
5451 }
5452
5453 if (!error) {
5454 uint32_t flags = SET_PRELOCK_FLAG(0);
5455
5456 // we need to read the val of what we retrieve if
5457 // we do NOT have a covering index AND we are using a clustering secondary
5458 // key
5459 bool need_val =
5460 (do_key_read == 0) &&
5461 (tokudb_active_index == primary_key ||
5462 key_is_clustering(&table->key_info[tokudb_active_index]));
5463
5464 if ((bytes_used_in_range_query_buff -
5465 curr_range_query_buff_offset) > 0) {
5466 error = read_data_from_range_query_buff(buf, need_val, do_key_read);
5467 } else if (icp_went_out_of_range) {
5468 icp_went_out_of_range = false;
5469 error = HA_ERR_END_OF_FILE;
5470 } else {
5471 invalidate_bulk_fetch();
5472 if (doing_bulk_fetch) {
5473 struct smart_dbt_bf_info bf_info;
5474 bf_info.ha = this;
5475 // you need the val if you have a clustering index and key_read is not 0;
5476 bf_info.direction = direction;
5477 bf_info.thd = ha_thd();
5478 bf_info.need_val = need_val;
5479 bf_info.buf = buf;
5480 bf_info.key_to_compare = key_to_compare;
5481 //
5482 // call c_getf_next with purpose of filling in range_query_buff
5483 //
5484 rows_fetched_using_bulk_fetch = 0;
5485 // it is expected that we can do ICP in the smart_dbt_bf_callback
5486 // as a result, it's possible we don't return any data because
5487 // none of the rows matched the index condition. Therefore, we need
5488 // this while loop. icp_out_of_range will be set if we hit a row that
5489 // the index condition states is out of our range. When that hits,
5490 // we know all the data in the buffer is the last data we will retrieve
5491 while (bytes_used_in_range_query_buff == 0 &&
5492 !icp_went_out_of_range && error == 0) {
5493 if (direction > 0) {
5494 error =
5495 cursor->c_getf_next(
5496 cursor,
5497 flags,
5498 smart_dbt_bf_callback,
5499 &bf_info);
5500 } else {
5501 error =
5502 cursor->c_getf_prev(
5503 cursor,
5504 flags,
5505 smart_dbt_bf_callback,
5506 &bf_info);
5507 }
5508 }
5509 // if there is no data set and we went out of range,
5510 // then there is nothing to return
5511 if (bytes_used_in_range_query_buff == 0 &&
5512 icp_went_out_of_range) {
5513 icp_went_out_of_range = false;
5514 error = HA_ERR_END_OF_FILE;
5515 }
5516 if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5517 bulk_fetch_iteration++;
5518 }
5519
5520 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5521 if (error) {
5522 goto cleanup;
5523 }
5524
5525 //
5526 // now that range_query_buff is filled, read an element
5527 //
5528 error =
5529 read_data_from_range_query_buff(buf, need_val, do_key_read);
5530 } else {
5531 struct smart_dbt_info info;
5532 info.ha = this;
5533 info.buf = buf;
5534 info.keynr = tokudb_active_index;
5535
5536 if (direction > 0) {
5537 error =
5538 cursor->c_getf_next(
5539 cursor,
5540 flags,
5541 SMART_DBT_CALLBACK(do_key_read),
5542 &info);
5543 } else {
5544 error =
5545 cursor->c_getf_prev(
5546 cursor,
5547 flags,
5548 SMART_DBT_CALLBACK(do_key_read),
5549 &info);
5550 }
5551 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5552 }
5553 }
5554 }
5555
5556 //
5557 // at this point, one of two things has happened
5558 // either we have unpacked the data into buf, and we
5559 // are done, or we have unpacked the primary key
5560 // into last_key, and we use the code below to
5561 // read the full row by doing a point query into the
5562 // main table.
5563 //
5564 if (!error &&
5565 !do_key_read &&
5566 (tokudb_active_index != primary_key) &&
5567 !key_is_clustering(&table->key_info[tokudb_active_index])) {
5568 error = read_full_row(buf);
5569 }
5570
5571 if (!error) {
5572 THD *thd = ha_thd();
5573 tokudb_trx_data* trx =
5574 static_cast<tokudb_trx_data*>(thd_get_ha_data(thd, tokudb_hton));
5575 trx->stmt_progress.queried++;
5576 track_progress(thd);
5577 if (thd_killed(thd))
5578 error = ER_ABORTING_CONNECTION;
5579 }
5580 cleanup:
5581 return error;
5582 }
5583
5584
5585 //
5586 // Reads the next row from the active index (cursor) into buf, and advances cursor
5587 // Parameters:
5588 // [out] buf - buffer for the next row, in MySQL format
5589 // Returns:
5590 // 0 on success
5591 // HA_ERR_END_OF_FILE if not found
5592 // error otherwise
5593 //
index_next(uchar * buf)5594 int ha_tokudb::index_next(uchar * buf) {
5595 TOKUDB_HANDLER_DBUG_ENTER("");
5596 ha_statistic_increment(&SSV::ha_read_next_count);
5597 int error = get_next(buf, 1, NULL, key_read);
5598 TOKUDB_HANDLER_DBUG_RETURN(error);
5599 }
5600
5601
index_read_last(uchar * buf,const uchar * key,uint key_len)5602 int ha_tokudb::index_read_last(uchar * buf, const uchar * key, uint key_len) {
5603 return(index_read(buf, key, key_len, HA_READ_PREFIX_LAST));
5604 }
5605
5606
5607 //
5608 // Reads the previous row from the active index (cursor) into buf, and advances cursor
5609 // Parameters:
5610 // [out] buf - buffer for the next row, in MySQL format
5611 // Returns:
5612 // 0 on success
5613 // HA_ERR_END_OF_FILE if not found
5614 // error otherwise
5615 //
index_prev(uchar * buf)5616 int ha_tokudb::index_prev(uchar * buf) {
5617 TOKUDB_HANDLER_DBUG_ENTER("");
5618 ha_statistic_increment(&SSV::ha_read_prev_count);
5619 int error = get_next(buf, -1, NULL, key_read);
5620 TOKUDB_HANDLER_DBUG_RETURN(error);
5621 }
5622
5623 //
5624 // Reads the first row from the active index (cursor) into buf, and advances cursor
5625 // Parameters:
5626 // [out] buf - buffer for the next row, in MySQL format
5627 // Returns:
5628 // 0 on success
5629 // HA_ERR_END_OF_FILE if not found
5630 // error otherwise
5631 //
index_first(uchar * buf)5632 int ha_tokudb::index_first(uchar * buf) {
5633 TOKUDB_HANDLER_DBUG_ENTER("");
5634 invalidate_bulk_fetch();
5635 int error = 0;
5636 struct smart_dbt_info info;
5637 uint32_t flags = SET_PRELOCK_FLAG(0);
5638 THD* thd = ha_thd();
5639 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5640 HANDLE_INVALID_CURSOR();
5641
5642 ha_statistic_increment(&SSV::ha_read_first_count);
5643
5644 info.ha = this;
5645 info.buf = buf;
5646 info.keynr = tokudb_active_index;
5647
5648 error = cursor->c_getf_first(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5649 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5650
5651 //
5652 // still need to get entire contents of the row if operation done on
5653 // secondary DB and it was NOT a covering index
5654 //
5655 if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5656 error = read_full_row(buf);
5657 }
5658 if (trx) {
5659 trx->stmt_progress.queried++;
5660 }
5661 track_progress(thd);
5662 maybe_index_scan = true;
5663 cleanup:
5664 TOKUDB_HANDLER_DBUG_RETURN(error);
5665 }
5666
5667 //
5668 // Reads the last row from the active index (cursor) into buf, and advances cursor
5669 // Parameters:
5670 // [out] buf - buffer for the next row, in MySQL format
5671 // Returns:
5672 // 0 on success
5673 // HA_ERR_END_OF_FILE if not found
5674 // error otherwise
5675 //
index_last(uchar * buf)5676 int ha_tokudb::index_last(uchar * buf) {
5677 TOKUDB_HANDLER_DBUG_ENTER("");
5678 invalidate_bulk_fetch();
5679 int error = 0;
5680 struct smart_dbt_info info;
5681 uint32_t flags = SET_PRELOCK_FLAG(0);
5682 THD* thd = ha_thd();
5683 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5684 HANDLE_INVALID_CURSOR();
5685
5686 ha_statistic_increment(&SSV::ha_read_last_count);
5687
5688 info.ha = this;
5689 info.buf = buf;
5690 info.keynr = tokudb_active_index;
5691
5692 error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5693 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5694 //
5695 // still need to get entire contents of the row if operation done on
5696 // secondary DB and it was NOT a covering index
5697 //
5698 if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5699 error = read_full_row(buf);
5700 }
5701
5702 if (trx) {
5703 trx->stmt_progress.queried++;
5704 }
5705 track_progress(thd);
5706 maybe_index_scan = true;
5707 cleanup:
5708 TOKUDB_HANDLER_DBUG_RETURN(error);
5709 }
5710
5711 //
5712 // Initialize a scan of the table (which is why index_init is called on primary_key)
5713 // Parameters:
5714 // scan - unused
5715 // Returns:
5716 // 0 on success
5717 // error otherwise
5718 //
rnd_init(bool scan)5719 int ha_tokudb::rnd_init(bool scan) {
5720 TOKUDB_HANDLER_DBUG_ENTER("");
5721 int error = 0;
5722 range_lock_grabbed = false;
5723 error = index_init(MAX_KEY, 0);
5724 if (error) { goto cleanup;}
5725
5726 if (scan) {
5727 error = prelock_range(NULL, NULL);
5728 if (error) { goto cleanup; }
5729
5730 // only want to set range_lock_grabbed to true after index_init
5731 // successfully executed for two reasons:
5732 // 1) index_init will reset it to false anyway
5733 // 2) if it fails, we don't want prelocking on,
5734 range_lock_grabbed = true;
5735 }
5736
5737 error = 0;
5738 cleanup:
5739 if (error) {
5740 index_end();
5741 last_cursor_error = error;
5742 }
5743 TOKUDB_HANDLER_DBUG_RETURN(error);
5744 }
5745
5746 //
5747 // End a scan of the table
5748 //
rnd_end()5749 int ha_tokudb::rnd_end() {
5750 TOKUDB_HANDLER_DBUG_ENTER("");
5751 range_lock_grabbed = false;
5752 TOKUDB_HANDLER_DBUG_RETURN(index_end());
5753 }
5754
5755
5756 //
5757 // Read the next row in a table scan
5758 // Parameters:
5759 // [out] buf - buffer for the next row, in MySQL format
5760 // Returns:
5761 // 0 on success
5762 // HA_ERR_END_OF_FILE if not found
5763 // error otherwise
5764 //
rnd_next(uchar * buf)5765 int ha_tokudb::rnd_next(uchar * buf) {
5766 TOKUDB_HANDLER_DBUG_ENTER("");
5767 ha_statistic_increment(&SSV::ha_read_rnd_next_count);
5768 int error = get_next(buf, 1, NULL, false);
5769 TOKUDB_HANDLER_DBUG_RETURN(error);
5770 }
5771
5772
track_progress(THD * thd)5773 void ha_tokudb::track_progress(THD* thd) {
5774 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
5775 if (trx) {
5776 ulonglong num_written = trx->stmt_progress.inserted +
5777 trx->stmt_progress.updated +
5778 trx->stmt_progress.deleted;
5779 bool update_status =
5780 (trx->stmt_progress.queried &&
5781 tokudb::sysvars::read_status_frequency &&
5782 (trx->stmt_progress.queried %
5783 tokudb::sysvars::read_status_frequency) == 0) ||
5784 (num_written && tokudb::sysvars::write_status_frequency &&
5785 (num_written % tokudb::sysvars::write_status_frequency) == 0);
5786 if (update_status) {
5787 char *next_status = write_status_msg;
5788 bool first = true;
5789 int r;
5790 if (trx->stmt_progress.queried) {
5791 r = sprintf(
5792 next_status,
5793 "Queried about %llu row%s",
5794 trx->stmt_progress.queried,
5795 trx->stmt_progress.queried == 1 ? "" : "s");
5796 assert_always(r >= 0);
5797 next_status += r;
5798 first = false;
5799 }
5800 if (trx->stmt_progress.inserted) {
5801 if (trx->stmt_progress.using_loader) {
5802 r = sprintf(
5803 next_status,
5804 "%sFetched about %llu row%s, loading data still remains",
5805 first ? "" : ", ",
5806 trx->stmt_progress.inserted,
5807 trx->stmt_progress.inserted == 1 ? "" : "s");
5808 } else {
5809 r = sprintf(
5810 next_status,
5811 "%sInserted about %llu row%s",
5812 first ? "" : ", ",
5813 trx->stmt_progress.inserted,
5814 trx->stmt_progress.inserted == 1 ? "" : "s");
5815 }
5816 assert_always(r >= 0);
5817 next_status += r;
5818 first = false;
5819 }
5820 if (trx->stmt_progress.updated) {
5821 r = sprintf(
5822 next_status,
5823 "%sUpdated about %llu row%s",
5824 first ? "" : ", ",
5825 trx->stmt_progress.updated,
5826 trx->stmt_progress.updated == 1 ? "" : "s");
5827 assert_always(r >= 0);
5828 next_status += r;
5829 first = false;
5830 }
5831 if (trx->stmt_progress.deleted) {
5832 r = sprintf(
5833 next_status,
5834 "%sDeleted about %llu row%s",
5835 first ? "" : ", ",
5836 trx->stmt_progress.deleted,
5837 trx->stmt_progress.deleted == 1 ? "" : "s");
5838 assert_always(r >= 0);
5839 next_status += r;
5840 first = false;
5841 }
5842 if (!first)
5843 thd_proc_info(thd, write_status_msg);
5844 }
5845 }
5846 }
5847
5848
get_pos(DBT * to,uchar * pos)5849 DBT *ha_tokudb::get_pos(DBT * to, uchar * pos) {
5850 TOKUDB_HANDLER_DBUG_ENTER("");
5851 /* We don't need to set app_data here */
5852 memset((void *) to, 0, sizeof(*to));
5853 to->data = pos + sizeof(uint32_t);
5854 to->size = *(uint32_t *)pos;
5855 DBUG_DUMP("key", (const uchar *) to->data, to->size);
5856 DBUG_RETURN(to);
5857 }
5858
5859 // Retrieves a row with based on the primary key saved in pos
5860 // Returns:
5861 // 0 on success
5862 // HA_ERR_KEY_NOT_FOUND if not found
5863 // error otherwise
rnd_pos(uchar * buf,uchar * pos)5864 int ha_tokudb::rnd_pos(uchar * buf, uchar * pos) {
5865 TOKUDB_HANDLER_DBUG_ENTER("");
5866 DBT db_pos;
5867 int error = 0;
5868 struct smart_dbt_info info;
5869 bool old_unpack_entire_row = unpack_entire_row;
5870 DBT* key = get_pos(&db_pos, pos);
5871
5872 unpack_entire_row = true;
5873 ha_statistic_increment(&SSV::ha_read_rnd_count);
5874 tokudb_active_index = MAX_KEY;
5875
5876 THD *thd = ha_thd();
5877 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
5878 // test rpl slave by inducing a delay before the point query
5879 if (thd->slave_thread && (in_rpl_delete_rows || in_rpl_update_rows)) {
5880 DBUG_EXECUTE_IF("tokudb_crash_if_rpl_looks_up_row", assert(0););
5881 uint64_t delay_ms = tokudb::sysvars::rpl_lookup_rows_delay(thd);
5882 if (delay_ms)
5883 usleep(delay_ms * 1000);
5884 }
5885 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
5886
5887 info.ha = this;
5888 info.buf = buf;
5889 info.keynr = primary_key;
5890
5891 error = share->file->getf_set(share->file, transaction,
5892 get_cursor_isolation_flags(lock.type, thd),
5893 key, smart_dbt_callback_rowread_ptquery, &info);
5894
5895 if (error == DB_NOTFOUND) {
5896 error = HA_ERR_KEY_NOT_FOUND;
5897 goto cleanup;
5898 }
5899 cleanup:
5900 unpack_entire_row = old_unpack_entire_row;
5901 TOKUDB_HANDLER_DBUG_RETURN(error);
5902 }
5903
prelock_range(const key_range * start_key,const key_range * end_key)5904 int ha_tokudb::prelock_range(const key_range *start_key, const key_range *end_key) {
5905 TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
5906 THD* thd = ha_thd();
5907
5908 int error = 0;
5909 DBT start_dbt_key;
5910 DBT end_dbt_key;
5911 uchar* start_key_buff = prelocked_left_range;
5912 uchar* end_key_buff = prelocked_right_range;
5913
5914 memset((void *) &start_dbt_key, 0, sizeof(start_dbt_key));
5915 memset((void *) &end_dbt_key, 0, sizeof(end_dbt_key));
5916
5917 HANDLE_INVALID_CURSOR();
5918 if (start_key) {
5919 switch (start_key->flag) {
5920 case HA_READ_AFTER_KEY:
5921 pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_POS_INF);
5922 break;
5923 default:
5924 pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_NEG_INF);
5925 break;
5926 }
5927 prelocked_left_range_size = start_dbt_key.size;
5928 }
5929 else {
5930 prelocked_left_range_size = 0;
5931 }
5932
5933 if (end_key) {
5934 switch (end_key->flag) {
5935 case HA_READ_BEFORE_KEY:
5936 pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_NEG_INF);
5937 break;
5938 default:
5939 pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_POS_INF);
5940 break;
5941 }
5942 prelocked_right_range_size = end_dbt_key.size;
5943 }
5944 else {
5945 prelocked_right_range_size = 0;
5946 }
5947
5948 error = cursor->c_set_bounds(
5949 cursor,
5950 start_key ? &start_dbt_key : share->key_file[tokudb_active_index]->dbt_neg_infty(),
5951 end_key ? &end_dbt_key : share->key_file[tokudb_active_index]->dbt_pos_infty(),
5952 true,
5953 (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
5954 );
5955 if (error) {
5956 error = map_to_handler_error(error);
5957 last_cursor_error = error;
5958 //
5959 // cursor should be initialized here, but in case it is not, we still check
5960 //
5961 if (cursor) {
5962 int r = cursor->c_close(cursor);
5963 assert_always(r==0);
5964 cursor = NULL;
5965 remove_from_trx_handler_list();
5966 }
5967 goto cleanup;
5968 }
5969
5970 // at this point, determine if we will be doing bulk fetch
5971 doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
5972 bulk_fetch_iteration = 0;
5973 rows_fetched_using_bulk_fetch = 0;
5974
5975 cleanup:
5976 TOKUDB_HANDLER_DBUG_RETURN(error);
5977 }
5978
5979 //
5980 // Prelock range if possible, start_key is leftmost, end_key is rightmost
5981 // whether scanning forward or backward. This function is called by MySQL
5982 // for backward range queries (in QUICK_SELECT_DESC::get_next).
5983 // Forward scans use read_range_first()/read_range_next().
5984 //
prepare_range_scan(const key_range * start_key,const key_range * end_key)5985 int ha_tokudb::prepare_range_scan( const key_range *start_key, const key_range *end_key) {
5986 TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
5987 int error = prelock_range(start_key, end_key);
5988 if (!error) {
5989 range_lock_grabbed = true;
5990 }
5991 TOKUDB_HANDLER_DBUG_RETURN(error);
5992 }
5993
read_range_first(const key_range * start_key,const key_range * end_key,bool eq_range,bool sorted)5994 int ha_tokudb::read_range_first(
5995 const key_range *start_key,
5996 const key_range *end_key,
5997 bool eq_range,
5998 bool sorted)
5999 {
6000 TOKUDB_HANDLER_DBUG_ENTER("%p %p %u %u", start_key, end_key, eq_range, sorted);
6001 int error = prelock_range(start_key, end_key);
6002 if (error) { goto cleanup; }
6003 range_lock_grabbed = true;
6004
6005 error = handler::read_range_first(start_key, end_key, eq_range, sorted);
6006 cleanup:
6007 TOKUDB_HANDLER_DBUG_RETURN(error);
6008 }
6009
read_range_next()6010 int ha_tokudb::read_range_next()
6011 {
6012 TOKUDB_HANDLER_DBUG_ENTER("");
6013 int error;
6014 error = handler::read_range_next();
6015 if (error) {
6016 range_lock_grabbed = false;
6017 }
6018 TOKUDB_HANDLER_DBUG_RETURN(error);
6019 }
6020
6021
6022
6023 /*
6024 Set a reference to the current record in (ref,ref_length).
6025
6026 SYNOPSIS
6027 ha_tokudb::position()
6028 record The current record buffer
6029
6030 DESCRIPTION
6031 The BDB handler stores the primary key in (ref,ref_length).
6032 There is either an explicit primary key, or an implicit (hidden)
6033 primary key.
6034 During open(), 'ref_length' is calculated as the maximum primary
6035 key length. When an actual key is shorter than that, the rest of
6036 the buffer must be cleared out. The row cannot be identified, if
6037 garbage follows behind the end of the key. There is no length
6038 field for the current key, so that the whole ref_length is used
6039 for comparison.
6040
6041 RETURN
6042 nothing
6043 */
position(const uchar * record)6044 void ha_tokudb::position(const uchar * record) {
6045 TOKUDB_HANDLER_DBUG_ENTER("");
6046 DBT key;
6047 if (hidden_primary_key) {
6048 assert(ref_length == (TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t)));
6049 memcpy(ref + sizeof(uint32_t), current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
6050 *(uint32_t *)ref = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
6051 }
6052 else {
6053 bool has_null;
6054 //
6055 // save the data
6056 //
6057 create_dbt_key_from_table(&key, primary_key, ref + sizeof(uint32_t), record, &has_null);
6058 //
6059 // save the size of data in the first four bytes of ref
6060 //
6061 memcpy(ref, &key.size, sizeof(uint32_t));
6062 }
6063 TOKUDB_HANDLER_DBUG_VOID_RETURN;
6064 }
6065
6066 //
6067 // Per InnoDB: Returns statistics information of the table to the MySQL interpreter,
6068 // in various fields of the handle object.
6069 // Return:
6070 // 0, always success
6071 //
info(uint flag)6072 int ha_tokudb::info(uint flag) {
6073 TOKUDB_HANDLER_DBUG_ENTER("%d", flag);
6074 int error = 0;
6075 #if defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
6076 for (uint i=0; i < table->s->keys; i++)
6077 if (key_is_clustering(&table->key_info[i]))
6078 table->covering_keys.set_bit(i);
6079 #endif // defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
6080 DB_TXN* txn = NULL;
6081 if (flag & HA_STATUS_VARIABLE) {
6082 stats.records = share->row_count() + share->rows_from_locked_table;
6083 stats.deleted = 0;
6084 if (!(flag & HA_STATUS_NO_LOCK)) {
6085 error = txn_begin(db_env, NULL, &txn, DB_READ_UNCOMMITTED, ha_thd());
6086 if (error) {
6087 goto cleanup;
6088 }
6089
6090 // we should always have a primary key
6091 assert_always(share->file != NULL);
6092
6093 DB_BTREE_STAT64 dict_stats;
6094 error = share->file->stat64(share->file, txn, &dict_stats);
6095 if (error) {
6096 goto cleanup;
6097 }
6098 share->set_row_count(dict_stats.bt_ndata, false);
6099 stats.records = dict_stats.bt_ndata;
6100 stats.create_time = dict_stats.bt_create_time_sec;
6101 stats.update_time = dict_stats.bt_modify_time_sec;
6102 stats.check_time = dict_stats.bt_verify_time_sec;
6103 stats.data_file_length = dict_stats.bt_dsize;
6104 stats.delete_length = dict_stats.bt_fsize - dict_stats.bt_dsize;
6105 if (hidden_primary_key) {
6106 //
6107 // in this case, we have a hidden primary key, do not
6108 // want to report space taken up by the hidden primary key to the user
6109 //
6110 uint64_t hpk_space =
6111 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH * dict_stats.bt_ndata;
6112 stats.data_file_length =
6113 (hpk_space > stats.data_file_length) ?
6114 0 : stats.data_file_length - hpk_space;
6115 } else {
6116 //
6117 // one infinity byte per key needs to be subtracted
6118 //
6119 uint64_t inf_byte_space = dict_stats.bt_ndata;
6120 stats.data_file_length =
6121 (inf_byte_space > stats.data_file_length) ?
6122 0 : stats.data_file_length - inf_byte_space;
6123 }
6124
6125 stats.mean_rec_length =
6126 stats.records ?
6127 (ulong)(stats.data_file_length/stats.records) : 0;
6128 stats.index_file_length = 0;
6129 // curr_num_DBs is the number of keys we have, according
6130 // to the mysql layer. if drop index is running concurrently
6131 // with info() (it can, because info does not take table locks),
6132 // then it could be the case that one of the dbs was dropped
6133 // and set to NULL before mysql was able to set table->s->keys
6134 // accordingly.
6135 //
6136 // we should just ignore any DB * that is NULL.
6137 //
6138 // this solution is much simpler than trying to maintain an
6139 // accurate number of valid keys at the handlerton layer.
6140 uint curr_num_DBs =
6141 table->s->keys + tokudb_test(hidden_primary_key);
6142 for (uint i = 0; i < curr_num_DBs; i++) {
6143 // skip the primary key, skip dropped indexes
6144 if (i == primary_key || share->key_file[i] == NULL) {
6145 continue;
6146 }
6147 error = share->key_file[i]->stat64(
6148 share->key_file[i], txn, &dict_stats);
6149 if (error) {
6150 goto cleanup;
6151 }
6152 stats.index_file_length += dict_stats.bt_dsize;
6153 stats.delete_length +=
6154 dict_stats.bt_fsize - dict_stats.bt_dsize;
6155 }
6156 }
6157
6158 /*
6159 The following comment and logic has been taken from InnoDB and
6160 an old hack was removed that forced to always set stats.records > 0
6161 ---
6162 The MySQL optimizer seems to assume in a left join that n_rows
6163 is an accurate estimate if it is zero. Of course, it is not,
6164 since we do not have any locks on the rows yet at this phase.
6165 Since SHOW TABLE STATUS seems to call this function with the
6166 HA_STATUS_TIME flag set, while the left join optimizer does not
6167 set that flag, we add one to a zero value if the flag is not
6168 set. That way SHOW TABLE STATUS will show the best estimate,
6169 while the optimizer never sees the table empty. */
6170 if (stats.records == 0 && !(flag & HA_STATUS_TIME)) {
6171 stats.records++;
6172 }
6173 }
6174 if ((flag & HA_STATUS_CONST)) {
6175 stats.max_data_file_length = 9223372036854775807ULL;
6176 share->set_cardinality_counts_in_table(table);
6177 }
6178
6179 /* Don't return key if we got an error for the internal primary key */
6180 if (flag & HA_STATUS_ERRKEY && last_dup_key < table_share->keys) {
6181 errkey = last_dup_key;
6182 }
6183
6184 if (flag & HA_STATUS_AUTO && table->found_next_number_field) {
6185 THD* thd = table->in_use;
6186 struct system_variables* variables = &thd->variables;
6187 stats.auto_increment_value =
6188 share->last_auto_increment + variables->auto_increment_increment;
6189 }
6190 error = 0;
6191 cleanup:
6192 if (txn != NULL) {
6193 commit_txn(txn, DB_TXN_NOSYNC);
6194 txn = NULL;
6195 }
6196 TOKUDB_HANDLER_DBUG_RETURN(error);
6197 }
6198
6199 //
6200 // Per InnoDB: Tells something additional to the handler about how to do things.
6201 //
extra(enum ha_extra_function operation)6202 int ha_tokudb::extra(enum ha_extra_function operation) {
6203 TOKUDB_HANDLER_DBUG_ENTER("%d", operation);
6204 switch (operation) {
6205 case HA_EXTRA_RESET_STATE:
6206 reset();
6207 break;
6208 case HA_EXTRA_KEYREAD:
6209 key_read = true; // Query satisfied with key
6210 break;
6211 case HA_EXTRA_NO_KEYREAD:
6212 key_read = false;
6213 break;
6214 case HA_EXTRA_IGNORE_DUP_KEY:
6215 using_ignore = true;
6216 break;
6217 case HA_EXTRA_NO_IGNORE_DUP_KEY:
6218 using_ignore = false;
6219 break;
6220 case HA_EXTRA_IGNORE_NO_KEY:
6221 using_ignore_no_key = true;
6222 break;
6223 case HA_EXTRA_NO_IGNORE_NO_KEY:
6224 using_ignore_no_key = false;
6225 break;
6226 case HA_EXTRA_NOT_USED:
6227 case HA_EXTRA_PREPARE_FOR_RENAME:
6228 break; // must do nothing and return 0
6229 default:
6230 break;
6231 }
6232 TOKUDB_HANDLER_DBUG_RETURN(0);
6233 }
6234
reset()6235 int ha_tokudb::reset() {
6236 TOKUDB_HANDLER_DBUG_ENTER("");
6237 key_read = false;
6238 using_ignore = false;
6239 using_ignore_no_key = false;
6240 ds_mrr.reset();
6241 invalidate_icp();
6242 TOKUDB_HANDLER_DBUG_RETURN(0);
6243 }
6244
6245 //
6246 // helper function that iterates through all DB's
6247 // and grabs a lock (either read or write, but not both)
6248 // Parameters:
6249 // [in] trans - transaction to be used to pre acquire the lock
6250 // lt - type of lock to get, either lock_read or lock_write
6251 // Returns:
6252 // 0 on success
6253 // error otherwise
6254 //
acquire_table_lock(DB_TXN * trans,TABLE_LOCK_TYPE lt)6255 int ha_tokudb::acquire_table_lock (DB_TXN* trans, TABLE_LOCK_TYPE lt) {
6256 TOKUDB_HANDLER_DBUG_ENTER("%p %s", trans, lt == lock_read ? "r" : "w");
6257 int error = ENOSYS;
6258 if (!num_DBs_locked_in_bulk) {
6259 rwlock_t_lock_read(share->_num_DBs_lock);
6260 }
6261 uint curr_num_DBs = share->num_DBs;
6262 if (lt == lock_read) {
6263 error = 0;
6264 goto cleanup;
6265 } else if (lt == lock_write) {
6266 for (uint i = 0; i < curr_num_DBs; i++) {
6267 DB* db = share->key_file[i];
6268 error = db->pre_acquire_table_lock(db, trans);
6269 if (error == EINVAL)
6270 TOKUDB_HANDLER_TRACE("%d db=%p trans=%p", i, db, trans);
6271 if (error) break;
6272 }
6273 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6274 if (error) goto cleanup;
6275 } else {
6276 error = ENOSYS;
6277 goto cleanup;
6278 }
6279
6280 error = 0;
6281 cleanup:
6282 if (!num_DBs_locked_in_bulk) {
6283 share->_num_DBs_lock.unlock();
6284 }
6285 TOKUDB_HANDLER_DBUG_RETURN(error);
6286 }
6287
create_txn(THD * thd,tokudb_trx_data * trx)6288 int ha_tokudb::create_txn(THD* thd, tokudb_trx_data* trx) {
6289 int error;
6290 ulong tx_isolation = thd_tx_isolation(thd);
6291 HA_TOKU_ISO_LEVEL toku_iso_level = tx_to_toku_iso(tx_isolation);
6292 bool is_autocommit = !thd_test_options(
6293 thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN);
6294
6295 /* First table lock, start transaction */
6296 if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) &&
6297 !trx->all &&
6298 (thd_sql_command(thd) != SQLCOM_CREATE_TABLE) &&
6299 (thd_sql_command(thd) != SQLCOM_DROP_TABLE) &&
6300 (thd_sql_command(thd) != SQLCOM_DROP_INDEX) &&
6301 (thd_sql_command(thd) != SQLCOM_CREATE_INDEX) &&
6302 (thd_sql_command(thd) != SQLCOM_ALTER_TABLE)) {
6303 /* QQQ We have to start a master transaction */
6304 // DBUG_PRINT("trans", ("starting transaction all "));
6305 uint32_t txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6306 if (thd_tx_is_read_only(thd)) {
6307 txn_begin_flags |= DB_TXN_READ_ONLY;
6308 }
6309 if ((error = txn_begin(db_env, NULL, &trx->all, txn_begin_flags, thd))) {
6310 goto cleanup;
6311 }
6312 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6313 TOKUDB_DEBUG_TXN,
6314 "created master %p",
6315 trx->all);
6316 trx->sp_level = trx->all;
6317 trans_register_ha(thd, true, tokudb_hton, NULL);
6318 }
6319 DBUG_PRINT("trans", ("starting transaction stmt"));
6320 if (trx->stmt) {
6321 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6322 TOKUDB_DEBUG_TXN,
6323 "warning:stmt=%p",
6324 trx->stmt);
6325 }
6326 uint32_t txn_begin_flags;
6327 if (trx->all == NULL) {
6328 txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6329 //
6330 // if the isolation level that the user has set is serializable,
6331 // but autocommit is on and this is just a select,
6332 // then we can go ahead and set the isolation level to
6333 // be a snapshot read, because we can serialize
6334 // the transaction to be the point in time at which the snapshot began.
6335 //
6336 if (txn_begin_flags == 0 && is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT) {
6337 txn_begin_flags = DB_TXN_SNAPSHOT;
6338 }
6339 if (is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT &&
6340 !thd->in_sub_stmt && lock.type <= TL_READ_NO_INSERT &&
6341 !thd->lex->uses_stored_routines()) {
6342 txn_begin_flags |= DB_TXN_READ_ONLY;
6343 }
6344 } else {
6345 txn_begin_flags = DB_INHERIT_ISOLATION;
6346 }
6347 error = txn_begin(db_env, trx->sp_level, &trx->stmt, txn_begin_flags, thd);
6348 if (error) {
6349 /* We leave the possible master transaction open */
6350 goto cleanup;
6351 }
6352 trx->sub_sp_level = trx->stmt;
6353 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6354 TOKUDB_DEBUG_TXN,
6355 "created stmt %p sp_level %p",
6356 trx->sp_level,
6357 trx->stmt);
6358 reset_stmt_progress(&trx->stmt_progress);
6359 trans_register_ha(thd, false, tokudb_hton, NULL);
6360 cleanup:
6361 return error;
6362 }
6363
lock_type_str(int lock_type)6364 static const char *lock_type_str(int lock_type) {
6365 if (lock_type == F_RDLCK) return "F_RDLCK";
6366 if (lock_type == F_WRLCK) return "F_WRLCK";
6367 if (lock_type == F_UNLCK) return "F_UNLCK";
6368 return "?";
6369 }
6370
6371 /*
6372 As MySQL will execute an external lock for every new table it uses
6373 we can use this to start the transactions.
6374 If we are in auto_commit mode we just need to start a transaction
6375 for the statement to be able to rollback the statement.
6376 If not, we have to start a master transaction if there doesn't exist
6377 one from before.
6378 */
6379 //
6380 // Parameters:
6381 // [in] thd - handle to the user thread
6382 // lock_type - the type of lock
6383 // Returns:
6384 // 0 on success
6385 // error otherwise
6386 //
external_lock(THD * thd,int lock_type)6387 int ha_tokudb::external_lock(THD * thd, int lock_type) {
6388 TOKUDB_HANDLER_DBUG_ENTER(
6389 "cmd %d lock %d %s %s",
6390 thd_sql_command(thd),
6391 lock_type,
6392 lock_type_str(lock_type),
6393 share->full_table_name());
6394 if (TOKUDB_UNLIKELY(!TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ENTER) &&
6395 TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_LOCK))) {
6396 TOKUDB_HANDLER_TRACE(
6397 "cmd %d lock %d %s %s",
6398 thd_sql_command(thd),
6399 lock_type,
6400 lock_type_str(lock_type),
6401 share->full_table_name());
6402 }
6403 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s",
6404 thd->query().str);
6405
6406 int error = 0;
6407 tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6408 if (!trx) {
6409 error = create_tokudb_trx_data_instance(&trx);
6410 if (error) { goto cleanup; }
6411 thd_set_ha_data(thd, tokudb_hton, trx);
6412 }
6413
6414 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6415 TOKUDB_DEBUG_TXN,
6416 "trx %p %p %p %p %u %u",
6417 trx->all,
6418 trx->stmt,
6419 trx->sp_level,
6420 trx->sub_sp_level,
6421 trx->tokudb_lock_count,
6422 trx->create_lock_count);
6423
6424 if (trx->all == NULL) {
6425 trx->sp_level = NULL;
6426 }
6427 if (lock_type != F_UNLCK) {
6428 use_write_locks = false;
6429 if (lock_type == F_WRLCK) {
6430 use_write_locks = true;
6431 }
6432 if (!trx->stmt) {
6433 transaction = NULL; // Safety
6434 error = create_txn(thd, trx);
6435 if (error) {
6436 goto cleanup;
6437 }
6438 trx->create_lock_count = trx->tokudb_lock_count;
6439 }
6440 transaction = trx->sub_sp_level;
6441 trx->tokudb_lock_count++;
6442 } else {
6443 share->update_row_count(thd, added_rows, deleted_rows, updated_rows);
6444 added_rows = 0;
6445 deleted_rows = 0;
6446 updated_rows = 0;
6447 share->rows_from_locked_table = 0;
6448 if (trx->tokudb_lock_count > 0) {
6449 if (--trx->tokudb_lock_count <= trx->create_lock_count) {
6450 trx->create_lock_count = 0;
6451 if (trx->stmt) {
6452 /*
6453 F_UNLCK is done without a transaction commit / rollback.
6454 This happens if the thread didn't update any rows
6455 We must in this case commit the work to keep the row locks
6456 */
6457 DBUG_PRINT("trans", ("committing non-updating transaction"));
6458 reset_stmt_progress(&trx->stmt_progress);
6459 commit_txn(trx->stmt, 0);
6460 trx->stmt = NULL;
6461 trx->sub_sp_level = NULL;
6462 }
6463 }
6464 transaction = NULL;
6465 }
6466 }
6467 cleanup:
6468 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6469 TOKUDB_HANDLER_DBUG_RETURN(error);
6470 }
6471
6472 /*
6473 When using LOCK TABLE's external_lock is only called when the actual
6474 TABLE LOCK is done.
6475 Under LOCK TABLES, each used tables will force a call to start_stmt.
6476 */
start_stmt(THD * thd,thr_lock_type lock_type)6477 int ha_tokudb::start_stmt(THD* thd, thr_lock_type lock_type) {
6478 TOKUDB_HANDLER_DBUG_ENTER(
6479 "cmd %d lock %d %s",
6480 thd_sql_command(thd),
6481 lock_type,
6482 share->full_table_name());
6483
6484 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s",
6485 thd->query().str);
6486
6487 int error = 0;
6488 tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6489 if (!trx) {
6490 error = create_tokudb_trx_data_instance(&trx);
6491 if (error) { goto cleanup; }
6492 thd_set_ha_data(thd, tokudb_hton, trx);
6493 }
6494
6495 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6496 TOKUDB_DEBUG_TXN,
6497 "trx %p %p %p %p %u %u",
6498 trx->all,
6499 trx->stmt,
6500 trx->sp_level,
6501 trx->sub_sp_level,
6502 trx->tokudb_lock_count,
6503 trx->create_lock_count);
6504
6505 /*
6506 note that trx->stmt may have been already initialized as start_stmt()
6507 is called for *each table* not for each storage engine,
6508 and there could be many bdb tables referenced in the query
6509 */
6510 if (!trx->stmt) {
6511 error = create_txn(thd, trx);
6512 if (error) {
6513 goto cleanup;
6514 }
6515 trx->create_lock_count = trx->tokudb_lock_count;
6516 } else {
6517 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6518 TOKUDB_DEBUG_TXN,
6519 "trx->stmt %p already existed",
6520 trx->stmt);
6521 }
6522 if (added_rows > deleted_rows) {
6523 share->rows_from_locked_table = added_rows - deleted_rows;
6524 }
6525 transaction = trx->sub_sp_level;
6526 trans_register_ha(thd, false, tokudb_hton, NULL);
6527 cleanup:
6528 TOKUDB_HANDLER_DBUG_RETURN(error);
6529 }
6530
6531
get_cursor_isolation_flags(enum thr_lock_type lock_type,THD * thd)6532 uint32_t ha_tokudb::get_cursor_isolation_flags(enum thr_lock_type lock_type, THD* thd) {
6533 uint sql_command = thd_sql_command(thd);
6534 bool in_lock_tables = thd_in_lock_tables(thd);
6535
6536 //
6537 // following InnoDB's lead and having checksum command use a snapshot read if told
6538 //
6539 if (sql_command == SQLCOM_CHECKSUM) {
6540 return 0;
6541 }
6542 else if ((lock_type == TL_READ && in_lock_tables) ||
6543 (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
6544 sql_command != SQLCOM_SELECT ||
6545 (sql_command == SQLCOM_SELECT && lock_type >= TL_WRITE_ALLOW_WRITE)) { // select for update
6546 ulong tx_isolation = thd_tx_isolation(thd);
6547 // pattern matched from InnoDB
6548 if ( (tx_isolation == ISO_READ_COMMITTED || tx_isolation == ISO_READ_UNCOMMITTED) &&
6549 (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT) &&
6550 (sql_command == SQLCOM_INSERT_SELECT
6551 || sql_command == SQLCOM_REPLACE_SELECT
6552 || sql_command == SQLCOM_UPDATE
6553 || sql_command == SQLCOM_CREATE_TABLE) )
6554 {
6555 return 0;
6556 }
6557 else {
6558 return DB_SERIALIZABLE;
6559 }
6560 }
6561 else {
6562 return 0;
6563 }
6564 }
6565
6566 /*
6567 The idea with handler::store_lock() is the following:
6568
6569 The statement decided which locks we should need for the table
6570 for updates/deletes/inserts we get WRITE locks, for SELECT... we get
6571 read locks.
6572
6573 Before adding the lock into the table lock handler (see thr_lock.c)
6574 mysqld calls store lock with the requested locks. Store lock can now
6575 modify a write lock to a read lock (or some other lock), ignore the
6576 lock (if we don't want to use MySQL table locks at all) or add locks
6577 for many tables (like we do when we are using a MERGE handler).
6578
6579 TokuDB changes all WRITE locks to TL_WRITE_ALLOW_WRITE (which
6580 signals that we are doing WRITES, but we are still allowing other
6581 reader's and writer's.
6582
6583 When releasing locks, store_lock() are also called. In this case one
6584 usually doesn't have to do anything.
6585
6586 In some exceptional cases MySQL may send a request for a TL_IGNORE;
6587 This means that we are requesting the same lock as last time and this
6588 should also be ignored. (This may happen when someone does a flush
6589 table when we have opened a part of the tables, in which case mysqld
6590 closes and reopens the tables and tries to get the same locks at last
6591 time). In the future we will probably try to remove this.
6592 */
6593
store_lock(THD * thd,THR_LOCK_DATA ** to,enum thr_lock_type lock_type)6594 THR_LOCK_DATA* *ha_tokudb::store_lock(
6595 THD* thd,
6596 THR_LOCK_DATA** to,
6597 enum thr_lock_type lock_type) {
6598
6599 TOKUDB_HANDLER_DBUG_ENTER(
6600 "lock_type=%d cmd=%d",
6601 lock_type,
6602 thd_sql_command(thd));
6603 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6604 TOKUDB_DEBUG_LOCK,
6605 "lock_type=%d cmd=%d",
6606 lock_type,
6607 thd_sql_command(thd));
6608
6609 if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) {
6610 enum_sql_command sql_command = (enum_sql_command) thd_sql_command(thd);
6611 if (!thd->in_lock_tables) {
6612 if (sql_command == SQLCOM_CREATE_INDEX &&
6613 tokudb::sysvars::create_index_online(thd)) {
6614 // hot indexing
6615 rwlock_t_lock_read(share->_num_DBs_lock);
6616 if (share->num_DBs ==
6617 (table->s->keys + tokudb_test(hidden_primary_key))) {
6618 lock_type = TL_WRITE_ALLOW_WRITE;
6619 }
6620 share->_num_DBs_lock.unlock();
6621 } else if ((lock_type >= TL_WRITE_CONCURRENT_INSERT &&
6622 lock_type <= TL_WRITE) &&
6623 sql_command != SQLCOM_TRUNCATE &&
6624 !thd_tablespace_op(thd)) {
6625 // allow concurrent writes
6626 lock_type = TL_WRITE_ALLOW_WRITE;
6627 } else if (sql_command == SQLCOM_OPTIMIZE &&
6628 lock_type == TL_READ_NO_INSERT) {
6629 // hot optimize table
6630 lock_type = TL_READ;
6631 }
6632 }
6633 lock.type = lock_type;
6634 }
6635 *to++ = &lock;
6636 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6637 TOKUDB_DEBUG_LOCK,
6638 "lock_type=%d",
6639 lock_type);
6640 TOKUDB_HANDLER_DBUG_RETURN_PTR(to);
6641 }
6642
get_compression_method(DB * file)6643 static toku_compression_method get_compression_method(DB* file) {
6644 enum toku_compression_method method;
6645 int r = file->get_compression_method(file, &method);
6646 assert_always(r == 0);
6647 return method;
6648 }
6649
6650 #if defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) && \
6651 TOKU_INCLUDE_ROW_TYPE_COMPRESSION
get_row_type() const6652 enum row_type ha_tokudb::get_row_type() const {
6653 toku_compression_method compression_method = get_compression_method(share->file);
6654 return toku_compression_method_to_row_type(compression_method);
6655 }
6656 #endif // defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) &&
6657 // TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6658
create_sub_table(const char * table_name,DBT * row_descriptor,DB_TXN * txn,uint32_t block_size,uint32_t read_block_size,toku_compression_method compression_method,bool is_hot_index,uint32_t fanout)6659 static int create_sub_table(
6660 const char* table_name,
6661 DBT* row_descriptor,
6662 DB_TXN* txn,
6663 uint32_t block_size,
6664 uint32_t read_block_size,
6665 toku_compression_method compression_method,
6666 bool is_hot_index,
6667 uint32_t fanout) {
6668
6669 TOKUDB_DBUG_ENTER("");
6670 int error;
6671 DB *file = NULL;
6672 uint32_t create_flags;
6673
6674
6675 error = db_create(&file, db_env, 0);
6676 if (error) {
6677 DBUG_PRINT("error", ("Got error: %d when creating table", error));
6678 set_my_errno(error);
6679 goto exit;
6680 }
6681
6682
6683 if (block_size != 0) {
6684 error = file->set_pagesize(file, block_size);
6685 if (error != 0) {
6686 DBUG_PRINT(
6687 "error",
6688 ("Got error: %d when setting block size %u for table '%s'",
6689 error,
6690 block_size,
6691 table_name));
6692 goto exit;
6693 }
6694 }
6695 if (read_block_size != 0) {
6696 error = file->set_readpagesize(file, read_block_size);
6697 if (error != 0) {
6698 DBUG_PRINT(
6699 "error",
6700 ("Got error: %d when setting read block size %u for table '%s'",
6701 error,
6702 read_block_size,
6703 table_name));
6704 goto exit;
6705 }
6706 }
6707 if (fanout != 0) {
6708 error = file->set_fanout(file, fanout);
6709 if (error != 0) {
6710 DBUG_PRINT(
6711 "error",
6712 ("Got error: %d when setting fanout %u for table '%s'",
6713 error,
6714 fanout,
6715 table_name));
6716 goto exit;
6717 }
6718 }
6719 error = file->set_compression_method(file, compression_method);
6720 if (error != 0) {
6721 DBUG_PRINT(
6722 "error",
6723 ("Got error: %d when setting compression type %u for table '%s'",
6724 error,
6725 compression_method,
6726 table_name));
6727 goto exit;
6728 }
6729
6730 create_flags =
6731 DB_THREAD | DB_CREATE | DB_EXCL | (is_hot_index ? DB_IS_HOT_INDEX : 0);
6732 error =
6733 file->open(
6734 file,
6735 txn,
6736 table_name,
6737 NULL,
6738 DB_BTREE,
6739 create_flags,
6740 my_umask);
6741 if (error) {
6742 DBUG_PRINT(
6743 "error",
6744 ("Got error: %d when opening table '%s'", error, table_name));
6745 goto exit;
6746 }
6747
6748 error =
6749 file->change_descriptor(
6750 file,
6751 txn,
6752 row_descriptor,
6753 (is_hot_index ? DB_IS_HOT_INDEX |
6754 DB_UPDATE_CMP_DESCRIPTOR :
6755 DB_UPDATE_CMP_DESCRIPTOR));
6756 if (error) {
6757 DBUG_PRINT(
6758 "error",
6759 ("Got error: %d when setting row descriptor for table '%s'",
6760 error,
6761 table_name));
6762 goto exit;
6763 }
6764
6765 error = 0;
6766 exit:
6767 if (file) {
6768 int r = file->close(file, 0);
6769 assert_always(r==0);
6770 }
6771 TOKUDB_DBUG_RETURN(error);
6772 }
6773
update_create_info(HA_CREATE_INFO * create_info)6774 void ha_tokudb::update_create_info(HA_CREATE_INFO* create_info) {
6775 if (share->has_auto_inc) {
6776 info(HA_STATUS_AUTO);
6777 if (!(create_info->used_fields & HA_CREATE_USED_AUTO) ||
6778 create_info->auto_increment_value < stats.auto_increment_value) {
6779 create_info->auto_increment_value = stats.auto_increment_value;
6780 }
6781 }
6782 #if defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) && \
6783 TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6784 if (!(create_info->used_fields & HA_CREATE_USED_ROW_FORMAT)) {
6785 // show create table asks us to update this create_info, this makes it
6786 // so we'll always show what compression type we're using
6787 create_info->row_type = get_row_type();
6788 if (create_info->row_type == ROW_TYPE_TOKU_ZLIB &&
6789 tokudb::sysvars::hide_default_row_format(ha_thd()) != 0) {
6790 create_info->row_type = ROW_TYPE_DEFAULT;
6791 }
6792 }
6793 #endif // defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) &&
6794 // TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6795 }
6796
6797 //
6798 // removes key name from status.tokudb.
6799 // needed for when we are dropping indexes, so that
6800 // during drop table, we do not attempt to remove already dropped
6801 // indexes because we did not keep status.tokudb in sync with list of indexes.
6802 //
remove_key_name_from_status(DB * status_block,char * key_name,DB_TXN * txn)6803 int ha_tokudb::remove_key_name_from_status(DB* status_block, char* key_name, DB_TXN* txn) {
6804 int error;
6805 uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6806 HA_METADATA_KEY md_key = hatoku_key_name;
6807 memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6808 //
6809 // put index name in status.tokudb
6810 //
6811 memcpy(
6812 status_key_info + sizeof(HA_METADATA_KEY),
6813 key_name,
6814 strlen(key_name) + 1
6815 );
6816 error = remove_metadata(
6817 status_block,
6818 status_key_info,
6819 sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6820 txn
6821 );
6822 return error;
6823 }
6824
6825 //
6826 // writes the key name in status.tokudb, so that we may later delete or rename
6827 // the dictionary associated with key_name
6828 //
write_key_name_to_status(DB * status_block,char * key_name,DB_TXN * txn)6829 int ha_tokudb::write_key_name_to_status(DB* status_block, char* key_name, DB_TXN* txn) {
6830 int error;
6831 uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6832 HA_METADATA_KEY md_key = hatoku_key_name;
6833 memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6834 //
6835 // put index name in status.tokudb
6836 //
6837 memcpy(
6838 status_key_info + sizeof(HA_METADATA_KEY),
6839 key_name,
6840 strlen(key_name) + 1
6841 );
6842 error = write_metadata(
6843 status_block,
6844 status_key_info,
6845 sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6846 NULL,
6847 0,
6848 txn
6849 );
6850 return error;
6851 }
6852
6853 //
6854 // some tracing moved out of ha_tokudb::create, because ::create was
6855 // getting cluttered
6856 //
trace_create_table_info(TABLE * form)6857 void ha_tokudb::trace_create_table_info(TABLE* form) {
6858 uint i;
6859 //
6860 // tracing information about what type of table we are creating
6861 //
6862 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_OPEN))) {
6863 for (i = 0; i < form->s->fields; i++) {
6864 Field *field = form->s->field[i];
6865 TOKUDB_HANDLER_TRACE(
6866 "field:%d:%s:type=%d:flags=%x",
6867 i,
6868 field->field_name,
6869 field->type(),
6870 field->flags);
6871 }
6872 for (i = 0; i < form->s->keys; i++) {
6873 KEY *key = &form->s->key_info[i];
6874 TOKUDB_HANDLER_TRACE(
6875 "key:%d:%s:%d",
6876 i,
6877 key->name,
6878 key->user_defined_key_parts);
6879 uint p;
6880 for (p = 0; p < key->user_defined_key_parts; p++) {
6881 KEY_PART_INFO* key_part = &key->key_part[p];
6882 Field* field = key_part->field;
6883 TOKUDB_HANDLER_TRACE(
6884 "key:%d:%d:length=%d:%s:type=%d:flags=%x",
6885 i,
6886 p,
6887 key_part->length,
6888 field->field_name,
6889 field->type(),
6890 field->flags);
6891 }
6892 }
6893 }
6894 }
6895
get_max_desc_size(KEY_AND_COL_INFO * kc_info,TABLE * form)6896 static uint32_t get_max_desc_size(KEY_AND_COL_INFO* kc_info, TABLE* form) {
6897 uint32_t max_row_desc_buff_size;
6898 // upper bound of key comparison descriptor
6899 max_row_desc_buff_size = 2*(form->s->fields * 6)+10;
6900 // upper bound for sec. key part
6901 max_row_desc_buff_size += get_max_secondary_key_pack_desc_size(kc_info);
6902 // upper bound for clustering val part
6903 max_row_desc_buff_size += get_max_clustering_val_pack_desc_size(form->s);
6904 return max_row_desc_buff_size;
6905 }
6906
create_secondary_key_descriptor(uchar * buf,KEY * key_info,KEY * prim_key,uint hpk,TABLE * form,uint primary_key,uint32_t keynr,KEY_AND_COL_INFO * kc_info)6907 static uint32_t create_secondary_key_descriptor(
6908 uchar* buf,
6909 KEY* key_info,
6910 KEY* prim_key,
6911 uint hpk,
6912 TABLE* form,
6913 uint primary_key,
6914 uint32_t keynr,
6915 KEY_AND_COL_INFO* kc_info) {
6916
6917 uchar* ptr = NULL;
6918
6919 ptr = buf;
6920 ptr += create_toku_key_descriptor(
6921 ptr,
6922 false,
6923 key_info,
6924 hpk,
6925 prim_key
6926 );
6927
6928 ptr += create_toku_secondary_key_pack_descriptor(
6929 ptr,
6930 hpk,
6931 primary_key,
6932 form->s,
6933 form,
6934 kc_info,
6935 key_info,
6936 prim_key
6937 );
6938
6939 ptr += create_toku_clustering_val_pack_descriptor(
6940 ptr,
6941 primary_key,
6942 form->s,
6943 kc_info,
6944 keynr,
6945 key_is_clustering(key_info)
6946 );
6947 return ptr - buf;
6948 }
6949
6950
6951 //
6952 // creates dictionary for secondary index, with key description key_info, all using txn
6953 //
create_secondary_dictionary(const char * name,TABLE * form,KEY * key_info,DB_TXN * txn,KEY_AND_COL_INFO * kc_info,uint32_t keynr,bool is_hot_index,toku_compression_method compression_method)6954 int ha_tokudb::create_secondary_dictionary(
6955 const char* name,
6956 TABLE* form,
6957 KEY* key_info,
6958 DB_TXN* txn,
6959 KEY_AND_COL_INFO* kc_info,
6960 uint32_t keynr,
6961 bool is_hot_index,
6962 toku_compression_method compression_method) {
6963
6964 int error;
6965 DBT row_descriptor;
6966 uchar* row_desc_buff = NULL;
6967 char* newname = NULL;
6968 size_t newname_len = 0;
6969 KEY* prim_key = NULL;
6970 char dict_name[MAX_DICT_NAME_LEN];
6971 uint32_t max_row_desc_buff_size;
6972 uint hpk= (form->s->primary_key >= MAX_KEY) ?
6973 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
6974 uint32_t block_size;
6975 uint32_t read_block_size;
6976 uint32_t fanout;
6977 THD* thd = ha_thd();
6978
6979 memset(&row_descriptor, 0, sizeof(row_descriptor));
6980
6981 max_row_desc_buff_size = get_max_desc_size(kc_info,form);
6982
6983 row_desc_buff = (uchar*)tokudb::memory::malloc(
6984 max_row_desc_buff_size,
6985 MYF(MY_WME));
6986 if (row_desc_buff == NULL) {
6987 error = ENOMEM;
6988 goto cleanup;
6989 }
6990
6991 newname_len = get_max_dict_name_path_length(name);
6992 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
6993 if (newname == NULL) {
6994 error = ENOMEM;
6995 goto cleanup;
6996 }
6997
6998 sprintf(dict_name, "key-%s", key_info->name);
6999 make_name(newname, newname_len, name, dict_name);
7000
7001 prim_key = (hpk) ? NULL : &form->s->key_info[primary_key];
7002
7003 //
7004 // setup the row descriptor
7005 //
7006 row_descriptor.data = row_desc_buff;
7007 //
7008 // save data necessary for key comparisons
7009 //
7010 row_descriptor.size = create_secondary_key_descriptor(
7011 row_desc_buff,
7012 key_info,
7013 prim_key,
7014 hpk,
7015 form,
7016 primary_key,
7017 keynr,
7018 kc_info);
7019 assert_always(row_descriptor.size <= max_row_desc_buff_size);
7020
7021 block_size = tokudb::sysvars::block_size(thd);
7022 read_block_size = tokudb::sysvars::read_block_size(thd);
7023 fanout = tokudb::sysvars::fanout(thd);
7024
7025 error = create_sub_table(
7026 newname,
7027 &row_descriptor,
7028 txn,
7029 block_size,
7030 read_block_size,
7031 compression_method,
7032 is_hot_index,
7033 fanout);
7034 cleanup:
7035 tokudb::memory::free(newname);
7036 tokudb::memory::free(row_desc_buff);
7037 return error;
7038 }
7039
7040
create_main_key_descriptor(uchar * buf,KEY * prim_key,uint hpk,uint primary_key,TABLE * form,KEY_AND_COL_INFO * kc_info)7041 static uint32_t create_main_key_descriptor(
7042 uchar* buf,
7043 KEY* prim_key,
7044 uint hpk,
7045 uint primary_key,
7046 TABLE* form,
7047 KEY_AND_COL_INFO* kc_info) {
7048
7049 uchar* ptr = buf;
7050 ptr += create_toku_key_descriptor(
7051 ptr,
7052 hpk,
7053 prim_key,
7054 false,
7055 NULL);
7056
7057 ptr += create_toku_main_key_pack_descriptor(ptr);
7058
7059 ptr += create_toku_clustering_val_pack_descriptor(
7060 ptr,
7061 primary_key,
7062 form->s,
7063 kc_info,
7064 primary_key,
7065 false);
7066 return ptr - buf;
7067 }
7068
7069 //
7070 // create and close the main dictionarr with name of "name" using table form, all within
7071 // transaction txn.
7072 //
create_main_dictionary(const char * name,TABLE * form,DB_TXN * txn,KEY_AND_COL_INFO * kc_info,toku_compression_method compression_method)7073 int ha_tokudb::create_main_dictionary(
7074 const char* name,
7075 TABLE* form,
7076 DB_TXN* txn,
7077 KEY_AND_COL_INFO* kc_info,
7078 toku_compression_method compression_method) {
7079
7080 int error;
7081 DBT row_descriptor;
7082 uchar* row_desc_buff = NULL;
7083 char* newname = NULL;
7084 size_t newname_len = 0;
7085 KEY* prim_key = NULL;
7086 uint32_t max_row_desc_buff_size;
7087 uint hpk = (form->s->primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7088 uint32_t block_size;
7089 uint32_t read_block_size;
7090 uint32_t fanout;
7091 THD* thd = ha_thd();
7092
7093 memset(&row_descriptor, 0, sizeof(row_descriptor));
7094 max_row_desc_buff_size = get_max_desc_size(kc_info, form);
7095
7096 row_desc_buff = (uchar*)tokudb::memory::malloc(
7097 max_row_desc_buff_size,
7098 MYF(MY_WME));
7099 if (row_desc_buff == NULL) {
7100 error = ENOMEM;
7101 goto cleanup;
7102 }
7103
7104 newname_len = get_max_dict_name_path_length(name);
7105 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7106 if (newname == NULL) {
7107 error = ENOMEM;
7108 goto cleanup;
7109 }
7110
7111 make_name(newname, newname_len, name, "main");
7112
7113 prim_key = (hpk) ? NULL : &form->s->key_info[primary_key];
7114
7115 //
7116 // setup the row descriptor
7117 //
7118 row_descriptor.data = row_desc_buff;
7119 //
7120 // save data necessary for key comparisons
7121 //
7122 row_descriptor.size = create_main_key_descriptor(
7123 row_desc_buff,
7124 prim_key,
7125 hpk,
7126 primary_key,
7127 form,
7128 kc_info);
7129 assert_always(row_descriptor.size <= max_row_desc_buff_size);
7130
7131 block_size = tokudb::sysvars::block_size(thd);
7132 read_block_size = tokudb::sysvars::read_block_size(thd);
7133 fanout = tokudb::sysvars::fanout(thd);
7134
7135 /* Create the main table that will hold the real rows */
7136 error = create_sub_table(
7137 newname,
7138 &row_descriptor,
7139 txn,
7140 block_size,
7141 read_block_size,
7142 compression_method,
7143 false,
7144 fanout);
7145 cleanup:
7146 tokudb::memory::free(newname);
7147 tokudb::memory::free(row_desc_buff);
7148 return error;
7149 }
7150
7151 //
7152 // Creates a new table
7153 // Parameters:
7154 // [in] name - table name
7155 // [in] form - info on table, columns and indexes
7156 // [in] create_info - more info on table, CURRENTLY UNUSED
7157 // Returns:
7158 // 0 on success
7159 // error otherwise
7160 //
create(const char * name,TABLE * form,HA_CREATE_INFO * create_info)7161 int ha_tokudb::create(
7162 const char* name,
7163 TABLE* form,
7164 HA_CREATE_INFO* create_info) {
7165
7166 TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7167
7168 int error;
7169 DB *status_block = NULL;
7170 uint version;
7171 uint capabilities;
7172 DB_TXN* txn = NULL;
7173 bool do_commit = false;
7174 char* newname = NULL;
7175 size_t newname_len = 0;
7176 KEY_AND_COL_INFO kc_info;
7177 tokudb_trx_data *trx = NULL;
7178 THD* thd = ha_thd();
7179
7180 String database_name, table_name, dictionary_name;
7181 tokudb_split_dname(name, database_name, table_name, dictionary_name);
7182 if (database_name.is_empty() || table_name.is_empty()) {
7183 push_warning(thd,
7184 Sql_condition::SL_WARNING,
7185 ER_TABLE_NAME,
7186 "TokuDB: Table Name or Database Name is empty");
7187 DBUG_RETURN(ER_TABLE_NAME);
7188 }
7189
7190 memset(&kc_info, 0, sizeof(kc_info));
7191
7192 // TDB-76 : CREATE TABLE ... LIKE ... does not use source row_format on
7193 // target table
7194 // Original code would only use create_info->row_type if
7195 // create_info->used_fields & HA_CREATE_USED_ROW_FORMAT was true. This
7196 // would cause us to skip transferring the row_format for a table created
7197 // via CREATE TABLE tn LIKE tn. We also take on more InnoDB like behavior
7198 // and throw a warning if we get a row_format that we can't translate into
7199 // a known TokuDB row_format.
7200 tokudb::sysvars::row_format_t row_format =
7201 tokudb::sysvars::row_format(thd);
7202
7203 if ((create_info->used_fields & HA_CREATE_USED_ROW_FORMAT) ||
7204 create_info->row_type != ROW_TYPE_DEFAULT) {
7205 row_format = row_type_to_row_format(create_info->row_type);
7206 if (row_format == tokudb::sysvars::SRV_ROW_FORMAT_DEFAULT &&
7207 create_info->row_type != ROW_TYPE_DEFAULT) {
7208 push_warning(thd,
7209 Sql_condition::SL_WARNING,
7210 ER_ILLEGAL_HA_CREATE_OPTION,
7211 "TokuDB: invalid ROW_FORMAT specifier.");
7212 }
7213 }
7214 const toku_compression_method compression_method =
7215 row_format_to_toku_compression_method(row_format);
7216
7217 bool create_from_engine = (create_info->table_options & HA_OPTION_CREATE_FROM_ENGINE);
7218 if (create_from_engine) {
7219 // table already exists, nothing to do
7220 error = 0;
7221 goto cleanup;
7222 }
7223
7224 // validate the fields in the table. If the table has fields
7225 // we do not support that came from an old version of MySQL,
7226 // gracefully return an error
7227 for (uint32_t i = 0; i < form->s->fields; i++) {
7228 Field* field = table_share->field[i];
7229 if (!field_valid_for_tokudb_table(field)) {
7230 sql_print_error("Table %s has an invalid field %s, that was created "
7231 "with an old version of MySQL. This field is no longer supported. "
7232 "This is probably due to an alter table engine=TokuDB. To load this "
7233 "table, do a dump and load",
7234 name,
7235 field->field_name
7236 );
7237 error = HA_ERR_UNSUPPORTED;
7238 goto cleanup;
7239 }
7240 }
7241
7242 newname_len = get_max_dict_name_path_length(name);
7243 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7244 if (newname == NULL) {
7245 error = ENOMEM;
7246 goto cleanup;
7247 }
7248
7249 trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
7250 if (trx && trx->sub_sp_level &&
7251 thd_sql_command(thd) == SQLCOM_CREATE_TABLE) {
7252 txn = trx->sub_sp_level;
7253 } else {
7254 do_commit = true;
7255 error = txn_begin(db_env, 0, &txn, 0, thd);
7256 if (error) {
7257 goto cleanup;
7258 }
7259 }
7260
7261 primary_key = form->s->primary_key;
7262 hidden_primary_key = (primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7263 if (hidden_primary_key) {
7264 primary_key = form->s->keys;
7265 }
7266
7267 /* do some tracing */
7268 trace_create_table_info(form);
7269
7270 /* Create status.tokudb and save relevant metadata */
7271 make_name(newname, newname_len, name, "status");
7272
7273 error = tokudb::metadata::create(db_env, &status_block, newname, txn);
7274 if (error) { goto cleanup; }
7275
7276 version = HA_TOKU_VERSION;
7277 error = write_to_status(
7278 status_block,
7279 hatoku_new_version,
7280 &version,
7281 sizeof(version),
7282 txn);
7283 if (error) {
7284 goto cleanup;
7285 }
7286
7287 capabilities = HA_TOKU_CAP;
7288 error = write_to_status(
7289 status_block,
7290 hatoku_capabilities,
7291 &capabilities,
7292 sizeof(capabilities),
7293 txn);
7294 if (error) {
7295 goto cleanup;
7296 }
7297
7298 error = write_auto_inc_create(
7299 status_block,
7300 create_info->auto_increment_value,
7301 txn);
7302 if (error) {
7303 goto cleanup;
7304 }
7305
7306 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
7307 #if defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
7308 if (form->part_info == NULL) {
7309 error = write_frm_data(status_block, txn, form->s->path.str);
7310 if (error) {
7311 goto cleanup;
7312 }
7313 }
7314 #else
7315 error = write_frm_data(status_block, txn, form->s->path.str);
7316 if (error) {
7317 goto cleanup;
7318 }
7319 #endif // defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
7320 #endif // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
7321
7322 error = allocate_key_and_col_info(form->s, &kc_info);
7323 if (error) {
7324 goto cleanup;
7325 }
7326
7327 error = initialize_key_and_col_info(
7328 form->s,
7329 form,
7330 &kc_info,
7331 hidden_primary_key,
7332 primary_key);
7333 if (error) {
7334 goto cleanup;
7335 }
7336
7337 error = create_main_dictionary(
7338 name,
7339 form,
7340 txn,
7341 &kc_info,
7342 compression_method);
7343 if (error) {
7344 goto cleanup;
7345 }
7346
7347
7348 for (uint i = 0; i < form->s->keys; i++) {
7349 if (i != primary_key) {
7350 error = create_secondary_dictionary(
7351 name,
7352 form,
7353 &form->key_info[i],
7354 txn,
7355 &kc_info,
7356 i,
7357 false,
7358 compression_method);
7359 if (error) {
7360 goto cleanup;
7361 }
7362
7363 error = write_key_name_to_status(
7364 status_block,
7365 form->s->key_info[i].name,
7366 txn);
7367 if (error) {
7368 goto cleanup;
7369 }
7370 }
7371 }
7372
7373 error = 0;
7374 cleanup:
7375 if (status_block != NULL) {
7376 int r = tokudb::metadata::close(&status_block);
7377 assert_always(r==0);
7378 }
7379 free_key_and_col_info(&kc_info);
7380 if (do_commit && txn) {
7381 if (error) {
7382 abort_txn(txn);
7383 } else {
7384 commit_txn(txn,0);
7385 }
7386 }
7387 tokudb::memory::free(newname);
7388 TOKUDB_HANDLER_DBUG_RETURN(error);
7389 }
7390
discard_or_import_tablespace(TOKUDB_UNUSED (my_bool discard))7391 int ha_tokudb::discard_or_import_tablespace(TOKUDB_UNUSED(my_bool discard)) {
7392 /*
7393 if (discard) {
7394 my_errno=HA_ERR_WRONG_COMMAND;
7395 return my_errno;
7396 }
7397 return add_table_to_metadata(share->table_name);
7398 */
7399 set_my_errno(HA_ERR_WRONG_COMMAND);
7400 return my_errno();
7401 }
7402
7403
7404 //
7405 // deletes from_name or renames from_name to to_name, all using transaction txn.
7406 // is_delete specifies which we are doing
7407 // is_key specifies if it is a secondary index (and hence a "key-" needs to be prepended) or
7408 // if it is not a secondary index
7409 //
delete_or_rename_dictionary(const char * from_name,const char * to_name,const char * secondary_name,bool is_key,DB_TXN * txn,bool is_delete)7410 int ha_tokudb::delete_or_rename_dictionary(
7411 const char* from_name,
7412 const char* to_name,
7413 const char* secondary_name,
7414 bool is_key,
7415 DB_TXN* txn,
7416 bool is_delete) {
7417
7418 int error;
7419 char dict_name[MAX_DICT_NAME_LEN];
7420 char* new_from_name = NULL;
7421 size_t new_from_name_len = 0;
7422 char* new_to_name = NULL;
7423 size_t new_to_name_len = 0;
7424 assert_always(txn);
7425
7426 new_from_name_len = get_max_dict_name_path_length(from_name);
7427 new_from_name = (char*)tokudb::memory::malloc(
7428 new_from_name_len,
7429 MYF(MY_WME));
7430 if (new_from_name == NULL) {
7431 error = ENOMEM;
7432 goto cleanup;
7433 }
7434 if (!is_delete) {
7435 assert_always(to_name);
7436 new_to_name_len = get_max_dict_name_path_length(to_name);
7437 new_to_name = (char*)tokudb::memory::malloc(
7438 new_to_name_len,
7439 MYF(MY_WME));
7440 if (new_to_name == NULL) {
7441 error = ENOMEM;
7442 goto cleanup;
7443 }
7444 }
7445
7446 if (is_key) {
7447 sprintf(dict_name, "key-%s", secondary_name);
7448 make_name(new_from_name, new_from_name_len, from_name, dict_name);
7449 } else {
7450 make_name(new_from_name, new_from_name_len, from_name, secondary_name);
7451 }
7452 if (!is_delete) {
7453 if (is_key) {
7454 sprintf(dict_name, "key-%s", secondary_name);
7455 make_name(new_to_name, new_to_name_len, to_name, dict_name);
7456 } else {
7457 make_name(new_to_name, new_to_name_len, to_name, secondary_name);
7458 }
7459 }
7460
7461 if (is_delete) {
7462 error = db_env->dbremove(db_env, txn, new_from_name, NULL, 0);
7463 } else {
7464 error = db_env->dbrename(
7465 db_env,
7466 txn,
7467 new_from_name,
7468 NULL,
7469 new_to_name,
7470 0);
7471 }
7472 if (error) {
7473 goto cleanup;
7474 }
7475
7476 cleanup:
7477 tokudb::memory::free(new_from_name);
7478 tokudb::memory::free(new_to_name);
7479 return error;
7480 }
7481
7482
7483 //
7484 // deletes or renames a table. if is_delete is true, then we delete, and to_name can be NULL
7485 // if is_delete is false, then to_name must be non-NULL, as we are renaming the table.
7486 //
delete_or_rename_table(const char * from_name,const char * to_name,bool is_delete)7487 int ha_tokudb::delete_or_rename_table (const char* from_name, const char* to_name, bool is_delete) {
7488 THD *thd = ha_thd();
7489 int error;
7490 DB* status_db = NULL;
7491 DBC* status_cursor = NULL;
7492 DB_TXN* txn = NULL;
7493 DBT curr_key;
7494 DBT curr_val;
7495 memset(&curr_key, 0, sizeof(curr_key));
7496 memset(&curr_val, 0, sizeof(curr_val));
7497
7498 DB_TXN *parent_txn = NULL;
7499 tokudb_trx_data *trx = NULL;
7500 trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
7501 if (thd_sql_command(ha_thd()) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
7502 parent_txn = trx->sub_sp_level;
7503 }
7504
7505 error = txn_begin(db_env, parent_txn, &txn, 0, thd);
7506 if (error) { goto cleanup; }
7507
7508 //
7509 // open status db,
7510 // create cursor,
7511 // for each name read out of there, create a db and delete or rename it
7512 //
7513 error = open_status_dictionary(&status_db, from_name, txn);
7514 if (error) { goto cleanup; }
7515
7516 error = status_db->cursor(status_db, txn, &status_cursor, 0);
7517 if (error) { goto cleanup; }
7518 status_cursor->c_set_check_interrupt_callback(status_cursor, tokudb_killed_thd_callback, thd);
7519
7520 while (error != DB_NOTFOUND) {
7521 error = status_cursor->c_get(status_cursor, &curr_key, &curr_val, DB_NEXT);
7522 if (error && error != DB_NOTFOUND) {
7523 error = map_to_handler_error(error);
7524 goto cleanup;
7525 }
7526 if (error == DB_NOTFOUND) {
7527 break;
7528 }
7529 HA_METADATA_KEY mk = *(HA_METADATA_KEY *)curr_key.data;
7530 if (mk != hatoku_key_name) {
7531 continue;
7532 }
7533 error = delete_or_rename_dictionary(from_name, to_name, (char *)((char *)curr_key.data + sizeof(HA_METADATA_KEY)), true, txn, is_delete);
7534 if (error) { goto cleanup; }
7535 }
7536
7537 //
7538 // delete or rename main.tokudb
7539 //
7540 error = delete_or_rename_dictionary(from_name, to_name, "main", false, txn, is_delete);
7541 if (error) { goto cleanup; }
7542
7543 error = status_cursor->c_close(status_cursor);
7544 assert_always(error==0);
7545 status_cursor = NULL;
7546 if (error) { goto cleanup; }
7547
7548 error = status_db->close(status_db, 0);
7549 assert_always(error == 0);
7550 status_db = NULL;
7551
7552 //
7553 // delete or rename status.tokudb
7554 //
7555 error = delete_or_rename_dictionary(from_name, to_name, "status", false, txn, is_delete);
7556 if (error) { goto cleanup; }
7557
7558 set_my_errno(error);
7559 cleanup:
7560 if (status_cursor) {
7561 int r = status_cursor->c_close(status_cursor);
7562 assert_always(r==0);
7563 }
7564 if (status_db) {
7565 int r = status_db->close(status_db, 0);
7566 assert_always(r==0);
7567 }
7568 if (txn) {
7569 if (error) {
7570 abort_txn(txn);
7571 }
7572 else {
7573 commit_txn(txn, 0);
7574 }
7575 }
7576 return error;
7577 }
7578
delete_non_partitioned_table(const char * name)7579 int ha_tokudb::delete_non_partitioned_table(const char* name) {
7580 TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7581 TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(name, NULL, false);
7582 if (share) {
7583 share->unlock();
7584 share->release();
7585 // this should be enough to handle locking as the higher level MDL
7586 // on this table should prevent any new analyze tasks.
7587 share->cancel_background_jobs();
7588 TOKUDB_SHARE::drop_share(share);
7589 }
7590
7591 int error;
7592 error = delete_or_rename_table(name, NULL, true);
7593 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7594 error == DB_LOCK_NOTGRANTED) {
7595 sql_print_error(
7596 "Could not delete table %s because another transaction has "
7597 "accessed the table. To drop the table, make sure no "
7598 "transactions touch the table.",
7599 name);
7600 }
7601 TOKUDB_HANDLER_DBUG_RETURN(error);
7602 }
7603
delete_rename_partitioned_table(const char * from,const char * to,const std::string & partition_info_str)7604 int ha_tokudb::delete_rename_partitioned_table(
7605 const char* from,
7606 const char* to,
7607 const std::string& partition_info_str) {
7608 THD* thd = ha_thd();
7609 assert(thd);
7610 MEM_ROOT* mem_root = thd->mem_root;
7611
7612 partition_info* part_info =
7613 native_part::parse_partition_info(ha_thd(), partition_info_str);
7614 ha_tokupart file(tokudb_hton, nullptr);
7615 if (file.init_partitioning(mem_root))
7616 return HA_ERR_CANNOT_INITIALIZE_PARTITIONING;
7617
7618 file.set_part_info(part_info, false);
7619 if (file.initialize_partition(mem_root))
7620 return HA_ERR_CANNOT_INITIALIZE_PARTITIONING;
7621
7622 if (to)
7623 return file.rename_table(from, to);
7624
7625 return file.delete_table(from);
7626 }
7627
7628 //
7629 // Drops table
7630 // Parameters:
7631 // [in] name - name of table to be deleted
7632 // Returns:
7633 // 0 on success
7634 // error otherwise
7635 //
delete_table(const char * name)7636 int ha_tokudb::delete_table(const char* name) {
7637 assert(name);
7638 std::string partition_info_str;
7639 if (!native_part::get_part_str_for_table(name, partition_info_str))
7640 return HA_ERR_TABLE_CORRUPT;
7641 if (partition_info_str.empty())
7642 return delete_non_partitioned_table(name);
7643 return delete_rename_partitioned_table(name, nullptr, partition_info_str);
7644 }
7645
tokudb_check_db_dir_exist_from_table_name(const char * table_name)7646 static bool tokudb_check_db_dir_exist_from_table_name(const char* table_name) {
7647 assert(table_name);
7648 bool mysql_dir_exists;
7649 char db_name[FN_REFLEN];
7650 const char *db_name_begin = strchr(table_name, FN_LIBCHAR);
7651 const char *db_name_end = strrchr(table_name, FN_LIBCHAR);
7652 assert(db_name_begin);
7653 assert(db_name_end);
7654 assert(db_name_begin != db_name_end);
7655
7656 ++db_name_begin;
7657 size_t db_name_size = db_name_end - db_name_begin;
7658
7659 assert(db_name_size < FN_REFLEN);
7660
7661 memcpy(db_name, db_name_begin, db_name_size);
7662 db_name[db_name_size] = '\0';
7663
7664 // At this point, db_name contains the MySQL formatted database name.
7665 // This is exactly the same format that would come into us through a
7666 // CREATE TABLE. Some charaters (like ':' for example) might be expanded
7667 // into hex (':' would papear as "@003a").
7668 // We need to check that the MySQL destination database directory exists.
7669 mysql_dir_exists = (my_access(db_name, F_OK) == 0);
7670
7671 return mysql_dir_exists;
7672 }
7673
rename_non_partitioned_table(const char * from,const char * to)7674 int ha_tokudb::rename_non_partitioned_table(const char* from, const char* to) {
7675 TOKUDB_HANDLER_DBUG_ENTER("%s %s", from, to);
7676 TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(from, NULL, false);
7677 if (share) {
7678 share->unlock();
7679 share->release();
7680 // this should be enough to handle locking as the higher level MDL
7681 // on this table should prevent any new analyze tasks.
7682 share->cancel_background_jobs();
7683 TOKUDB_SHARE::drop_share(share);
7684 }
7685 int error;
7686 bool to_db_dir_exist = tokudb_check_db_dir_exist_from_table_name(to);
7687 if (!to_db_dir_exist) {
7688 sql_print_error(
7689 "Could not rename table from %s to %s because "
7690 "destination db does not exist",
7691 from,
7692 to);
7693 error = HA_ERR_DEST_SCHEMA_NOT_EXIST;
7694 }
7695 else {
7696 error = delete_or_rename_table(from, to, false);
7697 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7698 error == DB_LOCK_NOTGRANTED) {
7699 sql_print_error(
7700 "Could not rename table from %s to %s because another transaction "
7701 "has accessed the table. To rename the table, make sure no "
7702 "transactions touch the table.",
7703 from,
7704 to);
7705 }
7706 }
7707 TOKUDB_HANDLER_DBUG_RETURN(error);
7708 }
7709
7710 //
7711 // renames table from "from" to "to"
7712 // Parameters:
7713 // [in] name - old name of table
7714 // [in] to - new name of table
7715 // Returns:
7716 // 0 on success
7717 // error otherwise
7718 //
rename_table(const char * from,const char * to)7719 int ha_tokudb::rename_table(const char* from, const char* to) {
7720 assert(from);
7721 assert(to);
7722 std::string partition_info_str;
7723 if (!native_part::get_part_str_for_table(from, partition_info_str))
7724 return DB_NOTFOUND; // TODO: set correct error code here
7725 if (partition_info_str.empty())
7726 return rename_non_partitioned_table(from, to);
7727 return delete_rename_partitioned_table(from, to, partition_info_str);
7728 }
7729
7730 /*
7731 Returns estimate on number of seeks it will take to read through the table
7732 This is to be comparable to the number returned by records_in_range so
7733 that we can decide if we should scan the table or use keys.
7734 */
7735 /// QQQ why divide by 3
scan_time()7736 double ha_tokudb::scan_time() {
7737 TOKUDB_HANDLER_DBUG_ENTER("");
7738 double ret_val = (double)stats.records / 3;
7739 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7740 TOKUDB_DEBUG_RETURN,
7741 "return %" PRIu64 " %f",
7742 (uint64_t)stats.records,
7743 ret_val);
7744 DBUG_RETURN(ret_val);
7745 }
7746
keyread_time(uint index,uint ranges,ha_rows rows)7747 double ha_tokudb::keyread_time(uint index, uint ranges, ha_rows rows)
7748 {
7749 TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7750 double ret_val;
7751 if (index == primary_key || key_is_clustering(&table->key_info[index])) {
7752 ret_val = read_time(index, ranges, rows);
7753 DBUG_RETURN(ret_val);
7754 }
7755 /*
7756 It is assumed that we will read trough the whole key range and that all
7757 key blocks are half full (normally things are much better). It is also
7758 assumed that each time we read the next key from the index, the handler
7759 performs a random seek, thus the cost is proportional to the number of
7760 blocks read. This model does not take into account clustered indexes -
7761 engines that support that (e.g. InnoDB) may want to overwrite this method.
7762 */
7763 double keys_per_block= (stats.block_size/2.0/
7764 (table->key_info[index].key_length +
7765 ref_length) + 1);
7766 ret_val = (rows + keys_per_block - 1)/ keys_per_block;
7767 TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7768 }
7769
7770 //
7771 // Calculate the time it takes to read a set of ranges through an index
7772 // This enables us to optimize reads for clustered indexes.
7773 // Implementation pulled from InnoDB
7774 // Parameters:
7775 // index - index to use
7776 // ranges - number of ranges
7777 // rows - estimated number of rows in the range
7778 // Returns:
7779 // estimated time measured in disk seeks
7780 //
read_time(uint index,uint ranges,ha_rows rows)7781 double ha_tokudb::read_time(
7782 uint index,
7783 uint ranges,
7784 ha_rows rows
7785 )
7786 {
7787 TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7788 double total_scan;
7789 double ret_val;
7790 bool is_primary = (index == primary_key);
7791 bool is_clustering;
7792
7793 //
7794 // in case for hidden primary key, this is called
7795 //
7796 if (index >= table_share->keys) {
7797 ret_val = handler::read_time(index, ranges, rows);
7798 goto cleanup;
7799 }
7800
7801 is_clustering = key_is_clustering(&table->key_info[index]);
7802
7803
7804 //
7805 // if it is not the primary key, and it is not a clustering key, then return handler::read_time
7806 //
7807 if (!(is_primary || is_clustering)) {
7808 ret_val = handler::read_time(index, ranges, rows);
7809 goto cleanup;
7810 }
7811
7812 //
7813 // for primary key and for clustered keys, return a fraction of scan_time()
7814 //
7815 total_scan = scan_time();
7816
7817 if (stats.records < rows) {
7818 ret_val = is_clustering ? total_scan + 0.00001 : total_scan;
7819 goto cleanup;
7820 }
7821
7822 //
7823 // one disk seek per range plus the proportional scan time of the rows
7824 //
7825 ret_val = (ranges + (double) rows / (double) stats.records * total_scan);
7826 ret_val = is_clustering ? ret_val + 0.00001 : ret_val;
7827
7828 cleanup:
7829 TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7830 }
7831
index_only_read_time(uint keynr,double records)7832 double ha_tokudb::index_only_read_time(uint keynr, double records) {
7833 TOKUDB_HANDLER_DBUG_ENTER("%u %f", keynr, records);
7834 double ret_val = keyread_time(keynr, 1, (ha_rows)records);
7835 TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7836 }
7837
7838 //
7839 // Estimates the number of index records in a range. In case of errors, return
7840 // HA_TOKUDB_RANGE_COUNT instead of HA_POS_ERROR. This was behavior
7841 // when we got the handlerton from MySQL.
7842 // Parameters:
7843 // keynr -index to use
7844 // [in] start_key - low end of the range
7845 // [in] end_key - high end of the range
7846 // Returns:
7847 // 0 - There are no matching keys in the given range
7848 // number > 0 - There are approximately number matching rows in the range
7849 // HA_POS_ERROR - Something is wrong with the index tree
7850 //
records_in_range(uint keynr,key_range * start_key,key_range * end_key)7851 ha_rows ha_tokudb::records_in_range(uint keynr, key_range* start_key, key_range* end_key) {
7852 TOKUDB_HANDLER_DBUG_ENTER("%d %p %p", keynr, start_key, end_key);
7853 DBT *pleft_key, *pright_key;
7854 DBT left_key, right_key;
7855 ha_rows ret_val = HA_TOKUDB_RANGE_COUNT;
7856 DB *kfile = share->key_file[keynr];
7857 uint64_t rows = 0;
7858 int error;
7859
7860 // get start_rows and end_rows values so that we can estimate range
7861 // when calling key_range64, the only value we can trust is the value for less
7862 // The reason is that the key being passed in may be a prefix of keys in the DB
7863 // As a result, equal may be 0 and greater may actually be equal+greater
7864 // So, we call key_range64 on the key, and the key that is after it.
7865 if (!start_key && !end_key) {
7866 error = estimate_num_rows(share->file, &rows, transaction);
7867 if (error) {
7868 ret_val = HA_TOKUDB_RANGE_COUNT;
7869 goto cleanup;
7870 }
7871 ret_val = (rows <= 1) ? 1 : rows;
7872 goto cleanup;
7873 }
7874 if (start_key) {
7875 uchar inf_byte = (start_key->flag == HA_READ_KEY_EXACT) ? COL_NEG_INF : COL_POS_INF;
7876 pack_key(&left_key, keynr, key_buff, start_key->key, start_key->length, inf_byte);
7877 pleft_key = &left_key;
7878 } else {
7879 pleft_key = NULL;
7880 }
7881 if (end_key) {
7882 uchar inf_byte = (end_key->flag == HA_READ_BEFORE_KEY) ? COL_NEG_INF : COL_POS_INF;
7883 pack_key(&right_key, keynr, key_buff2, end_key->key, end_key->length, inf_byte);
7884 pright_key = &right_key;
7885 } else {
7886 pright_key = NULL;
7887 }
7888 // keys_range64 can not handle a degenerate range (left_key > right_key), so we filter here
7889 if (pleft_key && pright_key && tokudb_cmp_dbt_key(kfile, pleft_key, pright_key) > 0) {
7890 rows = 0;
7891 } else {
7892 uint64_t less, equal1, middle, equal2, greater;
7893 bool is_exact;
7894 error = kfile->keys_range64(kfile, transaction, pleft_key, pright_key,
7895 &less, &equal1, &middle, &equal2, &greater, &is_exact);
7896 if (error) {
7897 ret_val = HA_TOKUDB_RANGE_COUNT;
7898 goto cleanup;
7899 }
7900 rows = middle;
7901 }
7902
7903 // MySQL thinks a return value of 0 means there are exactly 0 rows
7904 // Therefore, always return non-zero so this assumption is not made
7905 ret_val = (ha_rows) (rows <= 1 ? 1 : rows);
7906
7907 cleanup:
7908 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7909 TOKUDB_DEBUG_RETURN,
7910 "return %" PRIu64 " %" PRIu64,
7911 (uint64_t)ret_val,
7912 rows);
7913 DBUG_RETURN(ret_val);
7914 }
7915
7916
7917 //
7918 // Initializes the auto-increment data in the local "share" object to the
7919 // greater of two values: what's stored in the metadata or the last inserted
7920 // auto-increment field (if auto-increment field is the first field of a key).
7921 //
init_auto_increment()7922 void ha_tokudb::init_auto_increment() {
7923 int error;
7924 DB_TXN* txn = NULL;
7925
7926 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
7927 if (error) {
7928 share->last_auto_increment = 0;
7929 } else {
7930 HA_METADATA_KEY key_val;
7931 DBT key;
7932 memset(&key, 0, sizeof(key));
7933 key.data = &key_val;
7934 key.size = sizeof(key_val);
7935 DBT value;
7936 memset(&value, 0, sizeof(value));
7937 value.flags = DB_DBT_USERMEM;
7938
7939 // Retrieve the initial auto increment value, as specified by create table
7940 // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
7941 // then the value 100 should be stored here
7942 key_val = hatoku_ai_create_value;
7943 value.ulen = sizeof(share->auto_inc_create_value);
7944 value.data = &share->auto_inc_create_value;
7945 error = share->status_block->get(share->status_block, txn, &key, &value, 0);
7946
7947 if (error || value.size != sizeof(share->auto_inc_create_value)) {
7948 share->auto_inc_create_value = 0;
7949 }
7950
7951 // Retrieve hatoku_max_ai, which is max value used by auto increment
7952 // column so far, the max value could have been auto generated (e.g. insert (NULL))
7953 // or it could have been manually inserted by user (e.g. insert (345))
7954 key_val = hatoku_max_ai;
7955 value.ulen = sizeof(share->last_auto_increment);
7956 value.data = &share->last_auto_increment;
7957 error = share->status_block->get(share->status_block, txn, &key, &value, 0);
7958
7959 if (error || value.size != sizeof(share->last_auto_increment)) {
7960 if (share->auto_inc_create_value)
7961 share->last_auto_increment = share->auto_inc_create_value - 1;
7962 else
7963 share->last_auto_increment = 0;
7964 }
7965
7966 commit_txn(txn, 0);
7967 }
7968 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7969 TOKUDB_DEBUG_AUTO_INCREMENT,
7970 "init auto increment:%lld",
7971 share->last_auto_increment);
7972 }
7973
get_auto_increment(ulonglong offset,ulonglong increment,ulonglong nb_desired_values,ulonglong * first_value,ulonglong * nb_reserved_values)7974 void ha_tokudb::get_auto_increment(
7975 ulonglong offset,
7976 ulonglong increment,
7977 ulonglong nb_desired_values,
7978 ulonglong* first_value,
7979 ulonglong* nb_reserved_values) {
7980
7981 TOKUDB_HANDLER_DBUG_ENTER("");
7982 ulonglong nr;
7983 bool over;
7984
7985 share->lock();
7986
7987 if (share->auto_inc_create_value > share->last_auto_increment) {
7988 nr = share->auto_inc_create_value;
7989 over = false;
7990 share->last_auto_increment = share->auto_inc_create_value;
7991 } else {
7992 nr = share->last_auto_increment + increment;
7993 over = nr < share->last_auto_increment;
7994 if (over)
7995 nr = ULLONG_MAX;
7996 }
7997 if (!over) {
7998 share->last_auto_increment = nr + (nb_desired_values - 1)*increment;
7999 if (delay_updating_ai_metadata) {
8000 ai_metadata_update_required = true;
8001 } else {
8002 update_max_auto_inc(
8003 share->status_block,
8004 share->last_auto_increment);
8005 }
8006 }
8007 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
8008 TOKUDB_DEBUG_AUTO_INCREMENT,
8009 "get_auto_increment(%lld,%lld,%lld): got:%lld:%lld",
8010 offset,
8011 increment,
8012 nb_desired_values,
8013 nr,
8014 nb_desired_values);
8015 *first_value = nr;
8016 *nb_reserved_values = nb_desired_values;
8017 share->unlock();
8018 TOKUDB_HANDLER_DBUG_VOID_RETURN;
8019 }
8020
is_optimize_blocking()8021 bool ha_tokudb::is_optimize_blocking() {
8022 return false;
8023 }
8024
is_auto_inc_singleton()8025 bool ha_tokudb::is_auto_inc_singleton(){
8026 return false;
8027 }
8028
8029
8030 // Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8031 // With a transaction, drops dictionaries associated with indexes in key_num
8032 //
8033 //
8034 // Adds indexes to the table. Takes the array of KEY passed in key_info, and creates
8035 // DB's that will go at the end of share->key_file. THE IMPLICIT ASSUMPTION HERE is
8036 // that the table will be modified and that these added keys will be appended to the end
8037 // of the array table->key_info
8038 // Parameters:
8039 // [in] table_arg - table that is being modified, seems to be identical to this->table
8040 // [in] key_info - array of KEY's to be added
8041 // num_of_keys - number of keys to be added, number of elements in key_info
8042 // Returns:
8043 // 0 on success, error otherwise
8044 //
tokudb_add_index(TABLE * table_arg,KEY * key_info,uint num_of_keys,DB_TXN * txn,bool * inc_num_DBs,bool * modified_DBs)8045 int ha_tokudb::tokudb_add_index(
8046 TABLE* table_arg,
8047 KEY* key_info,
8048 uint num_of_keys,
8049 DB_TXN* txn,
8050 bool* inc_num_DBs,
8051 bool* modified_DBs) {
8052
8053 TOKUDB_HANDLER_DBUG_ENTER("");
8054 assert_always(txn);
8055
8056 int error;
8057 uint curr_index = 0;
8058 DBC* tmp_cursor = NULL;
8059 int cursor_ret_val = 0;
8060 DBT curr_pk_key, curr_pk_val;
8061 THD* thd = ha_thd();
8062 DB_LOADER* loader = NULL;
8063 DB_INDEXER* indexer = NULL;
8064 bool loader_save_space = tokudb::sysvars::load_save_space(thd);
8065 bool use_hot_index = (lock.type == TL_WRITE_ALLOW_WRITE);
8066 uint32_t loader_flags = loader_save_space ? LOADER_COMPRESS_INTERMEDIATES : 0;
8067 uint32_t indexer_flags = 0;
8068 uint32_t mult_db_flags[MAX_KEY + 1] = {0};
8069 uint32_t mult_put_flags[MAX_KEY + 1];
8070 uint32_t mult_dbt_flags[MAX_KEY + 1];
8071 bool creating_hot_index = false;
8072 struct loader_context lc;
8073 memset(&lc, 0, sizeof lc);
8074 lc.thd = thd;
8075 lc.ha = this;
8076 loader_error = 0;
8077 bool rw_lock_taken = false;
8078 *inc_num_DBs = false;
8079 *modified_DBs = false;
8080 invalidate_bulk_fetch();
8081 unpack_entire_row = true; // for bulk fetching rows
8082 for (uint32_t i = 0; i < MAX_KEY+1; i++) {
8083 mult_put_flags[i] = 0;
8084 mult_dbt_flags[i] = DB_DBT_REALLOC;
8085 }
8086 //
8087 // number of DB files we have open currently, before add_index is executed
8088 //
8089 uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8090
8091 //
8092 // get the row type to use for the indexes we're adding
8093 //
8094 toku_compression_method compression_method =
8095 get_compression_method(share->file);
8096
8097 //
8098 // status message to be shown in "show process list"
8099 //
8100 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
8101 // buffer of 200 should be a good upper bound.
8102 char status_msg[MAX_ALIAS_NAME + 200];
8103 // variable that stores number of elements inserted thus far
8104 ulonglong num_processed = 0;
8105 thd_proc_info(thd, "Adding indexes");
8106
8107 //
8108 // in unpack_row, MySQL passes a buffer that is this long,
8109 // so this length should be good enough for us as well
8110 //
8111 memset((void *) &curr_pk_key, 0, sizeof(curr_pk_key));
8112 memset((void *) &curr_pk_val, 0, sizeof(curr_pk_val));
8113
8114 //
8115 // The files for secondary tables are derived from the name of keys
8116 // If we try to add a key with the same name as an already existing key,
8117 // We can crash. So here we check if any of the keys added has the same
8118 // name of an existing key, and if so, we fail gracefully
8119 //
8120 for (uint i = 0; i < num_of_keys; i++) {
8121 for (uint j = 0; j < table_arg->s->keys; j++) {
8122 if (strcmp(key_info[i].name, table_arg->s->key_info[j].name) == 0) {
8123 error = HA_ERR_WRONG_COMMAND;
8124 goto cleanup;
8125 }
8126 }
8127 }
8128
8129 rwlock_t_lock_write(share->_num_DBs_lock);
8130 rw_lock_taken = true;
8131 //
8132 // open all the DB files and set the appropriate variables in share
8133 // they go to the end of share->key_file
8134 //
8135 creating_hot_index =
8136 use_hot_index && num_of_keys == 1 &&
8137 (key_info[0].flags & HA_NOSAME) == 0;
8138 if (use_hot_index && (share->num_DBs > curr_num_DBs)) {
8139 //
8140 // already have hot index in progress, get out
8141 //
8142 error = HA_ERR_INTERNAL_ERROR;
8143 goto cleanup;
8144 }
8145 curr_index = curr_num_DBs;
8146 *modified_DBs = true;
8147 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8148 if (key_is_clustering(&key_info[i])) {
8149 set_key_filter(
8150 &share->kc_info.key_filters[curr_index],
8151 &key_info[i],
8152 table_arg,
8153 false);
8154 if (!hidden_primary_key) {
8155 set_key_filter(
8156 &share->kc_info.key_filters[curr_index],
8157 &table_arg->key_info[primary_key],
8158 table_arg,
8159 false);
8160 }
8161
8162 error = initialize_col_pack_info(
8163 &share->kc_info,
8164 table_arg->s,
8165 curr_index);
8166 if (error) {
8167 goto cleanup;
8168 }
8169 }
8170
8171
8172 error = create_secondary_dictionary(
8173 share->full_table_name(),
8174 table_arg,
8175 &key_info[i],
8176 txn,
8177 &share->kc_info,
8178 curr_index,
8179 creating_hot_index,
8180 compression_method);
8181 if (error) {
8182 goto cleanup;
8183 }
8184
8185 error = open_secondary_dictionary(
8186 &share->key_file[curr_index],
8187 &key_info[i],
8188 share->full_table_name(),
8189 false,
8190 txn);
8191 if (error) {
8192 goto cleanup;
8193 }
8194 }
8195
8196 if (creating_hot_index) {
8197 share->num_DBs++;
8198 *inc_num_DBs = true;
8199 error = db_env->create_indexer(
8200 db_env,
8201 txn,
8202 &indexer,
8203 share->file,
8204 num_of_keys,
8205 &share->key_file[curr_num_DBs],
8206 mult_db_flags,
8207 indexer_flags);
8208 if (error) {
8209 goto cleanup;
8210 }
8211
8212 error = indexer->set_poll_function(
8213 indexer, ha_tokudb::tokudb_add_index_poll, &lc);
8214 if (error) {
8215 goto cleanup;
8216 }
8217
8218 error = indexer->set_error_callback(
8219 indexer, ha_tokudb::loader_add_index_err, &lc);
8220 if (error) {
8221 goto cleanup;
8222 }
8223
8224 share->_num_DBs_lock.unlock();
8225 rw_lock_taken = false;
8226
8227 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8228 // initialize a one phase progress report.
8229 // incremental reports are done in the indexer's callback function.
8230 thd_progress_init(thd, 1);
8231 #endif
8232
8233 error = indexer->build(indexer);
8234
8235 if (error) {
8236 goto cleanup;
8237 }
8238
8239 rwlock_t_lock_write(share->_num_DBs_lock);
8240 error = indexer->close(indexer);
8241 share->_num_DBs_lock.unlock();
8242 if (error) {
8243 goto cleanup;
8244 }
8245 indexer = NULL;
8246 } else {
8247 assert(table->mdl_ticket->get_type() >= MDL_SHARED_NO_WRITE);
8248 share->_num_DBs_lock.unlock();
8249 rw_lock_taken = false;
8250 prelocked_right_range_size = 0;
8251 prelocked_left_range_size = 0;
8252 struct smart_dbt_bf_info bf_info;
8253 bf_info.ha = this;
8254 // you need the val if you have a clustering index and key_read is not 0;
8255 bf_info.direction = 1;
8256 bf_info.thd = ha_thd();
8257 bf_info.need_val = true;
8258 bf_info.key_to_compare = NULL;
8259
8260 error = db_env->create_loader(
8261 db_env,
8262 txn,
8263 &loader,
8264 NULL, // no src_db needed
8265 num_of_keys,
8266 &share->key_file[curr_num_DBs],
8267 mult_put_flags,
8268 mult_dbt_flags,
8269 loader_flags);
8270 if (error) {
8271 goto cleanup;
8272 }
8273
8274 error =
8275 loader->set_poll_function(loader, ha_tokudb::bulk_insert_poll, &lc);
8276 if (error) {
8277 goto cleanup;
8278 }
8279
8280 error = loader->set_error_callback(
8281 loader, ha_tokudb::loader_add_index_err, &lc);
8282 if (error) {
8283 goto cleanup;
8284 }
8285 //
8286 // scan primary table, create each secondary key, add to each DB
8287 //
8288 error = share->file->cursor(
8289 share->file,
8290 txn,
8291 &tmp_cursor,
8292 DB_SERIALIZABLE);
8293 if (error) {
8294 tmp_cursor = NULL; // Safety
8295 goto cleanup;
8296 }
8297
8298 //
8299 // grab some locks to make this go faster
8300 // first a global read lock on the main DB, because
8301 // we intend to scan the entire thing
8302 //
8303 error = tmp_cursor->c_set_bounds(
8304 tmp_cursor,
8305 share->file->dbt_neg_infty(),
8306 share->file->dbt_pos_infty(),
8307 true,
8308 0);
8309 if (error) {
8310 goto cleanup;
8311 }
8312
8313 // set the bulk fetch iteration to its max so that adding an
8314 // index fills the bulk fetch buffer every time. we do not
8315 // want it to grow exponentially fast.
8316 rows_fetched_using_bulk_fetch = 0;
8317 bulk_fetch_iteration = HA_TOKU_BULK_FETCH_ITERATION_MAX;
8318 cursor_ret_val = tmp_cursor->c_getf_next(
8319 tmp_cursor,
8320 DB_PRELOCKED,
8321 smart_dbt_bf_callback,
8322 &bf_info);
8323
8324 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8325 // initialize a two phase progress report.
8326 // first phase: putting rows into the loader
8327 thd_progress_init(thd, 2);
8328 #endif
8329
8330 while (cursor_ret_val != DB_NOTFOUND ||
8331 ((bytes_used_in_range_query_buff -
8332 curr_range_query_buff_offset) > 0)) {
8333 if ((bytes_used_in_range_query_buff -
8334 curr_range_query_buff_offset) == 0) {
8335 invalidate_bulk_fetch(); // reset the buffers
8336 cursor_ret_val = tmp_cursor->c_getf_next(
8337 tmp_cursor,
8338 DB_PRELOCKED,
8339 smart_dbt_bf_callback,
8340 &bf_info);
8341 if (cursor_ret_val != DB_NOTFOUND && cursor_ret_val != 0) {
8342 error = cursor_ret_val;
8343 goto cleanup;
8344 }
8345 }
8346 // do this check in case the the c_getf_next did not put anything
8347 // into the buffer because there was no more data
8348 if ((bytes_used_in_range_query_buff -
8349 curr_range_query_buff_offset) == 0) {
8350 break;
8351 }
8352 // at this point, we know the range query buffer has at least one
8353 // key/val pair
8354 uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
8355
8356 uint32_t key_size = *(uint32_t *)curr_pos;
8357 curr_pos += sizeof(key_size);
8358 uchar* curr_key_buff = curr_pos;
8359 curr_pos += key_size;
8360 curr_pk_key.data = curr_key_buff;
8361 curr_pk_key.size = key_size;
8362
8363 uint32_t val_size = *(uint32_t *)curr_pos;
8364 curr_pos += sizeof(val_size);
8365 uchar* curr_val_buff = curr_pos;
8366 curr_pos += val_size;
8367 curr_pk_val.data = curr_val_buff;
8368 curr_pk_val.size = val_size;
8369
8370 curr_range_query_buff_offset = curr_pos - range_query_buff;
8371
8372 error = loader->put(loader, &curr_pk_key, &curr_pk_val);
8373 if (error) {
8374 goto cleanup;
8375 }
8376
8377 num_processed++;
8378
8379 if ((num_processed % 1000) == 0) {
8380 sprintf(
8381 status_msg,
8382 "Adding indexes: Fetched %llu of about %llu rows, loading "
8383 "of data still remains.",
8384 num_processed,
8385 (long long unsigned)share->row_count());
8386 thd_proc_info(thd, status_msg);
8387
8388 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8389 thd_progress_report(
8390 thd,
8391 num_processed,
8392 (long long unsigned)share->rows);
8393 #endif
8394
8395 if (thd_killed(thd)) {
8396 error = ER_ABORTING_CONNECTION;
8397 goto cleanup;
8398 }
8399 }
8400 }
8401 error = tmp_cursor->c_close(tmp_cursor);
8402 assert_always(error==0);
8403 tmp_cursor = NULL;
8404
8405 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8406 // next progress report phase: closing the loader.
8407 // incremental reports are done in the loader's callback function.
8408 thd_progress_next_stage(thd);
8409 #endif
8410
8411 error = loader->close(loader);
8412 loader = NULL;
8413
8414 if (error) goto cleanup;
8415 }
8416 curr_index = curr_num_DBs;
8417 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8418 if (key_info[i].flags & HA_NOSAME) {
8419 bool is_unique;
8420 error = is_index_unique(
8421 &is_unique,
8422 txn,
8423 share->key_file[curr_index],
8424 &key_info[i],
8425 creating_hot_index ? 0 : DB_PRELOCKED_WRITE);
8426 if (error)
8427 goto cleanup;
8428 if (!is_unique) {
8429 error = HA_ERR_FOUND_DUPP_KEY;
8430 last_dup_key = i;
8431 goto cleanup;
8432 }
8433 }
8434 }
8435
8436 share->lock();
8437 //
8438 // We have an accurate row count, might as well update share->rows
8439 //
8440 if(!creating_hot_index) {
8441 share->set_row_count(num_processed, true);
8442 }
8443 //
8444 // now write stuff to status.tokudb
8445 //
8446 for (uint i = 0; i < num_of_keys; i++) {
8447 write_key_name_to_status(share->status_block, key_info[i].name, txn);
8448 }
8449 share->unlock();
8450
8451 error = 0;
8452 cleanup:
8453 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8454 thd_progress_end(thd);
8455 #endif
8456 if (rw_lock_taken) {
8457 share->_num_DBs_lock.unlock();
8458 rw_lock_taken = false;
8459 }
8460 if (tmp_cursor) {
8461 int r = tmp_cursor->c_close(tmp_cursor);
8462 assert_always(r==0);
8463 tmp_cursor = NULL;
8464 }
8465 if (loader != NULL) {
8466 sprintf(status_msg, "aborting creation of indexes.");
8467 thd_proc_info(thd, status_msg);
8468 loader->abort(loader);
8469 }
8470 if (indexer != NULL) {
8471 sprintf(status_msg, "aborting creation of indexes.");
8472 thd_proc_info(thd, status_msg);
8473 rwlock_t_lock_write(share->_num_DBs_lock);
8474 indexer->abort(indexer);
8475 share->_num_DBs_lock.unlock();
8476 }
8477 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8478 error == DB_LOCK_NOTGRANTED) {
8479 sql_print_error(
8480 "Could not add indexes to table %s because another transaction has "
8481 "accessed the table. To add indexes, make sure no transactions "
8482 "touch the table.",
8483 share->full_table_name());
8484 }
8485 thd_proc_info(thd, orig_proc_info);
8486 TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
8487 }
tokudb_add_index_poll(void * extra,float progress)8488 int ha_tokudb::tokudb_add_index_poll(void* extra, float progress) {
8489 LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
8490 if (thd_killed(context->thd)) {
8491 snprintf(context->write_status_msg,
8492 sizeof(context->write_status_msg),
8493 "The process has been killed, aborting add index.");
8494 return ER_ABORTING_CONNECTION;
8495 }
8496 float percentage = progress * 100;
8497 snprintf(context->write_status_msg,
8498 sizeof(context->write_status_msg),
8499 "Adding of indexes to %s about %.1f%% done",
8500 context->ha->share->full_table_name(),
8501 percentage);
8502 thd_proc_info(context->thd, context->write_status_msg);
8503 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8504 thd_progress_report(context->thd, (unsigned long long)percentage, 100);
8505 #endif
8506 return 0;
8507 }
8508
8509 //
8510 // Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8511 // Closes added indexes in case of error in error path of add_index and alter_table_phase2
8512 //
restore_add_index(TABLE * table_arg,uint num_of_keys,bool incremented_numDBs,bool modified_DBs)8513 void ha_tokudb::restore_add_index(
8514 TABLE* table_arg,
8515 uint num_of_keys,
8516 bool incremented_numDBs,
8517 bool modified_DBs) {
8518
8519 uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8520 uint curr_index = 0;
8521
8522 //
8523 // need to restore num_DBs, and we have to do it before we close the dictionaries
8524 // so that there is not a window
8525 //
8526 if (incremented_numDBs) {
8527 rwlock_t_lock_write(share->_num_DBs_lock);
8528 share->num_DBs--;
8529 }
8530 if (modified_DBs) {
8531 curr_index = curr_num_DBs;
8532 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8533 reset_key_and_col_info(&share->kc_info, curr_index);
8534 }
8535 curr_index = curr_num_DBs;
8536 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8537 if (share->key_file[curr_index]) {
8538 int r = share->key_file[curr_index]->close(
8539 share->key_file[curr_index],
8540 0);
8541 assert_always(r==0);
8542 share->key_file[curr_index] = NULL;
8543 }
8544 }
8545 }
8546 if (incremented_numDBs) {
8547 share->_num_DBs_lock.unlock();
8548 }
8549 }
8550
8551 //
8552 // Internal function called by ha_tokudb::prepare_drop_index and ha_tokudb::alter_table_phase2
8553 // With a transaction, drops dictionaries associated with indexes in key_num
8554 //
drop_indexes(uint * key_num,uint num_of_keys,KEY * key_info,DB_TXN * txn)8555 int ha_tokudb::drop_indexes(uint* key_num,
8556 uint num_of_keys,
8557 KEY* key_info,
8558 DB_TXN* txn) {
8559 TOKUDB_HANDLER_DBUG_ENTER("");
8560 assert_always(txn);
8561
8562 int error = 0;
8563 for (uint i = 0; i < num_of_keys; i++) {
8564 uint curr_index = key_num[i];
8565 error = share->key_file[curr_index]->pre_acquire_fileops_lock(
8566 share->key_file[curr_index],
8567 txn);
8568 if (error != 0) {
8569 goto cleanup;
8570 }
8571 }
8572 for (uint i = 0; i < num_of_keys; i++) {
8573 uint curr_index = key_num[i];
8574 int r = share->key_file[curr_index]->close(share->key_file[curr_index],0);
8575 assert_always(r==0);
8576 share->key_file[curr_index] = NULL;
8577
8578 error = remove_key_name_from_status(
8579 share->status_block,
8580 key_info[curr_index].name,
8581 txn);
8582 if (error) {
8583 goto cleanup;
8584 }
8585
8586 error = delete_or_rename_dictionary(
8587 share->full_table_name(),
8588 NULL,
8589 key_info[curr_index].name,
8590 true,
8591 txn,
8592 true);
8593 if (error) {
8594 goto cleanup;
8595 }
8596 }
8597
8598 cleanup:
8599 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8600 error == DB_LOCK_NOTGRANTED) {
8601 sql_print_error(
8602 "Could not drop indexes from table %s because another transaction "
8603 "has accessed the table. To drop indexes, make sure no "
8604 "transactions touch the table.",
8605 share->full_table_name());
8606 }
8607 TOKUDB_HANDLER_DBUG_RETURN(error);
8608 }
8609
8610 //
8611 // Internal function called by ha_tokudb::prepare_drop_index and
8612 // ha_tokudb::alter_table_phase2
8613 // Restores dropped indexes in case of error in error path of
8614 // prepare_drop_index and alter_table_phase2
8615 //
restore_drop_indexes(uint * key_num,uint num_of_keys)8616 void ha_tokudb::restore_drop_indexes(uint* key_num, uint num_of_keys) {
8617 //
8618 // reopen closed dictionaries
8619 //
8620 for (uint i = 0; i < num_of_keys; i++) {
8621 int r;
8622 uint curr_index = key_num[i];
8623 if (share->key_file[curr_index] == NULL) {
8624 r = open_secondary_dictionary(
8625 &share->key_file[curr_index],
8626 &table_share->key_info[curr_index],
8627 share->full_table_name(),
8628 false,
8629 NULL);
8630 assert_always(!r);
8631 }
8632 }
8633 }
8634
map_to_handler_error(int error)8635 int ha_tokudb::map_to_handler_error(int error) {
8636 switch (error) {
8637 case DB_LOCK_DEADLOCK:
8638 error = HA_ERR_LOCK_DEADLOCK;
8639 break;
8640 case DB_LOCK_NOTGRANTED:
8641 error = HA_ERR_LOCK_WAIT_TIMEOUT;
8642 break;
8643 #if defined(HA_ERR_DISK_FULL)
8644 case ENOSPC:
8645 error = HA_ERR_DISK_FULL;
8646 break;
8647 #endif
8648 case DB_KEYEXIST:
8649 error = HA_ERR_FOUND_DUPP_KEY;
8650 break;
8651 #if defined(HA_ALTER_ERROR)
8652 case HA_ALTER_ERROR:
8653 error = HA_ERR_UNSUPPORTED;
8654 break;
8655 #endif
8656 case TOKUDB_INTERRUPTED:
8657 error = ER_QUERY_INTERRUPTED;
8658 break;
8659 case TOKUDB_OUT_OF_LOCKS:
8660 error = HA_ERR_LOCK_TABLE_FULL;
8661 break;
8662 }
8663 return error;
8664 }
8665
print_error(int error,myf errflag)8666 void ha_tokudb::print_error(int error, myf errflag) {
8667 error = map_to_handler_error(error);
8668 handler::print_error(error, errflag);
8669 }
8670
8671 //
8672 // truncate's dictionary associated with keynr index using transaction txn
8673 // does so by deleting and then recreating the dictionary in the context
8674 // of a transaction
8675 //
truncate_dictionary(uint keynr,DB_TXN * txn)8676 int ha_tokudb::truncate_dictionary(uint keynr, DB_TXN* txn) {
8677 int error;
8678 bool is_pk = (keynr == primary_key);
8679
8680 toku_compression_method compression_method =
8681 get_compression_method(share->key_file[keynr]);
8682 error = share->key_file[keynr]->close(share->key_file[keynr], 0);
8683 assert_always(error == 0);
8684
8685 share->key_file[keynr] = NULL;
8686 if (is_pk) {
8687 share->file = NULL;
8688 }
8689
8690 if (is_pk) {
8691 error = delete_or_rename_dictionary(
8692 share->full_table_name(),
8693 NULL,
8694 "main",
8695 false, //is_key
8696 txn,
8697 true); // is a delete
8698 if (error) {
8699 goto cleanup;
8700 }
8701 } else {
8702 error = delete_or_rename_dictionary(
8703 share->full_table_name(),
8704 NULL,
8705 table_share->key_info[keynr].name,
8706 true, //is_key
8707 txn,
8708 true); // is a delete
8709 if (error) {
8710 goto cleanup;
8711 }
8712 }
8713
8714 if (is_pk) {
8715 error = create_main_dictionary(
8716 share->full_table_name(),
8717 table,
8718 txn,
8719 &share->kc_info,
8720 compression_method);
8721 } else {
8722 error = create_secondary_dictionary(
8723 share->full_table_name(),
8724 table,
8725 &table_share->key_info[keynr],
8726 txn,
8727 &share->kc_info,
8728 keynr,
8729 false,
8730 compression_method);
8731 }
8732 if (error) {
8733 goto cleanup;
8734 }
8735
8736 cleanup:
8737 return error;
8738 }
8739
8740 // for 5.5
truncate()8741 int ha_tokudb::truncate() {
8742 TOKUDB_HANDLER_DBUG_ENTER("");
8743 int error = delete_all_rows_internal();
8744 TOKUDB_HANDLER_DBUG_RETURN(error);
8745 }
8746
8747 // delete all rows from a table
8748 //
8749 // effects: delete all of the rows in the main dictionary and all of the
8750 // indices. this must be atomic, so we use the statement transaction
8751 // for all of the truncate operations.
8752 // locks: if we have an exclusive table write lock, all of the concurrency
8753 // issues go away.
8754 // returns: 0 if success
delete_all_rows()8755 int ha_tokudb::delete_all_rows() {
8756 TOKUDB_HANDLER_DBUG_ENTER("");
8757 int error = 0;
8758 if (thd_sql_command(ha_thd()) != SQLCOM_TRUNCATE) {
8759 share->try_table_lock = true;
8760 error = HA_ERR_WRONG_COMMAND;
8761 }
8762 if (error == 0)
8763 error = delete_all_rows_internal();
8764 TOKUDB_HANDLER_DBUG_RETURN(error);
8765 }
8766
delete_all_rows_internal()8767 int ha_tokudb::delete_all_rows_internal() {
8768 TOKUDB_HANDLER_DBUG_ENTER("");
8769 int error = 0;
8770 uint curr_num_DBs = 0;
8771 DB_TXN* txn = NULL;
8772
8773 // this should be enough to handle locking as the higher level MDL
8774 // on this table should prevent any new analyze tasks.
8775 share->cancel_background_jobs();
8776
8777 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
8778 if (error) {
8779 goto cleanup;
8780 }
8781
8782 curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
8783 for (uint i = 0; i < curr_num_DBs; i++) {
8784 error = share->key_file[i]->pre_acquire_fileops_lock(
8785 share->key_file[i],
8786 txn);
8787 if (error) {
8788 goto cleanup;
8789 }
8790 error = share->key_file[i]->pre_acquire_table_lock(
8791 share->key_file[i],
8792 txn);
8793 if (error) {
8794 goto cleanup;
8795 }
8796 }
8797 for (uint i = 0; i < curr_num_DBs; i++) {
8798 error = truncate_dictionary(i, txn);
8799 if (error) {
8800 goto cleanup;
8801 }
8802 }
8803
8804 DEBUG_SYNC(ha_thd(), "tokudb_after_truncate_all_dictionarys");
8805
8806 // zap the row count
8807 if (error == 0) {
8808 share->set_row_count(0, false);
8809 // update auto increment
8810 share->last_auto_increment = 0;
8811 // calling write_to_status directly because we need to use txn
8812 write_to_status(
8813 share->status_block,
8814 hatoku_max_ai,
8815 &share->last_auto_increment,
8816 sizeof(share->last_auto_increment),
8817 txn);
8818 }
8819
8820 share->try_table_lock = true;
8821 cleanup:
8822 if (txn) {
8823 if (error) {
8824 abort_txn(txn);
8825 } else {
8826 commit_txn(txn,0);
8827 }
8828 }
8829
8830 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(
8831 TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8832 error == DB_LOCK_NOTGRANTED) {
8833 sql_print_error(
8834 "Could not truncate table %s because another transaction has "
8835 "accessed the table. To truncate the table, make sure no "
8836 "transactions touch the table.",
8837 share->full_table_name());
8838 }
8839 //
8840 // regardless of errors, need to reopen the DB's
8841 //
8842 for (uint i = 0; i < curr_num_DBs; i++) {
8843 int r = 0;
8844 if (share->key_file[i] == NULL) {
8845 if (i != primary_key) {
8846 r = open_secondary_dictionary(
8847 &share->key_file[i],
8848 &table_share->key_info[i],
8849 share->full_table_name(),
8850 false,
8851 NULL);
8852 assert_always(!r);
8853 } else {
8854 r = open_main_dictionary(
8855 share->full_table_name(),
8856 false,
8857 NULL);
8858 assert_always(!r);
8859 }
8860 }
8861 }
8862 TOKUDB_HANDLER_DBUG_RETURN(error);
8863 }
8864
set_loader_error(int err)8865 void ha_tokudb::set_loader_error(int err) {
8866 loader_error = err;
8867 }
8868
set_dup_value_for_pk(DBT * key)8869 void ha_tokudb::set_dup_value_for_pk(DBT* key) {
8870 assert_always(!hidden_primary_key);
8871 unpack_key(table->record[0],key,primary_key);
8872 last_dup_key = primary_key;
8873 }
8874
8875 // we cache the information so we can do filtering ourselves,
8876 // but as far as MySQL knows, we are not doing any filtering,
8877 // so if we happen to miss filtering a row that does not match
8878 // idx_cond_arg, MySQL will catch it.
8879 // This allows us the ability to deal with only index_next and index_prev,
8880 // and not need to worry about other index_XXX functions
idx_cond_push(uint keyno_arg,Item * idx_cond_arg)8881 Item* ha_tokudb::idx_cond_push(uint keyno_arg, Item* idx_cond_arg) {
8882 toku_pushed_idx_cond_keyno = keyno_arg;
8883 toku_pushed_idx_cond = idx_cond_arg;
8884 return idx_cond_arg;
8885 }
8886
cancel_pushed_idx_cond()8887 void ha_tokudb::cancel_pushed_idx_cond() {
8888 invalidate_icp();
8889 handler::cancel_pushed_idx_cond();
8890 }
8891
cleanup_txn(DB_TXN * txn)8892 void ha_tokudb::cleanup_txn(DB_TXN *txn) {
8893 if (transaction == txn && cursor) {
8894 int r = cursor->c_close(cursor);
8895 assert_always(r == 0);
8896 cursor = NULL;
8897 }
8898 }
8899
add_to_trx_handler_list()8900 void ha_tokudb::add_to_trx_handler_list() {
8901 tokudb_trx_data* trx =
8902 (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8903 trx->handlers = list_add(trx->handlers, &trx_handler_list);
8904 }
8905
remove_from_trx_handler_list()8906 void ha_tokudb::remove_from_trx_handler_list() {
8907 tokudb_trx_data* trx =
8908 (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8909 trx->handlers = list_delete(trx->handlers, &trx_handler_list);
8910 }
8911
8912 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
rpl_before_write_rows()8913 void ha_tokudb::rpl_before_write_rows() {
8914 in_rpl_write_rows = true;
8915 }
8916
rpl_after_write_rows()8917 void ha_tokudb::rpl_after_write_rows() {
8918 in_rpl_write_rows = false;
8919 }
8920
rpl_before_delete_rows()8921 void ha_tokudb::rpl_before_delete_rows() {
8922 in_rpl_delete_rows = true;
8923 }
8924
rpl_after_delete_rows()8925 void ha_tokudb::rpl_after_delete_rows() {
8926 in_rpl_delete_rows = false;
8927 }
8928
rpl_before_update_rows()8929 void ha_tokudb::rpl_before_update_rows() {
8930 in_rpl_update_rows = true;
8931 }
8932
rpl_after_update_rows()8933 void ha_tokudb::rpl_after_update_rows() {
8934 in_rpl_update_rows = false;
8935 }
8936
rpl_lookup_rows()8937 bool ha_tokudb::rpl_lookup_rows() {
8938 if (!in_rpl_delete_rows && !in_rpl_update_rows)
8939 return true;
8940 else
8941 return tokudb::sysvars::rpl_lookup_rows(ha_thd());
8942 }
8943 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
8944
8945 // table admin
8946 #include "ha_tokudb_admin.cc"
8947
8948 // update functions
8949 #include "tokudb_update_fun.cc"
8950
8951 // fast updates
8952 #if defined(TOKU_INCLUDE_UPSERT) && TOKU_INCLUDE_UPSERT
8953 #include "ha_tokudb_update.cc"
8954 #endif // defined(TOKU_INCLUDE_UPSERT) && TOKU_INCLUDE_UPSERT
8955
8956 // alter table
8957 #include "ha_tokudb_alter.cc"
8958
8959 // key comparisons
8960 #include "hatoku_cmp.cc"
8961
8962 // mrr
8963 #include "ha_tokudb_mrr_mysql.cc"
8964
8965 // handlerton
8966 #include "ha_tokupart.cc"
8967 #include "hatoku_hton.cc"
8968
8969 // generate template functions
8970 namespace tokudb {
8971 template size_t vlq_encode_ui(uint32_t n, void *p, size_t s);
8972 template size_t vlq_decode_ui(uint32_t *np, void *p, size_t s);
8973 template size_t vlq_encode_ui(uint64_t n, void *p, size_t s);
8974 template size_t vlq_decode_ui(uint64_t *np, void *p, size_t s);
8975 };
8976