1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of TokuDB
6
7
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9
10 TokuDBis is free software: you can redistribute it and/or modify
11 it under the terms of the GNU General Public License, version 2,
12 as published by the Free Software Foundation.
13
14 TokuDB is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with TokuDB. If not, see <http://www.gnu.org/licenses/>.
21
22 ======= */
23
24 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
25
26 #include "hatoku_hton.h"
27 #include "hatoku_cmp.h"
28 #include "tokudb_buffer.h"
29 #include "tokudb_status.h"
30 #include "tokudb_card.h"
31 #include "ha_tokudb.h"
32 #include "sql_db.h"
33
34 pfs_key_t ha_tokudb_mutex_key;
35 pfs_key_t num_DBs_lock_key;
36
37 std::unordered_map<std::string, TOKUDB_SHARE*> TOKUDB_SHARE::_open_tables;
38 tokudb::thread::mutex_t TOKUDB_SHARE::_open_tables_mutex;
39
40 static const char* ha_tokudb_exts[] = {
41 ha_tokudb_ext,
42 NullS
43 };
44
45 //
46 // This offset is calculated starting from AFTER the NULL bytes
47 //
get_fixed_field_size(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)48 static inline uint32_t get_fixed_field_size(
49 KEY_AND_COL_INFO* kc_info,
50 TABLE_SHARE* table_share,
51 uint keynr) {
52
53 uint offset = 0;
54 for (uint i = 0; i < table_share->fields; i++) {
55 if (is_fixed_field(kc_info, i) &&
56 !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
57 offset += kc_info->field_lengths[i];
58 }
59 }
60 return offset;
61 }
62
63
get_len_of_offsets(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)64 static inline uint32_t get_len_of_offsets(
65 KEY_AND_COL_INFO* kc_info,
66 TABLE_SHARE* table_share,
67 uint keynr) {
68
69 uint len = 0;
70 for (uint i = 0; i < table_share->fields; i++) {
71 if (is_variable_field(kc_info, i) &&
72 !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
73 len += kc_info->num_offset_bytes;
74 }
75 }
76 return len;
77 }
78
79
allocate_key_and_col_info(TABLE_SHARE * table_share,KEY_AND_COL_INFO * kc_info)80 static int allocate_key_and_col_info(
81 TABLE_SHARE* table_share,
82 KEY_AND_COL_INFO* kc_info) {
83
84 int error;
85 //
86 // initialize all of the bitmaps
87 //
88 for (uint i = 0; i < MAX_KEY + 1; i++) {
89 error =
90 bitmap_init(
91 &kc_info->key_filters[i],
92 NULL,
93 table_share->fields,
94 false);
95 if (error) {
96 goto exit;
97 }
98 }
99
100 //
101 // create the field lengths
102 //
103 kc_info->multi_ptr = tokudb::memory::multi_malloc(
104 MYF(MY_WME+MY_ZEROFILL),
105 &kc_info->field_types, (uint)(table_share->fields * sizeof (uint8_t)),
106 &kc_info->field_lengths, (uint)(table_share->fields * sizeof (uint16_t)),
107 &kc_info->length_bytes, (uint)(table_share->fields * sizeof (uint8_t)),
108 &kc_info->blob_fields, (uint)(table_share->fields * sizeof (uint32_t)),
109 NullS);
110 if (kc_info->multi_ptr == NULL) {
111 error = ENOMEM;
112 goto exit;
113 }
114 exit:
115 if (error) {
116 for (uint i = 0; MAX_KEY + 1; i++) {
117 bitmap_free(&kc_info->key_filters[i]);
118 }
119 tokudb::memory::free(kc_info->multi_ptr);
120 }
121 return error;
122 }
123
free_key_and_col_info(KEY_AND_COL_INFO * kc_info)124 static void free_key_and_col_info (KEY_AND_COL_INFO* kc_info) {
125 for (uint i = 0; i < MAX_KEY+1; i++) {
126 bitmap_free(&kc_info->key_filters[i]);
127 }
128
129 for (uint i = 0; i < MAX_KEY+1; i++) {
130 tokudb::memory::free(kc_info->cp_info[i]);
131 kc_info->cp_info[i] = NULL; // 3144
132 }
133
134 tokudb::memory::free(kc_info->multi_ptr);
135 kc_info->field_types = NULL;
136 kc_info->field_lengths = NULL;
137 kc_info->length_bytes = NULL;
138 kc_info->blob_fields = NULL;
139 }
140
141
static_init()142 void TOKUDB_SHARE::static_init() {
143 assert_always(_open_tables.size() == 0);
144 }
static_destroy()145 void TOKUDB_SHARE::static_destroy() {
146 for (auto it = _open_tables.cbegin(); it != _open_tables.cend(); it++) {
147 TOKUDB_TRACE("_open_tables %s %p", it->first.c_str(), it->second);
148 TOKUDB_SHARE* share = it->second;
149 share->destroy();
150 delete share;
151 }
152 _open_tables.clear();
153 assert_always(_open_tables.size() == 0);
154 }
get_state_string(share_state_t state)155 const char* TOKUDB_SHARE::get_state_string(share_state_t state) {
156 static const char* state_string[] = {
157 "CLOSED",
158 "OPENED",
159 "ERROR"
160 };
161 assert_always(state == CLOSED || state == OPENED || state == ERROR);
162 return state_string[state];
163 }
operator new(size_t sz)164 void* TOKUDB_SHARE::operator new(size_t sz) {
165 return tokudb::memory::malloc(sz, MYF(MY_WME|MY_ZEROFILL|MY_FAE));
166 }
operator delete(void * p)167 void TOKUDB_SHARE::operator delete(void* p) { tokudb::memory::free(p); }
TOKUDB_SHARE()168 TOKUDB_SHARE::TOKUDB_SHARE()
169 : _num_DBs_lock(num_DBs_lock_key), _mutex(ha_tokudb_mutex_key) {}
init(const char * table_name)170 void TOKUDB_SHARE::init(const char* table_name) {
171 _use_count = 0;
172 thr_lock_init(&_thr_lock);
173 _state = CLOSED;
174 _row_delta_activity = 0;
175 _allow_auto_analysis = true;
176
177 _full_table_name.append(table_name);
178
179 String tmp_dictionary_name;
180 tokudb_split_dname(
181 table_name,
182 _database_name,
183 _table_name,
184 tmp_dictionary_name);
185
186 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
187 _full_table_name.ptr(),
188 get_state_string(_state),
189 _use_count);
190 TOKUDB_SHARE_DBUG_VOID_RETURN();
191 }
destroy()192 void TOKUDB_SHARE::destroy() {
193 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
194 _full_table_name.ptr(),
195 get_state_string(_state),
196 _use_count);
197
198 assert_always(_use_count == 0);
199 assert_always(
200 _state == TOKUDB_SHARE::CLOSED || _state == TOKUDB_SHARE::ERROR);
201 thr_lock_delete(&_thr_lock);
202 TOKUDB_SHARE_DBUG_VOID_RETURN();
203 }
get_share(const char * table_name,THR_LOCK_DATA * data,bool create_new)204 TOKUDB_SHARE* TOKUDB_SHARE::get_share(const char* table_name,
205 THR_LOCK_DATA* data,
206 bool create_new) {
207 std::string find_table_name(table_name);
208 mutex_t_lock(_open_tables_mutex);
209 auto it = _open_tables.find(find_table_name);
210 TOKUDB_SHARE *share = nullptr;
211 if (it != _open_tables.end()) {
212 share = it->second;
213 assert_always(strcmp(table_name, share->full_table_name()) == 0);
214 }
215 TOKUDB_TRACE_FOR_FLAGS(
216 TOKUDB_DEBUG_SHARE,
217 "existing share[%s] %s:share[%p]",
218 table_name,
219 share == NULL ? "not found" : "found",
220 share);
221
222 if (!share) {
223 if (create_new == false)
224 goto exit;
225 // create share and fill it with all zeroes
226 // hence, all pointers are initialized to NULL
227 share = new TOKUDB_SHARE;
228 assert_always(share);
229
230 share->init(table_name);
231
232 _open_tables.insert({find_table_name, share});
233 }
234
235 share->addref();
236
237 if (data)
238 thr_lock_data_init(&(share->_thr_lock), data, NULL);
239
240 exit:
241 mutex_t_unlock(_open_tables_mutex);
242 return share;
243 }
drop_share(TOKUDB_SHARE * share)244 void TOKUDB_SHARE::drop_share(TOKUDB_SHARE* share) {
245 TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_SHARE,
246 "share[%p]:file[%s]:state[%s]:use_count[%d]",
247 share,
248 share->_full_table_name.ptr(),
249 get_state_string(share->_state),
250 share->_use_count);
251
252 mutex_t_lock(_open_tables_mutex);
253 size_t n = _open_tables.erase(std::string(share->full_table_name()));
254 assert_always(n == 1);
255 share->destroy();
256 delete share;
257 mutex_t_unlock(_open_tables_mutex);
258 }
addref()259 TOKUDB_SHARE::share_state_t TOKUDB_SHARE::addref() {
260 TOKUDB_SHARE_TRACE_FOR_FLAGS((TOKUDB_DEBUG_ENTER & TOKUDB_DEBUG_SHARE),
261 "file[%s]:state[%s]:use_count[%d]",
262 _full_table_name.ptr(),
263 get_state_string(_state),
264 _use_count);
265
266 lock();
267 _use_count++;
268
269 return _state;
270 }
release()271 int TOKUDB_SHARE::release() {
272 TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
273 _full_table_name.ptr(),
274 get_state_string(_state),
275 _use_count);
276
277 int error, result = 0;
278
279 mutex_t_lock(_mutex);
280 assert_always(_use_count != 0);
281 _use_count--;
282 if (_use_count == 0 && _state == TOKUDB_SHARE::OPENED) {
283 // number of open DB's may not be equal to number of keys we have
284 // because add_index may have added some. So, we loop through entire
285 // array and close any non-NULL value. It is imperative that we reset
286 // a DB to NULL once we are done with it.
287 for (uint i = 0; i < sizeof(key_file)/sizeof(key_file[0]); i++) {
288 if (key_file[i]) {
289 TOKUDB_TRACE_FOR_FLAGS(
290 TOKUDB_DEBUG_OPEN,
291 "dbclose:%p",
292 key_file[i]);
293 error = key_file[i]->close(key_file[i], 0);
294 assert_always(error == 0);
295 if (error) {
296 result = error;
297 }
298 if (key_file[i] == file)
299 file = NULL;
300 key_file[i] = NULL;
301 }
302 }
303
304 error = tokudb::metadata::close(&status_block);
305 assert_always(error == 0);
306
307 free_key_and_col_info(&kc_info);
308
309 if (_rec_per_key) {
310 tokudb::memory::free(_rec_per_key);
311 _rec_per_key = NULL;
312 _rec_per_keys = 0;
313 }
314
315 for (uint i = 0; i < _keys; i++) {
316 tokudb::memory::free(_key_descriptors[i]._name);
317 }
318 tokudb::memory::free(_key_descriptors);
319 _keys = _max_key_parts = 0; _key_descriptors = NULL;
320
321 _state = TOKUDB_SHARE::CLOSED;
322 }
323 mutex_t_unlock(_mutex);
324
325 TOKUDB_SHARE_DBUG_RETURN(result);
326 }
update_row_count(THD * thd,uint64_t added,uint64_t deleted,uint64_t updated)327 void TOKUDB_SHARE::update_row_count(
328 THD* thd,
329 uint64_t added,
330 uint64_t deleted,
331 uint64_t updated) {
332
333 uint64_t delta = added + deleted + updated;
334 lock();
335 if (deleted > added && _rows < (deleted - added)) {
336 _rows = 0;
337 } else {
338 _rows += added - deleted;
339 }
340 _row_delta_activity += delta;
341 if (_row_delta_activity == (uint64_t)~0)
342 _row_delta_activity = 1;
343
344 ulonglong auto_threshold = tokudb::sysvars::auto_analyze(thd);
345 if (delta && auto_threshold > 0 && _allow_auto_analysis) {
346 ulonglong pct_of_rows_changed_to_trigger;
347 pct_of_rows_changed_to_trigger = ((_rows * auto_threshold) / 100);
348 if (_row_delta_activity >= pct_of_rows_changed_to_trigger) {
349 char msg[200];
350 snprintf(msg,
351 sizeof(msg),
352 "TokuDB: Auto %s analysis for %s, delta_activity %llu is "
353 "greater than %llu percent of %llu rows.",
354 tokudb::sysvars::analyze_in_background(thd) > 0
355 ? "scheduling background"
356 : "running foreground",
357 full_table_name(),
358 _row_delta_activity,
359 auto_threshold,
360 (ulonglong)(_rows));
361
362 // analyze_standard will unlock _mutex regardless of success/failure
363 int ret = analyze_standard(thd, NULL);
364 if (ret == 0) {
365 sql_print_information("%s - succeeded.", msg);
366 } else {
367 sql_print_information(
368 "%s - failed, likely a job already running.",
369 msg);
370 }
371 }
372 }
373 unlock();
374 }
set_cardinality_counts_in_table(TABLE * table)375 void TOKUDB_SHARE::set_cardinality_counts_in_table(TABLE* table) {
376 lock();
377 uint32_t next_key_part = 0;
378 for (uint32_t i = 0; i < table->s->keys; i++) {
379 KEY* key = &table->key_info[i];
380 bool is_unique_key =
381 (i == table->s->primary_key) || (key->flags & HA_NOSAME);
382
383 for (uint32_t j = 0; j < key->actual_key_parts; j++) {
384 if (j >= key->user_defined_key_parts) {
385 // MySQL 'hidden' keys, really needs deeper investigation
386 // into MySQL hidden keys vs TokuDB hidden keys
387 key->rec_per_key[j] = 1;
388 continue;
389 }
390
391 assert_always(next_key_part < _rec_per_keys);
392 ulong val = _rec_per_key[next_key_part++];
393 val = (val * tokudb::sysvars::cardinality_scale_percent) / 100;
394 if (val == 0 || _rows == 0 ||
395 (is_unique_key && j == key->actual_key_parts - 1)) {
396 val = 1;
397 }
398 key->rec_per_key[j] = val;
399 }
400 }
401 unlock();
402 }
403
404 #define HANDLE_INVALID_CURSOR() \
405 if (cursor == NULL) { \
406 error = last_cursor_error; \
407 goto cleanup; \
408 }
409
table_type() const410 const char *ha_tokudb::table_type() const {
411 return tokudb_hton_name;
412 }
413
index_type(TOKUDB_UNUSED (uint inx))414 const char *ha_tokudb::index_type(TOKUDB_UNUSED(uint inx)) {
415 return "BTREE";
416 }
417
418 /*
419 * returns NULL terminated file extension string
420 */
bas_ext() const421 const char **ha_tokudb::bas_ext() const {
422 TOKUDB_HANDLER_DBUG_ENTER("");
423 DBUG_RETURN(ha_tokudb_exts);
424 }
425
is_insert_ignore(THD * thd)426 static inline bool is_insert_ignore (THD* thd) {
427 //
428 // from http://lists.mysql.com/internals/37735
429 //
430 return thd->lex->ignore && thd->lex->duplicates == DUP_ERROR;
431 }
432
is_replace_into(THD * thd)433 static inline bool is_replace_into(THD* thd) {
434 return thd->lex->duplicates == DUP_REPLACE;
435 }
436
do_ignore_flag_optimization(THD * thd,TABLE * table,bool opt_eligible)437 static inline bool do_ignore_flag_optimization(
438 THD* thd,
439 TABLE* table,
440 bool opt_eligible) {
441
442 bool do_opt = false;
443 if (opt_eligible &&
444 (is_replace_into(thd) || is_insert_ignore(thd)) &&
445 tokudb::sysvars::pk_insert_mode(thd) == 1 &&
446 !table->triggers &&
447 !(mysql_bin_log.is_open() &&
448 thd->variables.binlog_format != BINLOG_FORMAT_STMT)) {
449 do_opt = true;
450 }
451 return do_opt;
452 }
453
454 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
get_ext_key_parts(const KEY * key)455 static inline uint get_ext_key_parts(const KEY *key) {
456 #if (50609 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \
457 (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799)
458 return key->actual_key_parts;
459 #elif defined(MARIADB_BASE_VERSION)
460 return key->ext_key_parts;
461 #else
462 #error
463 #endif
464 }
465 #endif // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
466
table_flags() const467 ulonglong ha_tokudb::table_flags() const {
468 return int_table_flags | HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE;
469 }
470
471 //
472 // Returns a bit mask of capabilities of the key or its part specified by
473 // the arguments. The capabilities are defined in sql/handler.h.
474 //
index_flags(uint idx,TOKUDB_UNUSED (uint part),TOKUDB_UNUSED (bool all_parts)) const475 ulong ha_tokudb::index_flags(uint idx,
476 TOKUDB_UNUSED(uint part),
477 TOKUDB_UNUSED(bool all_parts)) const {
478 TOKUDB_HANDLER_DBUG_ENTER("");
479 assert_always(table_share);
480 ulong flags = (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER |
481 HA_KEYREAD_ONLY | HA_READ_RANGE | HA_DO_INDEX_COND_PUSHDOWN);
482 if (key_is_clustering(&table_share->key_info[idx])) {
483 flags |= HA_CLUSTERED_INDEX;
484 }
485 DBUG_RETURN(flags);
486 }
487
488
489 //
490 // struct that will be used as a context for smart DBT callbacks
491 // contains parameters needed to complete the smart DBT cursor call
492 //
493 typedef struct smart_dbt_info {
494 ha_tokudb* ha; //instance to ha_tokudb needed for reading the row
495 uchar* buf; // output buffer where row will be written
496 uint keynr; // index into share->key_file that represents DB we are currently operating on
497 } *SMART_DBT_INFO;
498
499 typedef struct smart_dbt_bf_info {
500 ha_tokudb* ha;
501 bool need_val;
502 int direction;
503 THD* thd;
504 uchar* buf;
505 DBT* key_to_compare;
506 } *SMART_DBT_BF_INFO;
507
508 typedef struct index_read_info {
509 struct smart_dbt_info smart_dbt_info;
510 int cmp;
511 DBT* orig_key;
512 } *INDEX_READ_INFO;
513
514 //
515 // smart DBT callback function for optimize
516 // in optimize, we want to flatten DB by doing
517 // a full table scan. Therefore, we don't
518 // want to actually do anything with the data, hence
519 // callback does nothing
520 //
smart_dbt_do_nothing(TOKUDB_UNUSED (DBT const * key),TOKUDB_UNUSED (DBT const * row),TOKUDB_UNUSED (void * context))521 static int smart_dbt_do_nothing(TOKUDB_UNUSED(DBT const* key),
522 TOKUDB_UNUSED(DBT const* row),
523 TOKUDB_UNUSED(void* context)) {
524 return 0;
525 }
526
527 static int
smart_dbt_callback_rowread_ptquery(DBT const * key,DBT const * row,void * context)528 smart_dbt_callback_rowread_ptquery (DBT const *key, DBT const *row, void *context) {
529 SMART_DBT_INFO info = (SMART_DBT_INFO)context;
530 info->ha->extract_hidden_primary_key(info->keynr, key);
531 return info->ha->read_row_callback(info->buf,info->keynr,row,key);
532 }
533
534 //
535 // Smart DBT callback function in case where we have a covering index
536 //
smart_dbt_callback_keyread(DBT const * key,DBT TOKUDB_UNUSED (const * row),void * context)537 static int smart_dbt_callback_keyread(DBT const* key,
538 DBT TOKUDB_UNUSED(const* row),
539 void* context) {
540 SMART_DBT_INFO info = (SMART_DBT_INFO)context;
541 info->ha->extract_hidden_primary_key(info->keynr, key);
542 info->ha->read_key_only(info->buf,info->keynr,key);
543 return 0;
544 }
545
546 //
547 // Smart DBT callback function in case where we do NOT have a covering index
548 //
549 static int
smart_dbt_callback_rowread(DBT const * key,DBT const * row,void * context)550 smart_dbt_callback_rowread(DBT const *key, DBT const *row, void *context) {
551 int error = 0;
552 SMART_DBT_INFO info = (SMART_DBT_INFO)context;
553 info->ha->extract_hidden_primary_key(info->keynr, key);
554 error = info->ha->read_primary_key(info->buf,info->keynr,row,key);
555 return error;
556 }
557
558 //
559 // Smart DBT callback function in case where we have a covering index
560 //
smart_dbt_callback_ir_keyread(DBT const * key,TOKUDB_UNUSED (DBT const * row),void * context)561 static int smart_dbt_callback_ir_keyread(DBT const* key,
562 TOKUDB_UNUSED(DBT const* row),
563 void* context) {
564 INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
565 ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(
566 ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
567 if (ir_info->cmp) {
568 return 0;
569 }
570 return smart_dbt_callback_keyread(key, row, &ir_info->smart_dbt_info);
571 }
572
smart_dbt_callback_lookup(DBT const * key,TOKUDB_UNUSED (DBT const * row),void * context)573 static int smart_dbt_callback_lookup(DBT const* key,
574 TOKUDB_UNUSED(DBT const* row),
575 void* context) {
576 INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
577 ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(
578 ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
579 return 0;
580 }
581
582
583 //
584 // Smart DBT callback function in case where we do NOT have a covering index
585 //
586 static int
smart_dbt_callback_ir_rowread(DBT const * key,DBT const * row,void * context)587 smart_dbt_callback_ir_rowread(DBT const *key, DBT const *row, void *context) {
588 INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
589 ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
590 if (ir_info->cmp) {
591 return 0;
592 }
593 return smart_dbt_callback_rowread(key, row, &ir_info->smart_dbt_info);
594 }
595
596 //
597 // macro for Smart DBT callback function,
598 // so we do not need to put this long line of code in multiple places
599 //
600 #define SMART_DBT_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_keyread : smart_dbt_callback_rowread )
601 #define SMART_DBT_IR_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_ir_keyread : smart_dbt_callback_ir_rowread )
602
603 //
604 // macro that modifies read flag for cursor operations depending on whether
605 // we have preacquired lock or not
606 //
607 #define SET_PRELOCK_FLAG(flg) ((flg) | (range_lock_grabbed ? (use_write_locks ? DB_PRELOCKED_WRITE : DB_PRELOCKED) : 0))
608
609 //
610 // This method retrieves the value of the auto increment column of a record in MySQL format
611 // This was basically taken from MyISAM
612 // Parameters:
613 // type - the type of the auto increment column (e.g. int, float, double...)
614 // offset - offset into the record where the auto increment column is stored
615 // [in] record - MySQL row whose auto increment value we want to extract
616 // Returns:
617 // The value of the auto increment column in record
618 //
retrieve_auto_increment(uint16 type,uint32 offset,const uchar * record)619 static ulonglong retrieve_auto_increment(uint16 type, uint32 offset,const uchar *record)
620 {
621 const uchar *key; /* Key */
622 ulonglong unsigned_autoinc = 0; /* Unsigned auto-increment */
623 longlong signed_autoinc = 0; /* Signed auto-increment */
624 enum { unsigned_type, signed_type } autoinc_type;
625 float float_tmp; /* Temporary variable */
626 double double_tmp; /* Temporary variable */
627
628 key = ((uchar *) record) + offset;
629
630 /* Set default autoincrement type */
631 autoinc_type = unsigned_type;
632
633 switch (type) {
634 case HA_KEYTYPE_INT8:
635 signed_autoinc = (longlong) *(char*)key;
636 autoinc_type = signed_type;
637 break;
638
639 case HA_KEYTYPE_BINARY:
640 unsigned_autoinc = (ulonglong) *(uchar*) key;
641 break;
642
643 case HA_KEYTYPE_SHORT_INT:
644 signed_autoinc = (longlong) sint2korr(key);
645 autoinc_type = signed_type;
646 break;
647
648 case HA_KEYTYPE_USHORT_INT:
649 unsigned_autoinc = (ulonglong) uint2korr(key);
650 break;
651
652 case HA_KEYTYPE_LONG_INT:
653 signed_autoinc = (longlong) sint4korr(key);
654 autoinc_type = signed_type;
655 break;
656
657 case HA_KEYTYPE_ULONG_INT:
658 unsigned_autoinc = (ulonglong) uint4korr(key);
659 break;
660
661 case HA_KEYTYPE_INT24:
662 signed_autoinc = (longlong) sint3korr(key);
663 autoinc_type = signed_type;
664 break;
665
666 case HA_KEYTYPE_UINT24:
667 unsigned_autoinc = (ulonglong) tokudb_uint3korr(key);
668 break;
669
670 case HA_KEYTYPE_LONGLONG:
671 signed_autoinc = sint8korr(key);
672 autoinc_type = signed_type;
673 break;
674
675 case HA_KEYTYPE_ULONGLONG:
676 unsigned_autoinc = uint8korr(key);
677 break;
678
679 /* The remaining two cases should not be used but are included for
680 compatibility */
681 case HA_KEYTYPE_FLOAT:
682 float4get(float_tmp, key); /* Note: float4get is a macro */
683 signed_autoinc = (longlong) float_tmp;
684 autoinc_type = signed_type;
685 break;
686
687 case HA_KEYTYPE_DOUBLE:
688 float8get(double_tmp, key); /* Note: float8get is a macro */
689 signed_autoinc = (longlong) double_tmp;
690 autoinc_type = signed_type;
691 break;
692
693 default:
694 assert_unreachable();
695 }
696
697 if (signed_autoinc < 0) {
698 signed_autoinc = 0;
699 }
700
701 return autoinc_type == unsigned_type ?
702 unsigned_autoinc : (ulonglong) signed_autoinc;
703 }
704
field_offset(Field * field,TABLE * table)705 static inline ulong field_offset(Field* field, TABLE* table) {
706 return((ulong) (field->ptr - table->record[0]));
707 }
708
tx_to_toku_iso(ulong tx_isolation)709 static inline HA_TOKU_ISO_LEVEL tx_to_toku_iso(ulong tx_isolation) {
710 if (tx_isolation == ISO_READ_UNCOMMITTED) {
711 return hatoku_iso_read_uncommitted;
712 }
713 else if (tx_isolation == ISO_READ_COMMITTED) {
714 return hatoku_iso_read_committed;
715 }
716 else if (tx_isolation == ISO_REPEATABLE_READ) {
717 return hatoku_iso_repeatable_read;
718 }
719 else {
720 return hatoku_iso_serializable;
721 }
722 }
723
toku_iso_to_txn_flag(HA_TOKU_ISO_LEVEL lvl)724 static inline uint32_t toku_iso_to_txn_flag (HA_TOKU_ISO_LEVEL lvl) {
725 if (lvl == hatoku_iso_read_uncommitted) {
726 return DB_READ_UNCOMMITTED;
727 }
728 else if (lvl == hatoku_iso_read_committed) {
729 return DB_READ_COMMITTED;
730 }
731 else if (lvl == hatoku_iso_repeatable_read) {
732 return DB_TXN_SNAPSHOT;
733 }
734 else {
735 return 0;
736 }
737 }
738
filter_key_part_compare(const void * left,const void * right)739 static int filter_key_part_compare (const void* left, const void* right) {
740 FILTER_KEY_PART_INFO* left_part= (FILTER_KEY_PART_INFO *)left;
741 FILTER_KEY_PART_INFO* right_part = (FILTER_KEY_PART_INFO *)right;
742 return left_part->offset - right_part->offset;
743 }
744
745 //
746 // Be very careful with parameters passed to this function. Who knows
747 // if key, table have proper info set. I had to verify by checking
748 // in the debugger.
749 //
set_key_filter(MY_BITMAP * key_filter,KEY * key,TABLE * table,bool get_offset_from_keypart)750 void set_key_filter(
751 MY_BITMAP* key_filter,
752 KEY* key,
753 TABLE* table,
754 bool get_offset_from_keypart) {
755
756 FILTER_KEY_PART_INFO parts[MAX_REF_PARTS];
757 uint curr_skip_index = 0;
758
759 for (uint i = 0; i < key->user_defined_key_parts; i++) {
760 //
761 // horrendous hack due to bugs in mysql, basically
762 // we cannot always reliably get the offset from the same source
763 //
764 parts[i].offset =
765 get_offset_from_keypart ?
766 key->key_part[i].offset :
767 field_offset(key->key_part[i].field, table);
768 parts[i].part_index = i;
769 }
770 qsort(
771 parts, // start of array
772 key->user_defined_key_parts, //num elements
773 sizeof(*parts), //size of each element
774 filter_key_part_compare);
775
776 for (uint i = 0; i < table->s->fields; i++) {
777 Field* field = table->field[i];
778 uint curr_field_offset = field_offset(field, table);
779 if (curr_skip_index < key->user_defined_key_parts) {
780 uint curr_skip_offset = 0;
781 curr_skip_offset = parts[curr_skip_index].offset;
782 if (curr_skip_offset == curr_field_offset) {
783 //
784 // we have hit a field that is a portion of the primary key
785 //
786 uint curr_key_index = parts[curr_skip_index].part_index;
787 curr_skip_index++;
788 //
789 // only choose to continue over the key if the key's length matches the field's length
790 // otherwise, we may have a situation where the column is a varchar(10), the
791 // key is only the first 3 characters, and we end up losing the last 7 bytes of the
792 // column
793 //
794 TOKU_TYPE toku_type = mysql_to_toku_type(field);
795 switch (toku_type) {
796 case toku_type_blob:
797 break;
798 case toku_type_varbinary:
799 case toku_type_varstring:
800 case toku_type_fixbinary:
801 case toku_type_fixstring:
802 if (key->key_part[curr_key_index].length == field->field_length) {
803 bitmap_set_bit(key_filter,i);
804 }
805 break;
806 default:
807 bitmap_set_bit(key_filter,i);
808 break;
809 }
810 }
811 }
812 }
813 }
814
pack_fixed_field(uchar * to_tokudb,const uchar * from_mysql,uint32_t num_bytes)815 static inline uchar* pack_fixed_field(
816 uchar* to_tokudb,
817 const uchar* from_mysql,
818 uint32_t num_bytes
819 )
820 {
821 switch (num_bytes) {
822 case (1):
823 memcpy(to_tokudb, from_mysql, 1);
824 break;
825 case (2):
826 memcpy(to_tokudb, from_mysql, 2);
827 break;
828 case (3):
829 memcpy(to_tokudb, from_mysql, 3);
830 break;
831 case (4):
832 memcpy(to_tokudb, from_mysql, 4);
833 break;
834 case (8):
835 memcpy(to_tokudb, from_mysql, 8);
836 break;
837 default:
838 memcpy(to_tokudb, from_mysql, num_bytes);
839 break;
840 }
841 return to_tokudb+num_bytes;
842 }
843
unpack_fixed_field(uchar * to_mysql,const uchar * from_tokudb,uint32_t num_bytes)844 static inline const uchar* unpack_fixed_field(
845 uchar* to_mysql,
846 const uchar* from_tokudb,
847 uint32_t num_bytes
848 )
849 {
850 switch (num_bytes) {
851 case (1):
852 memcpy(to_mysql, from_tokudb, 1);
853 break;
854 case (2):
855 memcpy(to_mysql, from_tokudb, 2);
856 break;
857 case (3):
858 memcpy(to_mysql, from_tokudb, 3);
859 break;
860 case (4):
861 memcpy(to_mysql, from_tokudb, 4);
862 break;
863 case (8):
864 memcpy(to_mysql, from_tokudb, 8);
865 break;
866 default:
867 memcpy(to_mysql, from_tokudb, num_bytes);
868 break;
869 }
870 return from_tokudb+num_bytes;
871 }
872
write_var_field(uchar * to_tokudb_offset_ptr,uchar * to_tokudb_data,uchar * to_tokudb_offset_start,const uchar * data,uint32_t data_length,uint32_t offset_bytes)873 static inline uchar* write_var_field(
874 uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
875 uchar* to_tokudb_data, // location where data is going to be written
876 uchar* to_tokudb_offset_start, //location where offset starts, IS THIS A BAD NAME????
877 const uchar * data, // the data to write
878 uint32_t data_length, // length of data to write
879 uint32_t offset_bytes // number of offset bytes
880 )
881 {
882 memcpy(to_tokudb_data, data, data_length);
883 //
884 // for offset, we pack the offset where the data ENDS!
885 //
886 uint32_t offset = to_tokudb_data + data_length - to_tokudb_offset_start;
887 switch(offset_bytes) {
888 case (1):
889 to_tokudb_offset_ptr[0] = (uchar)offset;
890 break;
891 case (2):
892 int2store(to_tokudb_offset_ptr,offset);
893 break;
894 default:
895 assert_unreachable();
896 break;
897 }
898 return to_tokudb_data + data_length;
899 }
900
get_var_data_length(const uchar * from_mysql,uint32_t mysql_length_bytes)901 static inline uint32_t get_var_data_length(
902 const uchar * from_mysql,
903 uint32_t mysql_length_bytes
904 )
905 {
906 uint32_t data_length;
907 switch(mysql_length_bytes) {
908 case(1):
909 data_length = from_mysql[0];
910 break;
911 case(2):
912 data_length = uint2korr(from_mysql);
913 break;
914 default:
915 assert_unreachable();
916 }
917 return data_length;
918 }
919
pack_var_field(uchar * to_tokudb_offset_ptr,uchar * to_tokudb_data,uchar * to_tokudb_offset_start,const uchar * from_mysql,uint32_t mysql_length_bytes,uint32_t offset_bytes)920 static inline uchar* pack_var_field(
921 uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
922 uchar* to_tokudb_data, // pointer to where tokudb data should be written
923 uchar* to_tokudb_offset_start, //location where data starts, IS THIS A BAD NAME????
924 const uchar * from_mysql, // mysql data
925 uint32_t mysql_length_bytes, //number of bytes used to store length in from_mysql
926 uint32_t offset_bytes //number of offset_bytes used in tokudb row
927 )
928 {
929 uint data_length = get_var_data_length(from_mysql, mysql_length_bytes);
930 return write_var_field(
931 to_tokudb_offset_ptr,
932 to_tokudb_data,
933 to_tokudb_offset_start,
934 from_mysql + mysql_length_bytes,
935 data_length,
936 offset_bytes
937 );
938 }
939
unpack_var_field(uchar * to_mysql,const uchar * from_tokudb_data,uint32_t from_tokudb_data_len,uint32_t mysql_length_bytes)940 static inline void unpack_var_field(
941 uchar* to_mysql,
942 const uchar* from_tokudb_data,
943 uint32_t from_tokudb_data_len,
944 uint32_t mysql_length_bytes
945 )
946 {
947 //
948 // store the length
949 //
950 switch (mysql_length_bytes) {
951 case(1):
952 to_mysql[0] = (uchar)from_tokudb_data_len;
953 break;
954 case(2):
955 int2store(to_mysql, from_tokudb_data_len);
956 break;
957 default:
958 assert_unreachable();
959 }
960 //
961 // store the data
962 //
963 memcpy(to_mysql+mysql_length_bytes, from_tokudb_data, from_tokudb_data_len);
964 }
965
pack_toku_field_blob(uchar * to_tokudb,const uchar * from_mysql,Field * field)966 static uchar* pack_toku_field_blob(
967 uchar* to_tokudb,
968 const uchar* from_mysql,
969 Field* field
970 )
971 {
972 uint32_t len_bytes = field->row_pack_length();
973 uint32_t length = 0;
974 uchar* data_ptr = NULL;
975 memcpy(to_tokudb, from_mysql, len_bytes);
976
977 switch (len_bytes) {
978 case (1):
979 length = (uint32_t)(*from_mysql);
980 break;
981 case (2):
982 length = uint2korr(from_mysql);
983 break;
984 case (3):
985 length = tokudb_uint3korr(from_mysql);
986 break;
987 case (4):
988 length = uint4korr(from_mysql);
989 break;
990 default:
991 assert_unreachable();
992 }
993
994 if (length > 0) {
995 memcpy((uchar *)(&data_ptr), from_mysql + len_bytes, sizeof(uchar*));
996 memcpy(to_tokudb + len_bytes, data_ptr, length);
997 }
998 return (to_tokudb + len_bytes + length);
999 }
1000
create_tokudb_trx_data_instance(tokudb_trx_data ** out_trx)1001 static int create_tokudb_trx_data_instance(tokudb_trx_data** out_trx) {
1002 int error;
1003 tokudb_trx_data* trx = (tokudb_trx_data *) tokudb::memory::malloc(
1004 sizeof(*trx),
1005 MYF(MY_ZEROFILL));
1006 if (!trx) {
1007 error = ENOMEM;
1008 goto cleanup;
1009 }
1010
1011 *out_trx = trx;
1012 error = 0;
1013 cleanup:
1014 return error;
1015 }
1016
tokudb_generate_row(DB * dest_db,TOKUDB_UNUSED (DB * src_db),DBT * dest_key,DBT * dest_val,const DBT * src_key,const DBT * src_val)1017 static inline int tokudb_generate_row(DB* dest_db,
1018 TOKUDB_UNUSED(DB* src_db),
1019 DBT* dest_key,
1020 DBT* dest_val,
1021 const DBT* src_key,
1022 const DBT* src_val) {
1023 int error;
1024
1025 DB* curr_db = dest_db;
1026 uchar* row_desc = NULL;
1027 uint32_t desc_size;
1028 uchar* buff = NULL;
1029 uint32_t max_key_len = 0;
1030
1031 row_desc = (uchar *)curr_db->descriptor->dbt.data;
1032 row_desc += (*(uint32_t *)row_desc);
1033 desc_size = (*(uint32_t *)row_desc) - 4;
1034 row_desc += 4;
1035
1036 if (is_key_pk(row_desc)) {
1037 if (dest_key->flags == DB_DBT_REALLOC && dest_key->data != NULL) {
1038 free(dest_key->data);
1039 }
1040 if (dest_val != NULL) {
1041 if (dest_val->flags == DB_DBT_REALLOC && dest_val->data != NULL) {
1042 free(dest_val->data);
1043 }
1044 }
1045 dest_key->data = src_key->data;
1046 dest_key->size = src_key->size;
1047 dest_key->flags = 0;
1048 if (dest_val != NULL) {
1049 dest_val->data = src_val->data;
1050 dest_val->size = src_val->size;
1051 dest_val->flags = 0;
1052 }
1053 error = 0;
1054 goto cleanup;
1055 }
1056 // at this point, we need to create the key/val and set it
1057 // in the DBTs
1058 if (dest_key->flags == 0) {
1059 dest_key->ulen = 0;
1060 dest_key->size = 0;
1061 dest_key->data = NULL;
1062 dest_key->flags = DB_DBT_REALLOC;
1063 }
1064 if (dest_key->flags == DB_DBT_REALLOC) {
1065 max_key_len = max_key_size_from_desc(row_desc, desc_size);
1066 max_key_len += src_key->size;
1067
1068 if (max_key_len > dest_key->ulen) {
1069 void* old_ptr = dest_key->data;
1070 void* new_ptr = NULL;
1071 new_ptr = realloc(old_ptr, max_key_len);
1072 assert_always(new_ptr);
1073 dest_key->data = new_ptr;
1074 dest_key->ulen = max_key_len;
1075 }
1076
1077 buff = (uchar *)dest_key->data;
1078 assert_always(buff != nullptr);
1079 assert_always(max_key_len > 0);
1080 } else {
1081 assert_unreachable();
1082 }
1083
1084 dest_key->size = pack_key_from_desc(buff, row_desc, desc_size, src_key,
1085 src_val);
1086 assert_always(dest_key->ulen >= dest_key->size);
1087 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY)) &&
1088 !max_key_len) {
1089 max_key_len = max_key_size_from_desc(row_desc, desc_size);
1090 max_key_len += src_key->size;
1091 }
1092 if (max_key_len) {
1093 assert_always(max_key_len >= dest_key->size);
1094 }
1095
1096 row_desc += desc_size;
1097 desc_size = (*(uint32_t *)row_desc) - 4;
1098 row_desc += 4;
1099 if (dest_val != NULL) {
1100 if (!is_key_clustering(desc_size) || src_val->size == 0) {
1101 dest_val->size = 0;
1102 } else {
1103 uchar* buff = NULL;
1104 if (dest_val->flags == 0) {
1105 dest_val->ulen = 0;
1106 dest_val->size = 0;
1107 dest_val->data = NULL;
1108 dest_val->flags = DB_DBT_REALLOC;
1109 }
1110 if (dest_val->flags == DB_DBT_REALLOC){
1111 if (dest_val->ulen < src_val->size) {
1112 void* old_ptr = dest_val->data;
1113 void* new_ptr = NULL;
1114 new_ptr = realloc(old_ptr, src_val->size);
1115 assert_always(new_ptr);
1116 dest_val->data = new_ptr;
1117 dest_val->ulen = src_val->size;
1118 }
1119 buff = (uchar *)dest_val->data;
1120 assert_always(buff != NULL);
1121 } else {
1122 assert_unreachable();
1123 }
1124 dest_val->size = pack_clustering_val_from_desc(
1125 buff,
1126 row_desc,
1127 desc_size,
1128 src_val);
1129 assert_always(dest_val->ulen >= dest_val->size);
1130 }
1131 }
1132 error = 0;
1133 cleanup:
1134 return error;
1135 }
1136
generate_row_for_del(DB * dest_db,DB * src_db,DBT_ARRAY * dest_key_arrays,const DBT * src_key,const DBT * src_val)1137 static int generate_row_for_del(
1138 DB *dest_db,
1139 DB *src_db,
1140 DBT_ARRAY *dest_key_arrays,
1141 const DBT *src_key,
1142 const DBT *src_val
1143 )
1144 {
1145 DBT* dest_key = &dest_key_arrays->dbts[0];
1146 return tokudb_generate_row(
1147 dest_db,
1148 src_db,
1149 dest_key,
1150 NULL,
1151 src_key,
1152 src_val
1153 );
1154 }
1155
1156
generate_row_for_put(DB * dest_db,DB * src_db,DBT_ARRAY * dest_key_arrays,DBT_ARRAY * dest_val_arrays,const DBT * src_key,const DBT * src_val)1157 static int generate_row_for_put(
1158 DB *dest_db,
1159 DB *src_db,
1160 DBT_ARRAY *dest_key_arrays,
1161 DBT_ARRAY *dest_val_arrays,
1162 const DBT *src_key,
1163 const DBT *src_val
1164 )
1165 {
1166 DBT* dest_key = &dest_key_arrays->dbts[0];
1167 DBT *dest_val = (dest_val_arrays == NULL) ? NULL : &dest_val_arrays->dbts[0];
1168 return tokudb_generate_row(
1169 dest_db,
1170 src_db,
1171 dest_key,
1172 dest_val,
1173 src_key,
1174 src_val
1175 );
1176 }
1177
ha_tokudb(handlerton * hton,TABLE_SHARE * table_arg)1178 ha_tokudb::ha_tokudb(handlerton * hton, TABLE_SHARE * table_arg):handler(hton, table_arg) {
1179 TOKUDB_HANDLER_DBUG_ENTER("");
1180 share = NULL;
1181 int_table_flags = HA_REC_NOT_IN_SEQ | HA_NULL_IN_KEY | HA_CAN_INDEX_BLOBS
1182 | HA_PRIMARY_KEY_IN_READ_INDEX | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION
1183 | HA_FILE_BASED | HA_AUTO_PART_KEY | HA_TABLE_SCAN_ON_INDEX
1184 | HA_CAN_WRITE_DURING_OPTIMIZE | HA_ONLINE_ANALYZE;
1185 alloc_ptr = NULL;
1186 rec_buff = NULL;
1187 rec_update_buff = NULL;
1188 transaction = NULL;
1189 cursor = NULL;
1190 fixed_cols_for_query = NULL;
1191 var_cols_for_query = NULL;
1192 num_fixed_cols_for_query = 0;
1193 num_var_cols_for_query = 0;
1194 unpack_entire_row = true;
1195 read_blobs = false;
1196 read_key = false;
1197 added_rows = 0;
1198 deleted_rows = 0;
1199 updated_rows = 0;
1200 last_dup_key = UINT_MAX;
1201 using_ignore = false;
1202 using_ignore_no_key = false;
1203 last_cursor_error = 0;
1204 range_lock_grabbed = false;
1205 blob_buff = NULL;
1206 num_blob_bytes = 0;
1207 delay_updating_ai_metadata = false;
1208 ai_metadata_update_required = false;
1209 memset(mult_key_dbt_array, 0, sizeof(mult_key_dbt_array));
1210 memset(mult_rec_dbt_array, 0, sizeof(mult_rec_dbt_array));
1211 for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1212 toku_dbt_array_init(&mult_key_dbt_array[i], 1);
1213 }
1214 for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1215 toku_dbt_array_init(&mult_rec_dbt_array[i], 1);
1216 }
1217 loader = NULL;
1218 abort_loader = false;
1219 memset(&lc, 0, sizeof(lc));
1220 lock.type = TL_IGNORE;
1221 for (uint32_t i = 0; i < MAX_KEY+1; i++) {
1222 mult_put_flags[i] = 0;
1223 mult_del_flags[i] = DB_DELETE_ANY;
1224 mult_dbt_flags[i] = DB_DBT_REALLOC;
1225 }
1226 num_DBs_locked_in_bulk = false;
1227 lock_count = 0;
1228 use_write_locks = false;
1229 range_query_buff = NULL;
1230 size_range_query_buff = 0;
1231 bytes_used_in_range_query_buff = 0;
1232 curr_range_query_buff_offset = 0;
1233 doing_bulk_fetch = false;
1234 prelocked_left_range_size = 0;
1235 prelocked_right_range_size = 0;
1236 tokudb_active_index = MAX_KEY;
1237 invalidate_icp();
1238 trx_handler_list.data = this;
1239 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
1240 in_rpl_write_rows = in_rpl_delete_rows = in_rpl_update_rows = false;
1241 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
1242 TOKUDB_HANDLER_DBUG_VOID_RETURN;
1243 }
1244
~ha_tokudb()1245 ha_tokudb::~ha_tokudb() {
1246 TOKUDB_HANDLER_DBUG_ENTER("");
1247 for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1248 toku_dbt_array_destroy(&mult_key_dbt_array[i]);
1249 }
1250 for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1251 toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
1252 }
1253 TOKUDB_HANDLER_DBUG_VOID_RETURN;
1254 }
1255
1256 //
1257 // states if table has an auto increment column, if so, sets index where auto inc column is to index
1258 // Parameters:
1259 // [out] index - if auto inc exists, then this param is set to where it exists in table, if not, then unchanged
1260 // Returns:
1261 // true if auto inc column exists, false otherwise
1262 //
has_auto_increment_flag(uint * index)1263 bool ha_tokudb::has_auto_increment_flag(uint* index) {
1264 //
1265 // check to see if we have auto increment field
1266 //
1267 bool ai_found = false;
1268 uint ai_index = 0;
1269 for (uint i = 0; i < table_share->fields; i++, ai_index++) {
1270 Field* field = table->field[i];
1271 if (field->flags & AUTO_INCREMENT_FLAG) {
1272 ai_found = true;
1273 *index = ai_index;
1274 break;
1275 }
1276 }
1277 return ai_found;
1278 }
1279
open_status_dictionary(DB ** ptr,const char * name,DB_TXN * txn)1280 static int open_status_dictionary(DB** ptr, const char* name, DB_TXN* txn) {
1281 int error;
1282 char* newname = NULL;
1283 size_t newname_len = get_max_dict_name_path_length(name);
1284 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
1285 if (newname == NULL) {
1286 error = ENOMEM;
1287 goto cleanup;
1288 }
1289 make_name(newname, newname_len, name, "status");
1290 TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "open:%s", newname);
1291
1292 error = tokudb::metadata::open(db_env, ptr, newname, txn);
1293 cleanup:
1294 tokudb::memory::free(newname);
1295 return error;
1296 }
1297
open_main_dictionary(const char * name,bool is_read_only,DB_TXN * txn)1298 int ha_tokudb::open_main_dictionary(
1299 const char* name,
1300 bool is_read_only,
1301 DB_TXN* txn) {
1302
1303 int error;
1304 char* newname = NULL;
1305 size_t newname_len = 0;
1306 uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1307
1308 assert_always(share->file == NULL);
1309 assert_always(share->key_file[primary_key] == NULL);
1310 newname_len = get_max_dict_name_path_length(name);
1311 newname = (char*)tokudb::memory::malloc(
1312 newname_len,
1313 MYF(MY_WME|MY_ZEROFILL));
1314 if (newname == NULL) {
1315 error = ENOMEM;
1316 goto exit;
1317 }
1318 make_name(newname, newname_len, name, "main");
1319
1320 error = db_create(&share->file, db_env, 0);
1321 if (error) {
1322 goto exit;
1323 }
1324 share->key_file[primary_key] = share->file;
1325
1326 error =
1327 share->file->open(
1328 share->file,
1329 txn,
1330 newname,
1331 NULL,
1332 DB_BTREE,
1333 open_flags,
1334 S_IWUSR);
1335 if (error) {
1336 goto exit;
1337 }
1338
1339 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1340 TOKUDB_DEBUG_OPEN,
1341 "open:%s:file=%p",
1342 newname,
1343 share->file);
1344
1345 error = 0;
1346 exit:
1347 if (error) {
1348 if (share->file) {
1349 int r = share->file->close(
1350 share->file,
1351 0
1352 );
1353 assert_always(r==0);
1354 share->file = NULL;
1355 share->key_file[primary_key] = NULL;
1356 }
1357 }
1358 tokudb::memory::free(newname);
1359 return error;
1360 }
1361
1362 //
1363 // Open a secondary table, the key will be a secondary index, the data will
1364 // be a primary key
1365 //
open_secondary_dictionary(DB ** ptr,KEY * key_info,const char * name,bool is_read_only,DB_TXN * txn)1366 int ha_tokudb::open_secondary_dictionary(
1367 DB** ptr,
1368 KEY* key_info,
1369 const char* name,
1370 bool is_read_only,
1371 DB_TXN* txn) {
1372
1373 int error = ENOSYS;
1374 char dict_name[MAX_DICT_NAME_LEN];
1375 uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1376 char* newname = NULL;
1377 size_t newname_len = 0;
1378
1379 sprintf(dict_name, "key-%s", key_info->name);
1380
1381 newname_len = get_max_dict_name_path_length(name);
1382 newname =
1383 (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME|MY_ZEROFILL));
1384 if (newname == NULL) {
1385 error = ENOMEM;
1386 goto cleanup;
1387 }
1388 make_name(newname, newname_len, name, dict_name);
1389
1390
1391 if ((error = db_create(ptr, db_env, 0))) {
1392 my_errno = error;
1393 goto cleanup;
1394 }
1395
1396
1397 error = (*ptr)->open(*ptr, txn, newname, NULL, DB_BTREE, open_flags, S_IWUSR);
1398 if (error) {
1399 my_errno = error;
1400 goto cleanup;
1401 }
1402 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1403 TOKUDB_DEBUG_OPEN,
1404 "open:%s:file=%p",
1405 newname,
1406 *ptr);
1407 cleanup:
1408 if (error) {
1409 if (*ptr) {
1410 int r = (*ptr)->close(*ptr, 0);
1411 assert_always(r==0);
1412 *ptr = NULL;
1413 }
1414 }
1415 tokudb::memory::free(newname);
1416 return error;
1417 }
1418
initialize_col_pack_info(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)1419 static int initialize_col_pack_info(KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, uint keynr) {
1420 int error = ENOSYS;
1421 //
1422 // set up the cp_info
1423 //
1424 assert_always(kc_info->cp_info[keynr] == NULL);
1425 kc_info->cp_info[keynr] = (COL_PACK_INFO*)tokudb::memory::malloc(
1426 table_share->fields * sizeof(COL_PACK_INFO),
1427 MYF(MY_WME | MY_ZEROFILL));
1428 if (kc_info->cp_info[keynr] == NULL) {
1429 error = ENOMEM;
1430 goto exit;
1431 }
1432 {
1433 uint32_t curr_fixed_offset = 0;
1434 uint32_t curr_var_index = 0;
1435 for (uint j = 0; j < table_share->fields; j++) {
1436 COL_PACK_INFO* curr = &kc_info->cp_info[keynr][j];
1437 //
1438 // need to set the offsets / indexes
1439 // offsets are calculated AFTER the NULL bytes
1440 //
1441 if (!bitmap_is_set(&kc_info->key_filters[keynr],j)) {
1442 if (is_fixed_field(kc_info, j)) {
1443 curr->col_pack_val = curr_fixed_offset;
1444 curr_fixed_offset += kc_info->field_lengths[j];
1445 }
1446 else if (is_variable_field(kc_info, j)) {
1447 curr->col_pack_val = curr_var_index;
1448 curr_var_index++;
1449 }
1450 }
1451 }
1452
1453 //
1454 // set up the mcp_info
1455 //
1456 kc_info->mcp_info[keynr].fixed_field_size = get_fixed_field_size(
1457 kc_info,
1458 table_share,
1459 keynr
1460 );
1461 kc_info->mcp_info[keynr].len_of_offsets = get_len_of_offsets(
1462 kc_info,
1463 table_share,
1464 keynr
1465 );
1466
1467 error = 0;
1468 }
1469 exit:
1470 return error;
1471 }
1472
1473 // reset the kc_info state at keynr
reset_key_and_col_info(KEY_AND_COL_INFO * kc_info,uint keynr)1474 static void reset_key_and_col_info(KEY_AND_COL_INFO *kc_info, uint keynr) {
1475 bitmap_clear_all(&kc_info->key_filters[keynr]);
1476 tokudb::memory::free(kc_info->cp_info[keynr]);
1477 kc_info->cp_info[keynr] = NULL;
1478 kc_info->mcp_info[keynr] = (MULTI_COL_PACK_INFO) { 0, 0 };
1479 }
1480
initialize_key_and_col_info(TABLE_SHARE * table_share,TABLE * table,KEY_AND_COL_INFO * kc_info,uint hidden_primary_key,uint primary_key)1481 static int initialize_key_and_col_info(
1482 TABLE_SHARE* table_share,
1483 TABLE* table,
1484 KEY_AND_COL_INFO* kc_info,
1485 uint hidden_primary_key,
1486 uint primary_key) {
1487
1488 int error = 0;
1489 uint32_t curr_blob_field_index = 0;
1490 uint32_t max_var_bytes = 0;
1491 //
1492 // fill in the field lengths. 0 means it is a variable sized field length
1493 // fill in length_bytes, 0 means it is fixed or blob
1494 //
1495 for (uint i = 0; i < table_share->fields; i++) {
1496 Field* field = table_share->field[i];
1497 TOKU_TYPE toku_type = mysql_to_toku_type(field);
1498 uint32 pack_length = 0;
1499 switch (toku_type) {
1500 case toku_type_int:
1501 case toku_type_double:
1502 case toku_type_float:
1503 case toku_type_fixbinary:
1504 case toku_type_fixstring:
1505 pack_length = field->pack_length();
1506 assert_always(pack_length < 1<<16);
1507 kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_FIXED_FIELD;
1508 kc_info->field_lengths[i] = (uint16_t)pack_length;
1509 kc_info->length_bytes[i] = 0;
1510 break;
1511 case toku_type_blob:
1512 kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_BLOB_FIELD;
1513 kc_info->field_lengths[i] = 0;
1514 kc_info->length_bytes[i] = 0;
1515 kc_info->blob_fields[curr_blob_field_index] = i;
1516 curr_blob_field_index++;
1517 break;
1518 case toku_type_varstring:
1519 case toku_type_varbinary:
1520 kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_VARIABLE_FIELD;
1521 kc_info->field_lengths[i] = 0;
1522 kc_info->length_bytes[i] =
1523 (uchar)((Field_varstring*)field)->length_bytes;
1524 max_var_bytes += field->field_length;
1525 break;
1526 default:
1527 assert_unreachable();
1528 }
1529 }
1530 kc_info->num_blobs = curr_blob_field_index;
1531
1532 //
1533 // initialize share->num_offset_bytes
1534 // because MAX_REF_LENGTH is 65536, we
1535 // can safely set num_offset_bytes to 1 or 2
1536 //
1537 if (max_var_bytes < 256) {
1538 kc_info->num_offset_bytes = 1;
1539 } else {
1540 kc_info->num_offset_bytes = 2;
1541 }
1542
1543 for (uint i = 0;
1544 i < table_share->keys + tokudb_test(hidden_primary_key);
1545 i++) {
1546 //
1547 // do the cluster/primary key filtering calculations
1548 //
1549 if (!(i==primary_key && hidden_primary_key)) {
1550 if (i == primary_key) {
1551 set_key_filter(
1552 &kc_info->key_filters[primary_key],
1553 &table_share->key_info[primary_key],
1554 table,
1555 true);
1556 } else {
1557 set_key_filter(
1558 &kc_info->key_filters[i],
1559 &table_share->key_info[i],
1560 table,
1561 true);
1562 if (!hidden_primary_key) {
1563 set_key_filter(
1564 &kc_info->key_filters[i],
1565 &table_share->key_info[primary_key],
1566 table,
1567 true);
1568 }
1569 }
1570 }
1571 if (i == primary_key || key_is_clustering(&table_share->key_info[i])) {
1572 error = initialize_col_pack_info(kc_info, table_share, i);
1573 if (error) {
1574 goto exit;
1575 }
1576 }
1577 }
1578 exit:
1579 return error;
1580 }
1581
can_replace_into_be_fast(TABLE_SHARE * table_share,KEY_AND_COL_INFO * kc_info,uint pk)1582 bool ha_tokudb::can_replace_into_be_fast(
1583 TABLE_SHARE* table_share,
1584 KEY_AND_COL_INFO* kc_info,
1585 uint pk) {
1586
1587 uint curr_num_DBs = table_share->keys + tokudb_test(hidden_primary_key);
1588 bool ret_val;
1589 if (curr_num_DBs == 1) {
1590 ret_val = true;
1591 goto exit;
1592 }
1593 ret_val = true;
1594 for (uint curr_index = 0; curr_index < table_share->keys; curr_index++) {
1595 if (curr_index == pk) continue;
1596 KEY* curr_key_info = &table_share->key_info[curr_index];
1597 for (uint i = 0; i < curr_key_info->user_defined_key_parts; i++) {
1598 uint16 curr_field_index = curr_key_info->key_part[i].field->field_index;
1599 if (!bitmap_is_set(&kc_info->key_filters[curr_index],curr_field_index)) {
1600 ret_val = false;
1601 goto exit;
1602 }
1603 if (bitmap_is_set(&kc_info->key_filters[curr_index], curr_field_index) &&
1604 !bitmap_is_set(&kc_info->key_filters[pk], curr_field_index)) {
1605 ret_val = false;
1606 goto exit;
1607 }
1608
1609 }
1610 }
1611 exit:
1612 return ret_val;
1613 }
1614
initialize_share(const char * name,int mode)1615 int ha_tokudb::initialize_share(const char* name, int mode) {
1616 int error = 0;
1617 uint64_t num_rows = 0;
1618 DB_TXN* txn = NULL;
1619 bool do_commit = false;
1620 THD* thd = ha_thd();
1621 tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
1622 if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
1623 txn = trx->sub_sp_level;
1624 }
1625 else {
1626 do_commit = true;
1627 error = txn_begin(db_env, 0, &txn, 0, thd);
1628 if (error) { goto exit; }
1629 }
1630
1631
1632 error = get_status(txn);
1633 if (error) {
1634 goto exit;
1635 }
1636 if (share->version != HA_TOKU_VERSION) {
1637 error = ENOSYS;
1638 goto exit;
1639 }
1640
1641 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
1642 #if defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
1643 // verify frm data for non-partitioned tables
1644 if (TOKU_PARTITION_WRITE_FRM_DATA || table->part_info == NULL) {
1645 error = verify_frm_data(table->s->path.str, txn);
1646 if (error)
1647 goto exit;
1648 } else {
1649 // remove the frm data for partitions since we are not maintaining it
1650 error = remove_frm_data(share->status_block, txn);
1651 if (error)
1652 goto exit;
1653 }
1654 #else
1655 error = verify_frm_data(table->s->path.str, txn);
1656 if (error)
1657 goto exit;
1658 #endif // defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
1659 #endif // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
1660
1661 error =
1662 initialize_key_and_col_info(
1663 table_share,
1664 table,
1665 &share->kc_info,
1666 hidden_primary_key,
1667 primary_key);
1668 if (error) { goto exit; }
1669
1670 error = open_main_dictionary(name, mode == O_RDONLY, txn);
1671 if (error) {
1672 goto exit;
1673 }
1674
1675 share->has_unique_keys = false;
1676 share->_keys = table_share->keys;
1677 share->_max_key_parts = table_share->key_parts;
1678 share->_key_descriptors =
1679 (TOKUDB_SHARE::key_descriptor_t*)tokudb::memory::malloc(
1680 sizeof(TOKUDB_SHARE::key_descriptor_t) * share->_keys,
1681 MYF(MY_ZEROFILL));
1682
1683 /* Open other keys; These are part of the share structure */
1684 for (uint i = 0; i < table_share->keys; i++) {
1685 share->_key_descriptors[i]._parts =
1686 table_share->key_info[i].user_defined_key_parts;
1687 if (i == primary_key) {
1688 share->_key_descriptors[i]._is_unique = true;
1689 share->_key_descriptors[i]._name = tokudb::memory::strdup("primary", 0);
1690 } else {
1691 share->_key_descriptors[i]._is_unique = false;
1692 share->_key_descriptors[i]._name =
1693 tokudb::memory::strdup(table_share->key_info[i].name, 0);
1694 }
1695
1696 if (table_share->key_info[i].flags & HA_NOSAME) {
1697 share->_key_descriptors[i]._is_unique = true;
1698 share->has_unique_keys = true;
1699 }
1700 if (i != primary_key) {
1701 error =
1702 open_secondary_dictionary(
1703 &share->key_file[i],
1704 &table_share->key_info[i],
1705 name,
1706 mode == O_RDONLY,
1707 txn);
1708 if (error) {
1709 goto exit;
1710 }
1711 }
1712 }
1713 share->replace_into_fast =
1714 can_replace_into_be_fast(
1715 table_share,
1716 &share->kc_info,
1717 primary_key);
1718
1719 share->pk_has_string = false;
1720 if (!hidden_primary_key) {
1721 //
1722 // We need to set the ref_length to start at 5, to account for
1723 // the "infinity byte" in keys, and for placing the DBT size in the first four bytes
1724 //
1725 ref_length = sizeof(uint32_t) + sizeof(uchar);
1726 KEY_PART_INFO* key_part = table->key_info[primary_key].key_part;
1727 KEY_PART_INFO* end =
1728 key_part + table->key_info[primary_key].user_defined_key_parts;
1729 for (; key_part != end; key_part++) {
1730 ref_length += key_part->field->max_packed_col_length(key_part->length);
1731 TOKU_TYPE toku_type = mysql_to_toku_type(key_part->field);
1732 if (toku_type == toku_type_fixstring ||
1733 toku_type == toku_type_varstring ||
1734 toku_type == toku_type_blob
1735 )
1736 {
1737 share->pk_has_string = true;
1738 }
1739 }
1740 share->status |= STATUS_PRIMARY_KEY_INIT;
1741 }
1742 share->ref_length = ref_length;
1743
1744 error = estimate_num_rows(share->file, &num_rows, txn);
1745 //
1746 // estimate_num_rows should not fail under normal conditions
1747 //
1748 if (error == 0) {
1749 share->set_row_count(num_rows, true);
1750 } else {
1751 goto exit;
1752 }
1753 //
1754 // initialize auto increment data
1755 //
1756 share->has_auto_inc = has_auto_increment_flag(&share->ai_field_index);
1757 if (share->has_auto_inc) {
1758 init_auto_increment();
1759 }
1760
1761 if (may_table_be_empty(txn)) {
1762 share->try_table_lock = true;
1763 } else {
1764 share->try_table_lock = false;
1765 }
1766
1767 share->num_DBs = table_share->keys + tokudb_test(hidden_primary_key);
1768
1769 init_hidden_prim_key_info(txn);
1770
1771 // initialize cardinality info from the status dictionary
1772 {
1773 uint32_t rec_per_keys = tokudb::compute_total_key_parts(table_share);
1774 uint64_t* rec_per_key =
1775 (uint64_t*)tokudb::memory::malloc(
1776 rec_per_keys * sizeof(uint64_t),
1777 MYF(MY_FAE));
1778 error =
1779 tokudb::get_card_from_status(
1780 share->status_block,
1781 txn,
1782 rec_per_keys,
1783 rec_per_key);
1784 if (error) {
1785 memset(rec_per_key, 0, sizeof(ulonglong) * rec_per_keys);
1786 }
1787 share->init_cardinality_counts(rec_per_keys, rec_per_key);
1788 }
1789
1790 error = 0;
1791 exit:
1792 if (do_commit && txn) {
1793 commit_txn(txn,0);
1794 }
1795 return error;
1796 }
1797
1798 //
1799 // Creates and opens a handle to a table which already exists in a tokudb
1800 // database.
1801 // Parameters:
1802 // [in] name - table name
1803 // mode - seems to specify if table is read only
1804 // test_if_locked - unused
1805 // Returns:
1806 // 0 on success
1807 // 1 on error
1808 //
open(const char * name,int mode,uint test_if_locked)1809 int ha_tokudb::open(const char *name, int mode, uint test_if_locked) {
1810 TOKUDB_HANDLER_DBUG_ENTER("%s %o %u", name, mode, test_if_locked);
1811 THD* thd = ha_thd();
1812
1813 int error = 0;
1814 int ret_val = 0;
1815
1816 transaction = NULL;
1817 cursor = NULL;
1818
1819
1820 /* Open primary key */
1821 hidden_primary_key = 0;
1822 if ((primary_key = table_share->primary_key) >= MAX_KEY) {
1823 // No primary key
1824 primary_key = table_share->keys;
1825 key_used_on_scan = MAX_KEY;
1826 hidden_primary_key = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
1827 ref_length = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t);
1828 }
1829 else {
1830 key_used_on_scan = primary_key;
1831 }
1832
1833 /* Need some extra memory in case of packed keys */
1834 // the "+ 1" is for the first byte that states +/- infinity
1835 // multiply everything by 2 to account for clustered keys having a key and primary key together
1836 max_key_length = 2*(table_share->max_key_length + MAX_REF_PARTS * 3 + sizeof(uchar));
1837 alloc_ptr = tokudb::memory::multi_malloc(
1838 MYF(MY_WME),
1839 &key_buff, max_key_length,
1840 &key_buff2, max_key_length,
1841 &key_buff3, max_key_length,
1842 &key_buff4, max_key_length,
1843 &prelocked_left_range, max_key_length,
1844 &prelocked_right_range, max_key_length,
1845 &primary_key_buff, (hidden_primary_key ? 0 : max_key_length),
1846 &fixed_cols_for_query, table_share->fields*sizeof(uint32_t),
1847 &var_cols_for_query, table_share->fields*sizeof(uint32_t),
1848 NullS);
1849 if (alloc_ptr == NULL) {
1850 ret_val = 1;
1851 goto exit;
1852 }
1853
1854 size_range_query_buff = tokudb::sysvars::read_buf_size(thd);
1855 range_query_buff =
1856 (uchar*)tokudb::memory::malloc(size_range_query_buff, MYF(MY_WME));
1857 if (range_query_buff == NULL) {
1858 ret_val = 1;
1859 goto exit;
1860 }
1861
1862 alloced_rec_buff_length = table_share->rec_buff_length +
1863 table_share->fields;
1864 rec_buff = (uchar *) tokudb::memory::malloc(
1865 alloced_rec_buff_length,
1866 MYF(MY_WME));
1867 if (rec_buff == NULL) {
1868 ret_val = 1;
1869 goto exit;
1870 }
1871
1872 alloced_update_rec_buff_length = alloced_rec_buff_length;
1873 rec_update_buff = (uchar*)tokudb::memory::malloc(
1874 alloced_update_rec_buff_length,
1875 MYF(MY_WME));
1876 if (rec_update_buff == NULL) {
1877 ret_val = 1;
1878 goto exit;
1879 }
1880
1881 // lookup or create share
1882 share = TOKUDB_SHARE::get_share(name, &lock, true);
1883 assert_always(share);
1884
1885 if (share->state() != TOKUDB_SHARE::OPENED) {
1886 // means we're responsible for the transition to OPENED, ERROR or CLOSED
1887
1888 ret_val = allocate_key_and_col_info(table_share, &share->kc_info);
1889 if (ret_val == 0) {
1890 ret_val = initialize_share(name, mode);
1891 }
1892
1893 if (ret_val == 0) {
1894 share->set_state(TOKUDB_SHARE::OPENED);
1895 } else {
1896 free_key_and_col_info(&share->kc_info);
1897 share->set_state(TOKUDB_SHARE::ERROR);
1898 }
1899 share->unlock();
1900 } else {
1901 // got an already OPENED instance
1902 share->unlock();
1903 }
1904
1905 if (share->state() == TOKUDB_SHARE::ERROR) {
1906 share->release();
1907 goto exit;
1908 }
1909
1910 assert_always(share->state() == TOKUDB_SHARE::OPENED);
1911
1912 ref_length = share->ref_length; // If second open
1913
1914 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1915 TOKUDB_DEBUG_OPEN,
1916 "tokudbopen:%p:share=%p:file=%p:table=%p:table->s=%p:%d",
1917 this,
1918 share,
1919 share->file,
1920 table,
1921 table->s,
1922 share->use_count());
1923
1924 key_read = false;
1925 stats.block_size = 1<<20; // QQQ Tokudb DB block size
1926
1927 info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
1928
1929 exit:
1930 if (ret_val) {
1931 tokudb::memory::free(range_query_buff);
1932 range_query_buff = NULL;
1933 tokudb::memory::free(alloc_ptr);
1934 alloc_ptr = NULL;
1935 tokudb::memory::free(rec_buff);
1936 rec_buff = NULL;
1937 tokudb::memory::free(rec_update_buff);
1938 rec_update_buff = NULL;
1939
1940 if (error) {
1941 my_errno = error;
1942 }
1943 }
1944 TOKUDB_HANDLER_DBUG_RETURN(ret_val);
1945 }
1946
1947 //
1948 // estimate the number of rows in a DB
1949 // Parameters:
1950 // [in] db - DB whose number of rows will be estimated
1951 // [out] num_rows - number of estimated rows in db
1952 // Returns:
1953 // 0 on success
1954 // error otherwise
1955 //
estimate_num_rows(DB * db,uint64_t * num_rows,DB_TXN * txn)1956 int ha_tokudb::estimate_num_rows(DB* db, uint64_t* num_rows, DB_TXN* txn) {
1957 int error = ENOSYS;
1958 bool do_commit = false;
1959 DB_BTREE_STAT64 dict_stats;
1960 DB_TXN* txn_to_use = NULL;
1961
1962 if (txn == NULL) {
1963 error = txn_begin(db_env, 0, &txn_to_use, DB_READ_UNCOMMITTED, ha_thd());
1964 if (error) goto cleanup;
1965 do_commit = true;
1966 }
1967 else {
1968 txn_to_use = txn;
1969 }
1970
1971 error = db->stat64(db, txn_to_use, &dict_stats);
1972 if (error) { goto cleanup; }
1973
1974 *num_rows = dict_stats.bt_ndata;
1975 error = 0;
1976 cleanup:
1977 if (do_commit) {
1978 commit_txn(txn_to_use, 0);
1979 txn_to_use = NULL;
1980 }
1981 return error;
1982 }
1983
1984
write_to_status(DB * db,HA_METADATA_KEY curr_key_data,void * data,uint size,DB_TXN * txn)1985 int ha_tokudb::write_to_status(DB* db, HA_METADATA_KEY curr_key_data, void* data, uint size, DB_TXN* txn ){
1986 return write_metadata(db, &curr_key_data, sizeof curr_key_data, data, size, txn);
1987 }
1988
remove_from_status(DB * db,HA_METADATA_KEY curr_key_data,DB_TXN * txn)1989 int ha_tokudb::remove_from_status(DB *db, HA_METADATA_KEY curr_key_data, DB_TXN *txn) {
1990 return remove_metadata(db, &curr_key_data, sizeof curr_key_data, txn);
1991 }
1992
remove_metadata(DB * db,void * key_data,uint key_size,DB_TXN * transaction)1993 int ha_tokudb::remove_metadata(DB* db, void* key_data, uint key_size, DB_TXN* transaction){
1994 int error;
1995 DBT key;
1996 DB_TXN* txn = NULL;
1997 bool do_commit = false;
1998 //
1999 // transaction to be used for putting metadata into status.tokudb
2000 //
2001 if (transaction == NULL) {
2002 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
2003 if (error) {
2004 goto cleanup;
2005 }
2006 do_commit = true;
2007 }
2008 else {
2009 txn = transaction;
2010 }
2011
2012 memset(&key, 0, sizeof(key));
2013 key.data = key_data;
2014 key.size = key_size;
2015 error = db->del(db, txn, &key, DB_DELETE_ANY);
2016 if (error) {
2017 goto cleanup;
2018 }
2019
2020 error = 0;
2021 cleanup:
2022 if (do_commit && txn) {
2023 if (!error) {
2024 commit_txn(txn, DB_TXN_NOSYNC);
2025 }
2026 else {
2027 abort_txn(txn);
2028 }
2029 }
2030 return error;
2031 }
2032
2033 //
2034 // helper function to write a piece of metadata in to status.tokudb
2035 //
write_metadata(DB * db,void * key_data,uint key_size,void * val_data,uint val_size,DB_TXN * transaction)2036 int ha_tokudb::write_metadata(DB* db, void* key_data, uint key_size, void* val_data, uint val_size, DB_TXN* transaction ){
2037 int error;
2038 DBT key;
2039 DBT value;
2040 DB_TXN* txn = NULL;
2041 bool do_commit = false;
2042 //
2043 // transaction to be used for putting metadata into status.tokudb
2044 //
2045 if (transaction == NULL) {
2046 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
2047 if (error) {
2048 goto cleanup;
2049 }
2050 do_commit = true;
2051 }
2052 else {
2053 txn = transaction;
2054 }
2055
2056 memset(&key, 0, sizeof(key));
2057 memset(&value, 0, sizeof(value));
2058 key.data = key_data;
2059 key.size = key_size;
2060 value.data = val_data;
2061 value.size = val_size;
2062 error = db->put(db, txn, &key, &value, 0);
2063 if (error) {
2064 goto cleanup;
2065 }
2066
2067 error = 0;
2068 cleanup:
2069 if (do_commit && txn) {
2070 if (!error) {
2071 commit_txn(txn, DB_TXN_NOSYNC);
2072 }
2073 else {
2074 abort_txn(txn);
2075 }
2076 }
2077 return error;
2078 }
2079
2080 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
write_frm_data(DB * db,DB_TXN * txn,const char * frm_name)2081 int ha_tokudb::write_frm_data(DB* db, DB_TXN* txn, const char* frm_name) {
2082 TOKUDB_HANDLER_DBUG_ENTER("%p %p %s", db, txn, frm_name);
2083
2084 uchar* frm_data = NULL;
2085 size_t frm_len = 0;
2086 int error = 0;
2087
2088 #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099
2089 error = table_share->read_frm_image((const uchar**)&frm_data,&frm_len);
2090 if (error) { goto cleanup; }
2091 #else
2092 error = readfrm(frm_name,&frm_data,&frm_len);
2093 if (error) { goto cleanup; }
2094 #endif
2095
2096 error = write_to_status(db,hatoku_frm_data,frm_data,(uint)frm_len, txn);
2097 if (error) { goto cleanup; }
2098
2099 error = 0;
2100 cleanup:
2101 tokudb::memory::free(frm_data);
2102 TOKUDB_HANDLER_DBUG_RETURN(error);
2103 }
2104
remove_frm_data(DB * db,DB_TXN * txn)2105 int ha_tokudb::remove_frm_data(DB *db, DB_TXN *txn) {
2106 return remove_from_status(db, hatoku_frm_data, txn);
2107 }
2108
smart_dbt_callback_verify_frm(TOKUDB_UNUSED (DBT const * key),DBT const * row,void * context)2109 static int smart_dbt_callback_verify_frm(TOKUDB_UNUSED(DBT const* key),
2110 DBT const* row,
2111 void* context) {
2112 DBT* stored_frm = (DBT *)context;
2113 stored_frm->size = row->size;
2114 stored_frm->data = (uchar *)tokudb::memory::malloc(row->size, MYF(MY_WME));
2115 assert_always(stored_frm->data);
2116 memcpy(stored_frm->data, row->data, row->size);
2117 return 0;
2118 }
2119
verify_frm_data(const char * frm_name,DB_TXN * txn)2120 int ha_tokudb::verify_frm_data(const char* frm_name, DB_TXN* txn) {
2121 TOKUDB_HANDLER_DBUG_ENTER("%s", frm_name);
2122 uchar* mysql_frm_data = NULL;
2123 size_t mysql_frm_len = 0;
2124 DBT key = {};
2125 DBT stored_frm = {};
2126 int error = 0;
2127 HA_METADATA_KEY curr_key = hatoku_frm_data;
2128
2129 // get the frm data from MySQL
2130 #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099
2131 error = table_share->read_frm_image((const uchar**)&mysql_frm_data,&mysql_frm_len);
2132 if (error) {
2133 goto cleanup;
2134 }
2135 #else
2136 error = readfrm(frm_name,&mysql_frm_data,&mysql_frm_len);
2137 if (error) {
2138 goto cleanup;
2139 }
2140 #endif
2141
2142 key.data = &curr_key;
2143 key.size = sizeof(curr_key);
2144 error = share->status_block->getf_set(
2145 share->status_block,
2146 txn,
2147 0,
2148 &key,
2149 smart_dbt_callback_verify_frm,
2150 &stored_frm
2151 );
2152 if (error == DB_NOTFOUND) {
2153 // if not found, write it
2154 error = write_frm_data(share->status_block, txn, frm_name);
2155 goto cleanup;
2156 } else if (error) {
2157 goto cleanup;
2158 }
2159
2160 if (stored_frm.size != mysql_frm_len || memcmp(stored_frm.data, mysql_frm_data, stored_frm.size)) {
2161 error = HA_ERR_TABLE_DEF_CHANGED;
2162 goto cleanup;
2163 }
2164
2165 error = 0;
2166 cleanup:
2167 tokudb::memory::free(mysql_frm_data);
2168 tokudb::memory::free(stored_frm.data);
2169 TOKUDB_HANDLER_DBUG_RETURN(error);
2170 }
2171 #endif // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
2172
2173 //
2174 // Updates status.tokudb with a new max value used for the auto increment column
2175 // Parameters:
2176 // [in] db - this will always be status.tokudb
2177 // val - value to store
2178 // Returns:
2179 // 0 on success, error otherwise
2180 //
2181 //
update_max_auto_inc(DB * db,ulonglong val)2182 int ha_tokudb::update_max_auto_inc(DB* db, ulonglong val){
2183 return write_to_status(db,hatoku_max_ai,&val,sizeof(val), NULL);
2184 }
2185
2186 //
2187 // Writes the initial auto increment value, as specified by create table
2188 // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
2189 // then the value 100 will be stored here in val
2190 // Parameters:
2191 // [in] db - this will always be status.tokudb
2192 // val - value to store
2193 // Returns:
2194 // 0 on success, error otherwise
2195 //
2196 //
write_auto_inc_create(DB * db,ulonglong val,DB_TXN * txn)2197 int ha_tokudb::write_auto_inc_create(DB* db, ulonglong val, DB_TXN* txn){
2198 return write_to_status(db,hatoku_ai_create_value,&val,sizeof(val), txn);
2199 }
2200
2201
2202 //
2203 // Closes a handle to a table.
2204 //
close()2205 int ha_tokudb::close() {
2206 TOKUDB_HANDLER_DBUG_ENTER("");
2207 int r = __close();
2208 TOKUDB_HANDLER_DBUG_RETURN(r);
2209 }
2210
__close()2211 int ha_tokudb::__close() {
2212 TOKUDB_HANDLER_DBUG_ENTER("");
2213 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "close:%p", this);
2214 tokudb::memory::free(rec_buff);
2215 tokudb::memory::free(rec_update_buff);
2216 tokudb::memory::free(blob_buff);
2217 tokudb::memory::free(alloc_ptr);
2218 tokudb::memory::free(range_query_buff);
2219 for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
2220 toku_dbt_array_destroy(&mult_key_dbt_array[i]);
2221 }
2222 for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
2223 toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
2224 }
2225 rec_buff = NULL;
2226 rec_update_buff = NULL;
2227 alloc_ptr = NULL;
2228 ha_tokudb::reset();
2229 int retval = share->release();
2230 TOKUDB_HANDLER_DBUG_RETURN(retval);
2231 }
2232
2233 //
2234 // Reallocate record buffer (rec_buff) if needed
2235 // If not needed, does nothing
2236 // Parameters:
2237 // length - size of buffer required for rec_buff
2238 //
fix_rec_buff_for_blob(ulong length)2239 bool ha_tokudb::fix_rec_buff_for_blob(ulong length) {
2240 if (!rec_buff || (length > alloced_rec_buff_length)) {
2241 uchar* newptr = (uchar*)tokudb::memory::realloc(
2242 (void*)rec_buff,
2243 length,
2244 MYF(MY_ALLOW_ZERO_PTR));
2245 if (!newptr)
2246 return 1;
2247 rec_buff = newptr;
2248 alloced_rec_buff_length = length;
2249 }
2250 return 0;
2251 }
2252
2253 //
2254 // Reallocate record buffer (rec_buff) if needed
2255 // If not needed, does nothing
2256 // Parameters:
2257 // length - size of buffer required for rec_buff
2258 //
fix_rec_update_buff_for_blob(ulong length)2259 bool ha_tokudb::fix_rec_update_buff_for_blob(ulong length) {
2260 if (!rec_update_buff || (length > alloced_update_rec_buff_length)) {
2261 uchar* newptr = (uchar*)tokudb::memory::realloc(
2262 (void*)rec_update_buff,
2263 length,
2264 MYF(MY_ALLOW_ZERO_PTR));
2265 if (!newptr)
2266 return 1;
2267 rec_update_buff= newptr;
2268 alloced_update_rec_buff_length = length;
2269 }
2270 return 0;
2271 }
2272
2273 /* Calculate max length needed for row */
max_row_length(const uchar * buf)2274 ulong ha_tokudb::max_row_length(const uchar * buf) {
2275 ulong length = table_share->reclength + table_share->fields * 2;
2276 uint *ptr, *end;
2277 for (ptr = table_share->blob_field, end = ptr + table_share->blob_fields; ptr != end; ptr++) {
2278 Field_blob *blob = ((Field_blob *) table->field[*ptr]);
2279 length += blob->get_length((uchar *) (buf + field_offset(blob, table))) + 2;
2280 }
2281 return length;
2282 }
2283
2284 /*
2285 */
2286 //
2287 // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2288 // Pack a row for storage.
2289 // If the row is of fixed length, just store the row 'as is'.
2290 // If not, we will generate a packed row suitable for storage.
2291 // This will only fail if we don't have enough memory to pack the row,
2292 // which may only happen in rows with blobs, as the default row length is
2293 // pre-allocated.
2294 // Parameters:
2295 // [out] row - row stored in DBT to be converted
2296 // [out] buf - buffer where row is packed
2297 // [in] record - row in MySQL format
2298 //
2299
pack_row_in_buff(DBT * row,const uchar * record,uint index,uchar * row_buff)2300 int ha_tokudb::pack_row_in_buff(
2301 DBT * row,
2302 const uchar* record,
2303 uint index,
2304 uchar* row_buff
2305 )
2306 {
2307 uchar* fixed_field_ptr = NULL;
2308 uchar* var_field_offset_ptr = NULL;
2309 uchar* start_field_data_ptr = NULL;
2310 uchar* var_field_data_ptr = NULL;
2311 int r = ENOSYS;
2312 memset((void *) row, 0, sizeof(*row));
2313
2314 my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
2315
2316 // Copy null bytes
2317 memcpy(row_buff, record, table_share->null_bytes);
2318 fixed_field_ptr = row_buff + table_share->null_bytes;
2319 var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2320 start_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2321 var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2322
2323 // assert that when the hidden primary key exists, primary_key_offsets is NULL
2324 for (uint i = 0; i < table_share->fields; i++) {
2325 Field* field = table->field[i];
2326 uint curr_field_offset = field_offset(field, table);
2327 if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2328 continue;
2329 }
2330 if (is_fixed_field(&share->kc_info, i)) {
2331 fixed_field_ptr = pack_fixed_field(
2332 fixed_field_ptr,
2333 record + curr_field_offset,
2334 share->kc_info.field_lengths[i]
2335 );
2336 }
2337 else if (is_variable_field(&share->kc_info, i)) {
2338 var_field_data_ptr = pack_var_field(
2339 var_field_offset_ptr,
2340 var_field_data_ptr,
2341 start_field_data_ptr,
2342 record + curr_field_offset,
2343 share->kc_info.length_bytes[i],
2344 share->kc_info.num_offset_bytes
2345 );
2346 var_field_offset_ptr += share->kc_info.num_offset_bytes;
2347 }
2348 }
2349
2350 for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2351 Field* field = table->field[share->kc_info.blob_fields[i]];
2352 var_field_data_ptr = pack_toku_field_blob(
2353 var_field_data_ptr,
2354 record + field_offset(field, table),
2355 field
2356 );
2357 }
2358
2359 row->data = row_buff;
2360 row->size = (size_t) (var_field_data_ptr - row_buff);
2361 r = 0;
2362
2363 dbug_tmp_restore_column_map(table->write_set, old_map);
2364 return r;
2365 }
2366
2367
pack_row(DBT * row,const uchar * record,uint index)2368 int ha_tokudb::pack_row(
2369 DBT * row,
2370 const uchar* record,
2371 uint index
2372 )
2373 {
2374 return pack_row_in_buff(row,record,index,rec_buff);
2375 }
2376
pack_old_row_for_update(DBT * row,const uchar * record,uint index)2377 int ha_tokudb::pack_old_row_for_update(
2378 DBT * row,
2379 const uchar* record,
2380 uint index
2381 )
2382 {
2383 return pack_row_in_buff(row,record,index,rec_update_buff);
2384 }
2385
2386
unpack_blobs(uchar * record,const uchar * from_tokudb_blob,uint32_t num_bytes,bool check_bitmap)2387 int ha_tokudb::unpack_blobs(
2388 uchar* record,
2389 const uchar* from_tokudb_blob,
2390 uint32_t num_bytes,
2391 bool check_bitmap
2392 )
2393 {
2394 uint error = 0;
2395 uchar* ptr = NULL;
2396 const uchar* buff = NULL;
2397 //
2398 // assert that num_bytes > 0 iff share->num_blobs > 0
2399 //
2400 assert_always( !((share->kc_info.num_blobs == 0) && (num_bytes > 0)) );
2401 if (num_bytes > num_blob_bytes) {
2402 ptr = (uchar*)tokudb::memory::realloc(
2403 (void*)blob_buff, num_bytes,
2404 MYF(MY_ALLOW_ZERO_PTR));
2405 if (ptr == NULL) {
2406 error = ENOMEM;
2407 goto exit;
2408 }
2409 blob_buff = ptr;
2410 num_blob_bytes = num_bytes;
2411 }
2412
2413 memcpy(blob_buff, from_tokudb_blob, num_bytes);
2414 buff= blob_buff;
2415 for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2416 uint32_t curr_field_index = share->kc_info.blob_fields[i];
2417 bool skip = check_bitmap ?
2418 !(bitmap_is_set(table->read_set,curr_field_index) ||
2419 bitmap_is_set(table->write_set,curr_field_index)) :
2420 false;
2421 Field* field = table->field[curr_field_index];
2422 uint32_t len_bytes = field->row_pack_length();
2423 const uchar* end_buff = unpack_toku_field_blob(
2424 record + field_offset(field, table),
2425 buff,
2426 len_bytes,
2427 skip
2428 );
2429 // verify that the pointers to the blobs are all contained within the blob_buff
2430 if (!(blob_buff <= buff && end_buff <= blob_buff + num_bytes)) {
2431 error = -3000000;
2432 goto exit;
2433 }
2434 buff = end_buff;
2435 }
2436 // verify that the entire blob buffer was parsed
2437 if (share->kc_info.num_blobs > 0 && !(num_bytes > 0 && buff == blob_buff + num_bytes)) {
2438 error = -4000000;
2439 goto exit;
2440 }
2441
2442 error = 0;
2443 exit:
2444 return error;
2445 }
2446
2447 //
2448 // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2449 // Parameters:
2450 // [out] record - row in MySQL format
2451 // [in] row - row stored in DBT to be converted
2452 //
unpack_row(uchar * record,DBT const * row,DBT const * key,uint index)2453 int ha_tokudb::unpack_row(
2454 uchar* record,
2455 DBT const *row,
2456 DBT const *key,
2457 uint index
2458 )
2459 {
2460 //
2461 // two cases, fixed length row, and variable length row
2462 // fixed length row is first below
2463 //
2464 /* Copy null bits */
2465 int error = 0;
2466 const uchar* fixed_field_ptr = (const uchar *) row->data;
2467 const uchar* var_field_offset_ptr = NULL;
2468 const uchar* var_field_data_ptr = NULL;
2469 uint32_t data_end_offset = 0;
2470 memcpy(record, fixed_field_ptr, table_share->null_bytes);
2471 fixed_field_ptr += table_share->null_bytes;
2472
2473 var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2474 var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2475
2476 //
2477 // unpack the key, if necessary
2478 //
2479 if (!(hidden_primary_key && index == primary_key)) {
2480 unpack_key(record,key,index);
2481 }
2482
2483 uint32_t last_offset = 0;
2484 //
2485 // we have two methods of unpacking, one if we need to unpack the entire row
2486 // the second if we unpack a subset of the entire row
2487 // first method here is if we unpack the entire row
2488 //
2489 if (unpack_entire_row) {
2490 //
2491 // fill in parts of record that are not part of the key
2492 //
2493 for (uint i = 0; i < table_share->fields; i++) {
2494 Field* field = table->field[i];
2495 if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2496 continue;
2497 }
2498
2499 if (is_fixed_field(&share->kc_info, i)) {
2500 fixed_field_ptr = unpack_fixed_field(
2501 record + field_offset(field, table),
2502 fixed_field_ptr,
2503 share->kc_info.field_lengths[i]
2504 );
2505 }
2506 //
2507 // here, we DO modify var_field_data_ptr or var_field_offset_ptr
2508 // as we unpack variable sized fields
2509 //
2510 else if (is_variable_field(&share->kc_info, i)) {
2511 switch (share->kc_info.num_offset_bytes) {
2512 case (1):
2513 data_end_offset = var_field_offset_ptr[0];
2514 break;
2515 case (2):
2516 data_end_offset = uint2korr(var_field_offset_ptr);
2517 break;
2518 default:
2519 assert_unreachable();
2520 }
2521 unpack_var_field(
2522 record + field_offset(field, table),
2523 var_field_data_ptr,
2524 data_end_offset - last_offset,
2525 share->kc_info.length_bytes[i]
2526 );
2527 var_field_offset_ptr += share->kc_info.num_offset_bytes;
2528 var_field_data_ptr += data_end_offset - last_offset;
2529 last_offset = data_end_offset;
2530 }
2531 }
2532 error = unpack_blobs(
2533 record,
2534 var_field_data_ptr,
2535 row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2536 false
2537 );
2538 if (error) {
2539 goto exit;
2540 }
2541 }
2542 //
2543 // in this case, we unpack only what is specified
2544 // in fixed_cols_for_query and var_cols_for_query
2545 //
2546 else {
2547 //
2548 // first the fixed fields
2549 //
2550 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
2551 uint field_index = fixed_cols_for_query[i];
2552 Field* field = table->field[field_index];
2553 unpack_fixed_field(
2554 record + field_offset(field, table),
2555 fixed_field_ptr + share->kc_info.cp_info[index][field_index].col_pack_val,
2556 share->kc_info.field_lengths[field_index]
2557 );
2558 }
2559
2560 //
2561 // now the var fields
2562 // here, we do NOT modify var_field_data_ptr or var_field_offset_ptr
2563 //
2564 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
2565 uint field_index = var_cols_for_query[i];
2566 Field* field = table->field[field_index];
2567 uint32_t var_field_index = share->kc_info.cp_info[index][field_index].col_pack_val;
2568 uint32_t data_start_offset;
2569 uint32_t field_len;
2570
2571 get_var_field_info(
2572 &field_len,
2573 &data_start_offset,
2574 var_field_index,
2575 var_field_offset_ptr,
2576 share->kc_info.num_offset_bytes
2577 );
2578
2579 unpack_var_field(
2580 record + field_offset(field, table),
2581 var_field_data_ptr + data_start_offset,
2582 field_len,
2583 share->kc_info.length_bytes[field_index]
2584 );
2585 }
2586
2587 if (read_blobs) {
2588 //
2589 // now the blobs
2590 //
2591 get_blob_field_info(
2592 &data_end_offset,
2593 share->kc_info.mcp_info[index].len_of_offsets,
2594 var_field_data_ptr,
2595 share->kc_info.num_offset_bytes
2596 );
2597
2598 var_field_data_ptr += data_end_offset;
2599 error = unpack_blobs(
2600 record,
2601 var_field_data_ptr,
2602 row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2603 true
2604 );
2605 if (error) {
2606 goto exit;
2607 }
2608 }
2609 }
2610 error = 0;
2611 exit:
2612 return error;
2613 }
2614
place_key_into_mysql_buff(KEY * key_info,uchar * record,uchar * data)2615 uint32_t ha_tokudb::place_key_into_mysql_buff(
2616 KEY* key_info,
2617 uchar* record,
2618 uchar* data) {
2619
2620 KEY_PART_INFO* key_part = key_info->key_part;
2621 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2622 uchar* pos = data;
2623
2624 for (; key_part != end; key_part++) {
2625 if (key_part->field->null_bit) {
2626 uint null_offset = get_null_offset(table, key_part->field);
2627 if (*pos++ == NULL_COL_VAL) { // Null value
2628 //
2629 // We don't need to reset the record data as we will not access it
2630 // if the null data is set
2631 //
2632 record[null_offset] |= key_part->field->null_bit;
2633 continue;
2634 }
2635 record[null_offset] &= ~key_part->field->null_bit;
2636 }
2637 #if !defined(MARIADB_BASE_VERSION)
2638 //
2639 // HOPEFULLY TEMPORARY
2640 //
2641 assert_always(table->s->db_low_byte_first);
2642 #endif
2643 pos = unpack_toku_key_field(
2644 record + field_offset(key_part->field, table),
2645 pos,
2646 key_part->field,
2647 key_part->length
2648 );
2649 }
2650 return pos-data;
2651 }
2652
2653 //
2654 // Store the key and the primary key into the row
2655 // Parameters:
2656 // [out] record - key stored in MySQL format
2657 // [in] key - key stored in DBT to be converted
2658 // index -index into key_file that represents the DB
2659 // unpacking a key of
2660 //
unpack_key(uchar * record,DBT const * key,uint index)2661 void ha_tokudb::unpack_key(uchar * record, DBT const *key, uint index) {
2662 uint32_t bytes_read;
2663 uchar *pos = (uchar *) key->data + 1;
2664 bytes_read = place_key_into_mysql_buff(
2665 &table->key_info[index],
2666 record,
2667 pos
2668 );
2669 if( (index != primary_key) && !hidden_primary_key) {
2670 //
2671 // also unpack primary key
2672 //
2673 place_key_into_mysql_buff(
2674 &table->key_info[primary_key],
2675 record,
2676 pos+bytes_read
2677 );
2678 }
2679 }
2680
place_key_into_dbt_buff(KEY * key_info,uchar * buff,const uchar * record,bool * has_null,int key_length)2681 uint32_t ha_tokudb::place_key_into_dbt_buff(
2682 KEY* key_info,
2683 uchar* buff,
2684 const uchar* record,
2685 bool* has_null,
2686 int key_length) {
2687
2688 KEY_PART_INFO* key_part = key_info->key_part;
2689 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2690 uchar* curr_buff = buff;
2691 *has_null = false;
2692 for (; key_part != end && key_length > 0; key_part++) {
2693 //
2694 // accessing key_part->field->null_bit instead off key_part->null_bit
2695 // because key_part->null_bit is not set in add_index
2696 // filed ticket 862 to look into this
2697 //
2698 if (key_part->field->null_bit) {
2699 /* Store 0 if the key part is a NULL part */
2700 uint null_offset = get_null_offset(table, key_part->field);
2701 if (record[null_offset] & key_part->field->null_bit) {
2702 *curr_buff++ = NULL_COL_VAL;
2703 *has_null = true;
2704 continue;
2705 }
2706 *curr_buff++ = NONNULL_COL_VAL; // Store NOT NULL marker
2707 }
2708 #if !defined(MARIADB_BASE_VERSION)
2709 //
2710 // HOPEFULLY TEMPORARY
2711 //
2712 assert_always(table->s->db_low_byte_first);
2713 #endif
2714 //
2715 // accessing field_offset(key_part->field) instead off key_part->offset
2716 // because key_part->offset is SET INCORRECTLY in add_index
2717 // filed ticket 862 to look into this
2718 //
2719 curr_buff = pack_toku_key_field(
2720 curr_buff,
2721 (uchar *) (record + field_offset(key_part->field, table)),
2722 key_part->field,
2723 key_part->length
2724 );
2725 key_length -= key_part->length;
2726 }
2727 return curr_buff - buff;
2728 }
2729
2730
2731
2732 //
2733 // Create a packed key from a row. This key will be written as such
2734 // to the index tree. This will never fail as the key buffer is pre-allocated.
2735 // Parameters:
2736 // [out] key - DBT that holds the key
2737 // [in] key_info - holds data about the key, such as it's length and offset into record
2738 // [out] buff - buffer that will hold the data for key (unless
2739 // we have a hidden primary key)
2740 // [in] record - row from which to create the key
2741 // key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2742 // Returns:
2743 // the parameter key
2744 //
2745
create_dbt_key_from_key(DBT * key,KEY * key_info,uchar * buff,const uchar * record,bool * has_null,bool dont_pack_pk,int key_length,uint8_t inf_byte)2746 DBT* ha_tokudb::create_dbt_key_from_key(
2747 DBT * key,
2748 KEY* key_info,
2749 uchar * buff,
2750 const uchar * record,
2751 bool* has_null,
2752 bool dont_pack_pk,
2753 int key_length,
2754 uint8_t inf_byte
2755 )
2756 {
2757 uint32_t size = 0;
2758 uchar* tmp_buff = buff;
2759 my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
2760
2761 key->data = buff;
2762
2763 //
2764 // first put the "infinity" byte at beginning. States if missing columns are implicitly
2765 // positive infinity or negative infinity or zero. For this, because we are creating key
2766 // from a row, there is no way that columns can be missing, so in practice,
2767 // this will be meaningless. Might as well put in a value
2768 //
2769 *tmp_buff++ = inf_byte;
2770 size++;
2771 size += place_key_into_dbt_buff(
2772 key_info,
2773 tmp_buff,
2774 record,
2775 has_null,
2776 key_length
2777 );
2778 if (!dont_pack_pk) {
2779 tmp_buff = buff + size;
2780 if (hidden_primary_key) {
2781 memcpy(tmp_buff, current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2782 size += TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2783 }
2784 else {
2785 bool tmp_bool = false;
2786 size += place_key_into_dbt_buff(
2787 &table->key_info[primary_key],
2788 tmp_buff,
2789 record,
2790 &tmp_bool,
2791 MAX_KEY_LENGTH //this parameter does not matter
2792 );
2793 }
2794 }
2795
2796 key->size = size;
2797 DBUG_DUMP("key", (uchar *) key->data, key->size);
2798 dbug_tmp_restore_column_map(table->write_set, old_map);
2799 return key;
2800 }
2801
2802
2803 //
2804 // Create a packed key from a row. This key will be written as such
2805 // to the index tree. This will never fail as the key buffer is pre-allocated.
2806 // Parameters:
2807 // [out] key - DBT that holds the key
2808 // keynr - index for which to create the key
2809 // [out] buff - buffer that will hold the data for key (unless
2810 // we have a hidden primary key)
2811 // [in] record - row from which to create the key
2812 // [out] has_null - says if the key has a NULL value for one of its columns
2813 // key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2814 // Returns:
2815 // the parameter key
2816 //
create_dbt_key_from_table(DBT * key,uint keynr,uchar * buff,const uchar * record,bool * has_null,int key_length)2817 DBT *ha_tokudb::create_dbt_key_from_table(
2818 DBT * key,
2819 uint keynr,
2820 uchar * buff,
2821 const uchar * record,
2822 bool* has_null,
2823 int key_length
2824 )
2825 {
2826 TOKUDB_HANDLER_DBUG_ENTER("");
2827 memset((void *) key, 0, sizeof(*key));
2828 if (hidden_primary_key && keynr == primary_key) {
2829 key->data = buff;
2830 memcpy(buff, ¤t_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2831 key->size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2832 *has_null = false;
2833 DBUG_RETURN(key);
2834 }
2835 DBUG_RETURN(create_dbt_key_from_key(key, &table->key_info[keynr],buff,record, has_null, (keynr == primary_key), key_length, COL_ZERO));
2836 }
2837
create_dbt_key_for_lookup(DBT * key,KEY * key_info,uchar * buff,const uchar * record,bool * has_null,int key_length)2838 DBT* ha_tokudb::create_dbt_key_for_lookup(
2839 DBT * key,
2840 KEY* key_info,
2841 uchar * buff,
2842 const uchar * record,
2843 bool* has_null,
2844 int key_length
2845 )
2846 {
2847 TOKUDB_HANDLER_DBUG_ENTER("");
2848 // override the infinity byte, needed in case the pk is a string
2849 // to make sure that the cursor that uses this key properly positions
2850 // it at the right location. If the table stores "D", but we look up for "d",
2851 // and the infinity byte is 0, then we will skip the "D", because
2852 // in bytes, "d" > "D".
2853 DBT* ret = create_dbt_key_from_key(key, key_info, buff, record, has_null, true, key_length, COL_NEG_INF);
2854 DBUG_RETURN(ret);
2855 }
2856
2857 //
2858 // Create a packed key from from a MySQL unpacked key (like the one that is
2859 // sent from the index_read() This key is to be used to read a row
2860 // Parameters:
2861 // [out] key - DBT that holds the key
2862 // keynr - index for which to pack the key
2863 // [out] buff - buffer that will hold the data for key
2864 // [in] key_ptr - MySQL unpacked key
2865 // key_length - length of key_ptr
2866 // Returns:
2867 // the parameter key
2868 //
pack_key(DBT * key,uint keynr,uchar * buff,const uchar * key_ptr,uint key_length,int8_t inf_byte)2869 DBT* ha_tokudb::pack_key(
2870 DBT* key,
2871 uint keynr,
2872 uchar* buff,
2873 const uchar* key_ptr,
2874 uint key_length,
2875 int8_t inf_byte) {
2876
2877 TOKUDB_HANDLER_DBUG_ENTER(
2878 "key %p %u:%2.2x inf=%d",
2879 key_ptr,
2880 key_length,
2881 key_length > 0 ? key_ptr[0] : 0,
2882 inf_byte);
2883 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2884 if (keynr != primary_key && !tokudb_test(hidden_primary_key)) {
2885 DBUG_RETURN(pack_ext_key(key, keynr, buff, key_ptr, key_length, inf_byte));
2886 }
2887 #endif // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2888 KEY* key_info = &table->key_info[keynr];
2889 KEY_PART_INFO* key_part = key_info->key_part;
2890 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2891 my_bitmap_map* old_map = dbug_tmp_use_all_columns(table, table->write_set);
2892
2893 memset((void *) key, 0, sizeof(*key));
2894 key->data = buff;
2895
2896 // first put the "infinity" byte at beginning. States if missing columns are implicitly
2897 // positive infinity or negative infinity
2898 *buff++ = (uchar)inf_byte;
2899
2900 for (; key_part != end && (int) key_length > 0; key_part++) {
2901 uint offset = 0;
2902 if (key_part->null_bit) {
2903 if (!(*key_ptr == 0)) {
2904 *buff++ = NULL_COL_VAL;
2905 key_length -= key_part->store_length;
2906 key_ptr += key_part->store_length;
2907 continue;
2908 }
2909 *buff++ = NONNULL_COL_VAL;
2910 offset = 1; // Data is at key_ptr+1
2911 }
2912 #if !defined(MARIADB_BASE_VERSION)
2913 assert_always(table->s->db_low_byte_first);
2914 #endif
2915 buff = pack_key_toku_key_field(
2916 buff,
2917 (uchar *) key_ptr + offset,
2918 key_part->field,
2919 key_part->length
2920 );
2921
2922 key_ptr += key_part->store_length;
2923 key_length -= key_part->store_length;
2924 }
2925
2926 key->size = (buff - (uchar *) key->data);
2927 DBUG_DUMP("key", (uchar *) key->data, key->size);
2928 dbug_tmp_restore_column_map(table->write_set, old_map);
2929 DBUG_RETURN(key);
2930 }
2931
2932 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
pack_ext_key(DBT * key,uint keynr,uchar * buff,const uchar * key_ptr,uint key_length,int8_t inf_byte)2933 DBT* ha_tokudb::pack_ext_key(
2934 DBT* key,
2935 uint keynr,
2936 uchar* buff,
2937 const uchar* key_ptr,
2938 uint key_length,
2939 int8_t inf_byte) {
2940
2941 TOKUDB_HANDLER_DBUG_ENTER("");
2942
2943 // build a list of PK parts that are in the SK. we will use this list to build the
2944 // extended key if necessary.
2945 KEY* pk_key_info = &table->key_info[primary_key];
2946 uint pk_parts = pk_key_info->user_defined_key_parts;
2947 uint pk_next = 0;
2948 struct {
2949 const uchar *key_ptr;
2950 KEY_PART_INFO *key_part;
2951 } pk_info[pk_parts];
2952
2953 KEY* key_info = &table->key_info[keynr];
2954 KEY_PART_INFO* key_part = key_info->key_part;
2955 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2956 my_bitmap_map* old_map = dbug_tmp_use_all_columns(table, table->write_set);
2957
2958 memset((void *) key, 0, sizeof(*key));
2959 key->data = buff;
2960
2961 // first put the "infinity" byte at beginning. States if missing columns are implicitly
2962 // positive infinity or negative infinity
2963 *buff++ = (uchar)inf_byte;
2964
2965 for (; key_part != end && (int) key_length > 0; key_part++) {
2966 // if the SK part is part of the PK, then append it to the list.
2967 if (key_part->field->part_of_key.is_set(primary_key)) {
2968 assert_always(pk_next < pk_parts);
2969 pk_info[pk_next].key_ptr = key_ptr;
2970 pk_info[pk_next].key_part = key_part;
2971 pk_next++;
2972 }
2973 uint offset = 0;
2974 if (key_part->null_bit) {
2975 if (!(*key_ptr == 0)) {
2976 *buff++ = NULL_COL_VAL;
2977 key_length -= key_part->store_length;
2978 key_ptr += key_part->store_length;
2979 continue;
2980 }
2981 *buff++ = NONNULL_COL_VAL;
2982 offset = 1; // Data is at key_ptr+1
2983 }
2984 #if !defined(MARIADB_BASE_VERSION)
2985 assert_always(table->s->db_low_byte_first);
2986 #endif
2987 buff = pack_key_toku_key_field(
2988 buff,
2989 (uchar *) key_ptr + offset,
2990 key_part->field,
2991 key_part->length
2992 );
2993
2994 key_ptr += key_part->store_length;
2995 key_length -= key_part->store_length;
2996 }
2997
2998 if (key_length > 0) {
2999 assert_always(key_part == end);
3000 end = key_info->key_part + get_ext_key_parts(key_info);
3001
3002 // pack PK in order of PK key parts
3003 for (uint pk_index = 0;
3004 key_part != end && (int) key_length > 0 && pk_index < pk_parts;
3005 pk_index++) {
3006 uint i;
3007 for (i = 0; i < pk_next; i++) {
3008 if (pk_info[i].key_part->fieldnr ==
3009 pk_key_info->key_part[pk_index].fieldnr)
3010 break;
3011 }
3012 if (i < pk_next) {
3013 const uchar *this_key_ptr = pk_info[i].key_ptr;
3014 KEY_PART_INFO *this_key_part = pk_info[i].key_part;
3015 buff = pack_key_toku_key_field(
3016 buff,
3017 (uchar*)this_key_ptr,
3018 this_key_part->field,
3019 this_key_part->length);
3020 } else {
3021 buff = pack_key_toku_key_field(
3022 buff,
3023 (uchar*)key_ptr,
3024 key_part->field,
3025 key_part->length);
3026 key_ptr += key_part->store_length;
3027 key_length -= key_part->store_length;
3028 key_part++;
3029 }
3030 }
3031 }
3032
3033 key->size = (buff - (uchar *) key->data);
3034 DBUG_DUMP("key", (uchar *) key->data, key->size);
3035 dbug_tmp_restore_column_map(table->write_set, old_map);
3036 DBUG_RETURN(key);
3037 }
3038 #endif // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
3039
3040 //
3041 // get max used hidden primary key value
3042 //
init_hidden_prim_key_info(DB_TXN * txn)3043 void ha_tokudb::init_hidden_prim_key_info(DB_TXN *txn) {
3044 TOKUDB_HANDLER_DBUG_ENTER("");
3045 if (!(share->status & STATUS_PRIMARY_KEY_INIT)) {
3046 int error = 0;
3047 DBC* c = NULL;
3048 error = share->key_file[primary_key]->cursor(
3049 share->key_file[primary_key],
3050 txn,
3051 &c,
3052 0);
3053 assert_always(error == 0);
3054 DBT key,val;
3055 memset(&key, 0, sizeof(key));
3056 memset(&val, 0, sizeof(val));
3057 error = c->c_get(c, &key, &val, DB_LAST);
3058 if (error == 0) {
3059 assert_always(key.size == TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
3060 share->auto_ident = hpk_char_to_num((uchar *)key.data);
3061 }
3062 error = c->c_close(c);
3063 assert_always(error == 0);
3064 share->status |= STATUS_PRIMARY_KEY_INIT;
3065 }
3066 TOKUDB_HANDLER_DBUG_VOID_RETURN;
3067 }
3068
3069
3070
3071 /** @brief
3072 Get metadata info stored in status.tokudb
3073 */
get_status(DB_TXN * txn)3074 int ha_tokudb::get_status(DB_TXN* txn) {
3075 TOKUDB_HANDLER_DBUG_ENTER("");
3076 DBT key, value;
3077 HA_METADATA_KEY curr_key;
3078 int error;
3079
3080 //
3081 // open status.tokudb
3082 //
3083 if (!share->status_block) {
3084 error =
3085 open_status_dictionary(
3086 &share->status_block,
3087 share->full_table_name(),
3088 txn);
3089 if (error) {
3090 goto cleanup;
3091 }
3092 }
3093
3094 //
3095 // transaction to be used for putting metadata into status.tokudb
3096 //
3097 memset(&key, 0, sizeof(key));
3098 memset(&value, 0, sizeof(value));
3099 key.data = &curr_key;
3100 key.size = sizeof(curr_key);
3101 value.flags = DB_DBT_USERMEM;
3102
3103 assert_always(share->status_block);
3104 //
3105 // get version
3106 //
3107 value.ulen = sizeof(share->version);
3108 value.data = &share->version;
3109 curr_key = hatoku_new_version;
3110 error = share->status_block->get(
3111 share->status_block,
3112 txn,
3113 &key,
3114 &value,
3115 0
3116 );
3117 if (error == DB_NOTFOUND) {
3118 //
3119 // hack to keep handle the issues of going back and forth
3120 // between 5.0.3 to 5.0.4
3121 // the problem with going back and forth
3122 // is with storing the frm file, 5.0.4 stores it, 5.0.3 does not
3123 // so, if a user goes back and forth and alters the schema
3124 // the frm stored can get out of sync with the schema of the table
3125 // This can cause issues.
3126 // To take care of this, we are doing this versioning work here.
3127 // We change the key that stores the version.
3128 // In 5.0.3, it is hatoku_old_version, in 5.0.4 it is hatoku_new_version
3129 // When we encounter a table that does not have hatoku_new_version
3130 // set, we give it the right one, and overwrite the old one with zero.
3131 // This ensures that 5.0.3 cannot open the table. Once it has been opened by 5.0.4
3132 //
3133 uint dummy_version = 0;
3134 share->version = HA_TOKU_ORIG_VERSION;
3135 error = write_to_status(
3136 share->status_block,
3137 hatoku_new_version,
3138 &share->version,
3139 sizeof(share->version),
3140 txn
3141 );
3142 if (error) { goto cleanup; }
3143 error = write_to_status(
3144 share->status_block,
3145 hatoku_old_version,
3146 &dummy_version,
3147 sizeof(dummy_version),
3148 txn
3149 );
3150 if (error) { goto cleanup; }
3151 }
3152 else if (error || value.size != sizeof(share->version)) {
3153 if (error == 0) {
3154 error = HA_ERR_INTERNAL_ERROR;
3155 }
3156 goto cleanup;
3157 }
3158 //
3159 // get capabilities
3160 //
3161 curr_key = hatoku_capabilities;
3162 value.ulen = sizeof(share->capabilities);
3163 value.data = &share->capabilities;
3164 error = share->status_block->get(
3165 share->status_block,
3166 txn,
3167 &key,
3168 &value,
3169 0
3170 );
3171 if (error == DB_NOTFOUND) {
3172 share->capabilities= 0;
3173 }
3174 else if (error || value.size != sizeof(share->version)) {
3175 if (error == 0) {
3176 error = HA_ERR_INTERNAL_ERROR;
3177 }
3178 goto cleanup;
3179 }
3180
3181 error = 0;
3182 cleanup:
3183 TOKUDB_HANDLER_DBUG_RETURN(error);
3184 }
3185
3186 /** @brief
3187 Return an estimated of the number of rows in the table.
3188 Used when sorting to allocate buffers and by the optimizer.
3189 This is used in filesort.cc.
3190 */
estimate_rows_upper_bound()3191 ha_rows ha_tokudb::estimate_rows_upper_bound() {
3192 TOKUDB_HANDLER_DBUG_ENTER("");
3193 DBUG_RETURN(share->row_count() + HA_TOKUDB_EXTRA_ROWS);
3194 }
3195
3196 //
3197 // Function that compares two primary keys that were saved as part of rnd_pos
3198 // and ::position
3199 //
cmp_ref(const uchar * ref1,const uchar * ref2)3200 int ha_tokudb::cmp_ref(const uchar * ref1, const uchar * ref2) {
3201 int ret_val = 0;
3202 bool read_string = false;
3203 ret_val = tokudb_compare_two_keys(
3204 ref1 + sizeof(uint32_t),
3205 *(uint32_t *)ref1,
3206 ref2 + sizeof(uint32_t),
3207 *(uint32_t *)ref2,
3208 (uchar *)share->file->descriptor->dbt.data + 4,
3209 *(uint32_t *)share->file->descriptor->dbt.data - 4,
3210 false,
3211 &read_string
3212 );
3213 return ret_val;
3214 }
3215
check_if_incompatible_data(HA_CREATE_INFO * info,uint table_changes)3216 bool ha_tokudb::check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes) {
3217 //
3218 // This is a horrendous hack for now, as copied by InnoDB.
3219 // This states that if the auto increment create field has changed,
3220 // via a "alter table foo auto_increment=new_val", that this
3221 // change is incompatible, and to rebuild the entire table
3222 // This will need to be fixed
3223 //
3224 if ((info->used_fields & HA_CREATE_USED_AUTO) &&
3225 info->auto_increment_value != 0) {
3226
3227 return COMPATIBLE_DATA_NO;
3228 }
3229 if (table_changes != IS_EQUAL_YES)
3230 return COMPATIBLE_DATA_NO;
3231 return COMPATIBLE_DATA_YES;
3232 }
3233
3234 //
3235 // Method that is called before the beginning of many calls
3236 // to insert rows (ha_tokudb::write_row). There is no guarantee
3237 // that start_bulk_insert is called, however there is a guarantee
3238 // that if start_bulk_insert is called, then end_bulk_insert may be
3239 // called as well.
3240 // Parameters:
3241 // [in] rows - an estimate of the number of rows that will be inserted
3242 // if number of rows is unknown (such as if doing
3243 // "insert into foo select * from bar), then rows
3244 // will be 0
3245 //
3246 //
3247 // This function returns true if the table MAY be empty.
3248 // It is NOT meant to be a 100% check for emptiness.
3249 // This is used for a bulk load optimization.
3250 //
may_table_be_empty(DB_TXN * txn)3251 bool ha_tokudb::may_table_be_empty(DB_TXN *txn) {
3252 int error;
3253 bool ret_val = false;
3254 DBC* tmp_cursor = NULL;
3255 DB_TXN* tmp_txn = NULL;
3256
3257 const int empty_scan = tokudb::sysvars::empty_scan(ha_thd());
3258 if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_DISABLED)
3259 goto cleanup;
3260
3261 if (txn == NULL) {
3262 error = txn_begin(db_env, 0, &tmp_txn, 0, ha_thd());
3263 if (error) {
3264 goto cleanup;
3265 }
3266 txn = tmp_txn;
3267 }
3268
3269 error = share->file->cursor(share->file, txn, &tmp_cursor, 0);
3270 if (error)
3271 goto cleanup;
3272 tmp_cursor->c_set_check_interrupt_callback(tmp_cursor, tokudb_killed_thd_callback, ha_thd());
3273 if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_LR)
3274 error = tmp_cursor->c_getf_next(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3275 else
3276 error = tmp_cursor->c_getf_prev(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3277 error = map_to_handler_error(error);
3278 if (error == DB_NOTFOUND)
3279 ret_val = true;
3280 else
3281 ret_val = false;
3282 error = 0;
3283
3284 cleanup:
3285 if (tmp_cursor) {
3286 int r = tmp_cursor->c_close(tmp_cursor);
3287 assert_always(r == 0);
3288 tmp_cursor = NULL;
3289 }
3290 if (tmp_txn) {
3291 commit_txn(tmp_txn, 0);
3292 tmp_txn = NULL;
3293 }
3294 return ret_val;
3295 }
3296
3297 #if MYSQL_VERSION_ID >= 100000
start_bulk_insert(ha_rows rows,uint flags)3298 void ha_tokudb::start_bulk_insert(ha_rows rows, uint flags) {
3299 TOKUDB_HANDLER_DBUG_ENTER("%llu %u txn %p", (unsigned long long) rows, flags, transaction);
3300 #else
3301 void ha_tokudb::start_bulk_insert(ha_rows rows) {
3302 TOKUDB_HANDLER_DBUG_ENTER("%llu txn %p", (unsigned long long) rows, transaction);
3303 #endif
3304 THD* thd = ha_thd();
3305 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3306 delay_updating_ai_metadata = true;
3307 ai_metadata_update_required = false;
3308 abort_loader = false;
3309
3310 rwlock_t_lock_read(share->_num_DBs_lock);
3311 uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3312 num_DBs_locked_in_bulk = true;
3313 lock_count = 0;
3314
3315 if ((rows == 0 || rows > 1) && share->try_table_lock) {
3316 if (tokudb::sysvars::prelock_empty(thd) &&
3317 may_table_be_empty(transaction) &&
3318 transaction != NULL) {
3319 if (using_ignore || is_insert_ignore(thd) || thd->lex->duplicates != DUP_ERROR) {
3320 acquire_table_lock(transaction, lock_write);
3321 } else {
3322 mult_dbt_flags[primary_key] = 0;
3323 if (!thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS) && !hidden_primary_key) {
3324 mult_put_flags[primary_key] = DB_NOOVERWRITE;
3325 }
3326 uint32_t loader_flags = (tokudb::sysvars::load_save_space(thd)) ?
3327 LOADER_COMPRESS_INTERMEDIATES : 0;
3328
3329 int error = db_env->create_loader(
3330 db_env,
3331 transaction,
3332 &loader,
3333 NULL, // no src_db needed
3334 curr_num_DBs,
3335 share->key_file,
3336 mult_put_flags,
3337 mult_dbt_flags,
3338 loader_flags
3339 );
3340 if (error) {
3341 assert_always(loader == NULL);
3342 goto exit_try_table_lock;
3343 }
3344
3345 lc.thd = thd;
3346 lc.ha = this;
3347
3348 error = loader->set_poll_function(
3349 loader, ha_tokudb::bulk_insert_poll, &lc);
3350 assert_always(!error);
3351
3352 error = loader->set_error_callback(
3353 loader, ha_tokudb::loader_dup, &lc);
3354 assert_always(!error);
3355
3356 trx->stmt_progress.using_loader = true;
3357 }
3358 }
3359 exit_try_table_lock:
3360 share->lock();
3361 share->try_table_lock = false;
3362 share->unlock();
3363 }
3364 TOKUDB_HANDLER_DBUG_VOID_RETURN;
3365 }
3366 int ha_tokudb::bulk_insert_poll(void* extra, float progress) {
3367 LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
3368 if (thd_killed(context->thd)) {
3369 snprintf(context->write_status_msg,
3370 sizeof(context->write_status_msg),
3371 "The process has been killed, aborting bulk load.");
3372 return ER_ABORTING_CONNECTION;
3373 }
3374 float percentage = progress * 100;
3375 snprintf(context->write_status_msg,
3376 sizeof(context->write_status_msg),
3377 "Loading of data t %s about %.1f%% done",
3378 context->ha->share->full_table_name(),
3379 percentage);
3380 thd_proc_info(context->thd, context->write_status_msg);
3381 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
3382 thd_progress_report(context->thd, (unsigned long long)percentage, 100);
3383 #endif
3384 return 0;
3385 }
3386 void ha_tokudb::loader_add_index_err(TOKUDB_UNUSED(DB* db),
3387 TOKUDB_UNUSED(int i),
3388 TOKUDB_UNUSED(int err),
3389 TOKUDB_UNUSED(DBT* key),
3390 TOKUDB_UNUSED(DBT* val),
3391 void* error_extra) {
3392 LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3393 assert_always(context->ha);
3394 context->ha->set_loader_error(err);
3395 }
3396 void ha_tokudb::loader_dup(TOKUDB_UNUSED(DB* db),
3397 TOKUDB_UNUSED(int i),
3398 int err,
3399 DBT* key,
3400 TOKUDB_UNUSED(DBT* val),
3401 void* error_extra) {
3402 LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3403 assert_always(context->ha);
3404 context->ha->set_loader_error(err);
3405 if (err == DB_KEYEXIST) {
3406 context->ha->set_dup_value_for_pk(key);
3407 }
3408 }
3409
3410 //
3411 // Method that is called at the end of many calls to insert rows
3412 // (ha_tokudb::write_row). If start_bulk_insert is called, then
3413 // this is guaranteed to be called.
3414 //
3415 int ha_tokudb::end_bulk_insert(TOKUDB_UNUSED(bool abort)) {
3416 TOKUDB_HANDLER_DBUG_ENTER("");
3417 int error = 0;
3418 THD* thd = ha_thd();
3419 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3420 bool using_loader = (loader != NULL);
3421 if (ai_metadata_update_required) {
3422 share->lock();
3423 error = update_max_auto_inc(share->status_block, share->last_auto_increment);
3424 share->unlock();
3425 if (error) { goto cleanup; }
3426 }
3427 delay_updating_ai_metadata = false;
3428 ai_metadata_update_required = false;
3429 loader_error = 0;
3430 if (loader) {
3431 if (!abort_loader && !thd_killed(thd)) {
3432 DBUG_EXECUTE_IF("tokudb_end_bulk_insert_sleep", {
3433 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3434 thd_proc_info(thd, "DBUG sleep");
3435 my_sleep(20000000);
3436 thd_proc_info(thd, orig_proc_info);
3437 });
3438 error = loader->close(loader);
3439 loader = NULL;
3440 if (error) {
3441 if (thd_killed(thd)) {
3442 my_error(ER_QUERY_INTERRUPTED, MYF(0));
3443 }
3444 goto cleanup;
3445 }
3446
3447 for (uint i = 0; i < table_share->keys; i++) {
3448 if (table_share->key_info[i].flags & HA_NOSAME) {
3449 bool is_unique;
3450 if (i == primary_key && !share->pk_has_string) {
3451 continue;
3452 }
3453 error = is_index_unique(&is_unique, transaction, share->key_file[i], &table->key_info[i],
3454 DB_PRELOCKED_WRITE);
3455 if (error) goto cleanup;
3456 if (!is_unique) {
3457 error = HA_ERR_FOUND_DUPP_KEY;
3458 last_dup_key = i;
3459 goto cleanup;
3460 }
3461 }
3462 }
3463 }
3464 else {
3465 error = sprintf(write_status_msg, "aborting bulk load");
3466 thd_proc_info(thd, write_status_msg);
3467 loader->abort(loader);
3468 loader = NULL;
3469 share->try_table_lock = true;
3470 }
3471 }
3472
3473 cleanup:
3474 if (num_DBs_locked_in_bulk) {
3475 share->_num_DBs_lock.unlock();
3476 }
3477 num_DBs_locked_in_bulk = false;
3478 lock_count = 0;
3479 if (loader) {
3480 error = sprintf(write_status_msg, "aborting bulk load");
3481 thd_proc_info(thd, write_status_msg);
3482 loader->abort(loader);
3483 loader = NULL;
3484 }
3485 abort_loader = false;
3486 memset(&lc, 0, sizeof(lc));
3487 if (error || loader_error) {
3488 my_errno = error ? error : loader_error;
3489 if (using_loader) {
3490 share->try_table_lock = true;
3491 }
3492 }
3493 trx->stmt_progress.using_loader = false;
3494 thd_proc_info(thd, 0);
3495 TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
3496 }
3497
3498 int ha_tokudb::end_bulk_insert() {
3499 return end_bulk_insert( false );
3500 }
3501
3502 int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info, int lock_flags) {
3503 int error;
3504 DBC* tmp_cursor1 = NULL;
3505 DBC* tmp_cursor2 = NULL;
3506 DBT key1, key2, val, packed_key1, packed_key2;
3507 uint64_t cnt = 0;
3508 char status_msg[MAX_ALIAS_NAME + 200]; //buffer of 200 should be a good upper bound.
3509 THD* thd = ha_thd();
3510 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3511 memset(&key1, 0, sizeof(key1));
3512 memset(&key2, 0, sizeof(key2));
3513 memset(&val, 0, sizeof(val));
3514 memset(&packed_key1, 0, sizeof(packed_key1));
3515 memset(&packed_key2, 0, sizeof(packed_key2));
3516 *is_unique = true;
3517
3518 error = db->cursor(db, txn, &tmp_cursor1, DB_SERIALIZABLE);
3519 if (error) { goto cleanup; }
3520
3521 error = db->cursor(db, txn, &tmp_cursor2, DB_SERIALIZABLE);
3522 if (error) { goto cleanup; }
3523
3524 error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3525 if (error == DB_NOTFOUND) {
3526 *is_unique = true;
3527 error = 0;
3528 goto cleanup;
3529 }
3530 else if (error) { goto cleanup; }
3531 error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3532 if (error) { goto cleanup; }
3533
3534 error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3535 if (error == DB_NOTFOUND) {
3536 *is_unique = true;
3537 error = 0;
3538 goto cleanup;
3539 }
3540 else if (error) { goto cleanup; }
3541
3542 while (error != DB_NOTFOUND) {
3543 bool has_null1;
3544 bool has_null2;
3545 int cmp;
3546 place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key1.data + 1);
3547 place_key_into_mysql_buff(key_info, table->record[1], (uchar *) key2.data + 1);
3548
3549 create_dbt_key_for_lookup(&packed_key1, key_info, key_buff, table->record[0], &has_null1);
3550 create_dbt_key_for_lookup(&packed_key2, key_info, key_buff2, table->record[1], &has_null2);
3551
3552 if (!has_null1 && !has_null2) {
3553 cmp = tokudb_prefix_cmp_dbt_key(db, &packed_key1, &packed_key2);
3554 if (cmp == 0) {
3555 memcpy(key_buff, key1.data, key1.size);
3556 place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key_buff + 1);
3557 *is_unique = false;
3558 break;
3559 }
3560 }
3561
3562 error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3563 if (error) { goto cleanup; }
3564 error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3565 if (error && (error != DB_NOTFOUND)) { goto cleanup; }
3566
3567 cnt++;
3568 if ((cnt % 10000) == 0) {
3569 sprintf(
3570 status_msg,
3571 "Verifying index uniqueness: Checked %llu of %llu rows in key-%s.",
3572 (long long unsigned) cnt,
3573 share->row_count(),
3574 key_info->name);
3575 thd_proc_info(thd, status_msg);
3576 if (thd_killed(thd)) {
3577 my_error(ER_QUERY_INTERRUPTED, MYF(0));
3578 error = ER_QUERY_INTERRUPTED;
3579 goto cleanup;
3580 }
3581 }
3582 }
3583
3584 error = 0;
3585
3586 cleanup:
3587 thd_proc_info(thd, orig_proc_info);
3588 if (tmp_cursor1) {
3589 tmp_cursor1->c_close(tmp_cursor1);
3590 tmp_cursor1 = NULL;
3591 }
3592 if (tmp_cursor2) {
3593 tmp_cursor2->c_close(tmp_cursor2);
3594 tmp_cursor2 = NULL;
3595 }
3596 return error;
3597 }
3598
3599 int ha_tokudb::is_val_unique(bool* is_unique, uchar* record, KEY* key_info, uint dict_index, DB_TXN* txn) {
3600 int error = 0;
3601 bool has_null;
3602 DBC* tmp_cursor = NULL;
3603
3604 DBT key; memset((void *)&key, 0, sizeof(key));
3605 create_dbt_key_from_key(&key, key_info, key_buff2, record, &has_null, true, MAX_KEY_LENGTH, COL_NEG_INF);
3606 if (has_null) {
3607 error = 0;
3608 *is_unique = true;
3609 goto cleanup;
3610 }
3611
3612 error = share->key_file[dict_index]->cursor(share->key_file[dict_index], txn, &tmp_cursor, DB_SERIALIZABLE | DB_RMW);
3613 if (error) {
3614 goto cleanup;
3615 } else {
3616 // prelock (key,-inf),(key,+inf) so that the subsequent key lookup does not overlock
3617 uint flags = 0;
3618 DBT key_right; memset(&key_right, 0, sizeof key_right);
3619 create_dbt_key_from_key(&key_right, key_info, key_buff3, record, &has_null, true, MAX_KEY_LENGTH, COL_POS_INF);
3620 error = tmp_cursor->c_set_bounds(tmp_cursor, &key, &key_right, true, DB_NOTFOUND);
3621 if (error == 0) {
3622 flags = DB_PRELOCKED | DB_PRELOCKED_WRITE;
3623 }
3624
3625 // lookup key and check unique prefix
3626 struct smart_dbt_info info;
3627 info.ha = this;
3628 info.buf = NULL;
3629 info.keynr = dict_index;
3630
3631 struct index_read_info ir_info;
3632 ir_info.orig_key = &key;
3633 ir_info.smart_dbt_info = info;
3634
3635 error = tmp_cursor->c_getf_set_range(tmp_cursor, flags, &key, smart_dbt_callback_lookup, &ir_info);
3636 if (error == DB_NOTFOUND) {
3637 *is_unique = true;
3638 error = 0;
3639 goto cleanup;
3640 }
3641 else if (error) {
3642 error = map_to_handler_error(error);
3643 goto cleanup;
3644 }
3645 if (ir_info.cmp) {
3646 *is_unique = true;
3647 }
3648 else {
3649 *is_unique = false;
3650 }
3651 }
3652 error = 0;
3653
3654 cleanup:
3655 if (tmp_cursor) {
3656 int r = tmp_cursor->c_close(tmp_cursor);
3657 assert_always(r==0);
3658 tmp_cursor = NULL;
3659 }
3660 return error;
3661 }
3662
3663 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
3664 static void maybe_do_unique_checks_delay_fn(THD *thd) {
3665 if (thd->slave_thread) {
3666 uint64_t delay_ms = tokudb::sysvars::rpl_unique_checks_delay(thd);
3667 if (delay_ms)
3668 usleep(delay_ms * 1000);
3669 }
3670 }
3671
3672 #define maybe_do_unique_checks_delay(__thd) \
3673 (maybe_do_unique_checks_delay_fn(__thd))
3674
3675 #define maybe_do_unique_checks_delay_if_flags_set( \
3676 __thd, __flags_set, __flags_check) \
3677 { if (((__flags_set) & DB_OPFLAGS_MASK) == \
3678 (__flags_check)) maybe_do_unique_checks_delay_fn(__thd); }
3679
3680 static bool need_read_only(THD *thd) {
3681 return opt_readonly || !tokudb::sysvars::rpl_check_readonly(thd);
3682 }
3683
3684 static bool do_unique_checks_fn(THD *thd, bool do_rpl_event) {
3685 if (do_rpl_event &&
3686 thd->slave_thread &&
3687 need_read_only(thd) &&
3688 !tokudb::sysvars::rpl_unique_checks(thd)) {
3689 return false;
3690 } else {
3691 return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3692 }
3693 }
3694
3695 #define do_unique_checks(__thd, __flags) \
3696 (do_unique_checks_fn(__thd, __flags))
3697
3698 #else
3699
3700 #define maybe_do_unique_checks_delay(__thd) ((void)0)
3701
3702 #define maybe_do_unique_checks_delay_if_flags_set( \
3703 __thd, __flags_set, __flags_check) \
3704 ((void)0)
3705
3706 static bool do_unique_checks_fn(THD *thd) {
3707 return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3708 }
3709
3710 #define do_unique_checks(__thd, _flags) \
3711 (do_unique_checks_fn(__thd))
3712
3713 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
3714
3715 int ha_tokudb::do_uniqueness_checks(uchar* record, DB_TXN* txn, THD* thd) {
3716 int error = 0;
3717 //
3718 // first do uniqueness checks
3719 //
3720 if (share->has_unique_keys && do_unique_checks(thd, in_rpl_write_rows)) {
3721 DBUG_EXECUTE_IF("tokudb_crash_if_rpl_does_uniqueness_check",
3722 DBUG_ASSERT(0););
3723 for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3724 bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
3725 bool is_unique = false;
3726 //
3727 // don't need to do check for primary key that don't have strings
3728 //
3729 if (keynr == primary_key && !share->pk_has_string) {
3730 continue;
3731 }
3732 if (!is_unique_key) {
3733 continue;
3734 }
3735
3736 maybe_do_unique_checks_delay(thd);
3737
3738 //
3739 // if unique key, check uniqueness constraint
3740 // but, we do not need to check it if the key has a null
3741 // and we do not need to check it if unique_checks is off
3742 //
3743 error = is_val_unique(&is_unique, record, &table->key_info[keynr], keynr, txn);
3744 if (error) {
3745 goto cleanup;
3746 }
3747 if (!is_unique) {
3748 error = DB_KEYEXIST;
3749 last_dup_key = keynr;
3750 goto cleanup;
3751 }
3752 }
3753 }
3754 cleanup:
3755 return error;
3756 }
3757
3758 void ha_tokudb::test_row_packing(uchar* record, DBT* pk_key, DBT* pk_val) {
3759 int error;
3760 DBT row, key;
3761 //
3762 // variables for testing key packing, only used in some debug modes
3763 //
3764 uchar* tmp_pk_key_data = NULL;
3765 uchar* tmp_pk_val_data = NULL;
3766 DBT tmp_pk_key;
3767 DBT tmp_pk_val;
3768 bool has_null;
3769 int cmp;
3770
3771 memset(&tmp_pk_key, 0, sizeof(DBT));
3772 memset(&tmp_pk_val, 0, sizeof(DBT));
3773
3774 //
3775 //use for testing the packing of keys
3776 //
3777 tmp_pk_key_data = (uchar*)tokudb::memory::malloc(pk_key->size, MYF(MY_WME));
3778 assert_always(tmp_pk_key_data);
3779 tmp_pk_val_data = (uchar*)tokudb::memory::malloc(pk_val->size, MYF(MY_WME));
3780 assert_always(tmp_pk_val_data);
3781 memcpy(tmp_pk_key_data, pk_key->data, pk_key->size);
3782 memcpy(tmp_pk_val_data, pk_val->data, pk_val->size);
3783 tmp_pk_key.data = tmp_pk_key_data;
3784 tmp_pk_key.size = pk_key->size;
3785 tmp_pk_val.data = tmp_pk_val_data;
3786 tmp_pk_val.size = pk_val->size;
3787
3788 for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3789 uint32_t tmp_num_bytes = 0;
3790 uchar* row_desc = NULL;
3791 uint32_t desc_size = 0;
3792
3793 if (keynr == primary_key) {
3794 continue;
3795 }
3796
3797 create_dbt_key_from_table(&key, keynr, key_buff2, record, &has_null);
3798
3799 //
3800 // TEST
3801 //
3802 row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3803 row_desc += (*(uint32_t *)row_desc);
3804 desc_size = (*(uint32_t *)row_desc) - 4;
3805 row_desc += 4;
3806 tmp_num_bytes = pack_key_from_desc(
3807 key_buff3,
3808 row_desc,
3809 desc_size,
3810 &tmp_pk_key,
3811 &tmp_pk_val
3812 );
3813 assert_always(tmp_num_bytes == key.size);
3814 cmp = memcmp(key_buff3,key_buff2,tmp_num_bytes);
3815 assert_always(cmp == 0);
3816
3817 //
3818 // test key packing of clustering keys
3819 //
3820 if (key_is_clustering(&table->key_info[keynr])) {
3821 error = pack_row(&row, (const uchar *) record, keynr);
3822 assert_always(error == 0);
3823 uchar* tmp_buff = NULL;
3824 tmp_buff = (uchar*)tokudb::memory::malloc(
3825 alloced_rec_buff_length,
3826 MYF(MY_WME));
3827 assert_always(tmp_buff);
3828 row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3829 row_desc += (*(uint32_t *)row_desc);
3830 row_desc += (*(uint32_t *)row_desc);
3831 desc_size = (*(uint32_t *)row_desc) - 4;
3832 row_desc += 4;
3833 tmp_num_bytes = pack_clustering_val_from_desc(
3834 tmp_buff,
3835 row_desc,
3836 desc_size,
3837 &tmp_pk_val
3838 );
3839 assert_always(tmp_num_bytes == row.size);
3840 cmp = memcmp(tmp_buff,rec_buff,tmp_num_bytes);
3841 assert_always(cmp == 0);
3842 tokudb::memory::free(tmp_buff);
3843 }
3844 }
3845
3846 //
3847 // copy stuff back out
3848 //
3849 error = pack_row(pk_val, (const uchar *) record, primary_key);
3850 assert_always(pk_val->size == tmp_pk_val.size);
3851 cmp = memcmp(pk_val->data, tmp_pk_val_data, pk_val->size);
3852 assert_always( cmp == 0);
3853
3854 tokudb::memory::free(tmp_pk_key_data);
3855 tokudb::memory::free(tmp_pk_val_data);
3856 }
3857
3858 // set the put flags for the main dictionary
3859 void ha_tokudb::set_main_dict_put_flags(THD* thd, bool opt_eligible, uint32_t* put_flags) {
3860 uint32_t old_prelock_flags = 0;
3861 uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3862 bool in_hot_index = share->num_DBs > curr_num_DBs;
3863 bool using_ignore_flag_opt = do_ignore_flag_optimization(thd, table, share->replace_into_fast && !using_ignore_no_key);
3864 //
3865 // optimization for "REPLACE INTO..." (and "INSERT IGNORE") command
3866 // if the command is "REPLACE INTO" and the only table
3867 // is the main table (or all indexes are a subset of the pk),
3868 // then we can simply insert the element
3869 // with DB_YESOVERWRITE. If the element does not exist,
3870 // it will act as a normal insert, and if it does exist, it
3871 // will act as a replace, which is exactly what REPLACE INTO is supposed
3872 // to do. We cannot do this if otherwise, because then we lose
3873 // consistency between indexes
3874 //
3875 if (hidden_primary_key)
3876 {
3877 *put_flags = old_prelock_flags;
3878 }
3879 else if (!do_unique_checks(thd, in_rpl_write_rows | in_rpl_update_rows) && !is_replace_into(thd) && !is_insert_ignore(thd))
3880 {
3881 *put_flags = old_prelock_flags;
3882 }
3883 else if (using_ignore_flag_opt && is_replace_into(thd)
3884 && !in_hot_index)
3885 {
3886 *put_flags = old_prelock_flags;
3887 }
3888 else if (opt_eligible && using_ignore_flag_opt && is_insert_ignore(thd)
3889 && !in_hot_index)
3890 {
3891 *put_flags = DB_NOOVERWRITE_NO_ERROR | old_prelock_flags;
3892 }
3893 else
3894 {
3895 *put_flags = DB_NOOVERWRITE | old_prelock_flags;
3896 }
3897 }
3898
3899 int ha_tokudb::insert_row_to_main_dictionary(
3900 DBT* pk_key,
3901 DBT* pk_val,
3902 DB_TXN* txn) {
3903
3904 int error = 0;
3905 uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3906 assert_always(curr_num_DBs == 1);
3907
3908 uint32_t put_flags = mult_put_flags[primary_key];
3909 THD *thd = ha_thd();
3910 set_main_dict_put_flags(thd, true, &put_flags);
3911
3912 // for test, make unique checks have a very long duration
3913 maybe_do_unique_checks_delay_if_flags_set(thd, put_flags, DB_NOOVERWRITE);
3914
3915 error = share->file->put(share->file, txn, pk_key, pk_val, put_flags);
3916 if (error) {
3917 last_dup_key = primary_key;
3918 goto cleanup;
3919 }
3920
3921 cleanup:
3922 return error;
3923 }
3924
3925 int ha_tokudb::insert_rows_to_dictionaries_mult(DBT* pk_key, DBT* pk_val, DB_TXN* txn, THD* thd) {
3926 int error = 0;
3927 uint curr_num_DBs = share->num_DBs;
3928 set_main_dict_put_flags(thd, true, &mult_put_flags[primary_key]);
3929 uint32_t flags = mult_put_flags[primary_key];
3930
3931 // for test, make unique checks have a very long duration
3932 maybe_do_unique_checks_delay_if_flags_set(thd, flags, DB_NOOVERWRITE);
3933
3934 // the insert ignore optimization uses DB_NOOVERWRITE_NO_ERROR,
3935 // which is not allowed with env->put_multiple.
3936 // we have to insert the rows one by one in this case.
3937 if (flags & DB_NOOVERWRITE_NO_ERROR) {
3938 DB * src_db = share->key_file[primary_key];
3939 for (uint32_t i = 0; i < curr_num_DBs; i++) {
3940 DB * db = share->key_file[i];
3941 if (i == primary_key) {
3942 // if it's the primary key, insert the rows
3943 // as they are.
3944 error = db->put(db, txn, pk_key, pk_val, flags);
3945 } else {
3946 // generate a row for secondary keys.
3947 // use our multi put key/rec buffers
3948 // just as the ydb layer would have in
3949 // env->put_multiple(), except that
3950 // we will just do a put() right away.
3951 error = tokudb_generate_row(db, src_db,
3952 &mult_key_dbt_array[i].dbts[0], &mult_rec_dbt_array[i].dbts[0],
3953 pk_key, pk_val);
3954 if (error != 0) {
3955 goto out;
3956 }
3957 error = db->put(db, txn, &mult_key_dbt_array[i].dbts[0],
3958 &mult_rec_dbt_array[i].dbts[0], flags);
3959 }
3960 if (error != 0) {
3961 goto out;
3962 }
3963 }
3964 } else {
3965 // not insert ignore, so we can use put multiple
3966 error = db_env->put_multiple(
3967 db_env,
3968 share->key_file[primary_key],
3969 txn,
3970 pk_key,
3971 pk_val,
3972 curr_num_DBs,
3973 share->key_file,
3974 mult_key_dbt_array,
3975 mult_rec_dbt_array,
3976 mult_put_flags
3977 );
3978 }
3979
3980 out:
3981 //
3982 // We break if we hit an error, unless it is a dup key error
3983 // and MySQL told us to ignore duplicate key errors
3984 //
3985 if (error) {
3986 last_dup_key = primary_key;
3987 }
3988 return error;
3989 }
3990
3991 //
3992 // Stores a row in the table, called when handling an INSERT query
3993 // Parameters:
3994 // [in] record - a row in MySQL format
3995 // Returns:
3996 // 0 on success
3997 // error otherwise
3998 //
3999 int ha_tokudb::write_row(uchar * record) {
4000 TOKUDB_HANDLER_DBUG_ENTER("%p", record);
4001
4002 DBT row, prim_key;
4003 int error;
4004 THD *thd = ha_thd();
4005 bool has_null;
4006 DB_TXN* sub_trans = NULL;
4007 DB_TXN* txn = NULL;
4008 tokudb_trx_data *trx = NULL;
4009 uint curr_num_DBs;
4010 bool create_sub_trans = false;
4011 bool num_DBs_locked = false;
4012
4013 //
4014 // some crap that needs to be done because MySQL does not properly abstract
4015 // this work away from us, namely filling in auto increment and setting auto timestamp
4016 //
4017 ha_statistic_increment(&SSV::ha_write_count);
4018 #if MYSQL_VERSION_ID < 50600
4019 if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT) {
4020 table->timestamp_field->set_time();
4021 }
4022 #endif
4023 if (table->next_number_field && record == table->record[0]) {
4024 error = update_auto_increment();
4025 if (error)
4026 goto cleanup;
4027 }
4028
4029 //
4030 // check to see if some value for the auto increment column that is bigger
4031 // than anything else til now is being used. If so, update the metadata to reflect it
4032 // the goal here is we never want to have a dup key error due to a bad increment
4033 // of the auto inc field.
4034 //
4035 if (share->has_auto_inc && record == table->record[0]) {
4036 share->lock();
4037 ulonglong curr_auto_inc = retrieve_auto_increment(
4038 table->field[share->ai_field_index]->key_type(),
4039 field_offset(table->field[share->ai_field_index], table),
4040 record);
4041 if (curr_auto_inc > share->last_auto_increment) {
4042 share->last_auto_increment = curr_auto_inc;
4043 if (delay_updating_ai_metadata) {
4044 ai_metadata_update_required = true;
4045 } else {
4046 update_max_auto_inc(
4047 share->status_block,
4048 share->last_auto_increment);
4049 }
4050 }
4051 share->unlock();
4052 }
4053
4054 //
4055 // grab reader lock on numDBs_lock
4056 //
4057 if (!num_DBs_locked_in_bulk) {
4058 rwlock_t_lock_read(share->_num_DBs_lock);
4059 num_DBs_locked = true;
4060 } else {
4061 lock_count++;
4062 if (lock_count >= 2000) {
4063 share->_num_DBs_lock.unlock();
4064 rwlock_t_lock_read(share->_num_DBs_lock);
4065 lock_count = 0;
4066 }
4067 }
4068 curr_num_DBs = share->num_DBs;
4069
4070 if (hidden_primary_key) {
4071 get_auto_primary_key(current_ident);
4072 }
4073
4074 if (table_share->blob_fields) {
4075 if (fix_rec_buff_for_blob(max_row_length(record))) {
4076 error = HA_ERR_OUT_OF_MEM;
4077 goto cleanup;
4078 }
4079 }
4080
4081 create_dbt_key_from_table(&prim_key, primary_key, primary_key_buff, record, &has_null);
4082 if ((error = pack_row(&row, (const uchar *) record, primary_key))){
4083 goto cleanup;
4084 }
4085
4086 create_sub_trans = (using_ignore && !(do_ignore_flag_optimization(thd,table,share->replace_into_fast && !using_ignore_no_key)));
4087 if (create_sub_trans) {
4088 error = txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4089 if (error) {
4090 goto cleanup;
4091 }
4092 }
4093 txn = create_sub_trans ? sub_trans : transaction;
4094 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_TXN, "txn %p", txn);
4095 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY))) {
4096 test_row_packing(record,&prim_key,&row);
4097 }
4098 if (loader) {
4099 error = loader->put(loader, &prim_key, &row);
4100 if (error) {
4101 abort_loader = true;
4102 goto cleanup;
4103 }
4104 } else {
4105 error = do_uniqueness_checks(record, txn, thd);
4106 if (error) {
4107 // for #4633
4108 // if we have a duplicate key error, let's check the primary key to see
4109 // if there is a duplicate there. If so, set last_dup_key to the pk
4110 if (error == DB_KEYEXIST && !tokudb_test(hidden_primary_key) && last_dup_key != primary_key) {
4111 int r = share->file->getf_set(share->file, txn, DB_SERIALIZABLE, &prim_key, smart_dbt_do_nothing, NULL);
4112 if (r == 0) {
4113 // if we get no error, that means the row
4114 // was found and this is a duplicate key,
4115 // so we set last_dup_key
4116 last_dup_key = primary_key;
4117 } else if (r != DB_NOTFOUND) {
4118 // if some other error is returned, return that to the user.
4119 error = r;
4120 }
4121 }
4122 goto cleanup;
4123 }
4124 if (curr_num_DBs == 1) {
4125 error = insert_row_to_main_dictionary(&prim_key, &row, txn);
4126 if (error) { goto cleanup; }
4127 } else {
4128 error = insert_rows_to_dictionaries_mult(&prim_key, &row, txn, thd);
4129 if (error) { goto cleanup; }
4130 }
4131 if (error == 0) {
4132 uint64_t full_row_size = prim_key.size + row.size;
4133 toku_hton_update_primary_key_bytes_inserted(full_row_size);
4134 }
4135 }
4136
4137 trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4138 if (!error) {
4139 added_rows++;
4140 trx->stmt_progress.inserted++;
4141 track_progress(thd);
4142 }
4143 cleanup:
4144 if (num_DBs_locked) {
4145 share->_num_DBs_lock.unlock();
4146 }
4147 if (error == DB_KEYEXIST) {
4148 error = HA_ERR_FOUND_DUPP_KEY;
4149 }
4150 if (sub_trans) {
4151 // no point in recording error value of abort.
4152 // nothing we can do about it anyway and it is not what
4153 // we want to return.
4154 if (error) {
4155 abort_txn(sub_trans);
4156 }
4157 else {
4158 commit_txn(sub_trans, DB_TXN_NOSYNC);
4159 }
4160 }
4161 TOKUDB_HANDLER_DBUG_RETURN(error);
4162 }
4163
4164 /* Compare if a key in a row has changed */
4165 bool ha_tokudb::key_changed(uint keynr, const uchar * old_row, const uchar * new_row) {
4166 DBT old_key;
4167 DBT new_key;
4168 memset((void *) &old_key, 0, sizeof(old_key));
4169 memset((void *) &new_key, 0, sizeof(new_key));
4170
4171 bool has_null;
4172 create_dbt_key_from_table(&new_key, keynr, key_buff2, new_row, &has_null);
4173 create_dbt_key_for_lookup(&old_key,&table->key_info[keynr], key_buff3, old_row, &has_null);
4174 return tokudb_prefix_cmp_dbt_key(share->key_file[keynr], &old_key, &new_key);
4175 }
4176
4177 //
4178 // Updates a row in the table, called when handling an UPDATE query
4179 // Parameters:
4180 // [in] old_row - row to be updated, in MySQL format
4181 // [in] new_row - new row, in MySQL format
4182 // Returns:
4183 // 0 on success
4184 // error otherwise
4185 //
4186 int ha_tokudb::update_row(const uchar * old_row, uchar * new_row) {
4187 TOKUDB_HANDLER_DBUG_ENTER("");
4188 DBT prim_key, old_prim_key, prim_row, old_prim_row;
4189 int error;
4190 bool has_null;
4191 THD* thd = ha_thd();
4192 DB_TXN* sub_trans = NULL;
4193 DB_TXN* txn = NULL;
4194 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4195 uint curr_num_DBs;
4196
4197 LINT_INIT(error);
4198 memset((void *) &prim_key, 0, sizeof(prim_key));
4199 memset((void *) &old_prim_key, 0, sizeof(old_prim_key));
4200 memset((void *) &prim_row, 0, sizeof(prim_row));
4201 memset((void *) &old_prim_row, 0, sizeof(old_prim_row));
4202
4203 ha_statistic_increment(&SSV::ha_update_count);
4204 #if MYSQL_VERSION_ID < 50600
4205 if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) {
4206 table->timestamp_field->set_time();
4207 }
4208 #endif
4209 //
4210 // check to see if some value for the auto increment column that is bigger
4211 // than anything else til now is being used. If so, update the metadata to reflect it
4212 // the goal here is we never want to have a dup key error due to a bad increment
4213 // of the auto inc field.
4214 //
4215 if (share->has_auto_inc && new_row == table->record[0]) {
4216 share->lock();
4217 ulonglong curr_auto_inc = retrieve_auto_increment(
4218 table->field[share->ai_field_index]->key_type(),
4219 field_offset(table->field[share->ai_field_index], table),
4220 new_row
4221 );
4222 if (curr_auto_inc > share->last_auto_increment) {
4223 error = update_max_auto_inc(share->status_block, curr_auto_inc);
4224 if (!error) {
4225 share->last_auto_increment = curr_auto_inc;
4226 }
4227 }
4228 share->unlock();
4229 }
4230
4231 //
4232 // grab reader lock on numDBs_lock
4233 //
4234 bool num_DBs_locked = false;
4235 if (!num_DBs_locked_in_bulk) {
4236 rwlock_t_lock_read(share->_num_DBs_lock);
4237 num_DBs_locked = true;
4238 }
4239 curr_num_DBs = share->num_DBs;
4240
4241 if (using_ignore) {
4242 error = txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4243 if (error) {
4244 goto cleanup;
4245 }
4246 }
4247 txn = using_ignore ? sub_trans : transaction;
4248
4249 if (hidden_primary_key) {
4250 memset((void *) &prim_key, 0, sizeof(prim_key));
4251 prim_key.data = (void *) current_ident;
4252 prim_key.size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
4253 old_prim_key = prim_key;
4254 }
4255 else {
4256 create_dbt_key_from_table(&prim_key, primary_key, key_buff, new_row, &has_null);
4257 create_dbt_key_from_table(&old_prim_key, primary_key, primary_key_buff, old_row, &has_null);
4258 }
4259
4260 // do uniqueness checks
4261 if (share->has_unique_keys && do_unique_checks(thd, in_rpl_update_rows)) {
4262 for (uint keynr = 0; keynr < table_share->keys; keynr++) {
4263 bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
4264 if (keynr == primary_key && !share->pk_has_string) {
4265 continue;
4266 }
4267 if (is_unique_key) {
4268 bool key_ch = key_changed(keynr, old_row, new_row);
4269 if (key_ch) {
4270 bool is_unique;
4271 error = is_val_unique(&is_unique, new_row, &table->key_info[keynr], keynr, txn);
4272 if (error) goto cleanup;
4273 if (!is_unique) {
4274 error = DB_KEYEXIST;
4275 last_dup_key = keynr;
4276 goto cleanup;
4277 }
4278 }
4279 }
4280 }
4281 }
4282
4283 if (table_share->blob_fields) {
4284 if (fix_rec_buff_for_blob(max_row_length(new_row))) {
4285 error = HA_ERR_OUT_OF_MEM;
4286 goto cleanup;
4287 }
4288 if (fix_rec_update_buff_for_blob(max_row_length(old_row))) {
4289 error = HA_ERR_OUT_OF_MEM;
4290 goto cleanup;
4291 }
4292 }
4293
4294 error = pack_row(&prim_row, new_row, primary_key);
4295 if (error) { goto cleanup; }
4296
4297 error = pack_old_row_for_update(&old_prim_row, old_row, primary_key);
4298 if (error) { goto cleanup; }
4299
4300 set_main_dict_put_flags(thd, false, &mult_put_flags[primary_key]);
4301
4302 // for test, make unique checks have a very long duration
4303 if ((mult_put_flags[primary_key] & DB_OPFLAGS_MASK) == DB_NOOVERWRITE)
4304 maybe_do_unique_checks_delay(thd);
4305
4306 error = db_env->update_multiple(
4307 db_env,
4308 share->key_file[primary_key],
4309 txn,
4310 &old_prim_key,
4311 &old_prim_row,
4312 &prim_key,
4313 &prim_row,
4314 curr_num_DBs,
4315 share->key_file,
4316 mult_put_flags,
4317 2*curr_num_DBs,
4318 mult_key_dbt_array,
4319 curr_num_DBs,
4320 mult_rec_dbt_array
4321 );
4322
4323 if (error == DB_KEYEXIST) {
4324 last_dup_key = primary_key;
4325 }
4326 else if (!error) {
4327 updated_rows++;
4328 trx->stmt_progress.updated++;
4329 track_progress(thd);
4330 }
4331
4332
4333 cleanup:
4334 if (num_DBs_locked) {
4335 share->_num_DBs_lock.unlock();
4336 }
4337 if (error == DB_KEYEXIST) {
4338 error = HA_ERR_FOUND_DUPP_KEY;
4339 }
4340 if (sub_trans) {
4341 // no point in recording error value of abort.
4342 // nothing we can do about it anyway and it is not what
4343 // we want to return.
4344 if (error) {
4345 abort_txn(sub_trans);
4346 }
4347 else {
4348 commit_txn(sub_trans, DB_TXN_NOSYNC);
4349 }
4350 }
4351 TOKUDB_HANDLER_DBUG_RETURN(error);
4352 }
4353
4354 //
4355 // Deletes a row in the table, called when handling a DELETE query
4356 // Parameters:
4357 // [in] record - row to be deleted, in MySQL format
4358 // Returns:
4359 // 0 on success
4360 // error otherwise
4361 //
4362 int ha_tokudb::delete_row(const uchar * record) {
4363 TOKUDB_HANDLER_DBUG_ENTER("");
4364 int error = ENOSYS;
4365 DBT row, prim_key;
4366 bool has_null;
4367 THD* thd = ha_thd();
4368 uint curr_num_DBs;
4369 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4370
4371 ha_statistic_increment(&SSV::ha_delete_count);
4372
4373 //
4374 // grab reader lock on numDBs_lock
4375 //
4376 bool num_DBs_locked = false;
4377 if (!num_DBs_locked_in_bulk) {
4378 rwlock_t_lock_read(share->_num_DBs_lock);
4379 num_DBs_locked = true;
4380 }
4381 curr_num_DBs = share->num_DBs;
4382
4383 create_dbt_key_from_table(&prim_key, primary_key, key_buff, record, &has_null);
4384 if (table_share->blob_fields) {
4385 if (fix_rec_buff_for_blob(max_row_length(record))) {
4386 error = HA_ERR_OUT_OF_MEM;
4387 goto cleanup;
4388 }
4389 }
4390 if ((error = pack_row(&row, (const uchar *) record, primary_key))){
4391 goto cleanup;
4392 }
4393
4394 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
4395 TOKUDB_DEBUG_TXN,
4396 "all %p stmt %p sub_sp_level %p transaction %p",
4397 trx->all,
4398 trx->stmt,
4399 trx->sub_sp_level,
4400 transaction);
4401
4402 error =
4403 db_env->del_multiple(
4404 db_env,
4405 share->key_file[primary_key],
4406 transaction,
4407 &prim_key,
4408 &row,
4409 curr_num_DBs,
4410 share->key_file,
4411 mult_key_dbt_array,
4412 mult_del_flags);
4413
4414 if (error) {
4415 DBUG_PRINT("error", ("Got error %d", error));
4416 } else {
4417 deleted_rows++;
4418 trx->stmt_progress.deleted++;
4419 track_progress(thd);
4420 }
4421 cleanup:
4422 if (num_DBs_locked) {
4423 share->_num_DBs_lock.unlock();
4424 }
4425 TOKUDB_HANDLER_DBUG_RETURN(error);
4426 }
4427
4428 //
4429 // takes as input table->read_set and table->write_set
4430 // and puts list of field indexes that need to be read in
4431 // unpack_row in the member variables fixed_cols_for_query
4432 // and var_cols_for_query
4433 //
4434 void ha_tokudb::set_query_columns(uint keynr) {
4435 uint32_t curr_fixed_col_index = 0;
4436 uint32_t curr_var_col_index = 0;
4437 read_key = false;
4438 read_blobs = false;
4439 //
4440 // i know this is probably confusing and will need to be explained better
4441 //
4442 uint key_index = 0;
4443
4444 if (keynr == primary_key || keynr == MAX_KEY) {
4445 key_index = primary_key;
4446 }
4447 else {
4448 key_index = (key_is_clustering(&table->key_info[keynr]) ? keynr : primary_key);
4449 }
4450 for (uint i = 0; i < table_share->fields; i++) {
4451 if (bitmap_is_set(table->read_set,i) ||
4452 bitmap_is_set(table->write_set,i)
4453 )
4454 {
4455 if (bitmap_is_set(&share->kc_info.key_filters[key_index],i)) {
4456 read_key = true;
4457 }
4458 else {
4459 //
4460 // if fixed field length
4461 //
4462 if (is_fixed_field(&share->kc_info, i)) {
4463 //
4464 // save the offset into the list
4465 //
4466 fixed_cols_for_query[curr_fixed_col_index] = i;
4467 curr_fixed_col_index++;
4468 }
4469 //
4470 // varchar or varbinary
4471 //
4472 else if (is_variable_field(&share->kc_info, i)) {
4473 var_cols_for_query[curr_var_col_index] = i;
4474 curr_var_col_index++;
4475 }
4476 //
4477 // it is a blob
4478 //
4479 else {
4480 read_blobs = true;
4481 }
4482 }
4483 }
4484 }
4485 num_fixed_cols_for_query = curr_fixed_col_index;
4486 num_var_cols_for_query = curr_var_col_index;
4487 }
4488
4489 void ha_tokudb::column_bitmaps_signal() {
4490 //
4491 // if we have max number of indexes, then MAX_KEY == primary_key
4492 //
4493 if (tokudb_active_index != MAX_KEY || tokudb_active_index == primary_key) {
4494 set_query_columns(tokudb_active_index);
4495 }
4496 }
4497
4498 //
4499 // Notification that a scan of entire secondary table is about
4500 // to take place. Will pre acquire table read lock
4501 // Returns:
4502 // 0 on success
4503 // error otherwise
4504 //
4505 int ha_tokudb::prepare_index_scan() {
4506 TOKUDB_HANDLER_DBUG_ENTER("");
4507 int error = 0;
4508 HANDLE_INVALID_CURSOR();
4509 error = prelock_range(NULL, NULL);
4510 if (error) { last_cursor_error = error; goto cleanup; }
4511
4512 range_lock_grabbed = true;
4513 error = 0;
4514 cleanup:
4515 TOKUDB_HANDLER_DBUG_RETURN(error);
4516 }
4517
4518 static bool index_key_is_null(
4519 TABLE* table,
4520 uint keynr,
4521 const uchar* key,
4522 uint key_len) {
4523
4524 bool key_can_be_null = false;
4525 KEY* key_info = &table->key_info[keynr];
4526 KEY_PART_INFO* key_part = key_info->key_part;
4527 KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
4528 for (; key_part != end; key_part++) {
4529 if (key_part->null_bit) {
4530 key_can_be_null = true;
4531 break;
4532 }
4533 }
4534 return key_can_be_null && key_len > 0 && key[0] != 0;
4535 }
4536
4537 // Return true if bulk fetch can be used
4538 static bool tokudb_do_bulk_fetch(THD *thd) {
4539 switch (thd_sql_command(thd)) {
4540 case SQLCOM_SELECT:
4541 case SQLCOM_CREATE_TABLE:
4542 case SQLCOM_INSERT_SELECT:
4543 case SQLCOM_REPLACE_SELECT:
4544 case SQLCOM_DELETE:
4545 return tokudb::sysvars::bulk_fetch(thd) != 0;
4546 default:
4547 return false;
4548 }
4549 }
4550
4551 //
4552 // Notification that a range query getting all elements that equal a key
4553 // to take place. Will pre acquire read lock
4554 // Returns:
4555 // 0 on success
4556 // error otherwise
4557 //
4558 int ha_tokudb::prepare_index_key_scan(const uchar * key, uint key_len) {
4559 TOKUDB_HANDLER_DBUG_ENTER("%p %u", key, key_len);
4560 int error = 0;
4561 DBT start_key, end_key;
4562 THD* thd = ha_thd();
4563 HANDLE_INVALID_CURSOR();
4564 pack_key(&start_key, tokudb_active_index, prelocked_left_range, key, key_len, COL_NEG_INF);
4565 prelocked_left_range_size = start_key.size;
4566 pack_key(&end_key, tokudb_active_index, prelocked_right_range, key, key_len, COL_POS_INF);
4567 prelocked_right_range_size = end_key.size;
4568
4569 error = cursor->c_set_bounds(
4570 cursor,
4571 &start_key,
4572 &end_key,
4573 true,
4574 (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
4575 );
4576
4577 if (error){
4578 goto cleanup;
4579 }
4580
4581 range_lock_grabbed = true;
4582 range_lock_grabbed_null = index_key_is_null(table, tokudb_active_index, key, key_len);
4583 doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
4584 bulk_fetch_iteration = 0;
4585 rows_fetched_using_bulk_fetch = 0;
4586 error = 0;
4587 cleanup:
4588 if (error) {
4589 error = map_to_handler_error(error);
4590 last_cursor_error = error;
4591 //
4592 // cursor should be initialized here, but in case it is not,
4593 // we still check
4594 //
4595 if (cursor) {
4596 int r = cursor->c_close(cursor);
4597 assert_always(r==0);
4598 cursor = NULL;
4599 remove_from_trx_handler_list();
4600 }
4601 }
4602 TOKUDB_HANDLER_DBUG_RETURN(error);
4603 }
4604
4605 void ha_tokudb::invalidate_bulk_fetch() {
4606 bytes_used_in_range_query_buff= 0;
4607 curr_range_query_buff_offset = 0;
4608 icp_went_out_of_range = false;
4609 }
4610
4611 void ha_tokudb::invalidate_icp() {
4612 toku_pushed_idx_cond = NULL;
4613 toku_pushed_idx_cond_keyno = MAX_KEY;
4614 icp_went_out_of_range = false;
4615 }
4616
4617 //
4618 // Initializes local cursor on DB with index keynr
4619 // Parameters:
4620 // keynr - key (index) number
4621 // sorted - 1 if result MUST be sorted according to index
4622 // Returns:
4623 // 0 on success
4624 // error otherwise
4625 //
4626 int ha_tokudb::index_init(uint keynr, bool sorted) {
4627 TOKUDB_HANDLER_DBUG_ENTER("%d %u txn %p", keynr, sorted, transaction);
4628
4629 int error;
4630 THD* thd = ha_thd();
4631 DBUG_PRINT("enter", ("table: '%s' key: %d", table_share->table_name.str, keynr));
4632
4633 /*
4634 Under some very rare conditions (like full joins) we may already have
4635 an active cursor at this point
4636 */
4637 if (cursor) {
4638 DBUG_PRINT("note", ("Closing active cursor"));
4639 int r = cursor->c_close(cursor);
4640 assert_always(r==0);
4641 remove_from_trx_handler_list();
4642 }
4643 active_index = keynr;
4644
4645 if (active_index < MAX_KEY) {
4646 DBUG_ASSERT(keynr <= table->s->keys);
4647 } else {
4648 DBUG_ASSERT(active_index == MAX_KEY);
4649 keynr = primary_key;
4650 }
4651 tokudb_active_index = keynr;
4652
4653 #if defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
4654 if (keynr < table->s->keys && table->key_info[keynr].option_struct->clustering)
4655 key_read = false;
4656 #endif // defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
4657
4658 last_cursor_error = 0;
4659 range_lock_grabbed = false;
4660 range_lock_grabbed_null = false;
4661 DBUG_ASSERT(share->key_file[keynr]);
4662 cursor_flags = get_cursor_isolation_flags(lock.type, thd);
4663 if (use_write_locks) {
4664 cursor_flags |= DB_RMW;
4665 }
4666 if (tokudb::sysvars::disable_prefetching(thd)) {
4667 cursor_flags |= DBC_DISABLE_PREFETCHING;
4668 }
4669 if (lock.type == TL_READ_WITH_SHARED_LOCKS) {
4670 cursor_flags |= DB_LOCKING_READ;
4671 }
4672 if ((error = share->key_file[keynr]->cursor(share->key_file[keynr],
4673 transaction, &cursor,
4674 cursor_flags))) {
4675 if (error == TOKUDB_MVCC_DICTIONARY_TOO_NEW) {
4676 error = HA_ERR_TABLE_DEF_CHANGED;
4677 my_error(ER_TABLE_DEF_CHANGED, MYF(0));
4678 }
4679 if (error == DB_LOCK_NOTGRANTED) {
4680 error = HA_ERR_LOCK_WAIT_TIMEOUT;
4681 my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
4682 }
4683 table->status = STATUS_NOT_FOUND;
4684 error = map_to_handler_error(error);
4685 last_cursor_error = error;
4686 cursor = NULL; // Safety
4687 goto exit;
4688 }
4689 cursor->c_set_check_interrupt_callback(cursor, tokudb_killed_thd_callback, thd);
4690 memset((void *) &last_key, 0, sizeof(last_key));
4691
4692 add_to_trx_handler_list();
4693
4694 if (thd_sql_command(thd) == SQLCOM_SELECT) {
4695 set_query_columns(keynr);
4696 unpack_entire_row = false;
4697 }
4698 else {
4699 unpack_entire_row = true;
4700 }
4701 invalidate_bulk_fetch();
4702 doing_bulk_fetch = false;
4703 maybe_index_scan = false;
4704 error = 0;
4705 exit:
4706 TOKUDB_HANDLER_DBUG_RETURN(error);
4707 }
4708
4709 //
4710 // closes the local cursor
4711 //
4712 int ha_tokudb::index_end() {
4713 TOKUDB_HANDLER_DBUG_ENTER("");
4714 range_lock_grabbed = false;
4715 range_lock_grabbed_null = false;
4716 if (cursor) {
4717 DBUG_PRINT("enter", ("table: '%s'", table_share->table_name.str));
4718 int r = cursor->c_close(cursor);
4719 assert_always(r==0);
4720 cursor = NULL;
4721 remove_from_trx_handler_list();
4722 last_cursor_error = 0;
4723 }
4724 active_index = tokudb_active_index = MAX_KEY;
4725
4726 //
4727 // reset query variables
4728 //
4729 unpack_entire_row = true;
4730 read_blobs = true;
4731 read_key = true;
4732 num_fixed_cols_for_query = 0;
4733 num_var_cols_for_query = 0;
4734
4735 invalidate_bulk_fetch();
4736 invalidate_icp();
4737 doing_bulk_fetch = false;
4738 close_dsmrr();
4739
4740 TOKUDB_HANDLER_DBUG_RETURN(0);
4741 }
4742
4743 int ha_tokudb::handle_cursor_error(int error, int err_to_return) {
4744 TOKUDB_HANDLER_DBUG_ENTER("");
4745 if (error) {
4746 error = map_to_handler_error(error);
4747 last_cursor_error = error;
4748 table->status = STATUS_NOT_FOUND;
4749 if (error == DB_NOTFOUND) {
4750 error = err_to_return;
4751 }
4752 }
4753 TOKUDB_HANDLER_DBUG_RETURN(error);
4754 }
4755
4756
4757 //
4758 // Helper function for read_row and smart_dbt_callback_xxx functions
4759 // When using a hidden primary key, upon reading a row,
4760 // we set the current_ident field to whatever the primary key we retrieved
4761 // was
4762 //
4763 void ha_tokudb::extract_hidden_primary_key(uint keynr, DBT const *found_key) {
4764 //
4765 // extract hidden primary key to current_ident
4766 //
4767 if (hidden_primary_key) {
4768 if (keynr == primary_key) {
4769 memcpy(current_ident, (char *) found_key->data, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
4770 }
4771 //
4772 // if secondary key, hidden primary key is at end of found_key
4773 //
4774 else {
4775 memcpy(
4776 current_ident,
4777 (char *) found_key->data + found_key->size - TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH,
4778 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH
4779 );
4780 }
4781 }
4782 }
4783
4784
4785 int ha_tokudb::read_row_callback (uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4786 assert_always(keynr == primary_key);
4787 return unpack_row(buf, row,found_key, keynr);
4788 }
4789
4790 //
4791 // Reads the contents of row and found_key, DBT's retrieved from the DB associated to keynr, into buf
4792 // This function assumes that we are using a covering index, as a result, if keynr is the primary key,
4793 // we do not read row into buf
4794 // Parameters:
4795 // [out] buf - buffer for the row, in MySQL format
4796 // keynr - index into key_file that represents DB we are currently operating on.
4797 // [in] row - the row that has been read from the preceding DB call
4798 // [in] found_key - key used to retrieve the row
4799 //
4800 void ha_tokudb::read_key_only(uchar * buf, uint keynr, DBT const *found_key) {
4801 TOKUDB_HANDLER_DBUG_ENTER("");
4802 table->status = 0;
4803 //
4804 // only case when we do not unpack the key is if we are dealing with the main dictionary
4805 // of a table with a hidden primary key
4806 //
4807 if (!(hidden_primary_key && keynr == primary_key)) {
4808 unpack_key(buf, found_key, keynr);
4809 }
4810 TOKUDB_HANDLER_DBUG_VOID_RETURN;
4811 }
4812
4813 //
4814 // Helper function used to try to retrieve the entire row
4815 // If keynr is associated with the main table, reads contents of found_key and row into buf, otherwise,
4816 // makes copy of primary key and saves it to last_key. This can later be used to retrieve the entire row
4817 // Parameters:
4818 // [out] buf - buffer for the row, in MySQL format
4819 // keynr - index into key_file that represents DB we are currently operating on.
4820 // [in] row - the row that has been read from the preceding DB call
4821 // [in] found_key - key used to retrieve the row
4822 //
4823 int ha_tokudb::read_primary_key(uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4824 TOKUDB_HANDLER_DBUG_ENTER("");
4825 int error = 0;
4826 table->status = 0;
4827 //
4828 // case where we read from secondary table that is not clustered
4829 //
4830 if (keynr != primary_key && !key_is_clustering(&table->key_info[keynr])) {
4831 bool has_null;
4832 //
4833 // create a DBT that has the same data as row, this is inefficient
4834 // extract_hidden_primary_key MUST have been called before this
4835 //
4836 memset((void *) &last_key, 0, sizeof(last_key));
4837 if (!hidden_primary_key) {
4838 unpack_key(buf, found_key, keynr);
4839 }
4840 create_dbt_key_from_table(
4841 &last_key,
4842 primary_key,
4843 key_buff,
4844 buf,
4845 &has_null
4846 );
4847 }
4848 //
4849 // else read from clustered/primary key
4850 //
4851 else {
4852 error = unpack_row(buf, row, found_key, keynr);
4853 if (error) { goto exit; }
4854 }
4855 if (found_key) { DBUG_DUMP("read row key", (uchar *) found_key->data, found_key->size); }
4856 error = 0;
4857 exit:
4858 TOKUDB_HANDLER_DBUG_RETURN(error);
4859 }
4860
4861 //
4862 // This function reads an entire row into buf. This function also assumes that
4863 // the key needed to retrieve the row is stored in the member variable last_key
4864 // Parameters:
4865 // [out] buf - buffer for the row, in MySQL format
4866 // Returns:
4867 // 0 on success, error otherwise
4868 //
4869 int ha_tokudb::read_full_row(uchar * buf) {
4870 TOKUDB_HANDLER_DBUG_ENTER("");
4871 int error = 0;
4872 struct smart_dbt_info info;
4873 info.ha = this;
4874 info.buf = buf;
4875 info.keynr = primary_key;
4876 //
4877 // assumes key is stored in this->last_key
4878 //
4879
4880 error = share->file->getf_set(share->file,
4881 transaction,
4882 cursor_flags,
4883 &last_key,
4884 smart_dbt_callback_rowread_ptquery,
4885 &info);
4886
4887 DBUG_EXECUTE_IF("tokudb_fake_db_notfound_error_in_read_full_row", {
4888 error = DB_NOTFOUND;
4889 });
4890
4891 if (error) {
4892 if (error == DB_LOCK_NOTGRANTED) {
4893 error = HA_ERR_LOCK_WAIT_TIMEOUT;
4894 } else if (error == DB_NOTFOUND) {
4895 error = HA_ERR_CRASHED;
4896 if (tokudb_active_index < share->_keys) {
4897 sql_print_error(
4898 "ha_tokudb::read_full_row on table %s cound not locate "
4899 "record in PK that matches record found in key %s",
4900 share->full_table_name(),
4901 share->_key_descriptors[tokudb_active_index]._name);
4902 } else {
4903 sql_print_error(
4904 "ha_tokudb::read_full_row on table %s cound not locate "
4905 "record in PK that matches record found in key %d",
4906 share->full_table_name(),
4907 tokudb_active_index);
4908 }
4909 }
4910 table->status = STATUS_NOT_FOUND;
4911 }
4912
4913 TOKUDB_HANDLER_DBUG_RETURN(error);
4914 }
4915
4916
4917 //
4918 // Reads the next row matching to the key, on success, advances cursor
4919 // Parameters:
4920 // [out] buf - buffer for the next row, in MySQL format
4921 // [in] key - key value
4922 // keylen - length of key
4923 // Returns:
4924 // 0 on success
4925 // HA_ERR_END_OF_FILE if not found
4926 // error otherwise
4927 //
4928 int ha_tokudb::index_next_same(uchar* buf, const uchar* key, uint keylen) {
4929 TOKUDB_HANDLER_DBUG_ENTER("");
4930 ha_statistic_increment(&SSV::ha_read_next_count);
4931
4932 DBT curr_key;
4933 DBT found_key;
4934 bool has_null;
4935 int cmp;
4936 // create the key that will be used to compare with what is found
4937 // in order to figure out if we should return an error
4938 pack_key(&curr_key, tokudb_active_index, key_buff2, key, keylen, COL_ZERO);
4939 int error = get_next(buf, 1, &curr_key, key_read);
4940 if (error) {
4941 goto cleanup;
4942 }
4943 //
4944 // now do the comparison
4945 //
4946 create_dbt_key_from_table(
4947 &found_key,
4948 tokudb_active_index,
4949 key_buff3,buf,
4950 &has_null);
4951 cmp =
4952 tokudb_prefix_cmp_dbt_key(
4953 share->key_file[tokudb_active_index],
4954 &curr_key,
4955 &found_key);
4956 if (cmp) {
4957 error = HA_ERR_END_OF_FILE;
4958 }
4959
4960 cleanup:
4961 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
4962 TOKUDB_HANDLER_DBUG_RETURN(error);
4963 }
4964
4965
4966 //
4967 // According to InnoDB handlerton: Positions an index cursor to the index
4968 // specified in keynr. Fetches the row if any
4969 // Parameters:
4970 // [out] buf - buffer for the returned row
4971 // [in] key - key value, according to InnoDB, if NULL,
4972 // position cursor at start or end of index,
4973 // not sure if this is done now
4974 // key_len - length of key
4975 // find_flag - according to InnoDB, search flags from my_base.h
4976 // Returns:
4977 // 0 on success
4978 // HA_ERR_KEY_NOT_FOUND if not found (per InnoDB),
4979 // we seem to return HA_ERR_END_OF_FILE if find_flag != HA_READ_KEY_EXACT
4980 // TODO: investigate this for correctness
4981 // error otherwise
4982 //
4983 int ha_tokudb::index_read(
4984 uchar* buf,
4985 const uchar* key,
4986 uint key_len,
4987 enum ha_rkey_function find_flag) {
4988
4989 TOKUDB_HANDLER_DBUG_ENTER(
4990 "key %p %u:%2.2x find=%u",
4991 key,
4992 key_len,
4993 key ? key[0] : 0,
4994 find_flag);
4995 invalidate_bulk_fetch();
4996 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
4997 TOKUDB_DBUG_DUMP("mysql key=", key, key_len);
4998 }
4999 DBT row;
5000 DBT lookup_key;
5001 int error = 0;
5002 uint32_t flags = 0;
5003 THD* thd = ha_thd();
5004 tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
5005 struct smart_dbt_info info;
5006 struct index_read_info ir_info;
5007
5008 HANDLE_INVALID_CURSOR();
5009
5010 // if we locked a non-null key range and we now have a null key, then
5011 // remove the bounds from the cursor
5012 if (range_lock_grabbed &&
5013 !range_lock_grabbed_null &&
5014 index_key_is_null(table, tokudb_active_index, key, key_len)) {
5015 range_lock_grabbed = range_lock_grabbed_null = false;
5016 cursor->c_remove_restriction(cursor);
5017 }
5018
5019 ha_statistic_increment(&SSV::ha_read_key_count);
5020 memset((void *) &row, 0, sizeof(row));
5021
5022 info.ha = this;
5023 info.buf = buf;
5024 info.keynr = tokudb_active_index;
5025
5026 ir_info.smart_dbt_info = info;
5027 ir_info.cmp = 0;
5028
5029 flags = SET_PRELOCK_FLAG(0);
5030 switch (find_flag) {
5031 case HA_READ_KEY_EXACT: /* Find first record else error */ {
5032 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5033 DBT lookup_bound;
5034 pack_key(&lookup_bound, tokudb_active_index, key_buff4, key, key_len, COL_POS_INF);
5035 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
5036 TOKUDB_DBUG_DUMP("tokudb key=", lookup_key.data, lookup_key.size);
5037 }
5038 ir_info.orig_key = &lookup_key;
5039 error = cursor->c_getf_set_range_with_bound(cursor, flags, &lookup_key, &lookup_bound, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5040 if (ir_info.cmp) {
5041 error = DB_NOTFOUND;
5042 }
5043 break;
5044 }
5045 case HA_READ_AFTER_KEY: /* Find next rec. after key-record */
5046 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5047 error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5048 break;
5049 case HA_READ_BEFORE_KEY: /* Find next rec. before key-record */
5050 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5051 error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5052 break;
5053 case HA_READ_KEY_OR_NEXT: /* Record or next record */
5054 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5055 error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5056 break;
5057 //
5058 // This case does not seem to ever be used, it is ok for it to be slow
5059 //
5060 case HA_READ_KEY_OR_PREV: /* Record or previous */
5061 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5062 ir_info.orig_key = &lookup_key;
5063 error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5064 if (error == DB_NOTFOUND) {
5065 error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5066 }
5067 else if (ir_info.cmp) {
5068 error = cursor->c_getf_prev(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5069 }
5070 break;
5071 case HA_READ_PREFIX_LAST_OR_PREV: /* Last or prev key with the same prefix */
5072 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5073 error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5074 break;
5075 case HA_READ_PREFIX_LAST:
5076 pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5077 ir_info.orig_key = &lookup_key;
5078 error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5079 if (ir_info.cmp) {
5080 error = DB_NOTFOUND;
5081 }
5082 break;
5083 default:
5084 TOKUDB_HANDLER_TRACE("unsupported:%d", find_flag);
5085 error = HA_ERR_UNSUPPORTED;
5086 break;
5087 }
5088 error = handle_cursor_error(error, HA_ERR_KEY_NOT_FOUND);
5089 if (!error && !key_read && tokudb_active_index != primary_key && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5090 error = read_full_row(buf);
5091 }
5092
5093 if (TOKUDB_UNLIKELY(error && TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ERROR))) {
5094 TOKUDB_HANDLER_TRACE("error:%d:%d", error, find_flag);
5095 }
5096 trx->stmt_progress.queried++;
5097 track_progress(thd);
5098
5099 cleanup:
5100 TOKUDB_HANDLER_DBUG_RETURN(error);
5101 }
5102
5103
5104 int ha_tokudb::read_data_from_range_query_buff(uchar* buf, bool need_val, bool do_key_read) {
5105 // buffer has the next row, get it from there
5106 int error;
5107 uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
5108 DBT curr_key;
5109 memset((void *) &curr_key, 0, sizeof(curr_key));
5110
5111 // get key info
5112 uint32_t key_size = *(uint32_t *)curr_pos;
5113 curr_pos += sizeof(key_size);
5114 uchar* curr_key_buff = curr_pos;
5115 curr_pos += key_size;
5116
5117 curr_key.data = curr_key_buff;
5118 curr_key.size = key_size;
5119
5120 // if this is a covering index, this is all we need
5121 if (do_key_read) {
5122 assert_always(!need_val);
5123 extract_hidden_primary_key(tokudb_active_index, &curr_key);
5124 read_key_only(buf, tokudb_active_index, &curr_key);
5125 error = 0;
5126 }
5127 // we need to get more data
5128 else {
5129 DBT curr_val;
5130 memset((void *) &curr_val, 0, sizeof(curr_val));
5131 uchar* curr_val_buff = NULL;
5132 uint32_t val_size = 0;
5133 // in this case, we don't have a val, we are simply extracting the pk
5134 if (!need_val) {
5135 curr_val.data = curr_val_buff;
5136 curr_val.size = val_size;
5137 extract_hidden_primary_key(tokudb_active_index, &curr_key);
5138 error = read_primary_key( buf, tokudb_active_index, &curr_val, &curr_key);
5139 }
5140 else {
5141 extract_hidden_primary_key(tokudb_active_index, &curr_key);
5142 // need to extract a val and place it into buf
5143 if (unpack_entire_row) {
5144 // get val info
5145 val_size = *(uint32_t *)curr_pos;
5146 curr_pos += sizeof(val_size);
5147 curr_val_buff = curr_pos;
5148 curr_pos += val_size;
5149 curr_val.data = curr_val_buff;
5150 curr_val.size = val_size;
5151 error = unpack_row(buf,&curr_val, &curr_key, tokudb_active_index);
5152 }
5153 else {
5154 if (!(hidden_primary_key && tokudb_active_index == primary_key)) {
5155 unpack_key(buf,&curr_key,tokudb_active_index);
5156 }
5157 // read rows we care about
5158
5159 // first the null bytes;
5160 memcpy(buf, curr_pos, table_share->null_bytes);
5161 curr_pos += table_share->null_bytes;
5162
5163 // now the fixed sized rows
5164 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5165 uint field_index = fixed_cols_for_query[i];
5166 Field* field = table->field[field_index];
5167 unpack_fixed_field(
5168 buf + field_offset(field, table),
5169 curr_pos,
5170 share->kc_info.field_lengths[field_index]
5171 );
5172 curr_pos += share->kc_info.field_lengths[field_index];
5173 }
5174 // now the variable sized rows
5175 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5176 uint field_index = var_cols_for_query[i];
5177 Field* field = table->field[field_index];
5178 uint32_t field_len = *(uint32_t *)curr_pos;
5179 curr_pos += sizeof(field_len);
5180 unpack_var_field(
5181 buf + field_offset(field, table),
5182 curr_pos,
5183 field_len,
5184 share->kc_info.length_bytes[field_index]
5185 );
5186 curr_pos += field_len;
5187 }
5188 // now the blobs
5189 if (read_blobs) {
5190 uint32_t blob_size = *(uint32_t *)curr_pos;
5191 curr_pos += sizeof(blob_size);
5192 error = unpack_blobs(
5193 buf,
5194 curr_pos,
5195 blob_size,
5196 true
5197 );
5198 curr_pos += blob_size;
5199 if (error) {
5200 invalidate_bulk_fetch();
5201 goto exit;
5202 }
5203 }
5204 error = 0;
5205 }
5206 }
5207 }
5208
5209 curr_range_query_buff_offset = curr_pos - range_query_buff;
5210 exit:
5211 return error;
5212 }
5213
5214 static int smart_dbt_bf_callback(
5215 DBT const* key,
5216 DBT const* row,
5217 void* context) {
5218 SMART_DBT_BF_INFO info = (SMART_DBT_BF_INFO)context;
5219 return
5220 info->ha->fill_range_query_buf(
5221 info->need_val,
5222 key,
5223 row,
5224 info->direction,
5225 info->thd,
5226 info->buf,
5227 info->key_to_compare);
5228 }
5229
5230 enum icp_result ha_tokudb::toku_handler_index_cond_check(
5231 Item* pushed_idx_cond) {
5232
5233 enum icp_result res;
5234 if (end_range) {
5235 int cmp;
5236 #ifdef MARIADB_BASE_VERSION
5237 cmp = compare_key2(end_range);
5238 #else
5239 cmp = compare_key_icp(end_range);
5240 #endif
5241 if (cmp > 0) {
5242 return ICP_OUT_OF_RANGE;
5243 }
5244 }
5245 res = pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;
5246 return res;
5247 }
5248
5249 // fill in the range query buf for bulk fetch
5250 int ha_tokudb::fill_range_query_buf(
5251 bool need_val,
5252 DBT const* key,
5253 DBT const* row,
5254 int direction,
5255 THD* thd,
5256 uchar* buf,
5257 DBT* key_to_compare) {
5258
5259 int error;
5260 //
5261 // first put the value into range_query_buf
5262 //
5263 uint32_t size_remaining =
5264 size_range_query_buff - bytes_used_in_range_query_buff;
5265 uint32_t size_needed;
5266 uint32_t user_defined_size = tokudb::sysvars::read_buf_size(thd);
5267 uchar* curr_pos = NULL;
5268
5269 if (key_to_compare) {
5270 int cmp = tokudb_prefix_cmp_dbt_key(
5271 share->key_file[tokudb_active_index],
5272 key_to_compare,
5273 key);
5274 if (cmp) {
5275 icp_went_out_of_range = true;
5276 error = 0;
5277 goto cleanup;
5278 }
5279 }
5280
5281 // if we have an index condition pushed down, we check it
5282 if (toku_pushed_idx_cond &&
5283 (tokudb_active_index == toku_pushed_idx_cond_keyno)) {
5284 unpack_key(buf, key, tokudb_active_index);
5285 enum icp_result result =
5286 toku_handler_index_cond_check(toku_pushed_idx_cond);
5287
5288 // If we have reason to stop, we set icp_went_out_of_range and get out
5289 // otherwise, if we simply see that the current key is no match,
5290 // we tell the cursor to continue and don't store
5291 // the key locally
5292 if (result == ICP_OUT_OF_RANGE || thd_killed(thd)) {
5293 icp_went_out_of_range = true;
5294 error = 0;
5295 DEBUG_SYNC(ha_thd(), "tokudb_icp_asc_scan_out_of_range");
5296 goto cleanup;
5297 } else if (result == ICP_NO_MATCH) {
5298 // Optimizer change for MyRocks also benefits us here in TokuDB as
5299 // opt_range.cc QUICK_SELECT::get_next now sets end_range during
5300 // descending scan. We should not ever hit this condition, but
5301 // leaving this code in to prevent any possibility of a descending
5302 // scan to the beginning of an index and catch any possibility
5303 // in debug builds with an assertion
5304 assert_debug(!(!end_range && direction < 0));
5305 if (!end_range &&
5306 direction < 0) {
5307 cancel_pushed_idx_cond();
5308 }
5309 error = TOKUDB_CURSOR_CONTINUE;
5310 goto cleanup;
5311 }
5312 }
5313
5314 // at this point, if ICP is on, we have verified that the key is one
5315 // we are interested in, so we proceed with placing the data
5316 // into the range query buffer
5317
5318 if (need_val) {
5319 if (unpack_entire_row) {
5320 size_needed = 2*sizeof(uint32_t) + key->size + row->size;
5321 } else {
5322 // this is an upper bound
5323 size_needed =
5324 // size of key length
5325 sizeof(uint32_t) +
5326 // key and row
5327 key->size + row->size +
5328 // lengths of varchars stored
5329 num_var_cols_for_query * (sizeof(uint32_t)) +
5330 // length of blobs
5331 sizeof(uint32_t);
5332 }
5333 } else {
5334 size_needed = sizeof(uint32_t) + key->size;
5335 }
5336 if (size_remaining < size_needed) {
5337 range_query_buff =
5338 static_cast<uchar*>(tokudb::memory::realloc(
5339 static_cast<void*>(range_query_buff),
5340 bytes_used_in_range_query_buff + size_needed,
5341 MYF(MY_WME)));
5342 if (range_query_buff == NULL) {
5343 error = ENOMEM;
5344 invalidate_bulk_fetch();
5345 goto cleanup;
5346 }
5347 size_range_query_buff = bytes_used_in_range_query_buff + size_needed;
5348 }
5349 //
5350 // now we know we have the size, let's fill the buffer, starting with the key
5351 //
5352 curr_pos = range_query_buff + bytes_used_in_range_query_buff;
5353
5354 *reinterpret_cast<uint32_t*>(curr_pos) = key->size;
5355 curr_pos += sizeof(uint32_t);
5356 memcpy(curr_pos, key->data, key->size);
5357 curr_pos += key->size;
5358 if (need_val) {
5359 if (unpack_entire_row) {
5360 *reinterpret_cast<uint32_t*>(curr_pos) = row->size;
5361 curr_pos += sizeof(uint32_t);
5362 memcpy(curr_pos, row->data, row->size);
5363 curr_pos += row->size;
5364 } else {
5365 // need to unpack just the data we care about
5366 const uchar* fixed_field_ptr = static_cast<const uchar*>(row->data);
5367 fixed_field_ptr += table_share->null_bytes;
5368
5369 const uchar* var_field_offset_ptr = NULL;
5370 const uchar* var_field_data_ptr = NULL;
5371
5372 var_field_offset_ptr =
5373 fixed_field_ptr +
5374 share->kc_info.mcp_info[tokudb_active_index].fixed_field_size;
5375 var_field_data_ptr =
5376 var_field_offset_ptr +
5377 share->kc_info.mcp_info[tokudb_active_index].len_of_offsets;
5378
5379 // first the null bytes
5380 memcpy(curr_pos, row->data, table_share->null_bytes);
5381 curr_pos += table_share->null_bytes;
5382 // now the fixed fields
5383 //
5384 // first the fixed fields
5385 //
5386 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5387 uint field_index = fixed_cols_for_query[i];
5388 memcpy(
5389 curr_pos,
5390 fixed_field_ptr + share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val,
5391 share->kc_info.field_lengths[field_index]);
5392 curr_pos += share->kc_info.field_lengths[field_index];
5393 }
5394
5395 //
5396 // now the var fields
5397 //
5398 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5399 uint field_index = var_cols_for_query[i];
5400 uint32_t var_field_index =
5401 share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val;
5402 uint32_t data_start_offset;
5403 uint32_t field_len;
5404
5405 get_var_field_info(
5406 &field_len,
5407 &data_start_offset,
5408 var_field_index,
5409 var_field_offset_ptr,
5410 share->kc_info.num_offset_bytes);
5411 memcpy(curr_pos, &field_len, sizeof(field_len));
5412 curr_pos += sizeof(field_len);
5413 memcpy(
5414 curr_pos,
5415 var_field_data_ptr + data_start_offset,
5416 field_len);
5417 curr_pos += field_len;
5418 }
5419
5420 if (read_blobs) {
5421 uint32_t blob_offset = 0;
5422 uint32_t data_size = 0;
5423 //
5424 // now the blobs
5425 //
5426 get_blob_field_info(
5427 &blob_offset,
5428 share->kc_info.mcp_info[tokudb_active_index].len_of_offsets,
5429 var_field_data_ptr,
5430 share->kc_info.num_offset_bytes);
5431 data_size =
5432 row->size -
5433 blob_offset -
5434 static_cast<uint32_t>((var_field_data_ptr -
5435 static_cast<const uchar*>(row->data)));
5436 memcpy(curr_pos, &data_size, sizeof(data_size));
5437 curr_pos += sizeof(data_size);
5438 memcpy(curr_pos, var_field_data_ptr + blob_offset, data_size);
5439 curr_pos += data_size;
5440 }
5441 }
5442 }
5443
5444 bytes_used_in_range_query_buff = curr_pos - range_query_buff;
5445 assert_always(bytes_used_in_range_query_buff <= size_range_query_buff);
5446
5447 //
5448 // now determine if we should continue with the bulk fetch
5449 // we want to stop under these conditions:
5450 // - we overran the prelocked range
5451 // - we are close to the end of the buffer
5452 // - we have fetched an exponential amount of rows with
5453 // respect to the bulk fetch iteration, which is initialized
5454 // to 0 in index_init() and prelock_range().
5455
5456 rows_fetched_using_bulk_fetch++;
5457 // if the iteration is less than the number of possible shifts on
5458 // a 64 bit integer, check that we haven't exceeded this iterations
5459 // row fetch upper bound.
5460 if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5461 uint64_t row_fetch_upper_bound = 1LLU << bulk_fetch_iteration;
5462 assert_always(row_fetch_upper_bound > 0);
5463 if (rows_fetched_using_bulk_fetch >= row_fetch_upper_bound) {
5464 error = 0;
5465 goto cleanup;
5466 }
5467 }
5468
5469 if (bytes_used_in_range_query_buff +
5470 table_share->rec_buff_length >
5471 user_defined_size) {
5472 error = 0;
5473 goto cleanup;
5474 }
5475 if (direction > 0) {
5476 // compare what we got to the right endpoint of prelocked range
5477 // because we are searching keys in ascending order
5478 if (prelocked_right_range_size == 0) {
5479 error = TOKUDB_CURSOR_CONTINUE;
5480 goto cleanup;
5481 }
5482 DBT right_range;
5483 memset(&right_range, 0, sizeof(right_range));
5484 right_range.size = prelocked_right_range_size;
5485 right_range.data = prelocked_right_range;
5486 int cmp = tokudb_cmp_dbt_key(
5487 share->key_file[tokudb_active_index],
5488 key,
5489 &right_range);
5490 error = (cmp > 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5491 } else {
5492 // compare what we got to the left endpoint of prelocked range
5493 // because we are searching keys in descending order
5494 if (prelocked_left_range_size == 0) {
5495 error = TOKUDB_CURSOR_CONTINUE;
5496 goto cleanup;
5497 }
5498 DBT left_range;
5499 memset(&left_range, 0, sizeof(left_range));
5500 left_range.size = prelocked_left_range_size;
5501 left_range.data = prelocked_left_range;
5502 int cmp = tokudb_cmp_dbt_key(
5503 share->key_file[tokudb_active_index],
5504 key,
5505 &left_range);
5506 error = (cmp < 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5507 }
5508 cleanup:
5509 return error;
5510 }
5511
5512 int ha_tokudb::get_next(
5513 uchar* buf,
5514 int direction,
5515 DBT* key_to_compare,
5516 bool do_key_read) {
5517
5518 int error = 0;
5519 HANDLE_INVALID_CURSOR();
5520
5521 if (maybe_index_scan) {
5522 maybe_index_scan = false;
5523 if (!range_lock_grabbed) {
5524 error = prepare_index_scan();
5525 }
5526 }
5527
5528 if (!error) {
5529 uint32_t flags = SET_PRELOCK_FLAG(0);
5530
5531 // we need to read the val of what we retrieve if
5532 // we do NOT have a covering index AND we are using a clustering secondary
5533 // key
5534 bool need_val =
5535 (do_key_read == 0) &&
5536 (tokudb_active_index == primary_key ||
5537 key_is_clustering(&table->key_info[tokudb_active_index]));
5538
5539 if ((bytes_used_in_range_query_buff -
5540 curr_range_query_buff_offset) > 0) {
5541 error = read_data_from_range_query_buff(buf, need_val, do_key_read);
5542 } else if (icp_went_out_of_range) {
5543 icp_went_out_of_range = false;
5544 error = HA_ERR_END_OF_FILE;
5545 } else {
5546 invalidate_bulk_fetch();
5547 if (doing_bulk_fetch) {
5548 struct smart_dbt_bf_info bf_info;
5549 bf_info.ha = this;
5550 // you need the val if you have a clustering index and key_read is not 0;
5551 bf_info.direction = direction;
5552 bf_info.thd = ha_thd();
5553 bf_info.need_val = need_val;
5554 bf_info.buf = buf;
5555 bf_info.key_to_compare = key_to_compare;
5556 //
5557 // call c_getf_next with purpose of filling in range_query_buff
5558 //
5559 rows_fetched_using_bulk_fetch = 0;
5560 // it is expected that we can do ICP in the smart_dbt_bf_callback
5561 // as a result, it's possible we don't return any data because
5562 // none of the rows matched the index condition. Therefore, we need
5563 // this while loop. icp_out_of_range will be set if we hit a row that
5564 // the index condition states is out of our range. When that hits,
5565 // we know all the data in the buffer is the last data we will retrieve
5566 while (bytes_used_in_range_query_buff == 0 &&
5567 !icp_went_out_of_range && error == 0) {
5568 if (direction > 0) {
5569 error =
5570 cursor->c_getf_next(
5571 cursor,
5572 flags,
5573 smart_dbt_bf_callback,
5574 &bf_info);
5575 } else {
5576 error =
5577 cursor->c_getf_prev(
5578 cursor,
5579 flags,
5580 smart_dbt_bf_callback,
5581 &bf_info);
5582 }
5583 }
5584 // if there is no data set and we went out of range,
5585 // then there is nothing to return
5586 if (bytes_used_in_range_query_buff == 0 &&
5587 icp_went_out_of_range) {
5588 icp_went_out_of_range = false;
5589 error = HA_ERR_END_OF_FILE;
5590 }
5591 if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5592 bulk_fetch_iteration++;
5593 }
5594
5595 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5596 if (error) {
5597 goto cleanup;
5598 }
5599
5600 //
5601 // now that range_query_buff is filled, read an element
5602 //
5603 error =
5604 read_data_from_range_query_buff(buf, need_val, do_key_read);
5605 } else {
5606 struct smart_dbt_info info;
5607 info.ha = this;
5608 info.buf = buf;
5609 info.keynr = tokudb_active_index;
5610
5611 if (direction > 0) {
5612 error =
5613 cursor->c_getf_next(
5614 cursor,
5615 flags,
5616 SMART_DBT_CALLBACK(do_key_read),
5617 &info);
5618 } else {
5619 error =
5620 cursor->c_getf_prev(
5621 cursor,
5622 flags,
5623 SMART_DBT_CALLBACK(do_key_read),
5624 &info);
5625 }
5626 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5627 }
5628 }
5629 }
5630
5631 //
5632 // at this point, one of two things has happened
5633 // either we have unpacked the data into buf, and we
5634 // are done, or we have unpacked the primary key
5635 // into last_key, and we use the code below to
5636 // read the full row by doing a point query into the
5637 // main table.
5638 //
5639 if (!error &&
5640 !do_key_read &&
5641 (tokudb_active_index != primary_key) &&
5642 !key_is_clustering(&table->key_info[tokudb_active_index])) {
5643 error = read_full_row(buf);
5644 }
5645
5646 if (!error) {
5647 THD *thd = ha_thd();
5648 tokudb_trx_data* trx =
5649 static_cast<tokudb_trx_data*>(thd_get_ha_data(thd, tokudb_hton));
5650 trx->stmt_progress.queried++;
5651 track_progress(thd);
5652 if (thd_killed(thd))
5653 error = ER_ABORTING_CONNECTION;
5654 }
5655 cleanup:
5656 return error;
5657 }
5658
5659
5660 //
5661 // Reads the next row from the active index (cursor) into buf, and advances cursor
5662 // Parameters:
5663 // [out] buf - buffer for the next row, in MySQL format
5664 // Returns:
5665 // 0 on success
5666 // HA_ERR_END_OF_FILE if not found
5667 // error otherwise
5668 //
5669 int ha_tokudb::index_next(uchar * buf) {
5670 TOKUDB_HANDLER_DBUG_ENTER("");
5671 ha_statistic_increment(&SSV::ha_read_next_count);
5672 int error = get_next(buf, 1, NULL, key_read);
5673 TOKUDB_HANDLER_DBUG_RETURN(error);
5674 }
5675
5676
5677 int ha_tokudb::index_read_last(uchar * buf, const uchar * key, uint key_len) {
5678 return(index_read(buf, key, key_len, HA_READ_PREFIX_LAST));
5679 }
5680
5681
5682 //
5683 // Reads the previous row from the active index (cursor) into buf, and advances cursor
5684 // Parameters:
5685 // [out] buf - buffer for the next row, in MySQL format
5686 // Returns:
5687 // 0 on success
5688 // HA_ERR_END_OF_FILE if not found
5689 // error otherwise
5690 //
5691 int ha_tokudb::index_prev(uchar * buf) {
5692 TOKUDB_HANDLER_DBUG_ENTER("");
5693 ha_statistic_increment(&SSV::ha_read_prev_count);
5694 int error = get_next(buf, -1, NULL, key_read);
5695 TOKUDB_HANDLER_DBUG_RETURN(error);
5696 }
5697
5698 //
5699 // Reads the first row from the active index (cursor) into buf, and advances cursor
5700 // Parameters:
5701 // [out] buf - buffer for the next row, in MySQL format
5702 // Returns:
5703 // 0 on success
5704 // HA_ERR_END_OF_FILE if not found
5705 // error otherwise
5706 //
5707 int ha_tokudb::index_first(uchar * buf) {
5708 TOKUDB_HANDLER_DBUG_ENTER("");
5709 invalidate_bulk_fetch();
5710 int error = 0;
5711 struct smart_dbt_info info;
5712 uint32_t flags = SET_PRELOCK_FLAG(0);
5713 THD* thd = ha_thd();
5714 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5715 HANDLE_INVALID_CURSOR();
5716
5717 ha_statistic_increment(&SSV::ha_read_first_count);
5718
5719 info.ha = this;
5720 info.buf = buf;
5721 info.keynr = tokudb_active_index;
5722
5723 error = cursor->c_getf_first(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5724 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5725
5726 //
5727 // still need to get entire contents of the row if operation done on
5728 // secondary DB and it was NOT a covering index
5729 //
5730 if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5731 error = read_full_row(buf);
5732 }
5733 if (trx) {
5734 trx->stmt_progress.queried++;
5735 }
5736 track_progress(thd);
5737 maybe_index_scan = true;
5738 cleanup:
5739 TOKUDB_HANDLER_DBUG_RETURN(error);
5740 }
5741
5742 //
5743 // Reads the last row from the active index (cursor) into buf, and advances cursor
5744 // Parameters:
5745 // [out] buf - buffer for the next row, in MySQL format
5746 // Returns:
5747 // 0 on success
5748 // HA_ERR_END_OF_FILE if not found
5749 // error otherwise
5750 //
5751 int ha_tokudb::index_last(uchar * buf) {
5752 TOKUDB_HANDLER_DBUG_ENTER("");
5753 invalidate_bulk_fetch();
5754 int error = 0;
5755 struct smart_dbt_info info;
5756 uint32_t flags = SET_PRELOCK_FLAG(0);
5757 THD* thd = ha_thd();
5758 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5759 HANDLE_INVALID_CURSOR();
5760
5761 ha_statistic_increment(&SSV::ha_read_last_count);
5762
5763 info.ha = this;
5764 info.buf = buf;
5765 info.keynr = tokudb_active_index;
5766
5767 error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5768 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5769 //
5770 // still need to get entire contents of the row if operation done on
5771 // secondary DB and it was NOT a covering index
5772 //
5773 if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5774 error = read_full_row(buf);
5775 }
5776
5777 if (trx) {
5778 trx->stmt_progress.queried++;
5779 }
5780 track_progress(thd);
5781 maybe_index_scan = true;
5782 cleanup:
5783 TOKUDB_HANDLER_DBUG_RETURN(error);
5784 }
5785
5786 //
5787 // Initialize a scan of the table (which is why index_init is called on primary_key)
5788 // Parameters:
5789 // scan - unused
5790 // Returns:
5791 // 0 on success
5792 // error otherwise
5793 //
5794 int ha_tokudb::rnd_init(bool scan) {
5795 TOKUDB_HANDLER_DBUG_ENTER("");
5796 int error = 0;
5797 range_lock_grabbed = false;
5798 error = index_init(MAX_KEY, 0);
5799 if (error) { goto cleanup;}
5800
5801 if (scan) {
5802 error = prelock_range(NULL, NULL);
5803 if (error) { goto cleanup; }
5804
5805 // only want to set range_lock_grabbed to true after index_init
5806 // successfully executed for two reasons:
5807 // 1) index_init will reset it to false anyway
5808 // 2) if it fails, we don't want prelocking on,
5809 range_lock_grabbed = true;
5810 }
5811
5812 error = 0;
5813 cleanup:
5814 if (error) {
5815 index_end();
5816 last_cursor_error = error;
5817 }
5818 TOKUDB_HANDLER_DBUG_RETURN(error);
5819 }
5820
5821 //
5822 // End a scan of the table
5823 //
5824 int ha_tokudb::rnd_end() {
5825 TOKUDB_HANDLER_DBUG_ENTER("");
5826 range_lock_grabbed = false;
5827 TOKUDB_HANDLER_DBUG_RETURN(index_end());
5828 }
5829
5830
5831 //
5832 // Read the next row in a table scan
5833 // Parameters:
5834 // [out] buf - buffer for the next row, in MySQL format
5835 // Returns:
5836 // 0 on success
5837 // HA_ERR_END_OF_FILE if not found
5838 // error otherwise
5839 //
5840 int ha_tokudb::rnd_next(uchar * buf) {
5841 TOKUDB_HANDLER_DBUG_ENTER("");
5842 ha_statistic_increment(&SSV::ha_read_rnd_next_count);
5843 int error = get_next(buf, 1, NULL, false);
5844 TOKUDB_HANDLER_DBUG_RETURN(error);
5845 }
5846
5847
5848 void ha_tokudb::track_progress(THD* thd) {
5849 tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
5850 if (trx) {
5851 ulonglong num_written = trx->stmt_progress.inserted +
5852 trx->stmt_progress.updated +
5853 trx->stmt_progress.deleted;
5854 bool update_status =
5855 (trx->stmt_progress.queried &&
5856 tokudb::sysvars::read_status_frequency &&
5857 (trx->stmt_progress.queried %
5858 tokudb::sysvars::read_status_frequency) == 0) ||
5859 (num_written && tokudb::sysvars::write_status_frequency &&
5860 (num_written % tokudb::sysvars::write_status_frequency) == 0);
5861 if (update_status) {
5862 char *next_status = write_status_msg;
5863 bool first = true;
5864 int r;
5865 if (trx->stmt_progress.queried) {
5866 r = sprintf(
5867 next_status,
5868 "Queried about %llu row%s",
5869 trx->stmt_progress.queried,
5870 trx->stmt_progress.queried == 1 ? "" : "s");
5871 assert_always(r >= 0);
5872 next_status += r;
5873 first = false;
5874 }
5875 if (trx->stmt_progress.inserted) {
5876 if (trx->stmt_progress.using_loader) {
5877 r = sprintf(
5878 next_status,
5879 "%sFetched about %llu row%s, loading data still remains",
5880 first ? "" : ", ",
5881 trx->stmt_progress.inserted,
5882 trx->stmt_progress.inserted == 1 ? "" : "s");
5883 } else {
5884 r = sprintf(
5885 next_status,
5886 "%sInserted about %llu row%s",
5887 first ? "" : ", ",
5888 trx->stmt_progress.inserted,
5889 trx->stmt_progress.inserted == 1 ? "" : "s");
5890 }
5891 assert_always(r >= 0);
5892 next_status += r;
5893 first = false;
5894 }
5895 if (trx->stmt_progress.updated) {
5896 r = sprintf(
5897 next_status,
5898 "%sUpdated about %llu row%s",
5899 first ? "" : ", ",
5900 trx->stmt_progress.updated,
5901 trx->stmt_progress.updated == 1 ? "" : "s");
5902 assert_always(r >= 0);
5903 next_status += r;
5904 first = false;
5905 }
5906 if (trx->stmt_progress.deleted) {
5907 r = sprintf(
5908 next_status,
5909 "%sDeleted about %llu row%s",
5910 first ? "" : ", ",
5911 trx->stmt_progress.deleted,
5912 trx->stmt_progress.deleted == 1 ? "" : "s");
5913 assert_always(r >= 0);
5914 next_status += r;
5915 first = false;
5916 }
5917 if (!first)
5918 thd_proc_info(thd, write_status_msg);
5919 }
5920 }
5921 }
5922
5923
5924 DBT *ha_tokudb::get_pos(DBT * to, uchar * pos) {
5925 TOKUDB_HANDLER_DBUG_ENTER("");
5926 /* We don't need to set app_data here */
5927 memset((void *) to, 0, sizeof(*to));
5928 to->data = pos + sizeof(uint32_t);
5929 to->size = *(uint32_t *)pos;
5930 DBUG_DUMP("key", (const uchar *) to->data, to->size);
5931 DBUG_RETURN(to);
5932 }
5933
5934 // Retrieves a row with based on the primary key saved in pos
5935 // Returns:
5936 // 0 on success
5937 // HA_ERR_KEY_NOT_FOUND if not found
5938 // error otherwise
5939 int ha_tokudb::rnd_pos(uchar * buf, uchar * pos) {
5940 TOKUDB_HANDLER_DBUG_ENTER("");
5941 DBT db_pos;
5942 int error = 0;
5943 struct smart_dbt_info info;
5944 bool old_unpack_entire_row = unpack_entire_row;
5945 DBT* key = get_pos(&db_pos, pos);
5946
5947 unpack_entire_row = true;
5948 ha_statistic_increment(&SSV::ha_read_rnd_count);
5949 tokudb_active_index = MAX_KEY;
5950
5951 THD *thd = ha_thd();
5952 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
5953 // test rpl slave by inducing a delay before the point query
5954 if (thd->slave_thread && (in_rpl_delete_rows || in_rpl_update_rows)) {
5955 DBUG_EXECUTE_IF("tokudb_crash_if_rpl_looks_up_row", DBUG_ASSERT(0););
5956 uint64_t delay_ms = tokudb::sysvars::rpl_lookup_rows_delay(thd);
5957 if (delay_ms)
5958 usleep(delay_ms * 1000);
5959 }
5960 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
5961
5962 info.ha = this;
5963 info.buf = buf;
5964 info.keynr = primary_key;
5965
5966 error = share->file->getf_set(share->file, transaction,
5967 get_cursor_isolation_flags(lock.type, thd),
5968 key, smart_dbt_callback_rowread_ptquery, &info);
5969
5970 if (error == DB_NOTFOUND) {
5971 error = HA_ERR_KEY_NOT_FOUND;
5972 goto cleanup;
5973 }
5974 cleanup:
5975 unpack_entire_row = old_unpack_entire_row;
5976 TOKUDB_HANDLER_DBUG_RETURN(error);
5977 }
5978
5979 int ha_tokudb::prelock_range(const key_range *start_key, const key_range *end_key) {
5980 TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
5981 THD* thd = ha_thd();
5982
5983 int error = 0;
5984 DBT start_dbt_key;
5985 DBT end_dbt_key;
5986 uchar* start_key_buff = prelocked_left_range;
5987 uchar* end_key_buff = prelocked_right_range;
5988
5989 memset((void *) &start_dbt_key, 0, sizeof(start_dbt_key));
5990 memset((void *) &end_dbt_key, 0, sizeof(end_dbt_key));
5991
5992 HANDLE_INVALID_CURSOR();
5993 if (start_key) {
5994 switch (start_key->flag) {
5995 case HA_READ_AFTER_KEY:
5996 pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_POS_INF);
5997 break;
5998 default:
5999 pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_NEG_INF);
6000 break;
6001 }
6002 prelocked_left_range_size = start_dbt_key.size;
6003 }
6004 else {
6005 prelocked_left_range_size = 0;
6006 }
6007
6008 if (end_key) {
6009 switch (end_key->flag) {
6010 case HA_READ_BEFORE_KEY:
6011 pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_NEG_INF);
6012 break;
6013 default:
6014 pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_POS_INF);
6015 break;
6016 }
6017 prelocked_right_range_size = end_dbt_key.size;
6018 }
6019 else {
6020 prelocked_right_range_size = 0;
6021 }
6022
6023 error = cursor->c_set_bounds(
6024 cursor,
6025 start_key ? &start_dbt_key : share->key_file[tokudb_active_index]->dbt_neg_infty(),
6026 end_key ? &end_dbt_key : share->key_file[tokudb_active_index]->dbt_pos_infty(),
6027 true,
6028 (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
6029 );
6030 if (error) {
6031 error = map_to_handler_error(error);
6032 last_cursor_error = error;
6033 //
6034 // cursor should be initialized here, but in case it is not, we still check
6035 //
6036 if (cursor) {
6037 int r = cursor->c_close(cursor);
6038 assert_always(r==0);
6039 cursor = NULL;
6040 remove_from_trx_handler_list();
6041 }
6042 goto cleanup;
6043 }
6044
6045 // at this point, determine if we will be doing bulk fetch
6046 doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
6047 bulk_fetch_iteration = 0;
6048 rows_fetched_using_bulk_fetch = 0;
6049
6050 cleanup:
6051 TOKUDB_HANDLER_DBUG_RETURN(error);
6052 }
6053
6054 //
6055 // Prelock range if possible, start_key is leftmost, end_key is rightmost
6056 // whether scanning forward or backward. This function is called by MySQL
6057 // for backward range queries (in QUICK_SELECT_DESC::get_next).
6058 // Forward scans use read_range_first()/read_range_next().
6059 //
6060 int ha_tokudb::prepare_range_scan( const key_range *start_key, const key_range *end_key) {
6061 TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
6062 int error = prelock_range(start_key, end_key);
6063 if (!error) {
6064 range_lock_grabbed = true;
6065 }
6066 TOKUDB_HANDLER_DBUG_RETURN(error);
6067 }
6068
6069 int ha_tokudb::read_range_first(
6070 const key_range *start_key,
6071 const key_range *end_key,
6072 bool eq_range,
6073 bool sorted)
6074 {
6075 TOKUDB_HANDLER_DBUG_ENTER("%p %p %u %u", start_key, end_key, eq_range, sorted);
6076 int error = prelock_range(start_key, end_key);
6077 if (error) { goto cleanup; }
6078 range_lock_grabbed = true;
6079
6080 error = handler::read_range_first(start_key, end_key, eq_range, sorted);
6081 cleanup:
6082 TOKUDB_HANDLER_DBUG_RETURN(error);
6083 }
6084
6085 int ha_tokudb::read_range_next()
6086 {
6087 TOKUDB_HANDLER_DBUG_ENTER("");
6088 int error;
6089 error = handler::read_range_next();
6090 if (error) {
6091 range_lock_grabbed = false;
6092 }
6093 TOKUDB_HANDLER_DBUG_RETURN(error);
6094 }
6095
6096
6097
6098 /*
6099 Set a reference to the current record in (ref,ref_length).
6100
6101 SYNOPSIS
6102 ha_tokudb::position()
6103 record The current record buffer
6104
6105 DESCRIPTION
6106 The BDB handler stores the primary key in (ref,ref_length).
6107 There is either an explicit primary key, or an implicit (hidden)
6108 primary key.
6109 During open(), 'ref_length' is calculated as the maximum primary
6110 key length. When an actual key is shorter than that, the rest of
6111 the buffer must be cleared out. The row cannot be identified, if
6112 garbage follows behind the end of the key. There is no length
6113 field for the current key, so that the whole ref_length is used
6114 for comparison.
6115
6116 RETURN
6117 nothing
6118 */
6119 void ha_tokudb::position(const uchar * record) {
6120 TOKUDB_HANDLER_DBUG_ENTER("");
6121 DBT key;
6122 if (hidden_primary_key) {
6123 DBUG_ASSERT(ref_length == (TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t)));
6124 memcpy(ref + sizeof(uint32_t), current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
6125 *(uint32_t *)ref = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
6126 }
6127 else {
6128 bool has_null;
6129 //
6130 // save the data
6131 //
6132 create_dbt_key_from_table(&key, primary_key, ref + sizeof(uint32_t), record, &has_null);
6133 //
6134 // save the size of data in the first four bytes of ref
6135 //
6136 memcpy(ref, &key.size, sizeof(uint32_t));
6137 }
6138 TOKUDB_HANDLER_DBUG_VOID_RETURN;
6139 }
6140
6141 //
6142 // Per InnoDB: Returns statistics information of the table to the MySQL interpreter,
6143 // in various fields of the handle object.
6144 // Return:
6145 // 0, always success
6146 //
6147 int ha_tokudb::info(uint flag) {
6148 TOKUDB_HANDLER_DBUG_ENTER("%d", flag);
6149 int error = 0;
6150 #if defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
6151 for (uint i=0; i < table->s->keys; i++)
6152 if (key_is_clustering(&table->key_info[i]))
6153 table->covering_keys.set_bit(i);
6154 #endif // defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
6155 DB_TXN* txn = NULL;
6156 if (flag & HA_STATUS_VARIABLE) {
6157 stats.records = share->row_count() + share->rows_from_locked_table;
6158 stats.deleted = 0;
6159 if (!(flag & HA_STATUS_NO_LOCK)) {
6160
6161 error = txn_begin(db_env, NULL, &txn, DB_READ_UNCOMMITTED, ha_thd());
6162 if (error) {
6163 goto cleanup;
6164 }
6165
6166 // we should always have a primary key
6167 assert_always(share->file != NULL);
6168
6169 DB_BTREE_STAT64 dict_stats;
6170 error = share->file->stat64(share->file, txn, &dict_stats);
6171 if (error) {
6172 goto cleanup;
6173 }
6174 share->set_row_count(dict_stats.bt_ndata, false);
6175 stats.records = dict_stats.bt_ndata;
6176 stats.create_time = dict_stats.bt_create_time_sec;
6177 stats.update_time = dict_stats.bt_modify_time_sec;
6178 stats.check_time = dict_stats.bt_verify_time_sec;
6179 stats.data_file_length = dict_stats.bt_dsize;
6180 stats.delete_length = dict_stats.bt_fsize - dict_stats.bt_dsize;
6181 if (hidden_primary_key) {
6182 //
6183 // in this case, we have a hidden primary key, do not
6184 // want to report space taken up by the hidden primary key to the user
6185 //
6186 uint64_t hpk_space =
6187 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH * dict_stats.bt_ndata;
6188 stats.data_file_length =
6189 (hpk_space > stats.data_file_length) ?
6190 0 : stats.data_file_length - hpk_space;
6191 } else {
6192 //
6193 // one infinity byte per key needs to be subtracted
6194 //
6195 uint64_t inf_byte_space = dict_stats.bt_ndata;
6196 stats.data_file_length =
6197 (inf_byte_space > stats.data_file_length) ?
6198 0 : stats.data_file_length - inf_byte_space;
6199 }
6200
6201 stats.mean_rec_length =
6202 stats.records ?
6203 (ulong)(stats.data_file_length/stats.records) : 0;
6204 stats.index_file_length = 0;
6205 // curr_num_DBs is the number of keys we have, according
6206 // to the mysql layer. if drop index is running concurrently
6207 // with info() (it can, because info does not take table locks),
6208 // then it could be the case that one of the dbs was dropped
6209 // and set to NULL before mysql was able to set table->s->keys
6210 // accordingly.
6211 //
6212 // we should just ignore any DB * that is NULL.
6213 //
6214 // this solution is much simpler than trying to maintain an
6215 // accurate number of valid keys at the handlerton layer.
6216 uint curr_num_DBs =
6217 table->s->keys + tokudb_test(hidden_primary_key);
6218 for (uint i = 0; i < curr_num_DBs; i++) {
6219 // skip the primary key, skip dropped indexes
6220 if (i == primary_key || share->key_file[i] == NULL) {
6221 continue;
6222 }
6223 error = share->key_file[i]->stat64(
6224 share->key_file[i], txn, &dict_stats);
6225 if (error) {
6226 goto cleanup;
6227 }
6228 stats.index_file_length += dict_stats.bt_dsize;
6229 stats.delete_length +=
6230 dict_stats.bt_fsize - dict_stats.bt_dsize;
6231 }
6232 }
6233
6234 /*
6235 The following comment and logic has been taken from InnoDB and
6236 an old hack was removed that forced to always set stats.records > 0
6237 ---
6238 The MySQL optimizer seems to assume in a left join that n_rows
6239 is an accurate estimate if it is zero. Of course, it is not,
6240 since we do not have any locks on the rows yet at this phase.
6241 Since SHOW TABLE STATUS seems to call this function with the
6242 HA_STATUS_TIME flag set, while the left join optimizer does not
6243 set that flag, we add one to a zero value if the flag is not
6244 set. That way SHOW TABLE STATUS will show the best estimate,
6245 while the optimizer never sees the table empty. */
6246 if (stats.records == 0 && !(flag & HA_STATUS_TIME)) {
6247 stats.records++;
6248 }
6249 }
6250 if ((flag & HA_STATUS_CONST)) {
6251 stats.max_data_file_length = 9223372036854775807ULL;
6252 }
6253 if (flag & (HA_STATUS_VARIABLE | HA_STATUS_CONST)) {
6254 share->set_cardinality_counts_in_table(table);
6255 }
6256
6257 /* Don't return key if we got an error for the internal primary key */
6258 if (flag & HA_STATUS_ERRKEY && last_dup_key < table_share->keys) {
6259 errkey = last_dup_key;
6260 }
6261
6262 if (flag & HA_STATUS_AUTO && table->found_next_number_field) {
6263 THD* thd = table->in_use;
6264 struct system_variables* variables = &thd->variables;
6265 stats.auto_increment_value =
6266 share->last_auto_increment + variables->auto_increment_increment;
6267 }
6268 error = 0;
6269 cleanup:
6270 if (txn != NULL) {
6271 commit_txn(txn, DB_TXN_NOSYNC);
6272 txn = NULL;
6273 }
6274 TOKUDB_HANDLER_DBUG_RETURN(error);
6275 }
6276
6277 //
6278 // Per InnoDB: Tells something additional to the handler about how to do things.
6279 //
6280 int ha_tokudb::extra(enum ha_extra_function operation) {
6281 TOKUDB_HANDLER_DBUG_ENTER("%d", operation);
6282 switch (operation) {
6283 case HA_EXTRA_RESET_STATE:
6284 reset();
6285 break;
6286 case HA_EXTRA_KEYREAD:
6287 key_read = true; // Query satisfied with key
6288 break;
6289 case HA_EXTRA_NO_KEYREAD:
6290 key_read = false;
6291 break;
6292 case HA_EXTRA_IGNORE_DUP_KEY:
6293 using_ignore = true;
6294 break;
6295 case HA_EXTRA_NO_IGNORE_DUP_KEY:
6296 using_ignore = false;
6297 break;
6298 case HA_EXTRA_IGNORE_NO_KEY:
6299 using_ignore_no_key = true;
6300 break;
6301 case HA_EXTRA_NO_IGNORE_NO_KEY:
6302 using_ignore_no_key = false;
6303 break;
6304 case HA_EXTRA_NOT_USED:
6305 case HA_EXTRA_PREPARE_FOR_RENAME:
6306 break; // must do nothing and return 0
6307 default:
6308 break;
6309 }
6310 TOKUDB_HANDLER_DBUG_RETURN(0);
6311 }
6312
6313 int ha_tokudb::reset() {
6314 TOKUDB_HANDLER_DBUG_ENTER("");
6315 key_read = false;
6316 using_ignore = false;
6317 using_ignore_no_key = false;
6318 reset_dsmrr();
6319 invalidate_icp();
6320 TOKUDB_HANDLER_DBUG_RETURN(0);
6321 }
6322
6323 //
6324 // helper function that iterates through all DB's
6325 // and grabs a lock (either read or write, but not both)
6326 // Parameters:
6327 // [in] trans - transaction to be used to pre acquire the lock
6328 // lt - type of lock to get, either lock_read or lock_write
6329 // Returns:
6330 // 0 on success
6331 // error otherwise
6332 //
6333 int ha_tokudb::acquire_table_lock (DB_TXN* trans, TABLE_LOCK_TYPE lt) {
6334 TOKUDB_HANDLER_DBUG_ENTER("%p %s", trans, lt == lock_read ? "r" : "w");
6335 int error = ENOSYS;
6336 if (!num_DBs_locked_in_bulk) {
6337 rwlock_t_lock_read(share->_num_DBs_lock);
6338 }
6339 uint curr_num_DBs = share->num_DBs;
6340 if (lt == lock_read) {
6341 error = 0;
6342 goto cleanup;
6343 } else if (lt == lock_write) {
6344 for (uint i = 0; i < curr_num_DBs; i++) {
6345 DB* db = share->key_file[i];
6346 error = db->pre_acquire_table_lock(db, trans);
6347 if (error == EINVAL)
6348 TOKUDB_HANDLER_TRACE("%d db=%p trans=%p", i, db, trans);
6349 if (error) break;
6350 }
6351 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6352 if (error) goto cleanup;
6353 } else {
6354 error = ENOSYS;
6355 goto cleanup;
6356 }
6357
6358 error = 0;
6359 cleanup:
6360 if (!num_DBs_locked_in_bulk) {
6361 share->_num_DBs_lock.unlock();
6362 }
6363 TOKUDB_HANDLER_DBUG_RETURN(error);
6364 }
6365
6366 int ha_tokudb::create_txn(THD* thd, tokudb_trx_data* trx) {
6367 int error;
6368 ulong tx_isolation = thd_tx_isolation(thd);
6369 HA_TOKU_ISO_LEVEL toku_iso_level = tx_to_toku_iso(tx_isolation);
6370 bool is_autocommit = !thd_test_options(
6371 thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN);
6372
6373 /* First table lock, start transaction */
6374 if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) &&
6375 !trx->all &&
6376 (thd_sql_command(thd) != SQLCOM_CREATE_TABLE) &&
6377 (thd_sql_command(thd) != SQLCOM_DROP_TABLE) &&
6378 (thd_sql_command(thd) != SQLCOM_DROP_INDEX) &&
6379 (thd_sql_command(thd) != SQLCOM_CREATE_INDEX) &&
6380 (thd_sql_command(thd) != SQLCOM_ALTER_TABLE)) {
6381 /* QQQ We have to start a master transaction */
6382 // DBUG_PRINT("trans", ("starting transaction all "));
6383 uint32_t txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6384 #if 50614 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
6385 if (thd_tx_is_read_only(thd)) {
6386 txn_begin_flags |= DB_TXN_READ_ONLY;
6387 }
6388 #endif
6389 if ((error = txn_begin(db_env, NULL, &trx->all, txn_begin_flags, thd))) {
6390 goto cleanup;
6391 }
6392 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6393 TOKUDB_DEBUG_TXN,
6394 "created master %p",
6395 trx->all);
6396 trx->sp_level = trx->all;
6397 trans_register_ha(thd, true, tokudb_hton);
6398 }
6399 DBUG_PRINT("trans", ("starting transaction stmt"));
6400 if (trx->stmt) {
6401 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6402 TOKUDB_DEBUG_TXN,
6403 "warning:stmt=%p",
6404 trx->stmt);
6405 }
6406 uint32_t txn_begin_flags;
6407 if (trx->all == NULL) {
6408 txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6409 //
6410 // if the isolation level that the user has set is serializable,
6411 // but autocommit is on and this is just a select,
6412 // then we can go ahead and set the isolation level to
6413 // be a snapshot read, because we can serialize
6414 // the transaction to be the point in time at which the snapshot began.
6415 //
6416 if (txn_begin_flags == 0 && is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT) {
6417 txn_begin_flags = DB_TXN_SNAPSHOT;
6418 }
6419 if (is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT &&
6420 !thd->in_sub_stmt && lock.type <= TL_READ_NO_INSERT &&
6421 !thd->lex->uses_stored_routines()) {
6422 txn_begin_flags |= DB_TXN_READ_ONLY;
6423 }
6424 } else {
6425 txn_begin_flags = DB_INHERIT_ISOLATION;
6426 }
6427 error = txn_begin(db_env, trx->sp_level, &trx->stmt, txn_begin_flags, thd);
6428 if (error) {
6429 /* We leave the possible master transaction open */
6430 goto cleanup;
6431 }
6432 trx->sub_sp_level = trx->stmt;
6433 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6434 TOKUDB_DEBUG_TXN,
6435 "created stmt %p sp_level %p",
6436 trx->sp_level,
6437 trx->stmt);
6438 reset_stmt_progress(&trx->stmt_progress);
6439 trans_register_ha(thd, false, tokudb_hton);
6440 cleanup:
6441 return error;
6442 }
6443
6444 static const char *lock_type_str(int lock_type) {
6445 if (lock_type == F_RDLCK) return "F_RDLCK";
6446 if (lock_type == F_WRLCK) return "F_WRLCK";
6447 if (lock_type == F_UNLCK) return "F_UNLCK";
6448 return "?";
6449 }
6450
6451 /*
6452 As MySQL will execute an external lock for every new table it uses
6453 we can use this to start the transactions.
6454 If we are in auto_commit mode we just need to start a transaction
6455 for the statement to be able to rollback the statement.
6456 If not, we have to start a master transaction if there doesn't exist
6457 one from before.
6458 */
6459 //
6460 // Parameters:
6461 // [in] thd - handle to the user thread
6462 // lock_type - the type of lock
6463 // Returns:
6464 // 0 on success
6465 // error otherwise
6466 //
6467 int ha_tokudb::external_lock(THD * thd, int lock_type) {
6468 TOKUDB_HANDLER_DBUG_ENTER(
6469 "cmd %d lock %d %s %s",
6470 thd_sql_command(thd),
6471 lock_type,
6472 lock_type_str(lock_type),
6473 share->full_table_name());
6474 if (TOKUDB_UNLIKELY(!TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ENTER) &&
6475 TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_LOCK))) {
6476 TOKUDB_HANDLER_TRACE(
6477 "cmd %d lock %d %s %s",
6478 thd_sql_command(thd),
6479 lock_type,
6480 lock_type_str(lock_type),
6481 share->full_table_name());
6482 }
6483 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s", thd->query());
6484
6485 int error = 0;
6486 tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6487 if (!trx) {
6488 error = create_tokudb_trx_data_instance(&trx);
6489 if (error) { goto cleanup; }
6490 thd_set_ha_data(thd, tokudb_hton, trx);
6491 }
6492
6493 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6494 TOKUDB_DEBUG_TXN,
6495 "trx %p %p %p %p %u %u",
6496 trx->all,
6497 trx->stmt,
6498 trx->sp_level,
6499 trx->sub_sp_level,
6500 trx->tokudb_lock_count,
6501 trx->create_lock_count);
6502
6503 if (trx->all == NULL) {
6504 trx->sp_level = NULL;
6505 }
6506 if (lock_type != F_UNLCK) {
6507 use_write_locks = false;
6508 if (lock_type == F_WRLCK) {
6509 use_write_locks = true;
6510 }
6511 if (!trx->stmt) {
6512 transaction = NULL; // Safety
6513 error = create_txn(thd, trx);
6514 if (error) {
6515 goto cleanup;
6516 }
6517 trx->create_lock_count = trx->tokudb_lock_count;
6518 }
6519 transaction = trx->sub_sp_level;
6520 trx->tokudb_lock_count++;
6521 } else {
6522 share->update_row_count(thd, added_rows, deleted_rows, updated_rows);
6523 added_rows = 0;
6524 deleted_rows = 0;
6525 updated_rows = 0;
6526 share->rows_from_locked_table = 0;
6527 if (trx->tokudb_lock_count > 0) {
6528 if (--trx->tokudb_lock_count <= trx->create_lock_count) {
6529 trx->create_lock_count = 0;
6530 if (trx->stmt) {
6531 /*
6532 F_UNLCK is done without a transaction commit / rollback.
6533 This happens if the thread didn't update any rows
6534 We must in this case commit the work to keep the row locks
6535 */
6536 DBUG_PRINT("trans", ("commiting non-updating transaction"));
6537 reset_stmt_progress(&trx->stmt_progress);
6538 commit_txn(trx->stmt, 0);
6539 trx->stmt = NULL;
6540 trx->sub_sp_level = NULL;
6541 }
6542 }
6543 transaction = NULL;
6544 }
6545 }
6546 cleanup:
6547 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6548 TOKUDB_HANDLER_DBUG_RETURN(error);
6549 }
6550
6551 /*
6552 When using LOCK TABLE's external_lock is only called when the actual
6553 TABLE LOCK is done.
6554 Under LOCK TABLES, each used tables will force a call to start_stmt.
6555 */
6556 int ha_tokudb::start_stmt(THD* thd, thr_lock_type lock_type) {
6557 TOKUDB_HANDLER_DBUG_ENTER(
6558 "cmd %d lock %d %s",
6559 thd_sql_command(thd),
6560 lock_type,
6561 share->full_table_name());
6562
6563 TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s", thd->query());
6564
6565 int error = 0;
6566 tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6567 if (!trx) {
6568 error = create_tokudb_trx_data_instance(&trx);
6569 if (error) { goto cleanup; }
6570 thd_set_ha_data(thd, tokudb_hton, trx);
6571 }
6572
6573 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6574 TOKUDB_DEBUG_TXN,
6575 "trx %p %p %p %p %u %u",
6576 trx->all,
6577 trx->stmt,
6578 trx->sp_level,
6579 trx->sub_sp_level,
6580 trx->tokudb_lock_count,
6581 trx->create_lock_count);
6582
6583 /*
6584 note that trx->stmt may have been already initialized as start_stmt()
6585 is called for *each table* not for each storage engine,
6586 and there could be many bdb tables referenced in the query
6587 */
6588 if (!trx->stmt) {
6589 error = create_txn(thd, trx);
6590 if (error) {
6591 goto cleanup;
6592 }
6593 trx->create_lock_count = trx->tokudb_lock_count;
6594 } else {
6595 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6596 TOKUDB_DEBUG_TXN,
6597 "trx->stmt %p already existed",
6598 trx->stmt);
6599 }
6600 if (added_rows > deleted_rows) {
6601 share->rows_from_locked_table = added_rows - deleted_rows;
6602 }
6603 transaction = trx->sub_sp_level;
6604 trans_register_ha(thd, false, tokudb_hton);
6605 cleanup:
6606 TOKUDB_HANDLER_DBUG_RETURN(error);
6607 }
6608
6609
6610 uint32_t ha_tokudb::get_cursor_isolation_flags(enum thr_lock_type lock_type, THD* thd) {
6611 uint sql_command = thd_sql_command(thd);
6612 bool in_lock_tables = thd_in_lock_tables(thd);
6613
6614 //
6615 // following InnoDB's lead and having checksum command use a snapshot read if told
6616 //
6617 if (sql_command == SQLCOM_CHECKSUM) {
6618 return 0;
6619 }
6620 else if ((lock_type == TL_READ && in_lock_tables) ||
6621 (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
6622 sql_command != SQLCOM_SELECT ||
6623 (sql_command == SQLCOM_SELECT && lock_type >= TL_WRITE_ALLOW_WRITE)) { // select for update
6624 ulong tx_isolation = thd_tx_isolation(thd);
6625 // pattern matched from InnoDB
6626 if ( (tx_isolation == ISO_READ_COMMITTED || tx_isolation == ISO_READ_UNCOMMITTED) &&
6627 (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT) &&
6628 (sql_command == SQLCOM_INSERT_SELECT
6629 || sql_command == SQLCOM_REPLACE_SELECT
6630 || sql_command == SQLCOM_UPDATE
6631 || sql_command == SQLCOM_CREATE_TABLE) )
6632 {
6633 return 0;
6634 }
6635 else {
6636 return DB_SERIALIZABLE;
6637 }
6638 }
6639 else {
6640 return 0;
6641 }
6642 }
6643
6644 /*
6645 The idea with handler::store_lock() is the following:
6646
6647 The statement decided which locks we should need for the table
6648 for updates/deletes/inserts we get WRITE locks, for SELECT... we get
6649 read locks.
6650
6651 Before adding the lock into the table lock handler (see thr_lock.c)
6652 mysqld calls store lock with the requested locks. Store lock can now
6653 modify a write lock to a read lock (or some other lock), ignore the
6654 lock (if we don't want to use MySQL table locks at all) or add locks
6655 for many tables (like we do when we are using a MERGE handler).
6656
6657 TokuDB changes all WRITE locks to TL_WRITE_ALLOW_WRITE (which
6658 signals that we are doing WRITES, but we are still allowing other
6659 reader's and writer's.
6660
6661 When releasing locks, store_lock() are also called. In this case one
6662 usually doesn't have to do anything.
6663
6664 In some exceptional cases MySQL may send a request for a TL_IGNORE;
6665 This means that we are requesting the same lock as last time and this
6666 should also be ignored. (This may happen when someone does a flush
6667 table when we have opened a part of the tables, in which case mysqld
6668 closes and reopens the tables and tries to get the same locks at last
6669 time). In the future we will probably try to remove this.
6670 */
6671
6672 THR_LOCK_DATA* *ha_tokudb::store_lock(
6673 THD* thd,
6674 THR_LOCK_DATA** to,
6675 enum thr_lock_type lock_type) {
6676
6677 TOKUDB_HANDLER_DBUG_ENTER(
6678 "lock_type=%d cmd=%d",
6679 lock_type,
6680 thd_sql_command(thd));
6681 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6682 TOKUDB_DEBUG_LOCK,
6683 "lock_type=%d cmd=%d",
6684 lock_type,
6685 thd_sql_command(thd));
6686
6687 if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) {
6688 enum_sql_command sql_command = (enum_sql_command) thd_sql_command(thd);
6689 if (!thd->in_lock_tables) {
6690 if (sql_command == SQLCOM_CREATE_INDEX &&
6691 tokudb::sysvars::create_index_online(thd)) {
6692 // hot indexing
6693 rwlock_t_lock_read(share->_num_DBs_lock);
6694 if (share->num_DBs ==
6695 (table->s->keys + tokudb_test(hidden_primary_key))) {
6696 lock_type = TL_WRITE_ALLOW_WRITE;
6697 }
6698 share->_num_DBs_lock.unlock();
6699 } else if ((lock_type >= TL_WRITE_CONCURRENT_INSERT &&
6700 lock_type <= TL_WRITE) &&
6701 sql_command != SQLCOM_TRUNCATE &&
6702 !thd_tablespace_op(thd)) {
6703 // allow concurrent writes
6704 lock_type = TL_WRITE_ALLOW_WRITE;
6705 } else if (sql_command == SQLCOM_OPTIMIZE &&
6706 lock_type == TL_READ_NO_INSERT) {
6707 // hot optimize table
6708 lock_type = TL_READ;
6709 }
6710 }
6711 lock.type = lock_type;
6712 }
6713 *to++ = &lock;
6714 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6715 TOKUDB_DEBUG_LOCK,
6716 "lock_type=%d",
6717 lock_type);
6718 TOKUDB_HANDLER_DBUG_RETURN_PTR(to);
6719 }
6720
6721 static toku_compression_method get_compression_method(DB* file) {
6722 enum toku_compression_method method;
6723 int r = file->get_compression_method(file, &method);
6724 assert_always(r == 0);
6725 return method;
6726 }
6727
6728 #if defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) && \
6729 TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6730 enum row_type ha_tokudb::get_row_type() const {
6731 toku_compression_method compression_method = get_compression_method(share->file);
6732 return toku_compression_method_to_row_type(compression_method);
6733 }
6734 #endif // defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) &&
6735 // TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6736
6737 static int create_sub_table(
6738 const char* table_name,
6739 DBT* row_descriptor,
6740 DB_TXN* txn,
6741 uint32_t block_size,
6742 uint32_t read_block_size,
6743 toku_compression_method compression_method,
6744 bool is_hot_index,
6745 uint32_t fanout) {
6746
6747 TOKUDB_DBUG_ENTER("");
6748 int error;
6749 DB *file = NULL;
6750 uint32_t create_flags;
6751
6752
6753 error = db_create(&file, db_env, 0);
6754 if (error) {
6755 DBUG_PRINT("error", ("Got error: %d when creating table", error));
6756 my_errno = error;
6757 goto exit;
6758 }
6759
6760
6761 if (block_size != 0) {
6762 error = file->set_pagesize(file, block_size);
6763 if (error != 0) {
6764 DBUG_PRINT(
6765 "error",
6766 ("Got error: %d when setting block size %u for table '%s'",
6767 error,
6768 block_size,
6769 table_name));
6770 goto exit;
6771 }
6772 }
6773 if (read_block_size != 0) {
6774 error = file->set_readpagesize(file, read_block_size);
6775 if (error != 0) {
6776 DBUG_PRINT(
6777 "error",
6778 ("Got error: %d when setting read block size %u for table '%s'",
6779 error,
6780 read_block_size,
6781 table_name));
6782 goto exit;
6783 }
6784 }
6785 if (fanout != 0) {
6786 error = file->set_fanout(file, fanout);
6787 if (error != 0) {
6788 DBUG_PRINT(
6789 "error",
6790 ("Got error: %d when setting fanout %u for table '%s'",
6791 error,
6792 fanout,
6793 table_name));
6794 goto exit;
6795 }
6796 }
6797 error = file->set_compression_method(file, compression_method);
6798 if (error != 0) {
6799 DBUG_PRINT(
6800 "error",
6801 ("Got error: %d when setting compression type %u for table '%s'",
6802 error,
6803 compression_method,
6804 table_name));
6805 goto exit;
6806 }
6807
6808 create_flags =
6809 DB_THREAD | DB_CREATE | DB_EXCL | (is_hot_index ? DB_IS_HOT_INDEX : 0);
6810 error =
6811 file->open(
6812 file,
6813 txn,
6814 table_name,
6815 NULL,
6816 DB_BTREE,
6817 create_flags,
6818 my_umask);
6819 if (error) {
6820 DBUG_PRINT(
6821 "error",
6822 ("Got error: %d when opening table '%s'", error, table_name));
6823 goto exit;
6824 }
6825
6826 error =
6827 file->change_descriptor(
6828 file,
6829 txn,
6830 row_descriptor,
6831 (is_hot_index ? DB_IS_HOT_INDEX |
6832 DB_UPDATE_CMP_DESCRIPTOR :
6833 DB_UPDATE_CMP_DESCRIPTOR));
6834 if (error) {
6835 DBUG_PRINT(
6836 "error",
6837 ("Got error: %d when setting row descriptor for table '%s'",
6838 error,
6839 table_name));
6840 goto exit;
6841 }
6842
6843 error = 0;
6844 exit:
6845 if (file) {
6846 int r = file->close(file, 0);
6847 assert_always(r==0);
6848 }
6849 TOKUDB_DBUG_RETURN(error);
6850 }
6851
6852 void ha_tokudb::update_create_info(HA_CREATE_INFO* create_info) {
6853 if (share->has_auto_inc) {
6854 info(HA_STATUS_AUTO);
6855 if (!(create_info->used_fields & HA_CREATE_USED_AUTO) ||
6856 create_info->auto_increment_value < stats.auto_increment_value) {
6857 create_info->auto_increment_value = stats.auto_increment_value;
6858 }
6859 }
6860 #if defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) && \
6861 TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6862 if (!(create_info->used_fields & HA_CREATE_USED_ROW_FORMAT)) {
6863 // show create table asks us to update this create_info, this makes it
6864 // so we'll always show what compression type we're using
6865 create_info->row_type = get_row_type();
6866 if (create_info->row_type == ROW_TYPE_TOKU_ZLIB &&
6867 tokudb::sysvars::hide_default_row_format(ha_thd()) != 0) {
6868 create_info->row_type = ROW_TYPE_DEFAULT;
6869 }
6870 }
6871 #endif // defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) &&
6872 // TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6873 }
6874
6875 //
6876 // removes key name from status.tokudb.
6877 // needed for when we are dropping indexes, so that
6878 // during drop table, we do not attempt to remove already dropped
6879 // indexes because we did not keep status.tokudb in sync with list of indexes.
6880 //
6881 int ha_tokudb::remove_key_name_from_status(DB* status_block, char* key_name, DB_TXN* txn) {
6882 int error;
6883 uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6884 HA_METADATA_KEY md_key = hatoku_key_name;
6885 memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6886 //
6887 // put index name in status.tokudb
6888 //
6889 memcpy(
6890 status_key_info + sizeof(HA_METADATA_KEY),
6891 key_name,
6892 strlen(key_name) + 1
6893 );
6894 error = remove_metadata(
6895 status_block,
6896 status_key_info,
6897 sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6898 txn
6899 );
6900 return error;
6901 }
6902
6903 //
6904 // writes the key name in status.tokudb, so that we may later delete or rename
6905 // the dictionary associated with key_name
6906 //
6907 int ha_tokudb::write_key_name_to_status(DB* status_block, char* key_name, DB_TXN* txn) {
6908 int error;
6909 uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6910 HA_METADATA_KEY md_key = hatoku_key_name;
6911 memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6912 //
6913 // put index name in status.tokudb
6914 //
6915 memcpy(
6916 status_key_info + sizeof(HA_METADATA_KEY),
6917 key_name,
6918 strlen(key_name) + 1
6919 );
6920 error = write_metadata(
6921 status_block,
6922 status_key_info,
6923 sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6924 NULL,
6925 0,
6926 txn
6927 );
6928 return error;
6929 }
6930
6931 //
6932 // some tracing moved out of ha_tokudb::create, because ::create was
6933 // getting cluttered
6934 //
6935 void ha_tokudb::trace_create_table_info(TABLE* form) {
6936 uint i;
6937 //
6938 // tracing information about what type of table we are creating
6939 //
6940 if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_OPEN))) {
6941 for (i = 0; i < form->s->fields; i++) {
6942 Field *field = form->s->field[i];
6943 TOKUDB_HANDLER_TRACE(
6944 "field:%d:%s:type=%d:flags=%x",
6945 i,
6946 field->field_name,
6947 field->type(),
6948 field->flags);
6949 }
6950 for (i = 0; i < form->s->keys; i++) {
6951 KEY *key = &form->s->key_info[i];
6952 TOKUDB_HANDLER_TRACE(
6953 "key:%d:%s:%d",
6954 i,
6955 key->name,
6956 key->user_defined_key_parts);
6957 uint p;
6958 for (p = 0; p < key->user_defined_key_parts; p++) {
6959 KEY_PART_INFO* key_part = &key->key_part[p];
6960 Field* field = key_part->field;
6961 TOKUDB_HANDLER_TRACE(
6962 "key:%d:%d:length=%d:%s:type=%d:flags=%x",
6963 i,
6964 p,
6965 key_part->length,
6966 field->field_name,
6967 field->type(),
6968 field->flags);
6969 }
6970 }
6971 }
6972 }
6973
6974 static uint32_t get_max_desc_size(KEY_AND_COL_INFO* kc_info, TABLE* form) {
6975 uint32_t max_row_desc_buff_size;
6976 // upper bound of key comparison descriptor
6977 max_row_desc_buff_size = 2*(form->s->fields * 6)+10;
6978 // upper bound for sec. key part
6979 max_row_desc_buff_size += get_max_secondary_key_pack_desc_size(kc_info);
6980 // upper bound for clustering val part
6981 max_row_desc_buff_size += get_max_clustering_val_pack_desc_size(form->s);
6982 return max_row_desc_buff_size;
6983 }
6984
6985 static uint32_t create_secondary_key_descriptor(
6986 uchar* buf,
6987 KEY* key_info,
6988 KEY* prim_key,
6989 uint hpk,
6990 TABLE* form,
6991 uint primary_key,
6992 uint32_t keynr,
6993 KEY_AND_COL_INFO* kc_info) {
6994
6995 uchar* ptr = NULL;
6996
6997 ptr = buf;
6998 ptr += create_toku_key_descriptor(
6999 ptr,
7000 false,
7001 key_info,
7002 hpk,
7003 prim_key
7004 );
7005
7006 ptr += create_toku_secondary_key_pack_descriptor(
7007 ptr,
7008 hpk,
7009 primary_key,
7010 form->s,
7011 form,
7012 kc_info,
7013 key_info,
7014 prim_key
7015 );
7016
7017 ptr += create_toku_clustering_val_pack_descriptor(
7018 ptr,
7019 primary_key,
7020 form->s,
7021 kc_info,
7022 keynr,
7023 key_is_clustering(key_info)
7024 );
7025 return ptr - buf;
7026 }
7027
7028
7029 //
7030 // creates dictionary for secondary index, with key description key_info, all using txn
7031 //
7032 int ha_tokudb::create_secondary_dictionary(
7033 const char* name,
7034 TABLE* form,
7035 KEY* key_info,
7036 DB_TXN* txn,
7037 KEY_AND_COL_INFO* kc_info,
7038 uint32_t keynr,
7039 bool is_hot_index,
7040 toku_compression_method compression_method) {
7041
7042 int error;
7043 DBT row_descriptor;
7044 uchar* row_desc_buff = NULL;
7045 char* newname = NULL;
7046 size_t newname_len = 0;
7047 KEY* prim_key = NULL;
7048 char dict_name[MAX_DICT_NAME_LEN];
7049 uint32_t max_row_desc_buff_size;
7050 uint hpk= (form->s->primary_key >= MAX_KEY) ?
7051 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7052 uint32_t block_size;
7053 uint32_t read_block_size;
7054 uint32_t fanout;
7055 THD* thd = ha_thd();
7056
7057 memset(&row_descriptor, 0, sizeof(row_descriptor));
7058
7059 max_row_desc_buff_size = get_max_desc_size(kc_info,form);
7060
7061 row_desc_buff = (uchar*)tokudb::memory::malloc(
7062 max_row_desc_buff_size,
7063 MYF(MY_WME));
7064 if (row_desc_buff == NULL) {
7065 error = ENOMEM;
7066 goto cleanup;
7067 }
7068
7069 newname_len = get_max_dict_name_path_length(name);
7070 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7071 if (newname == NULL) {
7072 error = ENOMEM;
7073 goto cleanup;
7074 }
7075
7076 sprintf(dict_name, "key-%s", key_info->name);
7077 make_name(newname, newname_len, name, dict_name);
7078
7079 prim_key = (hpk) ? NULL : &form->s->key_info[primary_key];
7080
7081 //
7082 // setup the row descriptor
7083 //
7084 row_descriptor.data = row_desc_buff;
7085 //
7086 // save data necessary for key comparisons
7087 //
7088 row_descriptor.size = create_secondary_key_descriptor(
7089 row_desc_buff,
7090 key_info,
7091 prim_key,
7092 hpk,
7093 form,
7094 primary_key,
7095 keynr,
7096 kc_info);
7097 assert_always(row_descriptor.size <= max_row_desc_buff_size);
7098
7099 block_size = tokudb::sysvars::block_size(thd);
7100 read_block_size = tokudb::sysvars::read_block_size(thd);
7101 fanout = tokudb::sysvars::fanout(thd);
7102
7103 error = create_sub_table(
7104 newname,
7105 &row_descriptor,
7106 txn,
7107 block_size,
7108 read_block_size,
7109 compression_method,
7110 is_hot_index,
7111 fanout);
7112 cleanup:
7113 tokudb::memory::free(newname);
7114 tokudb::memory::free(row_desc_buff);
7115 return error;
7116 }
7117
7118
7119 static uint32_t create_main_key_descriptor(
7120 uchar* buf,
7121 KEY* prim_key,
7122 uint hpk,
7123 uint primary_key,
7124 TABLE* form,
7125 KEY_AND_COL_INFO* kc_info) {
7126
7127 uchar* ptr = buf;
7128 ptr += create_toku_key_descriptor(
7129 ptr,
7130 hpk,
7131 prim_key,
7132 false,
7133 NULL);
7134
7135 ptr += create_toku_main_key_pack_descriptor(ptr);
7136
7137 ptr += create_toku_clustering_val_pack_descriptor(
7138 ptr,
7139 primary_key,
7140 form->s,
7141 kc_info,
7142 primary_key,
7143 false);
7144 return ptr - buf;
7145 }
7146
7147 //
7148 // create and close the main dictionarr with name of "name" using table form, all within
7149 // transaction txn.
7150 //
7151 int ha_tokudb::create_main_dictionary(
7152 const char* name,
7153 TABLE* form,
7154 DB_TXN* txn,
7155 KEY_AND_COL_INFO* kc_info,
7156 toku_compression_method compression_method) {
7157
7158 int error;
7159 DBT row_descriptor;
7160 uchar* row_desc_buff = NULL;
7161 char* newname = NULL;
7162 size_t newname_len = 0;
7163 KEY* prim_key = NULL;
7164 uint32_t max_row_desc_buff_size;
7165 uint hpk = (form->s->primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7166 uint32_t block_size;
7167 uint32_t read_block_size;
7168 uint32_t fanout;
7169 THD* thd = ha_thd();
7170
7171 memset(&row_descriptor, 0, sizeof(row_descriptor));
7172 max_row_desc_buff_size = get_max_desc_size(kc_info, form);
7173
7174 row_desc_buff = (uchar*)tokudb::memory::malloc(
7175 max_row_desc_buff_size,
7176 MYF(MY_WME));
7177 if (row_desc_buff == NULL) {
7178 error = ENOMEM;
7179 goto cleanup;
7180 }
7181
7182 newname_len = get_max_dict_name_path_length(name);
7183 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7184 if (newname == NULL) {
7185 error = ENOMEM;
7186 goto cleanup;
7187 }
7188
7189 make_name(newname, newname_len, name, "main");
7190
7191 prim_key = (hpk) ? NULL : &form->s->key_info[primary_key];
7192
7193 //
7194 // setup the row descriptor
7195 //
7196 row_descriptor.data = row_desc_buff;
7197 //
7198 // save data necessary for key comparisons
7199 //
7200 row_descriptor.size = create_main_key_descriptor(
7201 row_desc_buff,
7202 prim_key,
7203 hpk,
7204 primary_key,
7205 form,
7206 kc_info);
7207 assert_always(row_descriptor.size <= max_row_desc_buff_size);
7208
7209 block_size = tokudb::sysvars::block_size(thd);
7210 read_block_size = tokudb::sysvars::read_block_size(thd);
7211 fanout = tokudb::sysvars::fanout(thd);
7212
7213 /* Create the main table that will hold the real rows */
7214 error = create_sub_table(
7215 newname,
7216 &row_descriptor,
7217 txn,
7218 block_size,
7219 read_block_size,
7220 compression_method,
7221 false,
7222 fanout);
7223 cleanup:
7224 tokudb::memory::free(newname);
7225 tokudb::memory::free(row_desc_buff);
7226 return error;
7227 }
7228
7229 //
7230 // Creates a new table
7231 // Parameters:
7232 // [in] name - table name
7233 // [in] form - info on table, columns and indexes
7234 // [in] create_info - more info on table, CURRENTLY UNUSED
7235 // Returns:
7236 // 0 on success
7237 // error otherwise
7238 //
7239 int ha_tokudb::create(
7240 const char* name,
7241 TABLE* form,
7242 HA_CREATE_INFO* create_info) {
7243
7244 TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7245
7246 int error;
7247 DB *status_block = NULL;
7248 uint version;
7249 uint capabilities;
7250 DB_TXN* txn = NULL;
7251 bool do_commit = false;
7252 char* newname = NULL;
7253 size_t newname_len = 0;
7254 KEY_AND_COL_INFO kc_info;
7255 tokudb_trx_data *trx = NULL;
7256 THD* thd = ha_thd();
7257
7258 String database_name, table_name, dictionary_name;
7259 tokudb_split_dname(name, database_name, table_name, dictionary_name);
7260 if (database_name.is_empty() || table_name.is_empty()) {
7261 push_warning_printf(thd,
7262 Sql_condition::WARN_LEVEL_WARN,
7263 ER_TABLE_NAME,
7264 "TokuDB: Table Name or Database Name is empty");
7265 DBUG_RETURN(ER_TABLE_NAME);
7266 }
7267
7268 memset(&kc_info, 0, sizeof(kc_info));
7269
7270 #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100999
7271 // TokuDB does not support discover_table_names() and writes no files
7272 // in the database directory, so automatic filename-based
7273 // discover_table_names() doesn't work either. So, it must force .frm
7274 // file to disk.
7275 form->s->write_frm_image();
7276 #endif
7277
7278 #if defined(TOKU_INCLUDE_OPTION_STRUCTS) && TOKU_INCLUDE_OPTION_STRUCTS
7279 const tokudb::sysvars::format_t row_format =
7280 (tokudb::sysvars::row_format_t)form->s->option_struct->row_format;
7281 #else
7282 // TDB-76 : CREATE TABLE ... LIKE ... does not use source row_format on
7283 // target table
7284 // Original code would only use create_info->row_type if
7285 // create_info->used_fields & HA_CREATE_USED_ROW_FORMAT was true. This
7286 // would cause us to skip transferring the row_format for a table created
7287 // via CREATE TABLE tn LIKE tn. We also take on more InnoDB like behavior
7288 // and throw a warning if we get a row_format that we can't translate into
7289 // a known TokuDB row_format.
7290 tokudb::sysvars::row_format_t row_format =
7291 tokudb::sysvars::row_format(thd);
7292
7293 if ((create_info->used_fields & HA_CREATE_USED_ROW_FORMAT) ||
7294 create_info->row_type != ROW_TYPE_DEFAULT) {
7295 row_format = row_type_to_row_format(create_info->row_type);
7296 if (row_format == tokudb::sysvars::SRV_ROW_FORMAT_DEFAULT &&
7297 create_info->row_type != ROW_TYPE_DEFAULT) {
7298 push_warning(thd,
7299 Sql_condition::WARN_LEVEL_WARN,
7300 ER_ILLEGAL_HA_CREATE_OPTION,
7301 "TokuDB: invalid ROW_FORMAT specifier.");
7302 }
7303 }
7304 #endif // defined(TOKU_INCLUDE_OPTION_STRUCTS) && TOKU_INCLUDE_OPTION_STRUCTS
7305 const toku_compression_method compression_method =
7306 row_format_to_toku_compression_method(row_format);
7307
7308 bool create_from_engine = (create_info->table_options & HA_OPTION_CREATE_FROM_ENGINE);
7309 if (create_from_engine) {
7310 // table already exists, nothing to do
7311 error = 0;
7312 goto cleanup;
7313 }
7314
7315 // validate the fields in the table. If the table has fields
7316 // we do not support that came from an old version of MySQL,
7317 // gracefully return an error
7318 for (uint32_t i = 0; i < form->s->fields; i++) {
7319 Field* field = table_share->field[i];
7320 if (!field_valid_for_tokudb_table(field)) {
7321 sql_print_error("Table %s has an invalid field %s, that was created "
7322 "with an old version of MySQL. This field is no longer supported. "
7323 "This is probably due to an alter table engine=TokuDB. To load this "
7324 "table, do a dump and load",
7325 name,
7326 field->field_name
7327 );
7328 error = HA_ERR_UNSUPPORTED;
7329 goto cleanup;
7330 }
7331 }
7332
7333 newname_len = get_max_dict_name_path_length(name);
7334 newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7335 if (newname == NULL) {
7336 error = ENOMEM;
7337 goto cleanup;
7338 }
7339
7340 trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
7341 if (trx && trx->sub_sp_level &&
7342 thd_sql_command(thd) == SQLCOM_CREATE_TABLE) {
7343 txn = trx->sub_sp_level;
7344 } else {
7345 do_commit = true;
7346 error = txn_begin(db_env, 0, &txn, 0, thd);
7347 if (error) {
7348 goto cleanup;
7349 }
7350 }
7351
7352 primary_key = form->s->primary_key;
7353 hidden_primary_key = (primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7354 if (hidden_primary_key) {
7355 primary_key = form->s->keys;
7356 }
7357
7358 /* do some tracing */
7359 trace_create_table_info(form);
7360
7361 /* Create status.tokudb and save relevant metadata */
7362 make_name(newname, newname_len, name, "status");
7363
7364 error = tokudb::metadata::create(db_env, &status_block, newname, txn);
7365 if (error) { goto cleanup; }
7366
7367 version = HA_TOKU_VERSION;
7368 error = write_to_status(
7369 status_block,
7370 hatoku_new_version,
7371 &version,
7372 sizeof(version),
7373 txn);
7374 if (error) {
7375 goto cleanup;
7376 }
7377
7378 capabilities = HA_TOKU_CAP;
7379 error = write_to_status(
7380 status_block,
7381 hatoku_capabilities,
7382 &capabilities,
7383 sizeof(capabilities),
7384 txn);
7385 if (error) {
7386 goto cleanup;
7387 }
7388
7389 error = write_auto_inc_create(
7390 status_block,
7391 create_info->auto_increment_value,
7392 txn);
7393 if (error) {
7394 goto cleanup;
7395 }
7396
7397 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
7398 #if defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
7399 if (TOKU_PARTITION_WRITE_FRM_DATA || form->part_info == NULL) {
7400 error = write_frm_data(status_block, txn, form->s->path.str);
7401 if (error) {
7402 goto cleanup;
7403 }
7404 }
7405 #else
7406 error = write_frm_data(status_block, txn, form->s->path.str);
7407 if (error) {
7408 goto cleanup;
7409 }
7410 #endif // defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
7411 #endif // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
7412
7413 error = allocate_key_and_col_info(form->s, &kc_info);
7414 if (error) {
7415 goto cleanup;
7416 }
7417
7418 error = initialize_key_and_col_info(
7419 form->s,
7420 form,
7421 &kc_info,
7422 hidden_primary_key,
7423 primary_key);
7424 if (error) {
7425 goto cleanup;
7426 }
7427
7428 error = create_main_dictionary(
7429 name,
7430 form,
7431 txn,
7432 &kc_info,
7433 compression_method);
7434 if (error) {
7435 goto cleanup;
7436 }
7437
7438
7439 for (uint i = 0; i < form->s->keys; i++) {
7440 if (i != primary_key) {
7441 error = create_secondary_dictionary(
7442 name,
7443 form,
7444 &form->key_info[i],
7445 txn,
7446 &kc_info,
7447 i,
7448 false,
7449 compression_method);
7450 if (error) {
7451 goto cleanup;
7452 }
7453
7454 error = write_key_name_to_status(
7455 status_block,
7456 form->s->key_info[i].name,
7457 txn);
7458 if (error) {
7459 goto cleanup;
7460 }
7461 }
7462 }
7463
7464 error = 0;
7465 cleanup:
7466 if (status_block != NULL) {
7467 int r = tokudb::metadata::close(&status_block);
7468 assert_always(r==0);
7469 }
7470 free_key_and_col_info(&kc_info);
7471 if (do_commit && txn) {
7472 if (error) {
7473 abort_txn(txn);
7474 } else {
7475 commit_txn(txn,0);
7476 }
7477 }
7478 tokudb::memory::free(newname);
7479 TOKUDB_HANDLER_DBUG_RETURN(error);
7480 }
7481
7482 int ha_tokudb::discard_or_import_tablespace(TOKUDB_UNUSED(my_bool discard)) {
7483 /*
7484 if (discard) {
7485 my_errno=HA_ERR_WRONG_COMMAND;
7486 return my_errno;
7487 }
7488 return add_table_to_metadata(share->table_name);
7489 */
7490 my_errno=HA_ERR_WRONG_COMMAND;
7491 return my_errno;
7492 }
7493
7494
7495 //
7496 // deletes from_name or renames from_name to to_name, all using transaction txn.
7497 // is_delete specifies which we are doing
7498 // is_key specifies if it is a secondary index (and hence a "key-" needs to be prepended) or
7499 // if it is not a secondary index
7500 //
7501 int ha_tokudb::delete_or_rename_dictionary(
7502 const char* from_name,
7503 const char* to_name,
7504 const char* secondary_name,
7505 bool is_key,
7506 DB_TXN* txn,
7507 bool is_delete) {
7508
7509 int error;
7510 char dict_name[MAX_DICT_NAME_LEN];
7511 char* new_from_name = NULL;
7512 size_t new_from_name_len = 0;
7513 char* new_to_name = NULL;
7514 size_t new_to_name_len = 0;
7515 assert_always(txn);
7516
7517 new_from_name_len = get_max_dict_name_path_length(from_name);
7518 new_from_name = (char*)tokudb::memory::malloc(
7519 new_from_name_len,
7520 MYF(MY_WME));
7521 if (new_from_name == NULL) {
7522 error = ENOMEM;
7523 goto cleanup;
7524 }
7525 if (!is_delete) {
7526 assert_always(to_name);
7527 new_to_name_len = get_max_dict_name_path_length(to_name);
7528 new_to_name = (char*)tokudb::memory::malloc(
7529 new_to_name_len,
7530 MYF(MY_WME));
7531 if (new_to_name == NULL) {
7532 error = ENOMEM;
7533 goto cleanup;
7534 }
7535 }
7536
7537 if (is_key) {
7538 sprintf(dict_name, "key-%s", secondary_name);
7539 make_name(new_from_name, new_from_name_len, from_name, dict_name);
7540 } else {
7541 make_name(new_from_name, new_from_name_len, from_name, secondary_name);
7542 }
7543 if (!is_delete) {
7544 if (is_key) {
7545 sprintf(dict_name, "key-%s", secondary_name);
7546 make_name(new_to_name, new_to_name_len, to_name, dict_name);
7547 } else {
7548 make_name(new_to_name, new_to_name_len, to_name, secondary_name);
7549 }
7550 }
7551
7552 if (is_delete) {
7553 error = db_env->dbremove(db_env, txn, new_from_name, NULL, 0);
7554 } else {
7555 error = db_env->dbrename(
7556 db_env,
7557 txn,
7558 new_from_name,
7559 NULL,
7560 new_to_name,
7561 0);
7562 }
7563 if (error) {
7564 goto cleanup;
7565 }
7566
7567 cleanup:
7568 tokudb::memory::free(new_from_name);
7569 tokudb::memory::free(new_to_name);
7570 return error;
7571 }
7572
7573
7574 //
7575 // deletes or renames a table. if is_delete is true, then we delete, and to_name can be NULL
7576 // if is_delete is false, then to_name must be non-NULL, as we are renaming the table.
7577 //
7578 int ha_tokudb::delete_or_rename_table (const char* from_name, const char* to_name, bool is_delete) {
7579 THD *thd = ha_thd();
7580 int error;
7581 DB* status_db = NULL;
7582 DBC* status_cursor = NULL;
7583 DB_TXN* txn = NULL;
7584 DBT curr_key;
7585 DBT curr_val;
7586 memset(&curr_key, 0, sizeof(curr_key));
7587 memset(&curr_val, 0, sizeof(curr_val));
7588
7589 DB_TXN *parent_txn = NULL;
7590 tokudb_trx_data *trx = NULL;
7591 trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
7592 if (thd_sql_command(ha_thd()) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
7593 parent_txn = trx->sub_sp_level;
7594 }
7595
7596 error = txn_begin(db_env, parent_txn, &txn, 0, thd);
7597 if (error) { goto cleanup; }
7598
7599 //
7600 // open status db,
7601 // create cursor,
7602 // for each name read out of there, create a db and delete or rename it
7603 //
7604 error = open_status_dictionary(&status_db, from_name, txn);
7605 if (error) { goto cleanup; }
7606
7607 error = status_db->cursor(status_db, txn, &status_cursor, 0);
7608 if (error) { goto cleanup; }
7609 status_cursor->c_set_check_interrupt_callback(status_cursor, tokudb_killed_thd_callback, thd);
7610
7611 while (error != DB_NOTFOUND) {
7612 error = status_cursor->c_get(status_cursor, &curr_key, &curr_val, DB_NEXT);
7613 if (error && error != DB_NOTFOUND) {
7614 error = map_to_handler_error(error);
7615 goto cleanup;
7616 }
7617 if (error == DB_NOTFOUND) {
7618 break;
7619 }
7620 HA_METADATA_KEY mk = *(HA_METADATA_KEY *)curr_key.data;
7621 if (mk != hatoku_key_name) {
7622 continue;
7623 }
7624 error = delete_or_rename_dictionary(from_name, to_name, (char *)((char *)curr_key.data + sizeof(HA_METADATA_KEY)), true, txn, is_delete);
7625 if (error) { goto cleanup; }
7626 }
7627
7628 //
7629 // delete or rename main.tokudb
7630 //
7631 error = delete_or_rename_dictionary(from_name, to_name, "main", false, txn, is_delete);
7632 if (error) { goto cleanup; }
7633
7634 error = status_cursor->c_close(status_cursor);
7635 assert_always(error==0);
7636 status_cursor = NULL;
7637 if (error) { goto cleanup; }
7638
7639 error = status_db->close(status_db, 0);
7640 assert_always(error == 0);
7641 status_db = NULL;
7642
7643 //
7644 // delete or rename status.tokudb
7645 //
7646 error = delete_or_rename_dictionary(from_name, to_name, "status", false, txn, is_delete);
7647 if (error) { goto cleanup; }
7648
7649 my_errno = error;
7650 cleanup:
7651 if (status_cursor) {
7652 int r = status_cursor->c_close(status_cursor);
7653 assert_always(r==0);
7654 }
7655 if (status_db) {
7656 int r = status_db->close(status_db, 0);
7657 assert_always(r==0);
7658 }
7659 if (txn) {
7660 if (error) {
7661 abort_txn(txn);
7662 }
7663 else {
7664 commit_txn(txn, 0);
7665 }
7666 }
7667 return error;
7668 }
7669
7670
7671 //
7672 // Drops table
7673 // Parameters:
7674 // [in] name - name of table to be deleted
7675 // Returns:
7676 // 0 on success
7677 // error otherwise
7678 //
7679 int ha_tokudb::delete_table(const char *name) {
7680 TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7681 TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(name, NULL, false);
7682 if (share) {
7683 share->unlock();
7684 share->release();
7685 // this should be enough to handle locking as the higher level MDL
7686 // on this table should prevent any new analyze tasks.
7687 share->cancel_background_jobs();
7688 TOKUDB_SHARE::drop_share(share);
7689 }
7690
7691 int error;
7692 error = delete_or_rename_table(name, NULL, true);
7693 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7694 error == DB_LOCK_NOTGRANTED) {
7695 sql_print_error(
7696 "Could not delete table %s because another transaction has "
7697 "accessed the table. To drop the table, make sure no "
7698 "transactions touch the table.",
7699 name);
7700 }
7701 TOKUDB_HANDLER_DBUG_RETURN(error);
7702 }
7703
7704 static bool tokudb_check_db_dir_exist_from_table_name(const char *table_name) {
7705 DBUG_ASSERT(table_name);
7706 bool mysql_dir_exists;
7707 char db_name[FN_REFLEN];
7708 const char *db_name_begin = strchr(table_name, FN_LIBCHAR);
7709 const char *db_name_end = strrchr(table_name, FN_LIBCHAR);
7710 DBUG_ASSERT(db_name_begin);
7711 DBUG_ASSERT(db_name_end);
7712 DBUG_ASSERT(db_name_begin != db_name_end);
7713
7714 ++db_name_begin;
7715 size_t db_name_size = db_name_end - db_name_begin;
7716
7717 DBUG_ASSERT(db_name_size < FN_REFLEN);
7718
7719 memcpy(db_name, db_name_begin, db_name_size);
7720 db_name[db_name_size] = '\0';
7721
7722 // At this point, db_name contains the MySQL formatted database name.
7723 // This is exactly the same format that would come into us through a
7724 // CREATE TABLE. Some charaters (like ':' for example) might be expanded
7725 // into hex (':' would papear as "@003a").
7726 // We need to check that the MySQL destination database directory exists.
7727 mysql_dir_exists = (my_access(db_name, F_OK) == 0);
7728
7729 return mysql_dir_exists;
7730 }
7731
7732 //
7733 // renames table from "from" to "to"
7734 // Parameters:
7735 // [in] name - old name of table
7736 // [in] to - new name of table
7737 // Returns:
7738 // 0 on success
7739 // error otherwise
7740 //
7741 int ha_tokudb::rename_table(const char *from, const char *to) {
7742 TOKUDB_HANDLER_DBUG_ENTER("%s %s", from, to);
7743 TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(from, NULL, false);
7744 if (share) {
7745 share->unlock();
7746 share->release();
7747 // this should be enough to handle locking as the higher level MDL
7748 // on this table should prevent any new analyze tasks.
7749 share->cancel_background_jobs();
7750 TOKUDB_SHARE::drop_share(share);
7751 }
7752 int error;
7753 bool to_db_dir_exist = tokudb_check_db_dir_exist_from_table_name(to);
7754 if (!to_db_dir_exist) {
7755 sql_print_error(
7756 "Could not rename table from %s to %s because "
7757 "destination db does not exist",
7758 from,
7759 to);
7760 error = HA_ERR_DEST_SCHEMA_NOT_EXIST;
7761 }
7762 else {
7763 error = delete_or_rename_table(from, to, false);
7764 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7765 error == DB_LOCK_NOTGRANTED) {
7766 sql_print_error(
7767 "Could not rename table from %s to %s because another transaction "
7768 "has accessed the table. To rename the table, make sure no "
7769 "transactions touch the table.",
7770 from,
7771 to);
7772 }
7773 }
7774 TOKUDB_HANDLER_DBUG_RETURN(error);
7775 }
7776
7777
7778 /*
7779 Returns estimate on number of seeks it will take to read through the table
7780 This is to be comparable to the number returned by records_in_range so
7781 that we can decide if we should scan the table or use keys.
7782 */
7783 /// QQQ why divide by 3
7784 double ha_tokudb::scan_time() {
7785 TOKUDB_HANDLER_DBUG_ENTER("");
7786 double ret_val = (double)stats.records / 3;
7787 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7788 TOKUDB_DEBUG_RETURN,
7789 "return %" PRIu64 " %f",
7790 (uint64_t)stats.records,
7791 ret_val);
7792 DBUG_RETURN(ret_val);
7793 }
7794
7795 double ha_tokudb::keyread_time(uint index, uint ranges, ha_rows rows)
7796 {
7797 TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7798 double ret_val;
7799 if (index == primary_key || key_is_clustering(&table->key_info[index])) {
7800 ret_val = read_time(index, ranges, rows);
7801 DBUG_RETURN(ret_val);
7802 }
7803 /*
7804 It is assumed that we will read trough the whole key range and that all
7805 key blocks are half full (normally things are much better). It is also
7806 assumed that each time we read the next key from the index, the handler
7807 performs a random seek, thus the cost is proportional to the number of
7808 blocks read. This model does not take into account clustered indexes -
7809 engines that support that (e.g. InnoDB) may want to overwrite this method.
7810 */
7811 double keys_per_block= (stats.block_size/2.0/
7812 (table->key_info[index].key_length +
7813 ref_length) + 1);
7814 ret_val = (rows + keys_per_block - 1)/ keys_per_block;
7815 TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7816 }
7817
7818 //
7819 // Calculate the time it takes to read a set of ranges through an index
7820 // This enables us to optimize reads for clustered indexes.
7821 // Implementation pulled from InnoDB
7822 // Parameters:
7823 // index - index to use
7824 // ranges - number of ranges
7825 // rows - estimated number of rows in the range
7826 // Returns:
7827 // estimated time measured in disk seeks
7828 //
7829 double ha_tokudb::read_time(
7830 uint index,
7831 uint ranges,
7832 ha_rows rows
7833 )
7834 {
7835 TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7836 double total_scan;
7837 double ret_val;
7838 bool is_primary = (index == primary_key);
7839 bool is_clustering;
7840
7841 //
7842 // in case for hidden primary key, this is called
7843 //
7844 if (index >= table_share->keys) {
7845 ret_val = handler::read_time(index, ranges, rows);
7846 goto cleanup;
7847 }
7848
7849 is_clustering = key_is_clustering(&table->key_info[index]);
7850
7851
7852 //
7853 // if it is not the primary key, and it is not a clustering key, then return handler::read_time
7854 //
7855 if (!(is_primary || is_clustering)) {
7856 ret_val = handler::read_time(index, ranges, rows);
7857 goto cleanup;
7858 }
7859
7860 //
7861 // for primary key and for clustered keys, return a fraction of scan_time()
7862 //
7863 total_scan = scan_time();
7864
7865 if (stats.records < rows) {
7866 ret_val = is_clustering ? total_scan + 0.00001 : total_scan;
7867 goto cleanup;
7868 }
7869
7870 //
7871 // one disk seek per range plus the proportional scan time of the rows
7872 //
7873 ret_val = (ranges + (double) rows / (double) stats.records * total_scan);
7874 ret_val = is_clustering ? ret_val + 0.00001 : ret_val;
7875
7876 cleanup:
7877 TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7878 }
7879
7880 double ha_tokudb::index_only_read_time(uint keynr, double records) {
7881 TOKUDB_HANDLER_DBUG_ENTER("%u %f", keynr, records);
7882 double ret_val = keyread_time(keynr, 1, (ha_rows)records);
7883 TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7884 }
7885
7886 //
7887 // Estimates the number of index records in a range. In case of errors, return
7888 // HA_TOKUDB_RANGE_COUNT instead of HA_POS_ERROR. This was behavior
7889 // when we got the handlerton from MySQL.
7890 // Parameters:
7891 // keynr -index to use
7892 // [in] start_key - low end of the range
7893 // [in] end_key - high end of the range
7894 // Returns:
7895 // 0 - There are no matching keys in the given range
7896 // number > 0 - There are approximately number matching rows in the range
7897 // HA_POS_ERROR - Something is wrong with the index tree
7898 //
7899 ha_rows ha_tokudb::records_in_range(uint keynr, key_range* start_key, key_range* end_key) {
7900 TOKUDB_HANDLER_DBUG_ENTER("%d %p %p", keynr, start_key, end_key);
7901 DBT *pleft_key, *pright_key;
7902 DBT left_key, right_key;
7903 ha_rows ret_val = HA_TOKUDB_RANGE_COUNT;
7904 DB *kfile = share->key_file[keynr];
7905 uint64_t rows = 0;
7906 int error;
7907
7908 // get start_rows and end_rows values so that we can estimate range
7909 // when calling key_range64, the only value we can trust is the value for less
7910 // The reason is that the key being passed in may be a prefix of keys in the DB
7911 // As a result, equal may be 0 and greater may actually be equal+greater
7912 // So, we call key_range64 on the key, and the key that is after it.
7913 if (!start_key && !end_key) {
7914 error = estimate_num_rows(share->file, &rows, transaction);
7915 if (error) {
7916 ret_val = HA_TOKUDB_RANGE_COUNT;
7917 goto cleanup;
7918 }
7919 ret_val = (rows <= 1) ? 1 : rows;
7920 goto cleanup;
7921 }
7922 if (start_key) {
7923 uchar inf_byte = (start_key->flag == HA_READ_KEY_EXACT) ? COL_NEG_INF : COL_POS_INF;
7924 pack_key(&left_key, keynr, key_buff, start_key->key, start_key->length, inf_byte);
7925 pleft_key = &left_key;
7926 } else {
7927 pleft_key = NULL;
7928 }
7929 if (end_key) {
7930 uchar inf_byte = (end_key->flag == HA_READ_BEFORE_KEY) ? COL_NEG_INF : COL_POS_INF;
7931 pack_key(&right_key, keynr, key_buff2, end_key->key, end_key->length, inf_byte);
7932 pright_key = &right_key;
7933 } else {
7934 pright_key = NULL;
7935 }
7936 // keys_range64 can not handle a degenerate range (left_key > right_key), so we filter here
7937 if (pleft_key && pright_key && tokudb_cmp_dbt_key(kfile, pleft_key, pright_key) > 0) {
7938 rows = 0;
7939 } else {
7940 uint64_t less, equal1, middle, equal2, greater;
7941 bool is_exact;
7942 error = kfile->keys_range64(kfile, transaction, pleft_key, pright_key,
7943 &less, &equal1, &middle, &equal2, &greater, &is_exact);
7944 if (error) {
7945 ret_val = HA_TOKUDB_RANGE_COUNT;
7946 goto cleanup;
7947 }
7948 rows = middle;
7949 }
7950
7951 // MySQL thinks a return value of 0 means there are exactly 0 rows
7952 // Therefore, always return non-zero so this assumption is not made
7953 ret_val = (ha_rows) (rows <= 1 ? 1 : rows);
7954
7955 cleanup:
7956 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7957 TOKUDB_DEBUG_RETURN,
7958 "return %" PRIu64 " %" PRIu64,
7959 (uint64_t)ret_val,
7960 rows);
7961 DBUG_RETURN(ret_val);
7962 }
7963
7964
7965 //
7966 // Initializes the auto-increment data in the local "share" object to the
7967 // greater of two values: what's stored in the metadata or the last inserted
7968 // auto-increment field (if auto-increment field is the first field of a key).
7969 //
7970 void ha_tokudb::init_auto_increment() {
7971 int error;
7972 DB_TXN* txn = NULL;
7973
7974 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
7975 if (error) {
7976 share->last_auto_increment = 0;
7977 } else {
7978 HA_METADATA_KEY key_val;
7979 DBT key;
7980 memset(&key, 0, sizeof(key));
7981 key.data = &key_val;
7982 key.size = sizeof(key_val);
7983 DBT value;
7984 memset(&value, 0, sizeof(value));
7985 value.flags = DB_DBT_USERMEM;
7986
7987 // Retrieve the initial auto increment value, as specified by create table
7988 // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
7989 // then the value 100 should be stored here
7990 key_val = hatoku_ai_create_value;
7991 value.ulen = sizeof(share->auto_inc_create_value);
7992 value.data = &share->auto_inc_create_value;
7993 error = share->status_block->get(share->status_block, txn, &key, &value, 0);
7994
7995 if (error || value.size != sizeof(share->auto_inc_create_value)) {
7996 share->auto_inc_create_value = 0;
7997 }
7998
7999 // Retrieve hatoku_max_ai, which is max value used by auto increment
8000 // column so far, the max value could have been auto generated (e.g. insert (NULL))
8001 // or it could have been manually inserted by user (e.g. insert (345))
8002 key_val = hatoku_max_ai;
8003 value.ulen = sizeof(share->last_auto_increment);
8004 value.data = &share->last_auto_increment;
8005 error = share->status_block->get(share->status_block, txn, &key, &value, 0);
8006
8007 if (error || value.size != sizeof(share->last_auto_increment)) {
8008 if (share->auto_inc_create_value)
8009 share->last_auto_increment = share->auto_inc_create_value - 1;
8010 else
8011 share->last_auto_increment = 0;
8012 }
8013
8014 commit_txn(txn, 0);
8015 }
8016 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
8017 TOKUDB_DEBUG_AUTO_INCREMENT,
8018 "init auto increment:%lld",
8019 share->last_auto_increment);
8020 }
8021
8022 void ha_tokudb::get_auto_increment(
8023 ulonglong offset,
8024 ulonglong increment,
8025 ulonglong nb_desired_values,
8026 ulonglong* first_value,
8027 ulonglong* nb_reserved_values) {
8028
8029 TOKUDB_HANDLER_DBUG_ENTER("");
8030 ulonglong nr;
8031 bool over;
8032
8033 share->lock();
8034
8035 if (share->auto_inc_create_value > share->last_auto_increment) {
8036 nr = share->auto_inc_create_value;
8037 over = false;
8038 share->last_auto_increment = share->auto_inc_create_value;
8039 } else {
8040 nr = share->last_auto_increment + increment;
8041 over = nr < share->last_auto_increment;
8042 if (over)
8043 nr = ULONGLONG_MAX;
8044 }
8045 if (!over) {
8046 share->last_auto_increment = nr + (nb_desired_values - 1)*increment;
8047 if (delay_updating_ai_metadata) {
8048 ai_metadata_update_required = true;
8049 } else {
8050 update_max_auto_inc(
8051 share->status_block,
8052 share->last_auto_increment);
8053 }
8054 }
8055 TOKUDB_HANDLER_TRACE_FOR_FLAGS(
8056 TOKUDB_DEBUG_AUTO_INCREMENT,
8057 "get_auto_increment(%lld,%lld,%lld): got:%lld:%lld",
8058 offset,
8059 increment,
8060 nb_desired_values,
8061 nr,
8062 nb_desired_values);
8063 *first_value = nr;
8064 *nb_reserved_values = nb_desired_values;
8065 share->unlock();
8066 TOKUDB_HANDLER_DBUG_VOID_RETURN;
8067 }
8068
8069 bool ha_tokudb::is_optimize_blocking() {
8070 return false;
8071 }
8072
8073 bool ha_tokudb::is_auto_inc_singleton(){
8074 return false;
8075 }
8076
8077
8078 // Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8079 // With a transaction, drops dictionaries associated with indexes in key_num
8080 //
8081 //
8082 // Adds indexes to the table. Takes the array of KEY passed in key_info, and creates
8083 // DB's that will go at the end of share->key_file. THE IMPLICIT ASSUMPTION HERE is
8084 // that the table will be modified and that these added keys will be appended to the end
8085 // of the array table->key_info
8086 // Parameters:
8087 // [in] table_arg - table that is being modified, seems to be identical to this->table
8088 // [in] key_info - array of KEY's to be added
8089 // num_of_keys - number of keys to be added, number of elements in key_info
8090 // Returns:
8091 // 0 on success, error otherwise
8092 //
8093 int ha_tokudb::tokudb_add_index(
8094 TABLE* table_arg,
8095 KEY* key_info,
8096 uint num_of_keys,
8097 DB_TXN* txn,
8098 bool* inc_num_DBs,
8099 bool* modified_DBs) {
8100
8101 TOKUDB_HANDLER_DBUG_ENTER("");
8102 assert_always(txn);
8103
8104 int error;
8105 uint curr_index = 0;
8106 DBC* tmp_cursor = NULL;
8107 int cursor_ret_val = 0;
8108 DBT curr_pk_key, curr_pk_val;
8109 THD* thd = ha_thd();
8110 DB_LOADER* loader = NULL;
8111 DB_INDEXER* indexer = NULL;
8112 bool loader_save_space = tokudb::sysvars::load_save_space(thd);
8113 bool use_hot_index = (lock.type == TL_WRITE_ALLOW_WRITE);
8114 uint32_t loader_flags = loader_save_space ? LOADER_COMPRESS_INTERMEDIATES : 0;
8115 uint32_t indexer_flags = 0;
8116 uint32_t mult_db_flags[MAX_KEY + 1] = {0};
8117 uint32_t mult_put_flags[MAX_KEY + 1];
8118 uint32_t mult_dbt_flags[MAX_KEY + 1];
8119 bool creating_hot_index = false;
8120 struct loader_context lc;
8121 memset(&lc, 0, sizeof lc);
8122 lc.thd = thd;
8123 lc.ha = this;
8124 loader_error = 0;
8125 bool rw_lock_taken = false;
8126 *inc_num_DBs = false;
8127 *modified_DBs = false;
8128 invalidate_bulk_fetch();
8129 unpack_entire_row = true; // for bulk fetching rows
8130 for (uint32_t i = 0; i < MAX_KEY+1; i++) {
8131 mult_put_flags[i] = 0;
8132 mult_dbt_flags[i] = DB_DBT_REALLOC;
8133 }
8134 //
8135 // number of DB files we have open currently, before add_index is executed
8136 //
8137 uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8138
8139 //
8140 // get the row type to use for the indexes we're adding
8141 //
8142 toku_compression_method compression_method =
8143 get_compression_method(share->file);
8144
8145 //
8146 // status message to be shown in "show process list"
8147 //
8148 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
8149 // buffer of 200 should be a good upper bound.
8150 char status_msg[MAX_ALIAS_NAME + 200];
8151 // variable that stores number of elements inserted thus far
8152 ulonglong num_processed = 0;
8153 thd_proc_info(thd, "Adding indexes");
8154
8155 //
8156 // in unpack_row, MySQL passes a buffer that is this long,
8157 // so this length should be good enough for us as well
8158 //
8159 memset((void *) &curr_pk_key, 0, sizeof(curr_pk_key));
8160 memset((void *) &curr_pk_val, 0, sizeof(curr_pk_val));
8161
8162 //
8163 // The files for secondary tables are derived from the name of keys
8164 // If we try to add a key with the same name as an already existing key,
8165 // We can crash. So here we check if any of the keys added has the same
8166 // name of an existing key, and if so, we fail gracefully
8167 //
8168 for (uint i = 0; i < num_of_keys; i++) {
8169 for (uint j = 0; j < table_arg->s->keys; j++) {
8170 if (strcmp(key_info[i].name, table_arg->s->key_info[j].name) == 0) {
8171 error = HA_ERR_WRONG_COMMAND;
8172 goto cleanup;
8173 }
8174 }
8175 }
8176
8177 rwlock_t_lock_write(share->_num_DBs_lock);
8178 rw_lock_taken = true;
8179 //
8180 // open all the DB files and set the appropriate variables in share
8181 // they go to the end of share->key_file
8182 //
8183 creating_hot_index =
8184 use_hot_index && num_of_keys == 1 &&
8185 (key_info[0].flags & HA_NOSAME) == 0;
8186 if (use_hot_index && (share->num_DBs > curr_num_DBs)) {
8187 //
8188 // already have hot index in progress, get out
8189 //
8190 error = HA_ERR_INTERNAL_ERROR;
8191 goto cleanup;
8192 }
8193 curr_index = curr_num_DBs;
8194 *modified_DBs = true;
8195 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8196 if (key_is_clustering(&key_info[i])) {
8197 set_key_filter(
8198 &share->kc_info.key_filters[curr_index],
8199 &key_info[i],
8200 table_arg,
8201 false);
8202 if (!hidden_primary_key) {
8203 set_key_filter(
8204 &share->kc_info.key_filters[curr_index],
8205 &table_arg->key_info[primary_key],
8206 table_arg,
8207 false);
8208 }
8209
8210 error = initialize_col_pack_info(
8211 &share->kc_info,
8212 table_arg->s,
8213 curr_index);
8214 if (error) {
8215 goto cleanup;
8216 }
8217 }
8218
8219
8220 error = create_secondary_dictionary(
8221 share->full_table_name(),
8222 table_arg,
8223 &key_info[i],
8224 txn,
8225 &share->kc_info,
8226 curr_index,
8227 creating_hot_index,
8228 compression_method);
8229 if (error) {
8230 goto cleanup;
8231 }
8232
8233 error = open_secondary_dictionary(
8234 &share->key_file[curr_index],
8235 &key_info[i],
8236 share->full_table_name(),
8237 false,
8238 txn);
8239 if (error) {
8240 goto cleanup;
8241 }
8242 }
8243
8244 if (creating_hot_index) {
8245 share->num_DBs++;
8246 *inc_num_DBs = true;
8247 error = db_env->create_indexer(
8248 db_env,
8249 txn,
8250 &indexer,
8251 share->file,
8252 num_of_keys,
8253 &share->key_file[curr_num_DBs],
8254 mult_db_flags,
8255 indexer_flags);
8256 if (error) {
8257 goto cleanup;
8258 }
8259
8260 error = indexer->set_poll_function(
8261 indexer, ha_tokudb::tokudb_add_index_poll, &lc);
8262 if (error) {
8263 goto cleanup;
8264 }
8265
8266 error = indexer->set_error_callback(
8267 indexer, ha_tokudb::loader_add_index_err, &lc);
8268 if (error) {
8269 goto cleanup;
8270 }
8271
8272 share->_num_DBs_lock.unlock();
8273 rw_lock_taken = false;
8274
8275 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8276 // initialize a one phase progress report.
8277 // incremental reports are done in the indexer's callback function.
8278 thd_progress_init(thd, 1);
8279 #endif
8280
8281 error = indexer->build(indexer);
8282
8283 if (error) {
8284 goto cleanup;
8285 }
8286
8287 rwlock_t_lock_write(share->_num_DBs_lock);
8288 error = indexer->close(indexer);
8289 share->_num_DBs_lock.unlock();
8290 if (error) {
8291 goto cleanup;
8292 }
8293 indexer = NULL;
8294 } else {
8295 DBUG_ASSERT(table->mdl_ticket->get_type() >= MDL_SHARED_NO_WRITE);
8296 share->_num_DBs_lock.unlock();
8297 rw_lock_taken = false;
8298 prelocked_right_range_size = 0;
8299 prelocked_left_range_size = 0;
8300 struct smart_dbt_bf_info bf_info;
8301 bf_info.ha = this;
8302 // you need the val if you have a clustering index and key_read is not 0;
8303 bf_info.direction = 1;
8304 bf_info.thd = ha_thd();
8305 bf_info.need_val = true;
8306 bf_info.key_to_compare = NULL;
8307
8308 error = db_env->create_loader(
8309 db_env,
8310 txn,
8311 &loader,
8312 NULL, // no src_db needed
8313 num_of_keys,
8314 &share->key_file[curr_num_DBs],
8315 mult_put_flags,
8316 mult_dbt_flags,
8317 loader_flags);
8318 if (error) {
8319 goto cleanup;
8320 }
8321
8322 error =
8323 loader->set_poll_function(loader, ha_tokudb::bulk_insert_poll, &lc);
8324 if (error) {
8325 goto cleanup;
8326 }
8327
8328 error = loader->set_error_callback(
8329 loader, ha_tokudb::loader_add_index_err, &lc);
8330 if (error) {
8331 goto cleanup;
8332 }
8333 //
8334 // scan primary table, create each secondary key, add to each DB
8335 //
8336 error = share->file->cursor(
8337 share->file,
8338 txn,
8339 &tmp_cursor,
8340 DB_SERIALIZABLE);
8341 if (error) {
8342 tmp_cursor = NULL; // Safety
8343 goto cleanup;
8344 }
8345
8346 //
8347 // grab some locks to make this go faster
8348 // first a global read lock on the main DB, because
8349 // we intend to scan the entire thing
8350 //
8351 error = tmp_cursor->c_set_bounds(
8352 tmp_cursor,
8353 share->file->dbt_neg_infty(),
8354 share->file->dbt_pos_infty(),
8355 true,
8356 0);
8357 if (error) {
8358 goto cleanup;
8359 }
8360
8361 // set the bulk fetch iteration to its max so that adding an
8362 // index fills the bulk fetch buffer every time. we do not
8363 // want it to grow exponentially fast.
8364 rows_fetched_using_bulk_fetch = 0;
8365 bulk_fetch_iteration = HA_TOKU_BULK_FETCH_ITERATION_MAX;
8366 cursor_ret_val = tmp_cursor->c_getf_next(
8367 tmp_cursor,
8368 DB_PRELOCKED,
8369 smart_dbt_bf_callback,
8370 &bf_info);
8371
8372 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8373 // initialize a two phase progress report.
8374 // first phase: putting rows into the loader
8375 thd_progress_init(thd, 2);
8376 #endif
8377
8378 while (cursor_ret_val != DB_NOTFOUND ||
8379 ((bytes_used_in_range_query_buff -
8380 curr_range_query_buff_offset) > 0)) {
8381 if ((bytes_used_in_range_query_buff -
8382 curr_range_query_buff_offset) == 0) {
8383 invalidate_bulk_fetch(); // reset the buffers
8384 cursor_ret_val = tmp_cursor->c_getf_next(
8385 tmp_cursor,
8386 DB_PRELOCKED,
8387 smart_dbt_bf_callback,
8388 &bf_info);
8389 if (cursor_ret_val != DB_NOTFOUND && cursor_ret_val != 0) {
8390 error = cursor_ret_val;
8391 goto cleanup;
8392 }
8393 }
8394 // do this check in case the the c_getf_next did not put anything
8395 // into the buffer because there was no more data
8396 if ((bytes_used_in_range_query_buff -
8397 curr_range_query_buff_offset) == 0) {
8398 break;
8399 }
8400 // at this point, we know the range query buffer has at least one
8401 // key/val pair
8402 uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
8403
8404 uint32_t key_size = *(uint32_t *)curr_pos;
8405 curr_pos += sizeof(key_size);
8406 uchar* curr_key_buff = curr_pos;
8407 curr_pos += key_size;
8408 curr_pk_key.data = curr_key_buff;
8409 curr_pk_key.size = key_size;
8410
8411 uint32_t val_size = *(uint32_t *)curr_pos;
8412 curr_pos += sizeof(val_size);
8413 uchar* curr_val_buff = curr_pos;
8414 curr_pos += val_size;
8415 curr_pk_val.data = curr_val_buff;
8416 curr_pk_val.size = val_size;
8417
8418 curr_range_query_buff_offset = curr_pos - range_query_buff;
8419
8420 error = loader->put(loader, &curr_pk_key, &curr_pk_val);
8421 if (error) {
8422 goto cleanup;
8423 }
8424
8425 num_processed++;
8426
8427 if ((num_processed % 1000) == 0) {
8428 sprintf(
8429 status_msg,
8430 "Adding indexes: Fetched %llu of about %llu rows, loading "
8431 "of data still remains.",
8432 num_processed,
8433 (long long unsigned)share->row_count());
8434 thd_proc_info(thd, status_msg);
8435
8436 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8437 thd_progress_report(
8438 thd,
8439 num_processed,
8440 (long long unsigned)share->rows);
8441 #endif
8442
8443 if (thd_killed(thd)) {
8444 error = ER_ABORTING_CONNECTION;
8445 goto cleanup;
8446 }
8447 }
8448 }
8449 error = tmp_cursor->c_close(tmp_cursor);
8450 assert_always(error==0);
8451 tmp_cursor = NULL;
8452
8453 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8454 // next progress report phase: closing the loader.
8455 // incremental reports are done in the loader's callback function.
8456 thd_progress_next_stage(thd);
8457 #endif
8458
8459 error = loader->close(loader);
8460 loader = NULL;
8461
8462 if (error) goto cleanup;
8463 }
8464 curr_index = curr_num_DBs;
8465 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8466 if (key_info[i].flags & HA_NOSAME) {
8467 bool is_unique;
8468 error = is_index_unique(
8469 &is_unique,
8470 txn,
8471 share->key_file[curr_index],
8472 &key_info[i],
8473 creating_hot_index ? 0 : DB_PRELOCKED_WRITE);
8474 if (error)
8475 goto cleanup;
8476 if (!is_unique) {
8477 error = HA_ERR_FOUND_DUPP_KEY;
8478 last_dup_key = i;
8479 goto cleanup;
8480 }
8481 }
8482 }
8483
8484 share->lock();
8485 //
8486 // We have an accurate row count, might as well update share->rows
8487 //
8488 if(!creating_hot_index) {
8489 share->set_row_count(num_processed, true);
8490 }
8491 //
8492 // now write stuff to status.tokudb
8493 //
8494 for (uint i = 0; i < num_of_keys; i++) {
8495 write_key_name_to_status(share->status_block, key_info[i].name, txn);
8496 }
8497 share->unlock();
8498
8499 error = 0;
8500 cleanup:
8501 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8502 thd_progress_end(thd);
8503 #endif
8504 if (rw_lock_taken) {
8505 share->_num_DBs_lock.unlock();
8506 rw_lock_taken = false;
8507 }
8508 if (tmp_cursor) {
8509 int r = tmp_cursor->c_close(tmp_cursor);
8510 assert_always(r==0);
8511 tmp_cursor = NULL;
8512 }
8513 if (loader != NULL) {
8514 sprintf(status_msg, "aborting creation of indexes.");
8515 thd_proc_info(thd, status_msg);
8516 loader->abort(loader);
8517 }
8518 if (indexer != NULL) {
8519 sprintf(status_msg, "aborting creation of indexes.");
8520 thd_proc_info(thd, status_msg);
8521 rwlock_t_lock_write(share->_num_DBs_lock);
8522 indexer->abort(indexer);
8523 share->_num_DBs_lock.unlock();
8524 }
8525 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8526 error == DB_LOCK_NOTGRANTED) {
8527 sql_print_error(
8528 "Could not add indexes to table %s because another transaction has "
8529 "accessed the table. To add indexes, make sure no transactions "
8530 "touch the table.",
8531 share->full_table_name());
8532 }
8533 thd_proc_info(thd, orig_proc_info);
8534 TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
8535 }
8536 int ha_tokudb::tokudb_add_index_poll(void* extra, float progress) {
8537 LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
8538 if (thd_killed(context->thd)) {
8539 snprintf(context->write_status_msg,
8540 sizeof(context->write_status_msg),
8541 "The process has been killed, aborting add index.");
8542 return ER_ABORTING_CONNECTION;
8543 }
8544 float percentage = progress * 100;
8545 snprintf(context->write_status_msg,
8546 sizeof(context->write_status_msg),
8547 "Adding of indexes to %s about %.1f%% done",
8548 context->ha->share->full_table_name(),
8549 percentage);
8550 thd_proc_info(context->thd, context->write_status_msg);
8551 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8552 thd_progress_report(context->thd, (unsigned long long)percentage, 100);
8553 #endif
8554 return 0;
8555 }
8556
8557 //
8558 // Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8559 // Closes added indexes in case of error in error path of add_index and alter_table_phase2
8560 //
8561 void ha_tokudb::restore_add_index(
8562 TABLE* table_arg,
8563 uint num_of_keys,
8564 bool incremented_numDBs,
8565 bool modified_DBs) {
8566
8567 uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8568 uint curr_index = 0;
8569
8570 //
8571 // need to restore num_DBs, and we have to do it before we close the dictionaries
8572 // so that there is not a window
8573 //
8574 if (incremented_numDBs) {
8575 rwlock_t_lock_write(share->_num_DBs_lock);
8576 share->num_DBs--;
8577 }
8578 if (modified_DBs) {
8579 curr_index = curr_num_DBs;
8580 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8581 reset_key_and_col_info(&share->kc_info, curr_index);
8582 }
8583 curr_index = curr_num_DBs;
8584 for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8585 if (share->key_file[curr_index]) {
8586 int r = share->key_file[curr_index]->close(
8587 share->key_file[curr_index],
8588 0);
8589 assert_always(r==0);
8590 share->key_file[curr_index] = NULL;
8591 }
8592 }
8593 }
8594 if (incremented_numDBs) {
8595 share->_num_DBs_lock.unlock();
8596 }
8597 }
8598
8599 //
8600 // Internal function called by ha_tokudb::prepare_drop_index and ha_tokudb::alter_table_phase2
8601 // With a transaction, drops dictionaries associated with indexes in key_num
8602 //
8603 int ha_tokudb::drop_indexes(uint* key_num,
8604 uint num_of_keys,
8605 KEY* key_info,
8606 DB_TXN* txn) {
8607 TOKUDB_HANDLER_DBUG_ENTER("");
8608 assert_always(txn);
8609
8610 int error = 0;
8611 for (uint i = 0; i < num_of_keys; i++) {
8612 uint curr_index = key_num[i];
8613 error = share->key_file[curr_index]->pre_acquire_fileops_lock(
8614 share->key_file[curr_index],
8615 txn);
8616 if (error != 0) {
8617 goto cleanup;
8618 }
8619 }
8620 for (uint i = 0; i < num_of_keys; i++) {
8621 uint curr_index = key_num[i];
8622 int r = share->key_file[curr_index]->close(share->key_file[curr_index],0);
8623 assert_always(r==0);
8624 share->key_file[curr_index] = NULL;
8625
8626 error = remove_key_name_from_status(
8627 share->status_block,
8628 key_info[curr_index].name,
8629 txn);
8630 if (error) {
8631 goto cleanup;
8632 }
8633
8634 error = delete_or_rename_dictionary(
8635 share->full_table_name(),
8636 NULL,
8637 key_info[curr_index].name,
8638 true,
8639 txn,
8640 true);
8641 if (error) {
8642 goto cleanup;
8643 }
8644 }
8645
8646 cleanup:
8647 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8648 error == DB_LOCK_NOTGRANTED) {
8649 sql_print_error(
8650 "Could not drop indexes from table %s because another transaction "
8651 "has accessed the table. To drop indexes, make sure no "
8652 "transactions touch the table.",
8653 share->full_table_name());
8654 }
8655 TOKUDB_HANDLER_DBUG_RETURN(error);
8656 }
8657
8658 //
8659 // Internal function called by ha_tokudb::prepare_drop_index and
8660 // ha_tokudb::alter_table_phase2
8661 // Restores dropped indexes in case of error in error path of
8662 // prepare_drop_index and alter_table_phase2
8663 //
8664 void ha_tokudb::restore_drop_indexes(uint* key_num, uint num_of_keys) {
8665 //
8666 // reopen closed dictionaries
8667 //
8668 for (uint i = 0; i < num_of_keys; i++) {
8669 int r;
8670 uint curr_index = key_num[i];
8671 if (share->key_file[curr_index] == NULL) {
8672 r = open_secondary_dictionary(
8673 &share->key_file[curr_index],
8674 &table_share->key_info[curr_index],
8675 share->full_table_name(),
8676 false,
8677 NULL);
8678 assert_always(!r);
8679 }
8680 }
8681 }
8682
8683 int ha_tokudb::map_to_handler_error(int error) {
8684 switch (error) {
8685 case DB_LOCK_DEADLOCK:
8686 error = HA_ERR_LOCK_DEADLOCK;
8687 break;
8688 case DB_LOCK_NOTGRANTED:
8689 error = HA_ERR_LOCK_WAIT_TIMEOUT;
8690 break;
8691 #if defined(HA_ERR_DISK_FULL)
8692 case ENOSPC:
8693 error = HA_ERR_DISK_FULL;
8694 break;
8695 #endif
8696 case DB_KEYEXIST:
8697 error = HA_ERR_FOUND_DUPP_KEY;
8698 break;
8699 #if defined(HA_ALTER_ERROR)
8700 case HA_ALTER_ERROR:
8701 error = HA_ERR_UNSUPPORTED;
8702 break;
8703 #endif
8704 case TOKUDB_INTERRUPTED:
8705 error = ER_QUERY_INTERRUPTED;
8706 break;
8707 case TOKUDB_OUT_OF_LOCKS:
8708 error = HA_ERR_LOCK_TABLE_FULL;
8709 break;
8710 }
8711 return error;
8712 }
8713
8714 void ha_tokudb::print_error(int error, myf errflag) {
8715 error = map_to_handler_error(error);
8716 handler::print_error(error, errflag);
8717 }
8718
8719 //
8720 // truncate's dictionary associated with keynr index using transaction txn
8721 // does so by deleting and then recreating the dictionary in the context
8722 // of a transaction
8723 //
8724 int ha_tokudb::truncate_dictionary(uint keynr, DB_TXN* txn) {
8725 int error;
8726 bool is_pk = (keynr == primary_key);
8727
8728 toku_compression_method compression_method =
8729 get_compression_method(share->key_file[keynr]);
8730 error = share->key_file[keynr]->close(share->key_file[keynr], 0);
8731 assert_always(error == 0);
8732
8733 share->key_file[keynr] = NULL;
8734 if (is_pk) {
8735 share->file = NULL;
8736 }
8737
8738 if (is_pk) {
8739 error = delete_or_rename_dictionary(
8740 share->full_table_name(),
8741 NULL,
8742 "main",
8743 false, //is_key
8744 txn,
8745 true); // is a delete
8746 if (error) {
8747 goto cleanup;
8748 }
8749 } else {
8750 error = delete_or_rename_dictionary(
8751 share->full_table_name(),
8752 NULL,
8753 table_share->key_info[keynr].name,
8754 true, //is_key
8755 txn,
8756 true); // is a delete
8757 if (error) {
8758 goto cleanup;
8759 }
8760 }
8761
8762 if (is_pk) {
8763 error = create_main_dictionary(
8764 share->full_table_name(),
8765 table,
8766 txn,
8767 &share->kc_info,
8768 compression_method);
8769 } else {
8770 error = create_secondary_dictionary(
8771 share->full_table_name(),
8772 table,
8773 &table_share->key_info[keynr],
8774 txn,
8775 &share->kc_info,
8776 keynr,
8777 false,
8778 compression_method);
8779 }
8780 if (error) {
8781 goto cleanup;
8782 }
8783
8784 cleanup:
8785 return error;
8786 }
8787
8788 // for 5.5
8789 int ha_tokudb::truncate() {
8790 TOKUDB_HANDLER_DBUG_ENTER("");
8791 int error = delete_all_rows_internal();
8792 TOKUDB_HANDLER_DBUG_RETURN(error);
8793 }
8794
8795 // delete all rows from a table
8796 //
8797 // effects: delete all of the rows in the main dictionary and all of the
8798 // indices. this must be atomic, so we use the statement transaction
8799 // for all of the truncate operations.
8800 // locks: if we have an exclusive table write lock, all of the concurrency
8801 // issues go away.
8802 // returns: 0 if success
8803 int ha_tokudb::delete_all_rows() {
8804 TOKUDB_HANDLER_DBUG_ENTER("");
8805 int error = 0;
8806 if (thd_sql_command(ha_thd()) != SQLCOM_TRUNCATE) {
8807 share->try_table_lock = true;
8808 error = HA_ERR_WRONG_COMMAND;
8809 }
8810 if (error == 0)
8811 error = delete_all_rows_internal();
8812 TOKUDB_HANDLER_DBUG_RETURN(error);
8813 }
8814
8815 int ha_tokudb::delete_all_rows_internal() {
8816 TOKUDB_HANDLER_DBUG_ENTER("");
8817 int error = 0;
8818 uint curr_num_DBs = 0;
8819 DB_TXN* txn = NULL;
8820
8821 // this should be enough to handle locking as the higher level MDL
8822 // on this table should prevent any new analyze tasks.
8823 share->cancel_background_jobs();
8824
8825 error = txn_begin(db_env, 0, &txn, 0, ha_thd());
8826 if (error) {
8827 goto cleanup;
8828 }
8829
8830 curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
8831 for (uint i = 0; i < curr_num_DBs; i++) {
8832 error = share->key_file[i]->pre_acquire_fileops_lock(
8833 share->key_file[i],
8834 txn);
8835 if (error) {
8836 goto cleanup;
8837 }
8838 error = share->key_file[i]->pre_acquire_table_lock(
8839 share->key_file[i],
8840 txn);
8841 if (error) {
8842 goto cleanup;
8843 }
8844 }
8845 for (uint i = 0; i < curr_num_DBs; i++) {
8846 error = truncate_dictionary(i, txn);
8847 if (error) {
8848 goto cleanup;
8849 }
8850 }
8851
8852 DEBUG_SYNC(ha_thd(), "tokudb_after_truncate_all_dictionarys");
8853
8854 // zap the row count
8855 if (error == 0) {
8856 share->set_row_count(0, false);
8857 // update auto increment
8858 share->last_auto_increment = 0;
8859 // calling write_to_status directly because we need to use txn
8860 write_to_status(
8861 share->status_block,
8862 hatoku_max_ai,
8863 &share->last_auto_increment,
8864 sizeof(share->last_auto_increment),
8865 txn);
8866 }
8867
8868 share->try_table_lock = true;
8869 cleanup:
8870 if (txn) {
8871 if (error) {
8872 abort_txn(txn);
8873 } else {
8874 commit_txn(txn,0);
8875 }
8876 }
8877
8878 if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(
8879 TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8880 error == DB_LOCK_NOTGRANTED) {
8881 sql_print_error(
8882 "Could not truncate table %s because another transaction has "
8883 "accessed the table. To truncate the table, make sure no "
8884 "transactions touch the table.",
8885 share->full_table_name());
8886 }
8887 //
8888 // regardless of errors, need to reopen the DB's
8889 //
8890 for (uint i = 0; i < curr_num_DBs; i++) {
8891 int r = 0;
8892 if (share->key_file[i] == NULL) {
8893 if (i != primary_key) {
8894 r = open_secondary_dictionary(
8895 &share->key_file[i],
8896 &table_share->key_info[i],
8897 share->full_table_name(),
8898 false,
8899 NULL);
8900 assert_always(!r);
8901 } else {
8902 r = open_main_dictionary(
8903 share->full_table_name(),
8904 false,
8905 NULL);
8906 assert_always(!r);
8907 }
8908 }
8909 }
8910 TOKUDB_HANDLER_DBUG_RETURN(error);
8911 }
8912
8913 void ha_tokudb::set_loader_error(int err) {
8914 loader_error = err;
8915 }
8916
8917 void ha_tokudb::set_dup_value_for_pk(DBT* key) {
8918 assert_always(!hidden_primary_key);
8919 unpack_key(table->record[0],key,primary_key);
8920 last_dup_key = primary_key;
8921 }
8922
8923 void ha_tokudb::close_dsmrr() {
8924 #ifdef MARIADB_BASE_VERSION
8925 ds_mrr.dsmrr_close();
8926 #elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
8927 ds_mrr.dsmrr_close();
8928 #endif
8929 }
8930
8931 void ha_tokudb::reset_dsmrr() {
8932 #ifdef MARIADB_BASE_VERSION
8933 ds_mrr.dsmrr_close();
8934 #elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
8935 ds_mrr.reset();
8936 #endif
8937 }
8938
8939 // we cache the information so we can do filtering ourselves,
8940 // but as far as MySQL knows, we are not doing any filtering,
8941 // so if we happen to miss filtering a row that does not match
8942 // idx_cond_arg, MySQL will catch it.
8943 // This allows us the ability to deal with only index_next and index_prev,
8944 // and not need to worry about other index_XXX functions
8945 Item* ha_tokudb::idx_cond_push(uint keyno_arg, Item* idx_cond_arg) {
8946 toku_pushed_idx_cond_keyno = keyno_arg;
8947 toku_pushed_idx_cond = idx_cond_arg;
8948 return idx_cond_arg;
8949 }
8950
8951 void ha_tokudb::cancel_pushed_idx_cond() {
8952 invalidate_icp();
8953 handler::cancel_pushed_idx_cond();
8954 }
8955
8956 void ha_tokudb::cleanup_txn(DB_TXN *txn) {
8957 if (transaction == txn && cursor) {
8958 int r = cursor->c_close(cursor);
8959 assert_always(r == 0);
8960 cursor = NULL;
8961 }
8962 }
8963
8964 void ha_tokudb::add_to_trx_handler_list() {
8965 tokudb_trx_data* trx =
8966 (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8967 trx->handlers = list_add(trx->handlers, &trx_handler_list);
8968 }
8969
8970 void ha_tokudb::remove_from_trx_handler_list() {
8971 tokudb_trx_data* trx =
8972 (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8973 trx->handlers = list_delete(trx->handlers, &trx_handler_list);
8974 }
8975
8976 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
8977 void ha_tokudb::rpl_before_write_rows() {
8978 in_rpl_write_rows = true;
8979 }
8980
8981 void ha_tokudb::rpl_after_write_rows() {
8982 in_rpl_write_rows = false;
8983 }
8984
8985 void ha_tokudb::rpl_before_delete_rows() {
8986 in_rpl_delete_rows = true;
8987 }
8988
8989 void ha_tokudb::rpl_after_delete_rows() {
8990 in_rpl_delete_rows = false;
8991 }
8992
8993 void ha_tokudb::rpl_before_update_rows() {
8994 in_rpl_update_rows = true;
8995 }
8996
8997 void ha_tokudb::rpl_after_update_rows() {
8998 in_rpl_update_rows = false;
8999 }
9000
9001 bool ha_tokudb::rpl_lookup_rows() {
9002 if (!in_rpl_delete_rows && !in_rpl_update_rows)
9003 return true;
9004 else
9005 return tokudb::sysvars::rpl_lookup_rows(ha_thd());
9006 }
9007 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
9008
9009 // table admin
9010 #include "ha_tokudb_admin.cc"
9011
9012 // update functions
9013 #include "tokudb_update_fun.cc"
9014
9015 // fast updates
9016 #include "ha_tokudb_update.cc"
9017
9018 // alter table code for various mysql distros
9019 #include "ha_tokudb_alter_55.cc"
9020 #include "ha_tokudb_alter_56.cc"
9021
9022 // mrr
9023 #ifdef MARIADB_BASE_VERSION
9024 #include "ha_tokudb_mrr_maria.cc"
9025 #elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
9026 #include "ha_tokudb_mrr_mysql.cc"
9027 #endif
9028
9029 // key comparisons
9030 #include "hatoku_cmp.cc"
9031
9032 // handlerton
9033 #include "hatoku_hton.cc"
9034
9035 // generate template functions
9036 namespace tokudb {
9037 template size_t vlq_encode_ui(uint32_t n, void *p, size_t s);
9038 template size_t vlq_decode_ui(uint32_t *np, void *p, size_t s);
9039 template size_t vlq_encode_ui(uint64_t n, void *p, size_t s);
9040 template size_t vlq_decode_ui(uint64_t *np, void *p, size_t s);
9041 };
9042