1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of TokuDB
6 
7 
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9 
10     TokuDBis is free software: you can redistribute it and/or modify
11     it under the terms of the GNU General Public License, version 2,
12     as published by the Free Software Foundation.
13 
14     TokuDB is distributed in the hope that it will be useful,
15     but WITHOUT ANY WARRANTY; without even the implied warranty of
16     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17     GNU General Public License for more details.
18 
19     You should have received a copy of the GNU General Public License
20     along with TokuDB.  If not, see <http://www.gnu.org/licenses/>.
21 
22 ======= */
23 
24 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
25 
26 #include "hatoku_hton.h"
27 #include "hatoku_cmp.h"
28 #include "tokudb_buffer.h"
29 #include "tokudb_status.h"
30 #include "tokudb_card.h"
31 #include "ha_tokudb.h"
32 #include "sql_db.h"
33 
34 pfs_key_t ha_tokudb_mutex_key;
35 pfs_key_t num_DBs_lock_key;
36 
37 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
get_ext_key_parts(const KEY * key)38 static inline uint get_ext_key_parts(const KEY *key) {
39 #if (50609 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \
40     (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799)
41     return key->actual_key_parts;
42 #elif defined(MARIADB_BASE_VERSION)
43     return key->ext_key_parts;
44 #else
45 #error
46 #endif
47 }
48 #endif  // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
49 
50 std::unordered_map<std::string, TOKUDB_SHARE*> TOKUDB_SHARE::_open_tables;
51 tokudb::thread::mutex_t* TOKUDB_SHARE::_open_tables_mutex;
52 
53 static const char* ha_tokudb_exts[] = {
54     ha_tokudb_ext,
55     NullS
56 };
57 
58 //
59 // This offset is calculated starting from AFTER the NULL bytes
60 //
get_fixed_field_size(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)61 static inline uint32_t get_fixed_field_size(
62     KEY_AND_COL_INFO* kc_info,
63     TABLE_SHARE* table_share,
64     uint keynr) {
65 
66     uint offset = 0;
67     for (uint i = 0; i < table_share->fields; i++) {
68         if (is_fixed_field(kc_info, i) &&
69             !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
70             offset += kc_info->field_lengths[i];
71         }
72     }
73     return offset;
74 }
75 
76 
get_len_of_offsets(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)77 static inline uint32_t get_len_of_offsets(
78     KEY_AND_COL_INFO* kc_info,
79     TABLE_SHARE* table_share,
80     uint keynr) {
81 
82     uint len = 0;
83     for (uint i = 0; i < table_share->fields; i++) {
84         if (is_variable_field(kc_info, i) &&
85             !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
86             len += kc_info->num_offset_bytes;
87         }
88     }
89     return len;
90 }
91 
92 
allocate_key_and_col_info(TABLE_SHARE * table_share,KEY_AND_COL_INFO * kc_info)93 static int allocate_key_and_col_info(
94     TABLE_SHARE* table_share,
95     KEY_AND_COL_INFO* kc_info) {
96 
97     int error;
98     //
99     // initialize all of the bitmaps
100     //
101     for (uint i = 0; i < MAX_KEY + 1; i++) {
102         error =
103             bitmap_init(
104                 &kc_info->key_filters[i],
105                 NULL,
106                 table_share->fields,
107                 false);
108         if (error) {
109             goto exit;
110         }
111     }
112 
113     //
114     // create the field lengths
115     //
116     kc_info->multi_ptr = tokudb::memory::multi_malloc(
117         MYF(MY_WME+MY_ZEROFILL),
118         &kc_info->field_types, (uint)(table_share->fields * sizeof (uint8_t)),
119         &kc_info->field_lengths, (uint)(table_share->fields * sizeof (uint16_t)),
120         &kc_info->length_bytes, (uint)(table_share->fields * sizeof (uint8_t)),
121         &kc_info->blob_fields, (uint)(table_share->fields * sizeof (uint32_t)),
122         NullS);
123     if (kc_info->multi_ptr == NULL) {
124         error = ENOMEM;
125         goto exit;
126     }
127 exit:
128     if (error) {
129         for (uint i = 0; MAX_KEY + 1; i++) {
130             bitmap_free(&kc_info->key_filters[i]);
131         }
132         tokudb::memory::free(kc_info->multi_ptr);
133     }
134     return error;
135 }
136 
free_key_and_col_info(KEY_AND_COL_INFO * kc_info)137 static void free_key_and_col_info (KEY_AND_COL_INFO* kc_info) {
138     for (uint i = 0; i < MAX_KEY+1; i++) {
139         bitmap_free(&kc_info->key_filters[i]);
140     }
141 
142     for (uint i = 0; i < MAX_KEY+1; i++) {
143         tokudb::memory::free(kc_info->cp_info[i]);
144         kc_info->cp_info[i] = NULL; // 3144
145     }
146 
147     tokudb::memory::free(kc_info->multi_ptr);
148     kc_info->field_types = NULL;
149     kc_info->field_lengths = NULL;
150     kc_info->length_bytes = NULL;
151     kc_info->blob_fields = NULL;
152 }
153 
154 
static_init()155 void TOKUDB_SHARE::static_init() {
156     assert_always(_open_tables.size() == 0);
157    _open_tables_mutex = new tokudb::thread::mutex_t();
158 }
static_destroy()159 void TOKUDB_SHARE::static_destroy() {
160     for (auto it = _open_tables.cbegin(); it != _open_tables.cend(); it++) {
161         TOKUDB_TRACE("_open_tables %s %p", it->first.c_str(), it->second);
162         TOKUDB_SHARE* share = it->second;
163         share->destroy();
164         delete share;
165     }
166     _open_tables.clear();
167     assert_always(_open_tables.size() == 0);
168     delete _open_tables_mutex;
169 }
get_state_string(share_state_t state)170 const char* TOKUDB_SHARE::get_state_string(share_state_t state) {
171     static const char* state_string[] = {
172         "CLOSED",
173         "OPENED",
174         "ERROR"
175     };
176     assert_always(state == CLOSED || state == OPENED || state == ERROR);
177     return state_string[state];
178 }
operator new(size_t sz)179 void* TOKUDB_SHARE::operator new(size_t sz) {
180     return tokudb::memory::malloc(sz, MYF(MY_WME|MY_ZEROFILL|MY_FAE));
181 }
operator delete(void * p)182 void TOKUDB_SHARE::operator delete(void* p) { tokudb::memory::free(p); }
TOKUDB_SHARE()183 TOKUDB_SHARE::TOKUDB_SHARE()
184     : _num_DBs_lock(num_DBs_lock_key), _mutex(ha_tokudb_mutex_key) {}
init(const char * table_name)185 void TOKUDB_SHARE::init(const char* table_name) {
186     _use_count = 0;
187     thr_lock_init(&_thr_lock);
188     _state = CLOSED;
189     _row_delta_activity = 0;
190     _allow_auto_analysis = true;
191 
192     _full_table_name.append(table_name);
193 
194     String tmp_dictionary_name;
195     tokudb_split_dname(
196         table_name,
197         _database_name,
198         _table_name,
199         tmp_dictionary_name);
200 
201     TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
202         _full_table_name.ptr(),
203         get_state_string(_state),
204         _use_count);
205     TOKUDB_SHARE_DBUG_VOID_RETURN();
206 }
destroy()207 void TOKUDB_SHARE::destroy() {
208     TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
209         _full_table_name.ptr(),
210         get_state_string(_state),
211         _use_count);
212 
213     assert_always(_use_count == 0);
214     assert_always(
215         _state == TOKUDB_SHARE::CLOSED || _state == TOKUDB_SHARE::ERROR);
216     thr_lock_delete(&_thr_lock);
217     TOKUDB_SHARE_DBUG_VOID_RETURN();
218 }
get_share(const char * table_name,THR_LOCK_DATA * data,bool create_new)219 TOKUDB_SHARE* TOKUDB_SHARE::get_share(const char* table_name,
220                                       THR_LOCK_DATA* data,
221                                       bool create_new) {
222     std::string find_table_name(table_name);
223     mutex_t_lock(*_open_tables_mutex);
224     auto it = _open_tables.find(find_table_name);
225     TOKUDB_SHARE *share = nullptr;
226     if (it != _open_tables.end()) {
227         share = it->second;
228         assert_always(strcmp(table_name, share->full_table_name()) == 0);
229     }
230     TOKUDB_TRACE_FOR_FLAGS(
231         TOKUDB_DEBUG_SHARE,
232         "existing share[%s] %s:share[%p]",
233         table_name,
234         share == NULL ? "not found" : "found",
235         share);
236 
237     if (!share) {
238         if (create_new == false)
239             goto exit;
240         // create share and fill it with all zeroes
241         // hence, all pointers are initialized to NULL
242         share = new TOKUDB_SHARE;
243         assert_always(share);
244 
245         share->init(table_name);
246 
247         _open_tables.insert({find_table_name, share});
248     }
249 
250     share->addref();
251 
252     if (data)
253         thr_lock_data_init(&(share->_thr_lock), data, NULL);
254 
255 exit:
256     mutex_t_unlock(*_open_tables_mutex);
257     return share;
258 }
drop_share(TOKUDB_SHARE * share)259 void TOKUDB_SHARE::drop_share(TOKUDB_SHARE* share) {
260     TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_SHARE,
261                            "share[%p]:file[%s]:state[%s]:use_count[%d]",
262                            share,
263                            share->_full_table_name.ptr(),
264                            get_state_string(share->_state),
265                            share->_use_count);
266 
267     mutex_t_lock(*_open_tables_mutex);
268     size_t n = _open_tables.erase(std::string(share->full_table_name()));
269     assert_always(n == 1);
270     share->destroy();
271     delete share;
272     mutex_t_unlock(*_open_tables_mutex);
273 }
addref()274 TOKUDB_SHARE::share_state_t TOKUDB_SHARE::addref() {
275     TOKUDB_SHARE_TRACE_FOR_FLAGS((TOKUDB_DEBUG_ENTER & TOKUDB_DEBUG_SHARE),
276                                  "file[%s]:state[%s]:use_count[%d]",
277                                  _full_table_name.ptr(),
278                                  get_state_string(_state),
279                                  _use_count);
280 
281     lock();
282     _use_count++;
283 
284     return _state;
285 }
release()286 int TOKUDB_SHARE::release() {
287     TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
288         _full_table_name.ptr(),
289         get_state_string(_state),
290         _use_count);
291 
292     int error, result = 0;
293 
294     mutex_t_lock(_mutex);
295     assert_always(_use_count != 0);
296     _use_count--;
297     if (_use_count == 0 && _state == TOKUDB_SHARE::OPENED) {
298         // number of open DB's may not be equal to number of keys we have
299         // because add_index may have added some. So, we loop through entire
300         // array and close any non-NULL value.  It is imperative that we reset
301         // a DB to NULL once we are done with it.
302         for (uint i = 0; i < sizeof(key_file)/sizeof(key_file[0]); i++) {
303             if (key_file[i]) {
304                 TOKUDB_TRACE_FOR_FLAGS(
305                     TOKUDB_DEBUG_OPEN,
306                     "dbclose:%p",
307                     key_file[i]);
308                 error = key_file[i]->close(key_file[i], 0);
309                 assert_always(error == 0);
310                 if (error) {
311                     result = error;
312                 }
313                 if (key_file[i] == file)
314                     file = NULL;
315                 key_file[i] = NULL;
316             }
317         }
318 
319         error = tokudb::metadata::close(&status_block);
320         assert_always(error == 0);
321 
322         free_key_and_col_info(&kc_info);
323 
324         if (_rec_per_key) {
325             tokudb::memory::free(_rec_per_key);
326             _rec_per_key = NULL;
327             _rec_per_keys = 0;
328         }
329 
330         for (uint i = 0; i < _keys; i++) {
331            tokudb::memory::free(_key_descriptors[i]._name);
332         }
333         tokudb::memory::free(_key_descriptors);
334         _keys = _max_key_parts = 0; _key_descriptors = NULL;
335 
336         _state = TOKUDB_SHARE::CLOSED;
337     }
338     mutex_t_unlock(_mutex);
339 
340     TOKUDB_SHARE_DBUG_RETURN(result);
341 }
update_row_count(THD * thd,uint64_t added,uint64_t deleted,uint64_t updated)342 void TOKUDB_SHARE::update_row_count(
343     THD* thd,
344     uint64_t added,
345     uint64_t deleted,
346     uint64_t updated) {
347 
348     uint64_t delta = added + deleted + updated;
349     lock();
350     if (deleted > added && _rows < (deleted - added)) {
351         _rows = 0;
352     } else {
353         _rows += added - deleted;
354     }
355     _row_delta_activity += delta;
356     if (_row_delta_activity == (uint64_t)~0)
357         _row_delta_activity = 1;
358 
359     ulonglong auto_threshold = tokudb::sysvars::auto_analyze(thd);
360     if (delta && auto_threshold > 0 && _allow_auto_analysis) {
361         ulonglong pct_of_rows_changed_to_trigger;
362         pct_of_rows_changed_to_trigger = ((_rows * auto_threshold) / 100);
363         if (_row_delta_activity >= pct_of_rows_changed_to_trigger) {
364             char msg[200];
365             snprintf(msg,
366                      sizeof(msg),
367                      "TokuDB: Auto %s analysis for %s, delta_activity %llu is "
368                      "greater than %llu percent of %llu rows.",
369                      tokudb::sysvars::analyze_in_background(thd) > 0
370                          ? "scheduling background"
371                          : "running foreground",
372                      full_table_name(),
373                      _row_delta_activity,
374                      auto_threshold,
375                      (ulonglong)(_rows));
376 
377             // analyze_standard will unlock _mutex regardless of success/failure
378             int ret = analyze_standard(thd, NULL);
379             if (ret == 0) {
380                 sql_print_information("%s - succeeded.", msg);
381             } else {
382                 sql_print_information(
383                     "%s - failed, likely a job already running.",
384                     msg);
385             }
386         }
387     }
388     unlock();
389 }
set_cardinality_counts_in_table(TABLE * table)390 void TOKUDB_SHARE::set_cardinality_counts_in_table(TABLE* table) {
391     lock();
392     uint32_t next_key_part = 0;
393     for (uint32_t i = 0; i < table->s->keys; i++) {
394         KEY* key = &table->key_info[i];
395         bool is_unique_key =
396             (i == table->s->primary_key) || (key->flags & HA_NOSAME);
397 
398         for (uint32_t j = 0; j < get_ext_key_parts(key); j++) {
399             if (j >= key->user_defined_key_parts) {
400                 // MySQL 'hidden' keys, really needs deeper investigation
401                 // into MySQL hidden keys vs TokuDB hidden keys
402                 key->rec_per_key[j] = 1;
403                 continue;
404             }
405 
406             assert_always(next_key_part < _rec_per_keys);
407             ulong val = _rec_per_key[next_key_part++];
408             val = (val * tokudb::sysvars::cardinality_scale_percent) / 100;
409             if (val == 0 || _rows == 0 ||
410                 (is_unique_key && j == get_ext_key_parts(key) - 1)) {
411                 val = 1;
412             }
413             key->rec_per_key[j] = val;
414         }
415     }
416     unlock();
417 }
418 
419 #define HANDLE_INVALID_CURSOR() \
420     if (cursor == NULL) { \
421         error = last_cursor_error; \
422         goto cleanup; \
423     }
424 
table_type() const425 const char *ha_tokudb::table_type() const {
426     return tokudb_hton_name;
427 }
428 
index_type(TOKUDB_UNUSED (uint inx))429 const char *ha_tokudb::index_type(TOKUDB_UNUSED(uint inx)) {
430     return "BTREE";
431 }
432 
433 /*
434  *  returns NULL terminated file extension string
435  */
bas_ext() const436 const char **ha_tokudb::bas_ext() const {
437     TOKUDB_HANDLER_DBUG_ENTER("");
438     DBUG_RETURN(ha_tokudb_exts);
439 }
440 
is_insert_ignore(THD * thd)441 static inline bool is_insert_ignore (THD* thd) {
442     //
443     // from http://lists.mysql.com/internals/37735
444     //
445     return thd->lex->ignore && thd->lex->duplicates == DUP_ERROR;
446 }
447 
is_replace_into(THD * thd)448 static inline bool is_replace_into(THD* thd) {
449     return thd->lex->duplicates == DUP_REPLACE;
450 }
451 
do_ignore_flag_optimization(THD * thd,TABLE * table,bool opt_eligible)452 static inline bool do_ignore_flag_optimization(
453     THD* thd,
454     TABLE* table,
455     bool opt_eligible) {
456 
457     bool do_opt = false;
458     if (opt_eligible &&
459         (is_replace_into(thd) || is_insert_ignore(thd)) &&
460         tokudb::sysvars::pk_insert_mode(thd) == 1 &&
461         !table->triggers &&
462         !(mysql_bin_log.is_open() &&
463          thd->variables.binlog_format != BINLOG_FORMAT_STMT)) {
464         do_opt = true;
465     }
466     return do_opt;
467 }
468 
table_flags() const469 ulonglong ha_tokudb::table_flags() const {
470     return int_table_flags | HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE;
471 }
472 
473 //
474 // Returns a bit mask of capabilities of the key or its part specified by
475 // the arguments. The capabilities are defined in sql/handler.h.
476 //
index_flags(uint idx,TOKUDB_UNUSED (uint part),TOKUDB_UNUSED (bool all_parts)) const477 ulong ha_tokudb::index_flags(uint idx,
478                              TOKUDB_UNUSED(uint part),
479                              TOKUDB_UNUSED(bool all_parts)) const {
480     TOKUDB_HANDLER_DBUG_ENTER("");
481     assert_always(table_share);
482     ulong flags = (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER |
483         HA_KEYREAD_ONLY | HA_READ_RANGE | HA_DO_INDEX_COND_PUSHDOWN);
484     if (key_is_clustering(&table_share->key_info[idx])) {
485         flags |= HA_CLUSTERED_INDEX;
486     }
487     DBUG_RETURN(flags);
488 }
489 
490 
491 //
492 // struct that will be used as a context for smart DBT callbacks
493 // contains parameters needed to complete the smart DBT cursor call
494 //
495 typedef struct smart_dbt_info {
496     ha_tokudb* ha; //instance to ha_tokudb needed for reading the row
497     uchar* buf; // output buffer where row will be written
498     uint keynr; // index into share->key_file that represents DB we are currently operating on
499 } *SMART_DBT_INFO;
500 
501 typedef struct smart_dbt_bf_info {
502     ha_tokudb* ha;
503     bool need_val;
504     int direction;
505     THD* thd;
506     uchar* buf;
507     DBT* key_to_compare;
508 } *SMART_DBT_BF_INFO;
509 
510 typedef struct index_read_info {
511     struct smart_dbt_info smart_dbt_info;
512     int cmp;
513     DBT* orig_key;
514 } *INDEX_READ_INFO;
515 
516 //
517 // smart DBT callback function for optimize
518 // in optimize, we want to flatten DB by doing
519 // a full table scan. Therefore, we don't
520 // want to actually do anything with the data, hence
521 // callback does nothing
522 //
smart_dbt_do_nothing(TOKUDB_UNUSED (DBT const * key),TOKUDB_UNUSED (DBT const * row),TOKUDB_UNUSED (void * context))523 static int smart_dbt_do_nothing(TOKUDB_UNUSED(DBT const* key),
524                                 TOKUDB_UNUSED(DBT const* row),
525                                 TOKUDB_UNUSED(void* context)) {
526     return 0;
527 }
528 
529 static int
smart_dbt_callback_rowread_ptquery(DBT const * key,DBT const * row,void * context)530 smart_dbt_callback_rowread_ptquery (DBT const *key, DBT  const *row, void *context) {
531     SMART_DBT_INFO info = (SMART_DBT_INFO)context;
532     info->ha->extract_hidden_primary_key(info->keynr, key);
533     return info->ha->read_row_callback(info->buf,info->keynr,row,key);
534 }
535 
536 //
537 // Smart DBT callback function in case where we have a covering index
538 //
smart_dbt_callback_keyread(DBT const * key,DBT TOKUDB_UNUSED (const * row),void * context)539 static int smart_dbt_callback_keyread(DBT const* key,
540                                       DBT TOKUDB_UNUSED(const* row),
541                                       void* context) {
542     SMART_DBT_INFO info = (SMART_DBT_INFO)context;
543     info->ha->extract_hidden_primary_key(info->keynr, key);
544     info->ha->read_key_only(info->buf,info->keynr,key);
545     return 0;
546 }
547 
548 //
549 // Smart DBT callback function in case where we do NOT have a covering index
550 //
551 static int
smart_dbt_callback_rowread(DBT const * key,DBT const * row,void * context)552 smart_dbt_callback_rowread(DBT const *key, DBT  const *row, void *context) {
553     int error = 0;
554     SMART_DBT_INFO info = (SMART_DBT_INFO)context;
555     info->ha->extract_hidden_primary_key(info->keynr, key);
556     error = info->ha->read_primary_key(info->buf,info->keynr,row,key);
557     return error;
558 }
559 
560 //
561 // Smart DBT callback function in case where we have a covering index
562 //
smart_dbt_callback_ir_keyread(DBT const * key,TOKUDB_UNUSED (DBT const * row),void * context)563 static int smart_dbt_callback_ir_keyread(DBT const* key,
564                                          TOKUDB_UNUSED(DBT const* row),
565                                          void* context) {
566     INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
567     ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(
568         ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
569     if (ir_info->cmp) {
570         return 0;
571     }
572     return smart_dbt_callback_keyread(key, row, &ir_info->smart_dbt_info);
573 }
574 
smart_dbt_callback_lookup(DBT const * key,TOKUDB_UNUSED (DBT const * row),void * context)575 static int smart_dbt_callback_lookup(DBT const* key,
576                                      TOKUDB_UNUSED(DBT const* row),
577                                      void* context) {
578     INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
579     ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(
580         ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
581     return 0;
582 }
583 
584 
585 //
586 // Smart DBT callback function in case where we do NOT have a covering index
587 //
588 static int
smart_dbt_callback_ir_rowread(DBT const * key,DBT const * row,void * context)589 smart_dbt_callback_ir_rowread(DBT const *key, DBT  const *row, void *context) {
590     INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
591     ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
592     if (ir_info->cmp) {
593         return 0;
594     }
595     return smart_dbt_callback_rowread(key, row, &ir_info->smart_dbt_info);
596 }
597 
598 //
599 // macro for Smart DBT callback function,
600 // so we do not need to put this long line of code in multiple places
601 //
602 #define SMART_DBT_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_keyread : smart_dbt_callback_rowread )
603 #define SMART_DBT_IR_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_ir_keyread : smart_dbt_callback_ir_rowread )
604 
605 //
606 // macro that modifies read flag for cursor operations depending on whether
607 // we have preacquired lock or not
608 //
609 #define SET_PRELOCK_FLAG(flg) ((flg) | (range_lock_grabbed ? (use_write_locks ? DB_PRELOCKED_WRITE : DB_PRELOCKED) : 0))
610 
611 //
612 // This method retrieves the value of the auto increment column of a record in MySQL format
613 // This was basically taken from MyISAM
614 // Parameters:
615 //              type - the type of the auto increment column (e.g. int, float, double...)
616 //              offset - offset into the record where the auto increment column is stored
617 //      [in]    record - MySQL row whose auto increment value we want to extract
618 // Returns:
619 //      The value of the auto increment column in record
620 //
retrieve_auto_increment(uint16 type,uint32 offset,const uchar * record)621 static ulonglong retrieve_auto_increment(uint16 type, uint32 offset,const uchar *record)
622 {
623     const uchar *key;     /* Key */
624     ulonglong   unsigned_autoinc = 0;  /* Unsigned auto-increment */
625     longlong      signed_autoinc = 0;  /* Signed auto-increment */
626     enum { unsigned_type, signed_type } autoinc_type;
627     float float_tmp;   /* Temporary variable */
628     double double_tmp; /* Temporary variable */
629 
630     key = ((uchar *) record) + offset;
631 
632     /* Set default autoincrement type */
633     autoinc_type = unsigned_type;
634 
635     switch (type) {
636     case HA_KEYTYPE_INT8:
637         signed_autoinc   = (longlong) *(char*)key;
638         autoinc_type     = signed_type;
639         break;
640 
641     case HA_KEYTYPE_BINARY:
642         unsigned_autoinc = (ulonglong) *(uchar*) key;
643         break;
644 
645     case HA_KEYTYPE_SHORT_INT:
646         signed_autoinc   = (longlong) sint2korr(key);
647         autoinc_type     = signed_type;
648         break;
649 
650     case HA_KEYTYPE_USHORT_INT:
651         unsigned_autoinc = (ulonglong) uint2korr(key);
652         break;
653 
654     case HA_KEYTYPE_LONG_INT:
655         signed_autoinc   = (longlong) sint4korr(key);
656         autoinc_type     = signed_type;
657         break;
658 
659     case HA_KEYTYPE_ULONG_INT:
660         unsigned_autoinc = (ulonglong) uint4korr(key);
661         break;
662 
663     case HA_KEYTYPE_INT24:
664         signed_autoinc   = (longlong) sint3korr(key);
665         autoinc_type     = signed_type;
666         break;
667 
668     case HA_KEYTYPE_UINT24:
669         unsigned_autoinc = (ulonglong) tokudb_uint3korr(key);
670         break;
671 
672     case HA_KEYTYPE_LONGLONG:
673         signed_autoinc   = sint8korr(key);
674         autoinc_type     = signed_type;
675         break;
676 
677     case HA_KEYTYPE_ULONGLONG:
678         unsigned_autoinc = uint8korr(key);
679         break;
680 
681     /* The remaining two cases should not be used but are included for
682        compatibility */
683     case HA_KEYTYPE_FLOAT:
684         float4get(float_tmp, key);  /* Note: float4get is a macro */
685         signed_autoinc   = (longlong) float_tmp;
686         autoinc_type     = signed_type;
687         break;
688 
689     case HA_KEYTYPE_DOUBLE:
690         float8get(double_tmp, key); /* Note: float8get is a macro */
691         signed_autoinc   = (longlong) double_tmp;
692         autoinc_type     = signed_type;
693         break;
694 
695     default:
696         assert_unreachable();
697     }
698 
699     if (signed_autoinc < 0) {
700         signed_autoinc = 0;
701     }
702 
703     return autoinc_type == unsigned_type ?
704            unsigned_autoinc : (ulonglong) signed_autoinc;
705 }
706 
field_offset(Field * field,TABLE * table)707 static inline ulong field_offset(Field* field, TABLE* table) {
708     return((ulong) (field->ptr - table->record[0]));
709 }
710 
tx_to_toku_iso(ulong tx_isolation)711 static inline HA_TOKU_ISO_LEVEL tx_to_toku_iso(ulong tx_isolation) {
712     if (tx_isolation == ISO_READ_UNCOMMITTED) {
713         return hatoku_iso_read_uncommitted;
714     }
715     else if (tx_isolation == ISO_READ_COMMITTED) {
716         return hatoku_iso_read_committed;
717     }
718     else if (tx_isolation == ISO_REPEATABLE_READ) {
719         return hatoku_iso_repeatable_read;
720     }
721     else {
722         return hatoku_iso_serializable;
723     }
724 }
725 
toku_iso_to_txn_flag(HA_TOKU_ISO_LEVEL lvl)726 static inline uint32_t toku_iso_to_txn_flag (HA_TOKU_ISO_LEVEL lvl) {
727     if (lvl == hatoku_iso_read_uncommitted) {
728         return DB_READ_UNCOMMITTED;
729     }
730     else if (lvl == hatoku_iso_read_committed) {
731         return DB_READ_COMMITTED;
732     }
733     else if (lvl == hatoku_iso_repeatable_read) {
734         return DB_TXN_SNAPSHOT;
735     }
736     else {
737         return 0;
738     }
739 }
740 
filter_key_part_compare(const void * left,const void * right)741 static int filter_key_part_compare (const void* left, const void* right) {
742     FILTER_KEY_PART_INFO* left_part= (FILTER_KEY_PART_INFO *)left;
743     FILTER_KEY_PART_INFO* right_part = (FILTER_KEY_PART_INFO *)right;
744     return left_part->offset - right_part->offset;
745 }
746 
747 //
748 // Be very careful with parameters passed to this function. Who knows
749 // if key, table have proper info set. I had to verify by checking
750 // in the debugger.
751 //
set_key_filter(MY_BITMAP * key_filter,KEY * key,TABLE * table,bool get_offset_from_keypart)752 void set_key_filter(
753     MY_BITMAP* key_filter,
754     KEY* key,
755     TABLE* table,
756     bool get_offset_from_keypart) {
757 
758     FILTER_KEY_PART_INFO parts[MAX_REF_PARTS];
759     uint curr_skip_index = 0;
760 
761     for (uint i = 0; i < key->user_defined_key_parts; i++) {
762         //
763         // horrendous hack due to bugs in mysql, basically
764         // we cannot always reliably get the offset from the same source
765         //
766         parts[i].offset =
767             get_offset_from_keypart ?
768                 key->key_part[i].offset :
769                 field_offset(key->key_part[i].field, table);
770         parts[i].part_index = i;
771     }
772     qsort(
773         parts, // start of array
774         key->user_defined_key_parts, //num elements
775         sizeof(*parts), //size of each element
776         filter_key_part_compare);
777 
778     for (uint i = 0; i < table->s->fields; i++) {
779         Field* field = table->field[i];
780         uint curr_field_offset = field_offset(field, table);
781         if (curr_skip_index < key->user_defined_key_parts) {
782             uint curr_skip_offset = 0;
783             curr_skip_offset = parts[curr_skip_index].offset;
784             if (curr_skip_offset == curr_field_offset) {
785                 //
786                 // we have hit a field that is a portion of the primary key
787                 //
788                 uint curr_key_index = parts[curr_skip_index].part_index;
789                 curr_skip_index++;
790                 //
791                 // only choose to continue over the key if the key's length matches the field's length
792                 // otherwise, we may have a situation where the column is a varchar(10), the
793                 // key is only the first 3 characters, and we end up losing the last 7 bytes of the
794                 // column
795                 //
796                 TOKU_TYPE toku_type = mysql_to_toku_type(field);
797                 switch (toku_type) {
798                 case toku_type_blob:
799                     break;
800                 case toku_type_varbinary:
801                 case toku_type_varstring:
802                 case toku_type_fixbinary:
803                 case toku_type_fixstring:
804                     if (key->key_part[curr_key_index].length == field->field_length) {
805                         bitmap_set_bit(key_filter,i);
806                     }
807                     break;
808                 default:
809                     bitmap_set_bit(key_filter,i);
810                     break;
811                 }
812             }
813         }
814     }
815 }
816 
pack_fixed_field(uchar * to_tokudb,const uchar * from_mysql,uint32_t num_bytes)817 static inline uchar* pack_fixed_field(
818     uchar* to_tokudb,
819     const uchar* from_mysql,
820     uint32_t num_bytes
821     )
822 {
823     switch (num_bytes) {
824     case (1):
825         memcpy(to_tokudb, from_mysql, 1);
826         break;
827     case (2):
828         memcpy(to_tokudb, from_mysql, 2);
829         break;
830     case (3):
831         memcpy(to_tokudb, from_mysql, 3);
832         break;
833     case (4):
834         memcpy(to_tokudb, from_mysql, 4);
835         break;
836     case (8):
837         memcpy(to_tokudb, from_mysql, 8);
838         break;
839     default:
840         memcpy(to_tokudb, from_mysql, num_bytes);
841         break;
842     }
843     return to_tokudb+num_bytes;
844 }
845 
unpack_fixed_field(uchar * to_mysql,const uchar * from_tokudb,uint32_t num_bytes)846 static inline const uchar* unpack_fixed_field(
847     uchar* to_mysql,
848     const uchar* from_tokudb,
849     uint32_t num_bytes
850     )
851 {
852     switch (num_bytes) {
853     case (1):
854         memcpy(to_mysql, from_tokudb, 1);
855         break;
856     case (2):
857         memcpy(to_mysql, from_tokudb, 2);
858         break;
859     case (3):
860         memcpy(to_mysql, from_tokudb, 3);
861         break;
862     case (4):
863         memcpy(to_mysql, from_tokudb, 4);
864         break;
865     case (8):
866         memcpy(to_mysql, from_tokudb, 8);
867         break;
868     default:
869         memcpy(to_mysql, from_tokudb, num_bytes);
870         break;
871     }
872     return from_tokudb+num_bytes;
873 }
874 
write_var_field(uchar * to_tokudb_offset_ptr,uchar * to_tokudb_data,uchar * to_tokudb_offset_start,const uchar * data,uint32_t data_length,uint32_t offset_bytes)875 static inline uchar* write_var_field(
876     uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
877     uchar* to_tokudb_data, // location where data is going to be written
878     uchar* to_tokudb_offset_start, //location where offset starts, IS THIS A BAD NAME????
879     const uchar * data, // the data to write
880     uint32_t data_length, // length of data to write
881     uint32_t offset_bytes // number of offset bytes
882     )
883 {
884     memcpy(to_tokudb_data, data, data_length);
885     //
886     // for offset, we pack the offset where the data ENDS!
887     //
888     uint32_t offset = to_tokudb_data + data_length - to_tokudb_offset_start;
889     switch(offset_bytes) {
890     case (1):
891         to_tokudb_offset_ptr[0] = (uchar)offset;
892         break;
893     case (2):
894         int2store(to_tokudb_offset_ptr,offset);
895         break;
896     default:
897         assert_unreachable();
898         break;
899     }
900     return to_tokudb_data + data_length;
901 }
902 
get_var_data_length(const uchar * from_mysql,uint32_t mysql_length_bytes)903 static inline uint32_t get_var_data_length(
904     const uchar * from_mysql,
905     uint32_t mysql_length_bytes
906     )
907 {
908     uint32_t data_length;
909     switch(mysql_length_bytes) {
910     case(1):
911         data_length = from_mysql[0];
912         break;
913     case(2):
914         data_length = uint2korr(from_mysql);
915         break;
916     default:
917         assert_unreachable();
918     }
919     return data_length;
920 }
921 
pack_var_field(uchar * to_tokudb_offset_ptr,uchar * to_tokudb_data,uchar * to_tokudb_offset_start,const uchar * from_mysql,uint32_t mysql_length_bytes,uint32_t offset_bytes)922 static inline uchar* pack_var_field(
923     uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
924     uchar* to_tokudb_data, // pointer to where tokudb data should be written
925     uchar* to_tokudb_offset_start, //location where data starts, IS THIS A BAD NAME????
926     const uchar * from_mysql, // mysql data
927     uint32_t mysql_length_bytes, //number of bytes used to store length in from_mysql
928     uint32_t offset_bytes //number of offset_bytes used in tokudb row
929     )
930 {
931     uint data_length = get_var_data_length(from_mysql, mysql_length_bytes);
932     return write_var_field(
933         to_tokudb_offset_ptr,
934         to_tokudb_data,
935         to_tokudb_offset_start,
936         from_mysql + mysql_length_bytes,
937         data_length,
938         offset_bytes
939         );
940 }
941 
unpack_var_field(uchar * to_mysql,const uchar * from_tokudb_data,uint32_t from_tokudb_data_len,uint32_t mysql_length_bytes)942 static inline void unpack_var_field(
943     uchar* to_mysql,
944     const uchar* from_tokudb_data,
945     uint32_t from_tokudb_data_len,
946     uint32_t mysql_length_bytes
947     )
948 {
949     //
950     // store the length
951     //
952     switch (mysql_length_bytes) {
953     case(1):
954         to_mysql[0] = (uchar)from_tokudb_data_len;
955         break;
956     case(2):
957         int2store(to_mysql, from_tokudb_data_len);
958         break;
959     default:
960         assert_unreachable();
961     }
962     //
963     // store the data
964     //
965     memcpy(to_mysql+mysql_length_bytes, from_tokudb_data, from_tokudb_data_len);
966 }
967 
pack_toku_field_blob(uchar * to_tokudb,const uchar * from_mysql,Field * field)968 static uchar* pack_toku_field_blob(
969     uchar* to_tokudb,
970     const uchar* from_mysql,
971     Field* field
972     )
973 {
974     uint32_t len_bytes = field->row_pack_length();
975     uint32_t length = 0;
976     uchar* data_ptr = NULL;
977     memcpy(to_tokudb, from_mysql, len_bytes);
978 
979     switch (len_bytes) {
980     case (1):
981         length = (uint32_t)(*from_mysql);
982         break;
983     case (2):
984         length = uint2korr(from_mysql);
985         break;
986     case (3):
987         length = tokudb_uint3korr(from_mysql);
988         break;
989     case (4):
990         length = uint4korr(from_mysql);
991         break;
992     default:
993         assert_unreachable();
994     }
995 
996     if (length > 0) {
997         memcpy((uchar *)(&data_ptr), from_mysql + len_bytes, sizeof(uchar*));
998         memcpy(to_tokudb + len_bytes, data_ptr, length);
999     }
1000     return (to_tokudb + len_bytes + length);
1001 }
1002 
create_tokudb_trx_data_instance(tokudb_trx_data ** out_trx)1003 static int create_tokudb_trx_data_instance(tokudb_trx_data** out_trx) {
1004     int error;
1005     tokudb_trx_data* trx = (tokudb_trx_data *) tokudb::memory::malloc(
1006         sizeof(*trx),
1007         MYF(MY_ZEROFILL));
1008     if (!trx) {
1009         error = ENOMEM;
1010         goto cleanup;
1011     }
1012 
1013     *out_trx = trx;
1014     error = 0;
1015 cleanup:
1016     return error;
1017 }
1018 
tokudb_generate_row(DB * dest_db,TOKUDB_UNUSED (DB * src_db),DBT * dest_key,DBT * dest_val,const DBT * src_key,const DBT * src_val)1019 static inline int tokudb_generate_row(DB* dest_db,
1020                                       TOKUDB_UNUSED(DB* src_db),
1021                                       DBT* dest_key,
1022                                       DBT* dest_val,
1023                                       const DBT* src_key,
1024                                       const DBT* src_val) {
1025     int error;
1026 
1027     DB* curr_db = dest_db;
1028     uchar* row_desc = NULL;
1029     uint32_t desc_size;
1030     uchar* buff = NULL;
1031     uint32_t max_key_len = 0;
1032 
1033     row_desc = (uchar *)curr_db->descriptor->dbt.data;
1034     row_desc += (*(uint32_t *)row_desc);
1035     desc_size = (*(uint32_t *)row_desc) - 4;
1036     row_desc += 4;
1037 
1038     if (is_key_pk(row_desc)) {
1039         if (dest_key->flags == DB_DBT_REALLOC && dest_key->data != NULL) {
1040             free(dest_key->data);
1041         }
1042         if (dest_val != NULL) {
1043             if (dest_val->flags == DB_DBT_REALLOC && dest_val->data != NULL) {
1044                 free(dest_val->data);
1045             }
1046         }
1047         dest_key->data = src_key->data;
1048         dest_key->size = src_key->size;
1049         dest_key->flags = 0;
1050         if (dest_val != NULL) {
1051             dest_val->data = src_val->data;
1052             dest_val->size = src_val->size;
1053             dest_val->flags = 0;
1054         }
1055         error = 0;
1056         goto cleanup;
1057     }
1058     // at this point, we need to create the key/val and set it
1059     // in the DBTs
1060     if (dest_key->flags == 0) {
1061         dest_key->ulen = 0;
1062         dest_key->size = 0;
1063         dest_key->data = NULL;
1064         dest_key->flags = DB_DBT_REALLOC;
1065     }
1066     if (dest_key->flags == DB_DBT_REALLOC) {
1067         max_key_len = max_key_size_from_desc(row_desc, desc_size);
1068         max_key_len += src_key->size;
1069 
1070         if (max_key_len > dest_key->ulen) {
1071             void* old_ptr = dest_key->data;
1072             void* new_ptr = NULL;
1073             new_ptr = realloc(old_ptr, max_key_len);
1074             assert_always(new_ptr);
1075             dest_key->data = new_ptr;
1076             dest_key->ulen = max_key_len;
1077         }
1078 
1079         buff = (uchar *)dest_key->data;
1080         assert_always(buff != nullptr);
1081         assert_always(max_key_len > 0);
1082     } else {
1083         assert_unreachable();
1084     }
1085 
1086     dest_key->size = pack_key_from_desc(buff, row_desc, desc_size, src_key,
1087                                         src_val);
1088     assert_always(dest_key->ulen >= dest_key->size);
1089     if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY)) &&
1090         !max_key_len) {
1091         max_key_len = max_key_size_from_desc(row_desc, desc_size);
1092         max_key_len += src_key->size;
1093     }
1094     if (max_key_len) {
1095         assert_always(max_key_len >= dest_key->size);
1096     }
1097 
1098     row_desc += desc_size;
1099     desc_size = (*(uint32_t *)row_desc) - 4;
1100     row_desc += 4;
1101     if (dest_val != NULL) {
1102         if (!is_key_clustering(desc_size) || src_val->size == 0) {
1103             dest_val->size = 0;
1104         } else {
1105             uchar* buff = NULL;
1106             if (dest_val->flags == 0) {
1107                 dest_val->ulen = 0;
1108                 dest_val->size = 0;
1109                 dest_val->data = NULL;
1110                 dest_val->flags = DB_DBT_REALLOC;
1111             }
1112             if (dest_val->flags == DB_DBT_REALLOC){
1113                 if (dest_val->ulen < src_val->size) {
1114                     void* old_ptr = dest_val->data;
1115                     void* new_ptr = NULL;
1116                     new_ptr = realloc(old_ptr, src_val->size);
1117                     assert_always(new_ptr);
1118                     dest_val->data = new_ptr;
1119                     dest_val->ulen = src_val->size;
1120                 }
1121                 buff = (uchar *)dest_val->data;
1122                 assert_always(buff != NULL);
1123             } else {
1124                 assert_unreachable();
1125             }
1126             dest_val->size = pack_clustering_val_from_desc(
1127                 buff,
1128                 row_desc,
1129                 desc_size,
1130                 src_val);
1131             assert_always(dest_val->ulen >= dest_val->size);
1132         }
1133     }
1134     error = 0;
1135 cleanup:
1136     return error;
1137 }
1138 
generate_row_for_del(DB * dest_db,DB * src_db,DBT_ARRAY * dest_key_arrays,const DBT * src_key,const DBT * src_val)1139 static int generate_row_for_del(
1140     DB *dest_db,
1141     DB *src_db,
1142     DBT_ARRAY *dest_key_arrays,
1143     const DBT *src_key,
1144     const DBT *src_val
1145     )
1146 {
1147     DBT* dest_key = &dest_key_arrays->dbts[0];
1148     return tokudb_generate_row(
1149         dest_db,
1150         src_db,
1151         dest_key,
1152         NULL,
1153         src_key,
1154         src_val
1155         );
1156 }
1157 
1158 
generate_row_for_put(DB * dest_db,DB * src_db,DBT_ARRAY * dest_key_arrays,DBT_ARRAY * dest_val_arrays,const DBT * src_key,const DBT * src_val)1159 static int generate_row_for_put(
1160     DB *dest_db,
1161     DB *src_db,
1162     DBT_ARRAY *dest_key_arrays,
1163     DBT_ARRAY *dest_val_arrays,
1164     const DBT *src_key,
1165     const DBT *src_val
1166     )
1167 {
1168     DBT* dest_key = &dest_key_arrays->dbts[0];
1169     DBT *dest_val = (dest_val_arrays == NULL) ? NULL : &dest_val_arrays->dbts[0];
1170     return tokudb_generate_row(
1171         dest_db,
1172         src_db,
1173         dest_key,
1174         dest_val,
1175         src_key,
1176         src_val
1177         );
1178 }
1179 
ha_tokudb(handlerton * hton,TABLE_SHARE * table_arg)1180 ha_tokudb::ha_tokudb(handlerton * hton, TABLE_SHARE * table_arg):handler(hton, table_arg) {
1181     TOKUDB_HANDLER_DBUG_ENTER("");
1182     share = NULL;
1183     int_table_flags = HA_REC_NOT_IN_SEQ  | HA_NULL_IN_KEY | HA_CAN_INDEX_BLOBS
1184         | HA_PRIMARY_KEY_IN_READ_INDEX | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION
1185         | HA_FILE_BASED | HA_AUTO_PART_KEY | HA_TABLE_SCAN_ON_INDEX
1186         | HA_CAN_WRITE_DURING_OPTIMIZE | HA_ONLINE_ANALYZE;
1187     alloc_ptr = NULL;
1188     rec_buff = NULL;
1189     rec_update_buff = NULL;
1190     transaction = NULL;
1191     cursor = NULL;
1192     fixed_cols_for_query = NULL;
1193     var_cols_for_query = NULL;
1194     num_fixed_cols_for_query = 0;
1195     num_var_cols_for_query = 0;
1196     unpack_entire_row = true;
1197     read_blobs = false;
1198     read_key = false;
1199     added_rows = 0;
1200     deleted_rows = 0;
1201     updated_rows = 0;
1202     last_dup_key = UINT_MAX;
1203     using_ignore = false;
1204     using_ignore_no_key = false;
1205     last_cursor_error = 0;
1206     range_lock_grabbed = false;
1207     blob_buff = NULL;
1208     num_blob_bytes = 0;
1209     delay_updating_ai_metadata = false;
1210     ai_metadata_update_required = false;
1211     memset(mult_key_dbt_array, 0, sizeof(mult_key_dbt_array));
1212     memset(mult_rec_dbt_array, 0, sizeof(mult_rec_dbt_array));
1213     for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1214         toku_dbt_array_init(&mult_key_dbt_array[i], 1);
1215     }
1216     for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1217         toku_dbt_array_init(&mult_rec_dbt_array[i], 1);
1218     }
1219     loader = NULL;
1220     abort_loader = false;
1221     memset(&lc, 0, sizeof(lc));
1222     lock.type = TL_IGNORE;
1223     for (uint32_t i = 0; i < MAX_KEY+1; i++) {
1224         mult_put_flags[i] = 0;
1225         mult_del_flags[i] = DB_DELETE_ANY;
1226         mult_dbt_flags[i] = DB_DBT_REALLOC;
1227     }
1228     num_DBs_locked_in_bulk = false;
1229     lock_count = 0;
1230     use_write_locks = false;
1231     range_query_buff = NULL;
1232     size_range_query_buff = 0;
1233     bytes_used_in_range_query_buff = 0;
1234     curr_range_query_buff_offset = 0;
1235     doing_bulk_fetch = false;
1236     prelocked_left_range_size = 0;
1237     prelocked_right_range_size = 0;
1238     tokudb_active_index = MAX_KEY;
1239     invalidate_icp();
1240     trx_handler_list.data = this;
1241 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
1242     in_rpl_write_rows = in_rpl_delete_rows = in_rpl_update_rows = false;
1243 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
1244     TOKUDB_HANDLER_DBUG_VOID_RETURN;
1245 }
1246 
~ha_tokudb()1247 ha_tokudb::~ha_tokudb() {
1248     TOKUDB_HANDLER_DBUG_ENTER("");
1249     for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1250         toku_dbt_array_destroy(&mult_key_dbt_array[i]);
1251     }
1252     for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1253         toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
1254     }
1255     TOKUDB_HANDLER_DBUG_VOID_RETURN;
1256 }
1257 
1258 //
1259 // states if table has an auto increment column, if so, sets index where auto inc column is to index
1260 // Parameters:
1261 //      [out]   index - if auto inc exists, then this param is set to where it exists in table, if not, then unchanged
1262 // Returns:
1263 //      true if auto inc column exists, false otherwise
1264 //
has_auto_increment_flag(uint * index)1265 bool ha_tokudb::has_auto_increment_flag(uint* index) {
1266     //
1267     // check to see if we have auto increment field
1268     //
1269     bool ai_found = false;
1270     uint ai_index = 0;
1271     for (uint i = 0; i < table_share->fields; i++, ai_index++) {
1272         Field* field = table->field[i];
1273         if (field->flags & AUTO_INCREMENT_FLAG) {
1274             ai_found = true;
1275             *index = ai_index;
1276             break;
1277         }
1278     }
1279     return ai_found;
1280 }
1281 
open_status_dictionary(DB ** ptr,const char * name,DB_TXN * txn)1282 static int open_status_dictionary(DB** ptr, const char* name, DB_TXN* txn) {
1283     int error;
1284     char* newname = NULL;
1285     size_t newname_len = get_max_dict_name_path_length(name);
1286     newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
1287     if (newname == NULL) {
1288         error = ENOMEM;
1289         goto cleanup;
1290     }
1291     make_name(newname, newname_len, name, "status");
1292     TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "open:%s", newname);
1293 
1294     error = tokudb::metadata::open(db_env, ptr, newname, txn);
1295 cleanup:
1296     tokudb::memory::free(newname);
1297     return error;
1298 }
1299 
open_main_dictionary(const char * name,bool is_read_only,DB_TXN * txn)1300 int ha_tokudb::open_main_dictionary(
1301     const char* name,
1302     bool is_read_only,
1303     DB_TXN* txn) {
1304 
1305     int error;
1306     char* newname = NULL;
1307     size_t newname_len = 0;
1308     uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1309 
1310     assert_always(share->file == NULL);
1311     assert_always(share->key_file[primary_key] == NULL);
1312     newname_len = get_max_dict_name_path_length(name);
1313     newname = (char*)tokudb::memory::malloc(
1314         newname_len,
1315         MYF(MY_WME|MY_ZEROFILL));
1316     if (newname == NULL) {
1317         error = ENOMEM;
1318         goto exit;
1319     }
1320     make_name(newname, newname_len, name, "main");
1321 
1322     error = db_create(&share->file, db_env, 0);
1323     if (error) {
1324         goto exit;
1325     }
1326     share->key_file[primary_key] = share->file;
1327 
1328     error =
1329         share->file->open(
1330             share->file,
1331             txn,
1332             newname,
1333             NULL,
1334             DB_BTREE,
1335             open_flags,
1336             S_IWUSR);
1337     if (error) {
1338         goto exit;
1339     }
1340 
1341     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1342         TOKUDB_DEBUG_OPEN,
1343         "open:%s:file=%p",
1344         newname,
1345         share->file);
1346 
1347     error = 0;
1348 exit:
1349     if (error) {
1350         if (share->file) {
1351             int r = share->file->close(
1352                 share->file,
1353                 0
1354                 );
1355             assert_always(r==0);
1356             share->file = NULL;
1357             share->key_file[primary_key] = NULL;
1358         }
1359     }
1360     tokudb::memory::free(newname);
1361     return error;
1362 }
1363 
1364 //
1365 // Open a secondary table, the key will be a secondary index, the data will
1366 // be a primary key
1367 //
open_secondary_dictionary(DB ** ptr,KEY * key_info,const char * name,bool is_read_only,DB_TXN * txn)1368 int ha_tokudb::open_secondary_dictionary(
1369     DB** ptr,
1370     KEY* key_info,
1371     const char* name,
1372     bool is_read_only,
1373     DB_TXN* txn) {
1374 
1375     int error = ENOSYS;
1376     char dict_name[MAX_DICT_NAME_LEN];
1377     uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1378     char* newname = NULL;
1379     size_t newname_len = 0;
1380 
1381     sprintf(dict_name, "key-%s", key_info->name.str);
1382 
1383     newname_len = get_max_dict_name_path_length(name);
1384     newname =
1385         (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME|MY_ZEROFILL));
1386     if (newname == NULL) {
1387         error = ENOMEM;
1388         goto cleanup;
1389     }
1390     make_name(newname, newname_len, name, dict_name);
1391 
1392 
1393     if ((error = db_create(ptr, db_env, 0))) {
1394         my_errno = error;
1395         goto cleanup;
1396     }
1397 
1398 
1399     error = (*ptr)->open(*ptr, txn, newname, NULL, DB_BTREE, open_flags, S_IWUSR);
1400     if (error) {
1401         my_errno = error;
1402         goto cleanup;
1403     }
1404     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1405         TOKUDB_DEBUG_OPEN,
1406         "open:%s:file=%p",
1407         newname,
1408         *ptr);
1409 cleanup:
1410     if (error) {
1411         if (*ptr) {
1412             int r = (*ptr)->close(*ptr, 0);
1413             assert_always(r==0);
1414             *ptr = NULL;
1415         }
1416     }
1417     tokudb::memory::free(newname);
1418     return error;
1419 }
1420 
initialize_col_pack_info(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)1421 static int initialize_col_pack_info(KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, uint keynr) {
1422     int error = ENOSYS;
1423     //
1424     // set up the cp_info
1425     //
1426     assert_always(kc_info->cp_info[keynr] == NULL);
1427     kc_info->cp_info[keynr] = (COL_PACK_INFO*)tokudb::memory::malloc(
1428         table_share->fields * sizeof(COL_PACK_INFO),
1429         MYF(MY_WME | MY_ZEROFILL));
1430     if (kc_info->cp_info[keynr] == NULL) {
1431         error = ENOMEM;
1432         goto exit;
1433     }
1434     {
1435     uint32_t curr_fixed_offset = 0;
1436     uint32_t curr_var_index = 0;
1437     for (uint j = 0; j < table_share->fields; j++) {
1438         COL_PACK_INFO* curr = &kc_info->cp_info[keynr][j];
1439         //
1440         // need to set the offsets / indexes
1441         // offsets are calculated AFTER the NULL bytes
1442         //
1443         if (!bitmap_is_set(&kc_info->key_filters[keynr],j)) {
1444             if (is_fixed_field(kc_info, j)) {
1445                 curr->col_pack_val = curr_fixed_offset;
1446                 curr_fixed_offset += kc_info->field_lengths[j];
1447             }
1448             else if (is_variable_field(kc_info, j)) {
1449                 curr->col_pack_val = curr_var_index;
1450                 curr_var_index++;
1451             }
1452         }
1453     }
1454 
1455     //
1456     // set up the mcp_info
1457     //
1458     kc_info->mcp_info[keynr].fixed_field_size = get_fixed_field_size(
1459         kc_info,
1460         table_share,
1461         keynr
1462         );
1463     kc_info->mcp_info[keynr].len_of_offsets = get_len_of_offsets(
1464         kc_info,
1465         table_share,
1466         keynr
1467         );
1468 
1469     error = 0;
1470     }
1471 exit:
1472     return error;
1473 }
1474 
1475 // reset the kc_info state at keynr
reset_key_and_col_info(KEY_AND_COL_INFO * kc_info,uint keynr)1476 static void reset_key_and_col_info(KEY_AND_COL_INFO *kc_info, uint keynr) {
1477     bitmap_clear_all(&kc_info->key_filters[keynr]);
1478     tokudb::memory::free(kc_info->cp_info[keynr]);
1479     kc_info->cp_info[keynr] = NULL;
1480     kc_info->mcp_info[keynr] = (MULTI_COL_PACK_INFO) { 0, 0 };
1481 }
1482 
initialize_key_and_col_info(TABLE_SHARE * table_share,TABLE * table,KEY_AND_COL_INFO * kc_info,uint hidden_primary_key,uint primary_key)1483 static int initialize_key_and_col_info(
1484     TABLE_SHARE* table_share,
1485     TABLE* table,
1486     KEY_AND_COL_INFO* kc_info,
1487     uint hidden_primary_key,
1488     uint primary_key) {
1489 
1490     int error = 0;
1491     uint32_t curr_blob_field_index = 0;
1492     uint32_t max_var_bytes = 0;
1493     //
1494     // fill in the field lengths. 0 means it is a variable sized field length
1495     // fill in length_bytes, 0 means it is fixed or blob
1496     //
1497     for (uint i = 0; i < table_share->fields; i++) {
1498         Field* field = table_share->field[i];
1499         TOKU_TYPE toku_type = mysql_to_toku_type(field);
1500         uint32 pack_length = 0;
1501         switch (toku_type) {
1502         case toku_type_int:
1503         case toku_type_double:
1504         case toku_type_float:
1505         case toku_type_fixbinary:
1506         case toku_type_fixstring:
1507             pack_length = field->pack_length();
1508             assert_always(pack_length < 1<<16);
1509             kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_FIXED_FIELD;
1510             kc_info->field_lengths[i] = (uint16_t)pack_length;
1511             kc_info->length_bytes[i] = 0;
1512             break;
1513         case toku_type_blob:
1514             kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_BLOB_FIELD;
1515             kc_info->field_lengths[i] = 0;
1516             kc_info->length_bytes[i] = 0;
1517             kc_info->blob_fields[curr_blob_field_index] = i;
1518             curr_blob_field_index++;
1519             break;
1520         case toku_type_varstring:
1521         case toku_type_varbinary:
1522             kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_VARIABLE_FIELD;
1523             kc_info->field_lengths[i] = 0;
1524             kc_info->length_bytes[i] =
1525                 (uchar)((Field_varstring*)field)->length_bytes;
1526             max_var_bytes += field->field_length;
1527             break;
1528         default:
1529             assert_unreachable();
1530         }
1531     }
1532     kc_info->num_blobs = curr_blob_field_index;
1533 
1534     //
1535     // initialize share->num_offset_bytes
1536     // because MAX_REF_LENGTH is 65536, we
1537     // can safely set num_offset_bytes to 1 or 2
1538     //
1539     if (max_var_bytes < 256) {
1540         kc_info->num_offset_bytes = 1;
1541     } else {
1542         kc_info->num_offset_bytes = 2;
1543     }
1544 
1545     for (uint i = 0;
1546          i < table_share->keys + tokudb_test(hidden_primary_key);
1547          i++) {
1548         //
1549         // do the cluster/primary key filtering calculations
1550         //
1551         if (!(i==primary_key && hidden_primary_key)) {
1552             if (i == primary_key) {
1553                 set_key_filter(
1554                     &kc_info->key_filters[primary_key],
1555                     &table_share->key_info[primary_key],
1556                     table,
1557                     true);
1558             } else {
1559                 set_key_filter(
1560                     &kc_info->key_filters[i],
1561                     &table_share->key_info[i],
1562                     table,
1563                     true);
1564                 if (!hidden_primary_key) {
1565                     set_key_filter(
1566                         &kc_info->key_filters[i],
1567                         &table_share->key_info[primary_key],
1568                         table,
1569                         true);
1570                 }
1571             }
1572         }
1573         if (i == primary_key || key_is_clustering(&table_share->key_info[i])) {
1574             error = initialize_col_pack_info(kc_info, table_share, i);
1575             if (error) {
1576                 goto exit;
1577             }
1578         }
1579     }
1580 exit:
1581     return error;
1582 }
1583 
can_replace_into_be_fast(TABLE_SHARE * table_share,KEY_AND_COL_INFO * kc_info,uint pk)1584 bool ha_tokudb::can_replace_into_be_fast(
1585     TABLE_SHARE* table_share,
1586     KEY_AND_COL_INFO* kc_info,
1587     uint pk) {
1588 
1589     uint curr_num_DBs = table_share->keys + tokudb_test(hidden_primary_key);
1590     bool ret_val;
1591     if (curr_num_DBs == 1) {
1592         ret_val = true;
1593         goto exit;
1594     }
1595     ret_val = true;
1596     for (uint curr_index = 0; curr_index < table_share->keys; curr_index++) {
1597         if (curr_index == pk) continue;
1598         KEY* curr_key_info = &table_share->key_info[curr_index];
1599         for (uint i = 0; i < curr_key_info->user_defined_key_parts; i++) {
1600             uint16 curr_field_index = curr_key_info->key_part[i].field->field_index;
1601             if (!bitmap_is_set(&kc_info->key_filters[curr_index],curr_field_index)) {
1602                 ret_val = false;
1603                 goto exit;
1604             }
1605             if (bitmap_is_set(&kc_info->key_filters[curr_index], curr_field_index) &&
1606                 !bitmap_is_set(&kc_info->key_filters[pk], curr_field_index)) {
1607                 ret_val = false;
1608                 goto exit;
1609             }
1610 
1611         }
1612     }
1613 exit:
1614     return ret_val;
1615 }
1616 
initialize_share(const char * name,int mode)1617 int ha_tokudb::initialize_share(const char* name, int mode) {
1618     int error = 0;
1619     uint64_t num_rows = 0;
1620     DB_TXN* txn = NULL;
1621     bool do_commit = false;
1622     THD* thd = ha_thd();
1623     tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
1624     if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
1625         txn = trx->sub_sp_level;
1626     }
1627     else {
1628         do_commit = true;
1629         error = txn_begin(db_env, 0, &txn, 0, thd);
1630         if (error) { goto exit; }
1631     }
1632 
1633 
1634     error = get_status(txn);
1635     if (error) {
1636         goto exit;
1637     }
1638     if (share->version != HA_TOKU_VERSION) {
1639         error = ENOSYS;
1640         goto exit;
1641     }
1642 
1643 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
1644 #if defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
1645     // verify frm data for non-partitioned tables
1646     if (TOKU_PARTITION_WRITE_FRM_DATA || table->part_info == NULL) {
1647         error = verify_frm_data(table->s->path.str, txn);
1648         if (error)
1649             goto exit;
1650     } else {
1651         // remove the frm data for partitions since we are not maintaining it
1652         error = remove_frm_data(share->status_block, txn);
1653         if (error)
1654             goto exit;
1655     }
1656 #else
1657     error = verify_frm_data(table->s->path.str, txn);
1658     if (error)
1659         goto exit;
1660 #endif  // defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
1661 #endif  // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
1662 
1663     error =
1664         initialize_key_and_col_info(
1665             table_share,
1666             table,
1667             &share->kc_info,
1668             hidden_primary_key,
1669             primary_key);
1670     if (error) { goto exit; }
1671 
1672     error = open_main_dictionary(name, mode == O_RDONLY, txn);
1673     if (error) {
1674         goto exit;
1675     }
1676 
1677     share->has_unique_keys = false;
1678     share->_keys = table_share->keys;
1679     share->_max_key_parts = table_share->key_parts;
1680     share->_key_descriptors =
1681         (TOKUDB_SHARE::key_descriptor_t*)tokudb::memory::malloc(
1682             sizeof(TOKUDB_SHARE::key_descriptor_t) * share->_keys,
1683             MYF(MY_ZEROFILL));
1684 
1685     /* Open other keys;  These are part of the share structure */
1686     for (uint i = 0; i < table_share->keys; i++) {
1687         share->_key_descriptors[i]._parts =
1688             table_share->key_info[i].user_defined_key_parts;
1689         if (i == primary_key) {
1690             share->_key_descriptors[i]._is_unique = true;
1691             share->_key_descriptors[i]._name = tokudb::memory::strdup("primary", 0);
1692         } else {
1693             share->_key_descriptors[i]._is_unique = false;
1694             share->_key_descriptors[i]._name =
1695                 tokudb::memory::strdup(table_share->key_info[i].name.str, 0);
1696         }
1697 
1698         if (table_share->key_info[i].flags & HA_NOSAME) {
1699             share->_key_descriptors[i]._is_unique = true;
1700             share->has_unique_keys = true;
1701         }
1702         if (i != primary_key) {
1703             error =
1704                 open_secondary_dictionary(
1705                     &share->key_file[i],
1706                     &table_share->key_info[i],
1707                     name,
1708                     mode == O_RDONLY,
1709                     txn);
1710             if (error) {
1711                 goto exit;
1712             }
1713         }
1714     }
1715     share->replace_into_fast =
1716         can_replace_into_be_fast(
1717             table_share,
1718             &share->kc_info,
1719             primary_key);
1720 
1721     share->pk_has_string = false;
1722     if (!hidden_primary_key) {
1723         //
1724         // We need to set the ref_length to start at 5, to account for
1725         // the "infinity byte" in keys, and for placing the DBT size in the first four bytes
1726         //
1727         ref_length = sizeof(uint32_t) + sizeof(uchar);
1728         KEY_PART_INFO* key_part = table->key_info[primary_key].key_part;
1729         KEY_PART_INFO* end =
1730             key_part + table->key_info[primary_key].user_defined_key_parts;
1731         for (; key_part != end; key_part++) {
1732             ref_length += key_part->field->max_packed_col_length(key_part->length);
1733             TOKU_TYPE toku_type = mysql_to_toku_type(key_part->field);
1734             if (toku_type == toku_type_fixstring ||
1735                 toku_type == toku_type_varstring ||
1736                 toku_type == toku_type_blob
1737                 )
1738             {
1739                 share->pk_has_string = true;
1740             }
1741         }
1742         share->status |= STATUS_PRIMARY_KEY_INIT;
1743     }
1744     share->ref_length = ref_length;
1745 
1746     error = estimate_num_rows(share->file, &num_rows, txn);
1747     //
1748     // estimate_num_rows should not fail under normal conditions
1749     //
1750     if (error == 0) {
1751         share->set_row_count(num_rows, true);
1752     } else {
1753         goto exit;
1754     }
1755     //
1756     // initialize auto increment data
1757     //
1758     share->has_auto_inc = has_auto_increment_flag(&share->ai_field_index);
1759     if (share->has_auto_inc) {
1760         init_auto_increment();
1761     }
1762 
1763     if (may_table_be_empty(txn)) {
1764         share->try_table_lock = true;
1765     } else {
1766         share->try_table_lock = false;
1767     }
1768 
1769     share->num_DBs = table_share->keys + tokudb_test(hidden_primary_key);
1770 
1771     init_hidden_prim_key_info(txn);
1772 
1773     // initialize cardinality info from the status dictionary
1774     {
1775         uint32_t rec_per_keys = tokudb::compute_total_key_parts(table_share);
1776         uint64_t* rec_per_key =
1777             (uint64_t*)tokudb::memory::malloc(
1778                 rec_per_keys * sizeof(uint64_t),
1779                 MYF(MY_FAE));
1780         error =
1781             tokudb::get_card_from_status(
1782                 share->status_block,
1783                 txn,
1784                 rec_per_keys,
1785                 rec_per_key);
1786         if (error) {
1787             memset(rec_per_key, 0, sizeof(ulonglong) * rec_per_keys);
1788         }
1789         share->init_cardinality_counts(rec_per_keys, rec_per_key);
1790     }
1791 
1792     error = 0;
1793 exit:
1794     if (do_commit && txn) {
1795         commit_txn(txn,0);
1796     }
1797     return error;
1798 }
1799 
1800 //
1801 // Creates and opens a handle to a table which already exists in a tokudb
1802 // database.
1803 // Parameters:
1804 //      [in]   name - table name
1805 //             mode - seems to specify if table is read only
1806 //             test_if_locked - unused
1807 // Returns:
1808 //      0 on success
1809 //      1 on error
1810 //
open(const char * name,int mode,uint test_if_locked)1811 int ha_tokudb::open(const char *name, int mode, uint test_if_locked) {
1812     TOKUDB_HANDLER_DBUG_ENTER("%s %o %u", name, mode, test_if_locked);
1813     THD* thd = ha_thd();
1814 
1815     int error = 0;
1816     int ret_val = 0;
1817 
1818     transaction = NULL;
1819     cursor = NULL;
1820 
1821 
1822     /* Open primary key */
1823     hidden_primary_key = 0;
1824     if ((primary_key = table_share->primary_key) >= MAX_KEY) {
1825         // No primary key
1826         primary_key = table_share->keys;
1827         key_used_on_scan = MAX_KEY;
1828         hidden_primary_key = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
1829         ref_length = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t);
1830     }
1831     else {
1832         key_used_on_scan = primary_key;
1833     }
1834 
1835     /* Need some extra memory in case of packed keys */
1836     // the "+ 1" is for the first byte that states +/- infinity
1837     // multiply everything by 2 to account for clustered keys having a key and primary key together
1838     max_key_length = 2*(table_share->max_key_length + MAX_REF_PARTS * 3 + sizeof(uchar));
1839     alloc_ptr = tokudb::memory::multi_malloc(
1840         MYF(MY_WME),
1841         &key_buff, max_key_length,
1842         &key_buff2, max_key_length,
1843         &key_buff3, max_key_length,
1844         &key_buff4, max_key_length,
1845         &prelocked_left_range, max_key_length,
1846         &prelocked_right_range, max_key_length,
1847         &primary_key_buff, (hidden_primary_key ? 0 : max_key_length),
1848         &fixed_cols_for_query, table_share->fields*sizeof(uint32_t),
1849         &var_cols_for_query, table_share->fields*sizeof(uint32_t),
1850         NullS);
1851     if (alloc_ptr == NULL) {
1852         ret_val = 1;
1853         goto exit;
1854     }
1855 
1856     size_range_query_buff = tokudb::sysvars::read_buf_size(thd);
1857     range_query_buff =
1858         (uchar*)tokudb::memory::malloc(size_range_query_buff, MYF(MY_WME));
1859     if (range_query_buff == NULL) {
1860         ret_val = 1;
1861         goto exit;
1862     }
1863 
1864     alloced_rec_buff_length = table_share->rec_buff_length +
1865         table_share->fields;
1866     rec_buff = (uchar *) tokudb::memory::malloc(
1867         alloced_rec_buff_length,
1868         MYF(MY_WME));
1869     if (rec_buff == NULL) {
1870         ret_val = 1;
1871         goto exit;
1872     }
1873 
1874     alloced_update_rec_buff_length = alloced_rec_buff_length;
1875     rec_update_buff = (uchar*)tokudb::memory::malloc(
1876         alloced_update_rec_buff_length,
1877         MYF(MY_WME));
1878     if (rec_update_buff == NULL) {
1879         ret_val = 1;
1880         goto exit;
1881     }
1882 
1883     // lookup or create share
1884     share = TOKUDB_SHARE::get_share(name, &lock, true);
1885     assert_always(share);
1886 
1887     if (share->state() != TOKUDB_SHARE::OPENED) {
1888         // means we're responsible for the transition to OPENED, ERROR or CLOSED
1889 
1890         ret_val = allocate_key_and_col_info(table_share, &share->kc_info);
1891         if (ret_val == 0) {
1892             ret_val = initialize_share(name, mode);
1893         }
1894 
1895         if (ret_val == 0) {
1896             share->set_state(TOKUDB_SHARE::OPENED);
1897         } else {
1898             free_key_and_col_info(&share->kc_info);
1899             share->set_state(TOKUDB_SHARE::ERROR);
1900         }
1901         share->unlock();
1902     } else {
1903         // got an already OPENED instance
1904         share->unlock();
1905     }
1906 
1907     if (share->state() == TOKUDB_SHARE::ERROR) {
1908         share->release();
1909         goto exit;
1910     }
1911 
1912     assert_always(share->state() == TOKUDB_SHARE::OPENED);
1913 
1914     ref_length = share->ref_length;     // If second open
1915 
1916     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1917         TOKUDB_DEBUG_OPEN,
1918         "tokudbopen:%p:share=%p:file=%p:table=%p:table->s=%p:%d",
1919         this,
1920         share,
1921         share->file,
1922         table,
1923         table->s,
1924         share->use_count());
1925 
1926     key_read = false;
1927     stats.block_size = 1<<20;    // QQQ Tokudb DB block size
1928 
1929     info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
1930 
1931 exit:
1932     if (ret_val) {
1933         tokudb::memory::free(range_query_buff);
1934         range_query_buff = NULL;
1935         tokudb::memory::free(alloc_ptr);
1936         alloc_ptr = NULL;
1937         tokudb::memory::free(rec_buff);
1938         rec_buff = NULL;
1939         tokudb::memory::free(rec_update_buff);
1940         rec_update_buff = NULL;
1941 
1942         if (error) {
1943             my_errno = error;
1944         }
1945     }
1946     TOKUDB_HANDLER_DBUG_RETURN(ret_val);
1947 }
1948 
1949 //
1950 // estimate the number of rows in a DB
1951 // Parameters:
1952 //      [in]    db - DB whose number of rows will be estimated
1953 //      [out]   num_rows - number of estimated rows in db
1954 // Returns:
1955 //      0 on success
1956 //      error otherwise
1957 //
estimate_num_rows(DB * db,uint64_t * num_rows,DB_TXN * txn)1958 int ha_tokudb::estimate_num_rows(DB* db, uint64_t* num_rows, DB_TXN* txn) {
1959     int error = ENOSYS;
1960     bool do_commit = false;
1961     DB_BTREE_STAT64 dict_stats;
1962     DB_TXN* txn_to_use = NULL;
1963 
1964     if (txn == NULL) {
1965         error = txn_begin(db_env, 0, &txn_to_use, DB_READ_UNCOMMITTED, ha_thd());
1966         if (error) goto cleanup;
1967         do_commit = true;
1968     }
1969     else {
1970         txn_to_use = txn;
1971     }
1972 
1973     error = db->stat64(db, txn_to_use, &dict_stats);
1974     if (error) { goto cleanup; }
1975 
1976     *num_rows = dict_stats.bt_ndata;
1977     error = 0;
1978 cleanup:
1979     if (do_commit) {
1980         commit_txn(txn_to_use, 0);
1981         txn_to_use = NULL;
1982     }
1983     return error;
1984 }
1985 
1986 
write_to_status(DB * db,HA_METADATA_KEY curr_key_data,void * data,uint size,DB_TXN * txn)1987 int ha_tokudb::write_to_status(DB* db, HA_METADATA_KEY curr_key_data, void* data, uint size, DB_TXN* txn ){
1988     return write_metadata(db, &curr_key_data, sizeof curr_key_data, data, size, txn);
1989 }
1990 
remove_from_status(DB * db,HA_METADATA_KEY curr_key_data,DB_TXN * txn)1991 int ha_tokudb::remove_from_status(DB *db, HA_METADATA_KEY curr_key_data, DB_TXN *txn) {
1992     return remove_metadata(db, &curr_key_data, sizeof curr_key_data, txn);
1993 }
1994 
remove_metadata(DB * db,void * key_data,uint key_size,DB_TXN * transaction)1995 int ha_tokudb::remove_metadata(DB* db, void* key_data, uint key_size, DB_TXN* transaction){
1996     int error;
1997     DBT key;
1998     DB_TXN* txn = NULL;
1999     bool do_commit = false;
2000     //
2001     // transaction to be used for putting metadata into status.tokudb
2002     //
2003     if (transaction == NULL) {
2004         error = txn_begin(db_env, 0, &txn, 0, ha_thd());
2005         if (error) {
2006             goto cleanup;
2007         }
2008         do_commit = true;
2009     }
2010     else {
2011         txn = transaction;
2012     }
2013 
2014     memset(&key, 0, sizeof(key));
2015     key.data = key_data;
2016     key.size = key_size;
2017     error = db->del(db, txn, &key, DB_DELETE_ANY);
2018     if (error) {
2019         goto cleanup;
2020     }
2021 
2022     error = 0;
2023 cleanup:
2024     if (do_commit && txn) {
2025         if (!error) {
2026             commit_txn(txn, DB_TXN_NOSYNC);
2027         }
2028         else {
2029             abort_txn(txn);
2030         }
2031     }
2032     return error;
2033 }
2034 
2035 //
2036 // helper function to write a piece of metadata in to status.tokudb
2037 //
write_metadata(DB * db,void * key_data,uint key_size,void * val_data,uint val_size,DB_TXN * transaction)2038 int ha_tokudb::write_metadata(DB* db, void* key_data, uint key_size, void* val_data, uint val_size, DB_TXN* transaction ){
2039     int error;
2040     DBT key;
2041     DBT value;
2042     DB_TXN* txn = NULL;
2043     bool do_commit = false;
2044     //
2045     // transaction to be used for putting metadata into status.tokudb
2046     //
2047     if (transaction == NULL) {
2048         error = txn_begin(db_env, 0, &txn, 0, ha_thd());
2049         if (error) {
2050             goto cleanup;
2051         }
2052         do_commit = true;
2053     }
2054     else {
2055         txn = transaction;
2056     }
2057 
2058     memset(&key, 0, sizeof(key));
2059     memset(&value, 0, sizeof(value));
2060     key.data = key_data;
2061     key.size = key_size;
2062     value.data = val_data;
2063     value.size = val_size;
2064     error = db->put(db, txn, &key, &value, 0);
2065     if (error) {
2066         goto cleanup;
2067     }
2068 
2069     error = 0;
2070 cleanup:
2071     if (do_commit && txn) {
2072         if (!error) {
2073             commit_txn(txn, DB_TXN_NOSYNC);
2074         }
2075         else {
2076             abort_txn(txn);
2077         }
2078     }
2079     return error;
2080 }
2081 
2082 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
write_frm_data(DB * db,DB_TXN * txn,const char * frm_name)2083 int ha_tokudb::write_frm_data(DB* db, DB_TXN* txn, const char* frm_name) {
2084     TOKUDB_HANDLER_DBUG_ENTER("%p %p %s", db, txn, frm_name);
2085 
2086     uchar* frm_data = NULL;
2087     size_t frm_len = 0;
2088     int error = 0;
2089 
2090 #if 100000 <= MYSQL_VERSION_ID
2091     error = table_share->read_frm_image((const uchar**)&frm_data,&frm_len);
2092     if (error) { goto cleanup; }
2093 #else
2094     error = readfrm(frm_name,&frm_data,&frm_len);
2095     if (error) { goto cleanup; }
2096 #endif
2097 
2098     error = write_to_status(db,hatoku_frm_data,frm_data,(uint)frm_len, txn);
2099     if (error) { goto cleanup; }
2100 
2101     error = 0;
2102 cleanup:
2103     tokudb::memory::free(frm_data);
2104     TOKUDB_HANDLER_DBUG_RETURN(error);
2105 }
2106 
remove_frm_data(DB * db,DB_TXN * txn)2107 int ha_tokudb::remove_frm_data(DB *db, DB_TXN *txn) {
2108     return remove_from_status(db, hatoku_frm_data, txn);
2109 }
2110 
smart_dbt_callback_verify_frm(TOKUDB_UNUSED (DBT const * key),DBT const * row,void * context)2111 static int smart_dbt_callback_verify_frm(TOKUDB_UNUSED(DBT const* key),
2112                                          DBT const* row,
2113                                          void* context) {
2114     DBT* stored_frm = (DBT *)context;
2115     stored_frm->size = row->size;
2116     stored_frm->data = (uchar *)tokudb::memory::malloc(row->size, MYF(MY_WME));
2117     assert_always(stored_frm->data);
2118     memcpy(stored_frm->data, row->data, row->size);
2119     return 0;
2120 }
2121 
verify_frm_data(const char * frm_name,DB_TXN * txn)2122 int ha_tokudb::verify_frm_data(const char* frm_name, DB_TXN* txn) {
2123     TOKUDB_HANDLER_DBUG_ENTER("%s", frm_name);
2124     uchar* mysql_frm_data = NULL;
2125     size_t mysql_frm_len = 0;
2126     DBT key = {};
2127     DBT stored_frm = {};
2128     int error = 0;
2129     HA_METADATA_KEY curr_key = hatoku_frm_data;
2130 
2131     // get the frm data from MySQL
2132 #if 100000 <= MYSQL_VERSION_ID
2133     error = table_share->read_frm_image((const uchar**)&mysql_frm_data,&mysql_frm_len);
2134     if (error) {
2135         goto cleanup;
2136     }
2137 #else
2138     error = readfrm(frm_name,&mysql_frm_data,&mysql_frm_len);
2139     if (error) {
2140         goto cleanup;
2141     }
2142 #endif
2143 
2144     key.data = &curr_key;
2145     key.size = sizeof(curr_key);
2146     error = share->status_block->getf_set(
2147         share->status_block,
2148         txn,
2149         0,
2150         &key,
2151         smart_dbt_callback_verify_frm,
2152         &stored_frm
2153         );
2154     if (error == DB_NOTFOUND) {
2155         // if not found, write it
2156         error = write_frm_data(share->status_block, txn, frm_name);
2157         goto cleanup;
2158     } else if (error) {
2159         goto cleanup;
2160     }
2161 
2162     if (stored_frm.size != mysql_frm_len || memcmp(stored_frm.data, mysql_frm_data, stored_frm.size)) {
2163         error = HA_ERR_TABLE_DEF_CHANGED;
2164         goto cleanup;
2165     }
2166 
2167     error = 0;
2168 cleanup:
2169     tokudb::memory::free(mysql_frm_data);
2170     tokudb::memory::free(stored_frm.data);
2171     TOKUDB_HANDLER_DBUG_RETURN(error);
2172 }
2173 #endif  // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
2174 
2175 //
2176 // Updates status.tokudb with a new max value used for the auto increment column
2177 // Parameters:
2178 //      [in]    db - this will always be status.tokudb
2179 //              val - value to store
2180 //  Returns:
2181 //      0 on success, error otherwise
2182 //
2183 //
update_max_auto_inc(DB * db,ulonglong val)2184 int ha_tokudb::update_max_auto_inc(DB* db, ulonglong val){
2185     return write_to_status(db,hatoku_max_ai,&val,sizeof(val), NULL);
2186 }
2187 
2188 //
2189 // Writes the initial auto increment value, as specified by create table
2190 // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
2191 // then the value 100 will be stored here in val
2192 // Parameters:
2193 //      [in]    db - this will always be status.tokudb
2194 //              val - value to store
2195 //  Returns:
2196 //      0 on success, error otherwise
2197 //
2198 //
write_auto_inc_create(DB * db,ulonglong val,DB_TXN * txn)2199 int ha_tokudb::write_auto_inc_create(DB* db, ulonglong val, DB_TXN* txn){
2200     return write_to_status(db,hatoku_ai_create_value,&val,sizeof(val), txn);
2201 }
2202 
2203 
2204 //
2205 // Closes a handle to a table.
2206 //
close()2207 int ha_tokudb::close() {
2208     TOKUDB_HANDLER_DBUG_ENTER("");
2209     int r = __close();
2210     TOKUDB_HANDLER_DBUG_RETURN(r);
2211 }
2212 
__close()2213 int ha_tokudb::__close() {
2214     TOKUDB_HANDLER_DBUG_ENTER("");
2215     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "close:%p", this);
2216     tokudb::memory::free(rec_buff);
2217     tokudb::memory::free(rec_update_buff);
2218     tokudb::memory::free(blob_buff);
2219     tokudb::memory::free(alloc_ptr);
2220     tokudb::memory::free(range_query_buff);
2221     for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
2222         toku_dbt_array_destroy(&mult_key_dbt_array[i]);
2223     }
2224     for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
2225         toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
2226     }
2227     rec_buff = NULL;
2228     rec_update_buff = NULL;
2229     alloc_ptr = NULL;
2230     ha_tokudb::reset();
2231     int retval = share->release();
2232     TOKUDB_HANDLER_DBUG_RETURN(retval);
2233 }
2234 
2235 //
2236 // Reallocate record buffer (rec_buff) if needed
2237 // If not needed, does nothing
2238 // Parameters:
2239 //          length - size of buffer required for rec_buff
2240 //
fix_rec_buff_for_blob(ulong length)2241 bool ha_tokudb::fix_rec_buff_for_blob(ulong length) {
2242     if (!rec_buff || (length > alloced_rec_buff_length)) {
2243         uchar* newptr = (uchar*)tokudb::memory::realloc(
2244             (void*)rec_buff,
2245             length,
2246             MYF(MY_ALLOW_ZERO_PTR));
2247         if (!newptr)
2248             return 1;
2249         rec_buff = newptr;
2250         alloced_rec_buff_length = length;
2251     }
2252     return 0;
2253 }
2254 
2255 //
2256 // Reallocate record buffer (rec_buff) if needed
2257 // If not needed, does nothing
2258 // Parameters:
2259 //          length - size of buffer required for rec_buff
2260 //
fix_rec_update_buff_for_blob(ulong length)2261 bool ha_tokudb::fix_rec_update_buff_for_blob(ulong length) {
2262     if (!rec_update_buff || (length > alloced_update_rec_buff_length)) {
2263         uchar* newptr = (uchar*)tokudb::memory::realloc(
2264             (void*)rec_update_buff,
2265             length,
2266             MYF(MY_ALLOW_ZERO_PTR));
2267         if (!newptr)
2268             return 1;
2269         rec_update_buff= newptr;
2270         alloced_update_rec_buff_length = length;
2271     }
2272     return 0;
2273 }
2274 
2275 /* Calculate max length needed for row */
max_row_length(const uchar * buf)2276 ulong ha_tokudb::max_row_length(const uchar * buf) {
2277     ulong length = table_share->reclength + table_share->fields * 2;
2278     uint *ptr, *end;
2279     for (ptr = table_share->blob_field, end = ptr + table_share->blob_fields; ptr != end; ptr++) {
2280         Field_blob *blob = ((Field_blob *) table->field[*ptr]);
2281         length += blob->get_length((uchar *) (buf + field_offset(blob, table))) + 2;
2282     }
2283     return length;
2284 }
2285 
2286 /*
2287 */
2288 //
2289 // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2290 // Pack a row for storage.
2291 // If the row is of fixed length, just store the  row 'as is'.
2292 // If not, we will generate a packed row suitable for storage.
2293 // This will only fail if we don't have enough memory to pack the row,
2294 // which may only happen in rows with blobs, as the default row length is
2295 // pre-allocated.
2296 // Parameters:
2297 //      [out]   row - row stored in DBT to be converted
2298 //      [out]   buf - buffer where row is packed
2299 //      [in]    record - row in MySQL format
2300 //
2301 
pack_row_in_buff(DBT * row,const uchar * record,uint index,uchar * row_buff)2302 int ha_tokudb::pack_row_in_buff(
2303     DBT * row,
2304     const uchar* record,
2305     uint index,
2306     uchar* row_buff
2307     )
2308 {
2309     uchar* fixed_field_ptr = NULL;
2310     uchar* var_field_offset_ptr = NULL;
2311     uchar* start_field_data_ptr = NULL;
2312     uchar* var_field_data_ptr = NULL;
2313     int r = ENOSYS;
2314     memset((void *) row, 0, sizeof(*row));
2315 
2316     MY_BITMAP *old_map = dbug_tmp_use_all_columns(table, &table->write_set);
2317 
2318     // Copy null bytes
2319     memcpy(row_buff, record, table_share->null_bytes);
2320     fixed_field_ptr = row_buff + table_share->null_bytes;
2321     var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2322     start_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2323     var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2324 
2325     // assert that when the hidden primary key exists, primary_key_offsets is NULL
2326     for (uint i = 0; i < table_share->fields; i++) {
2327         Field* field = table->field[i];
2328         uint curr_field_offset = field_offset(field, table);
2329         if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2330             continue;
2331         }
2332         if (is_fixed_field(&share->kc_info, i)) {
2333             fixed_field_ptr = pack_fixed_field(
2334                 fixed_field_ptr,
2335                 record + curr_field_offset,
2336                 share->kc_info.field_lengths[i]
2337                 );
2338         }
2339         else if (is_variable_field(&share->kc_info, i)) {
2340             var_field_data_ptr = pack_var_field(
2341                 var_field_offset_ptr,
2342                 var_field_data_ptr,
2343                 start_field_data_ptr,
2344                 record + curr_field_offset,
2345                 share->kc_info.length_bytes[i],
2346                 share->kc_info.num_offset_bytes
2347                 );
2348             var_field_offset_ptr += share->kc_info.num_offset_bytes;
2349         }
2350     }
2351 
2352     for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2353         Field* field = table->field[share->kc_info.blob_fields[i]];
2354         var_field_data_ptr = pack_toku_field_blob(
2355             var_field_data_ptr,
2356             record + field_offset(field, table),
2357             field
2358             );
2359     }
2360 
2361     row->data = row_buff;
2362     row->size = (size_t) (var_field_data_ptr - row_buff);
2363     r = 0;
2364 
2365     dbug_tmp_restore_column_map(&table->write_set, old_map);
2366     return r;
2367 }
2368 
2369 
pack_row(DBT * row,const uchar * record,uint index)2370 int ha_tokudb::pack_row(
2371     DBT * row,
2372     const uchar* record,
2373     uint index
2374     )
2375 {
2376     return pack_row_in_buff(row,record,index,rec_buff);
2377 }
2378 
pack_old_row_for_update(DBT * row,const uchar * record,uint index)2379 int ha_tokudb::pack_old_row_for_update(
2380     DBT * row,
2381     const uchar* record,
2382     uint index
2383     )
2384 {
2385     return pack_row_in_buff(row,record,index,rec_update_buff);
2386 }
2387 
2388 
unpack_blobs(uchar * record,const uchar * from_tokudb_blob,uint32_t num_bytes,bool check_bitmap)2389 int ha_tokudb::unpack_blobs(
2390     uchar* record,
2391     const uchar* from_tokudb_blob,
2392     uint32_t num_bytes,
2393     bool check_bitmap
2394     )
2395 {
2396     uint error = 0;
2397     uchar* ptr = NULL;
2398     const uchar* buff = NULL;
2399     //
2400     // assert that num_bytes > 0 iff share->num_blobs > 0
2401     //
2402     assert_always( !((share->kc_info.num_blobs == 0) && (num_bytes > 0)) );
2403     if (num_bytes > num_blob_bytes) {
2404         ptr = (uchar*)tokudb::memory::realloc(
2405             (void*)blob_buff, num_bytes,
2406             MYF(MY_ALLOW_ZERO_PTR));
2407         if (ptr == NULL) {
2408             error = ENOMEM;
2409             goto exit;
2410         }
2411         blob_buff = ptr;
2412         num_blob_bytes = num_bytes;
2413     }
2414 
2415     memcpy(blob_buff, from_tokudb_blob, num_bytes);
2416     buff= blob_buff;
2417     for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2418         uint32_t curr_field_index = share->kc_info.blob_fields[i];
2419         bool skip = check_bitmap ?
2420             !(bitmap_is_set(table->read_set,curr_field_index) ||
2421                 bitmap_is_set(table->write_set,curr_field_index)) :
2422             false;
2423         Field* field = table->field[curr_field_index];
2424         uint32_t len_bytes = field->row_pack_length();
2425         const uchar* end_buff = unpack_toku_field_blob(
2426             record + field_offset(field, table),
2427             buff,
2428             len_bytes,
2429             skip
2430             );
2431         // verify that the pointers to the blobs are all contained within the blob_buff
2432         if (!(blob_buff <= buff && end_buff <= blob_buff + num_bytes)) {
2433             error = -3000000;
2434             goto exit;
2435         }
2436         buff = end_buff;
2437     }
2438     // verify that the entire blob buffer was parsed
2439     if (share->kc_info.num_blobs > 0 && !(num_bytes > 0 && buff == blob_buff + num_bytes)) {
2440         error = -4000000;
2441         goto exit;
2442     }
2443 
2444     error = 0;
2445 exit:
2446     return error;
2447 }
2448 
2449 //
2450 // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2451 // Parameters:
2452 //      [out]   record - row in MySQL format
2453 //      [in]    row - row stored in DBT to be converted
2454 //
unpack_row(uchar * record,DBT const * row,DBT const * key,uint index)2455 int ha_tokudb::unpack_row(
2456     uchar* record,
2457     DBT const *row,
2458     DBT const *key,
2459     uint index
2460     )
2461 {
2462     //
2463     // two cases, fixed length row, and variable length row
2464     // fixed length row is first below
2465     //
2466     /* Copy null bits */
2467     int error = 0;
2468     const uchar* fixed_field_ptr = (const uchar *) row->data;
2469     const uchar* var_field_offset_ptr = NULL;
2470     const uchar* var_field_data_ptr = NULL;
2471     uint32_t data_end_offset = 0;
2472     memcpy(record, fixed_field_ptr, table_share->null_bytes);
2473     fixed_field_ptr += table_share->null_bytes;
2474 
2475     var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2476     var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2477 
2478     //
2479     // unpack the key, if necessary
2480     //
2481     if (!(hidden_primary_key && index == primary_key)) {
2482         unpack_key(record,key,index);
2483     }
2484 
2485     uint32_t last_offset = 0;
2486     //
2487     // we have two methods of unpacking, one if we need to unpack the entire row
2488     // the second if we unpack a subset of the entire row
2489     // first method here is if we unpack the entire row
2490     //
2491     if (unpack_entire_row) {
2492         //
2493         // fill in parts of record that are not part of the key
2494         //
2495         for (uint i = 0; i < table_share->fields; i++) {
2496             Field* field = table->field[i];
2497             if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2498                 continue;
2499             }
2500 
2501             if (is_fixed_field(&share->kc_info, i)) {
2502                 fixed_field_ptr = unpack_fixed_field(
2503                     record + field_offset(field, table),
2504                     fixed_field_ptr,
2505                     share->kc_info.field_lengths[i]
2506                     );
2507             }
2508             //
2509             // here, we DO modify var_field_data_ptr or var_field_offset_ptr
2510             // as we unpack variable sized fields
2511             //
2512             else if (is_variable_field(&share->kc_info, i)) {
2513                 switch (share->kc_info.num_offset_bytes) {
2514                 case (1):
2515                     data_end_offset = var_field_offset_ptr[0];
2516                     break;
2517                 case (2):
2518                     data_end_offset = uint2korr(var_field_offset_ptr);
2519                     break;
2520                 default:
2521                     assert_unreachable();
2522                 }
2523                 unpack_var_field(
2524                     record + field_offset(field, table),
2525                     var_field_data_ptr,
2526                     data_end_offset - last_offset,
2527                     share->kc_info.length_bytes[i]
2528                     );
2529                 var_field_offset_ptr += share->kc_info.num_offset_bytes;
2530                 var_field_data_ptr += data_end_offset - last_offset;
2531                 last_offset = data_end_offset;
2532             }
2533         }
2534         error = unpack_blobs(
2535             record,
2536             var_field_data_ptr,
2537             row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2538             false
2539             );
2540         if (error) {
2541             goto exit;
2542         }
2543     }
2544     //
2545     // in this case, we unpack only what is specified
2546     // in fixed_cols_for_query and var_cols_for_query
2547     //
2548     else {
2549         //
2550         // first the fixed fields
2551         //
2552         for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
2553             uint field_index = fixed_cols_for_query[i];
2554             Field* field = table->field[field_index];
2555             unpack_fixed_field(
2556                 record + field_offset(field, table),
2557                 fixed_field_ptr + share->kc_info.cp_info[index][field_index].col_pack_val,
2558                 share->kc_info.field_lengths[field_index]
2559                 );
2560         }
2561 
2562         //
2563         // now the var fields
2564         // here, we do NOT modify var_field_data_ptr or var_field_offset_ptr
2565         //
2566         for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
2567             uint field_index = var_cols_for_query[i];
2568             Field* field = table->field[field_index];
2569             uint32_t var_field_index = share->kc_info.cp_info[index][field_index].col_pack_val;
2570             uint32_t data_start_offset;
2571             uint32_t field_len;
2572 
2573             get_var_field_info(
2574                 &field_len,
2575                 &data_start_offset,
2576                 var_field_index,
2577                 var_field_offset_ptr,
2578                 share->kc_info.num_offset_bytes
2579                 );
2580 
2581             unpack_var_field(
2582                 record + field_offset(field, table),
2583                 var_field_data_ptr + data_start_offset,
2584                 field_len,
2585                 share->kc_info.length_bytes[field_index]
2586                 );
2587         }
2588 
2589         if (read_blobs) {
2590             //
2591             // now the blobs
2592             //
2593             get_blob_field_info(
2594                 &data_end_offset,
2595                 share->kc_info.mcp_info[index].len_of_offsets,
2596                 var_field_data_ptr,
2597                 share->kc_info.num_offset_bytes
2598                 );
2599 
2600             var_field_data_ptr += data_end_offset;
2601             error = unpack_blobs(
2602                 record,
2603                 var_field_data_ptr,
2604                 row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2605                 true
2606                 );
2607             if (error) {
2608                 goto exit;
2609             }
2610         }
2611     }
2612     error = 0;
2613 exit:
2614     return error;
2615 }
2616 
place_key_into_mysql_buff(KEY * key_info,uchar * record,uchar * data)2617 uint32_t ha_tokudb::place_key_into_mysql_buff(
2618     KEY* key_info,
2619     uchar* record,
2620     uchar* data) {
2621 
2622     KEY_PART_INFO* key_part = key_info->key_part;
2623     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2624     uchar* pos = data;
2625 
2626     for (; key_part != end; key_part++) {
2627         if (key_part->field->null_bit) {
2628             uint null_offset = get_null_offset(table, key_part->field);
2629             if (*pos++ == NULL_COL_VAL) { // Null value
2630                 //
2631                 // We don't need to reset the record data as we will not access it
2632                 // if the null data is set
2633                 //
2634                 record[null_offset] |= key_part->field->null_bit;
2635                 continue;
2636             }
2637             record[null_offset] &= ~key_part->field->null_bit;
2638         }
2639 #if !defined(MARIADB_BASE_VERSION)
2640         //
2641         // HOPEFULLY TEMPORARY
2642         //
2643         assert_always(table->s->db_low_byte_first);
2644 #endif
2645         pos = unpack_toku_key_field(
2646             record + field_offset(key_part->field, table),
2647             pos,
2648             key_part->field,
2649             key_part->length
2650             );
2651     }
2652     return pos-data;
2653 }
2654 
2655 //
2656 // Store the key and the primary key into the row
2657 // Parameters:
2658 //      [out]   record - key stored in MySQL format
2659 //      [in]    key - key stored in DBT to be converted
2660 //              index -index into key_file that represents the DB
2661 //                  unpacking a key of
2662 //
unpack_key(uchar * record,DBT const * key,uint index)2663 void ha_tokudb::unpack_key(uchar * record, DBT const *key, uint index) {
2664     uint32_t bytes_read;
2665     uchar *pos = (uchar *) key->data + 1;
2666     bytes_read = place_key_into_mysql_buff(
2667         &table->key_info[index],
2668         record,
2669         pos
2670         );
2671     if( (index != primary_key) && !hidden_primary_key) {
2672         //
2673         // also unpack primary key
2674         //
2675         place_key_into_mysql_buff(
2676             &table->key_info[primary_key],
2677             record,
2678             pos+bytes_read
2679             );
2680     }
2681 }
2682 
place_key_into_dbt_buff(KEY * key_info,uchar * buff,const uchar * record,bool * has_null,int key_length)2683 uint32_t ha_tokudb::place_key_into_dbt_buff(
2684     KEY* key_info,
2685     uchar* buff,
2686     const uchar* record,
2687     bool* has_null,
2688     int key_length) {
2689 
2690     KEY_PART_INFO* key_part = key_info->key_part;
2691     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2692     uchar* curr_buff = buff;
2693     *has_null = false;
2694     for (; key_part != end && key_length > 0; key_part++) {
2695         //
2696         // accessing key_part->field->null_bit instead off key_part->null_bit
2697         // because key_part->null_bit is not set in add_index
2698         // filed ticket 862 to look into this
2699         //
2700         if (key_part->field->null_bit) {
2701             /* Store 0 if the key part is a NULL part */
2702             uint null_offset = get_null_offset(table, key_part->field);
2703             if (record[null_offset] & key_part->field->null_bit) {
2704                 *curr_buff++ = NULL_COL_VAL;
2705                 *has_null = true;
2706                 continue;
2707             }
2708             *curr_buff++ = NONNULL_COL_VAL;        // Store NOT NULL marker
2709         }
2710 #if !defined(MARIADB_BASE_VERSION)
2711         //
2712         // HOPEFULLY TEMPORARY
2713         //
2714         assert_always(table->s->db_low_byte_first);
2715 #endif
2716         //
2717         // accessing field_offset(key_part->field) instead off key_part->offset
2718         // because key_part->offset is SET INCORRECTLY in add_index
2719         // filed ticket 862 to look into this
2720         //
2721         curr_buff = pack_toku_key_field(
2722             curr_buff,
2723             (uchar *) (record + field_offset(key_part->field, table)),
2724             key_part->field,
2725             key_part->length
2726             );
2727         key_length -= key_part->length;
2728     }
2729     return curr_buff - buff;
2730 }
2731 
2732 
2733 
2734 //
2735 // Create a packed key from a row. This key will be written as such
2736 // to the index tree.  This will never fail as the key buffer is pre-allocated.
2737 // Parameters:
2738 //      [out]   key - DBT that holds the key
2739 //      [in]    key_info - holds data about the key, such as it's length and offset into record
2740 //      [out]   buff - buffer that will hold the data for key (unless
2741 //                  we have a hidden primary key)
2742 //      [in]    record - row from which to create the key
2743 //              key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2744 // Returns:
2745 //      the parameter key
2746 //
2747 
create_dbt_key_from_key(DBT * key,KEY * key_info,uchar * buff,const uchar * record,bool * has_null,bool dont_pack_pk,int key_length,uint8_t inf_byte)2748 DBT* ha_tokudb::create_dbt_key_from_key(
2749     DBT * key,
2750     KEY* key_info,
2751     uchar * buff,
2752     const uchar * record,
2753     bool* has_null,
2754     bool dont_pack_pk,
2755     int key_length,
2756     uint8_t inf_byte
2757     )
2758 {
2759     uint32_t size = 0;
2760     uchar* tmp_buff = buff;
2761     MY_BITMAP *old_map = dbug_tmp_use_all_columns(table, &table->write_set);
2762 
2763     key->data = buff;
2764 
2765     //
2766     // first put the "infinity" byte at beginning. States if missing columns are implicitly
2767     // positive infinity or negative infinity or zero. For this, because we are creating key
2768     // from a row, there is no way that columns can be missing, so in practice,
2769     // this will be meaningless. Might as well put in a value
2770     //
2771     *tmp_buff++ = inf_byte;
2772     size++;
2773     size += place_key_into_dbt_buff(
2774         key_info,
2775         tmp_buff,
2776         record,
2777         has_null,
2778         key_length
2779         );
2780     if (!dont_pack_pk) {
2781         tmp_buff = buff + size;
2782         if (hidden_primary_key) {
2783             memcpy(tmp_buff, current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2784             size += TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2785         }
2786         else {
2787             bool tmp_bool = false;
2788             size += place_key_into_dbt_buff(
2789                 &table->key_info[primary_key],
2790                 tmp_buff,
2791                 record,
2792                 &tmp_bool,
2793                 MAX_KEY_LENGTH //this parameter does not matter
2794                 );
2795         }
2796     }
2797 
2798     key->size = size;
2799     DBUG_DUMP("key", (uchar *) key->data, key->size);
2800     dbug_tmp_restore_column_map(&table->write_set, old_map);
2801     return key;
2802 }
2803 
2804 
2805 //
2806 // Create a packed key from a row. This key will be written as such
2807 // to the index tree.  This will never fail as the key buffer is pre-allocated.
2808 // Parameters:
2809 //      [out]   key - DBT that holds the key
2810 //              keynr - index for which to create the key
2811 //      [out]   buff - buffer that will hold the data for key (unless
2812 //                  we have a hidden primary key)
2813 //      [in]    record - row from which to create the key
2814 //      [out]   has_null - says if the key has a NULL value for one of its columns
2815 //              key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2816 // Returns:
2817 //      the parameter key
2818 //
create_dbt_key_from_table(DBT * key,uint keynr,uchar * buff,const uchar * record,bool * has_null,int key_length)2819 DBT *ha_tokudb::create_dbt_key_from_table(
2820     DBT * key,
2821     uint keynr,
2822     uchar * buff,
2823     const uchar * record,
2824     bool* has_null,
2825     int key_length
2826     )
2827 {
2828     TOKUDB_HANDLER_DBUG_ENTER("");
2829     memset((void *) key, 0, sizeof(*key));
2830     if (hidden_primary_key && keynr == primary_key) {
2831         key->data = buff;
2832         memcpy(buff, &current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2833         key->size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2834         *has_null = false;
2835         DBUG_RETURN(key);
2836     }
2837     DBUG_RETURN(create_dbt_key_from_key(key, &table->key_info[keynr],buff,record, has_null, (keynr == primary_key), key_length, COL_ZERO));
2838 }
2839 
create_dbt_key_for_lookup(DBT * key,KEY * key_info,uchar * buff,const uchar * record,bool * has_null,int key_length)2840 DBT* ha_tokudb::create_dbt_key_for_lookup(
2841     DBT * key,
2842     KEY* key_info,
2843     uchar * buff,
2844     const uchar * record,
2845     bool* has_null,
2846     int key_length
2847     )
2848 {
2849     TOKUDB_HANDLER_DBUG_ENTER("");
2850     // override the infinity byte, needed in case the pk is a string
2851     // to make sure that the cursor that uses this key properly positions
2852     // it at the right location. If the table stores "D", but we look up for "d",
2853     // and the infinity byte is 0, then we will skip the "D", because
2854     // in bytes, "d" > "D".
2855     DBT* ret = create_dbt_key_from_key(key, key_info, buff, record, has_null, true, key_length, COL_NEG_INF);
2856     DBUG_RETURN(ret);
2857 }
2858 
2859 //
2860 // Create a packed key from from a MySQL unpacked key (like the one that is
2861 // sent from the index_read() This key is to be used to read a row
2862 // Parameters:
2863 //      [out]   key - DBT that holds the key
2864 //              keynr - index for which to pack the key
2865 //      [out]   buff - buffer that will hold the data for key
2866 //      [in]    key_ptr - MySQL unpacked key
2867 //              key_length - length of key_ptr
2868 // Returns:
2869 //      the parameter key
2870 //
pack_key(DBT * key,uint keynr,uchar * buff,const uchar * key_ptr,uint key_length,int8_t inf_byte)2871 DBT* ha_tokudb::pack_key(
2872     DBT* key,
2873     uint keynr,
2874     uchar* buff,
2875     const uchar* key_ptr,
2876     uint key_length,
2877     int8_t inf_byte) {
2878 
2879     TOKUDB_HANDLER_DBUG_ENTER(
2880         "key %p %u:%2.2x inf=%d",
2881         key_ptr,
2882         key_length,
2883         key_length > 0 ? key_ptr[0] : 0,
2884         inf_byte);
2885 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2886     if (keynr != primary_key && !tokudb_test(hidden_primary_key)) {
2887         DBUG_RETURN(pack_ext_key(key, keynr, buff, key_ptr, key_length, inf_byte));
2888     }
2889 #endif  // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2890     KEY* key_info = &table->key_info[keynr];
2891     KEY_PART_INFO* key_part = key_info->key_part;
2892     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2893     MY_BITMAP* old_map = dbug_tmp_use_all_columns(table, &table->write_set);
2894 
2895     memset((void *) key, 0, sizeof(*key));
2896     key->data = buff;
2897 
2898     // first put the "infinity" byte at beginning. States if missing columns are implicitly
2899     // positive infinity or negative infinity
2900     *buff++ = (uchar)inf_byte;
2901 
2902     for (; key_part != end && (int) key_length > 0; key_part++) {
2903         uint offset = 0;
2904         if (key_part->null_bit) {
2905             if (!(*key_ptr == 0)) {
2906                 *buff++ = NULL_COL_VAL;
2907                 key_length -= key_part->store_length;
2908                 key_ptr += key_part->store_length;
2909                 continue;
2910             }
2911             *buff++ = NONNULL_COL_VAL;
2912             offset = 1;         // Data is at key_ptr+1
2913         }
2914 #if !defined(MARIADB_BASE_VERSION)
2915         assert_always(table->s->db_low_byte_first);
2916 #endif
2917         buff = pack_key_toku_key_field(
2918             buff,
2919             (uchar *) key_ptr + offset,
2920             key_part->field,
2921             key_part->length
2922             );
2923 
2924         key_ptr += key_part->store_length;
2925         key_length -= key_part->store_length;
2926     }
2927 
2928     key->size = (buff - (uchar *) key->data);
2929     DBUG_DUMP("key", (uchar *) key->data, key->size);
2930     dbug_tmp_restore_column_map(&table->write_set, old_map);
2931     DBUG_RETURN(key);
2932 }
2933 
2934 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
pack_ext_key(DBT * key,uint keynr,uchar * buff,const uchar * key_ptr,uint key_length,int8_t inf_byte)2935 DBT* ha_tokudb::pack_ext_key(
2936     DBT* key,
2937     uint keynr,
2938     uchar* buff,
2939     const uchar* key_ptr,
2940     uint key_length,
2941     int8_t inf_byte) {
2942 
2943     TOKUDB_HANDLER_DBUG_ENTER("");
2944 
2945     // build a list of PK parts that are in the SK.  we will use this list to build the
2946     // extended key if necessary.
2947     KEY* pk_key_info = &table->key_info[primary_key];
2948     uint pk_parts = pk_key_info->user_defined_key_parts;
2949     uint pk_next = 0;
2950     struct {
2951         const uchar *key_ptr;
2952         KEY_PART_INFO *key_part;
2953     } pk_info[pk_parts];
2954 
2955     KEY* key_info = &table->key_info[keynr];
2956     KEY_PART_INFO* key_part = key_info->key_part;
2957     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2958     MY_BITMAP* old_map = dbug_tmp_use_all_columns(table, &table->write_set);
2959 
2960     memset((void *) key, 0, sizeof(*key));
2961     key->data = buff;
2962 
2963     // first put the "infinity" byte at beginning. States if missing columns are implicitly
2964     // positive infinity or negative infinity
2965     *buff++ = (uchar)inf_byte;
2966 
2967     for (; key_part != end && (int) key_length > 0; key_part++) {
2968         // if the SK part is part of the PK, then append it to the list.
2969         if (key_part->field->part_of_key.is_set(primary_key)) {
2970             assert_always(pk_next < pk_parts);
2971             pk_info[pk_next].key_ptr = key_ptr;
2972             pk_info[pk_next].key_part = key_part;
2973             pk_next++;
2974         }
2975         uint offset = 0;
2976         if (key_part->null_bit) {
2977             if (!(*key_ptr == 0)) {
2978                 *buff++ = NULL_COL_VAL;
2979                 key_length -= key_part->store_length;
2980                 key_ptr += key_part->store_length;
2981                 continue;
2982             }
2983             *buff++ = NONNULL_COL_VAL;
2984             offset = 1;         // Data is at key_ptr+1
2985         }
2986 #if !defined(MARIADB_BASE_VERSION)
2987         assert_always(table->s->db_low_byte_first);
2988 #endif
2989         buff = pack_key_toku_key_field(
2990             buff,
2991             (uchar *) key_ptr + offset,
2992             key_part->field,
2993             key_part->length
2994             );
2995 
2996         key_ptr += key_part->store_length;
2997         key_length -= key_part->store_length;
2998     }
2999 
3000     if (key_length > 0) {
3001         assert_always(key_part == end);
3002         end = key_info->key_part + get_ext_key_parts(key_info);
3003 
3004         // pack PK in order of PK key parts
3005         for (uint pk_index = 0;
3006              key_part != end && (int) key_length > 0 && pk_index < pk_parts;
3007              pk_index++) {
3008             uint i;
3009             for (i = 0; i < pk_next; i++) {
3010                 if (pk_info[i].key_part->fieldnr ==
3011                     pk_key_info->key_part[pk_index].fieldnr)
3012                     break;
3013             }
3014             if (i < pk_next) {
3015                 const uchar *this_key_ptr = pk_info[i].key_ptr;
3016                 KEY_PART_INFO *this_key_part = pk_info[i].key_part;
3017                 buff = pack_key_toku_key_field(
3018                     buff,
3019                     (uchar*)this_key_ptr,
3020                     this_key_part->field,
3021                     this_key_part->length);
3022             } else {
3023                 buff = pack_key_toku_key_field(
3024                     buff,
3025                     (uchar*)key_ptr,
3026                     key_part->field,
3027                     key_part->length);
3028                 key_ptr += key_part->store_length;
3029                 key_length -= key_part->store_length;
3030                 key_part++;
3031             }
3032         }
3033     }
3034 
3035     key->size = (buff - (uchar *) key->data);
3036     DBUG_DUMP("key", (uchar *) key->data, key->size);
3037     dbug_tmp_restore_column_map(&table->write_set, old_map);
3038     DBUG_RETURN(key);
3039 }
3040 #endif  // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
3041 
3042 //
3043 // get max used hidden primary key value
3044 //
init_hidden_prim_key_info(DB_TXN * txn)3045 void ha_tokudb::init_hidden_prim_key_info(DB_TXN *txn) {
3046     TOKUDB_HANDLER_DBUG_ENTER("");
3047     if (!(share->status & STATUS_PRIMARY_KEY_INIT)) {
3048         int error = 0;
3049         DBC* c = NULL;
3050         error = share->key_file[primary_key]->cursor(
3051             share->key_file[primary_key],
3052             txn,
3053             &c,
3054             0);
3055         assert_always(error == 0);
3056         DBT key,val;
3057         memset(&key, 0, sizeof(key));
3058         memset(&val, 0, sizeof(val));
3059         error = c->c_get(c, &key, &val, DB_LAST);
3060         if (error == 0) {
3061             assert_always(key.size == TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
3062             share->auto_ident = hpk_char_to_num((uchar *)key.data);
3063         }
3064         error = c->c_close(c);
3065         assert_always(error == 0);
3066         share->status |= STATUS_PRIMARY_KEY_INIT;
3067     }
3068     TOKUDB_HANDLER_DBUG_VOID_RETURN;
3069 }
3070 
3071 
3072 
3073 /** @brief
3074     Get metadata info stored in status.tokudb
3075     */
get_status(DB_TXN * txn)3076 int ha_tokudb::get_status(DB_TXN* txn) {
3077     TOKUDB_HANDLER_DBUG_ENTER("");
3078     DBT key, value;
3079     HA_METADATA_KEY curr_key;
3080     int error;
3081 
3082     //
3083     // open status.tokudb
3084     //
3085     if (!share->status_block) {
3086         error =
3087             open_status_dictionary(
3088                 &share->status_block,
3089                 share->full_table_name(),
3090                 txn);
3091         if (error) {
3092             goto cleanup;
3093         }
3094     }
3095 
3096     //
3097     // transaction to be used for putting metadata into status.tokudb
3098     //
3099     memset(&key, 0, sizeof(key));
3100     memset(&value, 0, sizeof(value));
3101     key.data = &curr_key;
3102     key.size = sizeof(curr_key);
3103     value.flags = DB_DBT_USERMEM;
3104 
3105     assert_always(share->status_block);
3106     //
3107     // get version
3108     //
3109     value.ulen = sizeof(share->version);
3110     value.data = &share->version;
3111     curr_key = hatoku_new_version;
3112     error = share->status_block->get(
3113         share->status_block,
3114         txn,
3115         &key,
3116         &value,
3117         0
3118         );
3119     if (error == DB_NOTFOUND) {
3120         //
3121         // hack to keep handle the issues of going back and forth
3122         // between 5.0.3 to 5.0.4
3123         // the problem with going back and forth
3124         // is with storing the frm file, 5.0.4 stores it, 5.0.3 does not
3125         // so, if a user goes back and forth and alters the schema
3126         // the frm stored can get out of sync with the schema of the table
3127         // This can cause issues.
3128         // To take care of this, we are doing this versioning work here.
3129         // We change the key that stores the version.
3130         // In 5.0.3, it is hatoku_old_version, in 5.0.4 it is hatoku_new_version
3131         // When we encounter a table that does not have hatoku_new_version
3132         // set, we give it the right one, and overwrite the old one with zero.
3133         // This ensures that 5.0.3 cannot open the table. Once it has been opened by 5.0.4
3134         //
3135         uint dummy_version = 0;
3136         share->version = HA_TOKU_ORIG_VERSION;
3137         error = write_to_status(
3138             share->status_block,
3139             hatoku_new_version,
3140             &share->version,
3141             sizeof(share->version),
3142             txn
3143             );
3144         if (error) { goto cleanup; }
3145         error = write_to_status(
3146             share->status_block,
3147             hatoku_old_version,
3148             &dummy_version,
3149             sizeof(dummy_version),
3150             txn
3151             );
3152         if (error) { goto cleanup; }
3153     }
3154     else if (error || value.size != sizeof(share->version)) {
3155         if (error == 0) {
3156             error = HA_ERR_INTERNAL_ERROR;
3157         }
3158         goto cleanup;
3159     }
3160     //
3161     // get capabilities
3162     //
3163     curr_key = hatoku_capabilities;
3164     value.ulen = sizeof(share->capabilities);
3165     value.data = &share->capabilities;
3166     error = share->status_block->get(
3167         share->status_block,
3168         txn,
3169         &key,
3170         &value,
3171         0
3172         );
3173     if (error == DB_NOTFOUND) {
3174         share->capabilities= 0;
3175     }
3176     else if (error || value.size != sizeof(share->version)) {
3177         if (error == 0) {
3178             error = HA_ERR_INTERNAL_ERROR;
3179         }
3180         goto cleanup;
3181     }
3182 
3183     error = 0;
3184 cleanup:
3185     TOKUDB_HANDLER_DBUG_RETURN(error);
3186 }
3187 
3188 /** @brief
3189     Return an estimated of the number of rows in the table.
3190     Used when sorting to allocate buffers and by the optimizer.
3191     This is used in filesort.cc.
3192 */
estimate_rows_upper_bound()3193 ha_rows ha_tokudb::estimate_rows_upper_bound() {
3194     TOKUDB_HANDLER_DBUG_ENTER("");
3195     DBUG_RETURN(share->row_count() + HA_TOKUDB_EXTRA_ROWS);
3196 }
3197 
3198 //
3199 // Function that compares two primary keys that were saved as part of rnd_pos
3200 // and ::position
3201 //
cmp_ref(const uchar * ref1,const uchar * ref2)3202 int ha_tokudb::cmp_ref(const uchar * ref1, const uchar * ref2) {
3203     int ret_val = 0;
3204     bool read_string = false;
3205     ret_val = tokudb_compare_two_keys(
3206         ref1 + sizeof(uint32_t),
3207         *(uint32_t *)ref1,
3208         ref2 + sizeof(uint32_t),
3209         *(uint32_t *)ref2,
3210         (uchar *)share->file->descriptor->dbt.data + 4,
3211         *(uint32_t *)share->file->descriptor->dbt.data - 4,
3212         false,
3213         &read_string
3214         );
3215     return ret_val;
3216 }
3217 
check_if_incompatible_data(HA_CREATE_INFO * info,uint table_changes)3218 bool ha_tokudb::check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes) {
3219   //
3220   // This is a horrendous hack for now, as copied by InnoDB.
3221   // This states that if the auto increment create field has changed,
3222   // via a "alter table foo auto_increment=new_val", that this
3223   // change is incompatible, and to rebuild the entire table
3224   // This will need to be fixed
3225   //
3226   if ((info->used_fields & HA_CREATE_USED_AUTO) &&
3227       info->auto_increment_value != 0) {
3228 
3229     return COMPATIBLE_DATA_NO;
3230   }
3231   if (table_changes != IS_EQUAL_YES)
3232     return COMPATIBLE_DATA_NO;
3233   return COMPATIBLE_DATA_YES;
3234 }
3235 
3236 //
3237 // Method that is called before the beginning of many calls
3238 // to insert rows (ha_tokudb::write_row). There is no guarantee
3239 // that start_bulk_insert is called, however there is a guarantee
3240 // that if start_bulk_insert is called, then end_bulk_insert may be
3241 // called as well.
3242 // Parameters:
3243 //      [in]    rows - an estimate of the number of rows that will be inserted
3244 //                     if number of rows is unknown (such as if doing
3245 //                     "insert into foo select * from bar), then rows
3246 //                     will be 0
3247 //
3248 //
3249 // This function returns true if the table MAY be empty.
3250 // It is NOT meant to be a 100% check for emptiness.
3251 // This is used for a bulk load optimization.
3252 //
may_table_be_empty(DB_TXN * txn)3253 bool ha_tokudb::may_table_be_empty(DB_TXN *txn) {
3254     int error;
3255     bool ret_val = false;
3256     DBC* tmp_cursor = NULL;
3257     DB_TXN* tmp_txn = NULL;
3258 
3259     const int empty_scan = tokudb::sysvars::empty_scan(ha_thd());
3260     if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_DISABLED)
3261         goto cleanup;
3262 
3263     if (txn == NULL) {
3264         error = txn_begin(db_env, 0, &tmp_txn, 0, ha_thd());
3265         if (error) {
3266             goto cleanup;
3267         }
3268         txn = tmp_txn;
3269     }
3270 
3271     error = share->file->cursor(share->file, txn, &tmp_cursor, 0);
3272     if (error)
3273         goto cleanup;
3274     tmp_cursor->c_set_check_interrupt_callback(tmp_cursor, tokudb_killed_thd_callback, ha_thd());
3275     if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_LR)
3276         error = tmp_cursor->c_getf_next(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3277     else
3278         error = tmp_cursor->c_getf_prev(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3279     error = map_to_handler_error(error);
3280     if (error == DB_NOTFOUND)
3281         ret_val = true;
3282     else
3283         ret_val = false;
3284     error = 0;
3285 
3286 cleanup:
3287     if (tmp_cursor) {
3288         int r = tmp_cursor->c_close(tmp_cursor);
3289         assert_always(r == 0);
3290         tmp_cursor = NULL;
3291     }
3292     if (tmp_txn) {
3293         commit_txn(tmp_txn, 0);
3294         tmp_txn = NULL;
3295     }
3296     return ret_val;
3297 }
3298 
3299 #if MYSQL_VERSION_ID >= 100000
start_bulk_insert(ha_rows rows,uint flags)3300 void ha_tokudb::start_bulk_insert(ha_rows rows, uint flags) {
3301     TOKUDB_HANDLER_DBUG_ENTER("%llu %u txn %p", (unsigned long long) rows, flags, transaction);
3302 #else
3303 void ha_tokudb::start_bulk_insert(ha_rows rows) {
3304     TOKUDB_HANDLER_DBUG_ENTER("%llu txn %p", (unsigned long long) rows, transaction);
3305 #endif
3306     THD* thd = ha_thd();
3307     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3308     delay_updating_ai_metadata = true;
3309     ai_metadata_update_required = false;
3310     abort_loader = false;
3311 
3312     rwlock_t_lock_read(share->_num_DBs_lock);
3313     uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3314     num_DBs_locked_in_bulk = true;
3315     lock_count = 0;
3316 
3317     if ((rows == 0 || rows > 1) && share->try_table_lock) {
3318         if (tokudb::sysvars::prelock_empty(thd) &&
3319             may_table_be_empty(transaction) &&
3320             transaction != NULL) {
3321             if (using_ignore || is_insert_ignore(thd) || thd->lex->duplicates != DUP_ERROR
3322                 || table->s->next_number_key_offset) {
3323                 acquire_table_lock(transaction, lock_write);
3324             } else {
3325                 mult_dbt_flags[primary_key] = 0;
3326                 if (!thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS) && !hidden_primary_key) {
3327                     mult_put_flags[primary_key] = DB_NOOVERWRITE;
3328                 }
3329                 uint32_t loader_flags = (tokudb::sysvars::load_save_space(thd)) ?
3330                     LOADER_COMPRESS_INTERMEDIATES : 0;
3331 
3332                 int error = db_env->create_loader(
3333                     db_env,
3334                     transaction,
3335                     &loader,
3336                     NULL, // no src_db needed
3337                     curr_num_DBs,
3338                     share->key_file,
3339                     mult_put_flags,
3340                     mult_dbt_flags,
3341                     loader_flags
3342                     );
3343                 if (error) {
3344                     assert_always(loader == NULL);
3345                     goto exit_try_table_lock;
3346                 }
3347 
3348                 lc.thd = thd;
3349                 lc.ha = this;
3350 
3351                 error = loader->set_poll_function(
3352                     loader, ha_tokudb::bulk_insert_poll, &lc);
3353                 assert_always(!error);
3354 
3355                 error = loader->set_error_callback(
3356                     loader, ha_tokudb::loader_dup, &lc);
3357                 assert_always(!error);
3358 
3359                 trx->stmt_progress.using_loader = true;
3360             }
3361         }
3362     exit_try_table_lock:
3363         share->lock();
3364         share->try_table_lock = false;
3365         share->unlock();
3366     }
3367     TOKUDB_HANDLER_DBUG_VOID_RETURN;
3368 }
3369 int ha_tokudb::bulk_insert_poll(void* extra, float progress) {
3370     LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
3371     if (thd_killed(context->thd)) {
3372         snprintf(context->write_status_msg,
3373                  sizeof(context->write_status_msg),
3374                  "The process has been killed, aborting bulk load.");
3375         return ER_ABORTING_CONNECTION;
3376     }
3377     float percentage = progress * 100;
3378     snprintf(context->write_status_msg,
3379              sizeof(context->write_status_msg),
3380              "Loading of data t %s about %.1f%% done",
3381              context->ha->share->full_table_name(),
3382              percentage);
3383     thd_proc_info(context->thd, context->write_status_msg);
3384 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
3385     thd_progress_report(context->thd, (unsigned long long)percentage, 100);
3386 #endif
3387     return 0;
3388 }
3389 void ha_tokudb::loader_add_index_err(TOKUDB_UNUSED(DB* db),
3390                                      TOKUDB_UNUSED(int i),
3391                                      TOKUDB_UNUSED(int err),
3392                                      TOKUDB_UNUSED(DBT* key),
3393                                      TOKUDB_UNUSED(DBT* val),
3394                                      void* error_extra) {
3395     LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3396     assert_always(context->ha);
3397     context->ha->set_loader_error(err);
3398 }
3399 void ha_tokudb::loader_dup(TOKUDB_UNUSED(DB* db),
3400                            TOKUDB_UNUSED(int i),
3401                            int err,
3402                            DBT* key,
3403                            TOKUDB_UNUSED(DBT* val),
3404                            void* error_extra) {
3405     LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3406     assert_always(context->ha);
3407     context->ha->set_loader_error(err);
3408     if (err == DB_KEYEXIST) {
3409         context->ha->set_dup_value_for_pk(key);
3410     }
3411 }
3412 
3413 //
3414 // Method that is called at the end of many calls to insert rows
3415 // (ha_tokudb::write_row). If start_bulk_insert is called, then
3416 // this is guaranteed to be called.
3417 //
3418 int ha_tokudb::end_bulk_insert(TOKUDB_UNUSED(bool abort)) {
3419     TOKUDB_HANDLER_DBUG_ENTER("");
3420     int error = 0;
3421     THD* thd = ha_thd();
3422     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3423     bool using_loader = (loader != NULL);
3424     if (ai_metadata_update_required) {
3425         share->lock();
3426         error = update_max_auto_inc(share->status_block, share->last_auto_increment);
3427         share->unlock();
3428         if (error) { goto cleanup; }
3429     }
3430     delay_updating_ai_metadata = false;
3431     ai_metadata_update_required = false;
3432     loader_error = 0;
3433     if (loader) {
3434         if (!abort_loader && !thd_kill_level(thd)) {
3435             DBUG_EXECUTE_IF("tokudb_end_bulk_insert_sleep", {
3436                 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3437                 thd_proc_info(thd, "DBUG sleep");
3438                 my_sleep(20000000);
3439                 thd_proc_info(thd, orig_proc_info);
3440             });
3441             error = loader->close(loader);
3442             loader = NULL;
3443             if (error) {
3444                 if (thd_kill_level(thd)) {
3445                     my_error(ER_QUERY_INTERRUPTED, MYF(0));
3446                 }
3447                 goto cleanup;
3448             }
3449 
3450             for (uint i = 0; i < table_share->keys; i++) {
3451                 if (table_share->key_info[i].flags & HA_NOSAME) {
3452                     bool is_unique;
3453                     if (i == primary_key && !share->pk_has_string) {
3454                         continue;
3455                     }
3456                     error = is_index_unique(&is_unique, transaction, share->key_file[i], &table->key_info[i],
3457                                             DB_PRELOCKED_WRITE);
3458                     if (error) goto cleanup;
3459                     if (!is_unique) {
3460                         error = HA_ERR_FOUND_DUPP_KEY;
3461                         last_dup_key = i;
3462                         goto cleanup;
3463                     }
3464                 }
3465             }
3466         }
3467         else {
3468             error = sprintf(write_status_msg, "aborting bulk load");
3469             thd_proc_info(thd, write_status_msg);
3470             loader->abort(loader);
3471             loader = NULL;
3472             share->try_table_lock = true;
3473         }
3474     }
3475 
3476 cleanup:
3477     if (num_DBs_locked_in_bulk) {
3478         share->_num_DBs_lock.unlock();
3479     }
3480     num_DBs_locked_in_bulk = false;
3481     lock_count = 0;
3482     if (loader) {
3483         error = sprintf(write_status_msg, "aborting bulk load");
3484         thd_proc_info(thd, write_status_msg);
3485         loader->abort(loader);
3486         loader = NULL;
3487     }
3488     abort_loader = false;
3489     memset(&lc, 0, sizeof(lc));
3490     if (error || loader_error) {
3491         my_errno = error ? error : loader_error;
3492         if (using_loader) {
3493             share->try_table_lock = true;
3494         }
3495     }
3496     trx->stmt_progress.using_loader = false;
3497     thd_proc_info(thd, 0);
3498     TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
3499 }
3500 
3501 int ha_tokudb::end_bulk_insert() {
3502     return end_bulk_insert( false );
3503 }
3504 
3505 int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info, int lock_flags) {
3506     int error;
3507     DBC* tmp_cursor1 = NULL;
3508     DBC* tmp_cursor2 = NULL;
3509     DBT key1, key2, val, packed_key1, packed_key2;
3510     uint64_t cnt = 0;
3511     char status_msg[MAX_ALIAS_NAME + 200]; //buffer of 200 should be a good upper bound.
3512     THD* thd = ha_thd();
3513     const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3514     memset(&key1, 0, sizeof(key1));
3515     memset(&key2, 0, sizeof(key2));
3516     memset(&val, 0, sizeof(val));
3517     memset(&packed_key1, 0, sizeof(packed_key1));
3518     memset(&packed_key2, 0, sizeof(packed_key2));
3519     *is_unique = true;
3520 
3521     error = db->cursor(db, txn, &tmp_cursor1, DB_SERIALIZABLE);
3522     if (error) { goto cleanup; }
3523 
3524     error = db->cursor(db, txn, &tmp_cursor2, DB_SERIALIZABLE);
3525     if (error) { goto cleanup; }
3526 
3527     error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3528     if (error == DB_NOTFOUND) {
3529         *is_unique = true;
3530         error = 0;
3531         goto cleanup;
3532     }
3533     else if (error) { goto cleanup; }
3534     error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3535     if (error) { goto cleanup; }
3536 
3537     error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3538     if (error == DB_NOTFOUND) {
3539         *is_unique = true;
3540         error = 0;
3541         goto cleanup;
3542     }
3543     else if (error) { goto cleanup; }
3544 
3545     while (error != DB_NOTFOUND) {
3546         bool has_null1;
3547         bool has_null2;
3548         int cmp;
3549         place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key1.data + 1);
3550         place_key_into_mysql_buff(key_info, table->record[1], (uchar *) key2.data + 1);
3551 
3552         create_dbt_key_for_lookup(&packed_key1, key_info, key_buff, table->record[0], &has_null1);
3553         create_dbt_key_for_lookup(&packed_key2, key_info, key_buff2, table->record[1], &has_null2);
3554 
3555         if (!has_null1 && !has_null2) {
3556             cmp = tokudb_prefix_cmp_dbt_key(db, &packed_key1, &packed_key2);
3557             if (cmp == 0) {
3558                 memcpy(key_buff, key1.data, key1.size);
3559                 place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key_buff + 1);
3560                 *is_unique = false;
3561                 break;
3562             }
3563         }
3564 
3565         error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3566         if (error) { goto cleanup; }
3567         error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3568         if (error && (error != DB_NOTFOUND)) { goto cleanup; }
3569 
3570         cnt++;
3571         if ((cnt % 10000) == 0) {
3572             sprintf(
3573                 status_msg,
3574                 "Verifying index uniqueness: Checked %llu of %llu rows in key-%s.",
3575                 (long long unsigned) cnt,
3576                 share->row_count(),
3577                 key_info->name.str);
3578             thd_proc_info(thd, status_msg);
3579             if (thd_kill_level(thd)) {
3580                 my_error(ER_QUERY_INTERRUPTED, MYF(0));
3581                 error = ER_QUERY_INTERRUPTED;
3582                 goto cleanup;
3583             }
3584         }
3585     }
3586 
3587     error = 0;
3588 
3589 cleanup:
3590     thd_proc_info(thd, orig_proc_info);
3591     if (tmp_cursor1) {
3592         tmp_cursor1->c_close(tmp_cursor1);
3593         tmp_cursor1 = NULL;
3594     }
3595     if (tmp_cursor2) {
3596         tmp_cursor2->c_close(tmp_cursor2);
3597         tmp_cursor2 = NULL;
3598     }
3599     return error;
3600 }
3601 
3602 int ha_tokudb::is_val_unique(bool* is_unique, const uchar* record, KEY* key_info, uint dict_index, DB_TXN* txn) {
3603     int error = 0;
3604     bool has_null;
3605     DBC* tmp_cursor = NULL;
3606 
3607     DBT key; memset((void *)&key, 0, sizeof(key));
3608     create_dbt_key_from_key(&key, key_info, key_buff2, record, &has_null, true, MAX_KEY_LENGTH, COL_NEG_INF);
3609     if (has_null) {
3610         error = 0;
3611         *is_unique = true;
3612         goto cleanup;
3613     }
3614 
3615     error = share->key_file[dict_index]->cursor(share->key_file[dict_index], txn, &tmp_cursor, DB_SERIALIZABLE | DB_RMW);
3616     if (error) {
3617         goto cleanup;
3618     } else {
3619         // prelock (key,-inf),(key,+inf) so that the subsequent key lookup does not overlock
3620         uint flags = 0;
3621         DBT key_right; memset(&key_right, 0, sizeof key_right);
3622         create_dbt_key_from_key(&key_right, key_info, key_buff3, record, &has_null, true, MAX_KEY_LENGTH, COL_POS_INF);
3623         error = tmp_cursor->c_set_bounds(tmp_cursor, &key, &key_right, true, DB_NOTFOUND);
3624         if (error == 0) {
3625             flags = DB_PRELOCKED | DB_PRELOCKED_WRITE;
3626         }
3627 
3628         // lookup key and check unique prefix
3629         struct smart_dbt_info info;
3630         info.ha = this;
3631         info.buf = NULL;
3632         info.keynr = dict_index;
3633 
3634         struct index_read_info ir_info;
3635         ir_info.orig_key = &key;
3636         ir_info.smart_dbt_info = info;
3637 
3638         error = tmp_cursor->c_getf_set_range(tmp_cursor, flags, &key, smart_dbt_callback_lookup, &ir_info);
3639         if (error == DB_NOTFOUND) {
3640             *is_unique = true;
3641             error = 0;
3642             goto cleanup;
3643         }
3644         else if (error) {
3645             error = map_to_handler_error(error);
3646             goto cleanup;
3647         }
3648         if (ir_info.cmp) {
3649             *is_unique = true;
3650         }
3651         else {
3652             *is_unique = false;
3653         }
3654     }
3655     error = 0;
3656 
3657 cleanup:
3658     if (tmp_cursor) {
3659         int r = tmp_cursor->c_close(tmp_cursor);
3660         assert_always(r==0);
3661         tmp_cursor = NULL;
3662     }
3663     return error;
3664 }
3665 
3666 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
3667 static void maybe_do_unique_checks_delay_fn(THD *thd) {
3668     if (thd->slave_thread) {
3669         uint64_t delay_ms = tokudb::sysvars::rpl_unique_checks_delay(thd);
3670         if (delay_ms)
3671             usleep(delay_ms * 1000);
3672     }
3673 }
3674 
3675 #define maybe_do_unique_checks_delay(__thd) \
3676     (maybe_do_unique_checks_delay_fn(__thd))
3677 
3678 #define maybe_do_unique_checks_delay_if_flags_set( \
3679     __thd, __flags_set, __flags_check)             \
3680     { if (((__flags_set) & DB_OPFLAGS_MASK) ==     \
3681          (__flags_check)) maybe_do_unique_checks_delay_fn(__thd); }
3682 
3683 static bool need_read_only(THD *thd) {
3684     return opt_readonly || !tokudb::sysvars::rpl_check_readonly(thd);
3685 }
3686 
3687 static bool do_unique_checks_fn(THD *thd, bool do_rpl_event) {
3688     if (do_rpl_event &&
3689         thd->slave_thread &&
3690         need_read_only(thd) &&
3691         !tokudb::sysvars::rpl_unique_checks(thd)) {
3692         return false;
3693     } else {
3694         return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3695     }
3696 }
3697 
3698 #define do_unique_checks(__thd, __flags) \
3699     (do_unique_checks_fn(__thd, __flags))
3700 
3701 #else
3702 
3703 #define maybe_do_unique_checks_delay(__thd) ((void)0)
3704 
3705 #define maybe_do_unique_checks_delay_if_flags_set( \
3706     __thd, __flags_set, __flags_check)             \
3707     ((void)0)
3708 
3709 static bool do_unique_checks_fn(THD *thd) {
3710     return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3711 }
3712 
3713 #define do_unique_checks(__thd, _flags) \
3714     (do_unique_checks_fn(__thd))
3715 
3716 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
3717 
3718 int ha_tokudb::do_uniqueness_checks(const uchar* record, DB_TXN* txn, THD* thd) {
3719     int error = 0;
3720     //
3721     // first do uniqueness checks
3722     //
3723     if (share->has_unique_keys && do_unique_checks(thd, in_rpl_write_rows)) {
3724         DBUG_EXECUTE_IF("tokudb_crash_if_rpl_does_uniqueness_check",
3725                         DBUG_ASSERT(0););
3726         for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3727             bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
3728             bool is_unique = false;
3729             //
3730             // don't need to do check for primary key that don't have strings
3731             //
3732             if (keynr == primary_key && !share->pk_has_string) {
3733                 continue;
3734             }
3735             if (!is_unique_key) {
3736                 continue;
3737             }
3738 
3739             maybe_do_unique_checks_delay(thd);
3740 
3741             //
3742             // if unique key, check uniqueness constraint
3743             // but, we do not need to check it if the key has a null
3744             // and we do not need to check it if unique_checks is off
3745             //
3746             error = is_val_unique(&is_unique, record, &table->key_info[keynr], keynr, txn);
3747             if (error) {
3748                 goto cleanup;
3749             }
3750             if (!is_unique) {
3751                 error = DB_KEYEXIST;
3752                 last_dup_key = keynr;
3753                 goto cleanup;
3754             }
3755         }
3756     }
3757 cleanup:
3758     return error;
3759 }
3760 
3761 void ha_tokudb::test_row_packing(const uchar* record, DBT* pk_key, DBT* pk_val) {
3762     int error;
3763     DBT row, key;
3764     //
3765     // variables for testing key packing, only used in some debug modes
3766     //
3767     uchar* tmp_pk_key_data = NULL;
3768     uchar* tmp_pk_val_data = NULL;
3769     DBT tmp_pk_key;
3770     DBT tmp_pk_val;
3771     bool has_null;
3772     int cmp;
3773 
3774     memset(&tmp_pk_key, 0, sizeof(DBT));
3775     memset(&tmp_pk_val, 0, sizeof(DBT));
3776 
3777     //
3778     //use for testing the packing of keys
3779     //
3780     tmp_pk_key_data = (uchar*)tokudb::memory::malloc(pk_key->size, MYF(MY_WME));
3781     assert_always(tmp_pk_key_data);
3782     tmp_pk_val_data = (uchar*)tokudb::memory::malloc(pk_val->size, MYF(MY_WME));
3783     assert_always(tmp_pk_val_data);
3784     memcpy(tmp_pk_key_data, pk_key->data, pk_key->size);
3785     memcpy(tmp_pk_val_data, pk_val->data, pk_val->size);
3786     tmp_pk_key.data = tmp_pk_key_data;
3787     tmp_pk_key.size = pk_key->size;
3788     tmp_pk_val.data = tmp_pk_val_data;
3789     tmp_pk_val.size = pk_val->size;
3790 
3791     for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3792         uint32_t tmp_num_bytes = 0;
3793         uchar* row_desc = NULL;
3794         uint32_t desc_size = 0;
3795 
3796         if (keynr == primary_key) {
3797             continue;
3798         }
3799 
3800         create_dbt_key_from_table(&key, keynr, key_buff2, record, &has_null);
3801 
3802         //
3803         // TEST
3804         //
3805         row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3806         row_desc += (*(uint32_t *)row_desc);
3807         desc_size = (*(uint32_t *)row_desc) - 4;
3808         row_desc += 4;
3809         tmp_num_bytes = pack_key_from_desc(
3810             key_buff3,
3811             row_desc,
3812             desc_size,
3813             &tmp_pk_key,
3814             &tmp_pk_val
3815             );
3816         assert_always(tmp_num_bytes == key.size);
3817         cmp = memcmp(key_buff3,key_buff2,tmp_num_bytes);
3818         assert_always(cmp == 0);
3819 
3820         //
3821         // test key packing of clustering keys
3822         //
3823         if (key_is_clustering(&table->key_info[keynr])) {
3824             error = pack_row(&row, (const uchar *) record, keynr);
3825             assert_always(error == 0);
3826             uchar* tmp_buff = NULL;
3827             tmp_buff = (uchar*)tokudb::memory::malloc(
3828                 alloced_rec_buff_length,
3829                 MYF(MY_WME));
3830             assert_always(tmp_buff);
3831             row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3832             row_desc += (*(uint32_t *)row_desc);
3833             row_desc += (*(uint32_t *)row_desc);
3834             desc_size = (*(uint32_t *)row_desc) - 4;
3835             row_desc += 4;
3836             tmp_num_bytes = pack_clustering_val_from_desc(
3837                 tmp_buff,
3838                 row_desc,
3839                 desc_size,
3840                 &tmp_pk_val
3841                 );
3842             assert_always(tmp_num_bytes == row.size);
3843             cmp = memcmp(tmp_buff,rec_buff,tmp_num_bytes);
3844             assert_always(cmp == 0);
3845             tokudb::memory::free(tmp_buff);
3846         }
3847     }
3848 
3849     //
3850     // copy stuff back out
3851     //
3852     error = pack_row(pk_val, (const uchar *) record, primary_key);
3853     assert_always(pk_val->size == tmp_pk_val.size);
3854     cmp = memcmp(pk_val->data, tmp_pk_val_data, pk_val->size);
3855     assert_always( cmp == 0);
3856 
3857     tokudb::memory::free(tmp_pk_key_data);
3858     tokudb::memory::free(tmp_pk_val_data);
3859 }
3860 
3861 // set the put flags for the main dictionary
3862 void ha_tokudb::set_main_dict_put_flags(THD* thd, bool opt_eligible, uint32_t* put_flags) {
3863     uint32_t old_prelock_flags = 0;
3864     uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3865     bool in_hot_index = share->num_DBs > curr_num_DBs;
3866     bool using_ignore_flag_opt = do_ignore_flag_optimization(thd, table, share->replace_into_fast && !using_ignore_no_key);
3867     //
3868     // optimization for "REPLACE INTO..." (and "INSERT IGNORE") command
3869     // if the command is "REPLACE INTO" and the only table
3870     // is the main table (or all indexes are a subset of the pk),
3871     // then we can simply insert the element
3872     // with DB_YESOVERWRITE. If the element does not exist,
3873     // it will act as a normal insert, and if it does exist, it
3874     // will act as a replace, which is exactly what REPLACE INTO is supposed
3875     // to do. We cannot do this if otherwise, because then we lose
3876     // consistency between indexes
3877     //
3878     if (hidden_primary_key)
3879     {
3880         *put_flags = old_prelock_flags;
3881     }
3882     else if (!do_unique_checks(thd, in_rpl_write_rows | in_rpl_update_rows) && !is_replace_into(thd) && !is_insert_ignore(thd))
3883     {
3884         *put_flags = old_prelock_flags;
3885     }
3886     else if (using_ignore_flag_opt && is_replace_into(thd)
3887             && !in_hot_index)
3888     {
3889         *put_flags = old_prelock_flags;
3890     }
3891     else if (opt_eligible && using_ignore_flag_opt && is_insert_ignore(thd)
3892             && !in_hot_index)
3893     {
3894         *put_flags = DB_NOOVERWRITE_NO_ERROR | old_prelock_flags;
3895     }
3896     else
3897     {
3898         *put_flags = DB_NOOVERWRITE | old_prelock_flags;
3899     }
3900 }
3901 
3902 int ha_tokudb::insert_row_to_main_dictionary(
3903     DBT* pk_key,
3904     DBT* pk_val,
3905     DB_TXN* txn) {
3906 
3907     int error = 0;
3908     uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3909     assert_always(curr_num_DBs == 1);
3910 
3911     uint32_t put_flags = mult_put_flags[primary_key];
3912     THD *thd = ha_thd();
3913     set_main_dict_put_flags(thd, true, &put_flags);
3914 
3915     // for test, make unique checks have a very long duration
3916     maybe_do_unique_checks_delay_if_flags_set(thd, put_flags, DB_NOOVERWRITE);
3917 
3918     error = share->file->put(share->file, txn, pk_key, pk_val, put_flags);
3919     if (error) {
3920         last_dup_key = primary_key;
3921         goto cleanup;
3922     }
3923 
3924 cleanup:
3925     return error;
3926 }
3927 
3928 int ha_tokudb::insert_rows_to_dictionaries_mult(DBT* pk_key, DBT* pk_val, DB_TXN* txn, THD* thd) {
3929     int error = 0;
3930     uint curr_num_DBs = share->num_DBs;
3931     set_main_dict_put_flags(thd, true, &mult_put_flags[primary_key]);
3932     uint32_t flags = mult_put_flags[primary_key];
3933 
3934     // for test, make unique checks have a very long duration
3935     maybe_do_unique_checks_delay_if_flags_set(thd, flags, DB_NOOVERWRITE);
3936 
3937     // the insert ignore optimization uses DB_NOOVERWRITE_NO_ERROR,
3938     // which is not allowed with env->put_multiple.
3939     // we have to insert the rows one by one in this case.
3940     if (flags & DB_NOOVERWRITE_NO_ERROR) {
3941         DB * src_db = share->key_file[primary_key];
3942         for (uint32_t i = 0; i < curr_num_DBs; i++) {
3943             DB * db = share->key_file[i];
3944             if (i == primary_key) {
3945                 // if it's the primary key, insert the rows
3946                 // as they are.
3947                 error = db->put(db, txn, pk_key, pk_val, flags);
3948             } else {
3949                 // generate a row for secondary keys.
3950                 // use our multi put key/rec buffers
3951                 // just as the ydb layer would have in
3952                 // env->put_multiple(), except that
3953                 // we will just do a put() right away.
3954                 error = tokudb_generate_row(db, src_db,
3955                         &mult_key_dbt_array[i].dbts[0], &mult_rec_dbt_array[i].dbts[0],
3956                         pk_key, pk_val);
3957                 if (error != 0) {
3958                     goto out;
3959                 }
3960                 error = db->put(db, txn, &mult_key_dbt_array[i].dbts[0],
3961                         &mult_rec_dbt_array[i].dbts[0], flags);
3962             }
3963             if (error != 0) {
3964                 goto out;
3965             }
3966         }
3967     } else {
3968         // not insert ignore, so we can use put multiple
3969         error = db_env->put_multiple(
3970             db_env,
3971             share->key_file[primary_key],
3972             txn,
3973             pk_key,
3974             pk_val,
3975             curr_num_DBs,
3976             share->key_file,
3977             mult_key_dbt_array,
3978             mult_rec_dbt_array,
3979             mult_put_flags
3980             );
3981     }
3982 
3983 out:
3984     //
3985     // We break if we hit an error, unless it is a dup key error
3986     // and MySQL told us to ignore duplicate key errors
3987     //
3988     if (error) {
3989         last_dup_key = primary_key;
3990     }
3991     return error;
3992 }
3993 
3994 //
3995 // Stores a row in the table, called when handling an INSERT query
3996 // Parameters:
3997 //      [in]    record - a row in MySQL format
3998 // Returns:
3999 //      0 on success
4000 //      error otherwise
4001 //
4002 int ha_tokudb::write_row(const uchar * record) {
4003     TOKUDB_HANDLER_DBUG_ENTER("%p", record);
4004 
4005     DBT row, prim_key;
4006     int error;
4007     THD *thd = ha_thd();
4008     bool has_null;
4009     DB_TXN* sub_trans = NULL;
4010     DB_TXN* txn = NULL;
4011     tokudb_trx_data *trx = NULL;
4012     uint curr_num_DBs;
4013     bool create_sub_trans = false;
4014     bool num_DBs_locked = false;
4015 
4016     //
4017     // some crap that needs to be done because MySQL does not properly abstract
4018     // this work away from us, namely filling in auto increment and setting auto timestamp
4019     //
4020 #if MYSQL_VERSION_ID < 50600
4021     if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT) {
4022         table->timestamp_field->set_time();
4023     }
4024 #endif
4025     if (table->next_number_field && record == table->record[0]) {
4026         error = update_auto_increment();
4027         if (error)
4028             goto cleanup;
4029     }
4030 
4031     //
4032     // check to see if some value for the auto increment column that is bigger
4033     // than anything else til now is being used. If so, update the metadata to reflect it
4034     // the goal here is we never want to have a dup key error due to a bad increment
4035     // of the auto inc field.
4036     //
4037     if (share->has_auto_inc && record == table->record[0]) {
4038         share->lock();
4039         ulonglong curr_auto_inc = retrieve_auto_increment(
4040             table->field[share->ai_field_index]->key_type(),
4041             field_offset(table->field[share->ai_field_index], table),
4042             record);
4043         if (curr_auto_inc > share->last_auto_increment) {
4044             share->last_auto_increment = curr_auto_inc;
4045             if (delay_updating_ai_metadata) {
4046                 ai_metadata_update_required = true;
4047             } else {
4048                 update_max_auto_inc(
4049                     share->status_block,
4050                     share->last_auto_increment);
4051             }
4052         }
4053         share->unlock();
4054     }
4055 
4056     //
4057     // grab reader lock on numDBs_lock
4058     //
4059     if (!num_DBs_locked_in_bulk) {
4060         rwlock_t_lock_read(share->_num_DBs_lock);
4061         num_DBs_locked = true;
4062     } else {
4063         lock_count++;
4064         if (lock_count >= 2000) {
4065             share->_num_DBs_lock.unlock();
4066             rwlock_t_lock_read(share->_num_DBs_lock);
4067             lock_count = 0;
4068         }
4069     }
4070     curr_num_DBs = share->num_DBs;
4071 
4072     if (hidden_primary_key) {
4073         get_auto_primary_key(current_ident);
4074     }
4075 
4076     if (table_share->blob_fields) {
4077         if (fix_rec_buff_for_blob(max_row_length(record))) {
4078             error = HA_ERR_OUT_OF_MEM;
4079             goto cleanup;
4080         }
4081     }
4082 
4083     create_dbt_key_from_table(&prim_key, primary_key, primary_key_buff, record, &has_null);
4084     if ((error = pack_row(&row, (const uchar *) record, primary_key))){
4085         goto cleanup;
4086     }
4087 
4088     create_sub_trans = (using_ignore && !(do_ignore_flag_optimization(thd,table,share->replace_into_fast && !using_ignore_no_key)));
4089     if (create_sub_trans) {
4090         error = txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4091         if (error) {
4092             goto cleanup;
4093         }
4094     }
4095     txn = create_sub_trans ? sub_trans : transaction;
4096     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_TXN, "txn %p", txn);
4097     if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY))) {
4098         test_row_packing(record,&prim_key,&row);
4099     }
4100     if (loader) {
4101         error = loader->put(loader, &prim_key, &row);
4102         if (error) {
4103             abort_loader = true;
4104             goto cleanup;
4105         }
4106     } else {
4107         error = do_uniqueness_checks(record, txn, thd);
4108         if (error) {
4109             // for #4633
4110             // if we have a duplicate key error, let's check the primary key to see
4111             // if there is a duplicate there. If so, set last_dup_key to the pk
4112             if (error == DB_KEYEXIST && !tokudb_test(hidden_primary_key) && last_dup_key != primary_key) {
4113                 int r = share->file->getf_set(share->file, txn, DB_SERIALIZABLE, &prim_key, smart_dbt_do_nothing, NULL);
4114                 if (r == 0) {
4115                     // if we get no error, that means the row
4116                     // was found and this is a duplicate key,
4117                     // so we set last_dup_key
4118                     last_dup_key = primary_key;
4119                 } else if (r != DB_NOTFOUND) {
4120                     // if some other error is returned, return that to the user.
4121                     error = r;
4122                 }
4123             }
4124             goto cleanup;
4125         }
4126         if (curr_num_DBs == 1) {
4127             error = insert_row_to_main_dictionary(&prim_key, &row, txn);
4128             if (error) { goto cleanup; }
4129         } else {
4130             error = insert_rows_to_dictionaries_mult(&prim_key, &row, txn, thd);
4131             if (error) { goto cleanup; }
4132         }
4133         if (error == 0) {
4134             uint64_t full_row_size = prim_key.size + row.size;
4135             toku_hton_update_primary_key_bytes_inserted(full_row_size);
4136         }
4137     }
4138 
4139     trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4140     if (!error) {
4141         added_rows++;
4142         trx->stmt_progress.inserted++;
4143         track_progress(thd);
4144     }
4145 cleanup:
4146     if (num_DBs_locked) {
4147        share->_num_DBs_lock.unlock();
4148     }
4149     if (error == DB_KEYEXIST) {
4150         error = HA_ERR_FOUND_DUPP_KEY;
4151     }
4152     if (sub_trans) {
4153         // no point in recording error value of abort.
4154         // nothing we can do about it anyway and it is not what
4155         // we want to return.
4156         if (error) {
4157             abort_txn(sub_trans);
4158         }
4159         else {
4160             commit_txn(sub_trans, DB_TXN_NOSYNC);
4161         }
4162     }
4163     TOKUDB_HANDLER_DBUG_RETURN(error);
4164 }
4165 
4166 /* Compare if a key in a row has changed */
4167 bool ha_tokudb::key_changed(uint keynr, const uchar * old_row, const uchar * new_row) {
4168     DBT old_key;
4169     DBT new_key;
4170     memset((void *) &old_key, 0, sizeof(old_key));
4171     memset((void *) &new_key, 0, sizeof(new_key));
4172 
4173     bool has_null;
4174     create_dbt_key_from_table(&new_key, keynr, key_buff2, new_row, &has_null);
4175     create_dbt_key_for_lookup(&old_key,&table->key_info[keynr], key_buff3, old_row, &has_null);
4176     return tokudb_prefix_cmp_dbt_key(share->key_file[keynr], &old_key, &new_key);
4177 }
4178 
4179 //
4180 // Updates a row in the table, called when handling an UPDATE query
4181 // Parameters:
4182 //      [in]    old_row - row to be updated, in MySQL format
4183 //      [in]    new_row - new row, in MySQL format
4184 // Returns:
4185 //      0 on success
4186 //      error otherwise
4187 //
4188 int ha_tokudb::update_row(const uchar * old_row, const uchar * new_row) {
4189     TOKUDB_HANDLER_DBUG_ENTER("");
4190     DBT prim_key, old_prim_key, prim_row, old_prim_row;
4191     int UNINIT_VAR(error);
4192     bool has_null;
4193     THD* thd = ha_thd();
4194     DB_TXN* sub_trans = NULL;
4195     DB_TXN* txn = NULL;
4196     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4197     uint curr_num_DBs;
4198 
4199     memset((void *) &prim_key, 0, sizeof(prim_key));
4200     memset((void *) &old_prim_key, 0, sizeof(old_prim_key));
4201     memset((void *) &prim_row, 0, sizeof(prim_row));
4202     memset((void *) &old_prim_row, 0, sizeof(old_prim_row));
4203 
4204 #if MYSQL_VERSION_ID < 50600
4205     if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) {
4206         table->timestamp_field->set_time();
4207     }
4208 #endif
4209     //
4210     // check to see if some value for the auto increment column that is bigger
4211     // than anything else til now is being used. If so, update the metadata to reflect it
4212     // the goal here is we never want to have a dup key error due to a bad increment
4213     // of the auto inc field.
4214     //
4215     if (share->has_auto_inc && new_row == table->record[0]) {
4216         share->lock();
4217         ulonglong curr_auto_inc = retrieve_auto_increment(
4218             table->field[share->ai_field_index]->key_type(),
4219             field_offset(table->field[share->ai_field_index], table),
4220             new_row
4221             );
4222         if (curr_auto_inc > share->last_auto_increment) {
4223             error = update_max_auto_inc(share->status_block, curr_auto_inc);
4224             if (!error) {
4225                 share->last_auto_increment = curr_auto_inc;
4226             }
4227         }
4228         share->unlock();
4229     }
4230 
4231     //
4232     // grab reader lock on numDBs_lock
4233     //
4234     bool num_DBs_locked = false;
4235     if (!num_DBs_locked_in_bulk) {
4236         rwlock_t_lock_read(share->_num_DBs_lock);
4237         num_DBs_locked = true;
4238     }
4239     curr_num_DBs = share->num_DBs;
4240 
4241     if (using_ignore) {
4242         error = txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4243         if (error) {
4244             goto cleanup;
4245         }
4246     }
4247     txn = using_ignore ? sub_trans : transaction;
4248 
4249     if (hidden_primary_key) {
4250         memset((void *) &prim_key, 0, sizeof(prim_key));
4251         prim_key.data = (void *) current_ident;
4252         prim_key.size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
4253         old_prim_key = prim_key;
4254     }
4255     else {
4256         create_dbt_key_from_table(&prim_key, primary_key, key_buff, new_row, &has_null);
4257         create_dbt_key_from_table(&old_prim_key, primary_key, primary_key_buff, old_row, &has_null);
4258     }
4259 
4260     // do uniqueness checks
4261     if (share->has_unique_keys && do_unique_checks(thd, in_rpl_update_rows)) {
4262         for (uint keynr = 0; keynr < table_share->keys; keynr++) {
4263             bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
4264             if (keynr == primary_key && !share->pk_has_string) {
4265                 continue;
4266             }
4267             if (is_unique_key) {
4268                 bool key_ch = key_changed(keynr, old_row, new_row);
4269                 if (key_ch) {
4270                     bool is_unique;
4271                     error = is_val_unique(&is_unique, new_row, &table->key_info[keynr], keynr, txn);
4272                     if (error) goto cleanup;
4273                     if (!is_unique) {
4274                         error = DB_KEYEXIST;
4275                         last_dup_key = keynr;
4276                         goto cleanup;
4277                     }
4278                 }
4279             }
4280         }
4281     }
4282 
4283     if (table_share->blob_fields) {
4284         if (fix_rec_buff_for_blob(max_row_length(new_row))) {
4285             error = HA_ERR_OUT_OF_MEM;
4286             goto cleanup;
4287         }
4288         if (fix_rec_update_buff_for_blob(max_row_length(old_row))) {
4289             error = HA_ERR_OUT_OF_MEM;
4290             goto cleanup;
4291         }
4292     }
4293 
4294     error = pack_row(&prim_row, new_row, primary_key);
4295     if (error) { goto cleanup; }
4296 
4297     error = pack_old_row_for_update(&old_prim_row, old_row, primary_key);
4298     if (error) { goto cleanup; }
4299 
4300     set_main_dict_put_flags(thd, false, &mult_put_flags[primary_key]);
4301 
4302     // for test, make unique checks have a very long duration
4303     if ((mult_put_flags[primary_key] & DB_OPFLAGS_MASK) == DB_NOOVERWRITE)
4304         maybe_do_unique_checks_delay(thd);
4305 
4306     error = db_env->update_multiple(
4307         db_env,
4308         share->key_file[primary_key],
4309         txn,
4310         &old_prim_key,
4311         &old_prim_row,
4312         &prim_key,
4313         &prim_row,
4314         curr_num_DBs,
4315         share->key_file,
4316         mult_put_flags,
4317         2*curr_num_DBs,
4318         mult_key_dbt_array,
4319         curr_num_DBs,
4320         mult_rec_dbt_array
4321         );
4322 
4323     if (error == DB_KEYEXIST) {
4324         last_dup_key = primary_key;
4325     }
4326     else if (!error) {
4327         updated_rows++;
4328         trx->stmt_progress.updated++;
4329         track_progress(thd);
4330     }
4331 
4332 
4333 cleanup:
4334     if (num_DBs_locked) {
4335         share->_num_DBs_lock.unlock();
4336     }
4337     if (error == DB_KEYEXIST) {
4338         error = HA_ERR_FOUND_DUPP_KEY;
4339     }
4340     if (sub_trans) {
4341         // no point in recording error value of abort.
4342         // nothing we can do about it anyway and it is not what
4343         // we want to return.
4344         if (error) {
4345             abort_txn(sub_trans);
4346         }
4347         else {
4348             commit_txn(sub_trans, DB_TXN_NOSYNC);
4349         }
4350     }
4351     TOKUDB_HANDLER_DBUG_RETURN(error);
4352 }
4353 
4354 //
4355 // Deletes a row in the table, called when handling a DELETE query
4356 // Parameters:
4357 //      [in]    record - row to be deleted, in MySQL format
4358 // Returns:
4359 //      0 on success
4360 //      error otherwise
4361 //
4362 int ha_tokudb::delete_row(const uchar * record) {
4363     TOKUDB_HANDLER_DBUG_ENTER("");
4364     int error = ENOSYS;
4365     DBT row, prim_key;
4366     bool has_null;
4367     THD* thd = ha_thd();
4368     uint curr_num_DBs;
4369     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4370 
4371     //
4372     // grab reader lock on numDBs_lock
4373     //
4374     bool num_DBs_locked = false;
4375     if (!num_DBs_locked_in_bulk) {
4376         rwlock_t_lock_read(share->_num_DBs_lock);
4377         num_DBs_locked = true;
4378     }
4379     curr_num_DBs = share->num_DBs;
4380 
4381     create_dbt_key_from_table(&prim_key, primary_key, key_buff, record, &has_null);
4382     if (table_share->blob_fields) {
4383         if (fix_rec_buff_for_blob(max_row_length(record))) {
4384             error = HA_ERR_OUT_OF_MEM;
4385             goto cleanup;
4386         }
4387     }
4388     if ((error = pack_row(&row, (const uchar *) record, primary_key))){
4389         goto cleanup;
4390     }
4391 
4392     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
4393         TOKUDB_DEBUG_TXN,
4394         "all %p stmt %p sub_sp_level %p transaction %p",
4395         trx->all,
4396         trx->stmt,
4397         trx->sub_sp_level,
4398         transaction);
4399 
4400     error =
4401         db_env->del_multiple(
4402             db_env,
4403             share->key_file[primary_key],
4404             transaction,
4405             &prim_key,
4406             &row,
4407             curr_num_DBs,
4408             share->key_file,
4409             mult_key_dbt_array,
4410             mult_del_flags);
4411 
4412     if (error) {
4413         DBUG_PRINT("error", ("Got error %d", error));
4414     } else {
4415         deleted_rows++;
4416         trx->stmt_progress.deleted++;
4417         track_progress(thd);
4418     }
4419 cleanup:
4420     if (num_DBs_locked) {
4421         share->_num_DBs_lock.unlock();
4422     }
4423     TOKUDB_HANDLER_DBUG_RETURN(error);
4424 }
4425 
4426 //
4427 // takes as input table->read_set and table->write_set
4428 // and puts list of field indexes that need to be read in
4429 // unpack_row in the member variables fixed_cols_for_query
4430 // and var_cols_for_query
4431 //
4432 void ha_tokudb::set_query_columns(uint keynr) {
4433     uint32_t curr_fixed_col_index = 0;
4434     uint32_t curr_var_col_index = 0;
4435     read_key = false;
4436     read_blobs = false;
4437     //
4438     // i know this is probably confusing and will need to be explained better
4439     //
4440     uint key_index = 0;
4441 
4442     if (keynr == primary_key || keynr == MAX_KEY) {
4443         key_index = primary_key;
4444     }
4445     else {
4446         key_index = (key_is_clustering(&table->key_info[keynr]) ? keynr : primary_key);
4447     }
4448     for (uint i = 0; i < table_share->fields; i++) {
4449         if (bitmap_is_set(table->read_set,i) ||
4450             bitmap_is_set(table->write_set,i)
4451             )
4452         {
4453             if (bitmap_is_set(&share->kc_info.key_filters[key_index],i)) {
4454                 read_key = true;
4455             }
4456             else {
4457                 //
4458                 // if fixed field length
4459                 //
4460                 if (is_fixed_field(&share->kc_info, i)) {
4461                     //
4462                     // save the offset into the list
4463                     //
4464                     fixed_cols_for_query[curr_fixed_col_index] = i;
4465                     curr_fixed_col_index++;
4466                 }
4467                 //
4468                 // varchar or varbinary
4469                 //
4470                 else if (is_variable_field(&share->kc_info, i)) {
4471                     var_cols_for_query[curr_var_col_index] = i;
4472                     curr_var_col_index++;
4473                 }
4474                 //
4475                 // it is a blob
4476                 //
4477                 else {
4478                     read_blobs = true;
4479                 }
4480             }
4481         }
4482     }
4483     num_fixed_cols_for_query = curr_fixed_col_index;
4484     num_var_cols_for_query = curr_var_col_index;
4485 }
4486 
4487 void ha_tokudb::column_bitmaps_signal() {
4488     //
4489     // if we have max number of indexes, then MAX_KEY == primary_key
4490     //
4491     if (tokudb_active_index != MAX_KEY || tokudb_active_index == primary_key) {
4492         set_query_columns(tokudb_active_index);
4493     }
4494 }
4495 
4496 //
4497 // Notification that a scan of entire secondary table is about
4498 // to take place. Will pre acquire table read lock
4499 // Returns:
4500 //      0 on success
4501 //      error otherwise
4502 //
4503 int ha_tokudb::prepare_index_scan() {
4504     TOKUDB_HANDLER_DBUG_ENTER("");
4505     int error = 0;
4506     HANDLE_INVALID_CURSOR();
4507     error = prelock_range(NULL, NULL);
4508     if (error) { last_cursor_error = error; goto cleanup; }
4509 
4510     range_lock_grabbed = true;
4511     error = 0;
4512 cleanup:
4513     TOKUDB_HANDLER_DBUG_RETURN(error);
4514 }
4515 
4516 static bool index_key_is_null(
4517     TABLE* table,
4518     uint keynr,
4519     const uchar* key,
4520     uint key_len) {
4521 
4522     bool key_can_be_null = false;
4523     KEY* key_info = &table->key_info[keynr];
4524     KEY_PART_INFO* key_part = key_info->key_part;
4525     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
4526     for (; key_part != end; key_part++) {
4527         if (key_part->null_bit) {
4528             key_can_be_null = true;
4529             break;
4530         }
4531     }
4532     return key_can_be_null && key_len > 0 && key[0] != 0;
4533 }
4534 
4535 // Return true if bulk fetch can be used
4536 static bool tokudb_do_bulk_fetch(THD *thd) {
4537     switch (thd_sql_command(thd)) {
4538     case SQLCOM_SELECT:
4539     case SQLCOM_CREATE_TABLE:
4540     case SQLCOM_INSERT_SELECT:
4541     case SQLCOM_REPLACE_SELECT:
4542     case SQLCOM_DELETE:
4543         return tokudb::sysvars::bulk_fetch(thd) != 0;
4544     default:
4545         return false;
4546     }
4547 }
4548 
4549 //
4550 // Notification that a range query getting all elements that equal a key
4551 //  to take place. Will pre acquire read lock
4552 // Returns:
4553 //      0 on success
4554 //      error otherwise
4555 //
4556 int ha_tokudb::prepare_index_key_scan(const uchar * key, uint key_len) {
4557     TOKUDB_HANDLER_DBUG_ENTER("%p %u", key, key_len);
4558     int error = 0;
4559     DBT start_key, end_key;
4560     THD* thd = ha_thd();
4561     HANDLE_INVALID_CURSOR();
4562     pack_key(&start_key, tokudb_active_index, prelocked_left_range, key, key_len, COL_NEG_INF);
4563     prelocked_left_range_size = start_key.size;
4564     pack_key(&end_key, tokudb_active_index, prelocked_right_range, key, key_len, COL_POS_INF);
4565     prelocked_right_range_size = end_key.size;
4566 
4567     error = cursor->c_set_bounds(
4568         cursor,
4569         &start_key,
4570         &end_key,
4571         true,
4572         (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
4573         );
4574 
4575     if (error){
4576         goto cleanup;
4577     }
4578 
4579     range_lock_grabbed = true;
4580     range_lock_grabbed_null = index_key_is_null(table, tokudb_active_index, key, key_len);
4581     doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
4582     bulk_fetch_iteration = 0;
4583     rows_fetched_using_bulk_fetch = 0;
4584     error = 0;
4585 cleanup:
4586     if (error) {
4587         error = map_to_handler_error(error);
4588         last_cursor_error = error;
4589         //
4590         // cursor should be initialized here, but in case it is not,
4591         // we still check
4592         //
4593         if (cursor) {
4594             int r = cursor->c_close(cursor);
4595             assert_always(r==0);
4596             cursor = NULL;
4597             remove_from_trx_handler_list();
4598         }
4599     }
4600     TOKUDB_HANDLER_DBUG_RETURN(error);
4601 }
4602 
4603 void ha_tokudb::invalidate_bulk_fetch() {
4604     bytes_used_in_range_query_buff= 0;
4605     curr_range_query_buff_offset = 0;
4606     icp_went_out_of_range = false;
4607 }
4608 
4609 void ha_tokudb::invalidate_icp() {
4610     toku_pushed_idx_cond = NULL;
4611     toku_pushed_idx_cond_keyno = MAX_KEY;
4612     icp_went_out_of_range = false;
4613 }
4614 
4615 //
4616 // Initializes local cursor on DB with index keynr
4617 // Parameters:
4618 //          keynr - key (index) number
4619 //          sorted - 1 if result MUST be sorted according to index
4620 // Returns:
4621 //      0 on success
4622 //      error otherwise
4623 //
4624 int ha_tokudb::index_init(uint keynr, bool sorted) {
4625     TOKUDB_HANDLER_DBUG_ENTER("%d %u txn %p", keynr, sorted, transaction);
4626 
4627     int error;
4628     THD* thd = ha_thd();
4629     DBUG_PRINT("enter", ("table: '%s'  key: %d", table_share->table_name.str, keynr));
4630 
4631     /*
4632        Under some very rare conditions (like full joins) we may already have
4633        an active cursor at this point
4634      */
4635     if (cursor) {
4636         DBUG_PRINT("note", ("Closing active cursor"));
4637         int r = cursor->c_close(cursor);
4638         assert_always(r==0);
4639         remove_from_trx_handler_list();
4640     }
4641     active_index = keynr;
4642 
4643     if (active_index < MAX_KEY) {
4644         DBUG_ASSERT(keynr <= table->s->keys);
4645     } else {
4646         DBUG_ASSERT(active_index == MAX_KEY);
4647         keynr = primary_key;
4648     }
4649     tokudb_active_index = keynr;
4650 
4651 #if defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
4652     if (keynr < table->s->keys && table->key_info[keynr].option_struct->clustering)
4653         key_read = false;
4654 #endif  // defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
4655 
4656     last_cursor_error = 0;
4657     range_lock_grabbed = false;
4658     range_lock_grabbed_null = false;
4659     DBUG_ASSERT(share->key_file[keynr]);
4660     cursor_flags = get_cursor_isolation_flags(lock.type, thd);
4661     if (use_write_locks) {
4662         cursor_flags |= DB_RMW;
4663     }
4664     if (tokudb::sysvars::disable_prefetching(thd)) {
4665         cursor_flags |= DBC_DISABLE_PREFETCHING;
4666     }
4667     if (lock.type == TL_READ_WITH_SHARED_LOCKS) {
4668        cursor_flags |= DB_LOCKING_READ;
4669     }
4670     if ((error = share->key_file[keynr]->cursor(share->key_file[keynr],
4671                                                 transaction, &cursor,
4672                                                 cursor_flags))) {
4673         if (error == TOKUDB_MVCC_DICTIONARY_TOO_NEW) {
4674             error = HA_ERR_TABLE_DEF_CHANGED;
4675             my_error(ER_TABLE_DEF_CHANGED, MYF(0));
4676         }
4677         if (error == DB_LOCK_NOTGRANTED) {
4678             error = HA_ERR_LOCK_WAIT_TIMEOUT;
4679             my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
4680         }
4681         table->status = STATUS_NOT_FOUND;
4682         error = map_to_handler_error(error);
4683         last_cursor_error = error;
4684         cursor = NULL;             // Safety
4685         goto exit;
4686     }
4687     cursor->c_set_check_interrupt_callback(cursor, tokudb_killed_thd_callback, thd);
4688     memset((void *) &last_key, 0, sizeof(last_key));
4689 
4690     add_to_trx_handler_list();
4691 
4692     if (thd_sql_command(thd) == SQLCOM_SELECT) {
4693         set_query_columns(keynr);
4694         unpack_entire_row = false;
4695     }
4696     else {
4697         unpack_entire_row = true;
4698     }
4699     invalidate_bulk_fetch();
4700     doing_bulk_fetch = false;
4701     maybe_index_scan = false;
4702     error = 0;
4703 exit:
4704     TOKUDB_HANDLER_DBUG_RETURN(error);
4705 }
4706 
4707 //
4708 // closes the local cursor
4709 //
4710 int ha_tokudb::index_end() {
4711     TOKUDB_HANDLER_DBUG_ENTER("");
4712     range_lock_grabbed = false;
4713     range_lock_grabbed_null = false;
4714     if (cursor) {
4715         DBUG_PRINT("enter", ("table: '%s'", table_share->table_name.str));
4716         int r = cursor->c_close(cursor);
4717         assert_always(r==0);
4718         cursor = NULL;
4719         remove_from_trx_handler_list();
4720         last_cursor_error = 0;
4721     }
4722     active_index = tokudb_active_index = MAX_KEY;
4723 
4724     //
4725     // reset query variables
4726     //
4727     unpack_entire_row = true;
4728     read_blobs = true;
4729     read_key = true;
4730     num_fixed_cols_for_query = 0;
4731     num_var_cols_for_query = 0;
4732 
4733     invalidate_bulk_fetch();
4734     invalidate_icp();
4735     doing_bulk_fetch = false;
4736     close_dsmrr();
4737 
4738     TOKUDB_HANDLER_DBUG_RETURN(0);
4739 }
4740 
4741 int ha_tokudb::handle_cursor_error(int error, int err_to_return) {
4742     TOKUDB_HANDLER_DBUG_ENTER("");
4743     if (error) {
4744         error = map_to_handler_error(error);
4745         last_cursor_error = error;
4746         table->status = STATUS_NOT_FOUND;
4747         if (error == DB_NOTFOUND) {
4748             error = err_to_return;
4749         }
4750     }
4751     TOKUDB_HANDLER_DBUG_RETURN(error);
4752 }
4753 
4754 
4755 //
4756 // Helper function for read_row and smart_dbt_callback_xxx functions
4757 // When using a hidden primary key, upon reading a row,
4758 // we set the current_ident field to whatever the primary key we retrieved
4759 // was
4760 //
4761 void ha_tokudb::extract_hidden_primary_key(uint keynr, DBT const *found_key) {
4762     //
4763     // extract hidden primary key to current_ident
4764     //
4765     if (hidden_primary_key) {
4766         if (keynr == primary_key) {
4767             memcpy(current_ident, (char *) found_key->data, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
4768         }
4769         //
4770         // if secondary key, hidden primary key is at end of found_key
4771         //
4772         else {
4773             memcpy(
4774                 current_ident,
4775                 (char *) found_key->data + found_key->size - TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH,
4776                 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH
4777                 );
4778         }
4779     }
4780 }
4781 
4782 
4783 int ha_tokudb::read_row_callback (uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4784     assert_always(keynr == primary_key);
4785     return unpack_row(buf, row,found_key, keynr);
4786 }
4787 
4788 //
4789 // Reads the contents of row and found_key, DBT's retrieved from the DB associated to keynr, into buf
4790 // This function assumes that we are using a covering index, as a result, if keynr is the primary key,
4791 // we do not read row into buf
4792 // Parameters:
4793 //      [out]   buf - buffer for the row, in MySQL format
4794 //              keynr - index into key_file that represents DB we are currently operating on.
4795 //      [in]    row - the row that has been read from the preceding DB call
4796 //      [in]    found_key - key used to retrieve the row
4797 //
4798 void ha_tokudb::read_key_only(uchar * buf, uint keynr, DBT const *found_key) {
4799     TOKUDB_HANDLER_DBUG_ENTER("");
4800     table->status = 0;
4801     //
4802     // only case when we do not unpack the key is if we are dealing with the main dictionary
4803     // of a table with a hidden primary key
4804     //
4805     if (!(hidden_primary_key && keynr == primary_key)) {
4806         unpack_key(buf, found_key, keynr);
4807     }
4808     TOKUDB_HANDLER_DBUG_VOID_RETURN;
4809 }
4810 
4811 //
4812 // Helper function used to try to retrieve the entire row
4813 // If keynr is associated with the main table, reads contents of found_key and row into buf, otherwise,
4814 // makes copy of primary key and saves it to last_key. This can later be used to retrieve the entire row
4815 // Parameters:
4816 //      [out]   buf - buffer for the row, in MySQL format
4817 //              keynr - index into key_file that represents DB we are currently operating on.
4818 //      [in]    row - the row that has been read from the preceding DB call
4819 //      [in]    found_key - key used to retrieve the row
4820 //
4821 int ha_tokudb::read_primary_key(uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4822     TOKUDB_HANDLER_DBUG_ENTER("");
4823     int error = 0;
4824     table->status = 0;
4825     //
4826     // case where we read from secondary table that is not clustered
4827     //
4828     if (keynr != primary_key && !key_is_clustering(&table->key_info[keynr])) {
4829         bool has_null;
4830         //
4831         // create a DBT that has the same data as row, this is inefficient
4832         // extract_hidden_primary_key MUST have been called before this
4833         //
4834         memset((void *) &last_key, 0, sizeof(last_key));
4835         if (!hidden_primary_key) {
4836             unpack_key(buf, found_key, keynr);
4837         }
4838         create_dbt_key_from_table(
4839             &last_key,
4840             primary_key,
4841             key_buff,
4842             buf,
4843             &has_null
4844             );
4845     }
4846     //
4847     // else read from clustered/primary key
4848     //
4849     else {
4850         error = unpack_row(buf, row, found_key, keynr);
4851         if (error) { goto exit; }
4852     }
4853     if (found_key) { DBUG_DUMP("read row key", (uchar *) found_key->data, found_key->size); }
4854     error = 0;
4855 exit:
4856     TOKUDB_HANDLER_DBUG_RETURN(error);
4857 }
4858 
4859 //
4860 // This function reads an entire row into buf. This function also assumes that
4861 // the key needed to retrieve the row is stored in the member variable last_key
4862 // Parameters:
4863 //      [out]   buf - buffer for the row, in MySQL format
4864 // Returns:
4865 //      0 on success, error otherwise
4866 //
4867 int ha_tokudb::read_full_row(uchar * buf) {
4868     TOKUDB_HANDLER_DBUG_ENTER("");
4869     int error = 0;
4870     struct smart_dbt_info info;
4871     info.ha = this;
4872     info.buf = buf;
4873     info.keynr = primary_key;
4874     //
4875     // assumes key is stored in this->last_key
4876     //
4877 
4878     error = share->file->getf_set(share->file,
4879                                   transaction,
4880                                   cursor_flags,
4881                                   &last_key,
4882                                   smart_dbt_callback_rowread_ptquery,
4883                                   &info);
4884 
4885     DBUG_EXECUTE_IF("tokudb_fake_db_notfound_error_in_read_full_row", {
4886         error = DB_NOTFOUND;
4887     });
4888 
4889     if (error) {
4890         if (error == DB_LOCK_NOTGRANTED) {
4891             error = HA_ERR_LOCK_WAIT_TIMEOUT;
4892         } else if (error == DB_NOTFOUND) {
4893             error = HA_ERR_CRASHED;
4894             if (tokudb_active_index < share->_keys) {
4895                 sql_print_error(
4896                     "ha_tokudb::read_full_row on table %s cound not locate "
4897                     "record in PK that matches record found in key %s",
4898                     share->full_table_name(),
4899                     share->_key_descriptors[tokudb_active_index]._name);
4900             } else {
4901                 sql_print_error(
4902                     "ha_tokudb::read_full_row on table %s cound not locate "
4903                     "record in PK that matches record found in key %d",
4904                     share->full_table_name(),
4905                     tokudb_active_index);
4906             }
4907         }
4908         table->status = STATUS_NOT_FOUND;
4909     }
4910 
4911     TOKUDB_HANDLER_DBUG_RETURN(error);
4912 }
4913 
4914 
4915 //
4916 // Reads the next row matching to the key, on success, advances cursor
4917 // Parameters:
4918 //      [out]   buf - buffer for the next row, in MySQL format
4919 //      [in]     key - key value
4920 //                keylen - length of key
4921 // Returns:
4922 //      0 on success
4923 //      HA_ERR_END_OF_FILE if not found
4924 //      error otherwise
4925 //
4926 int ha_tokudb::index_next_same(uchar* buf, const uchar* key, uint keylen) {
4927     TOKUDB_HANDLER_DBUG_ENTER("");
4928 
4929     DBT curr_key;
4930     DBT found_key;
4931     bool has_null;
4932     int cmp;
4933     // create the key that will be used to compare with what is found
4934     // in order to figure out if we should return an error
4935     pack_key(&curr_key, tokudb_active_index, key_buff2, key, keylen, COL_ZERO);
4936     int error = get_next(buf, 1, &curr_key, key_read);
4937     if (error) {
4938         goto cleanup;
4939     }
4940     //
4941     // now do the comparison
4942     //
4943     create_dbt_key_from_table(
4944         &found_key,
4945         tokudb_active_index,
4946         key_buff3,buf,
4947         &has_null);
4948     cmp =
4949         tokudb_prefix_cmp_dbt_key(
4950             share->key_file[tokudb_active_index],
4951             &curr_key,
4952             &found_key);
4953     if (cmp) {
4954         error = HA_ERR_END_OF_FILE;
4955     }
4956 
4957 cleanup:
4958     error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
4959     TOKUDB_HANDLER_DBUG_RETURN(error);
4960 }
4961 
4962 
4963 //
4964 // According to InnoDB handlerton: Positions an index cursor to the index
4965 // specified in keynr. Fetches the row if any
4966 // Parameters:
4967 //      [out]       buf - buffer for the  returned row
4968 //      [in]         key - key value, according to InnoDB, if NULL,
4969 //                              position cursor at start or end of index,
4970 //                              not sure if this is done now
4971 //                    key_len - length of key
4972 //                    find_flag - according to InnoDB, search flags from my_base.h
4973 // Returns:
4974 //      0 on success
4975 //      HA_ERR_KEY_NOT_FOUND if not found (per InnoDB),
4976 //          we seem to return HA_ERR_END_OF_FILE if find_flag != HA_READ_KEY_EXACT
4977 //          TODO: investigate this for correctness
4978 //      error otherwise
4979 //
4980 int ha_tokudb::index_read(
4981     uchar* buf,
4982     const uchar* key,
4983     uint key_len,
4984     enum ha_rkey_function find_flag) {
4985 
4986     TOKUDB_HANDLER_DBUG_ENTER(
4987         "key %p %u:%2.2x find=%u",
4988         key,
4989         key_len,
4990         key ? key[0] : 0,
4991         find_flag);
4992     invalidate_bulk_fetch();
4993     if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
4994         TOKUDB_DBUG_DUMP("mysql key=", key, key_len);
4995     }
4996     DBT row;
4997     DBT lookup_key;
4998     int error = 0;
4999     uint32_t flags = 0;
5000     THD* thd = ha_thd();
5001     tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
5002     struct smart_dbt_info info;
5003     struct index_read_info ir_info;
5004 
5005     HANDLE_INVALID_CURSOR();
5006 
5007     // if we locked a non-null key range and we now have a null key, then
5008     // remove the bounds from the cursor
5009     if (range_lock_grabbed &&
5010         !range_lock_grabbed_null &&
5011         index_key_is_null(table, tokudb_active_index, key, key_len)) {
5012         range_lock_grabbed = range_lock_grabbed_null = false;
5013         cursor->c_remove_restriction(cursor);
5014     }
5015 
5016     memset((void *) &row, 0, sizeof(row));
5017 
5018     info.ha = this;
5019     info.buf = buf;
5020     info.keynr = tokudb_active_index;
5021 
5022     ir_info.smart_dbt_info = info;
5023     ir_info.cmp = 0;
5024 
5025     flags = SET_PRELOCK_FLAG(0);
5026     switch (find_flag) {
5027     case HA_READ_KEY_EXACT: /* Find first record else error */ {
5028         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5029         DBT lookup_bound;
5030         pack_key(&lookup_bound, tokudb_active_index, key_buff4, key, key_len, COL_POS_INF);
5031         if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
5032             TOKUDB_DBUG_DUMP("tokudb key=", lookup_key.data, lookup_key.size);
5033         }
5034         ir_info.orig_key = &lookup_key;
5035         error = cursor->c_getf_set_range_with_bound(cursor, flags, &lookup_key, &lookup_bound, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5036         if (ir_info.cmp) {
5037             error = DB_NOTFOUND;
5038         }
5039         break;
5040     }
5041     case HA_READ_AFTER_KEY: /* Find next rec. after key-record */
5042         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5043         error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5044         break;
5045     case HA_READ_BEFORE_KEY: /* Find next rec. before key-record */
5046         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5047         error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5048         break;
5049     case HA_READ_KEY_OR_NEXT: /* Record or next record */
5050         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5051         error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5052         break;
5053     //
5054     // This case does not seem to ever be used, it is ok for it to be slow
5055     //
5056     case HA_READ_KEY_OR_PREV: /* Record or previous */
5057         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5058         ir_info.orig_key = &lookup_key;
5059         error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5060         if (error == DB_NOTFOUND) {
5061             error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5062         }
5063         else if (ir_info.cmp) {
5064             error = cursor->c_getf_prev(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5065         }
5066         break;
5067     case HA_READ_PREFIX_LAST_OR_PREV: /* Last or prev key with the same prefix */
5068         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5069         error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5070         break;
5071     case HA_READ_PREFIX_LAST:
5072         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5073         ir_info.orig_key = &lookup_key;
5074         error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5075         if (ir_info.cmp) {
5076             error = DB_NOTFOUND;
5077         }
5078         break;
5079     default:
5080         TOKUDB_HANDLER_TRACE("unsupported:%d", find_flag);
5081         error = HA_ERR_UNSUPPORTED;
5082         break;
5083     }
5084     error = handle_cursor_error(error, HA_ERR_KEY_NOT_FOUND);
5085     if (!error && !key_read && tokudb_active_index != primary_key && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5086         error = read_full_row(buf);
5087     }
5088 
5089     if (TOKUDB_UNLIKELY(error && TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ERROR))) {
5090         TOKUDB_HANDLER_TRACE("error:%d:%d", error, find_flag);
5091     }
5092     trx->stmt_progress.queried++;
5093     track_progress(thd);
5094 
5095 cleanup:
5096     TOKUDB_HANDLER_DBUG_RETURN(error);
5097 }
5098 
5099 
5100 int ha_tokudb::read_data_from_range_query_buff(uchar* buf, bool need_val, bool do_key_read) {
5101     // buffer has the next row, get it from there
5102     int error;
5103     uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
5104     DBT curr_key;
5105     memset((void *) &curr_key, 0, sizeof(curr_key));
5106 
5107     // get key info
5108     uint32_t key_size = *(uint32_t *)curr_pos;
5109     curr_pos += sizeof(key_size);
5110     uchar* curr_key_buff = curr_pos;
5111     curr_pos += key_size;
5112 
5113     curr_key.data = curr_key_buff;
5114     curr_key.size = key_size;
5115 
5116     // if this is a covering index, this is all we need
5117     if (do_key_read) {
5118         assert_always(!need_val);
5119         extract_hidden_primary_key(tokudb_active_index, &curr_key);
5120         read_key_only(buf, tokudb_active_index, &curr_key);
5121         error = 0;
5122     }
5123     // we need to get more data
5124     else {
5125         DBT curr_val;
5126         memset((void *) &curr_val, 0, sizeof(curr_val));
5127         uchar* curr_val_buff = NULL;
5128         uint32_t val_size = 0;
5129         // in this case, we don't have a val, we are simply extracting the pk
5130         if (!need_val) {
5131             curr_val.data = curr_val_buff;
5132             curr_val.size = val_size;
5133             extract_hidden_primary_key(tokudb_active_index, &curr_key);
5134             error = read_primary_key( buf, tokudb_active_index, &curr_val, &curr_key);
5135         }
5136         else {
5137             extract_hidden_primary_key(tokudb_active_index, &curr_key);
5138             // need to extract a val and place it into buf
5139             if (unpack_entire_row) {
5140                 // get val info
5141                 val_size = *(uint32_t *)curr_pos;
5142                 curr_pos += sizeof(val_size);
5143                 curr_val_buff = curr_pos;
5144                 curr_pos += val_size;
5145                 curr_val.data = curr_val_buff;
5146                 curr_val.size = val_size;
5147                 error = unpack_row(buf,&curr_val, &curr_key, tokudb_active_index);
5148             }
5149             else {
5150                 if (!(hidden_primary_key && tokudb_active_index == primary_key)) {
5151                     unpack_key(buf,&curr_key,tokudb_active_index);
5152                 }
5153                 // read rows we care about
5154 
5155                 // first the null bytes;
5156                 memcpy(buf, curr_pos, table_share->null_bytes);
5157                 curr_pos += table_share->null_bytes;
5158 
5159                 // now the fixed sized rows
5160                 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5161                     uint field_index = fixed_cols_for_query[i];
5162                     Field* field = table->field[field_index];
5163                     unpack_fixed_field(
5164                         buf + field_offset(field, table),
5165                         curr_pos,
5166                         share->kc_info.field_lengths[field_index]
5167                         );
5168                     curr_pos += share->kc_info.field_lengths[field_index];
5169                 }
5170                 // now the variable sized rows
5171                 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5172                     uint field_index = var_cols_for_query[i];
5173                     Field* field = table->field[field_index];
5174                     uint32_t field_len = *(uint32_t *)curr_pos;
5175                     curr_pos += sizeof(field_len);
5176                     unpack_var_field(
5177                         buf + field_offset(field, table),
5178                         curr_pos,
5179                         field_len,
5180                         share->kc_info.length_bytes[field_index]
5181                         );
5182                     curr_pos += field_len;
5183                 }
5184                 // now the blobs
5185                 if (read_blobs) {
5186                     uint32_t blob_size = *(uint32_t *)curr_pos;
5187                     curr_pos += sizeof(blob_size);
5188                     error = unpack_blobs(
5189                         buf,
5190                         curr_pos,
5191                         blob_size,
5192                         true
5193                         );
5194                     curr_pos += blob_size;
5195                     if (error) {
5196                         invalidate_bulk_fetch();
5197                         goto exit;
5198                     }
5199                 }
5200                 error = 0;
5201             }
5202         }
5203     }
5204 
5205     curr_range_query_buff_offset = curr_pos - range_query_buff;
5206 exit:
5207     return error;
5208 }
5209 
5210 static int smart_dbt_bf_callback(
5211     DBT const* key,
5212     DBT const* row,
5213     void* context) {
5214     SMART_DBT_BF_INFO info = (SMART_DBT_BF_INFO)context;
5215     return
5216         info->ha->fill_range_query_buf(
5217             info->need_val,
5218             key,
5219             row,
5220             info->direction,
5221             info->thd,
5222             info->buf,
5223             info->key_to_compare);
5224 }
5225 
5226 check_result_t ha_tokudb::toku_handler_index_cond_check(
5227     Item* pushed_idx_cond) {
5228 
5229     check_result_t res;
5230     if (end_range) {
5231         int cmp;
5232 #ifdef MARIADB_BASE_VERSION
5233         cmp = compare_key2(end_range);
5234 #else
5235         cmp = compare_key_icp(end_range);
5236 #endif
5237         if (cmp > 0) {
5238             return CHECK_OUT_OF_RANGE;
5239         }
5240     }
5241     res = pushed_idx_cond->val_int() ? CHECK_POS : CHECK_NEG;
5242     return res;
5243 }
5244 
5245 // fill in the range query buf for bulk fetch
5246 int ha_tokudb::fill_range_query_buf(
5247     bool need_val,
5248     DBT const* key,
5249     DBT const* row,
5250     int direction,
5251     THD* thd,
5252     uchar* buf,
5253     DBT* key_to_compare) {
5254 
5255     int error;
5256     //
5257     // first put the value into range_query_buf
5258     //
5259     uint32_t size_remaining =
5260         size_range_query_buff - bytes_used_in_range_query_buff;
5261     uint32_t size_needed;
5262     uint32_t user_defined_size = tokudb::sysvars::read_buf_size(thd);
5263     uchar* curr_pos = NULL;
5264 
5265     if (key_to_compare) {
5266         int cmp = tokudb_prefix_cmp_dbt_key(
5267             share->key_file[tokudb_active_index],
5268             key_to_compare,
5269             key);
5270         if (cmp) {
5271             icp_went_out_of_range = true;
5272             error = 0;
5273             goto cleanup;
5274         }
5275     }
5276 
5277     // if we have an index condition pushed down, we check it
5278     if (toku_pushed_idx_cond &&
5279         (tokudb_active_index == toku_pushed_idx_cond_keyno)) {
5280         unpack_key(buf, key, tokudb_active_index);
5281         check_result_t result =
5282             toku_handler_index_cond_check(toku_pushed_idx_cond);
5283 
5284         // If we have reason to stop, we set icp_went_out_of_range and get out
5285         // otherwise, if we simply see that the current key is no match,
5286         // we tell the cursor to continue and don't store
5287         // the key locally
5288         if (result == CHECK_OUT_OF_RANGE || thd_kill_level(thd)) {
5289             icp_went_out_of_range = true;
5290             error = 0;
5291             DEBUG_SYNC(ha_thd(), "tokudb_icp_asc_scan_out_of_range");
5292             goto cleanup;
5293         } else if (result == CHECK_NEG) {
5294             // Optimizer change for MyRocks also benefits us here in TokuDB as
5295             // opt_range.cc QUICK_SELECT::get_next now sets end_range during
5296             // descending scan. We should not ever hit this condition, but
5297             // leaving this code in to prevent any possibility of a descending
5298             // scan to the beginning of an index and catch any possibility
5299             // in debug builds with an assertion
5300             assert_debug(!(!end_range && direction < 0));
5301             if (!end_range &&
5302                 direction < 0) {
5303                 cancel_pushed_idx_cond();
5304             }
5305             error = TOKUDB_CURSOR_CONTINUE;
5306             goto cleanup;
5307         }
5308     }
5309 
5310     // at this point, if ICP is on, we have verified that the key is one
5311     // we are interested in, so we proceed with placing the data
5312     // into the range query buffer
5313 
5314     if (need_val) {
5315         if (unpack_entire_row) {
5316             size_needed = 2*sizeof(uint32_t) + key->size + row->size;
5317         } else {
5318             // this is an upper bound
5319             size_needed =
5320                 // size of key length
5321                 sizeof(uint32_t) +
5322                 // key and row
5323                 key->size + row->size +
5324                 // lengths of varchars stored
5325                 num_var_cols_for_query * (sizeof(uint32_t)) +
5326                 // length of blobs
5327                 sizeof(uint32_t);
5328         }
5329     } else {
5330         size_needed = sizeof(uint32_t) + key->size;
5331     }
5332     if (size_remaining < size_needed) {
5333         range_query_buff =
5334             static_cast<uchar*>(tokudb::memory::realloc(
5335                 static_cast<void*>(range_query_buff),
5336                 bytes_used_in_range_query_buff + size_needed,
5337                 MYF(MY_WME)));
5338         if (range_query_buff == NULL) {
5339             error = ENOMEM;
5340             invalidate_bulk_fetch();
5341             goto cleanup;
5342         }
5343         size_range_query_buff = bytes_used_in_range_query_buff + size_needed;
5344     }
5345     //
5346     // now we know we have the size, let's fill the buffer, starting with the key
5347     //
5348     curr_pos = range_query_buff + bytes_used_in_range_query_buff;
5349 
5350     *reinterpret_cast<uint32_t*>(curr_pos) = key->size;
5351     curr_pos += sizeof(uint32_t);
5352     memcpy(curr_pos, key->data, key->size);
5353     curr_pos += key->size;
5354     if (need_val) {
5355         if (unpack_entire_row) {
5356             *reinterpret_cast<uint32_t*>(curr_pos) = row->size;
5357             curr_pos += sizeof(uint32_t);
5358             memcpy(curr_pos, row->data, row->size);
5359             curr_pos += row->size;
5360         } else {
5361             // need to unpack just the data we care about
5362             const uchar* fixed_field_ptr = static_cast<const uchar*>(row->data);
5363             fixed_field_ptr += table_share->null_bytes;
5364 
5365             const uchar* var_field_offset_ptr = NULL;
5366             const uchar* var_field_data_ptr = NULL;
5367 
5368             var_field_offset_ptr =
5369                 fixed_field_ptr +
5370                 share->kc_info.mcp_info[tokudb_active_index].fixed_field_size;
5371             var_field_data_ptr =
5372                 var_field_offset_ptr +
5373                 share->kc_info.mcp_info[tokudb_active_index].len_of_offsets;
5374 
5375             // first the null bytes
5376             memcpy(curr_pos, row->data, table_share->null_bytes);
5377             curr_pos += table_share->null_bytes;
5378             // now the fixed fields
5379             //
5380             // first the fixed fields
5381             //
5382             for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5383                 uint field_index = fixed_cols_for_query[i];
5384                 memcpy(
5385                     curr_pos,
5386                     fixed_field_ptr + share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val,
5387                     share->kc_info.field_lengths[field_index]);
5388                 curr_pos += share->kc_info.field_lengths[field_index];
5389             }
5390 
5391             //
5392             // now the var fields
5393             //
5394             for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5395                 uint field_index = var_cols_for_query[i];
5396                 uint32_t var_field_index =
5397                     share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val;
5398                 uint32_t data_start_offset;
5399                 uint32_t field_len;
5400 
5401                 get_var_field_info(
5402                     &field_len,
5403                     &data_start_offset,
5404                     var_field_index,
5405                     var_field_offset_ptr,
5406                     share->kc_info.num_offset_bytes);
5407                 memcpy(curr_pos, &field_len, sizeof(field_len));
5408                 curr_pos += sizeof(field_len);
5409                 memcpy(
5410                     curr_pos,
5411                     var_field_data_ptr + data_start_offset,
5412                     field_len);
5413                 curr_pos += field_len;
5414             }
5415 
5416             if (read_blobs) {
5417                 uint32_t blob_offset = 0;
5418                 uint32_t data_size = 0;
5419                 //
5420                 // now the blobs
5421                 //
5422                 get_blob_field_info(
5423                     &blob_offset,
5424                     share->kc_info.mcp_info[tokudb_active_index].len_of_offsets,
5425                     var_field_data_ptr,
5426                     share->kc_info.num_offset_bytes);
5427                 data_size =
5428                     row->size -
5429                     blob_offset -
5430                     static_cast<uint32_t>((var_field_data_ptr -
5431                         static_cast<const uchar*>(row->data)));
5432                 memcpy(curr_pos, &data_size, sizeof(data_size));
5433                 curr_pos += sizeof(data_size);
5434                 memcpy(curr_pos, var_field_data_ptr + blob_offset, data_size);
5435                 curr_pos += data_size;
5436             }
5437         }
5438     }
5439 
5440     bytes_used_in_range_query_buff = curr_pos - range_query_buff;
5441     assert_always(bytes_used_in_range_query_buff <= size_range_query_buff);
5442 
5443     //
5444     // now determine if we should continue with the bulk fetch
5445     // we want to stop under these conditions:
5446     //  - we overran the prelocked range
5447     //  - we are close to the end of the buffer
5448     //  - we have fetched an exponential amount of rows with
5449     //  respect to the bulk fetch iteration, which is initialized
5450     //  to 0 in index_init() and prelock_range().
5451 
5452     rows_fetched_using_bulk_fetch++;
5453     // if the iteration is less than the number of possible shifts on
5454     // a 64 bit integer, check that we haven't exceeded this iterations
5455     // row fetch upper bound.
5456     if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5457         uint64_t row_fetch_upper_bound = 1LLU << bulk_fetch_iteration;
5458         assert_always(row_fetch_upper_bound > 0);
5459         if (rows_fetched_using_bulk_fetch >= row_fetch_upper_bound) {
5460             error = 0;
5461             goto cleanup;
5462         }
5463     }
5464 
5465     if (bytes_used_in_range_query_buff +
5466         table_share->rec_buff_length >
5467         user_defined_size) {
5468         error = 0;
5469         goto cleanup;
5470     }
5471     if (direction > 0) {
5472         // compare what we got to the right endpoint of prelocked range
5473         // because we are searching keys in ascending order
5474         if (prelocked_right_range_size == 0) {
5475             error = TOKUDB_CURSOR_CONTINUE;
5476             goto cleanup;
5477         }
5478         DBT right_range;
5479         memset(&right_range, 0, sizeof(right_range));
5480         right_range.size = prelocked_right_range_size;
5481         right_range.data = prelocked_right_range;
5482         int cmp = tokudb_cmp_dbt_key(
5483             share->key_file[tokudb_active_index],
5484             key,
5485             &right_range);
5486         error = (cmp > 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5487     } else {
5488         // compare what we got to the left endpoint of prelocked range
5489         // because we are searching keys in descending order
5490         if (prelocked_left_range_size == 0) {
5491             error = TOKUDB_CURSOR_CONTINUE;
5492             goto cleanup;
5493         }
5494         DBT left_range;
5495         memset(&left_range, 0, sizeof(left_range));
5496         left_range.size = prelocked_left_range_size;
5497         left_range.data = prelocked_left_range;
5498         int cmp = tokudb_cmp_dbt_key(
5499             share->key_file[tokudb_active_index],
5500             key,
5501             &left_range);
5502         error = (cmp < 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5503     }
5504 cleanup:
5505     return error;
5506 }
5507 
5508 int ha_tokudb::get_next(
5509     uchar* buf,
5510     int direction,
5511     DBT* key_to_compare,
5512     bool do_key_read) {
5513 
5514     int error = 0;
5515     HANDLE_INVALID_CURSOR();
5516 
5517     if (maybe_index_scan) {
5518         maybe_index_scan = false;
5519         if (!range_lock_grabbed) {
5520             error = prepare_index_scan();
5521         }
5522     }
5523 
5524     if (!error) {
5525         uint32_t flags = SET_PRELOCK_FLAG(0);
5526 
5527         // we need to read the val of what we retrieve if
5528         // we do NOT have a covering index AND we are using a clustering secondary
5529         // key
5530         bool need_val =
5531             (do_key_read == 0) &&
5532             (tokudb_active_index == primary_key ||
5533              key_is_clustering(&table->key_info[tokudb_active_index]));
5534 
5535         if ((bytes_used_in_range_query_buff -
5536              curr_range_query_buff_offset) > 0) {
5537             error = read_data_from_range_query_buff(buf, need_val, do_key_read);
5538         } else if (icp_went_out_of_range) {
5539             icp_went_out_of_range = false;
5540             error = HA_ERR_END_OF_FILE;
5541         } else {
5542             invalidate_bulk_fetch();
5543             if (doing_bulk_fetch) {
5544                 struct smart_dbt_bf_info bf_info;
5545                 bf_info.ha = this;
5546                 // you need the val if you have a clustering index and key_read is not 0;
5547                 bf_info.direction = direction;
5548                 bf_info.thd = ha_thd();
5549                 bf_info.need_val = need_val;
5550                 bf_info.buf = buf;
5551                 bf_info.key_to_compare = key_to_compare;
5552                 //
5553                 // call c_getf_next with purpose of filling in range_query_buff
5554                 //
5555                 rows_fetched_using_bulk_fetch = 0;
5556                 // it is expected that we can do ICP in the smart_dbt_bf_callback
5557                 // as a result, it's possible we don't return any data because
5558                 // none of the rows matched the index condition. Therefore, we need
5559                 // this while loop. icp_out_of_range will be set if we hit a row that
5560                 // the index condition states is out of our range. When that hits,
5561                 // we know all the data in the buffer is the last data we will retrieve
5562                 while (bytes_used_in_range_query_buff == 0 &&
5563                        !icp_went_out_of_range && error == 0) {
5564                     if (direction > 0) {
5565                         error =
5566                             cursor->c_getf_next(
5567                                 cursor,
5568                                 flags,
5569                                 smart_dbt_bf_callback,
5570                                 &bf_info);
5571                     } else {
5572                         error =
5573                             cursor->c_getf_prev(
5574                                 cursor,
5575                                 flags,
5576                                 smart_dbt_bf_callback,
5577                                 &bf_info);
5578                     }
5579                 }
5580                 // if there is no data set and we went out of range,
5581                 // then there is nothing to return
5582                 if (bytes_used_in_range_query_buff == 0 &&
5583                     icp_went_out_of_range) {
5584                     icp_went_out_of_range = false;
5585                     error = HA_ERR_END_OF_FILE;
5586                 }
5587                 if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5588                     bulk_fetch_iteration++;
5589                 }
5590 
5591                 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5592                 if (error) {
5593                     goto cleanup;
5594                 }
5595 
5596                 //
5597                 // now that range_query_buff is filled, read an element
5598                 //
5599                 error =
5600                     read_data_from_range_query_buff(buf, need_val, do_key_read);
5601             } else {
5602                 struct smart_dbt_info info;
5603                 info.ha = this;
5604                 info.buf = buf;
5605                 info.keynr = tokudb_active_index;
5606 
5607                 if (direction > 0) {
5608                     error =
5609                         cursor->c_getf_next(
5610                             cursor,
5611                             flags,
5612                             SMART_DBT_CALLBACK(do_key_read),
5613                             &info);
5614                 } else {
5615                     error =
5616                         cursor->c_getf_prev(
5617                             cursor,
5618                             flags,
5619                             SMART_DBT_CALLBACK(do_key_read),
5620                             &info);
5621                 }
5622                 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5623             }
5624         }
5625     }
5626 
5627     //
5628     // at this point, one of two things has happened
5629     // either we have unpacked the data into buf, and we
5630     // are done, or we have unpacked the primary key
5631     // into last_key, and we use the code below to
5632     // read the full row by doing a point query into the
5633     // main table.
5634     //
5635     if (!error &&
5636         !do_key_read &&
5637         (tokudb_active_index != primary_key) &&
5638         !key_is_clustering(&table->key_info[tokudb_active_index])) {
5639         error = read_full_row(buf);
5640     }
5641 
5642     if (!error) {
5643         THD *thd = ha_thd();
5644         tokudb_trx_data* trx =
5645             static_cast<tokudb_trx_data*>(thd_get_ha_data(thd, tokudb_hton));
5646         trx->stmt_progress.queried++;
5647         track_progress(thd);
5648         if (thd_kill_level(thd))
5649             error = ER_ABORTING_CONNECTION;
5650     }
5651 cleanup:
5652     return error;
5653 }
5654 
5655 
5656 //
5657 // Reads the next row from the active index (cursor) into buf, and advances cursor
5658 // Parameters:
5659 //      [out]   buf - buffer for the next row, in MySQL format
5660 // Returns:
5661 //      0 on success
5662 //      HA_ERR_END_OF_FILE if not found
5663 //      error otherwise
5664 //
5665 int ha_tokudb::index_next(uchar * buf) {
5666     TOKUDB_HANDLER_DBUG_ENTER("");
5667     int error = get_next(buf, 1, NULL, key_read);
5668     TOKUDB_HANDLER_DBUG_RETURN(error);
5669 }
5670 
5671 
5672 int ha_tokudb::index_read_last(uchar * buf, const uchar * key, uint key_len) {
5673     return(index_read(buf, key, key_len, HA_READ_PREFIX_LAST));
5674 }
5675 
5676 
5677 //
5678 // Reads the previous row from the active index (cursor) into buf, and advances cursor
5679 // Parameters:
5680 //      [out]   buf - buffer for the next row, in MySQL format
5681 // Returns:
5682 //      0 on success
5683 //      HA_ERR_END_OF_FILE if not found
5684 //      error otherwise
5685 //
5686 int ha_tokudb::index_prev(uchar * buf) {
5687     TOKUDB_HANDLER_DBUG_ENTER("");
5688     int error = get_next(buf, -1, NULL, key_read);
5689     TOKUDB_HANDLER_DBUG_RETURN(error);
5690 }
5691 
5692 //
5693 // Reads the first row from the active index (cursor) into buf, and advances cursor
5694 // Parameters:
5695 //      [out]   buf - buffer for the next row, in MySQL format
5696 // Returns:
5697 //      0 on success
5698 //      HA_ERR_END_OF_FILE if not found
5699 //      error otherwise
5700 //
5701 int ha_tokudb::index_first(uchar * buf) {
5702     TOKUDB_HANDLER_DBUG_ENTER("");
5703     invalidate_bulk_fetch();
5704     int error = 0;
5705     struct smart_dbt_info info;
5706     uint32_t flags = SET_PRELOCK_FLAG(0);
5707     THD* thd = ha_thd();
5708     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5709     HANDLE_INVALID_CURSOR();
5710 
5711     info.ha = this;
5712     info.buf = buf;
5713     info.keynr = tokudb_active_index;
5714 
5715     error = cursor->c_getf_first(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5716     error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5717 
5718     //
5719     // still need to get entire contents of the row if operation done on
5720     // secondary DB and it was NOT a covering index
5721     //
5722     if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5723         error = read_full_row(buf);
5724     }
5725     if (trx) {
5726         trx->stmt_progress.queried++;
5727     }
5728     track_progress(thd);
5729     maybe_index_scan = true;
5730 cleanup:
5731     TOKUDB_HANDLER_DBUG_RETURN(error);
5732 }
5733 
5734 //
5735 // Reads the last row from the active index (cursor) into buf, and advances cursor
5736 // Parameters:
5737 //      [out]   buf - buffer for the next row, in MySQL format
5738 // Returns:
5739 //      0 on success
5740 //      HA_ERR_END_OF_FILE if not found
5741 //      error otherwise
5742 //
5743 int ha_tokudb::index_last(uchar * buf) {
5744     TOKUDB_HANDLER_DBUG_ENTER("");
5745     invalidate_bulk_fetch();
5746     int error = 0;
5747     struct smart_dbt_info info;
5748     uint32_t flags = SET_PRELOCK_FLAG(0);
5749     THD* thd = ha_thd();
5750     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5751     HANDLE_INVALID_CURSOR();
5752 
5753     info.ha = this;
5754     info.buf = buf;
5755     info.keynr = tokudb_active_index;
5756 
5757     error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5758     error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5759     //
5760     // still need to get entire contents of the row if operation done on
5761     // secondary DB and it was NOT a covering index
5762     //
5763     if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5764         error = read_full_row(buf);
5765     }
5766 
5767     if (trx) {
5768         trx->stmt_progress.queried++;
5769     }
5770     track_progress(thd);
5771     maybe_index_scan = true;
5772 cleanup:
5773     TOKUDB_HANDLER_DBUG_RETURN(error);
5774 }
5775 
5776 //
5777 // Initialize a scan of the table (which is why index_init is called on primary_key)
5778 // Parameters:
5779 //          scan - unused
5780 // Returns:
5781 //      0 on success
5782 //      error otherwise
5783 //
5784 int ha_tokudb::rnd_init(bool scan) {
5785     TOKUDB_HANDLER_DBUG_ENTER("");
5786     int error = 0;
5787     range_lock_grabbed = false;
5788     error = index_init(MAX_KEY, 0);
5789     if (error) { goto cleanup;}
5790 
5791     if (scan) {
5792         error = prelock_range(NULL, NULL);
5793         if (error) { goto cleanup; }
5794 
5795         // only want to set range_lock_grabbed to true after index_init
5796         // successfully executed for two reasons:
5797         // 1) index_init will reset it to false anyway
5798         // 2) if it fails, we don't want prelocking on,
5799         range_lock_grabbed = true;
5800     }
5801 
5802     error = 0;
5803 cleanup:
5804     if (error) {
5805         index_end();
5806         last_cursor_error = error;
5807     }
5808     TOKUDB_HANDLER_DBUG_RETURN(error);
5809 }
5810 
5811 //
5812 // End a scan of the table
5813 //
5814 int ha_tokudb::rnd_end() {
5815     TOKUDB_HANDLER_DBUG_ENTER("");
5816     range_lock_grabbed = false;
5817     TOKUDB_HANDLER_DBUG_RETURN(index_end());
5818 }
5819 
5820 
5821 //
5822 // Read the next row in a table scan
5823 // Parameters:
5824 //      [out]   buf - buffer for the next row, in MySQL format
5825 // Returns:
5826 //      0 on success
5827 //      HA_ERR_END_OF_FILE if not found
5828 //      error otherwise
5829 //
5830 int ha_tokudb::rnd_next(uchar * buf) {
5831     TOKUDB_HANDLER_DBUG_ENTER("");
5832     int error = get_next(buf, 1, NULL, false);
5833     TOKUDB_HANDLER_DBUG_RETURN(error);
5834 }
5835 
5836 
5837 void ha_tokudb::track_progress(THD* thd) {
5838     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
5839     if (trx) {
5840         ulonglong num_written = trx->stmt_progress.inserted +
5841             trx->stmt_progress.updated +
5842             trx->stmt_progress.deleted;
5843         bool update_status =
5844             (trx->stmt_progress.queried &&
5845              tokudb::sysvars::read_status_frequency &&
5846              (trx->stmt_progress.queried %
5847                 tokudb::sysvars::read_status_frequency) == 0) ||
5848              (num_written && tokudb::sysvars::write_status_frequency &&
5849               (num_written % tokudb::sysvars::write_status_frequency) == 0);
5850         if (update_status) {
5851             char *next_status = write_status_msg;
5852             bool first = true;
5853             int r;
5854             if (trx->stmt_progress.queried) {
5855                 r = sprintf(
5856                     next_status,
5857                     "Queried about %llu row%s",
5858                     trx->stmt_progress.queried,
5859                     trx->stmt_progress.queried == 1 ? "" : "s");
5860                 assert_always(r >= 0);
5861                 next_status += r;
5862                 first = false;
5863             }
5864             if (trx->stmt_progress.inserted) {
5865                 if (trx->stmt_progress.using_loader) {
5866                     r = sprintf(
5867                         next_status,
5868                         "%sFetched about %llu row%s, loading data still remains",
5869                         first ? "" : ", ",
5870                         trx->stmt_progress.inserted,
5871                         trx->stmt_progress.inserted == 1 ? "" : "s");
5872                 } else {
5873                     r = sprintf(
5874                         next_status,
5875                         "%sInserted about %llu row%s",
5876                         first ? "" : ", ",
5877                         trx->stmt_progress.inserted,
5878                         trx->stmt_progress.inserted == 1 ? "" : "s");
5879                 }
5880                 assert_always(r >= 0);
5881                 next_status += r;
5882                 first = false;
5883             }
5884             if (trx->stmt_progress.updated) {
5885                 r = sprintf(
5886                     next_status,
5887                     "%sUpdated about %llu row%s",
5888                     first ? "" : ", ",
5889                     trx->stmt_progress.updated,
5890                     trx->stmt_progress.updated == 1 ? "" : "s");
5891                 assert_always(r >= 0);
5892                 next_status += r;
5893                 first = false;
5894             }
5895             if (trx->stmt_progress.deleted) {
5896                 r = sprintf(
5897                     next_status,
5898                     "%sDeleted about %llu row%s",
5899                     first ? "" : ", ",
5900                     trx->stmt_progress.deleted,
5901                     trx->stmt_progress.deleted == 1 ? "" : "s");
5902                 assert_always(r >= 0);
5903                 next_status += r;
5904                 first = false;
5905             }
5906             if (!first)
5907                 thd_proc_info(thd, write_status_msg);
5908         }
5909     }
5910 }
5911 
5912 
5913 DBT *ha_tokudb::get_pos(DBT * to, uchar * pos) {
5914     TOKUDB_HANDLER_DBUG_ENTER("");
5915     /* We don't need to set app_data here */
5916     memset((void *) to, 0, sizeof(*to));
5917     to->data = pos + sizeof(uint32_t);
5918     to->size = *(uint32_t *)pos;
5919     DBUG_DUMP("key", (const uchar *) to->data, to->size);
5920     DBUG_RETURN(to);
5921 }
5922 
5923 // Retrieves a row with based on the primary key saved in pos
5924 // Returns:
5925 //      0 on success
5926 //      HA_ERR_KEY_NOT_FOUND if not found
5927 //      error otherwise
5928 int ha_tokudb::rnd_pos(uchar * buf, uchar * pos) {
5929     TOKUDB_HANDLER_DBUG_ENTER("");
5930     DBT db_pos;
5931     int error = 0;
5932     struct smart_dbt_info info;
5933     bool old_unpack_entire_row = unpack_entire_row;
5934     DBT* key = get_pos(&db_pos, pos);
5935 
5936     unpack_entire_row = true;
5937     tokudb_active_index = MAX_KEY;
5938 
5939     THD *thd = ha_thd();
5940 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
5941     // test rpl slave by inducing a delay before the point query
5942     if (thd->slave_thread && (in_rpl_delete_rows || in_rpl_update_rows)) {
5943         DBUG_EXECUTE_IF("tokudb_crash_if_rpl_looks_up_row", DBUG_ASSERT(0););
5944         uint64_t delay_ms = tokudb::sysvars::rpl_lookup_rows_delay(thd);
5945         if (delay_ms)
5946             usleep(delay_ms * 1000);
5947     }
5948 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
5949 
5950     info.ha = this;
5951     info.buf = buf;
5952     info.keynr = primary_key;
5953 
5954     error = share->file->getf_set(share->file, transaction,
5955             get_cursor_isolation_flags(lock.type, thd),
5956             key, smart_dbt_callback_rowread_ptquery, &info);
5957 
5958     if (error == DB_NOTFOUND) {
5959         error = HA_ERR_KEY_NOT_FOUND;
5960         goto cleanup;
5961     }
5962 cleanup:
5963     unpack_entire_row = old_unpack_entire_row;
5964     TOKUDB_HANDLER_DBUG_RETURN(error);
5965 }
5966 
5967 int ha_tokudb::prelock_range(const key_range *start_key, const key_range *end_key) {
5968     TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
5969     THD* thd = ha_thd();
5970 
5971     int error = 0;
5972     DBT start_dbt_key;
5973     DBT end_dbt_key;
5974     uchar* start_key_buff  = prelocked_left_range;
5975     uchar* end_key_buff = prelocked_right_range;
5976 
5977     memset((void *) &start_dbt_key, 0, sizeof(start_dbt_key));
5978     memset((void *) &end_dbt_key, 0, sizeof(end_dbt_key));
5979 
5980     HANDLE_INVALID_CURSOR();
5981     if (start_key) {
5982         switch (start_key->flag) {
5983         case HA_READ_AFTER_KEY:
5984             pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_POS_INF);
5985             break;
5986         default:
5987             pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_NEG_INF);
5988             break;
5989         }
5990         prelocked_left_range_size = start_dbt_key.size;
5991     }
5992     else {
5993         prelocked_left_range_size = 0;
5994     }
5995 
5996     if (end_key) {
5997         switch (end_key->flag) {
5998         case HA_READ_BEFORE_KEY:
5999             pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_NEG_INF);
6000             break;
6001         default:
6002             pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_POS_INF);
6003             break;
6004         }
6005         prelocked_right_range_size = end_dbt_key.size;
6006     }
6007     else {
6008         prelocked_right_range_size = 0;
6009     }
6010 
6011     error = cursor->c_set_bounds(
6012         cursor,
6013         start_key ? &start_dbt_key : share->key_file[tokudb_active_index]->dbt_neg_infty(),
6014         end_key ? &end_dbt_key : share->key_file[tokudb_active_index]->dbt_pos_infty(),
6015         true,
6016         (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
6017         );
6018     if (error) {
6019         error = map_to_handler_error(error);
6020         last_cursor_error = error;
6021         //
6022         // cursor should be initialized here, but in case it is not, we still check
6023         //
6024         if (cursor) {
6025             int r = cursor->c_close(cursor);
6026             assert_always(r==0);
6027             cursor = NULL;
6028             remove_from_trx_handler_list();
6029         }
6030         goto cleanup;
6031     }
6032 
6033     // at this point, determine if we will be doing bulk fetch
6034     doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
6035     bulk_fetch_iteration = 0;
6036     rows_fetched_using_bulk_fetch = 0;
6037 
6038 cleanup:
6039     TOKUDB_HANDLER_DBUG_RETURN(error);
6040 }
6041 
6042 //
6043 // Prelock range if possible, start_key is leftmost, end_key is rightmost
6044 // whether scanning forward or backward.  This function is called by MySQL
6045 // for backward range queries (in QUICK_SELECT_DESC::get_next).
6046 // Forward scans use read_range_first()/read_range_next().
6047 //
6048 int ha_tokudb::prepare_range_scan( const key_range *start_key, const key_range *end_key) {
6049     TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
6050     int error = prelock_range(start_key, end_key);
6051     if (!error) {
6052         range_lock_grabbed = true;
6053     }
6054     TOKUDB_HANDLER_DBUG_RETURN(error);
6055 }
6056 
6057 int ha_tokudb::read_range_first(
6058     const key_range *start_key,
6059     const key_range *end_key,
6060     bool eq_range,
6061     bool sorted)
6062 {
6063     TOKUDB_HANDLER_DBUG_ENTER("%p %p %u %u", start_key, end_key, eq_range, sorted);
6064     int error = prelock_range(start_key, end_key);
6065     if (error) { goto cleanup; }
6066     range_lock_grabbed = true;
6067 
6068     error = handler::read_range_first(start_key, end_key, eq_range, sorted);
6069 cleanup:
6070     TOKUDB_HANDLER_DBUG_RETURN(error);
6071 }
6072 
6073 int ha_tokudb::read_range_next()
6074 {
6075     TOKUDB_HANDLER_DBUG_ENTER("");
6076     int error;
6077     error = handler::read_range_next();
6078     if (error) {
6079         range_lock_grabbed = false;
6080     }
6081     TOKUDB_HANDLER_DBUG_RETURN(error);
6082 }
6083 
6084 
6085 
6086 /*
6087   Set a reference to the current record in (ref,ref_length).
6088 
6089   SYNOPSIS
6090   ha_tokudb::position()
6091   record                      The current record buffer
6092 
6093   DESCRIPTION
6094   The BDB handler stores the primary key in (ref,ref_length).
6095   There is either an explicit primary key, or an implicit (hidden)
6096   primary key.
6097   During open(), 'ref_length' is calculated as the maximum primary
6098   key length. When an actual key is shorter than that, the rest of
6099   the buffer must be cleared out. The row cannot be identified, if
6100   garbage follows behind the end of the key. There is no length
6101   field for the current key, so that the whole ref_length is used
6102   for comparison.
6103 
6104   RETURN
6105   nothing
6106 */
6107 void ha_tokudb::position(const uchar * record) {
6108     TOKUDB_HANDLER_DBUG_ENTER("");
6109     DBT key;
6110     if (hidden_primary_key) {
6111         DBUG_ASSERT(ref_length == (TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t)));
6112         memcpy(ref + sizeof(uint32_t), current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
6113         *(uint32_t *)ref = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
6114     }
6115     else {
6116         bool has_null;
6117         //
6118         // save the data
6119         //
6120         create_dbt_key_from_table(&key, primary_key, ref + sizeof(uint32_t), record, &has_null);
6121         //
6122         // save the size of data in the first four bytes of ref
6123         //
6124         memcpy(ref, &key.size, sizeof(uint32_t));
6125     }
6126     /*
6127       tokudb doesn't always write the last byte. Don't that cause problems with
6128       MariaDB
6129     */
6130     MEM_MAKE_DEFINED(ref, ref_length);
6131     TOKUDB_HANDLER_DBUG_VOID_RETURN;
6132 }
6133 
6134 //
6135 // Per InnoDB: Returns statistics information of the table to the MySQL interpreter,
6136 // in various fields of the handle object.
6137 // Return:
6138 //      0, always success
6139 //
6140 int ha_tokudb::info(uint flag) {
6141     TOKUDB_HANDLER_DBUG_ENTER("%d", flag);
6142     int error = 0;
6143 #if defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
6144     for (uint i=0; i < table->s->keys; i++)
6145         if (key_is_clustering(&table->key_info[i]))
6146             table->covering_keys.set_bit(i);
6147 #endif  // defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
6148     DB_TXN* txn = NULL;
6149     if (flag & HA_STATUS_VARIABLE) {
6150         stats.records = share->row_count() + share->rows_from_locked_table;
6151         stats.deleted = 0;
6152         if (!(flag & HA_STATUS_NO_LOCK)) {
6153 
6154             error = txn_begin(db_env, NULL, &txn, DB_READ_UNCOMMITTED, ha_thd());
6155             if (error) {
6156                 goto cleanup;
6157             }
6158 
6159             // we should always have a primary key
6160             assert_always(share->file != NULL);
6161 
6162             DB_BTREE_STAT64 dict_stats;
6163             error = share->file->stat64(share->file, txn, &dict_stats);
6164             if (error) {
6165                 goto cleanup;
6166             }
6167             share->set_row_count(dict_stats.bt_ndata, false);
6168             stats.records = dict_stats.bt_ndata;
6169             stats.create_time = dict_stats.bt_create_time_sec;
6170             stats.update_time = dict_stats.bt_modify_time_sec;
6171             stats.check_time = dict_stats.bt_verify_time_sec;
6172             stats.data_file_length = dict_stats.bt_dsize;
6173             stats.delete_length = dict_stats.bt_fsize - dict_stats.bt_dsize;
6174             if (hidden_primary_key) {
6175                 //
6176                 // in this case, we have a hidden primary key, do not
6177                 // want to report space taken up by the hidden primary key to the user
6178                 //
6179                 uint64_t hpk_space =
6180                     TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH * dict_stats.bt_ndata;
6181                 stats.data_file_length =
6182                     (hpk_space > stats.data_file_length) ?
6183                         0 : stats.data_file_length - hpk_space;
6184             } else {
6185                 //
6186                 // one infinity byte per key needs to be subtracted
6187                 //
6188                 uint64_t inf_byte_space = dict_stats.bt_ndata;
6189                 stats.data_file_length =
6190                     (inf_byte_space > stats.data_file_length) ?
6191                         0 : stats.data_file_length - inf_byte_space;
6192             }
6193 
6194             stats.mean_rec_length =
6195                 stats.records ?
6196                     (ulong)(stats.data_file_length/stats.records) : 0;
6197             stats.index_file_length = 0;
6198             // curr_num_DBs is the number of keys we have, according
6199             // to the mysql layer. if drop index is running concurrently
6200             // with info() (it can, because info does not take table locks),
6201             // then it could be the case that one of the dbs was dropped
6202             // and set to NULL before mysql was able to set table->s->keys
6203             // accordingly.
6204             //
6205             // we should just ignore any DB * that is NULL.
6206             //
6207             // this solution is much simpler than trying to maintain an
6208             // accurate number of valid keys at the handlerton layer.
6209             uint curr_num_DBs =
6210                 table->s->keys + tokudb_test(hidden_primary_key);
6211             for (uint i = 0; i < curr_num_DBs; i++) {
6212                 // skip the primary key, skip dropped indexes
6213                 if (i == primary_key || share->key_file[i] == NULL) {
6214                     continue;
6215                 }
6216                 error = share->key_file[i]->stat64(
6217                     share->key_file[i], txn, &dict_stats);
6218                 if (error) {
6219                     goto cleanup;
6220                 }
6221                 stats.index_file_length += dict_stats.bt_dsize;
6222                 stats.delete_length +=
6223                     dict_stats.bt_fsize - dict_stats.bt_dsize;
6224             }
6225         }
6226 
6227         /*
6228         The following comment and logic has been taken from InnoDB and
6229         an old hack was removed that forced to always set stats.records > 0
6230         ---
6231         The MySQL optimizer seems to assume in a left join that n_rows
6232         is an accurate estimate if it is zero. Of course, it is not,
6233         since we do not have any locks on the rows yet at this phase.
6234         Since SHOW TABLE STATUS seems to call this function with the
6235         HA_STATUS_TIME flag set, while the left join optimizer does not
6236         set that flag, we add one to a zero value if the flag is not
6237         set. That way SHOW TABLE STATUS will show the best estimate,
6238         while the optimizer never sees the table empty. */
6239         if (stats.records == 0 && !(flag & HA_STATUS_TIME)) {
6240             stats.records++;
6241         }
6242     }
6243     if ((flag & HA_STATUS_CONST)) {
6244         stats.max_data_file_length = 9223372036854775807ULL;
6245     }
6246     if (flag & (HA_STATUS_VARIABLE | HA_STATUS_CONST)) {
6247         share->set_cardinality_counts_in_table(table);
6248     }
6249 
6250     /* Don't return key if we got an error for the internal primary key */
6251     if (flag & HA_STATUS_ERRKEY && last_dup_key < table_share->keys) {
6252         errkey = last_dup_key;
6253     }
6254 
6255     if (flag & HA_STATUS_AUTO && table->found_next_number_field) {
6256         THD* thd = table->in_use;
6257         struct system_variables* variables = &thd->variables;
6258         stats.auto_increment_value =
6259             share->last_auto_increment + variables->auto_increment_increment;
6260     }
6261     error = 0;
6262 cleanup:
6263     if (txn != NULL) {
6264         commit_txn(txn, DB_TXN_NOSYNC);
6265         txn = NULL;
6266     }
6267     TOKUDB_HANDLER_DBUG_RETURN(error);
6268 }
6269 
6270 //
6271 //  Per InnoDB: Tells something additional to the handler about how to do things.
6272 //
6273 int ha_tokudb::extra(enum ha_extra_function operation) {
6274     TOKUDB_HANDLER_DBUG_ENTER("%d", operation);
6275     switch (operation) {
6276     case HA_EXTRA_RESET_STATE:
6277         reset();
6278         break;
6279     case HA_EXTRA_KEYREAD:
6280         key_read = true;           // Query satisfied with key
6281         break;
6282     case HA_EXTRA_NO_KEYREAD:
6283         key_read = false;
6284         break;
6285     case HA_EXTRA_IGNORE_DUP_KEY:
6286         using_ignore = true;
6287         break;
6288     case HA_EXTRA_NO_IGNORE_DUP_KEY:
6289         using_ignore = false;
6290         break;
6291     case HA_EXTRA_IGNORE_NO_KEY:
6292         using_ignore_no_key = true;
6293         break;
6294     case HA_EXTRA_NO_IGNORE_NO_KEY:
6295         using_ignore_no_key = false;
6296         break;
6297     case HA_EXTRA_NOT_USED:
6298     case HA_EXTRA_PREPARE_FOR_RENAME:
6299         break; // must do nothing and return 0
6300     default:
6301         break;
6302     }
6303     TOKUDB_HANDLER_DBUG_RETURN(0);
6304 }
6305 
6306 int ha_tokudb::reset() {
6307     TOKUDB_HANDLER_DBUG_ENTER("");
6308     key_read = false;
6309     using_ignore = false;
6310     using_ignore_no_key = false;
6311     reset_dsmrr();
6312     invalidate_icp();
6313     TOKUDB_HANDLER_DBUG_RETURN(0);
6314 }
6315 
6316 //
6317 // helper function that iterates through all DB's
6318 // and grabs a lock (either read or write, but not both)
6319 // Parameters:
6320 //      [in]    trans - transaction to be used to pre acquire the lock
6321 //              lt - type of lock to get, either lock_read or lock_write
6322 //  Returns:
6323 //      0 on success
6324 //      error otherwise
6325 //
6326 int ha_tokudb::acquire_table_lock (DB_TXN* trans, TABLE_LOCK_TYPE lt) {
6327     TOKUDB_HANDLER_DBUG_ENTER("%p %s", trans, lt == lock_read ? "r" : "w");
6328     int error = ENOSYS;
6329     if (!num_DBs_locked_in_bulk) {
6330         rwlock_t_lock_read(share->_num_DBs_lock);
6331     }
6332     uint curr_num_DBs = share->num_DBs;
6333     if (lt == lock_read) {
6334         error = 0;
6335         goto cleanup;
6336     } else if (lt == lock_write) {
6337         for (uint i = 0; i < curr_num_DBs; i++) {
6338             DB* db = share->key_file[i];
6339             error = db->pre_acquire_table_lock(db, trans);
6340             if (error == EINVAL)
6341                 TOKUDB_HANDLER_TRACE("%d db=%p trans=%p", i, db, trans);
6342             if (error) break;
6343         }
6344         TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6345         if (error) goto cleanup;
6346     } else {
6347         error = ENOSYS;
6348         goto cleanup;
6349     }
6350 
6351     error = 0;
6352 cleanup:
6353     if (!num_DBs_locked_in_bulk) {
6354         share->_num_DBs_lock.unlock();
6355     }
6356     TOKUDB_HANDLER_DBUG_RETURN(error);
6357 }
6358 
6359 int ha_tokudb::create_txn(THD* thd, tokudb_trx_data* trx) {
6360     int error;
6361     ulong tx_isolation = thd_tx_isolation(thd);
6362     HA_TOKU_ISO_LEVEL toku_iso_level = tx_to_toku_iso(tx_isolation);
6363     bool is_autocommit = !thd_test_options(
6364             thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN);
6365 
6366     /* First table lock, start transaction */
6367     if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) &&
6368          !trx->all &&
6369          (thd_sql_command(thd) != SQLCOM_CREATE_TABLE) &&
6370          (thd_sql_command(thd) != SQLCOM_DROP_TABLE) &&
6371          (thd_sql_command(thd) != SQLCOM_DROP_INDEX) &&
6372          (thd_sql_command(thd) != SQLCOM_CREATE_INDEX) &&
6373          (thd_sql_command(thd) != SQLCOM_ALTER_TABLE)) {
6374         /* QQQ We have to start a master transaction */
6375         // DBUG_PRINT("trans", ("starting transaction all "));
6376         uint32_t txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6377 #if 50614 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
6378         if (thd_tx_is_read_only(thd)) {
6379             txn_begin_flags |= DB_TXN_READ_ONLY;
6380         }
6381 #endif
6382         if ((error = txn_begin(db_env, NULL, &trx->all, txn_begin_flags, thd))) {
6383             goto cleanup;
6384         }
6385         TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6386             TOKUDB_DEBUG_TXN,
6387             "created master %p",
6388             trx->all);
6389         trx->sp_level = trx->all;
6390         trans_register_ha(thd, true, tokudb_hton, 0);
6391     }
6392     DBUG_PRINT("trans", ("starting transaction stmt"));
6393     if (trx->stmt) {
6394         TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6395             TOKUDB_DEBUG_TXN,
6396             "warning:stmt=%p",
6397             trx->stmt);
6398     }
6399     uint32_t txn_begin_flags;
6400     if (trx->all == NULL) {
6401         txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6402         //
6403         // if the isolation level that the user has set is serializable,
6404         // but autocommit is on and this is just a select,
6405         // then we can go ahead and set the isolation level to
6406         // be a snapshot read, because we can serialize
6407         // the transaction to be the point in time at which the snapshot began.
6408         //
6409         if (txn_begin_flags == 0 && is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT) {
6410             txn_begin_flags = DB_TXN_SNAPSHOT;
6411         }
6412         if (is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT &&
6413             !thd->in_sub_stmt && lock.type <= TL_READ_NO_INSERT &&
6414             !thd->lex->uses_stored_routines()) {
6415             txn_begin_flags |= DB_TXN_READ_ONLY;
6416         }
6417     } else {
6418         txn_begin_flags = DB_INHERIT_ISOLATION;
6419     }
6420     error = txn_begin(db_env, trx->sp_level, &trx->stmt, txn_begin_flags, thd);
6421     if (error) {
6422         /* We leave the possible master transaction open */
6423         goto cleanup;
6424     }
6425     trx->sub_sp_level = trx->stmt;
6426     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6427         TOKUDB_DEBUG_TXN,
6428         "created stmt %p sp_level %p",
6429         trx->sp_level,
6430         trx->stmt);
6431     reset_stmt_progress(&trx->stmt_progress);
6432     trans_register_ha(thd, false, tokudb_hton, 0);
6433 cleanup:
6434     return error;
6435 }
6436 
6437 static const char *lock_type_str(int lock_type) {
6438     if (lock_type == F_RDLCK) return "F_RDLCK";
6439     if (lock_type == F_WRLCK) return "F_WRLCK";
6440     if (lock_type == F_UNLCK) return "F_UNLCK";
6441     return "?";
6442 }
6443 
6444 /*
6445   As MySQL will execute an external lock for every new table it uses
6446   we can use this to start the transactions.
6447   If we are in auto_commit mode we just need to start a transaction
6448   for the statement to be able to rollback the statement.
6449   If not, we have to start a master transaction if there doesn't exist
6450   one from before.
6451 */
6452 //
6453 // Parameters:
6454 //      [in]    thd - handle to the user thread
6455 //              lock_type - the type of lock
6456 // Returns:
6457 //      0 on success
6458 //      error otherwise
6459 //
6460 int ha_tokudb::external_lock(THD * thd, int lock_type) {
6461     TOKUDB_HANDLER_DBUG_ENTER(
6462         "cmd %d lock %d %s %s",
6463         thd_sql_command(thd),
6464         lock_type,
6465         lock_type_str(lock_type),
6466         share->full_table_name());
6467     if (TOKUDB_UNLIKELY(!TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ENTER) &&
6468         TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_LOCK))) {
6469         TOKUDB_HANDLER_TRACE(
6470             "cmd %d lock %d %s %s",
6471             thd_sql_command(thd),
6472             lock_type,
6473             lock_type_str(lock_type),
6474             share->full_table_name());
6475     }
6476     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s", thd->query());
6477 
6478     int error = 0;
6479     tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6480     if (!trx) {
6481         error = create_tokudb_trx_data_instance(&trx);
6482         if (error) { goto cleanup; }
6483         thd_set_ha_data(thd, tokudb_hton, trx);
6484     }
6485 
6486     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6487         TOKUDB_DEBUG_TXN,
6488         "trx %p %p %p %p %u %u",
6489         trx->all,
6490         trx->stmt,
6491         trx->sp_level,
6492         trx->sub_sp_level,
6493         trx->tokudb_lock_count,
6494         trx->create_lock_count);
6495 
6496     if (trx->all == NULL) {
6497         trx->sp_level = NULL;
6498     }
6499     if (lock_type != F_UNLCK) {
6500         use_write_locks = false;
6501         if (lock_type == F_WRLCK) {
6502             use_write_locks = true;
6503         }
6504         if (!trx->stmt) {
6505             transaction = NULL;    // Safety
6506             error = create_txn(thd, trx);
6507             if (error) {
6508                 goto cleanup;
6509             }
6510             trx->create_lock_count = trx->tokudb_lock_count;
6511         }
6512         transaction = trx->sub_sp_level;
6513         trx->tokudb_lock_count++;
6514     } else {
6515         share->update_row_count(thd, added_rows, deleted_rows, updated_rows);
6516         added_rows = 0;
6517         deleted_rows = 0;
6518         updated_rows = 0;
6519         share->rows_from_locked_table = 0;
6520         if (trx->tokudb_lock_count > 0) {
6521             if (--trx->tokudb_lock_count <= trx->create_lock_count) {
6522                 trx->create_lock_count = 0;
6523                 if (trx->stmt) {
6524                     /*
6525                       F_UNLCK is done without a transaction commit / rollback.
6526                       This happens if the thread didn't update any rows
6527                       We must in this case commit the work to keep the row locks
6528                     */
6529                     DBUG_PRINT("trans", ("commiting non-updating transaction"));
6530                     reset_stmt_progress(&trx->stmt_progress);
6531                     commit_txn(trx->stmt, 0);
6532                     trx->stmt = NULL;
6533                     trx->sub_sp_level = NULL;
6534                 }
6535             }
6536             transaction = NULL;
6537         }
6538     }
6539 cleanup:
6540     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6541     TOKUDB_HANDLER_DBUG_RETURN(error);
6542 }
6543 
6544 /*
6545   When using LOCK TABLE's external_lock is only called when the actual
6546   TABLE LOCK is done.
6547   Under LOCK TABLES, each used tables will force a call to start_stmt.
6548 */
6549 int ha_tokudb::start_stmt(THD* thd, thr_lock_type lock_type) {
6550     TOKUDB_HANDLER_DBUG_ENTER(
6551         "cmd %d lock %d %s",
6552         thd_sql_command(thd),
6553         lock_type,
6554         share->full_table_name());
6555 
6556     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s", thd->query());
6557 
6558     int error = 0;
6559     tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6560     if (!trx) {
6561         error = create_tokudb_trx_data_instance(&trx);
6562         if (error) { goto cleanup; }
6563         thd_set_ha_data(thd, tokudb_hton, trx);
6564     }
6565 
6566     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6567         TOKUDB_DEBUG_TXN,
6568         "trx %p %p %p %p %u %u",
6569         trx->all,
6570         trx->stmt,
6571         trx->sp_level,
6572         trx->sub_sp_level,
6573         trx->tokudb_lock_count,
6574         trx->create_lock_count);
6575 
6576     /*
6577        note that trx->stmt may have been already initialized as start_stmt()
6578        is called for *each table* not for each storage engine,
6579        and there could be many bdb tables referenced in the query
6580      */
6581     if (!trx->stmt) {
6582         error = create_txn(thd, trx);
6583         if (error) {
6584             goto cleanup;
6585         }
6586         trx->create_lock_count = trx->tokudb_lock_count;
6587     } else {
6588         TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6589             TOKUDB_DEBUG_TXN,
6590             "trx->stmt %p already existed",
6591             trx->stmt);
6592     }
6593     if (added_rows > deleted_rows) {
6594         share->rows_from_locked_table = added_rows - deleted_rows;
6595     }
6596     transaction = trx->sub_sp_level;
6597     trans_register_ha(thd, false, tokudb_hton, 0);
6598 cleanup:
6599     TOKUDB_HANDLER_DBUG_RETURN(error);
6600 }
6601 
6602 
6603 uint32_t ha_tokudb::get_cursor_isolation_flags(enum thr_lock_type lock_type, THD* thd) {
6604     uint sql_command = thd_sql_command(thd);
6605     bool in_lock_tables = thd_in_lock_tables(thd);
6606 
6607     //
6608     // following InnoDB's lead and having checksum command use a snapshot read if told
6609     //
6610     if (sql_command == SQLCOM_CHECKSUM) {
6611         return 0;
6612     }
6613     else if ((lock_type == TL_READ && in_lock_tables) ||
6614              (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
6615              sql_command != SQLCOM_SELECT ||
6616              (sql_command == SQLCOM_SELECT && lock_type >= TL_WRITE_ALLOW_WRITE)) { // select for update
6617       ulong tx_isolation = thd_tx_isolation(thd);
6618       // pattern matched from InnoDB
6619       if ( (tx_isolation == ISO_READ_COMMITTED || tx_isolation == ISO_READ_UNCOMMITTED) &&
6620 	   (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT) &&
6621 	   (sql_command == SQLCOM_INSERT_SELECT
6622               || sql_command == SQLCOM_REPLACE_SELECT
6623               || sql_command == SQLCOM_UPDATE
6624 	    || sql_command == SQLCOM_CREATE_TABLE) )
6625         {
6626 	  return 0;
6627         }
6628       else {
6629 	return DB_SERIALIZABLE;
6630       }
6631     }
6632     else {
6633         return 0;
6634     }
6635 }
6636 
6637 /*
6638   The idea with handler::store_lock() is the following:
6639 
6640   The statement decided which locks we should need for the table
6641   for updates/deletes/inserts we get WRITE locks, for SELECT... we get
6642   read locks.
6643 
6644   Before adding the lock into the table lock handler (see thr_lock.c)
6645   mysqld calls store lock with the requested locks.  Store lock can now
6646   modify a write lock to a read lock (or some other lock), ignore the
6647   lock (if we don't want to use MySQL table locks at all) or add locks
6648   for many tables (like we do when we are using a MERGE handler).
6649 
6650   TokuDB changes all WRITE locks to TL_WRITE_ALLOW_WRITE (which
6651   signals that we are doing WRITES, but we are still allowing other
6652   reader's and writer's.
6653 
6654   When releasing locks, store_lock() are also called. In this case one
6655   usually doesn't have to do anything.
6656 
6657   In some exceptional cases MySQL may send a request for a TL_IGNORE;
6658   This means that we are requesting the same lock as last time and this
6659   should also be ignored. (This may happen when someone does a flush
6660   table when we have opened a part of the tables, in which case mysqld
6661   closes and reopens the tables and tries to get the same locks at last
6662   time).  In the future we will probably try to remove this.
6663 */
6664 
6665 THR_LOCK_DATA* *ha_tokudb::store_lock(
6666     THD* thd,
6667     THR_LOCK_DATA** to,
6668     enum thr_lock_type lock_type) {
6669 
6670     TOKUDB_HANDLER_DBUG_ENTER(
6671         "lock_type=%d cmd=%d",
6672         lock_type,
6673         thd_sql_command(thd));
6674     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6675         TOKUDB_DEBUG_LOCK,
6676         "lock_type=%d cmd=%d",
6677         lock_type,
6678         thd_sql_command(thd));
6679 
6680     if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) {
6681         enum_sql_command sql_command = (enum_sql_command) thd_sql_command(thd);
6682         if (!thd->in_lock_tables) {
6683             if (sql_command == SQLCOM_CREATE_INDEX &&
6684                 tokudb::sysvars::create_index_online(thd)) {
6685                 // hot indexing
6686                 rwlock_t_lock_read(share->_num_DBs_lock);
6687                 if (share->num_DBs ==
6688                     (table->s->keys + tokudb_test(hidden_primary_key))) {
6689                     lock_type = TL_WRITE_ALLOW_WRITE;
6690                 }
6691                 share->_num_DBs_lock.unlock();
6692             } else if ((lock_type >= TL_WRITE_CONCURRENT_INSERT &&
6693                         lock_type <= TL_WRITE) &&
6694                         sql_command != SQLCOM_TRUNCATE &&
6695                         !thd_tablespace_op(thd)) {
6696                 // allow concurrent writes
6697                 lock_type = TL_WRITE_ALLOW_WRITE;
6698             } else if (sql_command == SQLCOM_OPTIMIZE &&
6699                        lock_type == TL_READ_NO_INSERT) {
6700                 // hot optimize table
6701                 lock_type = TL_READ;
6702             }
6703         }
6704         lock.type = lock_type;
6705     }
6706     *to++ = &lock;
6707     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6708         TOKUDB_DEBUG_LOCK,
6709         "lock_type=%d",
6710         lock_type);
6711     TOKUDB_HANDLER_DBUG_RETURN_PTR(to);
6712 }
6713 
6714 static toku_compression_method get_compression_method(DB* file) {
6715     enum toku_compression_method method;
6716     int r = file->get_compression_method(file, &method);
6717     assert_always(r == 0);
6718     return method;
6719 }
6720 
6721 #if defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) && \
6722     TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6723 enum row_type ha_tokudb::get_row_type() const {
6724     toku_compression_method compression_method = get_compression_method(share->file);
6725     return toku_compression_method_to_row_type(compression_method);
6726 }
6727 #endif  // defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) &&
6728         // TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6729 
6730 static int create_sub_table(
6731     const char* table_name,
6732     DBT* row_descriptor,
6733     DB_TXN* txn,
6734     uint32_t block_size,
6735     uint32_t read_block_size,
6736     toku_compression_method compression_method,
6737     bool is_hot_index,
6738     uint32_t fanout) {
6739 
6740     TOKUDB_DBUG_ENTER("");
6741     int error;
6742     DB *file = NULL;
6743     uint32_t create_flags;
6744 
6745 
6746     error = db_create(&file, db_env, 0);
6747     if (error) {
6748         DBUG_PRINT("error", ("Got error: %d when creating table", error));
6749         my_errno = error;
6750         goto exit;
6751     }
6752 
6753 
6754     if (block_size != 0) {
6755         error = file->set_pagesize(file, block_size);
6756         if (error != 0) {
6757             DBUG_PRINT(
6758                 "error",
6759                 ("Got error: %d when setting block size %u for table '%s'",
6760                     error,
6761                     block_size,
6762                     table_name));
6763             goto exit;
6764         }
6765     }
6766     if (read_block_size != 0) {
6767         error = file->set_readpagesize(file, read_block_size);
6768         if (error != 0) {
6769             DBUG_PRINT(
6770                 "error",
6771                 ("Got error: %d when setting read block size %u for table '%s'",
6772                     error,
6773                     read_block_size,
6774                     table_name));
6775             goto exit;
6776         }
6777     }
6778     if (fanout != 0) {
6779         error = file->set_fanout(file, fanout);
6780         if (error != 0) {
6781             DBUG_PRINT(
6782                 "error",
6783                 ("Got error: %d when setting fanout %u for table '%s'",
6784                     error,
6785                     fanout,
6786                     table_name));
6787             goto exit;
6788         }
6789     }
6790     error = file->set_compression_method(file, compression_method);
6791     if (error != 0) {
6792         DBUG_PRINT(
6793             "error",
6794             ("Got error: %d when setting compression type %u for table '%s'",
6795                 error,
6796                 compression_method,
6797                 table_name));
6798         goto exit;
6799     }
6800 
6801     create_flags =
6802         DB_THREAD | DB_CREATE | DB_EXCL | (is_hot_index ? DB_IS_HOT_INDEX : 0);
6803     error =
6804         file->open(
6805             file,
6806             txn,
6807             table_name,
6808             NULL,
6809             DB_BTREE,
6810             create_flags,
6811             my_umask);
6812     if (error) {
6813         DBUG_PRINT(
6814             "error",
6815             ("Got error: %d when opening table '%s'", error, table_name));
6816         goto exit;
6817     }
6818 
6819     error =
6820         file->change_descriptor(
6821             file,
6822             txn,
6823             row_descriptor,
6824             (is_hot_index ? DB_IS_HOT_INDEX |
6825                 DB_UPDATE_CMP_DESCRIPTOR :
6826                 DB_UPDATE_CMP_DESCRIPTOR));
6827     if (error) {
6828         DBUG_PRINT(
6829             "error",
6830             ("Got error: %d when setting row descriptor for table '%s'",
6831                 error,
6832                 table_name));
6833         goto exit;
6834     }
6835 
6836     error = 0;
6837 exit:
6838     if (file) {
6839         int r = file->close(file, 0);
6840         assert_always(r==0);
6841     }
6842     TOKUDB_DBUG_RETURN(error);
6843 }
6844 
6845 void ha_tokudb::update_create_info(HA_CREATE_INFO* create_info) {
6846     if (share->has_auto_inc) {
6847         info(HA_STATUS_AUTO);
6848         if (!(create_info->used_fields & HA_CREATE_USED_AUTO) ||
6849             create_info->auto_increment_value < stats.auto_increment_value) {
6850             create_info->auto_increment_value = stats.auto_increment_value;
6851         }
6852     }
6853 #if defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) && \
6854     TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6855     if (!(create_info->used_fields & HA_CREATE_USED_ROW_FORMAT)) {
6856         // show create table asks us to update this create_info, this makes it
6857         // so we'll always show what compression type we're using
6858         create_info->row_type = get_row_type();
6859         if (create_info->row_type == ROW_TYPE_TOKU_ZLIB &&
6860             tokudb::sysvars::hide_default_row_format(ha_thd()) != 0) {
6861             create_info->row_type = ROW_TYPE_DEFAULT;
6862         }
6863     }
6864 #endif  // defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) &&
6865         // TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6866 }
6867 
6868 //
6869 // removes key name from status.tokudb.
6870 // needed for when we are dropping indexes, so that
6871 // during drop table, we do not attempt to remove already dropped
6872 // indexes because we did not keep status.tokudb in sync with list of indexes.
6873 //
6874 int ha_tokudb::remove_key_name_from_status(DB* status_block, const char* key_name, DB_TXN* txn) {
6875     int error;
6876     uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6877     HA_METADATA_KEY md_key = hatoku_key_name;
6878     memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6879     //
6880     // put index name in status.tokudb
6881     //
6882     memcpy(
6883         status_key_info + sizeof(HA_METADATA_KEY),
6884         key_name,
6885         strlen(key_name) + 1
6886         );
6887     error = remove_metadata(
6888         status_block,
6889         status_key_info,
6890         sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6891         txn
6892         );
6893     return error;
6894 }
6895 
6896 //
6897 // writes the key name in status.tokudb, so that we may later delete or rename
6898 // the dictionary associated with key_name
6899 //
6900 int ha_tokudb::write_key_name_to_status(DB* status_block, const char* key_name,
6901  DB_TXN* txn) {
6902     int error;
6903     uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6904     HA_METADATA_KEY md_key = hatoku_key_name;
6905     memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6906     //
6907     // put index name in status.tokudb
6908     //
6909     memcpy(
6910         status_key_info + sizeof(HA_METADATA_KEY),
6911         key_name,
6912         strlen(key_name) + 1
6913         );
6914     error = write_metadata(
6915         status_block,
6916         status_key_info,
6917         sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6918         NULL,
6919         0,
6920         txn
6921         );
6922     return error;
6923 }
6924 
6925 //
6926 // some tracing moved out of ha_tokudb::create, because ::create was
6927 // getting cluttered
6928 //
6929 void ha_tokudb::trace_create_table_info(TABLE* form) {
6930     uint i;
6931     //
6932     // tracing information about what type of table we are creating
6933     //
6934     if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_OPEN))) {
6935         for (i = 0; i < form->s->fields; i++) {
6936             Field *field = form->s->field[i];
6937             TOKUDB_HANDLER_TRACE(
6938                 "field:%d:%s:type=%d:flags=%x",
6939                 i,
6940                 field->field_name.str,
6941                 field->type(),
6942                 field->flags);
6943         }
6944         for (i = 0; i < form->s->keys; i++) {
6945             KEY *key = &form->key_info[i];
6946             TOKUDB_HANDLER_TRACE(
6947                 "key:%d:%s:%d",
6948                 i,
6949                 key->name.str,
6950                 key->user_defined_key_parts);
6951             uint p;
6952             for (p = 0; p < key->user_defined_key_parts; p++) {
6953                 KEY_PART_INFO* key_part = &key->key_part[p];
6954                 Field* field = key_part->field;
6955                 TOKUDB_HANDLER_TRACE(
6956                     "key:%d:%d:length=%d:%s:type=%d:flags=%x",
6957                     i,
6958                     p,
6959                     key_part->length,
6960                     field->field_name.str,
6961                     field->type(),
6962                     field->flags);
6963             }
6964         }
6965     }
6966 }
6967 
6968 static uint32_t get_max_desc_size(KEY_AND_COL_INFO* kc_info, TABLE* form) {
6969     uint32_t max_row_desc_buff_size;
6970     // upper bound of key comparison descriptor
6971     max_row_desc_buff_size = 2*(form->s->fields * 6)+10;
6972     // upper bound for sec. key part
6973     max_row_desc_buff_size += get_max_secondary_key_pack_desc_size(kc_info);
6974     // upper bound for clustering val part
6975     max_row_desc_buff_size += get_max_clustering_val_pack_desc_size(form->s);
6976     return max_row_desc_buff_size;
6977 }
6978 
6979 static uint32_t create_secondary_key_descriptor(
6980     uchar* buf,
6981     KEY* key_info,
6982     KEY* prim_key,
6983     uint hpk,
6984     TABLE* form,
6985     uint primary_key,
6986     uint32_t keynr,
6987     KEY_AND_COL_INFO* kc_info) {
6988 
6989     uchar* ptr = NULL;
6990 
6991     ptr = buf;
6992     ptr += create_toku_key_descriptor(
6993         ptr,
6994         false,
6995         key_info,
6996         hpk,
6997         prim_key
6998         );
6999 
7000     ptr += create_toku_secondary_key_pack_descriptor(
7001         ptr,
7002         hpk,
7003         primary_key,
7004         form->s,
7005         form,
7006         kc_info,
7007         key_info,
7008         prim_key
7009         );
7010 
7011     ptr += create_toku_clustering_val_pack_descriptor(
7012         ptr,
7013         primary_key,
7014         form->s,
7015         kc_info,
7016         keynr,
7017         key_is_clustering(key_info)
7018         );
7019     return ptr - buf;
7020 }
7021 
7022 
7023 //
7024 // creates dictionary for secondary index, with key description key_info, all using txn
7025 //
7026 int ha_tokudb::create_secondary_dictionary(
7027     const char* name,
7028     TABLE* form,
7029     KEY* key_info,
7030     DB_TXN* txn,
7031     KEY_AND_COL_INFO* kc_info,
7032     uint32_t keynr,
7033     bool is_hot_index,
7034     toku_compression_method compression_method) {
7035 
7036     int error;
7037     DBT row_descriptor;
7038     uchar* row_desc_buff = NULL;
7039     char* newname = NULL;
7040     size_t newname_len = 0;
7041     KEY* prim_key = NULL;
7042     char dict_name[MAX_DICT_NAME_LEN];
7043     uint32_t max_row_desc_buff_size;
7044     uint hpk= (form->s->primary_key >= MAX_KEY) ?
7045         TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7046     uint32_t block_size;
7047     uint32_t read_block_size;
7048     uint32_t fanout;
7049     THD* thd = ha_thd();
7050 
7051     memset(&row_descriptor, 0, sizeof(row_descriptor));
7052 
7053     max_row_desc_buff_size = get_max_desc_size(kc_info,form);
7054 
7055     row_desc_buff = (uchar*)tokudb::memory::malloc(
7056         max_row_desc_buff_size,
7057         MYF(MY_WME));
7058     if (row_desc_buff == NULL) {
7059         error = ENOMEM;
7060         goto cleanup;
7061     }
7062 
7063     newname_len = get_max_dict_name_path_length(name);
7064     newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7065     if (newname == NULL) {
7066         error = ENOMEM;
7067         goto cleanup;
7068     }
7069 
7070     sprintf(dict_name, "key-%s", key_info->name.str);
7071     make_name(newname, newname_len, name, dict_name);
7072 
7073     prim_key = (hpk) ? NULL : &form->key_info[primary_key];
7074 
7075     //
7076     // setup the row descriptor
7077     //
7078     row_descriptor.data = row_desc_buff;
7079     //
7080     // save data necessary for key comparisons
7081     //
7082     row_descriptor.size = create_secondary_key_descriptor(
7083         row_desc_buff,
7084         key_info,
7085         prim_key,
7086         hpk,
7087         form,
7088         primary_key,
7089         keynr,
7090         kc_info);
7091     assert_always(row_descriptor.size <= max_row_desc_buff_size);
7092 
7093     block_size = tokudb::sysvars::block_size(thd);
7094     read_block_size = tokudb::sysvars::read_block_size(thd);
7095     fanout = tokudb::sysvars::fanout(thd);
7096 
7097     error = create_sub_table(
7098         newname,
7099         &row_descriptor,
7100         txn,
7101         block_size,
7102         read_block_size,
7103         compression_method,
7104         is_hot_index,
7105         fanout);
7106 cleanup:
7107     tokudb::memory::free(newname);
7108     tokudb::memory::free(row_desc_buff);
7109     return error;
7110 }
7111 
7112 
7113 static uint32_t create_main_key_descriptor(
7114     uchar* buf,
7115     KEY* prim_key,
7116     uint hpk,
7117     uint primary_key,
7118     TABLE* form,
7119     KEY_AND_COL_INFO* kc_info) {
7120 
7121     uchar* ptr = buf;
7122     ptr += create_toku_key_descriptor(
7123         ptr,
7124         hpk,
7125         prim_key,
7126         false,
7127         NULL);
7128 
7129     ptr += create_toku_main_key_pack_descriptor(ptr);
7130 
7131     ptr += create_toku_clustering_val_pack_descriptor(
7132         ptr,
7133         primary_key,
7134         form->s,
7135         kc_info,
7136         primary_key,
7137         false);
7138     return ptr - buf;
7139 }
7140 
7141 //
7142 // create and close the main dictionarr with name of "name" using table form, all within
7143 // transaction txn.
7144 //
7145 int ha_tokudb::create_main_dictionary(
7146     const char* name,
7147     TABLE* form,
7148     DB_TXN* txn,
7149     KEY_AND_COL_INFO* kc_info,
7150     toku_compression_method compression_method) {
7151 
7152     int error;
7153     DBT row_descriptor;
7154     uchar* row_desc_buff = NULL;
7155     char* newname = NULL;
7156     size_t newname_len = 0;
7157     KEY* prim_key = NULL;
7158     uint32_t max_row_desc_buff_size;
7159     uint hpk = (form->s->primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7160     uint32_t block_size;
7161     uint32_t read_block_size;
7162     uint32_t fanout;
7163     THD* thd = ha_thd();
7164 
7165     memset(&row_descriptor, 0, sizeof(row_descriptor));
7166     max_row_desc_buff_size = get_max_desc_size(kc_info, form);
7167 
7168     row_desc_buff = (uchar*)tokudb::memory::malloc(
7169         max_row_desc_buff_size,
7170         MYF(MY_WME));
7171     if (row_desc_buff == NULL) {
7172         error = ENOMEM;
7173         goto cleanup;
7174     }
7175 
7176     newname_len = get_max_dict_name_path_length(name);
7177     newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7178     if (newname == NULL) {
7179         error = ENOMEM;
7180         goto cleanup;
7181     }
7182 
7183     make_name(newname, newname_len, name, "main");
7184 
7185     prim_key = (hpk) ? NULL : &form->key_info[primary_key];
7186 
7187     //
7188     // setup the row descriptor
7189     //
7190     row_descriptor.data = row_desc_buff;
7191     //
7192     // save data necessary for key comparisons
7193     //
7194     row_descriptor.size = create_main_key_descriptor(
7195         row_desc_buff,
7196         prim_key,
7197         hpk,
7198         primary_key,
7199         form,
7200         kc_info);
7201     assert_always(row_descriptor.size <= max_row_desc_buff_size);
7202 
7203     block_size = tokudb::sysvars::block_size(thd);
7204     read_block_size = tokudb::sysvars::read_block_size(thd);
7205     fanout = tokudb::sysvars::fanout(thd);
7206 
7207     /* Create the main table that will hold the real rows */
7208     error = create_sub_table(
7209         newname,
7210         &row_descriptor,
7211         txn,
7212         block_size,
7213         read_block_size,
7214         compression_method,
7215         false,
7216         fanout);
7217 cleanup:
7218     tokudb::memory::free(newname);
7219     tokudb::memory::free(row_desc_buff);
7220     return error;
7221 }
7222 
7223 //
7224 // Creates a new table
7225 // Parameters:
7226 //      [in]    name - table name
7227 //      [in]    form - info on table, columns and indexes
7228 //      [in]    create_info - more info on table, CURRENTLY UNUSED
7229 // Returns:
7230 //      0 on success
7231 //      error otherwise
7232 //
7233 int ha_tokudb::create(
7234     const char* name,
7235     TABLE* form,
7236     HA_CREATE_INFO* create_info) {
7237 
7238     TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7239 
7240     int error;
7241     DB *status_block = NULL;
7242     uint version;
7243     uint capabilities;
7244     DB_TXN* txn = NULL;
7245     bool do_commit = false;
7246     char* newname = NULL;
7247     size_t newname_len = 0;
7248     KEY_AND_COL_INFO kc_info;
7249     tokudb_trx_data *trx = NULL;
7250     THD* thd = ha_thd();
7251 
7252     String database_name, table_name, dictionary_name;
7253     tokudb_split_dname(name, database_name, table_name, dictionary_name);
7254     if (database_name.is_empty() || table_name.is_empty()) {
7255         push_warning_printf(thd,
7256                             Sql_condition::WARN_LEVEL_WARN,
7257                             ER_TABLE_NAME,
7258                             "TokuDB: Table Name or Database Name is empty");
7259         DBUG_RETURN(ER_TABLE_NAME);
7260     }
7261 
7262     memset(&kc_info, 0, sizeof(kc_info));
7263 
7264 #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100999
7265     // TokuDB does not support discover_table_names() and writes no files
7266     // in the database directory, so automatic filename-based
7267     // discover_table_names() doesn't work either. So, it must force .frm
7268     // file to disk.
7269     error= form->s->write_frm_image();
7270 #endif
7271 
7272 #if defined(TOKU_INCLUDE_OPTION_STRUCTS) && TOKU_INCLUDE_OPTION_STRUCTS
7273     const tokudb::sysvars::row_format_t row_format =
7274         (tokudb::sysvars::row_format_t)form->s->option_struct->row_format;
7275 #else
7276     // TDB-76 : CREATE TABLE ... LIKE ... does not use source row_format on
7277     //          target table
7278     // Original code would only use create_info->row_type if
7279     // create_info->used_fields & HA_CREATE_USED_ROW_FORMAT was true. This
7280     // would cause us to skip transferring the row_format for a table created
7281     // via CREATE TABLE tn LIKE tn. We also take on more InnoDB like behavior
7282     // and throw a warning if we get a row_format that we can't translate into
7283     // a known TokuDB row_format.
7284     tokudb::sysvars::row_format_t row_format =
7285         tokudb::sysvars::row_format(thd);
7286 
7287     if ((create_info->used_fields & HA_CREATE_USED_ROW_FORMAT) ||
7288         create_info->row_type != ROW_TYPE_DEFAULT) {
7289         row_format = row_type_to_row_format(create_info->row_type);
7290         if (row_format == tokudb::sysvars::SRV_ROW_FORMAT_DEFAULT &&
7291             create_info->row_type != ROW_TYPE_DEFAULT) {
7292             push_warning(thd,
7293                          Sql_condition::WARN_LEVEL_WARN,
7294                          ER_ILLEGAL_HA_CREATE_OPTION,
7295                          "TokuDB: invalid ROW_FORMAT specifier.");
7296         }
7297     }
7298 #endif  // defined(TOKU_INCLUDE_OPTION_STRUCTS) && TOKU_INCLUDE_OPTION_STRUCTS
7299     const toku_compression_method compression_method =
7300         row_format_to_toku_compression_method(row_format);
7301     bool create_from_engine = (create_info->table_options & HA_OPTION_CREATE_FROM_ENGINE);
7302     if (error) { goto cleanup; }
7303     if (create_from_engine) {
7304         // table already exists, nothing to do
7305         error = 0;
7306         goto cleanup;
7307     }
7308 
7309     // validate the fields in the table. If the table has fields
7310     // we do not support that came from an old version of MySQL,
7311     // gracefully return an error
7312     for (uint32_t i = 0; i < form->s->fields; i++) {
7313         Field* field = table_share->field[i];
7314         if (!field_valid_for_tokudb_table(field)) {
7315             sql_print_error("Table %s has an invalid field %s, that was created "
7316                 "with an old version of MySQL. This field is no longer supported. "
7317                 "This is probably due to an alter table engine=TokuDB. To load this "
7318                 "table, do a dump and load",
7319                 name,
7320                 field->field_name.str
7321                 );
7322             error = HA_ERR_UNSUPPORTED;
7323             goto cleanup;
7324         }
7325     }
7326 
7327     newname_len = get_max_dict_name_path_length(name);
7328     newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7329     if (newname == NULL) {
7330         error = ENOMEM;
7331         goto cleanup;
7332     }
7333 
7334     trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
7335     if (trx && trx->sub_sp_level &&
7336         thd_sql_command(thd) == SQLCOM_CREATE_TABLE) {
7337         txn = trx->sub_sp_level;
7338     } else {
7339         do_commit = true;
7340         error = txn_begin(db_env, 0, &txn, 0, thd);
7341         if (error) {
7342             goto cleanup;
7343         }
7344     }
7345 
7346     primary_key = form->s->primary_key;
7347     hidden_primary_key = (primary_key  >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7348     if (hidden_primary_key) {
7349         primary_key = form->s->keys;
7350     }
7351 
7352     /* do some tracing */
7353     trace_create_table_info(form);
7354 
7355     /* Create status.tokudb and save relevant metadata */
7356     make_name(newname, newname_len, name, "status");
7357 
7358     error = tokudb::metadata::create(db_env, &status_block, newname, txn);
7359     if (error) { goto cleanup; }
7360 
7361     version = HA_TOKU_VERSION;
7362     error = write_to_status(
7363         status_block,
7364         hatoku_new_version,
7365         &version,
7366         sizeof(version),
7367         txn);
7368     if (error) {
7369         goto cleanup;
7370     }
7371 
7372     capabilities = HA_TOKU_CAP;
7373     error = write_to_status(
7374         status_block,
7375         hatoku_capabilities,
7376         &capabilities,
7377         sizeof(capabilities),
7378         txn);
7379     if (error) {
7380         goto cleanup;
7381     }
7382 
7383     error = write_auto_inc_create(
7384         status_block,
7385         create_info->auto_increment_value,
7386         txn);
7387     if (error) {
7388         goto cleanup;
7389     }
7390 
7391 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
7392 #if defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
7393     if (TOKU_PARTITION_WRITE_FRM_DATA || form->part_info == NULL) {
7394         error = write_frm_data(status_block, txn, form->s->path.str);
7395         if (error) {
7396             goto cleanup;
7397         }
7398     }
7399 #else
7400     error = write_frm_data(status_block, txn, form->s->path.str);
7401     if (error) {
7402         goto cleanup;
7403     }
7404 #endif  // defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
7405 #endif  // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
7406 
7407     error = allocate_key_and_col_info(form->s, &kc_info);
7408     if (error) {
7409         goto cleanup;
7410     }
7411 
7412     error = initialize_key_and_col_info(
7413         form->s,
7414         form,
7415         &kc_info,
7416         hidden_primary_key,
7417         primary_key);
7418     if (error) {
7419         goto cleanup;
7420     }
7421 
7422     error = create_main_dictionary(
7423         name,
7424         form,
7425         txn,
7426         &kc_info,
7427         compression_method);
7428     if (error) {
7429         goto cleanup;
7430     }
7431 
7432 
7433     for (uint i = 0; i < form->s->keys; i++) {
7434         if (i != primary_key) {
7435             error = create_secondary_dictionary(
7436                 name,
7437                 form,
7438                 &form->key_info[i],
7439                 txn,
7440                 &kc_info,
7441                 i,
7442                 false,
7443                 compression_method);
7444             if (error) {
7445                 goto cleanup;
7446             }
7447 
7448             error = write_key_name_to_status(
7449                 status_block,
7450                 form->key_info[i].name.str,
7451                 txn);
7452             if (error) {
7453                 goto cleanup;
7454             }
7455         }
7456     }
7457 
7458     error = 0;
7459 cleanup:
7460     if (status_block != NULL) {
7461         int r = tokudb::metadata::close(&status_block);
7462         assert_always(r==0);
7463     }
7464     free_key_and_col_info(&kc_info);
7465     if (do_commit && txn) {
7466         if (error) {
7467             abort_txn(txn);
7468         } else {
7469             commit_txn(txn,0);
7470         }
7471     }
7472     tokudb::memory::free(newname);
7473     TOKUDB_HANDLER_DBUG_RETURN(error);
7474 }
7475 
7476 int ha_tokudb::discard_or_import_tablespace(TOKUDB_UNUSED(my_bool discard)) {
7477     /*
7478     if (discard) {
7479         my_errno=HA_ERR_WRONG_COMMAND;
7480         return my_errno;
7481     }
7482     return add_table_to_metadata(share->table_name);
7483     */
7484     my_errno=HA_ERR_WRONG_COMMAND;
7485     return my_errno;
7486 }
7487 
7488 
7489 //
7490 // deletes from_name or renames from_name to to_name, all using transaction txn.
7491 // is_delete specifies which we are doing
7492 // is_key specifies if it is a secondary index (and hence a "key-" needs to be prepended) or
7493 // if it is not a secondary index
7494 //
7495 int ha_tokudb::delete_or_rename_dictionary(
7496     const char* from_name,
7497     const char* to_name,
7498     const char* secondary_name,
7499     bool is_key,
7500     DB_TXN* txn,
7501     bool is_delete) {
7502 
7503     int error;
7504     char dict_name[MAX_DICT_NAME_LEN];
7505     char* new_from_name = NULL;
7506     size_t new_from_name_len = 0;
7507     char* new_to_name = NULL;
7508     size_t new_to_name_len = 0;
7509     assert_always(txn);
7510 
7511     new_from_name_len = get_max_dict_name_path_length(from_name);
7512     new_from_name = (char*)tokudb::memory::malloc(
7513         new_from_name_len,
7514         MYF(MY_WME));
7515     if (new_from_name == NULL) {
7516         error = ENOMEM;
7517         goto cleanup;
7518     }
7519     if (!is_delete) {
7520         assert_always(to_name);
7521         new_to_name_len = get_max_dict_name_path_length(to_name);
7522         new_to_name = (char*)tokudb::memory::malloc(
7523             new_to_name_len,
7524             MYF(MY_WME));
7525         if (new_to_name == NULL) {
7526             error = ENOMEM;
7527             goto cleanup;
7528         }
7529     }
7530 
7531     if (is_key) {
7532         sprintf(dict_name, "key-%s", secondary_name);
7533         make_name(new_from_name, new_from_name_len, from_name, dict_name);
7534     } else {
7535         make_name(new_from_name, new_from_name_len, from_name, secondary_name);
7536     }
7537     if (!is_delete) {
7538         if (is_key) {
7539             sprintf(dict_name, "key-%s", secondary_name);
7540             make_name(new_to_name, new_to_name_len, to_name, dict_name);
7541         } else {
7542             make_name(new_to_name, new_to_name_len, to_name, secondary_name);
7543         }
7544     }
7545 
7546     if (is_delete) {
7547         error = db_env->dbremove(db_env, txn, new_from_name, NULL, 0);
7548     } else {
7549         error = db_env->dbrename(
7550             db_env,
7551             txn,
7552             new_from_name,
7553             NULL,
7554             new_to_name,
7555             0);
7556     }
7557     if (error) {
7558         goto cleanup;
7559     }
7560 
7561 cleanup:
7562     tokudb::memory::free(new_from_name);
7563     tokudb::memory::free(new_to_name);
7564     return error;
7565 }
7566 
7567 
7568 //
7569 // deletes or renames a table. if is_delete is true, then we delete, and to_name can be NULL
7570 // if is_delete is false, then to_name must be non-NULL, as we are renaming the table.
7571 //
7572 int ha_tokudb::delete_or_rename_table (const char* from_name, const char* to_name, bool is_delete) {
7573     THD *thd = ha_thd();
7574     int error;
7575     DB* status_db = NULL;
7576     DBC* status_cursor = NULL;
7577     DB_TXN* txn = NULL;
7578     DBT curr_key;
7579     DBT curr_val;
7580     memset(&curr_key, 0, sizeof(curr_key));
7581     memset(&curr_val, 0, sizeof(curr_val));
7582 
7583     DB_TXN *parent_txn = NULL;
7584     tokudb_trx_data *trx = NULL;
7585     trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
7586     if (thd_sql_command(ha_thd()) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
7587         parent_txn = trx->sub_sp_level;
7588     }
7589 
7590     error = txn_begin(db_env, parent_txn, &txn, 0, thd);
7591     if (error) { goto cleanup; }
7592 
7593     //
7594     // open status db,
7595     // create cursor,
7596     // for each name read out of there, create a db and delete or rename it
7597     //
7598     error = open_status_dictionary(&status_db, from_name, txn);
7599     if (error) { goto cleanup; }
7600 
7601     error = status_db->cursor(status_db, txn, &status_cursor, 0);
7602     if (error) { goto cleanup; }
7603     status_cursor->c_set_check_interrupt_callback(status_cursor, tokudb_killed_thd_callback, thd);
7604 
7605     while (error != DB_NOTFOUND) {
7606         error = status_cursor->c_get(status_cursor, &curr_key, &curr_val, DB_NEXT);
7607         if (error && error != DB_NOTFOUND) {
7608             error = map_to_handler_error(error);
7609             goto cleanup;
7610         }
7611         if (error == DB_NOTFOUND) {
7612             break;
7613         }
7614         HA_METADATA_KEY mk = *(HA_METADATA_KEY *)curr_key.data;
7615         if (mk != hatoku_key_name) {
7616             continue;
7617         }
7618         error = delete_or_rename_dictionary(from_name, to_name, (char *)((char *)curr_key.data + sizeof(HA_METADATA_KEY)), true, txn, is_delete);
7619         if (error) { goto cleanup; }
7620     }
7621 
7622     //
7623     // delete or rename main.tokudb
7624     //
7625     error = delete_or_rename_dictionary(from_name, to_name, "main", false, txn, is_delete);
7626     if (error) { goto cleanup; }
7627 
7628     error = status_cursor->c_close(status_cursor);
7629     assert_always(error==0);
7630     status_cursor = NULL;
7631     if (error) { goto cleanup; }
7632 
7633     error = status_db->close(status_db, 0);
7634     assert_always(error == 0);
7635     status_db = NULL;
7636 
7637     //
7638     // delete or rename status.tokudb
7639     //
7640     error = delete_or_rename_dictionary(from_name, to_name, "status", false, txn, is_delete);
7641     if (error) { goto cleanup; }
7642 
7643     my_errno = error;
7644 cleanup:
7645     if (status_cursor) {
7646         int r = status_cursor->c_close(status_cursor);
7647         assert_always(r==0);
7648     }
7649     if (status_db) {
7650         int r = status_db->close(status_db, 0);
7651         assert_always(r==0);
7652     }
7653     if (txn) {
7654         if (error) {
7655             abort_txn(txn);
7656         }
7657         else {
7658             commit_txn(txn, 0);
7659         }
7660     }
7661     return error;
7662 }
7663 
7664 
7665 //
7666 // Drops table
7667 // Parameters:
7668 //      [in]    name - name of table to be deleted
7669 // Returns:
7670 //      0 on success
7671 //      error otherwise
7672 //
7673 int ha_tokudb::delete_table(const char *name) {
7674     TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7675     TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(name, NULL, false);
7676     if (share) {
7677         share->unlock();
7678         share->release();
7679         // this should be enough to handle locking as the higher level MDL
7680         // on this table should prevent any new analyze tasks.
7681         share->cancel_background_jobs();
7682         TOKUDB_SHARE::drop_share(share);
7683     }
7684 
7685     int error;
7686     error = delete_or_rename_table(name, NULL, true);
7687     if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7688         error == DB_LOCK_NOTGRANTED) {
7689         sql_print_error(
7690             "Could not delete table %s because another transaction has "
7691             "accessed the table. To drop the table, make sure no "
7692             "transactions touch the table.",
7693             name);
7694     }
7695     TOKUDB_HANDLER_DBUG_RETURN(error);
7696 }
7697 
7698 static bool tokudb_check_db_dir_exist_from_table_name(const char *table_name) {
7699     DBUG_ASSERT(table_name);
7700     bool mysql_dir_exists;
7701     char db_name[FN_REFLEN];
7702     const char *db_name_begin = strchr(table_name, FN_LIBCHAR);
7703     const char *db_name_end = strrchr(table_name, FN_LIBCHAR);
7704     DBUG_ASSERT(db_name_begin);
7705     DBUG_ASSERT(db_name_end);
7706     DBUG_ASSERT(db_name_begin != db_name_end);
7707 
7708     ++db_name_begin;
7709     size_t db_name_size = db_name_end - db_name_begin;
7710 
7711     DBUG_ASSERT(db_name_size < FN_REFLEN);
7712 
7713     memcpy(db_name, db_name_begin, db_name_size);
7714     db_name[db_name_size] = '\0';
7715 
7716     // At this point, db_name contains the MySQL formatted database name.
7717     // This is exactly the same format that would come into us through a
7718     // CREATE TABLE. Some charaters (like ':' for example) might be expanded
7719     // into hex (':' would papear as "@003a").
7720     // We need to check that the MySQL destination database directory exists.
7721     mysql_dir_exists = (my_access(db_name, F_OK) == 0);
7722 
7723     return mysql_dir_exists;
7724 }
7725 
7726 //
7727 // renames table from "from" to "to"
7728 // Parameters:
7729 //      [in]    name - old name of table
7730 //      [in]    to - new name of table
7731 // Returns:
7732 //      0 on success
7733 //      error otherwise
7734 //
7735 int ha_tokudb::rename_table(const char *from, const char *to) {
7736     TOKUDB_HANDLER_DBUG_ENTER("%s %s", from, to);
7737     TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(from, NULL, false);
7738     if (share) {
7739         share->unlock();
7740         share->release();
7741         // this should be enough to handle locking as the higher level MDL
7742         // on this table should prevent any new analyze tasks.
7743         share->cancel_background_jobs();
7744         TOKUDB_SHARE::drop_share(share);
7745     }
7746     int error;
7747     bool to_db_dir_exist = tokudb_check_db_dir_exist_from_table_name(to);
7748     if (!to_db_dir_exist) {
7749         sql_print_error(
7750             "Could not rename table from %s to %s because "
7751             "destination db does not exist",
7752             from,
7753             to);
7754 #ifndef __WIN__
7755         /* Small hack. tokudb_check_db_dir_exist_from_table_name calls
7756          * my_access, which sets my_errno on Windows, but doesn't on
7757          * unix. Set it for unix too.
7758          */
7759         my_errno= errno;
7760 #endif
7761         error= my_errno;
7762     }
7763     else {
7764         error = delete_or_rename_table(from, to, false);
7765         if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7766             error == DB_LOCK_NOTGRANTED) {
7767             sql_print_error(
7768                 "Could not rename table from %s to %s because another transaction "
7769                 "has accessed the table. To rename the table, make sure no "
7770                 "transactions touch the table.",
7771                 from,
7772                 to);
7773         }
7774     }
7775     TOKUDB_HANDLER_DBUG_RETURN(error);
7776 }
7777 
7778 
7779 /*
7780   Returns estimate on number of seeks it will take to read through the table
7781   This is to be comparable to the number returned by records_in_range so
7782   that we can decide if we should scan the table or use keys.
7783 */
7784 /// QQQ why divide by 3
7785 double ha_tokudb::scan_time() {
7786     TOKUDB_HANDLER_DBUG_ENTER("");
7787     double ret_val = (double)stats.records / 3;
7788     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7789         TOKUDB_DEBUG_RETURN,
7790         "return %" PRIu64 " %f",
7791         (uint64_t)stats.records,
7792         ret_val);
7793     DBUG_RETURN(ret_val);
7794 }
7795 
7796 double ha_tokudb::keyread_time(uint index, uint ranges, ha_rows rows)
7797 {
7798     TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7799     double cost;
7800     if (index == primary_key || is_clustering_key(index)) {
7801         cost = read_time(index, ranges, rows);
7802         DBUG_RETURN(cost);
7803     }
7804     /*
7805       It is assumed that we will read trough the whole key range and that all
7806       key blocks are half full (normally things are much better). It is also
7807       assumed that each time we read the next key from the index, the handler
7808       performs a random seek, thus the cost is proportional to the number of
7809       blocks read. This model does not take into account clustered indexes -
7810       engines that support that (e.g. InnoDB) may want to overwrite this method.
7811     */
7812     cost= handler::keyread_time(index, ranges, rows);
7813     TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(cost);
7814 }
7815 
7816 //
7817 // Calculate the time it takes to read a set of ranges through an index
7818 // This enables us to optimize reads for clustered indexes.
7819 // Implementation pulled from InnoDB
7820 // Parameters:
7821 //          index - index to use
7822 //          ranges - number of ranges
7823 //          rows - estimated number of rows in the range
7824 // Returns:
7825 //      estimated time measured in disk seeks
7826 //
7827 double ha_tokudb::read_time(
7828     uint    index,
7829     uint    ranges,
7830     ha_rows rows
7831     )
7832 {
7833     TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7834     double total_scan;
7835     double ret_val;
7836     bool is_primary = (index == primary_key);
7837     bool is_clustering;
7838 
7839     //
7840     // in case for hidden primary key, this is called
7841     //
7842     if (index >= table_share->keys) {
7843         ret_val = handler::read_time(index, ranges, rows);
7844         goto cleanup;
7845     }
7846 
7847     is_clustering = key_is_clustering(&table->key_info[index]);
7848 
7849 
7850     //
7851     // if it is not the primary key, and it is not a clustering key, then return handler::read_time
7852     //
7853     if (!(is_primary || is_clustering)) {
7854         ret_val = handler::read_time(index, ranges, rows);
7855         goto cleanup;
7856     }
7857 
7858     //
7859     // for primary key and for clustered keys, return a fraction of scan_time()
7860     //
7861     total_scan = scan_time();
7862 
7863     if (stats.records <= rows) {
7864         ret_val = is_clustering ? total_scan + 0.00001 : total_scan;
7865         goto cleanup;
7866     }
7867 
7868     //
7869     // one disk seek per range plus the proportional scan time of the rows
7870     //
7871     ret_val = (ranges + (double) rows / (double) stats.records * total_scan);
7872     ret_val = is_clustering ? ret_val + 0.00001 : ret_val;
7873 
7874 cleanup:
7875     TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7876 }
7877 
7878 double ha_tokudb::index_only_read_time(uint keynr, double records) {
7879     TOKUDB_HANDLER_DBUG_ENTER("%u %f", keynr, records);
7880     double ret_val = keyread_time(keynr, 1, (ha_rows)records);
7881     TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7882 }
7883 
7884 //
7885 // Estimates the number of index records in a range. In case of errors, return
7886 //   HA_TOKUDB_RANGE_COUNT instead of HA_POS_ERROR. This was behavior
7887 //   when we got the handlerton from MySQL.
7888 // Parameters:
7889 //              keynr -index to use
7890 //      [in]    start_key - low end of the range
7891 //      [in]    end_key - high end of the range
7892 // Returns:
7893 //      0 - There are no matching keys in the given range
7894 //      number > 0 - There are approximately number matching rows in the range
7895 //      HA_POS_ERROR - Something is wrong with the index tree
7896 //
7897 ha_rows ha_tokudb::records_in_range(uint keynr, const key_range* start_key,
7898                                     const key_range* end_key,
7899                                     page_range *pages) {
7900     TOKUDB_HANDLER_DBUG_ENTER("%d %p %p", keynr, start_key, end_key);
7901     DBT *pleft_key, *pright_key;
7902     DBT left_key, right_key;
7903     ha_rows ret_val = HA_TOKUDB_RANGE_COUNT;
7904     DB *kfile = share->key_file[keynr];
7905     uint64_t rows = 0;
7906     int error;
7907 
7908     // get start_rows and end_rows values so that we can estimate range
7909     // when calling key_range64, the only value we can trust is the value for less
7910     // The reason is that the key being passed in may be a prefix of keys in the DB
7911     // As a result, equal may be 0 and greater may actually be equal+greater
7912     // So, we call key_range64 on the key, and the key that is after it.
7913     if (!start_key && !end_key) {
7914         error = estimate_num_rows(share->file, &rows, transaction);
7915         if (error) {
7916             ret_val = HA_TOKUDB_RANGE_COUNT;
7917             goto cleanup;
7918         }
7919         ret_val = (rows <= 1) ? 1 : rows;
7920         goto cleanup;
7921     }
7922     if (start_key) {
7923         uchar inf_byte = (start_key->flag == HA_READ_KEY_EXACT) ? COL_NEG_INF : COL_POS_INF;
7924         pack_key(&left_key, keynr, key_buff, start_key->key, start_key->length, inf_byte);
7925         pleft_key = &left_key;
7926     } else {
7927         pleft_key = NULL;
7928     }
7929     if (end_key) {
7930         uchar inf_byte = (end_key->flag == HA_READ_BEFORE_KEY) ? COL_NEG_INF : COL_POS_INF;
7931         pack_key(&right_key, keynr, key_buff2, end_key->key, end_key->length, inf_byte);
7932         pright_key = &right_key;
7933     } else {
7934         pright_key = NULL;
7935     }
7936     // keys_range64 can not handle a degenerate range (left_key > right_key), so we filter here
7937     if (pleft_key && pright_key && tokudb_cmp_dbt_key(kfile, pleft_key, pright_key) > 0) {
7938         rows = 0;
7939     } else {
7940         uint64_t less, equal1, middle, equal2, greater;
7941         bool is_exact;
7942         error = kfile->keys_range64(kfile, transaction, pleft_key, pright_key,
7943                                     &less, &equal1, &middle, &equal2, &greater, &is_exact);
7944         if (error) {
7945             ret_val = HA_TOKUDB_RANGE_COUNT;
7946             goto cleanup;
7947         }
7948         rows = middle;
7949     }
7950 
7951     // MySQL thinks a return value of 0 means there are exactly 0 rows
7952     // Therefore, always return non-zero so this assumption is not made
7953     ret_val = (ha_rows) (rows <= 1 ? 1 : rows);
7954 
7955 cleanup:
7956     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7957         TOKUDB_DEBUG_RETURN,
7958         "return %" PRIu64 " %" PRIu64,
7959         (uint64_t)ret_val,
7960         rows);
7961     DBUG_RETURN(ret_val);
7962 }
7963 
7964 
7965 //
7966 // Initializes the auto-increment data in the local "share" object to the
7967 // greater of two values: what's stored in the metadata or the last inserted
7968 // auto-increment field (if auto-increment field is the first field of a key).
7969 //
7970 void ha_tokudb::init_auto_increment() {
7971     int error;
7972     DB_TXN* txn = NULL;
7973 
7974     error = txn_begin(db_env, 0, &txn, 0, ha_thd());
7975     if (error) {
7976         share->last_auto_increment = 0;
7977     } else {
7978         HA_METADATA_KEY key_val;
7979         DBT key;
7980         memset(&key, 0, sizeof(key));
7981         key.data = &key_val;
7982         key.size = sizeof(key_val);
7983         DBT value;
7984         memset(&value, 0, sizeof(value));
7985         value.flags = DB_DBT_USERMEM;
7986 
7987         // Retrieve the initial auto increment value, as specified by create table
7988         // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
7989         // then the value 100 should be stored here
7990         key_val = hatoku_ai_create_value;
7991         value.ulen = sizeof(share->auto_inc_create_value);
7992         value.data = &share->auto_inc_create_value;
7993         error = share->status_block->get(share->status_block, txn, &key, &value, 0);
7994 
7995         if (error || value.size != sizeof(share->auto_inc_create_value)) {
7996             share->auto_inc_create_value = 0;
7997         }
7998 
7999         // Retrieve hatoku_max_ai, which is max value used by auto increment
8000         // column so far, the max value could have been auto generated (e.g. insert (NULL))
8001         // or it could have been manually inserted by user (e.g. insert (345))
8002         key_val = hatoku_max_ai;
8003         value.ulen = sizeof(share->last_auto_increment);
8004         value.data = &share->last_auto_increment;
8005         error = share->status_block->get(share->status_block, txn, &key, &value, 0);
8006 
8007         if (error || value.size != sizeof(share->last_auto_increment)) {
8008             if (share->auto_inc_create_value)
8009                 share->last_auto_increment = share->auto_inc_create_value - 1;
8010             else
8011                 share->last_auto_increment = 0;
8012         }
8013 
8014         commit_txn(txn, 0);
8015     }
8016     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
8017         TOKUDB_DEBUG_AUTO_INCREMENT,
8018         "init auto increment:%lld",
8019         share->last_auto_increment);
8020 }
8021 
8022 void ha_tokudb::get_auto_increment(
8023     ulonglong offset,
8024     ulonglong increment,
8025     ulonglong nb_desired_values,
8026     ulonglong* first_value,
8027     ulonglong* nb_reserved_values) {
8028 
8029     TOKUDB_HANDLER_DBUG_ENTER("");
8030     ulonglong nr;
8031     bool over;
8032 
8033     if (table->s->next_number_key_offset)
8034     {
8035       handler::get_auto_increment(offset, increment, nb_desired_values, first_value, nb_reserved_values);
8036       DBUG_VOID_RETURN;
8037     }
8038 
8039     share->lock();
8040 
8041     if (share->auto_inc_create_value > share->last_auto_increment) {
8042         nr = share->auto_inc_create_value;
8043         over = false;
8044         share->last_auto_increment = share->auto_inc_create_value;
8045     } else {
8046         nr = share->last_auto_increment + increment;
8047         over = nr < share->last_auto_increment;
8048         if (over)
8049             nr = ULONGLONG_MAX;
8050     }
8051     if (!over) {
8052         share->last_auto_increment = nr + (nb_desired_values - 1)*increment;
8053         if (delay_updating_ai_metadata) {
8054             ai_metadata_update_required = true;
8055         } else {
8056             update_max_auto_inc(
8057                 share->status_block,
8058                 share->last_auto_increment);
8059         }
8060     }
8061     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
8062         TOKUDB_DEBUG_AUTO_INCREMENT,
8063         "get_auto_increment(%lld,%lld,%lld): got:%lld:%lld",
8064         offset,
8065         increment,
8066         nb_desired_values,
8067         nr,
8068         nb_desired_values);
8069     *first_value = nr;
8070     *nb_reserved_values = nb_desired_values;
8071     share->unlock();
8072     TOKUDB_HANDLER_DBUG_VOID_RETURN;
8073 }
8074 
8075 bool ha_tokudb::is_optimize_blocking() {
8076     return false;
8077 }
8078 
8079 bool ha_tokudb::is_auto_inc_singleton(){
8080     return false;
8081 }
8082 
8083 
8084 // Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8085 // With a transaction, drops dictionaries associated with indexes in key_num
8086 //
8087 //
8088 // Adds indexes to the table. Takes the array of KEY passed in key_info, and creates
8089 // DB's that will go at the end of share->key_file. THE IMPLICIT ASSUMPTION HERE is
8090 // that the table will be modified and that these added keys will be appended to the end
8091 // of the array table->key_info
8092 // Parameters:
8093 //      [in]    table_arg - table that is being modified, seems to be identical to this->table
8094 //      [in]    key_info - array of KEY's to be added
8095 //              num_of_keys - number of keys to be added, number of elements in key_info
8096 //  Returns:
8097 //      0 on success, error otherwise
8098 //
8099 int ha_tokudb::tokudb_add_index(
8100     TABLE* table_arg,
8101     KEY* key_info,
8102     uint num_of_keys,
8103     DB_TXN* txn,
8104     bool* inc_num_DBs,
8105     bool* modified_DBs) {
8106 
8107     TOKUDB_HANDLER_DBUG_ENTER("");
8108     assert_always(txn);
8109 
8110     int error;
8111     uint curr_index = 0;
8112     DBC* tmp_cursor = NULL;
8113     int cursor_ret_val = 0;
8114     DBT curr_pk_key, curr_pk_val;
8115     THD* thd = ha_thd();
8116     DB_LOADER* loader = NULL;
8117     DB_INDEXER* indexer = NULL;
8118     bool loader_save_space = tokudb::sysvars::load_save_space(thd);
8119     bool use_hot_index = (lock.type == TL_WRITE_ALLOW_WRITE);
8120     uint32_t loader_flags = loader_save_space ? LOADER_COMPRESS_INTERMEDIATES : 0;
8121     uint32_t indexer_flags = 0;
8122     uint32_t mult_db_flags[MAX_KEY + 1] = {0};
8123     uint32_t mult_put_flags[MAX_KEY + 1];
8124     uint32_t mult_dbt_flags[MAX_KEY + 1];
8125     bool creating_hot_index = false;
8126     struct loader_context lc;
8127     memset(&lc, 0, sizeof lc);
8128     lc.thd = thd;
8129     lc.ha = this;
8130     loader_error = 0;
8131     bool rw_lock_taken = false;
8132     *inc_num_DBs = false;
8133     *modified_DBs = false;
8134     invalidate_bulk_fetch();
8135     unpack_entire_row = true; // for bulk fetching rows
8136     for (uint32_t i = 0; i < MAX_KEY+1; i++) {
8137         mult_put_flags[i] = 0;
8138         mult_dbt_flags[i] = DB_DBT_REALLOC;
8139     }
8140     //
8141     // number of DB files we have open currently, before add_index is executed
8142     //
8143     uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8144 
8145     //
8146     // get the row type to use for the indexes we're adding
8147     //
8148     toku_compression_method compression_method =
8149         get_compression_method(share->file);
8150 
8151     //
8152     // status message to be shown in "show process list"
8153     //
8154     const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
8155     // buffer of 200 should be a good upper bound.
8156     char status_msg[MAX_ALIAS_NAME + 200];
8157     // variable that stores number of elements inserted thus far
8158     ulonglong num_processed = 0;
8159     thd_proc_info(thd, "Adding indexes");
8160 
8161     //
8162     // in unpack_row, MySQL passes a buffer that is this long,
8163     // so this length should be good enough for us as well
8164     //
8165     memset((void *) &curr_pk_key, 0, sizeof(curr_pk_key));
8166     memset((void *) &curr_pk_val, 0, sizeof(curr_pk_val));
8167 
8168     //
8169     // The files for secondary tables are derived from the name of keys
8170     // If we try to add a key with the same name as an already existing key,
8171     // We can crash. So here we check if any of the keys added has the same
8172     // name of an existing key, and if so, we fail gracefully
8173     //
8174     for (uint i = 0; i < num_of_keys; i++) {
8175         for (uint j = 0; j < table_arg->s->keys; j++) {
8176             if (strcmp(key_info[i].name.str,
8177                        table_arg->key_info[j].name.str) == 0) {
8178                 error = HA_ERR_WRONG_COMMAND;
8179                 goto cleanup;
8180             }
8181         }
8182     }
8183 
8184     rwlock_t_lock_write(share->_num_DBs_lock);
8185     rw_lock_taken = true;
8186     //
8187     // open all the DB files and set the appropriate variables in share
8188     // they go to the end of share->key_file
8189     //
8190     creating_hot_index =
8191         use_hot_index && num_of_keys == 1 &&
8192         (key_info[0].flags & HA_NOSAME) == 0;
8193     if (use_hot_index && (share->num_DBs > curr_num_DBs)) {
8194         //
8195         // already have hot index in progress, get out
8196         //
8197         error = HA_ERR_INTERNAL_ERROR;
8198         goto cleanup;
8199     }
8200     curr_index = curr_num_DBs;
8201     *modified_DBs = true;
8202     for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8203         if (key_is_clustering(&key_info[i])) {
8204             set_key_filter(
8205                 &share->kc_info.key_filters[curr_index],
8206                 &key_info[i],
8207                 table_arg,
8208                 false);
8209             if (!hidden_primary_key) {
8210                 set_key_filter(
8211                     &share->kc_info.key_filters[curr_index],
8212                     &table_arg->key_info[primary_key],
8213                     table_arg,
8214                     false);
8215             }
8216 
8217             error = initialize_col_pack_info(
8218                 &share->kc_info,
8219                 table_arg->s,
8220                 curr_index);
8221             if (error) {
8222                 goto cleanup;
8223             }
8224         }
8225 
8226 
8227         error = create_secondary_dictionary(
8228             share->full_table_name(),
8229             table_arg,
8230             &key_info[i],
8231             txn,
8232             &share->kc_info,
8233             curr_index,
8234             creating_hot_index,
8235             compression_method);
8236         if (error) {
8237             goto cleanup;
8238         }
8239 
8240         error = open_secondary_dictionary(
8241             &share->key_file[curr_index],
8242             &key_info[i],
8243             share->full_table_name(),
8244             false,
8245             txn);
8246         if (error) {
8247             goto cleanup;
8248         }
8249     }
8250 
8251     if (creating_hot_index) {
8252         share->num_DBs++;
8253         *inc_num_DBs = true;
8254         error = db_env->create_indexer(
8255             db_env,
8256             txn,
8257             &indexer,
8258             share->file,
8259             num_of_keys,
8260             &share->key_file[curr_num_DBs],
8261             mult_db_flags,
8262             indexer_flags);
8263         if (error) {
8264             goto cleanup;
8265         }
8266 
8267         error = indexer->set_poll_function(
8268             indexer, ha_tokudb::tokudb_add_index_poll, &lc);
8269         if (error) {
8270             goto cleanup;
8271         }
8272 
8273         error = indexer->set_error_callback(
8274             indexer, ha_tokudb::loader_add_index_err, &lc);
8275         if (error) {
8276             goto cleanup;
8277         }
8278 
8279         share->_num_DBs_lock.unlock();
8280         rw_lock_taken = false;
8281 
8282 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8283         // initialize a one phase progress report.
8284         // incremental reports are done in the indexer's callback function.
8285         thd_progress_init(thd, 1);
8286 #endif
8287 
8288         error = indexer->build(indexer);
8289 
8290         if (error) {
8291             goto cleanup;
8292         }
8293 
8294         rwlock_t_lock_write(share->_num_DBs_lock);
8295         error = indexer->close(indexer);
8296         share->_num_DBs_lock.unlock();
8297         if (error) {
8298             goto cleanup;
8299         }
8300         indexer = NULL;
8301     } else {
8302         DBUG_ASSERT(table->mdl_ticket->get_type() >= MDL_SHARED_NO_WRITE);
8303         share->_num_DBs_lock.unlock();
8304         rw_lock_taken = false;
8305         prelocked_right_range_size = 0;
8306         prelocked_left_range_size = 0;
8307         struct smart_dbt_bf_info bf_info;
8308         bf_info.ha = this;
8309         // you need the val if you have a clustering index and key_read is not 0;
8310         bf_info.direction = 1;
8311         bf_info.thd = ha_thd();
8312         bf_info.need_val = true;
8313         bf_info.key_to_compare = NULL;
8314 
8315         error = db_env->create_loader(
8316             db_env,
8317             txn,
8318             &loader,
8319             NULL, // no src_db needed
8320             num_of_keys,
8321             &share->key_file[curr_num_DBs],
8322             mult_put_flags,
8323             mult_dbt_flags,
8324             loader_flags);
8325         if (error) {
8326             goto cleanup;
8327         }
8328 
8329         error =
8330             loader->set_poll_function(loader, ha_tokudb::bulk_insert_poll, &lc);
8331         if (error) {
8332             goto cleanup;
8333         }
8334 
8335         error = loader->set_error_callback(
8336             loader, ha_tokudb::loader_add_index_err, &lc);
8337         if (error) {
8338             goto cleanup;
8339         }
8340         //
8341         // scan primary table, create each secondary key, add to each DB
8342         //
8343         error = share->file->cursor(
8344             share->file,
8345             txn,
8346             &tmp_cursor,
8347             DB_SERIALIZABLE);
8348         if (error) {
8349             tmp_cursor = NULL;             // Safety
8350             goto cleanup;
8351         }
8352 
8353         //
8354         // grab some locks to make this go faster
8355         // first a global read lock on the main DB, because
8356         // we intend to scan the entire thing
8357         //
8358         error = tmp_cursor->c_set_bounds(
8359             tmp_cursor,
8360             share->file->dbt_neg_infty(),
8361             share->file->dbt_pos_infty(),
8362             true,
8363             0);
8364         if (error) {
8365             goto cleanup;
8366         }
8367 
8368         // set the bulk fetch iteration to its max so that adding an
8369         // index fills the bulk fetch buffer every time. we do not
8370         // want it to grow exponentially fast.
8371         rows_fetched_using_bulk_fetch = 0;
8372         bulk_fetch_iteration = HA_TOKU_BULK_FETCH_ITERATION_MAX;
8373         cursor_ret_val = tmp_cursor->c_getf_next(
8374             tmp_cursor,
8375             DB_PRELOCKED,
8376             smart_dbt_bf_callback,
8377             &bf_info);
8378 
8379 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8380         // initialize a two phase progress report.
8381         // first phase: putting rows into the loader
8382         thd_progress_init(thd, 2);
8383 #endif
8384 
8385         while (cursor_ret_val != DB_NOTFOUND ||
8386                ((bytes_used_in_range_query_buff -
8387                  curr_range_query_buff_offset) > 0)) {
8388             if ((bytes_used_in_range_query_buff -
8389                  curr_range_query_buff_offset) == 0) {
8390                 invalidate_bulk_fetch(); // reset the buffers
8391                 cursor_ret_val = tmp_cursor->c_getf_next(
8392                     tmp_cursor,
8393                     DB_PRELOCKED,
8394                     smart_dbt_bf_callback,
8395                     &bf_info);
8396                 if (cursor_ret_val != DB_NOTFOUND && cursor_ret_val != 0) {
8397                     error = cursor_ret_val;
8398                     goto cleanup;
8399                 }
8400             }
8401             // do this check in case the the c_getf_next did not put anything
8402             // into the buffer because there was no more data
8403             if ((bytes_used_in_range_query_buff -
8404                  curr_range_query_buff_offset) == 0) {
8405                 break;
8406             }
8407             // at this point, we know the range query buffer has at least one
8408             // key/val pair
8409             uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
8410 
8411             uint32_t key_size = *(uint32_t *)curr_pos;
8412             curr_pos += sizeof(key_size);
8413             uchar* curr_key_buff = curr_pos;
8414             curr_pos += key_size;
8415             curr_pk_key.data = curr_key_buff;
8416             curr_pk_key.size = key_size;
8417 
8418             uint32_t val_size = *(uint32_t *)curr_pos;
8419             curr_pos += sizeof(val_size);
8420             uchar* curr_val_buff = curr_pos;
8421             curr_pos += val_size;
8422             curr_pk_val.data = curr_val_buff;
8423             curr_pk_val.size = val_size;
8424 
8425             curr_range_query_buff_offset = curr_pos - range_query_buff;
8426 
8427             error = loader->put(loader, &curr_pk_key, &curr_pk_val);
8428             if (error) {
8429                 goto cleanup;
8430             }
8431 
8432             num_processed++;
8433 
8434             if ((num_processed % 1000) == 0) {
8435                 sprintf(
8436                     status_msg,
8437                     "Adding indexes: Fetched %llu of about %llu rows, loading "
8438                     "of data still remains.",
8439                     num_processed,
8440                     (long long unsigned)share->row_count());
8441                 thd_proc_info(thd, status_msg);
8442 
8443 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8444                 thd_progress_report(
8445                     thd,
8446                     num_processed,
8447                     (long long unsigned)share->row_count());
8448 #endif
8449 
8450                 if (thd_kill_level(thd)) {
8451                     error = ER_ABORTING_CONNECTION;
8452                     goto cleanup;
8453                 }
8454             }
8455         }
8456         error = tmp_cursor->c_close(tmp_cursor);
8457         assert_always(error==0);
8458         tmp_cursor = NULL;
8459 
8460 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8461         // next progress report phase: closing the loader.
8462         // incremental reports are done in the loader's callback function.
8463         thd_progress_next_stage(thd);
8464 #endif
8465 
8466         error = loader->close(loader);
8467         loader = NULL;
8468 
8469         if (error) goto cleanup;
8470     }
8471     curr_index = curr_num_DBs;
8472     for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8473         if (key_info[i].flags & HA_NOSAME) {
8474             bool is_unique;
8475             error = is_index_unique(
8476                 &is_unique,
8477                 txn,
8478                 share->key_file[curr_index],
8479                 &key_info[i],
8480                 creating_hot_index ? 0 : DB_PRELOCKED_WRITE);
8481             if (error)
8482                 goto cleanup;
8483             if (!is_unique) {
8484                 error = HA_ERR_FOUND_DUPP_KEY;
8485                 last_dup_key = i;
8486                 goto cleanup;
8487             }
8488         }
8489     }
8490 
8491     share->lock();
8492     //
8493     // We have an accurate row count, might as well update share->rows
8494     //
8495     if(!creating_hot_index) {
8496         share->set_row_count(num_processed, true);
8497     }
8498     //
8499     // now write stuff to status.tokudb
8500     //
8501     for (uint i = 0; i < num_of_keys; i++) {
8502         write_key_name_to_status(share->status_block, key_info[i].name.str, txn);
8503     }
8504     share->unlock();
8505 
8506     error = 0;
8507 cleanup:
8508 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8509     thd_progress_end(thd);
8510 #endif
8511     if (rw_lock_taken) {
8512         share->_num_DBs_lock.unlock();
8513         rw_lock_taken = false;
8514     }
8515     if (tmp_cursor) {
8516         int r = tmp_cursor->c_close(tmp_cursor);
8517         assert_always(r==0);
8518         tmp_cursor = NULL;
8519     }
8520     if (loader != NULL) {
8521         sprintf(status_msg, "aborting creation of indexes.");
8522         thd_proc_info(thd, status_msg);
8523         loader->abort(loader);
8524     }
8525     if (indexer != NULL) {
8526         sprintf(status_msg, "aborting creation of indexes.");
8527         thd_proc_info(thd, status_msg);
8528         rwlock_t_lock_write(share->_num_DBs_lock);
8529         indexer->abort(indexer);
8530         share->_num_DBs_lock.unlock();
8531     }
8532     if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8533         error == DB_LOCK_NOTGRANTED) {
8534         sql_print_error(
8535             "Could not add indexes to table %s because another transaction has "
8536             "accessed the table. To add indexes, make sure no transactions "
8537             "touch the table.",
8538             share->full_table_name());
8539     }
8540     thd_proc_info(thd, orig_proc_info);
8541     TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
8542 }
8543 int ha_tokudb::tokudb_add_index_poll(void* extra, float progress) {
8544     LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
8545     if (thd_killed(context->thd)) {
8546         snprintf(context->write_status_msg,
8547                  sizeof(context->write_status_msg),
8548                  "The process has been killed, aborting add index.");
8549         return ER_ABORTING_CONNECTION;
8550     }
8551     float percentage = progress * 100;
8552     snprintf(context->write_status_msg,
8553              sizeof(context->write_status_msg),
8554              "Adding of indexes to %s about %.1f%% done",
8555              context->ha->share->full_table_name(),
8556              percentage);
8557     thd_proc_info(context->thd, context->write_status_msg);
8558 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8559     thd_progress_report(context->thd, (unsigned long long)percentage, 100);
8560 #endif
8561     return 0;
8562 }
8563 
8564 //
8565 // Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8566 // Closes added indexes in case of error in error path of add_index and alter_table_phase2
8567 //
8568 void ha_tokudb::restore_add_index(
8569     TABLE* table_arg,
8570     uint num_of_keys,
8571     bool incremented_numDBs,
8572     bool modified_DBs) {
8573 
8574     uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8575     uint curr_index = 0;
8576 
8577     //
8578     // need to restore num_DBs, and we have to do it before we close the dictionaries
8579     // so that there is not a window
8580     //
8581     if (incremented_numDBs) {
8582         rwlock_t_lock_write(share->_num_DBs_lock);
8583         share->num_DBs--;
8584     }
8585     if (modified_DBs) {
8586         curr_index = curr_num_DBs;
8587         for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8588             reset_key_and_col_info(&share->kc_info, curr_index);
8589         }
8590         curr_index = curr_num_DBs;
8591         for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8592             if (share->key_file[curr_index]) {
8593                 int r = share->key_file[curr_index]->close(
8594                     share->key_file[curr_index],
8595                     0);
8596                 assert_always(r==0);
8597                 share->key_file[curr_index] = NULL;
8598             }
8599         }
8600     }
8601     if (incremented_numDBs) {
8602         share->_num_DBs_lock.unlock();
8603     }
8604 }
8605 
8606 //
8607 // Internal function called by ha_tokudb::prepare_drop_index and ha_tokudb::alter_table_phase2
8608 // With a transaction, drops dictionaries associated with indexes in key_num
8609 //
8610 int ha_tokudb::drop_indexes(uint* key_num,
8611                             uint num_of_keys,
8612                             KEY* key_info,
8613                             DB_TXN* txn) {
8614     TOKUDB_HANDLER_DBUG_ENTER("");
8615     assert_always(txn);
8616 
8617     int error = 0;
8618     for (uint i = 0; i < num_of_keys; i++) {
8619         uint curr_index = key_num[i];
8620         error = share->key_file[curr_index]->pre_acquire_fileops_lock(
8621             share->key_file[curr_index],
8622             txn);
8623         if (error != 0) {
8624             goto cleanup;
8625         }
8626     }
8627     for (uint i = 0; i < num_of_keys; i++) {
8628         uint curr_index = key_num[i];
8629         int r = share->key_file[curr_index]->close(share->key_file[curr_index],0);
8630         assert_always(r==0);
8631         share->key_file[curr_index] = NULL;
8632 
8633         error = remove_key_name_from_status(
8634             share->status_block,
8635             key_info[curr_index].name.str,
8636             txn);
8637         if (error) {
8638             goto cleanup;
8639         }
8640 
8641         error = delete_or_rename_dictionary(
8642             share->full_table_name(),
8643             NULL,
8644             key_info[curr_index].name.str,
8645             true,
8646             txn,
8647             true);
8648         if (error) {
8649             goto cleanup;
8650         }
8651     }
8652 
8653 cleanup:
8654     if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8655         error == DB_LOCK_NOTGRANTED) {
8656         sql_print_error(
8657             "Could not drop indexes from table %s because another transaction "
8658             "has accessed the table. To drop indexes, make sure no "
8659             "transactions touch the table.",
8660             share->full_table_name());
8661     }
8662     TOKUDB_HANDLER_DBUG_RETURN(error);
8663 }
8664 
8665 //
8666 // Internal function called by ha_tokudb::prepare_drop_index and
8667 // ha_tokudb::alter_table_phase2
8668 // Restores dropped indexes in case of error in error path of
8669 // prepare_drop_index and alter_table_phase2
8670 //
8671 void ha_tokudb::restore_drop_indexes(uint* key_num, uint num_of_keys) {
8672     //
8673     // reopen closed dictionaries
8674     //
8675     for (uint i = 0; i < num_of_keys; i++) {
8676         int r;
8677         uint curr_index = key_num[i];
8678         if (share->key_file[curr_index] == NULL) {
8679             r = open_secondary_dictionary(
8680                 &share->key_file[curr_index],
8681                 &table_share->key_info[curr_index],
8682                 share->full_table_name(),
8683                 false,
8684                 NULL);
8685             assert_always(!r);
8686         }
8687     }
8688 }
8689 
8690 int ha_tokudb::map_to_handler_error(int error) {
8691     switch (error) {
8692     case DB_LOCK_DEADLOCK:
8693         error = HA_ERR_LOCK_DEADLOCK;
8694         break;
8695     case DB_LOCK_NOTGRANTED:
8696         error = HA_ERR_LOCK_WAIT_TIMEOUT;
8697         break;
8698 #if defined(HA_ERR_DISK_FULL)
8699     case ENOSPC:
8700         error = HA_ERR_DISK_FULL;
8701         break;
8702 #endif
8703     case DB_KEYEXIST:
8704         error = HA_ERR_FOUND_DUPP_KEY;
8705         break;
8706 #if defined(HA_ALTER_ERROR)
8707     case HA_ALTER_ERROR:
8708         error = HA_ERR_UNSUPPORTED;
8709         break;
8710 #endif
8711     case TOKUDB_INTERRUPTED:
8712         error = ER_QUERY_INTERRUPTED;
8713         break;
8714     case TOKUDB_OUT_OF_LOCKS:
8715         error = HA_ERR_LOCK_TABLE_FULL;
8716         break;
8717     }
8718     return error;
8719 }
8720 
8721 void ha_tokudb::print_error(int error, myf errflag) {
8722     error = map_to_handler_error(error);
8723     handler::print_error(error, errflag);
8724 }
8725 
8726 //
8727 // truncate's dictionary associated with keynr index using transaction txn
8728 // does so by deleting and then recreating the dictionary in the context
8729 // of a transaction
8730 //
8731 int ha_tokudb::truncate_dictionary(uint keynr, DB_TXN* txn) {
8732     int error;
8733     bool is_pk = (keynr == primary_key);
8734 
8735     toku_compression_method compression_method =
8736         get_compression_method(share->key_file[keynr]);
8737     error = share->key_file[keynr]->close(share->key_file[keynr], 0);
8738     assert_always(error == 0);
8739 
8740     share->key_file[keynr] = NULL;
8741     if (is_pk) {
8742         share->file = NULL;
8743     }
8744 
8745     if (is_pk) {
8746         error = delete_or_rename_dictionary(
8747             share->full_table_name(),
8748             NULL,
8749             "main",
8750             false, //is_key
8751             txn,
8752             true); // is a delete
8753         if (error) {
8754             goto cleanup;
8755         }
8756     } else {
8757         error = delete_or_rename_dictionary(
8758             share->full_table_name(),
8759             NULL,
8760             table_share->key_info[keynr].name.str,
8761             true, //is_key
8762             txn,
8763             true); // is a delete
8764         if (error) {
8765             goto cleanup;
8766         }
8767     }
8768 
8769     if (is_pk) {
8770         error = create_main_dictionary(
8771             share->full_table_name(),
8772             table,
8773             txn,
8774             &share->kc_info,
8775             compression_method);
8776     } else {
8777         error = create_secondary_dictionary(
8778             share->full_table_name(),
8779             table,
8780             &table_share->key_info[keynr],
8781             txn,
8782             &share->kc_info,
8783             keynr,
8784             false,
8785             compression_method);
8786     }
8787     if (error) {
8788         goto cleanup;
8789     }
8790 
8791 cleanup:
8792     return error;
8793 }
8794 
8795 // for 5.5
8796 int ha_tokudb::truncate() {
8797     TOKUDB_HANDLER_DBUG_ENTER("");
8798     int error = delete_all_rows_internal();
8799     TOKUDB_HANDLER_DBUG_RETURN(error);
8800 }
8801 
8802 // delete all rows from a table
8803 //
8804 // effects: delete all of the rows in the main dictionary and all of the
8805 // indices.  this must be atomic, so we use the statement transaction
8806 // for all of the truncate operations.
8807 // locks:  if we have an exclusive table write lock, all of the concurrency
8808 // issues go away.
8809 // returns: 0 if success
8810 int ha_tokudb::delete_all_rows() {
8811     TOKUDB_HANDLER_DBUG_ENTER("");
8812     int error = 0;
8813     if (thd_sql_command(ha_thd()) != SQLCOM_TRUNCATE) {
8814         share->try_table_lock = true;
8815         error = HA_ERR_WRONG_COMMAND;
8816     }
8817     if (error == 0)
8818         error = delete_all_rows_internal();
8819     TOKUDB_HANDLER_DBUG_RETURN(error);
8820 }
8821 
8822 int ha_tokudb::delete_all_rows_internal() {
8823     TOKUDB_HANDLER_DBUG_ENTER("");
8824     int error = 0;
8825     uint curr_num_DBs = 0;
8826     DB_TXN* txn = NULL;
8827 
8828     // this should be enough to handle locking as the higher level MDL
8829     // on this table should prevent any new analyze tasks.
8830     share->cancel_background_jobs();
8831 
8832     error = txn_begin(db_env, 0, &txn, 0, ha_thd());
8833     if (error) {
8834         goto cleanup;
8835     }
8836 
8837     curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
8838     for (uint i = 0; i < curr_num_DBs; i++) {
8839         error = share->key_file[i]->pre_acquire_fileops_lock(
8840             share->key_file[i],
8841             txn);
8842         if (error) {
8843             goto cleanup;
8844         }
8845         error = share->key_file[i]->pre_acquire_table_lock(
8846             share->key_file[i],
8847             txn);
8848         if (error) {
8849             goto cleanup;
8850         }
8851     }
8852     for (uint i = 0; i < curr_num_DBs; i++) {
8853         error = truncate_dictionary(i, txn);
8854         if (error) {
8855             goto cleanup;
8856         }
8857     }
8858 
8859     DEBUG_SYNC(ha_thd(), "tokudb_after_truncate_all_dictionarys");
8860 
8861     // zap the row count
8862     if (error == 0) {
8863         share->set_row_count(0, false);
8864         // update auto increment
8865         share->last_auto_increment = 0;
8866         // calling write_to_status directly because we need to use txn
8867         write_to_status(
8868             share->status_block,
8869             hatoku_max_ai,
8870             &share->last_auto_increment,
8871             sizeof(share->last_auto_increment),
8872             txn);
8873     }
8874 
8875     share->try_table_lock = true;
8876 cleanup:
8877     if (txn) {
8878         if (error) {
8879             abort_txn(txn);
8880         } else {
8881             commit_txn(txn,0);
8882         }
8883     }
8884 
8885     if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(
8886         TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8887         error == DB_LOCK_NOTGRANTED) {
8888         sql_print_error(
8889             "Could not truncate table %s because another transaction has "
8890             "accessed the table. To truncate the table, make sure no "
8891             "transactions touch the table.",
8892             share->full_table_name());
8893     }
8894     //
8895     // regardless of errors, need to reopen the DB's
8896     //
8897     for (uint i = 0; i < curr_num_DBs; i++) {
8898         int r = 0;
8899         if (share->key_file[i] == NULL) {
8900             if (i != primary_key) {
8901                 r = open_secondary_dictionary(
8902                         &share->key_file[i],
8903                         &table_share->key_info[i],
8904                         share->full_table_name(),
8905                         false,
8906                         NULL);
8907                 assert_always(!r);
8908             } else {
8909                 r = open_main_dictionary(
8910                        share->full_table_name(),
8911                         false,
8912                         NULL);
8913                 assert_always(!r);
8914             }
8915         }
8916     }
8917     TOKUDB_HANDLER_DBUG_RETURN(error);
8918 }
8919 
8920 void ha_tokudb::set_loader_error(int err) {
8921     loader_error = err;
8922 }
8923 
8924 void ha_tokudb::set_dup_value_for_pk(DBT* key) {
8925     assert_always(!hidden_primary_key);
8926     unpack_key(table->record[0],key,primary_key);
8927     last_dup_key = primary_key;
8928 }
8929 
8930 void ha_tokudb::close_dsmrr() {
8931 #ifdef MARIADB_BASE_VERSION
8932     ds_mrr.dsmrr_close();
8933 #elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
8934     ds_mrr.dsmrr_close();
8935 #endif
8936 }
8937 
8938 void ha_tokudb::reset_dsmrr() {
8939 #ifdef MARIADB_BASE_VERSION
8940     ds_mrr.dsmrr_close();
8941 #elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
8942     ds_mrr.reset();
8943 #endif
8944 }
8945 
8946 // we cache the information so we can do filtering ourselves,
8947 // but as far as MySQL knows, we are not doing any filtering,
8948 // so if we happen to miss filtering a row that does not match
8949 // idx_cond_arg, MySQL will catch it.
8950 // This allows us the ability to deal with only index_next and index_prev,
8951 // and not need to worry about other index_XXX functions
8952 Item* ha_tokudb::idx_cond_push(uint keyno_arg, Item* idx_cond_arg) {
8953     toku_pushed_idx_cond_keyno = keyno_arg;
8954     toku_pushed_idx_cond = idx_cond_arg;
8955     return idx_cond_arg;
8956 }
8957 
8958 void ha_tokudb::cancel_pushed_idx_cond() {
8959     invalidate_icp();
8960     handler::cancel_pushed_idx_cond();
8961 }
8962 
8963 void ha_tokudb::cleanup_txn(DB_TXN *txn) {
8964     if (transaction == txn && cursor) {
8965         int r = cursor->c_close(cursor);
8966         assert_always(r == 0);
8967         cursor = NULL;
8968     }
8969 }
8970 
8971 void ha_tokudb::add_to_trx_handler_list() {
8972     tokudb_trx_data* trx =
8973         (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8974     trx->handlers = list_add(trx->handlers, &trx_handler_list);
8975 }
8976 
8977 void ha_tokudb::remove_from_trx_handler_list() {
8978     tokudb_trx_data* trx =
8979         (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8980     trx->handlers = list_delete(trx->handlers, &trx_handler_list);
8981 }
8982 
8983 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
8984 void ha_tokudb::rpl_before_write_rows() {
8985     in_rpl_write_rows = true;
8986 }
8987 
8988 void ha_tokudb::rpl_after_write_rows() {
8989     in_rpl_write_rows = false;
8990 }
8991 
8992 void ha_tokudb::rpl_before_delete_rows() {
8993     in_rpl_delete_rows = true;
8994 }
8995 
8996 void ha_tokudb::rpl_after_delete_rows() {
8997     in_rpl_delete_rows = false;
8998 }
8999 
9000 void ha_tokudb::rpl_before_update_rows() {
9001     in_rpl_update_rows = true;
9002 }
9003 
9004 void ha_tokudb::rpl_after_update_rows() {
9005     in_rpl_update_rows = false;
9006 }
9007 
9008 bool ha_tokudb::rpl_lookup_rows() {
9009     if (!in_rpl_delete_rows && !in_rpl_update_rows)
9010         return true;
9011     else
9012         return tokudb::sysvars::rpl_lookup_rows(ha_thd());
9013 }
9014 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
9015 
9016 // table admin
9017 #include "ha_tokudb_admin.cc"
9018 
9019 // update functions
9020 #include "tokudb_update_fun.cc"
9021 
9022 // fast updates
9023 #include "ha_tokudb_update.cc"
9024 
9025 // alter table code for various mysql distros
9026 #include "ha_tokudb_alter_55.cc"
9027 #include "ha_tokudb_alter_56.cc"
9028 
9029 // mrr
9030 #ifdef MARIADB_BASE_VERSION
9031 #include  "ha_tokudb_mrr_maria.cc"
9032 #elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
9033 #include  "ha_tokudb_mrr_mysql.cc"
9034 #endif
9035 
9036 // key comparisons
9037 #include "hatoku_cmp.cc"
9038 
9039 // handlerton
9040 #include "hatoku_hton.cc"
9041 
9042 // generate template functions
9043 namespace tokudb {
9044     template size_t vlq_encode_ui(uint32_t n, void *p, size_t s);
9045     template size_t vlq_decode_ui(uint32_t *np, void *p, size_t s);
9046     template size_t vlq_encode_ui(uint64_t n, void *p, size_t s);
9047     template size_t vlq_decode_ui(uint64_t *np, void *p, size_t s);
9048 };
9049