1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of TokuDB
6 
7 
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9 
10     TokuDBis is free software: you can redistribute it and/or modify
11     it under the terms of the GNU General Public License, version 2,
12     as published by the Free Software Foundation.
13 
14     TokuDB is distributed in the hope that it will be useful,
15     but WITHOUT ANY WARRANTY; without even the implied warranty of
16     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17     GNU General Public License for more details.
18 
19     You should have received a copy of the GNU General Public License
20     along with TokuDB.  If not, see <http://www.gnu.org/licenses/>.
21 
22 ======= */
23 
24 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
25 
26 #include "hatoku_hton.h"
27 #include "hatoku_cmp.h"
28 #include "tokudb_buffer.h"
29 #include "tokudb_status.h"
30 #include "tokudb_card.h"
31 #include "ha_tokudb.h"
32 #include "sql_db.h"
33 
34 pfs_key_t ha_tokudb_mutex_key;
35 pfs_key_t num_DBs_lock_key;
36 
37 std::unordered_map<std::string, TOKUDB_SHARE*> TOKUDB_SHARE::_open_tables;
38 tokudb::thread::mutex_t TOKUDB_SHARE::_open_tables_mutex;
39 
40 static const char* ha_tokudb_exts[] = {
41     ha_tokudb_ext,
42     NullS
43 };
44 
45 //
46 // This offset is calculated starting from AFTER the NULL bytes
47 //
get_fixed_field_size(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)48 static inline uint32_t get_fixed_field_size(
49     KEY_AND_COL_INFO* kc_info,
50     TABLE_SHARE* table_share,
51     uint keynr) {
52 
53     uint offset = 0;
54     for (uint i = 0; i < table_share->fields; i++) {
55         if (is_fixed_field(kc_info, i) &&
56             !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
57             offset += kc_info->field_lengths[i];
58         }
59     }
60     return offset;
61 }
62 
63 
get_len_of_offsets(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)64 static inline uint32_t get_len_of_offsets(
65     KEY_AND_COL_INFO* kc_info,
66     TABLE_SHARE* table_share,
67     uint keynr) {
68 
69     uint len = 0;
70     for (uint i = 0; i < table_share->fields; i++) {
71         if (is_variable_field(kc_info, i) &&
72             !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
73             len += kc_info->num_offset_bytes;
74         }
75     }
76     return len;
77 }
78 
79 
allocate_key_and_col_info(TABLE_SHARE * table_share,KEY_AND_COL_INFO * kc_info)80 static int allocate_key_and_col_info(
81     TABLE_SHARE* table_share,
82     KEY_AND_COL_INFO* kc_info) {
83 
84     int error;
85     //
86     // initialize all of the bitmaps
87     //
88     for (uint i = 0; i < MAX_KEY + 1; i++) {
89         error =
90             bitmap_init(
91                 &kc_info->key_filters[i],
92                 NULL,
93                 table_share->fields,
94                 false);
95         if (error) {
96             goto exit;
97         }
98     }
99 
100     //
101     // create the field lengths
102     //
103     kc_info->multi_ptr = tokudb::memory::multi_malloc(
104         MYF(MY_WME+MY_ZEROFILL),
105         &kc_info->field_types, (uint)(table_share->fields * sizeof (uint8_t)),
106         &kc_info->field_lengths, (uint)(table_share->fields * sizeof (uint16_t)),
107         &kc_info->length_bytes, (uint)(table_share->fields * sizeof (uint8_t)),
108         &kc_info->blob_fields, (uint)(table_share->fields * sizeof (uint32_t)),
109         NullS);
110     if (kc_info->multi_ptr == NULL) {
111         error = ENOMEM;
112         goto exit;
113     }
114 exit:
115     if (error) {
116         for (uint i = 0; MAX_KEY + 1; i++) {
117             bitmap_free(&kc_info->key_filters[i]);
118         }
119         tokudb::memory::free(kc_info->multi_ptr);
120     }
121     return error;
122 }
123 
free_key_and_col_info(KEY_AND_COL_INFO * kc_info)124 static void free_key_and_col_info (KEY_AND_COL_INFO* kc_info) {
125     for (uint i = 0; i < MAX_KEY+1; i++) {
126         bitmap_free(&kc_info->key_filters[i]);
127     }
128 
129     for (uint i = 0; i < MAX_KEY+1; i++) {
130         tokudb::memory::free(kc_info->cp_info[i]);
131         kc_info->cp_info[i] = NULL; // 3144
132     }
133 
134     tokudb::memory::free(kc_info->multi_ptr);
135     kc_info->field_types = NULL;
136     kc_info->field_lengths = NULL;
137     kc_info->length_bytes = NULL;
138     kc_info->blob_fields = NULL;
139 }
140 
141 
static_init()142 void TOKUDB_SHARE::static_init() {
143     assert_always(_open_tables.size() == 0);
144 }
static_destroy()145 void TOKUDB_SHARE::static_destroy() {
146     for (auto it = _open_tables.cbegin(); it != _open_tables.cend(); it++) {
147         TOKUDB_TRACE("_open_tables %s %p", it->first.c_str(), it->second);
148         TOKUDB_SHARE* share = it->second;
149         share->destroy();
150         delete share;
151     }
152     _open_tables.clear();
153     assert_always(_open_tables.size() == 0);
154 }
get_state_string(share_state_t state)155 const char* TOKUDB_SHARE::get_state_string(share_state_t state) {
156     static const char* state_string[] = {
157         "CLOSED",
158         "OPENED",
159         "ERROR"
160     };
161     assert_always(state == CLOSED || state == OPENED || state == ERROR);
162     return state_string[state];
163 }
operator new(size_t sz)164 void* TOKUDB_SHARE::operator new(size_t sz) {
165     return tokudb::memory::malloc(sz, MYF(MY_WME|MY_ZEROFILL|MY_FAE));
166 }
operator delete(void * p)167 void TOKUDB_SHARE::operator delete(void* p) { tokudb::memory::free(p); }
TOKUDB_SHARE()168 TOKUDB_SHARE::TOKUDB_SHARE()
169     : _num_DBs_lock(num_DBs_lock_key), _mutex(ha_tokudb_mutex_key) {}
init(const char * table_name)170 void TOKUDB_SHARE::init(const char* table_name) {
171     _use_count = 0;
172     thr_lock_init(&_thr_lock);
173     _state = CLOSED;
174     _row_delta_activity = 0;
175     _allow_auto_analysis = true;
176 
177     _full_table_name.append(table_name);
178 
179     String tmp_dictionary_name;
180     tokudb_split_dname(
181         table_name,
182         _database_name,
183         _table_name,
184         tmp_dictionary_name);
185 
186     TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
187         _full_table_name.ptr(),
188         get_state_string(_state),
189         _use_count);
190     TOKUDB_SHARE_DBUG_VOID_RETURN();
191 }
destroy()192 void TOKUDB_SHARE::destroy() {
193     TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
194         _full_table_name.ptr(),
195         get_state_string(_state),
196         _use_count);
197 
198     assert_always(_use_count == 0);
199     assert_always(
200         _state == TOKUDB_SHARE::CLOSED || _state == TOKUDB_SHARE::ERROR);
201     thr_lock_delete(&_thr_lock);
202     TOKUDB_SHARE_DBUG_VOID_RETURN();
203 }
get_share(const char * table_name,THR_LOCK_DATA * data,bool create_new)204 TOKUDB_SHARE* TOKUDB_SHARE::get_share(const char* table_name,
205                                       THR_LOCK_DATA* data,
206                                       bool create_new) {
207     std::string find_table_name(table_name);
208     mutex_t_lock(_open_tables_mutex);
209     auto it = _open_tables.find(find_table_name);
210     TOKUDB_SHARE *share = nullptr;
211     if (it != _open_tables.end()) {
212         share = it->second;
213         assert_always(strcmp(table_name, share->full_table_name()) == 0);
214     }
215     TOKUDB_TRACE_FOR_FLAGS(
216         TOKUDB_DEBUG_SHARE,
217         "existing share[%s] %s:share[%p]",
218         table_name,
219         share == NULL ? "not found" : "found",
220         share);
221 
222     if (!share) {
223         if (create_new == false)
224             goto exit;
225         // create share and fill it with all zeroes
226         // hence, all pointers are initialized to NULL
227         share = new TOKUDB_SHARE;
228         assert_always(share);
229 
230         share->init(table_name);
231 
232         _open_tables.insert({find_table_name, share});
233     }
234 
235     share->addref();
236 
237     if (data)
238         thr_lock_data_init(&(share->_thr_lock), data, NULL);
239 
240 exit:
241     mutex_t_unlock(_open_tables_mutex);
242     return share;
243 }
drop_share(TOKUDB_SHARE * share)244 void TOKUDB_SHARE::drop_share(TOKUDB_SHARE* share) {
245     TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_SHARE,
246                            "share[%p]:file[%s]:state[%s]:use_count[%d]",
247                            share,
248                            share->_full_table_name.ptr(),
249                            get_state_string(share->_state),
250                            share->_use_count);
251 
252     mutex_t_lock(_open_tables_mutex);
253     size_t n = _open_tables.erase(std::string(share->full_table_name()));
254     assert_always(n == 1);
255     share->destroy();
256     delete share;
257     mutex_t_unlock(_open_tables_mutex);
258 }
addref()259 TOKUDB_SHARE::share_state_t TOKUDB_SHARE::addref() {
260     TOKUDB_SHARE_TRACE_FOR_FLAGS((TOKUDB_DEBUG_ENTER & TOKUDB_DEBUG_SHARE),
261                                  "file[%s]:state[%s]:use_count[%d]",
262                                  _full_table_name.ptr(),
263                                  get_state_string(_state),
264                                  _use_count);
265 
266     lock();
267     _use_count++;
268 
269     return _state;
270 }
release()271 int TOKUDB_SHARE::release() {
272     TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
273         _full_table_name.ptr(),
274         get_state_string(_state),
275         _use_count);
276 
277     int error, result = 0;
278 
279     mutex_t_lock(_mutex);
280     assert_always(_use_count != 0);
281     _use_count--;
282     if (_use_count == 0 && _state == TOKUDB_SHARE::OPENED) {
283         // number of open DB's may not be equal to number of keys we have
284         // because add_index may have added some. So, we loop through entire
285         // array and close any non-NULL value.  It is imperative that we reset
286         // a DB to NULL once we are done with it.
287         for (uint i = 0; i < sizeof(key_file)/sizeof(key_file[0]); i++) {
288             if (key_file[i]) {
289                 TOKUDB_TRACE_FOR_FLAGS(
290                     TOKUDB_DEBUG_OPEN,
291                     "dbclose:%p",
292                     key_file[i]);
293                 error = key_file[i]->close(key_file[i], 0);
294                 assert_always(error == 0);
295                 if (error) {
296                     result = error;
297                 }
298                 if (key_file[i] == file)
299                     file = NULL;
300                 key_file[i] = NULL;
301             }
302         }
303 
304         error = tokudb::metadata::close(&status_block);
305         assert_always(error == 0);
306 
307         free_key_and_col_info(&kc_info);
308 
309         if (_rec_per_key) {
310             tokudb::memory::free(_rec_per_key);
311             _rec_per_key = NULL;
312             _rec_per_keys = 0;
313         }
314 
315         for (uint i = 0; i < _keys; i++) {
316            tokudb::memory::free(_key_descriptors[i]._name);
317         }
318         tokudb::memory::free(_key_descriptors);
319         _keys = _max_key_parts = 0; _key_descriptors = NULL;
320 
321         _state = TOKUDB_SHARE::CLOSED;
322     }
323     mutex_t_unlock(_mutex);
324 
325     TOKUDB_SHARE_DBUG_RETURN(result);
326 }
update_row_count(THD * thd,uint64_t added,uint64_t deleted,uint64_t updated)327 void TOKUDB_SHARE::update_row_count(
328     THD* thd,
329     uint64_t added,
330     uint64_t deleted,
331     uint64_t updated) {
332 
333     uint64_t delta = added + deleted + updated;
334     lock();
335     if (deleted > added && _rows < (deleted - added)) {
336         _rows = 0;
337     } else {
338         _rows += added - deleted;
339     }
340     _row_delta_activity += delta;
341     if (_row_delta_activity == (uint64_t)~0)
342         _row_delta_activity = 1;
343 
344     ulonglong auto_threshold = tokudb::sysvars::auto_analyze(thd);
345     if (delta && auto_threshold > 0 && _allow_auto_analysis) {
346         ulonglong pct_of_rows_changed_to_trigger;
347         pct_of_rows_changed_to_trigger = ((_rows * auto_threshold) / 100);
348         if (_row_delta_activity >= pct_of_rows_changed_to_trigger) {
349             char msg[200];
350             snprintf(msg,
351                      sizeof(msg),
352                      "TokuDB: Auto %s analysis for %s, delta_activity %llu is "
353                      "greater than %llu percent of %llu rows.",
354                      tokudb::sysvars::analyze_in_background(thd) > 0
355                          ? "scheduling background"
356                          : "running foreground",
357                      full_table_name(),
358                      _row_delta_activity,
359                      auto_threshold,
360                      (ulonglong)(_rows));
361 
362             // analyze_standard will unlock _mutex regardless of success/failure
363             int ret = analyze_standard(thd, NULL);
364             if (ret == 0) {
365                 sql_print_information("%s - succeeded.", msg);
366             } else {
367                 sql_print_information(
368                     "%s - failed, likely a job already running.",
369                     msg);
370             }
371         }
372     }
373     unlock();
374 }
set_cardinality_counts_in_table(TABLE * table)375 void TOKUDB_SHARE::set_cardinality_counts_in_table(TABLE* table) {
376     lock();
377     uint32_t next_key_part = 0;
378     for (uint32_t i = 0; i < table->s->keys; i++) {
379         KEY* key = &table->key_info[i];
380         bool is_unique_key =
381             (i == table->s->primary_key) || (key->flags & HA_NOSAME);
382 
383         for (uint32_t j = 0; j < key->actual_key_parts; j++) {
384             if (j >= key->user_defined_key_parts) {
385                 // MySQL 'hidden' keys, really needs deeper investigation
386                 // into MySQL hidden keys vs TokuDB hidden keys
387                 key->rec_per_key[j] = 1;
388                 continue;
389             }
390 
391             assert_always(next_key_part < _rec_per_keys);
392             ulong val = _rec_per_key[next_key_part++];
393             val = (val * tokudb::sysvars::cardinality_scale_percent) / 100;
394             if (val == 0 || _rows == 0 ||
395                 (is_unique_key && j == key->actual_key_parts - 1)) {
396                 val = 1;
397             }
398             key->rec_per_key[j] = val;
399         }
400     }
401     unlock();
402 }
403 
404 #define HANDLE_INVALID_CURSOR() \
405     if (cursor == NULL) { \
406         error = last_cursor_error; \
407         goto cleanup; \
408     }
409 
table_type() const410 const char *ha_tokudb::table_type() const {
411     return tokudb_hton_name;
412 }
413 
index_type(TOKUDB_UNUSED (uint inx))414 const char *ha_tokudb::index_type(TOKUDB_UNUSED(uint inx)) {
415     return "BTREE";
416 }
417 
418 /*
419  *  returns NULL terminated file extension string
420  */
bas_ext() const421 const char **ha_tokudb::bas_ext() const {
422     TOKUDB_HANDLER_DBUG_ENTER("");
423     DBUG_RETURN(ha_tokudb_exts);
424 }
425 
is_insert_ignore(THD * thd)426 static inline bool is_insert_ignore (THD* thd) {
427     //
428     // from http://lists.mysql.com/internals/37735
429     //
430     return thd->lex->ignore && thd->lex->duplicates == DUP_ERROR;
431 }
432 
is_replace_into(THD * thd)433 static inline bool is_replace_into(THD* thd) {
434     return thd->lex->duplicates == DUP_REPLACE;
435 }
436 
do_ignore_flag_optimization(THD * thd,TABLE * table,bool opt_eligible)437 static inline bool do_ignore_flag_optimization(
438     THD* thd,
439     TABLE* table,
440     bool opt_eligible) {
441 
442     bool do_opt = false;
443     if (opt_eligible &&
444         (is_replace_into(thd) || is_insert_ignore(thd)) &&
445         tokudb::sysvars::pk_insert_mode(thd) == 1 &&
446         !table->triggers &&
447         !(mysql_bin_log.is_open() &&
448          thd->variables.binlog_format != BINLOG_FORMAT_STMT)) {
449         do_opt = true;
450     }
451     return do_opt;
452 }
453 
454 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
get_ext_key_parts(const KEY * key)455 static inline uint get_ext_key_parts(const KEY *key) {
456 #if (50609 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699) || \
457     (50700 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50799)
458     return key->actual_key_parts;
459 #elif defined(MARIADB_BASE_VERSION)
460     return key->ext_key_parts;
461 #else
462 #error
463 #endif
464 }
465 #endif  // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
466 
table_flags() const467 ulonglong ha_tokudb::table_flags() const {
468     return int_table_flags | HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE;
469 }
470 
471 //
472 // Returns a bit mask of capabilities of the key or its part specified by
473 // the arguments. The capabilities are defined in sql/handler.h.
474 //
index_flags(uint idx,TOKUDB_UNUSED (uint part),TOKUDB_UNUSED (bool all_parts)) const475 ulong ha_tokudb::index_flags(uint idx,
476                              TOKUDB_UNUSED(uint part),
477                              TOKUDB_UNUSED(bool all_parts)) const {
478     TOKUDB_HANDLER_DBUG_ENTER("");
479     assert_always(table_share);
480     ulong flags = (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER |
481         HA_KEYREAD_ONLY | HA_READ_RANGE | HA_DO_INDEX_COND_PUSHDOWN);
482     if (key_is_clustering(&table_share->key_info[idx])) {
483         flags |= HA_CLUSTERED_INDEX;
484     }
485     DBUG_RETURN(flags);
486 }
487 
488 
489 //
490 // struct that will be used as a context for smart DBT callbacks
491 // contains parameters needed to complete the smart DBT cursor call
492 //
493 typedef struct smart_dbt_info {
494     ha_tokudb* ha; //instance to ha_tokudb needed for reading the row
495     uchar* buf; // output buffer where row will be written
496     uint keynr; // index into share->key_file that represents DB we are currently operating on
497 } *SMART_DBT_INFO;
498 
499 typedef struct smart_dbt_bf_info {
500     ha_tokudb* ha;
501     bool need_val;
502     int direction;
503     THD* thd;
504     uchar* buf;
505     DBT* key_to_compare;
506 } *SMART_DBT_BF_INFO;
507 
508 typedef struct index_read_info {
509     struct smart_dbt_info smart_dbt_info;
510     int cmp;
511     DBT* orig_key;
512 } *INDEX_READ_INFO;
513 
514 //
515 // smart DBT callback function for optimize
516 // in optimize, we want to flatten DB by doing
517 // a full table scan. Therefore, we don't
518 // want to actually do anything with the data, hence
519 // callback does nothing
520 //
smart_dbt_do_nothing(TOKUDB_UNUSED (DBT const * key),TOKUDB_UNUSED (DBT const * row),TOKUDB_UNUSED (void * context))521 static int smart_dbt_do_nothing(TOKUDB_UNUSED(DBT const* key),
522                                 TOKUDB_UNUSED(DBT const* row),
523                                 TOKUDB_UNUSED(void* context)) {
524     return 0;
525 }
526 
527 static int
smart_dbt_callback_rowread_ptquery(DBT const * key,DBT const * row,void * context)528 smart_dbt_callback_rowread_ptquery (DBT const *key, DBT  const *row, void *context) {
529     SMART_DBT_INFO info = (SMART_DBT_INFO)context;
530     info->ha->extract_hidden_primary_key(info->keynr, key);
531     return info->ha->read_row_callback(info->buf,info->keynr,row,key);
532 }
533 
534 //
535 // Smart DBT callback function in case where we have a covering index
536 //
smart_dbt_callback_keyread(DBT const * key,DBT TOKUDB_UNUSED (const * row),void * context)537 static int smart_dbt_callback_keyread(DBT const* key,
538                                       DBT TOKUDB_UNUSED(const* row),
539                                       void* context) {
540     SMART_DBT_INFO info = (SMART_DBT_INFO)context;
541     info->ha->extract_hidden_primary_key(info->keynr, key);
542     info->ha->read_key_only(info->buf,info->keynr,key);
543     return 0;
544 }
545 
546 //
547 // Smart DBT callback function in case where we do NOT have a covering index
548 //
549 static int
smart_dbt_callback_rowread(DBT const * key,DBT const * row,void * context)550 smart_dbt_callback_rowread(DBT const *key, DBT  const *row, void *context) {
551     int error = 0;
552     SMART_DBT_INFO info = (SMART_DBT_INFO)context;
553     info->ha->extract_hidden_primary_key(info->keynr, key);
554     error = info->ha->read_primary_key(info->buf,info->keynr,row,key);
555     return error;
556 }
557 
558 //
559 // Smart DBT callback function in case where we have a covering index
560 //
smart_dbt_callback_ir_keyread(DBT const * key,TOKUDB_UNUSED (DBT const * row),void * context)561 static int smart_dbt_callback_ir_keyread(DBT const* key,
562                                          TOKUDB_UNUSED(DBT const* row),
563                                          void* context) {
564     INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
565     ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(
566         ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
567     if (ir_info->cmp) {
568         return 0;
569     }
570     return smart_dbt_callback_keyread(key, row, &ir_info->smart_dbt_info);
571 }
572 
smart_dbt_callback_lookup(DBT const * key,TOKUDB_UNUSED (DBT const * row),void * context)573 static int smart_dbt_callback_lookup(DBT const* key,
574                                      TOKUDB_UNUSED(DBT const* row),
575                                      void* context) {
576     INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
577     ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(
578         ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
579     return 0;
580 }
581 
582 
583 //
584 // Smart DBT callback function in case where we do NOT have a covering index
585 //
586 static int
smart_dbt_callback_ir_rowread(DBT const * key,DBT const * row,void * context)587 smart_dbt_callback_ir_rowread(DBT const *key, DBT  const *row, void *context) {
588     INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
589     ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
590     if (ir_info->cmp) {
591         return 0;
592     }
593     return smart_dbt_callback_rowread(key, row, &ir_info->smart_dbt_info);
594 }
595 
596 //
597 // macro for Smart DBT callback function,
598 // so we do not need to put this long line of code in multiple places
599 //
600 #define SMART_DBT_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_keyread : smart_dbt_callback_rowread )
601 #define SMART_DBT_IR_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_ir_keyread : smart_dbt_callback_ir_rowread )
602 
603 //
604 // macro that modifies read flag for cursor operations depending on whether
605 // we have preacquired lock or not
606 //
607 #define SET_PRELOCK_FLAG(flg) ((flg) | (range_lock_grabbed ? (use_write_locks ? DB_PRELOCKED_WRITE : DB_PRELOCKED) : 0))
608 
609 //
610 // This method retrieves the value of the auto increment column of a record in MySQL format
611 // This was basically taken from MyISAM
612 // Parameters:
613 //              type - the type of the auto increment column (e.g. int, float, double...)
614 //              offset - offset into the record where the auto increment column is stored
615 //      [in]    record - MySQL row whose auto increment value we want to extract
616 // Returns:
617 //      The value of the auto increment column in record
618 //
retrieve_auto_increment(uint16 type,uint32 offset,const uchar * record)619 static ulonglong retrieve_auto_increment(uint16 type, uint32 offset,const uchar *record)
620 {
621     const uchar *key;     /* Key */
622     ulonglong   unsigned_autoinc = 0;  /* Unsigned auto-increment */
623     longlong      signed_autoinc = 0;  /* Signed auto-increment */
624     enum { unsigned_type, signed_type } autoinc_type;
625     float float_tmp;   /* Temporary variable */
626     double double_tmp; /* Temporary variable */
627 
628     key = ((uchar *) record) + offset;
629 
630     /* Set default autoincrement type */
631     autoinc_type = unsigned_type;
632 
633     switch (type) {
634     case HA_KEYTYPE_INT8:
635         signed_autoinc   = (longlong) *(char*)key;
636         autoinc_type     = signed_type;
637         break;
638 
639     case HA_KEYTYPE_BINARY:
640         unsigned_autoinc = (ulonglong) *(uchar*) key;
641         break;
642 
643     case HA_KEYTYPE_SHORT_INT:
644         signed_autoinc   = (longlong) sint2korr(key);
645         autoinc_type     = signed_type;
646         break;
647 
648     case HA_KEYTYPE_USHORT_INT:
649         unsigned_autoinc = (ulonglong) uint2korr(key);
650         break;
651 
652     case HA_KEYTYPE_LONG_INT:
653         signed_autoinc   = (longlong) sint4korr(key);
654         autoinc_type     = signed_type;
655         break;
656 
657     case HA_KEYTYPE_ULONG_INT:
658         unsigned_autoinc = (ulonglong) uint4korr(key);
659         break;
660 
661     case HA_KEYTYPE_INT24:
662         signed_autoinc   = (longlong) sint3korr(key);
663         autoinc_type     = signed_type;
664         break;
665 
666     case HA_KEYTYPE_UINT24:
667         unsigned_autoinc = (ulonglong) tokudb_uint3korr(key);
668         break;
669 
670     case HA_KEYTYPE_LONGLONG:
671         signed_autoinc   = sint8korr(key);
672         autoinc_type     = signed_type;
673         break;
674 
675     case HA_KEYTYPE_ULONGLONG:
676         unsigned_autoinc = uint8korr(key);
677         break;
678 
679     /* The remaining two cases should not be used but are included for
680        compatibility */
681     case HA_KEYTYPE_FLOAT:
682         float4get(float_tmp, key);  /* Note: float4get is a macro */
683         signed_autoinc   = (longlong) float_tmp;
684         autoinc_type     = signed_type;
685         break;
686 
687     case HA_KEYTYPE_DOUBLE:
688         float8get(double_tmp, key); /* Note: float8get is a macro */
689         signed_autoinc   = (longlong) double_tmp;
690         autoinc_type     = signed_type;
691         break;
692 
693     default:
694         assert_unreachable();
695     }
696 
697     if (signed_autoinc < 0) {
698         signed_autoinc = 0;
699     }
700 
701     return autoinc_type == unsigned_type ?
702            unsigned_autoinc : (ulonglong) signed_autoinc;
703 }
704 
field_offset(Field * field,TABLE * table)705 static inline ulong field_offset(Field* field, TABLE* table) {
706     return((ulong) (field->ptr - table->record[0]));
707 }
708 
tx_to_toku_iso(ulong tx_isolation)709 static inline HA_TOKU_ISO_LEVEL tx_to_toku_iso(ulong tx_isolation) {
710     if (tx_isolation == ISO_READ_UNCOMMITTED) {
711         return hatoku_iso_read_uncommitted;
712     }
713     else if (tx_isolation == ISO_READ_COMMITTED) {
714         return hatoku_iso_read_committed;
715     }
716     else if (tx_isolation == ISO_REPEATABLE_READ) {
717         return hatoku_iso_repeatable_read;
718     }
719     else {
720         return hatoku_iso_serializable;
721     }
722 }
723 
toku_iso_to_txn_flag(HA_TOKU_ISO_LEVEL lvl)724 static inline uint32_t toku_iso_to_txn_flag (HA_TOKU_ISO_LEVEL lvl) {
725     if (lvl == hatoku_iso_read_uncommitted) {
726         return DB_READ_UNCOMMITTED;
727     }
728     else if (lvl == hatoku_iso_read_committed) {
729         return DB_READ_COMMITTED;
730     }
731     else if (lvl == hatoku_iso_repeatable_read) {
732         return DB_TXN_SNAPSHOT;
733     }
734     else {
735         return 0;
736     }
737 }
738 
filter_key_part_compare(const void * left,const void * right)739 static int filter_key_part_compare (const void* left, const void* right) {
740     FILTER_KEY_PART_INFO* left_part= (FILTER_KEY_PART_INFO *)left;
741     FILTER_KEY_PART_INFO* right_part = (FILTER_KEY_PART_INFO *)right;
742     return left_part->offset - right_part->offset;
743 }
744 
745 //
746 // Be very careful with parameters passed to this function. Who knows
747 // if key, table have proper info set. I had to verify by checking
748 // in the debugger.
749 //
set_key_filter(MY_BITMAP * key_filter,KEY * key,TABLE * table,bool get_offset_from_keypart)750 void set_key_filter(
751     MY_BITMAP* key_filter,
752     KEY* key,
753     TABLE* table,
754     bool get_offset_from_keypart) {
755 
756     FILTER_KEY_PART_INFO parts[MAX_REF_PARTS];
757     uint curr_skip_index = 0;
758 
759     for (uint i = 0; i < key->user_defined_key_parts; i++) {
760         //
761         // horrendous hack due to bugs in mysql, basically
762         // we cannot always reliably get the offset from the same source
763         //
764         parts[i].offset =
765             get_offset_from_keypart ?
766                 key->key_part[i].offset :
767                 field_offset(key->key_part[i].field, table);
768         parts[i].part_index = i;
769     }
770     qsort(
771         parts, // start of array
772         key->user_defined_key_parts, //num elements
773         sizeof(*parts), //size of each element
774         filter_key_part_compare);
775 
776     for (uint i = 0; i < table->s->fields; i++) {
777         Field* field = table->field[i];
778         uint curr_field_offset = field_offset(field, table);
779         if (curr_skip_index < key->user_defined_key_parts) {
780             uint curr_skip_offset = 0;
781             curr_skip_offset = parts[curr_skip_index].offset;
782             if (curr_skip_offset == curr_field_offset) {
783                 //
784                 // we have hit a field that is a portion of the primary key
785                 //
786                 uint curr_key_index = parts[curr_skip_index].part_index;
787                 curr_skip_index++;
788                 //
789                 // only choose to continue over the key if the key's length matches the field's length
790                 // otherwise, we may have a situation where the column is a varchar(10), the
791                 // key is only the first 3 characters, and we end up losing the last 7 bytes of the
792                 // column
793                 //
794                 TOKU_TYPE toku_type = mysql_to_toku_type(field);
795                 switch (toku_type) {
796                 case toku_type_blob:
797                     break;
798                 case toku_type_varbinary:
799                 case toku_type_varstring:
800                 case toku_type_fixbinary:
801                 case toku_type_fixstring:
802                     if (key->key_part[curr_key_index].length == field->field_length) {
803                         bitmap_set_bit(key_filter,i);
804                     }
805                     break;
806                 default:
807                     bitmap_set_bit(key_filter,i);
808                     break;
809                 }
810             }
811         }
812     }
813 }
814 
pack_fixed_field(uchar * to_tokudb,const uchar * from_mysql,uint32_t num_bytes)815 static inline uchar* pack_fixed_field(
816     uchar* to_tokudb,
817     const uchar* from_mysql,
818     uint32_t num_bytes
819     )
820 {
821     switch (num_bytes) {
822     case (1):
823         memcpy(to_tokudb, from_mysql, 1);
824         break;
825     case (2):
826         memcpy(to_tokudb, from_mysql, 2);
827         break;
828     case (3):
829         memcpy(to_tokudb, from_mysql, 3);
830         break;
831     case (4):
832         memcpy(to_tokudb, from_mysql, 4);
833         break;
834     case (8):
835         memcpy(to_tokudb, from_mysql, 8);
836         break;
837     default:
838         memcpy(to_tokudb, from_mysql, num_bytes);
839         break;
840     }
841     return to_tokudb+num_bytes;
842 }
843 
unpack_fixed_field(uchar * to_mysql,const uchar * from_tokudb,uint32_t num_bytes)844 static inline const uchar* unpack_fixed_field(
845     uchar* to_mysql,
846     const uchar* from_tokudb,
847     uint32_t num_bytes
848     )
849 {
850     switch (num_bytes) {
851     case (1):
852         memcpy(to_mysql, from_tokudb, 1);
853         break;
854     case (2):
855         memcpy(to_mysql, from_tokudb, 2);
856         break;
857     case (3):
858         memcpy(to_mysql, from_tokudb, 3);
859         break;
860     case (4):
861         memcpy(to_mysql, from_tokudb, 4);
862         break;
863     case (8):
864         memcpy(to_mysql, from_tokudb, 8);
865         break;
866     default:
867         memcpy(to_mysql, from_tokudb, num_bytes);
868         break;
869     }
870     return from_tokudb+num_bytes;
871 }
872 
write_var_field(uchar * to_tokudb_offset_ptr,uchar * to_tokudb_data,uchar * to_tokudb_offset_start,const uchar * data,uint32_t data_length,uint32_t offset_bytes)873 static inline uchar* write_var_field(
874     uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
875     uchar* to_tokudb_data, // location where data is going to be written
876     uchar* to_tokudb_offset_start, //location where offset starts, IS THIS A BAD NAME????
877     const uchar * data, // the data to write
878     uint32_t data_length, // length of data to write
879     uint32_t offset_bytes // number of offset bytes
880     )
881 {
882     memcpy(to_tokudb_data, data, data_length);
883     //
884     // for offset, we pack the offset where the data ENDS!
885     //
886     uint32_t offset = to_tokudb_data + data_length - to_tokudb_offset_start;
887     switch(offset_bytes) {
888     case (1):
889         to_tokudb_offset_ptr[0] = (uchar)offset;
890         break;
891     case (2):
892         int2store(to_tokudb_offset_ptr,offset);
893         break;
894     default:
895         assert_unreachable();
896         break;
897     }
898     return to_tokudb_data + data_length;
899 }
900 
get_var_data_length(const uchar * from_mysql,uint32_t mysql_length_bytes)901 static inline uint32_t get_var_data_length(
902     const uchar * from_mysql,
903     uint32_t mysql_length_bytes
904     )
905 {
906     uint32_t data_length;
907     switch(mysql_length_bytes) {
908     case(1):
909         data_length = from_mysql[0];
910         break;
911     case(2):
912         data_length = uint2korr(from_mysql);
913         break;
914     default:
915         assert_unreachable();
916     }
917     return data_length;
918 }
919 
pack_var_field(uchar * to_tokudb_offset_ptr,uchar * to_tokudb_data,uchar * to_tokudb_offset_start,const uchar * from_mysql,uint32_t mysql_length_bytes,uint32_t offset_bytes)920 static inline uchar* pack_var_field(
921     uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
922     uchar* to_tokudb_data, // pointer to where tokudb data should be written
923     uchar* to_tokudb_offset_start, //location where data starts, IS THIS A BAD NAME????
924     const uchar * from_mysql, // mysql data
925     uint32_t mysql_length_bytes, //number of bytes used to store length in from_mysql
926     uint32_t offset_bytes //number of offset_bytes used in tokudb row
927     )
928 {
929     uint data_length = get_var_data_length(from_mysql, mysql_length_bytes);
930     return write_var_field(
931         to_tokudb_offset_ptr,
932         to_tokudb_data,
933         to_tokudb_offset_start,
934         from_mysql + mysql_length_bytes,
935         data_length,
936         offset_bytes
937         );
938 }
939 
unpack_var_field(uchar * to_mysql,const uchar * from_tokudb_data,uint32_t from_tokudb_data_len,uint32_t mysql_length_bytes)940 static inline void unpack_var_field(
941     uchar* to_mysql,
942     const uchar* from_tokudb_data,
943     uint32_t from_tokudb_data_len,
944     uint32_t mysql_length_bytes
945     )
946 {
947     //
948     // store the length
949     //
950     switch (mysql_length_bytes) {
951     case(1):
952         to_mysql[0] = (uchar)from_tokudb_data_len;
953         break;
954     case(2):
955         int2store(to_mysql, from_tokudb_data_len);
956         break;
957     default:
958         assert_unreachable();
959     }
960     //
961     // store the data
962     //
963     memcpy(to_mysql+mysql_length_bytes, from_tokudb_data, from_tokudb_data_len);
964 }
965 
pack_toku_field_blob(uchar * to_tokudb,const uchar * from_mysql,Field * field)966 static uchar* pack_toku_field_blob(
967     uchar* to_tokudb,
968     const uchar* from_mysql,
969     Field* field
970     )
971 {
972     uint32_t len_bytes = field->row_pack_length();
973     uint32_t length = 0;
974     uchar* data_ptr = NULL;
975     memcpy(to_tokudb, from_mysql, len_bytes);
976 
977     switch (len_bytes) {
978     case (1):
979         length = (uint32_t)(*from_mysql);
980         break;
981     case (2):
982         length = uint2korr(from_mysql);
983         break;
984     case (3):
985         length = tokudb_uint3korr(from_mysql);
986         break;
987     case (4):
988         length = uint4korr(from_mysql);
989         break;
990     default:
991         assert_unreachable();
992     }
993 
994     if (length > 0) {
995         memcpy((uchar *)(&data_ptr), from_mysql + len_bytes, sizeof(uchar*));
996         memcpy(to_tokudb + len_bytes, data_ptr, length);
997     }
998     return (to_tokudb + len_bytes + length);
999 }
1000 
create_tokudb_trx_data_instance(tokudb_trx_data ** out_trx)1001 static int create_tokudb_trx_data_instance(tokudb_trx_data** out_trx) {
1002     int error;
1003     tokudb_trx_data* trx = (tokudb_trx_data *) tokudb::memory::malloc(
1004         sizeof(*trx),
1005         MYF(MY_ZEROFILL));
1006     if (!trx) {
1007         error = ENOMEM;
1008         goto cleanup;
1009     }
1010 
1011     *out_trx = trx;
1012     error = 0;
1013 cleanup:
1014     return error;
1015 }
1016 
tokudb_generate_row(DB * dest_db,TOKUDB_UNUSED (DB * src_db),DBT * dest_key,DBT * dest_val,const DBT * src_key,const DBT * src_val)1017 static inline int tokudb_generate_row(DB* dest_db,
1018                                       TOKUDB_UNUSED(DB* src_db),
1019                                       DBT* dest_key,
1020                                       DBT* dest_val,
1021                                       const DBT* src_key,
1022                                       const DBT* src_val) {
1023     int error;
1024 
1025     DB* curr_db = dest_db;
1026     uchar* row_desc = NULL;
1027     uint32_t desc_size;
1028     uchar* buff = NULL;
1029     uint32_t max_key_len = 0;
1030 
1031     row_desc = (uchar *)curr_db->descriptor->dbt.data;
1032     row_desc += (*(uint32_t *)row_desc);
1033     desc_size = (*(uint32_t *)row_desc) - 4;
1034     row_desc += 4;
1035 
1036     if (is_key_pk(row_desc)) {
1037         if (dest_key->flags == DB_DBT_REALLOC && dest_key->data != NULL) {
1038             free(dest_key->data);
1039         }
1040         if (dest_val != NULL) {
1041             if (dest_val->flags == DB_DBT_REALLOC && dest_val->data != NULL) {
1042                 free(dest_val->data);
1043             }
1044         }
1045         dest_key->data = src_key->data;
1046         dest_key->size = src_key->size;
1047         dest_key->flags = 0;
1048         if (dest_val != NULL) {
1049             dest_val->data = src_val->data;
1050             dest_val->size = src_val->size;
1051             dest_val->flags = 0;
1052         }
1053         error = 0;
1054         goto cleanup;
1055     }
1056     // at this point, we need to create the key/val and set it
1057     // in the DBTs
1058     if (dest_key->flags == 0) {
1059         dest_key->ulen = 0;
1060         dest_key->size = 0;
1061         dest_key->data = NULL;
1062         dest_key->flags = DB_DBT_REALLOC;
1063     }
1064     if (dest_key->flags == DB_DBT_REALLOC) {
1065         max_key_len = max_key_size_from_desc(row_desc, desc_size);
1066         max_key_len += src_key->size;
1067 
1068         if (max_key_len > dest_key->ulen) {
1069             void* old_ptr = dest_key->data;
1070             void* new_ptr = NULL;
1071             new_ptr = realloc(old_ptr, max_key_len);
1072             assert_always(new_ptr);
1073             dest_key->data = new_ptr;
1074             dest_key->ulen = max_key_len;
1075         }
1076 
1077         buff = (uchar *)dest_key->data;
1078         assert_always(buff != nullptr);
1079         assert_always(max_key_len > 0);
1080     } else {
1081         assert_unreachable();
1082     }
1083 
1084     dest_key->size = pack_key_from_desc(buff, row_desc, desc_size, src_key,
1085                                         src_val);
1086     assert_always(dest_key->ulen >= dest_key->size);
1087     if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY)) &&
1088         !max_key_len) {
1089         max_key_len = max_key_size_from_desc(row_desc, desc_size);
1090         max_key_len += src_key->size;
1091     }
1092     if (max_key_len) {
1093         assert_always(max_key_len >= dest_key->size);
1094     }
1095 
1096     row_desc += desc_size;
1097     desc_size = (*(uint32_t *)row_desc) - 4;
1098     row_desc += 4;
1099     if (dest_val != NULL) {
1100         if (!is_key_clustering(desc_size) || src_val->size == 0) {
1101             dest_val->size = 0;
1102         } else {
1103             uchar* buff = NULL;
1104             if (dest_val->flags == 0) {
1105                 dest_val->ulen = 0;
1106                 dest_val->size = 0;
1107                 dest_val->data = NULL;
1108                 dest_val->flags = DB_DBT_REALLOC;
1109             }
1110             if (dest_val->flags == DB_DBT_REALLOC){
1111                 if (dest_val->ulen < src_val->size) {
1112                     void* old_ptr = dest_val->data;
1113                     void* new_ptr = NULL;
1114                     new_ptr = realloc(old_ptr, src_val->size);
1115                     assert_always(new_ptr);
1116                     dest_val->data = new_ptr;
1117                     dest_val->ulen = src_val->size;
1118                 }
1119                 buff = (uchar *)dest_val->data;
1120                 assert_always(buff != NULL);
1121             } else {
1122                 assert_unreachable();
1123             }
1124             dest_val->size = pack_clustering_val_from_desc(
1125                 buff,
1126                 row_desc,
1127                 desc_size,
1128                 src_val);
1129             assert_always(dest_val->ulen >= dest_val->size);
1130         }
1131     }
1132     error = 0;
1133 cleanup:
1134     return error;
1135 }
1136 
generate_row_for_del(DB * dest_db,DB * src_db,DBT_ARRAY * dest_key_arrays,const DBT * src_key,const DBT * src_val)1137 static int generate_row_for_del(
1138     DB *dest_db,
1139     DB *src_db,
1140     DBT_ARRAY *dest_key_arrays,
1141     const DBT *src_key,
1142     const DBT *src_val
1143     )
1144 {
1145     DBT* dest_key = &dest_key_arrays->dbts[0];
1146     return tokudb_generate_row(
1147         dest_db,
1148         src_db,
1149         dest_key,
1150         NULL,
1151         src_key,
1152         src_val
1153         );
1154 }
1155 
1156 
generate_row_for_put(DB * dest_db,DB * src_db,DBT_ARRAY * dest_key_arrays,DBT_ARRAY * dest_val_arrays,const DBT * src_key,const DBT * src_val)1157 static int generate_row_for_put(
1158     DB *dest_db,
1159     DB *src_db,
1160     DBT_ARRAY *dest_key_arrays,
1161     DBT_ARRAY *dest_val_arrays,
1162     const DBT *src_key,
1163     const DBT *src_val
1164     )
1165 {
1166     DBT* dest_key = &dest_key_arrays->dbts[0];
1167     DBT *dest_val = (dest_val_arrays == NULL) ? NULL : &dest_val_arrays->dbts[0];
1168     return tokudb_generate_row(
1169         dest_db,
1170         src_db,
1171         dest_key,
1172         dest_val,
1173         src_key,
1174         src_val
1175         );
1176 }
1177 
ha_tokudb(handlerton * hton,TABLE_SHARE * table_arg)1178 ha_tokudb::ha_tokudb(handlerton * hton, TABLE_SHARE * table_arg):handler(hton, table_arg) {
1179     TOKUDB_HANDLER_DBUG_ENTER("");
1180     share = NULL;
1181     int_table_flags = HA_REC_NOT_IN_SEQ  | HA_NULL_IN_KEY | HA_CAN_INDEX_BLOBS
1182         | HA_PRIMARY_KEY_IN_READ_INDEX | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION
1183         | HA_FILE_BASED | HA_AUTO_PART_KEY | HA_TABLE_SCAN_ON_INDEX
1184         | HA_CAN_WRITE_DURING_OPTIMIZE | HA_ONLINE_ANALYZE;
1185     alloc_ptr = NULL;
1186     rec_buff = NULL;
1187     rec_update_buff = NULL;
1188     transaction = NULL;
1189     cursor = NULL;
1190     fixed_cols_for_query = NULL;
1191     var_cols_for_query = NULL;
1192     num_fixed_cols_for_query = 0;
1193     num_var_cols_for_query = 0;
1194     unpack_entire_row = true;
1195     read_blobs = false;
1196     read_key = false;
1197     added_rows = 0;
1198     deleted_rows = 0;
1199     updated_rows = 0;
1200     last_dup_key = UINT_MAX;
1201     using_ignore = false;
1202     using_ignore_no_key = false;
1203     last_cursor_error = 0;
1204     range_lock_grabbed = false;
1205     blob_buff = NULL;
1206     num_blob_bytes = 0;
1207     delay_updating_ai_metadata = false;
1208     ai_metadata_update_required = false;
1209     memset(mult_key_dbt_array, 0, sizeof(mult_key_dbt_array));
1210     memset(mult_rec_dbt_array, 0, sizeof(mult_rec_dbt_array));
1211     for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1212         toku_dbt_array_init(&mult_key_dbt_array[i], 1);
1213     }
1214     for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1215         toku_dbt_array_init(&mult_rec_dbt_array[i], 1);
1216     }
1217     loader = NULL;
1218     abort_loader = false;
1219     memset(&lc, 0, sizeof(lc));
1220     lock.type = TL_IGNORE;
1221     for (uint32_t i = 0; i < MAX_KEY+1; i++) {
1222         mult_put_flags[i] = 0;
1223         mult_del_flags[i] = DB_DELETE_ANY;
1224         mult_dbt_flags[i] = DB_DBT_REALLOC;
1225     }
1226     num_DBs_locked_in_bulk = false;
1227     lock_count = 0;
1228     use_write_locks = false;
1229     range_query_buff = NULL;
1230     size_range_query_buff = 0;
1231     bytes_used_in_range_query_buff = 0;
1232     curr_range_query_buff_offset = 0;
1233     doing_bulk_fetch = false;
1234     prelocked_left_range_size = 0;
1235     prelocked_right_range_size = 0;
1236     tokudb_active_index = MAX_KEY;
1237     invalidate_icp();
1238     trx_handler_list.data = this;
1239 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
1240     in_rpl_write_rows = in_rpl_delete_rows = in_rpl_update_rows = false;
1241 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
1242     TOKUDB_HANDLER_DBUG_VOID_RETURN;
1243 }
1244 
~ha_tokudb()1245 ha_tokudb::~ha_tokudb() {
1246     TOKUDB_HANDLER_DBUG_ENTER("");
1247     for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1248         toku_dbt_array_destroy(&mult_key_dbt_array[i]);
1249     }
1250     for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1251         toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
1252     }
1253     TOKUDB_HANDLER_DBUG_VOID_RETURN;
1254 }
1255 
1256 //
1257 // states if table has an auto increment column, if so, sets index where auto inc column is to index
1258 // Parameters:
1259 //      [out]   index - if auto inc exists, then this param is set to where it exists in table, if not, then unchanged
1260 // Returns:
1261 //      true if auto inc column exists, false otherwise
1262 //
has_auto_increment_flag(uint * index)1263 bool ha_tokudb::has_auto_increment_flag(uint* index) {
1264     //
1265     // check to see if we have auto increment field
1266     //
1267     bool ai_found = false;
1268     uint ai_index = 0;
1269     for (uint i = 0; i < table_share->fields; i++, ai_index++) {
1270         Field* field = table->field[i];
1271         if (field->flags & AUTO_INCREMENT_FLAG) {
1272             ai_found = true;
1273             *index = ai_index;
1274             break;
1275         }
1276     }
1277     return ai_found;
1278 }
1279 
open_status_dictionary(DB ** ptr,const char * name,DB_TXN * txn)1280 static int open_status_dictionary(DB** ptr, const char* name, DB_TXN* txn) {
1281     int error;
1282     char* newname = NULL;
1283     size_t newname_len = get_max_dict_name_path_length(name);
1284     newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
1285     if (newname == NULL) {
1286         error = ENOMEM;
1287         goto cleanup;
1288     }
1289     make_name(newname, newname_len, name, "status");
1290     TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "open:%s", newname);
1291 
1292     error = tokudb::metadata::open(db_env, ptr, newname, txn);
1293 cleanup:
1294     tokudb::memory::free(newname);
1295     return error;
1296 }
1297 
open_main_dictionary(const char * name,bool is_read_only,DB_TXN * txn)1298 int ha_tokudb::open_main_dictionary(
1299     const char* name,
1300     bool is_read_only,
1301     DB_TXN* txn) {
1302 
1303     int error;
1304     char* newname = NULL;
1305     size_t newname_len = 0;
1306     uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1307 
1308     assert_always(share->file == NULL);
1309     assert_always(share->key_file[primary_key] == NULL);
1310     newname_len = get_max_dict_name_path_length(name);
1311     newname = (char*)tokudb::memory::malloc(
1312         newname_len,
1313         MYF(MY_WME|MY_ZEROFILL));
1314     if (newname == NULL) {
1315         error = ENOMEM;
1316         goto exit;
1317     }
1318     make_name(newname, newname_len, name, "main");
1319 
1320     error = db_create(&share->file, db_env, 0);
1321     if (error) {
1322         goto exit;
1323     }
1324     share->key_file[primary_key] = share->file;
1325 
1326     error =
1327         share->file->open(
1328             share->file,
1329             txn,
1330             newname,
1331             NULL,
1332             DB_BTREE,
1333             open_flags,
1334             S_IWUSR);
1335     if (error) {
1336         goto exit;
1337     }
1338 
1339     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1340         TOKUDB_DEBUG_OPEN,
1341         "open:%s:file=%p",
1342         newname,
1343         share->file);
1344 
1345     error = 0;
1346 exit:
1347     if (error) {
1348         if (share->file) {
1349             int r = share->file->close(
1350                 share->file,
1351                 0
1352                 );
1353             assert_always(r==0);
1354             share->file = NULL;
1355             share->key_file[primary_key] = NULL;
1356         }
1357     }
1358     tokudb::memory::free(newname);
1359     return error;
1360 }
1361 
1362 //
1363 // Open a secondary table, the key will be a secondary index, the data will
1364 // be a primary key
1365 //
open_secondary_dictionary(DB ** ptr,KEY * key_info,const char * name,bool is_read_only,DB_TXN * txn)1366 int ha_tokudb::open_secondary_dictionary(
1367     DB** ptr,
1368     KEY* key_info,
1369     const char* name,
1370     bool is_read_only,
1371     DB_TXN* txn) {
1372 
1373     int error = ENOSYS;
1374     char dict_name[MAX_DICT_NAME_LEN];
1375     uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1376     char* newname = NULL;
1377     size_t newname_len = 0;
1378 
1379     sprintf(dict_name, "key-%s", key_info->name);
1380 
1381     newname_len = get_max_dict_name_path_length(name);
1382     newname =
1383         (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME|MY_ZEROFILL));
1384     if (newname == NULL) {
1385         error = ENOMEM;
1386         goto cleanup;
1387     }
1388     make_name(newname, newname_len, name, dict_name);
1389 
1390 
1391     if ((error = db_create(ptr, db_env, 0))) {
1392         my_errno = error;
1393         goto cleanup;
1394     }
1395 
1396 
1397     error = (*ptr)->open(*ptr, txn, newname, NULL, DB_BTREE, open_flags, S_IWUSR);
1398     if (error) {
1399         my_errno = error;
1400         goto cleanup;
1401     }
1402     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1403         TOKUDB_DEBUG_OPEN,
1404         "open:%s:file=%p",
1405         newname,
1406         *ptr);
1407 cleanup:
1408     if (error) {
1409         if (*ptr) {
1410             int r = (*ptr)->close(*ptr, 0);
1411             assert_always(r==0);
1412             *ptr = NULL;
1413         }
1414     }
1415     tokudb::memory::free(newname);
1416     return error;
1417 }
1418 
initialize_col_pack_info(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)1419 static int initialize_col_pack_info(KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, uint keynr) {
1420     int error = ENOSYS;
1421     //
1422     // set up the cp_info
1423     //
1424     assert_always(kc_info->cp_info[keynr] == NULL);
1425     kc_info->cp_info[keynr] = (COL_PACK_INFO*)tokudb::memory::malloc(
1426         table_share->fields * sizeof(COL_PACK_INFO),
1427         MYF(MY_WME | MY_ZEROFILL));
1428     if (kc_info->cp_info[keynr] == NULL) {
1429         error = ENOMEM;
1430         goto exit;
1431     }
1432     {
1433     uint32_t curr_fixed_offset = 0;
1434     uint32_t curr_var_index = 0;
1435     for (uint j = 0; j < table_share->fields; j++) {
1436         COL_PACK_INFO* curr = &kc_info->cp_info[keynr][j];
1437         //
1438         // need to set the offsets / indexes
1439         // offsets are calculated AFTER the NULL bytes
1440         //
1441         if (!bitmap_is_set(&kc_info->key_filters[keynr],j)) {
1442             if (is_fixed_field(kc_info, j)) {
1443                 curr->col_pack_val = curr_fixed_offset;
1444                 curr_fixed_offset += kc_info->field_lengths[j];
1445             }
1446             else if (is_variable_field(kc_info, j)) {
1447                 curr->col_pack_val = curr_var_index;
1448                 curr_var_index++;
1449             }
1450         }
1451     }
1452 
1453     //
1454     // set up the mcp_info
1455     //
1456     kc_info->mcp_info[keynr].fixed_field_size = get_fixed_field_size(
1457         kc_info,
1458         table_share,
1459         keynr
1460         );
1461     kc_info->mcp_info[keynr].len_of_offsets = get_len_of_offsets(
1462         kc_info,
1463         table_share,
1464         keynr
1465         );
1466 
1467     error = 0;
1468     }
1469 exit:
1470     return error;
1471 }
1472 
1473 // reset the kc_info state at keynr
reset_key_and_col_info(KEY_AND_COL_INFO * kc_info,uint keynr)1474 static void reset_key_and_col_info(KEY_AND_COL_INFO *kc_info, uint keynr) {
1475     bitmap_clear_all(&kc_info->key_filters[keynr]);
1476     tokudb::memory::free(kc_info->cp_info[keynr]);
1477     kc_info->cp_info[keynr] = NULL;
1478     kc_info->mcp_info[keynr] = (MULTI_COL_PACK_INFO) { 0, 0 };
1479 }
1480 
initialize_key_and_col_info(TABLE_SHARE * table_share,TABLE * table,KEY_AND_COL_INFO * kc_info,uint hidden_primary_key,uint primary_key)1481 static int initialize_key_and_col_info(
1482     TABLE_SHARE* table_share,
1483     TABLE* table,
1484     KEY_AND_COL_INFO* kc_info,
1485     uint hidden_primary_key,
1486     uint primary_key) {
1487 
1488     int error = 0;
1489     uint32_t curr_blob_field_index = 0;
1490     uint32_t max_var_bytes = 0;
1491     //
1492     // fill in the field lengths. 0 means it is a variable sized field length
1493     // fill in length_bytes, 0 means it is fixed or blob
1494     //
1495     for (uint i = 0; i < table_share->fields; i++) {
1496         Field* field = table_share->field[i];
1497         TOKU_TYPE toku_type = mysql_to_toku_type(field);
1498         uint32 pack_length = 0;
1499         switch (toku_type) {
1500         case toku_type_int:
1501         case toku_type_double:
1502         case toku_type_float:
1503         case toku_type_fixbinary:
1504         case toku_type_fixstring:
1505             pack_length = field->pack_length();
1506             assert_always(pack_length < 1<<16);
1507             kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_FIXED_FIELD;
1508             kc_info->field_lengths[i] = (uint16_t)pack_length;
1509             kc_info->length_bytes[i] = 0;
1510             break;
1511         case toku_type_blob:
1512             kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_BLOB_FIELD;
1513             kc_info->field_lengths[i] = 0;
1514             kc_info->length_bytes[i] = 0;
1515             kc_info->blob_fields[curr_blob_field_index] = i;
1516             curr_blob_field_index++;
1517             break;
1518         case toku_type_varstring:
1519         case toku_type_varbinary:
1520             kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_VARIABLE_FIELD;
1521             kc_info->field_lengths[i] = 0;
1522             kc_info->length_bytes[i] =
1523                 (uchar)((Field_varstring*)field)->length_bytes;
1524             max_var_bytes += field->field_length;
1525             break;
1526         default:
1527             assert_unreachable();
1528         }
1529     }
1530     kc_info->num_blobs = curr_blob_field_index;
1531 
1532     //
1533     // initialize share->num_offset_bytes
1534     // because MAX_REF_LENGTH is 65536, we
1535     // can safely set num_offset_bytes to 1 or 2
1536     //
1537     if (max_var_bytes < 256) {
1538         kc_info->num_offset_bytes = 1;
1539     } else {
1540         kc_info->num_offset_bytes = 2;
1541     }
1542 
1543     for (uint i = 0;
1544          i < table_share->keys + tokudb_test(hidden_primary_key);
1545          i++) {
1546         //
1547         // do the cluster/primary key filtering calculations
1548         //
1549         if (!(i==primary_key && hidden_primary_key)) {
1550             if (i == primary_key) {
1551                 set_key_filter(
1552                     &kc_info->key_filters[primary_key],
1553                     &table_share->key_info[primary_key],
1554                     table,
1555                     true);
1556             } else {
1557                 set_key_filter(
1558                     &kc_info->key_filters[i],
1559                     &table_share->key_info[i],
1560                     table,
1561                     true);
1562                 if (!hidden_primary_key) {
1563                     set_key_filter(
1564                         &kc_info->key_filters[i],
1565                         &table_share->key_info[primary_key],
1566                         table,
1567                         true);
1568                 }
1569             }
1570         }
1571         if (i == primary_key || key_is_clustering(&table_share->key_info[i])) {
1572             error = initialize_col_pack_info(kc_info, table_share, i);
1573             if (error) {
1574                 goto exit;
1575             }
1576         }
1577     }
1578 exit:
1579     return error;
1580 }
1581 
can_replace_into_be_fast(TABLE_SHARE * table_share,KEY_AND_COL_INFO * kc_info,uint pk)1582 bool ha_tokudb::can_replace_into_be_fast(
1583     TABLE_SHARE* table_share,
1584     KEY_AND_COL_INFO* kc_info,
1585     uint pk) {
1586 
1587     uint curr_num_DBs = table_share->keys + tokudb_test(hidden_primary_key);
1588     bool ret_val;
1589     if (curr_num_DBs == 1) {
1590         ret_val = true;
1591         goto exit;
1592     }
1593     ret_val = true;
1594     for (uint curr_index = 0; curr_index < table_share->keys; curr_index++) {
1595         if (curr_index == pk) continue;
1596         KEY* curr_key_info = &table_share->key_info[curr_index];
1597         for (uint i = 0; i < curr_key_info->user_defined_key_parts; i++) {
1598             uint16 curr_field_index = curr_key_info->key_part[i].field->field_index;
1599             if (!bitmap_is_set(&kc_info->key_filters[curr_index],curr_field_index)) {
1600                 ret_val = false;
1601                 goto exit;
1602             }
1603             if (bitmap_is_set(&kc_info->key_filters[curr_index], curr_field_index) &&
1604                 !bitmap_is_set(&kc_info->key_filters[pk], curr_field_index)) {
1605                 ret_val = false;
1606                 goto exit;
1607             }
1608 
1609         }
1610     }
1611 exit:
1612     return ret_val;
1613 }
1614 
initialize_share(const char * name,int mode)1615 int ha_tokudb::initialize_share(const char* name, int mode) {
1616     int error = 0;
1617     uint64_t num_rows = 0;
1618     DB_TXN* txn = NULL;
1619     bool do_commit = false;
1620     THD* thd = ha_thd();
1621     tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
1622     if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
1623         txn = trx->sub_sp_level;
1624     }
1625     else {
1626         do_commit = true;
1627         error = txn_begin(db_env, 0, &txn, 0, thd);
1628         if (error) { goto exit; }
1629     }
1630 
1631 
1632     error = get_status(txn);
1633     if (error) {
1634         goto exit;
1635     }
1636     if (share->version != HA_TOKU_VERSION) {
1637         error = ENOSYS;
1638         goto exit;
1639     }
1640 
1641 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
1642 #if defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
1643     // verify frm data for non-partitioned tables
1644     if (TOKU_PARTITION_WRITE_FRM_DATA || table->part_info == NULL) {
1645         error = verify_frm_data(table->s->path.str, txn);
1646         if (error)
1647             goto exit;
1648     } else {
1649         // remove the frm data for partitions since we are not maintaining it
1650         error = remove_frm_data(share->status_block, txn);
1651         if (error)
1652             goto exit;
1653     }
1654 #else
1655     error = verify_frm_data(table->s->path.str, txn);
1656     if (error)
1657         goto exit;
1658 #endif  // defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
1659 #endif  // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
1660 
1661     error =
1662         initialize_key_and_col_info(
1663             table_share,
1664             table,
1665             &share->kc_info,
1666             hidden_primary_key,
1667             primary_key);
1668     if (error) { goto exit; }
1669 
1670     error = open_main_dictionary(name, mode == O_RDONLY, txn);
1671     if (error) {
1672         goto exit;
1673     }
1674 
1675     share->has_unique_keys = false;
1676     share->_keys = table_share->keys;
1677     share->_max_key_parts = table_share->key_parts;
1678     share->_key_descriptors =
1679         (TOKUDB_SHARE::key_descriptor_t*)tokudb::memory::malloc(
1680             sizeof(TOKUDB_SHARE::key_descriptor_t) * share->_keys,
1681             MYF(MY_ZEROFILL));
1682 
1683     /* Open other keys;  These are part of the share structure */
1684     for (uint i = 0; i < table_share->keys; i++) {
1685         share->_key_descriptors[i]._parts =
1686             table_share->key_info[i].user_defined_key_parts;
1687         if (i == primary_key) {
1688             share->_key_descriptors[i]._is_unique = true;
1689             share->_key_descriptors[i]._name = tokudb::memory::strdup("primary", 0);
1690         } else {
1691             share->_key_descriptors[i]._is_unique = false;
1692             share->_key_descriptors[i]._name =
1693                 tokudb::memory::strdup(table_share->key_info[i].name, 0);
1694         }
1695 
1696         if (table_share->key_info[i].flags & HA_NOSAME) {
1697             share->_key_descriptors[i]._is_unique = true;
1698             share->has_unique_keys = true;
1699         }
1700         if (i != primary_key) {
1701             error =
1702                 open_secondary_dictionary(
1703                     &share->key_file[i],
1704                     &table_share->key_info[i],
1705                     name,
1706                     mode == O_RDONLY,
1707                     txn);
1708             if (error) {
1709                 goto exit;
1710             }
1711         }
1712     }
1713     share->replace_into_fast =
1714         can_replace_into_be_fast(
1715             table_share,
1716             &share->kc_info,
1717             primary_key);
1718 
1719     share->pk_has_string = false;
1720     if (!hidden_primary_key) {
1721         //
1722         // We need to set the ref_length to start at 5, to account for
1723         // the "infinity byte" in keys, and for placing the DBT size in the first four bytes
1724         //
1725         ref_length = sizeof(uint32_t) + sizeof(uchar);
1726         KEY_PART_INFO* key_part = table->key_info[primary_key].key_part;
1727         KEY_PART_INFO* end =
1728             key_part + table->key_info[primary_key].user_defined_key_parts;
1729         for (; key_part != end; key_part++) {
1730             ref_length += key_part->field->max_packed_col_length(key_part->length);
1731             TOKU_TYPE toku_type = mysql_to_toku_type(key_part->field);
1732             if (toku_type == toku_type_fixstring ||
1733                 toku_type == toku_type_varstring ||
1734                 toku_type == toku_type_blob
1735                 )
1736             {
1737                 share->pk_has_string = true;
1738             }
1739         }
1740         share->status |= STATUS_PRIMARY_KEY_INIT;
1741     }
1742     share->ref_length = ref_length;
1743 
1744     error = estimate_num_rows(share->file, &num_rows, txn);
1745     //
1746     // estimate_num_rows should not fail under normal conditions
1747     //
1748     if (error == 0) {
1749         share->set_row_count(num_rows, true);
1750     } else {
1751         goto exit;
1752     }
1753     //
1754     // initialize auto increment data
1755     //
1756     share->has_auto_inc = has_auto_increment_flag(&share->ai_field_index);
1757     if (share->has_auto_inc) {
1758         init_auto_increment();
1759     }
1760 
1761     if (may_table_be_empty(txn)) {
1762         share->try_table_lock = true;
1763     } else {
1764         share->try_table_lock = false;
1765     }
1766 
1767     share->num_DBs = table_share->keys + tokudb_test(hidden_primary_key);
1768 
1769     init_hidden_prim_key_info(txn);
1770 
1771     // initialize cardinality info from the status dictionary
1772     {
1773         uint32_t rec_per_keys = tokudb::compute_total_key_parts(table_share);
1774         uint64_t* rec_per_key =
1775             (uint64_t*)tokudb::memory::malloc(
1776                 rec_per_keys * sizeof(uint64_t),
1777                 MYF(MY_FAE));
1778         error =
1779             tokudb::get_card_from_status(
1780                 share->status_block,
1781                 txn,
1782                 rec_per_keys,
1783                 rec_per_key);
1784         if (error) {
1785             memset(rec_per_key, 0, sizeof(ulonglong) * rec_per_keys);
1786         }
1787         share->init_cardinality_counts(rec_per_keys, rec_per_key);
1788     }
1789 
1790     error = 0;
1791 exit:
1792     if (do_commit && txn) {
1793         commit_txn(txn,0);
1794     }
1795     return error;
1796 }
1797 
1798 //
1799 // Creates and opens a handle to a table which already exists in a tokudb
1800 // database.
1801 // Parameters:
1802 //      [in]   name - table name
1803 //             mode - seems to specify if table is read only
1804 //             test_if_locked - unused
1805 // Returns:
1806 //      0 on success
1807 //      1 on error
1808 //
open(const char * name,int mode,uint test_if_locked)1809 int ha_tokudb::open(const char *name, int mode, uint test_if_locked) {
1810     TOKUDB_HANDLER_DBUG_ENTER("%s %o %u", name, mode, test_if_locked);
1811     THD* thd = ha_thd();
1812 
1813     int error = 0;
1814     int ret_val = 0;
1815 
1816     transaction = NULL;
1817     cursor = NULL;
1818 
1819 
1820     /* Open primary key */
1821     hidden_primary_key = 0;
1822     if ((primary_key = table_share->primary_key) >= MAX_KEY) {
1823         // No primary key
1824         primary_key = table_share->keys;
1825         key_used_on_scan = MAX_KEY;
1826         hidden_primary_key = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
1827         ref_length = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t);
1828     }
1829     else {
1830         key_used_on_scan = primary_key;
1831     }
1832 
1833     /* Need some extra memory in case of packed keys */
1834     // the "+ 1" is for the first byte that states +/- infinity
1835     // multiply everything by 2 to account for clustered keys having a key and primary key together
1836     max_key_length = 2*(table_share->max_key_length + MAX_REF_PARTS * 3 + sizeof(uchar));
1837     alloc_ptr = tokudb::memory::multi_malloc(
1838         MYF(MY_WME),
1839         &key_buff, max_key_length,
1840         &key_buff2, max_key_length,
1841         &key_buff3, max_key_length,
1842         &key_buff4, max_key_length,
1843         &prelocked_left_range, max_key_length,
1844         &prelocked_right_range, max_key_length,
1845         &primary_key_buff, (hidden_primary_key ? 0 : max_key_length),
1846         &fixed_cols_for_query, table_share->fields*sizeof(uint32_t),
1847         &var_cols_for_query, table_share->fields*sizeof(uint32_t),
1848         NullS);
1849     if (alloc_ptr == NULL) {
1850         ret_val = 1;
1851         goto exit;
1852     }
1853 
1854     size_range_query_buff = tokudb::sysvars::read_buf_size(thd);
1855     range_query_buff =
1856         (uchar*)tokudb::memory::malloc(size_range_query_buff, MYF(MY_WME));
1857     if (range_query_buff == NULL) {
1858         ret_val = 1;
1859         goto exit;
1860     }
1861 
1862     alloced_rec_buff_length = table_share->rec_buff_length +
1863         table_share->fields;
1864     rec_buff = (uchar *) tokudb::memory::malloc(
1865         alloced_rec_buff_length,
1866         MYF(MY_WME));
1867     if (rec_buff == NULL) {
1868         ret_val = 1;
1869         goto exit;
1870     }
1871 
1872     alloced_update_rec_buff_length = alloced_rec_buff_length;
1873     rec_update_buff = (uchar*)tokudb::memory::malloc(
1874         alloced_update_rec_buff_length,
1875         MYF(MY_WME));
1876     if (rec_update_buff == NULL) {
1877         ret_val = 1;
1878         goto exit;
1879     }
1880 
1881     // lookup or create share
1882     share = TOKUDB_SHARE::get_share(name, &lock, true);
1883     assert_always(share);
1884 
1885     if (share->state() != TOKUDB_SHARE::OPENED) {
1886         // means we're responsible for the transition to OPENED, ERROR or CLOSED
1887 
1888         ret_val = allocate_key_and_col_info(table_share, &share->kc_info);
1889         if (ret_val == 0) {
1890             ret_val = initialize_share(name, mode);
1891         }
1892 
1893         if (ret_val == 0) {
1894             share->set_state(TOKUDB_SHARE::OPENED);
1895         } else {
1896             free_key_and_col_info(&share->kc_info);
1897             share->set_state(TOKUDB_SHARE::ERROR);
1898         }
1899         share->unlock();
1900     } else {
1901         // got an already OPENED instance
1902         share->unlock();
1903     }
1904 
1905     if (share->state() == TOKUDB_SHARE::ERROR) {
1906         share->release();
1907         goto exit;
1908     }
1909 
1910     assert_always(share->state() == TOKUDB_SHARE::OPENED);
1911 
1912     ref_length = share->ref_length;     // If second open
1913 
1914     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1915         TOKUDB_DEBUG_OPEN,
1916         "tokudbopen:%p:share=%p:file=%p:table=%p:table->s=%p:%d",
1917         this,
1918         share,
1919         share->file,
1920         table,
1921         table->s,
1922         share->use_count());
1923 
1924     key_read = false;
1925     stats.block_size = 1<<20;    // QQQ Tokudb DB block size
1926 
1927     info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
1928 
1929 exit:
1930     if (ret_val) {
1931         tokudb::memory::free(range_query_buff);
1932         range_query_buff = NULL;
1933         tokudb::memory::free(alloc_ptr);
1934         alloc_ptr = NULL;
1935         tokudb::memory::free(rec_buff);
1936         rec_buff = NULL;
1937         tokudb::memory::free(rec_update_buff);
1938         rec_update_buff = NULL;
1939 
1940         if (error) {
1941             my_errno = error;
1942         }
1943     }
1944     TOKUDB_HANDLER_DBUG_RETURN(ret_val);
1945 }
1946 
1947 //
1948 // estimate the number of rows in a DB
1949 // Parameters:
1950 //      [in]    db - DB whose number of rows will be estimated
1951 //      [out]   num_rows - number of estimated rows in db
1952 // Returns:
1953 //      0 on success
1954 //      error otherwise
1955 //
estimate_num_rows(DB * db,uint64_t * num_rows,DB_TXN * txn)1956 int ha_tokudb::estimate_num_rows(DB* db, uint64_t* num_rows, DB_TXN* txn) {
1957     int error = ENOSYS;
1958     bool do_commit = false;
1959     DB_BTREE_STAT64 dict_stats;
1960     DB_TXN* txn_to_use = NULL;
1961 
1962     if (txn == NULL) {
1963         error = txn_begin(db_env, 0, &txn_to_use, DB_READ_UNCOMMITTED, ha_thd());
1964         if (error) goto cleanup;
1965         do_commit = true;
1966     }
1967     else {
1968         txn_to_use = txn;
1969     }
1970 
1971     error = db->stat64(db, txn_to_use, &dict_stats);
1972     if (error) { goto cleanup; }
1973 
1974     *num_rows = dict_stats.bt_ndata;
1975     error = 0;
1976 cleanup:
1977     if (do_commit) {
1978         commit_txn(txn_to_use, 0);
1979         txn_to_use = NULL;
1980     }
1981     return error;
1982 }
1983 
1984 
write_to_status(DB * db,HA_METADATA_KEY curr_key_data,void * data,uint size,DB_TXN * txn)1985 int ha_tokudb::write_to_status(DB* db, HA_METADATA_KEY curr_key_data, void* data, uint size, DB_TXN* txn ){
1986     return write_metadata(db, &curr_key_data, sizeof curr_key_data, data, size, txn);
1987 }
1988 
remove_from_status(DB * db,HA_METADATA_KEY curr_key_data,DB_TXN * txn)1989 int ha_tokudb::remove_from_status(DB *db, HA_METADATA_KEY curr_key_data, DB_TXN *txn) {
1990     return remove_metadata(db, &curr_key_data, sizeof curr_key_data, txn);
1991 }
1992 
remove_metadata(DB * db,void * key_data,uint key_size,DB_TXN * transaction)1993 int ha_tokudb::remove_metadata(DB* db, void* key_data, uint key_size, DB_TXN* transaction){
1994     int error;
1995     DBT key;
1996     DB_TXN* txn = NULL;
1997     bool do_commit = false;
1998     //
1999     // transaction to be used for putting metadata into status.tokudb
2000     //
2001     if (transaction == NULL) {
2002         error = txn_begin(db_env, 0, &txn, 0, ha_thd());
2003         if (error) {
2004             goto cleanup;
2005         }
2006         do_commit = true;
2007     }
2008     else {
2009         txn = transaction;
2010     }
2011 
2012     memset(&key, 0, sizeof(key));
2013     key.data = key_data;
2014     key.size = key_size;
2015     error = db->del(db, txn, &key, DB_DELETE_ANY);
2016     if (error) {
2017         goto cleanup;
2018     }
2019 
2020     error = 0;
2021 cleanup:
2022     if (do_commit && txn) {
2023         if (!error) {
2024             commit_txn(txn, DB_TXN_NOSYNC);
2025         }
2026         else {
2027             abort_txn(txn);
2028         }
2029     }
2030     return error;
2031 }
2032 
2033 //
2034 // helper function to write a piece of metadata in to status.tokudb
2035 //
write_metadata(DB * db,void * key_data,uint key_size,void * val_data,uint val_size,DB_TXN * transaction)2036 int ha_tokudb::write_metadata(DB* db, void* key_data, uint key_size, void* val_data, uint val_size, DB_TXN* transaction ){
2037     int error;
2038     DBT key;
2039     DBT value;
2040     DB_TXN* txn = NULL;
2041     bool do_commit = false;
2042     //
2043     // transaction to be used for putting metadata into status.tokudb
2044     //
2045     if (transaction == NULL) {
2046         error = txn_begin(db_env, 0, &txn, 0, ha_thd());
2047         if (error) {
2048             goto cleanup;
2049         }
2050         do_commit = true;
2051     }
2052     else {
2053         txn = transaction;
2054     }
2055 
2056     memset(&key, 0, sizeof(key));
2057     memset(&value, 0, sizeof(value));
2058     key.data = key_data;
2059     key.size = key_size;
2060     value.data = val_data;
2061     value.size = val_size;
2062     error = db->put(db, txn, &key, &value, 0);
2063     if (error) {
2064         goto cleanup;
2065     }
2066 
2067     error = 0;
2068 cleanup:
2069     if (do_commit && txn) {
2070         if (!error) {
2071             commit_txn(txn, DB_TXN_NOSYNC);
2072         }
2073         else {
2074             abort_txn(txn);
2075         }
2076     }
2077     return error;
2078 }
2079 
2080 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
write_frm_data(DB * db,DB_TXN * txn,const char * frm_name)2081 int ha_tokudb::write_frm_data(DB* db, DB_TXN* txn, const char* frm_name) {
2082     TOKUDB_HANDLER_DBUG_ENTER("%p %p %s", db, txn, frm_name);
2083 
2084     uchar* frm_data = NULL;
2085     size_t frm_len = 0;
2086     int error = 0;
2087 
2088 #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099
2089     error = table_share->read_frm_image((const uchar**)&frm_data,&frm_len);
2090     if (error) { goto cleanup; }
2091 #else
2092     error = readfrm(frm_name,&frm_data,&frm_len);
2093     if (error) { goto cleanup; }
2094 #endif
2095 
2096     error = write_to_status(db,hatoku_frm_data,frm_data,(uint)frm_len, txn);
2097     if (error) { goto cleanup; }
2098 
2099     error = 0;
2100 cleanup:
2101     tokudb::memory::free(frm_data);
2102     TOKUDB_HANDLER_DBUG_RETURN(error);
2103 }
2104 
remove_frm_data(DB * db,DB_TXN * txn)2105 int ha_tokudb::remove_frm_data(DB *db, DB_TXN *txn) {
2106     return remove_from_status(db, hatoku_frm_data, txn);
2107 }
2108 
smart_dbt_callback_verify_frm(TOKUDB_UNUSED (DBT const * key),DBT const * row,void * context)2109 static int smart_dbt_callback_verify_frm(TOKUDB_UNUSED(DBT const* key),
2110                                          DBT const* row,
2111                                          void* context) {
2112     DBT* stored_frm = (DBT *)context;
2113     stored_frm->size = row->size;
2114     stored_frm->data = (uchar *)tokudb::memory::malloc(row->size, MYF(MY_WME));
2115     assert_always(stored_frm->data);
2116     memcpy(stored_frm->data, row->data, row->size);
2117     return 0;
2118 }
2119 
verify_frm_data(const char * frm_name,DB_TXN * txn)2120 int ha_tokudb::verify_frm_data(const char* frm_name, DB_TXN* txn) {
2121     TOKUDB_HANDLER_DBUG_ENTER("%s", frm_name);
2122     uchar* mysql_frm_data = NULL;
2123     size_t mysql_frm_len = 0;
2124     DBT key = {};
2125     DBT stored_frm = {};
2126     int error = 0;
2127     HA_METADATA_KEY curr_key = hatoku_frm_data;
2128 
2129     // get the frm data from MySQL
2130 #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100099
2131     error = table_share->read_frm_image((const uchar**)&mysql_frm_data,&mysql_frm_len);
2132     if (error) {
2133         goto cleanup;
2134     }
2135 #else
2136     error = readfrm(frm_name,&mysql_frm_data,&mysql_frm_len);
2137     if (error) {
2138         goto cleanup;
2139     }
2140 #endif
2141 
2142     key.data = &curr_key;
2143     key.size = sizeof(curr_key);
2144     error = share->status_block->getf_set(
2145         share->status_block,
2146         txn,
2147         0,
2148         &key,
2149         smart_dbt_callback_verify_frm,
2150         &stored_frm
2151         );
2152     if (error == DB_NOTFOUND) {
2153         // if not found, write it
2154         error = write_frm_data(share->status_block, txn, frm_name);
2155         goto cleanup;
2156     } else if (error) {
2157         goto cleanup;
2158     }
2159 
2160     if (stored_frm.size != mysql_frm_len || memcmp(stored_frm.data, mysql_frm_data, stored_frm.size)) {
2161         error = HA_ERR_TABLE_DEF_CHANGED;
2162         goto cleanup;
2163     }
2164 
2165     error = 0;
2166 cleanup:
2167     tokudb::memory::free(mysql_frm_data);
2168     tokudb::memory::free(stored_frm.data);
2169     TOKUDB_HANDLER_DBUG_RETURN(error);
2170 }
2171 #endif  // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
2172 
2173 //
2174 // Updates status.tokudb with a new max value used for the auto increment column
2175 // Parameters:
2176 //      [in]    db - this will always be status.tokudb
2177 //              val - value to store
2178 //  Returns:
2179 //      0 on success, error otherwise
2180 //
2181 //
update_max_auto_inc(DB * db,ulonglong val)2182 int ha_tokudb::update_max_auto_inc(DB* db, ulonglong val){
2183     return write_to_status(db,hatoku_max_ai,&val,sizeof(val), NULL);
2184 }
2185 
2186 //
2187 // Writes the initial auto increment value, as specified by create table
2188 // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
2189 // then the value 100 will be stored here in val
2190 // Parameters:
2191 //      [in]    db - this will always be status.tokudb
2192 //              val - value to store
2193 //  Returns:
2194 //      0 on success, error otherwise
2195 //
2196 //
write_auto_inc_create(DB * db,ulonglong val,DB_TXN * txn)2197 int ha_tokudb::write_auto_inc_create(DB* db, ulonglong val, DB_TXN* txn){
2198     return write_to_status(db,hatoku_ai_create_value,&val,sizeof(val), txn);
2199 }
2200 
2201 
2202 //
2203 // Closes a handle to a table.
2204 //
close()2205 int ha_tokudb::close() {
2206     TOKUDB_HANDLER_DBUG_ENTER("");
2207     int r = __close();
2208     TOKUDB_HANDLER_DBUG_RETURN(r);
2209 }
2210 
__close()2211 int ha_tokudb::__close() {
2212     TOKUDB_HANDLER_DBUG_ENTER("");
2213     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "close:%p", this);
2214     tokudb::memory::free(rec_buff);
2215     tokudb::memory::free(rec_update_buff);
2216     tokudb::memory::free(blob_buff);
2217     tokudb::memory::free(alloc_ptr);
2218     tokudb::memory::free(range_query_buff);
2219     for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
2220         toku_dbt_array_destroy(&mult_key_dbt_array[i]);
2221     }
2222     for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
2223         toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
2224     }
2225     rec_buff = NULL;
2226     rec_update_buff = NULL;
2227     alloc_ptr = NULL;
2228     ha_tokudb::reset();
2229     int retval = share->release();
2230     TOKUDB_HANDLER_DBUG_RETURN(retval);
2231 }
2232 
2233 //
2234 // Reallocate record buffer (rec_buff) if needed
2235 // If not needed, does nothing
2236 // Parameters:
2237 //          length - size of buffer required for rec_buff
2238 //
fix_rec_buff_for_blob(ulong length)2239 bool ha_tokudb::fix_rec_buff_for_blob(ulong length) {
2240     if (!rec_buff || (length > alloced_rec_buff_length)) {
2241         uchar* newptr = (uchar*)tokudb::memory::realloc(
2242             (void*)rec_buff,
2243             length,
2244             MYF(MY_ALLOW_ZERO_PTR));
2245         if (!newptr)
2246             return 1;
2247         rec_buff = newptr;
2248         alloced_rec_buff_length = length;
2249     }
2250     return 0;
2251 }
2252 
2253 //
2254 // Reallocate record buffer (rec_buff) if needed
2255 // If not needed, does nothing
2256 // Parameters:
2257 //          length - size of buffer required for rec_buff
2258 //
fix_rec_update_buff_for_blob(ulong length)2259 bool ha_tokudb::fix_rec_update_buff_for_blob(ulong length) {
2260     if (!rec_update_buff || (length > alloced_update_rec_buff_length)) {
2261         uchar* newptr = (uchar*)tokudb::memory::realloc(
2262             (void*)rec_update_buff,
2263             length,
2264             MYF(MY_ALLOW_ZERO_PTR));
2265         if (!newptr)
2266             return 1;
2267         rec_update_buff= newptr;
2268         alloced_update_rec_buff_length = length;
2269     }
2270     return 0;
2271 }
2272 
2273 /* Calculate max length needed for row */
max_row_length(const uchar * buf)2274 ulong ha_tokudb::max_row_length(const uchar * buf) {
2275     ulong length = table_share->reclength + table_share->fields * 2;
2276     uint *ptr, *end;
2277     for (ptr = table_share->blob_field, end = ptr + table_share->blob_fields; ptr != end; ptr++) {
2278         Field_blob *blob = ((Field_blob *) table->field[*ptr]);
2279         length += blob->get_length((uchar *) (buf + field_offset(blob, table))) + 2;
2280     }
2281     return length;
2282 }
2283 
2284 /*
2285 */
2286 //
2287 // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2288 // Pack a row for storage.
2289 // If the row is of fixed length, just store the  row 'as is'.
2290 // If not, we will generate a packed row suitable for storage.
2291 // This will only fail if we don't have enough memory to pack the row,
2292 // which may only happen in rows with blobs, as the default row length is
2293 // pre-allocated.
2294 // Parameters:
2295 //      [out]   row - row stored in DBT to be converted
2296 //      [out]   buf - buffer where row is packed
2297 //      [in]    record - row in MySQL format
2298 //
2299 
pack_row_in_buff(DBT * row,const uchar * record,uint index,uchar * row_buff)2300 int ha_tokudb::pack_row_in_buff(
2301     DBT * row,
2302     const uchar* record,
2303     uint index,
2304     uchar* row_buff
2305     )
2306 {
2307     uchar* fixed_field_ptr = NULL;
2308     uchar* var_field_offset_ptr = NULL;
2309     uchar* start_field_data_ptr = NULL;
2310     uchar* var_field_data_ptr = NULL;
2311     int r = ENOSYS;
2312     memset((void *) row, 0, sizeof(*row));
2313 
2314     my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
2315 
2316     // Copy null bytes
2317     memcpy(row_buff, record, table_share->null_bytes);
2318     fixed_field_ptr = row_buff + table_share->null_bytes;
2319     var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2320     start_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2321     var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2322 
2323     // assert that when the hidden primary key exists, primary_key_offsets is NULL
2324     for (uint i = 0; i < table_share->fields; i++) {
2325         Field* field = table->field[i];
2326         uint curr_field_offset = field_offset(field, table);
2327         if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2328             continue;
2329         }
2330         if (is_fixed_field(&share->kc_info, i)) {
2331             fixed_field_ptr = pack_fixed_field(
2332                 fixed_field_ptr,
2333                 record + curr_field_offset,
2334                 share->kc_info.field_lengths[i]
2335                 );
2336         }
2337         else if (is_variable_field(&share->kc_info, i)) {
2338             var_field_data_ptr = pack_var_field(
2339                 var_field_offset_ptr,
2340                 var_field_data_ptr,
2341                 start_field_data_ptr,
2342                 record + curr_field_offset,
2343                 share->kc_info.length_bytes[i],
2344                 share->kc_info.num_offset_bytes
2345                 );
2346             var_field_offset_ptr += share->kc_info.num_offset_bytes;
2347         }
2348     }
2349 
2350     for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2351         Field* field = table->field[share->kc_info.blob_fields[i]];
2352         var_field_data_ptr = pack_toku_field_blob(
2353             var_field_data_ptr,
2354             record + field_offset(field, table),
2355             field
2356             );
2357     }
2358 
2359     row->data = row_buff;
2360     row->size = (size_t) (var_field_data_ptr - row_buff);
2361     r = 0;
2362 
2363     dbug_tmp_restore_column_map(table->write_set, old_map);
2364     return r;
2365 }
2366 
2367 
pack_row(DBT * row,const uchar * record,uint index)2368 int ha_tokudb::pack_row(
2369     DBT * row,
2370     const uchar* record,
2371     uint index
2372     )
2373 {
2374     return pack_row_in_buff(row,record,index,rec_buff);
2375 }
2376 
pack_old_row_for_update(DBT * row,const uchar * record,uint index)2377 int ha_tokudb::pack_old_row_for_update(
2378     DBT * row,
2379     const uchar* record,
2380     uint index
2381     )
2382 {
2383     return pack_row_in_buff(row,record,index,rec_update_buff);
2384 }
2385 
2386 
unpack_blobs(uchar * record,const uchar * from_tokudb_blob,uint32_t num_bytes,bool check_bitmap)2387 int ha_tokudb::unpack_blobs(
2388     uchar* record,
2389     const uchar* from_tokudb_blob,
2390     uint32_t num_bytes,
2391     bool check_bitmap
2392     )
2393 {
2394     uint error = 0;
2395     uchar* ptr = NULL;
2396     const uchar* buff = NULL;
2397     //
2398     // assert that num_bytes > 0 iff share->num_blobs > 0
2399     //
2400     assert_always( !((share->kc_info.num_blobs == 0) && (num_bytes > 0)) );
2401     if (num_bytes > num_blob_bytes) {
2402         ptr = (uchar*)tokudb::memory::realloc(
2403             (void*)blob_buff, num_bytes,
2404             MYF(MY_ALLOW_ZERO_PTR));
2405         if (ptr == NULL) {
2406             error = ENOMEM;
2407             goto exit;
2408         }
2409         blob_buff = ptr;
2410         num_blob_bytes = num_bytes;
2411     }
2412 
2413     memcpy(blob_buff, from_tokudb_blob, num_bytes);
2414     buff= blob_buff;
2415     for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2416         uint32_t curr_field_index = share->kc_info.blob_fields[i];
2417         bool skip = check_bitmap ?
2418             !(bitmap_is_set(table->read_set,curr_field_index) ||
2419                 bitmap_is_set(table->write_set,curr_field_index)) :
2420             false;
2421         Field* field = table->field[curr_field_index];
2422         uint32_t len_bytes = field->row_pack_length();
2423         const uchar* end_buff = unpack_toku_field_blob(
2424             record + field_offset(field, table),
2425             buff,
2426             len_bytes,
2427             skip
2428             );
2429         // verify that the pointers to the blobs are all contained within the blob_buff
2430         if (!(blob_buff <= buff && end_buff <= blob_buff + num_bytes)) {
2431             error = -3000000;
2432             goto exit;
2433         }
2434         buff = end_buff;
2435     }
2436     // verify that the entire blob buffer was parsed
2437     if (share->kc_info.num_blobs > 0 && !(num_bytes > 0 && buff == blob_buff + num_bytes)) {
2438         error = -4000000;
2439         goto exit;
2440     }
2441 
2442     error = 0;
2443 exit:
2444     return error;
2445 }
2446 
2447 //
2448 // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2449 // Parameters:
2450 //      [out]   record - row in MySQL format
2451 //      [in]    row - row stored in DBT to be converted
2452 //
unpack_row(uchar * record,DBT const * row,DBT const * key,uint index)2453 int ha_tokudb::unpack_row(
2454     uchar* record,
2455     DBT const *row,
2456     DBT const *key,
2457     uint index
2458     )
2459 {
2460     //
2461     // two cases, fixed length row, and variable length row
2462     // fixed length row is first below
2463     //
2464     /* Copy null bits */
2465     int error = 0;
2466     const uchar* fixed_field_ptr = (const uchar *) row->data;
2467     const uchar* var_field_offset_ptr = NULL;
2468     const uchar* var_field_data_ptr = NULL;
2469     uint32_t data_end_offset = 0;
2470     memcpy(record, fixed_field_ptr, table_share->null_bytes);
2471     fixed_field_ptr += table_share->null_bytes;
2472 
2473     var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2474     var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2475 
2476     //
2477     // unpack the key, if necessary
2478     //
2479     if (!(hidden_primary_key && index == primary_key)) {
2480         unpack_key(record,key,index);
2481     }
2482 
2483     uint32_t last_offset = 0;
2484     //
2485     // we have two methods of unpacking, one if we need to unpack the entire row
2486     // the second if we unpack a subset of the entire row
2487     // first method here is if we unpack the entire row
2488     //
2489     if (unpack_entire_row) {
2490         //
2491         // fill in parts of record that are not part of the key
2492         //
2493         for (uint i = 0; i < table_share->fields; i++) {
2494             Field* field = table->field[i];
2495             if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2496                 continue;
2497             }
2498 
2499             if (is_fixed_field(&share->kc_info, i)) {
2500                 fixed_field_ptr = unpack_fixed_field(
2501                     record + field_offset(field, table),
2502                     fixed_field_ptr,
2503                     share->kc_info.field_lengths[i]
2504                     );
2505             }
2506             //
2507             // here, we DO modify var_field_data_ptr or var_field_offset_ptr
2508             // as we unpack variable sized fields
2509             //
2510             else if (is_variable_field(&share->kc_info, i)) {
2511                 switch (share->kc_info.num_offset_bytes) {
2512                 case (1):
2513                     data_end_offset = var_field_offset_ptr[0];
2514                     break;
2515                 case (2):
2516                     data_end_offset = uint2korr(var_field_offset_ptr);
2517                     break;
2518                 default:
2519                     assert_unreachable();
2520                 }
2521                 unpack_var_field(
2522                     record + field_offset(field, table),
2523                     var_field_data_ptr,
2524                     data_end_offset - last_offset,
2525                     share->kc_info.length_bytes[i]
2526                     );
2527                 var_field_offset_ptr += share->kc_info.num_offset_bytes;
2528                 var_field_data_ptr += data_end_offset - last_offset;
2529                 last_offset = data_end_offset;
2530             }
2531         }
2532         error = unpack_blobs(
2533             record,
2534             var_field_data_ptr,
2535             row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2536             false
2537             );
2538         if (error) {
2539             goto exit;
2540         }
2541     }
2542     //
2543     // in this case, we unpack only what is specified
2544     // in fixed_cols_for_query and var_cols_for_query
2545     //
2546     else {
2547         //
2548         // first the fixed fields
2549         //
2550         for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
2551             uint field_index = fixed_cols_for_query[i];
2552             Field* field = table->field[field_index];
2553             unpack_fixed_field(
2554                 record + field_offset(field, table),
2555                 fixed_field_ptr + share->kc_info.cp_info[index][field_index].col_pack_val,
2556                 share->kc_info.field_lengths[field_index]
2557                 );
2558         }
2559 
2560         //
2561         // now the var fields
2562         // here, we do NOT modify var_field_data_ptr or var_field_offset_ptr
2563         //
2564         for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
2565             uint field_index = var_cols_for_query[i];
2566             Field* field = table->field[field_index];
2567             uint32_t var_field_index = share->kc_info.cp_info[index][field_index].col_pack_val;
2568             uint32_t data_start_offset;
2569             uint32_t field_len;
2570 
2571             get_var_field_info(
2572                 &field_len,
2573                 &data_start_offset,
2574                 var_field_index,
2575                 var_field_offset_ptr,
2576                 share->kc_info.num_offset_bytes
2577                 );
2578 
2579             unpack_var_field(
2580                 record + field_offset(field, table),
2581                 var_field_data_ptr + data_start_offset,
2582                 field_len,
2583                 share->kc_info.length_bytes[field_index]
2584                 );
2585         }
2586 
2587         if (read_blobs) {
2588             //
2589             // now the blobs
2590             //
2591             get_blob_field_info(
2592                 &data_end_offset,
2593                 share->kc_info.mcp_info[index].len_of_offsets,
2594                 var_field_data_ptr,
2595                 share->kc_info.num_offset_bytes
2596                 );
2597 
2598             var_field_data_ptr += data_end_offset;
2599             error = unpack_blobs(
2600                 record,
2601                 var_field_data_ptr,
2602                 row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2603                 true
2604                 );
2605             if (error) {
2606                 goto exit;
2607             }
2608         }
2609     }
2610     error = 0;
2611 exit:
2612     return error;
2613 }
2614 
place_key_into_mysql_buff(KEY * key_info,uchar * record,uchar * data)2615 uint32_t ha_tokudb::place_key_into_mysql_buff(
2616     KEY* key_info,
2617     uchar* record,
2618     uchar* data) {
2619 
2620     KEY_PART_INFO* key_part = key_info->key_part;
2621     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2622     uchar* pos = data;
2623 
2624     for (; key_part != end; key_part++) {
2625         if (key_part->field->null_bit) {
2626             uint null_offset = get_null_offset(table, key_part->field);
2627             if (*pos++ == NULL_COL_VAL) { // Null value
2628                 //
2629                 // We don't need to reset the record data as we will not access it
2630                 // if the null data is set
2631                 //
2632                 record[null_offset] |= key_part->field->null_bit;
2633                 continue;
2634             }
2635             record[null_offset] &= ~key_part->field->null_bit;
2636         }
2637 #if !defined(MARIADB_BASE_VERSION)
2638         //
2639         // HOPEFULLY TEMPORARY
2640         //
2641         assert_always(table->s->db_low_byte_first);
2642 #endif
2643         pos = unpack_toku_key_field(
2644             record + field_offset(key_part->field, table),
2645             pos,
2646             key_part->field,
2647             key_part->length
2648             );
2649     }
2650     return pos-data;
2651 }
2652 
2653 //
2654 // Store the key and the primary key into the row
2655 // Parameters:
2656 //      [out]   record - key stored in MySQL format
2657 //      [in]    key - key stored in DBT to be converted
2658 //              index -index into key_file that represents the DB
2659 //                  unpacking a key of
2660 //
unpack_key(uchar * record,DBT const * key,uint index)2661 void ha_tokudb::unpack_key(uchar * record, DBT const *key, uint index) {
2662     uint32_t bytes_read;
2663     uchar *pos = (uchar *) key->data + 1;
2664     bytes_read = place_key_into_mysql_buff(
2665         &table->key_info[index],
2666         record,
2667         pos
2668         );
2669     if( (index != primary_key) && !hidden_primary_key) {
2670         //
2671         // also unpack primary key
2672         //
2673         place_key_into_mysql_buff(
2674             &table->key_info[primary_key],
2675             record,
2676             pos+bytes_read
2677             );
2678     }
2679 }
2680 
place_key_into_dbt_buff(KEY * key_info,uchar * buff,const uchar * record,bool * has_null,int key_length)2681 uint32_t ha_tokudb::place_key_into_dbt_buff(
2682     KEY* key_info,
2683     uchar* buff,
2684     const uchar* record,
2685     bool* has_null,
2686     int key_length) {
2687 
2688     KEY_PART_INFO* key_part = key_info->key_part;
2689     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2690     uchar* curr_buff = buff;
2691     *has_null = false;
2692     for (; key_part != end && key_length > 0; key_part++) {
2693         //
2694         // accessing key_part->field->null_bit instead off key_part->null_bit
2695         // because key_part->null_bit is not set in add_index
2696         // filed ticket 862 to look into this
2697         //
2698         if (key_part->field->null_bit) {
2699             /* Store 0 if the key part is a NULL part */
2700             uint null_offset = get_null_offset(table, key_part->field);
2701             if (record[null_offset] & key_part->field->null_bit) {
2702                 *curr_buff++ = NULL_COL_VAL;
2703                 *has_null = true;
2704                 continue;
2705             }
2706             *curr_buff++ = NONNULL_COL_VAL;        // Store NOT NULL marker
2707         }
2708 #if !defined(MARIADB_BASE_VERSION)
2709         //
2710         // HOPEFULLY TEMPORARY
2711         //
2712         assert_always(table->s->db_low_byte_first);
2713 #endif
2714         //
2715         // accessing field_offset(key_part->field) instead off key_part->offset
2716         // because key_part->offset is SET INCORRECTLY in add_index
2717         // filed ticket 862 to look into this
2718         //
2719         curr_buff = pack_toku_key_field(
2720             curr_buff,
2721             (uchar *) (record + field_offset(key_part->field, table)),
2722             key_part->field,
2723             key_part->length
2724             );
2725         key_length -= key_part->length;
2726     }
2727     return curr_buff - buff;
2728 }
2729 
2730 
2731 
2732 //
2733 // Create a packed key from a row. This key will be written as such
2734 // to the index tree.  This will never fail as the key buffer is pre-allocated.
2735 // Parameters:
2736 //      [out]   key - DBT that holds the key
2737 //      [in]    key_info - holds data about the key, such as it's length and offset into record
2738 //      [out]   buff - buffer that will hold the data for key (unless
2739 //                  we have a hidden primary key)
2740 //      [in]    record - row from which to create the key
2741 //              key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2742 // Returns:
2743 //      the parameter key
2744 //
2745 
create_dbt_key_from_key(DBT * key,KEY * key_info,uchar * buff,const uchar * record,bool * has_null,bool dont_pack_pk,int key_length,uint8_t inf_byte)2746 DBT* ha_tokudb::create_dbt_key_from_key(
2747     DBT * key,
2748     KEY* key_info,
2749     uchar * buff,
2750     const uchar * record,
2751     bool* has_null,
2752     bool dont_pack_pk,
2753     int key_length,
2754     uint8_t inf_byte
2755     )
2756 {
2757     uint32_t size = 0;
2758     uchar* tmp_buff = buff;
2759     my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
2760 
2761     key->data = buff;
2762 
2763     //
2764     // first put the "infinity" byte at beginning. States if missing columns are implicitly
2765     // positive infinity or negative infinity or zero. For this, because we are creating key
2766     // from a row, there is no way that columns can be missing, so in practice,
2767     // this will be meaningless. Might as well put in a value
2768     //
2769     *tmp_buff++ = inf_byte;
2770     size++;
2771     size += place_key_into_dbt_buff(
2772         key_info,
2773         tmp_buff,
2774         record,
2775         has_null,
2776         key_length
2777         );
2778     if (!dont_pack_pk) {
2779         tmp_buff = buff + size;
2780         if (hidden_primary_key) {
2781             memcpy(tmp_buff, current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2782             size += TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2783         }
2784         else {
2785             bool tmp_bool = false;
2786             size += place_key_into_dbt_buff(
2787                 &table->key_info[primary_key],
2788                 tmp_buff,
2789                 record,
2790                 &tmp_bool,
2791                 MAX_KEY_LENGTH //this parameter does not matter
2792                 );
2793         }
2794     }
2795 
2796     key->size = size;
2797     DBUG_DUMP("key", (uchar *) key->data, key->size);
2798     dbug_tmp_restore_column_map(table->write_set, old_map);
2799     return key;
2800 }
2801 
2802 
2803 //
2804 // Create a packed key from a row. This key will be written as such
2805 // to the index tree.  This will never fail as the key buffer is pre-allocated.
2806 // Parameters:
2807 //      [out]   key - DBT that holds the key
2808 //              keynr - index for which to create the key
2809 //      [out]   buff - buffer that will hold the data for key (unless
2810 //                  we have a hidden primary key)
2811 //      [in]    record - row from which to create the key
2812 //      [out]   has_null - says if the key has a NULL value for one of its columns
2813 //              key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2814 // Returns:
2815 //      the parameter key
2816 //
create_dbt_key_from_table(DBT * key,uint keynr,uchar * buff,const uchar * record,bool * has_null,int key_length)2817 DBT *ha_tokudb::create_dbt_key_from_table(
2818     DBT * key,
2819     uint keynr,
2820     uchar * buff,
2821     const uchar * record,
2822     bool* has_null,
2823     int key_length
2824     )
2825 {
2826     TOKUDB_HANDLER_DBUG_ENTER("");
2827     memset((void *) key, 0, sizeof(*key));
2828     if (hidden_primary_key && keynr == primary_key) {
2829         key->data = buff;
2830         memcpy(buff, &current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2831         key->size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2832         *has_null = false;
2833         DBUG_RETURN(key);
2834     }
2835     DBUG_RETURN(create_dbt_key_from_key(key, &table->key_info[keynr],buff,record, has_null, (keynr == primary_key), key_length, COL_ZERO));
2836 }
2837 
create_dbt_key_for_lookup(DBT * key,KEY * key_info,uchar * buff,const uchar * record,bool * has_null,int key_length)2838 DBT* ha_tokudb::create_dbt_key_for_lookup(
2839     DBT * key,
2840     KEY* key_info,
2841     uchar * buff,
2842     const uchar * record,
2843     bool* has_null,
2844     int key_length
2845     )
2846 {
2847     TOKUDB_HANDLER_DBUG_ENTER("");
2848     // override the infinity byte, needed in case the pk is a string
2849     // to make sure that the cursor that uses this key properly positions
2850     // it at the right location. If the table stores "D", but we look up for "d",
2851     // and the infinity byte is 0, then we will skip the "D", because
2852     // in bytes, "d" > "D".
2853     DBT* ret = create_dbt_key_from_key(key, key_info, buff, record, has_null, true, key_length, COL_NEG_INF);
2854     DBUG_RETURN(ret);
2855 }
2856 
2857 //
2858 // Create a packed key from from a MySQL unpacked key (like the one that is
2859 // sent from the index_read() This key is to be used to read a row
2860 // Parameters:
2861 //      [out]   key - DBT that holds the key
2862 //              keynr - index for which to pack the key
2863 //      [out]   buff - buffer that will hold the data for key
2864 //      [in]    key_ptr - MySQL unpacked key
2865 //              key_length - length of key_ptr
2866 // Returns:
2867 //      the parameter key
2868 //
pack_key(DBT * key,uint keynr,uchar * buff,const uchar * key_ptr,uint key_length,int8_t inf_byte)2869 DBT* ha_tokudb::pack_key(
2870     DBT* key,
2871     uint keynr,
2872     uchar* buff,
2873     const uchar* key_ptr,
2874     uint key_length,
2875     int8_t inf_byte) {
2876 
2877     TOKUDB_HANDLER_DBUG_ENTER(
2878         "key %p %u:%2.2x inf=%d",
2879         key_ptr,
2880         key_length,
2881         key_length > 0 ? key_ptr[0] : 0,
2882         inf_byte);
2883 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2884     if (keynr != primary_key && !tokudb_test(hidden_primary_key)) {
2885         DBUG_RETURN(pack_ext_key(key, keynr, buff, key_ptr, key_length, inf_byte));
2886     }
2887 #endif  // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2888     KEY* key_info = &table->key_info[keynr];
2889     KEY_PART_INFO* key_part = key_info->key_part;
2890     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2891     my_bitmap_map* old_map = dbug_tmp_use_all_columns(table, table->write_set);
2892 
2893     memset((void *) key, 0, sizeof(*key));
2894     key->data = buff;
2895 
2896     // first put the "infinity" byte at beginning. States if missing columns are implicitly
2897     // positive infinity or negative infinity
2898     *buff++ = (uchar)inf_byte;
2899 
2900     for (; key_part != end && (int) key_length > 0; key_part++) {
2901         uint offset = 0;
2902         if (key_part->null_bit) {
2903             if (!(*key_ptr == 0)) {
2904                 *buff++ = NULL_COL_VAL;
2905                 key_length -= key_part->store_length;
2906                 key_ptr += key_part->store_length;
2907                 continue;
2908             }
2909             *buff++ = NONNULL_COL_VAL;
2910             offset = 1;         // Data is at key_ptr+1
2911         }
2912 #if !defined(MARIADB_BASE_VERSION)
2913         assert_always(table->s->db_low_byte_first);
2914 #endif
2915         buff = pack_key_toku_key_field(
2916             buff,
2917             (uchar *) key_ptr + offset,
2918             key_part->field,
2919             key_part->length
2920             );
2921 
2922         key_ptr += key_part->store_length;
2923         key_length -= key_part->store_length;
2924     }
2925 
2926     key->size = (buff - (uchar *) key->data);
2927     DBUG_DUMP("key", (uchar *) key->data, key->size);
2928     dbug_tmp_restore_column_map(table->write_set, old_map);
2929     DBUG_RETURN(key);
2930 }
2931 
2932 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
pack_ext_key(DBT * key,uint keynr,uchar * buff,const uchar * key_ptr,uint key_length,int8_t inf_byte)2933 DBT* ha_tokudb::pack_ext_key(
2934     DBT* key,
2935     uint keynr,
2936     uchar* buff,
2937     const uchar* key_ptr,
2938     uint key_length,
2939     int8_t inf_byte) {
2940 
2941     TOKUDB_HANDLER_DBUG_ENTER("");
2942 
2943     // build a list of PK parts that are in the SK.  we will use this list to build the
2944     // extended key if necessary.
2945     KEY* pk_key_info = &table->key_info[primary_key];
2946     uint pk_parts = pk_key_info->user_defined_key_parts;
2947     uint pk_next = 0;
2948     struct {
2949         const uchar *key_ptr;
2950         KEY_PART_INFO *key_part;
2951     } pk_info[pk_parts];
2952 
2953     KEY* key_info = &table->key_info[keynr];
2954     KEY_PART_INFO* key_part = key_info->key_part;
2955     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2956     my_bitmap_map* old_map = dbug_tmp_use_all_columns(table, table->write_set);
2957 
2958     memset((void *) key, 0, sizeof(*key));
2959     key->data = buff;
2960 
2961     // first put the "infinity" byte at beginning. States if missing columns are implicitly
2962     // positive infinity or negative infinity
2963     *buff++ = (uchar)inf_byte;
2964 
2965     for (; key_part != end && (int) key_length > 0; key_part++) {
2966         // if the SK part is part of the PK, then append it to the list.
2967         if (key_part->field->part_of_key.is_set(primary_key)) {
2968             assert_always(pk_next < pk_parts);
2969             pk_info[pk_next].key_ptr = key_ptr;
2970             pk_info[pk_next].key_part = key_part;
2971             pk_next++;
2972         }
2973         uint offset = 0;
2974         if (key_part->null_bit) {
2975             if (!(*key_ptr == 0)) {
2976                 *buff++ = NULL_COL_VAL;
2977                 key_length -= key_part->store_length;
2978                 key_ptr += key_part->store_length;
2979                 continue;
2980             }
2981             *buff++ = NONNULL_COL_VAL;
2982             offset = 1;         // Data is at key_ptr+1
2983         }
2984 #if !defined(MARIADB_BASE_VERSION)
2985         assert_always(table->s->db_low_byte_first);
2986 #endif
2987         buff = pack_key_toku_key_field(
2988             buff,
2989             (uchar *) key_ptr + offset,
2990             key_part->field,
2991             key_part->length
2992             );
2993 
2994         key_ptr += key_part->store_length;
2995         key_length -= key_part->store_length;
2996     }
2997 
2998     if (key_length > 0) {
2999         assert_always(key_part == end);
3000         end = key_info->key_part + get_ext_key_parts(key_info);
3001 
3002         // pack PK in order of PK key parts
3003         for (uint pk_index = 0;
3004              key_part != end && (int) key_length > 0 && pk_index < pk_parts;
3005              pk_index++) {
3006             uint i;
3007             for (i = 0; i < pk_next; i++) {
3008                 if (pk_info[i].key_part->fieldnr ==
3009                     pk_key_info->key_part[pk_index].fieldnr)
3010                     break;
3011             }
3012             if (i < pk_next) {
3013                 const uchar *this_key_ptr = pk_info[i].key_ptr;
3014                 KEY_PART_INFO *this_key_part = pk_info[i].key_part;
3015                 buff = pack_key_toku_key_field(
3016                     buff,
3017                     (uchar*)this_key_ptr,
3018                     this_key_part->field,
3019                     this_key_part->length);
3020             } else {
3021                 buff = pack_key_toku_key_field(
3022                     buff,
3023                     (uchar*)key_ptr,
3024                     key_part->field,
3025                     key_part->length);
3026                 key_ptr += key_part->store_length;
3027                 key_length -= key_part->store_length;
3028                 key_part++;
3029             }
3030         }
3031     }
3032 
3033     key->size = (buff - (uchar *) key->data);
3034     DBUG_DUMP("key", (uchar *) key->data, key->size);
3035     dbug_tmp_restore_column_map(table->write_set, old_map);
3036     DBUG_RETURN(key);
3037 }
3038 #endif  // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
3039 
3040 //
3041 // get max used hidden primary key value
3042 //
init_hidden_prim_key_info(DB_TXN * txn)3043 void ha_tokudb::init_hidden_prim_key_info(DB_TXN *txn) {
3044     TOKUDB_HANDLER_DBUG_ENTER("");
3045     if (!(share->status & STATUS_PRIMARY_KEY_INIT)) {
3046         int error = 0;
3047         DBC* c = NULL;
3048         error = share->key_file[primary_key]->cursor(
3049             share->key_file[primary_key],
3050             txn,
3051             &c,
3052             0);
3053         assert_always(error == 0);
3054         DBT key,val;
3055         memset(&key, 0, sizeof(key));
3056         memset(&val, 0, sizeof(val));
3057         error = c->c_get(c, &key, &val, DB_LAST);
3058         if (error == 0) {
3059             assert_always(key.size == TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
3060             share->auto_ident = hpk_char_to_num((uchar *)key.data);
3061         }
3062         error = c->c_close(c);
3063         assert_always(error == 0);
3064         share->status |= STATUS_PRIMARY_KEY_INIT;
3065     }
3066     TOKUDB_HANDLER_DBUG_VOID_RETURN;
3067 }
3068 
3069 
3070 
3071 /** @brief
3072     Get metadata info stored in status.tokudb
3073     */
get_status(DB_TXN * txn)3074 int ha_tokudb::get_status(DB_TXN* txn) {
3075     TOKUDB_HANDLER_DBUG_ENTER("");
3076     DBT key, value;
3077     HA_METADATA_KEY curr_key;
3078     int error;
3079 
3080     //
3081     // open status.tokudb
3082     //
3083     if (!share->status_block) {
3084         error =
3085             open_status_dictionary(
3086                 &share->status_block,
3087                 share->full_table_name(),
3088                 txn);
3089         if (error) {
3090             goto cleanup;
3091         }
3092     }
3093 
3094     //
3095     // transaction to be used for putting metadata into status.tokudb
3096     //
3097     memset(&key, 0, sizeof(key));
3098     memset(&value, 0, sizeof(value));
3099     key.data = &curr_key;
3100     key.size = sizeof(curr_key);
3101     value.flags = DB_DBT_USERMEM;
3102 
3103     assert_always(share->status_block);
3104     //
3105     // get version
3106     //
3107     value.ulen = sizeof(share->version);
3108     value.data = &share->version;
3109     curr_key = hatoku_new_version;
3110     error = share->status_block->get(
3111         share->status_block,
3112         txn,
3113         &key,
3114         &value,
3115         0
3116         );
3117     if (error == DB_NOTFOUND) {
3118         //
3119         // hack to keep handle the issues of going back and forth
3120         // between 5.0.3 to 5.0.4
3121         // the problem with going back and forth
3122         // is with storing the frm file, 5.0.4 stores it, 5.0.3 does not
3123         // so, if a user goes back and forth and alters the schema
3124         // the frm stored can get out of sync with the schema of the table
3125         // This can cause issues.
3126         // To take care of this, we are doing this versioning work here.
3127         // We change the key that stores the version.
3128         // In 5.0.3, it is hatoku_old_version, in 5.0.4 it is hatoku_new_version
3129         // When we encounter a table that does not have hatoku_new_version
3130         // set, we give it the right one, and overwrite the old one with zero.
3131         // This ensures that 5.0.3 cannot open the table. Once it has been opened by 5.0.4
3132         //
3133         uint dummy_version = 0;
3134         share->version = HA_TOKU_ORIG_VERSION;
3135         error = write_to_status(
3136             share->status_block,
3137             hatoku_new_version,
3138             &share->version,
3139             sizeof(share->version),
3140             txn
3141             );
3142         if (error) { goto cleanup; }
3143         error = write_to_status(
3144             share->status_block,
3145             hatoku_old_version,
3146             &dummy_version,
3147             sizeof(dummy_version),
3148             txn
3149             );
3150         if (error) { goto cleanup; }
3151     }
3152     else if (error || value.size != sizeof(share->version)) {
3153         if (error == 0) {
3154             error = HA_ERR_INTERNAL_ERROR;
3155         }
3156         goto cleanup;
3157     }
3158     //
3159     // get capabilities
3160     //
3161     curr_key = hatoku_capabilities;
3162     value.ulen = sizeof(share->capabilities);
3163     value.data = &share->capabilities;
3164     error = share->status_block->get(
3165         share->status_block,
3166         txn,
3167         &key,
3168         &value,
3169         0
3170         );
3171     if (error == DB_NOTFOUND) {
3172         share->capabilities= 0;
3173     }
3174     else if (error || value.size != sizeof(share->version)) {
3175         if (error == 0) {
3176             error = HA_ERR_INTERNAL_ERROR;
3177         }
3178         goto cleanup;
3179     }
3180 
3181     error = 0;
3182 cleanup:
3183     TOKUDB_HANDLER_DBUG_RETURN(error);
3184 }
3185 
3186 /** @brief
3187     Return an estimated of the number of rows in the table.
3188     Used when sorting to allocate buffers and by the optimizer.
3189     This is used in filesort.cc.
3190 */
estimate_rows_upper_bound()3191 ha_rows ha_tokudb::estimate_rows_upper_bound() {
3192     TOKUDB_HANDLER_DBUG_ENTER("");
3193     DBUG_RETURN(share->row_count() + HA_TOKUDB_EXTRA_ROWS);
3194 }
3195 
3196 //
3197 // Function that compares two primary keys that were saved as part of rnd_pos
3198 // and ::position
3199 //
cmp_ref(const uchar * ref1,const uchar * ref2)3200 int ha_tokudb::cmp_ref(const uchar * ref1, const uchar * ref2) {
3201     int ret_val = 0;
3202     bool read_string = false;
3203     ret_val = tokudb_compare_two_keys(
3204         ref1 + sizeof(uint32_t),
3205         *(uint32_t *)ref1,
3206         ref2 + sizeof(uint32_t),
3207         *(uint32_t *)ref2,
3208         (uchar *)share->file->descriptor->dbt.data + 4,
3209         *(uint32_t *)share->file->descriptor->dbt.data - 4,
3210         false,
3211         &read_string
3212         );
3213     return ret_val;
3214 }
3215 
check_if_incompatible_data(HA_CREATE_INFO * info,uint table_changes)3216 bool ha_tokudb::check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes) {
3217   //
3218   // This is a horrendous hack for now, as copied by InnoDB.
3219   // This states that if the auto increment create field has changed,
3220   // via a "alter table foo auto_increment=new_val", that this
3221   // change is incompatible, and to rebuild the entire table
3222   // This will need to be fixed
3223   //
3224   if ((info->used_fields & HA_CREATE_USED_AUTO) &&
3225       info->auto_increment_value != 0) {
3226 
3227     return COMPATIBLE_DATA_NO;
3228   }
3229   if (table_changes != IS_EQUAL_YES)
3230     return COMPATIBLE_DATA_NO;
3231   return COMPATIBLE_DATA_YES;
3232 }
3233 
3234 //
3235 // Method that is called before the beginning of many calls
3236 // to insert rows (ha_tokudb::write_row). There is no guarantee
3237 // that start_bulk_insert is called, however there is a guarantee
3238 // that if start_bulk_insert is called, then end_bulk_insert may be
3239 // called as well.
3240 // Parameters:
3241 //      [in]    rows - an estimate of the number of rows that will be inserted
3242 //                     if number of rows is unknown (such as if doing
3243 //                     "insert into foo select * from bar), then rows
3244 //                     will be 0
3245 //
3246 //
3247 // This function returns true if the table MAY be empty.
3248 // It is NOT meant to be a 100% check for emptiness.
3249 // This is used for a bulk load optimization.
3250 //
may_table_be_empty(DB_TXN * txn)3251 bool ha_tokudb::may_table_be_empty(DB_TXN *txn) {
3252     int error;
3253     bool ret_val = false;
3254     DBC* tmp_cursor = NULL;
3255     DB_TXN* tmp_txn = NULL;
3256 
3257     const int empty_scan = tokudb::sysvars::empty_scan(ha_thd());
3258     if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_DISABLED)
3259         goto cleanup;
3260 
3261     if (txn == NULL) {
3262         error = txn_begin(db_env, 0, &tmp_txn, 0, ha_thd());
3263         if (error) {
3264             goto cleanup;
3265         }
3266         txn = tmp_txn;
3267     }
3268 
3269     error = share->file->cursor(share->file, txn, &tmp_cursor, 0);
3270     if (error)
3271         goto cleanup;
3272     tmp_cursor->c_set_check_interrupt_callback(tmp_cursor, tokudb_killed_thd_callback, ha_thd());
3273     if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_LR)
3274         error = tmp_cursor->c_getf_next(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3275     else
3276         error = tmp_cursor->c_getf_prev(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3277     error = map_to_handler_error(error);
3278     if (error == DB_NOTFOUND)
3279         ret_val = true;
3280     else
3281         ret_val = false;
3282     error = 0;
3283 
3284 cleanup:
3285     if (tmp_cursor) {
3286         int r = tmp_cursor->c_close(tmp_cursor);
3287         assert_always(r == 0);
3288         tmp_cursor = NULL;
3289     }
3290     if (tmp_txn) {
3291         commit_txn(tmp_txn, 0);
3292         tmp_txn = NULL;
3293     }
3294     return ret_val;
3295 }
3296 
3297 #if MYSQL_VERSION_ID >= 100000
start_bulk_insert(ha_rows rows,uint flags)3298 void ha_tokudb::start_bulk_insert(ha_rows rows, uint flags) {
3299     TOKUDB_HANDLER_DBUG_ENTER("%llu %u txn %p", (unsigned long long) rows, flags, transaction);
3300 #else
3301 void ha_tokudb::start_bulk_insert(ha_rows rows) {
3302     TOKUDB_HANDLER_DBUG_ENTER("%llu txn %p", (unsigned long long) rows, transaction);
3303 #endif
3304     THD* thd = ha_thd();
3305     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3306     delay_updating_ai_metadata = true;
3307     ai_metadata_update_required = false;
3308     abort_loader = false;
3309 
3310     rwlock_t_lock_read(share->_num_DBs_lock);
3311     uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3312     num_DBs_locked_in_bulk = true;
3313     lock_count = 0;
3314 
3315     if ((rows == 0 || rows > 1) && share->try_table_lock) {
3316         if (tokudb::sysvars::prelock_empty(thd) &&
3317             may_table_be_empty(transaction) &&
3318             transaction != NULL) {
3319             if (using_ignore || is_insert_ignore(thd) || thd->lex->duplicates != DUP_ERROR) {
3320                 acquire_table_lock(transaction, lock_write);
3321             } else {
3322                 mult_dbt_flags[primary_key] = 0;
3323                 if (!thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS) && !hidden_primary_key) {
3324                     mult_put_flags[primary_key] = DB_NOOVERWRITE;
3325                 }
3326                 uint32_t loader_flags = (tokudb::sysvars::load_save_space(thd)) ?
3327                     LOADER_COMPRESS_INTERMEDIATES : 0;
3328 
3329                 int error = db_env->create_loader(
3330                     db_env,
3331                     transaction,
3332                     &loader,
3333                     NULL, // no src_db needed
3334                     curr_num_DBs,
3335                     share->key_file,
3336                     mult_put_flags,
3337                     mult_dbt_flags,
3338                     loader_flags
3339                     );
3340                 if (error) {
3341                     assert_always(loader == NULL);
3342                     goto exit_try_table_lock;
3343                 }
3344 
3345                 lc.thd = thd;
3346                 lc.ha = this;
3347 
3348                 error = loader->set_poll_function(
3349                     loader, ha_tokudb::bulk_insert_poll, &lc);
3350                 assert_always(!error);
3351 
3352                 error = loader->set_error_callback(
3353                     loader, ha_tokudb::loader_dup, &lc);
3354                 assert_always(!error);
3355 
3356                 trx->stmt_progress.using_loader = true;
3357             }
3358         }
3359     exit_try_table_lock:
3360         share->lock();
3361         share->try_table_lock = false;
3362         share->unlock();
3363     }
3364     TOKUDB_HANDLER_DBUG_VOID_RETURN;
3365 }
3366 int ha_tokudb::bulk_insert_poll(void* extra, float progress) {
3367     LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
3368     if (thd_killed(context->thd)) {
3369         snprintf(context->write_status_msg,
3370                  sizeof(context->write_status_msg),
3371                  "The process has been killed, aborting bulk load.");
3372         return ER_ABORTING_CONNECTION;
3373     }
3374     float percentage = progress * 100;
3375     snprintf(context->write_status_msg,
3376              sizeof(context->write_status_msg),
3377              "Loading of data t %s about %.1f%% done",
3378              context->ha->share->full_table_name(),
3379              percentage);
3380     thd_proc_info(context->thd, context->write_status_msg);
3381 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
3382     thd_progress_report(context->thd, (unsigned long long)percentage, 100);
3383 #endif
3384     return 0;
3385 }
3386 void ha_tokudb::loader_add_index_err(TOKUDB_UNUSED(DB* db),
3387                                      TOKUDB_UNUSED(int i),
3388                                      TOKUDB_UNUSED(int err),
3389                                      TOKUDB_UNUSED(DBT* key),
3390                                      TOKUDB_UNUSED(DBT* val),
3391                                      void* error_extra) {
3392     LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3393     assert_always(context->ha);
3394     context->ha->set_loader_error(err);
3395 }
3396 void ha_tokudb::loader_dup(TOKUDB_UNUSED(DB* db),
3397                            TOKUDB_UNUSED(int i),
3398                            int err,
3399                            DBT* key,
3400                            TOKUDB_UNUSED(DBT* val),
3401                            void* error_extra) {
3402     LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3403     assert_always(context->ha);
3404     context->ha->set_loader_error(err);
3405     if (err == DB_KEYEXIST) {
3406         context->ha->set_dup_value_for_pk(key);
3407     }
3408 }
3409 
3410 //
3411 // Method that is called at the end of many calls to insert rows
3412 // (ha_tokudb::write_row). If start_bulk_insert is called, then
3413 // this is guaranteed to be called.
3414 //
3415 int ha_tokudb::end_bulk_insert(TOKUDB_UNUSED(bool abort)) {
3416     TOKUDB_HANDLER_DBUG_ENTER("");
3417     int error = 0;
3418     THD* thd = ha_thd();
3419     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3420     bool using_loader = (loader != NULL);
3421     if (ai_metadata_update_required) {
3422         share->lock();
3423         error = update_max_auto_inc(share->status_block, share->last_auto_increment);
3424         share->unlock();
3425         if (error) { goto cleanup; }
3426     }
3427     delay_updating_ai_metadata = false;
3428     ai_metadata_update_required = false;
3429     loader_error = 0;
3430     if (loader) {
3431         if (!abort_loader && !thd_killed(thd)) {
3432             DBUG_EXECUTE_IF("tokudb_end_bulk_insert_sleep", {
3433                 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3434                 thd_proc_info(thd, "DBUG sleep");
3435                 my_sleep(20000000);
3436                 thd_proc_info(thd, orig_proc_info);
3437             });
3438             error = loader->close(loader);
3439             loader = NULL;
3440             if (error) {
3441                 if (thd_killed(thd)) {
3442                     my_error(ER_QUERY_INTERRUPTED, MYF(0));
3443                 }
3444                 goto cleanup;
3445             }
3446 
3447             for (uint i = 0; i < table_share->keys; i++) {
3448                 if (table_share->key_info[i].flags & HA_NOSAME) {
3449                     bool is_unique;
3450                     if (i == primary_key && !share->pk_has_string) {
3451                         continue;
3452                     }
3453                     error = is_index_unique(&is_unique, transaction, share->key_file[i], &table->key_info[i],
3454                                             DB_PRELOCKED_WRITE);
3455                     if (error) goto cleanup;
3456                     if (!is_unique) {
3457                         error = HA_ERR_FOUND_DUPP_KEY;
3458                         last_dup_key = i;
3459                         goto cleanup;
3460                     }
3461                 }
3462             }
3463         }
3464         else {
3465             error = sprintf(write_status_msg, "aborting bulk load");
3466             thd_proc_info(thd, write_status_msg);
3467             loader->abort(loader);
3468             loader = NULL;
3469             share->try_table_lock = true;
3470         }
3471     }
3472 
3473 cleanup:
3474     if (num_DBs_locked_in_bulk) {
3475         share->_num_DBs_lock.unlock();
3476     }
3477     num_DBs_locked_in_bulk = false;
3478     lock_count = 0;
3479     if (loader) {
3480         error = sprintf(write_status_msg, "aborting bulk load");
3481         thd_proc_info(thd, write_status_msg);
3482         loader->abort(loader);
3483         loader = NULL;
3484     }
3485     abort_loader = false;
3486     memset(&lc, 0, sizeof(lc));
3487     if (error || loader_error) {
3488         my_errno = error ? error : loader_error;
3489         if (using_loader) {
3490             share->try_table_lock = true;
3491         }
3492     }
3493     trx->stmt_progress.using_loader = false;
3494     thd_proc_info(thd, 0);
3495     TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
3496 }
3497 
3498 int ha_tokudb::end_bulk_insert() {
3499     return end_bulk_insert( false );
3500 }
3501 
3502 int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info, int lock_flags) {
3503     int error;
3504     DBC* tmp_cursor1 = NULL;
3505     DBC* tmp_cursor2 = NULL;
3506     DBT key1, key2, val, packed_key1, packed_key2;
3507     uint64_t cnt = 0;
3508     char status_msg[MAX_ALIAS_NAME + 200]; //buffer of 200 should be a good upper bound.
3509     THD* thd = ha_thd();
3510     const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3511     memset(&key1, 0, sizeof(key1));
3512     memset(&key2, 0, sizeof(key2));
3513     memset(&val, 0, sizeof(val));
3514     memset(&packed_key1, 0, sizeof(packed_key1));
3515     memset(&packed_key2, 0, sizeof(packed_key2));
3516     *is_unique = true;
3517 
3518     error = db->cursor(db, txn, &tmp_cursor1, DB_SERIALIZABLE);
3519     if (error) { goto cleanup; }
3520 
3521     error = db->cursor(db, txn, &tmp_cursor2, DB_SERIALIZABLE);
3522     if (error) { goto cleanup; }
3523 
3524     error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3525     if (error == DB_NOTFOUND) {
3526         *is_unique = true;
3527         error = 0;
3528         goto cleanup;
3529     }
3530     else if (error) { goto cleanup; }
3531     error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3532     if (error) { goto cleanup; }
3533 
3534     error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3535     if (error == DB_NOTFOUND) {
3536         *is_unique = true;
3537         error = 0;
3538         goto cleanup;
3539     }
3540     else if (error) { goto cleanup; }
3541 
3542     while (error != DB_NOTFOUND) {
3543         bool has_null1;
3544         bool has_null2;
3545         int cmp;
3546         place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key1.data + 1);
3547         place_key_into_mysql_buff(key_info, table->record[1], (uchar *) key2.data + 1);
3548 
3549         create_dbt_key_for_lookup(&packed_key1, key_info, key_buff, table->record[0], &has_null1);
3550         create_dbt_key_for_lookup(&packed_key2, key_info, key_buff2, table->record[1], &has_null2);
3551 
3552         if (!has_null1 && !has_null2) {
3553             cmp = tokudb_prefix_cmp_dbt_key(db, &packed_key1, &packed_key2);
3554             if (cmp == 0) {
3555                 memcpy(key_buff, key1.data, key1.size);
3556                 place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key_buff + 1);
3557                 *is_unique = false;
3558                 break;
3559             }
3560         }
3561 
3562         error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3563         if (error) { goto cleanup; }
3564         error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3565         if (error && (error != DB_NOTFOUND)) { goto cleanup; }
3566 
3567         cnt++;
3568         if ((cnt % 10000) == 0) {
3569             sprintf(
3570                 status_msg,
3571                 "Verifying index uniqueness: Checked %llu of %llu rows in key-%s.",
3572                 (long long unsigned) cnt,
3573                 share->row_count(),
3574                 key_info->name);
3575             thd_proc_info(thd, status_msg);
3576             if (thd_killed(thd)) {
3577                 my_error(ER_QUERY_INTERRUPTED, MYF(0));
3578                 error = ER_QUERY_INTERRUPTED;
3579                 goto cleanup;
3580             }
3581         }
3582     }
3583 
3584     error = 0;
3585 
3586 cleanup:
3587     thd_proc_info(thd, orig_proc_info);
3588     if (tmp_cursor1) {
3589         tmp_cursor1->c_close(tmp_cursor1);
3590         tmp_cursor1 = NULL;
3591     }
3592     if (tmp_cursor2) {
3593         tmp_cursor2->c_close(tmp_cursor2);
3594         tmp_cursor2 = NULL;
3595     }
3596     return error;
3597 }
3598 
3599 int ha_tokudb::is_val_unique(bool* is_unique, uchar* record, KEY* key_info, uint dict_index, DB_TXN* txn) {
3600     int error = 0;
3601     bool has_null;
3602     DBC* tmp_cursor = NULL;
3603 
3604     DBT key; memset((void *)&key, 0, sizeof(key));
3605     create_dbt_key_from_key(&key, key_info, key_buff2, record, &has_null, true, MAX_KEY_LENGTH, COL_NEG_INF);
3606     if (has_null) {
3607         error = 0;
3608         *is_unique = true;
3609         goto cleanup;
3610     }
3611 
3612     error = share->key_file[dict_index]->cursor(share->key_file[dict_index], txn, &tmp_cursor, DB_SERIALIZABLE | DB_RMW);
3613     if (error) {
3614         goto cleanup;
3615     } else {
3616         // prelock (key,-inf),(key,+inf) so that the subsequent key lookup does not overlock
3617         uint flags = 0;
3618         DBT key_right; memset(&key_right, 0, sizeof key_right);
3619         create_dbt_key_from_key(&key_right, key_info, key_buff3, record, &has_null, true, MAX_KEY_LENGTH, COL_POS_INF);
3620         error = tmp_cursor->c_set_bounds(tmp_cursor, &key, &key_right, true, DB_NOTFOUND);
3621         if (error == 0) {
3622             flags = DB_PRELOCKED | DB_PRELOCKED_WRITE;
3623         }
3624 
3625         // lookup key and check unique prefix
3626         struct smart_dbt_info info;
3627         info.ha = this;
3628         info.buf = NULL;
3629         info.keynr = dict_index;
3630 
3631         struct index_read_info ir_info;
3632         ir_info.orig_key = &key;
3633         ir_info.smart_dbt_info = info;
3634 
3635         error = tmp_cursor->c_getf_set_range(tmp_cursor, flags, &key, smart_dbt_callback_lookup, &ir_info);
3636         if (error == DB_NOTFOUND) {
3637             *is_unique = true;
3638             error = 0;
3639             goto cleanup;
3640         }
3641         else if (error) {
3642             error = map_to_handler_error(error);
3643             goto cleanup;
3644         }
3645         if (ir_info.cmp) {
3646             *is_unique = true;
3647         }
3648         else {
3649             *is_unique = false;
3650         }
3651     }
3652     error = 0;
3653 
3654 cleanup:
3655     if (tmp_cursor) {
3656         int r = tmp_cursor->c_close(tmp_cursor);
3657         assert_always(r==0);
3658         tmp_cursor = NULL;
3659     }
3660     return error;
3661 }
3662 
3663 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
3664 static void maybe_do_unique_checks_delay_fn(THD *thd) {
3665     if (thd->slave_thread) {
3666         uint64_t delay_ms = tokudb::sysvars::rpl_unique_checks_delay(thd);
3667         if (delay_ms)
3668             usleep(delay_ms * 1000);
3669     }
3670 }
3671 
3672 #define maybe_do_unique_checks_delay(__thd) \
3673     (maybe_do_unique_checks_delay_fn(__thd))
3674 
3675 #define maybe_do_unique_checks_delay_if_flags_set( \
3676     __thd, __flags_set, __flags_check)             \
3677     { if (((__flags_set) & DB_OPFLAGS_MASK) ==     \
3678          (__flags_check)) maybe_do_unique_checks_delay_fn(__thd); }
3679 
3680 static bool need_read_only(THD *thd) {
3681     return opt_readonly || !tokudb::sysvars::rpl_check_readonly(thd);
3682 }
3683 
3684 static bool do_unique_checks_fn(THD *thd, bool do_rpl_event) {
3685     if (do_rpl_event &&
3686         thd->slave_thread &&
3687         need_read_only(thd) &&
3688         !tokudb::sysvars::rpl_unique_checks(thd)) {
3689         return false;
3690     } else {
3691         return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3692     }
3693 }
3694 
3695 #define do_unique_checks(__thd, __flags) \
3696     (do_unique_checks_fn(__thd, __flags))
3697 
3698 #else
3699 
3700 #define maybe_do_unique_checks_delay(__thd) ((void)0)
3701 
3702 #define maybe_do_unique_checks_delay_if_flags_set( \
3703     __thd, __flags_set, __flags_check)             \
3704     ((void)0)
3705 
3706 static bool do_unique_checks_fn(THD *thd) {
3707     return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3708 }
3709 
3710 #define do_unique_checks(__thd, _flags) \
3711     (do_unique_checks_fn(__thd))
3712 
3713 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
3714 
3715 int ha_tokudb::do_uniqueness_checks(uchar* record, DB_TXN* txn, THD* thd) {
3716     int error = 0;
3717     //
3718     // first do uniqueness checks
3719     //
3720     if (share->has_unique_keys && do_unique_checks(thd, in_rpl_write_rows)) {
3721         DBUG_EXECUTE_IF("tokudb_crash_if_rpl_does_uniqueness_check",
3722                         DBUG_ASSERT(0););
3723         for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3724             bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
3725             bool is_unique = false;
3726             //
3727             // don't need to do check for primary key that don't have strings
3728             //
3729             if (keynr == primary_key && !share->pk_has_string) {
3730                 continue;
3731             }
3732             if (!is_unique_key) {
3733                 continue;
3734             }
3735 
3736             maybe_do_unique_checks_delay(thd);
3737 
3738             //
3739             // if unique key, check uniqueness constraint
3740             // but, we do not need to check it if the key has a null
3741             // and we do not need to check it if unique_checks is off
3742             //
3743             error = is_val_unique(&is_unique, record, &table->key_info[keynr], keynr, txn);
3744             if (error) {
3745                 goto cleanup;
3746             }
3747             if (!is_unique) {
3748                 error = DB_KEYEXIST;
3749                 last_dup_key = keynr;
3750                 goto cleanup;
3751             }
3752         }
3753     }
3754 cleanup:
3755     return error;
3756 }
3757 
3758 void ha_tokudb::test_row_packing(uchar* record, DBT* pk_key, DBT* pk_val) {
3759     int error;
3760     DBT row, key;
3761     //
3762     // variables for testing key packing, only used in some debug modes
3763     //
3764     uchar* tmp_pk_key_data = NULL;
3765     uchar* tmp_pk_val_data = NULL;
3766     DBT tmp_pk_key;
3767     DBT tmp_pk_val;
3768     bool has_null;
3769     int cmp;
3770 
3771     memset(&tmp_pk_key, 0, sizeof(DBT));
3772     memset(&tmp_pk_val, 0, sizeof(DBT));
3773 
3774     //
3775     //use for testing the packing of keys
3776     //
3777     tmp_pk_key_data = (uchar*)tokudb::memory::malloc(pk_key->size, MYF(MY_WME));
3778     assert_always(tmp_pk_key_data);
3779     tmp_pk_val_data = (uchar*)tokudb::memory::malloc(pk_val->size, MYF(MY_WME));
3780     assert_always(tmp_pk_val_data);
3781     memcpy(tmp_pk_key_data, pk_key->data, pk_key->size);
3782     memcpy(tmp_pk_val_data, pk_val->data, pk_val->size);
3783     tmp_pk_key.data = tmp_pk_key_data;
3784     tmp_pk_key.size = pk_key->size;
3785     tmp_pk_val.data = tmp_pk_val_data;
3786     tmp_pk_val.size = pk_val->size;
3787 
3788     for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3789         uint32_t tmp_num_bytes = 0;
3790         uchar* row_desc = NULL;
3791         uint32_t desc_size = 0;
3792 
3793         if (keynr == primary_key) {
3794             continue;
3795         }
3796 
3797         create_dbt_key_from_table(&key, keynr, key_buff2, record, &has_null);
3798 
3799         //
3800         // TEST
3801         //
3802         row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3803         row_desc += (*(uint32_t *)row_desc);
3804         desc_size = (*(uint32_t *)row_desc) - 4;
3805         row_desc += 4;
3806         tmp_num_bytes = pack_key_from_desc(
3807             key_buff3,
3808             row_desc,
3809             desc_size,
3810             &tmp_pk_key,
3811             &tmp_pk_val
3812             );
3813         assert_always(tmp_num_bytes == key.size);
3814         cmp = memcmp(key_buff3,key_buff2,tmp_num_bytes);
3815         assert_always(cmp == 0);
3816 
3817         //
3818         // test key packing of clustering keys
3819         //
3820         if (key_is_clustering(&table->key_info[keynr])) {
3821             error = pack_row(&row, (const uchar *) record, keynr);
3822             assert_always(error == 0);
3823             uchar* tmp_buff = NULL;
3824             tmp_buff = (uchar*)tokudb::memory::malloc(
3825                 alloced_rec_buff_length,
3826                 MYF(MY_WME));
3827             assert_always(tmp_buff);
3828             row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3829             row_desc += (*(uint32_t *)row_desc);
3830             row_desc += (*(uint32_t *)row_desc);
3831             desc_size = (*(uint32_t *)row_desc) - 4;
3832             row_desc += 4;
3833             tmp_num_bytes = pack_clustering_val_from_desc(
3834                 tmp_buff,
3835                 row_desc,
3836                 desc_size,
3837                 &tmp_pk_val
3838                 );
3839             assert_always(tmp_num_bytes == row.size);
3840             cmp = memcmp(tmp_buff,rec_buff,tmp_num_bytes);
3841             assert_always(cmp == 0);
3842             tokudb::memory::free(tmp_buff);
3843         }
3844     }
3845 
3846     //
3847     // copy stuff back out
3848     //
3849     error = pack_row(pk_val, (const uchar *) record, primary_key);
3850     assert_always(pk_val->size == tmp_pk_val.size);
3851     cmp = memcmp(pk_val->data, tmp_pk_val_data, pk_val->size);
3852     assert_always( cmp == 0);
3853 
3854     tokudb::memory::free(tmp_pk_key_data);
3855     tokudb::memory::free(tmp_pk_val_data);
3856 }
3857 
3858 // set the put flags for the main dictionary
3859 void ha_tokudb::set_main_dict_put_flags(THD* thd, bool opt_eligible, uint32_t* put_flags) {
3860     uint32_t old_prelock_flags = 0;
3861     uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3862     bool in_hot_index = share->num_DBs > curr_num_DBs;
3863     bool using_ignore_flag_opt = do_ignore_flag_optimization(thd, table, share->replace_into_fast && !using_ignore_no_key);
3864     //
3865     // optimization for "REPLACE INTO..." (and "INSERT IGNORE") command
3866     // if the command is "REPLACE INTO" and the only table
3867     // is the main table (or all indexes are a subset of the pk),
3868     // then we can simply insert the element
3869     // with DB_YESOVERWRITE. If the element does not exist,
3870     // it will act as a normal insert, and if it does exist, it
3871     // will act as a replace, which is exactly what REPLACE INTO is supposed
3872     // to do. We cannot do this if otherwise, because then we lose
3873     // consistency between indexes
3874     //
3875     if (hidden_primary_key)
3876     {
3877         *put_flags = old_prelock_flags;
3878     }
3879     else if (!do_unique_checks(thd, in_rpl_write_rows | in_rpl_update_rows) && !is_replace_into(thd) && !is_insert_ignore(thd))
3880     {
3881         *put_flags = old_prelock_flags;
3882     }
3883     else if (using_ignore_flag_opt && is_replace_into(thd)
3884             && !in_hot_index)
3885     {
3886         *put_flags = old_prelock_flags;
3887     }
3888     else if (opt_eligible && using_ignore_flag_opt && is_insert_ignore(thd)
3889             && !in_hot_index)
3890     {
3891         *put_flags = DB_NOOVERWRITE_NO_ERROR | old_prelock_flags;
3892     }
3893     else
3894     {
3895         *put_flags = DB_NOOVERWRITE | old_prelock_flags;
3896     }
3897 }
3898 
3899 int ha_tokudb::insert_row_to_main_dictionary(
3900     DBT* pk_key,
3901     DBT* pk_val,
3902     DB_TXN* txn) {
3903 
3904     int error = 0;
3905     uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3906     assert_always(curr_num_DBs == 1);
3907 
3908     uint32_t put_flags = mult_put_flags[primary_key];
3909     THD *thd = ha_thd();
3910     set_main_dict_put_flags(thd, true, &put_flags);
3911 
3912     // for test, make unique checks have a very long duration
3913     maybe_do_unique_checks_delay_if_flags_set(thd, put_flags, DB_NOOVERWRITE);
3914 
3915     error = share->file->put(share->file, txn, pk_key, pk_val, put_flags);
3916     if (error) {
3917         last_dup_key = primary_key;
3918         goto cleanup;
3919     }
3920 
3921 cleanup:
3922     return error;
3923 }
3924 
3925 int ha_tokudb::insert_rows_to_dictionaries_mult(DBT* pk_key, DBT* pk_val, DB_TXN* txn, THD* thd) {
3926     int error = 0;
3927     uint curr_num_DBs = share->num_DBs;
3928     set_main_dict_put_flags(thd, true, &mult_put_flags[primary_key]);
3929     uint32_t flags = mult_put_flags[primary_key];
3930 
3931     // for test, make unique checks have a very long duration
3932     maybe_do_unique_checks_delay_if_flags_set(thd, flags, DB_NOOVERWRITE);
3933 
3934     // the insert ignore optimization uses DB_NOOVERWRITE_NO_ERROR,
3935     // which is not allowed with env->put_multiple.
3936     // we have to insert the rows one by one in this case.
3937     if (flags & DB_NOOVERWRITE_NO_ERROR) {
3938         DB * src_db = share->key_file[primary_key];
3939         for (uint32_t i = 0; i < curr_num_DBs; i++) {
3940             DB * db = share->key_file[i];
3941             if (i == primary_key) {
3942                 // if it's the primary key, insert the rows
3943                 // as they are.
3944                 error = db->put(db, txn, pk_key, pk_val, flags);
3945             } else {
3946                 // generate a row for secondary keys.
3947                 // use our multi put key/rec buffers
3948                 // just as the ydb layer would have in
3949                 // env->put_multiple(), except that
3950                 // we will just do a put() right away.
3951                 error = tokudb_generate_row(db, src_db,
3952                         &mult_key_dbt_array[i].dbts[0], &mult_rec_dbt_array[i].dbts[0],
3953                         pk_key, pk_val);
3954                 if (error != 0) {
3955                     goto out;
3956                 }
3957                 error = db->put(db, txn, &mult_key_dbt_array[i].dbts[0],
3958                         &mult_rec_dbt_array[i].dbts[0], flags);
3959             }
3960             if (error != 0) {
3961                 goto out;
3962             }
3963         }
3964     } else {
3965         // not insert ignore, so we can use put multiple
3966         error = db_env->put_multiple(
3967             db_env,
3968             share->key_file[primary_key],
3969             txn,
3970             pk_key,
3971             pk_val,
3972             curr_num_DBs,
3973             share->key_file,
3974             mult_key_dbt_array,
3975             mult_rec_dbt_array,
3976             mult_put_flags
3977             );
3978     }
3979 
3980 out:
3981     //
3982     // We break if we hit an error, unless it is a dup key error
3983     // and MySQL told us to ignore duplicate key errors
3984     //
3985     if (error) {
3986         last_dup_key = primary_key;
3987     }
3988     return error;
3989 }
3990 
3991 //
3992 // Stores a row in the table, called when handling an INSERT query
3993 // Parameters:
3994 //      [in]    record - a row in MySQL format
3995 // Returns:
3996 //      0 on success
3997 //      error otherwise
3998 //
3999 int ha_tokudb::write_row(uchar * record) {
4000     TOKUDB_HANDLER_DBUG_ENTER("%p", record);
4001 
4002     DBT row, prim_key;
4003     int error;
4004     THD *thd = ha_thd();
4005     bool has_null;
4006     DB_TXN* sub_trans = NULL;
4007     DB_TXN* txn = NULL;
4008     tokudb_trx_data *trx = NULL;
4009     uint curr_num_DBs;
4010     bool create_sub_trans = false;
4011     bool num_DBs_locked = false;
4012 
4013     //
4014     // some crap that needs to be done because MySQL does not properly abstract
4015     // this work away from us, namely filling in auto increment and setting auto timestamp
4016     //
4017     ha_statistic_increment(&SSV::ha_write_count);
4018 #if MYSQL_VERSION_ID < 50600
4019     if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT) {
4020         table->timestamp_field->set_time();
4021     }
4022 #endif
4023     if (table->next_number_field && record == table->record[0]) {
4024         error = update_auto_increment();
4025         if (error)
4026             goto cleanup;
4027     }
4028 
4029     //
4030     // check to see if some value for the auto increment column that is bigger
4031     // than anything else til now is being used. If so, update the metadata to reflect it
4032     // the goal here is we never want to have a dup key error due to a bad increment
4033     // of the auto inc field.
4034     //
4035     if (share->has_auto_inc && record == table->record[0]) {
4036         share->lock();
4037         ulonglong curr_auto_inc = retrieve_auto_increment(
4038             table->field[share->ai_field_index]->key_type(),
4039             field_offset(table->field[share->ai_field_index], table),
4040             record);
4041         if (curr_auto_inc > share->last_auto_increment) {
4042             share->last_auto_increment = curr_auto_inc;
4043             if (delay_updating_ai_metadata) {
4044                 ai_metadata_update_required = true;
4045             } else {
4046                 update_max_auto_inc(
4047                     share->status_block,
4048                     share->last_auto_increment);
4049             }
4050         }
4051         share->unlock();
4052     }
4053 
4054     //
4055     // grab reader lock on numDBs_lock
4056     //
4057     if (!num_DBs_locked_in_bulk) {
4058         rwlock_t_lock_read(share->_num_DBs_lock);
4059         num_DBs_locked = true;
4060     } else {
4061         lock_count++;
4062         if (lock_count >= 2000) {
4063             share->_num_DBs_lock.unlock();
4064             rwlock_t_lock_read(share->_num_DBs_lock);
4065             lock_count = 0;
4066         }
4067     }
4068     curr_num_DBs = share->num_DBs;
4069 
4070     if (hidden_primary_key) {
4071         get_auto_primary_key(current_ident);
4072     }
4073 
4074     if (table_share->blob_fields) {
4075         if (fix_rec_buff_for_blob(max_row_length(record))) {
4076             error = HA_ERR_OUT_OF_MEM;
4077             goto cleanup;
4078         }
4079     }
4080 
4081     create_dbt_key_from_table(&prim_key, primary_key, primary_key_buff, record, &has_null);
4082     if ((error = pack_row(&row, (const uchar *) record, primary_key))){
4083         goto cleanup;
4084     }
4085 
4086     create_sub_trans = (using_ignore && !(do_ignore_flag_optimization(thd,table,share->replace_into_fast && !using_ignore_no_key)));
4087     if (create_sub_trans) {
4088         error = txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4089         if (error) {
4090             goto cleanup;
4091         }
4092     }
4093     txn = create_sub_trans ? sub_trans : transaction;
4094     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_TXN, "txn %p", txn);
4095     if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY))) {
4096         test_row_packing(record,&prim_key,&row);
4097     }
4098     if (loader) {
4099         error = loader->put(loader, &prim_key, &row);
4100         if (error) {
4101             abort_loader = true;
4102             goto cleanup;
4103         }
4104     } else {
4105         error = do_uniqueness_checks(record, txn, thd);
4106         if (error) {
4107             // for #4633
4108             // if we have a duplicate key error, let's check the primary key to see
4109             // if there is a duplicate there. If so, set last_dup_key to the pk
4110             if (error == DB_KEYEXIST && !tokudb_test(hidden_primary_key) && last_dup_key != primary_key) {
4111                 int r = share->file->getf_set(share->file, txn, DB_SERIALIZABLE, &prim_key, smart_dbt_do_nothing, NULL);
4112                 if (r == 0) {
4113                     // if we get no error, that means the row
4114                     // was found and this is a duplicate key,
4115                     // so we set last_dup_key
4116                     last_dup_key = primary_key;
4117                 } else if (r != DB_NOTFOUND) {
4118                     // if some other error is returned, return that to the user.
4119                     error = r;
4120                 }
4121             }
4122             goto cleanup;
4123         }
4124         if (curr_num_DBs == 1) {
4125             error = insert_row_to_main_dictionary(&prim_key, &row, txn);
4126             if (error) { goto cleanup; }
4127         } else {
4128             error = insert_rows_to_dictionaries_mult(&prim_key, &row, txn, thd);
4129             if (error) { goto cleanup; }
4130         }
4131         if (error == 0) {
4132             uint64_t full_row_size = prim_key.size + row.size;
4133             toku_hton_update_primary_key_bytes_inserted(full_row_size);
4134         }
4135     }
4136 
4137     trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4138     if (!error) {
4139         added_rows++;
4140         trx->stmt_progress.inserted++;
4141         track_progress(thd);
4142     }
4143 cleanup:
4144     if (num_DBs_locked) {
4145        share->_num_DBs_lock.unlock();
4146     }
4147     if (error == DB_KEYEXIST) {
4148         error = HA_ERR_FOUND_DUPP_KEY;
4149     }
4150     if (sub_trans) {
4151         // no point in recording error value of abort.
4152         // nothing we can do about it anyway and it is not what
4153         // we want to return.
4154         if (error) {
4155             abort_txn(sub_trans);
4156         }
4157         else {
4158             commit_txn(sub_trans, DB_TXN_NOSYNC);
4159         }
4160     }
4161     TOKUDB_HANDLER_DBUG_RETURN(error);
4162 }
4163 
4164 /* Compare if a key in a row has changed */
4165 bool ha_tokudb::key_changed(uint keynr, const uchar * old_row, const uchar * new_row) {
4166     DBT old_key;
4167     DBT new_key;
4168     memset((void *) &old_key, 0, sizeof(old_key));
4169     memset((void *) &new_key, 0, sizeof(new_key));
4170 
4171     bool has_null;
4172     create_dbt_key_from_table(&new_key, keynr, key_buff2, new_row, &has_null);
4173     create_dbt_key_for_lookup(&old_key,&table->key_info[keynr], key_buff3, old_row, &has_null);
4174     return tokudb_prefix_cmp_dbt_key(share->key_file[keynr], &old_key, &new_key);
4175 }
4176 
4177 //
4178 // Updates a row in the table, called when handling an UPDATE query
4179 // Parameters:
4180 //      [in]    old_row - row to be updated, in MySQL format
4181 //      [in]    new_row - new row, in MySQL format
4182 // Returns:
4183 //      0 on success
4184 //      error otherwise
4185 //
4186 int ha_tokudb::update_row(const uchar * old_row, uchar * new_row) {
4187     TOKUDB_HANDLER_DBUG_ENTER("");
4188     DBT prim_key, old_prim_key, prim_row, old_prim_row;
4189     int error;
4190     bool has_null;
4191     THD* thd = ha_thd();
4192     DB_TXN* sub_trans = NULL;
4193     DB_TXN* txn = NULL;
4194     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4195     uint curr_num_DBs;
4196 
4197     LINT_INIT(error);
4198     memset((void *) &prim_key, 0, sizeof(prim_key));
4199     memset((void *) &old_prim_key, 0, sizeof(old_prim_key));
4200     memset((void *) &prim_row, 0, sizeof(prim_row));
4201     memset((void *) &old_prim_row, 0, sizeof(old_prim_row));
4202 
4203     ha_statistic_increment(&SSV::ha_update_count);
4204 #if MYSQL_VERSION_ID < 50600
4205     if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) {
4206         table->timestamp_field->set_time();
4207     }
4208 #endif
4209     //
4210     // check to see if some value for the auto increment column that is bigger
4211     // than anything else til now is being used. If so, update the metadata to reflect it
4212     // the goal here is we never want to have a dup key error due to a bad increment
4213     // of the auto inc field.
4214     //
4215     if (share->has_auto_inc && new_row == table->record[0]) {
4216         share->lock();
4217         ulonglong curr_auto_inc = retrieve_auto_increment(
4218             table->field[share->ai_field_index]->key_type(),
4219             field_offset(table->field[share->ai_field_index], table),
4220             new_row
4221             );
4222         if (curr_auto_inc > share->last_auto_increment) {
4223             error = update_max_auto_inc(share->status_block, curr_auto_inc);
4224             if (!error) {
4225                 share->last_auto_increment = curr_auto_inc;
4226             }
4227         }
4228         share->unlock();
4229     }
4230 
4231     //
4232     // grab reader lock on numDBs_lock
4233     //
4234     bool num_DBs_locked = false;
4235     if (!num_DBs_locked_in_bulk) {
4236         rwlock_t_lock_read(share->_num_DBs_lock);
4237         num_DBs_locked = true;
4238     }
4239     curr_num_DBs = share->num_DBs;
4240 
4241     if (using_ignore) {
4242         error = txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4243         if (error) {
4244             goto cleanup;
4245         }
4246     }
4247     txn = using_ignore ? sub_trans : transaction;
4248 
4249     if (hidden_primary_key) {
4250         memset((void *) &prim_key, 0, sizeof(prim_key));
4251         prim_key.data = (void *) current_ident;
4252         prim_key.size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
4253         old_prim_key = prim_key;
4254     }
4255     else {
4256         create_dbt_key_from_table(&prim_key, primary_key, key_buff, new_row, &has_null);
4257         create_dbt_key_from_table(&old_prim_key, primary_key, primary_key_buff, old_row, &has_null);
4258     }
4259 
4260     // do uniqueness checks
4261     if (share->has_unique_keys && do_unique_checks(thd, in_rpl_update_rows)) {
4262         for (uint keynr = 0; keynr < table_share->keys; keynr++) {
4263             bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
4264             if (keynr == primary_key && !share->pk_has_string) {
4265                 continue;
4266             }
4267             if (is_unique_key) {
4268                 bool key_ch = key_changed(keynr, old_row, new_row);
4269                 if (key_ch) {
4270                     bool is_unique;
4271                     error = is_val_unique(&is_unique, new_row, &table->key_info[keynr], keynr, txn);
4272                     if (error) goto cleanup;
4273                     if (!is_unique) {
4274                         error = DB_KEYEXIST;
4275                         last_dup_key = keynr;
4276                         goto cleanup;
4277                     }
4278                 }
4279             }
4280         }
4281     }
4282 
4283     if (table_share->blob_fields) {
4284         if (fix_rec_buff_for_blob(max_row_length(new_row))) {
4285             error = HA_ERR_OUT_OF_MEM;
4286             goto cleanup;
4287         }
4288         if (fix_rec_update_buff_for_blob(max_row_length(old_row))) {
4289             error = HA_ERR_OUT_OF_MEM;
4290             goto cleanup;
4291         }
4292     }
4293 
4294     error = pack_row(&prim_row, new_row, primary_key);
4295     if (error) { goto cleanup; }
4296 
4297     error = pack_old_row_for_update(&old_prim_row, old_row, primary_key);
4298     if (error) { goto cleanup; }
4299 
4300     set_main_dict_put_flags(thd, false, &mult_put_flags[primary_key]);
4301 
4302     // for test, make unique checks have a very long duration
4303     if ((mult_put_flags[primary_key] & DB_OPFLAGS_MASK) == DB_NOOVERWRITE)
4304         maybe_do_unique_checks_delay(thd);
4305 
4306     error = db_env->update_multiple(
4307         db_env,
4308         share->key_file[primary_key],
4309         txn,
4310         &old_prim_key,
4311         &old_prim_row,
4312         &prim_key,
4313         &prim_row,
4314         curr_num_DBs,
4315         share->key_file,
4316         mult_put_flags,
4317         2*curr_num_DBs,
4318         mult_key_dbt_array,
4319         curr_num_DBs,
4320         mult_rec_dbt_array
4321         );
4322 
4323     if (error == DB_KEYEXIST) {
4324         last_dup_key = primary_key;
4325     }
4326     else if (!error) {
4327         updated_rows++;
4328         trx->stmt_progress.updated++;
4329         track_progress(thd);
4330     }
4331 
4332 
4333 cleanup:
4334     if (num_DBs_locked) {
4335         share->_num_DBs_lock.unlock();
4336     }
4337     if (error == DB_KEYEXIST) {
4338         error = HA_ERR_FOUND_DUPP_KEY;
4339     }
4340     if (sub_trans) {
4341         // no point in recording error value of abort.
4342         // nothing we can do about it anyway and it is not what
4343         // we want to return.
4344         if (error) {
4345             abort_txn(sub_trans);
4346         }
4347         else {
4348             commit_txn(sub_trans, DB_TXN_NOSYNC);
4349         }
4350     }
4351     TOKUDB_HANDLER_DBUG_RETURN(error);
4352 }
4353 
4354 //
4355 // Deletes a row in the table, called when handling a DELETE query
4356 // Parameters:
4357 //      [in]    record - row to be deleted, in MySQL format
4358 // Returns:
4359 //      0 on success
4360 //      error otherwise
4361 //
4362 int ha_tokudb::delete_row(const uchar * record) {
4363     TOKUDB_HANDLER_DBUG_ENTER("");
4364     int error = ENOSYS;
4365     DBT row, prim_key;
4366     bool has_null;
4367     THD* thd = ha_thd();
4368     uint curr_num_DBs;
4369     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4370 
4371     ha_statistic_increment(&SSV::ha_delete_count);
4372 
4373     //
4374     // grab reader lock on numDBs_lock
4375     //
4376     bool num_DBs_locked = false;
4377     if (!num_DBs_locked_in_bulk) {
4378         rwlock_t_lock_read(share->_num_DBs_lock);
4379         num_DBs_locked = true;
4380     }
4381     curr_num_DBs = share->num_DBs;
4382 
4383     create_dbt_key_from_table(&prim_key, primary_key, key_buff, record, &has_null);
4384     if (table_share->blob_fields) {
4385         if (fix_rec_buff_for_blob(max_row_length(record))) {
4386             error = HA_ERR_OUT_OF_MEM;
4387             goto cleanup;
4388         }
4389     }
4390     if ((error = pack_row(&row, (const uchar *) record, primary_key))){
4391         goto cleanup;
4392     }
4393 
4394     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
4395         TOKUDB_DEBUG_TXN,
4396         "all %p stmt %p sub_sp_level %p transaction %p",
4397         trx->all,
4398         trx->stmt,
4399         trx->sub_sp_level,
4400         transaction);
4401 
4402     error =
4403         db_env->del_multiple(
4404             db_env,
4405             share->key_file[primary_key],
4406             transaction,
4407             &prim_key,
4408             &row,
4409             curr_num_DBs,
4410             share->key_file,
4411             mult_key_dbt_array,
4412             mult_del_flags);
4413 
4414     if (error) {
4415         DBUG_PRINT("error", ("Got error %d", error));
4416     } else {
4417         deleted_rows++;
4418         trx->stmt_progress.deleted++;
4419         track_progress(thd);
4420     }
4421 cleanup:
4422     if (num_DBs_locked) {
4423         share->_num_DBs_lock.unlock();
4424     }
4425     TOKUDB_HANDLER_DBUG_RETURN(error);
4426 }
4427 
4428 //
4429 // takes as input table->read_set and table->write_set
4430 // and puts list of field indexes that need to be read in
4431 // unpack_row in the member variables fixed_cols_for_query
4432 // and var_cols_for_query
4433 //
4434 void ha_tokudb::set_query_columns(uint keynr) {
4435     uint32_t curr_fixed_col_index = 0;
4436     uint32_t curr_var_col_index = 0;
4437     read_key = false;
4438     read_blobs = false;
4439     //
4440     // i know this is probably confusing and will need to be explained better
4441     //
4442     uint key_index = 0;
4443 
4444     if (keynr == primary_key || keynr == MAX_KEY) {
4445         key_index = primary_key;
4446     }
4447     else {
4448         key_index = (key_is_clustering(&table->key_info[keynr]) ? keynr : primary_key);
4449     }
4450     for (uint i = 0; i < table_share->fields; i++) {
4451         if (bitmap_is_set(table->read_set,i) ||
4452             bitmap_is_set(table->write_set,i)
4453             )
4454         {
4455             if (bitmap_is_set(&share->kc_info.key_filters[key_index],i)) {
4456                 read_key = true;
4457             }
4458             else {
4459                 //
4460                 // if fixed field length
4461                 //
4462                 if (is_fixed_field(&share->kc_info, i)) {
4463                     //
4464                     // save the offset into the list
4465                     //
4466                     fixed_cols_for_query[curr_fixed_col_index] = i;
4467                     curr_fixed_col_index++;
4468                 }
4469                 //
4470                 // varchar or varbinary
4471                 //
4472                 else if (is_variable_field(&share->kc_info, i)) {
4473                     var_cols_for_query[curr_var_col_index] = i;
4474                     curr_var_col_index++;
4475                 }
4476                 //
4477                 // it is a blob
4478                 //
4479                 else {
4480                     read_blobs = true;
4481                 }
4482             }
4483         }
4484     }
4485     num_fixed_cols_for_query = curr_fixed_col_index;
4486     num_var_cols_for_query = curr_var_col_index;
4487 }
4488 
4489 void ha_tokudb::column_bitmaps_signal() {
4490     //
4491     // if we have max number of indexes, then MAX_KEY == primary_key
4492     //
4493     if (tokudb_active_index != MAX_KEY || tokudb_active_index == primary_key) {
4494         set_query_columns(tokudb_active_index);
4495     }
4496 }
4497 
4498 //
4499 // Notification that a scan of entire secondary table is about
4500 // to take place. Will pre acquire table read lock
4501 // Returns:
4502 //      0 on success
4503 //      error otherwise
4504 //
4505 int ha_tokudb::prepare_index_scan() {
4506     TOKUDB_HANDLER_DBUG_ENTER("");
4507     int error = 0;
4508     HANDLE_INVALID_CURSOR();
4509     error = prelock_range(NULL, NULL);
4510     if (error) { last_cursor_error = error; goto cleanup; }
4511 
4512     range_lock_grabbed = true;
4513     error = 0;
4514 cleanup:
4515     TOKUDB_HANDLER_DBUG_RETURN(error);
4516 }
4517 
4518 static bool index_key_is_null(
4519     TABLE* table,
4520     uint keynr,
4521     const uchar* key,
4522     uint key_len) {
4523 
4524     bool key_can_be_null = false;
4525     KEY* key_info = &table->key_info[keynr];
4526     KEY_PART_INFO* key_part = key_info->key_part;
4527     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
4528     for (; key_part != end; key_part++) {
4529         if (key_part->null_bit) {
4530             key_can_be_null = true;
4531             break;
4532         }
4533     }
4534     return key_can_be_null && key_len > 0 && key[0] != 0;
4535 }
4536 
4537 // Return true if bulk fetch can be used
4538 static bool tokudb_do_bulk_fetch(THD *thd) {
4539     switch (thd_sql_command(thd)) {
4540     case SQLCOM_SELECT:
4541     case SQLCOM_CREATE_TABLE:
4542     case SQLCOM_INSERT_SELECT:
4543     case SQLCOM_REPLACE_SELECT:
4544     case SQLCOM_DELETE:
4545         return tokudb::sysvars::bulk_fetch(thd) != 0;
4546     default:
4547         return false;
4548     }
4549 }
4550 
4551 //
4552 // Notification that a range query getting all elements that equal a key
4553 //  to take place. Will pre acquire read lock
4554 // Returns:
4555 //      0 on success
4556 //      error otherwise
4557 //
4558 int ha_tokudb::prepare_index_key_scan(const uchar * key, uint key_len) {
4559     TOKUDB_HANDLER_DBUG_ENTER("%p %u", key, key_len);
4560     int error = 0;
4561     DBT start_key, end_key;
4562     THD* thd = ha_thd();
4563     HANDLE_INVALID_CURSOR();
4564     pack_key(&start_key, tokudb_active_index, prelocked_left_range, key, key_len, COL_NEG_INF);
4565     prelocked_left_range_size = start_key.size;
4566     pack_key(&end_key, tokudb_active_index, prelocked_right_range, key, key_len, COL_POS_INF);
4567     prelocked_right_range_size = end_key.size;
4568 
4569     error = cursor->c_set_bounds(
4570         cursor,
4571         &start_key,
4572         &end_key,
4573         true,
4574         (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
4575         );
4576 
4577     if (error){
4578         goto cleanup;
4579     }
4580 
4581     range_lock_grabbed = true;
4582     range_lock_grabbed_null = index_key_is_null(table, tokudb_active_index, key, key_len);
4583     doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
4584     bulk_fetch_iteration = 0;
4585     rows_fetched_using_bulk_fetch = 0;
4586     error = 0;
4587 cleanup:
4588     if (error) {
4589         error = map_to_handler_error(error);
4590         last_cursor_error = error;
4591         //
4592         // cursor should be initialized here, but in case it is not,
4593         // we still check
4594         //
4595         if (cursor) {
4596             int r = cursor->c_close(cursor);
4597             assert_always(r==0);
4598             cursor = NULL;
4599             remove_from_trx_handler_list();
4600         }
4601     }
4602     TOKUDB_HANDLER_DBUG_RETURN(error);
4603 }
4604 
4605 void ha_tokudb::invalidate_bulk_fetch() {
4606     bytes_used_in_range_query_buff= 0;
4607     curr_range_query_buff_offset = 0;
4608     icp_went_out_of_range = false;
4609 }
4610 
4611 void ha_tokudb::invalidate_icp() {
4612     toku_pushed_idx_cond = NULL;
4613     toku_pushed_idx_cond_keyno = MAX_KEY;
4614     icp_went_out_of_range = false;
4615 }
4616 
4617 //
4618 // Initializes local cursor on DB with index keynr
4619 // Parameters:
4620 //          keynr - key (index) number
4621 //          sorted - 1 if result MUST be sorted according to index
4622 // Returns:
4623 //      0 on success
4624 //      error otherwise
4625 //
4626 int ha_tokudb::index_init(uint keynr, bool sorted) {
4627     TOKUDB_HANDLER_DBUG_ENTER("%d %u txn %p", keynr, sorted, transaction);
4628 
4629     int error;
4630     THD* thd = ha_thd();
4631     DBUG_PRINT("enter", ("table: '%s'  key: %d", table_share->table_name.str, keynr));
4632 
4633     /*
4634        Under some very rare conditions (like full joins) we may already have
4635        an active cursor at this point
4636      */
4637     if (cursor) {
4638         DBUG_PRINT("note", ("Closing active cursor"));
4639         int r = cursor->c_close(cursor);
4640         assert_always(r==0);
4641         remove_from_trx_handler_list();
4642     }
4643     active_index = keynr;
4644 
4645     if (active_index < MAX_KEY) {
4646         DBUG_ASSERT(keynr <= table->s->keys);
4647     } else {
4648         DBUG_ASSERT(active_index == MAX_KEY);
4649         keynr = primary_key;
4650     }
4651     tokudb_active_index = keynr;
4652 
4653 #if defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
4654     if (keynr < table->s->keys && table->key_info[keynr].option_struct->clustering)
4655         key_read = false;
4656 #endif  // defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
4657 
4658     last_cursor_error = 0;
4659     range_lock_grabbed = false;
4660     range_lock_grabbed_null = false;
4661     DBUG_ASSERT(share->key_file[keynr]);
4662     cursor_flags = get_cursor_isolation_flags(lock.type, thd);
4663     if (use_write_locks) {
4664         cursor_flags |= DB_RMW;
4665     }
4666     if (tokudb::sysvars::disable_prefetching(thd)) {
4667         cursor_flags |= DBC_DISABLE_PREFETCHING;
4668     }
4669     if (lock.type == TL_READ_WITH_SHARED_LOCKS) {
4670        cursor_flags |= DB_LOCKING_READ;
4671     }
4672     if ((error = share->key_file[keynr]->cursor(share->key_file[keynr],
4673                                                 transaction, &cursor,
4674                                                 cursor_flags))) {
4675         if (error == TOKUDB_MVCC_DICTIONARY_TOO_NEW) {
4676             error = HA_ERR_TABLE_DEF_CHANGED;
4677             my_error(ER_TABLE_DEF_CHANGED, MYF(0));
4678         }
4679         if (error == DB_LOCK_NOTGRANTED) {
4680             error = HA_ERR_LOCK_WAIT_TIMEOUT;
4681             my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
4682         }
4683         table->status = STATUS_NOT_FOUND;
4684         error = map_to_handler_error(error);
4685         last_cursor_error = error;
4686         cursor = NULL;             // Safety
4687         goto exit;
4688     }
4689     cursor->c_set_check_interrupt_callback(cursor, tokudb_killed_thd_callback, thd);
4690     memset((void *) &last_key, 0, sizeof(last_key));
4691 
4692     add_to_trx_handler_list();
4693 
4694     if (thd_sql_command(thd) == SQLCOM_SELECT) {
4695         set_query_columns(keynr);
4696         unpack_entire_row = false;
4697     }
4698     else {
4699         unpack_entire_row = true;
4700     }
4701     invalidate_bulk_fetch();
4702     doing_bulk_fetch = false;
4703     maybe_index_scan = false;
4704     error = 0;
4705 exit:
4706     TOKUDB_HANDLER_DBUG_RETURN(error);
4707 }
4708 
4709 //
4710 // closes the local cursor
4711 //
4712 int ha_tokudb::index_end() {
4713     TOKUDB_HANDLER_DBUG_ENTER("");
4714     range_lock_grabbed = false;
4715     range_lock_grabbed_null = false;
4716     if (cursor) {
4717         DBUG_PRINT("enter", ("table: '%s'", table_share->table_name.str));
4718         int r = cursor->c_close(cursor);
4719         assert_always(r==0);
4720         cursor = NULL;
4721         remove_from_trx_handler_list();
4722         last_cursor_error = 0;
4723     }
4724     active_index = tokudb_active_index = MAX_KEY;
4725 
4726     //
4727     // reset query variables
4728     //
4729     unpack_entire_row = true;
4730     read_blobs = true;
4731     read_key = true;
4732     num_fixed_cols_for_query = 0;
4733     num_var_cols_for_query = 0;
4734 
4735     invalidate_bulk_fetch();
4736     invalidate_icp();
4737     doing_bulk_fetch = false;
4738     close_dsmrr();
4739 
4740     TOKUDB_HANDLER_DBUG_RETURN(0);
4741 }
4742 
4743 int ha_tokudb::handle_cursor_error(int error, int err_to_return) {
4744     TOKUDB_HANDLER_DBUG_ENTER("");
4745     if (error) {
4746         error = map_to_handler_error(error);
4747         last_cursor_error = error;
4748         table->status = STATUS_NOT_FOUND;
4749         if (error == DB_NOTFOUND) {
4750             error = err_to_return;
4751         }
4752     }
4753     TOKUDB_HANDLER_DBUG_RETURN(error);
4754 }
4755 
4756 
4757 //
4758 // Helper function for read_row and smart_dbt_callback_xxx functions
4759 // When using a hidden primary key, upon reading a row,
4760 // we set the current_ident field to whatever the primary key we retrieved
4761 // was
4762 //
4763 void ha_tokudb::extract_hidden_primary_key(uint keynr, DBT const *found_key) {
4764     //
4765     // extract hidden primary key to current_ident
4766     //
4767     if (hidden_primary_key) {
4768         if (keynr == primary_key) {
4769             memcpy(current_ident, (char *) found_key->data, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
4770         }
4771         //
4772         // if secondary key, hidden primary key is at end of found_key
4773         //
4774         else {
4775             memcpy(
4776                 current_ident,
4777                 (char *) found_key->data + found_key->size - TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH,
4778                 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH
4779                 );
4780         }
4781     }
4782 }
4783 
4784 
4785 int ha_tokudb::read_row_callback (uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4786     assert_always(keynr == primary_key);
4787     return unpack_row(buf, row,found_key, keynr);
4788 }
4789 
4790 //
4791 // Reads the contents of row and found_key, DBT's retrieved from the DB associated to keynr, into buf
4792 // This function assumes that we are using a covering index, as a result, if keynr is the primary key,
4793 // we do not read row into buf
4794 // Parameters:
4795 //      [out]   buf - buffer for the row, in MySQL format
4796 //              keynr - index into key_file that represents DB we are currently operating on.
4797 //      [in]    row - the row that has been read from the preceding DB call
4798 //      [in]    found_key - key used to retrieve the row
4799 //
4800 void ha_tokudb::read_key_only(uchar * buf, uint keynr, DBT const *found_key) {
4801     TOKUDB_HANDLER_DBUG_ENTER("");
4802     table->status = 0;
4803     //
4804     // only case when we do not unpack the key is if we are dealing with the main dictionary
4805     // of a table with a hidden primary key
4806     //
4807     if (!(hidden_primary_key && keynr == primary_key)) {
4808         unpack_key(buf, found_key, keynr);
4809     }
4810     TOKUDB_HANDLER_DBUG_VOID_RETURN;
4811 }
4812 
4813 //
4814 // Helper function used to try to retrieve the entire row
4815 // If keynr is associated with the main table, reads contents of found_key and row into buf, otherwise,
4816 // makes copy of primary key and saves it to last_key. This can later be used to retrieve the entire row
4817 // Parameters:
4818 //      [out]   buf - buffer for the row, in MySQL format
4819 //              keynr - index into key_file that represents DB we are currently operating on.
4820 //      [in]    row - the row that has been read from the preceding DB call
4821 //      [in]    found_key - key used to retrieve the row
4822 //
4823 int ha_tokudb::read_primary_key(uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4824     TOKUDB_HANDLER_DBUG_ENTER("");
4825     int error = 0;
4826     table->status = 0;
4827     //
4828     // case where we read from secondary table that is not clustered
4829     //
4830     if (keynr != primary_key && !key_is_clustering(&table->key_info[keynr])) {
4831         bool has_null;
4832         //
4833         // create a DBT that has the same data as row, this is inefficient
4834         // extract_hidden_primary_key MUST have been called before this
4835         //
4836         memset((void *) &last_key, 0, sizeof(last_key));
4837         if (!hidden_primary_key) {
4838             unpack_key(buf, found_key, keynr);
4839         }
4840         create_dbt_key_from_table(
4841             &last_key,
4842             primary_key,
4843             key_buff,
4844             buf,
4845             &has_null
4846             );
4847     }
4848     //
4849     // else read from clustered/primary key
4850     //
4851     else {
4852         error = unpack_row(buf, row, found_key, keynr);
4853         if (error) { goto exit; }
4854     }
4855     if (found_key) { DBUG_DUMP("read row key", (uchar *) found_key->data, found_key->size); }
4856     error = 0;
4857 exit:
4858     TOKUDB_HANDLER_DBUG_RETURN(error);
4859 }
4860 
4861 //
4862 // This function reads an entire row into buf. This function also assumes that
4863 // the key needed to retrieve the row is stored in the member variable last_key
4864 // Parameters:
4865 //      [out]   buf - buffer for the row, in MySQL format
4866 // Returns:
4867 //      0 on success, error otherwise
4868 //
4869 int ha_tokudb::read_full_row(uchar * buf) {
4870     TOKUDB_HANDLER_DBUG_ENTER("");
4871     int error = 0;
4872     struct smart_dbt_info info;
4873     info.ha = this;
4874     info.buf = buf;
4875     info.keynr = primary_key;
4876     //
4877     // assumes key is stored in this->last_key
4878     //
4879 
4880     error = share->file->getf_set(share->file,
4881                                   transaction,
4882                                   cursor_flags,
4883                                   &last_key,
4884                                   smart_dbt_callback_rowread_ptquery,
4885                                   &info);
4886 
4887     DBUG_EXECUTE_IF("tokudb_fake_db_notfound_error_in_read_full_row", {
4888         error = DB_NOTFOUND;
4889     });
4890 
4891     if (error) {
4892         if (error == DB_LOCK_NOTGRANTED) {
4893             error = HA_ERR_LOCK_WAIT_TIMEOUT;
4894         } else if (error == DB_NOTFOUND) {
4895             error = HA_ERR_CRASHED;
4896             if (tokudb_active_index < share->_keys) {
4897                 sql_print_error(
4898                     "ha_tokudb::read_full_row on table %s cound not locate "
4899                     "record in PK that matches record found in key %s",
4900                     share->full_table_name(),
4901                     share->_key_descriptors[tokudb_active_index]._name);
4902             } else {
4903                 sql_print_error(
4904                     "ha_tokudb::read_full_row on table %s cound not locate "
4905                     "record in PK that matches record found in key %d",
4906                     share->full_table_name(),
4907                     tokudb_active_index);
4908             }
4909         }
4910         table->status = STATUS_NOT_FOUND;
4911     }
4912 
4913     TOKUDB_HANDLER_DBUG_RETURN(error);
4914 }
4915 
4916 
4917 //
4918 // Reads the next row matching to the key, on success, advances cursor
4919 // Parameters:
4920 //      [out]   buf - buffer for the next row, in MySQL format
4921 //      [in]     key - key value
4922 //                keylen - length of key
4923 // Returns:
4924 //      0 on success
4925 //      HA_ERR_END_OF_FILE if not found
4926 //      error otherwise
4927 //
4928 int ha_tokudb::index_next_same(uchar* buf, const uchar* key, uint keylen) {
4929     TOKUDB_HANDLER_DBUG_ENTER("");
4930     ha_statistic_increment(&SSV::ha_read_next_count);
4931 
4932     DBT curr_key;
4933     DBT found_key;
4934     bool has_null;
4935     int cmp;
4936     // create the key that will be used to compare with what is found
4937     // in order to figure out if we should return an error
4938     pack_key(&curr_key, tokudb_active_index, key_buff2, key, keylen, COL_ZERO);
4939     int error = get_next(buf, 1, &curr_key, key_read);
4940     if (error) {
4941         goto cleanup;
4942     }
4943     //
4944     // now do the comparison
4945     //
4946     create_dbt_key_from_table(
4947         &found_key,
4948         tokudb_active_index,
4949         key_buff3,buf,
4950         &has_null);
4951     cmp =
4952         tokudb_prefix_cmp_dbt_key(
4953             share->key_file[tokudb_active_index],
4954             &curr_key,
4955             &found_key);
4956     if (cmp) {
4957         error = HA_ERR_END_OF_FILE;
4958     }
4959 
4960 cleanup:
4961     error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
4962     TOKUDB_HANDLER_DBUG_RETURN(error);
4963 }
4964 
4965 
4966 //
4967 // According to InnoDB handlerton: Positions an index cursor to the index
4968 // specified in keynr. Fetches the row if any
4969 // Parameters:
4970 //      [out]       buf - buffer for the  returned row
4971 //      [in]         key - key value, according to InnoDB, if NULL,
4972 //                              position cursor at start or end of index,
4973 //                              not sure if this is done now
4974 //                    key_len - length of key
4975 //                    find_flag - according to InnoDB, search flags from my_base.h
4976 // Returns:
4977 //      0 on success
4978 //      HA_ERR_KEY_NOT_FOUND if not found (per InnoDB),
4979 //          we seem to return HA_ERR_END_OF_FILE if find_flag != HA_READ_KEY_EXACT
4980 //          TODO: investigate this for correctness
4981 //      error otherwise
4982 //
4983 int ha_tokudb::index_read(
4984     uchar* buf,
4985     const uchar* key,
4986     uint key_len,
4987     enum ha_rkey_function find_flag) {
4988 
4989     TOKUDB_HANDLER_DBUG_ENTER(
4990         "key %p %u:%2.2x find=%u",
4991         key,
4992         key_len,
4993         key ? key[0] : 0,
4994         find_flag);
4995     invalidate_bulk_fetch();
4996     if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
4997         TOKUDB_DBUG_DUMP("mysql key=", key, key_len);
4998     }
4999     DBT row;
5000     DBT lookup_key;
5001     int error = 0;
5002     uint32_t flags = 0;
5003     THD* thd = ha_thd();
5004     tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
5005     struct smart_dbt_info info;
5006     struct index_read_info ir_info;
5007 
5008     HANDLE_INVALID_CURSOR();
5009 
5010     // if we locked a non-null key range and we now have a null key, then
5011     // remove the bounds from the cursor
5012     if (range_lock_grabbed &&
5013         !range_lock_grabbed_null &&
5014         index_key_is_null(table, tokudb_active_index, key, key_len)) {
5015         range_lock_grabbed = range_lock_grabbed_null = false;
5016         cursor->c_remove_restriction(cursor);
5017     }
5018 
5019     ha_statistic_increment(&SSV::ha_read_key_count);
5020     memset((void *) &row, 0, sizeof(row));
5021 
5022     info.ha = this;
5023     info.buf = buf;
5024     info.keynr = tokudb_active_index;
5025 
5026     ir_info.smart_dbt_info = info;
5027     ir_info.cmp = 0;
5028 
5029     flags = SET_PRELOCK_FLAG(0);
5030     switch (find_flag) {
5031     case HA_READ_KEY_EXACT: /* Find first record else error */ {
5032         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5033         DBT lookup_bound;
5034         pack_key(&lookup_bound, tokudb_active_index, key_buff4, key, key_len, COL_POS_INF);
5035         if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
5036             TOKUDB_DBUG_DUMP("tokudb key=", lookup_key.data, lookup_key.size);
5037         }
5038         ir_info.orig_key = &lookup_key;
5039         error = cursor->c_getf_set_range_with_bound(cursor, flags, &lookup_key, &lookup_bound, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5040         if (ir_info.cmp) {
5041             error = DB_NOTFOUND;
5042         }
5043         break;
5044     }
5045     case HA_READ_AFTER_KEY: /* Find next rec. after key-record */
5046         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5047         error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5048         break;
5049     case HA_READ_BEFORE_KEY: /* Find next rec. before key-record */
5050         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5051         error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5052         break;
5053     case HA_READ_KEY_OR_NEXT: /* Record or next record */
5054         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5055         error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5056         break;
5057     //
5058     // This case does not seem to ever be used, it is ok for it to be slow
5059     //
5060     case HA_READ_KEY_OR_PREV: /* Record or previous */
5061         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
5062         ir_info.orig_key = &lookup_key;
5063         error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5064         if (error == DB_NOTFOUND) {
5065             error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5066         }
5067         else if (ir_info.cmp) {
5068             error = cursor->c_getf_prev(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5069         }
5070         break;
5071     case HA_READ_PREFIX_LAST_OR_PREV: /* Last or prev key with the same prefix */
5072         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5073         error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5074         break;
5075     case HA_READ_PREFIX_LAST:
5076         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5077         ir_info.orig_key = &lookup_key;
5078         error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5079         if (ir_info.cmp) {
5080             error = DB_NOTFOUND;
5081         }
5082         break;
5083     default:
5084         TOKUDB_HANDLER_TRACE("unsupported:%d", find_flag);
5085         error = HA_ERR_UNSUPPORTED;
5086         break;
5087     }
5088     error = handle_cursor_error(error, HA_ERR_KEY_NOT_FOUND);
5089     if (!error && !key_read && tokudb_active_index != primary_key && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5090         error = read_full_row(buf);
5091     }
5092 
5093     if (TOKUDB_UNLIKELY(error && TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ERROR))) {
5094         TOKUDB_HANDLER_TRACE("error:%d:%d", error, find_flag);
5095     }
5096     trx->stmt_progress.queried++;
5097     track_progress(thd);
5098 
5099 cleanup:
5100     TOKUDB_HANDLER_DBUG_RETURN(error);
5101 }
5102 
5103 
5104 int ha_tokudb::read_data_from_range_query_buff(uchar* buf, bool need_val, bool do_key_read) {
5105     // buffer has the next row, get it from there
5106     int error;
5107     uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
5108     DBT curr_key;
5109     memset((void *) &curr_key, 0, sizeof(curr_key));
5110 
5111     // get key info
5112     uint32_t key_size = *(uint32_t *)curr_pos;
5113     curr_pos += sizeof(key_size);
5114     uchar* curr_key_buff = curr_pos;
5115     curr_pos += key_size;
5116 
5117     curr_key.data = curr_key_buff;
5118     curr_key.size = key_size;
5119 
5120     // if this is a covering index, this is all we need
5121     if (do_key_read) {
5122         assert_always(!need_val);
5123         extract_hidden_primary_key(tokudb_active_index, &curr_key);
5124         read_key_only(buf, tokudb_active_index, &curr_key);
5125         error = 0;
5126     }
5127     // we need to get more data
5128     else {
5129         DBT curr_val;
5130         memset((void *) &curr_val, 0, sizeof(curr_val));
5131         uchar* curr_val_buff = NULL;
5132         uint32_t val_size = 0;
5133         // in this case, we don't have a val, we are simply extracting the pk
5134         if (!need_val) {
5135             curr_val.data = curr_val_buff;
5136             curr_val.size = val_size;
5137             extract_hidden_primary_key(tokudb_active_index, &curr_key);
5138             error = read_primary_key( buf, tokudb_active_index, &curr_val, &curr_key);
5139         }
5140         else {
5141             extract_hidden_primary_key(tokudb_active_index, &curr_key);
5142             // need to extract a val and place it into buf
5143             if (unpack_entire_row) {
5144                 // get val info
5145                 val_size = *(uint32_t *)curr_pos;
5146                 curr_pos += sizeof(val_size);
5147                 curr_val_buff = curr_pos;
5148                 curr_pos += val_size;
5149                 curr_val.data = curr_val_buff;
5150                 curr_val.size = val_size;
5151                 error = unpack_row(buf,&curr_val, &curr_key, tokudb_active_index);
5152             }
5153             else {
5154                 if (!(hidden_primary_key && tokudb_active_index == primary_key)) {
5155                     unpack_key(buf,&curr_key,tokudb_active_index);
5156                 }
5157                 // read rows we care about
5158 
5159                 // first the null bytes;
5160                 memcpy(buf, curr_pos, table_share->null_bytes);
5161                 curr_pos += table_share->null_bytes;
5162 
5163                 // now the fixed sized rows
5164                 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5165                     uint field_index = fixed_cols_for_query[i];
5166                     Field* field = table->field[field_index];
5167                     unpack_fixed_field(
5168                         buf + field_offset(field, table),
5169                         curr_pos,
5170                         share->kc_info.field_lengths[field_index]
5171                         );
5172                     curr_pos += share->kc_info.field_lengths[field_index];
5173                 }
5174                 // now the variable sized rows
5175                 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5176                     uint field_index = var_cols_for_query[i];
5177                     Field* field = table->field[field_index];
5178                     uint32_t field_len = *(uint32_t *)curr_pos;
5179                     curr_pos += sizeof(field_len);
5180                     unpack_var_field(
5181                         buf + field_offset(field, table),
5182                         curr_pos,
5183                         field_len,
5184                         share->kc_info.length_bytes[field_index]
5185                         );
5186                     curr_pos += field_len;
5187                 }
5188                 // now the blobs
5189                 if (read_blobs) {
5190                     uint32_t blob_size = *(uint32_t *)curr_pos;
5191                     curr_pos += sizeof(blob_size);
5192                     error = unpack_blobs(
5193                         buf,
5194                         curr_pos,
5195                         blob_size,
5196                         true
5197                         );
5198                     curr_pos += blob_size;
5199                     if (error) {
5200                         invalidate_bulk_fetch();
5201                         goto exit;
5202                     }
5203                 }
5204                 error = 0;
5205             }
5206         }
5207     }
5208 
5209     curr_range_query_buff_offset = curr_pos - range_query_buff;
5210 exit:
5211     return error;
5212 }
5213 
5214 static int smart_dbt_bf_callback(
5215     DBT const* key,
5216     DBT const* row,
5217     void* context) {
5218     SMART_DBT_BF_INFO info = (SMART_DBT_BF_INFO)context;
5219     return
5220         info->ha->fill_range_query_buf(
5221             info->need_val,
5222             key,
5223             row,
5224             info->direction,
5225             info->thd,
5226             info->buf,
5227             info->key_to_compare);
5228 }
5229 
5230 enum icp_result ha_tokudb::toku_handler_index_cond_check(
5231     Item* pushed_idx_cond) {
5232 
5233     enum icp_result res;
5234     if (end_range) {
5235         int cmp;
5236 #ifdef MARIADB_BASE_VERSION
5237         cmp = compare_key2(end_range);
5238 #else
5239         cmp = compare_key_icp(end_range);
5240 #endif
5241         if (cmp > 0) {
5242             return ICP_OUT_OF_RANGE;
5243         }
5244     }
5245     res = pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;
5246     return res;
5247 }
5248 
5249 // fill in the range query buf for bulk fetch
5250 int ha_tokudb::fill_range_query_buf(
5251     bool need_val,
5252     DBT const* key,
5253     DBT const* row,
5254     int direction,
5255     THD* thd,
5256     uchar* buf,
5257     DBT* key_to_compare) {
5258 
5259     int error;
5260     //
5261     // first put the value into range_query_buf
5262     //
5263     uint32_t size_remaining =
5264         size_range_query_buff - bytes_used_in_range_query_buff;
5265     uint32_t size_needed;
5266     uint32_t user_defined_size = tokudb::sysvars::read_buf_size(thd);
5267     uchar* curr_pos = NULL;
5268 
5269     if (key_to_compare) {
5270         int cmp = tokudb_prefix_cmp_dbt_key(
5271             share->key_file[tokudb_active_index],
5272             key_to_compare,
5273             key);
5274         if (cmp) {
5275             icp_went_out_of_range = true;
5276             error = 0;
5277             goto cleanup;
5278         }
5279     }
5280 
5281     // if we have an index condition pushed down, we check it
5282     if (toku_pushed_idx_cond &&
5283         (tokudb_active_index == toku_pushed_idx_cond_keyno)) {
5284         unpack_key(buf, key, tokudb_active_index);
5285         enum icp_result result =
5286             toku_handler_index_cond_check(toku_pushed_idx_cond);
5287 
5288         // If we have reason to stop, we set icp_went_out_of_range and get out
5289         // otherwise, if we simply see that the current key is no match,
5290         // we tell the cursor to continue and don't store
5291         // the key locally
5292         if (result == ICP_OUT_OF_RANGE || thd_killed(thd)) {
5293             icp_went_out_of_range = true;
5294             error = 0;
5295             DEBUG_SYNC(ha_thd(), "tokudb_icp_asc_scan_out_of_range");
5296             goto cleanup;
5297         } else if (result == ICP_NO_MATCH) {
5298             // Optimizer change for MyRocks also benefits us here in TokuDB as
5299             // opt_range.cc QUICK_SELECT::get_next now sets end_range during
5300             // descending scan. We should not ever hit this condition, but
5301             // leaving this code in to prevent any possibility of a descending
5302             // scan to the beginning of an index and catch any possibility
5303             // in debug builds with an assertion
5304             assert_debug(!(!end_range && direction < 0));
5305             if (!end_range &&
5306                 direction < 0) {
5307                 cancel_pushed_idx_cond();
5308             }
5309             error = TOKUDB_CURSOR_CONTINUE;
5310             goto cleanup;
5311         }
5312     }
5313 
5314     // at this point, if ICP is on, we have verified that the key is one
5315     // we are interested in, so we proceed with placing the data
5316     // into the range query buffer
5317 
5318     if (need_val) {
5319         if (unpack_entire_row) {
5320             size_needed = 2*sizeof(uint32_t) + key->size + row->size;
5321         } else {
5322             // this is an upper bound
5323             size_needed =
5324                 // size of key length
5325                 sizeof(uint32_t) +
5326                 // key and row
5327                 key->size + row->size +
5328                 // lengths of varchars stored
5329                 num_var_cols_for_query * (sizeof(uint32_t)) +
5330                 // length of blobs
5331                 sizeof(uint32_t);
5332         }
5333     } else {
5334         size_needed = sizeof(uint32_t) + key->size;
5335     }
5336     if (size_remaining < size_needed) {
5337         range_query_buff =
5338             static_cast<uchar*>(tokudb::memory::realloc(
5339                 static_cast<void*>(range_query_buff),
5340                 bytes_used_in_range_query_buff + size_needed,
5341                 MYF(MY_WME)));
5342         if (range_query_buff == NULL) {
5343             error = ENOMEM;
5344             invalidate_bulk_fetch();
5345             goto cleanup;
5346         }
5347         size_range_query_buff = bytes_used_in_range_query_buff + size_needed;
5348     }
5349     //
5350     // now we know we have the size, let's fill the buffer, starting with the key
5351     //
5352     curr_pos = range_query_buff + bytes_used_in_range_query_buff;
5353 
5354     *reinterpret_cast<uint32_t*>(curr_pos) = key->size;
5355     curr_pos += sizeof(uint32_t);
5356     memcpy(curr_pos, key->data, key->size);
5357     curr_pos += key->size;
5358     if (need_val) {
5359         if (unpack_entire_row) {
5360             *reinterpret_cast<uint32_t*>(curr_pos) = row->size;
5361             curr_pos += sizeof(uint32_t);
5362             memcpy(curr_pos, row->data, row->size);
5363             curr_pos += row->size;
5364         } else {
5365             // need to unpack just the data we care about
5366             const uchar* fixed_field_ptr = static_cast<const uchar*>(row->data);
5367             fixed_field_ptr += table_share->null_bytes;
5368 
5369             const uchar* var_field_offset_ptr = NULL;
5370             const uchar* var_field_data_ptr = NULL;
5371 
5372             var_field_offset_ptr =
5373                 fixed_field_ptr +
5374                 share->kc_info.mcp_info[tokudb_active_index].fixed_field_size;
5375             var_field_data_ptr =
5376                 var_field_offset_ptr +
5377                 share->kc_info.mcp_info[tokudb_active_index].len_of_offsets;
5378 
5379             // first the null bytes
5380             memcpy(curr_pos, row->data, table_share->null_bytes);
5381             curr_pos += table_share->null_bytes;
5382             // now the fixed fields
5383             //
5384             // first the fixed fields
5385             //
5386             for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5387                 uint field_index = fixed_cols_for_query[i];
5388                 memcpy(
5389                     curr_pos,
5390                     fixed_field_ptr + share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val,
5391                     share->kc_info.field_lengths[field_index]);
5392                 curr_pos += share->kc_info.field_lengths[field_index];
5393             }
5394 
5395             //
5396             // now the var fields
5397             //
5398             for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5399                 uint field_index = var_cols_for_query[i];
5400                 uint32_t var_field_index =
5401                     share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val;
5402                 uint32_t data_start_offset;
5403                 uint32_t field_len;
5404 
5405                 get_var_field_info(
5406                     &field_len,
5407                     &data_start_offset,
5408                     var_field_index,
5409                     var_field_offset_ptr,
5410                     share->kc_info.num_offset_bytes);
5411                 memcpy(curr_pos, &field_len, sizeof(field_len));
5412                 curr_pos += sizeof(field_len);
5413                 memcpy(
5414                     curr_pos,
5415                     var_field_data_ptr + data_start_offset,
5416                     field_len);
5417                 curr_pos += field_len;
5418             }
5419 
5420             if (read_blobs) {
5421                 uint32_t blob_offset = 0;
5422                 uint32_t data_size = 0;
5423                 //
5424                 // now the blobs
5425                 //
5426                 get_blob_field_info(
5427                     &blob_offset,
5428                     share->kc_info.mcp_info[tokudb_active_index].len_of_offsets,
5429                     var_field_data_ptr,
5430                     share->kc_info.num_offset_bytes);
5431                 data_size =
5432                     row->size -
5433                     blob_offset -
5434                     static_cast<uint32_t>((var_field_data_ptr -
5435                         static_cast<const uchar*>(row->data)));
5436                 memcpy(curr_pos, &data_size, sizeof(data_size));
5437                 curr_pos += sizeof(data_size);
5438                 memcpy(curr_pos, var_field_data_ptr + blob_offset, data_size);
5439                 curr_pos += data_size;
5440             }
5441         }
5442     }
5443 
5444     bytes_used_in_range_query_buff = curr_pos - range_query_buff;
5445     assert_always(bytes_used_in_range_query_buff <= size_range_query_buff);
5446 
5447     //
5448     // now determine if we should continue with the bulk fetch
5449     // we want to stop under these conditions:
5450     //  - we overran the prelocked range
5451     //  - we are close to the end of the buffer
5452     //  - we have fetched an exponential amount of rows with
5453     //  respect to the bulk fetch iteration, which is initialized
5454     //  to 0 in index_init() and prelock_range().
5455 
5456     rows_fetched_using_bulk_fetch++;
5457     // if the iteration is less than the number of possible shifts on
5458     // a 64 bit integer, check that we haven't exceeded this iterations
5459     // row fetch upper bound.
5460     if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5461         uint64_t row_fetch_upper_bound = 1LLU << bulk_fetch_iteration;
5462         assert_always(row_fetch_upper_bound > 0);
5463         if (rows_fetched_using_bulk_fetch >= row_fetch_upper_bound) {
5464             error = 0;
5465             goto cleanup;
5466         }
5467     }
5468 
5469     if (bytes_used_in_range_query_buff +
5470         table_share->rec_buff_length >
5471         user_defined_size) {
5472         error = 0;
5473         goto cleanup;
5474     }
5475     if (direction > 0) {
5476         // compare what we got to the right endpoint of prelocked range
5477         // because we are searching keys in ascending order
5478         if (prelocked_right_range_size == 0) {
5479             error = TOKUDB_CURSOR_CONTINUE;
5480             goto cleanup;
5481         }
5482         DBT right_range;
5483         memset(&right_range, 0, sizeof(right_range));
5484         right_range.size = prelocked_right_range_size;
5485         right_range.data = prelocked_right_range;
5486         int cmp = tokudb_cmp_dbt_key(
5487             share->key_file[tokudb_active_index],
5488             key,
5489             &right_range);
5490         error = (cmp > 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5491     } else {
5492         // compare what we got to the left endpoint of prelocked range
5493         // because we are searching keys in descending order
5494         if (prelocked_left_range_size == 0) {
5495             error = TOKUDB_CURSOR_CONTINUE;
5496             goto cleanup;
5497         }
5498         DBT left_range;
5499         memset(&left_range, 0, sizeof(left_range));
5500         left_range.size = prelocked_left_range_size;
5501         left_range.data = prelocked_left_range;
5502         int cmp = tokudb_cmp_dbt_key(
5503             share->key_file[tokudb_active_index],
5504             key,
5505             &left_range);
5506         error = (cmp < 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5507     }
5508 cleanup:
5509     return error;
5510 }
5511 
5512 int ha_tokudb::get_next(
5513     uchar* buf,
5514     int direction,
5515     DBT* key_to_compare,
5516     bool do_key_read) {
5517 
5518     int error = 0;
5519     HANDLE_INVALID_CURSOR();
5520 
5521     if (maybe_index_scan) {
5522         maybe_index_scan = false;
5523         if (!range_lock_grabbed) {
5524             error = prepare_index_scan();
5525         }
5526     }
5527 
5528     if (!error) {
5529         uint32_t flags = SET_PRELOCK_FLAG(0);
5530 
5531         // we need to read the val of what we retrieve if
5532         // we do NOT have a covering index AND we are using a clustering secondary
5533         // key
5534         bool need_val =
5535             (do_key_read == 0) &&
5536             (tokudb_active_index == primary_key ||
5537              key_is_clustering(&table->key_info[tokudb_active_index]));
5538 
5539         if ((bytes_used_in_range_query_buff -
5540              curr_range_query_buff_offset) > 0) {
5541             error = read_data_from_range_query_buff(buf, need_val, do_key_read);
5542         } else if (icp_went_out_of_range) {
5543             icp_went_out_of_range = false;
5544             error = HA_ERR_END_OF_FILE;
5545         } else {
5546             invalidate_bulk_fetch();
5547             if (doing_bulk_fetch) {
5548                 struct smart_dbt_bf_info bf_info;
5549                 bf_info.ha = this;
5550                 // you need the val if you have a clustering index and key_read is not 0;
5551                 bf_info.direction = direction;
5552                 bf_info.thd = ha_thd();
5553                 bf_info.need_val = need_val;
5554                 bf_info.buf = buf;
5555                 bf_info.key_to_compare = key_to_compare;
5556                 //
5557                 // call c_getf_next with purpose of filling in range_query_buff
5558                 //
5559                 rows_fetched_using_bulk_fetch = 0;
5560                 // it is expected that we can do ICP in the smart_dbt_bf_callback
5561                 // as a result, it's possible we don't return any data because
5562                 // none of the rows matched the index condition. Therefore, we need
5563                 // this while loop. icp_out_of_range will be set if we hit a row that
5564                 // the index condition states is out of our range. When that hits,
5565                 // we know all the data in the buffer is the last data we will retrieve
5566                 while (bytes_used_in_range_query_buff == 0 &&
5567                        !icp_went_out_of_range && error == 0) {
5568                     if (direction > 0) {
5569                         error =
5570                             cursor->c_getf_next(
5571                                 cursor,
5572                                 flags,
5573                                 smart_dbt_bf_callback,
5574                                 &bf_info);
5575                     } else {
5576                         error =
5577                             cursor->c_getf_prev(
5578                                 cursor,
5579                                 flags,
5580                                 smart_dbt_bf_callback,
5581                                 &bf_info);
5582                     }
5583                 }
5584                 // if there is no data set and we went out of range,
5585                 // then there is nothing to return
5586                 if (bytes_used_in_range_query_buff == 0 &&
5587                     icp_went_out_of_range) {
5588                     icp_went_out_of_range = false;
5589                     error = HA_ERR_END_OF_FILE;
5590                 }
5591                 if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5592                     bulk_fetch_iteration++;
5593                 }
5594 
5595                 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5596                 if (error) {
5597                     goto cleanup;
5598                 }
5599 
5600                 //
5601                 // now that range_query_buff is filled, read an element
5602                 //
5603                 error =
5604                     read_data_from_range_query_buff(buf, need_val, do_key_read);
5605             } else {
5606                 struct smart_dbt_info info;
5607                 info.ha = this;
5608                 info.buf = buf;
5609                 info.keynr = tokudb_active_index;
5610 
5611                 if (direction > 0) {
5612                     error =
5613                         cursor->c_getf_next(
5614                             cursor,
5615                             flags,
5616                             SMART_DBT_CALLBACK(do_key_read),
5617                             &info);
5618                 } else {
5619                     error =
5620                         cursor->c_getf_prev(
5621                             cursor,
5622                             flags,
5623                             SMART_DBT_CALLBACK(do_key_read),
5624                             &info);
5625                 }
5626                 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5627             }
5628         }
5629     }
5630 
5631     //
5632     // at this point, one of two things has happened
5633     // either we have unpacked the data into buf, and we
5634     // are done, or we have unpacked the primary key
5635     // into last_key, and we use the code below to
5636     // read the full row by doing a point query into the
5637     // main table.
5638     //
5639     if (!error &&
5640         !do_key_read &&
5641         (tokudb_active_index != primary_key) &&
5642         !key_is_clustering(&table->key_info[tokudb_active_index])) {
5643         error = read_full_row(buf);
5644     }
5645 
5646     if (!error) {
5647         THD *thd = ha_thd();
5648         tokudb_trx_data* trx =
5649             static_cast<tokudb_trx_data*>(thd_get_ha_data(thd, tokudb_hton));
5650         trx->stmt_progress.queried++;
5651         track_progress(thd);
5652         if (thd_killed(thd))
5653             error = ER_ABORTING_CONNECTION;
5654     }
5655 cleanup:
5656     return error;
5657 }
5658 
5659 
5660 //
5661 // Reads the next row from the active index (cursor) into buf, and advances cursor
5662 // Parameters:
5663 //      [out]   buf - buffer for the next row, in MySQL format
5664 // Returns:
5665 //      0 on success
5666 //      HA_ERR_END_OF_FILE if not found
5667 //      error otherwise
5668 //
5669 int ha_tokudb::index_next(uchar * buf) {
5670     TOKUDB_HANDLER_DBUG_ENTER("");
5671     ha_statistic_increment(&SSV::ha_read_next_count);
5672     int error = get_next(buf, 1, NULL, key_read);
5673     TOKUDB_HANDLER_DBUG_RETURN(error);
5674 }
5675 
5676 
5677 int ha_tokudb::index_read_last(uchar * buf, const uchar * key, uint key_len) {
5678     return(index_read(buf, key, key_len, HA_READ_PREFIX_LAST));
5679 }
5680 
5681 
5682 //
5683 // Reads the previous row from the active index (cursor) into buf, and advances cursor
5684 // Parameters:
5685 //      [out]   buf - buffer for the next row, in MySQL format
5686 // Returns:
5687 //      0 on success
5688 //      HA_ERR_END_OF_FILE if not found
5689 //      error otherwise
5690 //
5691 int ha_tokudb::index_prev(uchar * buf) {
5692     TOKUDB_HANDLER_DBUG_ENTER("");
5693     ha_statistic_increment(&SSV::ha_read_prev_count);
5694     int error = get_next(buf, -1, NULL, key_read);
5695     TOKUDB_HANDLER_DBUG_RETURN(error);
5696 }
5697 
5698 //
5699 // Reads the first row from the active index (cursor) into buf, and advances cursor
5700 // Parameters:
5701 //      [out]   buf - buffer for the next row, in MySQL format
5702 // Returns:
5703 //      0 on success
5704 //      HA_ERR_END_OF_FILE if not found
5705 //      error otherwise
5706 //
5707 int ha_tokudb::index_first(uchar * buf) {
5708     TOKUDB_HANDLER_DBUG_ENTER("");
5709     invalidate_bulk_fetch();
5710     int error = 0;
5711     struct smart_dbt_info info;
5712     uint32_t flags = SET_PRELOCK_FLAG(0);
5713     THD* thd = ha_thd();
5714     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5715     HANDLE_INVALID_CURSOR();
5716 
5717     ha_statistic_increment(&SSV::ha_read_first_count);
5718 
5719     info.ha = this;
5720     info.buf = buf;
5721     info.keynr = tokudb_active_index;
5722 
5723     error = cursor->c_getf_first(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5724     error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5725 
5726     //
5727     // still need to get entire contents of the row if operation done on
5728     // secondary DB and it was NOT a covering index
5729     //
5730     if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5731         error = read_full_row(buf);
5732     }
5733     if (trx) {
5734         trx->stmt_progress.queried++;
5735     }
5736     track_progress(thd);
5737     maybe_index_scan = true;
5738 cleanup:
5739     TOKUDB_HANDLER_DBUG_RETURN(error);
5740 }
5741 
5742 //
5743 // Reads the last row from the active index (cursor) into buf, and advances cursor
5744 // Parameters:
5745 //      [out]   buf - buffer for the next row, in MySQL format
5746 // Returns:
5747 //      0 on success
5748 //      HA_ERR_END_OF_FILE if not found
5749 //      error otherwise
5750 //
5751 int ha_tokudb::index_last(uchar * buf) {
5752     TOKUDB_HANDLER_DBUG_ENTER("");
5753     invalidate_bulk_fetch();
5754     int error = 0;
5755     struct smart_dbt_info info;
5756     uint32_t flags = SET_PRELOCK_FLAG(0);
5757     THD* thd = ha_thd();
5758     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5759     HANDLE_INVALID_CURSOR();
5760 
5761     ha_statistic_increment(&SSV::ha_read_last_count);
5762 
5763     info.ha = this;
5764     info.buf = buf;
5765     info.keynr = tokudb_active_index;
5766 
5767     error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5768     error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5769     //
5770     // still need to get entire contents of the row if operation done on
5771     // secondary DB and it was NOT a covering index
5772     //
5773     if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5774         error = read_full_row(buf);
5775     }
5776 
5777     if (trx) {
5778         trx->stmt_progress.queried++;
5779     }
5780     track_progress(thd);
5781     maybe_index_scan = true;
5782 cleanup:
5783     TOKUDB_HANDLER_DBUG_RETURN(error);
5784 }
5785 
5786 //
5787 // Initialize a scan of the table (which is why index_init is called on primary_key)
5788 // Parameters:
5789 //          scan - unused
5790 // Returns:
5791 //      0 on success
5792 //      error otherwise
5793 //
5794 int ha_tokudb::rnd_init(bool scan) {
5795     TOKUDB_HANDLER_DBUG_ENTER("");
5796     int error = 0;
5797     range_lock_grabbed = false;
5798     error = index_init(MAX_KEY, 0);
5799     if (error) { goto cleanup;}
5800 
5801     if (scan) {
5802         error = prelock_range(NULL, NULL);
5803         if (error) { goto cleanup; }
5804 
5805         // only want to set range_lock_grabbed to true after index_init
5806         // successfully executed for two reasons:
5807         // 1) index_init will reset it to false anyway
5808         // 2) if it fails, we don't want prelocking on,
5809         range_lock_grabbed = true;
5810     }
5811 
5812     error = 0;
5813 cleanup:
5814     if (error) {
5815         index_end();
5816         last_cursor_error = error;
5817     }
5818     TOKUDB_HANDLER_DBUG_RETURN(error);
5819 }
5820 
5821 //
5822 // End a scan of the table
5823 //
5824 int ha_tokudb::rnd_end() {
5825     TOKUDB_HANDLER_DBUG_ENTER("");
5826     range_lock_grabbed = false;
5827     TOKUDB_HANDLER_DBUG_RETURN(index_end());
5828 }
5829 
5830 
5831 //
5832 // Read the next row in a table scan
5833 // Parameters:
5834 //      [out]   buf - buffer for the next row, in MySQL format
5835 // Returns:
5836 //      0 on success
5837 //      HA_ERR_END_OF_FILE if not found
5838 //      error otherwise
5839 //
5840 int ha_tokudb::rnd_next(uchar * buf) {
5841     TOKUDB_HANDLER_DBUG_ENTER("");
5842     ha_statistic_increment(&SSV::ha_read_rnd_next_count);
5843     int error = get_next(buf, 1, NULL, false);
5844     TOKUDB_HANDLER_DBUG_RETURN(error);
5845 }
5846 
5847 
5848 void ha_tokudb::track_progress(THD* thd) {
5849     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
5850     if (trx) {
5851         ulonglong num_written = trx->stmt_progress.inserted +
5852             trx->stmt_progress.updated +
5853             trx->stmt_progress.deleted;
5854         bool update_status =
5855             (trx->stmt_progress.queried &&
5856              tokudb::sysvars::read_status_frequency &&
5857              (trx->stmt_progress.queried %
5858                 tokudb::sysvars::read_status_frequency) == 0) ||
5859              (num_written && tokudb::sysvars::write_status_frequency &&
5860               (num_written % tokudb::sysvars::write_status_frequency) == 0);
5861         if (update_status) {
5862             char *next_status = write_status_msg;
5863             bool first = true;
5864             int r;
5865             if (trx->stmt_progress.queried) {
5866                 r = sprintf(
5867                     next_status,
5868                     "Queried about %llu row%s",
5869                     trx->stmt_progress.queried,
5870                     trx->stmt_progress.queried == 1 ? "" : "s");
5871                 assert_always(r >= 0);
5872                 next_status += r;
5873                 first = false;
5874             }
5875             if (trx->stmt_progress.inserted) {
5876                 if (trx->stmt_progress.using_loader) {
5877                     r = sprintf(
5878                         next_status,
5879                         "%sFetched about %llu row%s, loading data still remains",
5880                         first ? "" : ", ",
5881                         trx->stmt_progress.inserted,
5882                         trx->stmt_progress.inserted == 1 ? "" : "s");
5883                 } else {
5884                     r = sprintf(
5885                         next_status,
5886                         "%sInserted about %llu row%s",
5887                         first ? "" : ", ",
5888                         trx->stmt_progress.inserted,
5889                         trx->stmt_progress.inserted == 1 ? "" : "s");
5890                 }
5891                 assert_always(r >= 0);
5892                 next_status += r;
5893                 first = false;
5894             }
5895             if (trx->stmt_progress.updated) {
5896                 r = sprintf(
5897                     next_status,
5898                     "%sUpdated about %llu row%s",
5899                     first ? "" : ", ",
5900                     trx->stmt_progress.updated,
5901                     trx->stmt_progress.updated == 1 ? "" : "s");
5902                 assert_always(r >= 0);
5903                 next_status += r;
5904                 first = false;
5905             }
5906             if (trx->stmt_progress.deleted) {
5907                 r = sprintf(
5908                     next_status,
5909                     "%sDeleted about %llu row%s",
5910                     first ? "" : ", ",
5911                     trx->stmt_progress.deleted,
5912                     trx->stmt_progress.deleted == 1 ? "" : "s");
5913                 assert_always(r >= 0);
5914                 next_status += r;
5915                 first = false;
5916             }
5917             if (!first)
5918                 thd_proc_info(thd, write_status_msg);
5919         }
5920     }
5921 }
5922 
5923 
5924 DBT *ha_tokudb::get_pos(DBT * to, uchar * pos) {
5925     TOKUDB_HANDLER_DBUG_ENTER("");
5926     /* We don't need to set app_data here */
5927     memset((void *) to, 0, sizeof(*to));
5928     to->data = pos + sizeof(uint32_t);
5929     to->size = *(uint32_t *)pos;
5930     DBUG_DUMP("key", (const uchar *) to->data, to->size);
5931     DBUG_RETURN(to);
5932 }
5933 
5934 // Retrieves a row with based on the primary key saved in pos
5935 // Returns:
5936 //      0 on success
5937 //      HA_ERR_KEY_NOT_FOUND if not found
5938 //      error otherwise
5939 int ha_tokudb::rnd_pos(uchar * buf, uchar * pos) {
5940     TOKUDB_HANDLER_DBUG_ENTER("");
5941     DBT db_pos;
5942     int error = 0;
5943     struct smart_dbt_info info;
5944     bool old_unpack_entire_row = unpack_entire_row;
5945     DBT* key = get_pos(&db_pos, pos);
5946 
5947     unpack_entire_row = true;
5948     ha_statistic_increment(&SSV::ha_read_rnd_count);
5949     tokudb_active_index = MAX_KEY;
5950 
5951     THD *thd = ha_thd();
5952 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
5953     // test rpl slave by inducing a delay before the point query
5954     if (thd->slave_thread && (in_rpl_delete_rows || in_rpl_update_rows)) {
5955         DBUG_EXECUTE_IF("tokudb_crash_if_rpl_looks_up_row", DBUG_ASSERT(0););
5956         uint64_t delay_ms = tokudb::sysvars::rpl_lookup_rows_delay(thd);
5957         if (delay_ms)
5958             usleep(delay_ms * 1000);
5959     }
5960 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
5961 
5962     info.ha = this;
5963     info.buf = buf;
5964     info.keynr = primary_key;
5965 
5966     error = share->file->getf_set(share->file, transaction,
5967             get_cursor_isolation_flags(lock.type, thd),
5968             key, smart_dbt_callback_rowread_ptquery, &info);
5969 
5970     if (error == DB_NOTFOUND) {
5971         error = HA_ERR_KEY_NOT_FOUND;
5972         goto cleanup;
5973     }
5974 cleanup:
5975     unpack_entire_row = old_unpack_entire_row;
5976     TOKUDB_HANDLER_DBUG_RETURN(error);
5977 }
5978 
5979 int ha_tokudb::prelock_range(const key_range *start_key, const key_range *end_key) {
5980     TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
5981     THD* thd = ha_thd();
5982 
5983     int error = 0;
5984     DBT start_dbt_key;
5985     DBT end_dbt_key;
5986     uchar* start_key_buff  = prelocked_left_range;
5987     uchar* end_key_buff = prelocked_right_range;
5988 
5989     memset((void *) &start_dbt_key, 0, sizeof(start_dbt_key));
5990     memset((void *) &end_dbt_key, 0, sizeof(end_dbt_key));
5991 
5992     HANDLE_INVALID_CURSOR();
5993     if (start_key) {
5994         switch (start_key->flag) {
5995         case HA_READ_AFTER_KEY:
5996             pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_POS_INF);
5997             break;
5998         default:
5999             pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_NEG_INF);
6000             break;
6001         }
6002         prelocked_left_range_size = start_dbt_key.size;
6003     }
6004     else {
6005         prelocked_left_range_size = 0;
6006     }
6007 
6008     if (end_key) {
6009         switch (end_key->flag) {
6010         case HA_READ_BEFORE_KEY:
6011             pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_NEG_INF);
6012             break;
6013         default:
6014             pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_POS_INF);
6015             break;
6016         }
6017         prelocked_right_range_size = end_dbt_key.size;
6018     }
6019     else {
6020         prelocked_right_range_size = 0;
6021     }
6022 
6023     error = cursor->c_set_bounds(
6024         cursor,
6025         start_key ? &start_dbt_key : share->key_file[tokudb_active_index]->dbt_neg_infty(),
6026         end_key ? &end_dbt_key : share->key_file[tokudb_active_index]->dbt_pos_infty(),
6027         true,
6028         (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
6029         );
6030     if (error) {
6031         error = map_to_handler_error(error);
6032         last_cursor_error = error;
6033         //
6034         // cursor should be initialized here, but in case it is not, we still check
6035         //
6036         if (cursor) {
6037             int r = cursor->c_close(cursor);
6038             assert_always(r==0);
6039             cursor = NULL;
6040             remove_from_trx_handler_list();
6041         }
6042         goto cleanup;
6043     }
6044 
6045     // at this point, determine if we will be doing bulk fetch
6046     doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
6047     bulk_fetch_iteration = 0;
6048     rows_fetched_using_bulk_fetch = 0;
6049 
6050 cleanup:
6051     TOKUDB_HANDLER_DBUG_RETURN(error);
6052 }
6053 
6054 //
6055 // Prelock range if possible, start_key is leftmost, end_key is rightmost
6056 // whether scanning forward or backward.  This function is called by MySQL
6057 // for backward range queries (in QUICK_SELECT_DESC::get_next).
6058 // Forward scans use read_range_first()/read_range_next().
6059 //
6060 int ha_tokudb::prepare_range_scan( const key_range *start_key, const key_range *end_key) {
6061     TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
6062     int error = prelock_range(start_key, end_key);
6063     if (!error) {
6064         range_lock_grabbed = true;
6065     }
6066     TOKUDB_HANDLER_DBUG_RETURN(error);
6067 }
6068 
6069 int ha_tokudb::read_range_first(
6070     const key_range *start_key,
6071     const key_range *end_key,
6072     bool eq_range,
6073     bool sorted)
6074 {
6075     TOKUDB_HANDLER_DBUG_ENTER("%p %p %u %u", start_key, end_key, eq_range, sorted);
6076     int error = prelock_range(start_key, end_key);
6077     if (error) { goto cleanup; }
6078     range_lock_grabbed = true;
6079 
6080     error = handler::read_range_first(start_key, end_key, eq_range, sorted);
6081 cleanup:
6082     TOKUDB_HANDLER_DBUG_RETURN(error);
6083 }
6084 
6085 int ha_tokudb::read_range_next()
6086 {
6087     TOKUDB_HANDLER_DBUG_ENTER("");
6088     int error;
6089     error = handler::read_range_next();
6090     if (error) {
6091         range_lock_grabbed = false;
6092     }
6093     TOKUDB_HANDLER_DBUG_RETURN(error);
6094 }
6095 
6096 
6097 
6098 /*
6099   Set a reference to the current record in (ref,ref_length).
6100 
6101   SYNOPSIS
6102   ha_tokudb::position()
6103   record                      The current record buffer
6104 
6105   DESCRIPTION
6106   The BDB handler stores the primary key in (ref,ref_length).
6107   There is either an explicit primary key, or an implicit (hidden)
6108   primary key.
6109   During open(), 'ref_length' is calculated as the maximum primary
6110   key length. When an actual key is shorter than that, the rest of
6111   the buffer must be cleared out. The row cannot be identified, if
6112   garbage follows behind the end of the key. There is no length
6113   field for the current key, so that the whole ref_length is used
6114   for comparison.
6115 
6116   RETURN
6117   nothing
6118 */
6119 void ha_tokudb::position(const uchar * record) {
6120     TOKUDB_HANDLER_DBUG_ENTER("");
6121     DBT key;
6122     if (hidden_primary_key) {
6123         DBUG_ASSERT(ref_length == (TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t)));
6124         memcpy(ref + sizeof(uint32_t), current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
6125         *(uint32_t *)ref = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
6126     }
6127     else {
6128         bool has_null;
6129         //
6130         // save the data
6131         //
6132         create_dbt_key_from_table(&key, primary_key, ref + sizeof(uint32_t), record, &has_null);
6133         //
6134         // save the size of data in the first four bytes of ref
6135         //
6136         memcpy(ref, &key.size, sizeof(uint32_t));
6137     }
6138     TOKUDB_HANDLER_DBUG_VOID_RETURN;
6139 }
6140 
6141 //
6142 // Per InnoDB: Returns statistics information of the table to the MySQL interpreter,
6143 // in various fields of the handle object.
6144 // Return:
6145 //      0, always success
6146 //
6147 int ha_tokudb::info(uint flag) {
6148     TOKUDB_HANDLER_DBUG_ENTER("%d", flag);
6149     int error = 0;
6150 #if defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
6151     for (uint i=0; i < table->s->keys; i++)
6152         if (key_is_clustering(&table->key_info[i]))
6153             table->covering_keys.set_bit(i);
6154 #endif  // defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
6155     DB_TXN* txn = NULL;
6156     if (flag & HA_STATUS_VARIABLE) {
6157         stats.records = share->row_count() + share->rows_from_locked_table;
6158         stats.deleted = 0;
6159         if (!(flag & HA_STATUS_NO_LOCK)) {
6160 
6161             error = txn_begin(db_env, NULL, &txn, DB_READ_UNCOMMITTED, ha_thd());
6162             if (error) {
6163                 goto cleanup;
6164             }
6165 
6166             // we should always have a primary key
6167             assert_always(share->file != NULL);
6168 
6169             DB_BTREE_STAT64 dict_stats;
6170             error = share->file->stat64(share->file, txn, &dict_stats);
6171             if (error) {
6172                 goto cleanup;
6173             }
6174             share->set_row_count(dict_stats.bt_ndata, false);
6175             stats.records = dict_stats.bt_ndata;
6176             stats.create_time = dict_stats.bt_create_time_sec;
6177             stats.update_time = dict_stats.bt_modify_time_sec;
6178             stats.check_time = dict_stats.bt_verify_time_sec;
6179             stats.data_file_length = dict_stats.bt_dsize;
6180             stats.delete_length = dict_stats.bt_fsize - dict_stats.bt_dsize;
6181             if (hidden_primary_key) {
6182                 //
6183                 // in this case, we have a hidden primary key, do not
6184                 // want to report space taken up by the hidden primary key to the user
6185                 //
6186                 uint64_t hpk_space =
6187                     TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH * dict_stats.bt_ndata;
6188                 stats.data_file_length =
6189                     (hpk_space > stats.data_file_length) ?
6190                         0 : stats.data_file_length - hpk_space;
6191             } else {
6192                 //
6193                 // one infinity byte per key needs to be subtracted
6194                 //
6195                 uint64_t inf_byte_space = dict_stats.bt_ndata;
6196                 stats.data_file_length =
6197                     (inf_byte_space > stats.data_file_length) ?
6198                         0 : stats.data_file_length - inf_byte_space;
6199             }
6200 
6201             stats.mean_rec_length =
6202                 stats.records ?
6203                     (ulong)(stats.data_file_length/stats.records) : 0;
6204             stats.index_file_length = 0;
6205             // curr_num_DBs is the number of keys we have, according
6206             // to the mysql layer. if drop index is running concurrently
6207             // with info() (it can, because info does not take table locks),
6208             // then it could be the case that one of the dbs was dropped
6209             // and set to NULL before mysql was able to set table->s->keys
6210             // accordingly.
6211             //
6212             // we should just ignore any DB * that is NULL.
6213             //
6214             // this solution is much simpler than trying to maintain an
6215             // accurate number of valid keys at the handlerton layer.
6216             uint curr_num_DBs =
6217                 table->s->keys + tokudb_test(hidden_primary_key);
6218             for (uint i = 0; i < curr_num_DBs; i++) {
6219                 // skip the primary key, skip dropped indexes
6220                 if (i == primary_key || share->key_file[i] == NULL) {
6221                     continue;
6222                 }
6223                 error = share->key_file[i]->stat64(
6224                     share->key_file[i], txn, &dict_stats);
6225                 if (error) {
6226                     goto cleanup;
6227                 }
6228                 stats.index_file_length += dict_stats.bt_dsize;
6229                 stats.delete_length +=
6230                     dict_stats.bt_fsize - dict_stats.bt_dsize;
6231             }
6232         }
6233 
6234         /*
6235         The following comment and logic has been taken from InnoDB and
6236         an old hack was removed that forced to always set stats.records > 0
6237         ---
6238         The MySQL optimizer seems to assume in a left join that n_rows
6239         is an accurate estimate if it is zero. Of course, it is not,
6240         since we do not have any locks on the rows yet at this phase.
6241         Since SHOW TABLE STATUS seems to call this function with the
6242         HA_STATUS_TIME flag set, while the left join optimizer does not
6243         set that flag, we add one to a zero value if the flag is not
6244         set. That way SHOW TABLE STATUS will show the best estimate,
6245         while the optimizer never sees the table empty. */
6246         if (stats.records == 0 && !(flag & HA_STATUS_TIME)) {
6247             stats.records++;
6248         }
6249     }
6250     if ((flag & HA_STATUS_CONST)) {
6251         stats.max_data_file_length = 9223372036854775807ULL;
6252     }
6253     if (flag & (HA_STATUS_VARIABLE | HA_STATUS_CONST)) {
6254         share->set_cardinality_counts_in_table(table);
6255     }
6256 
6257     /* Don't return key if we got an error for the internal primary key */
6258     if (flag & HA_STATUS_ERRKEY && last_dup_key < table_share->keys) {
6259         errkey = last_dup_key;
6260     }
6261 
6262     if (flag & HA_STATUS_AUTO && table->found_next_number_field) {
6263         THD* thd = table->in_use;
6264         struct system_variables* variables = &thd->variables;
6265         stats.auto_increment_value =
6266             share->last_auto_increment + variables->auto_increment_increment;
6267     }
6268     error = 0;
6269 cleanup:
6270     if (txn != NULL) {
6271         commit_txn(txn, DB_TXN_NOSYNC);
6272         txn = NULL;
6273     }
6274     TOKUDB_HANDLER_DBUG_RETURN(error);
6275 }
6276 
6277 //
6278 //  Per InnoDB: Tells something additional to the handler about how to do things.
6279 //
6280 int ha_tokudb::extra(enum ha_extra_function operation) {
6281     TOKUDB_HANDLER_DBUG_ENTER("%d", operation);
6282     switch (operation) {
6283     case HA_EXTRA_RESET_STATE:
6284         reset();
6285         break;
6286     case HA_EXTRA_KEYREAD:
6287         key_read = true;           // Query satisfied with key
6288         break;
6289     case HA_EXTRA_NO_KEYREAD:
6290         key_read = false;
6291         break;
6292     case HA_EXTRA_IGNORE_DUP_KEY:
6293         using_ignore = true;
6294         break;
6295     case HA_EXTRA_NO_IGNORE_DUP_KEY:
6296         using_ignore = false;
6297         break;
6298     case HA_EXTRA_IGNORE_NO_KEY:
6299         using_ignore_no_key = true;
6300         break;
6301     case HA_EXTRA_NO_IGNORE_NO_KEY:
6302         using_ignore_no_key = false;
6303         break;
6304     case HA_EXTRA_NOT_USED:
6305     case HA_EXTRA_PREPARE_FOR_RENAME:
6306         break; // must do nothing and return 0
6307     default:
6308         break;
6309     }
6310     TOKUDB_HANDLER_DBUG_RETURN(0);
6311 }
6312 
6313 int ha_tokudb::reset() {
6314     TOKUDB_HANDLER_DBUG_ENTER("");
6315     key_read = false;
6316     using_ignore = false;
6317     using_ignore_no_key = false;
6318     reset_dsmrr();
6319     invalidate_icp();
6320     TOKUDB_HANDLER_DBUG_RETURN(0);
6321 }
6322 
6323 //
6324 // helper function that iterates through all DB's
6325 // and grabs a lock (either read or write, but not both)
6326 // Parameters:
6327 //      [in]    trans - transaction to be used to pre acquire the lock
6328 //              lt - type of lock to get, either lock_read or lock_write
6329 //  Returns:
6330 //      0 on success
6331 //      error otherwise
6332 //
6333 int ha_tokudb::acquire_table_lock (DB_TXN* trans, TABLE_LOCK_TYPE lt) {
6334     TOKUDB_HANDLER_DBUG_ENTER("%p %s", trans, lt == lock_read ? "r" : "w");
6335     int error = ENOSYS;
6336     if (!num_DBs_locked_in_bulk) {
6337         rwlock_t_lock_read(share->_num_DBs_lock);
6338     }
6339     uint curr_num_DBs = share->num_DBs;
6340     if (lt == lock_read) {
6341         error = 0;
6342         goto cleanup;
6343     } else if (lt == lock_write) {
6344         for (uint i = 0; i < curr_num_DBs; i++) {
6345             DB* db = share->key_file[i];
6346             error = db->pre_acquire_table_lock(db, trans);
6347             if (error == EINVAL)
6348                 TOKUDB_HANDLER_TRACE("%d db=%p trans=%p", i, db, trans);
6349             if (error) break;
6350         }
6351         TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6352         if (error) goto cleanup;
6353     } else {
6354         error = ENOSYS;
6355         goto cleanup;
6356     }
6357 
6358     error = 0;
6359 cleanup:
6360     if (!num_DBs_locked_in_bulk) {
6361         share->_num_DBs_lock.unlock();
6362     }
6363     TOKUDB_HANDLER_DBUG_RETURN(error);
6364 }
6365 
6366 int ha_tokudb::create_txn(THD* thd, tokudb_trx_data* trx) {
6367     int error;
6368     ulong tx_isolation = thd_tx_isolation(thd);
6369     HA_TOKU_ISO_LEVEL toku_iso_level = tx_to_toku_iso(tx_isolation);
6370     bool is_autocommit = !thd_test_options(
6371             thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN);
6372 
6373     /* First table lock, start transaction */
6374     if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) &&
6375          !trx->all &&
6376          (thd_sql_command(thd) != SQLCOM_CREATE_TABLE) &&
6377          (thd_sql_command(thd) != SQLCOM_DROP_TABLE) &&
6378          (thd_sql_command(thd) != SQLCOM_DROP_INDEX) &&
6379          (thd_sql_command(thd) != SQLCOM_CREATE_INDEX) &&
6380          (thd_sql_command(thd) != SQLCOM_ALTER_TABLE)) {
6381         /* QQQ We have to start a master transaction */
6382         // DBUG_PRINT("trans", ("starting transaction all "));
6383         uint32_t txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6384 #if 50614 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
6385         if (thd_tx_is_read_only(thd)) {
6386             txn_begin_flags |= DB_TXN_READ_ONLY;
6387         }
6388 #endif
6389         if ((error = txn_begin(db_env, NULL, &trx->all, txn_begin_flags, thd))) {
6390             goto cleanup;
6391         }
6392         TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6393             TOKUDB_DEBUG_TXN,
6394             "created master %p",
6395             trx->all);
6396         trx->sp_level = trx->all;
6397         trans_register_ha(thd, true, tokudb_hton);
6398     }
6399     DBUG_PRINT("trans", ("starting transaction stmt"));
6400     if (trx->stmt) {
6401         TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6402             TOKUDB_DEBUG_TXN,
6403             "warning:stmt=%p",
6404             trx->stmt);
6405     }
6406     uint32_t txn_begin_flags;
6407     if (trx->all == NULL) {
6408         txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6409         //
6410         // if the isolation level that the user has set is serializable,
6411         // but autocommit is on and this is just a select,
6412         // then we can go ahead and set the isolation level to
6413         // be a snapshot read, because we can serialize
6414         // the transaction to be the point in time at which the snapshot began.
6415         //
6416         if (txn_begin_flags == 0 && is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT) {
6417             txn_begin_flags = DB_TXN_SNAPSHOT;
6418         }
6419         if (is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT &&
6420             !thd->in_sub_stmt && lock.type <= TL_READ_NO_INSERT &&
6421             !thd->lex->uses_stored_routines()) {
6422             txn_begin_flags |= DB_TXN_READ_ONLY;
6423         }
6424     } else {
6425         txn_begin_flags = DB_INHERIT_ISOLATION;
6426     }
6427     error = txn_begin(db_env, trx->sp_level, &trx->stmt, txn_begin_flags, thd);
6428     if (error) {
6429         /* We leave the possible master transaction open */
6430         goto cleanup;
6431     }
6432     trx->sub_sp_level = trx->stmt;
6433     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6434         TOKUDB_DEBUG_TXN,
6435         "created stmt %p sp_level %p",
6436         trx->sp_level,
6437         trx->stmt);
6438     reset_stmt_progress(&trx->stmt_progress);
6439     trans_register_ha(thd, false, tokudb_hton);
6440 cleanup:
6441     return error;
6442 }
6443 
6444 static const char *lock_type_str(int lock_type) {
6445     if (lock_type == F_RDLCK) return "F_RDLCK";
6446     if (lock_type == F_WRLCK) return "F_WRLCK";
6447     if (lock_type == F_UNLCK) return "F_UNLCK";
6448     return "?";
6449 }
6450 
6451 /*
6452   As MySQL will execute an external lock for every new table it uses
6453   we can use this to start the transactions.
6454   If we are in auto_commit mode we just need to start a transaction
6455   for the statement to be able to rollback the statement.
6456   If not, we have to start a master transaction if there doesn't exist
6457   one from before.
6458 */
6459 //
6460 // Parameters:
6461 //      [in]    thd - handle to the user thread
6462 //              lock_type - the type of lock
6463 // Returns:
6464 //      0 on success
6465 //      error otherwise
6466 //
6467 int ha_tokudb::external_lock(THD * thd, int lock_type) {
6468     TOKUDB_HANDLER_DBUG_ENTER(
6469         "cmd %d lock %d %s %s",
6470         thd_sql_command(thd),
6471         lock_type,
6472         lock_type_str(lock_type),
6473         share->full_table_name());
6474     if (TOKUDB_UNLIKELY(!TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ENTER) &&
6475         TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_LOCK))) {
6476         TOKUDB_HANDLER_TRACE(
6477             "cmd %d lock %d %s %s",
6478             thd_sql_command(thd),
6479             lock_type,
6480             lock_type_str(lock_type),
6481             share->full_table_name());
6482     }
6483     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s", thd->query());
6484 
6485     int error = 0;
6486     tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6487     if (!trx) {
6488         error = create_tokudb_trx_data_instance(&trx);
6489         if (error) { goto cleanup; }
6490         thd_set_ha_data(thd, tokudb_hton, trx);
6491     }
6492 
6493     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6494         TOKUDB_DEBUG_TXN,
6495         "trx %p %p %p %p %u %u",
6496         trx->all,
6497         trx->stmt,
6498         trx->sp_level,
6499         trx->sub_sp_level,
6500         trx->tokudb_lock_count,
6501         trx->create_lock_count);
6502 
6503     if (trx->all == NULL) {
6504         trx->sp_level = NULL;
6505     }
6506     if (lock_type != F_UNLCK) {
6507         use_write_locks = false;
6508         if (lock_type == F_WRLCK) {
6509             use_write_locks = true;
6510         }
6511         if (!trx->stmt) {
6512             transaction = NULL;    // Safety
6513             error = create_txn(thd, trx);
6514             if (error) {
6515                 goto cleanup;
6516             }
6517             trx->create_lock_count = trx->tokudb_lock_count;
6518         }
6519         transaction = trx->sub_sp_level;
6520         trx->tokudb_lock_count++;
6521     } else {
6522         share->update_row_count(thd, added_rows, deleted_rows, updated_rows);
6523         added_rows = 0;
6524         deleted_rows = 0;
6525         updated_rows = 0;
6526         share->rows_from_locked_table = 0;
6527         if (trx->tokudb_lock_count > 0) {
6528             if (--trx->tokudb_lock_count <= trx->create_lock_count) {
6529                 trx->create_lock_count = 0;
6530                 if (trx->stmt) {
6531                     /*
6532                       F_UNLCK is done without a transaction commit / rollback.
6533                       This happens if the thread didn't update any rows
6534                       We must in this case commit the work to keep the row locks
6535                     */
6536                     DBUG_PRINT("trans", ("commiting non-updating transaction"));
6537                     reset_stmt_progress(&trx->stmt_progress);
6538                     commit_txn(trx->stmt, 0);
6539                     trx->stmt = NULL;
6540                     trx->sub_sp_level = NULL;
6541                 }
6542             }
6543             transaction = NULL;
6544         }
6545     }
6546 cleanup:
6547     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6548     TOKUDB_HANDLER_DBUG_RETURN(error);
6549 }
6550 
6551 /*
6552   When using LOCK TABLE's external_lock is only called when the actual
6553   TABLE LOCK is done.
6554   Under LOCK TABLES, each used tables will force a call to start_stmt.
6555 */
6556 int ha_tokudb::start_stmt(THD* thd, thr_lock_type lock_type) {
6557     TOKUDB_HANDLER_DBUG_ENTER(
6558         "cmd %d lock %d %s",
6559         thd_sql_command(thd),
6560         lock_type,
6561         share->full_table_name());
6562 
6563     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s", thd->query());
6564 
6565     int error = 0;
6566     tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6567     if (!trx) {
6568         error = create_tokudb_trx_data_instance(&trx);
6569         if (error) { goto cleanup; }
6570         thd_set_ha_data(thd, tokudb_hton, trx);
6571     }
6572 
6573     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6574         TOKUDB_DEBUG_TXN,
6575         "trx %p %p %p %p %u %u",
6576         trx->all,
6577         trx->stmt,
6578         trx->sp_level,
6579         trx->sub_sp_level,
6580         trx->tokudb_lock_count,
6581         trx->create_lock_count);
6582 
6583     /*
6584        note that trx->stmt may have been already initialized as start_stmt()
6585        is called for *each table* not for each storage engine,
6586        and there could be many bdb tables referenced in the query
6587      */
6588     if (!trx->stmt) {
6589         error = create_txn(thd, trx);
6590         if (error) {
6591             goto cleanup;
6592         }
6593         trx->create_lock_count = trx->tokudb_lock_count;
6594     } else {
6595         TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6596             TOKUDB_DEBUG_TXN,
6597             "trx->stmt %p already existed",
6598             trx->stmt);
6599     }
6600     if (added_rows > deleted_rows) {
6601         share->rows_from_locked_table = added_rows - deleted_rows;
6602     }
6603     transaction = trx->sub_sp_level;
6604     trans_register_ha(thd, false, tokudb_hton);
6605 cleanup:
6606     TOKUDB_HANDLER_DBUG_RETURN(error);
6607 }
6608 
6609 
6610 uint32_t ha_tokudb::get_cursor_isolation_flags(enum thr_lock_type lock_type, THD* thd) {
6611     uint sql_command = thd_sql_command(thd);
6612     bool in_lock_tables = thd_in_lock_tables(thd);
6613 
6614     //
6615     // following InnoDB's lead and having checksum command use a snapshot read if told
6616     //
6617     if (sql_command == SQLCOM_CHECKSUM) {
6618         return 0;
6619     }
6620     else if ((lock_type == TL_READ && in_lock_tables) ||
6621              (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
6622              sql_command != SQLCOM_SELECT ||
6623              (sql_command == SQLCOM_SELECT && lock_type >= TL_WRITE_ALLOW_WRITE)) { // select for update
6624       ulong tx_isolation = thd_tx_isolation(thd);
6625       // pattern matched from InnoDB
6626       if ( (tx_isolation == ISO_READ_COMMITTED || tx_isolation == ISO_READ_UNCOMMITTED) &&
6627 	   (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT) &&
6628 	   (sql_command == SQLCOM_INSERT_SELECT
6629               || sql_command == SQLCOM_REPLACE_SELECT
6630               || sql_command == SQLCOM_UPDATE
6631 	    || sql_command == SQLCOM_CREATE_TABLE) )
6632         {
6633 	  return 0;
6634         }
6635       else {
6636 	return DB_SERIALIZABLE;
6637       }
6638     }
6639     else {
6640         return 0;
6641     }
6642 }
6643 
6644 /*
6645   The idea with handler::store_lock() is the following:
6646 
6647   The statement decided which locks we should need for the table
6648   for updates/deletes/inserts we get WRITE locks, for SELECT... we get
6649   read locks.
6650 
6651   Before adding the lock into the table lock handler (see thr_lock.c)
6652   mysqld calls store lock with the requested locks.  Store lock can now
6653   modify a write lock to a read lock (or some other lock), ignore the
6654   lock (if we don't want to use MySQL table locks at all) or add locks
6655   for many tables (like we do when we are using a MERGE handler).
6656 
6657   TokuDB changes all WRITE locks to TL_WRITE_ALLOW_WRITE (which
6658   signals that we are doing WRITES, but we are still allowing other
6659   reader's and writer's.
6660 
6661   When releasing locks, store_lock() are also called. In this case one
6662   usually doesn't have to do anything.
6663 
6664   In some exceptional cases MySQL may send a request for a TL_IGNORE;
6665   This means that we are requesting the same lock as last time and this
6666   should also be ignored. (This may happen when someone does a flush
6667   table when we have opened a part of the tables, in which case mysqld
6668   closes and reopens the tables and tries to get the same locks at last
6669   time).  In the future we will probably try to remove this.
6670 */
6671 
6672 THR_LOCK_DATA* *ha_tokudb::store_lock(
6673     THD* thd,
6674     THR_LOCK_DATA** to,
6675     enum thr_lock_type lock_type) {
6676 
6677     TOKUDB_HANDLER_DBUG_ENTER(
6678         "lock_type=%d cmd=%d",
6679         lock_type,
6680         thd_sql_command(thd));
6681     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6682         TOKUDB_DEBUG_LOCK,
6683         "lock_type=%d cmd=%d",
6684         lock_type,
6685         thd_sql_command(thd));
6686 
6687     if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) {
6688         enum_sql_command sql_command = (enum_sql_command) thd_sql_command(thd);
6689         if (!thd->in_lock_tables) {
6690             if (sql_command == SQLCOM_CREATE_INDEX &&
6691                 tokudb::sysvars::create_index_online(thd)) {
6692                 // hot indexing
6693                 rwlock_t_lock_read(share->_num_DBs_lock);
6694                 if (share->num_DBs ==
6695                     (table->s->keys + tokudb_test(hidden_primary_key))) {
6696                     lock_type = TL_WRITE_ALLOW_WRITE;
6697                 }
6698                 share->_num_DBs_lock.unlock();
6699             } else if ((lock_type >= TL_WRITE_CONCURRENT_INSERT &&
6700                         lock_type <= TL_WRITE) &&
6701                         sql_command != SQLCOM_TRUNCATE &&
6702                         !thd_tablespace_op(thd)) {
6703                 // allow concurrent writes
6704                 lock_type = TL_WRITE_ALLOW_WRITE;
6705             } else if (sql_command == SQLCOM_OPTIMIZE &&
6706                        lock_type == TL_READ_NO_INSERT) {
6707                 // hot optimize table
6708                 lock_type = TL_READ;
6709             }
6710         }
6711         lock.type = lock_type;
6712     }
6713     *to++ = &lock;
6714     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6715         TOKUDB_DEBUG_LOCK,
6716         "lock_type=%d",
6717         lock_type);
6718     TOKUDB_HANDLER_DBUG_RETURN_PTR(to);
6719 }
6720 
6721 static toku_compression_method get_compression_method(DB* file) {
6722     enum toku_compression_method method;
6723     int r = file->get_compression_method(file, &method);
6724     assert_always(r == 0);
6725     return method;
6726 }
6727 
6728 #if defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) && \
6729     TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6730 enum row_type ha_tokudb::get_row_type() const {
6731     toku_compression_method compression_method = get_compression_method(share->file);
6732     return toku_compression_method_to_row_type(compression_method);
6733 }
6734 #endif  // defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) &&
6735         // TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6736 
6737 static int create_sub_table(
6738     const char* table_name,
6739     DBT* row_descriptor,
6740     DB_TXN* txn,
6741     uint32_t block_size,
6742     uint32_t read_block_size,
6743     toku_compression_method compression_method,
6744     bool is_hot_index,
6745     uint32_t fanout) {
6746 
6747     TOKUDB_DBUG_ENTER("");
6748     int error;
6749     DB *file = NULL;
6750     uint32_t create_flags;
6751 
6752 
6753     error = db_create(&file, db_env, 0);
6754     if (error) {
6755         DBUG_PRINT("error", ("Got error: %d when creating table", error));
6756         my_errno = error;
6757         goto exit;
6758     }
6759 
6760 
6761     if (block_size != 0) {
6762         error = file->set_pagesize(file, block_size);
6763         if (error != 0) {
6764             DBUG_PRINT(
6765                 "error",
6766                 ("Got error: %d when setting block size %u for table '%s'",
6767                     error,
6768                     block_size,
6769                     table_name));
6770             goto exit;
6771         }
6772     }
6773     if (read_block_size != 0) {
6774         error = file->set_readpagesize(file, read_block_size);
6775         if (error != 0) {
6776             DBUG_PRINT(
6777                 "error",
6778                 ("Got error: %d when setting read block size %u for table '%s'",
6779                     error,
6780                     read_block_size,
6781                     table_name));
6782             goto exit;
6783         }
6784     }
6785     if (fanout != 0) {
6786         error = file->set_fanout(file, fanout);
6787         if (error != 0) {
6788             DBUG_PRINT(
6789                 "error",
6790                 ("Got error: %d when setting fanout %u for table '%s'",
6791                     error,
6792                     fanout,
6793                     table_name));
6794             goto exit;
6795         }
6796     }
6797     error = file->set_compression_method(file, compression_method);
6798     if (error != 0) {
6799         DBUG_PRINT(
6800             "error",
6801             ("Got error: %d when setting compression type %u for table '%s'",
6802                 error,
6803                 compression_method,
6804                 table_name));
6805         goto exit;
6806     }
6807 
6808     create_flags =
6809         DB_THREAD | DB_CREATE | DB_EXCL | (is_hot_index ? DB_IS_HOT_INDEX : 0);
6810     error =
6811         file->open(
6812             file,
6813             txn,
6814             table_name,
6815             NULL,
6816             DB_BTREE,
6817             create_flags,
6818             my_umask);
6819     if (error) {
6820         DBUG_PRINT(
6821             "error",
6822             ("Got error: %d when opening table '%s'", error, table_name));
6823         goto exit;
6824     }
6825 
6826     error =
6827         file->change_descriptor(
6828             file,
6829             txn,
6830             row_descriptor,
6831             (is_hot_index ? DB_IS_HOT_INDEX |
6832                 DB_UPDATE_CMP_DESCRIPTOR :
6833                 DB_UPDATE_CMP_DESCRIPTOR));
6834     if (error) {
6835         DBUG_PRINT(
6836             "error",
6837             ("Got error: %d when setting row descriptor for table '%s'",
6838                 error,
6839                 table_name));
6840         goto exit;
6841     }
6842 
6843     error = 0;
6844 exit:
6845     if (file) {
6846         int r = file->close(file, 0);
6847         assert_always(r==0);
6848     }
6849     TOKUDB_DBUG_RETURN(error);
6850 }
6851 
6852 void ha_tokudb::update_create_info(HA_CREATE_INFO* create_info) {
6853     if (share->has_auto_inc) {
6854         info(HA_STATUS_AUTO);
6855         if (!(create_info->used_fields & HA_CREATE_USED_AUTO) ||
6856             create_info->auto_increment_value < stats.auto_increment_value) {
6857             create_info->auto_increment_value = stats.auto_increment_value;
6858         }
6859     }
6860 #if defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) && \
6861     TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6862     if (!(create_info->used_fields & HA_CREATE_USED_ROW_FORMAT)) {
6863         // show create table asks us to update this create_info, this makes it
6864         // so we'll always show what compression type we're using
6865         create_info->row_type = get_row_type();
6866         if (create_info->row_type == ROW_TYPE_TOKU_ZLIB &&
6867             tokudb::sysvars::hide_default_row_format(ha_thd()) != 0) {
6868             create_info->row_type = ROW_TYPE_DEFAULT;
6869         }
6870     }
6871 #endif  // defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) &&
6872         // TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6873 }
6874 
6875 //
6876 // removes key name from status.tokudb.
6877 // needed for when we are dropping indexes, so that
6878 // during drop table, we do not attempt to remove already dropped
6879 // indexes because we did not keep status.tokudb in sync with list of indexes.
6880 //
6881 int ha_tokudb::remove_key_name_from_status(DB* status_block, char* key_name, DB_TXN* txn) {
6882     int error;
6883     uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6884     HA_METADATA_KEY md_key = hatoku_key_name;
6885     memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6886     //
6887     // put index name in status.tokudb
6888     //
6889     memcpy(
6890         status_key_info + sizeof(HA_METADATA_KEY),
6891         key_name,
6892         strlen(key_name) + 1
6893         );
6894     error = remove_metadata(
6895         status_block,
6896         status_key_info,
6897         sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6898         txn
6899         );
6900     return error;
6901 }
6902 
6903 //
6904 // writes the key name in status.tokudb, so that we may later delete or rename
6905 // the dictionary associated with key_name
6906 //
6907 int ha_tokudb::write_key_name_to_status(DB* status_block, char* key_name, DB_TXN* txn) {
6908     int error;
6909     uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6910     HA_METADATA_KEY md_key = hatoku_key_name;
6911     memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6912     //
6913     // put index name in status.tokudb
6914     //
6915     memcpy(
6916         status_key_info + sizeof(HA_METADATA_KEY),
6917         key_name,
6918         strlen(key_name) + 1
6919         );
6920     error = write_metadata(
6921         status_block,
6922         status_key_info,
6923         sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6924         NULL,
6925         0,
6926         txn
6927         );
6928     return error;
6929 }
6930 
6931 //
6932 // some tracing moved out of ha_tokudb::create, because ::create was
6933 // getting cluttered
6934 //
6935 void ha_tokudb::trace_create_table_info(TABLE* form) {
6936     uint i;
6937     //
6938     // tracing information about what type of table we are creating
6939     //
6940     if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_OPEN))) {
6941         for (i = 0; i < form->s->fields; i++) {
6942             Field *field = form->s->field[i];
6943             TOKUDB_HANDLER_TRACE(
6944                 "field:%d:%s:type=%d:flags=%x",
6945                 i,
6946                 field->field_name,
6947                 field->type(),
6948                 field->flags);
6949         }
6950         for (i = 0; i < form->s->keys; i++) {
6951             KEY *key = &form->s->key_info[i];
6952             TOKUDB_HANDLER_TRACE(
6953                 "key:%d:%s:%d",
6954                 i,
6955                 key->name,
6956                 key->user_defined_key_parts);
6957             uint p;
6958             for (p = 0; p < key->user_defined_key_parts; p++) {
6959                 KEY_PART_INFO* key_part = &key->key_part[p];
6960                 Field* field = key_part->field;
6961                 TOKUDB_HANDLER_TRACE(
6962                     "key:%d:%d:length=%d:%s:type=%d:flags=%x",
6963                     i,
6964                     p,
6965                     key_part->length,
6966                     field->field_name,
6967                     field->type(),
6968                     field->flags);
6969             }
6970         }
6971     }
6972 }
6973 
6974 static uint32_t get_max_desc_size(KEY_AND_COL_INFO* kc_info, TABLE* form) {
6975     uint32_t max_row_desc_buff_size;
6976     // upper bound of key comparison descriptor
6977     max_row_desc_buff_size = 2*(form->s->fields * 6)+10;
6978     // upper bound for sec. key part
6979     max_row_desc_buff_size += get_max_secondary_key_pack_desc_size(kc_info);
6980     // upper bound for clustering val part
6981     max_row_desc_buff_size += get_max_clustering_val_pack_desc_size(form->s);
6982     return max_row_desc_buff_size;
6983 }
6984 
6985 static uint32_t create_secondary_key_descriptor(
6986     uchar* buf,
6987     KEY* key_info,
6988     KEY* prim_key,
6989     uint hpk,
6990     TABLE* form,
6991     uint primary_key,
6992     uint32_t keynr,
6993     KEY_AND_COL_INFO* kc_info) {
6994 
6995     uchar* ptr = NULL;
6996 
6997     ptr = buf;
6998     ptr += create_toku_key_descriptor(
6999         ptr,
7000         false,
7001         key_info,
7002         hpk,
7003         prim_key
7004         );
7005 
7006     ptr += create_toku_secondary_key_pack_descriptor(
7007         ptr,
7008         hpk,
7009         primary_key,
7010         form->s,
7011         form,
7012         kc_info,
7013         key_info,
7014         prim_key
7015         );
7016 
7017     ptr += create_toku_clustering_val_pack_descriptor(
7018         ptr,
7019         primary_key,
7020         form->s,
7021         kc_info,
7022         keynr,
7023         key_is_clustering(key_info)
7024         );
7025     return ptr - buf;
7026 }
7027 
7028 
7029 //
7030 // creates dictionary for secondary index, with key description key_info, all using txn
7031 //
7032 int ha_tokudb::create_secondary_dictionary(
7033     const char* name,
7034     TABLE* form,
7035     KEY* key_info,
7036     DB_TXN* txn,
7037     KEY_AND_COL_INFO* kc_info,
7038     uint32_t keynr,
7039     bool is_hot_index,
7040     toku_compression_method compression_method) {
7041 
7042     int error;
7043     DBT row_descriptor;
7044     uchar* row_desc_buff = NULL;
7045     char* newname = NULL;
7046     size_t newname_len = 0;
7047     KEY* prim_key = NULL;
7048     char dict_name[MAX_DICT_NAME_LEN];
7049     uint32_t max_row_desc_buff_size;
7050     uint hpk= (form->s->primary_key >= MAX_KEY) ?
7051         TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7052     uint32_t block_size;
7053     uint32_t read_block_size;
7054     uint32_t fanout;
7055     THD* thd = ha_thd();
7056 
7057     memset(&row_descriptor, 0, sizeof(row_descriptor));
7058 
7059     max_row_desc_buff_size = get_max_desc_size(kc_info,form);
7060 
7061     row_desc_buff = (uchar*)tokudb::memory::malloc(
7062         max_row_desc_buff_size,
7063         MYF(MY_WME));
7064     if (row_desc_buff == NULL) {
7065         error = ENOMEM;
7066         goto cleanup;
7067     }
7068 
7069     newname_len = get_max_dict_name_path_length(name);
7070     newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7071     if (newname == NULL) {
7072         error = ENOMEM;
7073         goto cleanup;
7074     }
7075 
7076     sprintf(dict_name, "key-%s", key_info->name);
7077     make_name(newname, newname_len, name, dict_name);
7078 
7079     prim_key = (hpk) ? NULL : &form->s->key_info[primary_key];
7080 
7081     //
7082     // setup the row descriptor
7083     //
7084     row_descriptor.data = row_desc_buff;
7085     //
7086     // save data necessary for key comparisons
7087     //
7088     row_descriptor.size = create_secondary_key_descriptor(
7089         row_desc_buff,
7090         key_info,
7091         prim_key,
7092         hpk,
7093         form,
7094         primary_key,
7095         keynr,
7096         kc_info);
7097     assert_always(row_descriptor.size <= max_row_desc_buff_size);
7098 
7099     block_size = tokudb::sysvars::block_size(thd);
7100     read_block_size = tokudb::sysvars::read_block_size(thd);
7101     fanout = tokudb::sysvars::fanout(thd);
7102 
7103     error = create_sub_table(
7104         newname,
7105         &row_descriptor,
7106         txn,
7107         block_size,
7108         read_block_size,
7109         compression_method,
7110         is_hot_index,
7111         fanout);
7112 cleanup:
7113     tokudb::memory::free(newname);
7114     tokudb::memory::free(row_desc_buff);
7115     return error;
7116 }
7117 
7118 
7119 static uint32_t create_main_key_descriptor(
7120     uchar* buf,
7121     KEY* prim_key,
7122     uint hpk,
7123     uint primary_key,
7124     TABLE* form,
7125     KEY_AND_COL_INFO* kc_info) {
7126 
7127     uchar* ptr = buf;
7128     ptr += create_toku_key_descriptor(
7129         ptr,
7130         hpk,
7131         prim_key,
7132         false,
7133         NULL);
7134 
7135     ptr += create_toku_main_key_pack_descriptor(ptr);
7136 
7137     ptr += create_toku_clustering_val_pack_descriptor(
7138         ptr,
7139         primary_key,
7140         form->s,
7141         kc_info,
7142         primary_key,
7143         false);
7144     return ptr - buf;
7145 }
7146 
7147 //
7148 // create and close the main dictionarr with name of "name" using table form, all within
7149 // transaction txn.
7150 //
7151 int ha_tokudb::create_main_dictionary(
7152     const char* name,
7153     TABLE* form,
7154     DB_TXN* txn,
7155     KEY_AND_COL_INFO* kc_info,
7156     toku_compression_method compression_method) {
7157 
7158     int error;
7159     DBT row_descriptor;
7160     uchar* row_desc_buff = NULL;
7161     char* newname = NULL;
7162     size_t newname_len = 0;
7163     KEY* prim_key = NULL;
7164     uint32_t max_row_desc_buff_size;
7165     uint hpk = (form->s->primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7166     uint32_t block_size;
7167     uint32_t read_block_size;
7168     uint32_t fanout;
7169     THD* thd = ha_thd();
7170 
7171     memset(&row_descriptor, 0, sizeof(row_descriptor));
7172     max_row_desc_buff_size = get_max_desc_size(kc_info, form);
7173 
7174     row_desc_buff = (uchar*)tokudb::memory::malloc(
7175         max_row_desc_buff_size,
7176         MYF(MY_WME));
7177     if (row_desc_buff == NULL) {
7178         error = ENOMEM;
7179         goto cleanup;
7180     }
7181 
7182     newname_len = get_max_dict_name_path_length(name);
7183     newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7184     if (newname == NULL) {
7185         error = ENOMEM;
7186         goto cleanup;
7187     }
7188 
7189     make_name(newname, newname_len, name, "main");
7190 
7191     prim_key = (hpk) ? NULL : &form->s->key_info[primary_key];
7192 
7193     //
7194     // setup the row descriptor
7195     //
7196     row_descriptor.data = row_desc_buff;
7197     //
7198     // save data necessary for key comparisons
7199     //
7200     row_descriptor.size = create_main_key_descriptor(
7201         row_desc_buff,
7202         prim_key,
7203         hpk,
7204         primary_key,
7205         form,
7206         kc_info);
7207     assert_always(row_descriptor.size <= max_row_desc_buff_size);
7208 
7209     block_size = tokudb::sysvars::block_size(thd);
7210     read_block_size = tokudb::sysvars::read_block_size(thd);
7211     fanout = tokudb::sysvars::fanout(thd);
7212 
7213     /* Create the main table that will hold the real rows */
7214     error = create_sub_table(
7215         newname,
7216         &row_descriptor,
7217         txn,
7218         block_size,
7219         read_block_size,
7220         compression_method,
7221         false,
7222         fanout);
7223 cleanup:
7224     tokudb::memory::free(newname);
7225     tokudb::memory::free(row_desc_buff);
7226     return error;
7227 }
7228 
7229 //
7230 // Creates a new table
7231 // Parameters:
7232 //      [in]    name - table name
7233 //      [in]    form - info on table, columns and indexes
7234 //      [in]    create_info - more info on table, CURRENTLY UNUSED
7235 // Returns:
7236 //      0 on success
7237 //      error otherwise
7238 //
7239 int ha_tokudb::create(
7240     const char* name,
7241     TABLE* form,
7242     HA_CREATE_INFO* create_info) {
7243 
7244     TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7245 
7246     int error;
7247     DB *status_block = NULL;
7248     uint version;
7249     uint capabilities;
7250     DB_TXN* txn = NULL;
7251     bool do_commit = false;
7252     char* newname = NULL;
7253     size_t newname_len = 0;
7254     KEY_AND_COL_INFO kc_info;
7255     tokudb_trx_data *trx = NULL;
7256     THD* thd = ha_thd();
7257 
7258     String database_name, table_name, dictionary_name;
7259     tokudb_split_dname(name, database_name, table_name, dictionary_name);
7260     if (database_name.is_empty() || table_name.is_empty()) {
7261         push_warning_printf(thd,
7262                             Sql_condition::WARN_LEVEL_WARN,
7263                             ER_TABLE_NAME,
7264                             "TokuDB: Table Name or Database Name is empty");
7265         DBUG_RETURN(ER_TABLE_NAME);
7266     }
7267 
7268     memset(&kc_info, 0, sizeof(kc_info));
7269 
7270 #if 100000 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 100999
7271     // TokuDB does not support discover_table_names() and writes no files
7272     // in the database directory, so automatic filename-based
7273     // discover_table_names() doesn't work either. So, it must force .frm
7274     // file to disk.
7275     form->s->write_frm_image();
7276 #endif
7277 
7278 #if defined(TOKU_INCLUDE_OPTION_STRUCTS) && TOKU_INCLUDE_OPTION_STRUCTS
7279     const tokudb::sysvars::format_t row_format =
7280         (tokudb::sysvars::row_format_t)form->s->option_struct->row_format;
7281 #else
7282     // TDB-76 : CREATE TABLE ... LIKE ... does not use source row_format on
7283     //          target table
7284     // Original code would only use create_info->row_type if
7285     // create_info->used_fields & HA_CREATE_USED_ROW_FORMAT was true. This
7286     // would cause us to skip transferring the row_format for a table created
7287     // via CREATE TABLE tn LIKE tn. We also take on more InnoDB like behavior
7288     // and throw a warning if we get a row_format that we can't translate into
7289     // a known TokuDB row_format.
7290     tokudb::sysvars::row_format_t row_format =
7291         tokudb::sysvars::row_format(thd);
7292 
7293     if ((create_info->used_fields & HA_CREATE_USED_ROW_FORMAT) ||
7294         create_info->row_type != ROW_TYPE_DEFAULT) {
7295         row_format = row_type_to_row_format(create_info->row_type);
7296         if (row_format == tokudb::sysvars::SRV_ROW_FORMAT_DEFAULT &&
7297             create_info->row_type != ROW_TYPE_DEFAULT) {
7298             push_warning(thd,
7299                          Sql_condition::WARN_LEVEL_WARN,
7300                          ER_ILLEGAL_HA_CREATE_OPTION,
7301                          "TokuDB: invalid ROW_FORMAT specifier.");
7302         }
7303     }
7304 #endif  // defined(TOKU_INCLUDE_OPTION_STRUCTS) && TOKU_INCLUDE_OPTION_STRUCTS
7305     const toku_compression_method compression_method =
7306         row_format_to_toku_compression_method(row_format);
7307 
7308     bool create_from_engine = (create_info->table_options & HA_OPTION_CREATE_FROM_ENGINE);
7309     if (create_from_engine) {
7310         // table already exists, nothing to do
7311         error = 0;
7312         goto cleanup;
7313     }
7314 
7315     // validate the fields in the table. If the table has fields
7316     // we do not support that came from an old version of MySQL,
7317     // gracefully return an error
7318     for (uint32_t i = 0; i < form->s->fields; i++) {
7319         Field* field = table_share->field[i];
7320         if (!field_valid_for_tokudb_table(field)) {
7321             sql_print_error("Table %s has an invalid field %s, that was created "
7322                 "with an old version of MySQL. This field is no longer supported. "
7323                 "This is probably due to an alter table engine=TokuDB. To load this "
7324                 "table, do a dump and load",
7325                 name,
7326                 field->field_name
7327                 );
7328             error = HA_ERR_UNSUPPORTED;
7329             goto cleanup;
7330         }
7331     }
7332 
7333     newname_len = get_max_dict_name_path_length(name);
7334     newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7335     if (newname == NULL) {
7336         error = ENOMEM;
7337         goto cleanup;
7338     }
7339 
7340     trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
7341     if (trx && trx->sub_sp_level &&
7342         thd_sql_command(thd) == SQLCOM_CREATE_TABLE) {
7343         txn = trx->sub_sp_level;
7344     } else {
7345         do_commit = true;
7346         error = txn_begin(db_env, 0, &txn, 0, thd);
7347         if (error) {
7348             goto cleanup;
7349         }
7350     }
7351 
7352     primary_key = form->s->primary_key;
7353     hidden_primary_key = (primary_key  >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7354     if (hidden_primary_key) {
7355         primary_key = form->s->keys;
7356     }
7357 
7358     /* do some tracing */
7359     trace_create_table_info(form);
7360 
7361     /* Create status.tokudb and save relevant metadata */
7362     make_name(newname, newname_len, name, "status");
7363 
7364     error = tokudb::metadata::create(db_env, &status_block, newname, txn);
7365     if (error) { goto cleanup; }
7366 
7367     version = HA_TOKU_VERSION;
7368     error = write_to_status(
7369         status_block,
7370         hatoku_new_version,
7371         &version,
7372         sizeof(version),
7373         txn);
7374     if (error) {
7375         goto cleanup;
7376     }
7377 
7378     capabilities = HA_TOKU_CAP;
7379     error = write_to_status(
7380         status_block,
7381         hatoku_capabilities,
7382         &capabilities,
7383         sizeof(capabilities),
7384         txn);
7385     if (error) {
7386         goto cleanup;
7387     }
7388 
7389     error = write_auto_inc_create(
7390         status_block,
7391         create_info->auto_increment_value,
7392         txn);
7393     if (error) {
7394         goto cleanup;
7395     }
7396 
7397 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
7398 #if defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
7399     if (TOKU_PARTITION_WRITE_FRM_DATA || form->part_info == NULL) {
7400         error = write_frm_data(status_block, txn, form->s->path.str);
7401         if (error) {
7402             goto cleanup;
7403         }
7404     }
7405 #else
7406     error = write_frm_data(status_block, txn, form->s->path.str);
7407     if (error) {
7408         goto cleanup;
7409     }
7410 #endif  // defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
7411 #endif  // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
7412 
7413     error = allocate_key_and_col_info(form->s, &kc_info);
7414     if (error) {
7415         goto cleanup;
7416     }
7417 
7418     error = initialize_key_and_col_info(
7419         form->s,
7420         form,
7421         &kc_info,
7422         hidden_primary_key,
7423         primary_key);
7424     if (error) {
7425         goto cleanup;
7426     }
7427 
7428     error = create_main_dictionary(
7429         name,
7430         form,
7431         txn,
7432         &kc_info,
7433         compression_method);
7434     if (error) {
7435         goto cleanup;
7436     }
7437 
7438 
7439     for (uint i = 0; i < form->s->keys; i++) {
7440         if (i != primary_key) {
7441             error = create_secondary_dictionary(
7442                 name,
7443                 form,
7444                 &form->key_info[i],
7445                 txn,
7446                 &kc_info,
7447                 i,
7448                 false,
7449                 compression_method);
7450             if (error) {
7451                 goto cleanup;
7452             }
7453 
7454             error = write_key_name_to_status(
7455                 status_block,
7456                 form->s->key_info[i].name,
7457                 txn);
7458             if (error) {
7459                 goto cleanup;
7460             }
7461         }
7462     }
7463 
7464     error = 0;
7465 cleanup:
7466     if (status_block != NULL) {
7467         int r = tokudb::metadata::close(&status_block);
7468         assert_always(r==0);
7469     }
7470     free_key_and_col_info(&kc_info);
7471     if (do_commit && txn) {
7472         if (error) {
7473             abort_txn(txn);
7474         } else {
7475             commit_txn(txn,0);
7476         }
7477     }
7478     tokudb::memory::free(newname);
7479     TOKUDB_HANDLER_DBUG_RETURN(error);
7480 }
7481 
7482 int ha_tokudb::discard_or_import_tablespace(TOKUDB_UNUSED(my_bool discard)) {
7483     /*
7484     if (discard) {
7485         my_errno=HA_ERR_WRONG_COMMAND;
7486         return my_errno;
7487     }
7488     return add_table_to_metadata(share->table_name);
7489     */
7490     my_errno=HA_ERR_WRONG_COMMAND;
7491     return my_errno;
7492 }
7493 
7494 
7495 //
7496 // deletes from_name or renames from_name to to_name, all using transaction txn.
7497 // is_delete specifies which we are doing
7498 // is_key specifies if it is a secondary index (and hence a "key-" needs to be prepended) or
7499 // if it is not a secondary index
7500 //
7501 int ha_tokudb::delete_or_rename_dictionary(
7502     const char* from_name,
7503     const char* to_name,
7504     const char* secondary_name,
7505     bool is_key,
7506     DB_TXN* txn,
7507     bool is_delete) {
7508 
7509     int error;
7510     char dict_name[MAX_DICT_NAME_LEN];
7511     char* new_from_name = NULL;
7512     size_t new_from_name_len = 0;
7513     char* new_to_name = NULL;
7514     size_t new_to_name_len = 0;
7515     assert_always(txn);
7516 
7517     new_from_name_len = get_max_dict_name_path_length(from_name);
7518     new_from_name = (char*)tokudb::memory::malloc(
7519         new_from_name_len,
7520         MYF(MY_WME));
7521     if (new_from_name == NULL) {
7522         error = ENOMEM;
7523         goto cleanup;
7524     }
7525     if (!is_delete) {
7526         assert_always(to_name);
7527         new_to_name_len = get_max_dict_name_path_length(to_name);
7528         new_to_name = (char*)tokudb::memory::malloc(
7529             new_to_name_len,
7530             MYF(MY_WME));
7531         if (new_to_name == NULL) {
7532             error = ENOMEM;
7533             goto cleanup;
7534         }
7535     }
7536 
7537     if (is_key) {
7538         sprintf(dict_name, "key-%s", secondary_name);
7539         make_name(new_from_name, new_from_name_len, from_name, dict_name);
7540     } else {
7541         make_name(new_from_name, new_from_name_len, from_name, secondary_name);
7542     }
7543     if (!is_delete) {
7544         if (is_key) {
7545             sprintf(dict_name, "key-%s", secondary_name);
7546             make_name(new_to_name, new_to_name_len, to_name, dict_name);
7547         } else {
7548             make_name(new_to_name, new_to_name_len, to_name, secondary_name);
7549         }
7550     }
7551 
7552     if (is_delete) {
7553         error = db_env->dbremove(db_env, txn, new_from_name, NULL, 0);
7554     } else {
7555         error = db_env->dbrename(
7556             db_env,
7557             txn,
7558             new_from_name,
7559             NULL,
7560             new_to_name,
7561             0);
7562     }
7563     if (error) {
7564         goto cleanup;
7565     }
7566 
7567 cleanup:
7568     tokudb::memory::free(new_from_name);
7569     tokudb::memory::free(new_to_name);
7570     return error;
7571 }
7572 
7573 
7574 //
7575 // deletes or renames a table. if is_delete is true, then we delete, and to_name can be NULL
7576 // if is_delete is false, then to_name must be non-NULL, as we are renaming the table.
7577 //
7578 int ha_tokudb::delete_or_rename_table (const char* from_name, const char* to_name, bool is_delete) {
7579     THD *thd = ha_thd();
7580     int error;
7581     DB* status_db = NULL;
7582     DBC* status_cursor = NULL;
7583     DB_TXN* txn = NULL;
7584     DBT curr_key;
7585     DBT curr_val;
7586     memset(&curr_key, 0, sizeof(curr_key));
7587     memset(&curr_val, 0, sizeof(curr_val));
7588 
7589     DB_TXN *parent_txn = NULL;
7590     tokudb_trx_data *trx = NULL;
7591     trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
7592     if (thd_sql_command(ha_thd()) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
7593         parent_txn = trx->sub_sp_level;
7594     }
7595 
7596     error = txn_begin(db_env, parent_txn, &txn, 0, thd);
7597     if (error) { goto cleanup; }
7598 
7599     //
7600     // open status db,
7601     // create cursor,
7602     // for each name read out of there, create a db and delete or rename it
7603     //
7604     error = open_status_dictionary(&status_db, from_name, txn);
7605     if (error) { goto cleanup; }
7606 
7607     error = status_db->cursor(status_db, txn, &status_cursor, 0);
7608     if (error) { goto cleanup; }
7609     status_cursor->c_set_check_interrupt_callback(status_cursor, tokudb_killed_thd_callback, thd);
7610 
7611     while (error != DB_NOTFOUND) {
7612         error = status_cursor->c_get(status_cursor, &curr_key, &curr_val, DB_NEXT);
7613         if (error && error != DB_NOTFOUND) {
7614             error = map_to_handler_error(error);
7615             goto cleanup;
7616         }
7617         if (error == DB_NOTFOUND) {
7618             break;
7619         }
7620         HA_METADATA_KEY mk = *(HA_METADATA_KEY *)curr_key.data;
7621         if (mk != hatoku_key_name) {
7622             continue;
7623         }
7624         error = delete_or_rename_dictionary(from_name, to_name, (char *)((char *)curr_key.data + sizeof(HA_METADATA_KEY)), true, txn, is_delete);
7625         if (error) { goto cleanup; }
7626     }
7627 
7628     //
7629     // delete or rename main.tokudb
7630     //
7631     error = delete_or_rename_dictionary(from_name, to_name, "main", false, txn, is_delete);
7632     if (error) { goto cleanup; }
7633 
7634     error = status_cursor->c_close(status_cursor);
7635     assert_always(error==0);
7636     status_cursor = NULL;
7637     if (error) { goto cleanup; }
7638 
7639     error = status_db->close(status_db, 0);
7640     assert_always(error == 0);
7641     status_db = NULL;
7642 
7643     //
7644     // delete or rename status.tokudb
7645     //
7646     error = delete_or_rename_dictionary(from_name, to_name, "status", false, txn, is_delete);
7647     if (error) { goto cleanup; }
7648 
7649     my_errno = error;
7650 cleanup:
7651     if (status_cursor) {
7652         int r = status_cursor->c_close(status_cursor);
7653         assert_always(r==0);
7654     }
7655     if (status_db) {
7656         int r = status_db->close(status_db, 0);
7657         assert_always(r==0);
7658     }
7659     if (txn) {
7660         if (error) {
7661             abort_txn(txn);
7662         }
7663         else {
7664             commit_txn(txn, 0);
7665         }
7666     }
7667     return error;
7668 }
7669 
7670 
7671 //
7672 // Drops table
7673 // Parameters:
7674 //      [in]    name - name of table to be deleted
7675 // Returns:
7676 //      0 on success
7677 //      error otherwise
7678 //
7679 int ha_tokudb::delete_table(const char *name) {
7680     TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7681     TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(name, NULL, false);
7682     if (share) {
7683         share->unlock();
7684         share->release();
7685         // this should be enough to handle locking as the higher level MDL
7686         // on this table should prevent any new analyze tasks.
7687         share->cancel_background_jobs();
7688         TOKUDB_SHARE::drop_share(share);
7689     }
7690 
7691     int error;
7692     error = delete_or_rename_table(name, NULL, true);
7693     if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7694         error == DB_LOCK_NOTGRANTED) {
7695         sql_print_error(
7696             "Could not delete table %s because another transaction has "
7697             "accessed the table. To drop the table, make sure no "
7698             "transactions touch the table.",
7699             name);
7700     }
7701     TOKUDB_HANDLER_DBUG_RETURN(error);
7702 }
7703 
7704 static bool tokudb_check_db_dir_exist_from_table_name(const char *table_name) {
7705     DBUG_ASSERT(table_name);
7706     bool mysql_dir_exists;
7707     char db_name[FN_REFLEN];
7708     const char *db_name_begin = strchr(table_name, FN_LIBCHAR);
7709     const char *db_name_end = strrchr(table_name, FN_LIBCHAR);
7710     DBUG_ASSERT(db_name_begin);
7711     DBUG_ASSERT(db_name_end);
7712     DBUG_ASSERT(db_name_begin != db_name_end);
7713 
7714     ++db_name_begin;
7715     size_t db_name_size = db_name_end - db_name_begin;
7716 
7717     DBUG_ASSERT(db_name_size < FN_REFLEN);
7718 
7719     memcpy(db_name, db_name_begin, db_name_size);
7720     db_name[db_name_size] = '\0';
7721 
7722     // At this point, db_name contains the MySQL formatted database name.
7723     // This is exactly the same format that would come into us through a
7724     // CREATE TABLE. Some charaters (like ':' for example) might be expanded
7725     // into hex (':' would papear as "@003a").
7726     // We need to check that the MySQL destination database directory exists.
7727     mysql_dir_exists = (my_access(db_name, F_OK) == 0);
7728 
7729     return mysql_dir_exists;
7730 }
7731 
7732 //
7733 // renames table from "from" to "to"
7734 // Parameters:
7735 //      [in]    name - old name of table
7736 //      [in]    to - new name of table
7737 // Returns:
7738 //      0 on success
7739 //      error otherwise
7740 //
7741 int ha_tokudb::rename_table(const char *from, const char *to) {
7742     TOKUDB_HANDLER_DBUG_ENTER("%s %s", from, to);
7743     TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(from, NULL, false);
7744     if (share) {
7745         share->unlock();
7746         share->release();
7747         // this should be enough to handle locking as the higher level MDL
7748         // on this table should prevent any new analyze tasks.
7749         share->cancel_background_jobs();
7750         TOKUDB_SHARE::drop_share(share);
7751     }
7752     int error;
7753     bool to_db_dir_exist = tokudb_check_db_dir_exist_from_table_name(to);
7754     if (!to_db_dir_exist) {
7755         sql_print_error(
7756             "Could not rename table from %s to %s because "
7757             "destination db does not exist",
7758             from,
7759             to);
7760         error = HA_ERR_DEST_SCHEMA_NOT_EXIST;
7761     }
7762     else {
7763         error = delete_or_rename_table(from, to, false);
7764         if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7765             error == DB_LOCK_NOTGRANTED) {
7766             sql_print_error(
7767                 "Could not rename table from %s to %s because another transaction "
7768                 "has accessed the table. To rename the table, make sure no "
7769                 "transactions touch the table.",
7770                 from,
7771                 to);
7772         }
7773     }
7774     TOKUDB_HANDLER_DBUG_RETURN(error);
7775 }
7776 
7777 
7778 /*
7779   Returns estimate on number of seeks it will take to read through the table
7780   This is to be comparable to the number returned by records_in_range so
7781   that we can decide if we should scan the table or use keys.
7782 */
7783 /// QQQ why divide by 3
7784 double ha_tokudb::scan_time() {
7785     TOKUDB_HANDLER_DBUG_ENTER("");
7786     double ret_val = (double)stats.records / 3;
7787     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7788         TOKUDB_DEBUG_RETURN,
7789         "return %" PRIu64 " %f",
7790         (uint64_t)stats.records,
7791         ret_val);
7792     DBUG_RETURN(ret_val);
7793 }
7794 
7795 double ha_tokudb::keyread_time(uint index, uint ranges, ha_rows rows)
7796 {
7797     TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7798     double ret_val;
7799     if (index == primary_key || key_is_clustering(&table->key_info[index])) {
7800         ret_val = read_time(index, ranges, rows);
7801         DBUG_RETURN(ret_val);
7802     }
7803     /*
7804       It is assumed that we will read trough the whole key range and that all
7805       key blocks are half full (normally things are much better). It is also
7806       assumed that each time we read the next key from the index, the handler
7807       performs a random seek, thus the cost is proportional to the number of
7808       blocks read. This model does not take into account clustered indexes -
7809       engines that support that (e.g. InnoDB) may want to overwrite this method.
7810     */
7811     double keys_per_block= (stats.block_size/2.0/
7812                             (table->key_info[index].key_length +
7813                              ref_length) + 1);
7814     ret_val = (rows + keys_per_block - 1)/ keys_per_block;
7815     TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7816 }
7817 
7818 //
7819 // Calculate the time it takes to read a set of ranges through an index
7820 // This enables us to optimize reads for clustered indexes.
7821 // Implementation pulled from InnoDB
7822 // Parameters:
7823 //          index - index to use
7824 //          ranges - number of ranges
7825 //          rows - estimated number of rows in the range
7826 // Returns:
7827 //      estimated time measured in disk seeks
7828 //
7829 double ha_tokudb::read_time(
7830     uint    index,
7831     uint    ranges,
7832     ha_rows rows
7833     )
7834 {
7835     TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7836     double total_scan;
7837     double ret_val;
7838     bool is_primary = (index == primary_key);
7839     bool is_clustering;
7840 
7841     //
7842     // in case for hidden primary key, this is called
7843     //
7844     if (index >= table_share->keys) {
7845         ret_val = handler::read_time(index, ranges, rows);
7846         goto cleanup;
7847     }
7848 
7849     is_clustering = key_is_clustering(&table->key_info[index]);
7850 
7851 
7852     //
7853     // if it is not the primary key, and it is not a clustering key, then return handler::read_time
7854     //
7855     if (!(is_primary || is_clustering)) {
7856         ret_val = handler::read_time(index, ranges, rows);
7857         goto cleanup;
7858     }
7859 
7860     //
7861     // for primary key and for clustered keys, return a fraction of scan_time()
7862     //
7863     total_scan = scan_time();
7864 
7865     if (stats.records < rows) {
7866         ret_val = is_clustering ? total_scan + 0.00001 : total_scan;
7867         goto cleanup;
7868     }
7869 
7870     //
7871     // one disk seek per range plus the proportional scan time of the rows
7872     //
7873     ret_val = (ranges + (double) rows / (double) stats.records * total_scan);
7874     ret_val = is_clustering ? ret_val + 0.00001 : ret_val;
7875 
7876 cleanup:
7877     TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7878 }
7879 
7880 double ha_tokudb::index_only_read_time(uint keynr, double records) {
7881     TOKUDB_HANDLER_DBUG_ENTER("%u %f", keynr, records);
7882     double ret_val = keyread_time(keynr, 1, (ha_rows)records);
7883     TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7884 }
7885 
7886 //
7887 // Estimates the number of index records in a range. In case of errors, return
7888 //   HA_TOKUDB_RANGE_COUNT instead of HA_POS_ERROR. This was behavior
7889 //   when we got the handlerton from MySQL.
7890 // Parameters:
7891 //              keynr -index to use
7892 //      [in]    start_key - low end of the range
7893 //      [in]    end_key - high end of the range
7894 // Returns:
7895 //      0 - There are no matching keys in the given range
7896 //      number > 0 - There are approximately number matching rows in the range
7897 //      HA_POS_ERROR - Something is wrong with the index tree
7898 //
7899 ha_rows ha_tokudb::records_in_range(uint keynr, key_range* start_key, key_range* end_key) {
7900     TOKUDB_HANDLER_DBUG_ENTER("%d %p %p", keynr, start_key, end_key);
7901     DBT *pleft_key, *pright_key;
7902     DBT left_key, right_key;
7903     ha_rows ret_val = HA_TOKUDB_RANGE_COUNT;
7904     DB *kfile = share->key_file[keynr];
7905     uint64_t rows = 0;
7906     int error;
7907 
7908     // get start_rows and end_rows values so that we can estimate range
7909     // when calling key_range64, the only value we can trust is the value for less
7910     // The reason is that the key being passed in may be a prefix of keys in the DB
7911     // As a result, equal may be 0 and greater may actually be equal+greater
7912     // So, we call key_range64 on the key, and the key that is after it.
7913     if (!start_key && !end_key) {
7914         error = estimate_num_rows(share->file, &rows, transaction);
7915         if (error) {
7916             ret_val = HA_TOKUDB_RANGE_COUNT;
7917             goto cleanup;
7918         }
7919         ret_val = (rows <= 1) ? 1 : rows;
7920         goto cleanup;
7921     }
7922     if (start_key) {
7923         uchar inf_byte = (start_key->flag == HA_READ_KEY_EXACT) ? COL_NEG_INF : COL_POS_INF;
7924         pack_key(&left_key, keynr, key_buff, start_key->key, start_key->length, inf_byte);
7925         pleft_key = &left_key;
7926     } else {
7927         pleft_key = NULL;
7928     }
7929     if (end_key) {
7930         uchar inf_byte = (end_key->flag == HA_READ_BEFORE_KEY) ? COL_NEG_INF : COL_POS_INF;
7931         pack_key(&right_key, keynr, key_buff2, end_key->key, end_key->length, inf_byte);
7932         pright_key = &right_key;
7933     } else {
7934         pright_key = NULL;
7935     }
7936     // keys_range64 can not handle a degenerate range (left_key > right_key), so we filter here
7937     if (pleft_key && pright_key && tokudb_cmp_dbt_key(kfile, pleft_key, pright_key) > 0) {
7938         rows = 0;
7939     } else {
7940         uint64_t less, equal1, middle, equal2, greater;
7941         bool is_exact;
7942         error = kfile->keys_range64(kfile, transaction, pleft_key, pright_key,
7943                                     &less, &equal1, &middle, &equal2, &greater, &is_exact);
7944         if (error) {
7945             ret_val = HA_TOKUDB_RANGE_COUNT;
7946             goto cleanup;
7947         }
7948         rows = middle;
7949     }
7950 
7951     // MySQL thinks a return value of 0 means there are exactly 0 rows
7952     // Therefore, always return non-zero so this assumption is not made
7953     ret_val = (ha_rows) (rows <= 1 ? 1 : rows);
7954 
7955 cleanup:
7956     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7957         TOKUDB_DEBUG_RETURN,
7958         "return %" PRIu64 " %" PRIu64,
7959         (uint64_t)ret_val,
7960         rows);
7961     DBUG_RETURN(ret_val);
7962 }
7963 
7964 
7965 //
7966 // Initializes the auto-increment data in the local "share" object to the
7967 // greater of two values: what's stored in the metadata or the last inserted
7968 // auto-increment field (if auto-increment field is the first field of a key).
7969 //
7970 void ha_tokudb::init_auto_increment() {
7971     int error;
7972     DB_TXN* txn = NULL;
7973 
7974     error = txn_begin(db_env, 0, &txn, 0, ha_thd());
7975     if (error) {
7976         share->last_auto_increment = 0;
7977     } else {
7978         HA_METADATA_KEY key_val;
7979         DBT key;
7980         memset(&key, 0, sizeof(key));
7981         key.data = &key_val;
7982         key.size = sizeof(key_val);
7983         DBT value;
7984         memset(&value, 0, sizeof(value));
7985         value.flags = DB_DBT_USERMEM;
7986 
7987         // Retrieve the initial auto increment value, as specified by create table
7988         // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
7989         // then the value 100 should be stored here
7990         key_val = hatoku_ai_create_value;
7991         value.ulen = sizeof(share->auto_inc_create_value);
7992         value.data = &share->auto_inc_create_value;
7993         error = share->status_block->get(share->status_block, txn, &key, &value, 0);
7994 
7995         if (error || value.size != sizeof(share->auto_inc_create_value)) {
7996             share->auto_inc_create_value = 0;
7997         }
7998 
7999         // Retrieve hatoku_max_ai, which is max value used by auto increment
8000         // column so far, the max value could have been auto generated (e.g. insert (NULL))
8001         // or it could have been manually inserted by user (e.g. insert (345))
8002         key_val = hatoku_max_ai;
8003         value.ulen = sizeof(share->last_auto_increment);
8004         value.data = &share->last_auto_increment;
8005         error = share->status_block->get(share->status_block, txn, &key, &value, 0);
8006 
8007         if (error || value.size != sizeof(share->last_auto_increment)) {
8008             if (share->auto_inc_create_value)
8009                 share->last_auto_increment = share->auto_inc_create_value - 1;
8010             else
8011                 share->last_auto_increment = 0;
8012         }
8013 
8014         commit_txn(txn, 0);
8015     }
8016     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
8017         TOKUDB_DEBUG_AUTO_INCREMENT,
8018         "init auto increment:%lld",
8019         share->last_auto_increment);
8020 }
8021 
8022 void ha_tokudb::get_auto_increment(
8023     ulonglong offset,
8024     ulonglong increment,
8025     ulonglong nb_desired_values,
8026     ulonglong* first_value,
8027     ulonglong* nb_reserved_values) {
8028 
8029     TOKUDB_HANDLER_DBUG_ENTER("");
8030     ulonglong nr;
8031     bool over;
8032 
8033     share->lock();
8034 
8035     if (share->auto_inc_create_value > share->last_auto_increment) {
8036         nr = share->auto_inc_create_value;
8037         over = false;
8038         share->last_auto_increment = share->auto_inc_create_value;
8039     } else {
8040         nr = share->last_auto_increment + increment;
8041         over = nr < share->last_auto_increment;
8042         if (over)
8043             nr = ULONGLONG_MAX;
8044     }
8045     if (!over) {
8046         share->last_auto_increment = nr + (nb_desired_values - 1)*increment;
8047         if (delay_updating_ai_metadata) {
8048             ai_metadata_update_required = true;
8049         } else {
8050             update_max_auto_inc(
8051                 share->status_block,
8052                 share->last_auto_increment);
8053         }
8054     }
8055     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
8056         TOKUDB_DEBUG_AUTO_INCREMENT,
8057         "get_auto_increment(%lld,%lld,%lld): got:%lld:%lld",
8058         offset,
8059         increment,
8060         nb_desired_values,
8061         nr,
8062         nb_desired_values);
8063     *first_value = nr;
8064     *nb_reserved_values = nb_desired_values;
8065     share->unlock();
8066     TOKUDB_HANDLER_DBUG_VOID_RETURN;
8067 }
8068 
8069 bool ha_tokudb::is_optimize_blocking() {
8070     return false;
8071 }
8072 
8073 bool ha_tokudb::is_auto_inc_singleton(){
8074     return false;
8075 }
8076 
8077 
8078 // Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8079 // With a transaction, drops dictionaries associated with indexes in key_num
8080 //
8081 //
8082 // Adds indexes to the table. Takes the array of KEY passed in key_info, and creates
8083 // DB's that will go at the end of share->key_file. THE IMPLICIT ASSUMPTION HERE is
8084 // that the table will be modified and that these added keys will be appended to the end
8085 // of the array table->key_info
8086 // Parameters:
8087 //      [in]    table_arg - table that is being modified, seems to be identical to this->table
8088 //      [in]    key_info - array of KEY's to be added
8089 //              num_of_keys - number of keys to be added, number of elements in key_info
8090 //  Returns:
8091 //      0 on success, error otherwise
8092 //
8093 int ha_tokudb::tokudb_add_index(
8094     TABLE* table_arg,
8095     KEY* key_info,
8096     uint num_of_keys,
8097     DB_TXN* txn,
8098     bool* inc_num_DBs,
8099     bool* modified_DBs) {
8100 
8101     TOKUDB_HANDLER_DBUG_ENTER("");
8102     assert_always(txn);
8103 
8104     int error;
8105     uint curr_index = 0;
8106     DBC* tmp_cursor = NULL;
8107     int cursor_ret_val = 0;
8108     DBT curr_pk_key, curr_pk_val;
8109     THD* thd = ha_thd();
8110     DB_LOADER* loader = NULL;
8111     DB_INDEXER* indexer = NULL;
8112     bool loader_save_space = tokudb::sysvars::load_save_space(thd);
8113     bool use_hot_index = (lock.type == TL_WRITE_ALLOW_WRITE);
8114     uint32_t loader_flags = loader_save_space ? LOADER_COMPRESS_INTERMEDIATES : 0;
8115     uint32_t indexer_flags = 0;
8116     uint32_t mult_db_flags[MAX_KEY + 1] = {0};
8117     uint32_t mult_put_flags[MAX_KEY + 1];
8118     uint32_t mult_dbt_flags[MAX_KEY + 1];
8119     bool creating_hot_index = false;
8120     struct loader_context lc;
8121     memset(&lc, 0, sizeof lc);
8122     lc.thd = thd;
8123     lc.ha = this;
8124     loader_error = 0;
8125     bool rw_lock_taken = false;
8126     *inc_num_DBs = false;
8127     *modified_DBs = false;
8128     invalidate_bulk_fetch();
8129     unpack_entire_row = true; // for bulk fetching rows
8130     for (uint32_t i = 0; i < MAX_KEY+1; i++) {
8131         mult_put_flags[i] = 0;
8132         mult_dbt_flags[i] = DB_DBT_REALLOC;
8133     }
8134     //
8135     // number of DB files we have open currently, before add_index is executed
8136     //
8137     uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8138 
8139     //
8140     // get the row type to use for the indexes we're adding
8141     //
8142     toku_compression_method compression_method =
8143         get_compression_method(share->file);
8144 
8145     //
8146     // status message to be shown in "show process list"
8147     //
8148     const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
8149     // buffer of 200 should be a good upper bound.
8150     char status_msg[MAX_ALIAS_NAME + 200];
8151     // variable that stores number of elements inserted thus far
8152     ulonglong num_processed = 0;
8153     thd_proc_info(thd, "Adding indexes");
8154 
8155     //
8156     // in unpack_row, MySQL passes a buffer that is this long,
8157     // so this length should be good enough for us as well
8158     //
8159     memset((void *) &curr_pk_key, 0, sizeof(curr_pk_key));
8160     memset((void *) &curr_pk_val, 0, sizeof(curr_pk_val));
8161 
8162     //
8163     // The files for secondary tables are derived from the name of keys
8164     // If we try to add a key with the same name as an already existing key,
8165     // We can crash. So here we check if any of the keys added has the same
8166     // name of an existing key, and if so, we fail gracefully
8167     //
8168     for (uint i = 0; i < num_of_keys; i++) {
8169         for (uint j = 0; j < table_arg->s->keys; j++) {
8170             if (strcmp(key_info[i].name, table_arg->s->key_info[j].name) == 0) {
8171                 error = HA_ERR_WRONG_COMMAND;
8172                 goto cleanup;
8173             }
8174         }
8175     }
8176 
8177     rwlock_t_lock_write(share->_num_DBs_lock);
8178     rw_lock_taken = true;
8179     //
8180     // open all the DB files and set the appropriate variables in share
8181     // they go to the end of share->key_file
8182     //
8183     creating_hot_index =
8184         use_hot_index && num_of_keys == 1 &&
8185         (key_info[0].flags & HA_NOSAME) == 0;
8186     if (use_hot_index && (share->num_DBs > curr_num_DBs)) {
8187         //
8188         // already have hot index in progress, get out
8189         //
8190         error = HA_ERR_INTERNAL_ERROR;
8191         goto cleanup;
8192     }
8193     curr_index = curr_num_DBs;
8194     *modified_DBs = true;
8195     for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8196         if (key_is_clustering(&key_info[i])) {
8197             set_key_filter(
8198                 &share->kc_info.key_filters[curr_index],
8199                 &key_info[i],
8200                 table_arg,
8201                 false);
8202             if (!hidden_primary_key) {
8203                 set_key_filter(
8204                     &share->kc_info.key_filters[curr_index],
8205                     &table_arg->key_info[primary_key],
8206                     table_arg,
8207                     false);
8208             }
8209 
8210             error = initialize_col_pack_info(
8211                 &share->kc_info,
8212                 table_arg->s,
8213                 curr_index);
8214             if (error) {
8215                 goto cleanup;
8216             }
8217         }
8218 
8219 
8220         error = create_secondary_dictionary(
8221             share->full_table_name(),
8222             table_arg,
8223             &key_info[i],
8224             txn,
8225             &share->kc_info,
8226             curr_index,
8227             creating_hot_index,
8228             compression_method);
8229         if (error) {
8230             goto cleanup;
8231         }
8232 
8233         error = open_secondary_dictionary(
8234             &share->key_file[curr_index],
8235             &key_info[i],
8236             share->full_table_name(),
8237             false,
8238             txn);
8239         if (error) {
8240             goto cleanup;
8241         }
8242     }
8243 
8244     if (creating_hot_index) {
8245         share->num_DBs++;
8246         *inc_num_DBs = true;
8247         error = db_env->create_indexer(
8248             db_env,
8249             txn,
8250             &indexer,
8251             share->file,
8252             num_of_keys,
8253             &share->key_file[curr_num_DBs],
8254             mult_db_flags,
8255             indexer_flags);
8256         if (error) {
8257             goto cleanup;
8258         }
8259 
8260         error = indexer->set_poll_function(
8261             indexer, ha_tokudb::tokudb_add_index_poll, &lc);
8262         if (error) {
8263             goto cleanup;
8264         }
8265 
8266         error = indexer->set_error_callback(
8267             indexer, ha_tokudb::loader_add_index_err, &lc);
8268         if (error) {
8269             goto cleanup;
8270         }
8271 
8272         share->_num_DBs_lock.unlock();
8273         rw_lock_taken = false;
8274 
8275 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8276         // initialize a one phase progress report.
8277         // incremental reports are done in the indexer's callback function.
8278         thd_progress_init(thd, 1);
8279 #endif
8280 
8281         error = indexer->build(indexer);
8282 
8283         if (error) {
8284             goto cleanup;
8285         }
8286 
8287         rwlock_t_lock_write(share->_num_DBs_lock);
8288         error = indexer->close(indexer);
8289         share->_num_DBs_lock.unlock();
8290         if (error) {
8291             goto cleanup;
8292         }
8293         indexer = NULL;
8294     } else {
8295         DBUG_ASSERT(table->mdl_ticket->get_type() >= MDL_SHARED_NO_WRITE);
8296         share->_num_DBs_lock.unlock();
8297         rw_lock_taken = false;
8298         prelocked_right_range_size = 0;
8299         prelocked_left_range_size = 0;
8300         struct smart_dbt_bf_info bf_info;
8301         bf_info.ha = this;
8302         // you need the val if you have a clustering index and key_read is not 0;
8303         bf_info.direction = 1;
8304         bf_info.thd = ha_thd();
8305         bf_info.need_val = true;
8306         bf_info.key_to_compare = NULL;
8307 
8308         error = db_env->create_loader(
8309             db_env,
8310             txn,
8311             &loader,
8312             NULL, // no src_db needed
8313             num_of_keys,
8314             &share->key_file[curr_num_DBs],
8315             mult_put_flags,
8316             mult_dbt_flags,
8317             loader_flags);
8318         if (error) {
8319             goto cleanup;
8320         }
8321 
8322         error =
8323             loader->set_poll_function(loader, ha_tokudb::bulk_insert_poll, &lc);
8324         if (error) {
8325             goto cleanup;
8326         }
8327 
8328         error = loader->set_error_callback(
8329             loader, ha_tokudb::loader_add_index_err, &lc);
8330         if (error) {
8331             goto cleanup;
8332         }
8333         //
8334         // scan primary table, create each secondary key, add to each DB
8335         //
8336         error = share->file->cursor(
8337             share->file,
8338             txn,
8339             &tmp_cursor,
8340             DB_SERIALIZABLE);
8341         if (error) {
8342             tmp_cursor = NULL;             // Safety
8343             goto cleanup;
8344         }
8345 
8346         //
8347         // grab some locks to make this go faster
8348         // first a global read lock on the main DB, because
8349         // we intend to scan the entire thing
8350         //
8351         error = tmp_cursor->c_set_bounds(
8352             tmp_cursor,
8353             share->file->dbt_neg_infty(),
8354             share->file->dbt_pos_infty(),
8355             true,
8356             0);
8357         if (error) {
8358             goto cleanup;
8359         }
8360 
8361         // set the bulk fetch iteration to its max so that adding an
8362         // index fills the bulk fetch buffer every time. we do not
8363         // want it to grow exponentially fast.
8364         rows_fetched_using_bulk_fetch = 0;
8365         bulk_fetch_iteration = HA_TOKU_BULK_FETCH_ITERATION_MAX;
8366         cursor_ret_val = tmp_cursor->c_getf_next(
8367             tmp_cursor,
8368             DB_PRELOCKED,
8369             smart_dbt_bf_callback,
8370             &bf_info);
8371 
8372 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8373         // initialize a two phase progress report.
8374         // first phase: putting rows into the loader
8375         thd_progress_init(thd, 2);
8376 #endif
8377 
8378         while (cursor_ret_val != DB_NOTFOUND ||
8379                ((bytes_used_in_range_query_buff -
8380                  curr_range_query_buff_offset) > 0)) {
8381             if ((bytes_used_in_range_query_buff -
8382                  curr_range_query_buff_offset) == 0) {
8383                 invalidate_bulk_fetch(); // reset the buffers
8384                 cursor_ret_val = tmp_cursor->c_getf_next(
8385                     tmp_cursor,
8386                     DB_PRELOCKED,
8387                     smart_dbt_bf_callback,
8388                     &bf_info);
8389                 if (cursor_ret_val != DB_NOTFOUND && cursor_ret_val != 0) {
8390                     error = cursor_ret_val;
8391                     goto cleanup;
8392                 }
8393             }
8394             // do this check in case the the c_getf_next did not put anything
8395             // into the buffer because there was no more data
8396             if ((bytes_used_in_range_query_buff -
8397                  curr_range_query_buff_offset) == 0) {
8398                 break;
8399             }
8400             // at this point, we know the range query buffer has at least one
8401             // key/val pair
8402             uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
8403 
8404             uint32_t key_size = *(uint32_t *)curr_pos;
8405             curr_pos += sizeof(key_size);
8406             uchar* curr_key_buff = curr_pos;
8407             curr_pos += key_size;
8408             curr_pk_key.data = curr_key_buff;
8409             curr_pk_key.size = key_size;
8410 
8411             uint32_t val_size = *(uint32_t *)curr_pos;
8412             curr_pos += sizeof(val_size);
8413             uchar* curr_val_buff = curr_pos;
8414             curr_pos += val_size;
8415             curr_pk_val.data = curr_val_buff;
8416             curr_pk_val.size = val_size;
8417 
8418             curr_range_query_buff_offset = curr_pos - range_query_buff;
8419 
8420             error = loader->put(loader, &curr_pk_key, &curr_pk_val);
8421             if (error) {
8422                 goto cleanup;
8423             }
8424 
8425             num_processed++;
8426 
8427             if ((num_processed % 1000) == 0) {
8428                 sprintf(
8429                     status_msg,
8430                     "Adding indexes: Fetched %llu of about %llu rows, loading "
8431                     "of data still remains.",
8432                     num_processed,
8433                     (long long unsigned)share->row_count());
8434                 thd_proc_info(thd, status_msg);
8435 
8436 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8437                 thd_progress_report(
8438                     thd,
8439                     num_processed,
8440                     (long long unsigned)share->rows);
8441 #endif
8442 
8443                 if (thd_killed(thd)) {
8444                     error = ER_ABORTING_CONNECTION;
8445                     goto cleanup;
8446                 }
8447             }
8448         }
8449         error = tmp_cursor->c_close(tmp_cursor);
8450         assert_always(error==0);
8451         tmp_cursor = NULL;
8452 
8453 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8454         // next progress report phase: closing the loader.
8455         // incremental reports are done in the loader's callback function.
8456         thd_progress_next_stage(thd);
8457 #endif
8458 
8459         error = loader->close(loader);
8460         loader = NULL;
8461 
8462         if (error) goto cleanup;
8463     }
8464     curr_index = curr_num_DBs;
8465     for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8466         if (key_info[i].flags & HA_NOSAME) {
8467             bool is_unique;
8468             error = is_index_unique(
8469                 &is_unique,
8470                 txn,
8471                 share->key_file[curr_index],
8472                 &key_info[i],
8473                 creating_hot_index ? 0 : DB_PRELOCKED_WRITE);
8474             if (error)
8475                 goto cleanup;
8476             if (!is_unique) {
8477                 error = HA_ERR_FOUND_DUPP_KEY;
8478                 last_dup_key = i;
8479                 goto cleanup;
8480             }
8481         }
8482     }
8483 
8484     share->lock();
8485     //
8486     // We have an accurate row count, might as well update share->rows
8487     //
8488     if(!creating_hot_index) {
8489         share->set_row_count(num_processed, true);
8490     }
8491     //
8492     // now write stuff to status.tokudb
8493     //
8494     for (uint i = 0; i < num_of_keys; i++) {
8495         write_key_name_to_status(share->status_block, key_info[i].name, txn);
8496     }
8497     share->unlock();
8498 
8499     error = 0;
8500 cleanup:
8501 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8502     thd_progress_end(thd);
8503 #endif
8504     if (rw_lock_taken) {
8505         share->_num_DBs_lock.unlock();
8506         rw_lock_taken = false;
8507     }
8508     if (tmp_cursor) {
8509         int r = tmp_cursor->c_close(tmp_cursor);
8510         assert_always(r==0);
8511         tmp_cursor = NULL;
8512     }
8513     if (loader != NULL) {
8514         sprintf(status_msg, "aborting creation of indexes.");
8515         thd_proc_info(thd, status_msg);
8516         loader->abort(loader);
8517     }
8518     if (indexer != NULL) {
8519         sprintf(status_msg, "aborting creation of indexes.");
8520         thd_proc_info(thd, status_msg);
8521         rwlock_t_lock_write(share->_num_DBs_lock);
8522         indexer->abort(indexer);
8523         share->_num_DBs_lock.unlock();
8524     }
8525     if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8526         error == DB_LOCK_NOTGRANTED) {
8527         sql_print_error(
8528             "Could not add indexes to table %s because another transaction has "
8529             "accessed the table. To add indexes, make sure no transactions "
8530             "touch the table.",
8531             share->full_table_name());
8532     }
8533     thd_proc_info(thd, orig_proc_info);
8534     TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
8535 }
8536 int ha_tokudb::tokudb_add_index_poll(void* extra, float progress) {
8537     LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
8538     if (thd_killed(context->thd)) {
8539         snprintf(context->write_status_msg,
8540                  sizeof(context->write_status_msg),
8541                  "The process has been killed, aborting add index.");
8542         return ER_ABORTING_CONNECTION;
8543     }
8544     float percentage = progress * 100;
8545     snprintf(context->write_status_msg,
8546              sizeof(context->write_status_msg),
8547              "Adding of indexes to %s about %.1f%% done",
8548              context->ha->share->full_table_name(),
8549              percentage);
8550     thd_proc_info(context->thd, context->write_status_msg);
8551 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8552     thd_progress_report(context->thd, (unsigned long long)percentage, 100);
8553 #endif
8554     return 0;
8555 }
8556 
8557 //
8558 // Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8559 // Closes added indexes in case of error in error path of add_index and alter_table_phase2
8560 //
8561 void ha_tokudb::restore_add_index(
8562     TABLE* table_arg,
8563     uint num_of_keys,
8564     bool incremented_numDBs,
8565     bool modified_DBs) {
8566 
8567     uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8568     uint curr_index = 0;
8569 
8570     //
8571     // need to restore num_DBs, and we have to do it before we close the dictionaries
8572     // so that there is not a window
8573     //
8574     if (incremented_numDBs) {
8575         rwlock_t_lock_write(share->_num_DBs_lock);
8576         share->num_DBs--;
8577     }
8578     if (modified_DBs) {
8579         curr_index = curr_num_DBs;
8580         for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8581             reset_key_and_col_info(&share->kc_info, curr_index);
8582         }
8583         curr_index = curr_num_DBs;
8584         for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8585             if (share->key_file[curr_index]) {
8586                 int r = share->key_file[curr_index]->close(
8587                     share->key_file[curr_index],
8588                     0);
8589                 assert_always(r==0);
8590                 share->key_file[curr_index] = NULL;
8591             }
8592         }
8593     }
8594     if (incremented_numDBs) {
8595         share->_num_DBs_lock.unlock();
8596     }
8597 }
8598 
8599 //
8600 // Internal function called by ha_tokudb::prepare_drop_index and ha_tokudb::alter_table_phase2
8601 // With a transaction, drops dictionaries associated with indexes in key_num
8602 //
8603 int ha_tokudb::drop_indexes(uint* key_num,
8604                             uint num_of_keys,
8605                             KEY* key_info,
8606                             DB_TXN* txn) {
8607     TOKUDB_HANDLER_DBUG_ENTER("");
8608     assert_always(txn);
8609 
8610     int error = 0;
8611     for (uint i = 0; i < num_of_keys; i++) {
8612         uint curr_index = key_num[i];
8613         error = share->key_file[curr_index]->pre_acquire_fileops_lock(
8614             share->key_file[curr_index],
8615             txn);
8616         if (error != 0) {
8617             goto cleanup;
8618         }
8619     }
8620     for (uint i = 0; i < num_of_keys; i++) {
8621         uint curr_index = key_num[i];
8622         int r = share->key_file[curr_index]->close(share->key_file[curr_index],0);
8623         assert_always(r==0);
8624         share->key_file[curr_index] = NULL;
8625 
8626         error = remove_key_name_from_status(
8627             share->status_block,
8628             key_info[curr_index].name,
8629             txn);
8630         if (error) {
8631             goto cleanup;
8632         }
8633 
8634         error = delete_or_rename_dictionary(
8635             share->full_table_name(),
8636             NULL,
8637             key_info[curr_index].name,
8638             true,
8639             txn,
8640             true);
8641         if (error) {
8642             goto cleanup;
8643         }
8644     }
8645 
8646 cleanup:
8647     if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8648         error == DB_LOCK_NOTGRANTED) {
8649         sql_print_error(
8650             "Could not drop indexes from table %s because another transaction "
8651             "has accessed the table. To drop indexes, make sure no "
8652             "transactions touch the table.",
8653             share->full_table_name());
8654     }
8655     TOKUDB_HANDLER_DBUG_RETURN(error);
8656 }
8657 
8658 //
8659 // Internal function called by ha_tokudb::prepare_drop_index and
8660 // ha_tokudb::alter_table_phase2
8661 // Restores dropped indexes in case of error in error path of
8662 // prepare_drop_index and alter_table_phase2
8663 //
8664 void ha_tokudb::restore_drop_indexes(uint* key_num, uint num_of_keys) {
8665     //
8666     // reopen closed dictionaries
8667     //
8668     for (uint i = 0; i < num_of_keys; i++) {
8669         int r;
8670         uint curr_index = key_num[i];
8671         if (share->key_file[curr_index] == NULL) {
8672             r = open_secondary_dictionary(
8673                 &share->key_file[curr_index],
8674                 &table_share->key_info[curr_index],
8675                 share->full_table_name(),
8676                 false,
8677                 NULL);
8678             assert_always(!r);
8679         }
8680     }
8681 }
8682 
8683 int ha_tokudb::map_to_handler_error(int error) {
8684     switch (error) {
8685     case DB_LOCK_DEADLOCK:
8686         error = HA_ERR_LOCK_DEADLOCK;
8687         break;
8688     case DB_LOCK_NOTGRANTED:
8689         error = HA_ERR_LOCK_WAIT_TIMEOUT;
8690         break;
8691 #if defined(HA_ERR_DISK_FULL)
8692     case ENOSPC:
8693         error = HA_ERR_DISK_FULL;
8694         break;
8695 #endif
8696     case DB_KEYEXIST:
8697         error = HA_ERR_FOUND_DUPP_KEY;
8698         break;
8699 #if defined(HA_ALTER_ERROR)
8700     case HA_ALTER_ERROR:
8701         error = HA_ERR_UNSUPPORTED;
8702         break;
8703 #endif
8704     case TOKUDB_INTERRUPTED:
8705         error = ER_QUERY_INTERRUPTED;
8706         break;
8707     case TOKUDB_OUT_OF_LOCKS:
8708         error = HA_ERR_LOCK_TABLE_FULL;
8709         break;
8710     }
8711     return error;
8712 }
8713 
8714 void ha_tokudb::print_error(int error, myf errflag) {
8715     error = map_to_handler_error(error);
8716     handler::print_error(error, errflag);
8717 }
8718 
8719 //
8720 // truncate's dictionary associated with keynr index using transaction txn
8721 // does so by deleting and then recreating the dictionary in the context
8722 // of a transaction
8723 //
8724 int ha_tokudb::truncate_dictionary(uint keynr, DB_TXN* txn) {
8725     int error;
8726     bool is_pk = (keynr == primary_key);
8727 
8728     toku_compression_method compression_method =
8729         get_compression_method(share->key_file[keynr]);
8730     error = share->key_file[keynr]->close(share->key_file[keynr], 0);
8731     assert_always(error == 0);
8732 
8733     share->key_file[keynr] = NULL;
8734     if (is_pk) {
8735         share->file = NULL;
8736     }
8737 
8738     if (is_pk) {
8739         error = delete_or_rename_dictionary(
8740             share->full_table_name(),
8741             NULL,
8742             "main",
8743             false, //is_key
8744             txn,
8745             true); // is a delete
8746         if (error) {
8747             goto cleanup;
8748         }
8749     } else {
8750         error = delete_or_rename_dictionary(
8751             share->full_table_name(),
8752             NULL,
8753             table_share->key_info[keynr].name,
8754             true, //is_key
8755             txn,
8756             true); // is a delete
8757         if (error) {
8758             goto cleanup;
8759         }
8760     }
8761 
8762     if (is_pk) {
8763         error = create_main_dictionary(
8764             share->full_table_name(),
8765             table,
8766             txn,
8767             &share->kc_info,
8768             compression_method);
8769     } else {
8770         error = create_secondary_dictionary(
8771             share->full_table_name(),
8772             table,
8773             &table_share->key_info[keynr],
8774             txn,
8775             &share->kc_info,
8776             keynr,
8777             false,
8778             compression_method);
8779     }
8780     if (error) {
8781         goto cleanup;
8782     }
8783 
8784 cleanup:
8785     return error;
8786 }
8787 
8788 // for 5.5
8789 int ha_tokudb::truncate() {
8790     TOKUDB_HANDLER_DBUG_ENTER("");
8791     int error = delete_all_rows_internal();
8792     TOKUDB_HANDLER_DBUG_RETURN(error);
8793 }
8794 
8795 // delete all rows from a table
8796 //
8797 // effects: delete all of the rows in the main dictionary and all of the
8798 // indices.  this must be atomic, so we use the statement transaction
8799 // for all of the truncate operations.
8800 // locks:  if we have an exclusive table write lock, all of the concurrency
8801 // issues go away.
8802 // returns: 0 if success
8803 int ha_tokudb::delete_all_rows() {
8804     TOKUDB_HANDLER_DBUG_ENTER("");
8805     int error = 0;
8806     if (thd_sql_command(ha_thd()) != SQLCOM_TRUNCATE) {
8807         share->try_table_lock = true;
8808         error = HA_ERR_WRONG_COMMAND;
8809     }
8810     if (error == 0)
8811         error = delete_all_rows_internal();
8812     TOKUDB_HANDLER_DBUG_RETURN(error);
8813 }
8814 
8815 int ha_tokudb::delete_all_rows_internal() {
8816     TOKUDB_HANDLER_DBUG_ENTER("");
8817     int error = 0;
8818     uint curr_num_DBs = 0;
8819     DB_TXN* txn = NULL;
8820 
8821     // this should be enough to handle locking as the higher level MDL
8822     // on this table should prevent any new analyze tasks.
8823     share->cancel_background_jobs();
8824 
8825     error = txn_begin(db_env, 0, &txn, 0, ha_thd());
8826     if (error) {
8827         goto cleanup;
8828     }
8829 
8830     curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
8831     for (uint i = 0; i < curr_num_DBs; i++) {
8832         error = share->key_file[i]->pre_acquire_fileops_lock(
8833             share->key_file[i],
8834             txn);
8835         if (error) {
8836             goto cleanup;
8837         }
8838         error = share->key_file[i]->pre_acquire_table_lock(
8839             share->key_file[i],
8840             txn);
8841         if (error) {
8842             goto cleanup;
8843         }
8844     }
8845     for (uint i = 0; i < curr_num_DBs; i++) {
8846         error = truncate_dictionary(i, txn);
8847         if (error) {
8848             goto cleanup;
8849         }
8850     }
8851 
8852     DEBUG_SYNC(ha_thd(), "tokudb_after_truncate_all_dictionarys");
8853 
8854     // zap the row count
8855     if (error == 0) {
8856         share->set_row_count(0, false);
8857         // update auto increment
8858         share->last_auto_increment = 0;
8859         // calling write_to_status directly because we need to use txn
8860         write_to_status(
8861             share->status_block,
8862             hatoku_max_ai,
8863             &share->last_auto_increment,
8864             sizeof(share->last_auto_increment),
8865             txn);
8866     }
8867 
8868     share->try_table_lock = true;
8869 cleanup:
8870     if (txn) {
8871         if (error) {
8872             abort_txn(txn);
8873         } else {
8874             commit_txn(txn,0);
8875         }
8876     }
8877 
8878     if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(
8879         TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8880         error == DB_LOCK_NOTGRANTED) {
8881         sql_print_error(
8882             "Could not truncate table %s because another transaction has "
8883             "accessed the table. To truncate the table, make sure no "
8884             "transactions touch the table.",
8885             share->full_table_name());
8886     }
8887     //
8888     // regardless of errors, need to reopen the DB's
8889     //
8890     for (uint i = 0; i < curr_num_DBs; i++) {
8891         int r = 0;
8892         if (share->key_file[i] == NULL) {
8893             if (i != primary_key) {
8894                 r = open_secondary_dictionary(
8895                         &share->key_file[i],
8896                         &table_share->key_info[i],
8897                         share->full_table_name(),
8898                         false,
8899                         NULL);
8900                 assert_always(!r);
8901             } else {
8902                 r = open_main_dictionary(
8903                        share->full_table_name(),
8904                         false,
8905                         NULL);
8906                 assert_always(!r);
8907             }
8908         }
8909     }
8910     TOKUDB_HANDLER_DBUG_RETURN(error);
8911 }
8912 
8913 void ha_tokudb::set_loader_error(int err) {
8914     loader_error = err;
8915 }
8916 
8917 void ha_tokudb::set_dup_value_for_pk(DBT* key) {
8918     assert_always(!hidden_primary_key);
8919     unpack_key(table->record[0],key,primary_key);
8920     last_dup_key = primary_key;
8921 }
8922 
8923 void ha_tokudb::close_dsmrr() {
8924 #ifdef MARIADB_BASE_VERSION
8925     ds_mrr.dsmrr_close();
8926 #elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
8927     ds_mrr.dsmrr_close();
8928 #endif
8929 }
8930 
8931 void ha_tokudb::reset_dsmrr() {
8932 #ifdef MARIADB_BASE_VERSION
8933     ds_mrr.dsmrr_close();
8934 #elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
8935     ds_mrr.reset();
8936 #endif
8937 }
8938 
8939 // we cache the information so we can do filtering ourselves,
8940 // but as far as MySQL knows, we are not doing any filtering,
8941 // so if we happen to miss filtering a row that does not match
8942 // idx_cond_arg, MySQL will catch it.
8943 // This allows us the ability to deal with only index_next and index_prev,
8944 // and not need to worry about other index_XXX functions
8945 Item* ha_tokudb::idx_cond_push(uint keyno_arg, Item* idx_cond_arg) {
8946     toku_pushed_idx_cond_keyno = keyno_arg;
8947     toku_pushed_idx_cond = idx_cond_arg;
8948     return idx_cond_arg;
8949 }
8950 
8951 void ha_tokudb::cancel_pushed_idx_cond() {
8952     invalidate_icp();
8953     handler::cancel_pushed_idx_cond();
8954 }
8955 
8956 void ha_tokudb::cleanup_txn(DB_TXN *txn) {
8957     if (transaction == txn && cursor) {
8958         int r = cursor->c_close(cursor);
8959         assert_always(r == 0);
8960         cursor = NULL;
8961     }
8962 }
8963 
8964 void ha_tokudb::add_to_trx_handler_list() {
8965     tokudb_trx_data* trx =
8966         (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8967     trx->handlers = list_add(trx->handlers, &trx_handler_list);
8968 }
8969 
8970 void ha_tokudb::remove_from_trx_handler_list() {
8971     tokudb_trx_data* trx =
8972         (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8973     trx->handlers = list_delete(trx->handlers, &trx_handler_list);
8974 }
8975 
8976 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
8977 void ha_tokudb::rpl_before_write_rows() {
8978     in_rpl_write_rows = true;
8979 }
8980 
8981 void ha_tokudb::rpl_after_write_rows() {
8982     in_rpl_write_rows = false;
8983 }
8984 
8985 void ha_tokudb::rpl_before_delete_rows() {
8986     in_rpl_delete_rows = true;
8987 }
8988 
8989 void ha_tokudb::rpl_after_delete_rows() {
8990     in_rpl_delete_rows = false;
8991 }
8992 
8993 void ha_tokudb::rpl_before_update_rows() {
8994     in_rpl_update_rows = true;
8995 }
8996 
8997 void ha_tokudb::rpl_after_update_rows() {
8998     in_rpl_update_rows = false;
8999 }
9000 
9001 bool ha_tokudb::rpl_lookup_rows() {
9002     if (!in_rpl_delete_rows && !in_rpl_update_rows)
9003         return true;
9004     else
9005         return tokudb::sysvars::rpl_lookup_rows(ha_thd());
9006 }
9007 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
9008 
9009 // table admin
9010 #include "ha_tokudb_admin.cc"
9011 
9012 // update functions
9013 #include "tokudb_update_fun.cc"
9014 
9015 // fast updates
9016 #include "ha_tokudb_update.cc"
9017 
9018 // alter table code for various mysql distros
9019 #include "ha_tokudb_alter_55.cc"
9020 #include "ha_tokudb_alter_56.cc"
9021 
9022 // mrr
9023 #ifdef MARIADB_BASE_VERSION
9024 #include  "ha_tokudb_mrr_maria.cc"
9025 #elif 50600 <= MYSQL_VERSION_ID && MYSQL_VERSION_ID <= 50699
9026 #include  "ha_tokudb_mrr_mysql.cc"
9027 #endif
9028 
9029 // key comparisons
9030 #include "hatoku_cmp.cc"
9031 
9032 // handlerton
9033 #include "hatoku_hton.cc"
9034 
9035 // generate template functions
9036 namespace tokudb {
9037     template size_t vlq_encode_ui(uint32_t n, void *p, size_t s);
9038     template size_t vlq_decode_ui(uint32_t *np, void *p, size_t s);
9039     template size_t vlq_encode_ui(uint64_t n, void *p, size_t s);
9040     template size_t vlq_decode_ui(uint64_t *np, void *p, size_t s);
9041 };
9042