1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of TokuDB
6 
7 
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9 
10     TokuDBis is free software: you can redistribute it and/or modify
11     it under the terms of the GNU General Public License, version 2,
12     as published by the Free Software Foundation.
13 
14     TokuDB is distributed in the hope that it will be useful,
15     but WITHOUT ANY WARRANTY; without even the implied warranty of
16     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17     GNU General Public License for more details.
18 
19     You should have received a copy of the GNU General Public License
20     along with TokuDB.  If not, see <http://www.gnu.org/licenses/>.
21 
22 ======= */
23 
24 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
25 
26 #include "hatoku_hton.h"
27 #include "tokudb_buffer.h"
28 #include "tokudb_status.h"
29 #include "ha_tokudb.h"
30 #include "ha_tokupart.h"
31 #include "hatoku_cmp.h"
32 #include "partition_info.h"
33 #include "partitioning/partition_base.h"
34 #include "sql_db.h"
35 #include "sql_parse.h"
36 #include "sql_table.h"
37 #include "table.h"
38 #include "tokudb_card.h"
39 
40 #include "mysql/psi/mysql_file.h"
41 
42 pfs_key_t ha_tokudb_mutex_key;
43 pfs_key_t num_DBs_lock_key;
44 
45 std::unordered_map<std::string, TOKUDB_SHARE*> TOKUDB_SHARE::_open_tables;
46 tokudb::thread::mutex_t TOKUDB_SHARE::_open_tables_mutex;
47 
48 static const char* ha_tokudb_exts[] = {
49     ha_tokudb_ext,
50     NullS
51 };
52 
53 //
54 // This offset is calculated starting from AFTER the NULL bytes
55 //
get_fixed_field_size(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)56 static inline uint32_t get_fixed_field_size(
57     KEY_AND_COL_INFO* kc_info,
58     TABLE_SHARE* table_share,
59     uint keynr) {
60 
61     uint offset = 0;
62     for (uint i = 0; i < table_share->fields; i++) {
63         if (is_fixed_field(kc_info, i) &&
64             !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
65             offset += kc_info->field_lengths[i];
66         }
67     }
68     return offset;
69 }
70 
71 
get_len_of_offsets(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)72 static inline uint32_t get_len_of_offsets(
73     KEY_AND_COL_INFO* kc_info,
74     TABLE_SHARE* table_share,
75     uint keynr) {
76 
77     uint len = 0;
78     for (uint i = 0; i < table_share->fields; i++) {
79         if (is_variable_field(kc_info, i) &&
80             !bitmap_is_set(&kc_info->key_filters[keynr], i)) {
81             len += kc_info->num_offset_bytes;
82         }
83     }
84     return len;
85 }
86 
87 
allocate_key_and_col_info(TABLE_SHARE * table_share,KEY_AND_COL_INFO * kc_info)88 static int allocate_key_and_col_info(
89     TABLE_SHARE* table_share,
90     KEY_AND_COL_INFO* kc_info) {
91 
92     int error;
93     //
94     // initialize all of the bitmaps
95     //
96     for (uint i = 0; i < MAX_KEY + 1; i++) {
97         error =
98             bitmap_init(
99                 &kc_info->key_filters[i],
100                 NULL,
101                 table_share->fields,
102                 false);
103         if (error) {
104             goto exit;
105         }
106     }
107 
108     //
109     // create the field lengths
110     //
111     kc_info->multi_ptr = tokudb::memory::multi_malloc(
112         MYF(MY_WME+MY_ZEROFILL),
113         &kc_info->field_types, (uint)(table_share->fields * sizeof (uint8_t)),
114         &kc_info->field_lengths, (uint)(table_share->fields * sizeof (uint16_t)),
115         &kc_info->length_bytes, (uint)(table_share->fields * sizeof (uint8_t)),
116         &kc_info->blob_fields, (uint)(table_share->fields * sizeof (uint32_t)),
117         NullS);
118     if (kc_info->multi_ptr == NULL) {
119         error = ENOMEM;
120         goto exit;
121     }
122 exit:
123     if (error) {
124         for (uint i = 0; MAX_KEY + 1; i++) {
125             bitmap_free(&kc_info->key_filters[i]);
126         }
127         tokudb::memory::free(kc_info->multi_ptr);
128     }
129     return error;
130 }
131 
free_key_and_col_info(KEY_AND_COL_INFO * kc_info)132 static void free_key_and_col_info (KEY_AND_COL_INFO* kc_info) {
133     for (uint i = 0; i < MAX_KEY+1; i++) {
134         bitmap_free(&kc_info->key_filters[i]);
135     }
136 
137     for (uint i = 0; i < MAX_KEY+1; i++) {
138         tokudb::memory::free(kc_info->cp_info[i]);
139         kc_info->cp_info[i] = NULL; // 3144
140     }
141 
142     tokudb::memory::free(kc_info->multi_ptr);
143     kc_info->field_types = NULL;
144     kc_info->field_lengths = NULL;
145     kc_info->length_bytes = NULL;
146     kc_info->blob_fields = NULL;
147 }
148 
149 
static_init()150 void TOKUDB_SHARE::static_init() {
151     assert_always(_open_tables.size() == 0);
152 }
static_destroy()153 void TOKUDB_SHARE::static_destroy() {
154     for (auto it = _open_tables.cbegin(); it != _open_tables.cend(); it++) {
155         TOKUDB_TRACE("_open_tables %s %p", it->first.c_str(), it->second);
156         TOKUDB_SHARE* share = it->second;
157         share->destroy();
158         delete share;
159     }
160     _open_tables.clear();
161     assert_always(_open_tables.size() == 0);
162 }
get_state_string(share_state_t state)163 const char* TOKUDB_SHARE::get_state_string(share_state_t state) {
164     static const char* state_string[] = {
165         "CLOSED",
166         "OPENED",
167         "ERROR"
168     };
169     assert_always(state == CLOSED || state == OPENED || state == ERROR);
170     return state_string[state];
171 }
operator new(size_t sz)172 void* TOKUDB_SHARE::operator new(size_t sz) {
173     return tokudb::memory::malloc(sz, MYF(MY_WME|MY_ZEROFILL|MY_FAE));
174 }
operator delete(void * p)175 void TOKUDB_SHARE::operator delete(void* p) { tokudb::memory::free(p); }
TOKUDB_SHARE()176 TOKUDB_SHARE::TOKUDB_SHARE()
177     : _num_DBs_lock(num_DBs_lock_key), _mutex(ha_tokudb_mutex_key) {}
init(const char * table_name)178 void TOKUDB_SHARE::init(const char* table_name) {
179     _use_count = 0;
180     thr_lock_init(&_thr_lock);
181     _state = CLOSED;
182     _row_delta_activity = 0;
183     _allow_auto_analysis = true;
184 
185     _full_table_name.append(table_name);
186 
187     String tmp_dictionary_name;
188     tokudb_split_dname(
189         table_name,
190         _database_name,
191         _table_name,
192         tmp_dictionary_name);
193 
194     TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
195         _full_table_name.ptr(),
196         get_state_string(_state),
197         _use_count);
198     TOKUDB_SHARE_DBUG_VOID_RETURN();
199 }
destroy()200 void TOKUDB_SHARE::destroy() {
201     TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
202         _full_table_name.ptr(),
203         get_state_string(_state),
204         _use_count);
205 
206     assert_always(_use_count == 0);
207     assert_always(
208         _state == TOKUDB_SHARE::CLOSED || _state == TOKUDB_SHARE::ERROR);
209     thr_lock_delete(&_thr_lock);
210     TOKUDB_SHARE_DBUG_VOID_RETURN();
211 }
get_share(const char * table_name,THR_LOCK_DATA * data,bool create_new)212 TOKUDB_SHARE* TOKUDB_SHARE::get_share(const char* table_name,
213                                       THR_LOCK_DATA* data,
214                                       bool create_new) {
215     std::string find_table_name(table_name);
216     mutex_t_lock(_open_tables_mutex);
217     auto it = _open_tables.find(find_table_name);
218     TOKUDB_SHARE *share = nullptr;
219     if (it != _open_tables.end()) {
220         share = it->second;
221         assert_always(strcmp(table_name, share->full_table_name()) == 0);
222     }
223     TOKUDB_TRACE_FOR_FLAGS(
224         TOKUDB_DEBUG_SHARE,
225         "existing share[%s] %s:share[%p]",
226         table_name,
227         share == NULL ? "not found" : "found",
228         share);
229 
230     if (!share) {
231         if (create_new == false)
232             goto exit;
233         // create share and fill it with all zeroes
234         // hence, all pointers are initialized to NULL
235         share = new TOKUDB_SHARE;
236         assert_always(share);
237 
238         share->init(table_name);
239 
240         _open_tables.insert({find_table_name, share});
241     }
242 
243     share->addref();
244 
245     if (data)
246         thr_lock_data_init(&(share->_thr_lock), data, NULL);
247 
248 exit:
249     mutex_t_unlock(_open_tables_mutex);
250     return share;
251 }
drop_share(TOKUDB_SHARE * share)252 void TOKUDB_SHARE::drop_share(TOKUDB_SHARE* share) {
253     TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_SHARE,
254                            "share[%p]:file[%s]:state[%s]:use_count[%d]",
255                            share,
256                            share->_full_table_name.ptr(),
257                            get_state_string(share->_state),
258                            share->_use_count);
259 
260     mutex_t_lock(_open_tables_mutex);
261     size_t n = _open_tables.erase(std::string(share->full_table_name()));
262     assert_always(n == 1);
263     share->destroy();
264     delete share;
265     mutex_t_unlock(_open_tables_mutex);
266 }
addref()267 TOKUDB_SHARE::share_state_t TOKUDB_SHARE::addref() {
268     TOKUDB_SHARE_TRACE_FOR_FLAGS((TOKUDB_DEBUG_ENTER & TOKUDB_DEBUG_SHARE),
269                                  "file[%s]:state[%s]:use_count[%d]",
270                                  _full_table_name.ptr(),
271                                  get_state_string(_state),
272                                  _use_count);
273 
274     lock();
275     _use_count++;
276 
277     return _state;
278 }
release()279 int TOKUDB_SHARE::release() {
280     TOKUDB_SHARE_DBUG_ENTER("file[%s]:state[%s]:use_count[%d]",
281         _full_table_name.ptr(),
282         get_state_string(_state),
283         _use_count);
284 
285     int error, result = 0;
286 
287     mutex_t_lock(_mutex);
288     assert_always(_use_count != 0);
289     _use_count--;
290     if (_use_count == 0 && _state == TOKUDB_SHARE::OPENED) {
291         // number of open DB's may not be equal to number of keys we have
292         // because add_index may have added some. So, we loop through entire
293         // array and close any non-NULL value.  It is imperative that we reset
294         // a DB to NULL once we are done with it.
295         for (uint i = 0; i < sizeof(key_file)/sizeof(key_file[0]); i++) {
296             if (key_file[i]) {
297                 TOKUDB_TRACE_FOR_FLAGS(
298                     TOKUDB_DEBUG_OPEN,
299                     "dbclose:%p",
300                     key_file[i]);
301                 error = key_file[i]->close(key_file[i], 0);
302                 assert_always(error == 0);
303                 if (error) {
304                     result = error;
305                 }
306                 if (key_file[i] == file)
307                     file = NULL;
308                 key_file[i] = NULL;
309             }
310         }
311 
312         error = tokudb::metadata::close(&status_block);
313         assert_always(error == 0);
314 
315         free_key_and_col_info(&kc_info);
316 
317         if (_rec_per_key) {
318             tokudb::memory::free(_rec_per_key);
319             _rec_per_key = NULL;
320             _rec_per_keys = 0;
321         }
322 
323         for (uint i = 0; i < _keys; i++) {
324            tokudb::memory::free(_key_descriptors[i]._name);
325         }
326         tokudb::memory::free(_key_descriptors);
327         _keys = _max_key_parts = 0; _key_descriptors = NULL;
328 
329         _state = TOKUDB_SHARE::CLOSED;
330     }
331     mutex_t_unlock(_mutex);
332 
333     TOKUDB_SHARE_DBUG_RETURN(result);
334 }
update_row_count(THD * thd,uint64_t added,uint64_t deleted,uint64_t updated)335 void TOKUDB_SHARE::update_row_count(
336     THD* thd,
337     uint64_t added,
338     uint64_t deleted,
339     uint64_t updated) {
340 
341     uint64_t delta = added + deleted + updated;
342     lock();
343     if (deleted > added && _rows < (deleted - added)) {
344         _rows = 0;
345     } else {
346         _rows += added - deleted;
347     }
348     _row_delta_activity += delta;
349     if (_row_delta_activity == (uint64_t)~0)
350         _row_delta_activity = 1;
351 
352     ulonglong auto_threshold = tokudb::sysvars::auto_analyze(thd);
353     if (delta && auto_threshold > 0 && _allow_auto_analysis) {
354         ulonglong pct_of_rows_changed_to_trigger;
355         pct_of_rows_changed_to_trigger = ((_rows * auto_threshold) / 100);
356         if (TOKUDB_UNLIKELY(_row_delta_activity >= pct_of_rows_changed_to_trigger)) {
357             char msg[200];
358             snprintf(msg,
359                      sizeof(msg),
360                      "TokuDB: Auto %s analysis for %s, delta_activity %llu is "
361                      "greater than %llu percent of %llu rows.",
362                      tokudb::sysvars::analyze_in_background(thd) > 0
363                          ? "scheduling background"
364                          : "running foreground",
365                      full_table_name(),
366                      _row_delta_activity,
367                      auto_threshold,
368                      (ulonglong)(_rows));
369 
370             // analyze_standard will unlock _mutex regardless of success/failure
371             int ret = analyze_standard(thd, NULL);
372             if (TOKUDB_UNLIKELY(ret == 0 && tokudb::sysvars::debug > 0)) {
373                 sql_print_information("%s - succeeded.", msg);
374             } else if (TOKUDB_UNLIKELY(ret != 0)) {
375                 sql_print_information(
376                     "%s - failed, likely a job already running.",
377                     msg);
378             }
379         }
380     }
381     unlock();
382 }
set_cardinality_counts_in_table(TABLE * table)383 void TOKUDB_SHARE::set_cardinality_counts_in_table(TABLE* table) {
384     lock();
385     uint32_t next_key_part = 0;
386     for (uint32_t i = 0; i < table->s->keys; i++) {
387         KEY* key = &table->key_info[i];
388         bool is_unique_key =
389             (i == table->s->primary_key) || (key->flags & HA_NOSAME);
390 
391         /* Check if this index supports index statistics. */
392         if (!key->supports_records_per_key()) {
393             continue;
394         }
395 
396         for (uint32_t j = 0; j < key->actual_key_parts; j++) {
397             if (j >= key->user_defined_key_parts) {
398                 // MySQL 'hidden' keys, really needs deeper investigation
399                 // into MySQL hidden keys vs TokuDB hidden keys
400                 key->set_records_per_key(j, 1.0);
401                 key->rec_per_key[j] = 1;
402                 continue;
403             }
404 
405             assert_always(next_key_part < _rec_per_keys);
406             ulong val = _rec_per_key[next_key_part++];
407             val = (val * tokudb::sysvars::cardinality_scale_percent) / 100;
408             if (val == 0 || _rows == 0 ||
409                 (is_unique_key && j == key->actual_key_parts - 1)) {
410                 val = 1;
411             }
412             key->set_records_per_key(
413                 j,
414                 static_cast<rec_per_key_t>(val));
415             key->rec_per_key[j] = val;
416         }
417     }
418     unlock();
419 }
420 
421 #define HANDLE_INVALID_CURSOR() \
422     if (cursor == NULL) { \
423         error = last_cursor_error; \
424         goto cleanup; \
425     }
426 
table_type() const427 const char *ha_tokudb::table_type() const {
428     return tokudb_hton_name;
429 }
430 
index_type(TOKUDB_UNUSED (uint inx))431 const char *ha_tokudb::index_type(TOKUDB_UNUSED(uint inx)) {
432     return "BTREE";
433 }
434 
435 /*
436  *  returns NULL terminated file extension string
437  */
bas_ext() const438 const char **ha_tokudb::bas_ext() const {
439     TOKUDB_HANDLER_DBUG_ENTER("");
440     DBUG_RETURN(ha_tokudb_exts);
441 }
442 
is_insert_ignore(THD * thd)443 static inline bool is_insert_ignore (THD* thd) {
444     //
445     // from http://lists.mysql.com/internals/37735
446     //
447     return thd->lex->is_ignore() && thd->lex->duplicates == DUP_ERROR;
448 }
449 
is_replace_into(THD * thd)450 static inline bool is_replace_into(THD* thd) {
451     return thd->lex->duplicates == DUP_REPLACE;
452 }
453 
table_flags() const454 ulonglong ha_tokudb::table_flags() const {
455     return int_table_flags | HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE;
456 }
457 
458 //
459 // Returns a bit mask of capabilities of the key or its part specified by
460 // the arguments. The capabilities are defined in sql/handler.h.
461 //
index_flags(uint idx,TOKUDB_UNUSED (uint part),TOKUDB_UNUSED (bool all_parts)) const462 ulong ha_tokudb::index_flags(uint idx,
463                              TOKUDB_UNUSED(uint part),
464                              TOKUDB_UNUSED(bool all_parts)) const {
465     TOKUDB_HANDLER_DBUG_ENTER("");
466     assert_always(table_share);
467     ulong flags = (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER |
468         HA_KEYREAD_ONLY | HA_READ_RANGE | HA_DO_INDEX_COND_PUSHDOWN);
469     if (key_is_clustering(&table_share->key_info[idx])) {
470         flags |= HA_CLUSTERED_INDEX;
471     }
472     DBUG_RETURN(flags);
473 }
474 
475 
476 //
477 // struct that will be used as a context for smart DBT callbacks
478 // contains parameters needed to complete the smart DBT cursor call
479 //
480 typedef struct smart_dbt_info {
481     ha_tokudb* ha; //instance to ha_tokudb needed for reading the row
482     uchar* buf; // output buffer where row will be written
483     uint keynr; // index into share->key_file that represents DB we are currently operating on
484 } *SMART_DBT_INFO;
485 
486 typedef struct smart_dbt_bf_info {
487     ha_tokudb* ha;
488     bool need_val;
489     int direction;
490     THD* thd;
491     uchar* buf;
492     DBT* key_to_compare;
493 } *SMART_DBT_BF_INFO;
494 
495 typedef struct index_read_info {
496     struct smart_dbt_info smart_dbt_info;
497     int cmp;
498     DBT* orig_key;
499 } *INDEX_READ_INFO;
500 
501 //
502 // smart DBT callback function for optimize
503 // in optimize, we want to flatten DB by doing
504 // a full table scan. Therefore, we don't
505 // want to actually do anything with the data, hence
506 // callback does nothing
507 //
smart_dbt_do_nothing(TOKUDB_UNUSED (DBT const * key),TOKUDB_UNUSED (DBT const * row),TOKUDB_UNUSED (void * context))508 static int smart_dbt_do_nothing(TOKUDB_UNUSED(DBT const* key),
509                                 TOKUDB_UNUSED(DBT const* row),
510                                 TOKUDB_UNUSED(void* context)) {
511     return 0;
512 }
513 
514 static int
smart_dbt_callback_rowread_ptquery(DBT const * key,DBT const * row,void * context)515 smart_dbt_callback_rowread_ptquery (DBT const *key, DBT  const *row, void *context) {
516     SMART_DBT_INFO info = (SMART_DBT_INFO)context;
517     info->ha->extract_hidden_primary_key(info->keynr, key);
518     return info->ha->read_row_callback(info->buf,info->keynr,row,key);
519 }
520 
521 //
522 // Smart DBT callback function in case where we have a covering index
523 //
smart_dbt_callback_keyread(DBT const * key,DBT TOKUDB_UNUSED (const * row),void * context)524 static int smart_dbt_callback_keyread(DBT const* key,
525                                       DBT TOKUDB_UNUSED(const* row),
526                                       void* context) {
527     SMART_DBT_INFO info = (SMART_DBT_INFO)context;
528     info->ha->extract_hidden_primary_key(info->keynr, key);
529     info->ha->read_key_only(info->buf,info->keynr,key);
530     return 0;
531 }
532 
533 //
534 // Smart DBT callback function in case where we do NOT have a covering index
535 //
536 static int
smart_dbt_callback_rowread(DBT const * key,DBT const * row,void * context)537 smart_dbt_callback_rowread(DBT const *key, DBT  const *row, void *context) {
538     int error = 0;
539     SMART_DBT_INFO info = (SMART_DBT_INFO)context;
540     info->ha->extract_hidden_primary_key(info->keynr, key);
541     error = info->ha->read_primary_key(info->buf,info->keynr,row,key);
542     return error;
543 }
544 
545 //
546 // Smart DBT callback function in case where we have a covering index
547 //
smart_dbt_callback_ir_keyread(DBT const * key,TOKUDB_UNUSED (DBT const * row),void * context)548 static int smart_dbt_callback_ir_keyread(DBT const* key,
549                                          TOKUDB_UNUSED(DBT const* row),
550                                          void* context) {
551     INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
552     ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
553     if (ir_info->cmp) {
554         return 0;
555     }
556     return smart_dbt_callback_keyread(key, row, &ir_info->smart_dbt_info);
557 }
558 
smart_dbt_callback_lookup(DBT const * key,TOKUDB_UNUSED (DBT const * row),void * context)559 static int smart_dbt_callback_lookup(DBT const* key,
560                                      TOKUDB_UNUSED(DBT const* row),
561                                      void* context) {
562     INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
563     ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
564     return 0;
565 }
566 
567 
568 //
569 // Smart DBT callback function in case where we do NOT have a covering index
570 //
571 static int
smart_dbt_callback_ir_rowread(DBT const * key,DBT const * row,void * context)572 smart_dbt_callback_ir_rowread(DBT const *key, DBT  const *row, void *context) {
573     INDEX_READ_INFO ir_info = (INDEX_READ_INFO)context;
574     ir_info->cmp = ir_info->smart_dbt_info.ha->prefix_cmp_dbts(ir_info->smart_dbt_info.keynr, ir_info->orig_key, key);
575     if (ir_info->cmp) {
576         return 0;
577     }
578     return smart_dbt_callback_rowread(key, row, &ir_info->smart_dbt_info);
579 }
580 
581 //
582 // macro for Smart DBT callback function,
583 // so we do not need to put this long line of code in multiple places
584 //
585 #define SMART_DBT_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_keyread : smart_dbt_callback_rowread )
586 #define SMART_DBT_IR_CALLBACK(do_key_read) ((do_key_read) ? smart_dbt_callback_ir_keyread : smart_dbt_callback_ir_rowread )
587 
588 //
589 // macro that modifies read flag for cursor operations depending on whether
590 // we have preacquired lock or not
591 //
592 #define SET_PRELOCK_FLAG(flg) ((flg) | (range_lock_grabbed ? (use_write_locks ? DB_PRELOCKED_WRITE : DB_PRELOCKED) : 0))
593 
594 //
595 // This method retrieves the value of the auto increment column of a record in MySQL format
596 // This was basically taken from MyISAM
597 // Parameters:
598 //              type - the type of the auto increment column (e.g. int, float, double...)
599 //              offset - offset into the record where the auto increment column is stored
600 //      [in]    record - MySQL row whose auto increment value we want to extract
601 // Returns:
602 //      The value of the auto increment column in record
603 //
retrieve_auto_increment(uint16 type,uint32 offset,const uchar * record)604 static ulonglong retrieve_auto_increment(uint16 type, uint32 offset,const uchar *record)
605 {
606     const uchar *key;     /* Key */
607     ulonglong   unsigned_autoinc = 0;  /* Unsigned auto-increment */
608     longlong      signed_autoinc = 0;  /* Signed auto-increment */
609     enum { unsigned_type, signed_type } autoinc_type;
610     float float_tmp;   /* Temporary variable */
611     double double_tmp; /* Temporary variable */
612 
613     key = ((uchar *) record) + offset;
614 
615     /* Set default autoincrement type */
616     autoinc_type = unsigned_type;
617 
618     switch (type) {
619     case HA_KEYTYPE_INT8:
620         signed_autoinc   = (longlong) *(char*)key;
621         autoinc_type     = signed_type;
622         break;
623 
624     case HA_KEYTYPE_BINARY:
625         unsigned_autoinc = (ulonglong) *(uchar*) key;
626         break;
627 
628     case HA_KEYTYPE_SHORT_INT:
629         signed_autoinc   = (longlong) sint2korr(key);
630         autoinc_type     = signed_type;
631         break;
632 
633     case HA_KEYTYPE_USHORT_INT:
634         unsigned_autoinc = (ulonglong) uint2korr(key);
635         break;
636 
637     case HA_KEYTYPE_LONG_INT:
638         signed_autoinc   = (longlong) sint4korr(key);
639         autoinc_type     = signed_type;
640         break;
641 
642     case HA_KEYTYPE_ULONG_INT:
643         unsigned_autoinc = (ulonglong) uint4korr(key);
644         break;
645 
646     case HA_KEYTYPE_INT24:
647         signed_autoinc   = (longlong) sint3korr(key);
648         autoinc_type     = signed_type;
649         break;
650 
651     case HA_KEYTYPE_UINT24:
652         unsigned_autoinc = (ulonglong) tokudb_uint3korr(key);
653         break;
654 
655     case HA_KEYTYPE_LONGLONG:
656         signed_autoinc   = sint8korr(key);
657         autoinc_type     = signed_type;
658         break;
659 
660     case HA_KEYTYPE_ULONGLONG:
661         unsigned_autoinc = uint8korr(key);
662         break;
663 
664     /* The remaining two cases should not be used but are included for
665        compatibility */
666     case HA_KEYTYPE_FLOAT:
667         float4get(&float_tmp, key);  /* Note: float4get is a macro */
668         signed_autoinc   = (longlong) float_tmp;
669         autoinc_type     = signed_type;
670         break;
671 
672     case HA_KEYTYPE_DOUBLE:
673         float8get(&double_tmp, key); /* Note: float8get is a macro */
674         signed_autoinc   = (longlong) double_tmp;
675         autoinc_type     = signed_type;
676         break;
677 
678     default:
679         assert_unreachable();
680     }
681 
682     if (signed_autoinc < 0) {
683         signed_autoinc = 0;
684     }
685 
686     return autoinc_type == unsigned_type ?
687            unsigned_autoinc : (ulonglong) signed_autoinc;
688 }
689 
field_offset(Field * field,TABLE * table)690 static inline ulong field_offset(Field* field, TABLE* table) {
691     return((ulong) (field->ptr - table->record[0]));
692 }
693 
tx_to_toku_iso(ulong tx_isolation)694 static inline HA_TOKU_ISO_LEVEL tx_to_toku_iso(ulong tx_isolation) {
695     if (tx_isolation == ISO_READ_UNCOMMITTED) {
696         return hatoku_iso_read_uncommitted;
697     }
698     else if (tx_isolation == ISO_READ_COMMITTED) {
699         return hatoku_iso_read_committed;
700     }
701     else if (tx_isolation == ISO_REPEATABLE_READ) {
702         return hatoku_iso_repeatable_read;
703     }
704     else {
705         return hatoku_iso_serializable;
706     }
707 }
708 
toku_iso_to_txn_flag(HA_TOKU_ISO_LEVEL lvl)709 static inline uint32_t toku_iso_to_txn_flag (HA_TOKU_ISO_LEVEL lvl) {
710     if (lvl == hatoku_iso_read_uncommitted) {
711         return DB_READ_UNCOMMITTED;
712     }
713     else if (lvl == hatoku_iso_read_committed) {
714         return DB_READ_COMMITTED;
715     }
716     else if (lvl == hatoku_iso_repeatable_read) {
717         return DB_TXN_SNAPSHOT;
718     }
719     else {
720         return 0;
721     }
722 }
723 
filter_key_part_compare(const void * left,const void * right)724 static int filter_key_part_compare (const void* left, const void* right) {
725     FILTER_KEY_PART_INFO* left_part= (FILTER_KEY_PART_INFO *)left;
726     FILTER_KEY_PART_INFO* right_part = (FILTER_KEY_PART_INFO *)right;
727     return left_part->offset - right_part->offset;
728 }
729 
730 //
731 // Be very careful with parameters passed to this function. Who knows
732 // if key, table have proper info set. I had to verify by checking
733 // in the debugger.
734 //
set_key_filter(MY_BITMAP * key_filter,KEY * key,TABLE * table,bool get_offset_from_keypart)735 void set_key_filter(
736     MY_BITMAP* key_filter,
737     KEY* key,
738     TABLE* table,
739     bool get_offset_from_keypart) {
740 
741     FILTER_KEY_PART_INFO parts[MAX_REF_PARTS];
742     uint curr_skip_index = 0;
743 
744     for (uint i = 0; i < key->user_defined_key_parts; i++) {
745         //
746         // horrendous hack due to bugs in mysql, basically
747         // we cannot always reliably get the offset from the same source
748         //
749         parts[i].offset =
750             get_offset_from_keypart ?
751                 key->key_part[i].offset :
752                 field_offset(key->key_part[i].field, table);
753         parts[i].part_index = i;
754     }
755     qsort(
756         parts, // start of array
757         key->user_defined_key_parts, //num elements
758         sizeof(*parts), //size of each element
759         filter_key_part_compare);
760 
761     for (uint i = 0; i < table->s->fields; i++) {
762         Field* field = table->field[i];
763         uint curr_field_offset = field_offset(field, table);
764         if (curr_skip_index < key->user_defined_key_parts) {
765             uint curr_skip_offset = 0;
766             curr_skip_offset = parts[curr_skip_index].offset;
767             if (curr_skip_offset == curr_field_offset) {
768                 //
769                 // we have hit a field that is a portion of the primary key
770                 //
771                 uint curr_key_index = parts[curr_skip_index].part_index;
772                 curr_skip_index++;
773                 //
774                 // only choose to continue over the key if the key's length matches the field's length
775                 // otherwise, we may have a situation where the column is a varchar(10), the
776                 // key is only the first 3 characters, and we end up losing the last 7 bytes of the
777                 // column
778                 //
779                 TOKU_TYPE toku_type = mysql_to_toku_type(field);
780                 switch (toku_type) {
781                 case toku_type_blob:
782                     break;
783                 case toku_type_varbinary:
784                 case toku_type_varstring:
785                 case toku_type_fixbinary:
786                 case toku_type_fixstring:
787                     if (key->key_part[curr_key_index].length == field->field_length) {
788                         bitmap_set_bit(key_filter,i);
789                     }
790                     break;
791                 default:
792                     bitmap_set_bit(key_filter,i);
793                     break;
794                 }
795             }
796         }
797     }
798 }
799 
pack_fixed_field(uchar * to_tokudb,const uchar * from_mysql,uint32_t num_bytes)800 static inline uchar* pack_fixed_field(
801     uchar* to_tokudb,
802     const uchar* from_mysql,
803     uint32_t num_bytes
804     )
805 {
806     switch (num_bytes) {
807     case (1):
808         memcpy(to_tokudb, from_mysql, 1);
809         break;
810     case (2):
811         memcpy(to_tokudb, from_mysql, 2);
812         break;
813     case (3):
814         memcpy(to_tokudb, from_mysql, 3);
815         break;
816     case (4):
817         memcpy(to_tokudb, from_mysql, 4);
818         break;
819     case (8):
820         memcpy(to_tokudb, from_mysql, 8);
821         break;
822     default:
823         memcpy(to_tokudb, from_mysql, num_bytes);
824         break;
825     }
826     return to_tokudb+num_bytes;
827 }
828 
unpack_fixed_field(uchar * to_mysql,const uchar * from_tokudb,uint32_t num_bytes)829 static inline const uchar* unpack_fixed_field(
830     uchar* to_mysql,
831     const uchar* from_tokudb,
832     uint32_t num_bytes
833     )
834 {
835     switch (num_bytes) {
836     case (1):
837         memcpy(to_mysql, from_tokudb, 1);
838         break;
839     case (2):
840         memcpy(to_mysql, from_tokudb, 2);
841         break;
842     case (3):
843         memcpy(to_mysql, from_tokudb, 3);
844         break;
845     case (4):
846         memcpy(to_mysql, from_tokudb, 4);
847         break;
848     case (8):
849         memcpy(to_mysql, from_tokudb, 8);
850         break;
851     default:
852         memcpy(to_mysql, from_tokudb, num_bytes);
853         break;
854     }
855     return from_tokudb+num_bytes;
856 }
857 
write_var_field(uchar * to_tokudb_offset_ptr,uchar * to_tokudb_data,uchar * to_tokudb_offset_start,const uchar * data,uint32_t data_length,uint32_t offset_bytes)858 static inline uchar* write_var_field(
859     uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
860     uchar* to_tokudb_data, // location where data is going to be written
861     uchar* to_tokudb_offset_start, //location where offset starts, IS THIS A BAD NAME????
862     const uchar * data, // the data to write
863     uint32_t data_length, // length of data to write
864     uint32_t offset_bytes // number of offset bytes
865     )
866 {
867     memcpy(to_tokudb_data, data, data_length);
868     //
869     // for offset, we pack the offset where the data ENDS!
870     //
871     uint32_t offset = to_tokudb_data + data_length - to_tokudb_offset_start;
872     switch(offset_bytes) {
873     case (1):
874         to_tokudb_offset_ptr[0] = (uchar)offset;
875         break;
876     case (2):
877         int2store(to_tokudb_offset_ptr,offset);
878         break;
879     default:
880         assert_unreachable();
881         break;
882     }
883     return to_tokudb_data + data_length;
884 }
885 
get_var_data_length(const uchar * from_mysql,uint32_t mysql_length_bytes)886 static inline uint32_t get_var_data_length(
887     const uchar * from_mysql,
888     uint32_t mysql_length_bytes
889     )
890 {
891     uint32_t data_length;
892     switch(mysql_length_bytes) {
893     case(1):
894         data_length = from_mysql[0];
895         break;
896     case(2):
897         data_length = uint2korr(from_mysql);
898         break;
899     default:
900         assert_unreachable();
901     }
902     return data_length;
903 }
904 
pack_var_field(uchar * to_tokudb_offset_ptr,uchar * to_tokudb_data,uchar * to_tokudb_offset_start,const uchar * from_mysql,uint32_t mysql_length_bytes,uint32_t offset_bytes)905 static inline uchar* pack_var_field(
906     uchar* to_tokudb_offset_ptr, //location where offset data is going to be written
907     uchar* to_tokudb_data, // pointer to where tokudb data should be written
908     uchar* to_tokudb_offset_start, //location where data starts, IS THIS A BAD NAME????
909     const uchar * from_mysql, // mysql data
910     uint32_t mysql_length_bytes, //number of bytes used to store length in from_mysql
911     uint32_t offset_bytes //number of offset_bytes used in tokudb row
912     )
913 {
914     uint data_length = get_var_data_length(from_mysql, mysql_length_bytes);
915     return write_var_field(
916         to_tokudb_offset_ptr,
917         to_tokudb_data,
918         to_tokudb_offset_start,
919         from_mysql + mysql_length_bytes,
920         data_length,
921         offset_bytes
922         );
923 }
924 
unpack_var_field(uchar * to_mysql,const uchar * from_tokudb_data,uint32_t from_tokudb_data_len,uint32_t mysql_length_bytes)925 static inline void unpack_var_field(
926     uchar* to_mysql,
927     const uchar* from_tokudb_data,
928     uint32_t from_tokudb_data_len,
929     uint32_t mysql_length_bytes
930     )
931 {
932     //
933     // store the length
934     //
935     switch (mysql_length_bytes) {
936     case(1):
937         to_mysql[0] = (uchar)from_tokudb_data_len;
938         break;
939     case(2):
940         int2store(to_mysql, from_tokudb_data_len);
941         break;
942     default:
943         assert_unreachable();
944     }
945     //
946     // store the data
947     //
948     memcpy(to_mysql+mysql_length_bytes, from_tokudb_data, from_tokudb_data_len);
949 }
950 
pack_toku_field_blob(uchar * to_tokudb,const uchar * from_mysql,Field * field)951 static uchar* pack_toku_field_blob(
952     uchar* to_tokudb,
953     const uchar* from_mysql,
954     Field* field
955     )
956 {
957     uint32_t len_bytes = field->row_pack_length();
958     uint32_t length = 0;
959     uchar* data_ptr = NULL;
960     memcpy(to_tokudb, from_mysql, len_bytes);
961 
962     switch (len_bytes) {
963     case (1):
964         length = (uint32_t)(*from_mysql);
965         break;
966     case (2):
967         length = uint2korr(from_mysql);
968         break;
969     case (3):
970         length = tokudb_uint3korr(from_mysql);
971         break;
972     case (4):
973         length = uint4korr(from_mysql);
974         break;
975     default:
976         assert_unreachable();
977     }
978 
979     if (length > 0) {
980         memcpy((uchar *)(&data_ptr), from_mysql + len_bytes, sizeof(uchar*));
981         memcpy(to_tokudb + len_bytes, data_ptr, length);
982     }
983     return (to_tokudb + len_bytes + length);
984 }
985 
create_tokudb_trx_data_instance(tokudb_trx_data ** out_trx)986 static int create_tokudb_trx_data_instance(tokudb_trx_data** out_trx) {
987     int error;
988     tokudb_trx_data* trx = (tokudb_trx_data *) tokudb::memory::malloc(
989         sizeof(*trx),
990         MYF(MY_ZEROFILL));
991     if (!trx) {
992         error = ENOMEM;
993         goto cleanup;
994     }
995 
996     *out_trx = trx;
997     error = 0;
998 cleanup:
999     return error;
1000 }
1001 
tokudb_generate_row(DB * dest_db,TOKUDB_UNUSED (DB * src_db),DBT * dest_key,DBT * dest_val,const DBT * src_key,const DBT * src_val)1002 static inline int tokudb_generate_row(DB* dest_db,
1003                                       TOKUDB_UNUSED(DB* src_db),
1004                                       DBT* dest_key,
1005                                       DBT* dest_val,
1006                                       const DBT* src_key,
1007                                       const DBT* src_val) {
1008     int error;
1009 
1010     DB* curr_db = dest_db;
1011     uchar* row_desc = NULL;
1012     uint32_t desc_size;
1013     uchar* buff = NULL;
1014     uint32_t max_key_len = 0;
1015 
1016     row_desc = (uchar *)curr_db->descriptor->dbt.data;
1017     row_desc += (*(uint32_t *)row_desc);
1018     desc_size = (*(uint32_t *)row_desc) - 4;
1019     row_desc += 4;
1020 
1021     if (is_key_pk(row_desc)) {
1022         if (dest_key->flags == DB_DBT_REALLOC && dest_key->data != NULL) {
1023             free(dest_key->data);
1024         }
1025         if (dest_val != NULL) {
1026             if (dest_val->flags == DB_DBT_REALLOC && dest_val->data != NULL) {
1027                 free(dest_val->data);
1028             }
1029         }
1030         dest_key->data = src_key->data;
1031         dest_key->size = src_key->size;
1032         dest_key->flags = 0;
1033         if (dest_val != NULL) {
1034             dest_val->data = src_val->data;
1035             dest_val->size = src_val->size;
1036             dest_val->flags = 0;
1037         }
1038         error = 0;
1039         goto cleanup;
1040     }
1041     // at this point, we need to create the key/val and set it
1042     // in the DBTs
1043     if (dest_key->flags == 0) {
1044         dest_key->ulen = 0;
1045         dest_key->size = 0;
1046         dest_key->data = NULL;
1047         dest_key->flags = DB_DBT_REALLOC;
1048     }
1049     if (dest_key->flags == DB_DBT_REALLOC) {
1050         max_key_len = max_key_size_from_desc(row_desc, desc_size);
1051         max_key_len += src_key->size;
1052 
1053         if (max_key_len > dest_key->ulen) {
1054             void* old_ptr = dest_key->data;
1055             void* new_ptr = NULL;
1056             new_ptr = realloc(old_ptr, max_key_len);
1057             assert_always(new_ptr);
1058             dest_key->data = new_ptr;
1059             dest_key->ulen = max_key_len;
1060         }
1061 
1062         buff = (uchar *)dest_key->data;
1063         assert_always(buff != nullptr);
1064         assert_always(max_key_len > 0);
1065     } else {
1066         assert_unreachable();
1067     }
1068 
1069     dest_key->size = pack_key_from_desc(buff, row_desc, desc_size, src_key,
1070                                         src_val);
1071     assert_always(dest_key->ulen >= dest_key->size);
1072     if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY)) &&
1073         !max_key_len) {
1074         max_key_len = max_key_size_from_desc(row_desc, desc_size);
1075         max_key_len += src_key->size;
1076     }
1077     if (max_key_len) {
1078         assert_always(max_key_len >= dest_key->size);
1079     }
1080 
1081     row_desc += desc_size;
1082     desc_size = (*(uint32_t *)row_desc) - 4;
1083     row_desc += 4;
1084     if (dest_val != NULL) {
1085         if (!is_key_clustering(desc_size) || src_val->size == 0) {
1086             dest_val->size = 0;
1087         } else {
1088             uchar* buff = NULL;
1089             if (dest_val->flags == 0) {
1090                 dest_val->ulen = 0;
1091                 dest_val->size = 0;
1092                 dest_val->data = NULL;
1093                 dest_val->flags = DB_DBT_REALLOC;
1094             }
1095             if (dest_val->flags == DB_DBT_REALLOC){
1096                 if (dest_val->ulen < src_val->size) {
1097                     void* old_ptr = dest_val->data;
1098                     void* new_ptr = NULL;
1099                     new_ptr = realloc(old_ptr, src_val->size);
1100                     assert_always(new_ptr);
1101                     dest_val->data = new_ptr;
1102                     dest_val->ulen = src_val->size;
1103                 }
1104                 buff = (uchar *)dest_val->data;
1105                 assert_always(buff != NULL);
1106             } else {
1107                 assert_unreachable();
1108             }
1109             dest_val->size = pack_clustering_val_from_desc(
1110                 buff,
1111                 row_desc,
1112                 desc_size,
1113                 src_val);
1114             assert_always(dest_val->ulen >= dest_val->size);
1115         }
1116     }
1117     error = 0;
1118 cleanup:
1119     return error;
1120 }
1121 
generate_row_for_del(DB * dest_db,DB * src_db,DBT_ARRAY * dest_key_arrays,const DBT * src_key,const DBT * src_val)1122 static int generate_row_for_del(
1123     DB *dest_db,
1124     DB *src_db,
1125     DBT_ARRAY *dest_key_arrays,
1126     const DBT *src_key,
1127     const DBT *src_val
1128     )
1129 {
1130     DBT* dest_key = &dest_key_arrays->dbts[0];
1131     return tokudb_generate_row(
1132         dest_db,
1133         src_db,
1134         dest_key,
1135         NULL,
1136         src_key,
1137         src_val
1138         );
1139 }
1140 
1141 
generate_row_for_put(DB * dest_db,DB * src_db,DBT_ARRAY * dest_key_arrays,DBT_ARRAY * dest_val_arrays,const DBT * src_key,const DBT * src_val)1142 static int generate_row_for_put(
1143     DB *dest_db,
1144     DB *src_db,
1145     DBT_ARRAY *dest_key_arrays,
1146     DBT_ARRAY *dest_val_arrays,
1147     const DBT *src_key,
1148     const DBT *src_val
1149     )
1150 {
1151     DBT* dest_key = &dest_key_arrays->dbts[0];
1152     DBT *dest_val = (dest_val_arrays == NULL) ? NULL : &dest_val_arrays->dbts[0];
1153     return tokudb_generate_row(
1154         dest_db,
1155         src_db,
1156         dest_key,
1157         dest_val,
1158         src_key,
1159         src_val
1160         );
1161 }
1162 
ha_tokudb(handlerton * hton,TABLE_SHARE * table_arg)1163 ha_tokudb::ha_tokudb(handlerton * hton, TABLE_SHARE * table_arg):handler(hton, table_arg) {
1164     TOKUDB_HANDLER_DBUG_ENTER("");
1165     share = NULL;
1166     int_table_flags = HA_REC_NOT_IN_SEQ  | HA_NULL_IN_KEY | HA_CAN_INDEX_BLOBS
1167         | HA_PRIMARY_KEY_IN_READ_INDEX | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION
1168         | HA_FILE_BASED | HA_AUTO_PART_KEY | HA_TABLE_SCAN_ON_INDEX
1169         | HA_CAN_WRITE_DURING_OPTIMIZE | HA_ONLINE_ANALYZE;
1170     alloc_ptr = NULL;
1171     rec_buff = NULL;
1172     rec_update_buff = NULL;
1173     transaction = NULL;
1174     cursor = NULL;
1175     fixed_cols_for_query = NULL;
1176     var_cols_for_query = NULL;
1177     num_fixed_cols_for_query = 0;
1178     num_var_cols_for_query = 0;
1179     unpack_entire_row = true;
1180     read_blobs = false;
1181     read_key = false;
1182     added_rows = 0;
1183     deleted_rows = 0;
1184     updated_rows = 0;
1185     last_dup_key = UINT_MAX;
1186     using_ignore = false;
1187     using_ignore_no_key = false;
1188     last_cursor_error = 0;
1189     range_lock_grabbed = false;
1190     blob_buff = NULL;
1191     num_blob_bytes = 0;
1192     delay_updating_ai_metadata = false;
1193     ai_metadata_update_required = false;
1194     memset(mult_key_dbt_array, 0, sizeof(mult_key_dbt_array));
1195     memset(mult_rec_dbt_array, 0, sizeof(mult_rec_dbt_array));
1196     for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1197         toku_dbt_array_init(&mult_key_dbt_array[i], 1);
1198     }
1199     for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1200         toku_dbt_array_init(&mult_rec_dbt_array[i], 1);
1201     }
1202     loader = NULL;
1203     abort_loader = false;
1204     memset(&lc, 0, sizeof(lc));
1205     lock.type = TL_IGNORE;
1206     for (uint32_t i = 0; i < MAX_KEY+1; i++) {
1207         mult_put_flags[i] = 0;
1208         mult_del_flags[i] = DB_DELETE_ANY;
1209         mult_dbt_flags[i] = DB_DBT_REALLOC;
1210     }
1211     num_DBs_locked_in_bulk = false;
1212     lock_count = 0;
1213     use_write_locks = false;
1214     range_query_buff = NULL;
1215     size_range_query_buff = 0;
1216     bytes_used_in_range_query_buff = 0;
1217     curr_range_query_buff_offset = 0;
1218     doing_bulk_fetch = false;
1219     prelocked_left_range_size = 0;
1220     prelocked_right_range_size = 0;
1221     tokudb_active_index = MAX_KEY;
1222     invalidate_icp();
1223     trx_handler_list.data = this;
1224 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
1225     in_rpl_write_rows = in_rpl_delete_rows = in_rpl_update_rows = false;
1226 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
1227     TOKUDB_HANDLER_DBUG_VOID_RETURN;
1228 }
1229 
~ha_tokudb()1230 ha_tokudb::~ha_tokudb() {
1231     TOKUDB_HANDLER_DBUG_ENTER("");
1232     for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
1233         toku_dbt_array_destroy(&mult_key_dbt_array[i]);
1234     }
1235     for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
1236         toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
1237     }
1238     TOKUDB_HANDLER_DBUG_VOID_RETURN;
1239 }
1240 
1241 //
1242 // states if table has an auto increment column, if so, sets index where auto inc column is to index
1243 // Parameters:
1244 //      [out]   index - if auto inc exists, then this param is set to where it exists in table, if not, then unchanged
1245 // Returns:
1246 //      true if auto inc column exists, false otherwise
1247 //
has_auto_increment_flag(uint * index)1248 bool ha_tokudb::has_auto_increment_flag(uint* index) {
1249     //
1250     // check to see if we have auto increment field
1251     //
1252     bool ai_found = false;
1253     uint ai_index = 0;
1254     for (uint i = 0; i < table_share->fields; i++, ai_index++) {
1255         Field* field = table->field[i];
1256         if (field->flags & AUTO_INCREMENT_FLAG) {
1257             ai_found = true;
1258             *index = ai_index;
1259             break;
1260         }
1261     }
1262     return ai_found;
1263 }
1264 
open_status_dictionary(DB ** ptr,const char * name,DB_TXN * txn)1265 static int open_status_dictionary(DB** ptr, const char* name, DB_TXN* txn) {
1266     int error;
1267     char* newname = NULL;
1268     size_t newname_len = get_max_dict_name_path_length(name);
1269     newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
1270     if (newname == NULL) {
1271         error = ENOMEM;
1272         goto cleanup;
1273     }
1274     make_name(newname, newname_len, name, "status");
1275     TOKUDB_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "open:%s", newname);
1276 
1277     error = tokudb::metadata::open(db_env, ptr, newname, txn);
1278 cleanup:
1279     tokudb::memory::free(newname);
1280     return error;
1281 }
1282 
open_main_dictionary(const char * name,bool is_read_only,DB_TXN * txn)1283 int ha_tokudb::open_main_dictionary(
1284     const char* name,
1285     bool is_read_only,
1286     DB_TXN* txn) {
1287 
1288     int error;
1289     char* newname = NULL;
1290     size_t newname_len = 0;
1291     uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1292 
1293     assert_always(share->file == NULL);
1294     assert_always(share->key_file[primary_key] == NULL);
1295     newname_len = get_max_dict_name_path_length(name);
1296     newname = (char*)tokudb::memory::malloc(
1297         newname_len,
1298         MYF(MY_WME|MY_ZEROFILL));
1299     if (newname == NULL) {
1300         error = ENOMEM;
1301         goto exit;
1302     }
1303     make_name(newname, newname_len, name, "main");
1304 
1305     error = db_create(&share->file, db_env, 0);
1306     if (error) {
1307         goto exit;
1308     }
1309     share->key_file[primary_key] = share->file;
1310 
1311     error =
1312         share->file->open(
1313             share->file,
1314             txn,
1315             newname,
1316             NULL,
1317             DB_BTREE,
1318             open_flags,
1319             is_read_only ? 0 : S_IWUSR);
1320     if (error) {
1321         goto exit;
1322     }
1323 
1324     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1325         TOKUDB_DEBUG_OPEN,
1326         "open:%s:file=%p",
1327         newname,
1328         share->file);
1329 
1330     error = 0;
1331 exit:
1332     if (error) {
1333         if (share->file) {
1334             int r = share->file->close(
1335                 share->file,
1336                 0
1337                 );
1338             assert_always(r==0);
1339             share->file = NULL;
1340             share->key_file[primary_key] = NULL;
1341         }
1342     }
1343     tokudb::memory::free(newname);
1344     return error;
1345 }
1346 
1347 //
1348 // Open a secondary table, the key will be a secondary index, the data will
1349 // be a primary key
1350 //
open_secondary_dictionary(DB ** ptr,KEY * key_info,const char * name,bool is_read_only,DB_TXN * txn)1351 int ha_tokudb::open_secondary_dictionary(
1352     DB** ptr,
1353     KEY* key_info,
1354     const char* name,
1355     bool is_read_only,
1356     DB_TXN* txn) {
1357 
1358     int error = ENOSYS;
1359     char dict_name[MAX_DICT_NAME_LEN];
1360     uint open_flags = (is_read_only ? DB_RDONLY : 0) | DB_THREAD;
1361     char* newname = NULL;
1362     size_t newname_len = 0;
1363 
1364     sprintf(dict_name, "key-%s", key_info->name);
1365 
1366     newname_len = get_max_dict_name_path_length(name);
1367     newname =
1368         (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME|MY_ZEROFILL));
1369     if (newname == NULL) {
1370         error = ENOMEM;
1371         goto cleanup;
1372     }
1373     make_name(newname, newname_len, name, dict_name);
1374 
1375 
1376     if ((error = db_create(ptr, db_env, 0))) {
1377         set_my_errno(error);
1378         goto cleanup;
1379     }
1380 
1381 
1382     error = (*ptr)->open(*ptr, txn, newname, NULL, DB_BTREE, open_flags, is_read_only ? 0 : S_IWUSR);
1383     if (error) {
1384         set_my_errno(error);
1385         goto cleanup;
1386     }
1387     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1388         TOKUDB_DEBUG_OPEN,
1389         "open:%s:file=%p",
1390         newname,
1391         *ptr);
1392 cleanup:
1393     if (error) {
1394         if (*ptr) {
1395             int r = (*ptr)->close(*ptr, 0);
1396             assert_always(r==0);
1397             *ptr = NULL;
1398         }
1399     }
1400     tokudb::memory::free(newname);
1401     return error;
1402 }
1403 
initialize_col_pack_info(KEY_AND_COL_INFO * kc_info,TABLE_SHARE * table_share,uint keynr)1404 static int initialize_col_pack_info(KEY_AND_COL_INFO* kc_info, TABLE_SHARE* table_share, uint keynr) {
1405     int error = ENOSYS;
1406     //
1407     // set up the cp_info
1408     //
1409     assert_always(kc_info->cp_info[keynr] == NULL);
1410     kc_info->cp_info[keynr] = (COL_PACK_INFO*)tokudb::memory::malloc(
1411         table_share->fields * sizeof(COL_PACK_INFO),
1412         MYF(MY_WME | MY_ZEROFILL));
1413     if (kc_info->cp_info[keynr] == NULL) {
1414         error = ENOMEM;
1415         goto exit;
1416     }
1417     {
1418     uint32_t curr_fixed_offset = 0;
1419     uint32_t curr_var_index = 0;
1420     for (uint j = 0; j < table_share->fields; j++) {
1421         COL_PACK_INFO* curr = &kc_info->cp_info[keynr][j];
1422         //
1423         // need to set the offsets / indexes
1424         // offsets are calculated AFTER the NULL bytes
1425         //
1426         if (!bitmap_is_set(&kc_info->key_filters[keynr],j)) {
1427             if (is_fixed_field(kc_info, j)) {
1428                 curr->col_pack_val = curr_fixed_offset;
1429                 curr_fixed_offset += kc_info->field_lengths[j];
1430             }
1431             else if (is_variable_field(kc_info, j)) {
1432                 curr->col_pack_val = curr_var_index;
1433                 curr_var_index++;
1434             }
1435         }
1436     }
1437 
1438     //
1439     // set up the mcp_info
1440     //
1441     kc_info->mcp_info[keynr].fixed_field_size = get_fixed_field_size(
1442         kc_info,
1443         table_share,
1444         keynr
1445         );
1446     kc_info->mcp_info[keynr].len_of_offsets = get_len_of_offsets(
1447         kc_info,
1448         table_share,
1449         keynr
1450         );
1451 
1452     error = 0;
1453     }
1454 exit:
1455     return error;
1456 }
1457 
1458 // reset the kc_info state at keynr
reset_key_and_col_info(KEY_AND_COL_INFO * kc_info,uint keynr)1459 static void reset_key_and_col_info(KEY_AND_COL_INFO *kc_info, uint keynr) {
1460     bitmap_clear_all(&kc_info->key_filters[keynr]);
1461     tokudb::memory::free(kc_info->cp_info[keynr]);
1462     kc_info->cp_info[keynr] = NULL;
1463     kc_info->mcp_info[keynr] = (MULTI_COL_PACK_INFO) { 0, 0 };
1464 }
1465 
initialize_key_and_col_info(TABLE_SHARE * table_share,TABLE * table,KEY_AND_COL_INFO * kc_info,uint hidden_primary_key,uint primary_key)1466 static int initialize_key_and_col_info(
1467     TABLE_SHARE* table_share,
1468     TABLE* table,
1469     KEY_AND_COL_INFO* kc_info,
1470     uint hidden_primary_key,
1471     uint primary_key) {
1472 
1473     int error = 0;
1474     uint32_t curr_blob_field_index = 0;
1475     uint32_t max_var_bytes = 0;
1476     //
1477     // fill in the field lengths. 0 means it is a variable sized field length
1478     // fill in length_bytes, 0 means it is fixed or blob
1479     //
1480     for (uint i = 0; i < table_share->fields; i++) {
1481         Field* field = table_share->field[i];
1482         TOKU_TYPE toku_type = mysql_to_toku_type(field);
1483         uint32 pack_length = 0;
1484         switch (toku_type) {
1485         case toku_type_int:
1486         case toku_type_double:
1487         case toku_type_float:
1488         case toku_type_fixbinary:
1489         case toku_type_fixstring:
1490             pack_length = field->pack_length();
1491             assert_always(pack_length < 1<<16);
1492             kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_FIXED_FIELD;
1493             kc_info->field_lengths[i] = (uint16_t)pack_length;
1494             kc_info->length_bytes[i] = 0;
1495             break;
1496         case toku_type_blob:
1497             kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_BLOB_FIELD;
1498             kc_info->field_lengths[i] = 0;
1499             kc_info->length_bytes[i] = 0;
1500             kc_info->blob_fields[curr_blob_field_index] = i;
1501             curr_blob_field_index++;
1502             break;
1503         case toku_type_varstring:
1504         case toku_type_varbinary:
1505             kc_info->field_types[i] = KEY_AND_COL_INFO::TOKUDB_VARIABLE_FIELD;
1506             kc_info->field_lengths[i] = 0;
1507             kc_info->length_bytes[i] =
1508                 (uchar)((Field_varstring*)field)->length_bytes;
1509             max_var_bytes += field->field_length;
1510             break;
1511         default:
1512             assert_unreachable();
1513         }
1514     }
1515     kc_info->num_blobs = curr_blob_field_index;
1516 
1517     //
1518     // initialize share->num_offset_bytes
1519     // because MAX_REF_LENGTH is 65536, we
1520     // can safely set num_offset_bytes to 1 or 2
1521     //
1522     if (max_var_bytes < 256) {
1523         kc_info->num_offset_bytes = 1;
1524     } else {
1525         kc_info->num_offset_bytes = 2;
1526     }
1527 
1528     for (uint i = 0;
1529          i < table_share->keys + tokudb_test(hidden_primary_key);
1530          i++) {
1531         //
1532         // do the cluster/primary key filtering calculations
1533         //
1534         if (!(i==primary_key && hidden_primary_key)) {
1535             if (i == primary_key) {
1536                 set_key_filter(
1537                     &kc_info->key_filters[primary_key],
1538                     &table_share->key_info[primary_key],
1539                     table,
1540                     true);
1541             } else {
1542                 set_key_filter(
1543                     &kc_info->key_filters[i],
1544                     &table_share->key_info[i],
1545                     table,
1546                     true);
1547                 if (!hidden_primary_key) {
1548                     set_key_filter(
1549                         &kc_info->key_filters[i],
1550                         &table_share->key_info[primary_key],
1551                         table,
1552                         true);
1553                 }
1554             }
1555         }
1556         if (i == primary_key || key_is_clustering(&table_share->key_info[i])) {
1557             error = initialize_col_pack_info(kc_info, table_share, i);
1558             if (error) {
1559                 goto exit;
1560             }
1561         }
1562     }
1563 exit:
1564     return error;
1565 }
1566 
initialize_share(const char * name,int mode)1567 int ha_tokudb::initialize_share(const char* name, int mode) {
1568 
1569     int error = 0;
1570     uint64_t num_rows = 0;
1571     DB_TXN* txn = NULL;
1572     bool do_commit = false;
1573     THD* thd = ha_thd();
1574     tokudb_trx_data *trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
1575     if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
1576         txn = trx->sub_sp_level;
1577     }
1578     else {
1579         do_commit = true;
1580         error = txn_begin(db_env, 0, &txn, 0, thd);
1581         if (error) { goto exit; }
1582     }
1583 
1584 
1585     error = get_status(txn);
1586     if (error) {
1587         goto exit;
1588     }
1589     if (share->version != HA_TOKU_VERSION) {
1590         error = ENOSYS;
1591         goto exit;
1592     }
1593 
1594 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
1595 #if defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
1596     // verify frm data for non-partitioned tables
1597     if (table->part_info == NULL) {
1598         error = verify_frm_data(table->s->path.str, txn);
1599         if (error)
1600             goto exit;
1601     } else {
1602         // remove the frm data for partitions since we are not maintaining it
1603         error = remove_frm_data(share->status_block, txn);
1604         if (error)
1605             goto exit;
1606     }
1607 #else
1608     error = verify_frm_data(table->s->path.str, txn);
1609     if (error)
1610         goto exit;
1611 #endif  // defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
1612 #endif  // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
1613 
1614     error =
1615         initialize_key_and_col_info(
1616             table_share,
1617             table,
1618             &share->kc_info,
1619             hidden_primary_key,
1620             primary_key);
1621     if (error) { goto exit; }
1622 
1623 
1624     error = open_main_dictionary(name, mode == O_RDONLY, txn);
1625     if (error) {
1626         goto exit;
1627     }
1628 
1629     share->has_unique_keys = false;
1630     share->_keys = table_share->keys;
1631     share->_max_key_parts = table_share->key_parts;
1632     share->_key_descriptors =
1633         (TOKUDB_SHARE::key_descriptor_t*)tokudb::memory::malloc(
1634             sizeof(TOKUDB_SHARE::key_descriptor_t) * share->_keys,
1635             MYF(MY_ZEROFILL));
1636 
1637     /* Open other keys;  These are part of the share structure */
1638     for (uint i = 0; i < table_share->keys; i++) {
1639         share->_key_descriptors[i]._parts =
1640             table_share->key_info[i].user_defined_key_parts;
1641         if (i == primary_key) {
1642             share->_key_descriptors[i]._is_unique = true;
1643             share->_key_descriptors[i]._name =
1644                 tokudb::memory::strdup("primary", 0);
1645         } else {
1646             share->_key_descriptors[i]._is_unique = false;
1647             share->_key_descriptors[i]._name =
1648                 tokudb::memory::strdup(table_share->key_info[i].name, 0);
1649         }
1650 
1651         if (table_share->key_info[i].flags & HA_NOSAME) {
1652             share->_key_descriptors[i]._is_unique = true;
1653             share->has_unique_keys = true;
1654         }
1655         if (i != primary_key) {
1656             error =
1657                 open_secondary_dictionary(
1658                     &share->key_file[i],
1659                     &table_share->key_info[i],
1660                     name,
1661                     mode == O_RDONLY,
1662                     txn);
1663             if (error) {
1664                 goto exit;
1665             }
1666         }
1667     }
1668 
1669     share->pk_has_string = false;
1670     if (!hidden_primary_key) {
1671         //
1672         // We need to set the ref_length to start at 5, to account for
1673         // the "infinity byte" in keys, and for placing the DBT size in the
1674         // first four bytes
1675         //
1676         ref_length = sizeof(uint32_t) + sizeof(uchar);
1677         KEY_PART_INFO* key_part = table->key_info[primary_key].key_part;
1678         KEY_PART_INFO* end =
1679             key_part + table->key_info[primary_key].user_defined_key_parts;
1680         for (; key_part != end; key_part++) {
1681             uint field_length = key_part->field->pack_length();
1682             field_length += (field_length > 255 ? 2 : 1);
1683             ref_length += field_length;
1684             TOKU_TYPE toku_type = mysql_to_toku_type(key_part->field);
1685             if (toku_type == toku_type_fixstring ||
1686                 toku_type == toku_type_varstring ||
1687                 toku_type == toku_type_blob
1688                 )
1689             {
1690                 share->pk_has_string = true;
1691             }
1692         }
1693         share->status |= STATUS_PRIMARY_KEY_INIT;
1694     }
1695     share->ref_length = ref_length;
1696 
1697     error = estimate_num_rows(share->file, &num_rows, txn);
1698     //
1699     // estimate_num_rows should not fail under normal conditions
1700     //
1701     if (error == 0) {
1702         share->set_row_count(num_rows, true);
1703     } else {
1704         goto exit;
1705     }
1706     //
1707     // initialize auto increment data
1708     //
1709     share->has_auto_inc = has_auto_increment_flag(&share->ai_field_index);
1710     if (share->has_auto_inc) {
1711         init_auto_increment();
1712     }
1713 
1714     if (may_table_be_empty(txn)) {
1715         share->try_table_lock = true;
1716     } else {
1717         share->try_table_lock = false;
1718     }
1719 
1720     share->num_DBs = table_share->keys + tokudb_test(hidden_primary_key);
1721 
1722     init_hidden_prim_key_info(txn);
1723 
1724     // initialize cardinality info from the status dictionary
1725     {
1726         uint32_t rec_per_keys = tokudb::compute_total_key_parts(table_share);
1727         uint64_t* rec_per_key =
1728             (uint64_t*)tokudb::memory::malloc(
1729                 rec_per_keys * sizeof(uint64_t),
1730                 MYF(MY_FAE));
1731         error =
1732             tokudb::get_card_from_status(
1733                 share->status_block,
1734                 txn,
1735                 rec_per_keys,
1736                 rec_per_key);
1737         if (error) {
1738             memset(rec_per_key, 0, sizeof(ulonglong) * rec_per_keys);
1739         }
1740         share->init_cardinality_counts(rec_per_keys, rec_per_key);
1741     }
1742 
1743     error = 0;
1744 exit:
1745     if (do_commit && txn) {
1746         commit_txn(txn,0);
1747     }
1748     return error;
1749 }
1750 
1751 //
1752 // Creates and opens a handle to a table which already exists in a tokudb
1753 // database.
1754 // Parameters:
1755 //      [in]   name - table name
1756 //             mode - seems to specify if table is read only
1757 //             test_if_locked - unused
1758 // Returns:
1759 //      0 on success
1760 //      1 on error
1761 //
open(const char * name,int mode,uint test_if_locked)1762 int ha_tokudb::open(const char *name, int mode, uint test_if_locked) {
1763     TOKUDB_HANDLER_DBUG_ENTER("%s %o %u", name, mode, test_if_locked);
1764     THD* thd = ha_thd();
1765 
1766     int error = 0;
1767     int ret_val = 0;
1768 
1769     transaction = NULL;
1770     cursor = NULL;
1771 
1772     /* Open primary key */
1773     hidden_primary_key = 0;
1774     if ((primary_key = table_share->primary_key) >= MAX_KEY) {
1775         // No primary key
1776         primary_key = table_share->keys;
1777         key_used_on_scan = MAX_KEY;
1778         hidden_primary_key = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
1779         ref_length = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t);
1780     }
1781     else {
1782         key_used_on_scan = primary_key;
1783     }
1784 
1785     /* Need some extra memory in case of packed keys */
1786     // the "+ 1" is for the first byte that states +/- infinity
1787     // multiply everything by 2 to account for clustered keys having a key and primary key together
1788     max_key_length = 2*(table_share->max_key_length + MAX_REF_PARTS * 3 + sizeof(uchar));
1789     alloc_ptr = tokudb::memory::multi_malloc(
1790         MYF(MY_WME),
1791         &key_buff, max_key_length,
1792         &key_buff2, max_key_length,
1793         &key_buff3, max_key_length,
1794         &key_buff4, max_key_length,
1795         &prelocked_left_range, max_key_length,
1796         &prelocked_right_range, max_key_length,
1797         &primary_key_buff, (hidden_primary_key ? 0 : max_key_length),
1798         &fixed_cols_for_query, table_share->fields*sizeof(uint32_t),
1799         &var_cols_for_query, table_share->fields*sizeof(uint32_t),
1800         NullS);
1801     if (alloc_ptr == NULL) {
1802         ret_val = 1;
1803         goto exit;
1804     }
1805 
1806     size_range_query_buff = tokudb::sysvars::read_buf_size(thd);
1807     range_query_buff =
1808         (uchar*)tokudb::memory::malloc(size_range_query_buff, MYF(MY_WME));
1809     if (range_query_buff == NULL) {
1810         ret_val = 1;
1811         goto exit;
1812     }
1813 
1814     alloced_rec_buff_length = table_share->rec_buff_length +
1815         table_share->fields;
1816     rec_buff = (uchar *) tokudb::memory::malloc(
1817         alloced_rec_buff_length,
1818         MYF(MY_WME));
1819     if (rec_buff == NULL) {
1820         ret_val = 1;
1821         goto exit;
1822     }
1823 
1824     alloced_update_rec_buff_length = alloced_rec_buff_length;
1825     rec_update_buff = (uchar*)tokudb::memory::malloc(
1826         alloced_update_rec_buff_length,
1827         MYF(MY_WME));
1828     if (rec_update_buff == NULL) {
1829         ret_val = 1;
1830         goto exit;
1831     }
1832 
1833     // lookup or create share
1834     share = TOKUDB_SHARE::get_share(name, &lock, true);
1835     assert_always(share);
1836 
1837     if (share->state() != TOKUDB_SHARE::OPENED) {
1838         // means we're responsible for the transition to OPENED, ERROR or CLOSED
1839 
1840         ret_val = allocate_key_and_col_info(table_share, &share->kc_info);
1841         if (ret_val == 0) {
1842             ret_val = initialize_share(name, mode);
1843         }
1844 
1845         if (ret_val == 0) {
1846             share->set_state(TOKUDB_SHARE::OPENED);
1847         } else {
1848             free_key_and_col_info(&share->kc_info);
1849             share->set_state(TOKUDB_SHARE::ERROR);
1850         }
1851         share->unlock();
1852     } else {
1853         // got an already OPENED instance
1854         share->unlock();
1855     }
1856 
1857     if (share->state() == TOKUDB_SHARE::ERROR) {
1858         share->release();
1859         goto exit;
1860     }
1861 
1862     assert_always(share->state() == TOKUDB_SHARE::OPENED);
1863 
1864     ref_length = share->ref_length;     // If second open
1865 
1866     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
1867         TOKUDB_DEBUG_OPEN,
1868         "tokudbopen:%p:share=%p:file=%p:table=%p:table->s=%p:%d",
1869         this,
1870         share,
1871         share->file,
1872         table,
1873         table->s,
1874         share->use_count());
1875 
1876     key_read = false;
1877     stats.block_size = 1<<20;    // QQQ Tokudb DB block size
1878 
1879     info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
1880 
1881 exit:
1882     if (ret_val) {
1883         tokudb::memory::free(range_query_buff);
1884         range_query_buff = NULL;
1885         tokudb::memory::free(alloc_ptr);
1886         alloc_ptr = NULL;
1887         tokudb::memory::free(rec_buff);
1888         rec_buff = NULL;
1889         tokudb::memory::free(rec_update_buff);
1890         rec_update_buff = NULL;
1891 
1892         if (error) {
1893             set_my_errno(error);
1894         }
1895     }
1896     TOKUDB_HANDLER_DBUG_RETURN(ret_val);
1897 }
1898 
1899 //
1900 // estimate the number of rows in a DB
1901 // Parameters:
1902 //      [in]    db - DB whose number of rows will be estimated
1903 //      [out]   num_rows - number of estimated rows in db
1904 // Returns:
1905 //      0 on success
1906 //      error otherwise
1907 //
estimate_num_rows(DB * db,uint64_t * num_rows,DB_TXN * txn)1908 int ha_tokudb::estimate_num_rows(DB* db, uint64_t* num_rows, DB_TXN* txn) {
1909     int error = ENOSYS;
1910     bool do_commit = false;
1911     DB_BTREE_STAT64 dict_stats;
1912     DB_TXN* txn_to_use = NULL;
1913 
1914     if (txn == NULL) {
1915         error = txn_begin(db_env, 0, &txn_to_use, DB_READ_UNCOMMITTED, ha_thd());
1916         if (error) goto cleanup;
1917         do_commit = true;
1918     }
1919     else {
1920         txn_to_use = txn;
1921     }
1922 
1923     error = db->stat64(db, txn_to_use, &dict_stats);
1924     if (error) { goto cleanup; }
1925 
1926     *num_rows = dict_stats.bt_ndata;
1927     error = 0;
1928 cleanup:
1929     if (do_commit) {
1930         commit_txn(txn_to_use, 0);
1931         txn_to_use = NULL;
1932     }
1933     return error;
1934 }
1935 
1936 
write_to_status(DB * db,HA_METADATA_KEY curr_key_data,void * data,uint size,DB_TXN * txn)1937 int ha_tokudb::write_to_status(DB* db, HA_METADATA_KEY curr_key_data, void* data, uint size, DB_TXN* txn ){
1938     return write_metadata(db, &curr_key_data, sizeof curr_key_data, data, size, txn);
1939 }
1940 
remove_from_status(DB * db,HA_METADATA_KEY curr_key_data,DB_TXN * txn)1941 int ha_tokudb::remove_from_status(DB *db, HA_METADATA_KEY curr_key_data, DB_TXN *txn) {
1942     return remove_metadata(db, &curr_key_data, sizeof curr_key_data, txn);
1943 }
1944 
remove_metadata(DB * db,void * key_data,uint key_size,DB_TXN * transaction)1945 int ha_tokudb::remove_metadata(DB* db, void* key_data, uint key_size, DB_TXN* transaction){
1946     int error;
1947     DBT key;
1948     DB_TXN* txn = NULL;
1949     bool do_commit = false;
1950     //
1951     // transaction to be used for putting metadata into status.tokudb
1952     //
1953     if (transaction == NULL) {
1954         error = txn_begin(db_env, 0, &txn, 0, ha_thd());
1955         if (error) {
1956             goto cleanup;
1957         }
1958         do_commit = true;
1959     }
1960     else {
1961         txn = transaction;
1962     }
1963 
1964     memset(&key, 0, sizeof(key));
1965     key.data = key_data;
1966     key.size = key_size;
1967     error = db->del(db, txn, &key, DB_DELETE_ANY);
1968     if (error) {
1969         goto cleanup;
1970     }
1971 
1972     error = 0;
1973 cleanup:
1974     if (do_commit && txn) {
1975         if (!error) {
1976             commit_txn(txn, DB_TXN_NOSYNC);
1977         }
1978         else {
1979             abort_txn(txn);
1980         }
1981     }
1982     return error;
1983 }
1984 
1985 //
1986 // helper function to write a piece of metadata in to status.tokudb
1987 //
write_metadata(DB * db,void * key_data,uint key_size,void * val_data,uint val_size,DB_TXN * transaction)1988 int ha_tokudb::write_metadata(DB* db, void* key_data, uint key_size, void* val_data, uint val_size, DB_TXN* transaction ){
1989     int error;
1990     DBT key;
1991     DBT value;
1992     DB_TXN* txn = NULL;
1993     bool do_commit = false;
1994     //
1995     // transaction to be used for putting metadata into status.tokudb
1996     //
1997     if (transaction == NULL) {
1998         error = txn_begin(db_env, 0, &txn, 0, ha_thd());
1999         if (error) {
2000             goto cleanup;
2001         }
2002         do_commit = true;
2003     }
2004     else {
2005         txn = transaction;
2006     }
2007 
2008     memset(&key, 0, sizeof(key));
2009     memset(&value, 0, sizeof(value));
2010     key.data = key_data;
2011     key.size = key_size;
2012     value.data = val_data;
2013     value.size = val_size;
2014     error = db->put(db, txn, &key, &value, 0);
2015     if (error) {
2016         goto cleanup;
2017     }
2018 
2019     error = 0;
2020 cleanup:
2021     if (do_commit && txn) {
2022         if (!error) {
2023             commit_txn(txn, DB_TXN_NOSYNC);
2024         }
2025         else {
2026             abort_txn(txn);
2027         }
2028     }
2029     return error;
2030 }
2031 
2032 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
write_frm_data(DB * db,DB_TXN * txn,const char * frm_name)2033 int ha_tokudb::write_frm_data(DB* db, DB_TXN* txn, const char* frm_name) {
2034     TOKUDB_HANDLER_DBUG_ENTER("%p %p %s", db, txn, frm_name);
2035 
2036     uchar* frm_data = NULL;
2037     size_t frm_len = 0;
2038     int error = 0;
2039 
2040     error = readfrm(frm_name,&frm_data,&frm_len);
2041     if (error) { goto cleanup; }
2042 
2043     error = write_to_status(db,hatoku_frm_data,frm_data,(uint)frm_len, txn);
2044     if (error) { goto cleanup; }
2045 
2046     error = 0;
2047 cleanup:
2048     tokudb::memory::free(frm_data);
2049     TOKUDB_HANDLER_DBUG_RETURN(error);
2050 }
2051 
remove_frm_data(DB * db,DB_TXN * txn)2052 int ha_tokudb::remove_frm_data(DB *db, DB_TXN *txn) {
2053     return remove_from_status(db, hatoku_frm_data, txn);
2054 }
2055 
smart_dbt_callback_verify_frm(TOKUDB_UNUSED (DBT const * key),DBT const * row,void * context)2056 static int smart_dbt_callback_verify_frm(TOKUDB_UNUSED(DBT const* key),
2057                                          DBT const* row,
2058                                          void* context) {
2059     DBT* stored_frm = (DBT *)context;
2060     stored_frm->size = row->size;
2061     stored_frm->data = (uchar *)tokudb::memory::malloc(row->size, MYF(MY_WME));
2062     assert_always(stored_frm->data);
2063     memcpy(stored_frm->data, row->data, row->size);
2064     return 0;
2065 }
2066 
verify_frm_data(const char * frm_name,DB_TXN * txn)2067 int ha_tokudb::verify_frm_data(const char* frm_name, DB_TXN* txn) {
2068     TOKUDB_HANDLER_DBUG_ENTER("%s", frm_name);
2069     uchar* mysql_frm_data = NULL;
2070     size_t mysql_frm_len = 0;
2071     DBT key = {};
2072     DBT stored_frm = {};
2073     int error = 0;
2074     HA_METADATA_KEY curr_key = hatoku_frm_data;
2075 
2076     // get the frm data from MySQL
2077     error = readfrm(frm_name,&mysql_frm_data,&mysql_frm_len);
2078     if (error) {
2079         goto cleanup;
2080     }
2081 
2082     key.data = &curr_key;
2083     key.size = sizeof(curr_key);
2084     error = share->status_block->getf_set(
2085         share->status_block,
2086         txn,
2087         0,
2088         &key,
2089         smart_dbt_callback_verify_frm,
2090         &stored_frm
2091         );
2092     if (error == DB_NOTFOUND) {
2093         // if not found, write it
2094         error = write_frm_data(share->status_block, txn, frm_name);
2095         goto cleanup;
2096     } else if (error) {
2097         goto cleanup;
2098     }
2099 
2100     if (stored_frm.size != mysql_frm_len || memcmp(stored_frm.data, mysql_frm_data, stored_frm.size)) {
2101         error = HA_ERR_TABLE_DEF_CHANGED;
2102         goto cleanup;
2103     }
2104 
2105     error = 0;
2106 cleanup:
2107     tokudb::memory::free(mysql_frm_data);
2108     tokudb::memory::free(stored_frm.data);
2109     TOKUDB_HANDLER_DBUG_RETURN(error);
2110 }
2111 #endif  // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
2112 
2113 //
2114 // Updates status.tokudb with a new max value used for the auto increment column
2115 // Parameters:
2116 //      [in]    db - this will always be status.tokudb
2117 //              val - value to store
2118 //  Returns:
2119 //      0 on success, error otherwise
2120 //
2121 //
update_max_auto_inc(DB * db,ulonglong val)2122 int ha_tokudb::update_max_auto_inc(DB* db, ulonglong val){
2123     return write_to_status(db,hatoku_max_ai,&val,sizeof(val), NULL);
2124 }
2125 
2126 //
2127 // Writes the initial auto increment value, as specified by create table
2128 // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
2129 // then the value 100 will be stored here in val
2130 // Parameters:
2131 //      [in]    db - this will always be status.tokudb
2132 //              val - value to store
2133 //  Returns:
2134 //      0 on success, error otherwise
2135 //
2136 //
write_auto_inc_create(DB * db,ulonglong val,DB_TXN * txn)2137 int ha_tokudb::write_auto_inc_create(DB* db, ulonglong val, DB_TXN* txn){
2138     return write_to_status(db,hatoku_ai_create_value,&val,sizeof(val), txn);
2139 }
2140 
2141 
2142 //
2143 // Closes a handle to a table.
2144 //
close()2145 int ha_tokudb::close() {
2146     TOKUDB_HANDLER_DBUG_ENTER("");
2147     int r = __close();
2148     TOKUDB_HANDLER_DBUG_RETURN(r);
2149 }
2150 
__close()2151 int ha_tokudb::__close() {
2152     TOKUDB_HANDLER_DBUG_ENTER("");
2153     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_OPEN, "close:%p", this);
2154     tokudb::memory::free(rec_buff);
2155     tokudb::memory::free(rec_update_buff);
2156     tokudb::memory::free(blob_buff);
2157     tokudb::memory::free(alloc_ptr);
2158     tokudb::memory::free(range_query_buff);
2159     for (uint32_t i = 0; i < sizeof(mult_key_dbt_array)/sizeof(mult_key_dbt_array[0]); i++) {
2160         toku_dbt_array_destroy(&mult_key_dbt_array[i]);
2161     }
2162     for (uint32_t i = 0; i < sizeof(mult_rec_dbt_array)/sizeof(mult_rec_dbt_array[0]); i++) {
2163         toku_dbt_array_destroy(&mult_rec_dbt_array[i]);
2164     }
2165     rec_buff = NULL;
2166     rec_update_buff = NULL;
2167     alloc_ptr = NULL;
2168     ha_tokudb::reset();
2169     int retval = share->release();
2170     TOKUDB_HANDLER_DBUG_RETURN(retval);
2171 }
2172 
2173 //
2174 // Reallocate record buffer (rec_buff) if needed
2175 // If not needed, does nothing
2176 // Parameters:
2177 //          length - size of buffer required for rec_buff
2178 //
fix_rec_buff_for_blob(ulong length)2179 bool ha_tokudb::fix_rec_buff_for_blob(ulong length) {
2180     if (!rec_buff || (length > alloced_rec_buff_length)) {
2181         uchar* newptr = (uchar*)tokudb::memory::realloc(
2182             (void*)rec_buff,
2183             length,
2184             MYF(MY_ALLOW_ZERO_PTR));
2185         if (!newptr)
2186             return 1;
2187         rec_buff = newptr;
2188         alloced_rec_buff_length = length;
2189     }
2190     return 0;
2191 }
2192 
2193 //
2194 // Reallocate record buffer (rec_buff) if needed
2195 // If not needed, does nothing
2196 // Parameters:
2197 //          length - size of buffer required for rec_buff
2198 //
fix_rec_update_buff_for_blob(ulong length)2199 bool ha_tokudb::fix_rec_update_buff_for_blob(ulong length) {
2200     if (!rec_update_buff || (length > alloced_update_rec_buff_length)) {
2201         uchar* newptr = (uchar*)tokudb::memory::realloc(
2202             (void*)rec_update_buff,
2203             length,
2204             MYF(MY_ALLOW_ZERO_PTR));
2205         if (!newptr)
2206             return 1;
2207         rec_update_buff= newptr;
2208         alloced_update_rec_buff_length = length;
2209     }
2210     return 0;
2211 }
2212 
2213 /* Calculate max length needed for row */
max_row_length(const uchar * buf)2214 ulong ha_tokudb::max_row_length(const uchar * buf) {
2215     ulong length = table_share->reclength + table_share->fields * 2;
2216     uint *ptr, *end;
2217     for (ptr = table_share->blob_field, end = ptr + table_share->blob_fields; ptr != end; ptr++) {
2218         Field_blob *blob = ((Field_blob *) table->field[*ptr]);
2219         length += blob->get_length((uchar *) (buf + field_offset(blob, table))) + 2;
2220     }
2221     return length;
2222 }
2223 
2224 /*
2225 */
2226 //
2227 // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2228 // Pack a row for storage.
2229 // If the row is of fixed length, just store the  row 'as is'.
2230 // If not, we will generate a packed row suitable for storage.
2231 // This will only fail if we don't have enough memory to pack the row,
2232 // which may only happen in rows with blobs, as the default row length is
2233 // pre-allocated.
2234 // Parameters:
2235 //      [out]   row - row stored in DBT to be converted
2236 //      [out]   buf - buffer where row is packed
2237 //      [in]    record - row in MySQL format
2238 //
2239 
pack_row_in_buff(DBT * row,const uchar * record,uint index,uchar * row_buff)2240 int ha_tokudb::pack_row_in_buff(
2241     DBT * row,
2242     const uchar* record,
2243     uint index,
2244     uchar* row_buff
2245     )
2246 {
2247     uchar* fixed_field_ptr = NULL;
2248     uchar* var_field_offset_ptr = NULL;
2249     uchar* start_field_data_ptr = NULL;
2250     uchar* var_field_data_ptr = NULL;
2251     int r = ENOSYS;
2252     memset((void *) row, 0, sizeof(*row));
2253 
2254     my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
2255 
2256     // Copy null bytes
2257     memcpy(row_buff, record, table_share->null_bytes);
2258     fixed_field_ptr = row_buff + table_share->null_bytes;
2259     var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2260     start_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2261     var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2262 
2263     // assert that when the hidden primary key exists, primary_key_offsets is NULL
2264     for (uint i = 0; i < table_share->fields; i++) {
2265         Field* field = table->field[i];
2266         uint curr_field_offset = field_offset(field, table);
2267         if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2268             continue;
2269         }
2270         if (is_fixed_field(&share->kc_info, i)) {
2271             fixed_field_ptr = pack_fixed_field(
2272                 fixed_field_ptr,
2273                 record + curr_field_offset,
2274                 share->kc_info.field_lengths[i]
2275                 );
2276         }
2277         else if (is_variable_field(&share->kc_info, i)) {
2278             var_field_data_ptr = pack_var_field(
2279                 var_field_offset_ptr,
2280                 var_field_data_ptr,
2281                 start_field_data_ptr,
2282                 record + curr_field_offset,
2283                 share->kc_info.length_bytes[i],
2284                 share->kc_info.num_offset_bytes
2285                 );
2286             var_field_offset_ptr += share->kc_info.num_offset_bytes;
2287         }
2288     }
2289 
2290     for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2291         Field* field = table->field[share->kc_info.blob_fields[i]];
2292         var_field_data_ptr = pack_toku_field_blob(
2293             var_field_data_ptr,
2294             record + field_offset(field, table),
2295             field
2296             );
2297     }
2298 
2299     row->data = row_buff;
2300     row->size = (size_t) (var_field_data_ptr - row_buff);
2301     r = 0;
2302 
2303     dbug_tmp_restore_column_map(table->write_set, old_map);
2304     return r;
2305 }
2306 
2307 
pack_row(DBT * row,const uchar * record,uint index)2308 int ha_tokudb::pack_row(
2309     DBT * row,
2310     const uchar* record,
2311     uint index
2312     )
2313 {
2314     return pack_row_in_buff(row,record,index,rec_buff);
2315 }
2316 
pack_old_row_for_update(DBT * row,const uchar * record,uint index)2317 int ha_tokudb::pack_old_row_for_update(
2318     DBT * row,
2319     const uchar* record,
2320     uint index
2321     )
2322 {
2323     return pack_row_in_buff(row,record,index,rec_update_buff);
2324 }
2325 
2326 
unpack_blobs(uchar * record,const uchar * from_tokudb_blob,uint32_t num_bytes,bool check_bitmap)2327 int ha_tokudb::unpack_blobs(
2328     uchar* record,
2329     const uchar* from_tokudb_blob,
2330     uint32_t num_bytes,
2331     bool check_bitmap
2332     )
2333 {
2334     uint error = 0;
2335     uchar* ptr = NULL;
2336     const uchar* buff = NULL;
2337     //
2338     // assert that num_bytes > 0 iff share->num_blobs > 0
2339     //
2340     assert_always( !((share->kc_info.num_blobs == 0) && (num_bytes > 0)) );
2341     if (num_bytes > num_blob_bytes) {
2342         ptr = (uchar*)tokudb::memory::realloc(
2343             (void*)blob_buff, num_bytes,
2344             MYF(MY_ALLOW_ZERO_PTR));
2345         if (ptr == NULL) {
2346             error = ENOMEM;
2347             goto exit;
2348         }
2349         blob_buff = ptr;
2350         num_blob_bytes = num_bytes;
2351     }
2352 
2353     memcpy(blob_buff, from_tokudb_blob, num_bytes);
2354     buff= blob_buff;
2355     for (uint i = 0; i < share->kc_info.num_blobs; i++) {
2356         uint32_t curr_field_index = share->kc_info.blob_fields[i];
2357         bool skip = check_bitmap ?
2358             !(bitmap_is_set(table->read_set,curr_field_index) ||
2359                 bitmap_is_set(table->write_set,curr_field_index)) :
2360             false;
2361         Field* field = table->field[curr_field_index];
2362         uint32_t len_bytes = field->row_pack_length();
2363         const uchar* end_buff = unpack_toku_field_blob(
2364             record + field_offset(field, table),
2365             buff,
2366             len_bytes,
2367             skip
2368             );
2369         // verify that the pointers to the blobs are all contained within the blob_buff
2370         if (!(blob_buff <= buff && end_buff <= blob_buff + num_bytes)) {
2371             error = -3000000;
2372             goto exit;
2373         }
2374         buff = end_buff;
2375     }
2376     // verify that the entire blob buffer was parsed
2377     if (share->kc_info.num_blobs > 0 && !(num_bytes > 0 && buff == blob_buff + num_bytes)) {
2378         error = -4000000;
2379         goto exit;
2380     }
2381 
2382     error = 0;
2383 exit:
2384     return error;
2385 }
2386 
2387 //
2388 // take the row passed in as a DBT*, and convert it into a row in MySQL format in record
2389 // Parameters:
2390 //      [out]   record - row in MySQL format
2391 //      [in]    row - row stored in DBT to be converted
2392 //
unpack_row(uchar * record,DBT const * row,DBT const * key,uint index)2393 int ha_tokudb::unpack_row(
2394     uchar* record,
2395     DBT const *row,
2396     DBT const *key,
2397     uint index
2398     )
2399 {
2400     //
2401     // two cases, fixed length row, and variable length row
2402     // fixed length row is first below
2403     //
2404     /* Copy null bits */
2405     int error = 0;
2406     const uchar* fixed_field_ptr = (const uchar *) row->data;
2407     const uchar* var_field_offset_ptr = NULL;
2408     const uchar* var_field_data_ptr = NULL;
2409     uint32_t data_end_offset = 0;
2410     memcpy(record, fixed_field_ptr, table_share->null_bytes);
2411     fixed_field_ptr += table_share->null_bytes;
2412 
2413     var_field_offset_ptr = fixed_field_ptr + share->kc_info.mcp_info[index].fixed_field_size;
2414     var_field_data_ptr = var_field_offset_ptr + share->kc_info.mcp_info[index].len_of_offsets;
2415 
2416     //
2417     // unpack the key, if necessary
2418     //
2419     if (!(hidden_primary_key && index == primary_key)) {
2420         unpack_key(record,key,index);
2421     }
2422 
2423     uint32_t last_offset = 0;
2424     //
2425     // we have two methods of unpacking, one if we need to unpack the entire row
2426     // the second if we unpack a subset of the entire row
2427     // first method here is if we unpack the entire row
2428     //
2429     if (unpack_entire_row) {
2430         //
2431         // fill in parts of record that are not part of the key
2432         //
2433         for (uint i = 0; i < table_share->fields; i++) {
2434             Field* field = table->field[i];
2435             if (bitmap_is_set(&share->kc_info.key_filters[index],i)) {
2436                 continue;
2437             }
2438 
2439             if (is_fixed_field(&share->kc_info, i)) {
2440                 fixed_field_ptr = unpack_fixed_field(
2441                     record + field_offset(field, table),
2442                     fixed_field_ptr,
2443                     share->kc_info.field_lengths[i]
2444                     );
2445             }
2446             //
2447             // here, we DO modify var_field_data_ptr or var_field_offset_ptr
2448             // as we unpack variable sized fields
2449             //
2450             else if (is_variable_field(&share->kc_info, i)) {
2451                 switch (share->kc_info.num_offset_bytes) {
2452                 case (1):
2453                     data_end_offset = var_field_offset_ptr[0];
2454                     break;
2455                 case (2):
2456                     data_end_offset = uint2korr(var_field_offset_ptr);
2457                     break;
2458                 default:
2459                     assert_unreachable();
2460                 }
2461                 unpack_var_field(
2462                     record + field_offset(field, table),
2463                     var_field_data_ptr,
2464                     data_end_offset - last_offset,
2465                     share->kc_info.length_bytes[i]
2466                     );
2467                 var_field_offset_ptr += share->kc_info.num_offset_bytes;
2468                 var_field_data_ptr += data_end_offset - last_offset;
2469                 last_offset = data_end_offset;
2470             }
2471         }
2472         error = unpack_blobs(
2473             record,
2474             var_field_data_ptr,
2475             row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2476             false
2477             );
2478         if (error) {
2479             goto exit;
2480         }
2481     }
2482     //
2483     // in this case, we unpack only what is specified
2484     // in fixed_cols_for_query and var_cols_for_query
2485     //
2486     else {
2487         //
2488         // first the fixed fields
2489         //
2490         for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
2491             uint field_index = fixed_cols_for_query[i];
2492             Field* field = table->field[field_index];
2493             unpack_fixed_field(
2494                 record + field_offset(field, table),
2495                 fixed_field_ptr + share->kc_info.cp_info[index][field_index].col_pack_val,
2496                 share->kc_info.field_lengths[field_index]
2497                 );
2498         }
2499 
2500         //
2501         // now the var fields
2502         // here, we do NOT modify var_field_data_ptr or var_field_offset_ptr
2503         //
2504         for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
2505             uint field_index = var_cols_for_query[i];
2506             Field* field = table->field[field_index];
2507             uint32_t var_field_index = share->kc_info.cp_info[index][field_index].col_pack_val;
2508             uint32_t data_start_offset;
2509             uint32_t field_len;
2510 
2511             get_var_field_info(
2512                 &field_len,
2513                 &data_start_offset,
2514                 var_field_index,
2515                 var_field_offset_ptr,
2516                 share->kc_info.num_offset_bytes
2517                 );
2518 
2519             unpack_var_field(
2520                 record + field_offset(field, table),
2521                 var_field_data_ptr + data_start_offset,
2522                 field_len,
2523                 share->kc_info.length_bytes[field_index]
2524                 );
2525         }
2526 
2527         if (read_blobs) {
2528             //
2529             // now the blobs
2530             //
2531             get_blob_field_info(
2532                 &data_end_offset,
2533                 share->kc_info.mcp_info[index].len_of_offsets,
2534                 var_field_data_ptr,
2535                 share->kc_info.num_offset_bytes
2536                 );
2537 
2538             var_field_data_ptr += data_end_offset;
2539             error = unpack_blobs(
2540                 record,
2541                 var_field_data_ptr,
2542                 row->size - (uint32_t)(var_field_data_ptr - (const uchar *)row->data),
2543                 true
2544                 );
2545             if (error) {
2546                 goto exit;
2547             }
2548         }
2549     }
2550     error = 0;
2551 exit:
2552     return error;
2553 }
2554 
place_key_into_mysql_buff(KEY * key_info,uchar * record,uchar * data)2555 uint32_t ha_tokudb::place_key_into_mysql_buff(
2556     KEY* key_info,
2557     uchar* record,
2558     uchar* data) {
2559 
2560     KEY_PART_INFO* key_part = key_info->key_part;
2561     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2562     uchar* pos = data;
2563 
2564     for (; key_part != end; key_part++) {
2565         if (key_part->field->null_bit) {
2566             uint null_offset = get_null_offset(table, key_part->field);
2567             if (*pos++ == NULL_COL_VAL) { // Null value
2568                 //
2569                 // We don't need to reset the record data as we will not access it
2570                 // if the null data is set
2571                 //
2572                 record[null_offset] |= key_part->field->null_bit;
2573                 continue;
2574             }
2575             record[null_offset] &= ~key_part->field->null_bit;
2576         }
2577         //
2578         // HOPEFULLY TEMPORARY
2579         //
2580         assert_always(table->s->db_low_byte_first);
2581         pos = unpack_toku_key_field(
2582             record + field_offset(key_part->field, table),
2583             pos,
2584             key_part->field,
2585             key_part->length
2586             );
2587     }
2588     return pos-data;
2589 }
2590 
2591 //
2592 // Store the key and the primary key into the row
2593 // Parameters:
2594 //      [out]   record - key stored in MySQL format
2595 //      [in]    key - key stored in DBT to be converted
2596 //              index -index into key_file that represents the DB
2597 //                  unpacking a key of
2598 //
unpack_key(uchar * record,DBT const * key,uint index)2599 void ha_tokudb::unpack_key(uchar * record, DBT const *key, uint index) {
2600     uint32_t bytes_read;
2601     uchar *pos = (uchar *) key->data + 1;
2602     bytes_read = place_key_into_mysql_buff(
2603         &table->key_info[index],
2604         record,
2605         pos
2606         );
2607     if( (index != primary_key) && !hidden_primary_key) {
2608         //
2609         // also unpack primary key
2610         //
2611         place_key_into_mysql_buff(
2612             &table->key_info[primary_key],
2613             record,
2614             pos+bytes_read
2615             );
2616     }
2617 }
2618 
place_key_into_dbt_buff(KEY * key_info,uchar * buff,const uchar * record,bool * has_null,int key_length)2619 uint32_t ha_tokudb::place_key_into_dbt_buff(
2620     KEY* key_info,
2621     uchar* buff,
2622     const uchar* record,
2623     bool* has_null,
2624     int key_length) {
2625 
2626     KEY_PART_INFO* key_part = key_info->key_part;
2627     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2628     uchar* curr_buff = buff;
2629     *has_null = false;
2630     for (; key_part != end && key_length > 0; key_part++) {
2631         //
2632         // accessing key_part->field->null_bit instead off key_part->null_bit
2633         // because key_part->null_bit is not set in add_index
2634         // filed ticket 862 to look into this
2635         //
2636         if (key_part->field->null_bit) {
2637             /* Store 0 if the key part is a NULL part */
2638             uint null_offset = get_null_offset(table, key_part->field);
2639             if (record[null_offset] & key_part->field->null_bit) {
2640                 *curr_buff++ = NULL_COL_VAL;
2641                 *has_null = true;
2642                 continue;
2643             }
2644             *curr_buff++ = NONNULL_COL_VAL;        // Store NOT NULL marker
2645         }
2646         //
2647         // HOPEFULLY TEMPORARY
2648         //
2649         assert_always(table->s->db_low_byte_first);
2650         //
2651         // accessing field_offset(key_part->field) instead off key_part->offset
2652         // because key_part->offset is SET INCORRECTLY in add_index
2653         // filed ticket 862 to look into this
2654         //
2655         curr_buff = pack_toku_key_field(
2656             curr_buff,
2657             (uchar *) (record + field_offset(key_part->field, table)),
2658             key_part->field,
2659             key_part->length
2660             );
2661         key_length -= key_part->length;
2662     }
2663     return curr_buff - buff;
2664 }
2665 
2666 
2667 
2668 //
2669 // Create a packed key from a row. This key will be written as such
2670 // to the index tree.  This will never fail as the key buffer is pre-allocated.
2671 // Parameters:
2672 //      [out]   key - DBT that holds the key
2673 //      [in]    key_info - holds data about the key, such as it's length and offset into record
2674 //      [out]   buff - buffer that will hold the data for key (unless
2675 //                  we have a hidden primary key)
2676 //      [in]    record - row from which to create the key
2677 //              key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2678 // Returns:
2679 //      the parameter key
2680 //
2681 
create_dbt_key_from_key(DBT * key,KEY * key_info,uchar * buff,const uchar * record,bool * has_null,bool dont_pack_pk,int key_length,uint8_t inf_byte)2682 DBT* ha_tokudb::create_dbt_key_from_key(
2683     DBT * key,
2684     KEY* key_info,
2685     uchar * buff,
2686     const uchar * record,
2687     bool* has_null,
2688     bool dont_pack_pk,
2689     int key_length,
2690     uint8_t inf_byte
2691     )
2692 {
2693     uint32_t size = 0;
2694     uchar* tmp_buff = buff;
2695     my_bitmap_map *old_map = dbug_tmp_use_all_columns(table, table->write_set);
2696 
2697     key->data = buff;
2698 
2699     //
2700     // first put the "infinity" byte at beginning. States if missing columns are implicitly
2701     // positive infinity or negative infinity or zero. For this, because we are creating key
2702     // from a row, there is no way that columns can be missing, so in practice,
2703     // this will be meaningless. Might as well put in a value
2704     //
2705     *tmp_buff++ = inf_byte;
2706     size++;
2707     size += place_key_into_dbt_buff(
2708         key_info,
2709         tmp_buff,
2710         record,
2711         has_null,
2712         key_length
2713         );
2714     if (!dont_pack_pk) {
2715         tmp_buff = buff + size;
2716         if (hidden_primary_key) {
2717             memcpy(tmp_buff, current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2718             size += TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2719         }
2720         else {
2721             bool tmp_bool = false;
2722             size += place_key_into_dbt_buff(
2723                 &table->key_info[primary_key],
2724                 tmp_buff,
2725                 record,
2726                 &tmp_bool,
2727                 MAX_KEY_LENGTH //this parameter does not matter
2728                 );
2729         }
2730     }
2731 
2732     key->size = size;
2733     DBUG_DUMP("key", (uchar *) key->data, key->size);
2734     dbug_tmp_restore_column_map(table->write_set, old_map);
2735     return key;
2736 }
2737 
2738 
2739 //
2740 // Create a packed key from a row. This key will be written as such
2741 // to the index tree.  This will never fail as the key buffer is pre-allocated.
2742 // Parameters:
2743 //      [out]   key - DBT that holds the key
2744 //              keynr - index for which to create the key
2745 //      [out]   buff - buffer that will hold the data for key (unless
2746 //                  we have a hidden primary key)
2747 //      [in]    record - row from which to create the key
2748 //      [out]   has_null - says if the key has a NULL value for one of its columns
2749 //              key_length - currently set to MAX_KEY_LENGTH, is it size of buff?
2750 // Returns:
2751 //      the parameter key
2752 //
create_dbt_key_from_table(DBT * key,uint keynr,uchar * buff,const uchar * record,bool * has_null,int key_length)2753 DBT *ha_tokudb::create_dbt_key_from_table(
2754     DBT * key,
2755     uint keynr,
2756     uchar * buff,
2757     const uchar * record,
2758     bool* has_null,
2759     int key_length
2760     )
2761 {
2762     TOKUDB_HANDLER_DBUG_ENTER("");
2763     memset((void *) key, 0, sizeof(*key));
2764     if (hidden_primary_key && keynr == primary_key) {
2765         key->data = buff;
2766         memcpy(buff, &current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
2767         key->size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
2768         *has_null = false;
2769         DBUG_RETURN(key);
2770     }
2771     DBUG_RETURN(create_dbt_key_from_key(key, &table->key_info[keynr],buff,record, has_null, (keynr == primary_key), key_length, COL_ZERO));
2772 }
2773 
create_dbt_key_for_lookup(DBT * key,KEY * key_info,uchar * buff,const uchar * record,bool * has_null,int key_length)2774 DBT* ha_tokudb::create_dbt_key_for_lookup(
2775     DBT * key,
2776     KEY* key_info,
2777     uchar * buff,
2778     const uchar * record,
2779     bool* has_null,
2780     int key_length
2781     )
2782 {
2783     TOKUDB_HANDLER_DBUG_ENTER("");
2784     // override the infinity byte, needed in case the pk is a string
2785     // to make sure that the cursor that uses this key properly positions
2786     // it at the right location. If the table stores "D", but we look up for "d",
2787     // and the infinity byte is 0, then we will skip the "D", because
2788     // in bytes, "d" > "D".
2789     DBT* ret = create_dbt_key_from_key(key, key_info, buff, record, has_null, true, key_length, COL_NEG_INF);
2790     DBUG_RETURN(ret);
2791 }
2792 
2793 //
2794 // Create a packed key from from a MySQL unpacked key (like the one that is
2795 // sent from the index_read() This key is to be used to read a row
2796 // Parameters:
2797 //      [out]   key - DBT that holds the key
2798 //              keynr - index for which to pack the key
2799 //      [out]   buff - buffer that will hold the data for key
2800 //      [in]    key_ptr - MySQL unpacked key
2801 //              key_length - length of key_ptr
2802 // Returns:
2803 //      the parameter key
2804 //
pack_key(DBT * key,uint keynr,uchar * buff,const uchar * key_ptr,uint key_length,int8_t inf_byte)2805 DBT* ha_tokudb::pack_key(
2806     DBT* key,
2807     uint keynr,
2808     uchar* buff,
2809     const uchar* key_ptr,
2810     uint key_length,
2811     int8_t inf_byte) {
2812 
2813     TOKUDB_HANDLER_DBUG_ENTER(
2814         "key %p %u:%2.2x inf=%d",
2815         key_ptr,
2816         key_length,
2817         key_length > 0 ? key_ptr[0] : 0,
2818         inf_byte);
2819 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2820     if (keynr != primary_key && !tokudb_test(hidden_primary_key)) {
2821         DBUG_RETURN(pack_ext_key(
2822             key,
2823             keynr,
2824             buff,
2825             key_ptr,
2826             key_length,
2827             inf_byte));
2828     }
2829 #endif  // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2830     KEY* key_info = &table->key_info[keynr];
2831     KEY_PART_INFO* key_part = key_info->key_part;
2832     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2833     my_bitmap_map* old_map = dbug_tmp_use_all_columns(table, table->write_set);
2834 
2835     memset(key, 0, sizeof(*key));
2836     key->data = buff;
2837 
2838     // first put the "infinity" byte at beginning. States if missing columns are implicitly
2839     // positive infinity or negative infinity
2840     *buff++ = (uchar)inf_byte;
2841 
2842     for (; key_part != end && (int) key_length > 0; key_part++) {
2843         uint offset = 0;
2844         if (key_part->null_bit) {
2845             if (!(*key_ptr == 0)) {
2846                 *buff++ = NULL_COL_VAL;
2847                 key_length -= key_part->store_length;
2848                 key_ptr += key_part->store_length;
2849                 continue;
2850             }
2851             *buff++ = NONNULL_COL_VAL;
2852             offset = 1;         // Data is at key_ptr+1
2853         }
2854         assert_always(table->s->db_low_byte_first);
2855         buff = pack_key_toku_key_field(
2856             buff,
2857             (uchar *) key_ptr + offset,
2858             key_part->field,
2859             key_part->length
2860             );
2861 
2862         key_ptr += key_part->store_length;
2863         key_length -= key_part->store_length;
2864     }
2865 
2866     key->size = (buff - (uchar *) key->data);
2867     DBUG_DUMP("key", (uchar *) key->data, key->size);
2868     dbug_tmp_restore_column_map(table->write_set, old_map);
2869     DBUG_RETURN(key);
2870 }
2871 
2872 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
pack_ext_key(DBT * key,uint keynr,uchar * buff,const uchar * key_ptr,uint key_length,int8_t inf_byte)2873 DBT* ha_tokudb::pack_ext_key(
2874     DBT* key,
2875     uint keynr,
2876     uchar* buff,
2877     const uchar* key_ptr,
2878     uint key_length,
2879     int8_t inf_byte) {
2880 
2881     TOKUDB_HANDLER_DBUG_ENTER("");
2882 
2883     // build a list of PK parts that are in the SK.  we will use this list to
2884     // build the extended key if necessary.
2885     KEY* pk_key_info = &table->key_info[primary_key];
2886     uint pk_parts = pk_key_info->user_defined_key_parts;
2887     uint pk_next = 0;
2888     struct {
2889         const uchar *key_ptr;
2890         KEY_PART_INFO *key_part;
2891     } pk_info[pk_parts];
2892 
2893     KEY* key_info = &table->key_info[keynr];
2894     KEY_PART_INFO* key_part = key_info->key_part;
2895     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
2896     my_bitmap_map* old_map = dbug_tmp_use_all_columns(table, table->write_set);
2897 
2898     memset((void *) key, 0, sizeof(*key));
2899     key->data = buff;
2900 
2901     // first put the "infinity" byte at beginning. States if missing columns are implicitly
2902     // positive infinity or negative infinity
2903     *buff++ = (uchar)inf_byte;
2904 
2905     for (; key_part != end && (int) key_length > 0; key_part++) {
2906         // if the SK part is part of the PK, then append it to the list.
2907         if (key_part->field->part_of_key.is_set(primary_key)) {
2908             assert_always(pk_next < pk_parts);
2909             pk_info[pk_next].key_ptr = key_ptr;
2910             pk_info[pk_next].key_part = key_part;
2911             pk_next++;
2912         }
2913         uint offset = 0;
2914         if (key_part->null_bit) {
2915             if (!(*key_ptr == 0)) {
2916                 *buff++ = NULL_COL_VAL;
2917                 key_length -= key_part->store_length;
2918                 key_ptr += key_part->store_length;
2919                 continue;
2920             }
2921             *buff++ = NONNULL_COL_VAL;
2922             offset = 1;         // Data is at key_ptr+1
2923         }
2924         assert_always(table->s->db_low_byte_first);
2925         buff = pack_key_toku_key_field(
2926             buff,
2927             (uchar *) key_ptr + offset,
2928             key_part->field,
2929             key_part->length
2930             );
2931 
2932         key_ptr += key_part->store_length;
2933         key_length -= key_part->store_length;
2934     }
2935 
2936     if (key_length > 0) {
2937         assert_always(key_part == end);
2938 #if defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2939         end = key_info->key_part + key_info->actual_key_parts;
2940 #else
2941         end = key_info->key_part;
2942 #endif // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2943 
2944         // pack PK in order of PK key parts
2945         for (uint pk_index = 0;
2946              key_part != end && (int) key_length > 0 && pk_index < pk_parts;
2947              pk_index++) {
2948             uint i;
2949             for (i = 0; i < pk_next; i++) {
2950                 if (pk_info[i].key_part->fieldnr ==
2951                     pk_key_info->key_part[pk_index].fieldnr)
2952                     break;
2953             }
2954             if (i < pk_next) {
2955                 const uchar *this_key_ptr = pk_info[i].key_ptr;
2956                 KEY_PART_INFO *this_key_part = pk_info[i].key_part;
2957                 buff = pack_key_toku_key_field(
2958                     buff,
2959                     (uchar*)this_key_ptr,
2960                     this_key_part->field,
2961                     this_key_part->length);
2962             } else {
2963                 buff = pack_key_toku_key_field(
2964                     buff,
2965                     (uchar*)key_ptr,
2966                     key_part->field,
2967                     key_part->length);
2968                 key_ptr += key_part->store_length;
2969                 key_length -= key_part->store_length;
2970                 key_part++;
2971             }
2972         }
2973     }
2974 
2975     key->size = (buff - (uchar *) key->data);
2976     DBUG_DUMP("key", (uchar *) key->data, key->size);
2977     dbug_tmp_restore_column_map(table->write_set, old_map);
2978     DBUG_RETURN(key);
2979 }
2980 #endif  // defined(TOKU_INCLUDE_EXTENDED_KEYS) && TOKU_INCLUDE_EXTENDED_KEYS
2981 
2982 //
2983 // get max used hidden primary key value
2984 //
init_hidden_prim_key_info(DB_TXN * txn)2985 void ha_tokudb::init_hidden_prim_key_info(DB_TXN *txn) {
2986     TOKUDB_HANDLER_DBUG_ENTER("");
2987     if (!(share->status & STATUS_PRIMARY_KEY_INIT)) {
2988         int error = 0;
2989         DBC* c = NULL;
2990         error = share->key_file[primary_key]->cursor(
2991             share->key_file[primary_key],
2992             txn,
2993             &c,
2994             0);
2995         assert_always(error == 0);
2996         DBT key,val;
2997         memset(&key, 0, sizeof(key));
2998         memset(&val, 0, sizeof(val));
2999         error = c->c_get(c, &key, &val, DB_LAST);
3000         if (error == 0) {
3001             assert_always(key.size == TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
3002             share->auto_ident = hpk_char_to_num((uchar *)key.data);
3003         }
3004         error = c->c_close(c);
3005         assert_always(error == 0);
3006         share->status |= STATUS_PRIMARY_KEY_INIT;
3007     }
3008     TOKUDB_HANDLER_DBUG_VOID_RETURN;
3009 }
3010 
3011 
3012 
3013 /** @brief
3014     Get metadata info stored in status.tokudb
3015     */
get_status(DB_TXN * txn)3016 int ha_tokudb::get_status(DB_TXN* txn) {
3017     TOKUDB_HANDLER_DBUG_ENTER("");
3018     DBT key, value;
3019     HA_METADATA_KEY curr_key;
3020     int error;
3021 
3022     //
3023     // open status.tokudb
3024     //
3025     if (!share->status_block) {
3026         error =
3027             open_status_dictionary(
3028                 &share->status_block,
3029                 share->full_table_name(),
3030                 txn);
3031         if (error) {
3032             goto cleanup;
3033         }
3034     }
3035 
3036     //
3037     // transaction to be used for putting metadata into status.tokudb
3038     //
3039     memset(&key, 0, sizeof(key));
3040     memset(&value, 0, sizeof(value));
3041     key.data = &curr_key;
3042     key.size = sizeof(curr_key);
3043     value.flags = DB_DBT_USERMEM;
3044 
3045     assert_always(share->status_block);
3046     //
3047     // get version
3048     //
3049     value.ulen = sizeof(share->version);
3050     value.data = &share->version;
3051     curr_key = hatoku_new_version;
3052     error = share->status_block->get(
3053         share->status_block,
3054         txn,
3055         &key,
3056         &value,
3057         0
3058         );
3059     if (error == DB_NOTFOUND) {
3060         //
3061         // hack to keep handle the issues of going back and forth
3062         // between 5.0.3 to 5.0.4
3063         // the problem with going back and forth
3064         // is with storing the frm file, 5.0.4 stores it, 5.0.3 does not
3065         // so, if a user goes back and forth and alters the schema
3066         // the frm stored can get out of sync with the schema of the table
3067         // This can cause issues.
3068         // To take care of this, we are doing this versioning work here.
3069         // We change the key that stores the version.
3070         // In 5.0.3, it is hatoku_old_version, in 5.0.4 it is hatoku_new_version
3071         // When we encounter a table that does not have hatoku_new_version
3072         // set, we give it the right one, and overwrite the old one with zero.
3073         // This ensures that 5.0.3 cannot open the table. Once it has been opened by 5.0.4
3074         //
3075         uint dummy_version = 0;
3076         share->version = HA_TOKU_ORIG_VERSION;
3077         error = write_to_status(
3078             share->status_block,
3079             hatoku_new_version,
3080             &share->version,
3081             sizeof(share->version),
3082             txn
3083             );
3084         if (error) { goto cleanup; }
3085         error = write_to_status(
3086             share->status_block,
3087             hatoku_old_version,
3088             &dummy_version,
3089             sizeof(dummy_version),
3090             txn
3091             );
3092         if (error) { goto cleanup; }
3093     }
3094     else if (error || value.size != sizeof(share->version)) {
3095         if (error == 0) {
3096             error = HA_ERR_INTERNAL_ERROR;
3097         }
3098         goto cleanup;
3099     }
3100     //
3101     // get capabilities
3102     //
3103     curr_key = hatoku_capabilities;
3104     value.ulen = sizeof(share->capabilities);
3105     value.data = &share->capabilities;
3106     error = share->status_block->get(
3107         share->status_block,
3108         txn,
3109         &key,
3110         &value,
3111         0
3112         );
3113     if (error == DB_NOTFOUND) {
3114         share->capabilities= 0;
3115     }
3116     else if (error || value.size != sizeof(share->version)) {
3117         if (error == 0) {
3118             error = HA_ERR_INTERNAL_ERROR;
3119         }
3120         goto cleanup;
3121     }
3122 
3123     error = 0;
3124 cleanup:
3125     TOKUDB_HANDLER_DBUG_RETURN(error);
3126 }
3127 
3128 /** @brief
3129     Return an estimated of the number of rows in the table.
3130     Used when sorting to allocate buffers and by the optimizer.
3131     This is used in filesort.cc.
3132 */
estimate_rows_upper_bound()3133 ha_rows ha_tokudb::estimate_rows_upper_bound() {
3134     TOKUDB_HANDLER_DBUG_ENTER("");
3135     DBUG_RETURN(share->row_count() + HA_TOKUDB_EXTRA_ROWS);
3136 }
3137 
3138 //
3139 // Function that compares two primary keys that were saved as part of rnd_pos
3140 // and ::position
3141 //
cmp_ref(const uchar * ref1,const uchar * ref2)3142 int ha_tokudb::cmp_ref(const uchar * ref1, const uchar * ref2) {
3143     int ret_val = 0;
3144     bool read_string = false;
3145     ret_val = tokudb_compare_two_keys(
3146         ref1 + sizeof(uint32_t),
3147         *(uint32_t *)ref1,
3148         ref2 + sizeof(uint32_t),
3149         *(uint32_t *)ref2,
3150         (uchar *)share->file->descriptor->dbt.data + 4,
3151         *(uint32_t *)share->file->descriptor->dbt.data - 4,
3152         false,
3153         &read_string
3154         );
3155     return ret_val;
3156 }
3157 
check_if_incompatible_data(HA_CREATE_INFO * info,uint table_changes)3158 bool ha_tokudb::check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes) {
3159   //
3160   // This is a horrendous hack for now, as copied by InnoDB.
3161   // This states that if the auto increment create field has changed,
3162   // via a "alter table foo auto_increment=new_val", that this
3163   // change is incompatible, and to rebuild the entire table
3164   // This will need to be fixed
3165   //
3166   if ((info->used_fields & HA_CREATE_USED_AUTO) &&
3167       info->auto_increment_value != 0) {
3168 
3169     return COMPATIBLE_DATA_NO;
3170   }
3171   if (table_changes != IS_EQUAL_YES)
3172     return COMPATIBLE_DATA_NO;
3173   return COMPATIBLE_DATA_YES;
3174 }
3175 
3176 //
3177 // Method that is called before the beginning of many calls
3178 // to insert rows (ha_tokudb::write_row). There is no guarantee
3179 // that start_bulk_insert is called, however there is a guarantee
3180 // that if start_bulk_insert is called, then end_bulk_insert may be
3181 // called as well.
3182 // Parameters:
3183 //      [in]    rows - an estimate of the number of rows that will be inserted
3184 //                     if number of rows is unknown (such as if doing
3185 //                     "insert into foo select * from bar), then rows
3186 //                     will be 0
3187 //
3188 //
3189 // This function returns true if the table MAY be empty.
3190 // It is NOT meant to be a 100% check for emptiness.
3191 // This is used for a bulk load optimization.
3192 //
may_table_be_empty(DB_TXN * txn)3193 bool ha_tokudb::may_table_be_empty(DB_TXN *txn) {
3194     int error;
3195     bool ret_val = false;
3196     DBC* tmp_cursor = NULL;
3197     DB_TXN* tmp_txn = NULL;
3198 
3199     const int empty_scan = tokudb::sysvars::empty_scan(ha_thd());
3200     if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_DISABLED)
3201         goto cleanup;
3202 
3203     if (txn == NULL) {
3204         error = txn_begin(db_env, 0, &tmp_txn, 0, ha_thd());
3205         if (error) {
3206             goto cleanup;
3207         }
3208         txn = tmp_txn;
3209     }
3210 
3211     error = share->file->cursor(share->file, txn, &tmp_cursor, 0);
3212     if (error)
3213         goto cleanup;
3214     tmp_cursor->c_set_check_interrupt_callback(tmp_cursor, tokudb_killed_thd_callback, ha_thd());
3215     if (empty_scan == tokudb::sysvars::TOKUDB_EMPTY_SCAN_LR)
3216         error = tmp_cursor->c_getf_next(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3217     else
3218         error = tmp_cursor->c_getf_prev(tmp_cursor, 0, smart_dbt_do_nothing, NULL);
3219     error = map_to_handler_error(error);
3220     if (error == DB_NOTFOUND)
3221         ret_val = true;
3222     else
3223         ret_val = false;
3224     error = 0;
3225 
3226 cleanup:
3227     if (tmp_cursor) {
3228         int r = tmp_cursor->c_close(tmp_cursor);
3229         assert_always(r == 0);
3230         tmp_cursor = NULL;
3231     }
3232     if (tmp_txn) {
3233         commit_txn(tmp_txn, 0);
3234         tmp_txn = NULL;
3235     }
3236     return ret_val;
3237 }
3238 
start_bulk_insert(ha_rows rows)3239 void ha_tokudb::start_bulk_insert(ha_rows rows) {
3240     TOKUDB_HANDLER_DBUG_ENTER("%llu txn %p", (unsigned long long) rows, transaction);
3241     THD* thd = ha_thd();
3242     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3243     delay_updating_ai_metadata = true;
3244     ai_metadata_update_required = false;
3245     abort_loader = false;
3246 
3247     rwlock_t_lock_read(share->_num_DBs_lock);
3248     uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3249     num_DBs_locked_in_bulk = true;
3250     lock_count = 0;
3251 
3252     if ((rows == 0 || rows > 1) && share->try_table_lock) {
3253         if (tokudb::sysvars::prelock_empty(thd) &&
3254             may_table_be_empty(transaction) &&
3255             transaction != NULL) {
3256             if (using_ignore || is_insert_ignore(thd) || thd->lex->duplicates != DUP_ERROR) {
3257                 acquire_table_lock(transaction, lock_write);
3258             } else {
3259                 mult_dbt_flags[primary_key] = 0;
3260                 if (!thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS) && !hidden_primary_key) {
3261                     mult_put_flags[primary_key] = DB_NOOVERWRITE;
3262                 }
3263                 uint32_t loader_flags = (tokudb::sysvars::load_save_space(thd)) ?
3264                     LOADER_COMPRESS_INTERMEDIATES : 0;
3265 
3266                 int error = db_env->create_loader(
3267                     db_env,
3268                     transaction,
3269                     &loader,
3270                     NULL, // no src_db needed
3271                     curr_num_DBs,
3272                     share->key_file,
3273                     mult_put_flags,
3274                     mult_dbt_flags,
3275                     loader_flags
3276                     );
3277                 if (error) {
3278                     assert_always(loader == NULL);
3279                     goto exit_try_table_lock;
3280                 }
3281 
3282                 lc.thd = thd;
3283                 lc.ha = this;
3284 
3285                 error = loader->set_poll_function(
3286                     loader, ha_tokudb::bulk_insert_poll, &lc);
3287                 assert_always(!error);
3288 
3289                 error = loader->set_error_callback(
3290                     loader, ha_tokudb::loader_dup, &lc);
3291                 assert_always(!error);
3292 
3293                 trx->stmt_progress.using_loader = true;
3294             }
3295         }
3296     exit_try_table_lock:
3297         share->lock();
3298         share->try_table_lock = false;
3299         share->unlock();
3300     }
3301     TOKUDB_HANDLER_DBUG_VOID_RETURN;
3302 }
bulk_insert_poll(void * extra,float progress)3303 int ha_tokudb::bulk_insert_poll(void* extra, float progress) {
3304     LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
3305     if (thd_killed(context->thd)) {
3306         snprintf(context->write_status_msg,
3307                  sizeof(context->write_status_msg),
3308                  "The process has been killed, aborting bulk load.");
3309         return ER_ABORTING_CONNECTION;
3310     }
3311     float percentage = progress * 100;
3312     snprintf(context->write_status_msg,
3313              sizeof(context->write_status_msg),
3314              "Loading of data t %s about %.1f%% done",
3315              context->ha->share->full_table_name(),
3316              percentage);
3317     thd_proc_info(context->thd, context->write_status_msg);
3318 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
3319     thd_progress_report(context->thd, (unsigned long long)percentage, 100);
3320 #endif
3321     return 0;
3322 }
loader_add_index_err(TOKUDB_UNUSED (DB * db),TOKUDB_UNUSED (int i),int err,TOKUDB_UNUSED (DBT * key),TOKUDB_UNUSED (DBT * val),void * error_extra)3323 void ha_tokudb::loader_add_index_err(TOKUDB_UNUSED(DB* db),
3324                                      TOKUDB_UNUSED(int i),
3325                                      int err,
3326                                      TOKUDB_UNUSED(DBT* key),
3327                                      TOKUDB_UNUSED(DBT* val),
3328                                      void* error_extra) {
3329     LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3330     assert_always(context->ha);
3331     context->ha->set_loader_error(err);
3332 }
loader_dup(TOKUDB_UNUSED (DB * db),TOKUDB_UNUSED (int i),int err,DBT * key,TOKUDB_UNUSED (DBT * val),void * error_extra)3333 void ha_tokudb::loader_dup(TOKUDB_UNUSED(DB* db),
3334                            TOKUDB_UNUSED(int i),
3335                            int err,
3336                            DBT* key,
3337                            TOKUDB_UNUSED(DBT* val),
3338                            void* error_extra) {
3339     LOADER_CONTEXT context = (LOADER_CONTEXT)error_extra;
3340     assert_always(context->ha);
3341     context->ha->set_loader_error(err);
3342     if (err == DB_KEYEXIST) {
3343         context->ha->set_dup_value_for_pk(key);
3344     }
3345 }
3346 
3347 //
3348 // Method that is called at the end of many calls to insert rows
3349 // (ha_tokudb::write_row). If start_bulk_insert is called, then
3350 // this is guaranteed to be called.
3351 //
end_bulk_insert(TOKUDB_UNUSED (bool abort))3352 int ha_tokudb::end_bulk_insert(TOKUDB_UNUSED(bool abort)) {
3353     TOKUDB_HANDLER_DBUG_ENTER("");
3354     int error = 0;
3355     THD* thd = ha_thd();
3356     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
3357     bool using_loader = (loader != NULL);
3358     if (ai_metadata_update_required) {
3359         share->lock();
3360         error = update_max_auto_inc(share->status_block, share->last_auto_increment);
3361         share->unlock();
3362         if (error) { goto cleanup; }
3363     }
3364     delay_updating_ai_metadata = false;
3365     ai_metadata_update_required = false;
3366     loader_error = 0;
3367     if (loader) {
3368         if (!abort_loader && !thd_killed(thd)) {
3369             DBUG_EXECUTE_IF("tokudb_end_bulk_insert_sleep", {
3370                 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3371                 thd_proc_info(thd, "DBUG sleep");
3372                 my_sleep(20000000);
3373                 thd_proc_info(thd, orig_proc_info);
3374             });
3375             error = loader->close(loader);
3376             loader = NULL;
3377             if (error) {
3378                 if (thd_killed(thd)) {
3379                     my_error(ER_QUERY_INTERRUPTED, MYF(0));
3380                 }
3381                 goto cleanup;
3382             }
3383 
3384             for (uint i = 0; i < table_share->keys; i++) {
3385                 if (table_share->key_info[i].flags & HA_NOSAME) {
3386                     bool is_unique;
3387                     if (i == primary_key && !share->pk_has_string) {
3388                         continue;
3389                     }
3390                     error = is_index_unique(&is_unique, transaction, share->key_file[i], &table->key_info[i],
3391                                             DB_PRELOCKED_WRITE);
3392                     if (error) goto cleanup;
3393                     if (!is_unique) {
3394                         error = HA_ERR_FOUND_DUPP_KEY;
3395                         last_dup_key = i;
3396                         goto cleanup;
3397                     }
3398                 }
3399             }
3400         }
3401         else {
3402             error = sprintf(write_status_msg, "aborting bulk load");
3403             thd_proc_info(thd, write_status_msg);
3404             loader->abort(loader);
3405             loader = NULL;
3406             share->try_table_lock = true;
3407         }
3408     }
3409 
3410 cleanup:
3411     if (num_DBs_locked_in_bulk) {
3412         share->_num_DBs_lock.unlock();
3413     }
3414     num_DBs_locked_in_bulk = false;
3415     lock_count = 0;
3416     if (loader) {
3417         error = sprintf(write_status_msg, "aborting bulk load");
3418         thd_proc_info(thd, write_status_msg);
3419         loader->abort(loader);
3420         loader = NULL;
3421     }
3422     abort_loader = false;
3423     memset(&lc, 0, sizeof(lc));
3424     if (error || loader_error) {
3425         set_my_errno(error ? error : loader_error);
3426         if (using_loader) {
3427             share->try_table_lock = true;
3428         }
3429     }
3430     trx->stmt_progress.using_loader = false;
3431     thd_proc_info(thd, 0);
3432     TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
3433 }
3434 
end_bulk_insert()3435 int ha_tokudb::end_bulk_insert() {
3436     return end_bulk_insert( false );
3437 }
3438 
is_index_unique(bool * is_unique,DB_TXN * txn,DB * db,KEY * key_info,int lock_flags)3439 int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_info, int lock_flags) {
3440     int error;
3441     DBC* tmp_cursor1 = NULL;
3442     DBC* tmp_cursor2 = NULL;
3443     DBT key1, key2, val, packed_key1, packed_key2;
3444     uint64_t cnt = 0;
3445     char status_msg[MAX_ALIAS_NAME + 200]; //buffer of 200 should be a good upper bound.
3446     THD* thd = ha_thd();
3447     const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
3448     memset(&key1, 0, sizeof(key1));
3449     memset(&key2, 0, sizeof(key2));
3450     memset(&val, 0, sizeof(val));
3451     memset(&packed_key1, 0, sizeof(packed_key1));
3452     memset(&packed_key2, 0, sizeof(packed_key2));
3453     *is_unique = true;
3454 
3455     error = db->cursor(db, txn, &tmp_cursor1, DB_SERIALIZABLE);
3456     if (error) { goto cleanup; }
3457 
3458     error = db->cursor(db, txn, &tmp_cursor2, DB_SERIALIZABLE);
3459     if (error) { goto cleanup; }
3460 
3461     error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3462     if (error == DB_NOTFOUND) {
3463         *is_unique = true;
3464         error = 0;
3465         goto cleanup;
3466     }
3467     else if (error) { goto cleanup; }
3468     error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3469     if (error) { goto cleanup; }
3470 
3471     error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3472     if (error == DB_NOTFOUND) {
3473         *is_unique = true;
3474         error = 0;
3475         goto cleanup;
3476     }
3477     else if (error) { goto cleanup; }
3478 
3479     while (error != DB_NOTFOUND) {
3480         bool has_null1;
3481         bool has_null2;
3482         int cmp;
3483         place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key1.data + 1);
3484         place_key_into_mysql_buff(key_info, table->record[1], (uchar *) key2.data + 1);
3485 
3486         create_dbt_key_for_lookup(&packed_key1, key_info, key_buff, table->record[0], &has_null1);
3487         create_dbt_key_for_lookup(&packed_key2, key_info, key_buff2, table->record[1], &has_null2);
3488 
3489         if (!has_null1 && !has_null2) {
3490             cmp = tokudb_prefix_cmp_dbt_key(db, &packed_key1, &packed_key2);
3491             if (cmp == 0) {
3492                 memcpy(key_buff, key1.data, key1.size);
3493                 place_key_into_mysql_buff(key_info, table->record[0], (uchar *) key_buff + 1);
3494                 *is_unique = false;
3495                 break;
3496             }
3497         }
3498 
3499         error = tmp_cursor1->c_get(tmp_cursor1, &key1, &val, DB_NEXT + lock_flags);
3500         if (error) { goto cleanup; }
3501         error = tmp_cursor2->c_get(tmp_cursor2, &key2, &val, DB_NEXT + lock_flags);
3502         if (error && (error != DB_NOTFOUND)) { goto cleanup; }
3503 
3504         cnt++;
3505         if ((cnt % 10000) == 0) {
3506             sprintf(
3507                 status_msg,
3508                 "Verifying index uniqueness: Checked %llu of %llu rows in key-%s.",
3509                 (long long unsigned) cnt,
3510                 share->row_count(),
3511                 key_info->name);
3512             thd_proc_info(thd, status_msg);
3513             if (thd_killed(thd)) {
3514                 my_error(ER_QUERY_INTERRUPTED, MYF(0));
3515                 error = ER_QUERY_INTERRUPTED;
3516                 goto cleanup;
3517             }
3518         }
3519     }
3520 
3521     error = 0;
3522 
3523 cleanup:
3524     thd_proc_info(thd, orig_proc_info);
3525     if (tmp_cursor1) {
3526         tmp_cursor1->c_close(tmp_cursor1);
3527         tmp_cursor1 = NULL;
3528     }
3529     if (tmp_cursor2) {
3530         tmp_cursor2->c_close(tmp_cursor2);
3531         tmp_cursor2 = NULL;
3532     }
3533     return error;
3534 }
3535 
is_val_unique(bool * is_unique,uchar * record,KEY * key_info,uint dict_index,DB_TXN * txn)3536 int ha_tokudb::is_val_unique(bool* is_unique, uchar* record, KEY* key_info, uint dict_index, DB_TXN* txn) {
3537     int error = 0;
3538     bool has_null;
3539     DBC* tmp_cursor = NULL;
3540 
3541     DBT key; memset((void *)&key, 0, sizeof(key));
3542     create_dbt_key_from_key(&key, key_info, key_buff2, record, &has_null, true, MAX_KEY_LENGTH, COL_NEG_INF);
3543     if (has_null) {
3544         error = 0;
3545         *is_unique = true;
3546         goto cleanup;
3547     }
3548 
3549     error = share->key_file[dict_index]->cursor(share->key_file[dict_index], txn, &tmp_cursor, DB_SERIALIZABLE | DB_RMW);
3550     if (error) {
3551         goto cleanup;
3552     } else {
3553         // prelock (key,-inf),(key,+inf) so that the subsequent key lookup does not overlock
3554         uint flags = 0;
3555         DBT key_right; memset(&key_right, 0, sizeof key_right);
3556         create_dbt_key_from_key(&key_right, key_info, key_buff3, record, &has_null, true, MAX_KEY_LENGTH, COL_POS_INF);
3557         error = tmp_cursor->c_set_bounds(tmp_cursor, &key, &key_right, true, DB_NOTFOUND);
3558         if (error == 0) {
3559             flags = DB_PRELOCKED | DB_PRELOCKED_WRITE;
3560         }
3561 
3562         // lookup key and check unique prefix
3563         struct smart_dbt_info info;
3564         info.ha = this;
3565         info.buf = NULL;
3566         info.keynr = dict_index;
3567 
3568         struct index_read_info ir_info;
3569         ir_info.orig_key = &key;
3570         ir_info.smart_dbt_info = info;
3571 
3572         error = tmp_cursor->c_getf_set_range(tmp_cursor, flags, &key, smart_dbt_callback_lookup, &ir_info);
3573         if (error == DB_NOTFOUND) {
3574             *is_unique = true;
3575             error = 0;
3576             goto cleanup;
3577         }
3578         else if (error) {
3579             error = map_to_handler_error(error);
3580             goto cleanup;
3581         }
3582         if (ir_info.cmp) {
3583             *is_unique = true;
3584         }
3585         else {
3586             *is_unique = false;
3587         }
3588     }
3589     error = 0;
3590 
3591 cleanup:
3592     if (tmp_cursor) {
3593         int r = tmp_cursor->c_close(tmp_cursor);
3594         assert_always(r==0);
3595         tmp_cursor = NULL;
3596     }
3597     return error;
3598 }
3599 
3600 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
maybe_do_unique_checks_delay_fn(THD * thd)3601 static void maybe_do_unique_checks_delay_fn(THD *thd) {
3602     if (thd->slave_thread) {
3603         uint64_t delay_ms = tokudb::sysvars::rpl_unique_checks_delay(thd);
3604         if (delay_ms)
3605             usleep(delay_ms * 1000);
3606     }
3607 }
3608 
3609 #define maybe_do_unique_checks_delay(__thd) \
3610     (maybe_do_unique_checks_delay_fn(__thd))
3611 
3612 #define maybe_do_unique_checks_delay_if_flags_set( \
3613     __thd, __flags_set, __flags_check)             \
3614     { if (((__flags_set) & DB_OPFLAGS_MASK) ==     \
3615          (__flags_check)) maybe_do_unique_checks_delay_fn(__thd); }
3616 
need_read_only(THD * thd)3617 static bool need_read_only(THD *thd) {
3618     return opt_readonly || !tokudb::sysvars::rpl_check_readonly(thd);
3619 }
3620 
do_unique_checks_fn(THD * thd,bool do_rpl_event)3621 static bool do_unique_checks_fn(THD *thd, bool do_rpl_event) {
3622     if (do_rpl_event &&
3623         thd->slave_thread &&
3624         need_read_only(thd) &&
3625         !tokudb::sysvars::rpl_unique_checks(thd)) {
3626         return false;
3627     } else {
3628         return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3629     }
3630 }
3631 
3632 #define do_unique_checks(__thd, __flags) \
3633     (do_unique_checks_fn(__thd, __flags))
3634 
3635 #else
3636 
3637 #define maybe_do_unique_checks_delay(__thd) ((void)0)
3638 
3639 #define maybe_do_unique_checks_delay_if_flags_set( \
3640     __thd, __flags_set, __flags_check)             \
3641     ((void)0)
3642 
do_unique_checks_fn(THD * thd)3643 static bool do_unique_checks_fn(THD *thd) {
3644     return !thd_test_options(thd, OPTION_RELAXED_UNIQUE_CHECKS);
3645 }
3646 
3647 #define do_unique_checks(__thd, _flags) \
3648     (do_unique_checks_fn(__thd))
3649 
3650 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
3651 
do_uniqueness_checks(uchar * record,DB_TXN * txn,THD * thd)3652 int ha_tokudb::do_uniqueness_checks(uchar* record, DB_TXN* txn, THD* thd) {
3653     int error = 0;
3654     //
3655     // first do uniqueness checks
3656     //
3657     if (share->has_unique_keys && do_unique_checks(thd, in_rpl_write_rows)) {
3658         DBUG_EXECUTE_IF("tokudb_crash_if_rpl_does_uniqueness_check",
3659                         assert(0););
3660         for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3661             bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
3662             bool is_unique = false;
3663             //
3664             // don't need to do check for primary key that don't have strings
3665             //
3666             if (keynr == primary_key && !share->pk_has_string) {
3667                 continue;
3668             }
3669             if (!is_unique_key) {
3670                 continue;
3671             }
3672 
3673             maybe_do_unique_checks_delay(thd);
3674 
3675             //
3676             // if unique key, check uniqueness constraint
3677             // but, we do not need to check it if the key has a null
3678             // and we do not need to check it if unique_checks is off
3679             //
3680             error = is_val_unique(&is_unique, record, &table->key_info[keynr], keynr, txn);
3681             if (error) {
3682                 goto cleanup;
3683             }
3684             if (!is_unique) {
3685                 error = DB_KEYEXIST;
3686                 last_dup_key = keynr;
3687                 goto cleanup;
3688             }
3689         }
3690     }
3691 cleanup:
3692     return error;
3693 }
3694 
test_row_packing(uchar * record,DBT * pk_key,DBT * pk_val)3695 void ha_tokudb::test_row_packing(uchar* record, DBT* pk_key, DBT* pk_val) {
3696     int error;
3697     DBT row, key;
3698     //
3699     // variables for testing key packing, only used in some debug modes
3700     //
3701     uchar* tmp_pk_key_data = NULL;
3702     uchar* tmp_pk_val_data = NULL;
3703     DBT tmp_pk_key;
3704     DBT tmp_pk_val;
3705     bool has_null;
3706     int cmp;
3707 
3708     memset(&tmp_pk_key, 0, sizeof(DBT));
3709     memset(&tmp_pk_val, 0, sizeof(DBT));
3710 
3711     //
3712     //use for testing the packing of keys
3713     //
3714     tmp_pk_key_data = (uchar*)tokudb::memory::malloc(pk_key->size, MYF(MY_WME));
3715     assert_always(tmp_pk_key_data);
3716     tmp_pk_val_data = (uchar*)tokudb::memory::malloc(pk_val->size, MYF(MY_WME));
3717     assert_always(tmp_pk_val_data);
3718     memcpy(tmp_pk_key_data, pk_key->data, pk_key->size);
3719     memcpy(tmp_pk_val_data, pk_val->data, pk_val->size);
3720     tmp_pk_key.data = tmp_pk_key_data;
3721     tmp_pk_key.size = pk_key->size;
3722     tmp_pk_val.data = tmp_pk_val_data;
3723     tmp_pk_val.size = pk_val->size;
3724 
3725     for (uint keynr = 0; keynr < table_share->keys; keynr++) {
3726         uint32_t tmp_num_bytes = 0;
3727         uchar* row_desc = NULL;
3728         uint32_t desc_size = 0;
3729 
3730         if (keynr == primary_key) {
3731             continue;
3732         }
3733 
3734         create_dbt_key_from_table(&key, keynr, key_buff2, record, &has_null);
3735 
3736         //
3737         // TEST
3738         //
3739         row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3740         row_desc += (*(uint32_t *)row_desc);
3741         desc_size = (*(uint32_t *)row_desc) - 4;
3742         row_desc += 4;
3743         tmp_num_bytes = pack_key_from_desc(
3744             key_buff3,
3745             row_desc,
3746             desc_size,
3747             &tmp_pk_key,
3748             &tmp_pk_val
3749             );
3750         assert_always(tmp_num_bytes == key.size);
3751         cmp = memcmp(key_buff3,key_buff2,tmp_num_bytes);
3752         assert_always(cmp == 0);
3753 
3754         //
3755         // test key packing of clustering keys
3756         //
3757         if (key_is_clustering(&table->key_info[keynr])) {
3758             error = pack_row(&row, (const uchar *) record, keynr);
3759             assert_always(error == 0);
3760             uchar* tmp_buff = NULL;
3761             tmp_buff = (uchar*)tokudb::memory::malloc(
3762                 alloced_rec_buff_length,
3763                 MYF(MY_WME));
3764             assert_always(tmp_buff);
3765             row_desc = (uchar *)share->key_file[keynr]->descriptor->dbt.data;
3766             row_desc += (*(uint32_t *)row_desc);
3767             row_desc += (*(uint32_t *)row_desc);
3768             desc_size = (*(uint32_t *)row_desc) - 4;
3769             row_desc += 4;
3770             tmp_num_bytes = pack_clustering_val_from_desc(
3771                 tmp_buff,
3772                 row_desc,
3773                 desc_size,
3774                 &tmp_pk_val
3775                 );
3776             assert_always(tmp_num_bytes == row.size);
3777             cmp = memcmp(tmp_buff,rec_buff,tmp_num_bytes);
3778             assert_always(cmp == 0);
3779             tokudb::memory::free(tmp_buff);
3780         }
3781     }
3782 
3783     //
3784     // copy stuff back out
3785     //
3786     error = pack_row(pk_val, (const uchar *) record, primary_key);
3787     assert_always(pk_val->size == tmp_pk_val.size);
3788     cmp = memcmp(pk_val->data, tmp_pk_val_data, pk_val->size);
3789     assert_always( cmp == 0);
3790 
3791     tokudb::memory::free(tmp_pk_key_data);
3792     tokudb::memory::free(tmp_pk_val_data);
3793 }
3794 
3795 // set the put flags for the main dictionary
set_main_dict_put_flags(THD * thd,uint32_t * put_flags)3796 void ha_tokudb::set_main_dict_put_flags(THD* thd,
3797                                         uint32_t* put_flags) {
3798     uint32_t old_prelock_flags = 0;
3799 
3800     if (hidden_primary_key ||
3801         (!do_unique_checks(thd, in_rpl_write_rows | in_rpl_update_rows) &&
3802          !is_replace_into(thd) && !is_insert_ignore(thd))) {
3803         *put_flags = old_prelock_flags;
3804     } else {
3805         *put_flags = DB_NOOVERWRITE | old_prelock_flags;
3806     }
3807 }
3808 
insert_row_to_main_dictionary(DBT * pk_key,DBT * pk_val,DB_TXN * txn)3809 int ha_tokudb::insert_row_to_main_dictionary(
3810     DBT* pk_key,
3811     DBT* pk_val,
3812     DB_TXN* txn) {
3813 
3814     int error = 0;
3815     uint curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
3816     assert_always(curr_num_DBs == 1);
3817 
3818     uint32_t put_flags = mult_put_flags[primary_key];
3819     THD *thd = ha_thd();
3820     set_main_dict_put_flags(thd, &put_flags);
3821 
3822     // for test, make unique checks have a very long duration
3823     maybe_do_unique_checks_delay_if_flags_set(thd, put_flags, DB_NOOVERWRITE);
3824 
3825     error = share->file->put(share->file, txn, pk_key, pk_val, put_flags);
3826     if (error) {
3827         last_dup_key = primary_key;
3828         goto cleanup;
3829     }
3830 
3831 cleanup:
3832     return error;
3833 }
3834 
insert_rows_to_dictionaries_mult(DBT * pk_key,DBT * pk_val,DB_TXN * txn,THD * thd)3835 int ha_tokudb::insert_rows_to_dictionaries_mult(
3836     DBT* pk_key,
3837     DBT* pk_val,
3838     DB_TXN* txn,
3839     THD* thd) {
3840 
3841     int error = 0;
3842     uint curr_num_DBs = share->num_DBs;
3843     set_main_dict_put_flags(thd, &mult_put_flags[primary_key]);
3844     uint32_t flags = mult_put_flags[primary_key];
3845 
3846     // for test, make unique checks have a very long duration
3847     maybe_do_unique_checks_delay_if_flags_set(thd, flags, DB_NOOVERWRITE);
3848 
3849     // the insert ignore optimization uses DB_NOOVERWRITE_NO_ERROR,
3850     // which is not allowed with env->put_multiple.
3851     // we have to insert the rows one by one in this case.
3852     if (flags & DB_NOOVERWRITE_NO_ERROR) {
3853         DB * src_db = share->key_file[primary_key];
3854         for (uint32_t i = 0; i < curr_num_DBs; i++) {
3855             DB * db = share->key_file[i];
3856             if (i == primary_key) {
3857                 // if it's the primary key, insert the rows
3858                 // as they are.
3859                 error = db->put(db, txn, pk_key, pk_val, flags);
3860             } else {
3861                 // generate a row for secondary keys.
3862                 // use our multi put key/rec buffers
3863                 // just as the ydb layer would have in
3864                 // env->put_multiple(), except that
3865                 // we will just do a put() right away.
3866                 error =
3867                     tokudb_generate_row(
3868                         db,
3869                         src_db,
3870                         &mult_key_dbt_array[i].dbts[0],
3871                         &mult_rec_dbt_array[i].dbts[0],
3872                         pk_key,
3873                         pk_val);
3874                 if (error != 0) {
3875                     goto out;
3876                 }
3877                 error =
3878                     db->put(
3879                         db,
3880                         txn,
3881                         &mult_key_dbt_array[i].dbts[0],
3882                         &mult_rec_dbt_array[i].dbts[0],
3883                         flags);
3884             }
3885             if (error != 0) {
3886                 goto out;
3887             }
3888         }
3889     } else {
3890         // not insert ignore, so we can use put multiple
3891         error =
3892             db_env->put_multiple(
3893                 db_env,
3894                 share->key_file[primary_key],
3895                 txn,
3896                 pk_key,
3897                 pk_val,
3898                 curr_num_DBs,
3899                 share->key_file,
3900                 mult_key_dbt_array,
3901                 mult_rec_dbt_array,
3902                 mult_put_flags);
3903     }
3904 
3905 out:
3906     //
3907     // We break if we hit an error, unless it is a dup key error
3908     // and MySQL told us to ignore duplicate key errors
3909     //
3910     if (error) {
3911         last_dup_key = primary_key;
3912     }
3913     return error;
3914 }
3915 
3916 //
3917 // Stores a row in the table, called when handling an INSERT query
3918 // Parameters:
3919 //      [in]    record - a row in MySQL format
3920 // Returns:
3921 //      0 on success
3922 //      error otherwise
3923 //
write_row(uchar * record)3924 int ha_tokudb::write_row(uchar * record) {
3925     TOKUDB_HANDLER_DBUG_ENTER("%p", record);
3926 
3927     DBT row, prim_key;
3928     int error;
3929     THD *thd = ha_thd();
3930     bool has_null;
3931     DB_TXN* sub_trans = nullptr;
3932     DB_TXN* txn = nullptr;
3933     tokudb_trx_data* trx = nullptr;
3934     uint curr_num_DBs;
3935     bool num_DBs_locked = false;
3936 
3937     //
3938     // some crap that needs to be done because MySQL does not properly abstract
3939     // this work away from us, namely filling in auto increment and setting
3940     // auto timestamp
3941     //
3942     ha_statistic_increment(&SSV::ha_write_count);
3943     if (table->next_number_field && record == table->record[0]) {
3944         error = update_auto_increment();
3945         if (error)
3946             goto cleanup;
3947     }
3948 
3949     //
3950     // check to see if some value for the auto increment column that is bigger
3951     // than anything else til now is being used. If so, update the metadata to
3952     // reflect it the goal here is we never want to have a dup key error due to
3953     // a bad increment of the auto inc field.
3954     //
3955     if (share->has_auto_inc && record == table->record[0]) {
3956         share->lock();
3957         ulonglong curr_auto_inc = retrieve_auto_increment(
3958             table->field[share->ai_field_index]->key_type(),
3959             field_offset(table->field[share->ai_field_index], table),
3960             record);
3961         if (curr_auto_inc > share->last_auto_increment) {
3962             share->last_auto_increment = curr_auto_inc;
3963             if (delay_updating_ai_metadata) {
3964                 ai_metadata_update_required = true;
3965             } else {
3966                 update_max_auto_inc(
3967                     share->status_block,
3968                     share->last_auto_increment);
3969             }
3970         }
3971         share->unlock();
3972     }
3973 
3974     //
3975     // grab reader lock on numDBs_lock
3976     //
3977     if (!num_DBs_locked_in_bulk) {
3978         rwlock_t_lock_read(share->_num_DBs_lock);
3979         num_DBs_locked = true;
3980     } else {
3981         lock_count++;
3982         if (lock_count >= 2000) {
3983             share->_num_DBs_lock.unlock();
3984             rwlock_t_lock_read(share->_num_DBs_lock);
3985             lock_count = 0;
3986         }
3987     }
3988     curr_num_DBs = share->num_DBs;
3989 
3990     if (hidden_primary_key) {
3991         get_auto_primary_key(current_ident);
3992     }
3993 
3994     if (table_share->blob_fields) {
3995         if (fix_rec_buff_for_blob(max_row_length(record))) {
3996             error = HA_ERR_OUT_OF_MEM;
3997             goto cleanup;
3998         }
3999     }
4000 
4001     create_dbt_key_from_table(
4002         &prim_key,
4003         primary_key,
4004         primary_key_buff,
4005         record,
4006         &has_null);
4007     if ((error = pack_row(&row, (const uchar*)record, primary_key))) {
4008         goto cleanup;
4009     }
4010 
4011     if (using_ignore) {
4012         error = txn_begin(
4013             db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4014         if (error) {
4015             goto cleanup;
4016         }
4017     }
4018 
4019     txn = using_ignore ? sub_trans : transaction;
4020     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_TXN, "txn %p", txn);
4021     if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_CHECK_KEY))) {
4022         test_row_packing(record,&prim_key,&row);
4023     }
4024     if (loader) {
4025         error = loader->put(loader, &prim_key, &row);
4026         if (error) {
4027             abort_loader = true;
4028             goto cleanup;
4029         }
4030     } else {
4031         error = do_uniqueness_checks(record, txn, thd);
4032         if (error) {
4033             // for #4633
4034             // if we have a duplicate key error, let's check the primary key to
4035             // see if there is a duplicate there. If so, set last_dup_key to the
4036             // pk
4037             if (error == DB_KEYEXIST &&
4038                 !tokudb_test(hidden_primary_key) &&
4039                 last_dup_key != primary_key) {
4040                 int r =
4041                     share->file->getf_set(
4042                         share->file,
4043                         txn,
4044                         DB_SERIALIZABLE,
4045                         &prim_key,
4046                         smart_dbt_do_nothing,
4047                         NULL);
4048                 if (r == 0) {
4049                     // if we get no error, that means the row
4050                     // was found and this is a duplicate key,
4051                     // so we set last_dup_key
4052                     last_dup_key = primary_key;
4053                 } else if (r != DB_NOTFOUND) {
4054                     // if some other error is returned, return that to the user.
4055                     error = r;
4056                 }
4057             }
4058             goto cleanup;
4059         }
4060         if (curr_num_DBs == 1) {
4061             error = insert_row_to_main_dictionary(&prim_key, &row, txn);
4062             if (error) { goto cleanup; }
4063         } else {
4064             error = insert_rows_to_dictionaries_mult(&prim_key, &row, txn, thd);
4065             if (error) { goto cleanup; }
4066         }
4067         if (error == 0) {
4068             uint64_t full_row_size = prim_key.size + row.size;
4069             toku_hton_update_primary_key_bytes_inserted(full_row_size);
4070         }
4071     }
4072 
4073     trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4074     if (!error) {
4075         added_rows++;
4076         trx->stmt_progress.inserted++;
4077         track_progress(thd);
4078     }
4079 cleanup:
4080     if (num_DBs_locked) {
4081        share->_num_DBs_lock.unlock();
4082     }
4083     if (error == DB_KEYEXIST) {
4084         error = HA_ERR_FOUND_DUPP_KEY;
4085     }
4086     if (sub_trans) {
4087         // no point in recording error value of abort.
4088         // nothing we can do about it anyway and it is not what
4089         // we want to return.
4090         if (error) {
4091             abort_txn(sub_trans);
4092         } else {
4093             commit_txn(sub_trans, DB_TXN_NOSYNC);
4094         }
4095     }
4096     TOKUDB_HANDLER_DBUG_RETURN(error);
4097 }
4098 
4099 /* Compare if a key in a row has changed */
key_changed(uint keynr,const uchar * old_row,const uchar * new_row)4100 bool ha_tokudb::key_changed(uint keynr, const uchar * old_row, const uchar * new_row) {
4101     DBT old_key;
4102     DBT new_key;
4103     memset((void *) &old_key, 0, sizeof(old_key));
4104     memset((void *) &new_key, 0, sizeof(new_key));
4105 
4106     bool has_null;
4107     create_dbt_key_from_table(&new_key, keynr, key_buff2, new_row, &has_null);
4108     create_dbt_key_for_lookup(&old_key,&table->key_info[keynr], key_buff3, old_row, &has_null);
4109     return tokudb_prefix_cmp_dbt_key(share->key_file[keynr], &old_key, &new_key);
4110 }
4111 
4112 //
4113 // Updates a row in the table, called when handling an UPDATE query
4114 // Parameters:
4115 //      [in]    old_row - row to be updated, in MySQL format
4116 //      [in]    new_row - new row, in MySQL format
4117 // Returns:
4118 //      0 on success
4119 //      error otherwise
4120 //
update_row(const uchar * old_row,uchar * new_row)4121 int ha_tokudb::update_row(const uchar * old_row, uchar * new_row) {
4122     TOKUDB_HANDLER_DBUG_ENTER("");
4123     DBT prim_key, old_prim_key, prim_row, old_prim_row;
4124     int error = 0;
4125     bool has_null;
4126     THD* thd = ha_thd();
4127     DB_TXN* sub_trans = NULL;
4128     DB_TXN* txn = NULL;
4129     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4130     uint curr_num_DBs;
4131 
4132     memset((void *) &prim_key, 0, sizeof(prim_key));
4133     memset((void *) &old_prim_key, 0, sizeof(old_prim_key));
4134     memset((void *) &prim_row, 0, sizeof(prim_row));
4135     memset((void *) &old_prim_row, 0, sizeof(old_prim_row));
4136 
4137     ha_statistic_increment(&SSV::ha_update_count);
4138     //
4139     // check to see if some value for the auto increment column that is bigger
4140     // than anything else til now is being used. If so, update the metadata to reflect it
4141     // the goal here is we never want to have a dup key error due to a bad increment
4142     // of the auto inc field.
4143     //
4144     if (share->has_auto_inc && new_row == table->record[0]) {
4145         share->lock();
4146         ulonglong curr_auto_inc = retrieve_auto_increment(
4147             table->field[share->ai_field_index]->key_type(),
4148             field_offset(table->field[share->ai_field_index], table),
4149             new_row
4150             );
4151         if (curr_auto_inc > share->last_auto_increment) {
4152             error = update_max_auto_inc(share->status_block, curr_auto_inc);
4153             if (!error) {
4154                 share->last_auto_increment = curr_auto_inc;
4155             }
4156         }
4157         share->unlock();
4158     }
4159 
4160     //
4161     // grab reader lock on numDBs_lock
4162     //
4163     bool num_DBs_locked = false;
4164     if (!num_DBs_locked_in_bulk) {
4165         rwlock_t_lock_read(share->_num_DBs_lock);
4166         num_DBs_locked = true;
4167     }
4168     curr_num_DBs = share->num_DBs;
4169 
4170     if (using_ignore) {
4171         error = txn_begin(db_env, transaction, &sub_trans, DB_INHERIT_ISOLATION, thd);
4172         if (error) {
4173             goto cleanup;
4174         }
4175     }
4176     txn = using_ignore ? sub_trans : transaction;
4177 
4178     if (hidden_primary_key) {
4179         memset((void *) &prim_key, 0, sizeof(prim_key));
4180         prim_key.data = (void *) current_ident;
4181         prim_key.size = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
4182         old_prim_key = prim_key;
4183     }
4184     else {
4185         create_dbt_key_from_table(&prim_key, primary_key, key_buff, new_row, &has_null);
4186         create_dbt_key_from_table(&old_prim_key, primary_key, primary_key_buff, old_row, &has_null);
4187     }
4188 
4189     // do uniqueness checks
4190     if (share->has_unique_keys && do_unique_checks(thd, in_rpl_update_rows)) {
4191         for (uint keynr = 0; keynr < table_share->keys; keynr++) {
4192             bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
4193             if (keynr == primary_key && !share->pk_has_string) {
4194                 continue;
4195             }
4196             if (is_unique_key) {
4197                 bool key_ch = key_changed(keynr, old_row, new_row);
4198                 if (key_ch) {
4199                     bool is_unique;
4200                     error = is_val_unique(&is_unique, new_row, &table->key_info[keynr], keynr, txn);
4201                     if (error) goto cleanup;
4202                     if (!is_unique) {
4203                         error = DB_KEYEXIST;
4204                         last_dup_key = keynr;
4205                         goto cleanup;
4206                     }
4207                 }
4208             }
4209         }
4210     }
4211 
4212     if (table_share->blob_fields) {
4213         if (fix_rec_buff_for_blob(max_row_length(new_row))) {
4214             error = HA_ERR_OUT_OF_MEM;
4215             goto cleanup;
4216         }
4217         if (fix_rec_update_buff_for_blob(max_row_length(old_row))) {
4218             error = HA_ERR_OUT_OF_MEM;
4219             goto cleanup;
4220         }
4221     }
4222 
4223     error = pack_row(&prim_row, new_row, primary_key);
4224     if (error) { goto cleanup; }
4225 
4226     error = pack_old_row_for_update(&old_prim_row, old_row, primary_key);
4227     if (error) { goto cleanup; }
4228 
4229     set_main_dict_put_flags(thd, &mult_put_flags[primary_key]);
4230 
4231     // for test, make unique checks have a very long duration
4232     if ((mult_put_flags[primary_key] & DB_OPFLAGS_MASK) == DB_NOOVERWRITE)
4233         maybe_do_unique_checks_delay(thd);
4234 
4235     error = db_env->update_multiple(
4236         db_env,
4237         share->key_file[primary_key],
4238         txn,
4239         &old_prim_key,
4240         &old_prim_row,
4241         &prim_key,
4242         &prim_row,
4243         curr_num_DBs,
4244         share->key_file,
4245         mult_put_flags,
4246         2*curr_num_DBs,
4247         mult_key_dbt_array,
4248         curr_num_DBs,
4249         mult_rec_dbt_array
4250         );
4251 
4252     if (error == DB_KEYEXIST) {
4253         last_dup_key = primary_key;
4254     }
4255     else if (!error) {
4256         updated_rows++;
4257         trx->stmt_progress.updated++;
4258         track_progress(thd);
4259     }
4260 
4261 
4262 cleanup:
4263     if (num_DBs_locked) {
4264         share->_num_DBs_lock.unlock();
4265     }
4266     if (error == DB_KEYEXIST) {
4267         error = HA_ERR_FOUND_DUPP_KEY;
4268     }
4269     if (sub_trans) {
4270         // no point in recording error value of abort.
4271         // nothing we can do about it anyway and it is not what
4272         // we want to return.
4273         if (error) {
4274             abort_txn(sub_trans);
4275         }
4276         else {
4277             commit_txn(sub_trans, DB_TXN_NOSYNC);
4278         }
4279     }
4280     TOKUDB_HANDLER_DBUG_RETURN(error);
4281 }
4282 
4283 //
4284 // Deletes a row in the table, called when handling a DELETE query
4285 // Parameters:
4286 //      [in]    record - row to be deleted, in MySQL format
4287 // Returns:
4288 //      0 on success
4289 //      error otherwise
4290 //
delete_row(const uchar * record)4291 int ha_tokudb::delete_row(const uchar * record) {
4292     TOKUDB_HANDLER_DBUG_ENTER("");
4293     int error = ENOSYS;
4294     DBT row, prim_key;
4295     bool has_null;
4296     THD* thd = ha_thd();
4297     uint curr_num_DBs;
4298     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
4299 
4300     ha_statistic_increment(&SSV::ha_delete_count);
4301 
4302     //
4303     // grab reader lock on numDBs_lock
4304     //
4305     bool num_DBs_locked = false;
4306     if (!num_DBs_locked_in_bulk) {
4307         rwlock_t_lock_read(share->_num_DBs_lock);
4308         num_DBs_locked = true;
4309     }
4310     curr_num_DBs = share->num_DBs;
4311 
4312     create_dbt_key_from_table(&prim_key, primary_key, key_buff, record, &has_null);
4313     if (table_share->blob_fields) {
4314         if (fix_rec_buff_for_blob(max_row_length(record))) {
4315             error = HA_ERR_OUT_OF_MEM;
4316             goto cleanup;
4317         }
4318     }
4319     if ((error = pack_row(&row, (const uchar *) record, primary_key))){
4320         goto cleanup;
4321     }
4322 
4323     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
4324         TOKUDB_DEBUG_TXN,
4325         "all %p stmt %p sub_sp_level %p transaction %p",
4326         trx->all,
4327         trx->stmt,
4328         trx->sub_sp_level,
4329         transaction);
4330 
4331     error =
4332         db_env->del_multiple(
4333             db_env,
4334             share->key_file[primary_key],
4335             transaction,
4336             &prim_key,
4337             &row,
4338             curr_num_DBs,
4339             share->key_file,
4340             mult_key_dbt_array,
4341             mult_del_flags);
4342 
4343     if (error) {
4344         DBUG_PRINT("error", ("Got error %d", error));
4345     } else {
4346         deleted_rows++;
4347         trx->stmt_progress.deleted++;
4348         track_progress(thd);
4349     }
4350 cleanup:
4351     if (num_DBs_locked) {
4352         share->_num_DBs_lock.unlock();
4353     }
4354     TOKUDB_HANDLER_DBUG_RETURN(error);
4355 }
4356 
4357 //
4358 // takes as input table->read_set and table->write_set
4359 // and puts list of field indexes that need to be read in
4360 // unpack_row in the member variables fixed_cols_for_query
4361 // and var_cols_for_query
4362 //
set_query_columns(uint keynr)4363 void ha_tokudb::set_query_columns(uint keynr) {
4364     uint32_t curr_fixed_col_index = 0;
4365     uint32_t curr_var_col_index = 0;
4366     read_key = false;
4367     read_blobs = false;
4368     //
4369     // i know this is probably confusing and will need to be explained better
4370     //
4371     uint key_index = 0;
4372 
4373     if (keynr == primary_key || keynr == MAX_KEY) {
4374         key_index = primary_key;
4375     }
4376     else {
4377         key_index = (key_is_clustering(&table->key_info[keynr]) ? keynr : primary_key);
4378     }
4379     for (uint i = 0; i < table_share->fields; i++) {
4380         if (bitmap_is_set(table->read_set,i) ||
4381             bitmap_is_set(table->write_set,i)
4382             )
4383         {
4384             if (bitmap_is_set(&share->kc_info.key_filters[key_index],i)) {
4385                 read_key = true;
4386             }
4387             else {
4388                 //
4389                 // if fixed field length
4390                 //
4391                 if (is_fixed_field(&share->kc_info, i)) {
4392                     //
4393                     // save the offset into the list
4394                     //
4395                     fixed_cols_for_query[curr_fixed_col_index] = i;
4396                     curr_fixed_col_index++;
4397                 }
4398                 //
4399                 // varchar or varbinary
4400                 //
4401                 else if (is_variable_field(&share->kc_info, i)) {
4402                     var_cols_for_query[curr_var_col_index] = i;
4403                     curr_var_col_index++;
4404                 }
4405                 //
4406                 // it is a blob
4407                 //
4408                 else {
4409                     read_blobs = true;
4410                 }
4411             }
4412         }
4413     }
4414     num_fixed_cols_for_query = curr_fixed_col_index;
4415     num_var_cols_for_query = curr_var_col_index;
4416 }
4417 
column_bitmaps_signal()4418 void ha_tokudb::column_bitmaps_signal() {
4419     //
4420     // if we have max number of indexes, then MAX_KEY == primary_key
4421     //
4422     if (tokudb_active_index != MAX_KEY || tokudb_active_index == primary_key) {
4423         set_query_columns(tokudb_active_index);
4424     }
4425 }
4426 
4427 //
4428 // Notification that a scan of entire secondary table is about
4429 // to take place. Will pre acquire table read lock
4430 // Returns:
4431 //      0 on success
4432 //      error otherwise
4433 //
prepare_index_scan()4434 int ha_tokudb::prepare_index_scan() {
4435     TOKUDB_HANDLER_DBUG_ENTER("");
4436     int error = 0;
4437     HANDLE_INVALID_CURSOR();
4438     error = prelock_range(NULL, NULL);
4439     if (error) { last_cursor_error = error; goto cleanup; }
4440 
4441     range_lock_grabbed = true;
4442     error = 0;
4443 cleanup:
4444     TOKUDB_HANDLER_DBUG_RETURN(error);
4445 }
4446 
index_key_is_null(TABLE * table,uint keynr,const uchar * key,uint key_len)4447 static bool index_key_is_null(
4448     TABLE* table,
4449     uint keynr,
4450     const uchar* key,
4451     uint key_len) {
4452 
4453     bool key_can_be_null = false;
4454     KEY* key_info = &table->key_info[keynr];
4455     KEY_PART_INFO* key_part = key_info->key_part;
4456     KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts;
4457     for (; key_part != end; key_part++) {
4458         if (key_part->null_bit) {
4459             key_can_be_null = true;
4460             break;
4461         }
4462     }
4463     return key_can_be_null && key_len > 0 && key[0] != 0;
4464 }
4465 
4466 // Return true if bulk fetch can be used
tokudb_do_bulk_fetch(THD * thd)4467 static bool tokudb_do_bulk_fetch(THD *thd) {
4468     switch (thd_sql_command(thd)) {
4469     case SQLCOM_SELECT:
4470     case SQLCOM_CREATE_TABLE:
4471     case SQLCOM_INSERT_SELECT:
4472     case SQLCOM_REPLACE_SELECT:
4473     case SQLCOM_DELETE:
4474         return tokudb::sysvars::bulk_fetch(thd) != 0;
4475     default:
4476         return false;
4477     }
4478 }
4479 
4480 //
4481 // Notification that a range query getting all elements that equal a key
4482 //  to take place. Will pre acquire read lock
4483 // Returns:
4484 //      0 on success
4485 //      error otherwise
4486 //
prepare_index_key_scan(const uchar * key,uint key_len)4487 int ha_tokudb::prepare_index_key_scan(const uchar * key, uint key_len) {
4488     TOKUDB_HANDLER_DBUG_ENTER("%p %u", key, key_len);
4489     int error = 0;
4490     DBT start_key, end_key;
4491     THD* thd = ha_thd();
4492     HANDLE_INVALID_CURSOR();
4493     pack_key(&start_key, tokudb_active_index, prelocked_left_range, key, key_len, COL_NEG_INF);
4494     prelocked_left_range_size = start_key.size;
4495     pack_key(&end_key, tokudb_active_index, prelocked_right_range, key, key_len, COL_POS_INF);
4496     prelocked_right_range_size = end_key.size;
4497 
4498     error = cursor->c_set_bounds(
4499         cursor,
4500         &start_key,
4501         &end_key,
4502         true,
4503         (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
4504         );
4505 
4506     if (error){
4507         goto cleanup;
4508     }
4509 
4510     range_lock_grabbed = true;
4511     range_lock_grabbed_null = index_key_is_null(table, tokudb_active_index, key, key_len);
4512     doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
4513     bulk_fetch_iteration = 0;
4514     rows_fetched_using_bulk_fetch = 0;
4515     error = 0;
4516 cleanup:
4517     if (error) {
4518         error = map_to_handler_error(error);
4519         last_cursor_error = error;
4520         //
4521         // cursor should be initialized here, but in case it is not,
4522         // we still check
4523         //
4524         if (cursor) {
4525             int r = cursor->c_close(cursor);
4526             assert_always(r==0);
4527             cursor = NULL;
4528             remove_from_trx_handler_list();
4529         }
4530     }
4531     TOKUDB_HANDLER_DBUG_RETURN(error);
4532 }
4533 
invalidate_bulk_fetch()4534 void ha_tokudb::invalidate_bulk_fetch() {
4535     bytes_used_in_range_query_buff= 0;
4536     curr_range_query_buff_offset = 0;
4537     icp_went_out_of_range = false;
4538 }
4539 
invalidate_icp()4540 void ha_tokudb::invalidate_icp() {
4541     toku_pushed_idx_cond = NULL;
4542     toku_pushed_idx_cond_keyno = MAX_KEY;
4543     icp_went_out_of_range = false;
4544 }
4545 
4546 //
4547 // Initializes local cursor on DB with index keynr
4548 // Parameters:
4549 //          keynr - key (index) number
4550 //          sorted - 1 if result MUST be sorted according to index
4551 // Returns:
4552 //      0 on success
4553 //      error otherwise
4554 //
index_init(uint keynr,bool sorted)4555 int ha_tokudb::index_init(uint keynr, bool sorted) {
4556     TOKUDB_HANDLER_DBUG_ENTER("%d %u txn %p", keynr, sorted, transaction);
4557 
4558     int error;
4559     THD* thd = ha_thd();
4560     DBUG_PRINT("enter", ("table: '%s'  key: %d", table_share->table_name.str, keynr));
4561 
4562     /*
4563        Under some very rare conditions (like full joins) we may already have
4564        an active cursor at this point
4565      */
4566     if (cursor) {
4567         DBUG_PRINT("note", ("Closing active cursor"));
4568         int r = cursor->c_close(cursor);
4569         assert_always(r==0);
4570         remove_from_trx_handler_list();
4571     }
4572     active_index = keynr;
4573 
4574     if (active_index < MAX_KEY) {
4575         assert(keynr <= table->s->keys);
4576     } else {
4577         assert(active_index == MAX_KEY);
4578         keynr = primary_key;
4579     }
4580     tokudb_active_index = keynr;
4581 
4582 #if defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
4583     if (keynr < table->s->keys && table->key_info[keynr].option_struct->clustering)
4584         key_read = false;
4585 #endif  // defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
4586 
4587     last_cursor_error = 0;
4588     range_lock_grabbed = false;
4589     range_lock_grabbed_null = false;
4590     assert(share->key_file[keynr]);
4591     cursor_flags = get_cursor_isolation_flags(lock.type, thd);
4592     if (use_write_locks) {
4593         cursor_flags |= DB_RMW;
4594     }
4595     if (tokudb::sysvars::disable_prefetching(thd)) {
4596         cursor_flags |= DBC_DISABLE_PREFETCHING;
4597     }
4598     if (lock.type == TL_READ_WITH_SHARED_LOCKS) {
4599        cursor_flags |= DB_LOCKING_READ;
4600     }
4601     if ((error = share->key_file[keynr]->cursor(share->key_file[keynr],
4602                                                 transaction, &cursor,
4603                                                 cursor_flags))) {
4604         if (error == TOKUDB_MVCC_DICTIONARY_TOO_NEW) {
4605             error = HA_ERR_TABLE_DEF_CHANGED;
4606             my_error(ER_TABLE_DEF_CHANGED, MYF(0));
4607         }
4608         if (error == DB_LOCK_NOTGRANTED) {
4609             error = HA_ERR_LOCK_WAIT_TIMEOUT;
4610             my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
4611         }
4612         table->status = STATUS_NOT_FOUND;
4613         error = map_to_handler_error(error);
4614         last_cursor_error = error;
4615         cursor = NULL;             // Safety
4616         goto exit;
4617     }
4618     cursor->c_set_check_interrupt_callback(cursor, tokudb_killed_thd_callback, thd);
4619     memset((void *) &last_key, 0, sizeof(last_key));
4620 
4621     add_to_trx_handler_list();
4622 
4623     if (thd_sql_command(thd) == SQLCOM_SELECT) {
4624         set_query_columns(keynr);
4625         unpack_entire_row = false;
4626     }
4627     else {
4628         unpack_entire_row = true;
4629     }
4630     invalidate_bulk_fetch();
4631     doing_bulk_fetch = false;
4632     maybe_index_scan = false;
4633     error = 0;
4634 exit:
4635     TOKUDB_HANDLER_DBUG_RETURN(error);
4636 }
4637 
4638 //
4639 // closes the local cursor
4640 //
index_end()4641 int ha_tokudb::index_end() {
4642     TOKUDB_HANDLER_DBUG_ENTER("");
4643     range_lock_grabbed = false;
4644     range_lock_grabbed_null = false;
4645     if (cursor) {
4646         DBUG_PRINT("enter", ("table: '%s'", table_share->table_name.str));
4647         int r = cursor->c_close(cursor);
4648         assert_always(r==0);
4649         cursor = NULL;
4650         remove_from_trx_handler_list();
4651         last_cursor_error = 0;
4652     }
4653     active_index = tokudb_active_index = MAX_KEY;
4654 
4655     //
4656     // reset query variables
4657     //
4658     unpack_entire_row = true;
4659     read_blobs = true;
4660     read_key = true;
4661     num_fixed_cols_for_query = 0;
4662     num_var_cols_for_query = 0;
4663 
4664     invalidate_bulk_fetch();
4665     invalidate_icp();
4666     doing_bulk_fetch = false;
4667     ds_mrr.dsmrr_close();
4668 
4669     TOKUDB_HANDLER_DBUG_RETURN(0);
4670 }
4671 
handle_cursor_error(int error,int err_to_return)4672 int ha_tokudb::handle_cursor_error(int error, int err_to_return) {
4673     TOKUDB_HANDLER_DBUG_ENTER("");
4674     if (error) {
4675         error = map_to_handler_error(error);
4676         last_cursor_error = error;
4677         table->status = STATUS_NOT_FOUND;
4678         if (error == DB_NOTFOUND) {
4679             error = err_to_return;
4680         }
4681     }
4682     TOKUDB_HANDLER_DBUG_RETURN(error);
4683 }
4684 
4685 
4686 //
4687 // Helper function for read_row and smart_dbt_callback_xxx functions
4688 // When using a hidden primary key, upon reading a row,
4689 // we set the current_ident field to whatever the primary key we retrieved
4690 // was
4691 //
extract_hidden_primary_key(uint keynr,DBT const * found_key)4692 void ha_tokudb::extract_hidden_primary_key(uint keynr, DBT const *found_key) {
4693     //
4694     // extract hidden primary key to current_ident
4695     //
4696     if (hidden_primary_key) {
4697         if (keynr == primary_key) {
4698             memcpy(current_ident, (char *) found_key->data, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
4699         }
4700         //
4701         // if secondary key, hidden primary key is at end of found_key
4702         //
4703         else {
4704             memcpy(
4705                 current_ident,
4706                 (char *) found_key->data + found_key->size - TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH,
4707                 TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH
4708                 );
4709         }
4710     }
4711 }
4712 
4713 
read_row_callback(uchar * buf,uint keynr,DBT const * row,DBT const * found_key)4714 int ha_tokudb::read_row_callback (uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4715     assert_always(keynr == primary_key);
4716     return unpack_row(buf, row,found_key, keynr);
4717 }
4718 
4719 //
4720 // Reads the contents of row and found_key, DBT's retrieved from the DB associated to keynr, into buf
4721 // This function assumes that we are using a covering index, as a result, if keynr is the primary key,
4722 // we do not read row into buf
4723 // Parameters:
4724 //      [out]   buf - buffer for the row, in MySQL format
4725 //              keynr - index into key_file that represents DB we are currently operating on.
4726 //      [in]    row - the row that has been read from the preceding DB call
4727 //      [in]    found_key - key used to retrieve the row
4728 //
read_key_only(uchar * buf,uint keynr,DBT const * found_key)4729 void ha_tokudb::read_key_only(uchar * buf, uint keynr, DBT const *found_key) {
4730     TOKUDB_HANDLER_DBUG_ENTER("");
4731     table->status = 0;
4732     //
4733     // only case when we do not unpack the key is if we are dealing with the main dictionary
4734     // of a table with a hidden primary key
4735     //
4736     if (!(hidden_primary_key && keynr == primary_key)) {
4737         unpack_key(buf, found_key, keynr);
4738     }
4739     TOKUDB_HANDLER_DBUG_VOID_RETURN;
4740 }
4741 
4742 //
4743 // Helper function used to try to retrieve the entire row
4744 // If keynr is associated with the main table, reads contents of found_key and row into buf, otherwise,
4745 // makes copy of primary key and saves it to last_key. This can later be used to retrieve the entire row
4746 // Parameters:
4747 //      [out]   buf - buffer for the row, in MySQL format
4748 //              keynr - index into key_file that represents DB we are currently operating on.
4749 //      [in]    row - the row that has been read from the preceding DB call
4750 //      [in]    found_key - key used to retrieve the row
4751 //
read_primary_key(uchar * buf,uint keynr,DBT const * row,DBT const * found_key)4752 int ha_tokudb::read_primary_key(uchar * buf, uint keynr, DBT const *row, DBT const *found_key) {
4753     TOKUDB_HANDLER_DBUG_ENTER("");
4754     int error = 0;
4755     table->status = 0;
4756     //
4757     // case where we read from secondary table that is not clustered
4758     //
4759     if (keynr != primary_key && !key_is_clustering(&table->key_info[keynr])) {
4760         bool has_null;
4761         //
4762         // create a DBT that has the same data as row, this is inefficient
4763         // extract_hidden_primary_key MUST have been called before this
4764         //
4765         memset((void *) &last_key, 0, sizeof(last_key));
4766         if (!hidden_primary_key) {
4767             unpack_key(buf, found_key, keynr);
4768         }
4769         create_dbt_key_from_table(
4770             &last_key,
4771             primary_key,
4772             key_buff,
4773             buf,
4774             &has_null
4775             );
4776     }
4777     //
4778     // else read from clustered/primary key
4779     //
4780     else {
4781         error = unpack_row(buf, row, found_key, keynr);
4782         if (error) { goto exit; }
4783     }
4784     if (found_key) { DBUG_DUMP("read row key", (uchar *) found_key->data, found_key->size); }
4785     error = 0;
4786 exit:
4787     TOKUDB_HANDLER_DBUG_RETURN(error);
4788 }
4789 
4790 //
4791 // This function reads an entire row into buf. This function also assumes that
4792 // the key needed to retrieve the row is stored in the member variable last_key
4793 // Parameters:
4794 //      [out]   buf - buffer for the row, in MySQL format
4795 // Returns:
4796 //      0 on success, error otherwise
4797 //
read_full_row(uchar * buf)4798 int ha_tokudb::read_full_row(uchar * buf) {
4799     TOKUDB_HANDLER_DBUG_ENTER("");
4800     int error = 0;
4801     struct smart_dbt_info info;
4802     info.ha = this;
4803     info.buf = buf;
4804     info.keynr = primary_key;
4805     //
4806     // assumes key is stored in this->last_key
4807     //
4808 
4809     error = share->file->getf_set(share->file,
4810                                   transaction,
4811                                   cursor_flags,
4812                                   &last_key,
4813                                   smart_dbt_callback_rowread_ptquery,
4814                                   &info);
4815 
4816     DBUG_EXECUTE_IF("tokudb_fake_db_notfound_error_in_read_full_row", {
4817         error = DB_NOTFOUND;
4818     });
4819 
4820     if (error) {
4821         if (error == DB_LOCK_NOTGRANTED) {
4822             error = HA_ERR_LOCK_WAIT_TIMEOUT;
4823         } else if (error == DB_NOTFOUND) {
4824             error = HA_ERR_CRASHED;
4825             if (tokudb_active_index < share->_keys) {
4826                 sql_print_error(
4827                     "ha_tokudb::read_full_row on table %s cound not locate "
4828                     "record in PK that matches record found in key %s",
4829                     share->full_table_name(),
4830                     share->_key_descriptors[tokudb_active_index]._name);
4831             } else {
4832                 sql_print_error(
4833                     "ha_tokudb::read_full_row on table %s cound not locate "
4834                     "record in PK that matches record found in key %d",
4835                     share->full_table_name(),
4836                     tokudb_active_index);
4837             }
4838         }
4839         table->status = STATUS_NOT_FOUND;
4840     }
4841 
4842     TOKUDB_HANDLER_DBUG_RETURN(error);
4843 }
4844 
4845 
4846 //
4847 // Reads the next row matching to the key, on success, advances cursor
4848 // Parameters:
4849 //      [out]   buf - buffer for the next row, in MySQL format
4850 //      [in]     key - key value
4851 //                keylen - length of key
4852 // Returns:
4853 //      0 on success
4854 //      HA_ERR_END_OF_FILE if not found
4855 //      error otherwise
4856 //
index_next_same(uchar * buf,const uchar * key,uint keylen)4857 int ha_tokudb::index_next_same(uchar* buf, const uchar* key, uint keylen) {
4858     TOKUDB_HANDLER_DBUG_ENTER("");
4859     ha_statistic_increment(&SSV::ha_read_next_count);
4860 
4861     DBT curr_key;
4862     DBT found_key;
4863     bool has_null;
4864     int cmp;
4865     // create the key that will be used to compare with what is found
4866     // in order to figure out if we should return an error
4867     pack_key(&curr_key, tokudb_active_index, key_buff2, key, keylen, COL_ZERO);
4868     int error = get_next(buf, 1, &curr_key, key_read);
4869     if (error) {
4870         goto cleanup;
4871     }
4872     //
4873     // now do the comparison
4874     //
4875     create_dbt_key_from_table(
4876         &found_key,
4877         tokudb_active_index,
4878         key_buff3,buf,
4879         &has_null);
4880     cmp =
4881         tokudb_prefix_cmp_dbt_key(
4882             share->key_file[tokudb_active_index],
4883             &curr_key,
4884             &found_key);
4885     if (cmp) {
4886         error = HA_ERR_END_OF_FILE;
4887     }
4888 
4889 cleanup:
4890     error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
4891     TOKUDB_HANDLER_DBUG_RETURN(error);
4892 }
4893 
4894 
4895 //
4896 // According to InnoDB handlerton: Positions an index cursor to the index
4897 // specified in keynr. Fetches the row if any
4898 // Parameters:
4899 //      [out]       buf - buffer for the  returned row
4900 //      [in]         key - key value, according to InnoDB, if NULL,
4901 //                              position cursor at start or end of index,
4902 //                              not sure if this is done now
4903 //                    key_len - length of key
4904 //                    find_flag - according to InnoDB, search flags from my_base.h
4905 // Returns:
4906 //      0 on success
4907 //      HA_ERR_KEY_NOT_FOUND if not found (per InnoDB),
4908 //          we seem to return HA_ERR_END_OF_FILE if find_flag != HA_READ_KEY_EXACT
4909 //          TODO: investigate this for correctness
4910 //      error otherwise
4911 //
index_read(uchar * buf,const uchar * key,uint key_len,enum ha_rkey_function find_flag)4912 int ha_tokudb::index_read(
4913     uchar* buf,
4914     const uchar* key,
4915     uint key_len,
4916     enum ha_rkey_function find_flag) {
4917 
4918     TOKUDB_HANDLER_DBUG_ENTER(
4919         "key %p %u:%2.2x find=%u",
4920         key,
4921         key_len,
4922         key ? key[0] : 0,
4923         find_flag);
4924     invalidate_bulk_fetch();
4925     if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
4926         TOKUDB_DBUG_DUMP("mysql key=", key, key_len);
4927     }
4928     DBT row;
4929     DBT lookup_key;
4930     int error = 0;
4931     uint32_t flags = 0;
4932     THD* thd = ha_thd();
4933     tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
4934     struct smart_dbt_info info;
4935     struct index_read_info ir_info;
4936 
4937     HANDLE_INVALID_CURSOR();
4938 
4939     // if we locked a non-null key range and we now have a null key, then
4940     // remove the bounds from the cursor
4941     if (range_lock_grabbed &&
4942         !range_lock_grabbed_null &&
4943         index_key_is_null(table, tokudb_active_index, key, key_len)) {
4944         range_lock_grabbed = range_lock_grabbed_null = false;
4945         cursor->c_remove_restriction(cursor);
4946     }
4947 
4948     ha_statistic_increment(&SSV::ha_read_key_count);
4949     memset((void *) &row, 0, sizeof(row));
4950 
4951     info.ha = this;
4952     info.buf = buf;
4953     info.keynr = tokudb_active_index;
4954 
4955     ir_info.smart_dbt_info = info;
4956     ir_info.cmp = 0;
4957 
4958     flags = SET_PRELOCK_FLAG(0);
4959     switch (find_flag) {
4960     case HA_READ_KEY_EXACT: /* Find first record else error */ {
4961         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
4962         DBT lookup_bound;
4963         pack_key(&lookup_bound, tokudb_active_index, key_buff4, key, key_len, COL_POS_INF);
4964         if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_INDEX_KEY))) {
4965             TOKUDB_DBUG_DUMP("tokudb key=", lookup_key.data, lookup_key.size);
4966         }
4967         ir_info.orig_key = &lookup_key;
4968         error = cursor->c_getf_set_range_with_bound(cursor, flags, &lookup_key, &lookup_bound, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
4969         if (ir_info.cmp) {
4970             error = DB_NOTFOUND;
4971         }
4972         break;
4973     }
4974     case HA_READ_AFTER_KEY: /* Find next rec. after key-record */
4975         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
4976         error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
4977         break;
4978     case HA_READ_BEFORE_KEY: /* Find next rec. before key-record */
4979         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
4980         error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
4981         break;
4982     case HA_READ_KEY_OR_NEXT: /* Record or next record */
4983         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
4984         error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
4985         break;
4986     //
4987     // This case does not seem to ever be used, it is ok for it to be slow
4988     //
4989     case HA_READ_KEY_OR_PREV: /* Record or previous */
4990         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_NEG_INF);
4991         ir_info.orig_key = &lookup_key;
4992         error = cursor->c_getf_set_range(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
4993         if (error == DB_NOTFOUND) {
4994             error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
4995         }
4996         else if (ir_info.cmp) {
4997             error = cursor->c_getf_prev(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
4998         }
4999         break;
5000     case HA_READ_PREFIX_LAST_OR_PREV: /* Last or prev key with the same prefix */
5001         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5002         error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_CALLBACK(key_read), &info);
5003         break;
5004     case HA_READ_PREFIX_LAST:
5005         pack_key(&lookup_key, tokudb_active_index, key_buff3, key, key_len, COL_POS_INF);
5006         ir_info.orig_key = &lookup_key;
5007         error = cursor->c_getf_set_range_reverse(cursor, flags, &lookup_key, SMART_DBT_IR_CALLBACK(key_read), &ir_info);
5008         if (ir_info.cmp) {
5009             error = DB_NOTFOUND;
5010         }
5011         break;
5012     default:
5013         TOKUDB_HANDLER_TRACE("unsupported:%d", find_flag);
5014         error = HA_ERR_UNSUPPORTED;
5015         break;
5016     }
5017     error = handle_cursor_error(error, HA_ERR_KEY_NOT_FOUND);
5018     if (!error && !key_read && tokudb_active_index != primary_key && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5019         error = read_full_row(buf);
5020     }
5021 
5022     if (TOKUDB_UNLIKELY(error && TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ERROR))) {
5023         TOKUDB_HANDLER_TRACE("error:%d:%d", error, find_flag);
5024     }
5025     trx->stmt_progress.queried++;
5026     track_progress(thd);
5027 
5028 cleanup:
5029     TOKUDB_HANDLER_DBUG_RETURN(error);
5030 }
5031 
5032 
read_data_from_range_query_buff(uchar * buf,bool need_val,bool do_key_read)5033 int ha_tokudb::read_data_from_range_query_buff(uchar* buf, bool need_val, bool do_key_read) {
5034     // buffer has the next row, get it from there
5035     int error;
5036     uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
5037     DBT curr_key;
5038     memset((void *) &curr_key, 0, sizeof(curr_key));
5039 
5040     // get key info
5041     uint32_t key_size = *(uint32_t *)curr_pos;
5042     curr_pos += sizeof(key_size);
5043     uchar* curr_key_buff = curr_pos;
5044     curr_pos += key_size;
5045 
5046     curr_key.data = curr_key_buff;
5047     curr_key.size = key_size;
5048 
5049     // if this is a covering index, this is all we need
5050     if (do_key_read) {
5051         assert_always(!need_val);
5052         extract_hidden_primary_key(tokudb_active_index, &curr_key);
5053         read_key_only(buf, tokudb_active_index, &curr_key);
5054         error = 0;
5055     }
5056     // we need to get more data
5057     else {
5058         DBT curr_val;
5059         memset((void *) &curr_val, 0, sizeof(curr_val));
5060         uchar* curr_val_buff = NULL;
5061         uint32_t val_size = 0;
5062         // in this case, we don't have a val, we are simply extracting the pk
5063         if (!need_val) {
5064             curr_val.data = curr_val_buff;
5065             curr_val.size = val_size;
5066             extract_hidden_primary_key(tokudb_active_index, &curr_key);
5067             error = read_primary_key( buf, tokudb_active_index, &curr_val, &curr_key);
5068         }
5069         else {
5070             extract_hidden_primary_key(tokudb_active_index, &curr_key);
5071             // need to extract a val and place it into buf
5072             if (unpack_entire_row) {
5073                 // get val info
5074                 val_size = *(uint32_t *)curr_pos;
5075                 curr_pos += sizeof(val_size);
5076                 curr_val_buff = curr_pos;
5077                 curr_pos += val_size;
5078                 curr_val.data = curr_val_buff;
5079                 curr_val.size = val_size;
5080                 error = unpack_row(buf,&curr_val, &curr_key, tokudb_active_index);
5081             }
5082             else {
5083                 if (!(hidden_primary_key && tokudb_active_index == primary_key)) {
5084                     unpack_key(buf,&curr_key,tokudb_active_index);
5085                 }
5086                 // read rows we care about
5087 
5088                 // first the null bytes;
5089                 memcpy(buf, curr_pos, table_share->null_bytes);
5090                 curr_pos += table_share->null_bytes;
5091 
5092                 // now the fixed sized rows
5093                 for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5094                     uint field_index = fixed_cols_for_query[i];
5095                     Field* field = table->field[field_index];
5096                     unpack_fixed_field(
5097                         buf + field_offset(field, table),
5098                         curr_pos,
5099                         share->kc_info.field_lengths[field_index]
5100                         );
5101                     curr_pos += share->kc_info.field_lengths[field_index];
5102                 }
5103                 // now the variable sized rows
5104                 for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5105                     uint field_index = var_cols_for_query[i];
5106                     Field* field = table->field[field_index];
5107                     uint32_t field_len = *(uint32_t *)curr_pos;
5108                     curr_pos += sizeof(field_len);
5109                     unpack_var_field(
5110                         buf + field_offset(field, table),
5111                         curr_pos,
5112                         field_len,
5113                         share->kc_info.length_bytes[field_index]
5114                         );
5115                     curr_pos += field_len;
5116                 }
5117                 // now the blobs
5118                 if (read_blobs) {
5119                     uint32_t blob_size = *(uint32_t *)curr_pos;
5120                     curr_pos += sizeof(blob_size);
5121                     error = unpack_blobs(
5122                         buf,
5123                         curr_pos,
5124                         blob_size,
5125                         true
5126                         );
5127                     curr_pos += blob_size;
5128                     if (error) {
5129                         invalidate_bulk_fetch();
5130                         goto exit;
5131                     }
5132                 }
5133                 error = 0;
5134             }
5135         }
5136     }
5137 
5138     curr_range_query_buff_offset = curr_pos - range_query_buff;
5139 exit:
5140     return error;
5141 }
5142 
smart_dbt_bf_callback(DBT const * key,DBT const * row,void * context)5143 static int smart_dbt_bf_callback(
5144     DBT const* key,
5145     DBT const* row,
5146     void* context) {
5147     SMART_DBT_BF_INFO info = (SMART_DBT_BF_INFO)context;
5148     return
5149         info->ha->fill_range_query_buf(
5150             info->need_val,
5151             key,
5152             row,
5153             info->direction,
5154             info->thd,
5155             info->buf,
5156             info->key_to_compare);
5157 }
5158 
toku_handler_index_cond_check(Item * pushed_idx_cond)5159 enum icp_result ha_tokudb::toku_handler_index_cond_check(
5160     Item* pushed_idx_cond) {
5161 
5162     enum icp_result res;
5163     if (end_range) {
5164         int cmp;
5165         cmp = compare_key_icp(end_range);
5166         if (cmp > 0) {
5167             return ICP_OUT_OF_RANGE;
5168         }
5169     }
5170     res = pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;
5171     return res;
5172 }
5173 
5174 // fill in the range query buf for bulk fetch
fill_range_query_buf(bool need_val,DBT const * key,DBT const * row,int direction,THD * thd,uchar * buf,DBT * key_to_compare)5175 int ha_tokudb::fill_range_query_buf(
5176     bool need_val,
5177     DBT const* key,
5178     DBT const* row,
5179     int direction,
5180     THD* thd,
5181     uchar* buf,
5182     DBT* key_to_compare) {
5183 
5184     int error;
5185     //
5186     // first put the value into range_query_buf
5187     //
5188     uint32_t size_remaining =
5189         size_range_query_buff - bytes_used_in_range_query_buff;
5190     uint32_t size_needed;
5191     uint32_t user_defined_size = tokudb::sysvars::read_buf_size(thd);
5192     uchar* curr_pos = NULL;
5193 
5194     if (key_to_compare) {
5195         int cmp = tokudb_prefix_cmp_dbt_key(
5196             share->key_file[tokudb_active_index],
5197             key_to_compare,
5198             key);
5199         if (cmp) {
5200             icp_went_out_of_range = true;
5201             error = 0;
5202             goto cleanup;
5203         }
5204     }
5205 
5206     // if we have an index condition pushed down, we check it
5207     if (toku_pushed_idx_cond &&
5208         (tokudb_active_index == toku_pushed_idx_cond_keyno)) {
5209         unpack_key(buf, key, tokudb_active_index);
5210         enum icp_result result =
5211             toku_handler_index_cond_check(toku_pushed_idx_cond);
5212 
5213         // If we have reason to stop, we set icp_went_out_of_range and get out
5214         // otherwise, if we simply see that the current key is no match,
5215         // we tell the cursor to continue and don't store
5216         // the key locally
5217         if (result == ICP_OUT_OF_RANGE || thd_killed(thd)) {
5218             icp_went_out_of_range = true;
5219             error = 0;
5220             DEBUG_SYNC(ha_thd(), "tokudb_icp_asc_scan_out_of_range");
5221             goto cleanup;
5222         } else if (result == ICP_NO_MATCH) {
5223             // Optimizer change for MyRocks also benefits us here in TokuDB as
5224             // opt_range.cc QUICK_SELECT::get_next now sets end_range during
5225             // descending scan. We should not ever hit this condition, but
5226             // leaving this code in to prevent any possibility of a descending
5227             // scan to the beginning of an index and catch any possibility
5228             // in debug builds with an assertion
5229             assert_debug(!(!end_range && direction < 0));
5230             if (!end_range &&
5231                 direction < 0) {
5232                 cancel_pushed_idx_cond();
5233             }
5234             error = TOKUDB_CURSOR_CONTINUE;
5235             goto cleanup;
5236         }
5237     }
5238 
5239     // at this point, if ICP is on, we have verified that the key is one
5240     // we are interested in, so we proceed with placing the data
5241     // into the range query buffer
5242 
5243     if (need_val) {
5244         if (unpack_entire_row) {
5245             size_needed = 2*sizeof(uint32_t) + key->size + row->size;
5246         } else {
5247             // this is an upper bound
5248             size_needed =
5249                 // size of key length
5250                 sizeof(uint32_t) +
5251                 // key and row
5252                 key->size + row->size +
5253                 // lengths of varchars stored
5254                 num_var_cols_for_query * (sizeof(uint32_t)) +
5255                 // length of blobs
5256                 sizeof(uint32_t);
5257         }
5258     } else {
5259         size_needed = sizeof(uint32_t) + key->size;
5260     }
5261     if (size_remaining < size_needed) {
5262         range_query_buff =
5263             static_cast<uchar*>(tokudb::memory::realloc(
5264                 static_cast<void*>(range_query_buff),
5265                 bytes_used_in_range_query_buff + size_needed,
5266                 MYF(MY_WME)));
5267         if (range_query_buff == NULL) {
5268             error = ENOMEM;
5269             invalidate_bulk_fetch();
5270             goto cleanup;
5271         }
5272         size_range_query_buff = bytes_used_in_range_query_buff + size_needed;
5273     }
5274     //
5275     // now we know we have the size, let's fill the buffer, starting with the key
5276     //
5277     curr_pos = range_query_buff + bytes_used_in_range_query_buff;
5278 
5279     *reinterpret_cast<uint32_t*>(curr_pos) = key->size;
5280     curr_pos += sizeof(uint32_t);
5281     memcpy(curr_pos, key->data, key->size);
5282     curr_pos += key->size;
5283     if (need_val) {
5284         if (unpack_entire_row) {
5285             *reinterpret_cast<uint32_t*>(curr_pos) = row->size;
5286             curr_pos += sizeof(uint32_t);
5287             memcpy(curr_pos, row->data, row->size);
5288             curr_pos += row->size;
5289         } else {
5290             // need to unpack just the data we care about
5291             const uchar* fixed_field_ptr = static_cast<const uchar*>(row->data);
5292             fixed_field_ptr += table_share->null_bytes;
5293 
5294             const uchar* var_field_offset_ptr = NULL;
5295             const uchar* var_field_data_ptr = NULL;
5296 
5297             var_field_offset_ptr =
5298                 fixed_field_ptr +
5299                 share->kc_info.mcp_info[tokudb_active_index].fixed_field_size;
5300             var_field_data_ptr =
5301                 var_field_offset_ptr +
5302                 share->kc_info.mcp_info[tokudb_active_index].len_of_offsets;
5303 
5304             // first the null bytes
5305             memcpy(curr_pos, row->data, table_share->null_bytes);
5306             curr_pos += table_share->null_bytes;
5307             // now the fixed fields
5308             //
5309             // first the fixed fields
5310             //
5311             for (uint32_t i = 0; i < num_fixed_cols_for_query; i++) {
5312                 uint field_index = fixed_cols_for_query[i];
5313                 memcpy(
5314                     curr_pos,
5315                     fixed_field_ptr + share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val,
5316                     share->kc_info.field_lengths[field_index]);
5317                 curr_pos += share->kc_info.field_lengths[field_index];
5318             }
5319 
5320             //
5321             // now the var fields
5322             //
5323             for (uint32_t i = 0; i < num_var_cols_for_query; i++) {
5324                 uint field_index = var_cols_for_query[i];
5325                 uint32_t var_field_index =
5326                     share->kc_info.cp_info[tokudb_active_index][field_index].col_pack_val;
5327                 uint32_t data_start_offset;
5328                 uint32_t field_len;
5329 
5330                 get_var_field_info(
5331                     &field_len,
5332                     &data_start_offset,
5333                     var_field_index,
5334                     var_field_offset_ptr,
5335                     share->kc_info.num_offset_bytes);
5336                 memcpy(curr_pos, &field_len, sizeof(field_len));
5337                 curr_pos += sizeof(field_len);
5338                 memcpy(
5339                     curr_pos,
5340                     var_field_data_ptr + data_start_offset,
5341                     field_len);
5342                 curr_pos += field_len;
5343             }
5344 
5345             if (read_blobs) {
5346                 uint32_t blob_offset = 0;
5347                 uint32_t data_size = 0;
5348                 //
5349                 // now the blobs
5350                 //
5351                 get_blob_field_info(
5352                     &blob_offset,
5353                     share->kc_info.mcp_info[tokudb_active_index].len_of_offsets,
5354                     var_field_data_ptr,
5355                     share->kc_info.num_offset_bytes);
5356                 data_size =
5357                     row->size -
5358                     blob_offset -
5359                     static_cast<uint32_t>((var_field_data_ptr -
5360                         static_cast<const uchar*>(row->data)));
5361                 memcpy(curr_pos, &data_size, sizeof(data_size));
5362                 curr_pos += sizeof(data_size);
5363                 memcpy(curr_pos, var_field_data_ptr + blob_offset, data_size);
5364                 curr_pos += data_size;
5365             }
5366         }
5367     }
5368 
5369     bytes_used_in_range_query_buff = curr_pos - range_query_buff;
5370     assert_always(bytes_used_in_range_query_buff <= size_range_query_buff);
5371 
5372     //
5373     // now determine if we should continue with the bulk fetch
5374     // we want to stop under these conditions:
5375     //  - we overran the prelocked range
5376     //  - we are close to the end of the buffer
5377     //  - we have fetched an exponential amount of rows with
5378     //  respect to the bulk fetch iteration, which is initialized
5379     //  to 0 in index_init() and prelock_range().
5380 
5381     rows_fetched_using_bulk_fetch++;
5382     // if the iteration is less than the number of possible shifts on
5383     // a 64 bit integer, check that we haven't exceeded this iterations
5384     // row fetch upper bound.
5385     if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5386         uint64_t row_fetch_upper_bound = 1LLU << bulk_fetch_iteration;
5387         assert_always(row_fetch_upper_bound > 0);
5388         if (rows_fetched_using_bulk_fetch >= row_fetch_upper_bound) {
5389             error = 0;
5390             goto cleanup;
5391         }
5392     }
5393 
5394     if (bytes_used_in_range_query_buff +
5395         table_share->rec_buff_length >
5396         user_defined_size) {
5397         error = 0;
5398         goto cleanup;
5399     }
5400     if (direction > 0) {
5401         // compare what we got to the right endpoint of prelocked range
5402         // because we are searching keys in ascending order
5403         if (prelocked_right_range_size == 0) {
5404             error = TOKUDB_CURSOR_CONTINUE;
5405             goto cleanup;
5406         }
5407         DBT right_range;
5408         memset(&right_range, 0, sizeof(right_range));
5409         right_range.size = prelocked_right_range_size;
5410         right_range.data = prelocked_right_range;
5411         int cmp = tokudb_cmp_dbt_key(
5412             share->key_file[tokudb_active_index],
5413             key,
5414             &right_range);
5415         error = (cmp > 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5416     } else {
5417         // compare what we got to the left endpoint of prelocked range
5418         // because we are searching keys in descending order
5419         if (prelocked_left_range_size == 0) {
5420             error = TOKUDB_CURSOR_CONTINUE;
5421             goto cleanup;
5422         }
5423         DBT left_range;
5424         memset(&left_range, 0, sizeof(left_range));
5425         left_range.size = prelocked_left_range_size;
5426         left_range.data = prelocked_left_range;
5427         int cmp = tokudb_cmp_dbt_key(
5428             share->key_file[tokudb_active_index],
5429             key,
5430             &left_range);
5431         error = (cmp < 0) ? 0 : TOKUDB_CURSOR_CONTINUE;
5432     }
5433 cleanup:
5434     return error;
5435 }
5436 
get_next(uchar * buf,int direction,DBT * key_to_compare,bool do_key_read)5437 int ha_tokudb::get_next(
5438     uchar* buf,
5439     int direction,
5440     DBT* key_to_compare,
5441     bool do_key_read) {
5442 
5443     int error = 0;
5444     HANDLE_INVALID_CURSOR();
5445 
5446     if (maybe_index_scan) {
5447         maybe_index_scan = false;
5448         if (!range_lock_grabbed) {
5449             error = prepare_index_scan();
5450         }
5451     }
5452 
5453     if (!error) {
5454         uint32_t flags = SET_PRELOCK_FLAG(0);
5455 
5456         // we need to read the val of what we retrieve if
5457         // we do NOT have a covering index AND we are using a clustering secondary
5458         // key
5459         bool need_val =
5460             (do_key_read == 0) &&
5461             (tokudb_active_index == primary_key ||
5462              key_is_clustering(&table->key_info[tokudb_active_index]));
5463 
5464         if ((bytes_used_in_range_query_buff -
5465              curr_range_query_buff_offset) > 0) {
5466             error = read_data_from_range_query_buff(buf, need_val, do_key_read);
5467         } else if (icp_went_out_of_range) {
5468             icp_went_out_of_range = false;
5469             error = HA_ERR_END_OF_FILE;
5470         } else {
5471             invalidate_bulk_fetch();
5472             if (doing_bulk_fetch) {
5473                 struct smart_dbt_bf_info bf_info;
5474                 bf_info.ha = this;
5475                 // you need the val if you have a clustering index and key_read is not 0;
5476                 bf_info.direction = direction;
5477                 bf_info.thd = ha_thd();
5478                 bf_info.need_val = need_val;
5479                 bf_info.buf = buf;
5480                 bf_info.key_to_compare = key_to_compare;
5481                 //
5482                 // call c_getf_next with purpose of filling in range_query_buff
5483                 //
5484                 rows_fetched_using_bulk_fetch = 0;
5485                 // it is expected that we can do ICP in the smart_dbt_bf_callback
5486                 // as a result, it's possible we don't return any data because
5487                 // none of the rows matched the index condition. Therefore, we need
5488                 // this while loop. icp_out_of_range will be set if we hit a row that
5489                 // the index condition states is out of our range. When that hits,
5490                 // we know all the data in the buffer is the last data we will retrieve
5491                 while (bytes_used_in_range_query_buff == 0 &&
5492                        !icp_went_out_of_range && error == 0) {
5493                     if (direction > 0) {
5494                         error =
5495                             cursor->c_getf_next(
5496                                 cursor,
5497                                 flags,
5498                                 smart_dbt_bf_callback,
5499                                 &bf_info);
5500                     } else {
5501                         error =
5502                             cursor->c_getf_prev(
5503                                 cursor,
5504                                 flags,
5505                                 smart_dbt_bf_callback,
5506                                 &bf_info);
5507                     }
5508                 }
5509                 // if there is no data set and we went out of range,
5510                 // then there is nothing to return
5511                 if (bytes_used_in_range_query_buff == 0 &&
5512                     icp_went_out_of_range) {
5513                     icp_went_out_of_range = false;
5514                     error = HA_ERR_END_OF_FILE;
5515                 }
5516                 if (bulk_fetch_iteration < HA_TOKU_BULK_FETCH_ITERATION_MAX) {
5517                     bulk_fetch_iteration++;
5518                 }
5519 
5520                 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5521                 if (error) {
5522                     goto cleanup;
5523                 }
5524 
5525                 //
5526                 // now that range_query_buff is filled, read an element
5527                 //
5528                 error =
5529                     read_data_from_range_query_buff(buf, need_val, do_key_read);
5530             } else {
5531                 struct smart_dbt_info info;
5532                 info.ha = this;
5533                 info.buf = buf;
5534                 info.keynr = tokudb_active_index;
5535 
5536                 if (direction > 0) {
5537                     error =
5538                         cursor->c_getf_next(
5539                             cursor,
5540                             flags,
5541                             SMART_DBT_CALLBACK(do_key_read),
5542                             &info);
5543                 } else {
5544                     error =
5545                         cursor->c_getf_prev(
5546                             cursor,
5547                             flags,
5548                             SMART_DBT_CALLBACK(do_key_read),
5549                             &info);
5550                 }
5551                 error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5552             }
5553         }
5554     }
5555 
5556     //
5557     // at this point, one of two things has happened
5558     // either we have unpacked the data into buf, and we
5559     // are done, or we have unpacked the primary key
5560     // into last_key, and we use the code below to
5561     // read the full row by doing a point query into the
5562     // main table.
5563     //
5564     if (!error &&
5565         !do_key_read &&
5566         (tokudb_active_index != primary_key) &&
5567         !key_is_clustering(&table->key_info[tokudb_active_index])) {
5568         error = read_full_row(buf);
5569     }
5570 
5571     if (!error) {
5572         THD *thd = ha_thd();
5573         tokudb_trx_data* trx =
5574             static_cast<tokudb_trx_data*>(thd_get_ha_data(thd, tokudb_hton));
5575         trx->stmt_progress.queried++;
5576         track_progress(thd);
5577         if (thd_killed(thd))
5578             error = ER_ABORTING_CONNECTION;
5579     }
5580 cleanup:
5581     return error;
5582 }
5583 
5584 
5585 //
5586 // Reads the next row from the active index (cursor) into buf, and advances cursor
5587 // Parameters:
5588 //      [out]   buf - buffer for the next row, in MySQL format
5589 // Returns:
5590 //      0 on success
5591 //      HA_ERR_END_OF_FILE if not found
5592 //      error otherwise
5593 //
index_next(uchar * buf)5594 int ha_tokudb::index_next(uchar * buf) {
5595     TOKUDB_HANDLER_DBUG_ENTER("");
5596     ha_statistic_increment(&SSV::ha_read_next_count);
5597     int error = get_next(buf, 1, NULL, key_read);
5598     TOKUDB_HANDLER_DBUG_RETURN(error);
5599 }
5600 
5601 
index_read_last(uchar * buf,const uchar * key,uint key_len)5602 int ha_tokudb::index_read_last(uchar * buf, const uchar * key, uint key_len) {
5603     return(index_read(buf, key, key_len, HA_READ_PREFIX_LAST));
5604 }
5605 
5606 
5607 //
5608 // Reads the previous row from the active index (cursor) into buf, and advances cursor
5609 // Parameters:
5610 //      [out]   buf - buffer for the next row, in MySQL format
5611 // Returns:
5612 //      0 on success
5613 //      HA_ERR_END_OF_FILE if not found
5614 //      error otherwise
5615 //
index_prev(uchar * buf)5616 int ha_tokudb::index_prev(uchar * buf) {
5617     TOKUDB_HANDLER_DBUG_ENTER("");
5618     ha_statistic_increment(&SSV::ha_read_prev_count);
5619     int error = get_next(buf, -1, NULL, key_read);
5620     TOKUDB_HANDLER_DBUG_RETURN(error);
5621 }
5622 
5623 //
5624 // Reads the first row from the active index (cursor) into buf, and advances cursor
5625 // Parameters:
5626 //      [out]   buf - buffer for the next row, in MySQL format
5627 // Returns:
5628 //      0 on success
5629 //      HA_ERR_END_OF_FILE if not found
5630 //      error otherwise
5631 //
index_first(uchar * buf)5632 int ha_tokudb::index_first(uchar * buf) {
5633     TOKUDB_HANDLER_DBUG_ENTER("");
5634     invalidate_bulk_fetch();
5635     int error = 0;
5636     struct smart_dbt_info info;
5637     uint32_t flags = SET_PRELOCK_FLAG(0);
5638     THD* thd = ha_thd();
5639     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5640     HANDLE_INVALID_CURSOR();
5641 
5642     ha_statistic_increment(&SSV::ha_read_first_count);
5643 
5644     info.ha = this;
5645     info.buf = buf;
5646     info.keynr = tokudb_active_index;
5647 
5648     error = cursor->c_getf_first(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5649     error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5650 
5651     //
5652     // still need to get entire contents of the row if operation done on
5653     // secondary DB and it was NOT a covering index
5654     //
5655     if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5656         error = read_full_row(buf);
5657     }
5658     if (trx) {
5659         trx->stmt_progress.queried++;
5660     }
5661     track_progress(thd);
5662     maybe_index_scan = true;
5663 cleanup:
5664     TOKUDB_HANDLER_DBUG_RETURN(error);
5665 }
5666 
5667 //
5668 // Reads the last row from the active index (cursor) into buf, and advances cursor
5669 // Parameters:
5670 //      [out]   buf - buffer for the next row, in MySQL format
5671 // Returns:
5672 //      0 on success
5673 //      HA_ERR_END_OF_FILE if not found
5674 //      error otherwise
5675 //
index_last(uchar * buf)5676 int ha_tokudb::index_last(uchar * buf) {
5677     TOKUDB_HANDLER_DBUG_ENTER("");
5678     invalidate_bulk_fetch();
5679     int error = 0;
5680     struct smart_dbt_info info;
5681     uint32_t flags = SET_PRELOCK_FLAG(0);
5682     THD* thd = ha_thd();
5683     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);;
5684     HANDLE_INVALID_CURSOR();
5685 
5686     ha_statistic_increment(&SSV::ha_read_last_count);
5687 
5688     info.ha = this;
5689     info.buf = buf;
5690     info.keynr = tokudb_active_index;
5691 
5692     error = cursor->c_getf_last(cursor, flags, SMART_DBT_CALLBACK(key_read), &info);
5693     error = handle_cursor_error(error, HA_ERR_END_OF_FILE);
5694     //
5695     // still need to get entire contents of the row if operation done on
5696     // secondary DB and it was NOT a covering index
5697     //
5698     if (!error && !key_read && (tokudb_active_index != primary_key) && !key_is_clustering(&table->key_info[tokudb_active_index])) {
5699         error = read_full_row(buf);
5700     }
5701 
5702     if (trx) {
5703         trx->stmt_progress.queried++;
5704     }
5705     track_progress(thd);
5706     maybe_index_scan = true;
5707 cleanup:
5708     TOKUDB_HANDLER_DBUG_RETURN(error);
5709 }
5710 
5711 //
5712 // Initialize a scan of the table (which is why index_init is called on primary_key)
5713 // Parameters:
5714 //          scan - unused
5715 // Returns:
5716 //      0 on success
5717 //      error otherwise
5718 //
rnd_init(bool scan)5719 int ha_tokudb::rnd_init(bool scan) {
5720     TOKUDB_HANDLER_DBUG_ENTER("");
5721     int error = 0;
5722     range_lock_grabbed = false;
5723     error = index_init(MAX_KEY, 0);
5724     if (error) { goto cleanup;}
5725 
5726     if (scan) {
5727         error = prelock_range(NULL, NULL);
5728         if (error) { goto cleanup; }
5729 
5730         // only want to set range_lock_grabbed to true after index_init
5731         // successfully executed for two reasons:
5732         // 1) index_init will reset it to false anyway
5733         // 2) if it fails, we don't want prelocking on,
5734         range_lock_grabbed = true;
5735     }
5736 
5737     error = 0;
5738 cleanup:
5739     if (error) {
5740         index_end();
5741         last_cursor_error = error;
5742     }
5743     TOKUDB_HANDLER_DBUG_RETURN(error);
5744 }
5745 
5746 //
5747 // End a scan of the table
5748 //
rnd_end()5749 int ha_tokudb::rnd_end() {
5750     TOKUDB_HANDLER_DBUG_ENTER("");
5751     range_lock_grabbed = false;
5752     TOKUDB_HANDLER_DBUG_RETURN(index_end());
5753 }
5754 
5755 
5756 //
5757 // Read the next row in a table scan
5758 // Parameters:
5759 //      [out]   buf - buffer for the next row, in MySQL format
5760 // Returns:
5761 //      0 on success
5762 //      HA_ERR_END_OF_FILE if not found
5763 //      error otherwise
5764 //
rnd_next(uchar * buf)5765 int ha_tokudb::rnd_next(uchar * buf) {
5766     TOKUDB_HANDLER_DBUG_ENTER("");
5767     ha_statistic_increment(&SSV::ha_read_rnd_next_count);
5768     int error = get_next(buf, 1, NULL, false);
5769     TOKUDB_HANDLER_DBUG_RETURN(error);
5770 }
5771 
5772 
track_progress(THD * thd)5773 void ha_tokudb::track_progress(THD* thd) {
5774     tokudb_trx_data* trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
5775     if (trx) {
5776         ulonglong num_written = trx->stmt_progress.inserted +
5777             trx->stmt_progress.updated +
5778             trx->stmt_progress.deleted;
5779         bool update_status =
5780             (trx->stmt_progress.queried &&
5781              tokudb::sysvars::read_status_frequency &&
5782              (trx->stmt_progress.queried %
5783                 tokudb::sysvars::read_status_frequency) == 0) ||
5784              (num_written && tokudb::sysvars::write_status_frequency &&
5785               (num_written % tokudb::sysvars::write_status_frequency) == 0);
5786         if (update_status) {
5787             char *next_status = write_status_msg;
5788             bool first = true;
5789             int r;
5790             if (trx->stmt_progress.queried) {
5791                 r = sprintf(
5792                     next_status,
5793                     "Queried about %llu row%s",
5794                     trx->stmt_progress.queried,
5795                     trx->stmt_progress.queried == 1 ? "" : "s");
5796                 assert_always(r >= 0);
5797                 next_status += r;
5798                 first = false;
5799             }
5800             if (trx->stmt_progress.inserted) {
5801                 if (trx->stmt_progress.using_loader) {
5802                     r = sprintf(
5803                         next_status,
5804                         "%sFetched about %llu row%s, loading data still remains",
5805                         first ? "" : ", ",
5806                         trx->stmt_progress.inserted,
5807                         trx->stmt_progress.inserted == 1 ? "" : "s");
5808                 } else {
5809                     r = sprintf(
5810                         next_status,
5811                         "%sInserted about %llu row%s",
5812                         first ? "" : ", ",
5813                         trx->stmt_progress.inserted,
5814                         trx->stmt_progress.inserted == 1 ? "" : "s");
5815                 }
5816                 assert_always(r >= 0);
5817                 next_status += r;
5818                 first = false;
5819             }
5820             if (trx->stmt_progress.updated) {
5821                 r = sprintf(
5822                     next_status,
5823                     "%sUpdated about %llu row%s",
5824                     first ? "" : ", ",
5825                     trx->stmt_progress.updated,
5826                     trx->stmt_progress.updated == 1 ? "" : "s");
5827                 assert_always(r >= 0);
5828                 next_status += r;
5829                 first = false;
5830             }
5831             if (trx->stmt_progress.deleted) {
5832                 r = sprintf(
5833                     next_status,
5834                     "%sDeleted about %llu row%s",
5835                     first ? "" : ", ",
5836                     trx->stmt_progress.deleted,
5837                     trx->stmt_progress.deleted == 1 ? "" : "s");
5838                 assert_always(r >= 0);
5839                 next_status += r;
5840                 first = false;
5841             }
5842             if (!first)
5843                 thd_proc_info(thd, write_status_msg);
5844         }
5845     }
5846 }
5847 
5848 
get_pos(DBT * to,uchar * pos)5849 DBT *ha_tokudb::get_pos(DBT * to, uchar * pos) {
5850     TOKUDB_HANDLER_DBUG_ENTER("");
5851     /* We don't need to set app_data here */
5852     memset((void *) to, 0, sizeof(*to));
5853     to->data = pos + sizeof(uint32_t);
5854     to->size = *(uint32_t *)pos;
5855     DBUG_DUMP("key", (const uchar *) to->data, to->size);
5856     DBUG_RETURN(to);
5857 }
5858 
5859 // Retrieves a row with based on the primary key saved in pos
5860 // Returns:
5861 //      0 on success
5862 //      HA_ERR_KEY_NOT_FOUND if not found
5863 //      error otherwise
rnd_pos(uchar * buf,uchar * pos)5864 int ha_tokudb::rnd_pos(uchar * buf, uchar * pos) {
5865     TOKUDB_HANDLER_DBUG_ENTER("");
5866     DBT db_pos;
5867     int error = 0;
5868     struct smart_dbt_info info;
5869     bool old_unpack_entire_row = unpack_entire_row;
5870     DBT* key = get_pos(&db_pos, pos);
5871 
5872     unpack_entire_row = true;
5873     ha_statistic_increment(&SSV::ha_read_rnd_count);
5874     tokudb_active_index = MAX_KEY;
5875 
5876     THD *thd = ha_thd();
5877 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
5878     // test rpl slave by inducing a delay before the point query
5879     if (thd->slave_thread && (in_rpl_delete_rows || in_rpl_update_rows)) {
5880         DBUG_EXECUTE_IF("tokudb_crash_if_rpl_looks_up_row", assert(0););
5881         uint64_t delay_ms = tokudb::sysvars::rpl_lookup_rows_delay(thd);
5882         if (delay_ms)
5883             usleep(delay_ms * 1000);
5884     }
5885 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
5886 
5887     info.ha = this;
5888     info.buf = buf;
5889     info.keynr = primary_key;
5890 
5891     error = share->file->getf_set(share->file, transaction,
5892             get_cursor_isolation_flags(lock.type, thd),
5893             key, smart_dbt_callback_rowread_ptquery, &info);
5894 
5895     if (error == DB_NOTFOUND) {
5896         error = HA_ERR_KEY_NOT_FOUND;
5897         goto cleanup;
5898     }
5899 cleanup:
5900     unpack_entire_row = old_unpack_entire_row;
5901     TOKUDB_HANDLER_DBUG_RETURN(error);
5902 }
5903 
prelock_range(const key_range * start_key,const key_range * end_key)5904 int ha_tokudb::prelock_range(const key_range *start_key, const key_range *end_key) {
5905     TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
5906     THD* thd = ha_thd();
5907 
5908     int error = 0;
5909     DBT start_dbt_key;
5910     DBT end_dbt_key;
5911     uchar* start_key_buff  = prelocked_left_range;
5912     uchar* end_key_buff = prelocked_right_range;
5913 
5914     memset((void *) &start_dbt_key, 0, sizeof(start_dbt_key));
5915     memset((void *) &end_dbt_key, 0, sizeof(end_dbt_key));
5916 
5917     HANDLE_INVALID_CURSOR();
5918     if (start_key) {
5919         switch (start_key->flag) {
5920         case HA_READ_AFTER_KEY:
5921             pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_POS_INF);
5922             break;
5923         default:
5924             pack_key(&start_dbt_key, tokudb_active_index, start_key_buff, start_key->key, start_key->length, COL_NEG_INF);
5925             break;
5926         }
5927         prelocked_left_range_size = start_dbt_key.size;
5928     }
5929     else {
5930         prelocked_left_range_size = 0;
5931     }
5932 
5933     if (end_key) {
5934         switch (end_key->flag) {
5935         case HA_READ_BEFORE_KEY:
5936             pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_NEG_INF);
5937             break;
5938         default:
5939             pack_key(&end_dbt_key, tokudb_active_index, end_key_buff, end_key->key, end_key->length, COL_POS_INF);
5940             break;
5941         }
5942         prelocked_right_range_size = end_dbt_key.size;
5943     }
5944     else {
5945         prelocked_right_range_size = 0;
5946     }
5947 
5948     error = cursor->c_set_bounds(
5949         cursor,
5950         start_key ? &start_dbt_key : share->key_file[tokudb_active_index]->dbt_neg_infty(),
5951         end_key ? &end_dbt_key : share->key_file[tokudb_active_index]->dbt_pos_infty(),
5952         true,
5953         (cursor_flags & DB_SERIALIZABLE) != 0 ? DB_NOTFOUND : 0
5954         );
5955     if (error) {
5956         error = map_to_handler_error(error);
5957         last_cursor_error = error;
5958         //
5959         // cursor should be initialized here, but in case it is not, we still check
5960         //
5961         if (cursor) {
5962             int r = cursor->c_close(cursor);
5963             assert_always(r==0);
5964             cursor = NULL;
5965             remove_from_trx_handler_list();
5966         }
5967         goto cleanup;
5968     }
5969 
5970     // at this point, determine if we will be doing bulk fetch
5971     doing_bulk_fetch = tokudb_do_bulk_fetch(thd);
5972     bulk_fetch_iteration = 0;
5973     rows_fetched_using_bulk_fetch = 0;
5974 
5975 cleanup:
5976     TOKUDB_HANDLER_DBUG_RETURN(error);
5977 }
5978 
5979 //
5980 // Prelock range if possible, start_key is leftmost, end_key is rightmost
5981 // whether scanning forward or backward.  This function is called by MySQL
5982 // for backward range queries (in QUICK_SELECT_DESC::get_next).
5983 // Forward scans use read_range_first()/read_range_next().
5984 //
prepare_range_scan(const key_range * start_key,const key_range * end_key)5985 int ha_tokudb::prepare_range_scan( const key_range *start_key, const key_range *end_key) {
5986     TOKUDB_HANDLER_DBUG_ENTER("%p %p", start_key, end_key);
5987     int error = prelock_range(start_key, end_key);
5988     if (!error) {
5989         range_lock_grabbed = true;
5990     }
5991     TOKUDB_HANDLER_DBUG_RETURN(error);
5992 }
5993 
read_range_first(const key_range * start_key,const key_range * end_key,bool eq_range,bool sorted)5994 int ha_tokudb::read_range_first(
5995     const key_range *start_key,
5996     const key_range *end_key,
5997     bool eq_range,
5998     bool sorted)
5999 {
6000     TOKUDB_HANDLER_DBUG_ENTER("%p %p %u %u", start_key, end_key, eq_range, sorted);
6001     int error = prelock_range(start_key, end_key);
6002     if (error) { goto cleanup; }
6003     range_lock_grabbed = true;
6004 
6005     error = handler::read_range_first(start_key, end_key, eq_range, sorted);
6006 cleanup:
6007     TOKUDB_HANDLER_DBUG_RETURN(error);
6008 }
6009 
read_range_next()6010 int ha_tokudb::read_range_next()
6011 {
6012     TOKUDB_HANDLER_DBUG_ENTER("");
6013     int error;
6014     error = handler::read_range_next();
6015     if (error) {
6016         range_lock_grabbed = false;
6017     }
6018     TOKUDB_HANDLER_DBUG_RETURN(error);
6019 }
6020 
6021 
6022 
6023 /*
6024   Set a reference to the current record in (ref,ref_length).
6025 
6026   SYNOPSIS
6027   ha_tokudb::position()
6028   record                      The current record buffer
6029 
6030   DESCRIPTION
6031   The BDB handler stores the primary key in (ref,ref_length).
6032   There is either an explicit primary key, or an implicit (hidden)
6033   primary key.
6034   During open(), 'ref_length' is calculated as the maximum primary
6035   key length. When an actual key is shorter than that, the rest of
6036   the buffer must be cleared out. The row cannot be identified, if
6037   garbage follows behind the end of the key. There is no length
6038   field for the current key, so that the whole ref_length is used
6039   for comparison.
6040 
6041   RETURN
6042   nothing
6043 */
position(const uchar * record)6044 void ha_tokudb::position(const uchar * record) {
6045     TOKUDB_HANDLER_DBUG_ENTER("");
6046     DBT key;
6047     if (hidden_primary_key) {
6048         assert(ref_length == (TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH + sizeof(uint32_t)));
6049         memcpy(ref + sizeof(uint32_t), current_ident, TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH);
6050         *(uint32_t *)ref = TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH;
6051     }
6052     else {
6053         bool has_null;
6054         //
6055         // save the data
6056         //
6057         create_dbt_key_from_table(&key, primary_key, ref + sizeof(uint32_t), record, &has_null);
6058         //
6059         // save the size of data in the first four bytes of ref
6060         //
6061         memcpy(ref, &key.size, sizeof(uint32_t));
6062     }
6063     TOKUDB_HANDLER_DBUG_VOID_RETURN;
6064 }
6065 
6066 //
6067 // Per InnoDB: Returns statistics information of the table to the MySQL interpreter,
6068 // in various fields of the handle object.
6069 // Return:
6070 //      0, always success
6071 //
info(uint flag)6072 int ha_tokudb::info(uint flag) {
6073     TOKUDB_HANDLER_DBUG_ENTER("%d", flag);
6074     int error = 0;
6075 #if defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
6076     for (uint i=0; i < table->s->keys; i++)
6077         if (key_is_clustering(&table->key_info[i]))
6078             table->covering_keys.set_bit(i);
6079 #endif  // defined(TOKU_CLUSTERING_IS_COVERING) && TOKU_CLUSTERING_IS_COVERING
6080     DB_TXN* txn = NULL;
6081     if (flag & HA_STATUS_VARIABLE) {
6082         stats.records = share->row_count() + share->rows_from_locked_table;
6083         stats.deleted = 0;
6084         if (!(flag & HA_STATUS_NO_LOCK)) {
6085             error = txn_begin(db_env, NULL, &txn, DB_READ_UNCOMMITTED, ha_thd());
6086             if (error) {
6087                 goto cleanup;
6088             }
6089 
6090             // we should always have a primary key
6091             assert_always(share->file != NULL);
6092 
6093             DB_BTREE_STAT64 dict_stats;
6094             error = share->file->stat64(share->file, txn, &dict_stats);
6095             if (error) {
6096                 goto cleanup;
6097             }
6098             share->set_row_count(dict_stats.bt_ndata, false);
6099             stats.records = dict_stats.bt_ndata;
6100             stats.create_time = dict_stats.bt_create_time_sec;
6101             stats.update_time = dict_stats.bt_modify_time_sec;
6102             stats.check_time = dict_stats.bt_verify_time_sec;
6103             stats.data_file_length = dict_stats.bt_dsize;
6104             stats.delete_length = dict_stats.bt_fsize - dict_stats.bt_dsize;
6105             if (hidden_primary_key) {
6106                 //
6107                 // in this case, we have a hidden primary key, do not
6108                 // want to report space taken up by the hidden primary key to the user
6109                 //
6110                 uint64_t hpk_space =
6111                     TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH * dict_stats.bt_ndata;
6112                 stats.data_file_length =
6113                     (hpk_space > stats.data_file_length) ?
6114                         0 : stats.data_file_length - hpk_space;
6115             } else {
6116                 //
6117                 // one infinity byte per key needs to be subtracted
6118                 //
6119                 uint64_t inf_byte_space = dict_stats.bt_ndata;
6120                 stats.data_file_length =
6121                     (inf_byte_space > stats.data_file_length) ?
6122                         0 : stats.data_file_length - inf_byte_space;
6123             }
6124 
6125             stats.mean_rec_length =
6126                 stats.records ?
6127                     (ulong)(stats.data_file_length/stats.records) : 0;
6128             stats.index_file_length = 0;
6129             // curr_num_DBs is the number of keys we have, according
6130             // to the mysql layer. if drop index is running concurrently
6131             // with info() (it can, because info does not take table locks),
6132             // then it could be the case that one of the dbs was dropped
6133             // and set to NULL before mysql was able to set table->s->keys
6134             // accordingly.
6135             //
6136             // we should just ignore any DB * that is NULL.
6137             //
6138             // this solution is much simpler than trying to maintain an
6139             // accurate number of valid keys at the handlerton layer.
6140             uint curr_num_DBs =
6141                 table->s->keys + tokudb_test(hidden_primary_key);
6142             for (uint i = 0; i < curr_num_DBs; i++) {
6143                 // skip the primary key, skip dropped indexes
6144                 if (i == primary_key || share->key_file[i] == NULL) {
6145                     continue;
6146                 }
6147                 error = share->key_file[i]->stat64(
6148                     share->key_file[i], txn, &dict_stats);
6149                 if (error) {
6150                     goto cleanup;
6151                 }
6152                 stats.index_file_length += dict_stats.bt_dsize;
6153                 stats.delete_length +=
6154                     dict_stats.bt_fsize - dict_stats.bt_dsize;
6155             }
6156         }
6157 
6158         /*
6159         The following comment and logic has been taken from InnoDB and
6160         an old hack was removed that forced to always set stats.records > 0
6161         ---
6162         The MySQL optimizer seems to assume in a left join that n_rows
6163         is an accurate estimate if it is zero. Of course, it is not,
6164         since we do not have any locks on the rows yet at this phase.
6165         Since SHOW TABLE STATUS seems to call this function with the
6166         HA_STATUS_TIME flag set, while the left join optimizer does not
6167         set that flag, we add one to a zero value if the flag is not
6168         set. That way SHOW TABLE STATUS will show the best estimate,
6169         while the optimizer never sees the table empty. */
6170         if (stats.records == 0 && !(flag & HA_STATUS_TIME)) {
6171             stats.records++;
6172         }
6173     }
6174     if ((flag & HA_STATUS_CONST)) {
6175         stats.max_data_file_length = 9223372036854775807ULL;
6176         share->set_cardinality_counts_in_table(table);
6177     }
6178 
6179     /* Don't return key if we got an error for the internal primary key */
6180     if (flag & HA_STATUS_ERRKEY && last_dup_key < table_share->keys) {
6181         errkey = last_dup_key;
6182     }
6183 
6184     if (flag & HA_STATUS_AUTO && table->found_next_number_field) {
6185         THD* thd = table->in_use;
6186         struct system_variables* variables = &thd->variables;
6187         stats.auto_increment_value =
6188             share->last_auto_increment + variables->auto_increment_increment;
6189     }
6190     error = 0;
6191 cleanup:
6192     if (txn != NULL) {
6193         commit_txn(txn, DB_TXN_NOSYNC);
6194         txn = NULL;
6195     }
6196     TOKUDB_HANDLER_DBUG_RETURN(error);
6197 }
6198 
6199 //
6200 //  Per InnoDB: Tells something additional to the handler about how to do things.
6201 //
extra(enum ha_extra_function operation)6202 int ha_tokudb::extra(enum ha_extra_function operation) {
6203     TOKUDB_HANDLER_DBUG_ENTER("%d", operation);
6204     switch (operation) {
6205     case HA_EXTRA_RESET_STATE:
6206         reset();
6207         break;
6208     case HA_EXTRA_KEYREAD:
6209         key_read = true;           // Query satisfied with key
6210         break;
6211     case HA_EXTRA_NO_KEYREAD:
6212         key_read = false;
6213         break;
6214     case HA_EXTRA_IGNORE_DUP_KEY:
6215         using_ignore = true;
6216         break;
6217     case HA_EXTRA_NO_IGNORE_DUP_KEY:
6218         using_ignore = false;
6219         break;
6220     case HA_EXTRA_IGNORE_NO_KEY:
6221         using_ignore_no_key = true;
6222         break;
6223     case HA_EXTRA_NO_IGNORE_NO_KEY:
6224         using_ignore_no_key = false;
6225         break;
6226     case HA_EXTRA_NOT_USED:
6227     case HA_EXTRA_PREPARE_FOR_RENAME:
6228         break; // must do nothing and return 0
6229     default:
6230         break;
6231     }
6232     TOKUDB_HANDLER_DBUG_RETURN(0);
6233 }
6234 
reset()6235 int ha_tokudb::reset() {
6236     TOKUDB_HANDLER_DBUG_ENTER("");
6237     key_read = false;
6238     using_ignore = false;
6239     using_ignore_no_key = false;
6240     ds_mrr.reset();
6241     invalidate_icp();
6242     TOKUDB_HANDLER_DBUG_RETURN(0);
6243 }
6244 
6245 //
6246 // helper function that iterates through all DB's
6247 // and grabs a lock (either read or write, but not both)
6248 // Parameters:
6249 //      [in]    trans - transaction to be used to pre acquire the lock
6250 //              lt - type of lock to get, either lock_read or lock_write
6251 //  Returns:
6252 //      0 on success
6253 //      error otherwise
6254 //
acquire_table_lock(DB_TXN * trans,TABLE_LOCK_TYPE lt)6255 int ha_tokudb::acquire_table_lock (DB_TXN* trans, TABLE_LOCK_TYPE lt) {
6256     TOKUDB_HANDLER_DBUG_ENTER("%p %s", trans, lt == lock_read ? "r" : "w");
6257     int error = ENOSYS;
6258     if (!num_DBs_locked_in_bulk) {
6259         rwlock_t_lock_read(share->_num_DBs_lock);
6260     }
6261     uint curr_num_DBs = share->num_DBs;
6262     if (lt == lock_read) {
6263         error = 0;
6264         goto cleanup;
6265     } else if (lt == lock_write) {
6266         for (uint i = 0; i < curr_num_DBs; i++) {
6267             DB* db = share->key_file[i];
6268             error = db->pre_acquire_table_lock(db, trans);
6269             if (error == EINVAL)
6270                 TOKUDB_HANDLER_TRACE("%d db=%p trans=%p", i, db, trans);
6271             if (error) break;
6272         }
6273         TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6274         if (error) goto cleanup;
6275     } else {
6276         error = ENOSYS;
6277         goto cleanup;
6278     }
6279 
6280     error = 0;
6281 cleanup:
6282     if (!num_DBs_locked_in_bulk) {
6283         share->_num_DBs_lock.unlock();
6284     }
6285     TOKUDB_HANDLER_DBUG_RETURN(error);
6286 }
6287 
create_txn(THD * thd,tokudb_trx_data * trx)6288 int ha_tokudb::create_txn(THD* thd, tokudb_trx_data* trx) {
6289     int error;
6290     ulong tx_isolation = thd_tx_isolation(thd);
6291     HA_TOKU_ISO_LEVEL toku_iso_level = tx_to_toku_iso(tx_isolation);
6292     bool is_autocommit = !thd_test_options(
6293             thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN);
6294 
6295     /* First table lock, start transaction */
6296     if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) &&
6297          !trx->all &&
6298          (thd_sql_command(thd) != SQLCOM_CREATE_TABLE) &&
6299          (thd_sql_command(thd) != SQLCOM_DROP_TABLE) &&
6300          (thd_sql_command(thd) != SQLCOM_DROP_INDEX) &&
6301          (thd_sql_command(thd) != SQLCOM_CREATE_INDEX) &&
6302          (thd_sql_command(thd) != SQLCOM_ALTER_TABLE)) {
6303         /* QQQ We have to start a master transaction */
6304         // DBUG_PRINT("trans", ("starting transaction all "));
6305         uint32_t txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6306         if (thd_tx_is_read_only(thd)) {
6307             txn_begin_flags |= DB_TXN_READ_ONLY;
6308         }
6309         if ((error = txn_begin(db_env, NULL, &trx->all, txn_begin_flags, thd))) {
6310             goto cleanup;
6311         }
6312         TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6313             TOKUDB_DEBUG_TXN,
6314             "created master %p",
6315             trx->all);
6316         trx->sp_level = trx->all;
6317         trans_register_ha(thd, true, tokudb_hton, NULL);
6318     }
6319     DBUG_PRINT("trans", ("starting transaction stmt"));
6320     if (trx->stmt) {
6321         TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6322             TOKUDB_DEBUG_TXN,
6323             "warning:stmt=%p",
6324             trx->stmt);
6325     }
6326     uint32_t txn_begin_flags;
6327     if (trx->all == NULL) {
6328         txn_begin_flags = toku_iso_to_txn_flag(toku_iso_level);
6329         //
6330         // if the isolation level that the user has set is serializable,
6331         // but autocommit is on and this is just a select,
6332         // then we can go ahead and set the isolation level to
6333         // be a snapshot read, because we can serialize
6334         // the transaction to be the point in time at which the snapshot began.
6335         //
6336         if (txn_begin_flags == 0 && is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT) {
6337             txn_begin_flags = DB_TXN_SNAPSHOT;
6338         }
6339         if (is_autocommit && thd_sql_command(thd) == SQLCOM_SELECT &&
6340             !thd->in_sub_stmt && lock.type <= TL_READ_NO_INSERT &&
6341             !thd->lex->uses_stored_routines()) {
6342             txn_begin_flags |= DB_TXN_READ_ONLY;
6343         }
6344     } else {
6345         txn_begin_flags = DB_INHERIT_ISOLATION;
6346     }
6347     error = txn_begin(db_env, trx->sp_level, &trx->stmt, txn_begin_flags, thd);
6348     if (error) {
6349         /* We leave the possible master transaction open */
6350         goto cleanup;
6351     }
6352     trx->sub_sp_level = trx->stmt;
6353     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6354         TOKUDB_DEBUG_TXN,
6355         "created stmt %p sp_level %p",
6356         trx->sp_level,
6357         trx->stmt);
6358     reset_stmt_progress(&trx->stmt_progress);
6359     trans_register_ha(thd, false, tokudb_hton, NULL);
6360 cleanup:
6361     return error;
6362 }
6363 
lock_type_str(int lock_type)6364 static const char *lock_type_str(int lock_type) {
6365     if (lock_type == F_RDLCK) return "F_RDLCK";
6366     if (lock_type == F_WRLCK) return "F_WRLCK";
6367     if (lock_type == F_UNLCK) return "F_UNLCK";
6368     return "?";
6369 }
6370 
6371 /*
6372   As MySQL will execute an external lock for every new table it uses
6373   we can use this to start the transactions.
6374   If we are in auto_commit mode we just need to start a transaction
6375   for the statement to be able to rollback the statement.
6376   If not, we have to start a master transaction if there doesn't exist
6377   one from before.
6378 */
6379 //
6380 // Parameters:
6381 //      [in]    thd - handle to the user thread
6382 //              lock_type - the type of lock
6383 // Returns:
6384 //      0 on success
6385 //      error otherwise
6386 //
external_lock(THD * thd,int lock_type)6387 int ha_tokudb::external_lock(THD * thd, int lock_type) {
6388     TOKUDB_HANDLER_DBUG_ENTER(
6389         "cmd %d lock %d %s %s",
6390         thd_sql_command(thd),
6391         lock_type,
6392         lock_type_str(lock_type),
6393         share->full_table_name());
6394     if (TOKUDB_UNLIKELY(!TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_ENTER) &&
6395         TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_LOCK))) {
6396         TOKUDB_HANDLER_TRACE(
6397             "cmd %d lock %d %s %s",
6398             thd_sql_command(thd),
6399             lock_type,
6400             lock_type_str(lock_type),
6401             share->full_table_name());
6402     }
6403     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s",
6404                                    thd->query().str);
6405 
6406     int error = 0;
6407     tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6408     if (!trx) {
6409         error = create_tokudb_trx_data_instance(&trx);
6410         if (error) { goto cleanup; }
6411         thd_set_ha_data(thd, tokudb_hton, trx);
6412     }
6413 
6414     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6415         TOKUDB_DEBUG_TXN,
6416         "trx %p %p %p %p %u %u",
6417         trx->all,
6418         trx->stmt,
6419         trx->sp_level,
6420         trx->sub_sp_level,
6421         trx->tokudb_lock_count,
6422         trx->create_lock_count);
6423 
6424     if (trx->all == NULL) {
6425         trx->sp_level = NULL;
6426     }
6427     if (lock_type != F_UNLCK) {
6428         use_write_locks = false;
6429         if (lock_type == F_WRLCK) {
6430             use_write_locks = true;
6431         }
6432         if (!trx->stmt) {
6433             transaction = NULL;    // Safety
6434             error = create_txn(thd, trx);
6435             if (error) {
6436                 goto cleanup;
6437             }
6438             trx->create_lock_count = trx->tokudb_lock_count;
6439         }
6440         transaction = trx->sub_sp_level;
6441         trx->tokudb_lock_count++;
6442     } else {
6443         share->update_row_count(thd, added_rows, deleted_rows, updated_rows);
6444         added_rows = 0;
6445         deleted_rows = 0;
6446         updated_rows = 0;
6447         share->rows_from_locked_table = 0;
6448         if (trx->tokudb_lock_count > 0) {
6449             if (--trx->tokudb_lock_count <= trx->create_lock_count) {
6450                 trx->create_lock_count = 0;
6451                 if (trx->stmt) {
6452                     /*
6453                       F_UNLCK is done without a transaction commit / rollback.
6454                       This happens if the thread didn't update any rows
6455                       We must in this case commit the work to keep the row locks
6456                     */
6457                     DBUG_PRINT("trans", ("committing non-updating transaction"));
6458                     reset_stmt_progress(&trx->stmt_progress);
6459                     commit_txn(trx->stmt, 0);
6460                     trx->stmt = NULL;
6461                     trx->sub_sp_level = NULL;
6462                 }
6463             }
6464             transaction = NULL;
6465         }
6466     }
6467 cleanup:
6468     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "error=%d", error);
6469     TOKUDB_HANDLER_DBUG_RETURN(error);
6470 }
6471 
6472 /*
6473   When using LOCK TABLE's external_lock is only called when the actual
6474   TABLE LOCK is done.
6475   Under LOCK TABLES, each used tables will force a call to start_stmt.
6476 */
start_stmt(THD * thd,thr_lock_type lock_type)6477 int ha_tokudb::start_stmt(THD* thd, thr_lock_type lock_type) {
6478     TOKUDB_HANDLER_DBUG_ENTER(
6479         "cmd %d lock %d %s",
6480         thd_sql_command(thd),
6481         lock_type,
6482         share->full_table_name());
6483 
6484     TOKUDB_HANDLER_TRACE_FOR_FLAGS(TOKUDB_DEBUG_LOCK, "q %s",
6485                                    thd->query().str);
6486 
6487     int error = 0;
6488     tokudb_trx_data* trx = (tokudb_trx_data*)thd_get_ha_data(thd, tokudb_hton);
6489     if (!trx) {
6490         error = create_tokudb_trx_data_instance(&trx);
6491         if (error) { goto cleanup; }
6492         thd_set_ha_data(thd, tokudb_hton, trx);
6493     }
6494 
6495     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6496         TOKUDB_DEBUG_TXN,
6497         "trx %p %p %p %p %u %u",
6498         trx->all,
6499         trx->stmt,
6500         trx->sp_level,
6501         trx->sub_sp_level,
6502         trx->tokudb_lock_count,
6503         trx->create_lock_count);
6504 
6505     /*
6506        note that trx->stmt may have been already initialized as start_stmt()
6507        is called for *each table* not for each storage engine,
6508        and there could be many bdb tables referenced in the query
6509      */
6510     if (!trx->stmt) {
6511         error = create_txn(thd, trx);
6512         if (error) {
6513             goto cleanup;
6514         }
6515         trx->create_lock_count = trx->tokudb_lock_count;
6516     } else {
6517         TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6518             TOKUDB_DEBUG_TXN,
6519             "trx->stmt %p already existed",
6520             trx->stmt);
6521     }
6522     if (added_rows > deleted_rows) {
6523         share->rows_from_locked_table = added_rows - deleted_rows;
6524     }
6525     transaction = trx->sub_sp_level;
6526     trans_register_ha(thd, false, tokudb_hton, NULL);
6527 cleanup:
6528     TOKUDB_HANDLER_DBUG_RETURN(error);
6529 }
6530 
6531 
get_cursor_isolation_flags(enum thr_lock_type lock_type,THD * thd)6532 uint32_t ha_tokudb::get_cursor_isolation_flags(enum thr_lock_type lock_type, THD* thd) {
6533     uint sql_command = thd_sql_command(thd);
6534     bool in_lock_tables = thd_in_lock_tables(thd);
6535 
6536     //
6537     // following InnoDB's lead and having checksum command use a snapshot read if told
6538     //
6539     if (sql_command == SQLCOM_CHECKSUM) {
6540         return 0;
6541     }
6542     else if ((lock_type == TL_READ && in_lock_tables) ||
6543              (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
6544              sql_command != SQLCOM_SELECT ||
6545              (sql_command == SQLCOM_SELECT && lock_type >= TL_WRITE_ALLOW_WRITE)) { // select for update
6546       ulong tx_isolation = thd_tx_isolation(thd);
6547       // pattern matched from InnoDB
6548       if ( (tx_isolation == ISO_READ_COMMITTED || tx_isolation == ISO_READ_UNCOMMITTED) &&
6549 	   (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT) &&
6550 	   (sql_command == SQLCOM_INSERT_SELECT
6551               || sql_command == SQLCOM_REPLACE_SELECT
6552               || sql_command == SQLCOM_UPDATE
6553 	    || sql_command == SQLCOM_CREATE_TABLE) )
6554         {
6555 	  return 0;
6556         }
6557       else {
6558 	return DB_SERIALIZABLE;
6559       }
6560     }
6561     else {
6562         return 0;
6563     }
6564 }
6565 
6566 /*
6567   The idea with handler::store_lock() is the following:
6568 
6569   The statement decided which locks we should need for the table
6570   for updates/deletes/inserts we get WRITE locks, for SELECT... we get
6571   read locks.
6572 
6573   Before adding the lock into the table lock handler (see thr_lock.c)
6574   mysqld calls store lock with the requested locks.  Store lock can now
6575   modify a write lock to a read lock (or some other lock), ignore the
6576   lock (if we don't want to use MySQL table locks at all) or add locks
6577   for many tables (like we do when we are using a MERGE handler).
6578 
6579   TokuDB changes all WRITE locks to TL_WRITE_ALLOW_WRITE (which
6580   signals that we are doing WRITES, but we are still allowing other
6581   reader's and writer's.
6582 
6583   When releasing locks, store_lock() are also called. In this case one
6584   usually doesn't have to do anything.
6585 
6586   In some exceptional cases MySQL may send a request for a TL_IGNORE;
6587   This means that we are requesting the same lock as last time and this
6588   should also be ignored. (This may happen when someone does a flush
6589   table when we have opened a part of the tables, in which case mysqld
6590   closes and reopens the tables and tries to get the same locks at last
6591   time).  In the future we will probably try to remove this.
6592 */
6593 
store_lock(THD * thd,THR_LOCK_DATA ** to,enum thr_lock_type lock_type)6594 THR_LOCK_DATA* *ha_tokudb::store_lock(
6595     THD* thd,
6596     THR_LOCK_DATA** to,
6597     enum thr_lock_type lock_type) {
6598 
6599     TOKUDB_HANDLER_DBUG_ENTER(
6600         "lock_type=%d cmd=%d",
6601         lock_type,
6602         thd_sql_command(thd));
6603     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6604         TOKUDB_DEBUG_LOCK,
6605         "lock_type=%d cmd=%d",
6606         lock_type,
6607         thd_sql_command(thd));
6608 
6609     if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) {
6610         enum_sql_command sql_command = (enum_sql_command) thd_sql_command(thd);
6611         if (!thd->in_lock_tables) {
6612             if (sql_command == SQLCOM_CREATE_INDEX &&
6613                 tokudb::sysvars::create_index_online(thd)) {
6614                 // hot indexing
6615                 rwlock_t_lock_read(share->_num_DBs_lock);
6616                 if (share->num_DBs ==
6617                     (table->s->keys + tokudb_test(hidden_primary_key))) {
6618                     lock_type = TL_WRITE_ALLOW_WRITE;
6619                 }
6620                 share->_num_DBs_lock.unlock();
6621             } else if ((lock_type >= TL_WRITE_CONCURRENT_INSERT &&
6622                         lock_type <= TL_WRITE) &&
6623                         sql_command != SQLCOM_TRUNCATE &&
6624                         !thd_tablespace_op(thd)) {
6625                 // allow concurrent writes
6626                 lock_type = TL_WRITE_ALLOW_WRITE;
6627             } else if (sql_command == SQLCOM_OPTIMIZE &&
6628                        lock_type == TL_READ_NO_INSERT) {
6629                 // hot optimize table
6630                 lock_type = TL_READ;
6631             }
6632         }
6633         lock.type = lock_type;
6634     }
6635     *to++ = &lock;
6636     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
6637         TOKUDB_DEBUG_LOCK,
6638         "lock_type=%d",
6639         lock_type);
6640     TOKUDB_HANDLER_DBUG_RETURN_PTR(to);
6641 }
6642 
get_compression_method(DB * file)6643 static toku_compression_method get_compression_method(DB* file) {
6644     enum toku_compression_method method;
6645     int r = file->get_compression_method(file, &method);
6646     assert_always(r == 0);
6647     return method;
6648 }
6649 
6650 #if defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) && \
6651     TOKU_INCLUDE_ROW_TYPE_COMPRESSION
get_row_type() const6652 enum row_type ha_tokudb::get_row_type() const {
6653     toku_compression_method compression_method = get_compression_method(share->file);
6654     return toku_compression_method_to_row_type(compression_method);
6655 }
6656 #endif  // defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) &&
6657         // TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6658 
create_sub_table(const char * table_name,DBT * row_descriptor,DB_TXN * txn,uint32_t block_size,uint32_t read_block_size,toku_compression_method compression_method,bool is_hot_index,uint32_t fanout)6659 static int create_sub_table(
6660     const char* table_name,
6661     DBT* row_descriptor,
6662     DB_TXN* txn,
6663     uint32_t block_size,
6664     uint32_t read_block_size,
6665     toku_compression_method compression_method,
6666     bool is_hot_index,
6667     uint32_t fanout) {
6668 
6669     TOKUDB_DBUG_ENTER("");
6670     int error;
6671     DB *file = NULL;
6672     uint32_t create_flags;
6673 
6674 
6675     error = db_create(&file, db_env, 0);
6676     if (error) {
6677         DBUG_PRINT("error", ("Got error: %d when creating table", error));
6678         set_my_errno(error);
6679         goto exit;
6680     }
6681 
6682 
6683     if (block_size != 0) {
6684         error = file->set_pagesize(file, block_size);
6685         if (error != 0) {
6686             DBUG_PRINT(
6687                 "error",
6688                 ("Got error: %d when setting block size %u for table '%s'",
6689                     error,
6690                     block_size,
6691                     table_name));
6692             goto exit;
6693         }
6694     }
6695     if (read_block_size != 0) {
6696         error = file->set_readpagesize(file, read_block_size);
6697         if (error != 0) {
6698             DBUG_PRINT(
6699                 "error",
6700                 ("Got error: %d when setting read block size %u for table '%s'",
6701                     error,
6702                     read_block_size,
6703                     table_name));
6704             goto exit;
6705         }
6706     }
6707     if (fanout != 0) {
6708         error = file->set_fanout(file, fanout);
6709         if (error != 0) {
6710             DBUG_PRINT(
6711                 "error",
6712                 ("Got error: %d when setting fanout %u for table '%s'",
6713                     error,
6714                     fanout,
6715                     table_name));
6716             goto exit;
6717         }
6718     }
6719     error = file->set_compression_method(file, compression_method);
6720     if (error != 0) {
6721         DBUG_PRINT(
6722             "error",
6723             ("Got error: %d when setting compression type %u for table '%s'",
6724                 error,
6725                 compression_method,
6726                 table_name));
6727         goto exit;
6728     }
6729 
6730     create_flags =
6731         DB_THREAD | DB_CREATE | DB_EXCL | (is_hot_index ? DB_IS_HOT_INDEX : 0);
6732     error =
6733         file->open(
6734             file,
6735             txn,
6736             table_name,
6737             NULL,
6738             DB_BTREE,
6739             create_flags,
6740             my_umask);
6741     if (error) {
6742         DBUG_PRINT(
6743             "error",
6744             ("Got error: %d when opening table '%s'", error, table_name));
6745         goto exit;
6746     }
6747 
6748     error =
6749         file->change_descriptor(
6750             file,
6751             txn,
6752             row_descriptor,
6753             (is_hot_index ? DB_IS_HOT_INDEX |
6754                 DB_UPDATE_CMP_DESCRIPTOR :
6755                 DB_UPDATE_CMP_DESCRIPTOR));
6756     if (error) {
6757         DBUG_PRINT(
6758             "error",
6759             ("Got error: %d when setting row descriptor for table '%s'",
6760                 error,
6761                 table_name));
6762         goto exit;
6763     }
6764 
6765     error = 0;
6766 exit:
6767     if (file) {
6768         int r = file->close(file, 0);
6769         assert_always(r==0);
6770     }
6771     TOKUDB_DBUG_RETURN(error);
6772 }
6773 
update_create_info(HA_CREATE_INFO * create_info)6774 void ha_tokudb::update_create_info(HA_CREATE_INFO* create_info) {
6775     if (share->has_auto_inc) {
6776         info(HA_STATUS_AUTO);
6777         if (!(create_info->used_fields & HA_CREATE_USED_AUTO) ||
6778             create_info->auto_increment_value < stats.auto_increment_value) {
6779             create_info->auto_increment_value = stats.auto_increment_value;
6780         }
6781     }
6782 #if defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) && \
6783     TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6784     if (!(create_info->used_fields & HA_CREATE_USED_ROW_FORMAT)) {
6785         // show create table asks us to update this create_info, this makes it
6786         // so we'll always show what compression type we're using
6787         create_info->row_type = get_row_type();
6788         if (create_info->row_type == ROW_TYPE_TOKU_ZLIB &&
6789             tokudb::sysvars::hide_default_row_format(ha_thd()) != 0) {
6790             create_info->row_type = ROW_TYPE_DEFAULT;
6791         }
6792     }
6793 #endif  // defined(TOKU_INCLUDE_ROW_TYPE_COMPRESSION) &&
6794         // TOKU_INCLUDE_ROW_TYPE_COMPRESSION
6795 }
6796 
6797 //
6798 // removes key name from status.tokudb.
6799 // needed for when we are dropping indexes, so that
6800 // during drop table, we do not attempt to remove already dropped
6801 // indexes because we did not keep status.tokudb in sync with list of indexes.
6802 //
remove_key_name_from_status(DB * status_block,char * key_name,DB_TXN * txn)6803 int ha_tokudb::remove_key_name_from_status(DB* status_block, char* key_name, DB_TXN* txn) {
6804     int error;
6805     uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6806     HA_METADATA_KEY md_key = hatoku_key_name;
6807     memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6808     //
6809     // put index name in status.tokudb
6810     //
6811     memcpy(
6812         status_key_info + sizeof(HA_METADATA_KEY),
6813         key_name,
6814         strlen(key_name) + 1
6815         );
6816     error = remove_metadata(
6817         status_block,
6818         status_key_info,
6819         sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6820         txn
6821         );
6822     return error;
6823 }
6824 
6825 //
6826 // writes the key name in status.tokudb, so that we may later delete or rename
6827 // the dictionary associated with key_name
6828 //
write_key_name_to_status(DB * status_block,char * key_name,DB_TXN * txn)6829 int ha_tokudb::write_key_name_to_status(DB* status_block, char* key_name, DB_TXN* txn) {
6830     int error;
6831     uchar status_key_info[FN_REFLEN + sizeof(HA_METADATA_KEY)];
6832     HA_METADATA_KEY md_key = hatoku_key_name;
6833     memcpy(status_key_info, &md_key, sizeof(HA_METADATA_KEY));
6834     //
6835     // put index name in status.tokudb
6836     //
6837     memcpy(
6838         status_key_info + sizeof(HA_METADATA_KEY),
6839         key_name,
6840         strlen(key_name) + 1
6841         );
6842     error = write_metadata(
6843         status_block,
6844         status_key_info,
6845         sizeof(HA_METADATA_KEY) + strlen(key_name) + 1,
6846         NULL,
6847         0,
6848         txn
6849         );
6850     return error;
6851 }
6852 
6853 //
6854 // some tracing moved out of ha_tokudb::create, because ::create was
6855 // getting cluttered
6856 //
trace_create_table_info(TABLE * form)6857 void ha_tokudb::trace_create_table_info(TABLE* form) {
6858     uint i;
6859     //
6860     // tracing information about what type of table we are creating
6861     //
6862     if (TOKUDB_UNLIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_OPEN))) {
6863         for (i = 0; i < form->s->fields; i++) {
6864             Field *field = form->s->field[i];
6865             TOKUDB_HANDLER_TRACE(
6866                 "field:%d:%s:type=%d:flags=%x",
6867                 i,
6868                 field->field_name,
6869                 field->type(),
6870                 field->flags);
6871         }
6872         for (i = 0; i < form->s->keys; i++) {
6873             KEY *key = &form->s->key_info[i];
6874             TOKUDB_HANDLER_TRACE(
6875                 "key:%d:%s:%d",
6876                 i,
6877                 key->name,
6878                 key->user_defined_key_parts);
6879             uint p;
6880             for (p = 0; p < key->user_defined_key_parts; p++) {
6881                 KEY_PART_INFO* key_part = &key->key_part[p];
6882                 Field* field = key_part->field;
6883                 TOKUDB_HANDLER_TRACE(
6884                     "key:%d:%d:length=%d:%s:type=%d:flags=%x",
6885                     i,
6886                     p,
6887                     key_part->length,
6888                     field->field_name,
6889                     field->type(),
6890                     field->flags);
6891             }
6892         }
6893     }
6894 }
6895 
get_max_desc_size(KEY_AND_COL_INFO * kc_info,TABLE * form)6896 static uint32_t get_max_desc_size(KEY_AND_COL_INFO* kc_info, TABLE* form) {
6897     uint32_t max_row_desc_buff_size;
6898     // upper bound of key comparison descriptor
6899     max_row_desc_buff_size = 2*(form->s->fields * 6)+10;
6900     // upper bound for sec. key part
6901     max_row_desc_buff_size += get_max_secondary_key_pack_desc_size(kc_info);
6902     // upper bound for clustering val part
6903     max_row_desc_buff_size += get_max_clustering_val_pack_desc_size(form->s);
6904     return max_row_desc_buff_size;
6905 }
6906 
create_secondary_key_descriptor(uchar * buf,KEY * key_info,KEY * prim_key,uint hpk,TABLE * form,uint primary_key,uint32_t keynr,KEY_AND_COL_INFO * kc_info)6907 static uint32_t create_secondary_key_descriptor(
6908     uchar* buf,
6909     KEY* key_info,
6910     KEY* prim_key,
6911     uint hpk,
6912     TABLE* form,
6913     uint primary_key,
6914     uint32_t keynr,
6915     KEY_AND_COL_INFO* kc_info) {
6916 
6917     uchar* ptr = NULL;
6918 
6919     ptr = buf;
6920     ptr += create_toku_key_descriptor(
6921         ptr,
6922         false,
6923         key_info,
6924         hpk,
6925         prim_key
6926         );
6927 
6928     ptr += create_toku_secondary_key_pack_descriptor(
6929         ptr,
6930         hpk,
6931         primary_key,
6932         form->s,
6933         form,
6934         kc_info,
6935         key_info,
6936         prim_key
6937         );
6938 
6939     ptr += create_toku_clustering_val_pack_descriptor(
6940         ptr,
6941         primary_key,
6942         form->s,
6943         kc_info,
6944         keynr,
6945         key_is_clustering(key_info)
6946         );
6947     return ptr - buf;
6948 }
6949 
6950 
6951 //
6952 // creates dictionary for secondary index, with key description key_info, all using txn
6953 //
create_secondary_dictionary(const char * name,TABLE * form,KEY * key_info,DB_TXN * txn,KEY_AND_COL_INFO * kc_info,uint32_t keynr,bool is_hot_index,toku_compression_method compression_method)6954 int ha_tokudb::create_secondary_dictionary(
6955     const char* name,
6956     TABLE* form,
6957     KEY* key_info,
6958     DB_TXN* txn,
6959     KEY_AND_COL_INFO* kc_info,
6960     uint32_t keynr,
6961     bool is_hot_index,
6962     toku_compression_method compression_method) {
6963 
6964     int error;
6965     DBT row_descriptor;
6966     uchar* row_desc_buff = NULL;
6967     char* newname = NULL;
6968     size_t newname_len = 0;
6969     KEY* prim_key = NULL;
6970     char dict_name[MAX_DICT_NAME_LEN];
6971     uint32_t max_row_desc_buff_size;
6972     uint hpk= (form->s->primary_key >= MAX_KEY) ?
6973         TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
6974     uint32_t block_size;
6975     uint32_t read_block_size;
6976     uint32_t fanout;
6977     THD* thd = ha_thd();
6978 
6979     memset(&row_descriptor, 0, sizeof(row_descriptor));
6980 
6981     max_row_desc_buff_size = get_max_desc_size(kc_info,form);
6982 
6983     row_desc_buff = (uchar*)tokudb::memory::malloc(
6984         max_row_desc_buff_size,
6985         MYF(MY_WME));
6986     if (row_desc_buff == NULL) {
6987         error = ENOMEM;
6988         goto cleanup;
6989     }
6990 
6991     newname_len = get_max_dict_name_path_length(name);
6992     newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
6993     if (newname == NULL) {
6994         error = ENOMEM;
6995         goto cleanup;
6996     }
6997 
6998     sprintf(dict_name, "key-%s", key_info->name);
6999     make_name(newname, newname_len, name, dict_name);
7000 
7001     prim_key = (hpk) ? NULL : &form->s->key_info[primary_key];
7002 
7003     //
7004     // setup the row descriptor
7005     //
7006     row_descriptor.data = row_desc_buff;
7007     //
7008     // save data necessary for key comparisons
7009     //
7010     row_descriptor.size = create_secondary_key_descriptor(
7011         row_desc_buff,
7012         key_info,
7013         prim_key,
7014         hpk,
7015         form,
7016         primary_key,
7017         keynr,
7018         kc_info);
7019     assert_always(row_descriptor.size <= max_row_desc_buff_size);
7020 
7021     block_size = tokudb::sysvars::block_size(thd);
7022     read_block_size = tokudb::sysvars::read_block_size(thd);
7023     fanout = tokudb::sysvars::fanout(thd);
7024 
7025     error = create_sub_table(
7026         newname,
7027         &row_descriptor,
7028         txn,
7029         block_size,
7030         read_block_size,
7031         compression_method,
7032         is_hot_index,
7033         fanout);
7034 cleanup:
7035     tokudb::memory::free(newname);
7036     tokudb::memory::free(row_desc_buff);
7037     return error;
7038 }
7039 
7040 
create_main_key_descriptor(uchar * buf,KEY * prim_key,uint hpk,uint primary_key,TABLE * form,KEY_AND_COL_INFO * kc_info)7041 static uint32_t create_main_key_descriptor(
7042     uchar* buf,
7043     KEY* prim_key,
7044     uint hpk,
7045     uint primary_key,
7046     TABLE* form,
7047     KEY_AND_COL_INFO* kc_info) {
7048 
7049     uchar* ptr = buf;
7050     ptr += create_toku_key_descriptor(
7051         ptr,
7052         hpk,
7053         prim_key,
7054         false,
7055         NULL);
7056 
7057     ptr += create_toku_main_key_pack_descriptor(ptr);
7058 
7059     ptr += create_toku_clustering_val_pack_descriptor(
7060         ptr,
7061         primary_key,
7062         form->s,
7063         kc_info,
7064         primary_key,
7065         false);
7066     return ptr - buf;
7067 }
7068 
7069 //
7070 // create and close the main dictionarr with name of "name" using table form, all within
7071 // transaction txn.
7072 //
create_main_dictionary(const char * name,TABLE * form,DB_TXN * txn,KEY_AND_COL_INFO * kc_info,toku_compression_method compression_method)7073 int ha_tokudb::create_main_dictionary(
7074     const char* name,
7075     TABLE* form,
7076     DB_TXN* txn,
7077     KEY_AND_COL_INFO* kc_info,
7078     toku_compression_method compression_method) {
7079 
7080     int error;
7081     DBT row_descriptor;
7082     uchar* row_desc_buff = NULL;
7083     char* newname = NULL;
7084     size_t newname_len = 0;
7085     KEY* prim_key = NULL;
7086     uint32_t max_row_desc_buff_size;
7087     uint hpk = (form->s->primary_key >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7088     uint32_t block_size;
7089     uint32_t read_block_size;
7090     uint32_t fanout;
7091     THD* thd = ha_thd();
7092 
7093     memset(&row_descriptor, 0, sizeof(row_descriptor));
7094     max_row_desc_buff_size = get_max_desc_size(kc_info, form);
7095 
7096     row_desc_buff = (uchar*)tokudb::memory::malloc(
7097         max_row_desc_buff_size,
7098         MYF(MY_WME));
7099     if (row_desc_buff == NULL) {
7100         error = ENOMEM;
7101         goto cleanup;
7102     }
7103 
7104     newname_len = get_max_dict_name_path_length(name);
7105     newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7106     if (newname == NULL) {
7107         error = ENOMEM;
7108         goto cleanup;
7109     }
7110 
7111     make_name(newname, newname_len, name, "main");
7112 
7113     prim_key = (hpk) ? NULL : &form->s->key_info[primary_key];
7114 
7115     //
7116     // setup the row descriptor
7117     //
7118     row_descriptor.data = row_desc_buff;
7119     //
7120     // save data necessary for key comparisons
7121     //
7122     row_descriptor.size = create_main_key_descriptor(
7123         row_desc_buff,
7124         prim_key,
7125         hpk,
7126         primary_key,
7127         form,
7128         kc_info);
7129     assert_always(row_descriptor.size <= max_row_desc_buff_size);
7130 
7131     block_size = tokudb::sysvars::block_size(thd);
7132     read_block_size = tokudb::sysvars::read_block_size(thd);
7133     fanout = tokudb::sysvars::fanout(thd);
7134 
7135     /* Create the main table that will hold the real rows */
7136     error = create_sub_table(
7137         newname,
7138         &row_descriptor,
7139         txn,
7140         block_size,
7141         read_block_size,
7142         compression_method,
7143         false,
7144         fanout);
7145 cleanup:
7146     tokudb::memory::free(newname);
7147     tokudb::memory::free(row_desc_buff);
7148     return error;
7149 }
7150 
7151 //
7152 // Creates a new table
7153 // Parameters:
7154 //      [in]    name - table name
7155 //      [in]    form - info on table, columns and indexes
7156 //      [in]    create_info - more info on table, CURRENTLY UNUSED
7157 // Returns:
7158 //      0 on success
7159 //      error otherwise
7160 //
create(const char * name,TABLE * form,HA_CREATE_INFO * create_info)7161 int ha_tokudb::create(
7162     const char* name,
7163     TABLE* form,
7164     HA_CREATE_INFO* create_info) {
7165 
7166     TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7167 
7168     int error;
7169     DB *status_block = NULL;
7170     uint version;
7171     uint capabilities;
7172     DB_TXN* txn = NULL;
7173     bool do_commit = false;
7174     char* newname = NULL;
7175     size_t newname_len = 0;
7176     KEY_AND_COL_INFO kc_info;
7177     tokudb_trx_data *trx = NULL;
7178     THD* thd = ha_thd();
7179 
7180     String database_name, table_name, dictionary_name;
7181     tokudb_split_dname(name, database_name, table_name, dictionary_name);
7182     if (database_name.is_empty() || table_name.is_empty()) {
7183         push_warning(thd,
7184                      Sql_condition::SL_WARNING,
7185                      ER_TABLE_NAME,
7186                      "TokuDB: Table Name or Database Name is empty");
7187         DBUG_RETURN(ER_TABLE_NAME);
7188     }
7189 
7190     memset(&kc_info, 0, sizeof(kc_info));
7191 
7192     // TDB-76 : CREATE TABLE ... LIKE ... does not use source row_format on
7193     //          target table
7194     // Original code would only use create_info->row_type if
7195     // create_info->used_fields & HA_CREATE_USED_ROW_FORMAT was true. This
7196     // would cause us to skip transferring the row_format for a table created
7197     // via CREATE TABLE tn LIKE tn. We also take on more InnoDB like behavior
7198     // and throw a warning if we get a row_format that we can't translate into
7199     // a known TokuDB row_format.
7200     tokudb::sysvars::row_format_t row_format =
7201         tokudb::sysvars::row_format(thd);
7202 
7203     if ((create_info->used_fields & HA_CREATE_USED_ROW_FORMAT) ||
7204         create_info->row_type != ROW_TYPE_DEFAULT) {
7205         row_format = row_type_to_row_format(create_info->row_type);
7206         if (row_format == tokudb::sysvars::SRV_ROW_FORMAT_DEFAULT &&
7207             create_info->row_type != ROW_TYPE_DEFAULT) {
7208             push_warning(thd,
7209                          Sql_condition::SL_WARNING,
7210                          ER_ILLEGAL_HA_CREATE_OPTION,
7211                          "TokuDB: invalid ROW_FORMAT specifier.");
7212         }
7213     }
7214     const toku_compression_method compression_method =
7215         row_format_to_toku_compression_method(row_format);
7216 
7217     bool create_from_engine = (create_info->table_options & HA_OPTION_CREATE_FROM_ENGINE);
7218     if (create_from_engine) {
7219         // table already exists, nothing to do
7220         error = 0;
7221         goto cleanup;
7222     }
7223 
7224     // validate the fields in the table. If the table has fields
7225     // we do not support that came from an old version of MySQL,
7226     // gracefully return an error
7227     for (uint32_t i = 0; i < form->s->fields; i++) {
7228         Field* field = table_share->field[i];
7229         if (!field_valid_for_tokudb_table(field)) {
7230             sql_print_error("Table %s has an invalid field %s, that was created "
7231                 "with an old version of MySQL. This field is no longer supported. "
7232                 "This is probably due to an alter table engine=TokuDB. To load this "
7233                 "table, do a dump and load",
7234                 name,
7235                 field->field_name
7236                 );
7237             error = HA_ERR_UNSUPPORTED;
7238             goto cleanup;
7239         }
7240     }
7241 
7242     newname_len = get_max_dict_name_path_length(name);
7243     newname = (char*)tokudb::memory::malloc(newname_len, MYF(MY_WME));
7244     if (newname == NULL) {
7245         error = ENOMEM;
7246         goto cleanup;
7247     }
7248 
7249     trx = (tokudb_trx_data *) thd_get_ha_data(ha_thd(), tokudb_hton);
7250     if (trx && trx->sub_sp_level &&
7251         thd_sql_command(thd) == SQLCOM_CREATE_TABLE) {
7252         txn = trx->sub_sp_level;
7253     } else {
7254         do_commit = true;
7255         error = txn_begin(db_env, 0, &txn, 0, thd);
7256         if (error) {
7257             goto cleanup;
7258         }
7259     }
7260 
7261     primary_key = form->s->primary_key;
7262     hidden_primary_key = (primary_key  >= MAX_KEY) ? TOKUDB_HIDDEN_PRIMARY_KEY_LENGTH : 0;
7263     if (hidden_primary_key) {
7264         primary_key = form->s->keys;
7265     }
7266 
7267     /* do some tracing */
7268     trace_create_table_info(form);
7269 
7270     /* Create status.tokudb and save relevant metadata */
7271     make_name(newname, newname_len, name, "status");
7272 
7273     error = tokudb::metadata::create(db_env, &status_block, newname, txn);
7274     if (error) { goto cleanup; }
7275 
7276     version = HA_TOKU_VERSION;
7277     error = write_to_status(
7278         status_block,
7279         hatoku_new_version,
7280         &version,
7281         sizeof(version),
7282         txn);
7283     if (error) {
7284         goto cleanup;
7285     }
7286 
7287     capabilities = HA_TOKU_CAP;
7288     error = write_to_status(
7289         status_block,
7290         hatoku_capabilities,
7291         &capabilities,
7292         sizeof(capabilities),
7293         txn);
7294     if (error) {
7295         goto cleanup;
7296     }
7297 
7298     error = write_auto_inc_create(
7299         status_block,
7300         create_info->auto_increment_value,
7301         txn);
7302     if (error) {
7303         goto cleanup;
7304     }
7305 
7306 #if defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
7307 #if defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
7308     if (form->part_info == NULL) {
7309         error = write_frm_data(status_block, txn, form->s->path.str);
7310         if (error) {
7311             goto cleanup;
7312         }
7313     }
7314 #else
7315     error = write_frm_data(status_block, txn, form->s->path.str);
7316     if (error) {
7317         goto cleanup;
7318     }
7319 #endif  // defined(WITH_PARTITION_STORAGE_ENGINE) && WITH_PARTITION_STORAGE_ENGINE
7320 #endif  // defined(TOKU_INCLUDE_WRITE_FRM_DATA) && TOKU_INCLUDE_WRITE_FRM_DATA
7321 
7322     error = allocate_key_and_col_info(form->s, &kc_info);
7323     if (error) {
7324         goto cleanup;
7325     }
7326 
7327     error = initialize_key_and_col_info(
7328         form->s,
7329         form,
7330         &kc_info,
7331         hidden_primary_key,
7332         primary_key);
7333     if (error) {
7334         goto cleanup;
7335     }
7336 
7337     error = create_main_dictionary(
7338         name,
7339         form,
7340         txn,
7341         &kc_info,
7342         compression_method);
7343     if (error) {
7344         goto cleanup;
7345     }
7346 
7347 
7348     for (uint i = 0; i < form->s->keys; i++) {
7349         if (i != primary_key) {
7350             error = create_secondary_dictionary(
7351                 name,
7352                 form,
7353                 &form->key_info[i],
7354                 txn,
7355                 &kc_info,
7356                 i,
7357                 false,
7358                 compression_method);
7359             if (error) {
7360                 goto cleanup;
7361             }
7362 
7363             error = write_key_name_to_status(
7364                 status_block,
7365                 form->s->key_info[i].name,
7366                 txn);
7367             if (error) {
7368                 goto cleanup;
7369             }
7370         }
7371     }
7372 
7373     error = 0;
7374 cleanup:
7375     if (status_block != NULL) {
7376         int r = tokudb::metadata::close(&status_block);
7377         assert_always(r==0);
7378     }
7379     free_key_and_col_info(&kc_info);
7380     if (do_commit && txn) {
7381         if (error) {
7382             abort_txn(txn);
7383         } else {
7384             commit_txn(txn,0);
7385         }
7386     }
7387     tokudb::memory::free(newname);
7388     TOKUDB_HANDLER_DBUG_RETURN(error);
7389 }
7390 
discard_or_import_tablespace(TOKUDB_UNUSED (my_bool discard))7391 int ha_tokudb::discard_or_import_tablespace(TOKUDB_UNUSED(my_bool discard)) {
7392     /*
7393     if (discard) {
7394         my_errno=HA_ERR_WRONG_COMMAND;
7395         return my_errno;
7396     }
7397     return add_table_to_metadata(share->table_name);
7398     */
7399     set_my_errno(HA_ERR_WRONG_COMMAND);
7400     return my_errno();
7401 }
7402 
7403 
7404 //
7405 // deletes from_name or renames from_name to to_name, all using transaction txn.
7406 // is_delete specifies which we are doing
7407 // is_key specifies if it is a secondary index (and hence a "key-" needs to be prepended) or
7408 // if it is not a secondary index
7409 //
delete_or_rename_dictionary(const char * from_name,const char * to_name,const char * secondary_name,bool is_key,DB_TXN * txn,bool is_delete)7410 int ha_tokudb::delete_or_rename_dictionary(
7411     const char* from_name,
7412     const char* to_name,
7413     const char* secondary_name,
7414     bool is_key,
7415     DB_TXN* txn,
7416     bool is_delete) {
7417 
7418     int error;
7419     char dict_name[MAX_DICT_NAME_LEN];
7420     char* new_from_name = NULL;
7421     size_t new_from_name_len = 0;
7422     char* new_to_name = NULL;
7423     size_t new_to_name_len = 0;
7424     assert_always(txn);
7425 
7426     new_from_name_len = get_max_dict_name_path_length(from_name);
7427     new_from_name = (char*)tokudb::memory::malloc(
7428         new_from_name_len,
7429         MYF(MY_WME));
7430     if (new_from_name == NULL) {
7431         error = ENOMEM;
7432         goto cleanup;
7433     }
7434     if (!is_delete) {
7435         assert_always(to_name);
7436         new_to_name_len = get_max_dict_name_path_length(to_name);
7437         new_to_name = (char*)tokudb::memory::malloc(
7438             new_to_name_len,
7439             MYF(MY_WME));
7440         if (new_to_name == NULL) {
7441             error = ENOMEM;
7442             goto cleanup;
7443         }
7444     }
7445 
7446     if (is_key) {
7447         sprintf(dict_name, "key-%s", secondary_name);
7448         make_name(new_from_name, new_from_name_len, from_name, dict_name);
7449     } else {
7450         make_name(new_from_name, new_from_name_len, from_name, secondary_name);
7451     }
7452     if (!is_delete) {
7453         if (is_key) {
7454             sprintf(dict_name, "key-%s", secondary_name);
7455             make_name(new_to_name, new_to_name_len, to_name, dict_name);
7456         } else {
7457             make_name(new_to_name, new_to_name_len, to_name, secondary_name);
7458         }
7459     }
7460 
7461     if (is_delete) {
7462         error = db_env->dbremove(db_env, txn, new_from_name, NULL, 0);
7463     } else {
7464         error = db_env->dbrename(
7465             db_env,
7466             txn,
7467             new_from_name,
7468             NULL,
7469             new_to_name,
7470             0);
7471     }
7472     if (error) {
7473         goto cleanup;
7474     }
7475 
7476 cleanup:
7477     tokudb::memory::free(new_from_name);
7478     tokudb::memory::free(new_to_name);
7479     return error;
7480 }
7481 
7482 
7483 //
7484 // deletes or renames a table. if is_delete is true, then we delete, and to_name can be NULL
7485 // if is_delete is false, then to_name must be non-NULL, as we are renaming the table.
7486 //
delete_or_rename_table(const char * from_name,const char * to_name,bool is_delete)7487 int ha_tokudb::delete_or_rename_table (const char* from_name, const char* to_name, bool is_delete) {
7488     THD *thd = ha_thd();
7489     int error;
7490     DB* status_db = NULL;
7491     DBC* status_cursor = NULL;
7492     DB_TXN* txn = NULL;
7493     DBT curr_key;
7494     DBT curr_val;
7495     memset(&curr_key, 0, sizeof(curr_key));
7496     memset(&curr_val, 0, sizeof(curr_val));
7497 
7498     DB_TXN *parent_txn = NULL;
7499     tokudb_trx_data *trx = NULL;
7500     trx = (tokudb_trx_data *) thd_get_ha_data(thd, tokudb_hton);
7501     if (thd_sql_command(ha_thd()) == SQLCOM_CREATE_TABLE && trx && trx->sub_sp_level) {
7502         parent_txn = trx->sub_sp_level;
7503     }
7504 
7505     error = txn_begin(db_env, parent_txn, &txn, 0, thd);
7506     if (error) { goto cleanup; }
7507 
7508     //
7509     // open status db,
7510     // create cursor,
7511     // for each name read out of there, create a db and delete or rename it
7512     //
7513     error = open_status_dictionary(&status_db, from_name, txn);
7514     if (error) { goto cleanup; }
7515 
7516     error = status_db->cursor(status_db, txn, &status_cursor, 0);
7517     if (error) { goto cleanup; }
7518     status_cursor->c_set_check_interrupt_callback(status_cursor, tokudb_killed_thd_callback, thd);
7519 
7520     while (error != DB_NOTFOUND) {
7521         error = status_cursor->c_get(status_cursor, &curr_key, &curr_val, DB_NEXT);
7522         if (error && error != DB_NOTFOUND) {
7523             error = map_to_handler_error(error);
7524             goto cleanup;
7525         }
7526         if (error == DB_NOTFOUND) {
7527             break;
7528         }
7529         HA_METADATA_KEY mk = *(HA_METADATA_KEY *)curr_key.data;
7530         if (mk != hatoku_key_name) {
7531             continue;
7532         }
7533         error = delete_or_rename_dictionary(from_name, to_name, (char *)((char *)curr_key.data + sizeof(HA_METADATA_KEY)), true, txn, is_delete);
7534         if (error) { goto cleanup; }
7535     }
7536 
7537     //
7538     // delete or rename main.tokudb
7539     //
7540     error = delete_or_rename_dictionary(from_name, to_name, "main", false, txn, is_delete);
7541     if (error) { goto cleanup; }
7542 
7543     error = status_cursor->c_close(status_cursor);
7544     assert_always(error==0);
7545     status_cursor = NULL;
7546     if (error) { goto cleanup; }
7547 
7548     error = status_db->close(status_db, 0);
7549     assert_always(error == 0);
7550     status_db = NULL;
7551 
7552     //
7553     // delete or rename status.tokudb
7554     //
7555     error = delete_or_rename_dictionary(from_name, to_name, "status", false, txn, is_delete);
7556     if (error) { goto cleanup; }
7557 
7558     set_my_errno(error);
7559 cleanup:
7560     if (status_cursor) {
7561         int r = status_cursor->c_close(status_cursor);
7562         assert_always(r==0);
7563     }
7564     if (status_db) {
7565         int r = status_db->close(status_db, 0);
7566         assert_always(r==0);
7567     }
7568     if (txn) {
7569         if (error) {
7570             abort_txn(txn);
7571         }
7572         else {
7573             commit_txn(txn, 0);
7574         }
7575     }
7576     return error;
7577 }
7578 
delete_non_partitioned_table(const char * name)7579 int ha_tokudb::delete_non_partitioned_table(const char* name) {
7580     TOKUDB_HANDLER_DBUG_ENTER("%s", name);
7581     TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(name, NULL, false);
7582     if (share) {
7583         share->unlock();
7584         share->release();
7585         // this should be enough to handle locking as the higher level MDL
7586         // on this table should prevent any new analyze tasks.
7587         share->cancel_background_jobs();
7588         TOKUDB_SHARE::drop_share(share);
7589     }
7590 
7591     int error;
7592     error = delete_or_rename_table(name, NULL, true);
7593     if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7594         error == DB_LOCK_NOTGRANTED) {
7595         sql_print_error(
7596             "Could not delete table %s because another transaction has "
7597             "accessed the table. To drop the table, make sure no "
7598             "transactions touch the table.",
7599             name);
7600     }
7601     TOKUDB_HANDLER_DBUG_RETURN(error);
7602 }
7603 
delete_rename_partitioned_table(const char * from,const char * to,const std::string & partition_info_str)7604 int ha_tokudb::delete_rename_partitioned_table(
7605     const char* from,
7606     const char* to,
7607     const std::string& partition_info_str) {
7608     THD* thd = ha_thd();
7609     assert(thd);
7610     MEM_ROOT* mem_root = thd->mem_root;
7611 
7612     partition_info* part_info =
7613         native_part::parse_partition_info(ha_thd(), partition_info_str);
7614     ha_tokupart file(tokudb_hton, nullptr);
7615     if (file.init_partitioning(mem_root))
7616         return HA_ERR_CANNOT_INITIALIZE_PARTITIONING;
7617 
7618     file.set_part_info(part_info, false);
7619     if (file.initialize_partition(mem_root))
7620         return HA_ERR_CANNOT_INITIALIZE_PARTITIONING;
7621 
7622     if (to)
7623         return file.rename_table(from, to);
7624 
7625     return file.delete_table(from);
7626 }
7627 
7628 //
7629 // Drops table
7630 // Parameters:
7631 //      [in]    name - name of table to be deleted
7632 // Returns:
7633 //      0 on success
7634 //      error otherwise
7635 //
delete_table(const char * name)7636 int ha_tokudb::delete_table(const char* name) {
7637     assert(name);
7638     std::string partition_info_str;
7639     if (!native_part::get_part_str_for_table(name, partition_info_str))
7640         return HA_ERR_TABLE_CORRUPT;
7641     if (partition_info_str.empty())
7642         return delete_non_partitioned_table(name);
7643     return delete_rename_partitioned_table(name, nullptr, partition_info_str);
7644 }
7645 
tokudb_check_db_dir_exist_from_table_name(const char * table_name)7646 static bool tokudb_check_db_dir_exist_from_table_name(const char* table_name) {
7647     assert(table_name);
7648     bool mysql_dir_exists;
7649     char db_name[FN_REFLEN];
7650     const char *db_name_begin = strchr(table_name, FN_LIBCHAR);
7651     const char *db_name_end = strrchr(table_name, FN_LIBCHAR);
7652     assert(db_name_begin);
7653     assert(db_name_end);
7654     assert(db_name_begin != db_name_end);
7655 
7656     ++db_name_begin;
7657     size_t db_name_size = db_name_end - db_name_begin;
7658 
7659     assert(db_name_size < FN_REFLEN);
7660 
7661     memcpy(db_name, db_name_begin, db_name_size);
7662     db_name[db_name_size] = '\0';
7663 
7664     // At this point, db_name contains the MySQL formatted database name.
7665     // This is exactly the same format that would come into us through a
7666     // CREATE TABLE. Some charaters (like ':' for example) might be expanded
7667     // into hex (':' would papear as "@003a").
7668     // We need to check that the MySQL destination database directory exists.
7669     mysql_dir_exists = (my_access(db_name, F_OK) == 0);
7670 
7671     return mysql_dir_exists;
7672 }
7673 
rename_non_partitioned_table(const char * from,const char * to)7674 int ha_tokudb::rename_non_partitioned_table(const char* from, const char* to) {
7675     TOKUDB_HANDLER_DBUG_ENTER("%s %s", from, to);
7676     TOKUDB_SHARE* share = TOKUDB_SHARE::get_share(from, NULL, false);
7677     if (share) {
7678         share->unlock();
7679         share->release();
7680         // this should be enough to handle locking as the higher level MDL
7681         // on this table should prevent any new analyze tasks.
7682         share->cancel_background_jobs();
7683         TOKUDB_SHARE::drop_share(share);
7684     }
7685     int error;
7686     bool to_db_dir_exist = tokudb_check_db_dir_exist_from_table_name(to);
7687     if (!to_db_dir_exist) {
7688         sql_print_error(
7689             "Could not rename table from %s to %s because "
7690             "destination db does not exist",
7691             from,
7692             to);
7693         error = HA_ERR_DEST_SCHEMA_NOT_EXIST;
7694     }
7695     else {
7696         error = delete_or_rename_table(from, to, false);
7697         if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
7698             error == DB_LOCK_NOTGRANTED) {
7699             sql_print_error(
7700                 "Could not rename table from %s to %s because another transaction "
7701                 "has accessed the table. To rename the table, make sure no "
7702                 "transactions touch the table.",
7703                 from,
7704                 to);
7705         }
7706     }
7707     TOKUDB_HANDLER_DBUG_RETURN(error);
7708 }
7709 
7710 //
7711 // renames table from "from" to "to"
7712 // Parameters:
7713 //      [in]    name - old name of table
7714 //      [in]    to - new name of table
7715 // Returns:
7716 //      0 on success
7717 //      error otherwise
7718 //
rename_table(const char * from,const char * to)7719 int ha_tokudb::rename_table(const char* from, const char* to) {
7720     assert(from);
7721     assert(to);
7722     std::string partition_info_str;
7723     if (!native_part::get_part_str_for_table(from, partition_info_str))
7724         return DB_NOTFOUND;  // TODO: set correct error code here
7725     if (partition_info_str.empty())
7726         return rename_non_partitioned_table(from, to);
7727     return delete_rename_partitioned_table(from, to, partition_info_str);
7728 }
7729 
7730 /*
7731   Returns estimate on number of seeks it will take to read through the table
7732   This is to be comparable to the number returned by records_in_range so
7733   that we can decide if we should scan the table or use keys.
7734 */
7735 /// QQQ why divide by 3
scan_time()7736 double ha_tokudb::scan_time() {
7737     TOKUDB_HANDLER_DBUG_ENTER("");
7738     double ret_val = (double)stats.records / 3;
7739     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7740         TOKUDB_DEBUG_RETURN,
7741         "return %" PRIu64 " %f",
7742         (uint64_t)stats.records,
7743         ret_val);
7744     DBUG_RETURN(ret_val);
7745 }
7746 
keyread_time(uint index,uint ranges,ha_rows rows)7747 double ha_tokudb::keyread_time(uint index, uint ranges, ha_rows rows)
7748 {
7749     TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7750     double ret_val;
7751     if (index == primary_key || key_is_clustering(&table->key_info[index])) {
7752         ret_val = read_time(index, ranges, rows);
7753         DBUG_RETURN(ret_val);
7754     }
7755     /*
7756       It is assumed that we will read trough the whole key range and that all
7757       key blocks are half full (normally things are much better). It is also
7758       assumed that each time we read the next key from the index, the handler
7759       performs a random seek, thus the cost is proportional to the number of
7760       blocks read. This model does not take into account clustered indexes -
7761       engines that support that (e.g. InnoDB) may want to overwrite this method.
7762     */
7763     double keys_per_block= (stats.block_size/2.0/
7764                             (table->key_info[index].key_length +
7765                              ref_length) + 1);
7766     ret_val = (rows + keys_per_block - 1)/ keys_per_block;
7767     TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7768 }
7769 
7770 //
7771 // Calculate the time it takes to read a set of ranges through an index
7772 // This enables us to optimize reads for clustered indexes.
7773 // Implementation pulled from InnoDB
7774 // Parameters:
7775 //          index - index to use
7776 //          ranges - number of ranges
7777 //          rows - estimated number of rows in the range
7778 // Returns:
7779 //      estimated time measured in disk seeks
7780 //
read_time(uint index,uint ranges,ha_rows rows)7781 double ha_tokudb::read_time(
7782     uint    index,
7783     uint    ranges,
7784     ha_rows rows
7785     )
7786 {
7787     TOKUDB_HANDLER_DBUG_ENTER("%u %u %" PRIu64, index, ranges, (uint64_t) rows);
7788     double total_scan;
7789     double ret_val;
7790     bool is_primary = (index == primary_key);
7791     bool is_clustering;
7792 
7793     //
7794     // in case for hidden primary key, this is called
7795     //
7796     if (index >= table_share->keys) {
7797         ret_val = handler::read_time(index, ranges, rows);
7798         goto cleanup;
7799     }
7800 
7801     is_clustering = key_is_clustering(&table->key_info[index]);
7802 
7803 
7804     //
7805     // if it is not the primary key, and it is not a clustering key, then return handler::read_time
7806     //
7807     if (!(is_primary || is_clustering)) {
7808         ret_val = handler::read_time(index, ranges, rows);
7809         goto cleanup;
7810     }
7811 
7812     //
7813     // for primary key and for clustered keys, return a fraction of scan_time()
7814     //
7815     total_scan = scan_time();
7816 
7817     if (stats.records < rows) {
7818         ret_val = is_clustering ? total_scan + 0.00001 : total_scan;
7819         goto cleanup;
7820     }
7821 
7822     //
7823     // one disk seek per range plus the proportional scan time of the rows
7824     //
7825     ret_val = (ranges + (double) rows / (double) stats.records * total_scan);
7826     ret_val = is_clustering ? ret_val + 0.00001 : ret_val;
7827 
7828 cleanup:
7829     TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7830 }
7831 
index_only_read_time(uint keynr,double records)7832 double ha_tokudb::index_only_read_time(uint keynr, double records) {
7833     TOKUDB_HANDLER_DBUG_ENTER("%u %f", keynr, records);
7834     double ret_val = keyread_time(keynr, 1, (ha_rows)records);
7835     TOKUDB_HANDLER_DBUG_RETURN_DOUBLE(ret_val);
7836 }
7837 
7838 //
7839 // Estimates the number of index records in a range. In case of errors, return
7840 //   HA_TOKUDB_RANGE_COUNT instead of HA_POS_ERROR. This was behavior
7841 //   when we got the handlerton from MySQL.
7842 // Parameters:
7843 //              keynr -index to use
7844 //      [in]    start_key - low end of the range
7845 //      [in]    end_key - high end of the range
7846 // Returns:
7847 //      0 - There are no matching keys in the given range
7848 //      number > 0 - There are approximately number matching rows in the range
7849 //      HA_POS_ERROR - Something is wrong with the index tree
7850 //
records_in_range(uint keynr,key_range * start_key,key_range * end_key)7851 ha_rows ha_tokudb::records_in_range(uint keynr, key_range* start_key, key_range* end_key) {
7852     TOKUDB_HANDLER_DBUG_ENTER("%d %p %p", keynr, start_key, end_key);
7853     DBT *pleft_key, *pright_key;
7854     DBT left_key, right_key;
7855     ha_rows ret_val = HA_TOKUDB_RANGE_COUNT;
7856     DB *kfile = share->key_file[keynr];
7857     uint64_t rows = 0;
7858     int error;
7859 
7860     // get start_rows and end_rows values so that we can estimate range
7861     // when calling key_range64, the only value we can trust is the value for less
7862     // The reason is that the key being passed in may be a prefix of keys in the DB
7863     // As a result, equal may be 0 and greater may actually be equal+greater
7864     // So, we call key_range64 on the key, and the key that is after it.
7865     if (!start_key && !end_key) {
7866         error = estimate_num_rows(share->file, &rows, transaction);
7867         if (error) {
7868             ret_val = HA_TOKUDB_RANGE_COUNT;
7869             goto cleanup;
7870         }
7871         ret_val = (rows <= 1) ? 1 : rows;
7872         goto cleanup;
7873     }
7874     if (start_key) {
7875         uchar inf_byte = (start_key->flag == HA_READ_KEY_EXACT) ? COL_NEG_INF : COL_POS_INF;
7876         pack_key(&left_key, keynr, key_buff, start_key->key, start_key->length, inf_byte);
7877         pleft_key = &left_key;
7878     } else {
7879         pleft_key = NULL;
7880     }
7881     if (end_key) {
7882         uchar inf_byte = (end_key->flag == HA_READ_BEFORE_KEY) ? COL_NEG_INF : COL_POS_INF;
7883         pack_key(&right_key, keynr, key_buff2, end_key->key, end_key->length, inf_byte);
7884         pright_key = &right_key;
7885     } else {
7886         pright_key = NULL;
7887     }
7888     // keys_range64 can not handle a degenerate range (left_key > right_key), so we filter here
7889     if (pleft_key && pright_key && tokudb_cmp_dbt_key(kfile, pleft_key, pright_key) > 0) {
7890         rows = 0;
7891     } else {
7892         uint64_t less, equal1, middle, equal2, greater;
7893         bool is_exact;
7894         error = kfile->keys_range64(kfile, transaction, pleft_key, pright_key,
7895                                     &less, &equal1, &middle, &equal2, &greater, &is_exact);
7896         if (error) {
7897             ret_val = HA_TOKUDB_RANGE_COUNT;
7898             goto cleanup;
7899         }
7900         rows = middle;
7901     }
7902 
7903     // MySQL thinks a return value of 0 means there are exactly 0 rows
7904     // Therefore, always return non-zero so this assumption is not made
7905     ret_val = (ha_rows) (rows <= 1 ? 1 : rows);
7906 
7907 cleanup:
7908     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7909         TOKUDB_DEBUG_RETURN,
7910         "return %" PRIu64 " %" PRIu64,
7911         (uint64_t)ret_val,
7912         rows);
7913     DBUG_RETURN(ret_val);
7914 }
7915 
7916 
7917 //
7918 // Initializes the auto-increment data in the local "share" object to the
7919 // greater of two values: what's stored in the metadata or the last inserted
7920 // auto-increment field (if auto-increment field is the first field of a key).
7921 //
init_auto_increment()7922 void ha_tokudb::init_auto_increment() {
7923     int error;
7924     DB_TXN* txn = NULL;
7925 
7926     error = txn_begin(db_env, 0, &txn, 0, ha_thd());
7927     if (error) {
7928         share->last_auto_increment = 0;
7929     } else {
7930         HA_METADATA_KEY key_val;
7931         DBT key;
7932         memset(&key, 0, sizeof(key));
7933         key.data = &key_val;
7934         key.size = sizeof(key_val);
7935         DBT value;
7936         memset(&value, 0, sizeof(value));
7937         value.flags = DB_DBT_USERMEM;
7938 
7939         // Retrieve the initial auto increment value, as specified by create table
7940         // so if a user does "create table t1 (a int auto_increment, primary key (a)) auto_increment=100",
7941         // then the value 100 should be stored here
7942         key_val = hatoku_ai_create_value;
7943         value.ulen = sizeof(share->auto_inc_create_value);
7944         value.data = &share->auto_inc_create_value;
7945         error = share->status_block->get(share->status_block, txn, &key, &value, 0);
7946 
7947         if (error || value.size != sizeof(share->auto_inc_create_value)) {
7948             share->auto_inc_create_value = 0;
7949         }
7950 
7951         // Retrieve hatoku_max_ai, which is max value used by auto increment
7952         // column so far, the max value could have been auto generated (e.g. insert (NULL))
7953         // or it could have been manually inserted by user (e.g. insert (345))
7954         key_val = hatoku_max_ai;
7955         value.ulen = sizeof(share->last_auto_increment);
7956         value.data = &share->last_auto_increment;
7957         error = share->status_block->get(share->status_block, txn, &key, &value, 0);
7958 
7959         if (error || value.size != sizeof(share->last_auto_increment)) {
7960             if (share->auto_inc_create_value)
7961                 share->last_auto_increment = share->auto_inc_create_value - 1;
7962             else
7963                 share->last_auto_increment = 0;
7964         }
7965 
7966         commit_txn(txn, 0);
7967     }
7968     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
7969         TOKUDB_DEBUG_AUTO_INCREMENT,
7970         "init auto increment:%lld",
7971         share->last_auto_increment);
7972 }
7973 
get_auto_increment(ulonglong offset,ulonglong increment,ulonglong nb_desired_values,ulonglong * first_value,ulonglong * nb_reserved_values)7974 void ha_tokudb::get_auto_increment(
7975     ulonglong offset,
7976     ulonglong increment,
7977     ulonglong nb_desired_values,
7978     ulonglong* first_value,
7979     ulonglong* nb_reserved_values) {
7980 
7981     TOKUDB_HANDLER_DBUG_ENTER("");
7982     ulonglong nr;
7983     bool over;
7984 
7985     share->lock();
7986 
7987     if (share->auto_inc_create_value > share->last_auto_increment) {
7988         nr = share->auto_inc_create_value;
7989         over = false;
7990         share->last_auto_increment = share->auto_inc_create_value;
7991     } else {
7992         nr = share->last_auto_increment + increment;
7993         over = nr < share->last_auto_increment;
7994         if (over)
7995             nr = ULLONG_MAX;
7996     }
7997     if (!over) {
7998         share->last_auto_increment = nr + (nb_desired_values - 1)*increment;
7999         if (delay_updating_ai_metadata) {
8000             ai_metadata_update_required = true;
8001         } else {
8002             update_max_auto_inc(
8003                 share->status_block,
8004                 share->last_auto_increment);
8005         }
8006     }
8007     TOKUDB_HANDLER_TRACE_FOR_FLAGS(
8008         TOKUDB_DEBUG_AUTO_INCREMENT,
8009         "get_auto_increment(%lld,%lld,%lld): got:%lld:%lld",
8010         offset,
8011         increment,
8012         nb_desired_values,
8013         nr,
8014         nb_desired_values);
8015     *first_value = nr;
8016     *nb_reserved_values = nb_desired_values;
8017     share->unlock();
8018     TOKUDB_HANDLER_DBUG_VOID_RETURN;
8019 }
8020 
is_optimize_blocking()8021 bool ha_tokudb::is_optimize_blocking() {
8022     return false;
8023 }
8024 
is_auto_inc_singleton()8025 bool ha_tokudb::is_auto_inc_singleton(){
8026     return false;
8027 }
8028 
8029 
8030 // Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8031 // With a transaction, drops dictionaries associated with indexes in key_num
8032 //
8033 //
8034 // Adds indexes to the table. Takes the array of KEY passed in key_info, and creates
8035 // DB's that will go at the end of share->key_file. THE IMPLICIT ASSUMPTION HERE is
8036 // that the table will be modified and that these added keys will be appended to the end
8037 // of the array table->key_info
8038 // Parameters:
8039 //      [in]    table_arg - table that is being modified, seems to be identical to this->table
8040 //      [in]    key_info - array of KEY's to be added
8041 //              num_of_keys - number of keys to be added, number of elements in key_info
8042 //  Returns:
8043 //      0 on success, error otherwise
8044 //
tokudb_add_index(TABLE * table_arg,KEY * key_info,uint num_of_keys,DB_TXN * txn,bool * inc_num_DBs,bool * modified_DBs)8045 int ha_tokudb::tokudb_add_index(
8046     TABLE* table_arg,
8047     KEY* key_info,
8048     uint num_of_keys,
8049     DB_TXN* txn,
8050     bool* inc_num_DBs,
8051     bool* modified_DBs) {
8052 
8053     TOKUDB_HANDLER_DBUG_ENTER("");
8054     assert_always(txn);
8055 
8056     int error;
8057     uint curr_index = 0;
8058     DBC* tmp_cursor = NULL;
8059     int cursor_ret_val = 0;
8060     DBT curr_pk_key, curr_pk_val;
8061     THD* thd = ha_thd();
8062     DB_LOADER* loader = NULL;
8063     DB_INDEXER* indexer = NULL;
8064     bool loader_save_space = tokudb::sysvars::load_save_space(thd);
8065     bool use_hot_index = (lock.type == TL_WRITE_ALLOW_WRITE);
8066     uint32_t loader_flags = loader_save_space ? LOADER_COMPRESS_INTERMEDIATES : 0;
8067     uint32_t indexer_flags = 0;
8068     uint32_t mult_db_flags[MAX_KEY + 1] = {0};
8069     uint32_t mult_put_flags[MAX_KEY + 1];
8070     uint32_t mult_dbt_flags[MAX_KEY + 1];
8071     bool creating_hot_index = false;
8072     struct loader_context lc;
8073     memset(&lc, 0, sizeof lc);
8074     lc.thd = thd;
8075     lc.ha = this;
8076     loader_error = 0;
8077     bool rw_lock_taken = false;
8078     *inc_num_DBs = false;
8079     *modified_DBs = false;
8080     invalidate_bulk_fetch();
8081     unpack_entire_row = true; // for bulk fetching rows
8082     for (uint32_t i = 0; i < MAX_KEY+1; i++) {
8083         mult_put_flags[i] = 0;
8084         mult_dbt_flags[i] = DB_DBT_REALLOC;
8085     }
8086     //
8087     // number of DB files we have open currently, before add_index is executed
8088     //
8089     uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8090 
8091     //
8092     // get the row type to use for the indexes we're adding
8093     //
8094     toku_compression_method compression_method =
8095         get_compression_method(share->file);
8096 
8097     //
8098     // status message to be shown in "show process list"
8099     //
8100     const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
8101     // buffer of 200 should be a good upper bound.
8102     char status_msg[MAX_ALIAS_NAME + 200];
8103     // variable that stores number of elements inserted thus far
8104     ulonglong num_processed = 0;
8105     thd_proc_info(thd, "Adding indexes");
8106 
8107     //
8108     // in unpack_row, MySQL passes a buffer that is this long,
8109     // so this length should be good enough for us as well
8110     //
8111     memset((void *) &curr_pk_key, 0, sizeof(curr_pk_key));
8112     memset((void *) &curr_pk_val, 0, sizeof(curr_pk_val));
8113 
8114     //
8115     // The files for secondary tables are derived from the name of keys
8116     // If we try to add a key with the same name as an already existing key,
8117     // We can crash. So here we check if any of the keys added has the same
8118     // name of an existing key, and if so, we fail gracefully
8119     //
8120     for (uint i = 0; i < num_of_keys; i++) {
8121         for (uint j = 0; j < table_arg->s->keys; j++) {
8122             if (strcmp(key_info[i].name, table_arg->s->key_info[j].name) == 0) {
8123                 error = HA_ERR_WRONG_COMMAND;
8124                 goto cleanup;
8125             }
8126         }
8127     }
8128 
8129     rwlock_t_lock_write(share->_num_DBs_lock);
8130     rw_lock_taken = true;
8131     //
8132     // open all the DB files and set the appropriate variables in share
8133     // they go to the end of share->key_file
8134     //
8135     creating_hot_index =
8136         use_hot_index && num_of_keys == 1 &&
8137         (key_info[0].flags & HA_NOSAME) == 0;
8138     if (use_hot_index && (share->num_DBs > curr_num_DBs)) {
8139         //
8140         // already have hot index in progress, get out
8141         //
8142         error = HA_ERR_INTERNAL_ERROR;
8143         goto cleanup;
8144     }
8145     curr_index = curr_num_DBs;
8146     *modified_DBs = true;
8147     for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8148         if (key_is_clustering(&key_info[i])) {
8149             set_key_filter(
8150                 &share->kc_info.key_filters[curr_index],
8151                 &key_info[i],
8152                 table_arg,
8153                 false);
8154             if (!hidden_primary_key) {
8155                 set_key_filter(
8156                     &share->kc_info.key_filters[curr_index],
8157                     &table_arg->key_info[primary_key],
8158                     table_arg,
8159                     false);
8160             }
8161 
8162             error = initialize_col_pack_info(
8163                 &share->kc_info,
8164                 table_arg->s,
8165                 curr_index);
8166             if (error) {
8167                 goto cleanup;
8168             }
8169         }
8170 
8171 
8172         error = create_secondary_dictionary(
8173             share->full_table_name(),
8174             table_arg,
8175             &key_info[i],
8176             txn,
8177             &share->kc_info,
8178             curr_index,
8179             creating_hot_index,
8180             compression_method);
8181         if (error) {
8182             goto cleanup;
8183         }
8184 
8185         error = open_secondary_dictionary(
8186             &share->key_file[curr_index],
8187             &key_info[i],
8188             share->full_table_name(),
8189             false,
8190             txn);
8191         if (error) {
8192             goto cleanup;
8193         }
8194     }
8195 
8196     if (creating_hot_index) {
8197         share->num_DBs++;
8198         *inc_num_DBs = true;
8199         error = db_env->create_indexer(
8200             db_env,
8201             txn,
8202             &indexer,
8203             share->file,
8204             num_of_keys,
8205             &share->key_file[curr_num_DBs],
8206             mult_db_flags,
8207             indexer_flags);
8208         if (error) {
8209             goto cleanup;
8210         }
8211 
8212         error = indexer->set_poll_function(
8213             indexer, ha_tokudb::tokudb_add_index_poll, &lc);
8214         if (error) {
8215             goto cleanup;
8216         }
8217 
8218         error = indexer->set_error_callback(
8219             indexer, ha_tokudb::loader_add_index_err, &lc);
8220         if (error) {
8221             goto cleanup;
8222         }
8223 
8224         share->_num_DBs_lock.unlock();
8225         rw_lock_taken = false;
8226 
8227 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8228         // initialize a one phase progress report.
8229         // incremental reports are done in the indexer's callback function.
8230         thd_progress_init(thd, 1);
8231 #endif
8232 
8233         error = indexer->build(indexer);
8234 
8235         if (error) {
8236             goto cleanup;
8237         }
8238 
8239         rwlock_t_lock_write(share->_num_DBs_lock);
8240         error = indexer->close(indexer);
8241         share->_num_DBs_lock.unlock();
8242         if (error) {
8243             goto cleanup;
8244         }
8245         indexer = NULL;
8246     } else {
8247         assert(table->mdl_ticket->get_type() >= MDL_SHARED_NO_WRITE);
8248         share->_num_DBs_lock.unlock();
8249         rw_lock_taken = false;
8250         prelocked_right_range_size = 0;
8251         prelocked_left_range_size = 0;
8252         struct smart_dbt_bf_info bf_info;
8253         bf_info.ha = this;
8254         // you need the val if you have a clustering index and key_read is not 0;
8255         bf_info.direction = 1;
8256         bf_info.thd = ha_thd();
8257         bf_info.need_val = true;
8258         bf_info.key_to_compare = NULL;
8259 
8260         error = db_env->create_loader(
8261             db_env,
8262             txn,
8263             &loader,
8264             NULL, // no src_db needed
8265             num_of_keys,
8266             &share->key_file[curr_num_DBs],
8267             mult_put_flags,
8268             mult_dbt_flags,
8269             loader_flags);
8270         if (error) {
8271             goto cleanup;
8272         }
8273 
8274         error =
8275             loader->set_poll_function(loader, ha_tokudb::bulk_insert_poll, &lc);
8276         if (error) {
8277             goto cleanup;
8278         }
8279 
8280         error = loader->set_error_callback(
8281             loader, ha_tokudb::loader_add_index_err, &lc);
8282         if (error) {
8283             goto cleanup;
8284         }
8285         //
8286         // scan primary table, create each secondary key, add to each DB
8287         //
8288         error = share->file->cursor(
8289             share->file,
8290             txn,
8291             &tmp_cursor,
8292             DB_SERIALIZABLE);
8293         if (error) {
8294             tmp_cursor = NULL;             // Safety
8295             goto cleanup;
8296         }
8297 
8298         //
8299         // grab some locks to make this go faster
8300         // first a global read lock on the main DB, because
8301         // we intend to scan the entire thing
8302         //
8303         error = tmp_cursor->c_set_bounds(
8304             tmp_cursor,
8305             share->file->dbt_neg_infty(),
8306             share->file->dbt_pos_infty(),
8307             true,
8308             0);
8309         if (error) {
8310             goto cleanup;
8311         }
8312 
8313         // set the bulk fetch iteration to its max so that adding an
8314         // index fills the bulk fetch buffer every time. we do not
8315         // want it to grow exponentially fast.
8316         rows_fetched_using_bulk_fetch = 0;
8317         bulk_fetch_iteration = HA_TOKU_BULK_FETCH_ITERATION_MAX;
8318         cursor_ret_val = tmp_cursor->c_getf_next(
8319             tmp_cursor,
8320             DB_PRELOCKED,
8321             smart_dbt_bf_callback,
8322             &bf_info);
8323 
8324 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8325         // initialize a two phase progress report.
8326         // first phase: putting rows into the loader
8327         thd_progress_init(thd, 2);
8328 #endif
8329 
8330         while (cursor_ret_val != DB_NOTFOUND ||
8331                ((bytes_used_in_range_query_buff -
8332                  curr_range_query_buff_offset) > 0)) {
8333             if ((bytes_used_in_range_query_buff -
8334                  curr_range_query_buff_offset) == 0) {
8335                 invalidate_bulk_fetch(); // reset the buffers
8336                 cursor_ret_val = tmp_cursor->c_getf_next(
8337                     tmp_cursor,
8338                     DB_PRELOCKED,
8339                     smart_dbt_bf_callback,
8340                     &bf_info);
8341                 if (cursor_ret_val != DB_NOTFOUND && cursor_ret_val != 0) {
8342                     error = cursor_ret_val;
8343                     goto cleanup;
8344                 }
8345             }
8346             // do this check in case the the c_getf_next did not put anything
8347             // into the buffer because there was no more data
8348             if ((bytes_used_in_range_query_buff -
8349                  curr_range_query_buff_offset) == 0) {
8350                 break;
8351             }
8352             // at this point, we know the range query buffer has at least one
8353             // key/val pair
8354             uchar* curr_pos = range_query_buff+curr_range_query_buff_offset;
8355 
8356             uint32_t key_size = *(uint32_t *)curr_pos;
8357             curr_pos += sizeof(key_size);
8358             uchar* curr_key_buff = curr_pos;
8359             curr_pos += key_size;
8360             curr_pk_key.data = curr_key_buff;
8361             curr_pk_key.size = key_size;
8362 
8363             uint32_t val_size = *(uint32_t *)curr_pos;
8364             curr_pos += sizeof(val_size);
8365             uchar* curr_val_buff = curr_pos;
8366             curr_pos += val_size;
8367             curr_pk_val.data = curr_val_buff;
8368             curr_pk_val.size = val_size;
8369 
8370             curr_range_query_buff_offset = curr_pos - range_query_buff;
8371 
8372             error = loader->put(loader, &curr_pk_key, &curr_pk_val);
8373             if (error) {
8374                 goto cleanup;
8375             }
8376 
8377             num_processed++;
8378 
8379             if ((num_processed % 1000) == 0) {
8380                 sprintf(
8381                     status_msg,
8382                     "Adding indexes: Fetched %llu of about %llu rows, loading "
8383                     "of data still remains.",
8384                     num_processed,
8385                     (long long unsigned)share->row_count());
8386                 thd_proc_info(thd, status_msg);
8387 
8388 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8389                 thd_progress_report(
8390                     thd,
8391                     num_processed,
8392                     (long long unsigned)share->rows);
8393 #endif
8394 
8395                 if (thd_killed(thd)) {
8396                     error = ER_ABORTING_CONNECTION;
8397                     goto cleanup;
8398                 }
8399             }
8400         }
8401         error = tmp_cursor->c_close(tmp_cursor);
8402         assert_always(error==0);
8403         tmp_cursor = NULL;
8404 
8405 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8406         // next progress report phase: closing the loader.
8407         // incremental reports are done in the loader's callback function.
8408         thd_progress_next_stage(thd);
8409 #endif
8410 
8411         error = loader->close(loader);
8412         loader = NULL;
8413 
8414         if (error) goto cleanup;
8415     }
8416     curr_index = curr_num_DBs;
8417     for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8418         if (key_info[i].flags & HA_NOSAME) {
8419             bool is_unique;
8420             error = is_index_unique(
8421                 &is_unique,
8422                 txn,
8423                 share->key_file[curr_index],
8424                 &key_info[i],
8425                 creating_hot_index ? 0 : DB_PRELOCKED_WRITE);
8426             if (error)
8427                 goto cleanup;
8428             if (!is_unique) {
8429                 error = HA_ERR_FOUND_DUPP_KEY;
8430                 last_dup_key = i;
8431                 goto cleanup;
8432             }
8433         }
8434     }
8435 
8436     share->lock();
8437     //
8438     // We have an accurate row count, might as well update share->rows
8439     //
8440     if(!creating_hot_index) {
8441         share->set_row_count(num_processed, true);
8442     }
8443     //
8444     // now write stuff to status.tokudb
8445     //
8446     for (uint i = 0; i < num_of_keys; i++) {
8447         write_key_name_to_status(share->status_block, key_info[i].name, txn);
8448     }
8449     share->unlock();
8450 
8451     error = 0;
8452 cleanup:
8453 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8454     thd_progress_end(thd);
8455 #endif
8456     if (rw_lock_taken) {
8457         share->_num_DBs_lock.unlock();
8458         rw_lock_taken = false;
8459     }
8460     if (tmp_cursor) {
8461         int r = tmp_cursor->c_close(tmp_cursor);
8462         assert_always(r==0);
8463         tmp_cursor = NULL;
8464     }
8465     if (loader != NULL) {
8466         sprintf(status_msg, "aborting creation of indexes.");
8467         thd_proc_info(thd, status_msg);
8468         loader->abort(loader);
8469     }
8470     if (indexer != NULL) {
8471         sprintf(status_msg, "aborting creation of indexes.");
8472         thd_proc_info(thd, status_msg);
8473         rwlock_t_lock_write(share->_num_DBs_lock);
8474         indexer->abort(indexer);
8475         share->_num_DBs_lock.unlock();
8476     }
8477     if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8478         error == DB_LOCK_NOTGRANTED) {
8479         sql_print_error(
8480             "Could not add indexes to table %s because another transaction has "
8481             "accessed the table. To add indexes, make sure no transactions "
8482             "touch the table.",
8483             share->full_table_name());
8484     }
8485     thd_proc_info(thd, orig_proc_info);
8486     TOKUDB_HANDLER_DBUG_RETURN(error ? error : loader_error);
8487 }
tokudb_add_index_poll(void * extra,float progress)8488 int ha_tokudb::tokudb_add_index_poll(void* extra, float progress) {
8489     LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
8490     if (thd_killed(context->thd)) {
8491         snprintf(context->write_status_msg,
8492                  sizeof(context->write_status_msg),
8493                  "The process has been killed, aborting add index.");
8494         return ER_ABORTING_CONNECTION;
8495     }
8496     float percentage = progress * 100;
8497     snprintf(context->write_status_msg,
8498              sizeof(context->write_status_msg),
8499              "Adding of indexes to %s about %.1f%% done",
8500              context->ha->share->full_table_name(),
8501              percentage);
8502     thd_proc_info(context->thd, context->write_status_msg);
8503 #ifdef HA_TOKUDB_HAS_THD_PROGRESS
8504     thd_progress_report(context->thd, (unsigned long long)percentage, 100);
8505 #endif
8506     return 0;
8507 }
8508 
8509 //
8510 // Internal function called by ha_tokudb::add_index and ha_tokudb::alter_table_phase2
8511 // Closes added indexes in case of error in error path of add_index and alter_table_phase2
8512 //
restore_add_index(TABLE * table_arg,uint num_of_keys,bool incremented_numDBs,bool modified_DBs)8513 void ha_tokudb::restore_add_index(
8514     TABLE* table_arg,
8515     uint num_of_keys,
8516     bool incremented_numDBs,
8517     bool modified_DBs) {
8518 
8519     uint curr_num_DBs = table_arg->s->keys + tokudb_test(hidden_primary_key);
8520     uint curr_index = 0;
8521 
8522     //
8523     // need to restore num_DBs, and we have to do it before we close the dictionaries
8524     // so that there is not a window
8525     //
8526     if (incremented_numDBs) {
8527         rwlock_t_lock_write(share->_num_DBs_lock);
8528         share->num_DBs--;
8529     }
8530     if (modified_DBs) {
8531         curr_index = curr_num_DBs;
8532         for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8533             reset_key_and_col_info(&share->kc_info, curr_index);
8534         }
8535         curr_index = curr_num_DBs;
8536         for (uint i = 0; i < num_of_keys; i++, curr_index++) {
8537             if (share->key_file[curr_index]) {
8538                 int r = share->key_file[curr_index]->close(
8539                     share->key_file[curr_index],
8540                     0);
8541                 assert_always(r==0);
8542                 share->key_file[curr_index] = NULL;
8543             }
8544         }
8545     }
8546     if (incremented_numDBs) {
8547         share->_num_DBs_lock.unlock();
8548     }
8549 }
8550 
8551 //
8552 // Internal function called by ha_tokudb::prepare_drop_index and ha_tokudb::alter_table_phase2
8553 // With a transaction, drops dictionaries associated with indexes in key_num
8554 //
drop_indexes(uint * key_num,uint num_of_keys,KEY * key_info,DB_TXN * txn)8555 int ha_tokudb::drop_indexes(uint* key_num,
8556                             uint num_of_keys,
8557                             KEY* key_info,
8558                             DB_TXN* txn) {
8559     TOKUDB_HANDLER_DBUG_ENTER("");
8560     assert_always(txn);
8561 
8562     int error = 0;
8563     for (uint i = 0; i < num_of_keys; i++) {
8564         uint curr_index = key_num[i];
8565         error = share->key_file[curr_index]->pre_acquire_fileops_lock(
8566             share->key_file[curr_index],
8567             txn);
8568         if (error != 0) {
8569             goto cleanup;
8570         }
8571     }
8572     for (uint i = 0; i < num_of_keys; i++) {
8573         uint curr_index = key_num[i];
8574         int r = share->key_file[curr_index]->close(share->key_file[curr_index],0);
8575         assert_always(r==0);
8576         share->key_file[curr_index] = NULL;
8577 
8578         error = remove_key_name_from_status(
8579             share->status_block,
8580             key_info[curr_index].name,
8581             txn);
8582         if (error) {
8583             goto cleanup;
8584         }
8585 
8586         error = delete_or_rename_dictionary(
8587             share->full_table_name(),
8588             NULL,
8589             key_info[curr_index].name,
8590             true,
8591             txn,
8592             true);
8593         if (error) {
8594             goto cleanup;
8595         }
8596     }
8597 
8598 cleanup:
8599     if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8600         error == DB_LOCK_NOTGRANTED) {
8601         sql_print_error(
8602             "Could not drop indexes from table %s because another transaction "
8603             "has accessed the table. To drop indexes, make sure no "
8604             "transactions touch the table.",
8605             share->full_table_name());
8606     }
8607     TOKUDB_HANDLER_DBUG_RETURN(error);
8608 }
8609 
8610 //
8611 // Internal function called by ha_tokudb::prepare_drop_index and
8612 // ha_tokudb::alter_table_phase2
8613 // Restores dropped indexes in case of error in error path of
8614 // prepare_drop_index and alter_table_phase2
8615 //
restore_drop_indexes(uint * key_num,uint num_of_keys)8616 void ha_tokudb::restore_drop_indexes(uint* key_num, uint num_of_keys) {
8617     //
8618     // reopen closed dictionaries
8619     //
8620     for (uint i = 0; i < num_of_keys; i++) {
8621         int r;
8622         uint curr_index = key_num[i];
8623         if (share->key_file[curr_index] == NULL) {
8624             r = open_secondary_dictionary(
8625                 &share->key_file[curr_index],
8626                 &table_share->key_info[curr_index],
8627                 share->full_table_name(),
8628                 false,
8629                 NULL);
8630             assert_always(!r);
8631         }
8632     }
8633 }
8634 
map_to_handler_error(int error)8635 int ha_tokudb::map_to_handler_error(int error) {
8636     switch (error) {
8637     case DB_LOCK_DEADLOCK:
8638         error = HA_ERR_LOCK_DEADLOCK;
8639         break;
8640     case DB_LOCK_NOTGRANTED:
8641         error = HA_ERR_LOCK_WAIT_TIMEOUT;
8642         break;
8643 #if defined(HA_ERR_DISK_FULL)
8644     case ENOSPC:
8645         error = HA_ERR_DISK_FULL;
8646         break;
8647 #endif
8648     case DB_KEYEXIST:
8649         error = HA_ERR_FOUND_DUPP_KEY;
8650         break;
8651 #if defined(HA_ALTER_ERROR)
8652     case HA_ALTER_ERROR:
8653         error = HA_ERR_UNSUPPORTED;
8654         break;
8655 #endif
8656     case TOKUDB_INTERRUPTED:
8657         error = ER_QUERY_INTERRUPTED;
8658         break;
8659     case TOKUDB_OUT_OF_LOCKS:
8660         error = HA_ERR_LOCK_TABLE_FULL;
8661         break;
8662     }
8663     return error;
8664 }
8665 
print_error(int error,myf errflag)8666 void ha_tokudb::print_error(int error, myf errflag) {
8667     error = map_to_handler_error(error);
8668     handler::print_error(error, errflag);
8669 }
8670 
8671 //
8672 // truncate's dictionary associated with keynr index using transaction txn
8673 // does so by deleting and then recreating the dictionary in the context
8674 // of a transaction
8675 //
truncate_dictionary(uint keynr,DB_TXN * txn)8676 int ha_tokudb::truncate_dictionary(uint keynr, DB_TXN* txn) {
8677     int error;
8678     bool is_pk = (keynr == primary_key);
8679 
8680     toku_compression_method compression_method =
8681         get_compression_method(share->key_file[keynr]);
8682     error = share->key_file[keynr]->close(share->key_file[keynr], 0);
8683     assert_always(error == 0);
8684 
8685     share->key_file[keynr] = NULL;
8686     if (is_pk) {
8687         share->file = NULL;
8688     }
8689 
8690     if (is_pk) {
8691         error = delete_or_rename_dictionary(
8692             share->full_table_name(),
8693             NULL,
8694             "main",
8695             false, //is_key
8696             txn,
8697             true); // is a delete
8698         if (error) {
8699             goto cleanup;
8700         }
8701     } else {
8702         error = delete_or_rename_dictionary(
8703             share->full_table_name(),
8704             NULL,
8705             table_share->key_info[keynr].name,
8706             true, //is_key
8707             txn,
8708             true); // is a delete
8709         if (error) {
8710             goto cleanup;
8711         }
8712     }
8713 
8714     if (is_pk) {
8715         error = create_main_dictionary(
8716             share->full_table_name(),
8717             table,
8718             txn,
8719             &share->kc_info,
8720             compression_method);
8721     } else {
8722         error = create_secondary_dictionary(
8723             share->full_table_name(),
8724             table,
8725             &table_share->key_info[keynr],
8726             txn,
8727             &share->kc_info,
8728             keynr,
8729             false,
8730             compression_method);
8731     }
8732     if (error) {
8733         goto cleanup;
8734     }
8735 
8736 cleanup:
8737     return error;
8738 }
8739 
8740 // for 5.5
truncate()8741 int ha_tokudb::truncate() {
8742     TOKUDB_HANDLER_DBUG_ENTER("");
8743     int error = delete_all_rows_internal();
8744     TOKUDB_HANDLER_DBUG_RETURN(error);
8745 }
8746 
8747 // delete all rows from a table
8748 //
8749 // effects: delete all of the rows in the main dictionary and all of the
8750 // indices.  this must be atomic, so we use the statement transaction
8751 // for all of the truncate operations.
8752 // locks:  if we have an exclusive table write lock, all of the concurrency
8753 // issues go away.
8754 // returns: 0 if success
delete_all_rows()8755 int ha_tokudb::delete_all_rows() {
8756     TOKUDB_HANDLER_DBUG_ENTER("");
8757     int error = 0;
8758     if (thd_sql_command(ha_thd()) != SQLCOM_TRUNCATE) {
8759         share->try_table_lock = true;
8760         error = HA_ERR_WRONG_COMMAND;
8761     }
8762     if (error == 0)
8763         error = delete_all_rows_internal();
8764     TOKUDB_HANDLER_DBUG_RETURN(error);
8765 }
8766 
delete_all_rows_internal()8767 int ha_tokudb::delete_all_rows_internal() {
8768     TOKUDB_HANDLER_DBUG_ENTER("");
8769     int error = 0;
8770     uint curr_num_DBs = 0;
8771     DB_TXN* txn = NULL;
8772 
8773     // this should be enough to handle locking as the higher level MDL
8774     // on this table should prevent any new analyze tasks.
8775     share->cancel_background_jobs();
8776 
8777     error = txn_begin(db_env, 0, &txn, 0, ha_thd());
8778     if (error) {
8779         goto cleanup;
8780     }
8781 
8782     curr_num_DBs = table->s->keys + tokudb_test(hidden_primary_key);
8783     for (uint i = 0; i < curr_num_DBs; i++) {
8784         error = share->key_file[i]->pre_acquire_fileops_lock(
8785             share->key_file[i],
8786             txn);
8787         if (error) {
8788             goto cleanup;
8789         }
8790         error = share->key_file[i]->pre_acquire_table_lock(
8791             share->key_file[i],
8792             txn);
8793         if (error) {
8794             goto cleanup;
8795         }
8796     }
8797     for (uint i = 0; i < curr_num_DBs; i++) {
8798         error = truncate_dictionary(i, txn);
8799         if (error) {
8800             goto cleanup;
8801         }
8802     }
8803 
8804     DEBUG_SYNC(ha_thd(), "tokudb_after_truncate_all_dictionarys");
8805 
8806     // zap the row count
8807     if (error == 0) {
8808         share->set_row_count(0, false);
8809         // update auto increment
8810         share->last_auto_increment = 0;
8811         // calling write_to_status directly because we need to use txn
8812         write_to_status(
8813             share->status_block,
8814             hatoku_max_ai,
8815             &share->last_auto_increment,
8816             sizeof(share->last_auto_increment),
8817             txn);
8818     }
8819 
8820     share->try_table_lock = true;
8821 cleanup:
8822     if (txn) {
8823         if (error) {
8824             abort_txn(txn);
8825         } else {
8826             commit_txn(txn,0);
8827         }
8828     }
8829 
8830     if (TOKUDB_LIKELY(TOKUDB_DEBUG_FLAGS(
8831         TOKUDB_DEBUG_HIDE_DDL_LOCK_ERRORS) == 0) &&
8832         error == DB_LOCK_NOTGRANTED) {
8833         sql_print_error(
8834             "Could not truncate table %s because another transaction has "
8835             "accessed the table. To truncate the table, make sure no "
8836             "transactions touch the table.",
8837             share->full_table_name());
8838     }
8839     //
8840     // regardless of errors, need to reopen the DB's
8841     //
8842     for (uint i = 0; i < curr_num_DBs; i++) {
8843         int r = 0;
8844         if (share->key_file[i] == NULL) {
8845             if (i != primary_key) {
8846                 r = open_secondary_dictionary(
8847                         &share->key_file[i],
8848                         &table_share->key_info[i],
8849                         share->full_table_name(),
8850                         false,
8851                         NULL);
8852                 assert_always(!r);
8853             } else {
8854                 r = open_main_dictionary(
8855                        share->full_table_name(),
8856                         false,
8857                         NULL);
8858                 assert_always(!r);
8859             }
8860         }
8861     }
8862     TOKUDB_HANDLER_DBUG_RETURN(error);
8863 }
8864 
set_loader_error(int err)8865 void ha_tokudb::set_loader_error(int err) {
8866     loader_error = err;
8867 }
8868 
set_dup_value_for_pk(DBT * key)8869 void ha_tokudb::set_dup_value_for_pk(DBT* key) {
8870     assert_always(!hidden_primary_key);
8871     unpack_key(table->record[0],key,primary_key);
8872     last_dup_key = primary_key;
8873 }
8874 
8875 // we cache the information so we can do filtering ourselves,
8876 // but as far as MySQL knows, we are not doing any filtering,
8877 // so if we happen to miss filtering a row that does not match
8878 // idx_cond_arg, MySQL will catch it.
8879 // This allows us the ability to deal with only index_next and index_prev,
8880 // and not need to worry about other index_XXX functions
idx_cond_push(uint keyno_arg,Item * idx_cond_arg)8881 Item* ha_tokudb::idx_cond_push(uint keyno_arg, Item* idx_cond_arg) {
8882     toku_pushed_idx_cond_keyno = keyno_arg;
8883     toku_pushed_idx_cond = idx_cond_arg;
8884     return idx_cond_arg;
8885 }
8886 
cancel_pushed_idx_cond()8887 void ha_tokudb::cancel_pushed_idx_cond() {
8888     invalidate_icp();
8889     handler::cancel_pushed_idx_cond();
8890 }
8891 
cleanup_txn(DB_TXN * txn)8892 void ha_tokudb::cleanup_txn(DB_TXN *txn) {
8893     if (transaction == txn && cursor) {
8894         int r = cursor->c_close(cursor);
8895         assert_always(r == 0);
8896         cursor = NULL;
8897     }
8898 }
8899 
add_to_trx_handler_list()8900 void ha_tokudb::add_to_trx_handler_list() {
8901     tokudb_trx_data* trx =
8902         (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8903     trx->handlers = list_add(trx->handlers, &trx_handler_list);
8904 }
8905 
remove_from_trx_handler_list()8906 void ha_tokudb::remove_from_trx_handler_list() {
8907     tokudb_trx_data* trx =
8908         (tokudb_trx_data*)thd_get_ha_data(ha_thd(), tokudb_hton);
8909     trx->handlers = list_delete(trx->handlers, &trx_handler_list);
8910 }
8911 
8912 #if defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
rpl_before_write_rows()8913 void ha_tokudb::rpl_before_write_rows() {
8914     in_rpl_write_rows = true;
8915 }
8916 
rpl_after_write_rows()8917 void ha_tokudb::rpl_after_write_rows() {
8918     in_rpl_write_rows = false;
8919 }
8920 
rpl_before_delete_rows()8921 void ha_tokudb::rpl_before_delete_rows() {
8922     in_rpl_delete_rows = true;
8923 }
8924 
rpl_after_delete_rows()8925 void ha_tokudb::rpl_after_delete_rows() {
8926     in_rpl_delete_rows = false;
8927 }
8928 
rpl_before_update_rows()8929 void ha_tokudb::rpl_before_update_rows() {
8930     in_rpl_update_rows = true;
8931 }
8932 
rpl_after_update_rows()8933 void ha_tokudb::rpl_after_update_rows() {
8934     in_rpl_update_rows = false;
8935 }
8936 
rpl_lookup_rows()8937 bool ha_tokudb::rpl_lookup_rows() {
8938     if (!in_rpl_delete_rows && !in_rpl_update_rows)
8939         return true;
8940     else
8941         return tokudb::sysvars::rpl_lookup_rows(ha_thd());
8942 }
8943 #endif // defined(TOKU_INCLUDE_RFR) && TOKU_INCLUDE_RFR
8944 
8945 // table admin
8946 #include "ha_tokudb_admin.cc"
8947 
8948 // update functions
8949 #include "tokudb_update_fun.cc"
8950 
8951 // fast updates
8952 #if defined(TOKU_INCLUDE_UPSERT) && TOKU_INCLUDE_UPSERT
8953 #include "ha_tokudb_update.cc"
8954 #endif  // defined(TOKU_INCLUDE_UPSERT) && TOKU_INCLUDE_UPSERT
8955 
8956 // alter table
8957 #include "ha_tokudb_alter.cc"
8958 
8959 // key comparisons
8960 #include "hatoku_cmp.cc"
8961 
8962 // mrr
8963 #include "ha_tokudb_mrr_mysql.cc"
8964 
8965 // handlerton
8966 #include "ha_tokupart.cc"
8967 #include "hatoku_hton.cc"
8968 
8969 // generate template functions
8970 namespace tokudb {
8971     template size_t vlq_encode_ui(uint32_t n, void *p, size_t s);
8972     template size_t vlq_decode_ui(uint32_t *np, void *p, size_t s);
8973     template size_t vlq_encode_ui(uint64_t n, void *p, size_t s);
8974     template size_t vlq_decode_ui(uint64_t *np, void *p, size_t s);
8975 };
8976