/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*======
This file is part of PerconaFT.


Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.

    PerconaFT is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License, version 2,
    as published by the Free Software Foundation.

    PerconaFT is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.

----------------------------------------

    PerconaFT is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License, version 3,
    as published by the Free Software Foundation.

    PerconaFT is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
======= */

#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."

#pragma once

#include "cachetable/background_job_manager.h"
#include <portability/toku_random.h>
#include <util/frwlock.h>
#include <util/kibbutz.h>
#include <util/nb_mutex.h>
#include <util/partitioned_counter.h>

//////////////////////////////////////////////////////////////////////////////
//
// This file contains the classes and structs that make up the cachetable.
// The structs are:
//  - cachefile
//  - ctpair
//  - pair_list
//  - cachefile_list
//  - checkpointer
//  - evictor
//  - cleaner
//
// The rest of this comment assumes familiarity with the locks used in these
// classes/structs and what the locks protect. Nevertheless, here is 
// a list of the locks that we have:
//  - pair_list->list_lock
//  - pair_list->pending_lock_expensive
//  - pair_list->pending_lock_cheap
//  - cachefile_list->lock
//  - PAIR->mutex
//  - PAIR->value_rwlock
//  - PAIR->disk_nb_mutex
//
// Here are rules for how the locks interact:
//  - To grab any of the pair_list's locks, or the cachefile_list's lock,
//      the cachetable must be in existence
//  - To grab the PAIR mutex, we must know the PAIR will not dissappear:
//   - the PAIR must be pinned (value_rwlock or disk_nb_mutex is held)
//   - OR, the pair_list's list lock is held
//  - As a result, to get rid of a PAIR from the pair_list, we must hold
//     both the pair_list's list_lock and the PAIR's mutex
//  - To grab PAIR->value_rwlock, we must hold the PAIR's mutex
//  - To grab PAIR->disk_nb_mutex, we must hold the PAIR's mutex
//      and hold PAIR->value_rwlock
//
// Now let's talk about ordering. Here is an order from outer to inner (top locks must be grabbed first)
//  - pair_list->pending_lock_expensive
//  - pair_list->list_lock
//  - cachefile_list->lock
//  - PAIR->mutex
//  - pair_list->pending_lock_cheap <-- after grabbing this lock, 
//                                      NO other locks 
//                                      should be grabbed.
//  - when grabbing PAIR->value_rwlock or PAIR->disk_nb_mutex,
//     if the acquisition will not block, then it does not matter if any other locks held,
//     BUT if the acquisition will block, then NO other locks may be held besides
//     PAIR->mutex.
// 
// HERE ARE TWO EXAMPLES:
// To pin a PAIR on a client thread, the following must be done:
//  - first grab the list lock and find the PAIR
//  - with the list lock grabbed, grab PAIR->mutex
//  - with PAIR->mutex held:
//   - release list lock
//   - pin PAIR
//   - with PAIR pinned, grab pending_lock_cheap,
//   - copy and clear PAIR->checkpoint_pending,
//   - resolve checkpointing if necessary
//   - return to user.
//  The list lock may be held while pinning the PAIR if 
//  the PAIR has no contention. Otherwise, we may have
//  get a deadlock with another thread that has the PAIR pinned,
//  tries to pin some other PAIR, and in doing so, grabs the list lock.
//
// To unpin a PAIR on a client thread:
//  - because the PAIR is pinned, we don't need the pair_list's list_lock
//  - so, simply acquire PAIR->mutex
//  - unpin the PAIR
//  - return
//
//////////////////////////////////////////////////////////////////////////////
class evictor;
class pair_list;

///////////////////////////////////////////////////////////////////////////////
//
// Maps to a file on disk.
//
struct cachefile {
    // these next two fields are protected by cachetable's list lock
    // they are managed whenever we add or remove a pair from
    // the cachetable. As of Riddler, this linked list is only used to
    // make cachetable_flush_cachefile more efficient
    PAIR cf_head; // doubly linked list that is NOT circular
    uint32_t num_pairs; // count on number of pairs in the cachetable belong to this cachefile

    bool for_checkpoint; //True if part of the in-progress checkpoint

    // If set and the cachefile closes, the file will be removed.
    // Clients must not operate on the cachefile after setting this,
    // nor attempt to open any cachefile with the same fname (dname)
    // until this cachefile has been fully closed and unlinked.
    bool unlink_on_close;
    // If set then fclose will not be logged in recovery log.
    bool skip_log_recover_on_close;
    int fd;       /* Bug: If a file is opened read-only, then it is stuck in read-only.  If it is opened read-write, then subsequent writers can write to it too. */
    CACHETABLE cachetable;
    struct fileid fileid;
    // the filenum is used as an identifer of the cachefile
    // for logging and recovery
    FILENUM filenum;
    // number used to generate hashes for blocks in the cachefile
    // used in toku_cachetable_hash
    // this used to be the filenum.fileid, but now it is separate
    uint32_t hash_id;
    char *fname_in_env; /* Used for logging */

    void *userdata;
    void (*log_fassociate_during_checkpoint)(CACHEFILE cf, void *userdata); // When starting a checkpoint we must log all open files.
    void (*close_userdata)(CACHEFILE cf, int fd, void *userdata, bool lsnvalid, LSN); // when closing the last reference to a cachefile, first call this function. 
    void (*free_userdata)(CACHEFILE cf, void *userdata); // when closing the last reference to a cachefile, first call this function. 
    void (*begin_checkpoint_userdata)(LSN lsn_of_checkpoint, void *userdata); // before checkpointing cachefiles call this function.
    void (*checkpoint_userdata)(CACHEFILE cf, int fd, void *userdata); // when checkpointing a cachefile, call this function.
    void (*end_checkpoint_userdata)(CACHEFILE cf, int fd, void *userdata); // after checkpointing cachefiles call this function.
    void (*note_pin_by_checkpoint)(CACHEFILE cf, void *userdata); // add a reference to the userdata to prevent it from being removed from memory
    void (*note_unpin_by_checkpoint)(CACHEFILE cf, void *userdata); // add a reference to the userdata to prevent it from being removed from memory
    BACKGROUND_JOB_MANAGER bjm;
};


///////////////////////////////////////////////////////////////////////////////
//
//  The pair represents the data stored in the cachetable.
//
struct ctpair {
    // these fields are essentially constants. They do not change.
    CACHEFILE cachefile;
    CACHEKEY key;
    uint32_t fullhash;
    CACHETABLE_FLUSH_CALLBACK flush_callback;
    CACHETABLE_PARTIAL_EVICTION_EST_CALLBACK pe_est_callback;
    CACHETABLE_PARTIAL_EVICTION_CALLBACK pe_callback;
    CACHETABLE_CLEANER_CALLBACK cleaner_callback;
    CACHETABLE_CLONE_CALLBACK clone_callback;
    CACHETABLE_CHECKPOINT_COMPLETE_CALLBACK checkpoint_complete_callback;
    void *write_extraargs;

    // access to these fields are protected by disk_nb_mutex
    void* cloned_value_data; // cloned copy of value_data used for checkpointing
    long cloned_value_size; // size of cloned_value_data, used for accounting of size_current
    void* disk_data; // data used to fetch/flush value_data to and from disk.

    // access to these fields are protected by value_rwlock
    void* value_data; // data used by client threads, FTNODEs and ROLLBACK_LOG_NODEs
    PAIR_ATTR attr;
    enum cachetable_dirty dirty;

    // protected by PAIR->mutex
    uint32_t count;        // clock count
    uint32_t refcount; // if > 0, then this PAIR is referenced by
                       // callers to the cachetable, and therefore cannot 
                       // be evicted
    uint32_t num_waiting_on_refs; // number of threads waiting on refcount to go to zero
    toku_cond_t refcount_wait; // cond used to wait for refcount to go to zero

    // locks
    toku::frwlock value_rwlock;
    struct nb_mutex disk_nb_mutex; // single writer, protects disk_data, is used for writing cloned nodes for checkpoint
    toku_mutex_t* mutex; // gotten from the pair list

    // Access to checkpoint_pending is protected by two mechanisms,
    // the value_rwlock and the pair_list's pending locks (expensive and cheap).
    // checkpoint_pending may be true of false. 
    // Here are the rules for reading/modifying this bit.
    //  - To transition this field from false to true during begin_checkpoint,
    //   we must be holding both of the pair_list's pending locks.
    //  - To transition this field from true to false during end_checkpoint,
    //   we must be holding the value_rwlock.
    //  - For a non-checkpoint thread to read the value, we must hold both the
    //   value_rwlock and one of the pair_list's pending locks
    //  - For the checkpoint thread to read the value, we must 
    //   hold the value_rwlock
    //
    bool checkpoint_pending; // If this is on, then we have got to resolve checkpointing modifying it.

    // these are variables that are only used to transfer information to background threads
    // we cache them here to avoid a malloc. In the future, we should investigate if this
    // is necessary, as having these fields here is not technically necessary
    long size_evicting_estimate;
    evictor* ev;
    pair_list* list;

    // A PAIR is stored in a pair_list (which happens to be PAIR->list).
    // These variables are protected by the list lock in the pair_list
    //
    // clock_next,clock_prev represent a circular doubly-linked list.
    PAIR clock_next,clock_prev; // In clock.
    PAIR hash_chain;

    // pending_next,pending_next represent a non-circular doubly-linked list.
    PAIR pending_next;
    PAIR pending_prev;

    // cf_next, cf_prev represent a non-circular doubly-linked list.
    // entries in linked list for PAIRs in a cachefile, these are protected
    // by the list lock of the PAIR's pair_list. They are used to make
    // cachetable_flush_cachefile cheaper so that we don't need
    // to search the entire cachetable to find a particular cachefile's
    // PAIRs
    PAIR cf_next;
    PAIR cf_prev;
};

//
// This initializes the fields and members of the pair.
//
void pair_init(PAIR p,
    CACHEFILE cachefile,
    CACHEKEY key,
    void *value,
    PAIR_ATTR attr,
    enum cachetable_dirty dirty,
    uint32_t fullhash,
    CACHETABLE_WRITE_CALLBACK write_callback,
    evictor *ev,
    pair_list *list);


///////////////////////////////////////////////////////////////////////////////
//
//  The pair list maintains the set of PAIR's that make up
//  the cachetable.
//
class pair_list {
public:
    //
    // the following fields are protected by the list lock
    // 
    uint32_t m_n_in_table; // number of pairs in the hash table
    uint32_t m_table_size; // number of buckets in the hash table
    uint32_t m_num_locks;
    PAIR *m_table; // hash table
    toku_mutex_aligned_t *m_mutexes; 
    // 
    // The following fields are the heads of various linked lists.
    // They also protected by the list lock, but their 
    // usage is not as straightforward. For each of them,
    // only ONE thread is allowed iterate over them with 
    // a read lock on the list lock. All other threads
    // that want to modify elements in the lists or iterate over
    // the lists must hold the write list lock. Here is the
    // association between what threads may hold a read lock
    // on the list lock while iterating:
    //  - clock_head -> eviction thread (evictor)
    //  - cleaner_head -> cleaner thread (cleaner)
    //  - pending_head -> checkpoint thread (checkpointer)
    //
    PAIR m_clock_head; // of clock . head is the next thing to be up for decrement. 
    PAIR m_cleaner_head; // for cleaner thread. head is the next thing to look at for possible cleaning.
    PAIR m_checkpoint_head; // for begin checkpoint to iterate over PAIRs and mark as pending_checkpoint
    PAIR m_pending_head; // list of pairs marked with checkpoint_pending

    // this field is public so we are still POD

    // usage of this lock is described above
    toku_pthread_rwlock_t m_list_lock;
    //
    // these locks are the "pending locks" referenced 
    // in comments about PAIR->checkpoint_pending. There
    // are two of them, but both serve the same purpose, which
    // is to protect the transition of a PAIR's checkpoint pending
    // value from false to true during begin_checkpoint.
    // We use two locks, because threads that want to read the
    // checkpoint_pending value may hold a lock for varying periods of time.
    // Threads running eviction may need to protect checkpoint_pending
    // while writing a node to disk, which is an expensive operation,
    // so it uses pending_lock_expensive. Client threads that
    // want to pin PAIRs will want to protect checkpoint_pending
    // just long enough to read the value and wipe it out. This is
    // a cheap operation, and as a result, uses pending_lock_cheap.
    //
    // By having two locks, and making begin_checkpoint first 
    // grab pending_lock_expensive and then pending_lock_cheap,
    // we ensure that threads that want to pin nodes can grab
    // only pending_lock_cheap, and never block behind threads
    // holding pending_lock_expensive and writing a node out to disk
    //
    toku_pthread_rwlock_t m_pending_lock_expensive;
    toku_pthread_rwlock_t m_pending_lock_cheap;
    void init();
    void destroy();
    void evict_completely(PAIR pair);
    void evict_from_cachetable(PAIR pair);
    void evict_from_cachefile(PAIR pair);
    void add_to_cachetable_only(PAIR p);
    void put(PAIR pair);
    PAIR find_pair(CACHEFILE file, CACHEKEY key, uint32_t hash);
    void pending_pairs_remove (PAIR p);
    void verify();
    void get_state(int *num_entries, int *hash_size);
    void read_list_lock();
    void read_list_unlock();
    void write_list_lock();
    void write_list_unlock();
    void read_pending_exp_lock();
    void read_pending_exp_unlock();
    void write_pending_exp_lock();
    void write_pending_exp_unlock();
    void read_pending_cheap_lock();
    void read_pending_cheap_unlock();
    void write_pending_cheap_lock();
    void write_pending_cheap_unlock();
    toku_mutex_t* get_mutex_for_pair(uint32_t fullhash);
    void pair_lock_by_fullhash(uint32_t fullhash);
    void pair_unlock_by_fullhash(uint32_t fullhash);

private:
    void pair_remove (PAIR p);
    void remove_from_hash_chain(PAIR p);
    void add_to_cf_list (PAIR p);
    void add_to_clock (PAIR p);
    void add_to_hash_chain(PAIR p);
};

///////////////////////////////////////////////////////////////////////////////
//
// Wrapper for the head of our cachefile list.
//
class cachefile_list {
public:
    void init();
    void destroy();
    void read_lock();
    void read_unlock();
    void write_lock();
    void write_unlock();
    int cachefile_of_iname_in_env(const char *iname_in_env, CACHEFILE *cf);
    int cachefile_of_filenum(FILENUM filenum, CACHEFILE *cf);
    void add_cf_unlocked(CACHEFILE newcf);
    void add_stale_cf(CACHEFILE newcf);
    void remove_cf(CACHEFILE cf);
    void remove_stale_cf_unlocked(CACHEFILE cf);
    FILENUM reserve_filenum();
    uint32_t get_new_hash_id_unlocked();
    CACHEFILE find_cachefile_unlocked(struct fileid* fileid);
    CACHEFILE find_stale_cachefile_unlocked(struct fileid* fileid);
    void verify_unused_filenum(FILENUM filenum);
    bool evict_some_stale_pair(evictor* ev);
    void free_stale_data(evictor* ev);
    // access to these fields are protected by the lock
    FILENUM m_next_filenum_to_use;
    uint32_t m_next_hash_id_to_use;
    toku_pthread_rwlock_t m_lock; // this field is publoc so we are still POD
    toku::omt<CACHEFILE> m_active_filenum;
    toku::omt<CACHEFILE> m_active_fileid;
    toku::omt<CACHEFILE> m_stale_fileid;
private:    
    CACHEFILE find_cachefile_in_list_unlocked(CACHEFILE start, struct fileid* fileid);
};


///////////////////////////////////////////////////////////////////////////////
//
//  The checkpointer handles starting and finishing checkpoints of the 
//  cachetable's data.
//
class checkpointer {
public:
    int init(pair_list *_pl, TOKULOGGER _logger, evictor *_ev, cachefile_list *files);
    void destroy();
    void set_checkpoint_period(uint32_t new_period);
    uint32_t get_checkpoint_period();
    int shutdown();
    bool has_been_shutdown();
    void begin_checkpoint();
    void add_background_job();
    void remove_background_job();
    void end_checkpoint(void (*testcallback_f)(void*),  void* testextra);
    TOKULOGGER get_logger();
    // used during begin_checkpoint
    void increment_num_txns();
private:
    uint32_t m_checkpoint_num_txns;   // how many transactions are in the checkpoint
    TOKULOGGER m_logger;
    LSN m_lsn_of_checkpoint_in_progress;
    uint32_t m_checkpoint_num_files; // how many cachefiles are in the checkpoint
    struct minicron m_checkpointer_cron; // the periodic checkpointing thread
    cachefile_list *m_cf_list;
    pair_list *m_list;
    evictor *m_ev;
    bool m_checkpointer_cron_init;
    bool m_checkpointer_init;
    
    // variable used by the checkpoint thread to know
    // when all work induced by cloning on client threads is done
    BACKGROUND_JOB_MANAGER m_checkpoint_clones_bjm;
    // private methods for begin_checkpoint    
    void update_cachefiles();
    void log_begin_checkpoint();
    void turn_on_pending_bits();
    // private methods for end_checkpoint    
    void fill_checkpoint_cfs(CACHEFILE* checkpoint_cfs);
    void checkpoint_pending_pairs();
    void checkpoint_userdata(CACHEFILE* checkpoint_cfs);
    void log_end_checkpoint();
    void end_checkpoint_userdata(CACHEFILE* checkpoint_cfs);
    void remove_cachefiles(CACHEFILE* checkpoint_cfs);
    
    // Unit test struct needs access to private members.
    friend struct checkpointer_test;
};

//
// This is how often we want the eviction thread
// to run, in seconds.
//
const int EVICTION_PERIOD = 1;

///////////////////////////////////////////////////////////////////////////////
//
// The evictor handles the removal of pairs from the pair list/cachetable.
//
class evictor {
public:
    int init(long _size_limit, pair_list* _pl, cachefile_list* _cf_list, KIBBUTZ _kibbutz, uint32_t eviction_period);
    void destroy();
    void add_pair_attr(PAIR_ATTR attr);
    void remove_pair_attr(PAIR_ATTR attr);    
    void change_pair_attr(PAIR_ATTR old_attr, PAIR_ATTR new_attr);
    void add_cloned_data_size(long size);
    void remove_cloned_data_size(long size);
    uint64_t reserve_memory(double fraction, uint64_t upper_bound);
    void release_reserved_memory(uint64_t reserved_memory);
    void run_eviction_thread();
    void do_partial_eviction(PAIR p);
    void evict_pair(PAIR p, bool checkpoint_pending);
    void wait_for_cache_pressure_to_subside();
    void signal_eviction_thread();
    void signal_eviction_thread_locked();
    bool should_client_thread_sleep();
    bool should_client_wake_eviction_thread();
    // function needed for testing
    void get_state(long *size_current_ptr, long *size_limit_ptr);
    void fill_engine_status();
    void set_enable_partial_eviction(bool enabled);
    bool get_enable_partial_eviction(void) const;
private:
    void add_to_size_current(long size);
    void remove_from_size_current(long size);
    void run_eviction();
    bool run_eviction_on_pair(PAIR p);
    void try_evict_pair(PAIR p);
    void decrease_size_evicting(long size_evicting_estimate);
    bool should_sleeping_clients_wakeup();
    bool eviction_needed();

    // We have some intentional races with these variables because we're ok with reading something a little bit old.
    // Provide some hooks for reading variables in an unsafe way so that there are function names we can stick in a valgrind suppression.
    int64_t unsafe_read_size_current(void) const;
    int64_t unsafe_read_size_evicting(void) const;

    pair_list* m_pl;
    cachefile_list* m_cf_list;
    int64_t m_size_current;            // the sum of the sizes of the pairs in the cachetable
    int64_t m_size_cloned_data; // stores amount of cloned data we have, only used for engine status
    // changes to these two values are protected
    // by ev_thread_lock
    int64_t m_size_reserved;           // How much memory is reserved (e.g., by the loader)
    int64_t m_size_evicting;           // the sum of the sizes of the pairs being written

    // these are constants
    int64_t m_low_size_watermark; // target max size of cachetable that eviction thread aims for
    int64_t m_low_size_hysteresis; // if cachetable grows to this size, client threads wake up eviction thread upon adding data
    int64_t m_high_size_watermark; // if cachetable grows to this size, client threads sleep upon adding data
    int64_t m_high_size_hysteresis; // if > cachetable size, then sleeping client threads may wake up

    bool m_enable_partial_eviction; // true if partial evictions are permitted

    // used to calculate random numbers
    struct random_data m_random_data;
    char m_random_statebuf[64];

    // mutex that protects fields listed immedietly below
    toku_mutex_t m_ev_thread_lock;
    // the eviction thread
    toku_pthread_t m_ev_thread;
    // condition variable that controls the sleeping period
    // of the eviction thread
    toku_cond_t m_ev_thread_cond;
    // number of client threads that are currently sleeping
    // due to an over-subscribed cachetable
    uint32_t m_num_sleepers;
    // states if the eviction thread should run. set to true
    // in init, set to false during destroy
    bool m_run_thread;
    // bool that states if the eviction thread is currently running
    bool m_ev_thread_is_running;
    // period which the eviction thread sleeps
    uint32_t m_period_in_seconds;
    // condition variable on which client threads wait on when sleeping
    // due to an over-subscribed cachetable
    toku_cond_t m_flow_control_cond;

    // variables for engine status
    PARTITIONED_COUNTER m_size_nonleaf;
    PARTITIONED_COUNTER m_size_leaf;
    PARTITIONED_COUNTER m_size_rollback;
    PARTITIONED_COUNTER m_size_cachepressure;
    PARTITIONED_COUNTER m_wait_pressure_count;
    PARTITIONED_COUNTER m_wait_pressure_time;
    PARTITIONED_COUNTER m_long_wait_pressure_count;
    PARTITIONED_COUNTER m_long_wait_pressure_time;
    
    KIBBUTZ m_kibbutz;

    // this variable is ONLY used for testing purposes
    uint64_t m_num_eviction_thread_runs;

    bool m_ev_thread_init;
    bool m_evictor_init;

    friend class evictor_test_helpers;
    friend class evictor_unit_test;
};

///////////////////////////////////////////////////////////////////////////////
//
// Iterates over the clean head in the pair list, calling the cleaner
// callback on each node in that list.
//
class cleaner {
public:
    int init(uint32_t cleaner_iterations, pair_list* _pl, CACHETABLE _ct);
    void destroy(void);
    uint32_t get_iterations(void);
    void set_iterations(uint32_t new_iterations);
    uint32_t get_period_unlocked(void);
    void set_period(uint32_t new_period);
    int run_cleaner(void);
    
private:
    pair_list* m_pl;
    CACHETABLE m_ct;
    struct minicron m_cleaner_cron; // the periodic cleaner thread
    uint32_t m_cleaner_iterations; // how many times to run the cleaner per
                                  // cleaner period (minicron has a
                                  // minimum period of 1s so if you want
                                  // more frequent cleaner runs you must
                                  // use this)
    bool m_cleaner_cron_init;
    bool m_cleaner_init;
};

///////////////////////////////////////////////////////////////////////////////
//
// The cachetable is as close to an ENV as we get.
//
struct cachetable {
    pair_list list;
    cleaner cl;
    evictor ev;
    checkpointer cp;
    cachefile_list cf_list;
    
    KIBBUTZ client_kibbutz; // pool of worker threads and jobs to do asynchronously for the client.
    KIBBUTZ ct_kibbutz; // pool of worker threads and jobs to do asynchronously for the cachetable
    KIBBUTZ checkpointing_kibbutz; // small pool for checkpointing cloned pairs

    char *env_dir;
};