1 /*++
2 Copyright (c) 2006 Microsoft Corporation
3 
4 Module Name:
5 
6     dl_table.h
7 
8 Abstract:
9 
10     <abstract>
11 
12 Author:
13 
14     Krystof Hoder (t-khoder) 2010-09-01.
15 
16 Revision History:
17 
18 --*/
19 
20 #pragma once
21 
22 #include<iostream>
23 #include<list>
24 #include<utility>
25 
26 #include "ast/ast.h"
27 #include "util/bit_vector.h"
28 #include "util/buffer.h"
29 #include "util/hashtable.h"
30 #include "util/map.h"
31 #include "util/ref_vector.h"
32 #include "util/vector.h"
33 
34 #include "muz/rel/dl_base.h"
35 
36 
37 namespace datalog {
38     class sparse_table;
39 
40     class sparse_table_plugin : public table_plugin {
41         friend class sparse_table;
42     protected:
43         class join_project_fn;
44         class union_fn;
45         class transformer_fn;
46         class rename_fn;
47         class project_fn;
48         class negation_filter_fn;
49         class select_equal_and_project_fn;
50         class negated_join_fn;
51 
52         typedef ptr_vector<sparse_table> sp_table_vector;
53         typedef map<table_signature, sp_table_vector *,
54             table_signature::hash, table_signature::eq > table_pool;
55 
56         table_pool m_pool;
57 
58         void recycle(sparse_table * t);
59 
60         void garbage_collect();
61 
62         void reset();
63 
64         static bool join_involves_functional(const table_signature & s1, const table_signature & s2,
65             unsigned col_cnt, const unsigned * cols1, const unsigned * cols2);
66 
67     public:
68         typedef sparse_table table;
69 
70         sparse_table_plugin(relation_manager & manager);
71         ~sparse_table_plugin() override;
72 
can_handle_signature(const table_signature & s)73         bool can_handle_signature(const table_signature & s) override
74         { return !s.empty(); }
75 
76         table_base * mk_empty(const table_signature & s) override;
77         sparse_table * mk_clone(const sparse_table & t);
78 
79     protected:
80         table_join_fn * mk_join_fn(const table_base & t1, const table_base & t2,
81             unsigned col_cnt, const unsigned * cols1, const unsigned * cols2) override;
82         table_join_fn * mk_join_project_fn(const table_base & t1, const table_base & t2,
83             unsigned col_cnt, const unsigned * cols1, const unsigned * cols2, unsigned removed_col_cnt,
84             const unsigned * removed_cols) override;
85         table_union_fn * mk_union_fn(const table_base & tgt, const table_base & src,
86             const table_base * delta) override;
87         table_transformer_fn * mk_project_fn(const table_base & t, unsigned col_cnt,
88             const unsigned * removed_cols) override;
89         table_transformer_fn * mk_rename_fn(const table_base & t, unsigned permutation_cycle_len,
90             const unsigned * permutation_cycle) override;
91         table_transformer_fn * mk_select_equal_and_project_fn(const table_base & t,
92             const table_element & value, unsigned col) override;
93         table_intersection_filter_fn * mk_filter_by_negation_fn(const table_base & t,
94                 const table_base & negated_obj, unsigned joined_col_cnt,
95                 const unsigned * t_cols, const unsigned * negated_cols) override;
96         table_intersection_join_filter_fn* mk_filter_by_negated_join_fn(
97             const table_base & t,
98             const table_base & src1,
99             const table_base & src2,
100             unsigned_vector const& t_cols,
101             unsigned_vector const& src_cols,
102             unsigned_vector const& src1_cols,
103             unsigned_vector const& src2_cols) override;
104 
105         static sparse_table const& get(table_base const&);
106         static sparse_table& get(table_base&);
107         static sparse_table const* get(table_base const*);
108         static sparse_table* get(table_base*);
109 
110     };
111 
112     class entry_storage {
113     public:
114         typedef size_t store_offset;
115     private:
116         typedef svector<char, size_t> storage;
117 
118         class offset_hash_proc {
119             storage & m_storage;
120             unsigned m_unique_entry_size;
121         public:
offset_hash_proc(storage & s,unsigned unique_entry_sz)122             offset_hash_proc(storage & s, unsigned unique_entry_sz)
123                 : m_storage(s), m_unique_entry_size(unique_entry_sz) {}
operator()124             unsigned operator()(store_offset ofs) const {
125                 return string_hash(m_storage.c_ptr()+ofs, m_unique_entry_size, 0);
126             }
127         };
128 
129         class offset_eq_proc {
130             storage & m_storage;
131             unsigned m_unique_entry_size;
132         public:
offset_eq_proc(storage & s,unsigned unique_entry_sz)133             offset_eq_proc(storage & s, unsigned unique_entry_sz)
134                 : m_storage(s), m_unique_entry_size(unique_entry_sz) {}
operator()135             bool operator()(store_offset o1, store_offset o2) const {
136                 const char * base = m_storage.c_ptr();
137                 return memcmp(base+o1, base+o2, m_unique_entry_size)==0;
138             }
139         };
140 
141         typedef hashtable<store_offset, offset_hash_proc, offset_eq_proc> storage_indexer;
142 
143         static const store_offset NO_RESERVE = UINT_MAX;
144 
145         unsigned m_entry_size;
146         unsigned m_unique_part_size;
147         size_t m_data_size;
148         /**
149            Invariant: Every or all but one blocks of length \c m_entry_size in the \c m_data vector
150            are unique sequences of bytes and have their offset stored in the \c m_data_indexer hashtable.
151            If the offset of the last block is not stored in the hashtable, it is stored in the \c m_reserve
152            variable. Otherwise \c m_reserve==NO_RESERVE.
153 
154            The size of m_data is actually 8 bytes larger than stated in m_data_size, so that we may
155            deref an uint64_t pointer at the end of the array.
156         */
157         storage m_data;
158         storage_indexer m_data_indexer;
159         store_offset m_reserve;
160     public:
161         entry_storage(unsigned entry_size, unsigned functional_size = 0, unsigned init_size = 0)
m_entry_size(entry_size)162             :   m_entry_size(entry_size),
163                 m_unique_part_size(entry_size-functional_size),
164                 m_data_indexer(next_power_of_two(std::max(8u,init_size)),
165                     offset_hash_proc(m_data, m_unique_part_size), offset_eq_proc(m_data, m_unique_part_size)),
166                 m_reserve(NO_RESERVE) {
167             SASSERT(entry_size>0);
168             SASSERT(functional_size<=entry_size);
169             resize_data(init_size);
170             resize_data(0);
171         }
entry_storage(const entry_storage & s)172         entry_storage(const entry_storage &s)
173             :   m_entry_size(s.m_entry_size),
174                 m_unique_part_size(s.m_unique_part_size),
175                 m_data_size(s.m_data_size),
176                 m_data(s.m_data),
177                 m_data_indexer(next_power_of_two(std::max(8u,s.entry_count())),
178                     offset_hash_proc(m_data, m_unique_part_size), offset_eq_proc(m_data, m_unique_part_size)),
179                 m_reserve(s.m_reserve) {
180             store_offset after_last=after_last_offset();
181             for(store_offset i=0; i<after_last; i+=m_entry_size) {
182                 m_data_indexer.insert(i);
183             }
184         }
185 
186         entry_storage & operator=(const entry_storage & o) {
187             m_data_indexer.reset();
188             m_entry_size = o.m_entry_size;
189             m_unique_part_size = o.m_unique_part_size;
190             m_data_size = o.m_data_size;
191             m_data = o.m_data;
192             m_reserve = o.m_reserve;
193             store_offset after_last=after_last_offset();
194             for(store_offset i=0; i<after_last; i+=m_entry_size) {
195                 m_data_indexer.insert(i);
196             }
197             return *this;
198         }
199 
reset()200         void reset() {
201             resize_data(0);
202             m_data_indexer.reset();
203             m_reserve = NO_RESERVE;
204         }
205 
entry_size()206         unsigned entry_size() const { return m_entry_size; }
207         unsigned get_size_estimate_bytes() const;
get(store_offset ofs)208         char * get(store_offset ofs) { return m_data.begin()+ofs; }
get(store_offset ofs)209         const char * get(store_offset ofs) const
210         { return const_cast<entry_storage *>(this)->get(ofs); }
211 
entry_count()212         unsigned entry_count() const { return m_data_indexer.size(); }
213 
after_last_offset()214         store_offset after_last_offset() const {
215             return (m_reserve==NO_RESERVE) ? m_data_size : m_reserve;
216         }
217 
begin()218         char * begin() { return get(0); }
begin()219         const char * begin() const { return get(0); }
after_last()220         const char * after_last() const { return get(after_last_offset()); }
221 
222 
has_reserve()223         bool has_reserve() const { return m_reserve!=NO_RESERVE; }
reserve()224         store_offset reserve() const { SASSERT(has_reserve()); return m_reserve; }
225 
ensure_reserve()226         void ensure_reserve() {
227             if(has_reserve()) {
228                 SASSERT(m_reserve==m_data_size-m_entry_size);
229                 return;
230             }
231             m_reserve = m_data_size;
232             resize_data(m_data_size+m_entry_size);
233         }
234 
235         /**
236            \brief Return pointer to the reserve.
237 
238            The reserve must exist when the function is called.
239         */
get_reserve_ptr()240         char * get_reserve_ptr() {
241             SASSERT(has_reserve());
242             return &m_data.get(reserve());
243         }
244 
reserve_content_already_present()245         bool reserve_content_already_present() const {
246             SASSERT(has_reserve());
247             return m_data_indexer.contains(reserve());
248         }
249 
find_reserve_content(store_offset & result)250         bool find_reserve_content(store_offset & result) const {
251             SASSERT(has_reserve());
252             storage_indexer::entry * indexer_entry = m_data_indexer.find_core(reserve());
253             if(!indexer_entry) {
254                 return false;
255             }
256             result = indexer_entry->get_data();
257             return true;
258         }
259 
260         /**
261            \brief Write fact \c f into the reserve at the end of the \c m_data storage.
262 
263            If the reserve does not exist, this function creates it.
264         */
write_into_reserve(const char * data)265         void write_into_reserve(const char * data) {
266             ensure_reserve();
267             memcpy(get_reserve_ptr(), data, m_entry_size);
268         }
269 
270         /**
271            \brief If the fact in reserve is not in the table, insert it there and return true;
272            otherwise return false.
273 
274            When a fact is inserted into the table, the reserve becomes part of the table and
275            is no longer a reserve.
276         */
277         bool insert_reserve_content();
278         store_offset insert_or_get_reserve_content();
279         bool remove_reserve_content();
280         /**
281            Remove data at the offset \c ofs.
282 
283            Data with offset lower than \c ofs are not be modified by this function, data with
284            higher offset may be moved.
285         */
286         void remove_offset(store_offset ofs);
287 
288 
289         //the following two operations allow breaking of the object invariant!
resize_data(size_t sz)290         void resize_data(size_t sz) {
291             m_data_size = sz;
292             if (sz + sizeof(uint64_t) < sz) {
293                 throw default_exception("overflow resizing data section for sparse table");
294             }
295             m_data.resize(sz + sizeof(uint64_t));
296         }
297 
insert_offset(store_offset ofs)298         bool insert_offset(store_offset ofs) {
299             return m_data_indexer.insert_if_not_there(ofs)==ofs;
300         }
301     };
302 
303     class sparse_table : public table_base {
304         friend class sparse_table_plugin;
305         friend class sparse_table_plugin::join_project_fn;
306         friend class sparse_table_plugin::union_fn;
307         friend class sparse_table_plugin::transformer_fn;
308         friend class sparse_table_plugin::rename_fn;
309         friend class sparse_table_plugin::project_fn;
310         friend class sparse_table_plugin::negation_filter_fn;
311         friend class sparse_table_plugin::select_equal_and_project_fn;
312 
313         class our_iterator_core;
314         class key_indexer;
315         class general_key_indexer;
316         class full_signature_key_indexer;
317         typedef entry_storage::store_offset store_offset;
318 
319 
320         class column_info {
321             unsigned m_big_offset;
322             unsigned m_small_offset;
323             uint64_t m_mask;
324             uint64_t m_write_mask;
325         public:
326             unsigned m_offset; //!< in bits
327             unsigned m_length; //!< in bits
328 
column_info(unsigned offset,unsigned length)329             column_info(unsigned offset, unsigned length)
330                 : m_big_offset(offset / 8),
331                   m_small_offset(offset % 8),
332                   m_mask( length == 64 ? ULLONG_MAX : (static_cast<uint64_t>(1)<<length)-1 ),
333                   m_write_mask( ~(m_mask << m_small_offset) ),
334                   m_offset(offset),
335                   m_length(length) {
336                 SASSERT(length <= 64);
337                 SASSERT(length + m_small_offset <= 64);
338             }
get(const char * rec)339             table_element get(const char * rec) const {
340 
341                 uint64_t res;
342                 memcpy(&res, rec + m_big_offset, sizeof(res));
343                 res >>= m_small_offset;
344                 res &= m_mask;
345                 return res;
346             }
set(char * rec,table_element val)347             void set(char * rec, table_element val) const {
348                 SASSERT( (val&~m_mask)==0 ); //the value fits into the column
349                 uint64_t cell;
350                 memcpy(&cell, rec + m_big_offset, sizeof(cell));
351                 cell &= m_write_mask;
352                 cell |= val << m_small_offset;
353                 memcpy(rec + m_big_offset, &cell, sizeof(cell));
354             }
next_ofs()355             unsigned next_ofs() const { return m_offset+m_length; }
356         };
357         class column_layout : public svector<column_info> {
358 
359             void make_byte_aligned_end(unsigned col_index);
360         public:
361 
362             unsigned m_entry_size;
363             /**
364                 Number of last bytes which correspond to functional columns in the signature.
365             */
366             unsigned m_functional_part_size;
367             unsigned m_functional_col_cnt;
368 
369             column_layout(const table_signature & sig);
370 
get(const char * rec,unsigned col)371             table_element get(const char * rec, unsigned col) const {
372                 return (*this)[col].get(rec);
373             }
set(char * rec,unsigned col,table_element val)374             void set(char * rec, unsigned col, table_element val) const {
375                 return (*this)[col].set(rec, val);
376             }
377         };
378 
379 
380         typedef svector<unsigned> key_spec;        //sequence of columns in a key
381         typedef svector<table_element> key_value;  //values of key columns
382         typedef map<key_spec, key_indexer*, svector_hash_proc<unsigned_hash>,
383             vector_eq_proc<key_spec> > key_index_map;
384 
385         static const store_offset NO_RESERVE = UINT_MAX;
386 
387         column_layout m_column_layout;
388         unsigned m_fact_size;
389         entry_storage m_data;
390         mutable key_index_map m_key_indexes;
391 
392 
get_at_offset(store_offset i)393         const char * get_at_offset(store_offset i) const {
394             return m_data.get(i);
395         }
396 
get_cell(store_offset ofs,unsigned column)397         table_element get_cell(store_offset ofs, unsigned column) const {
398             return m_column_layout.get(m_data.get(ofs), column);
399         }
400 
set_cell(store_offset ofs,unsigned column,table_element val)401         void set_cell(store_offset ofs, unsigned column, table_element val) {
402             m_column_layout.set(m_data.get(ofs), column, val);
403         }
404 
405         void write_into_reserve(const table_element* f);
406 
407         /**
408            \brief Return reference to an indexer over columns in \c key_cols.
409 
410            An indexer can retrieve a sequence of offsets that with \c key_cols columns equal to
411            the specified key. Indexers are populated lazily -- they remember the position of the
412            last fact they contain, and when an indexer is retrieved by the \c get_key_indexer function,
413            all the new facts are added into the indexer.
414 
415            When a fact is removed from the table, all indexers are destroyed. This is not an extra
416            expense in the current use scenario, because we first perform all fact removals and do the
417            joins only after that (joins are the only operations that lead to index construction).
418         */
419         key_indexer& get_key_indexer(unsigned key_len, const unsigned * key_cols) const;
420 
421         void reset_indexes();
422 
423         static void copy_columns(const column_layout & src_layout, const column_layout & dest_layout,
424             unsigned start_index, unsigned after_last, const char * src, char * dest,
425             unsigned & dest_idx, unsigned & pre_projection_idx, const unsigned * & next_removed);
426 
427         /**
428             \c array \c removed_cols contains column indexes to be removed in ascending order and
429             is terminated by a number greater than the highest column index of a join the two tables.
430             This is to simplify the traversal of the array when building facts.
431          */
432         static void concatenate_rows(const column_layout & layout1, const column_layout & layout2,
433             const column_layout & layout_res, const char * ptr1, const char * ptr2, char * res,
434             const unsigned * removed_cols);
435 
436         /**
437            \brief Perform join-project between t1 and t2 iterating through t1 and retrieving relevant
438            columns from t2 using indexing.
439 
440            \c array \c removed_cols contains column indexes to be removed in ascending order and
441            is terminated by a number greater than the highest column index of a join the two tables.
442            This is to simplify the traversal of the array when building facts.
443 
444            \c tables_swapped value means that the resulting facts should contain facts from t2 first,
445            instead of the default behavior that would concatenate the two facts as \c (t1,t2).
446 
447            \remark The function is called \c self_agnostic_join since, unlike the virtual method
448            \c join, it is static and therefore allows to easily swap the roles of the two joined
449            tables (the indexed and iterated one) in a way that is expected to give better performance.
450         */
451         static void self_agnostic_join_project(const sparse_table & t1, const sparse_table & t2,
452             unsigned joined_col_cnt, const unsigned * t1_joined_cols, const unsigned * t2_joined_cols,
453             const unsigned * removed_cols, bool tables_swapped, sparse_table & result);
454 
455 
456         /**
457            If the fact at \c data (in table's native representation) is not in the table,
458            add it and return true. Otherwise return false.
459         */
460         bool add_fact(const char * data);
461 
462         bool add_reserve_content();
463 
464         void garbage_collect();
465 
466         sparse_table(sparse_table_plugin & p, const table_signature & sig, unsigned init_capacity=0);
467         sparse_table(const sparse_table & t);
468         ~sparse_table() override;
469     public:
470 
deallocate()471         void deallocate() override {
472             get_plugin().recycle(this);
473         }
474 
row_count()475         unsigned row_count() const { return m_data.entry_count(); }
476 
get_plugin()477         sparse_table_plugin & get_plugin() const
478         { return static_cast<sparse_table_plugin &>(table_base::get_plugin()); }
479 
empty()480         bool empty() const override { return row_count()==0; }
481         void add_fact(const table_fact & f) override;
482         bool contains_fact(const table_fact & f) const override;
483         bool fetch_fact(table_fact & f) const override;
484         void ensure_fact(const table_fact & f) override;
485         void remove_fact(const table_element* fact) override;
486         void reset() override;
487 
488         table_base * clone() const override;
489 
490         table_base::iterator begin() const override;
491         table_base::iterator end() const override;
492 
get_size_estimate_rows()493         unsigned get_size_estimate_rows() const override { return row_count(); }
494         unsigned get_size_estimate_bytes() const override;
knows_exact_size()495         bool knows_exact_size() const override { return true; }
496     };
497 
498  };
499 
500