1 #ifndef BMSTRSPARSEVEC__H__INCLUDED__
2 #define BMSTRSPARSEVEC__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10     http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit:  http://bitmagic.io
19 */
20 
21 /*! \file bmstrsparsevec.h
22     \brief string sparse vector based on bit-transposed matrix
23 */
24 
25 #include <stddef.h>
26 #include "bmconst.h"
27 
28 #ifndef BM_NO_STL
29 #include <stdexcept>
30 #endif
31 
32 #ifndef BM__H__INCLUDED__
33 // BitMagic utility headers do not include main "bm.h" declaration
34 // #include "bm.h" or "bm64.h" explicitly
35 # error missing include (bm.h or bm64.h)
36 #endif
37 
38 #include "bmtrans.h"
39 #include "bmalgo.h"
40 #include "bmbuffer.h"
41 #include "bmbmatrix.h"
42 #include "bmdef.h"
43 
44 namespace bm
45 {
46 
47 /*!
48    \brief succinct sparse vector for strings with compression using bit transposition method
49 
50    Initial string is bit-transposed into bit-planes so collection may use less
51    memory due to prefix sum (GAP) compression in bit-planes.
52 
53    @ingroup sv
54 */
55 template<typename CharType, typename BV, unsigned MAX_STR_SIZE>
56 class str_sparse_vector : public base_sparse_vector<CharType, BV, MAX_STR_SIZE>
57 {
58 public:
59     typedef BV                                       bvector_type;
60     typedef bvector_type*                            bvector_type_ptr;
61     typedef const bvector_type*                      bvector_type_const_ptr;
62     typedef CharType                                 value_type;
63     typedef CharType*                                value_type_prt;
64     typedef typename bvector_type::size_type         size_type;
65     typedef typename BV::allocator_type              allocator_type;
66     typedef typename bvector_type::allocation_policy allocation_policy_type;
67     typedef typename bvector_type::enumerator        bvector_enumerator_type;
68     typedef typename allocator_type::allocator_pool_type allocator_pool_type;
69     typedef bm::basic_bmatrix<BV>                    bmatrix_type;
70     typedef base_sparse_vector<CharType, BV, MAX_STR_SIZE> parent_type;
71 
72     /*! Statistical information about  memory allocation details. */
73     struct statistics : public bv_statistics
74     {};
75 
76     enum octet_planes
77     {
78         sv_octet_planes = MAX_STR_SIZE
79     };
80 
81     /**
82         Matrix of character remappings
83         @internal
84      */
85     typedef
86     bm::heap_matrix<unsigned char,
87                     MAX_STR_SIZE, /* ROWS */
88                     256,          /* COLS = number of chars in the ASCII set */
89                     typename bvector_type::allocator_type>
90                                                     plane_octet_matrix_type;
91     typedef plane_octet_matrix_type remap_matrix_type;
92     /**
93         Matrix of character frequencies (for optimal code remap)
94         @internal
95      */
96     typedef
97     bm::heap_matrix<size_t,
98                     MAX_STR_SIZE, /* ROWS */
99                     256,          /* COLS = number of chars in the ASCII set */
100                     typename bvector_type::allocator_type>
101                                                     octet_freq_matrix_type;
102 
103     struct is_remap_support { enum trait { value = true }; };
104     struct is_rsc_support { enum trait { value = false }; };
105 
106     /**
107          Reference class to access elements via common [] operator
108          @ingroup sv
109     */
110     class const_reference
111     {
112     public:
const_reference(const str_sparse_vector<CharType,BV,MAX_STR_SIZE> & str_sv,size_type idx)113         const_reference(
114                 const str_sparse_vector<CharType, BV, MAX_STR_SIZE>& str_sv,
115                 size_type idx) BMNOEXCEPT
116         : str_sv_(str_sv), idx_(idx)
117         {}
118 
119         operator const value_type*() const BMNOEXCEPT
120         {
121             return get();
122         }
123 
get()124         const value_type* get() const BMNOEXCEPT
125         {
126             str_sv_.get(idx_, buf_, MAX_STR_SIZE);
127             return &(buf_[0]);
128         }
129 
130         bool operator==(const const_reference& ref) const BMNOEXCEPT
131                                 { return bool(*this) == bool(ref); }
is_null()132         bool is_null() const BMNOEXCEPT { return str_sv_.is_null(idx_); }
133     private:
134         const str_sparse_vector<CharType, BV, MAX_STR_SIZE>& str_sv_;
135         size_type                                            idx_;
136         mutable CharType                                    buf_[MAX_STR_SIZE];
137     };
138 
139     /**
140          Reference class to access elements via common [] operator
141          @ingroup sv
142     */
143     class reference
144     {
145     public:
reference(str_sparse_vector<CharType,BV,MAX_STR_SIZE> & str_sv,size_type idx)146         reference(str_sparse_vector<CharType, BV, MAX_STR_SIZE>& str_sv,
147                   size_type idx) BMNOEXCEPT
148         : str_sv_(str_sv), idx_(idx)
149         {}
150 
151         operator const value_type*() const BMNOEXCEPT
152         {
153             return get();
154         }
155 
get()156         const value_type* get() const BMNOEXCEPT
157         {
158             str_sv_.get(idx_, buf_, MAX_STR_SIZE);
159             return &(buf_[0]);
160         }
161 
162         reference& operator=(const reference& ref)
163         {
164             // TO DO: implement element copy bit by bit
165             str_sv_.set(idx_, (const value_type*)ref);
166             return *this;
167         }
168 
169         reference& operator=(const value_type* str)
170         {
171             str_sv_.set(idx_, str);
172             return *this;
173         }
174         bool operator==(const reference& ref) const BMNOEXCEPT
175                                 { return bool(*this) == bool(ref); }
is_null()176         bool is_null() const BMNOEXCEPT { return str_sv_.is_null(idx_); }
177     private:
178         str_sparse_vector<CharType, BV, MAX_STR_SIZE>& str_sv_;
179         size_type                                      idx_;
180         mutable CharType                               buf_[MAX_STR_SIZE];
181     };
182 
183     /**
184         Const iterator to do quick traverse of the sparse vector.
185 
186         Implementation uses buffer for decoding so, competing changes
187         to the original vector may not match the iterator returned values.
188 
189         This iterator keeps an operational buffer of transposed elements,
190         so memory footprint is not negligable.
191 
192         @ingroup sv
193     */
194     class const_iterator
195     {
196     public:
197     friend class str_sparse_vector;
198 #ifndef BM_NO_STL
199         typedef std::input_iterator_tag  iterator_category;
200 #endif
201         typedef str_sparse_vector<CharType, BV, MAX_STR_SIZE> str_sparse_vector_type;
202         typedef str_sparse_vector_type*                       str_sparse_vector_type_ptr;
203         typedef typename str_sparse_vector_type::value_type   value_type;
204         typedef typename str_sparse_vector_type::size_type    size_type;
205         typedef typename str_sparse_vector_type::bvector_type bvector_type;
206         typedef typename bvector_type::allocator_type         allocator_type;
207         typedef typename allocator_type::allocator_pool_type  allocator_pool_type;
208 
209         typedef long long                   difference_type;
210         typedef CharType*                   pointer;
211         typedef CharType*&                  reference;
212     public:
213         /**
214             Construct iterator (not attached to any particular vector)
215          */
216         const_iterator() BMNOEXCEPT;
217         /**
218             Construct iterator (attached to sparse vector)
219             @param sv - pointer to sparse vector
220          */
221         const_iterator(const str_sparse_vector_type* sv) BMNOEXCEPT;
222         /**
223             Construct iterator (attached to sparse vector) and positioned
224             @param sv - reference to sparse vector
225             @param pos - position in the vector to start
226          */
227         const_iterator(const str_sparse_vector_type* sv, size_type pos) BMNOEXCEPT;
228 
229         const_iterator(const const_iterator& it) BMNOEXCEPT;
230 
231         /**
232             setup iterator to retrieve a sub-string of a string
233             @param from - Position of the first character to be copied
234             @param len - length of a substring
235          */
236         void set_substr(unsigned from, unsigned len = MAX_STR_SIZE) BMNOEXCEPT;
237 
238 
239         bool operator==(const const_iterator& it) const BMNOEXCEPT
240                                 { return (pos_ == it.pos_) && (sv_ == it.sv_); }
241         bool operator!=(const const_iterator& it) const BMNOEXCEPT
242                                 { return ! operator==(it); }
243         bool operator < (const const_iterator& it) const BMNOEXCEPT
244                                 { return pos_ < it.pos_; }
245         bool operator <= (const const_iterator& it) const BMNOEXCEPT
246                                 { return pos_ <= it.pos_; }
247         bool operator > (const const_iterator& it) const BMNOEXCEPT
248                                 { return pos_ > it.pos_; }
249         bool operator >= (const const_iterator& it) const BMNOEXCEPT
250                                 { return pos_ >= it.pos_; }
251 
252         /// \brief Get current position (value)
253         const value_type* operator*() const BMNOEXCEPT { return this->value(); }
254 
255         /// \brief Advance to the next available value
256         const_iterator& operator++() BMNOEXCEPT
257             { this->advance(); return *this; }
258 
259         /// \brief Advance to the next available value
260         const_iterator& operator++(int) BMNOEXCEPT
261             { const_iterator tmp(*this);this->advance(); return tmp; }
262 
263 
264         /// \brief Get current position (value)
265         const value_type* value() const BMNOEXCEPT;
266 
267         /// \brief Get NULL status
is_null()268         bool is_null() const BMNOEXCEPT { return sv_->is_null(this->pos_); }
269 
270         /// Returns true if iterator is at a valid position
valid()271         bool valid() const BMNOEXCEPT { return pos_ != bm::id_max; }
272 
273         /// Invalidate current iterator
invalidate()274         void invalidate() BMNOEXCEPT { pos_ = bm::id_max; }
275 
276         /// Current position (index) in the vector
pos()277         size_type pos() const BMNOEXCEPT { return pos_; }
278 
279         /// re-position to a specified position
280         void go_to(size_type pos) BMNOEXCEPT;
281 
282         /// advance iterator forward by one
283         void advance() BMNOEXCEPT;
284 
285     protected:
286         enum buf_size_e
287         {
288             n_rows = 1024
289         };
290         typedef dynamic_heap_matrix<CharType, allocator_type> buffer_matrix_type;
291 
292     private:
293         const str_sparse_vector_type*     sv_;      ///!< ptr to parent
294         unsigned                          substr_from_; ///!< substring from
295         unsigned                          substr_to_;   ///!< substring to
296         mutable size_type                 pos_;         ///!< Position
297         mutable buffer_matrix_type        buf_matrix_;  ///!< decode value buffer
298         mutable size_type                 pos_in_buf_;  ///!< buffer position
299     };
300 
301 
302     /**
303         Back insert iterator implements buffered insert, faster than generic
304         access assignment.
305 
306         Limitations for buffered inserter:
307         1. Do not use more than one inserter (into one vector) at the same time
308         2. Use method flush() at the end to send the rest of accumulated buffer
309         flush is happening automatically on destruction, but if flush produces an
310         exception (for whatever reason) it will be an exception in destructor.
311         As such, explicit flush() is safer way to finilize the sparse vector load.
312 
313         @ingroup sv
314     */
315     class back_insert_iterator
316     {
317     public:
318 #ifndef BM_NO_STL
319         typedef std::output_iterator_tag  iterator_category;
320 #endif
321         typedef str_sparse_vector<CharType, BV, MAX_STR_SIZE>  str_sparse_vector_type;
322         typedef str_sparse_vector_type*                        str_sparse_vector_type_ptr;
323         typedef typename str_sparse_vector_type::value_type    value_type;
324         typedef typename str_sparse_vector_type::size_type     size_type;
325         typedef typename str_sparse_vector_type::bvector_type  bvector_type;
326         typedef typename bvector_type::allocator_type          allocator_type;
327         typedef typename allocator_type::allocator_pool_type   allocator_pool_type;
328 
329         typedef void difference_type;
330         typedef void pointer;
331         typedef void reference;
332 
333     public:
334         back_insert_iterator() BMNOEXCEPT;
335         back_insert_iterator(str_sparse_vector_type* sv) BMNOEXCEPT;
336         back_insert_iterator(const back_insert_iterator& bi) BMNOEXCEPT;
337 
338         back_insert_iterator& operator=(const back_insert_iterator& bi)
339         {
340             BM_ASSERT(bi.empty());
341             this->flush(); sv_ = bi.sv_;
342             return *this;
343         }
344 
345         ~back_insert_iterator();
346 
347         /** push value to the vector */
348         back_insert_iterator& operator=(const value_type* v)
349             { this->add(v); return *this; }
350 
351 
352         /** push value to the vector */
353         template<typename StrType>
354         back_insert_iterator& operator=(const StrType& v)
355         {
356             this->add(v.c_str()); return *this; // TODO: avoid c_str()
357         }
358 
359         /** noop */
360         back_insert_iterator& operator*() { return *this; }
361         /** noop */
362         back_insert_iterator& operator++() { return *this; }
363         /** noop */
364         back_insert_iterator& operator++( int ) { return *this; }
365 
366         /** add value to the container*/
367         void add(const value_type* v);
368 
369         /** add NULL (no-value) to the container */
370         void add_null();
371 
372         /** add a series of consequitve NULLs (no-value) to the container */
373         void add_null(size_type count);
374 
375         /** flush the accumulated buffer */
376         void flush();
377     protected:
378         /** return true if insertion buffer is empty */
379         bool empty() const BMNOEXCEPT;
380 
381         typedef typename bvector_type::block_idx_type     block_idx_type;
382 
383         /** add value to the buffer without changing the NULL vector
384             @param v - value to push back
385             @internal
386         */
387         void add_value(const value_type* v);
388 
389     private:
390         enum buf_size_e
391         {
392             n_buf_size = 1024 * 8
393         };
394 
395         typedef bm::heap_matrix<CharType,
396                         n_buf_size,   // ROWS: number of strings in one batch
397                         MAX_STR_SIZE, // COLS
398                         allocator_type> buffer_matrix_type;
399 
400     private:
401         str_sparse_vector_type*  sv_;         ///!< pointer on the parent vector
402         bvector_type*            bv_null_;    ///!< not NULL vector pointer
403         buffer_matrix_type       buf_matrix_; ///!< value buffer
404         size_type                pos_in_buf_; ///!< buffer position
405         block_idx_type           prev_nb_;    ///!< previous block added
406     };
407 
408 
409 public:
410 
411     /*!
412         \brief Sparse vector constructor
413 
414         \param null_able - defines if vector supports NULL values flag
415             by default it is OFF, use bm::use_null to enable it
416         \param ap - allocation strategy for underlying bit-vectors
417         Default allocation policy uses BM_BIT setting (fastest access)
418         \param bv_max_size - maximum possible size of underlying bit-vectors
419         Please note, this is NOT size of svector itself, it is dynamic upper limit
420         which should be used very carefully if we surely know the ultimate size
421         \param alloc - allocator for bit-vectors
422 
423         \sa bvector<>
424         \sa bm::bvector<>::allocation_policy
425         \sa bm::startegy
426     */
427     str_sparse_vector(bm::null_support null_able = bm::no_null,
428                       allocation_policy_type ap = allocation_policy_type(),
429                       size_type bv_max_size = bm::id_max,
430                       const allocator_type&   alloc  = allocator_type());
431 
432     /*! copy-ctor */
433     str_sparse_vector(const str_sparse_vector& str_sv);
434 
435     /*! copy assignmment operator */
436     str_sparse_vector<CharType, BV, MAX_STR_SIZE>& operator = (
437                 const str_sparse_vector<CharType, BV, MAX_STR_SIZE>& str_sv)
438     {
439         if (this != &str_sv)
440             parent_type::copy_from(str_sv);
441         remap_flags_ = str_sv.remap_flags_;
442         remap_matrix1_ = str_sv.remap_matrix1_;
443         remap_matrix2_ = str_sv.remap_matrix2_;
444         return *this;
445     }
446 #ifndef BM_NO_CXX11
447     /*! move-ctor */
str_sparse_vector(str_sparse_vector<CharType,BV,MAX_STR_SIZE> && str_sv)448     str_sparse_vector(str_sparse_vector<CharType, BV, MAX_STR_SIZE>&& str_sv) BMNOEXCEPT
449     {
450         parent_type::swap(str_sv);
451         remap_flags_ = str_sv.remap_flags_;
452         remap_matrix1_.swap(str_sv.remap_matrix1_);
453         remap_matrix2_.swap(str_sv.remap_matrix2_);
454     }
455 
456     /*! move assignmment operator */
457     str_sparse_vector<CharType, BV, MAX_STR_SIZE>& operator =
458             (str_sparse_vector<CharType, BV, MAX_STR_SIZE>&& str_sv) BMNOEXCEPT
459     {
460         if (this != &str_sv)
461         {
462             this->swap(str_sv);
463         }
464         return *this;
465     }
466 #endif
467 
468 public:
469 
470     // ------------------------------------------------------------
471     /*! @name String element access */
472     ///@{
473 
474     /** \brief Operator to get read access to an element  */
475     const const_reference operator[](size_type idx) const
476                                 { return const_reference(*this, idx); }
477 
478     /** \brief Operator to get write access to an element  */
479     reference operator[](size_type idx) { return reference(*this, idx); }
480 
481     /*!
482         \brief set specified element with bounds checking and automatic resize
483         \param idx  - element index (vector auto-resized if needs to)
484         \param str  - string to set (zero terminated)
485     */
486     void set(size_type idx, const value_type* str);
487 
488     /*!
489         \brief set NULL status for the specified element
490         Vector is resized automatically
491         \param idx  - element index (vector auto-resized if needs to)
492     */
493     void set_null(size_type idx);
494 
495 
496     /*!
497         \brief insert the specified element
498         \param idx  - element index (vector auto-resized if needs to)
499         \param str  - string to set (zero terminated)
500     */
501     void insert(size_type idx, const value_type* str);
502 
503 
504     /*!
505         \brief insert STL string
506         \param idx  - element index (vector auto-resized if needs to)
507         \param str  - STL string to set
508     */
509     template<typename StrType>
insert(size_type idx,const StrType & str)510     void insert(size_type idx, const StrType& str)
511     {
512         this->insert(idx, str.c_str()); // TODO: avoid c_str()
513     }
514 
515     /*!
516         \brief erase the specified element
517         \param idx  - element index
518     */
519     void erase(size_type idx);
520 
521     /*!
522         \brief get specified element
523 
524         \param idx  - element index
525         \param str  - string buffer
526         \param buf_size - string buffer size
527 
528         @return string length
529     */
530     size_type get(size_type idx,
531                  value_type* str, size_type buf_size) const BMNOEXCEPT;
532 
533     /*!
534         \brief set specified element with bounds checking and automatic resize
535 
536         This is an equivalent of set() method, but templetized to be
537         more compatible with the STL std::string and the likes
538 
539         \param idx  - element index (vector auto-resized if needs to)
540         \param str  - input string
541                       expected an STL class with size() support,
542                       like basic_string<> or vector<char>
543     */
544     template<typename StrType>
assign(size_type idx,const StrType & str)545     void assign(size_type idx, const StrType& str)
546     {
547         if (idx >= this->size())
548             this->size_ = idx+1;
549 
550         size_type str_size = size_type(str.size());
551         size_type sz = size_type((str_size < MAX_STR_SIZE) ? str_size : MAX_STR_SIZE-1);
552         if (!sz)
553         {
554             this->clear_value_planes_from(0, idx);
555             return;
556         }
557         unsigned i = 0;
558         for (; i < sz; ++i)
559         {
560             CharType ch = str[i];
561             if (remap_flags_) // compressional re-mapping is in effect
562             {
563                 unsigned char remap_value = remap_matrix2_.get(i, unsigned(ch));
564                 BM_ASSERT(remap_value);
565                 ch = CharType(remap_value);
566             }
567             this->bmatr_.set_octet(idx, i, (unsigned char)ch);
568             if (!ch)
569                 break;
570         } // for i
571         if (idx > sz)
572             return;
573         this->bmatr_.set_octet(idx, sz, 0);
574         this->clear_value_planes_from(unsigned(sz*8+1), idx);
575     }
576 
577     /*!
578         \brief push back a string
579         \param str  - string to set
580                     (STL class with size() support, like basic_string)
581     */
582     template<typename StrType>
push_back(const StrType & str)583     void push_back(const StrType& str) { assign(this->size_, str); }
584 
585     /*!
586         \brief push back a string (zero terminated)
587         \param str  - string to set
588     */
push_back(const value_type * str)589     void push_back(const value_type* str) { set(this->size_, str); }
590 
591 
592     /*!
593         \brief get specified string element
594         Template method expects an STL-compatible type basic_string<>
595         \param idx  - element index (vector auto-resized if needs to)
596         \param str  - string to get [out]
597     */
598     template<typename StrType>
get(size_type idx,StrType & str)599     void get(size_type idx, StrType& str) const
600     {
601         str.clear();
602         for (unsigned i = 0; i < MAX_STR_SIZE; ++i)
603         {
604             CharType ch = CharType(this->bmatr_.get_octet(idx, i));
605             if (ch == 0)
606                 break;
607             if (remap_flags_)
608             {
609                 const unsigned char* remap_row = remap_matrix1_.row(i);
610                 unsigned char remap_value = remap_row[unsigned(ch)];
611                 BM_ASSERT(remap_value);
612                 if (!remap_value) // unknown dictionary element
613                 {
614                     throw_bad_value(0);
615                     break;
616                 }
617                 ch = CharType(remap_value);
618             }
619             str.push_back(ch);
620         } // for i
621     }
622 
623     /*! Swap content */
624     void swap(str_sparse_vector& str_sv) BMNOEXCEPT;
625 
626     ///@}
627 
628     // ------------------------------------------------------------
629     /*! @name Element comparison functions       */
630     ///@{
631 
632     /**
633         \brief Compare vector element with argument lexicographically
634 
635         NOTE: for a re-mapped container, input string may have no correct
636         remapping, in this case we have an ambiguity
637         (we know it is not equal (0) but LT or GT?).
638         Behavior is undefined.
639 
640         \param idx - vactor element index
641         \param str - argument to compare with
642 
643         \return 0 - equal, < 0 - vect[i] < str, >0 otherwise
644     */
645     int compare(size_type idx, const value_type* str) const BMNOEXCEPT;
646 
647 
648     /**
649         \brief Find size of common prefix between two vector elements in octets
650         \return size of common prefix
651     */
652     unsigned common_prefix_length(size_type idx1, size_type idx2) const BMNOEXCEPT;
653 
654     ///@}
655 
656 
657     // ------------------------------------------------------------
658     /*! @name Clear                                              */
659     ///@{
660 
661     /*! \brief resize to zero, free memory */
662     void clear_all(bool free_mem) BMNOEXCEPT;
663 
664     /*! \brief resize to zero, free memory */
clear()665     void clear() BMNOEXCEPT { clear_all(true); }
666 
667     /*!
668         \brief clear range (assign bit 0 for all planes)
669         \param left  - interval start
670         \param right - interval end (closed interval)
671         \param set_null - set cleared values to unassigned (NULL)
672     */
673     str_sparse_vector<CharType, BV, MAX_STR_SIZE>&
674         clear_range(size_type left, size_type right, bool set_null = false)
675     {
676         parent_type::clear_range(left, right, set_null);
677         return *this;
678     }
679 
680 
681     ///@}
682 
683 
684     // ------------------------------------------------------------
685     /*! @name Size, etc       */
686     ///@{
687 
688     /*! \brief return size of the vector
689         \return size of sparse vector
690     */
size()691     size_type size() const { return this->size_; }
692 
693     /*! \brief return true if vector is empty
694         \return true if empty
695     */
empty()696     bool empty() const { return (size() == 0); }
697 
698     /*! \brief resize vector
699         \param sz - new size
700     */
resize(size_type sz)701     void resize(size_type sz) { parent_type::resize(sz); }
702 
703     /*! \brief get maximum string length capacity
704         \return maximum string length sparse vector can take
705     */
max_str()706     static size_type max_str() { return sv_octet_planes; }
707 
708     /*! \brief get effective string length used in vector
709         Calculate and returns efficiency, how close are we
710         to the reserved maximum.
711         \return current string length maximum
712     */
713     size_type effective_max_str() const BMNOEXCEPT;
714 
715     /*! \brief get effective string length used in vector
716         \return current string length maximum
717     */
effective_vector_max()718     size_type effective_vector_max() const { return effective_max_str(); }
719 
720     /**
721         \brief recalculate size to exclude tail NULL elements
722         After this call size() will return the true size of the vector
723      */
724     void sync_size() BMNOEXCEPT;
725 
726     ///@}
727 
728 
729     // ------------------------------------------------------------
730     /*! @name Memory optimization/compression                    */
731     ///@{
732 
733     /*!
734         \brief run memory optimization for all vector planes
735         \param temp_block - pre-allocated memory block to avoid unnecessary re-allocs
736         \param opt_mode - requested compression depth
737         \param stat - memory allocation statistics after optimization
738     */
739     void optimize(
740        bm::word_t* temp_block = 0,
741        typename bvector_type::optmode opt_mode = bvector_type::opt_compress,
742        typename str_sparse_vector<CharType, BV, MAX_STR_SIZE>::statistics* stat = 0);
743 
744     /*!
745         @brief Calculates memory statistics.
746 
747         Function fills statistics structure containing information about how
748         this vector uses memory and estimation of max. amount of memory
749         bvector needs to serialize itself.
750 
751         @param st - pointer on statistics structure to be filled in.
752 
753         @sa statistics
754     */
755     void calc_stat(
756         struct str_sparse_vector<CharType, BV, MAX_STR_SIZE>::statistics* st
757         ) const BMNOEXCEPT;
758 
759 
760     ///@}
761 
762     // ------------------------------------------------------------
763     /*! @name Iterator access */
764     ///@{
765 
766     /** Provide const iterator access to container content  */
767     const_iterator begin() const BMNOEXCEPT;
768 
769     /** Provide const iterator access to the end    */
end()770     const_iterator end() const BMNOEXCEPT { return const_iterator(this, bm::id_max); }
771 
772     /** Get const_itertor re-positioned to specific element
773     @param idx - position in the sparse vector
774     */
get_const_iterator(size_type idx)775     const_iterator get_const_iterator(size_type idx) const BMNOEXCEPT
776         { return const_iterator(this, idx); }
777 
778      /** Provide back insert iterator
779     Back insert iterator implements buffered insertion, which is faster, than random access
780     or push_back
781     */
get_back_inserter()782     back_insert_iterator get_back_inserter()
783         { return back_insert_iterator(this); }
784 
785     ///@}
786 
787 
788 
789     // ------------------------------------------------------------
790     /*! @name Various traits                                     */
791     ///@{
792 
793     /** \brief trait if sparse vector is "compressed" (false)
794     */
795     static
is_compressed()796     bool is_compressed() BMNOEXCEPT { return false; }
797 
798     ///@}
799 
800     // ------------------------------------------------------------
801     /*! @name Char remapping, succinct utilities
802 
803         Remapping runs character usage analysis (frequency analysis)
804         based on that implements reduction of dit-depth thus improves
805         search performance and memory usage (both RAM and serialized).
806 
807         Remapping limits farther modifications of sparse vector.
808         (Use remapped vector as read-only).
809     */
810 
811     ///@{
812 
813     /**
814         Get character remapping status (true|false)
815     */
is_remap()816     bool is_remap() const BMNOEXCEPT { return remap_flags_ != 0; }
817 
818     /**
819         Build remapping profile and load content from another sparse vector
820         Remapped vector likely saves memory (both RAM and disk) but
821         should not be modified (should be read-only).
822 
823         \param str_sv - source sparse vector (assumed it is not remapped)
824     */
825     void remap_from(const str_sparse_vector& str_sv);
826 
827     /**
828         Build remapping profile and re-load content to save memory
829     */
830     void remap();
831 
832     /*!
833         Calculate flags which octets are present on each byte-plane.
834         @internal
835     */
836     void calc_octet_stat(octet_freq_matrix_type& octet_matrix) const BMNOEXCEPT;
837 
838     /*!
839         Compute optimal remap codes
840         @internal
841     */
842     static
843     void build_octet_remap(
844                 plane_octet_matrix_type& octet_remap_matrix1,
845                 plane_octet_matrix_type& octet_remap_matrix2,
846                 octet_freq_matrix_type& octet_occupancy_matrix);
847     /*!
848         remap string from external (ASCII) system to matrix internal code
849         @return true if remapping was ok, false if found incorrect value
850                 for the plane
851         @internal
852     */
853     static
854     bool remap_tosv(value_type*  BMRESTRICT      sv_str,
855                     size_type                    buf_size,
856                     const value_type* BMRESTRICT str,
857                     const plane_octet_matrix_type& BMRESTRICT octet_remap_matrix2
858                     ) BMNOEXCEPT;
859 
860     /*!
861         remap string from external (ASCII) system to matrix internal code
862         @internal
863     */
remap_tosv(value_type * sv_str,size_type buf_size,const value_type * str)864     bool remap_tosv(value_type*       sv_str,
865                     size_type         buf_size,
866                     const value_type* str) const BMNOEXCEPT
867     {
868         return remap_tosv(sv_str, buf_size, str, remap_matrix2_);
869     }
870     /*!
871         remap string from internal code to external (ASCII) system
872         @return true if remapping was ok, false if found incorrect value
873                 for the plane
874         @internal
875     */
876     static
877     bool remap_fromsv(
878             value_type*   BMRESTRICT     str,
879             size_type                    buf_size,
880             const value_type* BMRESTRICT sv_str,
881             const plane_octet_matrix_type& BMRESTRICT octet_remap_matrix1
882             ) BMNOEXCEPT;
883     /*!
884         re-calculate remap matrix2 based on matrix1
885         @internal
886     */
887     void recalc_remap_matrix2();
888 
889     ///@}
890 
891     // ------------------------------------------------------------
892     /*! @name Export content to C-style                          */
893     ///@{
894 
895     /**
896         \brief Bulk export strings to a C-style matrix of chars
897 
898         \param cmatr  - dest matrix (bm::heap_matrix)
899         \param idx_from - index in the sparse vector to export from
900         \param dec_size - decoding size (matrix column allocation should match)
901         \param zero_mem - set to false if target array is pre-initialized
902                           with 0s to avoid performance penalty
903 
904         \return number of actually exported elements (can be less than requested)
905     */
906     template<typename CharMatrix>
907     size_type decode(CharMatrix& cmatr,
908                      size_type   idx_from,
909                      size_type   dec_size,
910                      bool        zero_mem = true) const
911     {
912         return decode_substr(cmatr, idx_from, dec_size,
913                              0, MAX_STR_SIZE-1, zero_mem);
914     }
915 
916     /**
917         \brief Bulk export strings to a C-style matrix of chars
918 
919         \param cmatr  - dest matrix (bm::heap_matrix)
920         \param idx_from - index in the sparse vector to export from
921         \param dec_size - decoding size (matrix column allocation should match)
922         \param substr_from - sub-string position from
923         \param substr_to - sub-string position to
924         \param zero_mem - set to false if target array is pre-initialized
925                           with 0s to avoid performance penalty
926 
927         \return number of actually exported elements (can be less than requested)
928     */
929     template<typename CharMatrix>
930     size_type decode_substr(CharMatrix& cmatr,
931                      size_type   idx_from,
932                      size_type   dec_size,
933                      unsigned    substr_from,
934                      unsigned    substr_to,
935                      bool        zero_mem = true) const
936     {
937 
938     /// Decoder functor
939     /// @internal
940     ///
941     struct sv_decode_visitor_func
942     {
sv_decode_visitor_funcsv_decode_visitor_func943         sv_decode_visitor_func(CharMatrix& cmatr) BMNOEXCEPT2
944             : cmatr_(cmatr), mask_(0), sv_off_(0)
945         {}
946 
add_bitssv_decode_visitor_func947         void add_bits(size_type bv_offset,
948                       const unsigned char* BMRESTRICT bits, unsigned bits_size) BMNOEXCEPT
949         {
950             // can be negative (-1) when bv base offset = 0 and sv = 1,2..
951             size_type base = bv_offset - sv_off_;
952             value_type m = mask_;
953             const unsigned i = substr_i_;
954             for (unsigned j = 0; j < bits_size; ++j)
955             {
956                 size_type idx = bits[j] + base;
957                 value_type* BMRESTRICT str = cmatr_.row(idx);
958                 str[i] |= m;
959             } // for i
960         }
add_rangesv_decode_visitor_func961         void add_range(size_type bv_offset, size_type sz) BMNOEXCEPT
962         {
963             auto base = bv_offset - sv_off_;
964             value_type m = mask_;
965             const unsigned i = substr_i_;
966             for (size_type j = 0; j < sz; ++j)
967             {
968                 size_type idx = j + base;
969                 value_type* BMRESTRICT str = cmatr_.row(idx);
970                 str[i] |= m;
971             } // for i
972         }
973 
974         CharMatrix&            cmatr_;     ///< target array for reverse transpose
975         value_type             mask_;      ///< bit-plane mask
976         unsigned               substr_i_;  ///< i
977         size_type              sv_off_;    ///< SV read offset
978     };
979 
980 
981         BM_ASSERT(substr_from <= substr_to);
982         BM_ASSERT(substr_from <= MAX_STR_SIZE);
983         BM_ASSERT(cmatr.is_init());
984 
985         if (substr_to >= MAX_STR_SIZE)
986             substr_to = MAX_STR_SIZE-1;
987 
988         if (zero_mem)
989             cmatr.set_zero(); // TODO: set zero based on requested capacity
990 
991         size_type rows = size_type(cmatr.rows());
992         size_type max_sz = this->size() - idx_from;
993         if (max_sz < dec_size)
994             dec_size = max_sz;
995         if (rows < dec_size)
996             dec_size = rows;
997         if (!dec_size)
998             return dec_size;
999 
1000         sv_decode_visitor_func func(cmatr);
1001 
1002         for (unsigned i = substr_from; i <= substr_to; ++i)
1003         {
1004             unsigned bi = 0;
1005             func.substr_i_ = i - substr_from; // to put substr at the str[0]
1006 
1007             for (unsigned k = i * 8; k < (i * 8) + 8; ++k, ++bi)
1008             {
1009                 const bvector_type* bv = this->bmatr_.get_row(k);
1010                 if (!bv)
1011                     continue;
1012 
1013                 func.mask_ = value_type(1u << bi);
1014                 func.sv_off_ = idx_from;
1015 
1016                 size_type end = idx_from + dec_size;
1017                 bm::for_each_bit_range_no_check(*bv, idx_from, end-1, func);
1018 
1019             } // for k
1020         } // for i
1021 
1022         if (remap_flags_)
1023         {
1024             for (unsigned i = 0; i < dec_size; ++i)
1025             {
1026                 typename CharMatrix::value_type* str = cmatr.row(i);
1027                 remap_matrix1_.remapz(str);
1028             } // for i
1029         }
1030         return dec_size;
1031     }
1032 
1033     /**
1034         \brief Bulk import of strings from a C-style matrix of chars
1035 
1036         \param cmatr  - source matrix (bm::heap_matrix)
1037                         [in/out] parameter gets modified(corrupted)
1038                         in the process
1039         \param idx_from - destination index in the sparse vector
1040         \param imp_size - import size (number or rows to import)
1041     */
1042     template<typename CharMatrix>
import(CharMatrix & cmatr,size_type idx_from,size_type imp_size)1043     void import(CharMatrix& cmatr, size_type idx_from, size_type imp_size)
1044     {
1045         if (!imp_size)
1046             return;
1047         if (idx_from < this->size_) // in case it touches existing elements
1048         {
1049             // clear all planes in the range to provide corrrect import of 0 values
1050             this->clear_range(idx_from, idx_from + imp_size - 1);
1051         }
1052         import_no_check(cmatr, idx_from, imp_size);
1053     }
1054 
1055     /**
1056         \brief Bulk push-back import of strings from a C-style matrix of chars
1057 
1058         \param cmatr  - source matrix (bm::heap_matrix)
1059                         [in/out] parameter gets modified(corrupted)
1060                         in the process
1061         \param imp_size - import size (number or rows to import)
1062     */
1063     template<typename CharMatrix>
import_back(CharMatrix & cmatr,size_type imp_size)1064     void import_back(CharMatrix& cmatr, size_type imp_size)
1065     {
1066         if (!imp_size)
1067             return;
1068         import_no_check(cmatr, this->size(), imp_size);
1069     }
1070 
1071 
1072     ///@}
1073 
1074     // ------------------------------------------------------------
1075     /*! @name Merge, split, partition data                        */
1076     ///@{
1077 
1078     /**
1079         @brief copy range of values from another sparse vector
1080 
1081         Copy [left..right] values from the source vector,
1082         clear everything outside the range.
1083 
1084         \param sv - source vector
1085         \param left  - index from in losed diapason of [left..right]
1086         \param right - index to in losed diapason of [left..right]
1087         \param splice_null - "use_null" copy range for NULL vector or
1088                              do not copy it
1089     */
1090     void copy_range(const str_sparse_vector<CharType, BV, MAX_STR_SIZE>& sv,
1091                     size_type left, size_type right,
1092                     bm::null_support splice_null = bm::use_null);
1093 
1094     /**
1095         \brief merge with another sparse vector using OR operation
1096         Merge is different from join(), because it borrows data from the source
1097         vector, so it gets modified (destructive join)
1098 
1099         \param tr_sv - [in, out]argument vector to join with (vector mutates)
1100 
1101         \return self reference
1102      */
1103      str_sparse_vector<CharType, BV, MAX_STR_SIZE>&
1104      merge(str_sparse_vector<CharType, BV, MAX_STR_SIZE>& str_sv);
1105 
1106     /**
1107         Keep only specified interval in the sparse vector, clear all other
1108         elements.
1109 
1110         \param left  - interval start
1111         \param right - interval end (closed interval)
1112         \param splice_null - "use_null" copy range for NULL vector or not
1113      */
1114      void keep_range(size_type left, size_type right,
1115                     bm::null_support splice_null = bm::use_null);
1116 
1117     ///@}
1118 
1119     // ------------------------------------------------------------
1120 
1121     /*! \brief syncronize internal structures */
1122     void sync(bool force);
1123 
1124     /*!
1125         \brief check if another sparse vector has the same content and size
1126 
1127         \param sv        - sparse vector for comparison
1128         \param null_able - flag to consider NULL vector in comparison (default)
1129                            or compare only value content planes
1130 
1131         \return true, if it is the same
1132     */
1133     bool equal(const str_sparse_vector<CharType, BV, MAX_STR_SIZE>& sv,
1134                bm::null_support null_able = bm::use_null) const BMNOEXCEPT;
1135 
1136     /**
1137         \brief find position of compressed element by its rank
1138     */
1139     static
1140     bool find_rank(size_type rank, size_type& pos) BMNOEXCEPT;
1141 
1142     /**
1143         \brief size of sparse vector (may be different for RSC)
1144     */
effective_size()1145     size_type effective_size() const BMNOEXCEPT { return size(); }
1146 
1147 protected:
1148 
1149     /// @internal
1150     template<typename CharMatrix>
1151     void import_no_check(CharMatrix& cmatr,
1152                          size_type idx_from, size_type imp_size,
1153                          bool set_not_null = true)
1154     {
1155         BM_ASSERT (cmatr.is_init());
1156 
1157         unsigned max_str_size = 0;
1158         {
1159             for (unsigned j = 0; j < imp_size; ++j)
1160             {
1161                 typename CharMatrix::value_type* str = cmatr.row(j);
1162                 unsigned i;
1163                 for (i = 0; i < MAX_STR_SIZE; ++i)
1164                 {
1165                     value_type ch = str[i];
1166                     if (!ch)
1167                     {
1168                         max_str_size = (i > max_str_size) ? i : max_str_size;
1169                         break;
1170                     }
1171                     if (remap_flags_) // re-mapping is in effect
1172                     {
1173                         unsigned char remap_value =
1174                             remap_matrix2_.get(i, (unsigned char)(ch));
1175                         BM_ASSERT(remap_value);
1176                         if (!remap_value) // unknown dictionary element
1177                             throw_bad_value(0);
1178                         str[i] = CharType(remap_value);
1179                     }
1180                 } // for i
1181                 if (i == MAX_STR_SIZE)
1182                     max_str_size = i;
1183             } // for j
1184         }
1185 
1186         size_type bit_list[CharMatrix::n_rows];
1187         for (unsigned i = 0; i < max_str_size; ++i)
1188         {
1189             for (unsigned bi = 0; bi < 8; ++bi)
1190             {
1191                 unsigned n_bits = 0;
1192                 value_type mask = value_type(1u << bi);
1193                 for (size_type j = 0; j < imp_size; ++j)
1194                 {
1195                     typename CharMatrix::value_type* str = cmatr.row(j);
1196                     value_type ch = str[i];
1197                     if (!ch)
1198                         continue;
1199                     if (ch & mask)
1200                     {
1201                         bit_list[n_bits++] = idx_from + j;
1202                         str[i] ^= mask;
1203                     }
1204                 } // for j
1205                 if (n_bits) // set transposed bits to the target plane
1206                 {
1207                     unsigned plane = i*8 + bi;
1208                     bvector_type* bv = this->bmatr_.get_row(plane);
1209                     if (!bv)
1210                     {
1211                         bv = this->bmatr_.construct_row(plane);
1212                         bv->init();
1213                     }
1214                     bv->set(&bit_list[0], n_bits, BM_SORTED);
1215                 }
1216             } // for k
1217         } // for i
1218 
1219         size_type idx_to = idx_from + imp_size - 1;
1220         if (set_not_null)
1221         {
1222             bvector_type* bv_null = this->get_null_bvect();
1223             if (bv_null)
1224                 bv_null->set_range(idx_from, idx_to);
1225         }
1226         if (idx_to >= this->size())
1227             this->size_ = idx_to+1;
1228 
1229     }
1230 
1231     // ------------------------------------------------------------
1232     /*! @name Errors and exceptions                              */
1233     ///@{
1234 
1235     /**
1236         \brief throw range error
1237         \internal
1238     */
1239     static
1240     void throw_range_error(const char* err_msg);
1241 
1242     /**
1243         \brief throw domain error
1244         \internal
1245     */
1246     static
1247     void throw_bad_value(const char* err_msg);
1248 
1249     ///@}
1250 
1251     /*! \brief set value without checking boundaries */
1252     void set_value(size_type idx, const value_type* str);
1253 
1254     /*! \brief set value without checking boundaries or support of NULL */
1255     void set_value_no_null(size_type idx, const value_type* str);
1256 
1257     /*! \brief insert value without checking boundaries */
1258     void insert_value(size_type idx, const value_type* str);
1259 
1260     /*! \brief insert value without checking boundaries or support of NULL */
1261     void insert_value_no_null(size_type idx, const value_type* str);
1262 
1263 
size_internal()1264     size_type size_internal() const { return size(); }
resize_internal(size_type sz)1265     void resize_internal(size_type sz) { resize(sz); }
1266 
remap_size()1267     size_t remap_size() const { return remap_matrix1_.get_buffer().size(); }
get_remap_buffer()1268     const unsigned char* get_remap_buffer() const
1269                 { return remap_matrix1_.get_buffer().buf(); }
init_remap_buffer()1270     unsigned char* init_remap_buffer()
1271     {
1272         remap_matrix1_.init(true);
1273         return remap_matrix1_.get_buffer().data();
1274     }
set_remap()1275     void set_remap() { remap_flags_ = 1; }
1276 
1277 protected:
1278 
resolve_range(size_type from,size_type to,size_type * idx_from,size_type * idx_to)1279     bool resolve_range(size_type from, size_type to,
1280                        size_type* idx_from, size_type* idx_to) const
1281     {
1282         *idx_from = from; *idx_to = to; return true;
1283     }
1284 
get_remap_matrix()1285     const remap_matrix_type* get_remap_matrix() const
1286         { return &remap_matrix1_; }
get_remap_matrix()1287     remap_matrix_type* get_remap_matrix()
1288         { return &remap_matrix1_; }
1289 
1290 protected:
1291     template<class SVect> friend class sparse_vector_serializer;
1292     template<class SVect> friend class sparse_vector_deserializer;
1293 
1294 protected:
1295     unsigned                 remap_flags_;   ///< remapping status
1296     plane_octet_matrix_type  remap_matrix1_; ///< octet remap table 1
1297     plane_octet_matrix_type  remap_matrix2_; ///< octet remap table 2
1298 };
1299 
1300 //---------------------------------------------------------------------
1301 //---------------------------------------------------------------------
1302 
1303 
1304 template<class CharType, class BV, unsigned MAX_STR_SIZE>
str_sparse_vector(bm::null_support null_able,allocation_policy_type ap,size_type bv_max_size,const allocator_type & alloc)1305 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::str_sparse_vector(
1306         bm::null_support null_able,
1307         allocation_policy_type  ap,
1308         size_type               bv_max_size,
1309         const allocator_type&   alloc)
1310 : parent_type(null_able, ap, bv_max_size, alloc),
1311   remap_flags_(0)
1312 {}
1313 
1314 
1315 //---------------------------------------------------------------------
1316 
1317 template<class CharType, class BV, unsigned MAX_STR_SIZE>
str_sparse_vector(const str_sparse_vector & str_sv)1318 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::str_sparse_vector(
1319                                         const str_sparse_vector& str_sv)
1320 : parent_type(str_sv),
1321   remap_flags_(str_sv.remap_flags_),
1322   remap_matrix1_(str_sv.remap_matrix1_),
1323   remap_matrix2_(str_sv.remap_matrix2_)
1324 {}
1325 
1326 //---------------------------------------------------------------------
1327 
1328 template<class CharType, class BV, unsigned MAX_STR_SIZE>
swap(str_sparse_vector & str_sv)1329 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::swap(
1330                                 str_sparse_vector& str_sv) BMNOEXCEPT
1331 {
1332     parent_type::swap(str_sv);
1333     bm::xor_swap(remap_flags_, str_sv.remap_flags_);
1334     remap_matrix1_.swap(str_sv.remap_matrix1_);
1335     remap_matrix2_.swap(str_sv.remap_matrix2_);
1336 }
1337 
1338 //---------------------------------------------------------------------
1339 
1340 template<class CharType, class BV, unsigned MAX_STR_SIZE>
set(size_type idx,const value_type * str)1341 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::set(
1342                                 size_type idx, const value_type* str)
1343 {
1344     if (idx >= this->size())
1345         this->size_ = idx+1;
1346     set_value(idx, str);
1347 }
1348 
1349 //---------------------------------------------------------------------
1350 
1351 template<class CharType, class BV, unsigned MAX_STR_SIZE>
insert(size_type idx,const value_type * str)1352 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::insert(
1353                                 size_type idx, const value_type* str)
1354 {
1355     if (idx >= this->size())
1356     {
1357         this->size_ = idx+1;
1358         set_value(idx, str);
1359         return;
1360     }
1361     insert_value(idx, str);
1362     this->size_++;
1363 }
1364 
1365 //---------------------------------------------------------------------
1366 
1367 template<class CharType, class BV, unsigned MAX_STR_SIZE>
erase(size_type idx)1368 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::erase(size_type idx)
1369 {
1370     BM_ASSERT(idx < this->size_);
1371     if (idx >= this->size_)
1372         return;
1373     this->erase_column(idx);
1374     this->size_--;
1375 }
1376 
1377 //---------------------------------------------------------------------
1378 
1379 template<class CharType, class BV, unsigned MAX_STR_SIZE>
set_null(size_type idx)1380 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::set_null(size_type idx)
1381 {
1382     bvector_type* bv_null = this->get_null_bvect();
1383     if (bv_null)
1384         bv_null->clear_bit_no_check(idx);
1385     if (idx >= this->size_)
1386     {
1387         this->size_ = idx + 1;
1388     }
1389 }
1390 
1391 //---------------------------------------------------------------------
1392 
1393 template<class CharType, class BV, unsigned MAX_STR_SIZE>
set_value(size_type idx,const value_type * str)1394 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::set_value(
1395                                 size_type idx, const value_type* str)
1396 {
1397     set_value_no_null(idx, str);
1398     bvector_type* bv_null = this->get_null_bvect();
1399     if (bv_null)
1400         bv_null->set_bit_no_check(idx);
1401 }
1402 
1403 //---------------------------------------------------------------------
1404 
1405 template<class CharType, class BV, unsigned MAX_STR_SIZE>
set_value_no_null(size_type idx,const value_type * str)1406 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::set_value_no_null(
1407                                 size_type idx, const value_type* str)
1408 {
1409     for (unsigned i = 0; i < MAX_STR_SIZE; ++i)
1410     {
1411         CharType ch = str[i];
1412         if (!ch)
1413         {
1414             this->clear_value_planes_from(i*8, idx);
1415             return;
1416         }
1417 
1418         if (remap_flags_) // compressional re-mapping is in effect
1419         {
1420             unsigned char remap_value = remap_matrix2_.get(i, unsigned(ch));
1421             BM_ASSERT(remap_value);
1422             if (!remap_value) // unknown dictionary element
1423             {
1424                 this->clear_value_planes_from(i*8, idx);
1425                 return;
1426             }
1427             ch = CharType(remap_value);
1428         }
1429         this->bmatr_.set_octet(idx, i, (unsigned char)ch);
1430     } // for i
1431 }
1432 
1433 //---------------------------------------------------------------------
1434 
1435 template<class CharType, class BV, unsigned MAX_STR_SIZE>
insert_value(size_type idx,const value_type * str)1436 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::insert_value(
1437                                     size_type idx, const value_type* str)
1438 {
1439     insert_value_no_null(idx, str);
1440     this->insert_null(idx, true);
1441 }
1442 
1443 //---------------------------------------------------------------------
1444 
1445 template<class CharType, class BV, unsigned MAX_STR_SIZE>
insert_value_no_null(size_type idx,const value_type * str)1446 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::insert_value_no_null(
1447                                         size_type idx, const value_type* str)
1448 {
1449     for (unsigned i = 0; i < MAX_STR_SIZE; ++i)
1450     {
1451         CharType ch = str[i];
1452         if (!ch)
1453         {
1454             this->insert_clear_value_planes_from(i*8, idx);
1455             return;
1456         }
1457 
1458         if (remap_flags_) // compressional re-mapping is in effect
1459         {
1460             unsigned char remap_value = remap_matrix2_.get(i, unsigned(ch));
1461             BM_ASSERT(remap_value);
1462             if (!remap_value) // unknown dictionary element
1463             {
1464                 this->insert_clear_value_planes_from(i*8, idx);
1465                 return;
1466             }
1467             ch = CharType(remap_value);
1468         }
1469         this->bmatr_.insert_octet(idx, i, (unsigned char)ch);
1470     } // for i
1471 }
1472 
1473 
1474 //---------------------------------------------------------------------
1475 
1476 template<class CharType, class BV, unsigned MAX_STR_SIZE>
1477 typename str_sparse_vector<CharType, BV, MAX_STR_SIZE>::size_type
get(size_type idx,value_type * str,size_type buf_size)1478 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::get(
1479             size_type idx, value_type* str, size_type buf_size) const BMNOEXCEPT
1480 {
1481     size_type i = 0;
1482     for (; i < MAX_STR_SIZE; ++i)
1483     {
1484         if (i < buf_size)
1485             str[i] = 0;
1486         else
1487             break;
1488         CharType ch = CharType(this->bmatr_.get_octet(idx, i));
1489         if (ch == 0)
1490         {
1491             str[i] = 0;
1492             break;
1493         }
1494         str[i] = ch;
1495     } // for i
1496     if (remap_flags_)
1497     {
1498         remap_matrix1_.remap(str, i);
1499     }
1500     return i;
1501 }
1502 
1503 //---------------------------------------------------------------------
1504 
1505 template<class CharType, class BV, unsigned MAX_STR_SIZE>
optimize(bm::word_t * temp_block,typename bvector_type::optmode opt_mode,typename str_sparse_vector<CharType,BV,MAX_STR_SIZE>::statistics * st)1506 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::optimize(
1507       bm::word_t* temp_block,
1508       typename bvector_type::optmode opt_mode,
1509       typename str_sparse_vector<CharType, BV, MAX_STR_SIZE>::statistics* st)
1510 {
1511     typename bvector_type::statistics stbv;
1512     parent_type::optimize(temp_block, opt_mode, &stbv);
1513 
1514     if (st)
1515         st->add(stbv);
1516 }
1517 
1518 //---------------------------------------------------------------------
1519 
1520 template<class CharType, class BV, unsigned MAX_STR_SIZE>
calc_stat(struct str_sparse_vector<CharType,BV,MAX_STR_SIZE>::statistics * st)1521 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::calc_stat(
1522     struct str_sparse_vector<CharType, BV, MAX_STR_SIZE>::statistics* st
1523     ) const BMNOEXCEPT
1524 {
1525     BM_ASSERT(st);
1526     typename bvector_type::statistics stbv;
1527     parent_type::calc_stat(&stbv);
1528 
1529     st->reset();
1530 
1531     st->bit_blocks += stbv.bit_blocks;
1532     st->gap_blocks += stbv.gap_blocks;
1533     st->ptr_sub_blocks += stbv.ptr_sub_blocks;
1534     st->bv_count += stbv.bv_count;
1535     st->max_serialize_mem += stbv.max_serialize_mem + 8;
1536     st->memory_used += stbv.memory_used;
1537     st->gap_cap_overhead += stbv.gap_cap_overhead;
1538 
1539     size_t remap_mem_usage = sizeof(remap_flags_);
1540     remap_mem_usage += remap_matrix1_.get_buffer().mem_usage();
1541     remap_mem_usage += remap_matrix2_.get_buffer().mem_usage();
1542 
1543     st->memory_used += remap_mem_usage;
1544     if (remap_flags_) // use of remapping requires some extra storage
1545     {
1546         st->max_serialize_mem += (remap_mem_usage * 2);
1547     }
1548 }
1549 
1550 //---------------------------------------------------------------------
1551 
1552 template<class CharType, class BV, unsigned MAX_STR_SIZE>
compare(size_type idx,const value_type * str)1553 int str_sparse_vector<CharType, BV, MAX_STR_SIZE>::compare(
1554                      size_type idx,
1555                      const value_type* str) const BMNOEXCEPT
1556 {
1557     BM_ASSERT(str);
1558     int res = 0;
1559     if (remap_flags_)
1560     {
1561         for (unsigned i = 0; i < MAX_STR_SIZE; ++i)
1562         {
1563             CharType octet = str[i];
1564             CharType sv_octet = (CharType)this->bmatr_.get_octet(idx, i);
1565             if (!sv_octet)
1566             {
1567                 res = -octet; // -1 || 0
1568                 break;
1569             }
1570             const unsigned char* remap_row = remap_matrix1_.row(i);
1571             unsigned char remap_value = remap_row[unsigned(sv_octet)];
1572             BM_ASSERT(remap_value);
1573             res = (remap_value > octet) - (remap_value < octet);
1574             if (res || !octet)
1575                 break;
1576         } // for i
1577     }
1578     else
1579     {
1580         for (unsigned i = 0; i < MAX_STR_SIZE; ++i)
1581         {
1582             CharType octet = str[i];
1583             CharType sv_octet = (CharType)this->bmatr_.get_octet(idx, i);
1584             if (!sv_octet)
1585             {
1586                 res = -octet; // -1 || 0
1587                 break;
1588             }
1589             res = (sv_octet > octet) - (sv_octet < octet);
1590             if (res || !octet)
1591                 break;
1592         } // for i
1593     }
1594     return res;
1595 }
1596 
1597 //---------------------------------------------------------------------
1598 
1599 template<class CharType, class BV, unsigned MAX_STR_SIZE>
common_prefix_length(size_type idx1,size_type idx2)1600 unsigned str_sparse_vector<CharType, BV, MAX_STR_SIZE>::common_prefix_length(
1601                                 size_type idx1, size_type idx2) const BMNOEXCEPT
1602 {
1603     unsigned i = 0;
1604     for (; i < MAX_STR_SIZE; ++i)
1605     {
1606         CharType ch1 = CharType(this->bmatr_.get_octet(idx1, i));
1607         CharType ch2 = CharType(this->bmatr_.get_octet(idx2, i));
1608         if (!ch1 || !ch2)
1609         {
1610             if (i)
1611                 --i;
1612             break;
1613         }
1614         if (ch1 != ch2)
1615         {
1616             break;
1617         }
1618     } // for
1619 
1620     return i;
1621 }
1622 
1623 //---------------------------------------------------------------------
1624 
1625 template<class CharType, class BV, unsigned MAX_STR_SIZE>
1626 bool
find_rank(size_type rank,size_type & pos)1627 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::find_rank(
1628                                                 size_type rank,
1629                                                 size_type& pos) BMNOEXCEPT
1630 {
1631     BM_ASSERT(rank);
1632     pos = rank - 1;
1633     return true;
1634 }
1635 
1636 //---------------------------------------------------------------------
1637 
1638 template<class CharType, class BV, unsigned MAX_STR_SIZE>
1639 typename str_sparse_vector<CharType, BV, MAX_STR_SIZE>::size_type
effective_max_str()1640 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::effective_max_str()
1641                                                         const BMNOEXCEPT
1642 {
1643     for (int i = MAX_STR_SIZE-1; i >= 0; --i)
1644     {
1645         unsigned octet_plane = unsigned(i) * unsigned(sizeof(CharType)) * 8;
1646         for (unsigned j = 0; j < sizeof(CharType) * 8; ++j)
1647         {
1648             if (this->bmatr_.row(octet_plane+j))
1649                 return unsigned(i)+1;
1650         } // for j
1651     } // for i
1652     return 0;
1653 }
1654 
1655 //---------------------------------------------------------------------
1656 
1657 template<class CharType, class BV, unsigned MAX_STR_SIZE>
calc_octet_stat(octet_freq_matrix_type & octet_matrix)1658 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::calc_octet_stat(
1659                     octet_freq_matrix_type& octet_matrix) const BMNOEXCEPT
1660 {
1661     octet_matrix.init(true); // init and set zero
1662 
1663     const_iterator it(this);
1664     for(; it.valid(); ++it)
1665     {
1666         const value_type* s = *it; // get asciiz char*
1667         for (unsigned i = 0; true; ++i) // for each char in str
1668         {
1669             value_type ch = s[i];
1670             if (!ch)
1671                 break;
1672             typename octet_freq_matrix_type::value_type* row =
1673                                                     octet_matrix.row(i);
1674             unsigned ch_idx = (unsigned char)ch;
1675             row[ch_idx] += 1;
1676         } // for i
1677     } // for it
1678 
1679 }
1680 
1681 //---------------------------------------------------------------------
1682 
1683 template<class CharType, class BV, unsigned MAX_STR_SIZE>
build_octet_remap(plane_octet_matrix_type & octet_remap_matrix1,plane_octet_matrix_type & octet_remap_matrix2,octet_freq_matrix_type & octet_occupancy_matrix)1684 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::build_octet_remap(
1685             plane_octet_matrix_type& octet_remap_matrix1,
1686             plane_octet_matrix_type& octet_remap_matrix2,
1687             octet_freq_matrix_type& octet_occupancy_matrix)
1688 {
1689     octet_remap_matrix1.init(true); // init and set-zero
1690     octet_remap_matrix2.init(true);
1691 
1692     for (unsigned i = 0; i < octet_occupancy_matrix.rows(); ++i)
1693     {
1694         typename octet_freq_matrix_type::value_type* frq_row =
1695                                                 octet_occupancy_matrix.row(i);
1696 
1697         unsigned char* remap_row1 = octet_remap_matrix1.row(i);
1698         unsigned char* remap_row2 = octet_remap_matrix2.row(i);
1699 
1700         const typename plane_octet_matrix_type::size_type row_size =
1701                                              octet_occupancy_matrix.cols();
1702         for (unsigned remap_code = 1; true; ++remap_code)
1703         {
1704             typename octet_freq_matrix_type::size_type char_idx;
1705             bool found = bm::find_max_nz(frq_row, row_size, &char_idx);
1706             #if 0
1707             bool found = bm::find_first_nz(frq_row, row_size, &char_idx);
1708             #endif
1709             if (!found)
1710                 break;
1711             BM_ASSERT(char_idx);
1712             unsigned char ch = (unsigned char)char_idx;
1713             remap_row1[remap_code] = ch;
1714             remap_row2[ch] = (unsigned char)remap_code;
1715             frq_row[char_idx] = 0; // clear the picked char
1716         } // for
1717     } // for i
1718 }
1719 
1720 //---------------------------------------------------------------------
1721 
1722 template<class CharType, class BV, unsigned MAX_STR_SIZE>
recalc_remap_matrix2()1723 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::recalc_remap_matrix2()
1724 {
1725     BM_ASSERT(remap_flags_);
1726 
1727     remap_matrix2_.init(true);
1728 
1729     for (unsigned i = 0; i < remap_matrix1_.rows(); ++i)
1730     {
1731         const unsigned char* remap_row1 = remap_matrix1_.row(i);
1732               unsigned char* remap_row2 = remap_matrix2_.row(i);
1733         for (unsigned j = 1; j < remap_matrix1_.cols(); ++j)
1734         {
1735             if (remap_row1[j])
1736             {
1737                 unsigned ch_code = remap_row1[j];
1738                 remap_row2[ch_code] = (unsigned char)j;
1739                 BM_ASSERT(ch_code < 256);
1740             }
1741         } // for j
1742     } // for i
1743 }
1744 
1745 //---------------------------------------------------------------------
1746 
1747 template<class CharType, class BV, unsigned MAX_STR_SIZE>
remap_tosv(value_type * BMRESTRICT sv_str,size_type buf_size,const value_type * BMRESTRICT str,const plane_octet_matrix_type & BMRESTRICT octet_remap_matrix2)1748 bool str_sparse_vector<CharType, BV, MAX_STR_SIZE>::remap_tosv(
1749        value_type*   BMRESTRICT     sv_str,
1750        size_type                    buf_size,
1751        const value_type* BMRESTRICT str,
1752        const plane_octet_matrix_type& BMRESTRICT octet_remap_matrix2) BMNOEXCEPT
1753 {
1754     for (unsigned i = 0; i < buf_size; ++i)
1755     {
1756         CharType ch = str[i];
1757         if (!ch)
1758         {
1759             sv_str[i] = ch;
1760             break;
1761         }
1762         const unsigned char* remap_row = octet_remap_matrix2.row(i);
1763         unsigned char remap_value = remap_row[unsigned(ch)];
1764         if (!remap_value) // unknown dictionary element
1765         {
1766             return false;
1767         }
1768         sv_str[i] = CharType(remap_value);
1769     } // for i
1770     return true;
1771 }
1772 
1773 //---------------------------------------------------------------------
1774 
1775 template<class CharType, class BV, unsigned MAX_STR_SIZE>
remap_fromsv(value_type * BMRESTRICT str,size_type buf_size,const value_type * BMRESTRICT sv_str,const plane_octet_matrix_type & BMRESTRICT octet_remap_matrix1)1776 bool str_sparse_vector<CharType, BV, MAX_STR_SIZE>::remap_fromsv(
1777          value_type* BMRESTRICT str,
1778          size_type         buf_size,
1779          const value_type* BMRESTRICT sv_str,
1780          const plane_octet_matrix_type& BMRESTRICT octet_remap_matrix1
1781          ) BMNOEXCEPT
1782 {
1783     for (unsigned i = 0; i < buf_size; ++i)
1784     {
1785         CharType ch = sv_str[i];
1786         if (!ch)
1787         {
1788             str[i] = ch;
1789             break;
1790         }
1791         const unsigned char* remap_row = octet_remap_matrix1.row(i);
1792         unsigned char remap_value = remap_row[unsigned(ch)];
1793         if (!remap_value) // unknown dictionary element
1794         {
1795             return false;
1796         }
1797         str[i] = CharType(remap_value);
1798     } // for i
1799     return true;
1800 }
1801 
1802 //---------------------------------------------------------------------
1803 
1804 template<class CharType, class BV, unsigned MAX_STR_SIZE>
remap()1805 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::remap()
1806 {
1807     // TODO: get rid of tmp, implement a move-remapping
1808     str_sparse_vector<CharType, BV, MAX_STR_SIZE>
1809                                 sv_tmp(this->get_null_support());
1810     sv_tmp.remap_from(*this);
1811     sv_tmp.swap(*this);
1812 }
1813 
1814 //---------------------------------------------------------------------
1815 
1816 template<class CharType, class BV, unsigned MAX_STR_SIZE>
1817 void
remap_from(const str_sparse_vector & str_sv)1818 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::remap_from(
1819                                             const str_sparse_vector& str_sv)
1820 {
1821     if (str_sv.is_remap())
1822     {
1823         *this = str_sv;
1824         return;
1825     }
1826     this->clear_all(true);
1827     if (str_sv.empty()) // no content to remap
1828     {
1829         return;
1830     }
1831 
1832     octet_freq_matrix_type omatrix; // occupancy map
1833     str_sv.calc_octet_stat(omatrix);
1834 
1835     str_sv.build_octet_remap(remap_matrix1_, remap_matrix2_, omatrix);
1836     remap_flags_ = 1; // turn ON remapped mode
1837 
1838     const unsigned buffer_size = 1024 * 8;
1839 
1840     typedef bm::heap_matrix<CharType,
1841                     buffer_size,         // ROWS: number of strings in one batch
1842                     MAX_STR_SIZE,        // COLS
1843                     allocator_type> remap_buffer_type;
1844 
1845     remap_buffer_type cmatr(true);
1846     for (size_type i = 0; true; )
1847     {
1848         size_type dsize = str_sv.decode(cmatr, i, buffer_size, true);
1849         if (!dsize)
1850             break;
1851         this->import(cmatr, i, dsize);
1852         i += dsize;
1853     } // for i
1854 
1855     bvector_type* bv_null = this->get_null_bvect();
1856     if (bv_null)
1857     {
1858         const bvector_type* bv_null_arg = str_sv.get_null_bvector();
1859         if (bv_null_arg)
1860             *bv_null = *bv_null_arg;
1861         else
1862         {
1863             // TODO: exception? assert? maybe it is OK...
1864         }
1865     }
1866 
1867 }
1868 
1869 //---------------------------------------------------------------------
1870 
1871 template<class CharType, class BV, unsigned MAX_STR_SIZE>
sync(bool)1872 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::sync(bool /*force*/)
1873 {
1874     if (remap_flags_)
1875     {
1876         recalc_remap_matrix2();
1877     }
1878 }
1879 
1880 //---------------------------------------------------------------------
1881 
1882 template<class CharType, class BV, unsigned MAX_STR_SIZE>
equal(const str_sparse_vector<CharType,BV,MAX_STR_SIZE> & sv,bm::null_support null_able)1883 bool str_sparse_vector<CharType, BV, MAX_STR_SIZE>::equal(
1884                 const str_sparse_vector<CharType, BV, MAX_STR_SIZE>& sv,
1885                 bm::null_support null_able) const BMNOEXCEPT
1886 {
1887     // at this point both vectors should have the same remap settings
1888     // to be considered "equal".
1889     // Strictly speaking this is incorrect, because re-map and non-remap
1890     // vectors may have the same content
1891 
1892     if (remap_flags_ != sv.remap_flags_)
1893         return false;
1894     if (remap_flags_)
1895     {
1896         bool b;
1897         b = remap_matrix1_.get_buffer().equal(sv.remap_matrix1_.get_buffer());
1898         if (!b)
1899             return b;
1900         b = remap_matrix2_.get_buffer().equal(sv.remap_matrix2_.get_buffer());
1901         if (!b)
1902             return b;
1903     }
1904     return parent_type::equal(sv, null_able);
1905 }
1906 
1907 //---------------------------------------------------------------------
1908 
1909 template<class CharType, class BV, unsigned MAX_STR_SIZE>
copy_range(const str_sparse_vector<CharType,BV,MAX_STR_SIZE> & sv,size_type left,size_type right,bm::null_support splice_null)1910 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::copy_range(
1911                 const str_sparse_vector<CharType, BV, MAX_STR_SIZE>& sv,
1912                 size_type left, size_type right,
1913                 bm::null_support splice_null)
1914 {
1915     if (left > right)
1916         bm::xor_swap(left, right);
1917     this->clear_all(true);
1918 
1919     remap_flags_ = sv.remap_flags_;
1920     remap_matrix1_ = sv.remap_matrix1_;
1921     remap_matrix2_ = sv.remap_matrix2_;
1922 
1923     this->copy_range_planes(sv, left, right, splice_null);
1924     this->resize(sv.size());
1925 }
1926 
1927 //---------------------------------------------------------------------
1928 
1929 template<class CharType, class BV, unsigned MAX_STR_SIZE>
1930 str_sparse_vector<CharType, BV, MAX_STR_SIZE>&
merge(str_sparse_vector<CharType,BV,MAX_STR_SIZE> & str_sv)1931 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::merge(str_sparse_vector<CharType, BV, MAX_STR_SIZE>& str_sv)
1932 {
1933     size_type arg_size = str_sv.size();
1934     if (this->size_ < arg_size)
1935         resize(arg_size);
1936 
1937     // there is an assumption here that we only need to copy remap flags once
1938     // because we merge matrices with the same remaps
1939     // otherwise - undefined behavior
1940     //
1941     if (remap_flags_ != str_sv.remap_flags_)
1942     {
1943         remap_flags_ = str_sv.remap_flags_;
1944         remap_matrix1_ = str_sv.remap_matrix1_;
1945         remap_matrix2_ = str_sv.remap_matrix2_;
1946     }
1947 
1948     bvector_type* bv_null = this->get_null_bvect();
1949     unsigned planes = bv_null ? this->stored_planes() : this->planes();
1950 
1951     this->merge_matr(str_sv.bmatr_, planes);
1952 
1953     // our vector is NULL-able but argument is not (assumed all values are real)
1954     if (bv_null && !str_sv.is_nullable())
1955         bv_null->set_range(0, arg_size-1);
1956 
1957     return *this;
1958 
1959 }
1960 
1961 //---------------------------------------------------------------------
1962 
1963 template<class CharType, class BV, unsigned MAX_STR_SIZE>
keep_range(size_type left,size_type right,bm::null_support splice_null)1964 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::keep_range(
1965                 size_type left, size_type right,
1966                 bm::null_support splice_null)
1967 {
1968     if (right < left)
1969         bm::xor_swap(left, right);
1970     this->keep_range_no_check(left, right, splice_null);
1971 }
1972 
1973 //---------------------------------------------------------------------
1974 
1975 template<class CharType, class BV, unsigned MAX_STR_SIZE>
1976 typename str_sparse_vector<CharType, BV, MAX_STR_SIZE>::const_iterator
begin()1977 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::begin() const BMNOEXCEPT
1978 {
1979     typedef typename
1980         str_sparse_vector<CharType, BV, MAX_STR_SIZE>::const_iterator it_type;
1981     return it_type(this);
1982 }
1983 
1984 //---------------------------------------------------------------------
1985 
1986 template<class CharType, class BV, unsigned MAX_STR_SIZE>
clear_all(bool free_mem)1987 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::clear_all(bool free_mem) BMNOEXCEPT
1988 {
1989     parent_type::clear_all(free_mem);
1990 }
1991 
1992 //---------------------------------------------------------------------
1993 
1994 template<class CharType, class BV, unsigned MAX_STR_SIZE>
throw_range_error(const char * err_msg)1995 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::throw_range_error(
1996                                                            const char* err_msg)
1997 {
1998 #ifndef BM_NO_STL
1999     throw std::range_error(err_msg);
2000 #else
2001     BM_ASSERT_THROW(false, BM_ERR_RANGE);
2002 #endif
2003 }
2004 
2005 //---------------------------------------------------------------------
2006 
2007 template<class CharType, class BV, unsigned MAX_STR_SIZE>
throw_bad_value(const char * err_msg)2008 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::throw_bad_value(
2009                                                            const char* err_msg)
2010 {
2011 #ifndef BM_NO_STL
2012     if (!err_msg)
2013         err_msg = "Unknown/incomparable dictionary character";
2014     throw std::domain_error(err_msg);
2015 #else
2016     BM_ASSERT_THROW(false, BM_BAD_VALUE);
2017 #endif
2018 }
2019 
2020 
2021 //---------------------------------------------------------------------
2022 //
2023 //---------------------------------------------------------------------
2024 
2025 
2026 template<class CharType, class BV, unsigned MAX_STR_SIZE>
const_iterator()2027 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::const_iterator::const_iterator() BMNOEXCEPT
2028 : sv_(0), substr_from_(0), substr_to_(MAX_STR_SIZE),
2029   pos_(bm::id_max), pos_in_buf_(~size_type(0))
2030 {
2031 }
2032 
2033 //---------------------------------------------------------------------
2034 
2035 template<class CharType, class BV, unsigned MAX_STR_SIZE>
const_iterator(const str_sparse_vector<CharType,BV,MAX_STR_SIZE>::const_iterator & it)2036 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::const_iterator::const_iterator(
2037    const str_sparse_vector<CharType, BV, MAX_STR_SIZE>::const_iterator& it) BMNOEXCEPT
2038 : sv_(it.sv_),
2039   substr_from_(it.substr_from_), substr_to_(it.substr_to_),
2040   pos_(it.pos_), pos_in_buf_(~size_type(0))
2041 {
2042 }
2043 
2044 //---------------------------------------------------------------------
2045 
2046 template<class CharType, class BV, unsigned MAX_STR_SIZE>
const_iterator(const str_sparse_vector<CharType,BV,MAX_STR_SIZE> * sv)2047 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::const_iterator::const_iterator(
2048     const str_sparse_vector<CharType, BV, MAX_STR_SIZE>* sv) BMNOEXCEPT
2049 : sv_(sv), pos_(sv->empty() ? bm::id_max : 0), pos_in_buf_(~size_type(0))
2050 {
2051     substr_from_ = 0;
2052     substr_to_ = (unsigned) sv_->effective_max_str();
2053     buf_matrix_.resize(n_rows, substr_to_+1);
2054 
2055 }
2056 
2057 //---------------------------------------------------------------------
2058 
2059 template<class CharType, class BV, unsigned MAX_STR_SIZE>
const_iterator(const str_sparse_vector<CharType,BV,MAX_STR_SIZE> * sv,typename str_sparse_vector<CharType,BV,MAX_STR_SIZE>::size_type pos)2060 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::const_iterator::const_iterator(
2061     const str_sparse_vector<CharType, BV, MAX_STR_SIZE>* sv,
2062     typename str_sparse_vector<CharType, BV, MAX_STR_SIZE>::size_type pos) BMNOEXCEPT
2063 : sv_(sv), pos_(pos >= sv->size() ? bm::id_max : pos), pos_in_buf_(~size_type(0))
2064 {
2065     substr_from_ = 0;
2066     substr_to_ = (unsigned) sv_->effective_max_str();
2067     buf_matrix_.resize(n_rows, substr_to_+1);
2068 }
2069 
2070 //---------------------------------------------------------------------
2071 
2072 template<class CharType, class BV, unsigned MAX_STR_SIZE>
set_substr(unsigned from,unsigned len)2073 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::const_iterator::set_substr(
2074                     unsigned from, unsigned len) BMNOEXCEPT
2075 {
2076     BM_ASSERT(from < MAX_STR_SIZE);
2077     BM_ASSERT(len);
2078 
2079     unsigned max_str = sv_->effective_max_str();
2080 
2081     substr_from_ = from;
2082     substr_to_ = substr_from_ + len - 1;
2083     if (max_str < substr_to_)
2084         substr_to_ = max_str;
2085     buf_matrix_.resize(n_rows, len+1);
2086 }
2087 
2088 //---------------------------------------------------------------------
2089 
2090 template<class CharType, class BV, unsigned MAX_STR_SIZE>
2091 const typename str_sparse_vector<CharType, BV, MAX_STR_SIZE>::value_type*
value()2092 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::const_iterator::value() const BMNOEXCEPT
2093 {
2094     BM_ASSERT(sv_);
2095     BM_ASSERT(this->valid());
2096     if (pos_in_buf_ == ~size_type(0))
2097     {
2098         if (!buf_matrix_.is_init())
2099             buf_matrix_.init();
2100         pos_in_buf_ = 0;
2101         size_type d = sv_->decode_substr(buf_matrix_, pos_, n_rows,
2102                                          substr_from_, substr_to_);
2103         if (!d)
2104         {
2105             pos_ = bm::id_max;
2106             return 0;
2107         }
2108     }
2109     return buf_matrix_.row(pos_in_buf_);
2110 }
2111 
2112 //---------------------------------------------------------------------
2113 
2114 template<class CharType, class BV, unsigned MAX_STR_SIZE>
2115 void
go_to(typename str_sparse_vector<CharType,BV,MAX_STR_SIZE>::size_type pos)2116 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::const_iterator::go_to(
2117    typename str_sparse_vector<CharType, BV, MAX_STR_SIZE>::size_type pos
2118    ) BMNOEXCEPT
2119 {
2120     pos_ = (!sv_ || pos >= sv_->size()) ? bm::id_max : pos;
2121     pos_in_buf_ = ~size_type(0);
2122 }
2123 
2124 //---------------------------------------------------------------------
2125 
2126 template<class CharType, class BV, unsigned MAX_STR_SIZE>
2127 void
advance()2128 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::const_iterator::advance() BMNOEXCEPT
2129 {
2130     if (pos_ == bm::id_max) // nothing to do, we are at the end
2131         return;
2132     ++pos_;
2133 
2134     if (pos_ >= sv_->size())
2135         this->invalidate();
2136     else
2137     {
2138         if (pos_in_buf_ != ~size_type(0))
2139         {
2140             ++pos_in_buf_;
2141             if (pos_in_buf_ >= n_rows)
2142                 pos_in_buf_ = ~size_type(0);
2143         }
2144     }
2145 }
2146 
2147 //---------------------------------------------------------------------
2148 //
2149 //---------------------------------------------------------------------
2150 
2151 template<class CharType, class BV, unsigned MAX_STR_SIZE>
back_insert_iterator()2152 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::back_insert_iterator() BMNOEXCEPT
2153 : sv_(0), bv_null_(0), pos_in_buf_(~size_type(0)), prev_nb_(0)
2154 {}
2155 
2156 //---------------------------------------------------------------------
2157 
2158 template<class CharType, class BV, unsigned MAX_STR_SIZE>
back_insert_iterator(str_sparse_vector<CharType,BV,MAX_STR_SIZE> * sv)2159 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::back_insert_iterator(
2160            str_sparse_vector<CharType, BV, MAX_STR_SIZE>* sv) BMNOEXCEPT
2161 : sv_(sv), pos_in_buf_(~size_type(0))
2162 {
2163     if (sv)
2164     {
2165         prev_nb_ = sv_->size() >> bm::set_block_shift;
2166         bv_null_ = sv_->get_null_bvect();
2167     }
2168     else
2169     {
2170         bv_null_ = 0; prev_nb_ = 0;
2171     }
2172 }
2173 
2174 //---------------------------------------------------------------------
2175 
2176 template<class CharType, class BV, unsigned MAX_STR_SIZE>
back_insert_iterator(const str_sparse_vector<CharType,BV,MAX_STR_SIZE>::back_insert_iterator & bi)2177 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::back_insert_iterator(
2178 const str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator& bi) BMNOEXCEPT
2179 : sv_(bi.sv_), bv_null_(bi.bv_null_), pos_in_buf_(~size_type(0)), prev_nb_(bi.prev_nb_)
2180 {
2181     BM_ASSERT(bi.empty());
2182 }
2183 
2184 //---------------------------------------------------------------------
2185 
2186 template<class CharType, class BV, unsigned MAX_STR_SIZE>
~back_insert_iterator()2187 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::~back_insert_iterator()
2188 {
2189     this->flush();
2190 }
2191 
2192 //---------------------------------------------------------------------
2193 
2194 template<class CharType, class BV, unsigned MAX_STR_SIZE>
2195 bool
empty()2196 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::empty()
2197                                                                 const BMNOEXCEPT
2198 {
2199     return (pos_in_buf_ == ~size_type(0) || !sv_);
2200 }
2201 
2202 //---------------------------------------------------------------------
2203 
2204 template<class CharType, class BV, unsigned MAX_STR_SIZE>
flush()2205 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::flush()
2206 {
2207     if (this->empty())
2208         return;
2209 
2210     size_type imp_idx = sv_->size();
2211     sv_->import_no_check(buf_matrix_, imp_idx, pos_in_buf_+1, false);
2212     pos_in_buf_ = ~size_type(0);
2213     block_idx_type nb = sv_->size() >> bm::set_block_shift;
2214     if (nb != prev_nb_)
2215     {
2216         // optimize all previous blocks in all planes
2217         sv_->optimize_block(prev_nb_);
2218         prev_nb_ = nb;
2219     }
2220 }
2221 
2222 //---------------------------------------------------------------------
2223 
2224 template<class CharType, class BV, unsigned MAX_STR_SIZE>
add(const typename str_sparse_vector<CharType,BV,MAX_STR_SIZE>::back_insert_iterator::value_type * v)2225 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::add(
2226 const typename str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::value_type* v)
2227 {
2228     if (!v)
2229     {
2230         this->add_null();
2231         return;
2232     }
2233     size_type buf_idx = this->pos_in_buf_; // offset in
2234     size_type sz = sv_->size();
2235 
2236     this->add_value(v);
2237     if (bv_null_)
2238     {
2239         bv_null_->set_bit_no_check(sz + buf_idx + 1);
2240     }
2241 }
2242 
2243 //---------------------------------------------------------------------
2244 
2245 template<class CharType, class BV, unsigned MAX_STR_SIZE>
add_null()2246 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::add_null()
2247 {
2248     /*size_type buf_idx = */this->add_value("");
2249 }
2250 
2251 //---------------------------------------------------------------------
2252 
2253 template<class CharType, class BV, unsigned MAX_STR_SIZE>
add_null(typename str_sparse_vector<CharType,BV,MAX_STR_SIZE>::back_insert_iterator::size_type count)2254 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::add_null(
2255 typename str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::size_type count)
2256 {
2257     for (size_type i = 0; i < count; ++i) // TODO: optimization
2258         this->add_value("");
2259 }
2260 
2261 
2262 //---------------------------------------------------------------------
2263 
2264 template<class CharType, class BV, unsigned MAX_STR_SIZE>
2265 void
add_value(const str_sparse_vector<CharType,BV,MAX_STR_SIZE>::back_insert_iterator::value_type * v)2266 str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::add_value(
2267 const str_sparse_vector<CharType, BV, MAX_STR_SIZE>::back_insert_iterator::value_type* v)
2268 {
2269     BM_ASSERT(sv_);
2270     BM_ASSERT(v);
2271     if (pos_in_buf_ == ~size_type(0))
2272     {
2273         if (!buf_matrix_.is_init())
2274             buf_matrix_.init();
2275         pos_in_buf_ = 0; buf_matrix_.set_zero();
2276     }
2277     else
2278     if (pos_in_buf_ >= buffer_matrix_type::n_rows-1)
2279     {
2280         this->flush();
2281         pos_in_buf_ = 0; buf_matrix_.set_zero();
2282     }
2283     else
2284     {
2285         ++pos_in_buf_;
2286     }
2287 
2288     value_type* r = buf_matrix_.row(pos_in_buf_);
2289     for (unsigned i = 0; i < buffer_matrix_type::n_columns; ++i)
2290     {
2291         r[i] = v[i];
2292         if (!r[i])
2293             break;
2294     } // for i
2295     //return pos_in_buf_;
2296 }
2297 
2298 //---------------------------------------------------------------------
2299 
2300 template<class CharType, class BV, unsigned MAX_STR_SIZE>
sync_size()2301 void str_sparse_vector<CharType, BV, MAX_STR_SIZE>::sync_size() BMNOEXCEPT
2302 {
2303     const bvector_type* bv_null = this->get_null_bvector();
2304     if (!bv_null)
2305         return;
2306     bool found = bv_null->find_reverse(this->size_);
2307     this->size_ += found;
2308 }
2309 
2310 //---------------------------------------------------------------------
2311 
2312 } // namespace
2313 
2314 #endif
2315