1 #ifndef BMDBG__H__INCLUDED__
2 #define BMDBG__H__INCLUDED__
3 /*
4 Copyright(c) 2002-2017 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10     http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 For more information please visit:  http://bitmagic.io
19 */
20 
21 /*! \file bmdbg.h
22     \brief Debugging functions (internal)
23 */
24 
25 
26 #include <cstdio>
27 #include <stdlib.h>
28 #include <cassert>
29 #include <memory>
30 #include <time.h>
31 
32 #include <iostream>
33 #include <sstream>
34 #include <fstream>
35 #include <iomanip>
36 #include <vector>
37 
38 #include "bmalgo_similarity.h"
39 #include "bmsparsevec_serial.h"
40 #include "bmdef.h"
41 
42 
43 
44 #ifdef _MSC_VER
45 #pragma warning( push )
46 #pragma warning( disable : 4311 4312 4127)
47 #endif
48 
49 namespace bm
50 {
51 
52 inline
PrintGap(const bm::gap_word_t * gap_buf)53 void PrintGap(const bm::gap_word_t* gap_buf)
54 {
55     unsigned len = (*gap_buf >> 3);
56     std::cout << "[" << *gap_buf << " len=" << len << "] ";
57     for (unsigned i = 0; i < len; ++i)
58     {
59         ++gap_buf;
60         std::cout << *gap_buf << "; ";
61     }
62     std::cout << std::endl;
63 }
64 
65 inline
66 void PrintDGap(const bm::gap_word_t* gap_buf, unsigned gap_len=0)
67 {
68 
69     unsigned len = gap_len ? gap_len : (*gap_buf >> 3);
70     std::cout << "[" " len=" << len << "] ";
71     unsigned i = gap_len ? 0 : 1;
72     for (; i < len; ++i)
73     {
74         std::cout << gap_buf[i] << "; ";
75     }
76     std::cout << std::endl;
77 }
78 
iLog2(unsigned int value)79 inline unsigned int iLog2(unsigned int value)
80 {
81     unsigned int l = 0;
82     while( (value >> l) > 1 ) ++l;
83     return l;
84 }
85 
86 inline
PrintGammaCode(unsigned value)87 unsigned PrintGammaCode(unsigned value)
88 {
89     unsigned bits = 0;
90     // Elias gamma encode
91     {
92         unsigned l = iLog2(value);
93         //cout << "log2=" << l << endl;
94         for (unsigned i = 0; i < l; ++i)
95         {
96             std::cout << 0;
97             ++bits;
98         }
99         std::cout << 1; ++bits;
100         for (unsigned i = 0; i < l; ++i)
101         {
102             if (value & 1 << i)
103                 std::cout << 1;
104             else
105                 std::cout << 0;
106             ++bits;
107         }
108     }
109     return bits;
110 }
111 
112 inline
113 void PrintDGapGamma(const bm::gap_word_t* gap_buf, unsigned gap_len=0)
114 {
115     unsigned total = 0;
116     unsigned len = gap_len ? gap_len : (*gap_buf >> 3);
117     std::cout << "[" " len=" << len << "] ";
118     unsigned i = gap_len ? 0 : 1;
119     for (; i < len; ++i)
120     {
121         unsigned v = gap_buf[i];
122 
123         unsigned bits = PrintGammaCode(v+1);
124         std::cout << "; ";
125         total += bits;
126     }
127     std::cout << "  gamma_bits=" << total << " src_bits =" << len * 16;
128     std::cout << std::endl;
129 
130 }
131 
132 /// Read dump file into an STL container (vector of some basic type)
133 ///
134 /// @return 0 - if reading went well
135 ///
136 template<class VT>
read_dump_file(const std::string & fname,VT & data)137 int read_dump_file(const std::string& fname, VT& data)
138 {
139     typedef typename VT::value_type  value_type;
140 
141     size_t fsize;
142     std::ifstream fin(fname.c_str(), std::ios::in | std::ios::binary);
143     if (!fin.good())
144     {
145         return -1;
146     }
147     fin.seekg(0, std::ios::end);
148     fsize = (size_t)fin.tellg();
149 
150     data.resize(fsize/sizeof(value_type));
151 
152     if (!fsize)
153     {
154         return 0; // empty input
155     }
156     fin.seekg(0, std::ios::beg);
157     fin.read((char*) &data[0], std::streamsize(fsize));
158     if (!fin.good())
159     {
160         data.resize(0);
161         return -2;
162     }
163     return 0;
164 }
165 
166 template<class TBV>
167 void LoadBVector(const char* fname, TBV& bvector, unsigned* file_size=0)
168 {
169     std::ifstream bv_file (fname, std::ios::in | std::ios::binary);
170     if (!bv_file.good())
171     {
172         std::cout << "Cannot open file: " << fname << std::endl;
173         exit(1);
174     }
175     bv_file.seekg(0, std::ios_base::end);
176     unsigned length = (unsigned)bv_file.tellg();
177     if (length == 0)
178     {
179         std::cout << "Empty file:" << fname << std::endl;
180         exit(1);
181     }
182     if (file_size)
183         *file_size = length;
184 
185     bv_file.seekg(0, std::ios::beg);
186 
187     char* buffer = new char[length];
188 
189     bv_file.read(buffer, length);
190 
191     bm::deserialize(bvector, (unsigned char*)buffer);
192 
193     delete [] buffer;
194 }
195 
196 template<class TBV>
SaveBVector(const char * fname,const TBV & bvector)197 void SaveBVector(const char* fname, const TBV& bvector)
198 {
199     std::ofstream bfile (fname, std::ios::out | std::ios::binary);
200     if (!bfile.good())
201     {
202         std::cout << "Cannot open file: " << fname << std::endl;
203         exit(1);
204     }
205     typename TBV::statistics st1;
206     bvector.calc_stat(&st1);
207 
208     unsigned char* blob = new unsigned char[st1.max_serialize_mem];
209     size_t blob_size = bm::serialize(bvector, blob);
210 
211 
212     bfile.write((char*)blob, std::streamsize(blob_size));
213     bfile.close();
214 
215     delete [] blob;
216 }
217 
218 inline
SaveBlob(const char * name_prefix,unsigned num,const char * ext,const unsigned char * blob,size_t blob_size)219 void SaveBlob(const char* name_prefix, unsigned num, const char* ext,
220               const unsigned char* blob, size_t blob_size)
221 {
222     std::stringstream fname_str;
223     fname_str << name_prefix << "-" << num << ext;
224 
225 	std::string s = fname_str.str();
226     const char* fname = s.c_str();
227     std::ofstream bfile (fname, std::ios::out | std::ios::binary);
228     if (!bfile.good())
229     {
230         std::cout << "Cannot open file: " << fname << std::endl;
231         exit(1);
232     }
233     bfile.write((char*)blob, std::streamsize(blob_size));
234     bfile.close();
235 }
236 
237 
238 template<typename V>
PrintBinary(V val)239 void PrintBinary(V val)
240 {
241     for (unsigned i = 0; i < sizeof(V)*8; i++)
242     {
243         std::cout << (unsigned)((val >> i) & 1);
244         if (i == 15 && (sizeof(V)*8 > 16)) std::cout << "-";
245     }
246 //    cout << " :" << val;
247 }
248 
249 inline
PrintBits32(unsigned val)250 void PrintBits32(unsigned val)
251 {
252     PrintBinary(val);
253 }
254 
255 inline
PrintDistanceMatrix(const unsigned distance[bm::set_block_plane_cnt][bm::set_block_plane_cnt])256 void PrintDistanceMatrix(
257    const unsigned distance[bm::set_block_plane_cnt][bm::set_block_plane_cnt])
258 {
259     for (unsigned i = 0; i < bm::set_block_plane_cnt; ++i)
260     {
261         const unsigned* row = distance[i];
262         std::cout << i << ": ";
263         for (unsigned j = i; j < bm::set_block_plane_cnt; ++j)
264         {
265             std::cout << std::setw(4) << std::setfill('0') << row[j] << " ";
266         }
267         std::cout << std::endl;
268     }
269 }
270 
271 template<typename TM>
272 void PrintTMatrix(const TM& tmatrix, unsigned cols=0, bool binary = false)
273 {
274     unsigned columns = cols ? cols : tmatrix.cols();
275     for (unsigned i = 0; i < tmatrix.rows(); ++i)
276     {
277         const typename TM::value_type* row = tmatrix.row(i);
278         std::cout << i << ": ";
279         if (i < 10) std::cout << " ";
280         for (unsigned j = 0; j < columns; ++j)
281         {
282             if (!binary)
283             {
284                 std::cout << std::setw(4) << std::setfill('0') << row[j] << " ";
285             }
286             else
287             {
288                 PrintBinary(row[j]);
289             }
290         }
291         std::cout << std::endl;
292     }
293 }
294 
295 /// Binary code string converted to number
296 /// Bits are expected left to right
297 ///
298 inline
BinStrLR(const char * str)299 unsigned BinStrLR(const char* str)
300 {
301     unsigned value = 0;
302     unsigned bit_idx = 0;
303     for (; *str; ++str)
304     {
305         switch(*str)
306         {
307         case '0':
308             ++bit_idx;
309             break;
310         case '1':
311             value |= (1 << bit_idx);
312             ++bit_idx;
313             break;
314         default:
315             assert(0);
316         }
317         if (bit_idx == sizeof(unsigned) * 8)
318             break;
319     }
320     return value;
321 }
322 
323 template<class BV>
print_blocks_count(const BV & bv)324 void print_blocks_count(const BV& bv)
325 {
326     const unsigned sz = 128000;
327     unsigned* bc_arr = new unsigned[sz];
328     for(unsigned x = 0; x < sz; ++x) bc_arr[x] = 0;
329 
330 
331     unsigned last_block = bv.count_blocks(bc_arr);
332     unsigned sum = 0;
333 
334     for (unsigned i = 0; i <= last_block; ++i)
335     {
336         std::cout << i << ":";
337 
338         unsigned j = 0;
339         for (; i <= last_block; ++i)
340         {
341             std::cout << std::setw(5) << std::setfill('0') << bc_arr[i] << " ";
342             sum += bc_arr[i];
343             if (++j == 10) break;
344         }
345         std::cout << " | " << sum << std::endl;
346     }
347     std::cout << "Total=" << sum << std::endl;
348 
349     delete [] bc_arr;
350 }
351 inline
print_bc(unsigned i,unsigned count)352 void print_bc(unsigned i, unsigned count)
353 {
354     static unsigned sum = 0;
355     static unsigned row_idx = 0;
356     static unsigned prev = 0;
357 
358     if (i == 0)
359     {
360         sum = row_idx = 0;
361     }
362     else
363     {
364         if (prev +1 < i)
365             print_bc(prev+1, 0);
366         prev = i;
367     }
368 
369     if (row_idx == 0)
370     {
371         std::cout << i << ":";
372     }
373 
374     std::cout << std::setw(5) << std::setfill('0') << count << " ";
375     sum += count;
376 
377     ++row_idx;
378     if (row_idx == 10)
379     {
380         row_idx = 0;
381         std::cout << " | " << sum << std::endl;
382     }
383 }
384 
385 template<class BV>
print_bvector_stat(const BV & bvect)386 size_t print_bvector_stat(const BV& bvect)
387 {
388     typename BV::statistics st;
389     bvect.calc_stat(&st);
390 
391     typename serializer<BV>::buffer buf;
392     bm::serializer<BV> ser;
393     ser.serialize(bvect, buf, &st);
394     auto ssize = buf.size();
395 
396     std::cout << " - Blocks: [ "
397               << "B:"     << st.bit_blocks
398               << ", G:"   << st.gap_blocks << "] "
399               << " count() = " << bvect.count()
400               << ", mem = " << st.memory_used << " " << (st.memory_used / (1024 * 1024)) << "MB "
401               << ", max smem:" << st.max_serialize_mem << " " << (st.max_serialize_mem / (1024 * 1024)) << "MB "
402               << " compressed = " << ssize << " " << (ssize / (1024 * 1024)) << "MB "
403               << std::endl;
404     return ssize;
405 }
406 
407 
408 template<class BV>
409 void print_stat(const BV& bv, typename BV::block_idx_type blocks = 0)
410 {
411     const typename BV::blocks_manager_type& bman = bv.get_blocks_manager();
412 
413     bm::id_t count = 0;
414     int printed = 0;
415 
416     int total_gap_eff = 0;
417 
418     if (!blocks)
419     {
420         blocks = bm::set_total_blocks;
421     }
422 
423     typename BV::block_idx_type nb;
424     typename BV::block_idx_type nb_prev = 0;
425     for (nb = 0; nb < blocks; ++nb)
426     {
427         unsigned i0, j0;
428         bm::get_block_coord(nb, i0, j0);
429         const bm::word_t* blk = bman.get_block(i0, j0);
430 
431         if (!blk)
432            continue;
433 
434         if (IS_FULL_BLOCK(blk))
435         {
436            if (BM_IS_GAP(blk)) // gap block
437            {
438                std::cout << "[Alert!" << nb << "]";
439                assert(0);
440            }
441 
442            typename BV::block_idx_type start = nb;
443            for(auto i = nb+1; i < bm::set_total_blocks; ++i, ++nb)
444            {
445                bm::get_block_coord(nb, i0, j0);
446                blk = bman.get_block(i0, j0);
447                if (IS_FULL_BLOCK(blk))
448                {
449                  if (BM_IS_GAP(blk)) // gap block
450                  {
451                      std::cout << "[Alert!" << nb << "]";
452                      assert(0);
453                      --nb;
454                      break;
455                  }
456 
457                }
458                else
459                {
460                   --nb;
461                   break;
462                }
463            }
464 
465            std::cout << "{F." << start << ":" << nb << "}";
466            ++printed;
467         }
468         else
469         {
470             if ((nb-1) != nb_prev)
471             {
472                 std::cout << ".." << (size_t)nb-nb_prev << "..";
473             }
474 
475             if (BM_IS_GAP(blk))
476             {
477                 unsigned bc = bm::gap_bit_count(BMGAP_PTR(blk));
478                 /*unsigned sum = */bm::gap_control_sum(BMGAP_PTR(blk));
479                 unsigned level = bm::gap_level(BMGAP_PTR(blk));
480                 count += bc;
481                unsigned len = bm::gap_length(BMGAP_PTR(blk))-1;
482                unsigned raw_size=bc*2;
483                unsigned cmr_len=len*2;
484                size_t mem_eff = raw_size - cmr_len;
485                total_gap_eff += unsigned(mem_eff);
486 
487                unsigned i,j;
488                bm::get_block_coord(nb, i, j);
489                std::cout << " [GAP " << nb << "(" << i << "," << j << ")"
490                          << "=" << bc << ":" << level << "-L" << len << "(" << mem_eff << ")]";
491                 ++printed;
492             }
493             else // bitset
494             {
495                 unsigned bc = bm::bit_block_count(blk);
496 
497                 unsigned zw = 0;
498                 for (unsigned i = 0; i < bm::set_block_size; ++i)
499                 {
500                     zw += (blk[i] == 0);
501                 }
502 
503                 count += bc;
504                 std::cout << " (BIT " << nb << "=" << bc << "[" << zw << "])";
505                 ++printed;
506             }
507         }
508         if (printed == 10)
509         {
510             printed = 0;
511             printf("\n");
512         }
513         nb_prev = nb;
514     } // for nb
515     std::cout << std::endl << "gap_efficiency=" << total_gap_eff << std::endl;
516 
517 }
518 
519 template<class BV>
compute_serialization_size(const BV & bv)520 size_t compute_serialization_size(const BV& bv)
521 {
522     BM_DECLARE_TEMP_BLOCK(tb)
523     unsigned char*  buf = 0;
524     typename BV::size_type blob_size = 0;
525     try
526     {
527         bm::serializer<BV> bvs(typename BV::allocator_type(), tb);
528         //bvs.set_compression_level(4);
529 
530         typename BV::statistics st;
531         bv.calc_stat(&st);
532 
533         buf = new unsigned char[st.max_serialize_mem];
534         blob_size = (unsigned)bvs.serialize(bv, (unsigned char*)buf, st.max_serialize_mem);
535     }
536     catch (...)
537     {
538         delete [] buf;
539         throw;
540     }
541 
542     delete [] buf;
543     return blob_size;
544 }
545 
546 #if 0
547 template<class SV>
548 void print_svector_xor_stat(const SV& sv)
549 {
550     BM_DECLARE_TEMP_BLOCK(tb)
551     typename SV::size_type sz = sv.size();
552     if (!sz)
553         return;
554     typename SV::size_type nb_max = (sz >>  bm::set_block_shift);
555 
556     for (typename SV::size_type nb = 0; nb < nb_max; ++nb)
557     {
558         std::cout << "nb = " << nb << std::endl;
559 
560         unsigned i0 = unsigned(nb >> bm::set_array_shift);
561         unsigned j0 = unsigned(nb &  bm::set_array_mask);
562 
563         auto planes = sv.planes();
564         for (unsigned i = 0; i < planes; ++i)
565         {
566             const typename SV::bvector_type* bv = sv.get_plane(i);
567             if (!bv)
568                 continue;
569             const typename SV::bvector_type::blocks_manager_type& bman = bv->get_blocks_manager();
570             const bm::word_t* block = bman.get_block_ptr(i0, j0);
571             if (!IS_VALID_ADDR(block) || BM_IS_GAP(block))
572                 continue;
573 
574             // compute block complexity
575             bm::block_waves_xor_descr  x_descr;
576             bm::compute_complexity_descr(block, x_descr);
577             unsigned gc, bc;
578             bm::bit_block_change_bc32(block, &gc, &bc);
579             unsigned best_metric, block_metric;
580             block_metric = best_metric = gc < bc ? gc : bc;
581 
582             bool kb_found = false;
583             bm::id64_t d64 = 0;
584             for (unsigned k = i + 1; k < planes; ++k)
585             {
586                 const typename SV::bvector_type* bv_x = sv.get_plane(i);
587                 if (!bv_x)
588                     continue;
589                 const typename SV::bvector_type::blocks_manager_type& bman_x = bv_x->get_blocks_manager();
590                 const bm::word_t* block_x = bman_x.get_block_ptr(i0, j0);
591                 if (!IS_VALID_ADDR(block_x) || BM_IS_GAP(block_x))
592                     continue;
593 
594                 // evaluate potential key block as XOR filter
595                 bm::id64_t kb_d64 =
596                     bm::compute_xor_complexity_descr(block, block_x, x_descr);
597                 if (kb_d64) // candidate XOR filter found
598                 {
599                     bm::bit_block_xor_product(tb, block, block_x, kb_d64);
600                     unsigned kb_bc, kb_gc;
601                     bm::bit_block_change_bc32(tb, &kb_gc, &kb_bc);
602                     if (kb_gc < best_metric && kb_gc < bm::bie_cut_off)
603                     {
604                         d64 = kb_d64;
605                         best_metric = kb_gc;
606                         kb_found = true;
607                         //*kb_j = j0;
608                     }
609                     if (kb_bc < best_metric && kb_bc < bm::bie_cut_off)
610                     {
611                         d64 = kb_d64;
612                         best_metric = kb_bc;
613                         kb_found = true;
614                         //*kb_j = j0;
615                     }
616 
617                 }
618 
619             } // for k
620 
621             if (kb_found)
622             {
623                 std::cout << "XOR match " << "metric gain = " << std::endl;
624             }
625 
626 
627             std::cout << std::endl;
628 
629         } // for i
630 
631     } // for nb
632 }
633 #endif
634 
635 template<class SV>
636 void print_svector_stat(const SV& svect, bool print_sim = false)
637 {
638     typedef typename SV::bvector_type bvector_type;
639     /// Functor to compute jaccard similarity
640     /// \internal
641     struct Jaccard_Func
642     {
operatorJaccard_Func643         unsigned operator () (distance_metric_descriptor* dmit,
644                               distance_metric_descriptor* /*dmit_end*/)
645         {
646             double d;
647             BM_ASSERT(dmit->metric == COUNT_AND);
648             typename bvector_type::size_type cnt_and = dmit->result;
649             ++dmit;
650             BM_ASSERT(dmit->metric == COUNT_OR);
651             typename bvector_type::size_type cnt_or = dmit->result;
652             if (cnt_and == 0 || cnt_or == 0)
653             {
654                 d = 0.0;
655             }
656             else
657             {
658                 d = double(cnt_and) / double(cnt_or);
659             }
660             unsigned res = unsigned(d * 100);
661             if (res > 100) res = 100;
662             return res;
663         }
664     };
665 
666     typedef  bm::similarity_descriptor<bvector_type, 2, unsigned, unsigned, Jaccard_Func> similarity_descriptor_type;
667     typedef bm::similarity_batch<similarity_descriptor_type> similarity_batch_type;
668 
669     similarity_batch_type sbatch;
670 
671     bm::build_jaccard_similarity_batch(sbatch, svect);
672 
673     sbatch.calculate();
674     sbatch.sort();
675 
676     typename similarity_batch_type::vector_type& sim_vec = sbatch.descr_vect_;
677     if (print_sim)
678     {
679         for (size_t k = 0; k < sim_vec.size(); ++k)
680         {
681             unsigned sim = sim_vec[k].similarity();
682             if (sim > 10)
683             {
684                 const typename SV::bvector_type* bv1 = sim_vec[k].get_first();
685                 const typename SV::bvector_type* bv2 = sim_vec[k].get_second();
686 
687                 auto bv_size2 = compute_serialization_size(*bv2);
688 
689                 typename SV::bvector_type bvx(*bv2);
690                 bvx ^= *bv1;
691 
692                 auto bv_size_x = compute_serialization_size(bvx);
693                 if (bv_size_x < bv_size2) // true savings
694                 {
695                     size_t diff = bv_size2 - bv_size_x;
696 
697                     // compute 10% cut-off
698                     size_t sz10p = bv_size2 / 10;
699                     if (diff > sz10p)
700                     {
701                         std:: cout << "["  << sim_vec[k].get_first_idx()
702                                    << ", " << sim_vec[k].get_second_idx()
703                                    << "] = "  << sim
704                                    << " size(" << sim_vec[k].get_second_idx() << ")="
705                                    << bv_size2
706                                    << " size(x)=" << bv_size_x
707                                    << " diff=" << diff
708                                    << std:: endl;
709                     }
710                 }
711             }
712         } // for k
713     }
714 
715 
716     typename SV::statistics st;
717     svect.calc_stat(&st);
718 
719     std::cout << "size = " << svect.size() << std::endl;
720     std::cout << "Bit blocks:       " << st.bit_blocks << std::endl;
721     std::cout << "Gap blocks:       " << st.gap_blocks << std::endl;
722     std::cout << "Max serialize mem:" << st.max_serialize_mem << " "
723               << (st.max_serialize_mem / (1024 * 1024)) << "MB" << std::endl;
724     std::cout << "Memory used:      " << st.memory_used << " "
725               << (st.memory_used / (1024 * 1024))       << "MB" << std::endl;
726 
727     auto eff_max_element = svect.effective_vector_max();
728     size_t std_vect_size = sizeof(typename SV::value_type) * svect.size() * eff_max_element;
729     std::cout << "Projected mem usage for vector<value_type>:"
730               << std_vect_size << " "
731               << std_vect_size / (1024 * 1024) << "MB"
732               << std::endl;
733     if (sizeof(typename SV::value_type) > 4 && (eff_max_element == 1))
734     {
735         std::cout << "Projected mem usage for vector<long long>:"
736                   << sizeof(long long) * svect.size() << std::endl;
737     }
738 
739     std::cout << "\nplanes:" << std::endl;
740 
741     size_t ssize(0), octet_ssize(0);
742 
743     typename SV::bvector_type bv_join; // global OR of all planes
744     auto planes = svect.planes();
745 
746     unsigned octet_cnt(0), octet(0);
747     for (unsigned i = 0; i < planes; ++i)
748     {
749         const typename SV::bvector_type* bv_plane = svect.get_plane(i);
750         std::cout << i << "-" << octet_cnt << ":";
751         if (bv_plane == 0)
752         {
753             std::cout << "NULL\n";
754             bool any_else = false;
755             for (unsigned j = i+1; j < planes; ++j) // look ahead
756             {
757                 if (svect.get_plane(j))
758                 {
759                     any_else = true;
760                     break;
761                 }
762             }
763             if (!any_else)
764             {
765                 break;
766             }
767         }
768         else
769         {
770             bv_join |= *bv_plane;
771             auto pssize = bm::print_bvector_stat(*bv_plane);
772             ssize += pssize;
773             octet_ssize += pssize;
774         }
775         if (octet_cnt == 7)
776         {
777             std::cout << "--------------------" << std::endl;
778             std::cout << "octet N = " << octet <<
779                     "  compressed = " << octet_ssize <<
780                     " " << octet_ssize/(1024*1024) << "MB" << std::endl;
781             octet_cnt = 0; octet_ssize = 0;
782             octet++;
783             std::cout << std::endl;
784         }
785         else
786         {
787             octet_cnt++;
788         }
789     } // for i
790 
791     const typename SV::bvector_type* bv_null = svect.get_null_bvector();
792     if (bv_null)
793     {
794         std::cout << "(not) NULL plane:\n";
795         ssize += print_bvector_stat(*bv_null);
796         typename SV::size_type not_null_cnt = bv_null->count();
797         std::cout << " - Bitcount: " << not_null_cnt << std::endl;
798 
799         std::cout << "Projected mem usage for std::vector<pair<unsigned, value_type> >:"
800             << ((sizeof(typename SV::value_type) + sizeof(unsigned)) * not_null_cnt) << " "
801             << ((sizeof(typename SV::value_type) + sizeof(unsigned)) * not_null_cnt) / (1024 * 1024) << "MB"
802             << std::endl;
803     }
804 
805     std::cout << " Total serialized size (planes): " << ssize
806               << std::endl
807               << " " << ssize / (1024 * 1024) << " MB" << std::endl;
808 
809     if (svect.size())
810     {
811         bm::id64_t bv_join_cnt = bv_join.count();
812         double fr = double(bv_join_cnt) / double (svect.size());
813         std::cout << "Non-zero elements: " << bv_join_cnt << " "
814                   << "ratio=" << fr
815                   << std::endl;
816         size_t non_zero_mem = size_t(bv_join_cnt) * sizeof(typename SV::value_type);
817         std::cout << "Projected mem usage for non-zero elements: " << non_zero_mem << " "
818                   << non_zero_mem / (1024*1024) << " MB"
819                   << std::endl;
820     }
821 }
822 
823 
824 template<class SV>
print_str_svector_stat(const SV & str_svect)825 void print_str_svector_stat(const SV& str_svect)
826 {
827     typename SV::plane_octet_matrix_type octet_stat_matr;
828 
829     str_svect.calc_octet_stat(octet_stat_matr);
830 
831     for (unsigned i = 0; i < octet_stat_matr.rows(); ++i)
832     {
833         const typename SV::plane_octet_matrix_type::value_type* row
834                                                 = octet_stat_matr.row(i);
835         bool any = false;
836         for (unsigned j = 0; j < octet_stat_matr.cols(); ++j)
837         {
838             if (row[j]) // letter is present
839             {
840                 any = true;
841                 break;
842             }
843         }
844         if (!any)
845             continue;
846 
847         std::cout << i << " : ";
848         unsigned cnt = 0;
849         for (unsigned j = 0; j < octet_stat_matr.cols(); ++j)
850         {
851             if (row[j]) // letter is present
852             {
853                 std::cout << char(j);
854                 ++cnt;
855             }
856         } // for j
857         if (cnt)
858         {
859             std::cout << "\t total= " << cnt;
860         }
861         else
862         {
863             std::cout << " (empty) ";
864         }
865         std::cout << std::endl;
866     } // for i
867 }
868 
869 
870 
871 // save compressed collection to disk
872 //
873 template<class CBC>
874 int file_save_compressed_collection(const CBC& cbc, const std::string& fname, size_t* blob_size = 0)
875 {
876     bm::compressed_collection_serializer<CBC > cbcs;
877     typename CBC::buffer_type sbuf;
878 
879     cbcs.serialize(cbc, sbuf);
880 
881     std::ofstream fout(fname.c_str(), std::ios::binary);
882     if (!fout.good())
883     {
884         return -1;
885     }
886     const char* buf = (char*)sbuf.buf();
887     fout.write(buf, sbuf.size());
888     if (!fout.good())
889     {
890         return -1;
891     }
892 
893     fout.close();
894 
895     if (blob_size)
896     {
897         *blob_size = sbuf.size();
898     }
899     return 0;
900 }
901 
902 // load compressed collection from disk
903 //
904 template<class CBC>
file_load_compressed_collection(CBC & cbc,const std::string & fname)905 int file_load_compressed_collection(CBC& cbc, const std::string& fname)
906 {
907     std::vector<unsigned char> buffer;
908 
909     // read the input buffer, validate errors
910     auto ret = bm::read_dump_file(fname, buffer);
911     if (ret != 0)
912     {
913         return -2;
914     }
915     if (buffer.size() == 0)
916     {
917         return -3;
918     }
919 
920     const unsigned char* buf = &buffer[0];
921 
922     compressed_collection_deserializer<CBC> cbcd;
923     cbcd.deserialize(cbc, buf);
924 
925     return 0;
926 }
927 
928 
929 
930 // save sparse_vector dump to disk
931 //
932 template<class SV>
933 int file_save_svector(const SV& sv, const std::string& fname,
934                       size_t* sv_blob_size=0, bool use_xor = true)
935 {
936     BM_ASSERT(!fname.empty());
937 
938     bm::sparse_vector_serial_layout<SV> sv_lay;
939 
940     bm::sparse_vector_serializer<SV> sv_serializer;
941     sv_serializer.set_xor_ref(use_xor);
942 
943     sv_serializer.serialize(sv, sv_lay);
944     std::ofstream fout(fname.c_str(), std::ios::binary);
945     if (!fout.good())
946     {
947         return -1;
948     }
949     const char* buf = (char*)sv_lay.buf();
950     fout.write(buf, std::streamsize(sv_lay.size()));
951     if (!fout.good())
952     {
953         return -1;
954     }
955 
956     fout.close();
957 
958     if (sv_blob_size)
959     {
960         *sv_blob_size = sv_lay.size();
961     }
962     return 0;
963 }
964 
965 template<class SV>
file_load_svector(SV & sv,const std::string & fname)966 int file_load_svector(SV& sv, const std::string& fname)
967 {
968     std::vector<unsigned char> buffer;
969 
970     // read the input buffer, validate errors
971     auto ret = bm::read_dump_file(fname, buffer);
972     if (ret != 0)
973     {
974         return -2;
975     }
976     if (buffer.size() == 0)
977     {
978         return -3;
979     }
980 
981     const unsigned char* buf = &buffer[0];
982     BM_DECLARE_TEMP_BLOCK(tb)
983     auto res = bm::sparse_vector_deserialize(sv, buf, tb);
984     if (res != 0)
985     {
986         return -4;
987     }
988     return 0;
989 }
990 
991 
992 // compare-check if sparse vector is excatly coresponds to vector
993 //
994 // returns 0 - if equal
995 //         1 - no size match
996 //         2 - element match fails
997 template<class SV, class V>
svector_check(const SV & sv,const V & vect)998 int svector_check(const SV& sv, const V& vect)
999 {
1000     if (sv.size() != vect.size())
1001     {
1002         return 1;
1003     }
1004     for (size_t i = 0; i < vect.size(); ++i)
1005     {
1006         unsigned v1 = sv[(unsigned)i];
1007         unsigned v2 = vect[i];
1008         if (v1 != v2)
1009             return 2;
1010     } // for i
1011     return 0;
1012 }
1013 
1014 
1015 template<class SV, class BV>
convert_bv2sv(SV & sv,const BV & bv)1016 void convert_bv2sv(SV& sv, const BV& bv)
1017 {
1018     typename SV::back_insert_iterator bit = sv.get_back_inserter();
1019     typename BV::enumerator en = bv.first();
1020     for (; en.valid(); ++en)
1021     {
1022         auto v = en.value();
1023         bit = v;
1024     }
1025     bit.flush();
1026 }
1027 
1028 
1029 } // namespace
1030 
1031 
1032 
1033 #ifdef _MSC_VER
1034 #pragma warning( pop )
1035 #endif
1036 
1037 #endif
1038