1 #ifndef DARTS_H_
2 #define DARTS_H_
3 
4 #include <cstdio>
5 #include <exception>
6 #include <new>
7 
8 #define DARTS_VERSION "0.32"
9 
10 // DARTS_THROW() throws a <Darts::Exception> whose message starts with the
11 // file name and the line number. For example, DARTS_THROW("error message") at
12 // line 123 of "darts.h" throws a <Darts::Exception> which has a pointer to
13 // "darts.h:123: exception: error message". The message is available by using
14 // what() as well as that of <std::exception>.
15 #define DARTS_INT_TO_STR(value) #value
16 #define DARTS_LINE_TO_STR(line) DARTS_INT_TO_STR(line)
17 #define DARTS_LINE_STR DARTS_LINE_TO_STR(__LINE__)
18 #define DARTS_THROW(msg) throw Darts::Details::Exception( \
19   __FILE__ ":" DARTS_LINE_STR ": exception: " msg)
20 
21 namespace Darts {
22 
23 // The following namespace hides the internal types and classes.
24 namespace Details {
25 
26 // This header assumes that <int> and <unsigned int> are 32-bit integer types.
27 //
28 // Darts-clone keeps values associated with keys. The type of the values is
29 // <value_type>. Note that the values must be positive integers because the
30 // most significant bit (MSB) of each value is used to represent whether the
31 // corresponding unit is a leaf or not. Also, the keys are represented by
32 // sequences of <char_type>s. <uchar_type> is the unsigned type of <char_type>.
33 typedef char char_type;
34 typedef unsigned char uchar_type;
35 typedef int value_type;
36 
37 // The main structure of Darts-clone is an array of <DoubleArrayUnit>s, and the
38 // unit type is actually a wrapper of <id_type>.
39 typedef unsigned int id_type;
40 
41 // <progress_func_type> is the type of callback functions for reporting the
42 // progress of building a dictionary. See also build() of <DoubleArray>.
43 // The 1st argument receives the progress value and the 2nd argument receives
44 // the maximum progress value. A usage example is to show the progress
45 // percentage, 100.0 * (the 1st argument) / (the 2nd argument).
46 typedef int (*progress_func_type)(std::size_t, std::size_t);
47 
48 // <DoubleArrayUnit> is the type of double-array units and it is a wrapper of
49 // <id_type> in practice.
50 class DoubleArrayUnit {
51  public:
DoubleArrayUnit()52   DoubleArrayUnit() : unit_() {}
53 
54   // has_leaf() returns whether a leaf unit is immediately derived from the
55   // unit (true) or not (false).
has_leaf()56   bool has_leaf() const {
57     return ((unit_ >> 8) & 1) == 1;
58   }
59   // value() returns the value stored in the unit, and thus value() is
60   // available when and only when the unit is a leaf unit.
value()61   value_type value() const {
62     return static_cast<value_type>(unit_ & ((1U << 31) - 1));
63   }
64 
65   // label() returns the label associted with the unit. Note that a leaf unit
66   // always returns an invalid label. For this feature, leaf unit's label()
67   // returns an <id_type> that has the MSB of 1.
label()68   id_type label() const {
69     return unit_ & ((1U << 31) | 0xFF);
70   }
71   // offset() returns the offset from the unit to its derived units.
offset()72   id_type offset() const {
73     return (unit_ >> 10) << ((unit_ & (1U << 9)) >> 6);
74   }
75 
76  private:
77   id_type unit_;
78 
79   // Copyable.
80 };
81 
82 // Darts-clone throws an <Exception> for memory allocation failure, invalid
83 // arguments or a too large offset. The last case means that there are too many
84 // keys in the given set of keys. Note that the `msg' of <Exception> must be a
85 // constant or static string because an <Exception> keeps only a pointer to
86 // that string.
87 class Exception : public std::exception {
88  public:
throw()89   explicit Exception(const char *msg = NULL) throw() : msg_(msg) {}
throw()90   Exception(const Exception &rhs) throw() : msg_(rhs.msg_) {}
~Exception()91   virtual ~Exception() throw() {}
92 
93   // <Exception> overrides what() of <std::exception>.
what()94   virtual const char *what() const throw() {
95     return (msg_ != NULL) ? msg_ : "";
96   }
97 
98  private:
99   const char *msg_;
100 
101   // Disallows operator=.
102   Exception &operator=(const Exception &);
103 };
104 
105 }  // namespace Details
106 
107 // <DoubleArrayImpl> is the interface of Darts-clone. Note that other
108 // classes should not be accessed from outside.
109 //
110 // <DoubleArrayImpl> has 4 template arguments but only the 3rd one is used as
111 // the type of values. Note that the given <T> is used only from outside, and
112 // the internal value type is not changed from <Darts::Details::value_type>.
113 // In build(), given values are casted from <T> to <Darts::Details::value_type>
114 // by using static_cast. On the other hand, values are casted from
115 // <Darts::Details::value_type> to <T> in searching dictionaries.
116 template <typename, typename, typename T, typename>
117 class DoubleArrayImpl {
118  public:
119   // Even if this <value_type> is changed, the internal value type is still
120   // <Darts::Details::value_type>. Other types, such as 64-bit integer types
121   // and floating-point number types, should not be used.
122   typedef T value_type;
123   // A key is reprenseted by a sequence of <key_type>s. For example,
124   // exactMatchSearch() takes a <const key_type *>.
125   typedef Details::char_type key_type;
126   // In searching dictionaries, the values associated with the matched keys are
127   // stored into or returned as <result_type>s.
128   typedef value_type result_type;
129 
130   // <result_pair_type> enables applications to get the lengths of the matched
131   // keys in addition to the values.
132   struct result_pair_type {
133     value_type value;
134     std::size_t length;
135   };
136 
137   // The constructor initializes member variables with 0 and NULLs.
DoubleArrayImpl()138   DoubleArrayImpl() : size_(0), array_(NULL), buf_(NULL) {}
139   // The destructor frees memory allocated for units and then initializes
140   // member variables with 0 and NULLs.
~DoubleArrayImpl()141   virtual ~DoubleArrayImpl() {
142     clear();
143   }
144 
145   // <DoubleArrayImpl> has 2 kinds of set_result()s. The 1st set_result() is to
146   // set a value to a <value_type>. The 2nd set_result() is to set a value and
147   // a length to a <result_pair_type>. By using set_result()s, search methods
148   // can return the 2 kinds of results in the same way.
149   // Why the set_result()s are non-static? It is for compatibility.
150   //
151   // The 1st set_result() takes a length as the 3rd argument but it is not
152   // used. If a compiler does a good job, codes for getting the length may be
153   // removed.
set_result(value_type * result,value_type value,std::size_t)154   void set_result(value_type *result, value_type value, std::size_t) const {
155     *result = value;
156   }
157   // The 2nd set_result() uses both `value' and `length'.
set_result(result_pair_type * result,value_type value,std::size_t length)158   void set_result(result_pair_type *result,
159       value_type value, std::size_t length) const {
160     result->value = value;
161     result->length = length;
162   }
163 
164   // set_array() calls clear() in order to free memory allocated to the old
165   // array and then sets a new array. This function is useful to set a memory-
166   // mapped array. Note that the array set by set_array() is not freed in
167   // clear() and the destructor of <DoubleArrayImpl>.
168   // set_array() can also set the size of the new array but the size is not
169   // used in search methods. So it works well even if the 2nd argument is 0 or
170   // omitted. Remember that size() and total_size() returns 0 in such a case.
171   void set_array(const void *ptr, std::size_t size = 0) {
172     clear();
173     array_ = static_cast<const unit_type *>(ptr);
174     size_ = size;
175   }
176   // array() returns a pointer to the array of units.
array()177   const void *array() const {
178     return array_;
179   }
180 
181   // clear() frees memory allocated to units and then initializes member
182   // variables with 0 and NULLs. Note that clear() does not free memory if the
183   // array of units was set by set_array(). In such a case, `array_' is not
184   // NULL and `buf_' is NULL.
clear()185   void clear() {
186     size_ = 0;
187     array_ = NULL;
188     if (buf_ != NULL) {
189       delete[] buf_;
190       buf_ = NULL;
191     }
192   }
193 
194   // unit_size() returns the size of each unit. The size must be 4 bytes.
unit_size()195   std::size_t unit_size() const {
196     return sizeof(unit_type);
197   }
198   // size() returns the number of units. It can be 0 if set_array() is used.
size()199   std::size_t size() const {
200     return size_;
201   }
202   // total_size() returns the number of bytes allocated to the array of units.
203   // It can be 0 if set_array() is used.
total_size()204   std::size_t total_size() const {
205     return unit_size() * size();
206   }
207   // nonzero_size() exists for compatibility. It always returns the number of
208   // units because it takes long time to count the number of non-zero units.
nonzero_size()209   std::size_t nonzero_size() const {
210     return size();
211   }
212 
213   // build() constructs a dictionary from given key-value pairs. If `lengths'
214   // is NULL, `keys' is handled as an array of zero-terminated strings. If
215   // `values' is NULL, the index in `keys' is associated with each key, i.e.
216   // the ith key has (i - 1) as its value.
217   // Note that the key-value pairs must be arranged in key order and the values
218   // must not be negative. Also, if there are duplicate keys, only the first
219   // pair will be stored in the resultant dictionary.
220   // `progress_func' is a pointer to a callback function. If it is not NULL,
221   // it will be called in build() so that the caller can check the progress of
222   // dictionary construction. For details, please see the definition of
223   // <Darts::Details::progress_func_type>.
224   // The return value of build() is 0, and it indicates the success of the
225   // operation. Otherwise, build() throws a <Darts::Exception>, which is a
226   // derived class of <std::exception>.
227   // build() uses another construction algorithm if `values' is not NULL. In
228   // this case, Darts-clone uses a Directed Acyclic Word Graph (DAWG) instead
229   // of a trie because a DAWG is likely to be more compact than a trie.
230   int build(std::size_t num_keys, const key_type * const *keys,
231       const std::size_t *lengths = NULL, const value_type *values = NULL,
232       Details::progress_func_type progress_func = NULL);
233 
234   // open() reads an array of units from the specified file. And if it goes
235   // well, the old array will be freed and replaced with the new array read
236   // from the file. `offset' specifies the number of bytes to be skipped before
237   // reading an array. `size' specifies the number of bytes to be read from the
238   // file. If the `size' is 0, the whole file will be read.
239   // open() returns 0 iff the operation succeeds. Otherwise, it returns a
240   // non-zero value or throws a <Darts::Exception>. The exception is thrown
241   // when and only when a memory allocation fails.
242   int open(const char *file_name, const char *mode = "rb",
243       std::size_t offset = 0, std::size_t size = 0);
244   // save() writes the array of units into the specified file. `offset'
245   // specifies the number of bytes to be skipped before writing the array.
246   // open() returns 0 iff the operation succeeds. Otherwise, it returns a
247   // non-zero value.
248   int save(const char *file_name, const char *mode = "wb",
249       std::size_t offset = 0) const;
250 
251   // The 1st exactMatchSearch() tests whether the given key exists or not, and
252   // if it exists, its value and length are set to `result'. Otherwise, the
253   // value and the length of `result' are set to -1 and 0 respectively.
254   // Note that if `length' is 0, `key' is handled as a zero-terminated string.
255   // `node_pos' specifies the start position of matching. This argument enables
256   // the combination of exactMatchSearch() and traverse(). For example, if you
257   // want to test "xyzA", "xyzBC", and "xyzDE", you can use traverse() to get
258   // the node position corresponding to "xyz" and then you can use
259   // exactMatchSearch() to test "A", "BC", and "DE" from that position.
260   // Note that the length of `result' indicates the length from the `node_pos'.
261   // In the above example, the lengths are { 1, 2, 2 }, not { 4, 5, 5 }.
262   template <class U>
263   void exactMatchSearch(const key_type *key, U &result,
264       std::size_t length = 0, std::size_t node_pos = 0) const {
265     result = exactMatchSearch<U>(key, length, node_pos);
266   }
267   // The 2nd exactMatchSearch() returns a result instead of updating the 2nd
268   // argument. So, the following exactMatchSearch() has only 3 arguments.
269   template <class U>
270   inline U exactMatchSearch(const key_type *key, std::size_t length = 0,
271       std::size_t node_pos = 0) const;
272 
273   // commonPrefixSearch() searches for keys which match a prefix of the given
274   // string. If `length' is 0, `key' is handled as a zero-terminated string.
275   // The values and the lengths of at most `max_num_results' matched keys are
276   // stored in `results'. commonPrefixSearch() returns the number of matched
277   // keys. Note that the return value can be larger than `max_num_results' if
278   // there are more than `max_num_results' matches. If you want to get all the
279   // results, allocate more spaces and call commonPrefixSearch() again.
280   // `node_pos' works as well as in exactMatchSearch().
281   template <class U>
282   inline std::size_t commonPrefixSearch(const key_type *key, U *results,
283       std::size_t max_num_results, std::size_t length = 0,
284       std::size_t node_pos = 0) const;
285 
286   // In Darts-clone, a dictionary is a deterministic finite-state automaton
287   // (DFA) and traverse() tests transitions on the DFA. The initial state is
288   // `node_pos' and traverse() chooses transitions labeled key[key_pos],
289   // key[key_pos + 1], ... in order. If there is not a transition labeled
290   // key[key_pos + i], traverse() terminates the transitions at that state and
291   // returns -2. Otherwise, traverse() ends without a termination and returns
292   // -1 or a nonnegative value, -1 indicates that the final state was not an
293   // accept state. When a nonnegative value is returned, it is the value
294   // associated with the final accept state. That is, traverse() returns the
295   // value associated with the given key if it exists. Note that traverse()
296   // updates `node_pos' and `key_pos' after each transition.
297   inline value_type traverse(const key_type *key, std::size_t &node_pos,
298       std::size_t &key_pos, std::size_t length = 0) const;
299 
300  private:
301   typedef Details::uchar_type uchar_type;
302   typedef Details::id_type id_type;
303   typedef Details::DoubleArrayUnit unit_type;
304 
305   std::size_t size_;
306   const unit_type *array_;
307   unit_type *buf_;
308 
309   // Disallows copy and assignment.
310   DoubleArrayImpl(const DoubleArrayImpl &);
311   DoubleArrayImpl &operator=(const DoubleArrayImpl &);
312 };
313 
314 // <DoubleArray> is the typical instance of <DoubleArrayImpl>. It uses <int>
315 // as the type of values and it is suitable for most cases.
316 typedef DoubleArrayImpl<void, void, int, void> DoubleArray;
317 
318 // The interface section ends here. For using Darts-clone, there is no need
319 // to read the remaining section, which gives the implementation of
320 // Darts-clone.
321 
322 //
323 // Member functions of DoubleArrayImpl (except build()).
324 //
325 
326 template <typename A, typename B, typename T, typename C>
open(const char * file_name,const char * mode,std::size_t offset,std::size_t size)327 int DoubleArrayImpl<A, B, T, C>::open(const char *file_name,
328     const char *mode, std::size_t offset, std::size_t size) {
329 #ifdef _MSC_VER
330   std::FILE *file;
331   if (::fopen_s(&file, file_name, mode) != 0) {
332     return -1;
333   }
334 #else
335   std::FILE *file = std::fopen(file_name, mode);
336   if (file == NULL) {
337     return -1;
338   }
339 #endif
340 
341   if (size == 0) {
342     if (std::fseek(file, 0, SEEK_END) != 0) {
343       std::fclose(file);
344       return -1;
345     }
346     size = std::ftell(file) - offset;
347   }
348 
349   if (std::fseek(file, offset, SEEK_SET) != 0) {
350     std::fclose(file);
351     return -1;
352   }
353 
354   size /= unit_size();
355   unit_type *buf;
356   try {
357     buf = new unit_type[size];
358   } catch (const std::bad_alloc &) {
359     std::fclose(file);
360     DARTS_THROW("failed to open double-array: std::bad_alloc");
361   }
362 
363   if (std::fread(buf, unit_size(), size, file) != size) {
364     std::fclose(file);
365     delete[] buf;
366     return -1;
367   }
368   std::fclose(file);
369 
370   clear();
371 
372   size_ = size;
373   array_ = buf;
374   buf_ = buf;
375   return 0;
376 }
377 
378 template <typename A, typename B, typename T, typename C>
save(const char * file_name,const char * mode,std::size_t)379 int DoubleArrayImpl<A, B, T, C>::save(const char *file_name,
380     const char *mode, std::size_t) const {
381   if (size() == 0) {
382     return -1;
383   }
384 
385 #ifdef _MSC_VER
386   std::FILE *file;
387   if (::fopen_s(&file, file_name, mode) != 0) {
388     return -1;
389   }
390 #else
391   std::FILE *file = std::fopen(file_name, mode);
392   if (file == NULL) {
393     return -1;
394   }
395 #endif
396 
397   if (std::fwrite(array_, unit_size(), size(), file) != size()) {
398     std::fclose(file);
399     return -1;
400   }
401   std::fclose(file);
402   return 0;
403 }
404 
405 template <typename A, typename B, typename T, typename C>
406 template <typename U>
exactMatchSearch(const key_type * key,std::size_t length,std::size_t node_pos)407 inline U DoubleArrayImpl<A, B, T, C>::exactMatchSearch(const key_type *key,
408     std::size_t length, std::size_t node_pos) const {
409   U result;
410   set_result(&result, static_cast<value_type>(-1), 0);
411 
412   unit_type unit = array_[node_pos];
413   if (length != 0) {
414     for (std::size_t i = 0; i < length; ++i) {
415       node_pos ^= unit.offset() ^ static_cast<uchar_type>(key[i]);
416       unit = array_[node_pos];
417       if (unit.label() != static_cast<uchar_type>(key[i])) {
418         return result;
419       }
420     }
421   } else {
422     for ( ; key[length] != '\0'; ++length) {
423       node_pos ^= unit.offset() ^ static_cast<uchar_type>(key[length]);
424       unit = array_[node_pos];
425       if (unit.label() != static_cast<uchar_type>(key[length])) {
426         return result;
427       }
428     }
429   }
430 
431   if (!unit.has_leaf()) {
432     return result;
433   }
434   unit = array_[node_pos ^ unit.offset()];
435   set_result(&result, static_cast<value_type>(unit.value()), length);
436   return result;
437 }
438 
439 template <typename A, typename B, typename T, typename C>
440 template <typename U>
commonPrefixSearch(const key_type * key,U * results,std::size_t max_num_results,std::size_t length,std::size_t node_pos)441 inline std::size_t DoubleArrayImpl<A, B, T, C>::commonPrefixSearch(
442     const key_type *key, U *results, std::size_t max_num_results,
443     std::size_t length, std::size_t node_pos) const {
444   std::size_t num_results = 0;
445 
446   unit_type unit = array_[node_pos];
447   node_pos ^= unit.offset();
448   if (length != 0) {
449     for (std::size_t i = 0; i < length; ++i) {
450       node_pos ^= static_cast<uchar_type>(key[i]);
451       unit = array_[node_pos];
452       if (unit.label() != static_cast<uchar_type>(key[i])) {
453         return num_results;
454       }
455 
456       node_pos ^= unit.offset();
457       if (unit.has_leaf()) {
458         if (num_results < max_num_results) {
459           set_result(&results[num_results], static_cast<value_type>(
460               array_[node_pos].value()), i + 1);
461         }
462         ++num_results;
463       }
464     }
465   } else {
466     for ( ; key[length] != '\0'; ++length) {
467       node_pos ^= static_cast<uchar_type>(key[length]);
468       unit = array_[node_pos];
469       if (unit.label() != static_cast<uchar_type>(key[length])) {
470         return num_results;
471       }
472 
473       node_pos ^= unit.offset();
474       if (unit.has_leaf()) {
475         if (num_results < max_num_results) {
476           set_result(&results[num_results], static_cast<value_type>(
477               array_[node_pos].value()), length + 1);
478         }
479         ++num_results;
480       }
481     }
482   }
483 
484   return num_results;
485 }
486 
487 template <typename A, typename B, typename T, typename C>
488 inline typename DoubleArrayImpl<A, B, T, C>::value_type
traverse(const key_type * key,std::size_t & node_pos,std::size_t & key_pos,std::size_t length)489 DoubleArrayImpl<A, B, T, C>::traverse(const key_type *key,
490     std::size_t &node_pos, std::size_t &key_pos, std::size_t length) const {
491   id_type id = static_cast<id_type>(node_pos);
492   unit_type unit = array_[id];
493 
494   if (length != 0) {
495     for ( ; key_pos < length; ++key_pos) {
496       id ^= unit.offset() ^ static_cast<uchar_type>(key[key_pos]);
497       unit = array_[id];
498       if (unit.label() != static_cast<uchar_type>(key[key_pos])) {
499         return static_cast<value_type>(-2);
500       }
501       node_pos = id;
502     }
503   } else {
504     for ( ; key[key_pos] != '\0'; ++key_pos) {
505       id ^= unit.offset() ^ static_cast<uchar_type>(key[key_pos]);
506       unit = array_[id];
507       if (unit.label() != static_cast<uchar_type>(key[key_pos])) {
508         return static_cast<value_type>(-2);
509       }
510       node_pos = id;
511     }
512   }
513 
514   if (!unit.has_leaf()) {
515     return static_cast<value_type>(-1);
516   }
517   unit = array_[id ^ unit.offset()];
518   return static_cast<value_type>(unit.value());
519 }
520 
521 namespace Details {
522 
523 //
524 // Memory management of array.
525 //
526 
527 template <typename T>
528 class AutoArray {
529  public:
array_(array)530   explicit AutoArray(T *array = NULL) : array_(array) {}
~AutoArray()531   ~AutoArray() {
532     clear();
533   }
534 
535   const T &operator[](std::size_t id) const {
536     return array_[id];
537   }
538   T &operator[](std::size_t id) {
539     return array_[id];
540   }
541 
empty()542   bool empty() const {
543     return array_ == NULL;
544   }
545 
clear()546   void clear() {
547     if (array_ != NULL) {
548       delete[] array_;
549       array_ = NULL;
550     }
551   }
swap(AutoArray * array)552   void swap(AutoArray *array) {
553     T *temp = array_;
554     array_ = array->array_;
555     array->array_ = temp;
556   }
557   void reset(T *array = NULL) {
558     AutoArray(array).swap(this);
559   }
560 
561  private:
562   T *array_;
563 
564   // Disallows copy and assignment.
565   AutoArray(const AutoArray &);
566   AutoArray &operator=(const AutoArray &);
567 };
568 
569 //
570 // Memory management of resizable array.
571 //
572 
573 template <typename T>
574 class AutoPool {
575  public:
AutoPool()576   AutoPool() : buf_(), size_(0), capacity_(0) {}
~AutoPool()577   ~AutoPool() { clear(); }
578 
579   const T &operator[](std::size_t id) const {
580     return *(reinterpret_cast<const T *>(&buf_[0]) + id);
581   }
582   T &operator[](std::size_t id) {
583     return *(reinterpret_cast<T *>(&buf_[0]) + id);
584   }
585 
empty()586   bool empty() const {
587     return size_ == 0;
588   }
size()589   std::size_t size() const {
590     return size_;
591   }
592 
clear()593   void clear() {
594     resize(0);
595     buf_.clear();
596     size_ = 0;
597     capacity_ = 0;
598   }
599 
push_back(const T & value)600   void push_back(const T &value) {
601     append(value);
602   }
pop_back()603   void pop_back() {
604     (*this)[--size_].~T();
605   }
606 
append()607   void append() {
608     if (size_ == capacity_)
609       resize_buf(size_ + 1);
610     new(&(*this)[size_++]) T;
611   }
append(const T & value)612   void append(const T &value) {
613     if (size_ == capacity_)
614       resize_buf(size_ + 1);
615     new(&(*this)[size_++]) T(value);
616   }
617 
resize(std::size_t size)618   void resize(std::size_t size) {
619     while (size_ > size) {
620       (*this)[--size_].~T();
621     }
622     if (size > capacity_) {
623       resize_buf(size);
624     }
625     while (size_ < size) {
626       new(&(*this)[size_++]) T;
627     }
628   }
resize(std::size_t size,const T & value)629   void resize(std::size_t size, const T &value) {
630     while (size_ > size) {
631       (*this)[--size_].~T();
632     }
633     if (size > capacity_) {
634       resize_buf(size);
635     }
636     while (size_ < size) {
637       new(&(*this)[size_++]) T(value);
638     }
639   }
640 
reserve(std::size_t size)641   void reserve(std::size_t size) {
642     if (size > capacity_) {
643       resize_buf(size);
644     }
645   }
646 
647  private:
648   AutoArray<char> buf_;
649   std::size_t size_;
650   std::size_t capacity_;
651 
652   // Disallows copy and assignment.
653   AutoPool(const AutoPool &);
654   AutoPool &operator=(const AutoPool &);
655 
656   void resize_buf(std::size_t size);
657 };
658 
659 template <typename T>
resize_buf(std::size_t size)660 void AutoPool<T>::resize_buf(std::size_t size) {
661   std::size_t capacity;
662   if (size >= capacity_ * 2) {
663     capacity = size;
664   } else {
665     capacity = 1;
666     while (capacity < size) {
667       capacity <<= 1;
668     }
669   }
670 
671   AutoArray<char> buf;
672   try {
673     buf.reset(new char[sizeof(T) * capacity]);
674   } catch (const std::bad_alloc &) {
675     DARTS_THROW("failed to resize pool: std::bad_alloc");
676   }
677 
678   if (size_ > 0) {
679     T *src = reinterpret_cast<T *>(&buf_[0]);
680     T *dest = reinterpret_cast<T *>(&buf[0]);
681     for (std::size_t i = 0; i < size_; ++i) {
682       new(&dest[i]) T(src[i]);
683       src[i].~T();
684     }
685   }
686 
687   buf_.swap(&buf);
688   capacity_ = capacity;
689 }
690 
691 //
692 // Memory management of stack.
693 //
694 
695 template <typename T>
696 class AutoStack {
697  public:
AutoStack()698   AutoStack() : pool_() {}
~AutoStack()699   ~AutoStack() {
700     clear();
701   }
702 
top()703   const T &top() const {
704     return pool_[size() - 1];
705   }
top()706   T &top() {
707     return pool_[size() - 1];
708   }
709 
empty()710   bool empty() const {
711     return pool_.empty();
712   }
size()713   std::size_t size() const {
714     return pool_.size();
715   }
716 
push(const T & value)717   void push(const T &value) {
718     pool_.push_back(value);
719   }
pop()720   void pop() {
721     pool_.pop_back();
722   }
723 
clear()724   void clear() {
725     pool_.clear();
726   }
727 
728  private:
729   AutoPool<T> pool_;
730 
731   // Disallows copy and assignment.
732   AutoStack(const AutoStack &);
733   AutoStack &operator=(const AutoStack &);
734 };
735 
736 //
737 // Succinct bit vector.
738 //
739 
740 class BitVector {
741  public:
BitVector()742   BitVector() : units_(), ranks_(), num_ones_(0), size_(0) {}
~BitVector()743   ~BitVector() {
744     clear();
745   }
746 
747   bool operator[](std::size_t id) const {
748     return (units_[id / UNIT_SIZE] >> (id % UNIT_SIZE) & 1) == 1;
749   }
750 
rank(std::size_t id)751   id_type rank(std::size_t id) const {
752     std::size_t unit_id = id / UNIT_SIZE;
753     return ranks_[unit_id] + pop_count(units_[unit_id]
754         & (~0U >> (UNIT_SIZE - (id % UNIT_SIZE) - 1)));
755   }
756 
set(std::size_t id,bool bit)757   void set(std::size_t id, bool bit) {
758     if (bit) {
759       units_[id / UNIT_SIZE] |= 1U << (id % UNIT_SIZE);
760     } else {
761       units_[id / UNIT_SIZE] &= ~(1U << (id % UNIT_SIZE));
762     }
763   }
764 
empty()765   bool empty() const {
766     return units_.empty();
767   }
num_ones()768   std::size_t num_ones() const {
769     return num_ones_;
770   }
size()771   std::size_t size() const {
772     return size_;
773   }
774 
append()775   void append() {
776     if ((size_ % UNIT_SIZE) == 0) {
777       units_.append(0);
778     }
779     ++size_;
780   }
781   void build();
782 
clear()783   void clear() {
784     units_.clear();
785     ranks_.clear();
786   }
787 
788  private:
789   enum { UNIT_SIZE = sizeof(id_type) * 8 };
790 
791   AutoPool<id_type> units_;
792   AutoArray<id_type> ranks_;
793   std::size_t num_ones_;
794   std::size_t size_;
795 
796   // Disallows copy and assignment.
797   BitVector(const BitVector &);
798   BitVector &operator=(const BitVector &);
799 
pop_count(id_type unit)800   static id_type pop_count(id_type unit) {
801     unit = ((unit & 0xAAAAAAAA) >> 1) + (unit & 0x55555555);
802     unit = ((unit & 0xCCCCCCCC) >> 2) + (unit & 0x33333333);
803     unit = ((unit >> 4) + unit) & 0x0F0F0F0F;
804     unit += unit >> 8;
805     unit += unit >> 16;
806     return unit & 0xFF;
807   }
808 };
809 
build()810 inline void BitVector::build() {
811   try {
812     ranks_.reset(new id_type[units_.size()]);
813   } catch (const std::bad_alloc &) {
814     DARTS_THROW("failed to build rank index: std::bad_alloc");
815   }
816 
817   num_ones_ = 0;
818   for (std::size_t i = 0; i < units_.size(); ++i) {
819     ranks_[i] = num_ones_;
820     num_ones_ += pop_count(units_[i]);
821   }
822 }
823 
824 //
825 // Keyset.
826 //
827 
828 template <typename T>
829 class Keyset {
830  public:
Keyset(std::size_t num_keys,const char_type * const * keys,const std::size_t * lengths,const T * values)831   Keyset(std::size_t num_keys, const char_type * const *keys,
832       const std::size_t *lengths, const T *values) :
833       num_keys_(num_keys), keys_(keys), lengths_(lengths), values_(values) {}
834 
num_keys()835   std::size_t num_keys() const {
836     return num_keys_;
837   }
keys(std::size_t id)838   const char_type *keys(std::size_t id) const {
839     return keys_[id];
840   }
keys(std::size_t key_id,std::size_t char_id)841   uchar_type keys(std::size_t key_id, std::size_t char_id) const {
842     if (has_lengths() && char_id >= lengths_[key_id])
843       return '\0';
844     return keys_[key_id][char_id];
845   }
846 
has_lengths()847   bool has_lengths() const {
848     return lengths_ != NULL;
849   }
lengths(std::size_t id)850   std::size_t lengths(std::size_t id) const {
851     if (has_lengths()) {
852       return lengths_[id];
853     }
854     std::size_t length = 0;
855     while (keys_[id][length] != '\0') {
856       ++length;
857     }
858     return length;
859   }
860 
has_values()861   bool has_values() const {
862     return values_ != NULL;
863   }
values(std::size_t id)864   const value_type values(std::size_t id) const {
865     if (has_values()) {
866       return static_cast<value_type>(values_[id]);
867     }
868     return static_cast<value_type>(id);
869   }
870 
871  private:
872   std::size_t num_keys_;
873   const char_type * const * keys_;
874   const std::size_t *lengths_;
875   const T *values_;
876 
877   // Disallows copy and assignment.
878   Keyset(const Keyset &);
879   Keyset &operator=(const Keyset &);
880 };
881 
882 //
883 // Node of Directed Acyclic Word Graph (DAWG).
884 //
885 
886 class DawgNode {
887  public:
DawgNode()888   DawgNode() : child_(0), sibling_(0), label_('\0'),
889     is_state_(false), has_sibling_(false) {}
890 
set_child(id_type child)891   void set_child(id_type child) {
892     child_ = child;
893   }
set_sibling(id_type sibling)894   void set_sibling(id_type sibling) {
895     sibling_ = sibling;
896   }
set_value(value_type value)897   void set_value(value_type value) {
898     child_ = value;
899   }
set_label(uchar_type label)900   void set_label(uchar_type label) {
901     label_ = label;
902   }
set_is_state(bool is_state)903   void set_is_state(bool is_state) {
904     is_state_ = is_state;
905   }
set_has_sibling(bool has_sibling)906   void set_has_sibling(bool has_sibling) {
907     has_sibling_ = has_sibling;
908   }
909 
child()910   id_type child() const {
911     return child_;
912   }
sibling()913   id_type sibling() const {
914     return sibling_;
915   }
value()916   value_type value() const {
917     return static_cast<value_type>(child_);
918   }
label()919   uchar_type label() const {
920     return label_;
921   }
is_state()922   bool is_state() const {
923     return is_state_;
924   }
has_sibling()925   bool has_sibling() const {
926     return has_sibling_;
927   }
928 
unit()929   id_type unit() const {
930     if (label_ == '\0') {
931       return (child_ << 1) | (has_sibling_ ? 1 : 0);
932     }
933     return (child_ << 2) | (is_state_ ? 2 : 0) | (has_sibling_ ? 1 : 0);
934   }
935 
936  private:
937   id_type child_;
938   id_type sibling_;
939   uchar_type label_;
940   bool is_state_;
941   bool has_sibling_;
942 
943   // Copyable.
944 };
945 
946 //
947 // Fixed unit of Directed Acyclic Word Graph (DAWG).
948 //
949 
950 class DawgUnit {
951  public:
unit_(unit)952   explicit DawgUnit(id_type unit = 0) : unit_(unit) {}
DawgUnit(const DawgUnit & unit)953   DawgUnit(const DawgUnit &unit) : unit_(unit.unit_) {}
954 
955   DawgUnit &operator=(id_type unit) {
956     unit_ = unit;
957     return *this;
958   }
959 
unit()960   id_type unit() const {
961     return unit_;
962   }
963 
child()964   id_type child() const {
965     return unit_ >> 2;
966   }
has_sibling()967   bool has_sibling() const {
968     return (unit_ & 1) == 1;
969   }
value()970   value_type value() const {
971     return static_cast<value_type>(unit_ >> 1);
972   }
is_state()973   bool is_state() const {
974     return (unit_ & 2) == 2;
975   }
976 
977  private:
978   id_type unit_;
979 
980   // Copyable.
981 };
982 
983 //
984 // Directed Acyclic Word Graph (DAWG) builder.
985 //
986 
987 class DawgBuilder {
988  public:
DawgBuilder()989   DawgBuilder() : nodes_(), units_(), labels_(), is_intersections_(),
990     table_(), node_stack_(), recycle_bin_(), num_states_(0) {}
~DawgBuilder()991   ~DawgBuilder() {
992     clear();
993   }
994 
root()995   id_type root() const {
996     return 0;
997   }
998 
child(id_type id)999   id_type child(id_type id) const {
1000     return units_[id].child();
1001   }
sibling(id_type id)1002   id_type sibling(id_type id) const {
1003     return units_[id].has_sibling() ? (id + 1) : 0;
1004   }
value(id_type id)1005   int value(id_type id) const {
1006     return units_[id].value();
1007   }
1008 
is_leaf(id_type id)1009   bool is_leaf(id_type id) const {
1010     return label(id) == '\0';
1011   }
label(id_type id)1012   uchar_type label(id_type id) const {
1013     return labels_[id];
1014   }
1015 
is_intersection(id_type id)1016   bool is_intersection(id_type id) const {
1017     return is_intersections_[id];
1018   }
intersection_id(id_type id)1019   id_type intersection_id(id_type id) const {
1020     return is_intersections_.rank(id) - 1;
1021   }
1022 
num_intersections()1023   std::size_t num_intersections() const {
1024     return is_intersections_.num_ones();
1025   }
1026 
size()1027   std::size_t size() const {
1028     return units_.size();
1029   }
1030 
1031   void init();
1032   void finish();
1033 
1034   void insert(const char *key, std::size_t length, value_type value);
1035 
1036   void clear();
1037 
1038  private:
1039   enum { INITIAL_TABLE_SIZE = 1 << 10 };
1040 
1041   AutoPool<DawgNode> nodes_;
1042   AutoPool<DawgUnit> units_;
1043   AutoPool<uchar_type> labels_;
1044   BitVector is_intersections_;
1045   AutoPool<id_type> table_;
1046   AutoStack<id_type> node_stack_;
1047   AutoStack<id_type> recycle_bin_;
1048   std::size_t num_states_;
1049 
1050   // Disallows copy and assignment.
1051   DawgBuilder(const DawgBuilder &);
1052   DawgBuilder &operator=(const DawgBuilder &);
1053 
1054   void flush(id_type id);
1055 
1056   void expand_table();
1057 
1058   id_type find_unit(id_type id, id_type *hash_id) const;
1059   id_type find_node(id_type node_id, id_type *hash_id) const;
1060 
1061   bool are_equal(id_type node_id, id_type unit_id) const;
1062 
1063   id_type hash_unit(id_type id) const;
1064   id_type hash_node(id_type id) const;
1065 
1066   id_type append_node();
1067   id_type append_unit();
1068 
free_node(id_type id)1069   void free_node(id_type id) {
1070     recycle_bin_.push(id);
1071   }
1072 
hash(id_type key)1073   static id_type hash(id_type key) {
1074     key = ~key + (key << 15);  // key = (key << 15) - key - 1;
1075     key = key ^ (key >> 12);
1076     key = key + (key << 2);
1077     key = key ^ (key >> 4);
1078     key = key * 2057;  // key = (key + (key << 3)) + (key << 11);
1079     key = key ^ (key >> 16);
1080     return key;
1081   }
1082 };
1083 
init()1084 inline void DawgBuilder::init() {
1085   table_.resize(INITIAL_TABLE_SIZE, 0);
1086 
1087   append_node();
1088   append_unit();
1089 
1090   num_states_ = 1;
1091 
1092   nodes_[0].set_label(0xFF);
1093   node_stack_.push(0);
1094 }
1095 
finish()1096 inline void DawgBuilder::finish() {
1097   flush(0);
1098 
1099   units_[0] = nodes_[0].unit();
1100   labels_[0] = nodes_[0].label();
1101 
1102   nodes_.clear();
1103   table_.clear();
1104   node_stack_.clear();
1105   recycle_bin_.clear();
1106 
1107   is_intersections_.build();
1108 }
1109 
insert(const char * key,std::size_t length,value_type value)1110 inline void DawgBuilder::insert(const char *key, std::size_t length,
1111     value_type value) {
1112   if (value < 0) {
1113     DARTS_THROW("failed to insert key: negative value");
1114   } else if (length == 0) {
1115     DARTS_THROW("failed to insert key: zero-length key");
1116   }
1117 
1118   id_type id = 0;
1119   std::size_t key_pos = 0;
1120 
1121   for ( ; key_pos <= length; ++key_pos) {
1122     id_type child_id = nodes_[id].child();
1123     if (child_id == 0) {
1124       break;
1125     }
1126 
1127     uchar_type key_label = static_cast<uchar_type>(key[key_pos]);
1128     if (key_pos < length && key_label == '\0') {
1129       DARTS_THROW("failed to insert key: invalid null character");
1130     }
1131 
1132     uchar_type unit_label = nodes_[child_id].label();
1133     if (key_label < unit_label) {
1134       DARTS_THROW("failed to insert key: wrong key order");
1135     } else if (key_label > unit_label) {
1136       nodes_[child_id].set_has_sibling(true);
1137       flush(child_id);
1138       break;
1139     }
1140     id = child_id;
1141   }
1142 
1143   if (key_pos > length) {
1144     return;
1145   }
1146 
1147   for ( ; key_pos <= length; ++key_pos) {
1148     uchar_type key_label = static_cast<uchar_type>(
1149         (key_pos < length) ? key[key_pos] : '\0');
1150     id_type child_id = append_node();
1151 
1152     if (nodes_[id].child() == 0) {
1153       nodes_[child_id].set_is_state(true);
1154     }
1155     nodes_[child_id].set_sibling(nodes_[id].child());
1156     nodes_[child_id].set_label(key_label);
1157     nodes_[id].set_child(child_id);
1158     node_stack_.push(child_id);
1159 
1160     id = child_id;
1161   }
1162   nodes_[id].set_value(value);
1163 }
1164 
clear()1165 inline void DawgBuilder::clear() {
1166   nodes_.clear();
1167   units_.clear();
1168   labels_.clear();
1169   is_intersections_.clear();
1170   table_.clear();
1171   node_stack_.clear();
1172   recycle_bin_.clear();
1173   num_states_ = 0;
1174 }
1175 
flush(id_type id)1176 inline void DawgBuilder::flush(id_type id) {
1177   while (node_stack_.top() != id) {
1178     id_type node_id = node_stack_.top();
1179     node_stack_.pop();
1180 
1181     if (num_states_ >= table_.size() - (table_.size() >> 2)) {
1182       expand_table();
1183     }
1184 
1185     id_type num_siblings = 0;
1186     for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) {
1187       ++num_siblings;
1188     }
1189 
1190     id_type hash_id;
1191     id_type match_id = find_node(node_id, &hash_id);
1192     if (match_id != 0) {
1193       is_intersections_.set(match_id, true);
1194     } else {
1195       id_type unit_id = 0;
1196       for (id_type i = 0; i < num_siblings; ++i) {
1197         unit_id = append_unit();
1198       }
1199       for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) {
1200         units_[unit_id] = nodes_[i].unit();
1201         labels_[unit_id] = nodes_[i].label();
1202         --unit_id;
1203       }
1204       match_id = unit_id + 1;
1205       table_[hash_id] = match_id;
1206       ++num_states_;
1207     }
1208 
1209     for (id_type i = node_id, next; i != 0; i = next) {
1210       next = nodes_[i].sibling();
1211       free_node(i);
1212     }
1213 
1214     nodes_[node_stack_.top()].set_child(match_id);
1215   }
1216   node_stack_.pop();
1217 }
1218 
expand_table()1219 inline void DawgBuilder::expand_table() {
1220   std::size_t table_size = table_.size() << 1;
1221   table_.clear();
1222   table_.resize(table_size, 0);
1223 
1224   for (std::size_t i = 1; i < units_.size(); ++i) {
1225     id_type id = static_cast<id_type>(i);
1226     if (labels_[id] == '\0' || units_[id].is_state()) {
1227       id_type hash_id;
1228       find_unit(id, &hash_id);
1229       table_[hash_id] = id;
1230     }
1231   }
1232 }
1233 
find_unit(id_type id,id_type * hash_id)1234 inline id_type DawgBuilder::find_unit(id_type id, id_type *hash_id) const {
1235   *hash_id = hash_unit(id) % table_.size();
1236   for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) {
1237     id_type unit_id = table_[*hash_id];
1238     if (unit_id == 0) {
1239       break;
1240     }
1241 
1242     // There must not be the same unit.
1243   }
1244   return 0;
1245 }
1246 
find_node(id_type node_id,id_type * hash_id)1247 inline id_type DawgBuilder::find_node(id_type node_id,
1248     id_type *hash_id) const {
1249   *hash_id = hash_node(node_id) % table_.size();
1250   for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) {
1251     id_type unit_id = table_[*hash_id];
1252     if (unit_id == 0) {
1253       break;
1254     }
1255 
1256     if (are_equal(node_id, unit_id)) {
1257       return unit_id;
1258     }
1259   }
1260   return 0;
1261 }
1262 
are_equal(id_type node_id,id_type unit_id)1263 inline bool DawgBuilder::are_equal(id_type node_id, id_type unit_id) const {
1264   for (id_type i = nodes_[node_id].sibling(); i != 0;
1265       i = nodes_[i].sibling()) {
1266     if (units_[unit_id].has_sibling() == false) {
1267       return false;
1268     }
1269     ++unit_id;
1270   }
1271   if (units_[unit_id].has_sibling() == true) {
1272     return false;
1273   }
1274 
1275   for (id_type i = node_id; i != 0; i = nodes_[i].sibling(), --unit_id) {
1276     if (nodes_[i].unit() != units_[unit_id].unit() ||
1277         nodes_[i].label() != labels_[unit_id]) {
1278       return false;
1279     }
1280   }
1281   return true;
1282 }
1283 
hash_unit(id_type id)1284 inline id_type DawgBuilder::hash_unit(id_type id) const {
1285   id_type hash_value = 0;
1286   for ( ; id != 0; ++id) {
1287     id_type unit = units_[id].unit();
1288     uchar_type label = labels_[id];
1289     hash_value ^= hash((label << 24) ^ unit);
1290 
1291     if (units_[id].has_sibling() == false) {
1292       break;
1293     }
1294   }
1295   return hash_value;
1296 }
1297 
hash_node(id_type id)1298 inline id_type DawgBuilder::hash_node(id_type id) const {
1299   id_type hash_value = 0;
1300   for ( ; id != 0; id = nodes_[id].sibling()) {
1301     id_type unit = nodes_[id].unit();
1302     uchar_type label = nodes_[id].label();
1303     hash_value ^= hash((label << 24) ^ unit);
1304   }
1305   return hash_value;
1306 }
1307 
append_unit()1308 inline id_type DawgBuilder::append_unit() {
1309   is_intersections_.append();
1310   units_.append();
1311   labels_.append();
1312 
1313   return static_cast<id_type>(is_intersections_.size() - 1);
1314 }
1315 
append_node()1316 inline id_type DawgBuilder::append_node() {
1317   id_type id;
1318   if (recycle_bin_.empty()) {
1319     id = static_cast<id_type>(nodes_.size());
1320     nodes_.append();
1321   } else {
1322     id = recycle_bin_.top();
1323     nodes_[id] = DawgNode();
1324     recycle_bin_.pop();
1325   }
1326   return id;
1327 }
1328 
1329 //
1330 // Unit of double-array builder.
1331 //
1332 
1333 class DoubleArrayBuilderUnit {
1334  public:
DoubleArrayBuilderUnit()1335   DoubleArrayBuilderUnit() : unit_(0) {}
1336 
set_has_leaf(bool has_leaf)1337   void set_has_leaf(bool has_leaf) {
1338     if (has_leaf) {
1339       unit_ |= 1U << 8;
1340     } else {
1341       unit_ &= ~(1U << 8);
1342     }
1343   }
set_value(value_type value)1344   void set_value(value_type value) {
1345     unit_ = value | (1U << 31);
1346   }
set_label(uchar_type label)1347   void set_label(uchar_type label) {
1348     unit_ = (unit_ & ~0xFFU) | label;
1349   }
set_offset(id_type offset)1350   void set_offset(id_type offset) {
1351     if (offset >= 1U << 29) {
1352       DARTS_THROW("failed to modify unit: too large offset");
1353     }
1354     unit_ &= (1U << 31) | (1U << 8) | 0xFF;
1355     if (offset < 1U << 21) {
1356       unit_ |= (offset << 10);
1357     } else {
1358       unit_ |= (offset << 2) | (1U << 9);
1359     }
1360   }
1361 
1362  private:
1363   id_type unit_;
1364 
1365   // Copyable.
1366 };
1367 
1368 //
1369 // Extra unit of double-array builder.
1370 //
1371 
1372 class DoubleArrayBuilderExtraUnit {
1373  public:
DoubleArrayBuilderExtraUnit()1374   DoubleArrayBuilderExtraUnit() : prev_(0), next_(0),
1375       is_fixed_(false), is_used_(false) {}
1376 
set_prev(id_type prev)1377   void set_prev(id_type prev) {
1378     prev_ = prev;
1379   }
set_next(id_type next)1380   void set_next(id_type next) {
1381     next_ = next;
1382   }
set_is_fixed(bool is_fixed)1383   void set_is_fixed(bool is_fixed) {
1384     is_fixed_ = is_fixed;
1385   }
set_is_used(bool is_used)1386   void set_is_used(bool is_used) {
1387     is_used_ = is_used;
1388   }
1389 
prev()1390   id_type prev() const {
1391     return prev_;
1392   }
next()1393   id_type next() const {
1394     return next_;
1395   }
is_fixed()1396   bool is_fixed() const {
1397     return is_fixed_;
1398   }
is_used()1399   bool is_used() const {
1400     return is_used_;
1401   }
1402 
1403  private:
1404   id_type prev_;
1405   id_type next_;
1406   bool is_fixed_;
1407   bool is_used_;
1408 
1409   // Copyable.
1410 };
1411 
1412 //
1413 // DAWG -> double-array converter.
1414 //
1415 
1416 class DoubleArrayBuilder {
1417  public:
DoubleArrayBuilder(progress_func_type progress_func)1418   explicit DoubleArrayBuilder(progress_func_type progress_func)
1419       : progress_func_(progress_func), units_(), extras_(), labels_(),
1420         table_(), extras_head_(0) {}
~DoubleArrayBuilder()1421   ~DoubleArrayBuilder() {
1422     clear();
1423   }
1424 
1425   template <typename T>
1426   void build(const Keyset<T> &keyset);
1427   void copy(std::size_t *size_ptr, DoubleArrayUnit **buf_ptr) const;
1428 
1429   void clear();
1430 
1431  private:
1432   enum { BLOCK_SIZE = 256 };
1433   enum { NUM_EXTRA_BLOCKS = 16 };
1434   enum { NUM_EXTRAS = BLOCK_SIZE * NUM_EXTRA_BLOCKS };
1435 
1436   enum { UPPER_MASK = 0xFF << 21 };
1437   enum { LOWER_MASK = 0xFF };
1438 
1439   typedef DoubleArrayBuilderUnit unit_type;
1440   typedef DoubleArrayBuilderExtraUnit extra_type;
1441 
1442   progress_func_type progress_func_;
1443   AutoPool<unit_type> units_;
1444   AutoArray<extra_type> extras_;
1445   AutoPool<uchar_type> labels_;
1446   AutoArray<id_type> table_;
1447   id_type extras_head_;
1448 
1449   // Disallows copy and assignment.
1450   DoubleArrayBuilder(const DoubleArrayBuilder &);
1451   DoubleArrayBuilder &operator=(const DoubleArrayBuilder &);
1452 
num_blocks()1453   std::size_t num_blocks() const {
1454     return units_.size() / BLOCK_SIZE;
1455   }
1456 
extras(id_type id)1457   const extra_type &extras(id_type id) const {
1458     return extras_[id % NUM_EXTRAS];
1459   }
extras(id_type id)1460   extra_type &extras(id_type id) {
1461     return extras_[id % NUM_EXTRAS];
1462   }
1463 
1464   template <typename T>
1465   void build_dawg(const Keyset<T> &keyset, DawgBuilder *dawg_builder);
1466   void build_from_dawg(const DawgBuilder &dawg);
1467   void build_from_dawg(const DawgBuilder &dawg,
1468       id_type dawg_id, id_type dic_id);
1469   id_type arrange_from_dawg(const DawgBuilder &dawg,
1470       id_type dawg_id, id_type dic_id);
1471 
1472   template <typename T>
1473   void build_from_keyset(const Keyset<T> &keyset);
1474   template <typename T>
1475   void build_from_keyset(const Keyset<T> &keyset, std::size_t begin,
1476       std::size_t end, std::size_t depth, id_type dic_id);
1477   template <typename T>
1478   id_type arrange_from_keyset(const Keyset<T> &keyset, std::size_t begin,
1479       std::size_t end, std::size_t depth, id_type dic_id);
1480 
1481   id_type find_valid_offset(id_type id) const;
1482   bool is_valid_offset(id_type id, id_type offset) const;
1483 
1484   void reserve_id(id_type id);
1485   void expand_units();
1486 
1487   void fix_all_blocks();
1488   void fix_block(id_type block_id);
1489 };
1490 
1491 template <typename T>
build(const Keyset<T> & keyset)1492 void DoubleArrayBuilder::build(const Keyset<T> &keyset) {
1493   if (keyset.has_values()) {
1494     Details::DawgBuilder dawg_builder;
1495     build_dawg(keyset, &dawg_builder);
1496     build_from_dawg(dawg_builder);
1497     dawg_builder.clear();
1498   } else {
1499     build_from_keyset(keyset);
1500   }
1501 }
1502 
copy(std::size_t * size_ptr,DoubleArrayUnit ** buf_ptr)1503 inline void DoubleArrayBuilder::copy(std::size_t *size_ptr,
1504     DoubleArrayUnit **buf_ptr) const {
1505   if (size_ptr != NULL) {
1506     *size_ptr = units_.size();
1507   }
1508   if (buf_ptr != NULL) {
1509     *buf_ptr = new DoubleArrayUnit[units_.size()];
1510     unit_type *units = reinterpret_cast<unit_type *>(*buf_ptr);
1511     for (std::size_t i = 0; i < units_.size(); ++i) {
1512       units[i] = units_[i];
1513     }
1514   }
1515 }
1516 
clear()1517 inline void DoubleArrayBuilder::clear() {
1518   units_.clear();
1519   extras_.clear();
1520   labels_.clear();
1521   table_.clear();
1522   extras_head_ = 0;
1523 }
1524 
1525 template <typename T>
build_dawg(const Keyset<T> & keyset,DawgBuilder * dawg_builder)1526 void DoubleArrayBuilder::build_dawg(const Keyset<T> &keyset,
1527     DawgBuilder *dawg_builder) {
1528   dawg_builder->init();
1529   for (std::size_t i = 0; i < keyset.num_keys(); ++i) {
1530     dawg_builder->insert(keyset.keys(i), keyset.lengths(i), keyset.values(i));
1531     if (progress_func_ != NULL) {
1532       progress_func_(i + 1, keyset.num_keys() + 1);
1533     }
1534   }
1535   dawg_builder->finish();
1536 }
1537 
build_from_dawg(const DawgBuilder & dawg)1538 inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg) {
1539   std::size_t num_units = 1;
1540   while (num_units < dawg.size()) {
1541     num_units <<= 1;
1542   }
1543   units_.reserve(num_units);
1544 
1545   table_.reset(new id_type[dawg.num_intersections()]);
1546   for (std::size_t i = 0; i < dawg.num_intersections(); ++i) {
1547     table_[i] = 0;
1548   }
1549 
1550   extras_.reset(new extra_type[NUM_EXTRAS]);
1551 
1552   reserve_id(0);
1553   extras(0).set_is_used(true);
1554   units_[0].set_offset(1);
1555   units_[0].set_label('\0');
1556 
1557   if (dawg.child(dawg.root()) != 0) {
1558     build_from_dawg(dawg, dawg.root(), 0);
1559   }
1560 
1561   fix_all_blocks();
1562 
1563   extras_.clear();
1564   labels_.clear();
1565   table_.clear();
1566 }
1567 
build_from_dawg(const DawgBuilder & dawg,id_type dawg_id,id_type dic_id)1568 inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg,
1569     id_type dawg_id, id_type dic_id) {
1570   id_type dawg_child_id = dawg.child(dawg_id);
1571   if (dawg.is_intersection(dawg_child_id)) {
1572     id_type intersection_id = dawg.intersection_id(dawg_child_id);
1573     id_type offset = table_[intersection_id];
1574     if (offset != 0) {
1575       offset ^= dic_id;
1576       if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) {
1577         if (dawg.is_leaf(dawg_child_id)) {
1578           units_[dic_id].set_has_leaf(true);
1579         }
1580         units_[dic_id].set_offset(offset);
1581         return;
1582       }
1583     }
1584   }
1585 
1586   id_type offset = arrange_from_dawg(dawg, dawg_id, dic_id);
1587   if (dawg.is_intersection(dawg_child_id)) {
1588     table_[dawg.intersection_id(dawg_child_id)] = offset;
1589   }
1590 
1591   do {
1592     uchar_type child_label = dawg.label(dawg_child_id);
1593     id_type dic_child_id = offset ^ child_label;
1594     if (child_label != '\0') {
1595       build_from_dawg(dawg, dawg_child_id, dic_child_id);
1596     }
1597     dawg_child_id = dawg.sibling(dawg_child_id);
1598   } while (dawg_child_id != 0);
1599 }
1600 
arrange_from_dawg(const DawgBuilder & dawg,id_type dawg_id,id_type dic_id)1601 inline id_type DoubleArrayBuilder::arrange_from_dawg(const DawgBuilder &dawg,
1602     id_type dawg_id, id_type dic_id) {
1603   labels_.resize(0);
1604 
1605   id_type dawg_child_id = dawg.child(dawg_id);
1606   while (dawg_child_id != 0) {
1607     labels_.append(dawg.label(dawg_child_id));
1608     dawg_child_id = dawg.sibling(dawg_child_id);
1609   }
1610 
1611   id_type offset = find_valid_offset(dic_id);
1612   units_[dic_id].set_offset(dic_id ^ offset);
1613 
1614   dawg_child_id = dawg.child(dawg_id);
1615   for (std::size_t i = 0; i < labels_.size(); ++i) {
1616     id_type dic_child_id = offset ^ labels_[i];
1617     reserve_id(dic_child_id);
1618 
1619     if (dawg.is_leaf(dawg_child_id)) {
1620       units_[dic_id].set_has_leaf(true);
1621       units_[dic_child_id].set_value(dawg.value(dawg_child_id));
1622     } else {
1623       units_[dic_child_id].set_label(labels_[i]);
1624     }
1625 
1626     dawg_child_id = dawg.sibling(dawg_child_id);
1627   }
1628   extras(offset).set_is_used(true);
1629 
1630   return offset;
1631 }
1632 
1633 template <typename T>
build_from_keyset(const Keyset<T> & keyset)1634 void DoubleArrayBuilder::build_from_keyset(const Keyset<T> &keyset) {
1635   std::size_t num_units = 1;
1636   while (num_units < keyset.num_keys()) {
1637     num_units <<= 1;
1638   }
1639   units_.reserve(num_units);
1640 
1641   extras_.reset(new extra_type[NUM_EXTRAS]);
1642 
1643   reserve_id(0);
1644   extras(0).set_is_used(true);
1645   units_[0].set_offset(1);
1646   units_[0].set_label('\0');
1647 
1648   if (keyset.num_keys() > 0) {
1649     build_from_keyset(keyset, 0, keyset.num_keys(), 0, 0);
1650   }
1651 
1652   fix_all_blocks();
1653 
1654   extras_.clear();
1655   labels_.clear();
1656 }
1657 
1658 template <typename T>
build_from_keyset(const Keyset<T> & keyset,std::size_t begin,std::size_t end,std::size_t depth,id_type dic_id)1659 void DoubleArrayBuilder::build_from_keyset(const Keyset<T> &keyset,
1660     std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) {
1661   id_type offset = arrange_from_keyset(keyset, begin, end, depth, dic_id);
1662 
1663   while (begin < end) {
1664     if (keyset.keys(begin, depth) != '\0') {
1665       break;
1666     }
1667     ++begin;
1668   }
1669   if (begin == end) {
1670     return;
1671   }
1672 
1673   std::size_t last_begin = begin;
1674   uchar_type last_label = keyset.keys(begin, depth);
1675   while (++begin < end) {
1676     uchar_type label = keyset.keys(begin, depth);
1677     if (label != last_label) {
1678       build_from_keyset(keyset, last_begin, begin,
1679           depth + 1, offset ^ last_label);
1680       last_begin = begin;
1681       last_label = keyset.keys(begin, depth);
1682     }
1683   }
1684   build_from_keyset(keyset, last_begin, end, depth + 1, offset ^ last_label);
1685 }
1686 
1687 template <typename T>
arrange_from_keyset(const Keyset<T> & keyset,std::size_t begin,std::size_t end,std::size_t depth,id_type dic_id)1688 id_type DoubleArrayBuilder::arrange_from_keyset(const Keyset<T> &keyset,
1689     std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) {
1690   labels_.resize(0);
1691 
1692   value_type value = -1;
1693   for (std::size_t i = begin; i < end; ++i) {
1694     uchar_type label = keyset.keys(i, depth);
1695     if (label == '\0') {
1696       if (keyset.has_lengths() && depth < keyset.lengths(i)) {
1697         DARTS_THROW("failed to build double-array: "
1698             "invalid null character");
1699       } else if (keyset.values(i) < 0) {
1700         DARTS_THROW("failed to build double-array: negative value");
1701       }
1702 
1703       if (value == -1) {
1704         value = keyset.values(i);
1705       }
1706       if (progress_func_ != NULL) {
1707         progress_func_(i + 1, keyset.num_keys() + 1);
1708       }
1709     }
1710 
1711     if (labels_.empty()) {
1712       labels_.append(label);
1713     } else if (label != labels_[labels_.size() - 1]) {
1714       if (label < labels_[labels_.size() - 1]) {
1715         DARTS_THROW("failed to build double-array: wrong key order");
1716       }
1717       labels_.append(label);
1718     }
1719   }
1720 
1721   id_type offset = find_valid_offset(dic_id);
1722   units_[dic_id].set_offset(dic_id ^ offset);
1723 
1724   for (std::size_t i = 0; i < labels_.size(); ++i) {
1725     id_type dic_child_id = offset ^ labels_[i];
1726     reserve_id(dic_child_id);
1727     if (labels_[i] == '\0') {
1728       units_[dic_id].set_has_leaf(true);
1729       units_[dic_child_id].set_value(value);
1730     } else {
1731       units_[dic_child_id].set_label(labels_[i]);
1732     }
1733   }
1734   extras(offset).set_is_used(true);
1735 
1736   return offset;
1737 }
1738 
find_valid_offset(id_type id)1739 inline id_type DoubleArrayBuilder::find_valid_offset(id_type id) const {
1740   if (extras_head_ >= units_.size()) {
1741     return units_.size() | (id & LOWER_MASK);
1742   }
1743 
1744   id_type unfixed_id = extras_head_;
1745   do {
1746     id_type offset = unfixed_id ^ labels_[0];
1747     if (is_valid_offset(id, offset)) {
1748       return offset;
1749     }
1750     unfixed_id = extras(unfixed_id).next();
1751   } while (unfixed_id != extras_head_);
1752 
1753   return units_.size() | (id & LOWER_MASK);
1754 }
1755 
is_valid_offset(id_type id,id_type offset)1756 inline bool DoubleArrayBuilder::is_valid_offset(id_type id,
1757     id_type offset) const {
1758   if (extras(offset).is_used()) {
1759     return false;
1760   }
1761 
1762   id_type rel_offset = id ^ offset;
1763   if ((rel_offset & LOWER_MASK) && (rel_offset & UPPER_MASK)) {
1764     return false;
1765   }
1766 
1767   for (std::size_t i = 1; i < labels_.size(); ++i) {
1768     if (extras(offset ^ labels_[i]).is_fixed()) {
1769       return false;
1770     }
1771   }
1772 
1773   return true;
1774 }
1775 
reserve_id(id_type id)1776 inline void DoubleArrayBuilder::reserve_id(id_type id) {
1777   if (id >= units_.size()) {
1778     expand_units();
1779   }
1780 
1781   if (id == extras_head_) {
1782     extras_head_ = extras(id).next();
1783     if (extras_head_ == id) {
1784       extras_head_ = units_.size();
1785     }
1786   }
1787   extras(extras(id).prev()).set_next(extras(id).next());
1788   extras(extras(id).next()).set_prev(extras(id).prev());
1789   extras(id).set_is_fixed(true);
1790 }
1791 
expand_units()1792 inline void DoubleArrayBuilder::expand_units() {
1793   id_type src_num_units = units_.size();
1794   id_type src_num_blocks = num_blocks();
1795 
1796   id_type dest_num_units = src_num_units + BLOCK_SIZE;
1797   id_type dest_num_blocks = src_num_blocks + 1;
1798 
1799   if (dest_num_blocks > NUM_EXTRA_BLOCKS) {
1800     fix_block(src_num_blocks - NUM_EXTRA_BLOCKS);
1801   }
1802 
1803   units_.resize(dest_num_units);
1804 
1805   if (dest_num_blocks > NUM_EXTRA_BLOCKS) {
1806     for (std::size_t id = src_num_units; id < dest_num_units; ++id) {
1807       extras(id).set_is_used(false);
1808       extras(id).set_is_fixed(false);
1809     }
1810   }
1811 
1812   for (id_type i = src_num_units + 1; i < dest_num_units; ++i) {
1813     extras(i - 1).set_next(i);
1814     extras(i).set_prev(i - 1);
1815   }
1816 
1817   extras(src_num_units).set_prev(dest_num_units - 1);
1818   extras(dest_num_units - 1).set_next(src_num_units);
1819 
1820   extras(src_num_units).set_prev(extras(extras_head_).prev());
1821   extras(dest_num_units - 1).set_next(extras_head_);
1822 
1823   extras(extras(extras_head_).prev()).set_next(src_num_units);
1824   extras(extras_head_).set_prev(dest_num_units - 1);
1825 }
1826 
fix_all_blocks()1827 inline void DoubleArrayBuilder::fix_all_blocks() {
1828   id_type begin = 0;
1829   if (num_blocks() > NUM_EXTRA_BLOCKS) {
1830     begin = num_blocks() - NUM_EXTRA_BLOCKS;
1831   }
1832   id_type end = num_blocks();
1833 
1834   for (id_type block_id = begin; block_id != end; ++block_id) {
1835     fix_block(block_id);
1836   }
1837 }
1838 
fix_block(id_type block_id)1839 inline void DoubleArrayBuilder::fix_block(id_type block_id) {
1840   id_type begin = block_id * BLOCK_SIZE;
1841   id_type end = begin + BLOCK_SIZE;
1842 
1843   id_type unused_offset = 0;
1844   for (id_type offset = begin; offset != end; ++offset) {
1845     if (!extras(offset).is_used()) {
1846       unused_offset = offset;
1847       break;
1848     }
1849   }
1850 
1851   for (id_type id = begin; id != end; ++id) {
1852     if (!extras(id).is_fixed()) {
1853       reserve_id(id);
1854       units_[id].set_label(static_cast<uchar_type>(id ^ unused_offset));
1855     }
1856   }
1857 }
1858 
1859 }  // namespace Details
1860 
1861 //
1862 // Member function build() of DoubleArrayImpl.
1863 //
1864 
1865 template <typename A, typename B, typename T, typename C>
build(std::size_t num_keys,const key_type * const * keys,const std::size_t * lengths,const value_type * values,Details::progress_func_type progress_func)1866 int DoubleArrayImpl<A, B, T, C>::build(std::size_t num_keys,
1867     const key_type * const *keys, const std::size_t *lengths,
1868     const value_type *values, Details::progress_func_type progress_func) {
1869   Details::Keyset<value_type> keyset(num_keys, keys, lengths, values);
1870 
1871   Details::DoubleArrayBuilder builder(progress_func);
1872   builder.build(keyset);
1873 
1874   std::size_t size = 0;
1875   unit_type *buf = NULL;
1876   builder.copy(&size, &buf);
1877 
1878   clear();
1879 
1880   size_ = size;
1881   array_ = buf;
1882   buf_ = buf;
1883 
1884   if (progress_func != NULL) {
1885     progress_func(num_keys + 1, num_keys + 1);
1886   }
1887 
1888   return 0;
1889 }
1890 
1891 }  // namespace Darts
1892 
1893 #undef DARTS_INT_TO_STR
1894 #undef DARTS_LINE_TO_STR
1895 #undef DARTS_LINE_STR
1896 #undef DARTS_THROW
1897 
1898 #endif  // DARTS_H_
1899