1 #ifndef DARTS_H_
2 #define DARTS_H_
3
4 #include <cstdio>
5 #include <exception>
6 #include <new>
7
8 #define DARTS_VERSION "0.32"
9
10 // DARTS_THROW() throws a <Darts::Exception> whose message starts with the
11 // file name and the line number. For example, DARTS_THROW("error message") at
12 // line 123 of "darts.h" throws a <Darts::Exception> which has a pointer to
13 // "darts.h:123: exception: error message". The message is available by using
14 // what() as well as that of <std::exception>.
15 #define DARTS_INT_TO_STR(value) #value
16 #define DARTS_LINE_TO_STR(line) DARTS_INT_TO_STR(line)
17 #define DARTS_LINE_STR DARTS_LINE_TO_STR(__LINE__)
18 #define DARTS_THROW(msg) throw Darts::Details::Exception( \
19 __FILE__ ":" DARTS_LINE_STR ": exception: " msg)
20
21 namespace Darts {
22
23 // The following namespace hides the internal types and classes.
24 namespace Details {
25
26 // This header assumes that <int> and <unsigned int> are 32-bit integer types.
27 //
28 // Darts-clone keeps values associated with keys. The type of the values is
29 // <value_type>. Note that the values must be positive integers because the
30 // most significant bit (MSB) of each value is used to represent whether the
31 // corresponding unit is a leaf or not. Also, the keys are represented by
32 // sequences of <char_type>s. <uchar_type> is the unsigned type of <char_type>.
33 typedef char char_type;
34 typedef unsigned char uchar_type;
35 typedef int value_type;
36
37 // The main structure of Darts-clone is an array of <DoubleArrayUnit>s, and the
38 // unit type is actually a wrapper of <id_type>.
39 typedef unsigned int id_type;
40
41 // <progress_func_type> is the type of callback functions for reporting the
42 // progress of building a dictionary. See also build() of <DoubleArray>.
43 // The 1st argument receives the progress value and the 2nd argument receives
44 // the maximum progress value. A usage example is to show the progress
45 // percentage, 100.0 * (the 1st argument) / (the 2nd argument).
46 typedef int (*progress_func_type)(std::size_t, std::size_t);
47
48 // <DoubleArrayUnit> is the type of double-array units and it is a wrapper of
49 // <id_type> in practice.
50 class DoubleArrayUnit {
51 public:
DoubleArrayUnit()52 DoubleArrayUnit() : unit_() {}
53
54 // has_leaf() returns whether a leaf unit is immediately derived from the
55 // unit (true) or not (false).
has_leaf()56 bool has_leaf() const {
57 return ((unit_ >> 8) & 1) == 1;
58 }
59 // value() returns the value stored in the unit, and thus value() is
60 // available when and only when the unit is a leaf unit.
value()61 value_type value() const {
62 return static_cast<value_type>(unit_ & ((1U << 31) - 1));
63 }
64
65 // label() returns the label associted with the unit. Note that a leaf unit
66 // always returns an invalid label. For this feature, leaf unit's label()
67 // returns an <id_type> that has the MSB of 1.
label()68 id_type label() const {
69 return unit_ & ((1U << 31) | 0xFF);
70 }
71 // offset() returns the offset from the unit to its derived units.
offset()72 id_type offset() const {
73 return (unit_ >> 10) << ((unit_ & (1U << 9)) >> 6);
74 }
75
76 private:
77 id_type unit_;
78
79 // Copyable.
80 };
81
82 // Darts-clone throws an <Exception> for memory allocation failure, invalid
83 // arguments or a too large offset. The last case means that there are too many
84 // keys in the given set of keys. Note that the `msg' of <Exception> must be a
85 // constant or static string because an <Exception> keeps only a pointer to
86 // that string.
87 class Exception : public std::exception {
88 public:
throw()89 explicit Exception(const char *msg = NULL) throw() : msg_(msg) {}
throw()90 Exception(const Exception &rhs) throw() : msg_(rhs.msg_) {}
~Exception()91 virtual ~Exception() throw() {}
92
93 // <Exception> overrides what() of <std::exception>.
what()94 virtual const char *what() const throw() {
95 return (msg_ != NULL) ? msg_ : "";
96 }
97
98 private:
99 const char *msg_;
100
101 // Disallows operator=.
102 Exception &operator=(const Exception &);
103 };
104
105 } // namespace Details
106
107 // <DoubleArrayImpl> is the interface of Darts-clone. Note that other
108 // classes should not be accessed from outside.
109 //
110 // <DoubleArrayImpl> has 4 template arguments but only the 3rd one is used as
111 // the type of values. Note that the given <T> is used only from outside, and
112 // the internal value type is not changed from <Darts::Details::value_type>.
113 // In build(), given values are casted from <T> to <Darts::Details::value_type>
114 // by using static_cast. On the other hand, values are casted from
115 // <Darts::Details::value_type> to <T> in searching dictionaries.
116 template <typename, typename, typename T, typename>
117 class DoubleArrayImpl {
118 public:
119 // Even if this <value_type> is changed, the internal value type is still
120 // <Darts::Details::value_type>. Other types, such as 64-bit integer types
121 // and floating-point number types, should not be used.
122 typedef T value_type;
123 // A key is reprenseted by a sequence of <key_type>s. For example,
124 // exactMatchSearch() takes a <const key_type *>.
125 typedef Details::char_type key_type;
126 // In searching dictionaries, the values associated with the matched keys are
127 // stored into or returned as <result_type>s.
128 typedef value_type result_type;
129
130 // <result_pair_type> enables applications to get the lengths of the matched
131 // keys in addition to the values.
132 struct result_pair_type {
133 value_type value;
134 std::size_t length;
135 };
136
137 // The constructor initializes member variables with 0 and NULLs.
DoubleArrayImpl()138 DoubleArrayImpl() : size_(0), array_(NULL), buf_(NULL) {}
139 // The destructor frees memory allocated for units and then initializes
140 // member variables with 0 and NULLs.
~DoubleArrayImpl()141 virtual ~DoubleArrayImpl() {
142 clear();
143 }
144
145 // <DoubleArrayImpl> has 2 kinds of set_result()s. The 1st set_result() is to
146 // set a value to a <value_type>. The 2nd set_result() is to set a value and
147 // a length to a <result_pair_type>. By using set_result()s, search methods
148 // can return the 2 kinds of results in the same way.
149 // Why the set_result()s are non-static? It is for compatibility.
150 //
151 // The 1st set_result() takes a length as the 3rd argument but it is not
152 // used. If a compiler does a good job, codes for getting the length may be
153 // removed.
set_result(value_type * result,value_type value,std::size_t)154 void set_result(value_type *result, value_type value, std::size_t) const {
155 *result = value;
156 }
157 // The 2nd set_result() uses both `value' and `length'.
set_result(result_pair_type * result,value_type value,std::size_t length)158 void set_result(result_pair_type *result,
159 value_type value, std::size_t length) const {
160 result->value = value;
161 result->length = length;
162 }
163
164 // set_array() calls clear() in order to free memory allocated to the old
165 // array and then sets a new array. This function is useful to set a memory-
166 // mapped array. Note that the array set by set_array() is not freed in
167 // clear() and the destructor of <DoubleArrayImpl>.
168 // set_array() can also set the size of the new array but the size is not
169 // used in search methods. So it works well even if the 2nd argument is 0 or
170 // omitted. Remember that size() and total_size() returns 0 in such a case.
171 void set_array(const void *ptr, std::size_t size = 0) {
172 clear();
173 array_ = static_cast<const unit_type *>(ptr);
174 size_ = size;
175 }
176 // array() returns a pointer to the array of units.
array()177 const void *array() const {
178 return array_;
179 }
180
181 // clear() frees memory allocated to units and then initializes member
182 // variables with 0 and NULLs. Note that clear() does not free memory if the
183 // array of units was set by set_array(). In such a case, `array_' is not
184 // NULL and `buf_' is NULL.
clear()185 void clear() {
186 size_ = 0;
187 array_ = NULL;
188 if (buf_ != NULL) {
189 delete[] buf_;
190 buf_ = NULL;
191 }
192 }
193
194 // unit_size() returns the size of each unit. The size must be 4 bytes.
unit_size()195 std::size_t unit_size() const {
196 return sizeof(unit_type);
197 }
198 // size() returns the number of units. It can be 0 if set_array() is used.
size()199 std::size_t size() const {
200 return size_;
201 }
202 // total_size() returns the number of bytes allocated to the array of units.
203 // It can be 0 if set_array() is used.
total_size()204 std::size_t total_size() const {
205 return unit_size() * size();
206 }
207 // nonzero_size() exists for compatibility. It always returns the number of
208 // units because it takes long time to count the number of non-zero units.
nonzero_size()209 std::size_t nonzero_size() const {
210 return size();
211 }
212
213 // build() constructs a dictionary from given key-value pairs. If `lengths'
214 // is NULL, `keys' is handled as an array of zero-terminated strings. If
215 // `values' is NULL, the index in `keys' is associated with each key, i.e.
216 // the ith key has (i - 1) as its value.
217 // Note that the key-value pairs must be arranged in key order and the values
218 // must not be negative. Also, if there are duplicate keys, only the first
219 // pair will be stored in the resultant dictionary.
220 // `progress_func' is a pointer to a callback function. If it is not NULL,
221 // it will be called in build() so that the caller can check the progress of
222 // dictionary construction. For details, please see the definition of
223 // <Darts::Details::progress_func_type>.
224 // The return value of build() is 0, and it indicates the success of the
225 // operation. Otherwise, build() throws a <Darts::Exception>, which is a
226 // derived class of <std::exception>.
227 // build() uses another construction algorithm if `values' is not NULL. In
228 // this case, Darts-clone uses a Directed Acyclic Word Graph (DAWG) instead
229 // of a trie because a DAWG is likely to be more compact than a trie.
230 int build(std::size_t num_keys, const key_type * const *keys,
231 const std::size_t *lengths = NULL, const value_type *values = NULL,
232 Details::progress_func_type progress_func = NULL);
233
234 // open() reads an array of units from the specified file. And if it goes
235 // well, the old array will be freed and replaced with the new array read
236 // from the file. `offset' specifies the number of bytes to be skipped before
237 // reading an array. `size' specifies the number of bytes to be read from the
238 // file. If the `size' is 0, the whole file will be read.
239 // open() returns 0 iff the operation succeeds. Otherwise, it returns a
240 // non-zero value or throws a <Darts::Exception>. The exception is thrown
241 // when and only when a memory allocation fails.
242 int open(const char *file_name, const char *mode = "rb",
243 std::size_t offset = 0, std::size_t size = 0);
244 // save() writes the array of units into the specified file. `offset'
245 // specifies the number of bytes to be skipped before writing the array.
246 // open() returns 0 iff the operation succeeds. Otherwise, it returns a
247 // non-zero value.
248 int save(const char *file_name, const char *mode = "wb",
249 std::size_t offset = 0) const;
250
251 // The 1st exactMatchSearch() tests whether the given key exists or not, and
252 // if it exists, its value and length are set to `result'. Otherwise, the
253 // value and the length of `result' are set to -1 and 0 respectively.
254 // Note that if `length' is 0, `key' is handled as a zero-terminated string.
255 // `node_pos' specifies the start position of matching. This argument enables
256 // the combination of exactMatchSearch() and traverse(). For example, if you
257 // want to test "xyzA", "xyzBC", and "xyzDE", you can use traverse() to get
258 // the node position corresponding to "xyz" and then you can use
259 // exactMatchSearch() to test "A", "BC", and "DE" from that position.
260 // Note that the length of `result' indicates the length from the `node_pos'.
261 // In the above example, the lengths are { 1, 2, 2 }, not { 4, 5, 5 }.
262 template <class U>
263 void exactMatchSearch(const key_type *key, U &result,
264 std::size_t length = 0, std::size_t node_pos = 0) const {
265 result = exactMatchSearch<U>(key, length, node_pos);
266 }
267 // The 2nd exactMatchSearch() returns a result instead of updating the 2nd
268 // argument. So, the following exactMatchSearch() has only 3 arguments.
269 template <class U>
270 inline U exactMatchSearch(const key_type *key, std::size_t length = 0,
271 std::size_t node_pos = 0) const;
272
273 // commonPrefixSearch() searches for keys which match a prefix of the given
274 // string. If `length' is 0, `key' is handled as a zero-terminated string.
275 // The values and the lengths of at most `max_num_results' matched keys are
276 // stored in `results'. commonPrefixSearch() returns the number of matched
277 // keys. Note that the return value can be larger than `max_num_results' if
278 // there are more than `max_num_results' matches. If you want to get all the
279 // results, allocate more spaces and call commonPrefixSearch() again.
280 // `node_pos' works as well as in exactMatchSearch().
281 template <class U>
282 inline std::size_t commonPrefixSearch(const key_type *key, U *results,
283 std::size_t max_num_results, std::size_t length = 0,
284 std::size_t node_pos = 0) const;
285
286 // In Darts-clone, a dictionary is a deterministic finite-state automaton
287 // (DFA) and traverse() tests transitions on the DFA. The initial state is
288 // `node_pos' and traverse() chooses transitions labeled key[key_pos],
289 // key[key_pos + 1], ... in order. If there is not a transition labeled
290 // key[key_pos + i], traverse() terminates the transitions at that state and
291 // returns -2. Otherwise, traverse() ends without a termination and returns
292 // -1 or a nonnegative value, -1 indicates that the final state was not an
293 // accept state. When a nonnegative value is returned, it is the value
294 // associated with the final accept state. That is, traverse() returns the
295 // value associated with the given key if it exists. Note that traverse()
296 // updates `node_pos' and `key_pos' after each transition.
297 inline value_type traverse(const key_type *key, std::size_t &node_pos,
298 std::size_t &key_pos, std::size_t length = 0) const;
299
300 private:
301 typedef Details::uchar_type uchar_type;
302 typedef Details::id_type id_type;
303 typedef Details::DoubleArrayUnit unit_type;
304
305 std::size_t size_;
306 const unit_type *array_;
307 unit_type *buf_;
308
309 // Disallows copy and assignment.
310 DoubleArrayImpl(const DoubleArrayImpl &);
311 DoubleArrayImpl &operator=(const DoubleArrayImpl &);
312 };
313
314 // <DoubleArray> is the typical instance of <DoubleArrayImpl>. It uses <int>
315 // as the type of values and it is suitable for most cases.
316 typedef DoubleArrayImpl<void, void, int, void> DoubleArray;
317
318 // The interface section ends here. For using Darts-clone, there is no need
319 // to read the remaining section, which gives the implementation of
320 // Darts-clone.
321
322 //
323 // Member functions of DoubleArrayImpl (except build()).
324 //
325
326 template <typename A, typename B, typename T, typename C>
open(const char * file_name,const char * mode,std::size_t offset,std::size_t size)327 int DoubleArrayImpl<A, B, T, C>::open(const char *file_name,
328 const char *mode, std::size_t offset, std::size_t size) {
329 #ifdef _MSC_VER
330 std::FILE *file;
331 if (::fopen_s(&file, file_name, mode) != 0) {
332 return -1;
333 }
334 #else
335 std::FILE *file = std::fopen(file_name, mode);
336 if (file == NULL) {
337 return -1;
338 }
339 #endif
340
341 if (size == 0) {
342 if (std::fseek(file, 0, SEEK_END) != 0) {
343 std::fclose(file);
344 return -1;
345 }
346 size = std::ftell(file) - offset;
347 }
348
349 if (std::fseek(file, offset, SEEK_SET) != 0) {
350 std::fclose(file);
351 return -1;
352 }
353
354 size /= unit_size();
355 unit_type *buf;
356 try {
357 buf = new unit_type[size];
358 } catch (const std::bad_alloc &) {
359 std::fclose(file);
360 DARTS_THROW("failed to open double-array: std::bad_alloc");
361 }
362
363 if (std::fread(buf, unit_size(), size, file) != size) {
364 std::fclose(file);
365 delete[] buf;
366 return -1;
367 }
368 std::fclose(file);
369
370 clear();
371
372 size_ = size;
373 array_ = buf;
374 buf_ = buf;
375 return 0;
376 }
377
378 template <typename A, typename B, typename T, typename C>
save(const char * file_name,const char * mode,std::size_t)379 int DoubleArrayImpl<A, B, T, C>::save(const char *file_name,
380 const char *mode, std::size_t) const {
381 if (size() == 0) {
382 return -1;
383 }
384
385 #ifdef _MSC_VER
386 std::FILE *file;
387 if (::fopen_s(&file, file_name, mode) != 0) {
388 return -1;
389 }
390 #else
391 std::FILE *file = std::fopen(file_name, mode);
392 if (file == NULL) {
393 return -1;
394 }
395 #endif
396
397 if (std::fwrite(array_, unit_size(), size(), file) != size()) {
398 std::fclose(file);
399 return -1;
400 }
401 std::fclose(file);
402 return 0;
403 }
404
405 template <typename A, typename B, typename T, typename C>
406 template <typename U>
exactMatchSearch(const key_type * key,std::size_t length,std::size_t node_pos)407 inline U DoubleArrayImpl<A, B, T, C>::exactMatchSearch(const key_type *key,
408 std::size_t length, std::size_t node_pos) const {
409 U result;
410 set_result(&result, static_cast<value_type>(-1), 0);
411
412 unit_type unit = array_[node_pos];
413 if (length != 0) {
414 for (std::size_t i = 0; i < length; ++i) {
415 node_pos ^= unit.offset() ^ static_cast<uchar_type>(key[i]);
416 unit = array_[node_pos];
417 if (unit.label() != static_cast<uchar_type>(key[i])) {
418 return result;
419 }
420 }
421 } else {
422 for ( ; key[length] != '\0'; ++length) {
423 node_pos ^= unit.offset() ^ static_cast<uchar_type>(key[length]);
424 unit = array_[node_pos];
425 if (unit.label() != static_cast<uchar_type>(key[length])) {
426 return result;
427 }
428 }
429 }
430
431 if (!unit.has_leaf()) {
432 return result;
433 }
434 unit = array_[node_pos ^ unit.offset()];
435 set_result(&result, static_cast<value_type>(unit.value()), length);
436 return result;
437 }
438
439 template <typename A, typename B, typename T, typename C>
440 template <typename U>
commonPrefixSearch(const key_type * key,U * results,std::size_t max_num_results,std::size_t length,std::size_t node_pos)441 inline std::size_t DoubleArrayImpl<A, B, T, C>::commonPrefixSearch(
442 const key_type *key, U *results, std::size_t max_num_results,
443 std::size_t length, std::size_t node_pos) const {
444 std::size_t num_results = 0;
445
446 unit_type unit = array_[node_pos];
447 node_pos ^= unit.offset();
448 if (length != 0) {
449 for (std::size_t i = 0; i < length; ++i) {
450 node_pos ^= static_cast<uchar_type>(key[i]);
451 unit = array_[node_pos];
452 if (unit.label() != static_cast<uchar_type>(key[i])) {
453 return num_results;
454 }
455
456 node_pos ^= unit.offset();
457 if (unit.has_leaf()) {
458 if (num_results < max_num_results) {
459 set_result(&results[num_results], static_cast<value_type>(
460 array_[node_pos].value()), i + 1);
461 }
462 ++num_results;
463 }
464 }
465 } else {
466 for ( ; key[length] != '\0'; ++length) {
467 node_pos ^= static_cast<uchar_type>(key[length]);
468 unit = array_[node_pos];
469 if (unit.label() != static_cast<uchar_type>(key[length])) {
470 return num_results;
471 }
472
473 node_pos ^= unit.offset();
474 if (unit.has_leaf()) {
475 if (num_results < max_num_results) {
476 set_result(&results[num_results], static_cast<value_type>(
477 array_[node_pos].value()), length + 1);
478 }
479 ++num_results;
480 }
481 }
482 }
483
484 return num_results;
485 }
486
487 template <typename A, typename B, typename T, typename C>
488 inline typename DoubleArrayImpl<A, B, T, C>::value_type
traverse(const key_type * key,std::size_t & node_pos,std::size_t & key_pos,std::size_t length)489 DoubleArrayImpl<A, B, T, C>::traverse(const key_type *key,
490 std::size_t &node_pos, std::size_t &key_pos, std::size_t length) const {
491 id_type id = static_cast<id_type>(node_pos);
492 unit_type unit = array_[id];
493
494 if (length != 0) {
495 for ( ; key_pos < length; ++key_pos) {
496 id ^= unit.offset() ^ static_cast<uchar_type>(key[key_pos]);
497 unit = array_[id];
498 if (unit.label() != static_cast<uchar_type>(key[key_pos])) {
499 return static_cast<value_type>(-2);
500 }
501 node_pos = id;
502 }
503 } else {
504 for ( ; key[key_pos] != '\0'; ++key_pos) {
505 id ^= unit.offset() ^ static_cast<uchar_type>(key[key_pos]);
506 unit = array_[id];
507 if (unit.label() != static_cast<uchar_type>(key[key_pos])) {
508 return static_cast<value_type>(-2);
509 }
510 node_pos = id;
511 }
512 }
513
514 if (!unit.has_leaf()) {
515 return static_cast<value_type>(-1);
516 }
517 unit = array_[id ^ unit.offset()];
518 return static_cast<value_type>(unit.value());
519 }
520
521 namespace Details {
522
523 //
524 // Memory management of array.
525 //
526
527 template <typename T>
528 class AutoArray {
529 public:
array_(array)530 explicit AutoArray(T *array = NULL) : array_(array) {}
~AutoArray()531 ~AutoArray() {
532 clear();
533 }
534
535 const T &operator[](std::size_t id) const {
536 return array_[id];
537 }
538 T &operator[](std::size_t id) {
539 return array_[id];
540 }
541
empty()542 bool empty() const {
543 return array_ == NULL;
544 }
545
clear()546 void clear() {
547 if (array_ != NULL) {
548 delete[] array_;
549 array_ = NULL;
550 }
551 }
swap(AutoArray * array)552 void swap(AutoArray *array) {
553 T *temp = array_;
554 array_ = array->array_;
555 array->array_ = temp;
556 }
557 void reset(T *array = NULL) {
558 AutoArray(array).swap(this);
559 }
560
561 private:
562 T *array_;
563
564 // Disallows copy and assignment.
565 AutoArray(const AutoArray &);
566 AutoArray &operator=(const AutoArray &);
567 };
568
569 //
570 // Memory management of resizable array.
571 //
572
573 template <typename T>
574 class AutoPool {
575 public:
AutoPool()576 AutoPool() : buf_(), size_(0), capacity_(0) {}
~AutoPool()577 ~AutoPool() { clear(); }
578
579 const T &operator[](std::size_t id) const {
580 return *(reinterpret_cast<const T *>(&buf_[0]) + id);
581 }
582 T &operator[](std::size_t id) {
583 return *(reinterpret_cast<T *>(&buf_[0]) + id);
584 }
585
empty()586 bool empty() const {
587 return size_ == 0;
588 }
size()589 std::size_t size() const {
590 return size_;
591 }
592
clear()593 void clear() {
594 resize(0);
595 buf_.clear();
596 size_ = 0;
597 capacity_ = 0;
598 }
599
push_back(const T & value)600 void push_back(const T &value) {
601 append(value);
602 }
pop_back()603 void pop_back() {
604 (*this)[--size_].~T();
605 }
606
append()607 void append() {
608 if (size_ == capacity_)
609 resize_buf(size_ + 1);
610 new(&(*this)[size_++]) T;
611 }
append(const T & value)612 void append(const T &value) {
613 if (size_ == capacity_)
614 resize_buf(size_ + 1);
615 new(&(*this)[size_++]) T(value);
616 }
617
resize(std::size_t size)618 void resize(std::size_t size) {
619 while (size_ > size) {
620 (*this)[--size_].~T();
621 }
622 if (size > capacity_) {
623 resize_buf(size);
624 }
625 while (size_ < size) {
626 new(&(*this)[size_++]) T;
627 }
628 }
resize(std::size_t size,const T & value)629 void resize(std::size_t size, const T &value) {
630 while (size_ > size) {
631 (*this)[--size_].~T();
632 }
633 if (size > capacity_) {
634 resize_buf(size);
635 }
636 while (size_ < size) {
637 new(&(*this)[size_++]) T(value);
638 }
639 }
640
reserve(std::size_t size)641 void reserve(std::size_t size) {
642 if (size > capacity_) {
643 resize_buf(size);
644 }
645 }
646
647 private:
648 AutoArray<char> buf_;
649 std::size_t size_;
650 std::size_t capacity_;
651
652 // Disallows copy and assignment.
653 AutoPool(const AutoPool &);
654 AutoPool &operator=(const AutoPool &);
655
656 void resize_buf(std::size_t size);
657 };
658
659 template <typename T>
resize_buf(std::size_t size)660 void AutoPool<T>::resize_buf(std::size_t size) {
661 std::size_t capacity;
662 if (size >= capacity_ * 2) {
663 capacity = size;
664 } else {
665 capacity = 1;
666 while (capacity < size) {
667 capacity <<= 1;
668 }
669 }
670
671 AutoArray<char> buf;
672 try {
673 buf.reset(new char[sizeof(T) * capacity]);
674 } catch (const std::bad_alloc &) {
675 DARTS_THROW("failed to resize pool: std::bad_alloc");
676 }
677
678 if (size_ > 0) {
679 T *src = reinterpret_cast<T *>(&buf_[0]);
680 T *dest = reinterpret_cast<T *>(&buf[0]);
681 for (std::size_t i = 0; i < size_; ++i) {
682 new(&dest[i]) T(src[i]);
683 src[i].~T();
684 }
685 }
686
687 buf_.swap(&buf);
688 capacity_ = capacity;
689 }
690
691 //
692 // Memory management of stack.
693 //
694
695 template <typename T>
696 class AutoStack {
697 public:
AutoStack()698 AutoStack() : pool_() {}
~AutoStack()699 ~AutoStack() {
700 clear();
701 }
702
top()703 const T &top() const {
704 return pool_[size() - 1];
705 }
top()706 T &top() {
707 return pool_[size() - 1];
708 }
709
empty()710 bool empty() const {
711 return pool_.empty();
712 }
size()713 std::size_t size() const {
714 return pool_.size();
715 }
716
push(const T & value)717 void push(const T &value) {
718 pool_.push_back(value);
719 }
pop()720 void pop() {
721 pool_.pop_back();
722 }
723
clear()724 void clear() {
725 pool_.clear();
726 }
727
728 private:
729 AutoPool<T> pool_;
730
731 // Disallows copy and assignment.
732 AutoStack(const AutoStack &);
733 AutoStack &operator=(const AutoStack &);
734 };
735
736 //
737 // Succinct bit vector.
738 //
739
740 class BitVector {
741 public:
BitVector()742 BitVector() : units_(), ranks_(), num_ones_(0), size_(0) {}
~BitVector()743 ~BitVector() {
744 clear();
745 }
746
747 bool operator[](std::size_t id) const {
748 return (units_[id / UNIT_SIZE] >> (id % UNIT_SIZE) & 1) == 1;
749 }
750
rank(std::size_t id)751 id_type rank(std::size_t id) const {
752 std::size_t unit_id = id / UNIT_SIZE;
753 return ranks_[unit_id] + pop_count(units_[unit_id]
754 & (~0U >> (UNIT_SIZE - (id % UNIT_SIZE) - 1)));
755 }
756
set(std::size_t id,bool bit)757 void set(std::size_t id, bool bit) {
758 if (bit) {
759 units_[id / UNIT_SIZE] |= 1U << (id % UNIT_SIZE);
760 } else {
761 units_[id / UNIT_SIZE] &= ~(1U << (id % UNIT_SIZE));
762 }
763 }
764
empty()765 bool empty() const {
766 return units_.empty();
767 }
num_ones()768 std::size_t num_ones() const {
769 return num_ones_;
770 }
size()771 std::size_t size() const {
772 return size_;
773 }
774
append()775 void append() {
776 if ((size_ % UNIT_SIZE) == 0) {
777 units_.append(0);
778 }
779 ++size_;
780 }
781 void build();
782
clear()783 void clear() {
784 units_.clear();
785 ranks_.clear();
786 }
787
788 private:
789 enum { UNIT_SIZE = sizeof(id_type) * 8 };
790
791 AutoPool<id_type> units_;
792 AutoArray<id_type> ranks_;
793 std::size_t num_ones_;
794 std::size_t size_;
795
796 // Disallows copy and assignment.
797 BitVector(const BitVector &);
798 BitVector &operator=(const BitVector &);
799
pop_count(id_type unit)800 static id_type pop_count(id_type unit) {
801 unit = ((unit & 0xAAAAAAAA) >> 1) + (unit & 0x55555555);
802 unit = ((unit & 0xCCCCCCCC) >> 2) + (unit & 0x33333333);
803 unit = ((unit >> 4) + unit) & 0x0F0F0F0F;
804 unit += unit >> 8;
805 unit += unit >> 16;
806 return unit & 0xFF;
807 }
808 };
809
build()810 inline void BitVector::build() {
811 try {
812 ranks_.reset(new id_type[units_.size()]);
813 } catch (const std::bad_alloc &) {
814 DARTS_THROW("failed to build rank index: std::bad_alloc");
815 }
816
817 num_ones_ = 0;
818 for (std::size_t i = 0; i < units_.size(); ++i) {
819 ranks_[i] = num_ones_;
820 num_ones_ += pop_count(units_[i]);
821 }
822 }
823
824 //
825 // Keyset.
826 //
827
828 template <typename T>
829 class Keyset {
830 public:
Keyset(std::size_t num_keys,const char_type * const * keys,const std::size_t * lengths,const T * values)831 Keyset(std::size_t num_keys, const char_type * const *keys,
832 const std::size_t *lengths, const T *values) :
833 num_keys_(num_keys), keys_(keys), lengths_(lengths), values_(values) {}
834
num_keys()835 std::size_t num_keys() const {
836 return num_keys_;
837 }
keys(std::size_t id)838 const char_type *keys(std::size_t id) const {
839 return keys_[id];
840 }
keys(std::size_t key_id,std::size_t char_id)841 uchar_type keys(std::size_t key_id, std::size_t char_id) const {
842 if (has_lengths() && char_id >= lengths_[key_id])
843 return '\0';
844 return keys_[key_id][char_id];
845 }
846
has_lengths()847 bool has_lengths() const {
848 return lengths_ != NULL;
849 }
lengths(std::size_t id)850 std::size_t lengths(std::size_t id) const {
851 if (has_lengths()) {
852 return lengths_[id];
853 }
854 std::size_t length = 0;
855 while (keys_[id][length] != '\0') {
856 ++length;
857 }
858 return length;
859 }
860
has_values()861 bool has_values() const {
862 return values_ != NULL;
863 }
values(std::size_t id)864 const value_type values(std::size_t id) const {
865 if (has_values()) {
866 return static_cast<value_type>(values_[id]);
867 }
868 return static_cast<value_type>(id);
869 }
870
871 private:
872 std::size_t num_keys_;
873 const char_type * const * keys_;
874 const std::size_t *lengths_;
875 const T *values_;
876
877 // Disallows copy and assignment.
878 Keyset(const Keyset &);
879 Keyset &operator=(const Keyset &);
880 };
881
882 //
883 // Node of Directed Acyclic Word Graph (DAWG).
884 //
885
886 class DawgNode {
887 public:
DawgNode()888 DawgNode() : child_(0), sibling_(0), label_('\0'),
889 is_state_(false), has_sibling_(false) {}
890
set_child(id_type child)891 void set_child(id_type child) {
892 child_ = child;
893 }
set_sibling(id_type sibling)894 void set_sibling(id_type sibling) {
895 sibling_ = sibling;
896 }
set_value(value_type value)897 void set_value(value_type value) {
898 child_ = value;
899 }
set_label(uchar_type label)900 void set_label(uchar_type label) {
901 label_ = label;
902 }
set_is_state(bool is_state)903 void set_is_state(bool is_state) {
904 is_state_ = is_state;
905 }
set_has_sibling(bool has_sibling)906 void set_has_sibling(bool has_sibling) {
907 has_sibling_ = has_sibling;
908 }
909
child()910 id_type child() const {
911 return child_;
912 }
sibling()913 id_type sibling() const {
914 return sibling_;
915 }
value()916 value_type value() const {
917 return static_cast<value_type>(child_);
918 }
label()919 uchar_type label() const {
920 return label_;
921 }
is_state()922 bool is_state() const {
923 return is_state_;
924 }
has_sibling()925 bool has_sibling() const {
926 return has_sibling_;
927 }
928
unit()929 id_type unit() const {
930 if (label_ == '\0') {
931 return (child_ << 1) | (has_sibling_ ? 1 : 0);
932 }
933 return (child_ << 2) | (is_state_ ? 2 : 0) | (has_sibling_ ? 1 : 0);
934 }
935
936 private:
937 id_type child_;
938 id_type sibling_;
939 uchar_type label_;
940 bool is_state_;
941 bool has_sibling_;
942
943 // Copyable.
944 };
945
946 //
947 // Fixed unit of Directed Acyclic Word Graph (DAWG).
948 //
949
950 class DawgUnit {
951 public:
unit_(unit)952 explicit DawgUnit(id_type unit = 0) : unit_(unit) {}
DawgUnit(const DawgUnit & unit)953 DawgUnit(const DawgUnit &unit) : unit_(unit.unit_) {}
954
955 DawgUnit &operator=(id_type unit) {
956 unit_ = unit;
957 return *this;
958 }
959
unit()960 id_type unit() const {
961 return unit_;
962 }
963
child()964 id_type child() const {
965 return unit_ >> 2;
966 }
has_sibling()967 bool has_sibling() const {
968 return (unit_ & 1) == 1;
969 }
value()970 value_type value() const {
971 return static_cast<value_type>(unit_ >> 1);
972 }
is_state()973 bool is_state() const {
974 return (unit_ & 2) == 2;
975 }
976
977 private:
978 id_type unit_;
979
980 // Copyable.
981 };
982
983 //
984 // Directed Acyclic Word Graph (DAWG) builder.
985 //
986
987 class DawgBuilder {
988 public:
DawgBuilder()989 DawgBuilder() : nodes_(), units_(), labels_(), is_intersections_(),
990 table_(), node_stack_(), recycle_bin_(), num_states_(0) {}
~DawgBuilder()991 ~DawgBuilder() {
992 clear();
993 }
994
root()995 id_type root() const {
996 return 0;
997 }
998
child(id_type id)999 id_type child(id_type id) const {
1000 return units_[id].child();
1001 }
sibling(id_type id)1002 id_type sibling(id_type id) const {
1003 return units_[id].has_sibling() ? (id + 1) : 0;
1004 }
value(id_type id)1005 int value(id_type id) const {
1006 return units_[id].value();
1007 }
1008
is_leaf(id_type id)1009 bool is_leaf(id_type id) const {
1010 return label(id) == '\0';
1011 }
label(id_type id)1012 uchar_type label(id_type id) const {
1013 return labels_[id];
1014 }
1015
is_intersection(id_type id)1016 bool is_intersection(id_type id) const {
1017 return is_intersections_[id];
1018 }
intersection_id(id_type id)1019 id_type intersection_id(id_type id) const {
1020 return is_intersections_.rank(id) - 1;
1021 }
1022
num_intersections()1023 std::size_t num_intersections() const {
1024 return is_intersections_.num_ones();
1025 }
1026
size()1027 std::size_t size() const {
1028 return units_.size();
1029 }
1030
1031 void init();
1032 void finish();
1033
1034 void insert(const char *key, std::size_t length, value_type value);
1035
1036 void clear();
1037
1038 private:
1039 enum { INITIAL_TABLE_SIZE = 1 << 10 };
1040
1041 AutoPool<DawgNode> nodes_;
1042 AutoPool<DawgUnit> units_;
1043 AutoPool<uchar_type> labels_;
1044 BitVector is_intersections_;
1045 AutoPool<id_type> table_;
1046 AutoStack<id_type> node_stack_;
1047 AutoStack<id_type> recycle_bin_;
1048 std::size_t num_states_;
1049
1050 // Disallows copy and assignment.
1051 DawgBuilder(const DawgBuilder &);
1052 DawgBuilder &operator=(const DawgBuilder &);
1053
1054 void flush(id_type id);
1055
1056 void expand_table();
1057
1058 id_type find_unit(id_type id, id_type *hash_id) const;
1059 id_type find_node(id_type node_id, id_type *hash_id) const;
1060
1061 bool are_equal(id_type node_id, id_type unit_id) const;
1062
1063 id_type hash_unit(id_type id) const;
1064 id_type hash_node(id_type id) const;
1065
1066 id_type append_node();
1067 id_type append_unit();
1068
free_node(id_type id)1069 void free_node(id_type id) {
1070 recycle_bin_.push(id);
1071 }
1072
hash(id_type key)1073 static id_type hash(id_type key) {
1074 key = ~key + (key << 15); // key = (key << 15) - key - 1;
1075 key = key ^ (key >> 12);
1076 key = key + (key << 2);
1077 key = key ^ (key >> 4);
1078 key = key * 2057; // key = (key + (key << 3)) + (key << 11);
1079 key = key ^ (key >> 16);
1080 return key;
1081 }
1082 };
1083
init()1084 inline void DawgBuilder::init() {
1085 table_.resize(INITIAL_TABLE_SIZE, 0);
1086
1087 append_node();
1088 append_unit();
1089
1090 num_states_ = 1;
1091
1092 nodes_[0].set_label(0xFF);
1093 node_stack_.push(0);
1094 }
1095
finish()1096 inline void DawgBuilder::finish() {
1097 flush(0);
1098
1099 units_[0] = nodes_[0].unit();
1100 labels_[0] = nodes_[0].label();
1101
1102 nodes_.clear();
1103 table_.clear();
1104 node_stack_.clear();
1105 recycle_bin_.clear();
1106
1107 is_intersections_.build();
1108 }
1109
insert(const char * key,std::size_t length,value_type value)1110 inline void DawgBuilder::insert(const char *key, std::size_t length,
1111 value_type value) {
1112 if (value < 0) {
1113 DARTS_THROW("failed to insert key: negative value");
1114 } else if (length == 0) {
1115 DARTS_THROW("failed to insert key: zero-length key");
1116 }
1117
1118 id_type id = 0;
1119 std::size_t key_pos = 0;
1120
1121 for ( ; key_pos <= length; ++key_pos) {
1122 id_type child_id = nodes_[id].child();
1123 if (child_id == 0) {
1124 break;
1125 }
1126
1127 uchar_type key_label = static_cast<uchar_type>(key[key_pos]);
1128 if (key_pos < length && key_label == '\0') {
1129 DARTS_THROW("failed to insert key: invalid null character");
1130 }
1131
1132 uchar_type unit_label = nodes_[child_id].label();
1133 if (key_label < unit_label) {
1134 DARTS_THROW("failed to insert key: wrong key order");
1135 } else if (key_label > unit_label) {
1136 nodes_[child_id].set_has_sibling(true);
1137 flush(child_id);
1138 break;
1139 }
1140 id = child_id;
1141 }
1142
1143 if (key_pos > length) {
1144 return;
1145 }
1146
1147 for ( ; key_pos <= length; ++key_pos) {
1148 uchar_type key_label = static_cast<uchar_type>(
1149 (key_pos < length) ? key[key_pos] : '\0');
1150 id_type child_id = append_node();
1151
1152 if (nodes_[id].child() == 0) {
1153 nodes_[child_id].set_is_state(true);
1154 }
1155 nodes_[child_id].set_sibling(nodes_[id].child());
1156 nodes_[child_id].set_label(key_label);
1157 nodes_[id].set_child(child_id);
1158 node_stack_.push(child_id);
1159
1160 id = child_id;
1161 }
1162 nodes_[id].set_value(value);
1163 }
1164
clear()1165 inline void DawgBuilder::clear() {
1166 nodes_.clear();
1167 units_.clear();
1168 labels_.clear();
1169 is_intersections_.clear();
1170 table_.clear();
1171 node_stack_.clear();
1172 recycle_bin_.clear();
1173 num_states_ = 0;
1174 }
1175
flush(id_type id)1176 inline void DawgBuilder::flush(id_type id) {
1177 while (node_stack_.top() != id) {
1178 id_type node_id = node_stack_.top();
1179 node_stack_.pop();
1180
1181 if (num_states_ >= table_.size() - (table_.size() >> 2)) {
1182 expand_table();
1183 }
1184
1185 id_type num_siblings = 0;
1186 for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) {
1187 ++num_siblings;
1188 }
1189
1190 id_type hash_id;
1191 id_type match_id = find_node(node_id, &hash_id);
1192 if (match_id != 0) {
1193 is_intersections_.set(match_id, true);
1194 } else {
1195 id_type unit_id = 0;
1196 for (id_type i = 0; i < num_siblings; ++i) {
1197 unit_id = append_unit();
1198 }
1199 for (id_type i = node_id; i != 0; i = nodes_[i].sibling()) {
1200 units_[unit_id] = nodes_[i].unit();
1201 labels_[unit_id] = nodes_[i].label();
1202 --unit_id;
1203 }
1204 match_id = unit_id + 1;
1205 table_[hash_id] = match_id;
1206 ++num_states_;
1207 }
1208
1209 for (id_type i = node_id, next; i != 0; i = next) {
1210 next = nodes_[i].sibling();
1211 free_node(i);
1212 }
1213
1214 nodes_[node_stack_.top()].set_child(match_id);
1215 }
1216 node_stack_.pop();
1217 }
1218
expand_table()1219 inline void DawgBuilder::expand_table() {
1220 std::size_t table_size = table_.size() << 1;
1221 table_.clear();
1222 table_.resize(table_size, 0);
1223
1224 for (std::size_t i = 1; i < units_.size(); ++i) {
1225 id_type id = static_cast<id_type>(i);
1226 if (labels_[id] == '\0' || units_[id].is_state()) {
1227 id_type hash_id;
1228 find_unit(id, &hash_id);
1229 table_[hash_id] = id;
1230 }
1231 }
1232 }
1233
find_unit(id_type id,id_type * hash_id)1234 inline id_type DawgBuilder::find_unit(id_type id, id_type *hash_id) const {
1235 *hash_id = hash_unit(id) % table_.size();
1236 for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) {
1237 id_type unit_id = table_[*hash_id];
1238 if (unit_id == 0) {
1239 break;
1240 }
1241
1242 // There must not be the same unit.
1243 }
1244 return 0;
1245 }
1246
find_node(id_type node_id,id_type * hash_id)1247 inline id_type DawgBuilder::find_node(id_type node_id,
1248 id_type *hash_id) const {
1249 *hash_id = hash_node(node_id) % table_.size();
1250 for ( ; ; *hash_id = (*hash_id + 1) % table_.size()) {
1251 id_type unit_id = table_[*hash_id];
1252 if (unit_id == 0) {
1253 break;
1254 }
1255
1256 if (are_equal(node_id, unit_id)) {
1257 return unit_id;
1258 }
1259 }
1260 return 0;
1261 }
1262
are_equal(id_type node_id,id_type unit_id)1263 inline bool DawgBuilder::are_equal(id_type node_id, id_type unit_id) const {
1264 for (id_type i = nodes_[node_id].sibling(); i != 0;
1265 i = nodes_[i].sibling()) {
1266 if (units_[unit_id].has_sibling() == false) {
1267 return false;
1268 }
1269 ++unit_id;
1270 }
1271 if (units_[unit_id].has_sibling() == true) {
1272 return false;
1273 }
1274
1275 for (id_type i = node_id; i != 0; i = nodes_[i].sibling(), --unit_id) {
1276 if (nodes_[i].unit() != units_[unit_id].unit() ||
1277 nodes_[i].label() != labels_[unit_id]) {
1278 return false;
1279 }
1280 }
1281 return true;
1282 }
1283
hash_unit(id_type id)1284 inline id_type DawgBuilder::hash_unit(id_type id) const {
1285 id_type hash_value = 0;
1286 for ( ; id != 0; ++id) {
1287 id_type unit = units_[id].unit();
1288 uchar_type label = labels_[id];
1289 hash_value ^= hash((label << 24) ^ unit);
1290
1291 if (units_[id].has_sibling() == false) {
1292 break;
1293 }
1294 }
1295 return hash_value;
1296 }
1297
hash_node(id_type id)1298 inline id_type DawgBuilder::hash_node(id_type id) const {
1299 id_type hash_value = 0;
1300 for ( ; id != 0; id = nodes_[id].sibling()) {
1301 id_type unit = nodes_[id].unit();
1302 uchar_type label = nodes_[id].label();
1303 hash_value ^= hash((label << 24) ^ unit);
1304 }
1305 return hash_value;
1306 }
1307
append_unit()1308 inline id_type DawgBuilder::append_unit() {
1309 is_intersections_.append();
1310 units_.append();
1311 labels_.append();
1312
1313 return static_cast<id_type>(is_intersections_.size() - 1);
1314 }
1315
append_node()1316 inline id_type DawgBuilder::append_node() {
1317 id_type id;
1318 if (recycle_bin_.empty()) {
1319 id = static_cast<id_type>(nodes_.size());
1320 nodes_.append();
1321 } else {
1322 id = recycle_bin_.top();
1323 nodes_[id] = DawgNode();
1324 recycle_bin_.pop();
1325 }
1326 return id;
1327 }
1328
1329 //
1330 // Unit of double-array builder.
1331 //
1332
1333 class DoubleArrayBuilderUnit {
1334 public:
DoubleArrayBuilderUnit()1335 DoubleArrayBuilderUnit() : unit_(0) {}
1336
set_has_leaf(bool has_leaf)1337 void set_has_leaf(bool has_leaf) {
1338 if (has_leaf) {
1339 unit_ |= 1U << 8;
1340 } else {
1341 unit_ &= ~(1U << 8);
1342 }
1343 }
set_value(value_type value)1344 void set_value(value_type value) {
1345 unit_ = value | (1U << 31);
1346 }
set_label(uchar_type label)1347 void set_label(uchar_type label) {
1348 unit_ = (unit_ & ~0xFFU) | label;
1349 }
set_offset(id_type offset)1350 void set_offset(id_type offset) {
1351 if (offset >= 1U << 29) {
1352 DARTS_THROW("failed to modify unit: too large offset");
1353 }
1354 unit_ &= (1U << 31) | (1U << 8) | 0xFF;
1355 if (offset < 1U << 21) {
1356 unit_ |= (offset << 10);
1357 } else {
1358 unit_ |= (offset << 2) | (1U << 9);
1359 }
1360 }
1361
1362 private:
1363 id_type unit_;
1364
1365 // Copyable.
1366 };
1367
1368 //
1369 // Extra unit of double-array builder.
1370 //
1371
1372 class DoubleArrayBuilderExtraUnit {
1373 public:
DoubleArrayBuilderExtraUnit()1374 DoubleArrayBuilderExtraUnit() : prev_(0), next_(0),
1375 is_fixed_(false), is_used_(false) {}
1376
set_prev(id_type prev)1377 void set_prev(id_type prev) {
1378 prev_ = prev;
1379 }
set_next(id_type next)1380 void set_next(id_type next) {
1381 next_ = next;
1382 }
set_is_fixed(bool is_fixed)1383 void set_is_fixed(bool is_fixed) {
1384 is_fixed_ = is_fixed;
1385 }
set_is_used(bool is_used)1386 void set_is_used(bool is_used) {
1387 is_used_ = is_used;
1388 }
1389
prev()1390 id_type prev() const {
1391 return prev_;
1392 }
next()1393 id_type next() const {
1394 return next_;
1395 }
is_fixed()1396 bool is_fixed() const {
1397 return is_fixed_;
1398 }
is_used()1399 bool is_used() const {
1400 return is_used_;
1401 }
1402
1403 private:
1404 id_type prev_;
1405 id_type next_;
1406 bool is_fixed_;
1407 bool is_used_;
1408
1409 // Copyable.
1410 };
1411
1412 //
1413 // DAWG -> double-array converter.
1414 //
1415
1416 class DoubleArrayBuilder {
1417 public:
DoubleArrayBuilder(progress_func_type progress_func)1418 explicit DoubleArrayBuilder(progress_func_type progress_func)
1419 : progress_func_(progress_func), units_(), extras_(), labels_(),
1420 table_(), extras_head_(0) {}
~DoubleArrayBuilder()1421 ~DoubleArrayBuilder() {
1422 clear();
1423 }
1424
1425 template <typename T>
1426 void build(const Keyset<T> &keyset);
1427 void copy(std::size_t *size_ptr, DoubleArrayUnit **buf_ptr) const;
1428
1429 void clear();
1430
1431 private:
1432 enum { BLOCK_SIZE = 256 };
1433 enum { NUM_EXTRA_BLOCKS = 16 };
1434 enum { NUM_EXTRAS = BLOCK_SIZE * NUM_EXTRA_BLOCKS };
1435
1436 enum { UPPER_MASK = 0xFF << 21 };
1437 enum { LOWER_MASK = 0xFF };
1438
1439 typedef DoubleArrayBuilderUnit unit_type;
1440 typedef DoubleArrayBuilderExtraUnit extra_type;
1441
1442 progress_func_type progress_func_;
1443 AutoPool<unit_type> units_;
1444 AutoArray<extra_type> extras_;
1445 AutoPool<uchar_type> labels_;
1446 AutoArray<id_type> table_;
1447 id_type extras_head_;
1448
1449 // Disallows copy and assignment.
1450 DoubleArrayBuilder(const DoubleArrayBuilder &);
1451 DoubleArrayBuilder &operator=(const DoubleArrayBuilder &);
1452
num_blocks()1453 std::size_t num_blocks() const {
1454 return units_.size() / BLOCK_SIZE;
1455 }
1456
extras(id_type id)1457 const extra_type &extras(id_type id) const {
1458 return extras_[id % NUM_EXTRAS];
1459 }
extras(id_type id)1460 extra_type &extras(id_type id) {
1461 return extras_[id % NUM_EXTRAS];
1462 }
1463
1464 template <typename T>
1465 void build_dawg(const Keyset<T> &keyset, DawgBuilder *dawg_builder);
1466 void build_from_dawg(const DawgBuilder &dawg);
1467 void build_from_dawg(const DawgBuilder &dawg,
1468 id_type dawg_id, id_type dic_id);
1469 id_type arrange_from_dawg(const DawgBuilder &dawg,
1470 id_type dawg_id, id_type dic_id);
1471
1472 template <typename T>
1473 void build_from_keyset(const Keyset<T> &keyset);
1474 template <typename T>
1475 void build_from_keyset(const Keyset<T> &keyset, std::size_t begin,
1476 std::size_t end, std::size_t depth, id_type dic_id);
1477 template <typename T>
1478 id_type arrange_from_keyset(const Keyset<T> &keyset, std::size_t begin,
1479 std::size_t end, std::size_t depth, id_type dic_id);
1480
1481 id_type find_valid_offset(id_type id) const;
1482 bool is_valid_offset(id_type id, id_type offset) const;
1483
1484 void reserve_id(id_type id);
1485 void expand_units();
1486
1487 void fix_all_blocks();
1488 void fix_block(id_type block_id);
1489 };
1490
1491 template <typename T>
build(const Keyset<T> & keyset)1492 void DoubleArrayBuilder::build(const Keyset<T> &keyset) {
1493 if (keyset.has_values()) {
1494 Details::DawgBuilder dawg_builder;
1495 build_dawg(keyset, &dawg_builder);
1496 build_from_dawg(dawg_builder);
1497 dawg_builder.clear();
1498 } else {
1499 build_from_keyset(keyset);
1500 }
1501 }
1502
copy(std::size_t * size_ptr,DoubleArrayUnit ** buf_ptr)1503 inline void DoubleArrayBuilder::copy(std::size_t *size_ptr,
1504 DoubleArrayUnit **buf_ptr) const {
1505 if (size_ptr != NULL) {
1506 *size_ptr = units_.size();
1507 }
1508 if (buf_ptr != NULL) {
1509 *buf_ptr = new DoubleArrayUnit[units_.size()];
1510 unit_type *units = reinterpret_cast<unit_type *>(*buf_ptr);
1511 for (std::size_t i = 0; i < units_.size(); ++i) {
1512 units[i] = units_[i];
1513 }
1514 }
1515 }
1516
clear()1517 inline void DoubleArrayBuilder::clear() {
1518 units_.clear();
1519 extras_.clear();
1520 labels_.clear();
1521 table_.clear();
1522 extras_head_ = 0;
1523 }
1524
1525 template <typename T>
build_dawg(const Keyset<T> & keyset,DawgBuilder * dawg_builder)1526 void DoubleArrayBuilder::build_dawg(const Keyset<T> &keyset,
1527 DawgBuilder *dawg_builder) {
1528 dawg_builder->init();
1529 for (std::size_t i = 0; i < keyset.num_keys(); ++i) {
1530 dawg_builder->insert(keyset.keys(i), keyset.lengths(i), keyset.values(i));
1531 if (progress_func_ != NULL) {
1532 progress_func_(i + 1, keyset.num_keys() + 1);
1533 }
1534 }
1535 dawg_builder->finish();
1536 }
1537
build_from_dawg(const DawgBuilder & dawg)1538 inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg) {
1539 std::size_t num_units = 1;
1540 while (num_units < dawg.size()) {
1541 num_units <<= 1;
1542 }
1543 units_.reserve(num_units);
1544
1545 table_.reset(new id_type[dawg.num_intersections()]);
1546 for (std::size_t i = 0; i < dawg.num_intersections(); ++i) {
1547 table_[i] = 0;
1548 }
1549
1550 extras_.reset(new extra_type[NUM_EXTRAS]);
1551
1552 reserve_id(0);
1553 extras(0).set_is_used(true);
1554 units_[0].set_offset(1);
1555 units_[0].set_label('\0');
1556
1557 if (dawg.child(dawg.root()) != 0) {
1558 build_from_dawg(dawg, dawg.root(), 0);
1559 }
1560
1561 fix_all_blocks();
1562
1563 extras_.clear();
1564 labels_.clear();
1565 table_.clear();
1566 }
1567
build_from_dawg(const DawgBuilder & dawg,id_type dawg_id,id_type dic_id)1568 inline void DoubleArrayBuilder::build_from_dawg(const DawgBuilder &dawg,
1569 id_type dawg_id, id_type dic_id) {
1570 id_type dawg_child_id = dawg.child(dawg_id);
1571 if (dawg.is_intersection(dawg_child_id)) {
1572 id_type intersection_id = dawg.intersection_id(dawg_child_id);
1573 id_type offset = table_[intersection_id];
1574 if (offset != 0) {
1575 offset ^= dic_id;
1576 if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) {
1577 if (dawg.is_leaf(dawg_child_id)) {
1578 units_[dic_id].set_has_leaf(true);
1579 }
1580 units_[dic_id].set_offset(offset);
1581 return;
1582 }
1583 }
1584 }
1585
1586 id_type offset = arrange_from_dawg(dawg, dawg_id, dic_id);
1587 if (dawg.is_intersection(dawg_child_id)) {
1588 table_[dawg.intersection_id(dawg_child_id)] = offset;
1589 }
1590
1591 do {
1592 uchar_type child_label = dawg.label(dawg_child_id);
1593 id_type dic_child_id = offset ^ child_label;
1594 if (child_label != '\0') {
1595 build_from_dawg(dawg, dawg_child_id, dic_child_id);
1596 }
1597 dawg_child_id = dawg.sibling(dawg_child_id);
1598 } while (dawg_child_id != 0);
1599 }
1600
arrange_from_dawg(const DawgBuilder & dawg,id_type dawg_id,id_type dic_id)1601 inline id_type DoubleArrayBuilder::arrange_from_dawg(const DawgBuilder &dawg,
1602 id_type dawg_id, id_type dic_id) {
1603 labels_.resize(0);
1604
1605 id_type dawg_child_id = dawg.child(dawg_id);
1606 while (dawg_child_id != 0) {
1607 labels_.append(dawg.label(dawg_child_id));
1608 dawg_child_id = dawg.sibling(dawg_child_id);
1609 }
1610
1611 id_type offset = find_valid_offset(dic_id);
1612 units_[dic_id].set_offset(dic_id ^ offset);
1613
1614 dawg_child_id = dawg.child(dawg_id);
1615 for (std::size_t i = 0; i < labels_.size(); ++i) {
1616 id_type dic_child_id = offset ^ labels_[i];
1617 reserve_id(dic_child_id);
1618
1619 if (dawg.is_leaf(dawg_child_id)) {
1620 units_[dic_id].set_has_leaf(true);
1621 units_[dic_child_id].set_value(dawg.value(dawg_child_id));
1622 } else {
1623 units_[dic_child_id].set_label(labels_[i]);
1624 }
1625
1626 dawg_child_id = dawg.sibling(dawg_child_id);
1627 }
1628 extras(offset).set_is_used(true);
1629
1630 return offset;
1631 }
1632
1633 template <typename T>
build_from_keyset(const Keyset<T> & keyset)1634 void DoubleArrayBuilder::build_from_keyset(const Keyset<T> &keyset) {
1635 std::size_t num_units = 1;
1636 while (num_units < keyset.num_keys()) {
1637 num_units <<= 1;
1638 }
1639 units_.reserve(num_units);
1640
1641 extras_.reset(new extra_type[NUM_EXTRAS]);
1642
1643 reserve_id(0);
1644 extras(0).set_is_used(true);
1645 units_[0].set_offset(1);
1646 units_[0].set_label('\0');
1647
1648 if (keyset.num_keys() > 0) {
1649 build_from_keyset(keyset, 0, keyset.num_keys(), 0, 0);
1650 }
1651
1652 fix_all_blocks();
1653
1654 extras_.clear();
1655 labels_.clear();
1656 }
1657
1658 template <typename T>
build_from_keyset(const Keyset<T> & keyset,std::size_t begin,std::size_t end,std::size_t depth,id_type dic_id)1659 void DoubleArrayBuilder::build_from_keyset(const Keyset<T> &keyset,
1660 std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) {
1661 id_type offset = arrange_from_keyset(keyset, begin, end, depth, dic_id);
1662
1663 while (begin < end) {
1664 if (keyset.keys(begin, depth) != '\0') {
1665 break;
1666 }
1667 ++begin;
1668 }
1669 if (begin == end) {
1670 return;
1671 }
1672
1673 std::size_t last_begin = begin;
1674 uchar_type last_label = keyset.keys(begin, depth);
1675 while (++begin < end) {
1676 uchar_type label = keyset.keys(begin, depth);
1677 if (label != last_label) {
1678 build_from_keyset(keyset, last_begin, begin,
1679 depth + 1, offset ^ last_label);
1680 last_begin = begin;
1681 last_label = keyset.keys(begin, depth);
1682 }
1683 }
1684 build_from_keyset(keyset, last_begin, end, depth + 1, offset ^ last_label);
1685 }
1686
1687 template <typename T>
arrange_from_keyset(const Keyset<T> & keyset,std::size_t begin,std::size_t end,std::size_t depth,id_type dic_id)1688 id_type DoubleArrayBuilder::arrange_from_keyset(const Keyset<T> &keyset,
1689 std::size_t begin, std::size_t end, std::size_t depth, id_type dic_id) {
1690 labels_.resize(0);
1691
1692 value_type value = -1;
1693 for (std::size_t i = begin; i < end; ++i) {
1694 uchar_type label = keyset.keys(i, depth);
1695 if (label == '\0') {
1696 if (keyset.has_lengths() && depth < keyset.lengths(i)) {
1697 DARTS_THROW("failed to build double-array: "
1698 "invalid null character");
1699 } else if (keyset.values(i) < 0) {
1700 DARTS_THROW("failed to build double-array: negative value");
1701 }
1702
1703 if (value == -1) {
1704 value = keyset.values(i);
1705 }
1706 if (progress_func_ != NULL) {
1707 progress_func_(i + 1, keyset.num_keys() + 1);
1708 }
1709 }
1710
1711 if (labels_.empty()) {
1712 labels_.append(label);
1713 } else if (label != labels_[labels_.size() - 1]) {
1714 if (label < labels_[labels_.size() - 1]) {
1715 DARTS_THROW("failed to build double-array: wrong key order");
1716 }
1717 labels_.append(label);
1718 }
1719 }
1720
1721 id_type offset = find_valid_offset(dic_id);
1722 units_[dic_id].set_offset(dic_id ^ offset);
1723
1724 for (std::size_t i = 0; i < labels_.size(); ++i) {
1725 id_type dic_child_id = offset ^ labels_[i];
1726 reserve_id(dic_child_id);
1727 if (labels_[i] == '\0') {
1728 units_[dic_id].set_has_leaf(true);
1729 units_[dic_child_id].set_value(value);
1730 } else {
1731 units_[dic_child_id].set_label(labels_[i]);
1732 }
1733 }
1734 extras(offset).set_is_used(true);
1735
1736 return offset;
1737 }
1738
find_valid_offset(id_type id)1739 inline id_type DoubleArrayBuilder::find_valid_offset(id_type id) const {
1740 if (extras_head_ >= units_.size()) {
1741 return units_.size() | (id & LOWER_MASK);
1742 }
1743
1744 id_type unfixed_id = extras_head_;
1745 do {
1746 id_type offset = unfixed_id ^ labels_[0];
1747 if (is_valid_offset(id, offset)) {
1748 return offset;
1749 }
1750 unfixed_id = extras(unfixed_id).next();
1751 } while (unfixed_id != extras_head_);
1752
1753 return units_.size() | (id & LOWER_MASK);
1754 }
1755
is_valid_offset(id_type id,id_type offset)1756 inline bool DoubleArrayBuilder::is_valid_offset(id_type id,
1757 id_type offset) const {
1758 if (extras(offset).is_used()) {
1759 return false;
1760 }
1761
1762 id_type rel_offset = id ^ offset;
1763 if ((rel_offset & LOWER_MASK) && (rel_offset & UPPER_MASK)) {
1764 return false;
1765 }
1766
1767 for (std::size_t i = 1; i < labels_.size(); ++i) {
1768 if (extras(offset ^ labels_[i]).is_fixed()) {
1769 return false;
1770 }
1771 }
1772
1773 return true;
1774 }
1775
reserve_id(id_type id)1776 inline void DoubleArrayBuilder::reserve_id(id_type id) {
1777 if (id >= units_.size()) {
1778 expand_units();
1779 }
1780
1781 if (id == extras_head_) {
1782 extras_head_ = extras(id).next();
1783 if (extras_head_ == id) {
1784 extras_head_ = units_.size();
1785 }
1786 }
1787 extras(extras(id).prev()).set_next(extras(id).next());
1788 extras(extras(id).next()).set_prev(extras(id).prev());
1789 extras(id).set_is_fixed(true);
1790 }
1791
expand_units()1792 inline void DoubleArrayBuilder::expand_units() {
1793 id_type src_num_units = units_.size();
1794 id_type src_num_blocks = num_blocks();
1795
1796 id_type dest_num_units = src_num_units + BLOCK_SIZE;
1797 id_type dest_num_blocks = src_num_blocks + 1;
1798
1799 if (dest_num_blocks > NUM_EXTRA_BLOCKS) {
1800 fix_block(src_num_blocks - NUM_EXTRA_BLOCKS);
1801 }
1802
1803 units_.resize(dest_num_units);
1804
1805 if (dest_num_blocks > NUM_EXTRA_BLOCKS) {
1806 for (std::size_t id = src_num_units; id < dest_num_units; ++id) {
1807 extras(id).set_is_used(false);
1808 extras(id).set_is_fixed(false);
1809 }
1810 }
1811
1812 for (id_type i = src_num_units + 1; i < dest_num_units; ++i) {
1813 extras(i - 1).set_next(i);
1814 extras(i).set_prev(i - 1);
1815 }
1816
1817 extras(src_num_units).set_prev(dest_num_units - 1);
1818 extras(dest_num_units - 1).set_next(src_num_units);
1819
1820 extras(src_num_units).set_prev(extras(extras_head_).prev());
1821 extras(dest_num_units - 1).set_next(extras_head_);
1822
1823 extras(extras(extras_head_).prev()).set_next(src_num_units);
1824 extras(extras_head_).set_prev(dest_num_units - 1);
1825 }
1826
fix_all_blocks()1827 inline void DoubleArrayBuilder::fix_all_blocks() {
1828 id_type begin = 0;
1829 if (num_blocks() > NUM_EXTRA_BLOCKS) {
1830 begin = num_blocks() - NUM_EXTRA_BLOCKS;
1831 }
1832 id_type end = num_blocks();
1833
1834 for (id_type block_id = begin; block_id != end; ++block_id) {
1835 fix_block(block_id);
1836 }
1837 }
1838
fix_block(id_type block_id)1839 inline void DoubleArrayBuilder::fix_block(id_type block_id) {
1840 id_type begin = block_id * BLOCK_SIZE;
1841 id_type end = begin + BLOCK_SIZE;
1842
1843 id_type unused_offset = 0;
1844 for (id_type offset = begin; offset != end; ++offset) {
1845 if (!extras(offset).is_used()) {
1846 unused_offset = offset;
1847 break;
1848 }
1849 }
1850
1851 for (id_type id = begin; id != end; ++id) {
1852 if (!extras(id).is_fixed()) {
1853 reserve_id(id);
1854 units_[id].set_label(static_cast<uchar_type>(id ^ unused_offset));
1855 }
1856 }
1857 }
1858
1859 } // namespace Details
1860
1861 //
1862 // Member function build() of DoubleArrayImpl.
1863 //
1864
1865 template <typename A, typename B, typename T, typename C>
build(std::size_t num_keys,const key_type * const * keys,const std::size_t * lengths,const value_type * values,Details::progress_func_type progress_func)1866 int DoubleArrayImpl<A, B, T, C>::build(std::size_t num_keys,
1867 const key_type * const *keys, const std::size_t *lengths,
1868 const value_type *values, Details::progress_func_type progress_func) {
1869 Details::Keyset<value_type> keyset(num_keys, keys, lengths, values);
1870
1871 Details::DoubleArrayBuilder builder(progress_func);
1872 builder.build(keyset);
1873
1874 std::size_t size = 0;
1875 unit_type *buf = NULL;
1876 builder.copy(&size, &buf);
1877
1878 clear();
1879
1880 size_ = size;
1881 array_ = buf;
1882 buf_ = buf;
1883
1884 if (progress_func != NULL) {
1885 progress_func(num_keys + 1, num_keys + 1);
1886 }
1887
1888 return 0;
1889 }
1890
1891 } // namespace Darts
1892
1893 #undef DARTS_INT_TO_STR
1894 #undef DARTS_LINE_TO_STR
1895 #undef DARTS_LINE_STR
1896 #undef DARTS_THROW
1897
1898 #endif // DARTS_H_
1899