1 /*
2   MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
3 
4   $Id: darts.h,v 1.10 2004/09/20 09:59:16 taku-ku Exp $;
5 
6   Copyright (C) 2001-2004 Taku Kudo <taku-ku@is.aist-nara.ac.jp>
7   This is free software with ABSOLUTELY NO WARRANTY.
8 
9   This library is free software; you can redistribute it and/or
10   modify it under the terms of the GNU Lesser General Public
11   License as published by the Free Software Foundation; either
12   version 2.1 of the License, or (at your option) any later version.
13 
14   This library is distributed in the hope that it will be useful,
15   but WITHOUT ANY WARRANTY; without even the implied warranty of
16   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17   Lesser General Public License for more details.
18 
19   You should have received a copy of the GNU Lesser General Public
20   License along with this library; if not, write to the Free Software
21   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
22 */
23 #ifndef _DARTS_H
24 #define _DARTS_H
25 
26 #define DARTS_VERSION "0.3"
27 
28 #include <vector>
29 #include <cstring>
30 #include <cstdio>
31 
32 #ifdef HAVE_ZLIB_H
33 namespace zlib {
34 #include <zlib.h>
35 }
36 #define SH(p) ((unsigned short)(unsigned char)((p)[0]) | ((unsigned short)(unsigned char)((p)[1]) << 8))
37 #define LG(p) ((unsigned long)(SH(p)) | ((unsigned long)(SH((p)+2)) << 16))
38 #endif
39 
40 namespace Darts {
41 
_max(T x,T y)42   template <class T> inline T _max (T x, T y) { return (x > y) ? x : y; }
_resize(T * ptr,size_t n,size_t l,T v)43   template <class T> inline T* _resize (T* ptr, size_t n, size_t l, T v)
44   {
45     T *tmp = new T [l]; // realloc
46     for (size_t i = 0; i < n; ++i) tmp[i] = ptr[i]; // copy
47     for (size_t i = n; i < l; ++i) tmp[i] = v;
48     delete [] ptr;
49     return tmp;
50   }
51 
52   template <class T>
53   class Length {
operator()54   public: size_t operator() (const T *str) const
55     { size_t i; for (i = 0; str[i] != (T)0; ++i) {}; return i; }
56   };
57 
58   template <> class Length<char> {
operator()59   public: size_t operator() (const char *str) const
60     { return std::strlen (str); };
61   };
62 
63   template  <class NodeType,  class NodeUType,
64 	     class ArrayType, class ArrayUType, class LengthFunc = Length<NodeType> >
65   class DoubleArrayImpl
66   {
67   private:
68 
69     struct node_t
70     {
71       ArrayUType code;
72       size_t     depth;
73       size_t     left;
74       size_t     right;
75     };
76 
77     struct unit_t
78     {
79       ArrayType    base;
80       ArrayUType   check;
81     };
82 
83     unit_t         *array;
84     size_t         *used;
85     size_t         size;
86     size_t         alloc_size;
87     NodeType       **str;
88     size_t         str_size;
89     size_t         *len;
90     ArrayType      *val;
91 
92     unsigned int   progress;
93     size_t         next_check_pos;
94     int            no_delete;
95     int            (*progress_func) (size_t, size_t);
96 
resize(const size_t new_size)97     size_t resize (const size_t new_size)
98     {
99       unit_t tmp;
100       tmp.base = 0;
101       tmp.check = 0;
102 
103       array = _resize (array, alloc_size, new_size, tmp);
104       used  = _resize (used,  alloc_size, new_size, (size_t)0);
105       alloc_size = new_size;
106 
107       return new_size;
108     }
109 
fetch(const node_t & parent,std::vector<node_t> & siblings)110     size_t fetch (const node_t &parent, std::vector <node_t> &siblings)
111     {
112       ArrayUType prev = 0;
113 
114       for (size_t i = parent.left; i < parent.right; ++i) {
115 	if ((len ? len[i] : LengthFunc()(str[i])) < parent.depth) continue;
116 
117 	const NodeUType *tmp = (NodeUType *)(str[i]);
118 
119 	ArrayUType cur = 0;
120 	if ((len ? len[i] : LengthFunc()(str[i])) != parent.depth)
121 	  cur = (ArrayUType)tmp[parent.depth] + 1;
122 
123 	if (prev > cur) throw -3;
124 
125 	if (cur != prev || siblings.empty()) {
126 	  node_t tmp_node;
127 	  tmp_node.depth = parent.depth + 1;
128 	  tmp_node.code  = cur;
129 	  tmp_node.left  = i;
130 	  if (! siblings.empty()) siblings[siblings.size()-1].right = i;
131 
132 	  siblings.push_back(tmp_node);
133 	}
134 
135 	prev = cur;
136       }
137 
138       if (! siblings.empty())
139 	siblings[siblings.size()-1].right = parent.right;
140 
141       return siblings.size();
142     }
143 
insert(const std::vector<node_t> & siblings)144     size_t insert (const std::vector <node_t> &siblings)
145     {
146       size_t begin       = 0;
147       size_t pos         = _max ((size_t)siblings[0].code + 1, next_check_pos) - 1;
148       size_t nonzero_num = 0;
149       int    first       = 0;
150 
151       if (alloc_size <= pos) resize (pos + 1);
152 
153       while (1) {
154       next:
155 	++pos;
156 
157 	if (array[pos].check) {
158 	  ++nonzero_num;
159 	  continue;
160 	} else if (! first) {
161 	  next_check_pos = pos;
162 	  first = 1;
163 	}
164 
165 	begin = pos - siblings[0].code;
166 	if (alloc_size < (begin + siblings[siblings.size()-1].code))
167 	  resize((size_t)(alloc_size * _max(1.05, 1.0 * str_size / progress)));
168 
169 	if (used[begin]) continue;
170 
171 	for (size_t i = 1; i < siblings.size(); ++i)
172 	  if (array[begin + siblings[i].code].check != 0) goto next;
173 
174 	break;
175       }
176 
177       // -- Simple heuristics --
178       // if the percentage of non-empty contents in check between the index
179       // 'next_check_pos' and 'check' is greater than some constant value (e.g. 0.9),
180       // new 'next_check_pos' index is written by 'check'.
181       if (1.0 * nonzero_num/(pos - next_check_pos + 1) >= 0.95) next_check_pos = pos;
182 
183       used[begin] = 1;
184       size = _max (size, begin + (size_t)siblings[siblings.size()-1].code + 1);
185 
186       for (size_t i = 0; i < siblings.size(); ++i)
187 	array[begin + siblings[i].code].check = begin;
188 
189       for (size_t i = 0; i < siblings.size(); ++i) {
190 	std::vector <node_t> new_siblings;
191 
192 	if (! fetch(siblings[i], new_siblings)) {
193 	  array[begin + siblings[i].code].base =
194 	    val ? (ArrayType)(-val[siblings[i].left]-1) : (ArrayType)(-siblings[i].left-1);
195 
196 	  if (val && (ArrayType)(-val[siblings[i].left]-1) >= 0) throw -2;
197 
198 	  ++progress;
199 	  if (progress_func) (*progress_func) (progress, str_size);
200 
201 	} else {
202 	  size_t h = insert(new_siblings);
203 	  array[begin + siblings[i].code].base = h;
204 	}
205       }
206 
207       return begin;
208     }
209 
210   public:
211 
212     typedef ArrayType   value_type;
213     typedef NodeType    key_type;
214 
215     typedef ArrayType   result_type;  // for compatibility
216 
217     struct value_pair_type
218     {
219       value_type value;
220       size_t     length;
221     };
222 
DoubleArrayImpl()223     DoubleArrayImpl (): array(0), used(0), size(0), alloc_size(0), no_delete(0) {}
~DoubleArrayImpl()224     ~DoubleArrayImpl () { clear(); }
225 
226     // helper function
setResult(value_type & x,value_type r,size_t l)227     void setResult (value_type&      x, value_type r, size_t l) { x = r; }
setResult(value_pair_type & x,value_type r,size_t l)228     void setResult (value_pair_type& x, value_type r, size_t l) { x.value = r; x.length = l; }
229 
230     int setArray (void *ptr, size_t array_size = 0)
231     {
232       clear();
233       array = (unit_t *)ptr;
234       no_delete = 1;
235       size = array_size;
236       return 1;
237     }
238 
getArray()239     void *getArray ()
240     {
241       return (void *)array;
242     }
243 
clear()244     void clear ()
245     {
246       if (! no_delete) delete [] array;
247       delete [] used;
248       array      = 0;
249       used       = 0;
250       alloc_size = 0;
251       size       = 0;
252       no_delete  = 0;
253     }
254 
getUnitSize()255     size_t getUnitSize() { return sizeof(unit_t); };
256 
getSize()257     size_t getSize() { return size; };
258 
getNonzeroSize()259     size_t getNonzeroSize ()
260     {
261       size_t result = 0;
262       for (size_t i = 0; i < size; ++i)
263 	if (array[i].check) ++result;
264       return result;
265     }
266 
267     int build (size_t        _str_size,
268 	       key_type      **_str,
269 	       size_t        *_len = 0,
270 	       value_type   *_val = 0,
271 	       int (*_progress_func)(size_t, size_t) = 0)
272     {
273       try {
274 	if (!_str_size || ! _str) return 0;
275 
276 	progress_func  = _progress_func;
277 	str            = _str;
278 	len            = _len;
279 	str_size       = _str_size;
280 	val            = _val;
281 	progress       = 0;
282 
283 	resize (1024 * 10);
284 
285 	array[0].base = 1;
286 	next_check_pos = 0;
287 
288 	node_t root_node;
289 	root_node.left  = 0;
290 	root_node.right = _str_size;
291 	root_node.depth = 0;
292 
293 	std::vector <node_t> siblings;
294 	fetch (root_node, siblings);
295 	insert (siblings);
296 
297 	size += sizeof (ArrayType);
298 	if (size > alloc_size) resize (size);
299 
300 	delete [] used;
301 	used  = 0;
302 	return 0;
303       }
304 
catch(int & e)305       catch (int &e) {
306 	delete [] used;
307 	used  = 0;
308 	clear ();
309 	return e;
310       }
311 
312       // swallow all errors
catch(...)313       catch (...) {
314 	delete [] used;
315 	used  = 0;
316 	clear ();
317 	return -1;
318       }
319     }
320 
321     int open (const char *file,
322 	      const char *mode = "rb",
323 	      size_t offset = 0,
324 	      size_t _size = 0)
325     {
326       std::FILE *fp = std::fopen(file, mode);
327       if (! fp) return -1;
328       if (std::fseek (fp, offset, SEEK_SET) != 0) return -1;
329 
330       if (! _size) {
331 	if (std::fseek (fp, 0L,     SEEK_END) != 0) return -1;
332 	_size = std::ftell (fp);
333 	if (std::fseek (fp, offset, SEEK_SET) != 0) return -1;
334       }
335 
336       clear();
337 
338       size = _size;
339       size /= sizeof(unit_t);
340       array  = new unit_t [size];
341       if (size != std::fread ((unit_t *)array,  sizeof(unit_t), size, fp)) return -1;
342       std::fclose (fp);
343 
344       return 0;
345     }
346 
347     int save (const char *file,
348 	      const char *mode = "wb",
349 	      size_t offset = 0)
350     {
351       if (! size) return -1;
352       std::FILE *fp = std::fopen(file, mode);
353       if (! fp) return -1;
354       if (size != std::fwrite((unit_t *)array,  sizeof(unit_t), size, fp)) return -1;
355       std::fclose (fp);
356       return 0;
357     }
358 
359 #ifdef HAVE_ZLIB_H
360     int gzopen (const char *file,
361     	        const char *mode = "rb",
362 	        size_t offset = 0,
363 	        size_t _size = 0)
364     {
365       std::FILE *fp  = std::fopen (file, mode);
366       if (! fp) return -1;
367       clear();
368 
369       size = _size;
370       if (! size) {
371         if (-1L != (long)std::fseek (fp, (-8), SEEK_END)) {
372 	  char buf [8];
373 	  if (std::fread ((char*)buf, 1, 8, fp) != sizeof(buf)) {
374 	    std::fclose(fp);
375 	    return -1;
376 	  }
377 	  size = LG (buf+4);
378 	  size /= sizeof (unit_t);
379         }
380       }
381       std::fclose(fp);
382 
383       if (! size) return -1;
384 
385       zlib::gzFile gzfp = zlib::gzopen(file, mode);
386       if (! gzfp) return -1;
387       array = new unit_t [size];
388       if (zlib::gzseek (gzfp, offset, SEEK_SET) != 0) return -1;
389       zlib::gzread (gzfp, (unit_t *)array,  sizeof(unit_t) * size);
390       zlib::gzclose (gzfp);
391       return 0;
392     }
393 
394     int gzsave (const char *file, const char *mode = "wb", size_t offset = 0)
395     {
396       zlib::gzFile gzfp = zlib::gzopen (file, mode);
397       if (!gzfp) return -1;
398       zlib::gzwrite (gzfp, (unit_t *)array,  sizeof(unit_t) * size);
399       zlib::gzclose (gzfp);
400       return 0;
401     }
402 #endif
403 
404     value_type exactMatchSearch (const key_type *key,
405 				  size_t len = 0,
406 				  size_t node_pos = 0)
407     {
408       if (! len) len = LengthFunc() (key);
409 
410       register ArrayType  b = array[node_pos].base;
411       register ArrayUType p;
412 
413       for (register size_t i = 0; i < len; ++i) {
414 	p = b + (NodeUType)(key[i]) + 1;
415 	if ((ArrayUType)b == array[p].check) b = array[p].base;
416 	else return -1;
417       }
418 
419       p = b;
420       ArrayType n = array[p].base;
421       if ((ArrayUType)b == array[p].check && n < 0) return -n-1;
422 
423       return -1;
424     }
425 
426     template <class T> size_t commonPrefixSearch (const key_type *key,
427 						  T* result,
428 						  size_t result_len,
429 						  size_t len = 0,
430 						  size_t node_pos = 0)
431     {
432       if (! len) len = LengthFunc() (key);
433 
434       register ArrayType  b   = array[node_pos].base;
435       register size_t     num = 0;
436       register ArrayType  n;
437       register ArrayUType p;
438 
439       for (register size_t i = 0; i < len; ++i) {
440 	p = b; // + 0;
441 	n = array[p].base;
442 	if ((ArrayUType) b == array[p].check && n < 0) {
443 	  if (num < result_len) setResult (result[num], -n-1, i); // result[num] = -n-1;
444 	  ++num;
445 	}
446 
447 	p = b + (NodeUType)(key[i]) + 1;
448 	if ((ArrayUType) b == array[p].check) b = array[p].base;
449 	else return num;
450       }
451 
452       p = b;
453       n = array[p].base;
454 
455       if ((ArrayUType)b == array[p].check && n < 0) {
456 	if (num < result_len) setResult(result[num], -n-1, len);
457 	++num;
458       }
459 
460       return num;
461     }
462 
463     value_type traverse (const key_type *key, size_t &node_pos, size_t &key_pos, size_t len = 0)
464     {
465       if (! len) len = LengthFunc() (key);
466 
467       register ArrayType  b = array[node_pos].base;
468       register ArrayUType p;
469 
470       for (; key_pos < len; ++key_pos) {
471         p = b + (NodeUType)(key[key_pos]) + 1;
472         if ((ArrayUType)b == array[p].check) { node_pos = p; b = array[p].base; }
473         else return -2; // no node
474       }
475 
476       p = b;
477       ArrayType n = array[p].base;
478       if ((ArrayUType)b == array[p].check && n < 0) return -n-1;
479 
480       return -1; // found, but no value
481     }
482   };
483 
484 #if 4 == 2
485   typedef Darts::DoubleArrayImpl<char, unsigned char, short, unsigned short> DoubleArray;
486 #define DARTS_ARRAY_SIZE_IS_DEFINED 1
487 #endif
488 
489 #if 4 == 4 && ! defined (DARTS_ARRAY_SIZE_IS_DEFINED)
490   typedef Darts::DoubleArrayImpl<char, unsigned char, int, unsigned int> DoubleArray;
491 #define DARTS_ARRAY_SIZE_IS_DEFINED 1
492 #endif
493 
494 #if 4 == 4 && ! defined (DARTS_ARRAY_SIZE_IS_DEFINED)
495   typedef Darts::DoubleArrayImpl<char, unsigned char, long, unsigned long> DoubleArray;
496 #define DARTS_ARRAY_SIZE_IS_DEFINED 1
497 #endif
498 
499 #if 4 == 8 && ! defined (DARTS_ARRAY_SIZE_IS_DEFINED)
500   typedef Darts::DoubleArrayImpl<char, unsigned char, long long, unsigned long long> DoubleArray;
501 #endif
502 }
503 #endif
504