1 /* 2 MeCab -- Yet Another Part-of-Speech and Morphological Analyzer 3 4 $Id: darts.h,v 1.10 2004/09/20 09:59:16 taku-ku Exp $; 5 6 Copyright (C) 2001-2004 Taku Kudo <taku-ku@is.aist-nara.ac.jp> 7 This is free software with ABSOLUTELY NO WARRANTY. 8 9 This library is free software; you can redistribute it and/or 10 modify it under the terms of the GNU Lesser General Public 11 License as published by the Free Software Foundation; either 12 version 2.1 of the License, or (at your option) any later version. 13 14 This library is distributed in the hope that it will be useful, 15 but WITHOUT ANY WARRANTY; without even the implied warranty of 16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 Lesser General Public License for more details. 18 19 You should have received a copy of the GNU Lesser General Public 20 License along with this library; if not, write to the Free Software 21 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22 */ 23 #ifndef _DARTS_H 24 #define _DARTS_H 25 26 #define DARTS_VERSION "0.3" 27 28 #include <vector> 29 #include <cstring> 30 #include <cstdio> 31 32 #ifdef HAVE_ZLIB_H 33 namespace zlib { 34 #include <zlib.h> 35 } 36 #define SH(p) ((unsigned short)(unsigned char)((p)[0]) | ((unsigned short)(unsigned char)((p)[1]) << 8)) 37 #define LG(p) ((unsigned long)(SH(p)) | ((unsigned long)(SH((p)+2)) << 16)) 38 #endif 39 40 namespace Darts { 41 _max(T x,T y)42 template <class T> inline T _max (T x, T y) { return (x > y) ? x : y; } _resize(T * ptr,size_t n,size_t l,T v)43 template <class T> inline T* _resize (T* ptr, size_t n, size_t l, T v) 44 { 45 T *tmp = new T [l]; // realloc 46 for (size_t i = 0; i < n; ++i) tmp[i] = ptr[i]; // copy 47 for (size_t i = n; i < l; ++i) tmp[i] = v; 48 delete [] ptr; 49 return tmp; 50 } 51 52 template <class T> 53 class Length { operator()54 public: size_t operator() (const T *str) const 55 { size_t i; for (i = 0; str[i] != (T)0; ++i) {}; return i; } 56 }; 57 58 template <> class Length<char> { operator()59 public: size_t operator() (const char *str) const 60 { return std::strlen (str); }; 61 }; 62 63 template <class NodeType, class NodeUType, 64 class ArrayType, class ArrayUType, class LengthFunc = Length<NodeType> > 65 class DoubleArrayImpl 66 { 67 private: 68 69 struct node_t 70 { 71 ArrayUType code; 72 size_t depth; 73 size_t left; 74 size_t right; 75 }; 76 77 struct unit_t 78 { 79 ArrayType base; 80 ArrayUType check; 81 }; 82 83 unit_t *array; 84 size_t *used; 85 size_t size; 86 size_t alloc_size; 87 NodeType **str; 88 size_t str_size; 89 size_t *len; 90 ArrayType *val; 91 92 unsigned int progress; 93 size_t next_check_pos; 94 int no_delete; 95 int (*progress_func) (size_t, size_t); 96 resize(const size_t new_size)97 size_t resize (const size_t new_size) 98 { 99 unit_t tmp; 100 tmp.base = 0; 101 tmp.check = 0; 102 103 array = _resize (array, alloc_size, new_size, tmp); 104 used = _resize (used, alloc_size, new_size, (size_t)0); 105 alloc_size = new_size; 106 107 return new_size; 108 } 109 fetch(const node_t & parent,std::vector<node_t> & siblings)110 size_t fetch (const node_t &parent, std::vector <node_t> &siblings) 111 { 112 ArrayUType prev = 0; 113 114 for (size_t i = parent.left; i < parent.right; ++i) { 115 if ((len ? len[i] : LengthFunc()(str[i])) < parent.depth) continue; 116 117 const NodeUType *tmp = (NodeUType *)(str[i]); 118 119 ArrayUType cur = 0; 120 if ((len ? len[i] : LengthFunc()(str[i])) != parent.depth) 121 cur = (ArrayUType)tmp[parent.depth] + 1; 122 123 if (prev > cur) throw -3; 124 125 if (cur != prev || siblings.empty()) { 126 node_t tmp_node; 127 tmp_node.depth = parent.depth + 1; 128 tmp_node.code = cur; 129 tmp_node.left = i; 130 if (! siblings.empty()) siblings[siblings.size()-1].right = i; 131 132 siblings.push_back(tmp_node); 133 } 134 135 prev = cur; 136 } 137 138 if (! siblings.empty()) 139 siblings[siblings.size()-1].right = parent.right; 140 141 return siblings.size(); 142 } 143 insert(const std::vector<node_t> & siblings)144 size_t insert (const std::vector <node_t> &siblings) 145 { 146 size_t begin = 0; 147 size_t pos = _max ((size_t)siblings[0].code + 1, next_check_pos) - 1; 148 size_t nonzero_num = 0; 149 int first = 0; 150 151 if (alloc_size <= pos) resize (pos + 1); 152 153 while (1) { 154 next: 155 ++pos; 156 157 if (array[pos].check) { 158 ++nonzero_num; 159 continue; 160 } else if (! first) { 161 next_check_pos = pos; 162 first = 1; 163 } 164 165 begin = pos - siblings[0].code; 166 if (alloc_size < (begin + siblings[siblings.size()-1].code)) 167 resize((size_t)(alloc_size * _max(1.05, 1.0 * str_size / progress))); 168 169 if (used[begin]) continue; 170 171 for (size_t i = 1; i < siblings.size(); ++i) 172 if (array[begin + siblings[i].code].check != 0) goto next; 173 174 break; 175 } 176 177 // -- Simple heuristics -- 178 // if the percentage of non-empty contents in check between the index 179 // 'next_check_pos' and 'check' is greater than some constant value (e.g. 0.9), 180 // new 'next_check_pos' index is written by 'check'. 181 if (1.0 * nonzero_num/(pos - next_check_pos + 1) >= 0.95) next_check_pos = pos; 182 183 used[begin] = 1; 184 size = _max (size, begin + (size_t)siblings[siblings.size()-1].code + 1); 185 186 for (size_t i = 0; i < siblings.size(); ++i) 187 array[begin + siblings[i].code].check = begin; 188 189 for (size_t i = 0; i < siblings.size(); ++i) { 190 std::vector <node_t> new_siblings; 191 192 if (! fetch(siblings[i], new_siblings)) { 193 array[begin + siblings[i].code].base = 194 val ? (ArrayType)(-val[siblings[i].left]-1) : (ArrayType)(-siblings[i].left-1); 195 196 if (val && (ArrayType)(-val[siblings[i].left]-1) >= 0) throw -2; 197 198 ++progress; 199 if (progress_func) (*progress_func) (progress, str_size); 200 201 } else { 202 size_t h = insert(new_siblings); 203 array[begin + siblings[i].code].base = h; 204 } 205 } 206 207 return begin; 208 } 209 210 public: 211 212 typedef ArrayType value_type; 213 typedef NodeType key_type; 214 215 typedef ArrayType result_type; // for compatibility 216 217 struct value_pair_type 218 { 219 value_type value; 220 size_t length; 221 }; 222 DoubleArrayImpl()223 DoubleArrayImpl (): array(0), used(0), size(0), alloc_size(0), no_delete(0) {} ~DoubleArrayImpl()224 ~DoubleArrayImpl () { clear(); } 225 226 // helper function setResult(value_type & x,value_type r,size_t l)227 void setResult (value_type& x, value_type r, size_t l) { x = r; } setResult(value_pair_type & x,value_type r,size_t l)228 void setResult (value_pair_type& x, value_type r, size_t l) { x.value = r; x.length = l; } 229 230 int setArray (void *ptr, size_t array_size = 0) 231 { 232 clear(); 233 array = (unit_t *)ptr; 234 no_delete = 1; 235 size = array_size; 236 return 1; 237 } 238 getArray()239 void *getArray () 240 { 241 return (void *)array; 242 } 243 clear()244 void clear () 245 { 246 if (! no_delete) delete [] array; 247 delete [] used; 248 array = 0; 249 used = 0; 250 alloc_size = 0; 251 size = 0; 252 no_delete = 0; 253 } 254 getUnitSize()255 size_t getUnitSize() { return sizeof(unit_t); }; 256 getSize()257 size_t getSize() { return size; }; 258 getNonzeroSize()259 size_t getNonzeroSize () 260 { 261 size_t result = 0; 262 for (size_t i = 0; i < size; ++i) 263 if (array[i].check) ++result; 264 return result; 265 } 266 267 int build (size_t _str_size, 268 key_type **_str, 269 size_t *_len = 0, 270 value_type *_val = 0, 271 int (*_progress_func)(size_t, size_t) = 0) 272 { 273 try { 274 if (!_str_size || ! _str) return 0; 275 276 progress_func = _progress_func; 277 str = _str; 278 len = _len; 279 str_size = _str_size; 280 val = _val; 281 progress = 0; 282 283 resize (1024 * 10); 284 285 array[0].base = 1; 286 next_check_pos = 0; 287 288 node_t root_node; 289 root_node.left = 0; 290 root_node.right = _str_size; 291 root_node.depth = 0; 292 293 std::vector <node_t> siblings; 294 fetch (root_node, siblings); 295 insert (siblings); 296 297 size += sizeof (ArrayType); 298 if (size > alloc_size) resize (size); 299 300 delete [] used; 301 used = 0; 302 return 0; 303 } 304 catch(int & e)305 catch (int &e) { 306 delete [] used; 307 used = 0; 308 clear (); 309 return e; 310 } 311 312 // swallow all errors catch(...)313 catch (...) { 314 delete [] used; 315 used = 0; 316 clear (); 317 return -1; 318 } 319 } 320 321 int open (const char *file, 322 const char *mode = "rb", 323 size_t offset = 0, 324 size_t _size = 0) 325 { 326 std::FILE *fp = std::fopen(file, mode); 327 if (! fp) return -1; 328 if (std::fseek (fp, offset, SEEK_SET) != 0) return -1; 329 330 if (! _size) { 331 if (std::fseek (fp, 0L, SEEK_END) != 0) return -1; 332 _size = std::ftell (fp); 333 if (std::fseek (fp, offset, SEEK_SET) != 0) return -1; 334 } 335 336 clear(); 337 338 size = _size; 339 size /= sizeof(unit_t); 340 array = new unit_t [size]; 341 if (size != std::fread ((unit_t *)array, sizeof(unit_t), size, fp)) return -1; 342 std::fclose (fp); 343 344 return 0; 345 } 346 347 int save (const char *file, 348 const char *mode = "wb", 349 size_t offset = 0) 350 { 351 if (! size) return -1; 352 std::FILE *fp = std::fopen(file, mode); 353 if (! fp) return -1; 354 if (size != std::fwrite((unit_t *)array, sizeof(unit_t), size, fp)) return -1; 355 std::fclose (fp); 356 return 0; 357 } 358 359 #ifdef HAVE_ZLIB_H 360 int gzopen (const char *file, 361 const char *mode = "rb", 362 size_t offset = 0, 363 size_t _size = 0) 364 { 365 std::FILE *fp = std::fopen (file, mode); 366 if (! fp) return -1; 367 clear(); 368 369 size = _size; 370 if (! size) { 371 if (-1L != (long)std::fseek (fp, (-8), SEEK_END)) { 372 char buf [8]; 373 if (std::fread ((char*)buf, 1, 8, fp) != sizeof(buf)) { 374 std::fclose(fp); 375 return -1; 376 } 377 size = LG (buf+4); 378 size /= sizeof (unit_t); 379 } 380 } 381 std::fclose(fp); 382 383 if (! size) return -1; 384 385 zlib::gzFile gzfp = zlib::gzopen(file, mode); 386 if (! gzfp) return -1; 387 array = new unit_t [size]; 388 if (zlib::gzseek (gzfp, offset, SEEK_SET) != 0) return -1; 389 zlib::gzread (gzfp, (unit_t *)array, sizeof(unit_t) * size); 390 zlib::gzclose (gzfp); 391 return 0; 392 } 393 394 int gzsave (const char *file, const char *mode = "wb", size_t offset = 0) 395 { 396 zlib::gzFile gzfp = zlib::gzopen (file, mode); 397 if (!gzfp) return -1; 398 zlib::gzwrite (gzfp, (unit_t *)array, sizeof(unit_t) * size); 399 zlib::gzclose (gzfp); 400 return 0; 401 } 402 #endif 403 404 value_type exactMatchSearch (const key_type *key, 405 size_t len = 0, 406 size_t node_pos = 0) 407 { 408 if (! len) len = LengthFunc() (key); 409 410 register ArrayType b = array[node_pos].base; 411 register ArrayUType p; 412 413 for (register size_t i = 0; i < len; ++i) { 414 p = b + (NodeUType)(key[i]) + 1; 415 if ((ArrayUType)b == array[p].check) b = array[p].base; 416 else return -1; 417 } 418 419 p = b; 420 ArrayType n = array[p].base; 421 if ((ArrayUType)b == array[p].check && n < 0) return -n-1; 422 423 return -1; 424 } 425 426 template <class T> size_t commonPrefixSearch (const key_type *key, 427 T* result, 428 size_t result_len, 429 size_t len = 0, 430 size_t node_pos = 0) 431 { 432 if (! len) len = LengthFunc() (key); 433 434 register ArrayType b = array[node_pos].base; 435 register size_t num = 0; 436 register ArrayType n; 437 register ArrayUType p; 438 439 for (register size_t i = 0; i < len; ++i) { 440 p = b; // + 0; 441 n = array[p].base; 442 if ((ArrayUType) b == array[p].check && n < 0) { 443 if (num < result_len) setResult (result[num], -n-1, i); // result[num] = -n-1; 444 ++num; 445 } 446 447 p = b + (NodeUType)(key[i]) + 1; 448 if ((ArrayUType) b == array[p].check) b = array[p].base; 449 else return num; 450 } 451 452 p = b; 453 n = array[p].base; 454 455 if ((ArrayUType)b == array[p].check && n < 0) { 456 if (num < result_len) setResult(result[num], -n-1, len); 457 ++num; 458 } 459 460 return num; 461 } 462 463 value_type traverse (const key_type *key, size_t &node_pos, size_t &key_pos, size_t len = 0) 464 { 465 if (! len) len = LengthFunc() (key); 466 467 register ArrayType b = array[node_pos].base; 468 register ArrayUType p; 469 470 for (; key_pos < len; ++key_pos) { 471 p = b + (NodeUType)(key[key_pos]) + 1; 472 if ((ArrayUType)b == array[p].check) { node_pos = p; b = array[p].base; } 473 else return -2; // no node 474 } 475 476 p = b; 477 ArrayType n = array[p].base; 478 if ((ArrayUType)b == array[p].check && n < 0) return -n-1; 479 480 return -1; // found, but no value 481 } 482 }; 483 484 #if 4 == 2 485 typedef Darts::DoubleArrayImpl<char, unsigned char, short, unsigned short> DoubleArray; 486 #define DARTS_ARRAY_SIZE_IS_DEFINED 1 487 #endif 488 489 #if 4 == 4 && ! defined (DARTS_ARRAY_SIZE_IS_DEFINED) 490 typedef Darts::DoubleArrayImpl<char, unsigned char, int, unsigned int> DoubleArray; 491 #define DARTS_ARRAY_SIZE_IS_DEFINED 1 492 #endif 493 494 #if 4 == 4 && ! defined (DARTS_ARRAY_SIZE_IS_DEFINED) 495 typedef Darts::DoubleArrayImpl<char, unsigned char, long, unsigned long> DoubleArray; 496 #define DARTS_ARRAY_SIZE_IS_DEFINED 1 497 #endif 498 499 #if 4 == 8 && ! defined (DARTS_ARRAY_SIZE_IS_DEFINED) 500 typedef Darts::DoubleArrayImpl<char, unsigned char, long long, unsigned long long> DoubleArray; 501 #endif 502 } 503 #endif 504