1#------------------------------------------------------------------------------------------------- 2# Perl binding of Hyper Estraier 3# Copyright (C) 2004-2007 Mikio Hirabayashi 4# This file is part of Hyper Estraier. 5# Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of 6# the GNU Lesser General Public License as published by the Free Software Foundation; either 7# version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope 8# that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 9# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 10# License for more details. 11# You should have received a copy of the GNU Lesser General Public License along with Hyper 12# Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 13# Boston, MA 02111-1307 USA. 14#------------------------------------------------------------------------------------------------- 15 16 17=head1 NAME 18 19Perl Binding of Hyper Estraier 20 21=head1 SYNOPSYS 22 23 use Estraier; 24 25=head1 INTRODUCTION 26 27Hyper Estraier is a full-text search system for communities. 28 29This is a package implementing the core API of Hyper Estraier ( http://hyperestraier.sourceforge.net/ ), including native codes written in C with XS macros. As it works on Linux, Mac OS X, Windows, and so on, native libraries for each environment are required to run programs. This package requires Perl 5.8.8 or later versions. 30 31=head2 Setting 32 33Install the latest version of Hyper Estraier. 34 35Enter the sub directory `perlnative' in the extracted package then perform installation. 36 37 cd perlnative 38 ./configure 39 make 40 su 41 make install 42 43On Linux and other UNIX systems: set the environment variable LD_LIBRARY_PATH to find libraries; "libestraier.so". On Mac OS X: set the environment variable DYLD_LIBRARY_PATH to find libraries; "libestraier.dylib". On Windows: set the environment variable PATH to find libraries; "estraier.dll". 44 45The package `Estraier' should be loaded in each source file of application programs. 46 47 use Estraier; 48 49If you want to enable runtime assertion, set the variable `$Estraier::DEBUG' to be true. 50 51 $Estraier::DEBUG = 1; 52 53=head1 DESCRIPTION 54 55=head2 Class Document 56 57=over 58 59=item $doc = new Document(I<draft>) 60 61Create a document object. `draft' specifies a string of draft data. If it is omitted, an empty document object is created. 62 63=item $doc-E<gt>add_attr(I<name>, I<value>) 64 65Add an attribute. `name' specifies the name of an attribute. `value' specifies the value of the attribute. If it is `undef', the attribute is removed. The return value is always `undef'. 66 67=item $doc-E<gt>add_text(I<text>) 68 69Add a sentence of text. `text' specifies a sentence of text. The return value is always `undef'. 70 71=item $doc-E<gt>add_hidden_text(I<text>) 72 73Add a hidden sentence. `text' specifies a hidden sentence. The return value is always `undef'. 74 75=item $doc-E<gt>set_keywords(I<kwords>) 76 77Attach keywords. `kwords' specifies the reference of a hash object of keywords. Keys of the hash should be keywords of the document and values should be their scores in decimal string. The return value is always `undef'. 78 79=item $doc-E<gt>set_score(I<score>) 80 81Set the substitute score. `score' specifies the substitute score. It should be zero or positive. The return value is always `undef'. 82 83=item $doc-E<gt>id() 84 85Get the ID number. The return value is the ID number of the document object. If the object has never been registered, -1 is returned. 86 87=item $doc-E<gt>attr_names() 88 89Get an array of attribute names of a document object. The return value is a reference of an array object of attribute names. 90 91=item $doc-E<gt>attr() 92 93Get the value of an attribute. `name' specifies the name of an attribute. The return value is the value of the attribute or `undef' if it does not exist. 94 95=item $doc-E<gt>texts() 96 97Get an array of sentences of the text. The return value is a reference of an array object of sentences of the text. 98 99=item $doc-E<gt>cat_texts() 100 101Concatenate sentences of the text of a document object. The return value is concatenated sentences. 102 103=item $doc-E<gt>keywords() 104 105Get attached keywords. The return value is a reference of a hash object of keywords and their scores in decimal string. If no keyword is attached, `undef' is returned. 106 107=item $doc-E<gt>score() 108 109Get the substitute score. The return value is the substitute score or -1 if it is not set. 110 111=item $doc-E<gt>dump_draft() 112 113Dump draft data of a document object. The return value is draft data. 114 115=item $doc-E<gt>make_snippet(I<words>, I<wwidth>, I<hwidth>, I<awidth>) 116 117Make a snippet of the body text. `words' specifies a reference of an array object of words to be highlight. `wwidth' specifies whole width of the result. `hwidth' specifies width of strings picked up from the beginning of the text. `awidth' width of strings picked up around each highlighted word. The return value is a snippet string of the body text. There are tab separated values. Each line is a string to be shown. Though most lines have only one field, some lines have two fields. If the second field exists, the first field is to be shown with highlighted, and the second field means its normalized form. 118 119=back 120 121=head2 Class Condition 122 123=over 124 125=item Condition::SURE = 1 << 0 126 127option: check every N-gram key 128 129=item Condition::USUAL = 1 << 1 130 131option: check N-gram keys skipping by one 132 133=item Condition::FAST = 1 << 2 134 135option: check N-gram keys skipping by two 136 137=item Condition::AGITO = 1 << 3 138 139option: check N-gram keys skipping by three 140 141=item Condition::NOIDF = 1 << 4 142 143option: without TF-IDF tuning 144 145=item Condition::SIMPLE = 1 << 10 146 147option: with the simplified phrase 148 149=item Condition::ROUGH = 1 << 11 150 151option: with the rough phrase 152 153=item Condition::UNION = 1 << 15 154 155option: with the union phrase 156 157=item Condition::ISECT = 1 << 16 158 159option: with the intersection phrase 160 161=item Condition::ECLSIMURL = 10.0 162 163eclipse tuning: consider URL 164 165=item Condition::ECLSERV = 100.0 166 167eclipse tuning: on server basis 168 169=item Condition::ECLDIR = 101.0 170 171eclipse tuning: on directory basis 172 173=item Condition::ECLFILE = 102.0 174 175eclipse tuning: on file basis 176 177=item $cond = new Condition() 178 179Create a search condition object. 180 181=item $cond-E<gt>set_phrase(I<phrase>) 182 183Set the search phrase. `phrase' specifies a search phrase. The return value is always `undef'. 184 185=item $cond-E<gt>add_attr(I<expr>) 186 187Add an expression for an attribute. `expr' specifies an expression for an attribute. The return value is always `undef'. 188 189=item $cond-E<gt>set_order(I<expr>) 190 191Set the order of a condition object. `expr' specifies an expression for the order. By default, the order is by score descending. The return value is always `undef'. 192 193=item $cond-E<gt>set_max(I<max>) 194 195Set the maximum number of retrieval. `max' specifies the maximum number of retrieval. By default, the number of retrieval is not limited. 196 197=item $cond-E<gt>set_skip(I<skip>) 198 199Set the number of skipped documents. `skip' specifies the number of documents to be skipped in the search result. The return value is always `undef'. 200 201=item $cond-E<gt>set_options(I<options>) 202 203Set options of retrieval. `options' specifies options: `Condition::SURE' specifies that it checks every N-gram key, `Condition::USU', which is the default, specifies that it checks N-gram keys with skipping one key, `Condition::FAST' skips two keys, `Condition::AGITO' skips three keys, `Condition::NOIDF' specifies not to perform TF-IDF tuning, `Condition::SIMPLE' specifies to use simplified phrase, `Condition::ROUGH' specifies to use rough phrase, `Condition::UNION' specifies to use union phrase, `Condition::ISECT' specifies to use intersection phrase. Each option can be specified at the same time by bitwise or. If keys are skipped, though search speed is improved, the relevance ratio grows less. The return value is always `undef'. 204 205=item $cond-E<gt>set_auxiliary(I<min>) 206 207Set permission to adopt result of the auxiliary index. `min' specifies the minimum hits to adopt result of the auxiliary index. If it is not more than 0, the auxiliary index is not used. By default, it is 32. 208 209=item $cond-E<gt>set_eclipse(I<limit>) 210 211Set the lower limit of similarity eclipse. `limit' specifies the lower limit of similarity for documents to be eclipsed. Similarity is between 0.0 and 1.0. If the limit is added by `Condition::ECLSIMURL', similarity is weighted by URL. If the limit is `Condition::ECLSERV', similarity is ignored and documents in the same server are eclipsed. If the limit is `Condition::ECLDIR', similarity is ignored and documents in the same directory are eclipsed. If the limit is `Condition::ECLFILE', similarity is ignored and documents of the same file are eclipsed. 212 213=item $cond-E<gt>set_distinct(I<name>) 214 215Set the attribute distinction filter. `name' specifies the name of an attribute to be distinct. The return value is always `undef'. 216 217=back 218 219=head2 Class Result 220 221=over 222 223=item $result-E<gt>doc_num() 224 225Get the number of documents. The return value is the number of documents in the result. 226 227=item $result-E<gt>get_doc_id(I<index>) 228 229Get the ID number of a document. `index' specifies the index of a document. The return value is the ID number of the document or -1 if the index is out of bounds. 230 231=item $result-E<gt>get_dbidx(I<index>) 232 233Get the index of the container database of a document. `index' specifies the index of a document. The return value is the index of the container database of the document or -1 if the index is out of bounds. 234 235=item $result-E<gt>hint_words() 236 237Get an array of hint words. The return value is a reference of an array of hint words. 238 239=item $result-E<gt>hint(I<word>) 240 241Get the value of a hint word. `word' specifies a hint word. An empty string means the number of whole result. The return value is the number of documents corresponding the hint word. If the word is in a negative condition, the value is negative. 242 243=item $result-E<gt>get_score(I<index>) 244 245Get the score of a document. `index' specifies the index of a document. The return value is the score of the document or -1 if the index is out of bounds. 246 247=item $result-E<gt>get_shadows(I<id>) 248 249Get an array of ID numbers of eclipsed docuemnts of a document. `id' specifies the ID number of a parent document. The return value is a reference of an array whose elements expresse the ID numbers and their scores alternately. 250 251=back 252 253=head2 Class Database 254 255=over 256 257=item Database::VERSION = "0.0.0" 258 259version of Hyper Estraier 260 261=item Database::ERRNOERR = 0 262 263error code: no error 264 265=item Database::ERRINVAL = 1 266 267error code: invalid argument 268 269=item Database::ERRACCES = 2 270 271error code: access forbidden 272 273=item Database::ERRLOCK = 3 274 275error code: lock failure 276 277=item Database::ERRDB = 4 278 279error code: database problem 280 281=item Database::ERRIO = 5 282 283error code: I/O problem 284 285=item Database::ERRNOITEM = 6 286 287error code: no item 288 289=item Database::ERRMISC = 9999 290 291error code: miscellaneous 292 293=item Database::DBREADER = 1 << 0 294 295open mode: open as a reader 296 297=item Database::DBWRITER = 1 << 1 298 299open mode: open as a writer 300 301=item Database::DBCREAT = 1 << 2 302 303open mode: a writer creating 304 305=item Database::DBTRUNC = 1 << 3 306 307open mode: a writer truncating 308 309=item Database::DBNOLCK = 1 << 4 310 311open mode: open without locking 312 313=item Database::DBLCKNB = 1 << 5 314 315open mode: lock without blocking 316 317=item Database::DBPERFNG = 1 << 10 318 319open mode: use perfect N-gram analyzer 320 321=item Database::DBCHRCAT = 1 << 11 322 323open mode: use character category analyzer 324 325=item Database::DBSMALL= 1 << 20 326 327open mode: small tuning 328 329=item Database::DBLARGE = 1 << 21 330 331open mode: large tuning 332 333=item Database::DBHUGE = 1 << 22 334 335open mode: huge tuning 336 337=item Database::DBHUGE2 = 1 << 23 338 339open mode: huge tuning second 340 341=item Database::DBHUGE3 = 1 << 24 342 343open mode: huge tuning third 344 345=item Database::DBSCVOID = 1 << 25 346 347open mode: store scores as void 348 349=item Database::DBSCINT = 1 << 26 350 351open mode: store scores as integer 352 353=item Database::DBSCASIS = 1 << 27 354 355open mode: refrain from adjustment of scores 356 357=item Database::IDXATTRSEQ = 0 358 359attribute index type: for multipurpose sequencial access method 360 361=item Database::IDXATTRSTR = 1 362 363attribute index type: for narrowing with attributes as strings 364 365=item Database::IDXATTRNUM = 2 366 367attribute index type: for narrowing with attributes as numbers 368 369=item Database::OPTNOPURGE = 1 << 0 370 371optimize option: omit purging dispensable region of deleted 372 373=item Database::OPTNODBOPT = 1 << 1 374 375optimize option: omit optimization of the database files 376 377=item Database::MGCLEAN = 1 << 0 378 379merge option: clean up dispensable regions 380 381=item Database::PDCLEAN = 1 << 0 382 383put_doc option: clean up dispensable regions 384 385=item Database::PDWEIGHT = 1 << 1 386 387put_doc option: weight scores statically when indexing 388 389=item Database::ODCLEAN = 1 << 0 390 391out_doc option: clean up dispensable regions 392 393=item Database::GDNOATTR = 1 << 0 394 395get_doc option: no attributes 396 397=item Database::GDNOTEXT = 1 << 1 398 399get_doc option: no text 400 401=item Database::GDNOKWD = 1 << 2 402 403get_doc option: no keywords 404 405=item $db = new Database() 406 407Create a database object. 408 409=item Database::search_meta(dbs, cond) 410 411Search plural databases for documents corresponding a condition. `dbs' specifies a reference of an array whose elements are database objects. `cond' specifies a condition object. The return value is a result object. On error, `undef' is returned. 412 413=item $db-E<gt>err_msg(I<ecode>) 414 415Get the string of an error code. `ecode' specifies an error code. The return value is the string of the error code. 416 417=item $db-E<gt>open(I<name>, I<omode>) 418 419Open a database. `name' specifies the name of a database directory. `omode' specifies open modes: `Database::DBWRITER' as a writer, `Database::DBREADER' as a reader. If the mode is `Database::DBWRITER', the following may be added by bitwise or: `Database::DBCREAT', which means it creates a new database if not exist, `Database::DBTRUNC', which means it creates a new database regardless if one exists. Both of `Database::DBREADER' and `Database::DBWRITER' can be added to by bitwise or: `Database::DBNOLCK', which means it opens a database file without file locking, or `Database::DBLCKNB', which means locking is performed without blocking. If `Database::DBNOLCK' is used, the application is responsible for exclusion control. `Database::DBCREAT' can be added to by bitwise or: `Database::DBPERFNG', which means N-gram analysis is performed against European text also, `Database::DBCHACAT', which means character category analysis is performed instead of N-gram analysis, `Database::DBSMALL', which means the index is tuned to register less than 50000 documents, `Database::DBLARGE', which means the index is tuned to register more than 300000 documents, `Database::DBHUGE', which means the index is tuned to register more than 1000000 documents, `Database::DBHUGE2', which means the index is tuned to register more than 5000000 documents, `Database::DBHUGE3', which means the index is tuned to register more than 10000000 documents, `Database::DBSCVOID', which means scores are stored as void, `Database::DBSCINT', which means scores are stored as 32-bit integer, `Database::DBSCASIS', which means scores are stored as-is and marked not to be tuned when search. The return value is true if success, else it is false. 420 421=item $db-E<gt>close() 422 423Close the database. The return value is true if success, else it is false. 424 425=item $db-E<gt>error() 426 427Get the last happened error code. The return value is the last happened error code. 428 429=item $db-E<gt>fatal() 430 431Check whether the database has a fatal error. The return value is true if the database has fatal erroor, else it is false. 432 433=item $db-E<gt>add_attr_index(I<name>, I<type>) 434 435Add an index for narrowing or sorting with document attributes. `name' specifies the name of an attribute. `type' specifies the data type of attribute index; `Database::IDXATTRSEQ' for multipurpose sequencial access method, `Database::IDXATTRSTR' for narrowing with attributes as strings, `Database::IDXATTRNUM' for narrowing with attributes as numbers. The return value is true if success, else it is false. 436 437=item $db-E<gt>flush(I<max>) 438 439Flush index words in the cache. `max' specifies the maximum number of words to be flushed. If it not more than zero, all words are flushed. The return value is true if success, else it is false. 440 441=item $db-E<gt>sync() 442 443Synchronize updating contents. The return value is true if success, else it is false. 444 445=item $db-E<gt>optimize(I<options>) 446 447Optimize the database. `options' specifies options: `Database::OPTNOPURGE' to omit purging dispensable region of deleted documents, `Database::OPTNODBOPT' to omit optimization of the database files. The two can be specified at the same time by bitwise or. The return value is true if success, else it is false. 448 449=item $db-E<gt>merge(I<name>, I<options>) 450 451Merge another database. `name' specifies the name of another database directory. `options' specifies options: `Database::MGCLEAN' to clean up dispensable regions of the deleted document. The return value is true if success, else it is false. 452 453=item $db-E<gt>put_doc(I<doc>, I<options>) 454 455Add a document. `doc' specifies a document object. The document object should have the URI attribute. `options' specifies options: `Database::PDCLEAN' to clean up dispensable regions of the overwritten document. The return value is true if success, else it is false. 456 457=item $db-E<gt>out_doc(I<id>, I<options>) 458 459Remove a document. `id' specifies the ID number of a registered document. `options' specifies options: `Database::ODCLEAN' to clean up dispensable regions of the deleted document. The return value is true if success, else it is false. 460 461=item $db-E<gt>edit_doc(I<doc>) 462 463Edit attributes of a document. `doc' specifies a document object. The return value is true if success, else it is false. 464 465=item $db-E<gt>get_doc(I<id>, I<options>) 466 467Retrieve a document. `id' specifies the ID number of a registered document. `options' specifies options: `Database::GDNOATTR' to ignore attributes, `Database::GDNOTEXT' to ignore the body text, `Database::GDNOKWD' to ignore keywords. The three can be specified at the same time by bitwise or. The return value is a document object. On error, `undef' is returned. 468 469=item $db-E<gt>get_doc_attr(I<id>, I<name>) 470 471Retrieve the value of an attribute of a document. `id' specifies the ID number of a registered document. `name' specifies the name of an attribute. The return value is the value of the attribute or `undef' if it does not exist. 472 473=item $db-E<gt>uri_to_id(I<uri>) 474 475Get the ID of a document specified by URI. `uri' specifies the URI of a registered document. The return value is the ID of the document. On error, -1 is returned. 476 477=item $db-E<gt>name() 478 479Get the name. The return value is the name of the database. 480 481=item $db-E<gt>doc_num() 482 483Get the number of documents. The return value is the number of documents in the database. 484 485=item $db-E<gt>word_num() 486 487Get the number of unique words. The return value is the number of unique words in the database. 488 489=item $db-E<gt>size() 490 491Get the size. The return value is the size of the database. 492 493=item $db-E<gt>search(I<cond>) 494 495Search for documents corresponding a condition. `cond' specifies a condition object. The return value is a result object. On error, `undef' is returned. 496 497=item $db-E<gt>scan_doc(I<doc>, I<cond>) 498 499Check whether a document object matches the phrase of a search condition object definitely. `doc' specifies a document object. `cond' specifies a search condition object. The return value is true if the document matches the phrase of the condition object definitely, else it is false. 500 501=item $db-E<gt>set_cache_size(I<size>, I<anum>, I<tnum>, I<rnum>) 502 503Set the maximum size of the cache memory. `size' specifies the maximum size of the index cache. By default, it is 64MB. If it is not more than 0, the current size is not changed. `anum' specifies the maximum number of cached records for document attributes. By default, it is 8192. If it is not more than 0, the current size is not changed. `tnum' specifies the maximum number of cached records for document texts. By default, it is 1024. If it is not more than 0, the current size is not changed. `rnum' specifies the maximum number of cached records for occurrence results. By default, it is 256. If it is not more than 0, the current size is not changed. The return value is always `undef'. 504 505=item $db-E<gt>add_pseudo_index(I<path>) 506 507Add a pseudo index directory. `path' specifies the path of a pseudo index directory. The return value is true if success, else it is false. 508 509=item $db-E<gt>set_wildmax(I<num>) 510 511Set the maximum number of expansion of wild cards. `num' specifies the maximum number of expansion of wild cards. The return value is always `undef'. 512 513=item $db-E<gt>set_informer(I<informer>) 514 515Set the callback function to inform of database events. `informer' specifies the name of an arbitrary function. The function should have one parameter for a string of a message of each event. The return value is always `undef'. 516 517=back 518 519=head1 EXAMPLE 520 521=head2 Gatherer 522 523The following is the simplest implementation of a gatherer. 524 525 use strict; 526 use warnings; 527 use Estraier; 528 $Estraier::DEBUG = 1; 529 530 # create the database object 531 my $db = new Database(); 532 533 # open the database 534 unless($db->open("casket", Database::DBWRITER | Database::DBCREAT)){ 535 printf("error: %s\n", $db->err_msg($db->error())); 536 exit; 537 } 538 539 # create a document object 540 my $doc = new Document(); 541 542 # add attributes to the document object 543 $doc->add_attr('@uri', "https://estraier.gov/example.txt"); 544 $doc->add_attr('@title', "Over the Rainbow"); 545 546 # add the body text to the document object 547 $doc->add_text("Somewhere over the rainbow. Way up high."); 548 $doc->add_text("There's a land that I heard of once in a lullaby."); 549 550 # register the document object to the database 551 unless($db->put_doc($doc, Database::PDCLEAN)){ 552 printf("error: %s\n", $db->err_msg($db->error())); 553 } 554 555 # close the database 556 unless($db->close()){ 557 printf("error: %s\n", $db->err_msg($db->error())); 558 } 559 560=head2 Searcher 561 562The following is the simplest implementation of a searcher. 563 564 use strict; 565 use warnings; 566 use Estraier; 567 $Estraier::DEBUG = 1; 568 569 # create the database object 570 my $db = new Database(); 571 572 # open the database 573 unless($db->open("casket", Database::DBREADER)){ 574 printf("error: %s\n", $db->err_msg($db->error())); 575 exit; 576 } 577 578 # create a search condition object 579 my $cond = new Condition(); 580 581 # set the search phrase to the search condition object 582 $cond->set_phrase("rainbow AND lullaby"); 583 584 # get the result of search 585 my $result = $db->search($cond); 586 587 # for each document in the result 588 my $dnum = $result->doc_num(); 589 foreach my $i (0..$dnum-1){ 590 # retrieve the document object 591 my $doc = $db->get_doc($result->get_doc_id($i), 0); 592 next unless(defined($doc)); 593 # display attributes 594 my $uri = $doc->attr('@uri'); 595 printf("URI: %s\n", $uri) if defined($uri); 596 my $title = $doc->attr('@title'); 597 printf("Title: %s\n", $title) if defined($title); 598 # display the body text 599 my $texts = $doc->texts(); 600 foreach my $text (@$texts){ 601 printf("%s\n", $text); 602 } 603 } 604 605 # close the database 606 unless($db.close()){ 607 printf("error: %s\n", $db->err_msg($db->error())); 608 } 609 610=head1 LICENSE 611 612 Copyright (C) 2004-2007 Mikio Hirabayashi 613 All rights reserved. 614 615Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Hyper Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. 616