1#-------------------------------------------------------------------------------------------------
2# Perl binding of Hyper Estraier
3#                                                       Copyright (C) 2004-2007 Mikio Hirabayashi
4#  This file is part of Hyper Estraier.
5#  Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6#  the GNU Lesser General Public License as published by the Free Software Foundation; either
7#  version 2.1 of the License or any later version.  Hyper Estraier is distributed in the hope
8#  that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
10#  License for more details.
11#  You should have received a copy of the GNU Lesser General Public License along with Hyper
12#  Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13#  Boston, MA 02111-1307 USA.
14#-------------------------------------------------------------------------------------------------
15
16
17=head1 NAME
18
19Perl Binding of Hyper Estraier
20
21=head1 SYNOPSYS
22
23  use Estraier;
24
25=head1 INTRODUCTION
26
27Hyper Estraier is a full-text search system for communities.
28
29This is a package implementing the core API of Hyper Estraier ( http://hyperestraier.sourceforge.net/ ), including native codes written in C with XS macros.  As it works on Linux, Mac OS X, Windows, and so on, native libraries for each environment are required to run programs.  This package requires Perl 5.8.8 or later versions.
30
31=head2 Setting
32
33Install the latest version of Hyper Estraier.
34
35Enter the sub directory `perlnative' in the extracted package then perform installation.
36
37  cd perlnative
38  ./configure
39  make
40  su
41  make install
42
43On Linux and other UNIX systems: set the environment variable LD_LIBRARY_PATH to find libraries; "libestraier.so".  On Mac OS X: set the environment variable DYLD_LIBRARY_PATH to find libraries; "libestraier.dylib".  On Windows: set the environment variable PATH to find libraries; "estraier.dll".
44
45The package `Estraier' should be loaded in each source file of application programs.
46
47  use Estraier;
48
49If you want to enable runtime assertion, set the variable `$Estraier::DEBUG' to be true.
50
51  $Estraier::DEBUG = 1;
52
53=head1 DESCRIPTION
54
55=head2 Class Document
56
57=over
58
59=item $doc = new Document(I<draft>)
60
61Create a document object.  `draft' specifies a string of draft data.  If it is omitted, an empty document object is created.
62
63=item $doc-E<gt>add_attr(I<name>, I<value>)
64
65Add an attribute.  `name' specifies the name of an attribute.  `value' specifies the value of the attribute.  If it is `undef', the attribute is removed.  The return value is always `undef'.
66
67=item $doc-E<gt>add_text(I<text>)
68
69Add a sentence of text.  `text' specifies a sentence of text.  The return value is always `undef'.
70
71=item $doc-E<gt>add_hidden_text(I<text>)
72
73Add a hidden sentence.  `text' specifies a hidden sentence.  The return value is always `undef'.
74
75=item $doc-E<gt>set_keywords(I<kwords>)
76
77Attach keywords.  `kwords' specifies the reference of a hash object of keywords.  Keys of the hash should be keywords of the document and values should be their scores in decimal string.  The return value is always `undef'.
78
79=item $doc-E<gt>set_score(I<score>)
80
81Set the substitute score.  `score' specifies the substitute score.  It should be zero or positive.  The return value is always `undef'.
82
83=item $doc-E<gt>id()
84
85Get the ID number.  The return value is the ID number of the document object.  If the object has never been registered, -1 is returned.
86
87=item $doc-E<gt>attr_names()
88
89Get an array of attribute names of a document object.  The return value is a reference of an array object of attribute names.
90
91=item $doc-E<gt>attr()
92
93Get the value of an attribute.  `name' specifies the name of an attribute.  The return value is the value of the attribute or `undef' if it does not exist.
94
95=item $doc-E<gt>texts()
96
97Get an array of sentences of the text.  The return value is a reference of an array object of sentences of the text.
98
99=item $doc-E<gt>cat_texts()
100
101Concatenate sentences of the text of a document object.  The return value is concatenated sentences.
102
103=item $doc-E<gt>keywords()
104
105Get attached keywords.  The return value is a reference of a hash object of keywords and their scores in decimal string.  If no keyword is attached, `undef' is returned.
106
107=item $doc-E<gt>score()
108
109Get the substitute score.  The return value is the substitute score or -1 if it is not set.
110
111=item $doc-E<gt>dump_draft()
112
113Dump draft data of a document object.  The return value is draft data.
114
115=item $doc-E<gt>make_snippet(I<words>, I<wwidth>, I<hwidth>, I<awidth>)
116
117Make a snippet of the body text.  `words' specifies a reference of an array object of words to be highlight.  `wwidth' specifies whole width of the result.  `hwidth' specifies width of strings picked up from the beginning of the text.  `awidth' width of strings picked up around each highlighted word.  The return value is a snippet string of the body text.  There are tab separated values.  Each line is a string to be shown.  Though most lines have only one field, some lines have two fields.  If the second field exists, the first field is to be shown with highlighted, and the second field means its normalized form.
118
119=back
120
121=head2 Class Condition
122
123=over
124
125=item Condition::SURE = 1 << 0
126
127option: check every N-gram key
128
129=item Condition::USUAL = 1 << 1
130
131option: check N-gram keys skipping by one
132
133=item Condition::FAST = 1 << 2
134
135option: check N-gram keys skipping by two
136
137=item Condition::AGITO = 1 << 3
138
139option: check N-gram keys skipping by three
140
141=item Condition::NOIDF = 1 << 4
142
143option: without TF-IDF tuning
144
145=item Condition::SIMPLE = 1 << 10
146
147option: with the simplified phrase
148
149=item Condition::ROUGH = 1 << 11
150
151option: with the rough phrase
152
153=item Condition::UNION = 1 << 15
154
155option: with the union phrase
156
157=item Condition::ISECT = 1 << 16
158
159option: with the intersection phrase
160
161=item Condition::ECLSIMURL = 10.0
162
163eclipse tuning: consider URL
164
165=item Condition::ECLSERV = 100.0
166
167eclipse tuning: on server basis
168
169=item Condition::ECLDIR = 101.0
170
171eclipse tuning: on directory basis
172
173=item Condition::ECLFILE = 102.0
174
175eclipse tuning: on file basis
176
177=item $cond = new Condition()
178
179Create a search condition object.
180
181=item $cond-E<gt>set_phrase(I<phrase>)
182
183Set the search phrase.  `phrase' specifies a search phrase.  The return value is always `undef'.
184
185=item $cond-E<gt>add_attr(I<expr>)
186
187Add an expression for an attribute.  `expr' specifies an expression for an attribute.  The return value is always `undef'.
188
189=item $cond-E<gt>set_order(I<expr>)
190
191Set the order of a condition object.  `expr' specifies an expression for the order.  By default, the order is by score descending.  The return value is always `undef'.
192
193=item $cond-E<gt>set_max(I<max>)
194
195Set the maximum number of retrieval.  `max' specifies the maximum number of retrieval.  By default, the number of retrieval is not limited.
196
197=item $cond-E<gt>set_skip(I<skip>)
198
199Set the number of skipped documents.  `skip' specifies the number of documents to be skipped in the search result.  The return value is always `undef'.
200
201=item $cond-E<gt>set_options(I<options>)
202
203Set options of retrieval.  `options' specifies options: `Condition::SURE' specifies that it checks every N-gram key, `Condition::USU', which is the default, specifies that it checks N-gram keys with skipping one key, `Condition::FAST' skips two keys, `Condition::AGITO' skips three keys, `Condition::NOIDF' specifies not to perform TF-IDF tuning, `Condition::SIMPLE' specifies to use simplified phrase, `Condition::ROUGH' specifies to use rough phrase, `Condition::UNION' specifies to use union phrase, `Condition::ISECT' specifies to use intersection phrase.  Each option can be specified at the same time by bitwise or.  If keys are skipped, though search speed is improved, the relevance ratio grows less.  The return value is always `undef'.
204
205=item $cond-E<gt>set_auxiliary(I<min>)
206
207Set permission to adopt result of the auxiliary index.  `min' specifies the minimum hits to adopt result of the auxiliary index.  If it is not more than 0, the auxiliary index is not used.  By default, it is 32.
208
209=item $cond-E<gt>set_eclipse(I<limit>)
210
211Set the lower limit of similarity eclipse.  `limit' specifies the lower limit of similarity for documents to be eclipsed.  Similarity is between 0.0 and 1.0.  If the limit is added by `Condition::ECLSIMURL', similarity is weighted by URL.  If the limit is `Condition::ECLSERV', similarity is ignored and documents in the same server are eclipsed.  If the limit is `Condition::ECLDIR', similarity is ignored and documents in the same directory are eclipsed.  If the limit is `Condition::ECLFILE', similarity is ignored and documents of the same file are eclipsed.
212
213=item $cond-E<gt>set_distinct(I<name>)
214
215Set the attribute distinction filter.  `name' specifies the name of an attribute to be distinct.  The return value is always `undef'.
216
217=back
218
219=head2 Class Result
220
221=over
222
223=item $result-E<gt>doc_num()
224
225Get the number of documents.  The return value is the number of documents in the result.
226
227=item $result-E<gt>get_doc_id(I<index>)
228
229Get the ID number of a document.  `index' specifies the index of a document.  The return value is the ID number of the document or -1 if the index is out of bounds.
230
231=item $result-E<gt>get_dbidx(I<index>)
232
233Get the index of the container database of a document.  `index' specifies the index of a document.  The return value is the index of the container database of the document or -1 if the index is out of bounds.
234
235=item $result-E<gt>hint_words()
236
237Get an array of hint words.  The return value is a reference of an array of hint words.
238
239=item $result-E<gt>hint(I<word>)
240
241Get the value of a hint word.  `word' specifies a hint word.  An empty string means the number of whole result.  The return value is the number of documents corresponding the hint word.  If the word is in a negative condition, the value is negative.
242
243=item $result-E<gt>get_score(I<index>)
244
245Get the score of a document.  `index' specifies the index of a document.  The return value is the score of the document or -1 if the index is out of bounds.
246
247=item $result-E<gt>get_shadows(I<id>)
248
249Get an array of ID numbers of eclipsed docuemnts of a document.  `id' specifies the ID number of a parent document.  The return value is a reference of an array whose elements expresse the ID numbers and their scores alternately.
250
251=back
252
253=head2 Class Database
254
255=over
256
257=item Database::VERSION = "0.0.0"
258
259version of Hyper Estraier
260
261=item Database::ERRNOERR = 0
262
263error code: no error
264
265=item Database::ERRINVAL = 1
266
267error code: invalid argument
268
269=item Database::ERRACCES = 2
270
271error code: access forbidden
272
273=item Database::ERRLOCK = 3
274
275error code: lock failure
276
277=item Database::ERRDB = 4
278
279error code: database problem
280
281=item Database::ERRIO = 5
282
283error code: I/O problem
284
285=item Database::ERRNOITEM = 6
286
287error code: no item
288
289=item Database::ERRMISC = 9999
290
291error code: miscellaneous
292
293=item Database::DBREADER = 1 << 0
294
295open mode: open as a reader
296
297=item Database::DBWRITER = 1 << 1
298
299open mode: open as a writer
300
301=item Database::DBCREAT = 1 << 2
302
303open mode: a writer creating
304
305=item Database::DBTRUNC = 1 << 3
306
307open mode: a writer truncating
308
309=item Database::DBNOLCK = 1 << 4
310
311open mode: open without locking
312
313=item Database::DBLCKNB = 1 << 5
314
315open mode: lock without blocking
316
317=item Database::DBPERFNG = 1 << 10
318
319open mode: use perfect N-gram analyzer
320
321=item Database::DBCHRCAT = 1 << 11
322
323open mode: use character category analyzer
324
325=item Database::DBSMALL= 1 << 20
326
327open mode: small tuning
328
329=item Database::DBLARGE = 1 << 21
330
331open mode: large tuning
332
333=item Database::DBHUGE = 1 << 22
334
335open mode: huge tuning
336
337=item Database::DBHUGE2 = 1 << 23
338
339open mode: huge tuning second
340
341=item Database::DBHUGE3 = 1 << 24
342
343open mode: huge tuning third
344
345=item Database::DBSCVOID = 1 << 25
346
347open mode: store scores as void
348
349=item Database::DBSCINT = 1 << 26
350
351open mode: store scores as integer
352
353=item Database::DBSCASIS = 1 << 27
354
355open mode: refrain from adjustment of scores
356
357=item Database::IDXATTRSEQ = 0
358
359attribute index type: for multipurpose sequencial access method
360
361=item Database::IDXATTRSTR = 1
362
363attribute index type: for narrowing with attributes as strings
364
365=item Database::IDXATTRNUM = 2
366
367attribute index type: for narrowing with attributes as numbers
368
369=item Database::OPTNOPURGE = 1 << 0
370
371optimize option: omit purging dispensable region of deleted
372
373=item Database::OPTNODBOPT = 1 << 1
374
375optimize option: omit optimization of the database files
376
377=item Database::MGCLEAN = 1 << 0
378
379merge option: clean up dispensable regions
380
381=item Database::PDCLEAN = 1 << 0
382
383put_doc option: clean up dispensable regions
384
385=item Database::PDWEIGHT = 1 << 1
386
387put_doc option: weight scores statically when indexing
388
389=item Database::ODCLEAN = 1 << 0
390
391out_doc option: clean up dispensable regions
392
393=item Database::GDNOATTR = 1 << 0
394
395get_doc option: no attributes
396
397=item Database::GDNOTEXT = 1 << 1
398
399get_doc option: no text
400
401=item Database::GDNOKWD = 1 << 2
402
403get_doc option: no keywords
404
405=item $db = new Database()
406
407Create a database object.
408
409=item Database::search_meta(dbs, cond)
410
411Search plural databases for documents corresponding a condition.  `dbs' specifies a reference of an array whose elements are database objects.  `cond' specifies a condition object.  The return value is a result object.  On error, `undef' is returned.
412
413=item $db-E<gt>err_msg(I<ecode>)
414
415Get the string of an error code.  `ecode' specifies an error code.  The return value is the string of the error code.
416
417=item $db-E<gt>open(I<name>, I<omode>)
418
419Open a database.  `name' specifies the name of a database directory.  `omode' specifies open modes: `Database::DBWRITER' as a writer, `Database::DBREADER' as a reader.  If the mode is `Database::DBWRITER', the following may be added by bitwise or: `Database::DBCREAT', which means it creates a new database if not exist, `Database::DBTRUNC', which means it creates a new database regardless if one exists.  Both of `Database::DBREADER' and  `Database::DBWRITER' can be added to by bitwise or: `Database::DBNOLCK', which means it opens a database file without file locking, or `Database::DBLCKNB', which means locking is performed without blocking.  If `Database::DBNOLCK' is used, the application is responsible for exclusion control.  `Database::DBCREAT' can be added to by bitwise or: `Database::DBPERFNG', which means N-gram analysis is performed against European text also, `Database::DBCHACAT', which means character category analysis is performed instead of N-gram analysis, `Database::DBSMALL', which means the index is tuned to register less than 50000 documents, `Database::DBLARGE', which means the index is tuned to register more than 300000 documents, `Database::DBHUGE', which means the index is tuned to register more than 1000000 documents, `Database::DBHUGE2', which means the index is tuned to register more than 5000000 documents, `Database::DBHUGE3', which means the index is tuned to register more than 10000000 documents, `Database::DBSCVOID', which means scores are stored as void, `Database::DBSCINT', which means scores are stored as 32-bit integer, `Database::DBSCASIS', which means scores are stored as-is and marked not to be tuned when search.  The return value is true if success, else it is false.
420
421=item $db-E<gt>close()
422
423Close the database.  The return value is true if success, else it is false.
424
425=item $db-E<gt>error()
426
427Get the last happened error code.  The return value is the last happened error code.
428
429=item $db-E<gt>fatal()
430
431Check whether the database has a fatal error.  The return value is true if the database has fatal erroor, else it is false.
432
433=item $db-E<gt>add_attr_index(I<name>, I<type>)
434
435Add an index for narrowing or sorting with document attributes.  `name' specifies the name of an attribute.  `type' specifies the data type of attribute index; `Database::IDXATTRSEQ' for multipurpose sequencial access method, `Database::IDXATTRSTR' for narrowing with attributes as strings, `Database::IDXATTRNUM' for narrowing with attributes as numbers.  The return value is true if success, else it is false.
436
437=item $db-E<gt>flush(I<max>)
438
439Flush index words in the cache.  `max' specifies the maximum number of words to be flushed.  If it not more than zero, all words are flushed.  The return value is true if success, else it is false.
440
441=item $db-E<gt>sync()
442
443Synchronize updating contents.  The return value is true if success, else it is false.
444
445=item $db-E<gt>optimize(I<options>)
446
447Optimize the database.  `options' specifies options: `Database::OPTNOPURGE' to omit purging dispensable region of deleted documents, `Database::OPTNODBOPT' to omit optimization of the database files.  The two can be specified at the same time by bitwise or.  The return value is true if success, else it is false.
448
449=item $db-E<gt>merge(I<name>, I<options>)
450
451Merge another database.  `name' specifies the name of another database directory.  `options' specifies options: `Database::MGCLEAN' to clean up dispensable regions of the deleted document.  The return value is true if success, else it is false.
452
453=item $db-E<gt>put_doc(I<doc>, I<options>)
454
455Add a document.  `doc' specifies a document object.  The document object should have the URI attribute.  `options' specifies options: `Database::PDCLEAN' to clean up dispensable regions of the overwritten document.  The return value is true if success, else it is false.
456
457=item $db-E<gt>out_doc(I<id>, I<options>)
458
459Remove a document.  `id' specifies the ID number of a registered document.  `options' specifies options: `Database::ODCLEAN' to clean up dispensable regions of the deleted document.  The return value is true if success, else it is false.
460
461=item $db-E<gt>edit_doc(I<doc>)
462
463Edit attributes of a document.  `doc' specifies a document object.  The return value is true if success, else it is false.
464
465=item $db-E<gt>get_doc(I<id>, I<options>)
466
467Retrieve a document.  `id' specifies the ID number of a registered document.  `options' specifies options: `Database::GDNOATTR' to ignore attributes, `Database::GDNOTEXT' to ignore the body text, `Database::GDNOKWD' to ignore keywords.  The three can be specified at the same time by bitwise or.  The return value is a document object.  On error, `undef' is returned.
468
469=item $db-E<gt>get_doc_attr(I<id>, I<name>)
470
471Retrieve the value of an attribute of a document.  `id' specifies the ID number of a registered document.  `name' specifies the name of an attribute.  The return value is the value of the attribute or `undef' if it does not exist.
472
473=item $db-E<gt>uri_to_id(I<uri>)
474
475Get the ID of a document specified by URI.  `uri' specifies the URI of a registered document.  The return value is the ID of the document.  On error, -1 is returned.
476
477=item $db-E<gt>name()
478
479Get the name.  The return value is the name of the database.
480
481=item $db-E<gt>doc_num()
482
483Get the number of documents.  The return value is the number of documents in the database.
484
485=item $db-E<gt>word_num()
486
487Get the number of unique words.  The return value is the number of unique words in the database.
488
489=item $db-E<gt>size()
490
491Get the size.  The return value is the size of the database.
492
493=item $db-E<gt>search(I<cond>)
494
495Search for documents corresponding a condition.  `cond' specifies a condition object.  The return value is a result object.  On error, `undef' is returned.
496
497=item $db-E<gt>scan_doc(I<doc>, I<cond>)
498
499Check whether a document object matches the phrase of a search condition object definitely.  `doc' specifies a document object.  `cond' specifies a search condition object.  The return value is true if the document matches the phrase of the condition object definitely, else it is false.
500
501=item $db-E<gt>set_cache_size(I<size>, I<anum>, I<tnum>, I<rnum>)
502
503Set the maximum size of the cache memory.  `size' specifies the maximum size of the index cache.  By default, it is 64MB.  If it is not more than 0, the current size is not changed.  `anum' specifies the maximum number of cached records for document attributes.  By default, it is 8192.  If it is not more than 0, the current size is not changed.  `tnum' specifies the maximum number of cached records for document texts.  By default, it is 1024.  If it is not more than 0, the current size is not changed.  `rnum' specifies the maximum number of cached records for occurrence results.  By default, it is 256.  If it is not more than 0, the current size is not changed.  The return value is always `undef'.
504
505=item $db-E<gt>add_pseudo_index(I<path>)
506
507Add a pseudo index directory.  `path' specifies the path of a pseudo index directory.  The return value is true if success, else it is false.
508
509=item $db-E<gt>set_wildmax(I<num>)
510
511Set the maximum number of expansion of wild cards.  `num' specifies the maximum number of expansion of wild cards.  The return value is always `undef'.
512
513=item $db-E<gt>set_informer(I<informer>)
514
515Set the callback function to inform of database events.  `informer' specifies the name of an arbitrary function.  The function should have one parameter for a string of a message of each event.  The return value is always `undef'.
516
517=back
518
519=head1 EXAMPLE
520
521=head2 Gatherer
522
523The following is the simplest implementation of a gatherer.
524
525  use strict;
526  use warnings;
527  use Estraier;
528  $Estraier::DEBUG = 1;
529
530  # create the database object
531  my $db = new Database();
532
533  # open the database
534  unless($db->open("casket", Database::DBWRITER | Database::DBCREAT)){
535      printf("error: %s\n", $db->err_msg($db->error()));
536      exit;
537  }
538
539  # create a document object
540  my $doc = new Document();
541
542  # add attributes to the document object
543  $doc->add_attr('@uri', "https://estraier.gov/example.txt");
544  $doc->add_attr('@title', "Over the Rainbow");
545
546  # add the body text to the document object
547  $doc->add_text("Somewhere over the rainbow.  Way up high.");
548  $doc->add_text("There's a land that I heard of once in a lullaby.");
549
550  # register the document object to the database
551  unless($db->put_doc($doc, Database::PDCLEAN)){
552      printf("error: %s\n", $db->err_msg($db->error()));
553  }
554
555  # close the database
556  unless($db->close()){
557      printf("error: %s\n", $db->err_msg($db->error()));
558  }
559
560=head2 Searcher
561
562The following is the simplest implementation of a searcher.
563
564  use strict;
565  use warnings;
566  use Estraier;
567  $Estraier::DEBUG = 1;
568
569  # create the database object
570  my $db = new Database();
571
572  # open the database
573  unless($db->open("casket", Database::DBREADER)){
574      printf("error: %s\n", $db->err_msg($db->error()));
575      exit;
576  }
577
578  # create a search condition object
579  my $cond = new Condition();
580
581  # set the search phrase to the search condition object
582  $cond->set_phrase("rainbow AND lullaby");
583
584  # get the result of search
585  my $result = $db->search($cond);
586
587  # for each document in the result
588  my $dnum = $result->doc_num();
589  foreach my $i (0..$dnum-1){
590      # retrieve the document object
591      my $doc = $db->get_doc($result->get_doc_id($i), 0);
592      next unless(defined($doc));
593      # display attributes
594      my $uri = $doc->attr('@uri');
595      printf("URI: %s\n", $uri) if defined($uri);
596      my $title = $doc->attr('@title');
597      printf("Title: %s\n", $title) if defined($title);
598      # display the body text
599      my $texts = $doc->texts();
600      foreach my $text (@$texts){
601          printf("%s\n", $text);
602      }
603  }
604
605  # close the database
606  unless($db.close()){
607      printf("error: %s\n", $db->err_msg($db->error()));
608  }
609
610=head1 LICENSE
611
612 Copyright (C) 2004-2007 Mikio Hirabayashi
613 All rights reserved.
614
615Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License or any later version.  Hyper Estraier is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more details.  You should have received a copy of the GNU Lesser General Public License along with Hyper Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
616