1 //----------------------------------------------------------------
2 //
3 // libhtdig_api.h
4 //
5 // Header function for htdig shared library API
6 //
7 // 1/25/2002 created
8 //
9 // Neal Richter nealr@rightnow.com
10 //
11 // Part of the ht://Dig package   <http://www.htdig.org/>
12 // Copyright (c) 1995-2004 The ht://Dig Group
13 // For copyright details, see the file COPYING in your distribution
14 // or the GNU Library General Public License (LGPL) version 2 or later or later
15 // <http://www.gnu.org/copyleft/lgpl.html>
16 //
17 // $Id: libhtdig_api.h,v 1.4 2004/05/28 13:15:29 lha Exp $
18 //
19 //----------------------------------------------------------------
20 
21 #ifndef LIBHTDIG_API_H
22 #define LIBHTDIG_API_H
23 
24 #include <time.h>
25 
26 #ifndef TRUE
27 #define TRUE    1
28 #endif
29 
30 #ifndef FALSE
31 #define FALSE   0
32 #endif
33 
34 
35 #define HTDIG_MAX_FILENAME_PATH_L            1024
36 #define HTDIG_DOCUMENT_ID_L                    32
37 #define HTDIG_DOCUMENT_TITLE_L                256
38 #define HTDIG_DOCUMENT_META_L                4096
39 #define HTDIG_DOCUMENT_CONTENT_TYPE_L          32
40 #define HTDIG_DOCUMENT_EXCERPT_L              1024
41 //make sure HTDIG_DOCUMENT_EXCERPT_L is more than config 'excerpt_length'
42 
43 //default failsafe size of 'excerpt' document
44 //make sure it's more than config 'max_head_length'
45 #define HTDIG_DEFAULT_EXCERPT_SIZE         524288
46 
47 //should be the same as the default value in HTDIG
48 #define HTDIG_MAX_QUERY_L                     256
49 
50 
51 #define HTDIG_CUSTOM_TEXT_MIME_TYPE           "text/vnd.customdocument"
52 
53 //htfuzzy
54 #define  HTDIG_ALG_ACCENTS                  0x00000100         //"accents"
55 #define  HTDIG_ALG_ACCENTS_STR              "accents"
56 
57 #define  HTDIG_ALG_ENDINGS                  0x00001000         //"endings"
58 #define  HTDIG_ALG_ENDINGS_STR              "endings"
59 
60 #define  HTDIG_ALG_METAPHONE                0x00000010         //"metaphone"
61 #define  HTDIG_ALG_METAPHONE_STR            "metaphone"
62 
63 #define  HTDIG_ALG_SOUNDEX                  0x00000001         //"soundex"
64 #define  HTDIG_ALG_SOUNDEX_STR              "soundex"
65 
66 #define  HTDIG_ALG_SYNONYMS                 0x00010000         //"synonyms"
67 #define  HTDIG_ALG_SYNONYMS_STR             "synonyms"
68 
69 
70 //searching
71 #define  HTSEARCH_ALG_AND                   0x00000100         //"and"
72 #define  HTSEARCH_ALG_AND_STR               "and"
73 
74 #define  HTSEARCH_ALG_BOOLEAN               0x00000001         //"boolean"
75 #define  HTSEARCH_ALG_BOOLEAN_STR           "boolean"
76 
77 #define  HTSEARCH_ALG_OR                    0x00000010         //"or"
78 #define  HTSEARCH_ALG_OR_STR                "or"
79 
80 
81 #define  HTSEARCH_FORMAT_LONG               0x00000001         //"long"
82 #define  HTSEARCH_FORMAT_LONG_STR           "long"
83 
84 #define  HTSEARCH_FORMAT_SHORT              0x00000010         //"short"
85 #define  HTSEARCH_FORMAT_SHORT_STR          "short"
86 
87 
88 #define  HTSEARCH_SORT_SCORE                0x00000001         //"score"
89 #define  HTSEARCH_SORT_SCORE_STR            "score"
90 
91 #define  HTSEARCH_SORT_REV_SCORE            0x00000010         //"reverse score"
92 #define  HTSEARCH_SORT_REV_SCORE_STR        "reverse score"
93 
94 #define  HTSEARCH_SORT_TIME                 0x00000100         //"time"
95 #define  HTSEARCH_SORT_TIME_STR             "time"
96 
97 #define  HTSEARCH_SORT_REV_TIME             0x00001000         //"reverse time"
98 #define  HTSEARCH_SORT_REV_TIME_STR         "reverse time"
99 
100 #define  HTSEARCH_SORT_TITLE                0x00010000         //"title"
101 #define  HTSEARCH_SORT_TITLE_STR            "title"
102 
103 #define  HTSEARCH_SORT_REV_TITLE            0x00100000         //"reverse title"
104 #define  HTSEARCH_SORT_REV_TITLE_STR        "reverse title"
105 
106 
107 
108 #define  HTDIG_ERROR_CONFIG_READ               -101
109 #define  HTDIG_ERROR_URL_PART                  -102
110 #define  HTDIG_ERROR_URL_REWRITE               -103
111 #define  HTDIG_ERROR_URL_CREATE_FILE           -104
112 #define  HTDIG_ERROR_IMAGE_CREATE_FILE         -105
113 #define  HTDIG_ERROR_OPEN_CREATE_DOCDB         -106
114 #define  HTDIG_ERROR_LOGFILE_OPEN              -107
115 #define  HTDIG_ERROR_LOGFILE_CLOSE             -108
116 
117 #define  HTDIG_ERROR_TESTURL_EXCLUDE           -109
118 #define  HTDIG_ERROR_TESTURL_BADQUERY          -110
119 #define  HTDIG_ERROR_TESTURL_EXTENSION         -111
120 #define  HTDIG_ERROR_TESTURL_EXTENSION2        -112
121 #define  HTDIG_ERROR_TESTURL_LIMITS            -113
122 #define  HTDIG_ERROR_TESTURL_LIMITSNORM        -114
123 #define  HTDIG_ERROR_TESTURL_SRCH_RESTRICT     -115
124 #define  HTDIG_ERROR_TESTURL_SRCH_EXCLUDE      -116
125 #define  HTDIG_ERROR_TESTURL_REWRITE_EMPTY     -117
126 #define  HTDIG_ERROR_TESTURL_ROBOT_FORBID      -118
127 
128 #define  HTSEARCH_ERROR_NO_MATCH               -201
129 #define  HTSEARCH_ERROR_BAD_MATCH_INDEX        -202
130 #define  HTSEARCH_ERROR_BAD_DOCUMENT           -203
131 #define  HTSEARCH_ERROR_TEMPLATE_ERROR         -204
132 #define  HTSEARCH_ERROR_LOGFILE_OPEN           -205
133 #define  HTSEARCH_ERROR_LOGFILE_CLOSE          -206
134 #define  HTSEARCH_ERROR_CONFIG_READ            -207
135 #define  HTSEARCH_ERROR_URL_PART               -208
136 #define  HTSEARCH_ERROR_WORDDB_READ            -209
137 #define  HTSEARCH_ERROR_DOCINDEX_READ          -210
138 #define  HTSEARCH_ERROR_DOCDB_READ             -211
139 #define  HTSEARCH_ERROR_EXCERPTDB_READ         -212
140 
141 #define  HTMERGE_ERROR_LOGFILE_OPEN            -301
142 #define  HTMERGE_ERROR_LOGFILE_CLOSE           -302
143 #define  HTMERGE_ERROR_CONFIG_READ             -303
144 #define  HTMERGE_ERROR_URL_PART                -304
145 #define  HTMERGE_ERROR_WORDDB_READ             -305
146 #define  HTMERGE_ERROR_DOCINDEX_READ           -306
147 #define  HTMERGE_ERROR_DOCDB_READ              -307
148 #define  HTMERGE_ERROR_EXCERPTDB_READ          -308
149 
150 #define  PHP_HTDIG_CONFIGFILE_PARM              "configFile"
151 #define  PHP_HTDIG_URL_PARM                     "URL"
152 #define  PHP_HTDIG_LIMITTO_PARM                 "limit_urls_to"
153 #define  PHP_HTDIG_LIMITN_PARM                  "limit_normalized"
154 #define  PHP_HTDIG_EXCLUDEURLS_PARM             "exclude_urls"
155 #define  PHP_HTDIG_SEARCHRESTRICT_PARM          "search_restrict"
156 #define  PHP_HTDIG_SEARCHEXCLUDE_PARM           "search_exclude"
157 #define  PHP_HTDIG_MAXHOPCOUNT_PARM             "max_hop_cont"
158 #define  PHP_HTDIG_URLREWRITE_PARM              "url_rewrite_rules"
159 #define  PHP_HTDIG_BAD_QUERYSTR_PARM            "bad_querystr"
160 
161 //=============================================================================
162 //===== HTDIG INDEXING API ====================================================
163 
164 
165 /***************************************************
166  * HTDIG_DOCUMENTATION for htdig_parameters_struct
167  *
168  *    DEBUGGING PARAMETERS
169  *
170  *    int debug
171  *        Verbose mode.  This increases the verbosity of the
172  *         program.  Using more than 2 is probably only useful
173  *         for debugging purposes.  The default verbose mode
174  *         gives a nice progress report while digging.
175  *
176  *    char logFile
177  *         File to stream debugging & error messages to!
178  *
179  *    BOOLEAN PARAMETERS
180  *
181  *    int initial
182  *         Initial.  Do not use any old databases.  This is
183  *        accomplished by first erasing the databases
184  *
185  *    int create_text_database
186  *         Create an ASCII version of the document database.
187  *        This database is easy to parse with other programs so
188  *         that information can be extracted from it.
189  *
190  *    int report_statistics
191  *         Report statistics after completion.
192  *
193  *    int alt_work_area
194  *         Use alternate work files.
195  *        Tells htdig to append .work to database files, causing
196  *        a second copy of the database to be built.  This allows
197  *        the original files to be used by htsearch during the
198  *        indexing run.
199  *
200  *
201  *    STRING PARAMETERS
202  *
203  *    char configFile
204  *         configfile
205  *         Use the specified configuration file instead of the
206  *         default.
207  *
208  *    char credentials
209  *        username:password
210  *        Tells htdig to send the supplied username and
211  *        password with each HTTP request.  The credentials
212  *        will be encoded using the 'Basic' authentication scheme.
213  *        There *HAS* to be a colon (:) between the username
214  *        and password.
215  *
216  *
217  *    char maxhops    //9 digit limit
218  *         hopcount
219  *         Limit the stored documents to those which are at
220  *         most hopcount links away from the start URL.
221  *
222  *    char minimalFile
223  *
224  *    char URL
225  *         'command-line' URLs from stdin
226  *         fetches & indexes these URLs
227  *
228  ******************************************************************/
229 
230 typedef struct htdig_parameters_struct {
231 
232   char configFile[HTDIG_MAX_FILENAME_PATH_L];
233   char DBpath[HTDIG_MAX_FILENAME_PATH_L];
234   char credentials[HTDIG_MAX_FILENAME_PATH_L];
235   char max_hops[10];    //9 digit limit
236   char minimalFile[HTDIG_MAX_FILENAME_PATH_L];
237 
238   //debugging & logfile
239   char logFile[HTDIG_MAX_FILENAME_PATH_L];   //location of log file
240   int debug;            //0, 1 ,2, 3, 4, 5
241 
242   //booelan values
243   int initial;
244   int create_text_database;
245   int report_statistics;
246   int alt_work_area;
247   int use_cookies;
248 
249   //spidering filters
250   char URL[HTDIG_MAX_FILENAME_PATH_L];
251   char limit_urls_to[HTDIG_MAX_FILENAME_PATH_L];
252   char limit_normalized[HTDIG_MAX_FILENAME_PATH_L];
253   char exclude_urls[HTDIG_MAX_FILENAME_PATH_L];
254   char search_restrict[HTDIG_MAX_FILENAME_PATH_L];
255   char search_exclude[HTDIG_MAX_FILENAME_PATH_L];
256   char url_rewrite_rules[HTDIG_MAX_FILENAME_PATH_L];
257   char bad_querystr[HTDIG_MAX_FILENAME_PATH_L];
258   char locale[16];
259   char title_factor[16];
260   char text_factor[16];
261   char meta_description_factor[16];
262   int  max_hop_count;
263 
264   //the rewritten URL - OUTGOING after htdig_index_test_url
265   char rewritten_URL[HTDIG_MAX_FILENAME_PATH_L];
266 
267 } htdig_parameters_struct;
268 
269 /*****************************************************************
270  *  HTDIG_DOCUMENTATION for htdig_simple_doc_struct
271  *
272  *   STRING PARAMETERS
273  *
274  *    char location
275  *          the 'URL' of the document.  Can be any usefull string.
276  *
277  *    char documentid
278  *          document id of document  [NOT CURRENTLY USED - IGNORED]
279  *
280  *    char title
281  *          document title
282  *
283  *    char meta
284  *          content that is indexed but won appear in an search excerpts
285  *
286  *    char * contents
287  *          pointer to a NULL TERMINATED string on information to be
288  *          indexed.
289  *
290  *    char content_type
291  *          a MIME-like string
292  *          custom MIME-type defined above, others are supported by
293  *          htdig as well.
294  *
295  *
296  *****************************************************************/
297 
298 typedef struct htdig_simple_doc_struct {
299 
300     char location[HTDIG_MAX_FILENAME_PATH_L];
301     char documentid[HTDIG_DOCUMENT_ID_L];
302     char title[HTDIG_DOCUMENT_TITLE_L];
303     char meta[HTDIG_DOCUMENT_META_L];
304     char *contents;                               //MUST ALLOCATE & FREE!!!
305     char content_type[HTDIG_DOCUMENT_CONTENT_TYPE_L];   //MIME-ISH string
306     //struct tm time_tm;                      // use to override index time
307     time_t doc_time;
308 
309 } htdig_simple_doc_struct;
310 
311 
312 int htdig_index_open(htdig_parameters_struct *);
313 int htdig_index_simple_doc(htdig_simple_doc_struct * );
314 int htdig_index_urls(void);
315 int htdig_index_reset(void);
316 int htdig_index_close(void);
317 
318 int htdig_index_test_url(htdig_parameters_struct *htparms);
319 
320 int htdig_get_max_head_length(void);
321 
322 
323 
324 
325 //=============================================================================
326 //===== HTDIG MERGING API =====================================================
327 
328 /**************************************************
329  * HTDIG_DOCUMENTATION for  htmerge_parameters_struct
330  *
331  *   DEBUGGING PARAMETERS
332  *
333  *   int debug
334  *       Verbose mode.  This increases the verbosity of the
335  *       program.  Using more than 2 is probably only useful
336  *       for debugging purposes.  The default verbose mode
337  *       gives a progress on what it is doing and where it is.
338  *
339  *   char logFile
340  *        File to stream debugging & error messages to!
341  *
342  *
343  *   BOOLEAN PARAMETERS
344  *
345  *   int alt_work_area
346  *       Use alternate work files.
347  *       Tells htmerge to append .work to database files causing
348  *       a second copy of the database to be built.  This allows
349  *       original files to be used by htsearch during the indexing run.
350  *
351  *
352  *   STRING PARAMETERS
353  *
354  *   char configFile
355  *       configfile
356  *       Use the specified configuration file instead of the default.
357  *
358  *   char merge_configFile
359  *       merge_configfile
360  *       Merge the databases specified into the databases specified
361  *       by -c or the default.
362  *
363  *
364  *************************************************/
365 
366 typedef struct htmerge_parameters_struct {
367 
368   char configFile[HTDIG_MAX_FILENAME_PATH_L];
369   char merge_configFile[HTDIG_MAX_FILENAME_PATH_L];
370 
371   //debugging & logfile
372   char logFile[HTDIG_MAX_FILENAME_PATH_L];   //location of log file
373   int debug;            //0, 1 ,2, 3, 4, 5
374 
375   //booelan values
376   int alt_work_area;
377 
378 } htmerge_parameters_struct;
379 
380 int htmerge_index_merge(htmerge_parameters_struct *);
381 
382 
383 
384 
385 
386 //=============================================================================
387 //===== HTDIG HTFUZZY API =====================================================
388 
389 
390 
391 /**************************************************
392  *   HTDIG_DOCUMENTATION for  htfuzzy_parameters_struct
393  *
394  *    DEBUGGING PARAMETERS
395  *
396  *    int debug
397  *        Verbose mode.  This increases the verbosity of the
398  *         program.  Using more than 2 is probably only useful
399  *         for debugging purposes.
400  *
401  *    char logFile
402  *         File to stream debugging & error messages to!
403  *
404  *
405  *    PARAMETERS
406  *
407  *    char configFile
408  *        configfile
409  *        Use the specified configuration file instead of the default.
410  *
411  *    int algorithms_flag
412  *        Bitwise Flags to signal algorithms to be used
413  *
414  *     soundex    == HTDIG_ALG_SOUNDEX
415  *     metaphone  == HTDIG_ALG_METAPHONE
416  *     accents    == HTDIG_ALG_ACCENTS
417  *     endings    == HTDIG_ALG_ENDINGS
418  *     synonyms   == HTDIG_ALG_SYNONYMS
419  *
420  ***************************************************/
421 
422 
423 typedef struct htfuzzy_parameters_struct {
424 
425   char configFile[HTDIG_MAX_FILENAME_PATH_L];
426   int  algorithms_flag;
427 
428   //debugging & logfile
429   char logFile[HTDIG_MAX_FILENAME_PATH_L];   //location of log file
430   int debug;            //0, 1 ,2, 3, 4, 5
431 
432   //booelan values
433 
434 } htfuzzy_parameters_struct;
435 
436 
437 // htfuzzy functions
438 int htfuzzy_index(htfuzzy_parameters_struct *);
439 
440 
441 
442 
443 //==============================================================================
444 //===== HTDIG SEARCHING API ====================================================
445 
446 /************************************************
447  *  HTDIG_DOCUMENTATION for htsearch_parameters_struct
448  *
449  *   DEBUGGING PARAMETERS
450  *
451  *   int debug
452  *       Verbose mode.  This increases the verbosity of the;
453  *       program.  Using more than 2 is probably only useful;
454  *       for debugging purposes.  The default verbose mode;
455  *       gives a progress on what it is doing and where it is.;
456  *
457  *   char logFile
458  *        File to stream debugging & error messages to!
459  *
460  *   STRING PARAMETERS
461  *
462  *   char configFile
463  *       configfile
464  *       Use the specified configuration file instead of the default.
465  *
466  *
467  **************************************************/
468 
469 typedef struct htsearch_parameters_struct {
470 
471   char configFile[HTDIG_MAX_FILENAME_PATH_L];
472   char DBpath[HTDIG_MAX_FILENAME_PATH_L];
473   char locale[16];
474 
475   //debugging & logfile
476   char logFile[HTDIG_MAX_FILENAME_PATH_L];   //location of log file
477   int debug;            //0, 1 ,2, 3, 4, 5
478 
479   //filters
480   char search_restrict[HTDIG_MAX_FILENAME_PATH_L];
481   char search_exclude[HTDIG_MAX_FILENAME_PATH_L];
482   char title_factor[16];
483   char text_factor[16];
484   char meta_description_factor[16];
485 
486 } htsearch_parameters_struct;
487 
488 
489 
490 
491 /*****************************************************************
492  *  HTDIG_DOCUMENTATION for htsearch_query_struct
493  *
494  *  STRING PARAMETERS
495  *
496  *       char raw_query
497  *          STRING of text that is the search query -- syntax is important
498  *
499  *  INTEGER PARAMETERS
500  *
501  *      int algorithms_flag    [ALSO CALLED 'method' IN HTDIG]
502  *          HTSEARCH_ALG_BOOLEAN
503  *          HTSEARCH_ALG_OR
504  *          HTSEARCH_ALG_AND
505  *
506  *      int sortby_flag
507  *          score, date, title & reversed
508  *          HTSEARCH_SORT_SCORE
509  *          HTSEARCH_SORT_REV_SCORE
510  *          HTSEARCH_SORT_TIME
511  *          HTSEARCH_SORT_REV_TIME
512  *          HTSEARCH_SORT_TITLE
513  *          HTSEARCH_SORT_REV_TITLE
514  *
515  *      int format
516  *          short, long (with excerpt)
517  *          HTSEARCH_FORMAT_LONG
518  *          HTSEARCH_FORMAT_SHORT
519  *
520  *
521  *
522  *  TODO:  'Connect' these htsearch features to this API
523  *
524  *  config
525  *    Specifies the name of the configuration file.
526  *
527  *  exclude
528  *    This value is a pattern that specifies which URLs are to be excluded from
529  *    the search results.
530  *
531  *  keywords
532  *    Used to specify a list of required words that have to be in the documents.
533  *
534  *  restrict
535  *    This value is a pattern that all URLs of the search results will have to
536  *    match.
537  *
538  *  startyear, startmonth, startday, endyear, endmonth, endday
539  *    These values specify the allowed range of document modification dates
540  *    allowed in the search results.
541  *
542  *
543  *
544  *****************************************************************/
545 
546 typedef struct htsearch_query_struct {
547 
548   char raw_query[HTDIG_MAX_QUERY_L];
549 
550   int  algorithms_flag;
551   int  sortby_flag;
552   int format;
553 
554 } htsearch_query_struct;
555 
556 
557 /*****************************************************************
558  *  HTDIG_DOCUMENTATION for htsearch_query_match_struct
559  *
560  *  STRING PARAMETERS
561  *
562  *     char title
563  *          Title of document returned
564  *
565  *     char URL
566  *          URL/location-string of document returned
567  *
568  *     char excerpt
569  *          Excerpt with search words highlighted with
570  *          <strong>searchword</strong>
571  *
572  *  INTEGER PARAMETERS
573  *
574  *     int  score
575  *          score in 'number of stars'
576  *          [MAX NUMBER OF STARS DECLARED IN CONFIG FILE]
577  *
578  *     int  score_percent     //top result is 100%
579  *
580  *     time_t time  [DOCUMENT TIME]
581  *     struct tm time_tm    [DOCUMENT TIME]
582  *     int  size  [TOTAL DOCUMENT SIZE]
583  *
584  *
585  *****************************************************************/
586 
587 typedef struct htsearch_query_match_struct {
588 
589     char title[HTDIG_DOCUMENT_TITLE_L];
590     char URL[HTDIG_MAX_FILENAME_PATH_L];
591     char excerpt[HTDIG_DOCUMENT_EXCERPT_L];
592     int  score;
593     int  score_percent;     //top result is 100%
594     struct tm time_tm;
595     int  size;
596 
597 } htsearch_query_match_struct;
598 
599 
600 // htsearch functions
601 
602 int htsearch_open(htsearch_parameters_struct *);
603 int htsearch_query(htsearch_query_struct *);
604 
605 int htsearch_get_nth_match(int, htsearch_query_match_struct *);
606 int htsearch_close();
607 
608 //htsearch_free(indicator)
609 
610 char * htsearch_get_error();
611 
612 
613 #endif /* LIBHTDIG_API_H */
614 
615