1 /*
2  * Copyright (c) 1994-2008 Carnegie Mellon University.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. The name "Carnegie Mellon University" must not be used to
17  *    endorse or promote products derived from this software without
18  *    prior written permission. For permission or any legal
19  *    details, please contact
20  *      Carnegie Mellon University
21  *      Center for Technology Transfer and Enterprise Creation
22  *      4615 Forbes Avenue
23  *      Suite 302
24  *      Pittsburgh, PA  15213
25  *      (412) 268-7393, fax: (412) 268-7395
26  *      innovation@andrew.cmu.edu
27  *
28  * 4. Redistributions of any form whatsoever must retain the following
29  *    acknowledgment:
30  *    "This product includes software developed by Computing Services
31  *     at Carnegie Mellon University (http://www.cmu.edu/computing/)."
32  *
33  * CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO
34  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
35  * AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE
36  * FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
37  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
38  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
39  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
40  */
41 
42 /*
43   SQUAT library public interface.
44   Robert O'Callahan
45 
46   SQUAT (Search QUery Answer Tool) is a library for full-text indexing
47   and searching.
48 
49   The primary design goals are simplicity and robustness. There are
50   two parts to the API:
51   -- Indexing: Build an index by feeding in a set of documents
52      composed of arbitrary binary text.
53   -- Searching: Specify an arbitrary substring (of length >=
54      SQUAT_WORD_SIZE, default 4) and return a superset of the
55      documents which contain the substring. (SQUAT can (and
56      occasionally does) return documents which do not match the
57      string, thus the documents must be searched 'by hand' after
58      obtaining a document list from SQUAT.)
59 
60   SQUAT provides the following features that other similar engines do not:
61   -- Simple generic API (see above). No "word" heuristics to tweak. No
62      confusion about what can or cannot be matched. This makes SQUAT
63      suitable for indexing non-English, non-ASCII documents.
64   -- Simple, robust design. SQUAT has a very small amount of code. It
65      has been engineered for robustness; arbitrary input documents are
66      allowed, and corrupt index files are always be detected and
67      handled by SQUAT (i.e., it should never crash or behave
68      unpredictably). Anything less is a bug.
69   -- Robust performance. SQUAT searches are always fast. SQUAT index
70      creation uses a two-pass algorithm that consumes little memory
71      even for very large document collections and runs in time linear
72      in the combined size of the documents.
73   -- Easy embedding. The simple C library API makes it easy to add
74      SQUAT index creation and searching functionality to existing
75      applications without the need for helper processes or
76      tools. SQUAT's robustness makes it safe to do so.
77 
78   NOTES:
79 
80   SQUAT is not thread safe. Make sure that only one thread is in SQUAT
81   code at a time.
82 
83   Arbitrary binary substring searching is often not what you want for
84   your application. Most users will want to canonicalize the indexed
85   text and/or queries before feeding them into SQUAT, for example by
86   converting to a uniform case, or by converting runs of whitespace to
87   a single space. Such conversions are the responsibility of the
88   client.
89 
90   Minimal index size is *not* a goal of SQUAT. SQUAT tries to build
91   small indices but does not use techniques such as aggregation of
92   small documents (blocking) or aggressive file compression. The main
93   reason for this is to keep the code simple and robust. Another
94   reason is that disk is cheap. (Of course, one could perform blocking
95   at the client level to feed approximately equal size documents into
96   SQUAT, which would reduce index size and possibly make searches
97   faster if the application needs to find the location of the search
98   text and/or verify the search match within each document.) In
99   practice I find that, indexing my email, the indexes are about the
100   size as the processed source text, which is about 1/3 of the total
101   size of my email (since Cyrus' processing strips whitespace, binary
102   attachments, etc).
103 
104   The index file format is platform and architecture independent. The
105   file format supports 64-bit file offsets so in theory >2GB index
106   files would work, but the code has not been tested on any 64-bit
107   architectures (64 bit addressing would be needed to mmap such
108   files), so >2GB index files probably don't work in practice.
109 */
110 
111 #ifndef __SQUAT_H
112 #define __SQUAT_H
113 
114 /* Don't change this unless you're SURE you know what you're doing.
115    Its only effect on the API is that searches for strings that are
116    shorter than SQUAT_WORD_SIZE are not allowed.
117    In SQUAT, a 'word' simply refers to a string of SQUAT_WORD_SIZE
118    arbitrary bytes.
119 */
120 #define SQUAT_WORD_SIZE 4
121 
122 /* Type used for an index under construction. */
123 typedef struct _SquatIndex SquatIndex;
124 /* Type used for an index being searched. */
125 typedef struct _SquatSearchIndex SquatSearchIndex;
126 
127 typedef long long SquatInt64;
128 typedef int       SquatInt32;
129 
130 /* All SQUAT index files start with this magic 8 bytes */
131 extern char const squat_index_file_header[8]; /* "SQUAT 1\n" */
132 
133 /* SQUAT return values */
134 #define SQUAT_OK           1
135 #define SQUAT_ERR          2
136 #define SQUAT_LAST_BUILTIN SQUAT_ERR
137 
138 /* SQUAT error codes */
139 #define SQUAT_ERR_OK                         1
140 /* was SQUAT_ERR_OUT_OF_MEMORY */
141 #define SQUAT_ERR_SYSERR                     3   /* check errno */
142 #define SQUAT_ERR_INVALID_INDEX_FILE         4
143 #define SQUAT_ERR_SEARCH_STRING_TOO_SHORT    5
144 #define SQUAT_ERR_SEARCH_STRING_INVALID_CHAR 6
145 int squat_get_last_error(void);
146 
147 
148 /***************************************
149    INDEX CONSTRUCTION API
150 ***************************************/
151 
152 /* You can get reports about the progress of index creation using a
153    "stats callback" function. Your callback function is called every
154    time an event occurs. */
155 #define SQUAT_STATS_COMPLETED_DOC          1 /* Finished processing a document */
156 #define SQUAT_STATS_COMPLETED_INITIAL_CHAR 2 /* Indexed all words
157                                                 beginning with a given
158                                                 byte */
159 typedef union { /* An event report */
160   struct {
161     int type;   /* the type of the event, a SQUAT_STATS_ constant */
162   } generic;
163   struct {      /* data for a COMPLETED_DOC event, issued during
164                    squat_index_close_document */
165     int type;
166     int const* num_unique_words; /* num_unique_words[i] gives the
167                                     number of unique words in this
168                                     source document beginning with the
169                                     byte i */
170   } completed_doc;
171   struct {      /* data for a COMPLETED_INITIAL_CHAR event, issued
172                    during squat_index_finish */
173     int type;
174     int completed_char;     /* We've just finished processing all
175                                words beginning with this byte */
176     int num_words;          /* How many unique words over all
177                                documents start with this byte */
178     int temp_file_size;     /* The size of the temporary file that was
179                                used for this byte */
180   } completed_initial_char;
181 } SquatStatsEvent;
182 typedef void (* SquatStatsCallback)(void* closure, SquatStatsEvent* params);
183 
184 /* Create a SQUAT index. The index is dumped into 'fd', which should
185    be an empty file opened for writing.
186 
187    SQUAT indexing takes space that may be up to 5 times the size of
188    the input documents in the worst case (average case is much
189    lower!). SQUAT will create hundreds of temporary files in /tmp or
190    the directory you specify.
191 
192    Once a SquatIndex is successfully initialized, the caller is
193    obligated to call "squat_index_destroy" or "squat_index_finish" on
194    the index.
195 */
196 #define SQUAT_OPTION_TMP_PATH    0x01  /* The tmp_path options field is valid. */
197 #define SQUAT_OPTION_VALID_CHARS 0x02  /* The valid_chars options field is valid. */
198 #define SQUAT_OPTION_STATISTICS  0x04  /* The stats_callback* options
199                                           fields are valid. */
200 typedef struct {
201   int option_mask;                   /* Which options fields have been
202                                         initialized? */
203   char const* tmp_path;              /* A directory where all
204                                         temporary files will be
205                                         created. Must not have any
206                                         trailing slash. */
207   char const* valid_chars;           /* A null-terminated string
208                                         containing the characters
209                                         which can appear in search
210                                         strings. (Sorry, if you use
211                                         this option, the null
212                                         character is never allowed in
213                                         search strings.) If you try to
214                                         use any other bytes in a
215                                         search string, you will get an
216                                         error. If you know in advance
217                                         that certain bytes cannot
218                                         appear in search strings, you
219                                         can improve performance using
220                                         this option (especially if
221                                         those bytes do occur in source
222                                         documents). */
223   SquatStatsCallback stats_callback; /* See above */
224   void* stats_callback_closure;      /* Private data passed down into
225                                         the callback function */
226 } SquatOptions;
227 SquatIndex* squat_index_init(int fd, SquatOptions const* options);
228 
229 
230 /* Start adding a new document to the index. The name is a
231    null-terminated UTF8 string which is associated with the document.
232    Call this after successfully calling squat_index_init or
233    squat_index_close_document.
234 */
235 int         squat_index_open_document(SquatIndex* index, char const* name);
236 
237 
238 /* Notify SQUAT about some more data in the current document. This
239    function can be called as many times as desired until all the data
240    in the document has been fed into SQUAT. Call this after
241    successfully calling squat_index_open_document or
242    squat_index_append_document. */
243 int         squat_index_append_document(SquatIndex* index, char const* data,
244                int data_len);
245 
246 
247 /* Notify SQUAT that the current document has ended.
248    Call this after successfully calling squat_index_open_document or
249    squat_index_append_document. */
250 int         squat_index_close_document(SquatIndex* index);
251 
252 
253 /* Notify SQUAT that there are no more documents. SQUAT will finish
254    generating the index. It is the client's responsibility to close
255    the original index file. All SQUAT resources associated with the
256    index are released whether this call succeeds or fails.
257    Call this after successfully calling squat_index_init or
258    squat_index_close_document. */
259 
260 int         squat_index_finish(SquatIndex* index);
261 
262 /* Notify SQUAT that something has gone wrong and index construction
263    must be aborted. It is the client's responsibility to close and/or
264    remove the original index file. All SQUAT resources associated with
265    the index are released whether this call succeeds or fails.
266    Call this anytime. */
267 int         squat_index_destroy(SquatIndex* index);
268 
269 
270 typedef int (* SquatScanCallback)(void* closure, char *name, int doc_ID);
271 
272 int         squat_scan(SquatSearchIndex* index, char first_char,
273                        SquatScanCallback handler,
274                        void* closure);
275 
276 int         squat_count_docs(SquatSearchIndex* index, char first_char,
277                              int *counter);
278 
279 /***************************************
280    INDEX SEARCH API
281 ***************************************/
282 
283 /* Open an index for searching. 'fd' should be an index file opened
284    for reading, positioned at the beginning of the file.
285 
286    This function mmaps the entire index file. If there is not enough
287    virtual address space available it will fail with SQUAT_ERR_SYSERR.
288 */
289 SquatSearchIndex* squat_search_open(int fd);
290 
291 /* Get a list of the documents included in the index.
292    The callback function is called once for each document. The
293    callback function returns one of the following results to control
294    the progress of the operation. Call this after successfully calling
295    squat_search_open, squat_search_list_docs, or squat_search_execute. */
296 #define SQUAT_CALLBACK_CONTINUE   1  /* return this from the callback
297                                         function to continue with the
298                                         operation. */
299 #define SQUAT_CALLBACK_ABORT      2  /* return this from the callback
300                                         function to abort the current
301                                         operation and return to the
302                                         client. */
303 typedef struct {
304   char const* doc_name;  /* The UTF8 name of the document. */
305   SquatInt64  size;      /* The total size of the document in bytes. */
306 } SquatListDoc;
307 typedef int (* SquatListDocCallback)(void* closure, SquatListDoc const* doc);
308 int               squat_search_list_docs(SquatSearchIndex* index,
309                     SquatListDocCallback handler, void* closure);
310 
311 
312 /* Get a list of the documents that may include the given search string.
313    The callback function is called once for each possibly-matching
314    document. The callback function returns one of the above results to
315    control the progress of the operation. Call this after successfully
316    calling squat_search_open, squat_search_list_docs, or
317    squat_search_execute. */
318 typedef int (* SquatSearchResultCallback)(void* closure, char const* doc_name);
319 int               squat_search_execute(SquatSearchIndex* index, char const* data,
320                     int data_len, SquatSearchResultCallback handler, void* closure);
321 
322 
323 /* Release the SQUAT resources associated with an index. The resources
324    are released whether this call succeeds or fails.
325    Call this anytime. */
326 int               squat_search_close(SquatSearchIndex* index);
327 
328 
329 typedef int (* SquatDocChooserCallback)(void* closure,
330                                         SquatListDoc const* doc);
331 
332 int squat_index_add_existing(SquatIndex* index,
333                              SquatSearchIndex *old_index,
334                              SquatDocChooserCallback choose_existing,
335                              void *closure);
336 #endif
337