1 /* 2 * Copyright (c) 1994-2008 Carnegie Mellon University. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in 13 * the documentation and/or other materials provided with the 14 * distribution. 15 * 16 * 3. The name "Carnegie Mellon University" must not be used to 17 * endorse or promote products derived from this software without 18 * prior written permission. For permission or any legal 19 * details, please contact 20 * Carnegie Mellon University 21 * Center for Technology Transfer and Enterprise Creation 22 * 4615 Forbes Avenue 23 * Suite 302 24 * Pittsburgh, PA 15213 25 * (412) 268-7393, fax: (412) 268-7395 26 * innovation@andrew.cmu.edu 27 * 28 * 4. Redistributions of any form whatsoever must retain the following 29 * acknowledgment: 30 * "This product includes software developed by Computing Services 31 * at Carnegie Mellon University (http://www.cmu.edu/computing/)." 32 * 33 * CARNEGIE MELLON UNIVERSITY DISCLAIMS ALL WARRANTIES WITH REGARD TO 34 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 35 * AND FITNESS, IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE 36 * FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 37 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 38 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 39 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 40 */ 41 42 /* 43 SQUAT library public interface. 44 Robert O'Callahan 45 46 SQUAT (Search QUery Answer Tool) is a library for full-text indexing 47 and searching. 48 49 The primary design goals are simplicity and robustness. There are 50 two parts to the API: 51 -- Indexing: Build an index by feeding in a set of documents 52 composed of arbitrary binary text. 53 -- Searching: Specify an arbitrary substring (of length >= 54 SQUAT_WORD_SIZE, default 4) and return a superset of the 55 documents which contain the substring. (SQUAT can (and 56 occasionally does) return documents which do not match the 57 string, thus the documents must be searched 'by hand' after 58 obtaining a document list from SQUAT.) 59 60 SQUAT provides the following features that other similar engines do not: 61 -- Simple generic API (see above). No "word" heuristics to tweak. No 62 confusion about what can or cannot be matched. This makes SQUAT 63 suitable for indexing non-English, non-ASCII documents. 64 -- Simple, robust design. SQUAT has a very small amount of code. It 65 has been engineered for robustness; arbitrary input documents are 66 allowed, and corrupt index files are always be detected and 67 handled by SQUAT (i.e., it should never crash or behave 68 unpredictably). Anything less is a bug. 69 -- Robust performance. SQUAT searches are always fast. SQUAT index 70 creation uses a two-pass algorithm that consumes little memory 71 even for very large document collections and runs in time linear 72 in the combined size of the documents. 73 -- Easy embedding. The simple C library API makes it easy to add 74 SQUAT index creation and searching functionality to existing 75 applications without the need for helper processes or 76 tools. SQUAT's robustness makes it safe to do so. 77 78 NOTES: 79 80 SQUAT is not thread safe. Make sure that only one thread is in SQUAT 81 code at a time. 82 83 Arbitrary binary substring searching is often not what you want for 84 your application. Most users will want to canonicalize the indexed 85 text and/or queries before feeding them into SQUAT, for example by 86 converting to a uniform case, or by converting runs of whitespace to 87 a single space. Such conversions are the responsibility of the 88 client. 89 90 Minimal index size is *not* a goal of SQUAT. SQUAT tries to build 91 small indices but does not use techniques such as aggregation of 92 small documents (blocking) or aggressive file compression. The main 93 reason for this is to keep the code simple and robust. Another 94 reason is that disk is cheap. (Of course, one could perform blocking 95 at the client level to feed approximately equal size documents into 96 SQUAT, which would reduce index size and possibly make searches 97 faster if the application needs to find the location of the search 98 text and/or verify the search match within each document.) In 99 practice I find that, indexing my email, the indexes are about the 100 size as the processed source text, which is about 1/3 of the total 101 size of my email (since Cyrus' processing strips whitespace, binary 102 attachments, etc). 103 104 The index file format is platform and architecture independent. The 105 file format supports 64-bit file offsets so in theory >2GB index 106 files would work, but the code has not been tested on any 64-bit 107 architectures (64 bit addressing would be needed to mmap such 108 files), so >2GB index files probably don't work in practice. 109 */ 110 111 #ifndef __SQUAT_H 112 #define __SQUAT_H 113 114 /* Don't change this unless you're SURE you know what you're doing. 115 Its only effect on the API is that searches for strings that are 116 shorter than SQUAT_WORD_SIZE are not allowed. 117 In SQUAT, a 'word' simply refers to a string of SQUAT_WORD_SIZE 118 arbitrary bytes. 119 */ 120 #define SQUAT_WORD_SIZE 4 121 122 /* Type used for an index under construction. */ 123 typedef struct _SquatIndex SquatIndex; 124 /* Type used for an index being searched. */ 125 typedef struct _SquatSearchIndex SquatSearchIndex; 126 127 typedef long long SquatInt64; 128 typedef int SquatInt32; 129 130 /* All SQUAT index files start with this magic 8 bytes */ 131 extern char const squat_index_file_header[8]; /* "SQUAT 1\n" */ 132 133 /* SQUAT return values */ 134 #define SQUAT_OK 1 135 #define SQUAT_ERR 2 136 #define SQUAT_LAST_BUILTIN SQUAT_ERR 137 138 /* SQUAT error codes */ 139 #define SQUAT_ERR_OK 1 140 /* was SQUAT_ERR_OUT_OF_MEMORY */ 141 #define SQUAT_ERR_SYSERR 3 /* check errno */ 142 #define SQUAT_ERR_INVALID_INDEX_FILE 4 143 #define SQUAT_ERR_SEARCH_STRING_TOO_SHORT 5 144 #define SQUAT_ERR_SEARCH_STRING_INVALID_CHAR 6 145 int squat_get_last_error(void); 146 147 148 /*************************************** 149 INDEX CONSTRUCTION API 150 ***************************************/ 151 152 /* You can get reports about the progress of index creation using a 153 "stats callback" function. Your callback function is called every 154 time an event occurs. */ 155 #define SQUAT_STATS_COMPLETED_DOC 1 /* Finished processing a document */ 156 #define SQUAT_STATS_COMPLETED_INITIAL_CHAR 2 /* Indexed all words 157 beginning with a given 158 byte */ 159 typedef union { /* An event report */ 160 struct { 161 int type; /* the type of the event, a SQUAT_STATS_ constant */ 162 } generic; 163 struct { /* data for a COMPLETED_DOC event, issued during 164 squat_index_close_document */ 165 int type; 166 int const* num_unique_words; /* num_unique_words[i] gives the 167 number of unique words in this 168 source document beginning with the 169 byte i */ 170 } completed_doc; 171 struct { /* data for a COMPLETED_INITIAL_CHAR event, issued 172 during squat_index_finish */ 173 int type; 174 int completed_char; /* We've just finished processing all 175 words beginning with this byte */ 176 int num_words; /* How many unique words over all 177 documents start with this byte */ 178 int temp_file_size; /* The size of the temporary file that was 179 used for this byte */ 180 } completed_initial_char; 181 } SquatStatsEvent; 182 typedef void (* SquatStatsCallback)(void* closure, SquatStatsEvent* params); 183 184 /* Create a SQUAT index. The index is dumped into 'fd', which should 185 be an empty file opened for writing. 186 187 SQUAT indexing takes space that may be up to 5 times the size of 188 the input documents in the worst case (average case is much 189 lower!). SQUAT will create hundreds of temporary files in /tmp or 190 the directory you specify. 191 192 Once a SquatIndex is successfully initialized, the caller is 193 obligated to call "squat_index_destroy" or "squat_index_finish" on 194 the index. 195 */ 196 #define SQUAT_OPTION_TMP_PATH 0x01 /* The tmp_path options field is valid. */ 197 #define SQUAT_OPTION_VALID_CHARS 0x02 /* The valid_chars options field is valid. */ 198 #define SQUAT_OPTION_STATISTICS 0x04 /* The stats_callback* options 199 fields are valid. */ 200 typedef struct { 201 int option_mask; /* Which options fields have been 202 initialized? */ 203 char const* tmp_path; /* A directory where all 204 temporary files will be 205 created. Must not have any 206 trailing slash. */ 207 char const* valid_chars; /* A null-terminated string 208 containing the characters 209 which can appear in search 210 strings. (Sorry, if you use 211 this option, the null 212 character is never allowed in 213 search strings.) If you try to 214 use any other bytes in a 215 search string, you will get an 216 error. If you know in advance 217 that certain bytes cannot 218 appear in search strings, you 219 can improve performance using 220 this option (especially if 221 those bytes do occur in source 222 documents). */ 223 SquatStatsCallback stats_callback; /* See above */ 224 void* stats_callback_closure; /* Private data passed down into 225 the callback function */ 226 } SquatOptions; 227 SquatIndex* squat_index_init(int fd, SquatOptions const* options); 228 229 230 /* Start adding a new document to the index. The name is a 231 null-terminated UTF8 string which is associated with the document. 232 Call this after successfully calling squat_index_init or 233 squat_index_close_document. 234 */ 235 int squat_index_open_document(SquatIndex* index, char const* name); 236 237 238 /* Notify SQUAT about some more data in the current document. This 239 function can be called as many times as desired until all the data 240 in the document has been fed into SQUAT. Call this after 241 successfully calling squat_index_open_document or 242 squat_index_append_document. */ 243 int squat_index_append_document(SquatIndex* index, char const* data, 244 int data_len); 245 246 247 /* Notify SQUAT that the current document has ended. 248 Call this after successfully calling squat_index_open_document or 249 squat_index_append_document. */ 250 int squat_index_close_document(SquatIndex* index); 251 252 253 /* Notify SQUAT that there are no more documents. SQUAT will finish 254 generating the index. It is the client's responsibility to close 255 the original index file. All SQUAT resources associated with the 256 index are released whether this call succeeds or fails. 257 Call this after successfully calling squat_index_init or 258 squat_index_close_document. */ 259 260 int squat_index_finish(SquatIndex* index); 261 262 /* Notify SQUAT that something has gone wrong and index construction 263 must be aborted. It is the client's responsibility to close and/or 264 remove the original index file. All SQUAT resources associated with 265 the index are released whether this call succeeds or fails. 266 Call this anytime. */ 267 int squat_index_destroy(SquatIndex* index); 268 269 270 typedef int (* SquatScanCallback)(void* closure, char *name, int doc_ID); 271 272 int squat_scan(SquatSearchIndex* index, char first_char, 273 SquatScanCallback handler, 274 void* closure); 275 276 int squat_count_docs(SquatSearchIndex* index, char first_char, 277 int *counter); 278 279 /*************************************** 280 INDEX SEARCH API 281 ***************************************/ 282 283 /* Open an index for searching. 'fd' should be an index file opened 284 for reading, positioned at the beginning of the file. 285 286 This function mmaps the entire index file. If there is not enough 287 virtual address space available it will fail with SQUAT_ERR_SYSERR. 288 */ 289 SquatSearchIndex* squat_search_open(int fd); 290 291 /* Get a list of the documents included in the index. 292 The callback function is called once for each document. The 293 callback function returns one of the following results to control 294 the progress of the operation. Call this after successfully calling 295 squat_search_open, squat_search_list_docs, or squat_search_execute. */ 296 #define SQUAT_CALLBACK_CONTINUE 1 /* return this from the callback 297 function to continue with the 298 operation. */ 299 #define SQUAT_CALLBACK_ABORT 2 /* return this from the callback 300 function to abort the current 301 operation and return to the 302 client. */ 303 typedef struct { 304 char const* doc_name; /* The UTF8 name of the document. */ 305 SquatInt64 size; /* The total size of the document in bytes. */ 306 } SquatListDoc; 307 typedef int (* SquatListDocCallback)(void* closure, SquatListDoc const* doc); 308 int squat_search_list_docs(SquatSearchIndex* index, 309 SquatListDocCallback handler, void* closure); 310 311 312 /* Get a list of the documents that may include the given search string. 313 The callback function is called once for each possibly-matching 314 document. The callback function returns one of the above results to 315 control the progress of the operation. Call this after successfully 316 calling squat_search_open, squat_search_list_docs, or 317 squat_search_execute. */ 318 typedef int (* SquatSearchResultCallback)(void* closure, char const* doc_name); 319 int squat_search_execute(SquatSearchIndex* index, char const* data, 320 int data_len, SquatSearchResultCallback handler, void* closure); 321 322 323 /* Release the SQUAT resources associated with an index. The resources 324 are released whether this call succeeds or fails. 325 Call this anytime. */ 326 int squat_search_close(SquatSearchIndex* index); 327 328 329 typedef int (* SquatDocChooserCallback)(void* closure, 330 SquatListDoc const* doc); 331 332 int squat_index_add_existing(SquatIndex* index, 333 SquatSearchIndex *old_index, 334 SquatDocChooserCallback choose_existing, 335 void *closure); 336 #endif 337