1 #ifndef __RS_DOCUMENT_H__ 2 #define __RS_DOCUMENT_H__ 3 #include <pthread.h> 4 #include "redismodule.h" 5 #include "search_ctx.h" 6 #include "redisearch.h" 7 #include "tokenize.h" 8 #include "concurrent_ctx.h" 9 #include "byte_offsets.h" 10 #include "rmutil/args.h" 11 #include "query_error.h" 12 13 #ifdef __cplusplus 14 extern "C" { 15 #endif 16 17 //////////////////////////////////////////////////////////////////////////////// 18 //////////////////////////////////////////////////////////////////////////////// 19 /// General Architecture /// 20 //////////////////////////////////////////////////////////////////////////////// 21 //////////////////////////////////////////////////////////////////////////////// 22 23 /** 24 * To index a document, call Document_PrepareForAdd on the document itself. 25 * This initializes the Document structure for indexing purposes. Once the 26 * document has been prepared, acquire a new RSAddDocumentCtx() by calling 27 * NewAddDocumentCtx(). 28 * 29 * Once the new context has been received, call Document_AddToIndexes(). This 30 * will start tokenizing the documents, and should be called in a separate 31 * thread. This function will tokenize the document and send a reply back to 32 * the client. You may free the RSAddDocumentCtx structure by calling 33 * AddDocumentCtx_Free(). 34 * 35 * See document.c for the internals. 36 */ 37 38 typedef struct { 39 const char *name; // Can either be char or RMString 40 RedisModuleString *text; 41 FieldType indexAs; 42 } DocumentField; 43 44 typedef struct Document { 45 RedisModuleString *docKey; 46 DocumentField *fields; 47 uint32_t numFields; 48 RSLanguage language; 49 float score; 50 t_docId docId; 51 const char *payload; 52 size_t payloadSize; 53 uint32_t flags; 54 } Document; 55 56 /** 57 * Document should decrement the reference count to the contained strings. Used 58 * when the user does not want to retain his own reference to them. It effectively 59 * "steals" a reference. 60 * 61 * This only applies to _values_; not keys. Used internally by the C API 62 */ 63 #define DOCUMENT_F_OWNREFS 0x01 64 65 /** 66 * Indicates that the document owns a reference to the field contents, 67 * the language string, and the payload. 68 * 69 * The document always owns the field array, though. 70 */ 71 #define DOCUMENT_F_OWNSTRINGS 0x02 72 73 /** 74 * The document has been moved to another target. This is quicker than 75 * zero'ing the entire structure 76 */ 77 #define DOCUMENT_F_DEAD 0x08 78 79 struct RSAddDocumentCtx; 80 81 typedef void (*DocumentAddCompleted)(struct RSAddDocumentCtx *, RedisModuleCtx *, void *); 82 83 typedef struct { 84 uint32_t options; // DOCUMENT_ADD_XXX 85 RSLanguage language; // Language document should be indexed as 86 RedisModuleString *payload; // Arbitrary payload provided on return with WITHPAYLOADS 87 RedisModuleString **fieldsArray; // Field, Value, Field Value 88 size_t numFieldElems; // Number of elements 89 double score; // Score of the document 90 const char *evalExpr; // Only add the document if this expression evaluates to true. 91 DocumentAddCompleted donecb; // Callback to invoke when operation is done 92 } AddDocumentOptions; 93 94 void Document_AddField(Document *d, const char *fieldname, RedisModuleString *fieldval, 95 uint32_t typemask); 96 97 /** 98 * Add a simple char buffer value. This creates an RMString internally, so this 99 * must be used with F_OWNSTRINGS 100 */ 101 void Document_AddFieldC(Document *d, const char *fieldname, const char *val, size_t vallen, 102 uint32_t typemask); 103 /** 104 * Initialize document structure with the relevant fields. numFields will allocate 105 * the fields array, but you must still actually copy the data along. 106 * 107 * Note that this function assumes that the pointers passed in will remain valid 108 * throughout the lifetime of the document. If you need to make independent copies 109 * of the data within the document, call Document_Detach on the document (after 110 * calling this function). 111 */ 112 void Document_Init(Document *doc, RedisModuleString *docKey, double score, RSLanguage lang); 113 void Document_SetPayload(Document *doc, const void *payload, size_t n); 114 115 /** 116 * Make the document the owner of the strings it contains 117 */ 118 void Document_MakeStringsOwner(Document *doc); 119 120 /** 121 * Make the document object steal references to the document's strings. 122 */ 123 void Document_MakeRefOwner(Document *doc); 124 125 /** 126 * Clear the document of its fields. This does not free the document 127 * or clear its name 128 */ 129 void Document_Clear(Document *doc); 130 131 /** 132 * Move the contents of one document to another. This also manages ownership 133 * semantics 134 */ 135 void Document_Move(Document *dst, Document *src); 136 137 /** 138 * Load all fields specified in the schema to the document. Note that 139 * the document must then be freed using Document_Free(). 140 * 141 * The document must already have the docKey set 142 */ 143 int Document_LoadSchemaFields(Document *doc, RedisSearchCtx *sctx); 144 145 /** 146 * Load all the fields into the document. 147 */ 148 int Document_LoadAllFields(Document *doc, RedisModuleCtx *ctx); 149 150 void Document_LoadPairwiseArgs(Document *doc, RedisModuleString **args, size_t nargs); 151 152 /** 153 * Print contents of document to screen 154 */ 155 void Document_Dump(const Document *doc); // LCOV_EXCL_LINE debug 156 /** 157 * Free any copied data within the document. anyCtx is any non-NULL 158 * RedisModuleCtx. The reason for requiring a context is more related to the 159 * Redis Module API requiring a context for AutoMemory purposes, though in 160 * this case, the pointers are already removed from AutoMemory manangement 161 * anyway. 162 * 163 * This function also calls Document_Free 164 */ 165 void Document_FreeDetached(Document *doc, RedisModuleCtx *anyCtx); 166 167 /** 168 * Free the document's internals (like the field array). 169 */ 170 void Document_Free(Document *doc); 171 172 #define DOCUMENT_ADD_REPLACE 0x01 173 #define DOCUMENT_ADD_PARTIAL 0x02 174 #define DOCUMENT_ADD_NOSAVE 0x04 175 #define DOCUMENT_ADD_CURTHREAD 0x08 // Perform operation in main thread 176 #define DOCUMENT_ADD_NOCREATE 0x10 // Don't create document if not exist (replace ONLY) 177 178 struct ForwardIndex; 179 struct FieldIndexerData; 180 181 // The context has had its forward entries merged in the merge table. We can 182 // skip merging its tokens 183 #define ACTX_F_TEXTINDEXED 0x01 184 185 // The context has had an error and should not be processed further 186 #define ACTX_F_ERRORED 0x02 187 188 // Non-text fields have been indexed. 189 #define ACTX_F_OTHERINDEXED 0x04 190 191 // The content has indexable fields 192 #define ACTX_F_INDEXABLES 0x08 193 194 // The content has sortable fields 195 #define ACTX_F_SORTABLES 0x10 196 197 // Don't block/unblock the client when indexing. This is the case when the 198 // operation is being done from within the context of AOF 199 #define ACTX_F_NOBLOCK 0x20 200 201 // Document is entirely empty (no sortables, indexables) 202 #define ACTX_F_EMPTY 0x40 203 204 struct DocumentIndexer; 205 206 /** Context used when indexing documents */ 207 typedef struct RSAddDocumentCtx { 208 struct RSAddDocumentCtx *next; // Next context in the queue 209 Document doc; // Document which is being indexed 210 union { 211 RedisModuleBlockedClient *bc; // Client 212 RedisSearchCtx *sctx; 213 } client; 214 215 // Forward index. This contains all the terms found in the document 216 struct ForwardIndex *fwIdx; 217 218 struct DocumentIndexer *indexer; 219 220 // Sorting vector for the document. If the document has sortable fields, they 221 // are added to here as well 222 RSSortingVector *sv; 223 224 // Byte offsets for highlighting. If term offsets are stored, this contains 225 // the field byte offset for each term. 226 RSByteOffsets *byteOffsets; 227 ByteOffsetWriter offsetsWriter; 228 229 // Information about each field in the document. This is read from the spec 230 // and cached, so that we can look it up without holding the GIL 231 FieldSpec *fspecs; 232 RSTokenizer *tokenizer; 233 234 // Old document data. Contains sortables 235 RSDocumentMetadata *oldMd; 236 237 // New flags to assign to the document 238 RSDocumentFlags docFlags; 239 240 // Scratch space used by per-type field preprocessors (see the source) 241 struct FieldIndexerData *fdatas; 242 QueryError status; // Error message is placed here if there is an error during processing 243 uint32_t totalTokens; // Number of tokens, used for offset vector 244 uint32_t specFlags; // Cached index flags 245 uint8_t options; // Indexing options - i.e. DOCUMENT_ADD_xxx 246 uint8_t stateFlags; // Indexing state, ACTX_F_xxx 247 DocumentAddCompleted donecb; 248 void *donecbData; 249 } RSAddDocumentCtx; 250 251 #define AddDocumentCtx_IsBlockable(aCtx) (!((aCtx)->stateFlags & ACTX_F_NOBLOCK)) 252 253 /** 254 * Creates a new context used for adding documents. Once created, call 255 * Document_AddToIndexes on it. 256 * 257 * - client is a blocked client which will be used as the context for this 258 * operation. 259 * - sp is the index that this document will be added to 260 * - base is the document to be index. The context will take ownership of the 261 * document's contents (but not the structure itself). Thus, you should not 262 * call Document_Free on the document after a successful return of this 263 * function. 264 * 265 * When done, call AddDocumentCtx_Free 266 */ 267 RSAddDocumentCtx *NewAddDocumentCtx(IndexSpec *sp, Document *base, QueryError *status); 268 269 /** 270 * At this point the context will take over from the caller, and handle sending 271 * the replies and so on. 272 */ 273 void AddDocumentCtx_Submit(RSAddDocumentCtx *aCtx, RedisSearchCtx *sctx, uint32_t options); 274 275 /** 276 * Indicate that processing is finished on the current document 277 */ 278 void AddDocumentCtx_Finish(RSAddDocumentCtx *aCtx); 279 /** 280 * This function will tokenize the document and add the resultant tokens to 281 * the relevant inverted indexes. This function should be called from a 282 * worker thread (see ConcurrentSearch functions). 283 * 284 * 285 * When this function completes, it will send the reply to the client and 286 * unblock the client passed when the context was first created. 287 */ 288 int Document_AddToIndexes(RSAddDocumentCtx *ctx); 289 290 /** 291 * Free the AddDocumentCtx. Should be done once AddToIndexes() completes; or 292 * when the client is unblocked. 293 */ 294 void AddDocumentCtx_Free(RSAddDocumentCtx *aCtx); 295 296 /* Evaluate an IF expression (e.g. IF "@foo == 'bar'") against a document, by getting the 297 * properties from the sorting table or from the hash representation of the document. 298 * 299 * NOTE: This is disconnected from the document indexing flow, and loads the document and discards 300 * of it internally 301 * 302 * Returns REDISMODULE_ERR on failure, OK otherwise*/ 303 int Document_EvalExpression(RedisSearchCtx *sctx, RedisModuleString *key, const char *expr, 304 int *result, QueryError *err); 305 306 // Don't create document if it does not exist. Replace only 307 #define REDIS_SAVEDOC_NOCREATE 0x01 308 /** 309 * Save a document in the index. Used for returning contents in search results. 310 */ 311 int Redis_SaveDocument(RedisSearchCtx *ctx, Document *doc, int options, QueryError *status); 312 313 /* Serialzie the document's fields to a redis client */ 314 int Document_ReplyFields(RedisModuleCtx *ctx, Document *doc); 315 316 DocumentField *Document_GetField(Document *d, const char *fieldName); 317 318 // Document add functions: 319 int RSAddDocumentCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc); 320 int RSSafeAddDocumentCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc); 321 int RSAddHashCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc); 322 int RSSafeAddHashCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc); 323 324 int RS_AddDocument(RedisSearchCtx *sctx, RedisModuleString *name, const AddDocumentOptions *opts, 325 QueryError *status); 326 #ifdef __cplusplus 327 } 328 #endif 329 #endif 330