1 #ifndef __RS_DOCUMENT_H__
2 #define __RS_DOCUMENT_H__
3 #include <pthread.h>
4 #include "redismodule.h"
5 #include "search_ctx.h"
6 #include "redisearch.h"
7 #include "tokenize.h"
8 #include "concurrent_ctx.h"
9 #include "byte_offsets.h"
10 #include "rmutil/args.h"
11 #include "query_error.h"
12 
13 #ifdef __cplusplus
14 extern "C" {
15 #endif
16 
17 ////////////////////////////////////////////////////////////////////////////////
18 ////////////////////////////////////////////////////////////////////////////////
19 /// General Architecture                                                     ///
20 ////////////////////////////////////////////////////////////////////////////////
21 ////////////////////////////////////////////////////////////////////////////////
22 
23 /**
24  * To index a document, call Document_PrepareForAdd on the document itself.
25  * This initializes the Document structure for indexing purposes. Once the
26  * document has been prepared, acquire a new RSAddDocumentCtx() by calling
27  * NewAddDocumentCtx().
28  *
29  * Once the new context has been received, call Document_AddToIndexes(). This
30  * will start tokenizing the documents, and should be called in a separate
31  * thread. This function will tokenize the document and send a reply back to
32  * the client. You may free the RSAddDocumentCtx structure by calling
33  * AddDocumentCtx_Free().
34  *
35  * See document.c for the internals.
36  */
37 
38 typedef struct {
39   const char *name;  // Can either be char or RMString
40   RedisModuleString *text;
41   FieldType indexAs;
42 } DocumentField;
43 
44 typedef struct Document {
45   RedisModuleString *docKey;
46   DocumentField *fields;
47   uint32_t numFields;
48   RSLanguage language;
49   float score;
50   t_docId docId;
51   const char *payload;
52   size_t payloadSize;
53   uint32_t flags;
54 } Document;
55 
56 /**
57  * Document should decrement the reference count to the contained strings. Used
58  * when the user does not want to retain his own reference to them. It effectively
59  * "steals" a reference.
60  *
61  * This only applies to _values_; not keys. Used internally by the C API
62  */
63 #define DOCUMENT_F_OWNREFS 0x01
64 
65 /**
66  * Indicates that the document owns a reference to the field contents,
67  * the language string, and the payload.
68  *
69  * The document always owns the field array, though.
70  */
71 #define DOCUMENT_F_OWNSTRINGS 0x02
72 
73 /**
74  * The document has been moved to another target. This is quicker than
75  * zero'ing the entire structure
76  */
77 #define DOCUMENT_F_DEAD 0x08
78 
79 struct RSAddDocumentCtx;
80 
81 typedef void (*DocumentAddCompleted)(struct RSAddDocumentCtx *, RedisModuleCtx *, void *);
82 
83 typedef struct {
84   uint32_t options;                 // DOCUMENT_ADD_XXX
85   RSLanguage language;              // Language document should be indexed as
86   RedisModuleString *payload;       // Arbitrary payload provided on return with WITHPAYLOADS
87   RedisModuleString **fieldsArray;  // Field, Value, Field Value
88   size_t numFieldElems;             // Number of elements
89   double score;                     // Score of the document
90   const char *evalExpr;             // Only add the document if this expression evaluates to true.
91   DocumentAddCompleted donecb;      // Callback to invoke when operation is done
92 } AddDocumentOptions;
93 
94 void Document_AddField(Document *d, const char *fieldname, RedisModuleString *fieldval,
95                        uint32_t typemask);
96 
97 /**
98  * Add a simple char buffer value. This creates an RMString internally, so this
99  * must be used with F_OWNSTRINGS
100  */
101 void Document_AddFieldC(Document *d, const char *fieldname, const char *val, size_t vallen,
102                         uint32_t typemask);
103 /**
104  * Initialize document structure with the relevant fields. numFields will allocate
105  * the fields array, but you must still actually copy the data along.
106  *
107  * Note that this function assumes that the pointers passed in will remain valid
108  * throughout the lifetime of the document. If you need to make independent copies
109  * of the data within the document, call Document_Detach on the document (after
110  * calling this function).
111  */
112 void Document_Init(Document *doc, RedisModuleString *docKey, double score, RSLanguage lang);
113 void Document_SetPayload(Document *doc, const void *payload, size_t n);
114 
115 /**
116  * Make the document the owner of the strings it contains
117  */
118 void Document_MakeStringsOwner(Document *doc);
119 
120 /**
121  * Make the document object steal references to the document's strings.
122  */
123 void Document_MakeRefOwner(Document *doc);
124 
125 /**
126  * Clear the document of its fields. This does not free the document
127  * or clear its name
128  */
129 void Document_Clear(Document *doc);
130 
131 /**
132  * Move the contents of one document to another. This also manages ownership
133  * semantics
134  */
135 void Document_Move(Document *dst, Document *src);
136 
137 /**
138  * Load all fields specified in the schema to the document. Note that
139  * the document must then be freed using Document_Free().
140  *
141  * The document must already have the docKey set
142  */
143 int Document_LoadSchemaFields(Document *doc, RedisSearchCtx *sctx);
144 
145 /**
146  * Load all the fields into the document.
147  */
148 int Document_LoadAllFields(Document *doc, RedisModuleCtx *ctx);
149 
150 void Document_LoadPairwiseArgs(Document *doc, RedisModuleString **args, size_t nargs);
151 
152 /**
153  * Print contents of document to screen
154  */
155 void Document_Dump(const Document *doc); // LCOV_EXCL_LINE debug
156 /**
157  * Free any copied data within the document. anyCtx is any non-NULL
158  * RedisModuleCtx. The reason for requiring a context is more related to the
159  * Redis Module API requiring a context for AutoMemory purposes, though in
160  * this case, the pointers are already removed from AutoMemory manangement
161  * anyway.
162  *
163  * This function also calls Document_Free
164  */
165 void Document_FreeDetached(Document *doc, RedisModuleCtx *anyCtx);
166 
167 /**
168  * Free the document's internals (like the field array).
169  */
170 void Document_Free(Document *doc);
171 
172 #define DOCUMENT_ADD_REPLACE 0x01
173 #define DOCUMENT_ADD_PARTIAL 0x02
174 #define DOCUMENT_ADD_NOSAVE 0x04
175 #define DOCUMENT_ADD_CURTHREAD 0x08  // Perform operation in main thread
176 #define DOCUMENT_ADD_NOCREATE 0x10   // Don't create document if not exist (replace ONLY)
177 
178 struct ForwardIndex;
179 struct FieldIndexerData;
180 
181 // The context has had its forward entries merged in the merge table. We can
182 // skip merging its tokens
183 #define ACTX_F_TEXTINDEXED 0x01
184 
185 // The context has had an error and should not be processed further
186 #define ACTX_F_ERRORED 0x02
187 
188 // Non-text fields have been indexed.
189 #define ACTX_F_OTHERINDEXED 0x04
190 
191 // The content has indexable fields
192 #define ACTX_F_INDEXABLES 0x08
193 
194 // The content has sortable fields
195 #define ACTX_F_SORTABLES 0x10
196 
197 // Don't block/unblock the client when indexing. This is the case when the
198 // operation is being done from within the context of AOF
199 #define ACTX_F_NOBLOCK 0x20
200 
201 // Document is entirely empty (no sortables, indexables)
202 #define ACTX_F_EMPTY 0x40
203 
204 struct DocumentIndexer;
205 
206 /** Context used when indexing documents */
207 typedef struct RSAddDocumentCtx {
208   struct RSAddDocumentCtx *next;  // Next context in the queue
209   Document doc;                   // Document which is being indexed
210   union {
211     RedisModuleBlockedClient *bc;  // Client
212     RedisSearchCtx *sctx;
213   } client;
214 
215   // Forward index. This contains all the terms found in the document
216   struct ForwardIndex *fwIdx;
217 
218   struct DocumentIndexer *indexer;
219 
220   // Sorting vector for the document. If the document has sortable fields, they
221   // are added to here as well
222   RSSortingVector *sv;
223 
224   // Byte offsets for highlighting. If term offsets are stored, this contains
225   // the field byte offset for each term.
226   RSByteOffsets *byteOffsets;
227   ByteOffsetWriter offsetsWriter;
228 
229   // Information about each field in the document. This is read from the spec
230   // and cached, so that we can look it up without holding the GIL
231   FieldSpec *fspecs;
232   RSTokenizer *tokenizer;
233 
234   // Old document data. Contains sortables
235   RSDocumentMetadata *oldMd;
236 
237   // New flags to assign to the document
238   RSDocumentFlags docFlags;
239 
240   // Scratch space used by per-type field preprocessors (see the source)
241   struct FieldIndexerData *fdatas;
242   QueryError status;     // Error message is placed here if there is an error during processing
243   uint32_t totalTokens;  // Number of tokens, used for offset vector
244   uint32_t specFlags;    // Cached index flags
245   uint8_t options;       // Indexing options - i.e. DOCUMENT_ADD_xxx
246   uint8_t stateFlags;    // Indexing state, ACTX_F_xxx
247   DocumentAddCompleted donecb;
248   void *donecbData;
249 } RSAddDocumentCtx;
250 
251 #define AddDocumentCtx_IsBlockable(aCtx) (!((aCtx)->stateFlags & ACTX_F_NOBLOCK))
252 
253 /**
254  * Creates a new context used for adding documents. Once created, call
255  * Document_AddToIndexes on it.
256  *
257  * - client is a blocked client which will be used as the context for this
258  *   operation.
259  * - sp is the index that this document will be added to
260  * - base is the document to be index. The context will take ownership of the
261  *   document's contents (but not the structure itself). Thus, you should not
262  *   call Document_Free on the document after a successful return of this
263  *   function.
264  *
265  * When done, call AddDocumentCtx_Free
266  */
267 RSAddDocumentCtx *NewAddDocumentCtx(IndexSpec *sp, Document *base, QueryError *status);
268 
269 /**
270  * At this point the context will take over from the caller, and handle sending
271  * the replies and so on.
272  */
273 void AddDocumentCtx_Submit(RSAddDocumentCtx *aCtx, RedisSearchCtx *sctx, uint32_t options);
274 
275 /**
276  * Indicate that processing is finished on the current document
277  */
278 void AddDocumentCtx_Finish(RSAddDocumentCtx *aCtx);
279 /**
280  * This function will tokenize the document and add the resultant tokens to
281  * the relevant inverted indexes. This function should be called from a
282  * worker thread (see ConcurrentSearch functions).
283  *
284  *
285  * When this function completes, it will send the reply to the client and
286  * unblock the client passed when the context was first created.
287  */
288 int Document_AddToIndexes(RSAddDocumentCtx *ctx);
289 
290 /**
291  * Free the AddDocumentCtx. Should be done once AddToIndexes() completes; or
292  * when the client is unblocked.
293  */
294 void AddDocumentCtx_Free(RSAddDocumentCtx *aCtx);
295 
296 /* Evaluate an IF expression (e.g. IF "@foo == 'bar'") against a document, by getting the
297  * properties from the sorting table or from the hash representation of the document.
298  *
299  * NOTE: This is disconnected from the document indexing flow, and loads the document and discards
300  * of it internally
301  *
302  * Returns  REDISMODULE_ERR on failure, OK otherwise*/
303 int Document_EvalExpression(RedisSearchCtx *sctx, RedisModuleString *key, const char *expr,
304                             int *result, QueryError *err);
305 
306 // Don't create document if it does not exist. Replace only
307 #define REDIS_SAVEDOC_NOCREATE 0x01
308 /**
309  * Save a document in the index. Used for returning contents in search results.
310  */
311 int Redis_SaveDocument(RedisSearchCtx *ctx, Document *doc, int options, QueryError *status);
312 
313 /* Serialzie the document's fields to a redis client */
314 int Document_ReplyFields(RedisModuleCtx *ctx, Document *doc);
315 
316 DocumentField *Document_GetField(Document *d, const char *fieldName);
317 
318 // Document add functions:
319 int RSAddDocumentCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc);
320 int RSSafeAddDocumentCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc);
321 int RSAddHashCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc);
322 int RSSafeAddHashCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc);
323 
324 int RS_AddDocument(RedisSearchCtx *sctx, RedisModuleString *name, const AddDocumentOptions *opts,
325                    QueryError *status);
326 #ifdef __cplusplus
327 }
328 #endif
329 #endif
330