1 #ifndef __RS_DOCUMENT_H__
2 #define __RS_DOCUMENT_H__
3 #include <pthread.h>
4 #include "redismodule.h"
5 #include "search_ctx.h"
6 #include "redisearch.h"
7 #include "tokenize.h"
8 #include "concurrent_ctx.h"
9 #include "byte_offsets.h"
10 #include "rmutil/args.h"
11 #include "query_error.h"
12 #include "json.h"
13 
14 #ifdef __cplusplus
15 extern "C" {
16 #endif
17 
18 ////////////////////////////////////////////////////////////////////////////////
19 ////////////////////////////////////////////////////////////////////////////////
20 /// General Architecture                                                     ///
21 ////////////////////////////////////////////////////////////////////////////////
22 ////////////////////////////////////////////////////////////////////////////////
23 
24 /**
25  * To index a document, call Document_PrepareForAdd on the document itself.
26  * This initializes the Document structure for indexing purposes. Once the
27  * document has been prepared, acquire a new RSAddDocumentCtx() by calling
28  * NewAddDocumentCtx().
29  *
30  * Once the new context has been received, call Document_AddToIndexes(). This
31  * will start tokenizing the documents, and should be called in a separate
32  * thread. This function will tokenize the document and send a reply back to
33  * the client. You may free the RSAddDocumentCtx structure by calling
34  * AddDocumentCtx_Free().
35  *
36  * See document.c for the internals.
37  */
38 
39 typedef enum {
40   // Newline
41   FLD_VAR_T_RMS = 0x01,
42   FLD_VAR_T_CSTR = 0x02,
43   FLD_VAR_T_NUM = 0x04,
44   FLD_VAR_T_GEO = 0x08,
45   FLD_VAR_T_ARRAY = 0x10
46 } FieldVarType;
47 
48 typedef struct DocumentField{
49   const char *name;  // Can either be char or RMString
50   const char *path;
51   union {
52     // TODO: consider removing RMS altogether
53     RedisModuleString *text;
54     struct {
55       char *strval;
56       size_t strlen;
57     };
58     double numval;
59     struct {
60       double lon, lat;
61     };
62     struct {
63       char **multiVal;
64       size_t arrayLen; // for multiVal TODO: use arr.h
65     };
66   };
67   FieldVarType unionType;
68   FieldType indexAs;
69 } DocumentField;
70 
71 typedef struct Document {
72   RedisModuleString *docKey;
73   DocumentField *fields;
74   uint32_t numFields;
75   RSLanguage language;
76   float score;
77   t_docId docId;
78   const char *payload;
79   size_t payloadSize;
80   uint32_t flags;
81   DocumentType type;
82 } Document;
83 
84 /**
85  * Document should decrement the reference count to the contained strings. Used
86  * when the user does not want to retain his own reference to them. It effectively
87  * "steals" a reference.
88  *
89  * This only applies to _values_; not keys. Used internally by the C API
90  */
91 #define DOCUMENT_F_OWNREFS 0x01
92 
93 /**
94  * Indicates that the document owns a reference to the field contents,
95  * the language string, and the payload.
96  *
97  * The document always owns the field array, though.
98  */
99 #define DOCUMENT_F_OWNSTRINGS 0x02
100 
101 #define UNDERSCORE_KEY "__key"
102 #define UNDERSCORE_SCORE "__score"
103 #define UNDERSCORE_PAYLOAD "__payload"
104 #define UNDERSCORE_LANGUAGE "__language"
105 
106 struct RSAddDocumentCtx;
107 
108 typedef void (*DocumentAddCompleted)(struct RSAddDocumentCtx *, RedisModuleCtx *, void *);
109 
110 typedef struct {
111   uint32_t options;            // DOCUMENT_ADD_XXX
112   RSLanguage language;         // Language document should be indexed as
113   RedisModuleString *payload;  // Arbitrary payload provided on return with WITHPAYLOADS
114   arrayof(RedisModuleString *) fieldsArray;  // Field, Value, Field Value
115   size_t numFieldElems;                      // Number of elements
116   double score;                              // Score of the document
117   const char *evalExpr;         // Only add the document if this expression evaluates to true.
118   DocumentAddCompleted donecb;  // Callback to invoke when operation is done
119 
120   RedisModuleString *keyStr;       // key name for HSET
121   RedisModuleString *scoreStr;     // score string for HSET
122   RedisModuleString *languageStr;  // Language string for HSET
123 } AddDocumentOptions;
124 
125 void Document_AddField(Document *d, const char *fieldname, RedisModuleString *fieldval,
126                        uint32_t typemask);
127 
128 /**
129  * Add a simple char buffer value. This creates an RMString internally, so this
130  * must be used with F_OWNSTRINGS
131  */
132 void Document_AddFieldC(Document *d, const char *fieldname, const char *val, size_t vallen,
133                         uint32_t typemask);
134 
135 /**
136  * Load Document Field with a numeric value.
137  */
138 void Document_AddNumericField(Document *d, const char *fieldname,
139                               double val, uint32_t typemask);
140 
141 /**
142  * Load Document Field with a longitude and latitude values.
143  */
144 void Document_AddGeoField(Document *d, const char *fieldname,
145                           double lon, double lat, uint32_t typemask);
146 
147 /**
148  * Initialize document structure with the relevant fields. numFields will allocate
149  * the fields array, but you must still actually copy the data along.
150  *
151  * Note that this function assumes that the pointers passed in will remain valid
152  * throughout the lifetime of the document. If you need to make independent copies
153  * of the data within the document, call Document_Detach on the document (after
154  * calling this function).
155  */
156 void Document_Init(Document *doc, RedisModuleString *docKey, double score, RSLanguage lang, DocumentType type);
157 void Document_SetPayload(Document *doc, const void *payload, size_t n);
158 
159 /**
160  * Make the document the owner of the strings it contains
161  */
162 void Document_MakeStringsOwner(Document *doc);
163 
164 /**
165  * Make the document object steal references to the document's strings.
166  */
167 void Document_MakeRefOwner(Document *doc);
168 
169 /**
170  * Clear the document of its fields. This does not free the document
171  * or clear its name
172  */
173 void Document_Clear(Document *doc);
174 
175 /**
176  * Load all fields specified in the schema to the document. Note that
177  * the document must then be freed using Document_Free().
178  *
179  * The document must already have the docKey set
180  */
181 int Document_LoadSchemaFieldHash(Document *doc, RedisSearchCtx *sctx);
182 int Document_LoadSchemaFieldJson(Document *doc, RedisSearchCtx *sctx);
183 
184 /**
185  * Load all the fields into the document.
186  */
187 int Document_LoadAllFields(Document *doc, RedisModuleCtx *ctx);
188 
189 void Document_LoadPairwiseArgs(Document *doc, RedisModuleString **args, size_t nargs);
190 void Document_LoadHSetParams(Document *d, const AddDocumentOptions *opts);
191 
192 /**
193  * Print contents of document to screen
194  */
195 void Document_Dump(const Document *doc);  // LCOV_EXCL_LINE debug
196 /**
197  * Free any copied data within the document. anyCtx is any non-NULL
198  * RedisModuleCtx. The reason for requiring a context is more related to the
199  * Redis Module API requiring a context for AutoMemory purposes, though in
200  * this case, the pointers are already removed from AutoMemory manangement
201  * anyway.
202  *
203  * This function also calls Document_Free
204  */
205 void Document_FreeDetached(Document *doc, RedisModuleCtx *anyCtx);
206 
207 /**
208  * Free the document's internals (like the field array).
209  */
210 void Document_Free(Document *doc);
211 
212 #define DOCUMENT_ADD_REPLACE 0x01
213 #define DOCUMENT_ADD_PARTIAL 0x02
214 #define DOCUMENT_ADD_NOSAVE 0x04
215 #define DOCUMENT_ADD_NOCREATE 0x08  // Don't create document if not exist (replace ONLY)
216 
217 struct ForwardIndex;
218 struct FieldIndexerData;
219 
220 // The context has had its forward entries merged in the merge table. We can
221 // skip merging its tokens
222 #define ACTX_F_TEXTINDEXED 0x01
223 
224 // The context has had an error and should not be processed further
225 #define ACTX_F_ERRORED 0x02
226 
227 // Non-text fields have been indexed.
228 #define ACTX_F_OTHERINDEXED 0x04
229 
230 // The content has indexable fields
231 #define ACTX_F_INDEXABLES 0x08
232 
233 // The content has sortable fields
234 #define ACTX_F_SORTABLES 0x10
235 
236 // Don't block/unblock the client when indexing. This is the case when the
237 // operation is being done from within the context of AOF
238 #define ACTX_F_NOBLOCK 0x20
239 
240 // Document is entirely empty (no sortables, indexables)
241 #define ACTX_F_EMPTY 0x40
242 
243 #define ACTX_F_NOFREEDOC 0x80
244 
245 struct DocumentIndexer;
246 
247 /** Context used when indexing documents */
248 typedef struct RSAddDocumentCtx {
249   struct RSAddDocumentCtx *next;  // Next context in the queue
250   Document *doc;                   // Document which is being indexed
251   union {
252     RedisModuleBlockedClient *bc;  // Client
253     RedisSearchCtx *sctx;
254   } client;
255 
256   IndexSpec *spec;
257   char *specName;
258   size_t specNameLen;
259   uint64_t specId;
260 
261   // Forward index. This contains all the terms found in the document
262   struct ForwardIndex *fwIdx;
263 
264   struct DocumentIndexer *indexer;
265 
266   // Sorting vector for the document. If the document has sortable fields, they
267   // are added to here as well
268   RSSortingVector *sv;
269 
270   // Byte offsets for highlighting. If term offsets are stored, this contains
271   // the field byte offset for each term.
272   RSByteOffsets *byteOffsets;
273   ByteOffsetWriter offsetsWriter;
274 
275   // Information about each field in the document. This is read from the spec
276   // and cached, so that we can look it up without holding the GIL
277   FieldSpec *fspecs;
278   RSTokenizer *tokenizer;
279 
280   // Old document data. Contains sortables
281   RSDocumentMetadata *oldMd;
282 
283   // New flags to assign to the document
284   RSDocumentFlags docFlags;
285 
286   // Scratch space used by per-type field preprocessors (see the source)
287   struct FieldIndexerData *fdatas;
288   QueryError status;     // Error message is placed here if there is an error during processing
289   uint32_t totalTokens;  // Number of tokens, used for offset vector
290   uint32_t specFlags;    // Cached index flags
291   uint8_t options;       // Indexing options - i.e. DOCUMENT_ADD_xxx
292   uint8_t stateFlags;    // Indexing state, ACTX_F_xxx
293   DocumentAddCompleted donecb;
294   void *donecbData;
295 } RSAddDocumentCtx;
296 
297 #define AddDocumentCtx_IsBlockable(aCtx) (!((aCtx)->stateFlags & ACTX_F_NOBLOCK))
298 
299 /**
300  * Creates a new context used for adding documents. Once created, call
301  * Document_AddToIndexes on it.
302  *
303  * - client is a blocked client which will be used as the context for this
304  *   operation.
305  * - sp is the index that this document will be added to
306  * - base is the document to be index. The context will take ownership of the
307  *   document's contents (but not the structure itself). Thus, you should not
308  *   call Document_Free on the document after a successful return of this
309  *   function.
310  *
311  * When done, call AddDocumentCtx_Free
312  */
313 RSAddDocumentCtx *NewAddDocumentCtx(IndexSpec *sp, Document *base, QueryError *status);
314 
315 /**
316  * At this point the context will take over from the caller, and handle sending
317  * the replies and so on.
318  */
319 void AddDocumentCtx_Submit(RSAddDocumentCtx *aCtx, RedisSearchCtx *sctx, uint32_t options);
320 
321 /**
322  * Indicate that processing is finished on the current document
323  */
324 void AddDocumentCtx_Finish(RSAddDocumentCtx *aCtx);
325 /**
326  * This function will tokenize the document and add the resultant tokens to
327  * the relevant inverted indexes. This function should be called from a
328  * worker thread (see ConcurrentSearch functions).
329  *
330  *
331  * When this function completes, it will send the reply to the client and
332  * unblock the client passed when the context was first created.
333  */
334 int Document_AddToIndexes(RSAddDocumentCtx *ctx);
335 
336 /**
337  * Free the AddDocumentCtx. Should be done once AddToIndexes() completes; or
338  * when the client is unblocked.
339  */
340 void AddDocumentCtx_Free(RSAddDocumentCtx *aCtx);
341 
342 /* Evaluate an IF expression (e.g. IF "@foo == 'bar'") against a document, by getting the
343  * properties from the sorting table or from the hash representation of the document.
344  *
345  * NOTE: This is disconnected from the document indexing flow, and loads the document and discards
346  * of it internally
347  *
348  * Returns  REDISMODULE_ERR on failure, OK otherwise*/
349 int Document_EvalExpression(RedisSearchCtx *sctx, RedisModuleString *key, const char *expr,
350                             int *result, QueryError *err);
351 
352 // Don't create document if it does not exist. Replace only
353 #define REDIS_SAVEDOC_NOCREATE 0x01
354 /**
355  * Save a document in the index. Used for returning contents in search results.
356  */
357 int Redis_SaveDocument(RedisSearchCtx *ctx, const AddDocumentOptions *opts, QueryError *status);
358 
359 /* Serialzie the document's fields to a redis client */
360 int Document_ReplyAllFields(RedisModuleCtx *ctx, IndexSpec *spec, RedisModuleString *id);
361 
362 DocumentField *Document_GetField(Document *d, const char *fieldName);
363 
364 /* return value as c string */
365 const char *DocumentField_GetValueCStr(const DocumentField *df, size_t *len);
366 
367 // Document add functions:
368 int RSAddDocumentCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc);
369 int RSSafeAddDocumentCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc);
370 int RSAddHashCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc);
371 int RSSafeAddHashCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc);
372 
373 int RS_AddDocument(RedisSearchCtx *sctx, RedisModuleString *name, const AddDocumentOptions *opts,
374                    QueryError *status);
375 
376 void freeGlobalAddStrings();
377 
378 #ifdef __cplusplus
379 }
380 #endif
381 #endif
382