1 #ifndef FRAGMENTER_H
2 #define FRAGMENTER_H
3 
4 #include <stdlib.h>
5 #include <stdint.h>
6 #include <sys/uio.h>
7 #include "util/array.h"
8 #include "stemmer.h"
9 #include "redisearch.h"
10 #include "stopwords.h"
11 #include "byte_offsets.h"
12 
13 /**
14  *
15  ## Implementation
16 The summarization/highlight subsystem is implemented using an environment-agnostic
17 highlighter/fragmenter, and a higher level which is integrated with RediSearch and
18 the query keyword parser.
19 
20 The summarization process begins by tokenizing the requested field, and splitting
21 the document into *fragments*.
22 
23 When a matching token (or its stemmed variant) is found, a distance counter begins.
24 This counts the number of tokens following the matched token. If another matching
25 token occurs before the maximum token distance has been exceeded, the counter is
26 reset to 0 and the fragment is extended.
27 
28 Each time a token is found in a fragment, the fragment's score increases. The
29 score increase is dependent on the base token score (this is provided as
30 input to the fragmenter), and whether this term is being repeated, or if it is
31 a new occurrence (within the same fragment). New terms get higher scores; which
32 helps eliminate forms like "I said to Abraham: Abraham, why...".
33 
34 The input score for each term is calculated based on the term's overall frequency
35 in the DB (lower frequency means higher score), but this is consider out of bounds
36 for the fragmenter.
37 
38 Once all fragments are scored, they are then *contextualized*. The fragment's
39 context is determined to be X amount of tokens surrounding the given matched
40 tokens. Words in between the tokens are considered as well, ensuring that every
41 fragment is more or less the same size.
42  */
43 
44 typedef struct {
45   uint32_t tokPos;
46   uint32_t bytePos;
47   uint32_t termId;
48   uint32_t len;
49   float score;
50 } FragmentTerm;
51 
52 typedef struct {
53   RSByteOffsetIterator *byteIter;
54   RSOffsetIterator *offsetIter;
55   RSQueryTerm *curMatchRec;
56   uint32_t curTokPos;
57   uint32_t curByteOffset;
58   FragmentTerm tmpTerm;
59 } FragmentTermIterator;
60 
61 int FragmentTermIterator_Next(FragmentTermIterator *iter, FragmentTerm **termInfo);
62 void FragmentTermIterator_InitOffsets(FragmentTermIterator *iter, RSByteOffsetIterator *bytesIter,
63                                       RSOffsetIterator *offIter);
64 
65 typedef struct {
66   // Position in current fragment (bytes)
67   uint32_t offset;
68 
69   // Length of the token. This might be a stem, so not necessarily similar to termId
70   uint16_t len;
71 
72   // Index into FragmentList::terms
73   uint16_t termId;
74 } TermLoc;
75 
76 typedef struct Fragment {
77   const char *buf;
78   uint32_t len;
79 
80   // (token-wise) position of the last matched token
81   uint32_t lastMatchPos;
82 
83   // How many tokens are in this fragment
84   uint32_t totalTokens;
85 
86   // How many _matched_ tokens are in this fragment
87   uint32_t numMatches;
88 
89   // Inverted ranking (from 0..n) in the score array. A lower number means a higher score
90   uint32_t scoreRank;
91 
92   // Position within the array of fragments
93   uint32_t fragPos;
94 
95   // Score calculated from the number of matches
96   float score;
97   Array termLocs;  // TermLoc
98 } Fragment;
99 
100 typedef struct {
101   // Array of fragments
102   Array frags;
103 
104   // Array of indexes (in frags), sorted by score
105   const Fragment **sortedFrags;
106 
107   // Scratch space, used internally
108   const Fragment **scratchFrags;
109 
110   // Number of fragments
111   uint32_t numFrags;
112 
113   // Number of tokens since last match. Used in determining 'context ratio'
114   uint32_t numToksSinceLastMatch;
115 
116   const char *doc;
117   uint32_t docLen;
118 
119   // Maximum allowable distance between relevant terms to be called a 'fragment'
120   uint16_t maxDistance;
121 
122   // Average word size. Used when determining context.
123   uint8_t estAvgWordSize;
124 } FragmentList;
125 
FragmentList_Init(FragmentList * fragList,uint16_t maxDistance,uint8_t estWordSize)126 static inline void FragmentList_Init(FragmentList *fragList, uint16_t maxDistance,
127                                      uint8_t estWordSize) {
128   fragList->doc = NULL;
129   fragList->docLen = 0;
130   fragList->numFrags = 0;
131   fragList->maxDistance = maxDistance;
132   fragList->estAvgWordSize = estWordSize;
133   fragList->sortedFrags = NULL;
134   fragList->scratchFrags = NULL;
135   Array_Init(&fragList->frags);
136 }
137 
FragmentList_GetNumFrags(const FragmentList * fragList)138 static inline size_t FragmentList_GetNumFrags(const FragmentList *fragList) {
139   return ARRAY_GETSIZE_AS(&fragList->frags, Fragment);
140 }
141 
FragmentList_GetFragments(const FragmentList * fragList)142 static const Fragment *FragmentList_GetFragments(const FragmentList *fragList) {
143   return ARRAY_GETARRAY_AS(&fragList->frags, const Fragment *);
144 }
145 
146 #define FRAGMENT_TERM(buf_, len_, score_) \
147   { .tok = buf_, .len = len_, .score = score_ }
148 /**
149  * A single term to use for searching. Used when fragmenting a buffer
150  */
151 typedef struct {
152   const char *tok;
153   size_t len;
154   float score;
155 } FragmentSearchTerm;
156 
157 #define DOCLEN_NULTERM ((size_t)-1)
158 
159 /**
160  * Split a document into a list of fragments.
161  * - doc is the document to split
162  * - numTerms is the number of terms to search for. The terms themselves are
163  *   not searched, but each Fragment needs to have this memory made available.
164  *
165  * Returns a list of fragments.
166  */
167 void FragmentList_FragmentizeBuffer(FragmentList *fragList, const char *doc, Stemmer *stemmer,
168                                     StopWordList *stopwords, const FragmentSearchTerm *terms,
169                                     size_t numTerms);
170 
171 #define FRAGMENTIZE_TOKLEN_EXACT 0x01
172 void FragmentList_FragmentizeIter(FragmentList *fragList, const char *doc, size_t docLen,
173                                   FragmentTermIterator *iter, int options);
174 
175 typedef struct {
176   const char *openTag;
177   const char *closeTag;
178 } HighlightTags;
179 
180 void FragmentList_Free(FragmentList *frags);
181 
182 /** Highlight matches the entire document, returning a series of IOVs */
183 void FragmentList_HighlightWholeDocV(const FragmentList *fragList, const HighlightTags *tags,
184                                      Array *iovs);
185 
186 /** Highlight matches the entire document, returning it as a freeable NUL-terminated buffer */
187 char *FragmentList_HighlightWholeDocS(const FragmentList *fragList, const HighlightTags *tags);
188 
189 /**
190  * Return fragments by their score. The highest ranked fragment is returned fist
191  */
192 #define HIGHLIGHT_ORDER_SCORE 0x01
193 
194 /**
195  * Return fragments by their order in the document. The fragment with the lowest
196  * position is returned first.
197  */
198 #define HIGHLIGHT_ORDER_POS 0x02
199 
200 /**
201  * First select the highest scoring elements and then sort them by position
202  */
203 #define HIGHLIGHT_ORDER_SCOREPOS 0x03
204 /**
205  * Highlight fragments for each document.
206  *
207  * - contextSize is the size of the surrounding context, in estimated words,
208  * for each returned fragment. The function will use this as a hint.
209  *
210  * - iovBufList is an array of buffers. Each element corresponds to a fragment,
211  * and the fragments are always returned in order.
212  *
213  * - niovs If niovs is less than the number of fragments, then only the first
214  * <niov> fragments are returned.
215  *
216  * - order is one of the HIGHLIGHT_ORDER_ constants. See their documentation
217  * for more details
218  *
219  */
220 void FragmentList_HighlightFragments(FragmentList *fragList, const HighlightTags *tags,
221                                      size_t contextSize, Array *iovBufList, size_t niovs,
222                                      int order);
223 
224 void FragmentList_Dump(const FragmentList *fragList);
225 
226 #endif