1 #ifndef FRAGMENTER_H
2 #define FRAGMENTER_H
3
4 #include <stdlib.h>
5 #include <stdint.h>
6 #include <sys/uio.h>
7 #include "util/array.h"
8 #include "stemmer.h"
9 #include "redisearch.h"
10 #include "stopwords.h"
11 #include "byte_offsets.h"
12
13 /**
14 *
15 ## Implementation
16 The summarization/highlight subsystem is implemented using an environment-agnostic
17 highlighter/fragmenter, and a higher level which is integrated with RediSearch and
18 the query keyword parser.
19
20 The summarization process begins by tokenizing the requested field, and splitting
21 the document into *fragments*.
22
23 When a matching token (or its stemmed variant) is found, a distance counter begins.
24 This counts the number of tokens following the matched token. If another matching
25 token occurs before the maximum token distance has been exceeded, the counter is
26 reset to 0 and the fragment is extended.
27
28 Each time a token is found in a fragment, the fragment's score increases. The
29 score increase is dependent on the base token score (this is provided as
30 input to the fragmenter), and whether this term is being repeated, or if it is
31 a new occurrence (within the same fragment). New terms get higher scores; which
32 helps eliminate forms like "I said to Abraham: Abraham, why...".
33
34 The input score for each term is calculated based on the term's overall frequency
35 in the DB (lower frequency means higher score), but this is consider out of bounds
36 for the fragmenter.
37
38 Once all fragments are scored, they are then *contextualized*. The fragment's
39 context is determined to be X amount of tokens surrounding the given matched
40 tokens. Words in between the tokens are considered as well, ensuring that every
41 fragment is more or less the same size.
42 */
43
44 typedef struct {
45 uint32_t tokPos;
46 uint32_t bytePos;
47 uint32_t termId;
48 uint32_t len;
49 float score;
50 } FragmentTerm;
51
52 typedef struct {
53 RSByteOffsetIterator *byteIter;
54 RSOffsetIterator *offsetIter;
55 RSQueryTerm *curMatchRec;
56 uint32_t curTokPos;
57 uint32_t curByteOffset;
58 FragmentTerm tmpTerm;
59 } FragmentTermIterator;
60
61 int FragmentTermIterator_Next(FragmentTermIterator *iter, FragmentTerm **termInfo);
62 void FragmentTermIterator_InitOffsets(FragmentTermIterator *iter, RSByteOffsetIterator *bytesIter,
63 RSOffsetIterator *offIter);
64
65 typedef struct {
66 // Position in current fragment (bytes)
67 uint32_t offset;
68
69 // Length of the token. This might be a stem, so not necessarily similar to termId
70 uint16_t len;
71
72 // Index into FragmentList::terms
73 uint16_t termId;
74 } TermLoc;
75
76 typedef struct Fragment {
77 const char *buf;
78 uint32_t len;
79
80 // (token-wise) position of the last matched token
81 uint32_t lastMatchPos;
82
83 // How many tokens are in this fragment
84 uint32_t totalTokens;
85
86 // How many _matched_ tokens are in this fragment
87 uint32_t numMatches;
88
89 // Inverted ranking (from 0..n) in the score array. A lower number means a higher score
90 uint32_t scoreRank;
91
92 // Position within the array of fragments
93 uint32_t fragPos;
94
95 // Score calculated from the number of matches
96 float score;
97 Array termLocs; // TermLoc
98 } Fragment;
99
100 typedef struct {
101 // Array of fragments
102 Array frags;
103
104 // Array of indexes (in frags), sorted by score
105 const Fragment **sortedFrags;
106
107 // Scratch space, used internally
108 const Fragment **scratchFrags;
109
110 // Number of fragments
111 uint32_t numFrags;
112
113 // Number of tokens since last match. Used in determining 'context ratio'
114 uint32_t numToksSinceLastMatch;
115
116 const char *doc;
117 uint32_t docLen;
118
119 // Maximum allowable distance between relevant terms to be called a 'fragment'
120 uint16_t maxDistance;
121
122 // Average word size. Used when determining context.
123 uint8_t estAvgWordSize;
124 } FragmentList;
125
FragmentList_Init(FragmentList * fragList,uint16_t maxDistance,uint8_t estWordSize)126 static inline void FragmentList_Init(FragmentList *fragList, uint16_t maxDistance,
127 uint8_t estWordSize) {
128 fragList->doc = NULL;
129 fragList->docLen = 0;
130 fragList->numFrags = 0;
131 fragList->maxDistance = maxDistance;
132 fragList->estAvgWordSize = estWordSize;
133 fragList->sortedFrags = NULL;
134 fragList->scratchFrags = NULL;
135 Array_Init(&fragList->frags);
136 }
137
FragmentList_GetNumFrags(const FragmentList * fragList)138 static inline size_t FragmentList_GetNumFrags(const FragmentList *fragList) {
139 return ARRAY_GETSIZE_AS(&fragList->frags, Fragment);
140 }
141
FragmentList_GetFragments(const FragmentList * fragList)142 static const Fragment *FragmentList_GetFragments(const FragmentList *fragList) {
143 return ARRAY_GETARRAY_AS(&fragList->frags, const Fragment *);
144 }
145
146 #define FRAGMENT_TERM(buf_, len_, score_) \
147 { .tok = buf_, .len = len_, .score = score_ }
148 /**
149 * A single term to use for searching. Used when fragmenting a buffer
150 */
151 typedef struct {
152 const char *tok;
153 size_t len;
154 float score;
155 } FragmentSearchTerm;
156
157 #define DOCLEN_NULTERM ((size_t)-1)
158
159 /**
160 * Split a document into a list of fragments.
161 * - doc is the document to split
162 * - numTerms is the number of terms to search for. The terms themselves are
163 * not searched, but each Fragment needs to have this memory made available.
164 *
165 * Returns a list of fragments.
166 */
167 void FragmentList_FragmentizeBuffer(FragmentList *fragList, const char *doc, Stemmer *stemmer,
168 StopWordList *stopwords, const FragmentSearchTerm *terms,
169 size_t numTerms);
170
171 #define FRAGMENTIZE_TOKLEN_EXACT 0x01
172 void FragmentList_FragmentizeIter(FragmentList *fragList, const char *doc, size_t docLen,
173 FragmentTermIterator *iter, int options);
174
175 typedef struct {
176 const char *openTag;
177 const char *closeTag;
178 } HighlightTags;
179
180 void FragmentList_Free(FragmentList *frags);
181
182 /** Highlight matches the entire document, returning a series of IOVs */
183 void FragmentList_HighlightWholeDocV(const FragmentList *fragList, const HighlightTags *tags,
184 Array *iovs);
185
186 /** Highlight matches the entire document, returning it as a freeable NUL-terminated buffer */
187 char *FragmentList_HighlightWholeDocS(const FragmentList *fragList, const HighlightTags *tags);
188
189 /**
190 * Return fragments by their score. The highest ranked fragment is returned fist
191 */
192 #define HIGHLIGHT_ORDER_SCORE 0x01
193
194 /**
195 * Return fragments by their order in the document. The fragment with the lowest
196 * position is returned first.
197 */
198 #define HIGHLIGHT_ORDER_POS 0x02
199
200 /**
201 * First select the highest scoring elements and then sort them by position
202 */
203 #define HIGHLIGHT_ORDER_SCOREPOS 0x03
204 /**
205 * Highlight fragments for each document.
206 *
207 * - contextSize is the size of the surrounding context, in estimated words,
208 * for each returned fragment. The function will use this as a hint.
209 *
210 * - iovBufList is an array of buffers. Each element corresponds to a fragment,
211 * and the fragments are always returned in order.
212 *
213 * - niovs If niovs is less than the number of fragments, then only the first
214 * <niov> fragments are returned.
215 *
216 * - order is one of the HIGHLIGHT_ORDER_ constants. See their documentation
217 * for more details
218 *
219 */
220 void FragmentList_HighlightFragments(FragmentList *fragList, const HighlightTags *tags,
221 size_t contextSize, Array *iovBufList, size_t niovs,
222 int order);
223
224 void FragmentList_Dump(const FragmentList *fragList);
225
226 #endif