1 /////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
5 /////////////////////////////////////////////////////////////////////////////
6 
7 #ifndef TOKEN_H
8 #define TOKEN_H
9 
10 #include "Attribute.h"
11 #include "AttributeSource.h"
12 
13 namespace Lucene {
14 
15 /// A Token is an occurrence of a term from the text of a field.  It consists of a term's text, the start and end
16 /// offset of the term in the text of the field and a type string.
17 ///
18 /// The start and end offsets permit applications to re-associate a token with its source text, eg., to display
19 /// highlighted query terms in a document browser, or to show matching text fragments in a
20 /// <abbr title="KeyWord In Context">KWIC</abbr> display, etc.
21 ///
22 /// The type is a string, assigned by a lexical analyzer (a.k.a. tokenizer), naming the lexical or syntactic class
23 /// that the token belongs to.  For example an end of sentence marker token might be implemented with type "eos".
24 /// The default token type is "word".
25 ///
26 /// A Token can optionally have metadata (a.k.a. Payload) in the form of a variable length byte array. Use {@link
27 /// TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads
28 /// from the index.
29 ///
30 /// Tokenizers and TokenFilters should try to re-use a Token instance when possible for best performance, by implementing
31 /// the {@link TokenStream#incrementToken()} API.  Failing that, to create a new Token you should first use one of
32 /// the constructors that starts with null text.  To load the token from a char[] use
33 /// {@link #setTermBuffer(char[], int, int)}.  To load from a String use {@link #setTermBuffer(String)} or {@link
34 /// #setTermBuffer(String, int, int)}.  Alternatively you can get the Token's termBuffer by calling either {@link
35 /// #termBuffer()}, if you know that your text is shorter than the capacity of the termBuffer or {@link
36 /// #resizeTermBuffer(int)}, if there is any possibility that you may need to grow the buffer. Fill in the characters
37 /// of your term into this buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string,
38 /// or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to
39 /// set the length of the term text.
40 ///
41 /// Typical Token reuse patterns:
42 ///
43 /// Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):
44 /// <pre>
45 /// return reusableToken->reinit(string, startOffset, endOffset[, type]);
46 /// </pre>
47 ///
48 /// Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):
49 /// <pre>
50 /// return reusableToken->reinit(string, 0, string.length(), startOffset, endOffset[, type]);
51 /// </pre>
52 ///
53 /// Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):
54 /// <pre>
55 /// return reusableToken->reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
56 /// </pre>
57 ///
58 /// Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):
59 /// <pre>
60 /// return reusableToken->reinit(buffer, start, end - start, startOffset, endOffset[, type]);
61 /// </pre>
62 ///
63 /// Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):
64 /// <pre>
65 /// return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]);
66 /// </pre>
67 ///
68 /// A few things to note:
69 /// clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but
70 /// should affect no one.
71 /// Because TokenStreams can be chained, one cannot assume that the Token's current type is correct.  The startOffset
72 /// and endOffset represent the start and offset in the source text, so be careful in adjusting them.  When caching a
73 /// reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.
74 ///
75 /// @see Payload
76 class LPPAPI Token : public Attribute {
77 public:
78     /// Constructs a Token will null text.
79     Token();
80 
81     /// Constructs a Token with null text and start and end offsets.
82     /// @param start start offset in the source text
83     /// @param end end offset in the source text
84     Token(int32_t start, int32_t end);
85 
86     /// Constructs a Token with null text and start and end offsets plus the Token type.
87     /// @param start start offset in the source text
88     /// @param end end offset in the source text
89     /// @param type the lexical type of this Token
90     Token(int32_t start, int32_t end, const String& type);
91 
92     /// Constructs a Token with null text and start and end offsets plus flags.
93     /// @param start start offset in the source text
94     /// @param end end offset in the source text
95     /// @param flags The bits to set for this token
96     Token(int32_t start, int32_t end, int32_t flags);
97 
98     /// Constructs a Token with the given term text, start and end offsets.  The type defaults to "word."
99     /// NOTE: for better indexing speed you should instead use the char[] termBuffer methods to set the term text.
100     /// @param text term text
101     /// @param start start offset in the source text
102     /// @param end end offset in the source text
103     Token(const String& text, int32_t start, int32_t end);
104 
105     /// Constructs a Token with the given term text, start and end offsets and type.
106     /// NOTE: for better indexing speed you should instead use the char[] termBuffer methods to set the term text.
107     /// @param text term text
108     /// @param start start offset in the source text
109     /// @param end end offset in the source text
110     /// @param type the lexical type of this Token
111     Token(const String& text, int32_t start, int32_t end, const String& type);
112 
113     /// Constructs a Token with the given term text, start and end offsets and flags.
114     /// NOTE: for better indexing speed you should instead use the char[] termBuffer methods to set the term text.
115     /// @param text term text
116     /// @param start start offset in the source text
117     /// @param end end offset in the source text
118     /// @param flags The bits to set for this token
119     Token(const String& text, int32_t start, int32_t end, int32_t flags);
120 
121     /// Constructs a Token with the given term buffer (offset and length), start and end offsets
122     Token(CharArray startTermBuffer, int32_t termBufferOffset, int32_t termBufferLength, int32_t start, int32_t end);
123 
124     virtual ~Token();
125 
126     LUCENE_CLASS(Token);
127 
128 public:
129     static const String& DEFAULT_TYPE();
130 
131 protected:
132     static const int32_t MIN_BUFFER_SIZE;
133 
134     CharArray _termBuffer;
135     int32_t _termLength;
136     int32_t _startOffset;
137     int32_t _endOffset;
138     String _type;
139     int32_t flags;
140     PayloadPtr payload;
141     int32_t positionIncrement;
142 
143 public:
144     /// Set the position increment.  This determines the position of this token relative to the previous Token
145     /// in a {@link TokenStream}, used in phrase searching.
146     ///
147     /// The default value is one.
148     ///
149     /// Some common uses for this are:
150     ///
151     /// Set it to zero to put multiple terms in the same position.  This is useful if, eg., a word has multiple
152     /// stems.  Searches for phrases including either stem will match.  In this case, all but the first stem's
153     /// increment should be set to zero: the increment of the first instance should be one.  Repeating a token
154     /// with an increment of zero can also be used to boost the scores of matches on that token.
155     ///
156     /// Set it to values greater than one to inhibit exact phrase matches.  If, for example, one does not want
157     /// phrases to match across removed stop words, then one could build a stop word filter that removes stop
158     /// words and also sets the increment to the number of stop words removed before each non-stop word.  Then
159     /// exact phrase queries will only match when the terms occur with no intervening stop words.
160     ///
161     /// @param positionIncrement the distance from the prior term
162     /// @see TermPositions
163     virtual void setPositionIncrement(int32_t positionIncrement);
164 
165     /// Returns the position increment of this Token.
166     /// @see #setPositionIncrement
167     virtual int32_t getPositionIncrement();
168 
169     /// Returns the Token's term text.
170     ///
171     /// This method has a performance penalty because the text is stored internally in a char[].  If possible,
172     /// use {@link #termBuffer()} and {@link #termLength()} directly instead.  If you really need a String, use
173     /// this method, which is nothing more than a convenience call to String(token->termBuffer(), token->termLength())
174     virtual String term();
175 
176     /// Copies the contents of buffer, starting at offset for length characters, into the termBuffer array.
177     /// @param buffer the buffer to copy
178     /// @param offset the index in the buffer of the first character to copy
179     /// @param length the number of characters to copy
180     virtual void setTermBuffer(const wchar_t* buffer, int32_t offset, int32_t length);
181 
182     /// Copies the contents of buffer into the termBuffer array.
183     /// @param buffer the buffer to copy
184     virtual void setTermBuffer(const String& buffer);
185 
186     /// Copies the contents of buffer, starting at offset and continuing for length characters, into the termBuffer array.
187     /// @param buffer the buffer to copy
188     /// @param offset the index in the buffer of the first character to copy
189     /// @param length the number of characters to copy
190     virtual void setTermBuffer(const String& buffer, int32_t offset, int32_t length);
191 
192     /// Returns the internal termBuffer character array which you can then directly alter.  If the array is too
193     /// small for your token, use {@link #resizeTermBuffer(int)} to increase it.  After altering the buffer be sure
194     /// to call {@link #setTermLength} to record the number of valid characters that were placed into the termBuffer.
195     virtual CharArray termBuffer();
196 
197     /// Optimized implementation of termBuffer.
198     virtual wchar_t* termBufferArray();
199 
200     /// Grows the termBuffer to at least size newSize, preserving the existing content. Note: If the next operation is
201     /// to change the contents of the term buffer use {@link #setTermBuffer(char[], int, int)}, {@link
202     /// #setTermBuffer(String)}, or {@link #setTermBuffer(String, int, int)} to optimally combine the resize with the
203     /// setting of the termBuffer.
204     /// @param newSize minimum size of the new termBuffer
205     /// @return newly created termBuffer with length >= newSize
206     virtual CharArray resizeTermBuffer(int32_t newSize);
207 
208     /// Return number of valid characters (length of the term) in the termBuffer array.
209     virtual int32_t termLength();
210 
211     /// Set number of valid characters (length of the term) in the termBuffer array. Use this to truncate the termBuffer
212     /// or to synchronize with external manipulation of the termBuffer.  Note: to grow the size of the array, use {@link
213     /// #resizeTermBuffer(int)} first.
214     /// @param length the truncated length
215     virtual void setTermLength(int32_t length);
216 
217     /// Returns this Token's starting offset, the position of the first character corresponding to this token in the
218     /// source text.
219     ///
220     /// Note that the difference between endOffset() and startOffset() may not be equal to {@link #termLength}, as the
221     /// term text may have been altered by a stemmer or some other filter.
222     virtual int32_t startOffset();
223 
224     /// Set the starting offset.
225     /// @see #startOffset()
226     virtual void setStartOffset(int32_t offset);
227 
228     /// Returns this Token's ending offset, one greater than the position of the last character corresponding to this
229     /// token in the source text.  The length of the token in the source text is (endOffset - startOffset).
230     virtual int32_t endOffset();
231 
232     /// Set the ending offset.
233     /// @see #endOffset()
234     virtual void setEndOffset(int32_t offset);
235 
236     /// Set the starting and ending offset.
237     /// @see #startOffset() and #endOffset()
238     virtual void setOffset(int32_t startOffset, int32_t endOffset);
239 
240     /// Returns this Token's lexical type.  Defaults to "word".
241     virtual String type();
242 
243     /// Set the lexical type.
244     /// @see #type()
245     virtual void setType(const String& type);
246 
247     /// Get the bitset for any bits that have been set.  This is completely distinct from {@link #type()}, although
248     /// they do share similar purposes.  The flags can be used to encode information about the token for use by other
249     /// {@link TokenFilter}s.
250     ///
251     /// @return The bits
252     virtual int32_t getFlags();
253 
254     /// @see #getFlags()
255     virtual void setFlags(int32_t flags);
256 
257     /// Returns this Token's payload.
258     virtual PayloadPtr getPayload();
259 
260     /// Sets this Token's payload.
261     virtual void setPayload(const PayloadPtr& payload);
262 
263     virtual String toString();
264 
265     /// Resets the term text, payload, flags, and positionIncrement, startOffset, endOffset and token type to default.
266     virtual void clear();
267 
268     virtual LuceneObjectPtr clone(const LuceneObjectPtr& other = LuceneObjectPtr());
269 
270     /// Makes a clone, but replaces the term buffer and start/end offset in the process.  This is more efficient than
271     /// doing a full clone (and then calling setTermBuffer) because it saves a wasted copy of the old termBuffer.
272     TokenPtr clone(CharArray newTermBuffer, int32_t newTermOffset, int32_t newTermLength, int32_t newStartOffset, int32_t newEndOffset);
273 
274     virtual bool equals(const LuceneObjectPtr& other);
275     virtual int32_t hashCode();
276 
277     /// Shorthand for calling {@link #clear}, {@link #setTermBuffer(char[], int, int)}, {@link #setStartOffset},
278     /// {@link #setEndOffset}, {@link #setType}
279     /// @return this Token instance
280     TokenPtr reinit(CharArray newTermBuffer, int32_t newTermOffset, int32_t newTermLength, int32_t newStartOffset, int32_t newEndOffset, const String& newType);
281 
282     /// Shorthand for calling {@link #clear}, {@link #setTermBuffer(char[], int, int)}, {@link #setStartOffset},
283     /// {@link #setEndOffset}, {@link #setType} on Token::DEFAULT_TYPE
284     /// @return this Token instance
285     TokenPtr reinit(CharArray newTermBuffer, int32_t newTermOffset, int32_t newTermLength, int32_t newStartOffset, int32_t newEndOffset);
286 
287     /// Shorthand for calling {@link #clear}, {@link #setTermBuffer(String)}, {@link #setStartOffset},
288     /// {@link #setEndOffset}, {@link #setType}
289     /// @return this Token instance
290     TokenPtr reinit(const String& newTerm, int32_t newStartOffset, int32_t newEndOffset, const String& newType);
291 
292     /// Shorthand for calling {@link #clear}, {@link #setTermBuffer(String)}, {@link #setStartOffset},
293     /// {@link #setEndOffset}, {@link #setType}
294     /// @return this Token instance
295     TokenPtr reinit(const String& newTerm, int32_t newTermOffset, int32_t newTermLength, int32_t newStartOffset, int32_t newEndOffset, const String& newType);
296 
297     /// Shorthand for calling {@link #clear}, {@link #setTermBuffer(String)}, {@link #setStartOffset},
298     /// {@link #setEndOffset}, {@link #setType} on Token::DEFAULT_TYPE
299     /// @return this Token instance
300     TokenPtr reinit(const String& newTerm, int32_t newStartOffset, int32_t newEndOffset);
301 
302     /// Shorthand for calling {@link #clear}, {@link #setTermBuffer(String, int, int)}, {@link #setStartOffset},
303     /// {@link #setEndOffset}, {@link #setType} on Token::DEFAULT_TYPE
304     /// @return this Token instance
305     TokenPtr reinit(const String& newTerm, int32_t newTermOffset, int32_t newTermLength, int32_t newStartOffset, int32_t newEndOffset);
306 
307     /// Copy the prototype token's fields into this one. Note: Payloads are shared.
308     void reinit(const TokenPtr& prototype);
309 
310     /// Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
311     void reinit(const TokenPtr& prototype, const String& newTerm);
312 
313     /// Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared.
314     void reinit(const TokenPtr& prototype, CharArray newTermBuffer, int32_t offset, int32_t length);
315 
316     virtual void copyTo(const AttributePtr& target);
317 
318     /// Convenience factory that returns Token as implementation for the basic attributes
319     static AttributeFactoryPtr TOKEN_ATTRIBUTE_FACTORY();
320 
321 protected:
322     /// Construct Token and initialize values
323     void ConstructToken(int32_t start, int32_t end, const String& type, int32_t flags);
324 
325     /// Allocates a buffer char[] of at least newSize, without preserving the existing content.  Its always used in
326     /// places that set the content.
327     /// @param newSize minimum size of the buffer
328     void growTermBuffer(int32_t newSize);
329 
330     void initTermBuffer();
331 
332     /// Like clear() but doesn't clear termBuffer/text
333     void clearNoTermBuffer();
334 };
335 
336 /// Creates a TokenAttributeFactory returning {@link Token} as instance for the basic attributes and for all other
337 /// attributes calls the given delegate factory.
338 class LPPAPI TokenAttributeFactory : public AttributeFactory {
339 public:
340     TokenAttributeFactory(const AttributeFactoryPtr& delegate);
341     virtual ~TokenAttributeFactory();
342 
343     LUCENE_CLASS(TokenAttributeFactory);
344 
345 protected:
346     AttributeFactoryPtr delegate;
347 
348 public:
349     virtual AttributePtr createAttributeInstance(const String& className);
350     virtual bool equals(const LuceneObjectPtr& other);
351     virtual int32_t hashCode();
352 };
353 
354 }
355 
356 #endif
357