1 ///////////////////////////////////////////////////////////////////////////// 2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved. 3 // Distributable under the terms of either the Apache License (Version 2.0) 4 // or the GNU Lesser General Public License. 5 ///////////////////////////////////////////////////////////////////////////// 6 7 #ifndef TOKEN_H 8 #define TOKEN_H 9 10 #include "Attribute.h" 11 #include "AttributeSource.h" 12 13 namespace Lucene { 14 15 /// A Token is an occurrence of a term from the text of a field. It consists of a term's text, the start and end 16 /// offset of the term in the text of the field and a type string. 17 /// 18 /// The start and end offsets permit applications to re-associate a token with its source text, eg., to display 19 /// highlighted query terms in a document browser, or to show matching text fragments in a 20 /// <abbr title="KeyWord In Context">KWIC</abbr> display, etc. 21 /// 22 /// The type is a string, assigned by a lexical analyzer (a.k.a. tokenizer), naming the lexical or syntactic class 23 /// that the token belongs to. For example an end of sentence marker token might be implemented with type "eos". 24 /// The default token type is "word". 25 /// 26 /// A Token can optionally have metadata (a.k.a. Payload) in the form of a variable length byte array. Use {@link 27 /// TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads 28 /// from the index. 29 /// 30 /// Tokenizers and TokenFilters should try to re-use a Token instance when possible for best performance, by implementing 31 /// the {@link TokenStream#incrementToken()} API. Failing that, to create a new Token you should first use one of 32 /// the constructors that starts with null text. To load the token from a char[] use 33 /// {@link #setTermBuffer(char[], int, int)}. To load from a String use {@link #setTermBuffer(String)} or {@link 34 /// #setTermBuffer(String, int, int)}. Alternatively you can get the Token's termBuffer by calling either {@link 35 /// #termBuffer()}, if you know that your text is shorter than the capacity of the termBuffer or {@link 36 /// #resizeTermBuffer(int)}, if there is any possibility that you may need to grow the buffer. Fill in the characters 37 /// of your term into this buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, 38 /// or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to 39 /// set the length of the term text. 40 /// 41 /// Typical Token reuse patterns: 42 /// 43 /// Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified): 44 /// <pre> 45 /// return reusableToken->reinit(string, startOffset, endOffset[, type]); 46 /// </pre> 47 /// 48 /// Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified): 49 /// <pre> 50 /// return reusableToken->reinit(string, 0, string.length(), startOffset, endOffset[, type]); 51 /// </pre> 52 /// 53 /// Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified): 54 /// <pre> 55 /// return reusableToken->reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]); 56 /// </pre> 57 /// 58 /// Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified): 59 /// <pre> 60 /// return reusableToken->reinit(buffer, start, end - start, startOffset, endOffset[, type]); 61 /// </pre> 62 /// 63 /// Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified): 64 /// <pre> 65 /// return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]); 66 /// </pre> 67 /// 68 /// A few things to note: 69 /// clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but 70 /// should affect no one. 71 /// Because TokenStreams can be chained, one cannot assume that the Token's current type is correct. The startOffset 72 /// and endOffset represent the start and offset in the source text, so be careful in adjusting them. When caching a 73 /// reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again. 74 /// 75 /// @see Payload 76 class LPPAPI Token : public Attribute { 77 public: 78 /// Constructs a Token will null text. 79 Token(); 80 81 /// Constructs a Token with null text and start and end offsets. 82 /// @param start start offset in the source text 83 /// @param end end offset in the source text 84 Token(int32_t start, int32_t end); 85 86 /// Constructs a Token with null text and start and end offsets plus the Token type. 87 /// @param start start offset in the source text 88 /// @param end end offset in the source text 89 /// @param type the lexical type of this Token 90 Token(int32_t start, int32_t end, const String& type); 91 92 /// Constructs a Token with null text and start and end offsets plus flags. 93 /// @param start start offset in the source text 94 /// @param end end offset in the source text 95 /// @param flags The bits to set for this token 96 Token(int32_t start, int32_t end, int32_t flags); 97 98 /// Constructs a Token with the given term text, start and end offsets. The type defaults to "word." 99 /// NOTE: for better indexing speed you should instead use the char[] termBuffer methods to set the term text. 100 /// @param text term text 101 /// @param start start offset in the source text 102 /// @param end end offset in the source text 103 Token(const String& text, int32_t start, int32_t end); 104 105 /// Constructs a Token with the given term text, start and end offsets and type. 106 /// NOTE: for better indexing speed you should instead use the char[] termBuffer methods to set the term text. 107 /// @param text term text 108 /// @param start start offset in the source text 109 /// @param end end offset in the source text 110 /// @param type the lexical type of this Token 111 Token(const String& text, int32_t start, int32_t end, const String& type); 112 113 /// Constructs a Token with the given term text, start and end offsets and flags. 114 /// NOTE: for better indexing speed you should instead use the char[] termBuffer methods to set the term text. 115 /// @param text term text 116 /// @param start start offset in the source text 117 /// @param end end offset in the source text 118 /// @param flags The bits to set for this token 119 Token(const String& text, int32_t start, int32_t end, int32_t flags); 120 121 /// Constructs a Token with the given term buffer (offset and length), start and end offsets 122 Token(CharArray startTermBuffer, int32_t termBufferOffset, int32_t termBufferLength, int32_t start, int32_t end); 123 124 virtual ~Token(); 125 126 LUCENE_CLASS(Token); 127 128 public: 129 static const String& DEFAULT_TYPE(); 130 131 protected: 132 static const int32_t MIN_BUFFER_SIZE; 133 134 CharArray _termBuffer; 135 int32_t _termLength; 136 int32_t _startOffset; 137 int32_t _endOffset; 138 String _type; 139 int32_t flags; 140 PayloadPtr payload; 141 int32_t positionIncrement; 142 143 public: 144 /// Set the position increment. This determines the position of this token relative to the previous Token 145 /// in a {@link TokenStream}, used in phrase searching. 146 /// 147 /// The default value is one. 148 /// 149 /// Some common uses for this are: 150 /// 151 /// Set it to zero to put multiple terms in the same position. This is useful if, eg., a word has multiple 152 /// stems. Searches for phrases including either stem will match. In this case, all but the first stem's 153 /// increment should be set to zero: the increment of the first instance should be one. Repeating a token 154 /// with an increment of zero can also be used to boost the scores of matches on that token. 155 /// 156 /// Set it to values greater than one to inhibit exact phrase matches. If, for example, one does not want 157 /// phrases to match across removed stop words, then one could build a stop word filter that removes stop 158 /// words and also sets the increment to the number of stop words removed before each non-stop word. Then 159 /// exact phrase queries will only match when the terms occur with no intervening stop words. 160 /// 161 /// @param positionIncrement the distance from the prior term 162 /// @see TermPositions 163 virtual void setPositionIncrement(int32_t positionIncrement); 164 165 /// Returns the position increment of this Token. 166 /// @see #setPositionIncrement 167 virtual int32_t getPositionIncrement(); 168 169 /// Returns the Token's term text. 170 /// 171 /// This method has a performance penalty because the text is stored internally in a char[]. If possible, 172 /// use {@link #termBuffer()} and {@link #termLength()} directly instead. If you really need a String, use 173 /// this method, which is nothing more than a convenience call to String(token->termBuffer(), token->termLength()) 174 virtual String term(); 175 176 /// Copies the contents of buffer, starting at offset for length characters, into the termBuffer array. 177 /// @param buffer the buffer to copy 178 /// @param offset the index in the buffer of the first character to copy 179 /// @param length the number of characters to copy 180 virtual void setTermBuffer(const wchar_t* buffer, int32_t offset, int32_t length); 181 182 /// Copies the contents of buffer into the termBuffer array. 183 /// @param buffer the buffer to copy 184 virtual void setTermBuffer(const String& buffer); 185 186 /// Copies the contents of buffer, starting at offset and continuing for length characters, into the termBuffer array. 187 /// @param buffer the buffer to copy 188 /// @param offset the index in the buffer of the first character to copy 189 /// @param length the number of characters to copy 190 virtual void setTermBuffer(const String& buffer, int32_t offset, int32_t length); 191 192 /// Returns the internal termBuffer character array which you can then directly alter. If the array is too 193 /// small for your token, use {@link #resizeTermBuffer(int)} to increase it. After altering the buffer be sure 194 /// to call {@link #setTermLength} to record the number of valid characters that were placed into the termBuffer. 195 virtual CharArray termBuffer(); 196 197 /// Optimized implementation of termBuffer. 198 virtual wchar_t* termBufferArray(); 199 200 /// Grows the termBuffer to at least size newSize, preserving the existing content. Note: If the next operation is 201 /// to change the contents of the term buffer use {@link #setTermBuffer(char[], int, int)}, {@link 202 /// #setTermBuffer(String)}, or {@link #setTermBuffer(String, int, int)} to optimally combine the resize with the 203 /// setting of the termBuffer. 204 /// @param newSize minimum size of the new termBuffer 205 /// @return newly created termBuffer with length >= newSize 206 virtual CharArray resizeTermBuffer(int32_t newSize); 207 208 /// Return number of valid characters (length of the term) in the termBuffer array. 209 virtual int32_t termLength(); 210 211 /// Set number of valid characters (length of the term) in the termBuffer array. Use this to truncate the termBuffer 212 /// or to synchronize with external manipulation of the termBuffer. Note: to grow the size of the array, use {@link 213 /// #resizeTermBuffer(int)} first. 214 /// @param length the truncated length 215 virtual void setTermLength(int32_t length); 216 217 /// Returns this Token's starting offset, the position of the first character corresponding to this token in the 218 /// source text. 219 /// 220 /// Note that the difference between endOffset() and startOffset() may not be equal to {@link #termLength}, as the 221 /// term text may have been altered by a stemmer or some other filter. 222 virtual int32_t startOffset(); 223 224 /// Set the starting offset. 225 /// @see #startOffset() 226 virtual void setStartOffset(int32_t offset); 227 228 /// Returns this Token's ending offset, one greater than the position of the last character corresponding to this 229 /// token in the source text. The length of the token in the source text is (endOffset - startOffset). 230 virtual int32_t endOffset(); 231 232 /// Set the ending offset. 233 /// @see #endOffset() 234 virtual void setEndOffset(int32_t offset); 235 236 /// Set the starting and ending offset. 237 /// @see #startOffset() and #endOffset() 238 virtual void setOffset(int32_t startOffset, int32_t endOffset); 239 240 /// Returns this Token's lexical type. Defaults to "word". 241 virtual String type(); 242 243 /// Set the lexical type. 244 /// @see #type() 245 virtual void setType(const String& type); 246 247 /// Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although 248 /// they do share similar purposes. The flags can be used to encode information about the token for use by other 249 /// {@link TokenFilter}s. 250 /// 251 /// @return The bits 252 virtual int32_t getFlags(); 253 254 /// @see #getFlags() 255 virtual void setFlags(int32_t flags); 256 257 /// Returns this Token's payload. 258 virtual PayloadPtr getPayload(); 259 260 /// Sets this Token's payload. 261 virtual void setPayload(const PayloadPtr& payload); 262 263 virtual String toString(); 264 265 /// Resets the term text, payload, flags, and positionIncrement, startOffset, endOffset and token type to default. 266 virtual void clear(); 267 268 virtual LuceneObjectPtr clone(const LuceneObjectPtr& other = LuceneObjectPtr()); 269 270 /// Makes a clone, but replaces the term buffer and start/end offset in the process. This is more efficient than 271 /// doing a full clone (and then calling setTermBuffer) because it saves a wasted copy of the old termBuffer. 272 TokenPtr clone(CharArray newTermBuffer, int32_t newTermOffset, int32_t newTermLength, int32_t newStartOffset, int32_t newEndOffset); 273 274 virtual bool equals(const LuceneObjectPtr& other); 275 virtual int32_t hashCode(); 276 277 /// Shorthand for calling {@link #clear}, {@link #setTermBuffer(char[], int, int)}, {@link #setStartOffset}, 278 /// {@link #setEndOffset}, {@link #setType} 279 /// @return this Token instance 280 TokenPtr reinit(CharArray newTermBuffer, int32_t newTermOffset, int32_t newTermLength, int32_t newStartOffset, int32_t newEndOffset, const String& newType); 281 282 /// Shorthand for calling {@link #clear}, {@link #setTermBuffer(char[], int, int)}, {@link #setStartOffset}, 283 /// {@link #setEndOffset}, {@link #setType} on Token::DEFAULT_TYPE 284 /// @return this Token instance 285 TokenPtr reinit(CharArray newTermBuffer, int32_t newTermOffset, int32_t newTermLength, int32_t newStartOffset, int32_t newEndOffset); 286 287 /// Shorthand for calling {@link #clear}, {@link #setTermBuffer(String)}, {@link #setStartOffset}, 288 /// {@link #setEndOffset}, {@link #setType} 289 /// @return this Token instance 290 TokenPtr reinit(const String& newTerm, int32_t newStartOffset, int32_t newEndOffset, const String& newType); 291 292 /// Shorthand for calling {@link #clear}, {@link #setTermBuffer(String)}, {@link #setStartOffset}, 293 /// {@link #setEndOffset}, {@link #setType} 294 /// @return this Token instance 295 TokenPtr reinit(const String& newTerm, int32_t newTermOffset, int32_t newTermLength, int32_t newStartOffset, int32_t newEndOffset, const String& newType); 296 297 /// Shorthand for calling {@link #clear}, {@link #setTermBuffer(String)}, {@link #setStartOffset}, 298 /// {@link #setEndOffset}, {@link #setType} on Token::DEFAULT_TYPE 299 /// @return this Token instance 300 TokenPtr reinit(const String& newTerm, int32_t newStartOffset, int32_t newEndOffset); 301 302 /// Shorthand for calling {@link #clear}, {@link #setTermBuffer(String, int, int)}, {@link #setStartOffset}, 303 /// {@link #setEndOffset}, {@link #setType} on Token::DEFAULT_TYPE 304 /// @return this Token instance 305 TokenPtr reinit(const String& newTerm, int32_t newTermOffset, int32_t newTermLength, int32_t newStartOffset, int32_t newEndOffset); 306 307 /// Copy the prototype token's fields into this one. Note: Payloads are shared. 308 void reinit(const TokenPtr& prototype); 309 310 /// Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared. 311 void reinit(const TokenPtr& prototype, const String& newTerm); 312 313 /// Copy the prototype token's fields into this one, with a different term. Note: Payloads are shared. 314 void reinit(const TokenPtr& prototype, CharArray newTermBuffer, int32_t offset, int32_t length); 315 316 virtual void copyTo(const AttributePtr& target); 317 318 /// Convenience factory that returns Token as implementation for the basic attributes 319 static AttributeFactoryPtr TOKEN_ATTRIBUTE_FACTORY(); 320 321 protected: 322 /// Construct Token and initialize values 323 void ConstructToken(int32_t start, int32_t end, const String& type, int32_t flags); 324 325 /// Allocates a buffer char[] of at least newSize, without preserving the existing content. Its always used in 326 /// places that set the content. 327 /// @param newSize minimum size of the buffer 328 void growTermBuffer(int32_t newSize); 329 330 void initTermBuffer(); 331 332 /// Like clear() but doesn't clear termBuffer/text 333 void clearNoTermBuffer(); 334 }; 335 336 /// Creates a TokenAttributeFactory returning {@link Token} as instance for the basic attributes and for all other 337 /// attributes calls the given delegate factory. 338 class LPPAPI TokenAttributeFactory : public AttributeFactory { 339 public: 340 TokenAttributeFactory(const AttributeFactoryPtr& delegate); 341 virtual ~TokenAttributeFactory(); 342 343 LUCENE_CLASS(TokenAttributeFactory); 344 345 protected: 346 AttributeFactoryPtr delegate; 347 348 public: 349 virtual AttributePtr createAttributeInstance(const String& className); 350 virtual bool equals(const LuceneObjectPtr& other); 351 virtual int32_t hashCode(); 352 }; 353 354 } 355 356 #endif 357