1 2 /** 3 * Copyright (C) 2018-present MongoDB, Inc. 4 * 5 * This program is free software: you can redistribute it and/or modify 6 * it under the terms of the Server Side Public License, version 1, 7 * as published by MongoDB, Inc. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * Server Side Public License for more details. 13 * 14 * You should have received a copy of the Server Side Public License 15 * along with this program. If not, see 16 * <http://www.mongodb.com/licensing/server-side-public-license>. 17 * 18 * As a special exception, the copyright holders give permission to link the 19 * code of portions of this program with the OpenSSL library under certain 20 * conditions as described in each individual source file and distribute 21 * linked combinations including the program with the OpenSSL library. You 22 * must comply with the Server Side Public License in all respects for 23 * all of the code used other than as permitted herein. If you modify file(s) 24 * with this exception, you may extend this exception to your version of the 25 * file(s), but you are not obligated to do so. If you do not wish to do so, 26 * delete this exception statement from your version. If you delete this 27 * exception statement from all source files in the program, then also delete 28 * it in the license file. 29 */ 30 31 32 #pragma once 33 34 #include <cstdint> 35 36 #include "mongo/base/disallow_copying.h" 37 #include "mongo/base/string_data.h" 38 39 namespace mongo { 40 namespace fts { 41 42 class FTSLanguage; 43 class StopWords; 44 45 /** 46 * FTSTokenizer 47 * A iterator of "documents" where a document contains space delimited words. For each word returns 48 * a stem or lemma version of a word optimized for full text indexing. Supports various options to 49 * control how tokens are generated. 50 */ 51 class FTSTokenizer { 52 public: 53 virtual ~FTSTokenizer() = default; 54 55 /** 56 * Options for generating tokens. 57 */ 58 using Options = uint8_t; 59 60 /** 61 * Default means lower cased, diacritics removed, and stop words are not filtered. 62 */ 63 static const Options kNone = 0; 64 65 /** 66 * Do not lower case terms. 67 */ 68 static const Options kGenerateCaseSensitiveTokens = 1 << 0; 69 70 /** 71 * Filter out stop words from return tokens. 72 */ 73 static const Options kFilterStopWords = 1 << 1; 74 75 /** 76 * Do not remove diacritics from terms. 77 */ 78 static const Options kGenerateDiacriticSensitiveTokens = 1 << 2; 79 80 /** 81 * Process a new document, and discards any previous results. 82 * May be called multiple times on an instance of an iterator. 83 */ 84 virtual void reset(StringData document, Options options) = 0; 85 86 /** 87 * Moves to the next token in the iterator. 88 * Returns false when the iterator reaches end of the document. 89 */ 90 virtual bool moveNext() = 0; 91 92 /** 93 * Returns stemmed form, normalized, and lowercased depending on the parameter 94 * to the reset method. 95 * Returned StringData is valid until next call to moveNext(). 96 */ 97 virtual StringData get() const = 0; 98 }; 99 100 } // namespace fts 101 } // namespace mongo 102