1 
2 /**
3  *    Copyright (C) 2018-present MongoDB, Inc.
4  *
5  *    This program is free software: you can redistribute it and/or modify
6  *    it under the terms of the Server Side Public License, version 1,
7  *    as published by MongoDB, Inc.
8  *
9  *    This program is distributed in the hope that it will be useful,
10  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *    Server Side Public License for more details.
13  *
14  *    You should have received a copy of the Server Side Public License
15  *    along with this program. If not, see
16  *    <http://www.mongodb.com/licensing/server-side-public-license>.
17  *
18  *    As a special exception, the copyright holders give permission to link the
19  *    code of portions of this program with the OpenSSL library under certain
20  *    conditions as described in each individual source file and distribute
21  *    linked combinations including the program with the OpenSSL library. You
22  *    must comply with the Server Side Public License in all respects for
23  *    all of the code used other than as permitted herein. If you modify file(s)
24  *    with this exception, you may extend this exception to your version of the
25  *    file(s), but you are not obligated to do so. If you do not wish to do so,
26  *    delete this exception statement from your version. If you delete this
27  *    exception statement from all source files in the program, then also delete
28  *    it in the license file.
29  */
30 
31 
32 #pragma once
33 
34 #include <cstdint>
35 
36 #include "mongo/base/disallow_copying.h"
37 #include "mongo/base/string_data.h"
38 
39 namespace mongo {
40 namespace fts {
41 
42 class FTSLanguage;
43 class StopWords;
44 
45 /**
46  * FTSTokenizer
47  * A iterator of "documents" where a document contains space delimited words. For each word returns
48  * a stem or lemma version of a word optimized for full text indexing. Supports various options to
49  * control how tokens are generated.
50  */
51 class FTSTokenizer {
52 public:
53     virtual ~FTSTokenizer() = default;
54 
55     /**
56      * Options for generating tokens.
57      */
58     using Options = uint8_t;
59 
60     /**
61      * Default means lower cased, diacritics removed, and stop words are not filtered.
62      */
63     static const Options kNone = 0;
64 
65     /**
66      * Do not lower case terms.
67      */
68     static const Options kGenerateCaseSensitiveTokens = 1 << 0;
69 
70     /**
71      * Filter out stop words from return tokens.
72      */
73     static const Options kFilterStopWords = 1 << 1;
74 
75     /**
76      * Do not remove diacritics from terms.
77      */
78     static const Options kGenerateDiacriticSensitiveTokens = 1 << 2;
79 
80     /**
81      * Process a new document, and discards any previous results.
82      * May be called multiple times on an instance of an iterator.
83      */
84     virtual void reset(StringData document, Options options) = 0;
85 
86     /**
87      * Moves to the next token in the iterator.
88      * Returns false when the iterator reaches end of the document.
89      */
90     virtual bool moveNext() = 0;
91 
92     /**
93      * Returns stemmed form, normalized, and lowercased depending on the parameter
94      * to the reset method.
95      * Returned StringData is valid until next call to moveNext().
96      */
97     virtual StringData get() const = 0;
98 };
99 
100 }  // namespace fts
101 }  // namespace mongo
102