1 
2 /**
3  *    Copyright (C) 2018-present MongoDB, Inc.
4  *
5  *    This program is free software: you can redistribute it and/or modify
6  *    it under the terms of the Server Side Public License, version 1,
7  *    as published by MongoDB, Inc.
8  *
9  *    This program is distributed in the hope that it will be useful,
10  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *    Server Side Public License for more details.
13  *
14  *    You should have received a copy of the Server Side Public License
15  *    along with this program. If not, see
16  *    <http://www.mongodb.com/licensing/server-side-public-license>.
17  *
18  *    As a special exception, the copyright holders give permission to link the
19  *    code of portions of this program with the OpenSSL library under certain
20  *    conditions as described in each individual source file and distribute
21  *    linked combinations including the program with the OpenSSL library. You
22  *    must comply with the Server Side Public License in all respects for
23  *    all of the code used other than as permitted herein. If you modify file(s)
24  *    with this exception, you may extend this exception to your version of the
25  *    file(s), but you are not obligated to do so. If you do not wish to do so,
26  *    delete this exception statement from your version. If you delete this
27  *    exception statement from all source files in the program, then also delete
28  *    it in the license file.
29  */
30 
31 #include "mongo/db/fts/unicode/string.h"
32 
33 #include <algorithm>
34 #include <boost/algorithm/searching/boyer_moore.hpp>
35 #include <boost/version.hpp>
36 
37 #include "mongo/db/fts/unicode/byte_vector.h"
38 #include "mongo/platform/bits.h"
39 #include "mongo/shell/linenoise_utf8.h"
40 #include "mongo/util/assert_util.h"
41 
42 namespace mongo {
43 namespace unicode {
44 
45 namespace {
46 template <typename OutputIterator>
appendUtf8Codepoint(char32_t codepoint,OutputIterator * outputIt)47 inline void appendUtf8Codepoint(char32_t codepoint, OutputIterator* outputIt) {
48     if (codepoint <= 0x7f /* max 1-byte codepoint */) {
49         *(*outputIt)++ = (codepoint);
50     } else if (codepoint <= 0x7ff /* max 2-byte codepoint*/) {
51         *(*outputIt)++ = ((codepoint >> (6 * 1)) | 0xc0);  // 2 leading 1s.
52         *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
53     } else if (codepoint <= 0xffff /* max 3-byte codepoint*/) {
54         *(*outputIt)++ = ((codepoint >> (6 * 2)) | 0xe0);  // 3 leading 1s.
55         *(*outputIt)++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
56         *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
57     } else {
58         uassert(ErrorCodes::BadValue, "text contains invalid UTF-8", codepoint <= 0x10FFFF);
59         *(*outputIt)++ = ((codepoint >> (6 * 3)) | 0xf0);  // 4 leading 1s.
60         *(*outputIt)++ = (((codepoint >> (6 * 2)) & 0x3f) | 0x80);
61         *(*outputIt)++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
62         *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
63     }
64 }
65 }
66 
67 using linenoise_utf8::copyString32to8;
68 using linenoise_utf8::copyString8to32;
69 
70 using std::u32string;
71 
String(const StringData utf8_src)72 String::String(const StringData utf8_src) {
73     // Convert UTF-8 input to UTF-32 data.
74     setData(utf8_src);
75 }
76 
resetData(const StringData utf8_src)77 void String::resetData(const StringData utf8_src) {
78     // Convert UTF-8 input to UTF-32 data.
79     setData(utf8_src);
80 }
81 
setData(const StringData utf8_src)82 void String::setData(const StringData utf8_src) {
83     // _data is the target, resize it so that it's guaranteed to fit all of the input characters,
84     // plus a null character if there isn't one.
85     _data.resize(utf8_src.size() + 1);
86 
87     int result = 0;
88     size_t resultSize = 0;
89 
90     // Although utf8_src.rawData() is not guaranteed to be null-terminated, copyString8to32 won't
91     // access bad memory because it is limited by the size of its output buffer, which is set to the
92     // size of utf8_src.
93     copyString8to32(&_data[0],
94                     reinterpret_cast<const unsigned char*>(&utf8_src.rawData()[0]),
95                     _data.size(),
96                     resultSize,
97                     result);
98 
99     uassert(28755, "text contains invalid UTF-8", result == 0);
100 
101     // Resize _data so it is only as big as what it contains.
102     _data.resize(resultSize);
103     _needsOutputConversion = true;
104 }
105 
toString()106 std::string String::toString() {
107     // _outputBuf is the target, resize it so that it's guaranteed to fit all of the input
108     // characters, plus a null character if there isn't one.
109     if (_needsOutputConversion) {
110         _outputBuf.resize(_data.size() * 4 + 1);
111         size_t resultSize = copyString32to8(
112             reinterpret_cast<unsigned char*>(&_outputBuf[0]), &_data[0], _outputBuf.size());
113 
114         // Resize output so it is only as large as what it contains.
115         _outputBuf.resize(resultSize);
116         _needsOutputConversion = false;
117     }
118     return _outputBuf;
119 }
120 
121 template <typename Func>
substrToBufWithTransform(StackBufBuilder * buffer,size_t pos,size_t len,Func func) const122 StringData String::substrToBufWithTransform(StackBufBuilder* buffer,
123                                             size_t pos,
124                                             size_t len,
125                                             Func func) const {
126     pos = std::min(pos, _data.size());
127     len = std::min(len, _data.size() - pos);
128 
129     buffer->reset();
130     auto outputIt = buffer->skip(len * 4);  // Reserve room for worst-case expansion.
131     auto inputIt = _data.begin() + pos;
132     for (size_t i = 0; i < len; i++) {
133         appendUtf8Codepoint(func(*inputIt++), &outputIt);
134     }
135     buffer->setlen(outputIt - buffer->buf());
136     return {buffer->buf(), size_t(buffer->len())};
137 }
138 
substrToBuf(StackBufBuilder * buffer,size_t pos,size_t len) const139 StringData String::substrToBuf(StackBufBuilder* buffer, size_t pos, size_t len) const {
140     const auto identityFunc = [](char32_t ch) { return ch; };
141     return substrToBufWithTransform(buffer, pos, len, identityFunc);
142 }
143 
toLowerToBuf(StackBufBuilder * buffer,CaseFoldMode mode,size_t pos,size_t len) const144 StringData String::toLowerToBuf(StackBufBuilder* buffer,
145                                 CaseFoldMode mode,
146                                 size_t pos,
147                                 size_t len) const {
148     const auto toLower = [mode](char32_t ch) { return codepointToLower(ch, mode); };
149     return substrToBufWithTransform(buffer, pos, len, toLower);
150 }
151 
152 
caseFoldAndStripDiacritics(StackBufBuilder * buffer,StringData utf8,SubstrMatchOptions options,CaseFoldMode mode)153 StringData String::caseFoldAndStripDiacritics(StackBufBuilder* buffer,
154                                               StringData utf8,
155                                               SubstrMatchOptions options,
156                                               CaseFoldMode mode) {
157     // This fires if your input buffer the same as your output buffer.
158     invariant(buffer->buf() != utf8.rawData());
159 
160     if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) {
161         // No transformation needed. Just return the input data unmodified.
162         return utf8;
163     }
164 
165     // Allocate space for up to 2x growth which is the worst possible case for stripping diacritics
166     // and casefolding. Proof: the only case where 1 byte goes to >1 is 'I' in Turkish going to 2
167     // bytes. The biggest codepoint is 4 bytes which is also 2x 2 bytes. This holds as long as we
168     // don't map a single code point to more than one.
169     buffer->reset();
170     auto outputIt = buffer->skip(utf8.size() * 2);
171 
172     for (auto inputIt = utf8.begin(), endIt = utf8.end(); inputIt != endIt;) {
173 #ifdef MONGO_HAVE_FAST_BYTE_VECTOR
174         if (size_t(endIt - inputIt) >= ByteVector::size) {
175             // Try the fast path for 16 contiguous bytes of ASCII.
176             auto word = ByteVector::load(&*inputIt);
177 
178             // Count the bytes of ASCII.
179             uint32_t usableBytes = ByteVector::countInitialZeros(word.maskHigh());
180             if (usableBytes) {
181                 if (!(options & kCaseSensitive)) {
182                     if (mode == CaseFoldMode::kTurkish) {
183                         ByteVector::Mask iMask = word.compareEQ('I').maskAny();
184                         if (iMask) {
185                             usableBytes =
186                                 std::min(usableBytes, ByteVector::countInitialZeros(iMask));
187                         }
188                     }
189                     // 0xFF for each byte in word that is uppercase, 0x00 for all others.
190                     ByteVector uppercaseMask = word.compareGT('A' - 1) & word.compareLT('Z' + 1);
191                     word |= (uppercaseMask & ByteVector(0x20));  // Set the ascii lowercase bit.
192                 }
193 
194                 if (!(options & kDiacriticSensitive)) {
195                     ByteVector::Mask diacriticMask =
196                         word.compareEQ('^').maskAny() | word.compareEQ('`').maskAny();
197                     if (diacriticMask) {
198                         usableBytes =
199                             std::min(usableBytes, ByteVector::countInitialZeros(diacriticMask));
200                     }
201                 }
202 
203                 word.store(&*outputIt);
204                 outputIt += usableBytes;
205                 inputIt += usableBytes;
206                 if (usableBytes == ByteVector::size)
207                     continue;
208             }
209             // If we get here, inputIt is positioned on a byte that we know needs special handling.
210             // Either it isn't ASCII or it is a diacritic that needs to be stripped.
211         }
212 #endif
213         const uint8_t firstByte = *inputIt++;
214         char32_t codepoint = 0;
215         if (firstByte <= 0x7f) {
216             // ASCII special case. Can use faster operations.
217             if ((!(options & kCaseSensitive)) && (firstByte >= 'A' && firstByte <= 'Z')) {
218                 codepoint = (mode == CaseFoldMode::kTurkish && firstByte == 'I')
219                     ? 0x131                // In Turkish, I -> ı (i with no dot).
220                     : (firstByte | 0x20);  // Set the ascii lowercase bit on the character.
221             } else {
222                 // ASCII has two pure diacritics that should be skipped and no characters that
223                 // change when removing diacritics.
224                 if ((options & kDiacriticSensitive) || !(firstByte == '^' || firstByte == '`')) {
225                     *outputIt++ = (firstByte);
226                 }
227                 continue;
228             }
229         } else {
230             // firstByte indicates that it is not an ASCII char.
231             int leadingOnes = countLeadingZeros64(~(uint64_t(firstByte) << (64 - 8)));
232 
233             // Only checking enough to ensure that this code doesn't crash in the face of malformed
234             // utf-8. We make no guarantees about what results will be returned in this case.
235             uassert(ErrorCodes::BadValue,
236                     "text contains invalid UTF-8",
237                     leadingOnes > 1 && leadingOnes <= 4 && inputIt + leadingOnes - 1 <= endIt);
238 
239             codepoint = firstByte & (0xff >> leadingOnes);  // mask off the size indicator.
240             for (int subByteIx = 1; subByteIx < leadingOnes; subByteIx++) {
241                 const uint8_t subByte = *inputIt++;
242                 codepoint <<= 6;
243                 codepoint |= subByte & 0x3f;  // mask off continuation bits.
244             }
245 
246             if (!(options & kCaseSensitive)) {
247                 codepoint = codepointToLower(codepoint, mode);
248             }
249 
250             if (!(options & kDiacriticSensitive)) {
251                 codepoint = codepointRemoveDiacritics(codepoint);
252                 if (!codepoint)
253                     continue;  // codepoint is a pure diacritic.
254             }
255         }
256 
257         appendUtf8Codepoint(codepoint, &outputIt);
258     }
259 
260     buffer->setlen(outputIt - buffer->buf());
261     return {buffer->buf(), size_t(buffer->len())};
262 }
263 
substrMatch(const std::string & str,const std::string & find,SubstrMatchOptions options,CaseFoldMode cfMode)264 bool String::substrMatch(const std::string& str,
265                          const std::string& find,
266                          SubstrMatchOptions options,
267                          CaseFoldMode cfMode) {
268     if (cfMode == CaseFoldMode::kTurkish) {
269         // Turkish comparisons are always case insensitive due to their handling of I/i.
270         options &= ~kCaseSensitive;
271     }
272 
273     StackBufBuilder haystackBuf;
274     StackBufBuilder needleBuf;
275     auto haystack = caseFoldAndStripDiacritics(&haystackBuf, str, options, cfMode);
276     auto needle = caseFoldAndStripDiacritics(&needleBuf, find, options, cfMode);
277 
278 // Case sensitive and diacritic sensitive.
279 #if BOOST_VERSION < 106200
280     return boost::algorithm::boyer_moore_search(
281                haystack.begin(), haystack.end(), needle.begin(), needle.end()) != haystack.end();
282 #else
283     return boost::algorithm::boyer_moore_search(
284                haystack.begin(), haystack.end(), needle.begin(), needle.end()) !=
285         std::make_pair(haystack.end(), haystack.end());
286 #endif
287 }
288 
289 }  // namespace unicode
290 }  // namespace mongo
291