1
2 /**
3 * Copyright (C) 2018-present MongoDB, Inc.
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the Server Side Public License, version 1,
7 * as published by MongoDB, Inc.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * Server Side Public License for more details.
13 *
14 * You should have received a copy of the Server Side Public License
15 * along with this program. If not, see
16 * <http://www.mongodb.com/licensing/server-side-public-license>.
17 *
18 * As a special exception, the copyright holders give permission to link the
19 * code of portions of this program with the OpenSSL library under certain
20 * conditions as described in each individual source file and distribute
21 * linked combinations including the program with the OpenSSL library. You
22 * must comply with the Server Side Public License in all respects for
23 * all of the code used other than as permitted herein. If you modify file(s)
24 * with this exception, you may extend this exception to your version of the
25 * file(s), but you are not obligated to do so. If you do not wish to do so,
26 * delete this exception statement from your version. If you delete this
27 * exception statement from all source files in the program, then also delete
28 * it in the license file.
29 */
30
31 #include "mongo/db/fts/unicode/string.h"
32
33 #include <algorithm>
34 #include <boost/algorithm/searching/boyer_moore.hpp>
35 #include <boost/version.hpp>
36
37 #include "mongo/db/fts/unicode/byte_vector.h"
38 #include "mongo/platform/bits.h"
39 #include "mongo/shell/linenoise_utf8.h"
40 #include "mongo/util/assert_util.h"
41
42 namespace mongo {
43 namespace unicode {
44
45 namespace {
46 template <typename OutputIterator>
appendUtf8Codepoint(char32_t codepoint,OutputIterator * outputIt)47 inline void appendUtf8Codepoint(char32_t codepoint, OutputIterator* outputIt) {
48 if (codepoint <= 0x7f /* max 1-byte codepoint */) {
49 *(*outputIt)++ = (codepoint);
50 } else if (codepoint <= 0x7ff /* max 2-byte codepoint*/) {
51 *(*outputIt)++ = ((codepoint >> (6 * 1)) | 0xc0); // 2 leading 1s.
52 *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
53 } else if (codepoint <= 0xffff /* max 3-byte codepoint*/) {
54 *(*outputIt)++ = ((codepoint >> (6 * 2)) | 0xe0); // 3 leading 1s.
55 *(*outputIt)++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
56 *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
57 } else {
58 uassert(ErrorCodes::BadValue, "text contains invalid UTF-8", codepoint <= 0x10FFFF);
59 *(*outputIt)++ = ((codepoint >> (6 * 3)) | 0xf0); // 4 leading 1s.
60 *(*outputIt)++ = (((codepoint >> (6 * 2)) & 0x3f) | 0x80);
61 *(*outputIt)++ = (((codepoint >> (6 * 1)) & 0x3f) | 0x80);
62 *(*outputIt)++ = (((codepoint >> (6 * 0)) & 0x3f) | 0x80);
63 }
64 }
65 }
66
67 using linenoise_utf8::copyString32to8;
68 using linenoise_utf8::copyString8to32;
69
70 using std::u32string;
71
String(const StringData utf8_src)72 String::String(const StringData utf8_src) {
73 // Convert UTF-8 input to UTF-32 data.
74 setData(utf8_src);
75 }
76
resetData(const StringData utf8_src)77 void String::resetData(const StringData utf8_src) {
78 // Convert UTF-8 input to UTF-32 data.
79 setData(utf8_src);
80 }
81
setData(const StringData utf8_src)82 void String::setData(const StringData utf8_src) {
83 // _data is the target, resize it so that it's guaranteed to fit all of the input characters,
84 // plus a null character if there isn't one.
85 _data.resize(utf8_src.size() + 1);
86
87 int result = 0;
88 size_t resultSize = 0;
89
90 // Although utf8_src.rawData() is not guaranteed to be null-terminated, copyString8to32 won't
91 // access bad memory because it is limited by the size of its output buffer, which is set to the
92 // size of utf8_src.
93 copyString8to32(&_data[0],
94 reinterpret_cast<const unsigned char*>(&utf8_src.rawData()[0]),
95 _data.size(),
96 resultSize,
97 result);
98
99 uassert(28755, "text contains invalid UTF-8", result == 0);
100
101 // Resize _data so it is only as big as what it contains.
102 _data.resize(resultSize);
103 _needsOutputConversion = true;
104 }
105
toString()106 std::string String::toString() {
107 // _outputBuf is the target, resize it so that it's guaranteed to fit all of the input
108 // characters, plus a null character if there isn't one.
109 if (_needsOutputConversion) {
110 _outputBuf.resize(_data.size() * 4 + 1);
111 size_t resultSize = copyString32to8(
112 reinterpret_cast<unsigned char*>(&_outputBuf[0]), &_data[0], _outputBuf.size());
113
114 // Resize output so it is only as large as what it contains.
115 _outputBuf.resize(resultSize);
116 _needsOutputConversion = false;
117 }
118 return _outputBuf;
119 }
120
121 template <typename Func>
substrToBufWithTransform(StackBufBuilder * buffer,size_t pos,size_t len,Func func) const122 StringData String::substrToBufWithTransform(StackBufBuilder* buffer,
123 size_t pos,
124 size_t len,
125 Func func) const {
126 pos = std::min(pos, _data.size());
127 len = std::min(len, _data.size() - pos);
128
129 buffer->reset();
130 auto outputIt = buffer->skip(len * 4); // Reserve room for worst-case expansion.
131 auto inputIt = _data.begin() + pos;
132 for (size_t i = 0; i < len; i++) {
133 appendUtf8Codepoint(func(*inputIt++), &outputIt);
134 }
135 buffer->setlen(outputIt - buffer->buf());
136 return {buffer->buf(), size_t(buffer->len())};
137 }
138
substrToBuf(StackBufBuilder * buffer,size_t pos,size_t len) const139 StringData String::substrToBuf(StackBufBuilder* buffer, size_t pos, size_t len) const {
140 const auto identityFunc = [](char32_t ch) { return ch; };
141 return substrToBufWithTransform(buffer, pos, len, identityFunc);
142 }
143
toLowerToBuf(StackBufBuilder * buffer,CaseFoldMode mode,size_t pos,size_t len) const144 StringData String::toLowerToBuf(StackBufBuilder* buffer,
145 CaseFoldMode mode,
146 size_t pos,
147 size_t len) const {
148 const auto toLower = [mode](char32_t ch) { return codepointToLower(ch, mode); };
149 return substrToBufWithTransform(buffer, pos, len, toLower);
150 }
151
152
caseFoldAndStripDiacritics(StackBufBuilder * buffer,StringData utf8,SubstrMatchOptions options,CaseFoldMode mode)153 StringData String::caseFoldAndStripDiacritics(StackBufBuilder* buffer,
154 StringData utf8,
155 SubstrMatchOptions options,
156 CaseFoldMode mode) {
157 // This fires if your input buffer the same as your output buffer.
158 invariant(buffer->buf() != utf8.rawData());
159
160 if ((options & kCaseSensitive) && (options & kDiacriticSensitive)) {
161 // No transformation needed. Just return the input data unmodified.
162 return utf8;
163 }
164
165 // Allocate space for up to 2x growth which is the worst possible case for stripping diacritics
166 // and casefolding. Proof: the only case where 1 byte goes to >1 is 'I' in Turkish going to 2
167 // bytes. The biggest codepoint is 4 bytes which is also 2x 2 bytes. This holds as long as we
168 // don't map a single code point to more than one.
169 buffer->reset();
170 auto outputIt = buffer->skip(utf8.size() * 2);
171
172 for (auto inputIt = utf8.begin(), endIt = utf8.end(); inputIt != endIt;) {
173 #ifdef MONGO_HAVE_FAST_BYTE_VECTOR
174 if (size_t(endIt - inputIt) >= ByteVector::size) {
175 // Try the fast path for 16 contiguous bytes of ASCII.
176 auto word = ByteVector::load(&*inputIt);
177
178 // Count the bytes of ASCII.
179 uint32_t usableBytes = ByteVector::countInitialZeros(word.maskHigh());
180 if (usableBytes) {
181 if (!(options & kCaseSensitive)) {
182 if (mode == CaseFoldMode::kTurkish) {
183 ByteVector::Mask iMask = word.compareEQ('I').maskAny();
184 if (iMask) {
185 usableBytes =
186 std::min(usableBytes, ByteVector::countInitialZeros(iMask));
187 }
188 }
189 // 0xFF for each byte in word that is uppercase, 0x00 for all others.
190 ByteVector uppercaseMask = word.compareGT('A' - 1) & word.compareLT('Z' + 1);
191 word |= (uppercaseMask & ByteVector(0x20)); // Set the ascii lowercase bit.
192 }
193
194 if (!(options & kDiacriticSensitive)) {
195 ByteVector::Mask diacriticMask =
196 word.compareEQ('^').maskAny() | word.compareEQ('`').maskAny();
197 if (diacriticMask) {
198 usableBytes =
199 std::min(usableBytes, ByteVector::countInitialZeros(diacriticMask));
200 }
201 }
202
203 word.store(&*outputIt);
204 outputIt += usableBytes;
205 inputIt += usableBytes;
206 if (usableBytes == ByteVector::size)
207 continue;
208 }
209 // If we get here, inputIt is positioned on a byte that we know needs special handling.
210 // Either it isn't ASCII or it is a diacritic that needs to be stripped.
211 }
212 #endif
213 const uint8_t firstByte = *inputIt++;
214 char32_t codepoint = 0;
215 if (firstByte <= 0x7f) {
216 // ASCII special case. Can use faster operations.
217 if ((!(options & kCaseSensitive)) && (firstByte >= 'A' && firstByte <= 'Z')) {
218 codepoint = (mode == CaseFoldMode::kTurkish && firstByte == 'I')
219 ? 0x131 // In Turkish, I -> ı (i with no dot).
220 : (firstByte | 0x20); // Set the ascii lowercase bit on the character.
221 } else {
222 // ASCII has two pure diacritics that should be skipped and no characters that
223 // change when removing diacritics.
224 if ((options & kDiacriticSensitive) || !(firstByte == '^' || firstByte == '`')) {
225 *outputIt++ = (firstByte);
226 }
227 continue;
228 }
229 } else {
230 // firstByte indicates that it is not an ASCII char.
231 int leadingOnes = countLeadingZeros64(~(uint64_t(firstByte) << (64 - 8)));
232
233 // Only checking enough to ensure that this code doesn't crash in the face of malformed
234 // utf-8. We make no guarantees about what results will be returned in this case.
235 uassert(ErrorCodes::BadValue,
236 "text contains invalid UTF-8",
237 leadingOnes > 1 && leadingOnes <= 4 && inputIt + leadingOnes - 1 <= endIt);
238
239 codepoint = firstByte & (0xff >> leadingOnes); // mask off the size indicator.
240 for (int subByteIx = 1; subByteIx < leadingOnes; subByteIx++) {
241 const uint8_t subByte = *inputIt++;
242 codepoint <<= 6;
243 codepoint |= subByte & 0x3f; // mask off continuation bits.
244 }
245
246 if (!(options & kCaseSensitive)) {
247 codepoint = codepointToLower(codepoint, mode);
248 }
249
250 if (!(options & kDiacriticSensitive)) {
251 codepoint = codepointRemoveDiacritics(codepoint);
252 if (!codepoint)
253 continue; // codepoint is a pure diacritic.
254 }
255 }
256
257 appendUtf8Codepoint(codepoint, &outputIt);
258 }
259
260 buffer->setlen(outputIt - buffer->buf());
261 return {buffer->buf(), size_t(buffer->len())};
262 }
263
substrMatch(const std::string & str,const std::string & find,SubstrMatchOptions options,CaseFoldMode cfMode)264 bool String::substrMatch(const std::string& str,
265 const std::string& find,
266 SubstrMatchOptions options,
267 CaseFoldMode cfMode) {
268 if (cfMode == CaseFoldMode::kTurkish) {
269 // Turkish comparisons are always case insensitive due to their handling of I/i.
270 options &= ~kCaseSensitive;
271 }
272
273 StackBufBuilder haystackBuf;
274 StackBufBuilder needleBuf;
275 auto haystack = caseFoldAndStripDiacritics(&haystackBuf, str, options, cfMode);
276 auto needle = caseFoldAndStripDiacritics(&needleBuf, find, options, cfMode);
277
278 // Case sensitive and diacritic sensitive.
279 #if BOOST_VERSION < 106200
280 return boost::algorithm::boyer_moore_search(
281 haystack.begin(), haystack.end(), needle.begin(), needle.end()) != haystack.end();
282 #else
283 return boost::algorithm::boyer_moore_search(
284 haystack.begin(), haystack.end(), needle.begin(), needle.end()) !=
285 std::make_pair(haystack.end(), haystack.end());
286 #endif
287 }
288
289 } // namespace unicode
290 } // namespace mongo
291