1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 // IWYU pragma: private, include "nsString.h"
7
8 #ifndef nsReadableUtils_h___
9 #define nsReadableUtils_h___
10
11 /**
12 * I guess all the routines in this file are all mis-named.
13 * According to our conventions, they should be |NS_xxx|.
14 */
15
16 #include "mozilla/Assertions.h"
17 #include "nsAString.h"
18
19 #include "nsTArrayForwardDeclare.h"
20
21 // Can't include mozilla/Encoding.h here
22 extern "C" {
23 size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
24 size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
25 }
26
Distance(const nsReadingIterator<char16_t> & aStart,const nsReadingIterator<char16_t> & aEnd)27 inline size_t Distance(const nsReadingIterator<char16_t>& aStart,
28 const nsReadingIterator<char16_t>& aEnd) {
29 MOZ_ASSERT(aStart.get() <= aEnd.get());
30 return static_cast<size_t>(aEnd.get() - aStart.get());
31 }
Distance(const nsReadingIterator<char> & aStart,const nsReadingIterator<char> & aEnd)32 inline size_t Distance(const nsReadingIterator<char>& aStart,
33 const nsReadingIterator<char>& aEnd) {
34 MOZ_ASSERT(aStart.get() <= aEnd.get());
35 return static_cast<size_t>(aEnd.get() - aStart.get());
36 }
37
38 void LossyCopyUTF16toASCII(const nsAString& aSource, nsACString& aDest);
39 void CopyASCIItoUTF16(const nsACString& aSource, nsAString& aDest);
40 MOZ_MUST_USE bool CopyASCIItoUTF16(const nsACString& aSource, nsAString& aDest,
41 const mozilla::fallible_t&);
42
43 void LossyCopyUTF16toASCII(const char16ptr_t aSource, nsACString& aDest);
44 void CopyASCIItoUTF16(const char* aSource, nsAString& aDest);
45
46 void CopyUTF16toUTF8(const nsAString& aSource, nsACString& aDest);
47 MOZ_MUST_USE bool CopyUTF16toUTF8(const nsAString& aSource, nsACString& aDest,
48 const mozilla::fallible_t&);
49 void CopyUTF8toUTF16(const nsACString& aSource, nsAString& aDest);
50
51 void CopyUTF16toUTF8(const char16ptr_t aSource, nsACString& aDest);
52 void CopyUTF8toUTF16(const char* aSource, nsAString& aDest);
53
54 void LossyAppendUTF16toASCII(const nsAString& aSource, nsACString& aDest);
55 void AppendASCIItoUTF16(const nsACString& aSource, nsAString& aDest);
56 MOZ_MUST_USE bool AppendASCIItoUTF16(const nsACString& aSource,
57 nsAString& aDest,
58 const mozilla::fallible_t&);
59
60 void LossyAppendUTF16toASCII(const char16ptr_t aSource, nsACString& aDest);
61 MOZ_MUST_USE bool AppendASCIItoUTF16(const char* aSource, nsAString& aDest,
62 const mozilla::fallible_t&);
63 void AppendASCIItoUTF16(const char* aSource, nsAString& aDest);
64
65 void AppendUTF16toUTF8(const nsAString& aSource, nsACString& aDest);
66 MOZ_MUST_USE bool AppendUTF16toUTF8(const nsAString& aSource, nsACString& aDest,
67 const mozilla::fallible_t&);
68 void AppendUTF8toUTF16(const nsACString& aSource, nsAString& aDest);
69 MOZ_MUST_USE bool AppendUTF8toUTF16(const nsACString& aSource, nsAString& aDest,
70 const mozilla::fallible_t&);
71
72 void AppendUTF16toUTF8(const char16ptr_t aSource, nsACString& aDest);
73 void AppendUTF8toUTF16(const char* aSource, nsAString& aDest);
74
75 /**
76 * Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
77 *
78 * Allocates and returns a new |char| buffer which you must free with |free|.
79 * Performs a lossy encoding conversion by chopping 16-bit wide characters down
80 * to 8-bits wide while copying |aSource| to your new buffer. This conversion is
81 * not well defined; but it reproduces legacy string behavior. The new buffer is
82 * zero-terminated, but that may not help you if |aSource| contains embedded
83 * nulls.
84 *
85 * @param aSource a 16-bit wide string
86 * @return a new |char| buffer you must free with |free|.
87 */
88 char* ToNewCString(const nsAString& aSource);
89
90 /**
91 * Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
92 *
93 * Allocates and returns a new |char| buffer which you must free with |free|.
94 * The new buffer is zero-terminated, but that may not help you if |aSource|
95 * contains embedded nulls.
96 *
97 * @param aSource an 8-bit wide string
98 * @return a new |char| buffer you must free with |free|.
99 */
100 char* ToNewCString(const nsACString& aSource);
101
102 /**
103 * Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
104 *
105 * Allocates and returns a new |char| buffer which you must free with
106 * |free|.
107 * Performs an encoding conversion from a UTF-16 string to a UTF-8 string
108 * copying |aSource| to your new buffer.
109 * The new buffer is zero-terminated, but that may not help you if |aSource|
110 * contains embedded nulls.
111 *
112 * @param aSource a UTF-16 string (made of char16_t's)
113 * @param aUTF8Count the number of 8-bit units that was returned
114 * @return a new |char| buffer you must free with |free|.
115 */
116
117 char* ToNewUTF8String(const nsAString& aSource, uint32_t* aUTF8Count = nullptr);
118
119 /**
120 * Returns a new |char16_t| buffer containing a zero-terminated copy of
121 * |aSource|.
122 *
123 * Allocates and returns a new |char16_t| buffer which you must free with
124 * |free|.
125 * The new buffer is zero-terminated, but that may not help you if |aSource|
126 * contains embedded nulls.
127 *
128 * @param aSource a UTF-16 string
129 * @return a new |char16_t| buffer you must free with |free|.
130 */
131 char16_t* ToNewUnicode(const nsAString& aSource);
132
133 /**
134 * Returns a new |char16_t| buffer containing a zero-terminated copy of
135 * |aSource|.
136 *
137 * Allocates and returns a new |char16_t| buffer which you must free with
138 * |free|. Performs an encoding conversion by 0-padding 8-bit wide characters up
139 * to 16-bits wide while copying |aSource| to your new buffer. This conversion
140 * is not well defined; but it reproduces legacy string behavior. The new buffer
141 * is zero-terminated, but that may not help you if |aSource| contains embedded
142 * nulls.
143 *
144 * @param aSource an 8-bit wide string (a C-string, NOT UTF-8)
145 * @return a new |char16_t| buffer you must free with |free|.
146 */
147 char16_t* ToNewUnicode(const nsACString& aSource);
148
149 /**
150 * Returns the required length for a char16_t buffer holding
151 * a copy of aSource, using UTF-8 to UTF-16 conversion.
152 * The length does NOT include any space for zero-termination.
153 *
154 * @param aSource an 8-bit wide string, UTF-8 encoded
155 * @return length of UTF-16 encoded string copy, not zero-terminated
156 */
157 uint32_t CalcUTF8ToUnicodeLength(const nsACString& aSource);
158
159 /**
160 * Copies the source string into the specified buffer, converting UTF-8 to
161 * UTF-16 in the process. The conversion is well defined for valid UTF-8
162 * strings.
163 * The copied string will be zero-terminated! Any embedded nulls will be
164 * copied nonetheless. It is the caller's responsiblity to ensure the buffer
165 * is large enough to hold the string copy plus one char16_t for
166 * zero-termination!
167 *
168 * @see CalcUTF8ToUnicodeLength( const nsACString& )
169 * @see UTF8ToNewUnicode( const nsACString&, uint32_t* )
170 *
171 * @param aSource an 8-bit wide string, UTF-8 encoded
172 * @param aBuffer the buffer holding the converted string copy
173 * @param aUTF16Count receiving optionally the number of 16-bit units that
174 * were copied
175 * @return aBuffer pointer, for convenience
176 */
177 char16_t* UTF8ToUnicodeBuffer(const nsACString& aSource, char16_t* aBuffer,
178 uint32_t* aUTF16Count = nullptr);
179
180 /**
181 * Returns a new |char16_t| buffer containing a zero-terminated copy
182 * of |aSource|.
183 *
184 * Allocates and returns a new |char| buffer which you must free with
185 * |free|. Performs an encoding conversion from UTF-8 to UTF-16
186 * while copying |aSource| to your new buffer. This conversion is well defined
187 * for a valid UTF-8 string. The new buffer is zero-terminated, but that
188 * may not help you if |aSource| contains embedded nulls.
189 *
190 * @param aSource an 8-bit wide string, UTF-8 encoded
191 * @param aUTF16Count the number of 16-bit units that was returned
192 * @return a new |char16_t| buffer you must free with |free|.
193 * (UTF-16 encoded)
194 */
195 char16_t* UTF8ToNewUnicode(const nsACString& aSource,
196 uint32_t* aUTF16Count = nullptr);
197
198 /**
199 * Copies |aLength| 16-bit code units from the start of |aSource| to the
200 * |char16_t| buffer |aDest|.
201 *
202 * After this operation |aDest| is not null terminated.
203 *
204 * @param aSource a UTF-16 string
205 * @param aSrcOffset start offset in the source string
206 * @param aDest a |char16_t| buffer
207 * @param aLength the number of 16-bit code units to copy
208 * @return pointer to destination buffer - identical to |aDest|
209 */
210 char16_t* CopyUnicodeTo(const nsAString& aSource, uint32_t aSrcOffset,
211 char16_t* aDest, uint32_t aLength);
212
213 /**
214 * Copies 16-bit characters between iterators |aSrcStart| and
215 * |aSrcEnd| to the writable string |aDest|. Similar to the
216 * |nsString::Mid| method.
217 *
218 * After this operation |aDest| is not null terminated.
219 *
220 * @param aSrcStart start source iterator
221 * @param aSrcEnd end source iterator
222 * @param aDest destination for the copy
223 */
224 void CopyUnicodeTo(const nsAString::const_iterator& aSrcStart,
225 const nsAString::const_iterator& aSrcEnd, nsAString& aDest);
226
227 /**
228 * Appends 16-bit characters between iterators |aSrcStart| and
229 * |aSrcEnd| to the writable string |aDest|.
230 *
231 * After this operation |aDest| is not null terminated.
232 *
233 * @param aSrcStart start source iterator
234 * @param aSrcEnd end source iterator
235 * @param aDest destination for the copy
236 */
237 void AppendUnicodeTo(const nsAString::const_iterator& aSrcStart,
238 const nsAString::const_iterator& aSrcEnd,
239 nsAString& aDest);
240
241 /**
242 * Returns |true| if |aString| contains only ASCII characters, that is,
243 * characters in the range (0x00, 0x7F).
244 *
245 * @param aString a 16-bit wide string to scan
246 */
247 bool IsASCII(const nsAString& aString);
248
249 /**
250 * Returns |true| if |aString| contains only ASCII characters, that is,
251 * characters in the range (0x00, 0x7F).
252 *
253 * @param aString a 8-bit wide string to scan
254 */
IsASCII(const nsACString & aString)255 inline bool IsASCII(const nsACString& aString) {
256 size_t length = aString.Length();
257 const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
258 // For short strings, calling into Rust is a pessimization, and the SIMD
259 // code won't have a chance to kick in anyway. Additionally, handling the
260 // case of the empty string here makes null-checking ptr unnecessary.
261 // (Passing nullptr to Rust would technically be UB.)
262 if (length < 16) {
263 size_t accu = 0;
264 for (size_t i = 0; i < length; i++) {
265 accu |= ptr[i];
266 }
267 return accu < 0x80;
268 }
269 // This is not quite optimal, because it's not fail-fast when the by-register
270 // check already finds non-ASCII. Also, input to this function is almost
271 // always ASCII, so even the by-register check wouldn't need to be fail-fast
272 // and could be more like the loop above.
273 return length == encoding_ascii_valid_up_to(ptr, length);
274 }
275
276 /**
277 * Returns |true| if |aString| is a valid UTF-8 string.
278 *
279 * Note that this doesn't check whether the string might look like a valid
280 * string in another encoding, too, e.g. ISO-2022-JP.
281 *
282 * @param aString an 8-bit wide string to scan
283 */
IsUTF8(const nsACString & aString)284 inline bool IsUTF8(const nsACString& aString) {
285 size_t length = aString.Length();
286 const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
287 // For short strings, calling into Rust is a pessimization, and the SIMD
288 // code won't have a chance to kick in anyway. Additionally, handling the
289 // case of the empty string here makes null-checking ptr unnecessary.
290 // (Passing nullptr to Rust would technically be UB.)
291 if (length < 16) {
292 for (size_t i = 0; i < length; i++) {
293 if (ptr[i] >= 0x80) {
294 ptr += i;
295 length -= i;
296 goto end;
297 }
298 }
299 return true;
300 }
301 end:
302 return length == encoding_utf8_valid_up_to(ptr, length);
303 }
304
305 bool ParseString(const nsACString& aAstring, char aDelimiter,
306 nsTArray<nsCString>& aArray);
307
308 /**
309 * Converts case in place in the argument string.
310 */
311 void ToUpperCase(nsACString&);
312
313 void ToLowerCase(nsACString&);
314
315 void ToUpperCase(nsACString&);
316
317 void ToLowerCase(nsACString&);
318
319 /**
320 * Converts case from string aSource to aDest.
321 */
322 void ToUpperCase(const nsACString& aSource, nsACString& aDest);
323
324 void ToLowerCase(const nsACString& aSource, nsACString& aDest);
325
326 /**
327 * Finds the leftmost occurrence of |aPattern|, if any in the range
328 * |aSearchStart|..|aSearchEnd|.
329 *
330 * Returns |true| if a match was found, and adjusts |aSearchStart| and
331 * |aSearchEnd| to point to the match. If no match was found, returns |false|
332 * and makes |aSearchStart == aSearchEnd|.
333 *
334 * Currently, this is equivalent to the O(m*n) implementation previously on
335 * |ns[C]String|. If we need something faster, then we can implement that later.
336 */
337
338 bool FindInReadable(const nsAString& aPattern, nsAString::const_iterator&,
339 nsAString::const_iterator&,
340 const nsStringComparator& = nsDefaultStringComparator());
341 bool FindInReadable(const nsACString& aPattern, nsACString::const_iterator&,
342 nsACString::const_iterator&,
343 const nsCStringComparator& = nsDefaultCStringComparator());
344
345 /* sometimes we don't care about where the string was, just that we
346 * found it or not */
347 inline bool FindInReadable(
348 const nsAString& aPattern, const nsAString& aSource,
349 const nsStringComparator& aCompare = nsDefaultStringComparator()) {
350 nsAString::const_iterator start, end;
351 aSource.BeginReading(start);
352 aSource.EndReading(end);
353 return FindInReadable(aPattern, start, end, aCompare);
354 }
355
356 inline bool FindInReadable(
357 const nsACString& aPattern, const nsACString& aSource,
358 const nsCStringComparator& aCompare = nsDefaultCStringComparator()) {
359 nsACString::const_iterator start, end;
360 aSource.BeginReading(start);
361 aSource.EndReading(end);
362 return FindInReadable(aPattern, start, end, aCompare);
363 }
364
365 bool CaseInsensitiveFindInReadable(const nsACString& aPattern,
366 nsACString::const_iterator&,
367 nsACString::const_iterator&);
368
369 /**
370 * Finds the rightmost occurrence of |aPattern|
371 * Returns |true| if a match was found, and adjusts |aSearchStart| and
372 * |aSearchEnd| to point to the match. If no match was found, returns |false|
373 * and makes |aSearchStart == aSearchEnd|.
374 *
375 */
376 bool RFindInReadable(const nsAString& aPattern, nsAString::const_iterator&,
377 nsAString::const_iterator&,
378 const nsStringComparator& = nsDefaultStringComparator());
379 bool RFindInReadable(const nsACString& aPattern, nsACString::const_iterator&,
380 nsACString::const_iterator&,
381 const nsCStringComparator& = nsDefaultCStringComparator());
382
383 /**
384 * Finds the leftmost occurrence of |aChar|, if any in the range
385 * |aSearchStart|..|aSearchEnd|.
386 *
387 * Returns |true| if a match was found, and adjusts |aSearchStart| to
388 * point to the match. If no match was found, returns |false| and
389 * makes |aSearchStart == aSearchEnd|.
390 */
391 bool FindCharInReadable(char16_t aChar, nsAString::const_iterator& aSearchStart,
392 const nsAString::const_iterator& aSearchEnd);
393 bool FindCharInReadable(char aChar, nsACString::const_iterator& aSearchStart,
394 const nsACString::const_iterator& aSearchEnd);
395
396 /**
397 * Finds the number of occurences of |aChar| in the string |aStr|
398 */
399 uint32_t CountCharInReadable(const nsAString& aStr, char16_t aChar);
400 uint32_t CountCharInReadable(const nsACString& aStr, char aChar);
401
402 bool StringBeginsWith(const nsAString& aSource, const nsAString& aSubstring);
403 bool StringBeginsWith(const nsAString& aSource, const nsAString& aSubstring,
404 const nsStringComparator& aComparator);
405 bool StringBeginsWith(const nsACString& aSource, const nsACString& aSubstring);
406 bool StringBeginsWith(const nsACString& aSource, const nsACString& aSubstring,
407 const nsCStringComparator& aComparator);
408 bool StringEndsWith(const nsAString& aSource, const nsAString& aSubstring);
409 bool StringEndsWith(const nsAString& aSource, const nsAString& aSubstring,
410 const nsStringComparator& aComparator);
411 bool StringEndsWith(const nsACString& aSource, const nsACString& aSubstring);
412 bool StringEndsWith(const nsACString& aSource, const nsACString& aSubstring,
413 const nsCStringComparator& aComparator);
414
415 const nsString& EmptyString();
416 const nsCString& EmptyCString();
417
418 const nsString& VoidString();
419 const nsCString& VoidCString();
420
421 /**
422 * Compare a UTF-8 string to an UTF-16 string.
423 *
424 * Returns 0 if the strings are equal, -1 if aUTF8String is less
425 * than aUTF16Count, and 1 in the reverse case. In case of fatal
426 * error (eg the strings are not valid UTF8 and UTF16 respectively),
427 * this method will return INT32_MIN.
428 */
429 int32_t CompareUTF8toUTF16(const nsACString& aUTF8String,
430 const nsAString& aUTF16String);
431
432 void AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest);
433
434 template <class T>
EnsureStringLength(T & aStr,uint32_t aLen)435 inline bool EnsureStringLength(T& aStr, uint32_t aLen) {
436 aStr.SetLength(aLen);
437 return (aStr.Length() == aLen);
438 }
439
440 #endif // !defined(nsReadableUtils_h___)
441