1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 // IWYU pragma: private, include "nsString.h"
7 
8 #ifndef nsReadableUtils_h___
9 #define nsReadableUtils_h___
10 
11 /**
12  * I guess all the routines in this file are all mis-named.
13  * According to our conventions, they should be |NS_xxx|.
14  */
15 
16 #include "mozilla/Assertions.h"
17 #include "nsAString.h"
18 
19 #include "nsTArrayForwardDeclare.h"
20 
21 // Can't include mozilla/Encoding.h here
22 extern "C" {
23 size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
24 size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
25 }
26 
Distance(const nsReadingIterator<char16_t> & aStart,const nsReadingIterator<char16_t> & aEnd)27 inline size_t Distance(const nsReadingIterator<char16_t>& aStart,
28                        const nsReadingIterator<char16_t>& aEnd) {
29   MOZ_ASSERT(aStart.get() <= aEnd.get());
30   return static_cast<size_t>(aEnd.get() - aStart.get());
31 }
Distance(const nsReadingIterator<char> & aStart,const nsReadingIterator<char> & aEnd)32 inline size_t Distance(const nsReadingIterator<char>& aStart,
33                        const nsReadingIterator<char>& aEnd) {
34   MOZ_ASSERT(aStart.get() <= aEnd.get());
35   return static_cast<size_t>(aEnd.get() - aStart.get());
36 }
37 
38 void LossyCopyUTF16toASCII(const nsAString& aSource, nsACString& aDest);
39 void CopyASCIItoUTF16(const nsACString& aSource, nsAString& aDest);
40 MOZ_MUST_USE bool CopyASCIItoUTF16(const nsACString& aSource, nsAString& aDest,
41                                    const mozilla::fallible_t&);
42 
43 void LossyCopyUTF16toASCII(const char16ptr_t aSource, nsACString& aDest);
44 void CopyASCIItoUTF16(const char* aSource, nsAString& aDest);
45 
46 void CopyUTF16toUTF8(const nsAString& aSource, nsACString& aDest);
47 MOZ_MUST_USE bool CopyUTF16toUTF8(const nsAString& aSource, nsACString& aDest,
48                                   const mozilla::fallible_t&);
49 void CopyUTF8toUTF16(const nsACString& aSource, nsAString& aDest);
50 
51 void CopyUTF16toUTF8(const char16ptr_t aSource, nsACString& aDest);
52 void CopyUTF8toUTF16(const char* aSource, nsAString& aDest);
53 
54 void LossyAppendUTF16toASCII(const nsAString& aSource, nsACString& aDest);
55 void AppendASCIItoUTF16(const nsACString& aSource, nsAString& aDest);
56 MOZ_MUST_USE bool AppendASCIItoUTF16(const nsACString& aSource,
57                                      nsAString& aDest,
58                                      const mozilla::fallible_t&);
59 
60 void LossyAppendUTF16toASCII(const char16ptr_t aSource, nsACString& aDest);
61 MOZ_MUST_USE bool AppendASCIItoUTF16(const char* aSource, nsAString& aDest,
62                                      const mozilla::fallible_t&);
63 void AppendASCIItoUTF16(const char* aSource, nsAString& aDest);
64 
65 void AppendUTF16toUTF8(const nsAString& aSource, nsACString& aDest);
66 MOZ_MUST_USE bool AppendUTF16toUTF8(const nsAString& aSource, nsACString& aDest,
67                                     const mozilla::fallible_t&);
68 void AppendUTF8toUTF16(const nsACString& aSource, nsAString& aDest);
69 MOZ_MUST_USE bool AppendUTF8toUTF16(const nsACString& aSource, nsAString& aDest,
70                                     const mozilla::fallible_t&);
71 
72 void AppendUTF16toUTF8(const char16ptr_t aSource, nsACString& aDest);
73 void AppendUTF8toUTF16(const char* aSource, nsAString& aDest);
74 
75 /**
76  * Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
77  *
78  * Allocates and returns a new |char| buffer which you must free with |free|.
79  * Performs a lossy encoding conversion by chopping 16-bit wide characters down
80  * to 8-bits wide while copying |aSource| to your new buffer. This conversion is
81  * not well defined; but it reproduces legacy string behavior. The new buffer is
82  * zero-terminated, but that may not help you if |aSource| contains embedded
83  * nulls.
84  *
85  * @param aSource a 16-bit wide string
86  * @return a new |char| buffer you must free with |free|.
87  */
88 char* ToNewCString(const nsAString& aSource);
89 
90 /**
91  * Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
92  *
93  * Allocates and returns a new |char| buffer which you must free with |free|.
94  * The new buffer is zero-terminated, but that may not help you if |aSource|
95  * contains embedded nulls.
96  *
97  * @param aSource an 8-bit wide string
98  * @return a new |char| buffer you must free with |free|.
99  */
100 char* ToNewCString(const nsACString& aSource);
101 
102 /**
103  * Returns a new |char| buffer containing a zero-terminated copy of |aSource|.
104  *
105  * Allocates and returns a new |char| buffer which you must free with
106  * |free|.
107  * Performs an encoding conversion from a UTF-16 string to a UTF-8 string
108  * copying |aSource| to your new buffer.
109  * The new buffer is zero-terminated, but that may not help you if |aSource|
110  * contains embedded nulls.
111  *
112  * @param aSource a UTF-16 string (made of char16_t's)
113  * @param aUTF8Count the number of 8-bit units that was returned
114  * @return a new |char| buffer you must free with |free|.
115  */
116 
117 char* ToNewUTF8String(const nsAString& aSource, uint32_t* aUTF8Count = nullptr);
118 
119 /**
120  * Returns a new |char16_t| buffer containing a zero-terminated copy of
121  * |aSource|.
122  *
123  * Allocates and returns a new |char16_t| buffer which you must free with
124  * |free|.
125  * The new buffer is zero-terminated, but that may not help you if |aSource|
126  * contains embedded nulls.
127  *
128  * @param aSource a UTF-16 string
129  * @return a new |char16_t| buffer you must free with |free|.
130  */
131 char16_t* ToNewUnicode(const nsAString& aSource);
132 
133 /**
134  * Returns a new |char16_t| buffer containing a zero-terminated copy of
135  * |aSource|.
136  *
137  * Allocates and returns a new |char16_t| buffer which you must free with
138  * |free|. Performs an encoding conversion by 0-padding 8-bit wide characters up
139  * to 16-bits wide while copying |aSource| to your new buffer. This conversion
140  * is not well defined; but it reproduces legacy string behavior. The new buffer
141  * is zero-terminated, but that may not help you if |aSource| contains embedded
142  * nulls.
143  *
144  * @param aSource an 8-bit wide string (a C-string, NOT UTF-8)
145  * @return a new |char16_t| buffer you must free with |free|.
146  */
147 char16_t* ToNewUnicode(const nsACString& aSource);
148 
149 /**
150  * Returns the required length for a char16_t buffer holding
151  * a copy of aSource, using UTF-8 to UTF-16 conversion.
152  * The length does NOT include any space for zero-termination.
153  *
154  * @param aSource an 8-bit wide string, UTF-8 encoded
155  * @return length of UTF-16 encoded string copy, not zero-terminated
156  */
157 uint32_t CalcUTF8ToUnicodeLength(const nsACString& aSource);
158 
159 /**
160  * Copies the source string into the specified buffer, converting UTF-8 to
161  * UTF-16 in the process. The conversion is well defined for valid UTF-8
162  * strings.
163  * The copied string will be zero-terminated! Any embedded nulls will be
164  * copied nonetheless. It is the caller's responsiblity to ensure the buffer
165  * is large enough to hold the string copy plus one char16_t for
166  * zero-termination!
167  *
168  * @see CalcUTF8ToUnicodeLength( const nsACString& )
169  * @see UTF8ToNewUnicode( const nsACString&, uint32_t* )
170  *
171  * @param aSource an 8-bit wide string, UTF-8 encoded
172  * @param aBuffer the buffer holding the converted string copy
173  * @param aUTF16Count receiving optionally the number of 16-bit units that
174  *                    were copied
175  * @return aBuffer pointer, for convenience
176  */
177 char16_t* UTF8ToUnicodeBuffer(const nsACString& aSource, char16_t* aBuffer,
178                               uint32_t* aUTF16Count = nullptr);
179 
180 /**
181  * Returns a new |char16_t| buffer containing a zero-terminated copy
182  * of |aSource|.
183  *
184  * Allocates and returns a new |char| buffer which you must free with
185  * |free|.  Performs an encoding conversion from UTF-8 to UTF-16
186  * while copying |aSource| to your new buffer.  This conversion is well defined
187  * for a valid UTF-8 string.  The new buffer is zero-terminated, but that
188  * may not help you if |aSource| contains embedded nulls.
189  *
190  * @param aSource an 8-bit wide string, UTF-8 encoded
191  * @param aUTF16Count the number of 16-bit units that was returned
192  * @return a new |char16_t| buffer you must free with |free|.
193  *         (UTF-16 encoded)
194  */
195 char16_t* UTF8ToNewUnicode(const nsACString& aSource,
196                            uint32_t* aUTF16Count = nullptr);
197 
198 /**
199  * Copies |aLength| 16-bit code units from the start of |aSource| to the
200  * |char16_t| buffer |aDest|.
201  *
202  * After this operation |aDest| is not null terminated.
203  *
204  * @param aSource a UTF-16 string
205  * @param aSrcOffset start offset in the source string
206  * @param aDest a |char16_t| buffer
207  * @param aLength the number of 16-bit code units to copy
208  * @return pointer to destination buffer - identical to |aDest|
209  */
210 char16_t* CopyUnicodeTo(const nsAString& aSource, uint32_t aSrcOffset,
211                         char16_t* aDest, uint32_t aLength);
212 
213 /**
214  * Copies 16-bit characters between iterators |aSrcStart| and
215  * |aSrcEnd| to the writable string |aDest|. Similar to the
216  * |nsString::Mid| method.
217  *
218  * After this operation |aDest| is not null terminated.
219  *
220  * @param aSrcStart start source iterator
221  * @param aSrcEnd end source iterator
222  * @param aDest destination for the copy
223  */
224 void CopyUnicodeTo(const nsAString::const_iterator& aSrcStart,
225                    const nsAString::const_iterator& aSrcEnd, nsAString& aDest);
226 
227 /**
228  * Appends 16-bit characters between iterators |aSrcStart| and
229  * |aSrcEnd| to the writable string |aDest|.
230  *
231  * After this operation |aDest| is not null terminated.
232  *
233  * @param aSrcStart start source iterator
234  * @param aSrcEnd end source iterator
235  * @param aDest destination for the copy
236  */
237 void AppendUnicodeTo(const nsAString::const_iterator& aSrcStart,
238                      const nsAString::const_iterator& aSrcEnd,
239                      nsAString& aDest);
240 
241 /**
242  * Returns |true| if |aString| contains only ASCII characters, that is,
243  * characters in the range (0x00, 0x7F).
244  *
245  * @param aString a 16-bit wide string to scan
246  */
247 bool IsASCII(const nsAString& aString);
248 
249 /**
250  * Returns |true| if |aString| contains only ASCII characters, that is,
251  * characters in the range (0x00, 0x7F).
252  *
253  * @param aString a 8-bit wide string to scan
254  */
IsASCII(const nsACString & aString)255 inline bool IsASCII(const nsACString& aString) {
256   size_t length = aString.Length();
257   const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
258   // For short strings, calling into Rust is a pessimization, and the SIMD
259   // code won't have a chance to kick in anyway. Additionally, handling the
260   // case of the empty string here makes null-checking ptr unnecessary.
261   // (Passing nullptr to Rust would technically be UB.)
262   if (length < 16) {
263     size_t accu = 0;
264     for (size_t i = 0; i < length; i++) {
265       accu |= ptr[i];
266     }
267     return accu < 0x80;
268   }
269   // This is not quite optimal, because it's not fail-fast when the by-register
270   // check already finds non-ASCII. Also, input to this function is almost
271   // always ASCII, so even the by-register check wouldn't need to be fail-fast
272   // and could be more like the loop above.
273   return length == encoding_ascii_valid_up_to(ptr, length);
274 }
275 
276 /**
277  * Returns |true| if |aString| is a valid UTF-8 string.
278  *
279  * Note that this doesn't check whether the string might look like a valid
280  * string in another encoding, too, e.g. ISO-2022-JP.
281  *
282  * @param aString an 8-bit wide string to scan
283  */
IsUTF8(const nsACString & aString)284 inline bool IsUTF8(const nsACString& aString) {
285   size_t length = aString.Length();
286   const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
287   // For short strings, calling into Rust is a pessimization, and the SIMD
288   // code won't have a chance to kick in anyway. Additionally, handling the
289   // case of the empty string here makes null-checking ptr unnecessary.
290   // (Passing nullptr to Rust would technically be UB.)
291   if (length < 16) {
292     for (size_t i = 0; i < length; i++) {
293       if (ptr[i] >= 0x80) {
294         ptr += i;
295         length -= i;
296         goto end;
297       }
298     }
299     return true;
300   }
301 end:
302   return length == encoding_utf8_valid_up_to(ptr, length);
303 }
304 
305 bool ParseString(const nsACString& aAstring, char aDelimiter,
306                  nsTArray<nsCString>& aArray);
307 
308 /**
309  * Converts case in place in the argument string.
310  */
311 void ToUpperCase(nsACString&);
312 
313 void ToLowerCase(nsACString&);
314 
315 void ToUpperCase(nsACString&);
316 
317 void ToLowerCase(nsACString&);
318 
319 /**
320  * Converts case from string aSource to aDest.
321  */
322 void ToUpperCase(const nsACString& aSource, nsACString& aDest);
323 
324 void ToLowerCase(const nsACString& aSource, nsACString& aDest);
325 
326 /**
327  * Finds the leftmost occurrence of |aPattern|, if any in the range
328  * |aSearchStart|..|aSearchEnd|.
329  *
330  * Returns |true| if a match was found, and adjusts |aSearchStart| and
331  * |aSearchEnd| to point to the match.  If no match was found, returns |false|
332  * and makes |aSearchStart == aSearchEnd|.
333  *
334  * Currently, this is equivalent to the O(m*n) implementation previously on
335  * |ns[C]String|. If we need something faster, then we can implement that later.
336  */
337 
338 bool FindInReadable(const nsAString& aPattern, nsAString::const_iterator&,
339                     nsAString::const_iterator&,
340                     const nsStringComparator& = nsDefaultStringComparator());
341 bool FindInReadable(const nsACString& aPattern, nsACString::const_iterator&,
342                     nsACString::const_iterator&,
343                     const nsCStringComparator& = nsDefaultCStringComparator());
344 
345 /* sometimes we don't care about where the string was, just that we
346  * found it or not */
347 inline bool FindInReadable(
348     const nsAString& aPattern, const nsAString& aSource,
349     const nsStringComparator& aCompare = nsDefaultStringComparator()) {
350   nsAString::const_iterator start, end;
351   aSource.BeginReading(start);
352   aSource.EndReading(end);
353   return FindInReadable(aPattern, start, end, aCompare);
354 }
355 
356 inline bool FindInReadable(
357     const nsACString& aPattern, const nsACString& aSource,
358     const nsCStringComparator& aCompare = nsDefaultCStringComparator()) {
359   nsACString::const_iterator start, end;
360   aSource.BeginReading(start);
361   aSource.EndReading(end);
362   return FindInReadable(aPattern, start, end, aCompare);
363 }
364 
365 bool CaseInsensitiveFindInReadable(const nsACString& aPattern,
366                                    nsACString::const_iterator&,
367                                    nsACString::const_iterator&);
368 
369 /**
370  * Finds the rightmost occurrence of |aPattern|
371  * Returns |true| if a match was found, and adjusts |aSearchStart| and
372  * |aSearchEnd| to point to the match.  If no match was found, returns |false|
373  * and makes |aSearchStart == aSearchEnd|.
374  *
375  */
376 bool RFindInReadable(const nsAString& aPattern, nsAString::const_iterator&,
377                      nsAString::const_iterator&,
378                      const nsStringComparator& = nsDefaultStringComparator());
379 bool RFindInReadable(const nsACString& aPattern, nsACString::const_iterator&,
380                      nsACString::const_iterator&,
381                      const nsCStringComparator& = nsDefaultCStringComparator());
382 
383 /**
384  * Finds the leftmost occurrence of |aChar|, if any in the range
385  * |aSearchStart|..|aSearchEnd|.
386  *
387  * Returns |true| if a match was found, and adjusts |aSearchStart| to
388  * point to the match.  If no match was found, returns |false| and
389  * makes |aSearchStart == aSearchEnd|.
390  */
391 bool FindCharInReadable(char16_t aChar, nsAString::const_iterator& aSearchStart,
392                         const nsAString::const_iterator& aSearchEnd);
393 bool FindCharInReadable(char aChar, nsACString::const_iterator& aSearchStart,
394                         const nsACString::const_iterator& aSearchEnd);
395 
396 /**
397  * Finds the number of occurences of |aChar| in the string |aStr|
398  */
399 uint32_t CountCharInReadable(const nsAString& aStr, char16_t aChar);
400 uint32_t CountCharInReadable(const nsACString& aStr, char aChar);
401 
402 bool StringBeginsWith(const nsAString& aSource, const nsAString& aSubstring);
403 bool StringBeginsWith(const nsAString& aSource, const nsAString& aSubstring,
404                       const nsStringComparator& aComparator);
405 bool StringBeginsWith(const nsACString& aSource, const nsACString& aSubstring);
406 bool StringBeginsWith(const nsACString& aSource, const nsACString& aSubstring,
407                       const nsCStringComparator& aComparator);
408 bool StringEndsWith(const nsAString& aSource, const nsAString& aSubstring);
409 bool StringEndsWith(const nsAString& aSource, const nsAString& aSubstring,
410                     const nsStringComparator& aComparator);
411 bool StringEndsWith(const nsACString& aSource, const nsACString& aSubstring);
412 bool StringEndsWith(const nsACString& aSource, const nsACString& aSubstring,
413                     const nsCStringComparator& aComparator);
414 
415 const nsString& EmptyString();
416 const nsCString& EmptyCString();
417 
418 const nsString& VoidString();
419 const nsCString& VoidCString();
420 
421 /**
422  * Compare a UTF-8 string to an UTF-16 string.
423  *
424  * Returns 0 if the strings are equal, -1 if aUTF8String is less
425  * than aUTF16Count, and 1 in the reverse case.  In case of fatal
426  * error (eg the strings are not valid UTF8 and UTF16 respectively),
427  * this method will return INT32_MIN.
428  */
429 int32_t CompareUTF8toUTF16(const nsACString& aUTF8String,
430                            const nsAString& aUTF16String);
431 
432 void AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest);
433 
434 template <class T>
EnsureStringLength(T & aStr,uint32_t aLen)435 inline bool EnsureStringLength(T& aStr, uint32_t aLen) {
436   aStr.SetLength(aLen);
437   return (aStr.Length() == aLen);
438 }
439 
440 #endif  // !defined(nsReadableUtils_h___)
441