1 /* This Source Code Form is subject to the terms of the Mozilla Public
2  * License, v. 2.0. If a copy of the MPL was not distributed with this
3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 
5 /* Latin-1 operations (i.e. a byte is the corresponding code point).
6  * (Note: this is *not* the same as the encoding of windows-1252 or
7  * latin1 content on the web. In Web terms, this encoding
8  * corresponds to "isomorphic decode" / "isomorphic encoding" from
9  * the Infra Standard.)
10  */
11 
12 #ifndef mozilla_Latin1_h
13 #define mozilla_Latin1_h
14 
15 #include <type_traits>
16 
17 #include "mozilla/JsRust.h"
18 #include "mozilla/Span.h"
19 #include "mozilla/Tuple.h"
20 
21 #if MOZ_HAS_JSRUST()
22 #  include "encoding_rs_mem.h"
23 #endif
24 
25 namespace mozilla {
26 
27 namespace detail {
28 
29 // It's important for optimizations that Latin1ness checks
30 // and inflation/deflation function use the same short
31 // string limit. The limit is 16, because that's the shortest
32 // that inflates/deflates using SIMD.
33 constexpr size_t kShortStringLimitForInlinePaths = 16;
34 
35 template <typename Char>
36 class MakeUnsignedChar {
37  public:
38   using Type = std::make_unsigned_t<Char>;
39 };
40 
41 template <>
42 class MakeUnsignedChar<char16_t> {
43  public:
44   using Type = char16_t;
45 };
46 
47 template <>
48 class MakeUnsignedChar<char32_t> {
49  public:
50   using Type = char32_t;
51 };
52 
53 }  // namespace detail
54 
55 /**
56  * Returns true iff |aChar| is Latin-1 but not ASCII, i.e. in the range
57  * [0x80, 0xFF].
58  */
59 template <typename Char>
IsNonAsciiLatin1(Char aChar)60 constexpr bool IsNonAsciiLatin1(Char aChar) {
61   using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
62   auto uc = static_cast<UnsignedChar>(aChar);
63   return uc >= 0x80 && uc <= 0xFF;
64 }
65 
66 #if MOZ_HAS_JSRUST()
67 
68 /**
69  * Returns |true| iff |aString| contains only Latin1 characters, that is,
70  * characters in the range [U+0000, U+00FF].
71  *
72  * @param aString a potentially-invalid UTF-16 string to scan
73  */
IsUtf16Latin1(mozilla::Span<const char16_t> aString)74 inline bool IsUtf16Latin1(mozilla::Span<const char16_t> aString) {
75   size_t length = aString.Length();
76   const char16_t* ptr = aString.Elements();
77   // For short strings, calling into Rust is a pessimization, and the SIMD
78   // code won't have a chance to kick in anyway.
79   // 16 is a bit larger than logically necessary for this function alone,
80   // but it's important that the limit here matches the limit used in
81   // LossyConvertUtf16toLatin1!
82   if (length < mozilla::detail::kShortStringLimitForInlinePaths) {
83     char16_t accu = 0;
84     for (size_t i = 0; i < length; i++) {
85       accu |= ptr[i];
86     }
87     return accu < 0x100;
88   }
89   return encoding_mem_is_utf16_latin1(ptr, length);
90 }
91 
92 /**
93  * Returns |true| iff |aString| is valid UTF-8 containing only Latin-1
94  * characters.
95  *
96  * If you know that the argument is always absolutely guaranteed to be valid
97  * UTF-8, use the faster UnsafeIsValidUtf8Latin1() instead.
98  *
99  * @param aString potentially-invalid UTF-8 string to scan
100  */
IsUtf8Latin1(mozilla::Span<const char> aString)101 inline bool IsUtf8Latin1(mozilla::Span<const char> aString) {
102   return encoding_mem_is_utf8_latin1(aString.Elements(), aString.Length());
103 }
104 
105 /**
106  * Returns |true| iff |aString|, which MUST be valid UTF-8, contains only
107  * Latin1 characters, that is, characters in the range [U+0000, U+00FF].
108  * (If |aString| might not be valid UTF-8, use |IsUtf8Latin1| instead.)
109  *
110  * @param aString known-valid UTF-8 string to scan
111  */
UnsafeIsValidUtf8Latin1(mozilla::Span<const char> aString)112 inline bool UnsafeIsValidUtf8Latin1(mozilla::Span<const char> aString) {
113   return encoding_mem_is_str_latin1(aString.Elements(), aString.Length());
114 }
115 
116 /**
117  * Returns the index of first byte that starts an invalid byte
118  * sequence or a non-Latin1 byte sequence in a potentially-invalid UTF-8
119  * string, or the length of the string if there are neither.
120  *
121  * If you know that the argument is always absolutely guaranteed to be valid
122  * UTF-8, use the faster UnsafeValidUtf8Lati1UpTo() instead.
123  *
124  * @param aString potentially-invalid UTF-8 string to scan
125  */
Utf8Latin1UpTo(mozilla::Span<const char> aString)126 inline size_t Utf8Latin1UpTo(mozilla::Span<const char> aString) {
127   return encoding_mem_utf8_latin1_up_to(aString.Elements(), aString.Length());
128 }
129 
130 /**
131  * Returns the index of first byte that starts a non-Latin1 byte
132  * sequence in a known-valid UTF-8 string, or the length of the
133  * string if there are none. (If the string might not be valid
134  * UTF-8, use Utf8Latin1UpTo() instead.)
135  *
136  * @param aString known-valid UTF-8 string to scan
137  */
UnsafeValidUtf8Lati1UpTo(mozilla::Span<const char> aString)138 inline size_t UnsafeValidUtf8Lati1UpTo(mozilla::Span<const char> aString) {
139   return encoding_mem_str_latin1_up_to(aString.Elements(), aString.Length());
140 }
141 
142 /**
143  * If all the code points in the input are below U+0100, converts to Latin1,
144  * i.e. unsigned byte value is Unicode scalar value. If there are code points
145  * above U+00FF, produces unspecified garbage in a memory-safe way. The
146  * nature of the garbage must not be relied upon.
147  *
148  * The length of aDest must not be less than the length of aSource.
149  */
LossyConvertUtf16toLatin1(mozilla::Span<const char16_t> aSource,mozilla::Span<char> aDest)150 inline void LossyConvertUtf16toLatin1(mozilla::Span<const char16_t> aSource,
151                                       mozilla::Span<char> aDest) {
152   const char16_t* srcPtr = aSource.Elements();
153   size_t srcLen = aSource.Length();
154   char* dstPtr = aDest.Elements();
155   size_t dstLen = aDest.Length();
156   // Avoid function call overhead when SIMD isn't used anyway
157   // If you change the length limit here, be sure to change
158   // IsUtf16Latin1 and IsAscii to match so that optimizations don't
159   // fail!
160   if (srcLen < mozilla::detail::kShortStringLimitForInlinePaths) {
161     MOZ_ASSERT(dstLen >= srcLen);
162     uint8_t* unsignedPtr = reinterpret_cast<uint8_t*>(dstPtr);
163     const char16_t* end = srcPtr + srcLen;
164     while (srcPtr < end) {
165       *unsignedPtr = static_cast<uint8_t>(*srcPtr);
166       ++srcPtr;
167       ++unsignedPtr;
168     }
169     return;
170   }
171   encoding_mem_convert_utf16_to_latin1_lossy(srcPtr, srcLen, dstPtr, dstLen);
172 }
173 
174 /**
175  * If all the code points in the input are below U+0100, converts to Latin1,
176  * i.e. unsigned byte value is Unicode scalar value. If there are code points
177  * above U+00FF, produces unspecified garbage in a memory-safe way. The
178  * nature of the garbage must not be relied upon.
179  *
180  * Returns the number of code units written.
181  *
182  * The length of aDest must not be less than the length of aSource.
183  */
LossyConvertUtf8toLatin1(mozilla::Span<const char> aSource,mozilla::Span<char> aDest)184 inline size_t LossyConvertUtf8toLatin1(mozilla::Span<const char> aSource,
185                                        mozilla::Span<char> aDest) {
186   return encoding_mem_convert_utf8_to_latin1_lossy(
187       aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
188 }
189 
190 /**
191  * Converts each byte of |aSource|, interpreted as a Unicode scalar value
192  * having that unsigned value, to its UTF-8 representation in |aDest|.
193  *
194  * Returns the number of code units written.
195  *
196  * The length of aDest must be at least twice the length of aSource.
197  */
ConvertLatin1toUtf8(mozilla::Span<const char> aSource,mozilla::Span<char> aDest)198 inline size_t ConvertLatin1toUtf8(mozilla::Span<const char> aSource,
199                                   mozilla::Span<char> aDest) {
200   return encoding_mem_convert_latin1_to_utf8(
201       aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
202 }
203 
204 /**
205  * Converts bytes whose unsigned value is interpreted as Unicode code point
206  * (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
207  * output space.
208  *
209  * Returns the number of bytes read and the number of bytes written.
210  *
211  * If the output isn't large enough, not all input is consumed.
212  *
213  * The conversion is guaranteed to be complete if the length of aDest is
214  * at least the length of aSource times two.
215  *
216  * The output is always valid UTF-8 ending on scalar value boundary
217  * even in the case of partial conversion.
218  *
219  * The semantics of this function match the semantics of
220  * TextEncoder.encodeInto.
221  * https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto
222  */
ConvertLatin1toUtf8Partial(mozilla::Span<const char> aSource,mozilla::Span<char> aDest)223 inline mozilla::Tuple<size_t, size_t> ConvertLatin1toUtf8Partial(
224     mozilla::Span<const char> aSource, mozilla::Span<char> aDest) {
225   size_t srcLen = aSource.Length();
226   size_t dstLen = aDest.Length();
227   encoding_mem_convert_latin1_to_utf8_partial(aSource.Elements(), &srcLen,
228                                               aDest.Elements(), &dstLen);
229   return mozilla::MakeTuple(srcLen, dstLen);
230 }
231 
232 /**
233  * Converts Latin-1 code points (i.e. each byte is the identical code
234  * point) from |aSource| to UTF-16 code points in |aDest|.
235  *
236  * The length of aDest must not be less than the length of aSource.
237  */
ConvertLatin1toUtf16(mozilla::Span<const char> aSource,mozilla::Span<char16_t> aDest)238 inline void ConvertLatin1toUtf16(mozilla::Span<const char> aSource,
239                                  mozilla::Span<char16_t> aDest) {
240   const char* srcPtr = aSource.Elements();
241   size_t srcLen = aSource.Length();
242   char16_t* dstPtr = aDest.Elements();
243   size_t dstLen = aDest.Length();
244   // Avoid function call overhead when SIMD isn't used anyway
245   if (srcLen < mozilla::detail::kShortStringLimitForInlinePaths) {
246     MOZ_ASSERT(dstLen >= srcLen);
247     const uint8_t* unsignedPtr = reinterpret_cast<const uint8_t*>(srcPtr);
248     const uint8_t* end = unsignedPtr + srcLen;
249     while (unsignedPtr < end) {
250       *dstPtr = *unsignedPtr;
251       ++unsignedPtr;
252       ++dstPtr;
253     }
254     return;
255   }
256   encoding_mem_convert_latin1_to_utf16(srcPtr, srcLen, dstPtr, dstLen);
257 }
258 
259 #endif
260 
261 };  // namespace mozilla
262 
263 #endif  // mozilla_Latin1_h
264