1 /*
2 * Copyright (C) 2004, 2006, 2008, 2010 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26 #include "third_party/blink/renderer/platform/wtf/text/text_codec_utf16.h"
27
28 #include <memory>
29 #include "third_party/blink/renderer/platform/wtf/text/character_names.h"
30 #include "third_party/blink/renderer/platform/wtf/text/string_buffer.h"
31 #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
32
33 namespace WTF {
34
RegisterEncodingNames(EncodingNameRegistrar registrar)35 void TextCodecUTF16::RegisterEncodingNames(EncodingNameRegistrar registrar) {
36 registrar("UTF-16LE", "UTF-16LE");
37 registrar("UTF-16BE", "UTF-16BE");
38
39 registrar("ISO-10646-UCS-2", "UTF-16LE");
40 registrar("UCS-2", "UTF-16LE");
41 registrar("UTF-16", "UTF-16LE");
42 registrar("Unicode", "UTF-16LE");
43 registrar("csUnicode", "UTF-16LE");
44 registrar("unicodeFEFF", "UTF-16LE");
45
46 registrar("unicodeFFFE", "UTF-16BE");
47 }
48
NewStreamingTextDecoderUTF16LE(const TextEncoding &,const void *)49 static std::unique_ptr<TextCodec> NewStreamingTextDecoderUTF16LE(
50 const TextEncoding&,
51 const void*) {
52 return std::make_unique<TextCodecUTF16>(true);
53 }
54
NewStreamingTextDecoderUTF16BE(const TextEncoding &,const void *)55 static std::unique_ptr<TextCodec> NewStreamingTextDecoderUTF16BE(
56 const TextEncoding&,
57 const void*) {
58 return std::make_unique<TextCodecUTF16>(false);
59 }
60
RegisterCodecs(TextCodecRegistrar registrar)61 void TextCodecUTF16::RegisterCodecs(TextCodecRegistrar registrar) {
62 registrar("UTF-16LE", NewStreamingTextDecoderUTF16LE, nullptr);
63 registrar("UTF-16BE", NewStreamingTextDecoderUTF16BE, nullptr);
64 }
65
Decode(const char * bytes,wtf_size_t length,FlushBehavior flush,bool,bool & saw_error)66 String TextCodecUTF16::Decode(const char* bytes,
67 wtf_size_t length,
68 FlushBehavior flush,
69 bool,
70 bool& saw_error) {
71 // For compatibility reasons, ignore flush from fetch EOF.
72 const bool really_flush = flush != FlushBehavior::kDoNotFlush &&
73 flush != FlushBehavior::kFetchEOF;
74
75 if (!length) {
76 if (really_flush && (have_lead_byte_ || have_lead_surrogate_)) {
77 have_lead_byte_ = have_lead_surrogate_ = false;
78 saw_error = true;
79 return String(&kReplacementCharacter, 1);
80 }
81 return String();
82 }
83
84 const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);
85 const wtf_size_t num_bytes = length + have_lead_byte_;
86 const bool will_have_extra_byte = num_bytes & 1;
87 const wtf_size_t num_chars_in = num_bytes / 2;
88 const wtf_size_t max_chars_out =
89 num_chars_in + (have_lead_surrogate_ ? 1 : 0) +
90 (really_flush && will_have_extra_byte ? 1 : 0);
91
92 StringBuffer<UChar> buffer(max_chars_out);
93 UChar* q = buffer.Characters();
94
95 for (wtf_size_t i = 0; i < num_chars_in; ++i) {
96 UChar c;
97 if (have_lead_byte_) {
98 c = little_endian_ ? (lead_byte_ | (p[0] << 8))
99 : ((lead_byte_ << 8) | p[0]);
100 have_lead_byte_ = false;
101 ++p;
102 } else {
103 c = little_endian_ ? (p[0] | (p[1] << 8)) : ((p[0] << 8) | p[1]);
104 p += 2;
105 }
106
107 // TODO(jsbell): If necessary for performance, m_haveLeadByte handling
108 // can be pulled out and this loop split into distinct cases for
109 // big/little endian. The logic from here to the end of the loop is
110 // constant with respect to m_haveLeadByte and m_littleEndian.
111
112 if (have_lead_surrogate_ && U_IS_TRAIL(c)) {
113 *q++ = lead_surrogate_;
114 have_lead_surrogate_ = false;
115 *q++ = c;
116 } else {
117 if (have_lead_surrogate_) {
118 have_lead_surrogate_ = false;
119 saw_error = true;
120 *q++ = kReplacementCharacter;
121 }
122
123 if (U_IS_LEAD(c)) {
124 have_lead_surrogate_ = true;
125 lead_surrogate_ = c;
126 } else if (U_IS_TRAIL(c)) {
127 saw_error = true;
128 *q++ = kReplacementCharacter;
129 } else {
130 *q++ = c;
131 }
132 }
133 }
134
135 DCHECK(!have_lead_byte_);
136 if (will_have_extra_byte) {
137 have_lead_byte_ = true;
138 lead_byte_ = p[0];
139 }
140
141 if (really_flush && (have_lead_byte_ || have_lead_surrogate_)) {
142 have_lead_byte_ = have_lead_surrogate_ = false;
143 saw_error = true;
144 *q++ = kReplacementCharacter;
145 }
146
147 buffer.Shrink(static_cast<wtf_size_t>(q - buffer.Characters()));
148
149 return String::Adopt(buffer);
150 }
151
Encode(const UChar * characters,wtf_size_t length,UnencodableHandling)152 std::string TextCodecUTF16::Encode(const UChar* characters,
153 wtf_size_t length,
154 UnencodableHandling) {
155 // We need to be sure we can double the length without overflowing.
156 // Since the passed-in length is the length of an actual existing
157 // character buffer, each character is two bytes, and we know
158 // the buffer doesn't occupy the entire address space, we can
159 // assert here that doubling the length does not overflow wtf_size_t
160 // and there's no need for a runtime check.
161 DCHECK_LE(length, std::numeric_limits<wtf_size_t>::max() / 2);
162
163 std::string result(length * 2, '\0');
164
165 if (little_endian_) {
166 for (wtf_size_t i = 0; i < length; ++i) {
167 UChar c = characters[i];
168 result[i * 2] = static_cast<char>(c);
169 result[i * 2 + 1] = c >> 8;
170 }
171 } else {
172 for (wtf_size_t i = 0; i < length; ++i) {
173 UChar c = characters[i];
174 result[i * 2] = c >> 8;
175 result[i * 2 + 1] = static_cast<char>(c);
176 }
177 }
178
179 return result;
180 }
181
Encode(const LChar * characters,wtf_size_t length,UnencodableHandling)182 std::string TextCodecUTF16::Encode(const LChar* characters,
183 wtf_size_t length,
184 UnencodableHandling) {
185 // In the LChar case, we do actually need to perform this check in release. :)
186 CHECK_LE(length, std::numeric_limits<wtf_size_t>::max() / 2);
187
188 std::string result(length * 2, '\0');
189
190 if (little_endian_) {
191 for (wtf_size_t i = 0; i < length; ++i) {
192 result[i * 2] = characters[i];
193 result[i * 2 + 1] = 0;
194 }
195 } else {
196 for (wtf_size_t i = 0; i < length; ++i) {
197 result[i * 2] = 0;
198 result[i * 2 + 1] = characters[i];
199 }
200 }
201
202 return result;
203 }
204
205 } // namespace WTF
206