1 /*
2  * Copyright (C) 2004, 2006, 2008, 2010 Apple Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 
26 #include "third_party/blink/renderer/platform/wtf/text/text_codec_utf16.h"
27 
28 #include <memory>
29 #include "third_party/blink/renderer/platform/wtf/text/character_names.h"
30 #include "third_party/blink/renderer/platform/wtf/text/string_buffer.h"
31 #include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
32 
33 namespace WTF {
34 
RegisterEncodingNames(EncodingNameRegistrar registrar)35 void TextCodecUTF16::RegisterEncodingNames(EncodingNameRegistrar registrar) {
36   registrar("UTF-16LE", "UTF-16LE");
37   registrar("UTF-16BE", "UTF-16BE");
38 
39   registrar("ISO-10646-UCS-2", "UTF-16LE");
40   registrar("UCS-2", "UTF-16LE");
41   registrar("UTF-16", "UTF-16LE");
42   registrar("Unicode", "UTF-16LE");
43   registrar("csUnicode", "UTF-16LE");
44   registrar("unicodeFEFF", "UTF-16LE");
45 
46   registrar("unicodeFFFE", "UTF-16BE");
47 }
48 
NewStreamingTextDecoderUTF16LE(const TextEncoding &,const void *)49 static std::unique_ptr<TextCodec> NewStreamingTextDecoderUTF16LE(
50     const TextEncoding&,
51     const void*) {
52   return std::make_unique<TextCodecUTF16>(true);
53 }
54 
NewStreamingTextDecoderUTF16BE(const TextEncoding &,const void *)55 static std::unique_ptr<TextCodec> NewStreamingTextDecoderUTF16BE(
56     const TextEncoding&,
57     const void*) {
58   return std::make_unique<TextCodecUTF16>(false);
59 }
60 
RegisterCodecs(TextCodecRegistrar registrar)61 void TextCodecUTF16::RegisterCodecs(TextCodecRegistrar registrar) {
62   registrar("UTF-16LE", NewStreamingTextDecoderUTF16LE, nullptr);
63   registrar("UTF-16BE", NewStreamingTextDecoderUTF16BE, nullptr);
64 }
65 
Decode(const char * bytes,wtf_size_t length,FlushBehavior flush,bool,bool & saw_error)66 String TextCodecUTF16::Decode(const char* bytes,
67                               wtf_size_t length,
68                               FlushBehavior flush,
69                               bool,
70                               bool& saw_error) {
71   // For compatibility reasons, ignore flush from fetch EOF.
72   const bool really_flush = flush != FlushBehavior::kDoNotFlush &&
73                             flush != FlushBehavior::kFetchEOF;
74 
75   if (!length) {
76     if (really_flush && (have_lead_byte_ || have_lead_surrogate_)) {
77       have_lead_byte_ = have_lead_surrogate_ = false;
78       saw_error = true;
79       return String(&kReplacementCharacter, 1);
80     }
81     return String();
82   }
83 
84   const unsigned char* p = reinterpret_cast<const unsigned char*>(bytes);
85   const wtf_size_t num_bytes = length + have_lead_byte_;
86   const bool will_have_extra_byte = num_bytes & 1;
87   const wtf_size_t num_chars_in = num_bytes / 2;
88   const wtf_size_t max_chars_out =
89       num_chars_in + (have_lead_surrogate_ ? 1 : 0) +
90       (really_flush && will_have_extra_byte ? 1 : 0);
91 
92   StringBuffer<UChar> buffer(max_chars_out);
93   UChar* q = buffer.Characters();
94 
95   for (wtf_size_t i = 0; i < num_chars_in; ++i) {
96     UChar c;
97     if (have_lead_byte_) {
98       c = little_endian_ ? (lead_byte_ | (p[0] << 8))
99                          : ((lead_byte_ << 8) | p[0]);
100       have_lead_byte_ = false;
101       ++p;
102     } else {
103       c = little_endian_ ? (p[0] | (p[1] << 8)) : ((p[0] << 8) | p[1]);
104       p += 2;
105     }
106 
107     // TODO(jsbell): If necessary for performance, m_haveLeadByte handling
108     // can be pulled out and this loop split into distinct cases for
109     // big/little endian. The logic from here to the end of the loop is
110     // constant with respect to m_haveLeadByte and m_littleEndian.
111 
112     if (have_lead_surrogate_ && U_IS_TRAIL(c)) {
113       *q++ = lead_surrogate_;
114       have_lead_surrogate_ = false;
115       *q++ = c;
116     } else {
117       if (have_lead_surrogate_) {
118         have_lead_surrogate_ = false;
119         saw_error = true;
120         *q++ = kReplacementCharacter;
121       }
122 
123       if (U_IS_LEAD(c)) {
124         have_lead_surrogate_ = true;
125         lead_surrogate_ = c;
126       } else if (U_IS_TRAIL(c)) {
127         saw_error = true;
128         *q++ = kReplacementCharacter;
129       } else {
130         *q++ = c;
131       }
132     }
133   }
134 
135   DCHECK(!have_lead_byte_);
136   if (will_have_extra_byte) {
137     have_lead_byte_ = true;
138     lead_byte_ = p[0];
139   }
140 
141   if (really_flush && (have_lead_byte_ || have_lead_surrogate_)) {
142     have_lead_byte_ = have_lead_surrogate_ = false;
143     saw_error = true;
144     *q++ = kReplacementCharacter;
145   }
146 
147   buffer.Shrink(static_cast<wtf_size_t>(q - buffer.Characters()));
148 
149   return String::Adopt(buffer);
150 }
151 
Encode(const UChar * characters,wtf_size_t length,UnencodableHandling)152 std::string TextCodecUTF16::Encode(const UChar* characters,
153                                    wtf_size_t length,
154                                    UnencodableHandling) {
155   // We need to be sure we can double the length without overflowing.
156   // Since the passed-in length is the length of an actual existing
157   // character buffer, each character is two bytes, and we know
158   // the buffer doesn't occupy the entire address space, we can
159   // assert here that doubling the length does not overflow wtf_size_t
160   // and there's no need for a runtime check.
161   DCHECK_LE(length, std::numeric_limits<wtf_size_t>::max() / 2);
162 
163   std::string result(length * 2, '\0');
164 
165   if (little_endian_) {
166     for (wtf_size_t i = 0; i < length; ++i) {
167       UChar c = characters[i];
168       result[i * 2] = static_cast<char>(c);
169       result[i * 2 + 1] = c >> 8;
170     }
171   } else {
172     for (wtf_size_t i = 0; i < length; ++i) {
173       UChar c = characters[i];
174       result[i * 2] = c >> 8;
175       result[i * 2 + 1] = static_cast<char>(c);
176     }
177   }
178 
179   return result;
180 }
181 
Encode(const LChar * characters,wtf_size_t length,UnencodableHandling)182 std::string TextCodecUTF16::Encode(const LChar* characters,
183                                    wtf_size_t length,
184                                    UnencodableHandling) {
185   // In the LChar case, we do actually need to perform this check in release. :)
186   CHECK_LE(length, std::numeric_limits<wtf_size_t>::max() / 2);
187 
188   std::string result(length * 2, '\0');
189 
190   if (little_endian_) {
191     for (wtf_size_t i = 0; i < length; ++i) {
192       result[i * 2] = characters[i];
193       result[i * 2 + 1] = 0;
194     }
195   } else {
196     for (wtf_size_t i = 0; i < length; ++i) {
197       result[i * 2] = 0;
198       result[i * 2 + 1] = characters[i];
199     }
200   }
201 
202   return result;
203 }
204 
205 }  // namespace WTF
206