1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 #include <google/protobuf/util/internal/json_escaping.h>
32
33 #include <google/protobuf/stubs/logging.h>
34 #include <google/protobuf/stubs/common.h>
35
36 namespace google {
37 namespace protobuf {
38 namespace util {
39 namespace converter {
40
41 namespace {
42
43 // Array of hex characters for conversion to hex.
44 static const char kHex[] = "0123456789abcdef";
45
46 // Characters 0x00 to 0x9f are very commonly used, so we provide a special
47 // table lookup.
48 //
49 // For unicode code point ch < 0xa0:
50 // kCommonEscapes[ch] is the escaped string of ch, if escaping is needed;
51 // or an empty string, if escaping is not needed.
52 static const char kCommonEscapes[160][7] = {
53 // C0 (ASCII and derivatives) control characters
54 "\\u0000", "\\u0001", "\\u0002", "\\u0003", // 0x00
55 "\\u0004", "\\u0005", "\\u0006", "\\u0007",
56 "\\b", "\\t", "\\n", "\\u000b",
57 "\\f", "\\r", "\\u000e", "\\u000f",
58 "\\u0010", "\\u0011", "\\u0012", "\\u0013", // 0x10
59 "\\u0014", "\\u0015", "\\u0016", "\\u0017",
60 "\\u0018", "\\u0019", "\\u001a", "\\u001b",
61 "\\u001c", "\\u001d", "\\u001e", "\\u001f",
62 // Escaping of " and \ are required by www.json.org string definition.
63 // Escaping of < and > are required for HTML security.
64 "", "", "\\\"", "", "", "", "", "", // 0x20
65 "", "", "", "", "", "", "", "",
66 "", "", "", "", "", "", "", "", // 0x30
67 "", "", "", "", "\\u003c", "", "\\u003e", "",
68 "", "", "", "", "", "", "", "", // 0x40
69 "", "", "", "", "", "", "", "",
70 "", "", "", "", "", "", "", "", // 0x50
71 "", "", "", "", "\\\\", "", "", "",
72 "", "", "", "", "", "", "", "", // 0x60
73 "", "", "", "", "", "", "", "",
74 "", "", "", "", "", "", "", "", // 0x70
75 "", "", "", "", "", "", "", "\\u007f",
76 // C1 (ISO 8859 and Unicode) extended control characters
77 "\\u0080", "\\u0081", "\\u0082", "\\u0083", // 0x80
78 "\\u0084", "\\u0085", "\\u0086", "\\u0087",
79 "\\u0088", "\\u0089", "\\u008a", "\\u008b",
80 "\\u008c", "\\u008d", "\\u008e", "\\u008f",
81 "\\u0090", "\\u0091", "\\u0092", "\\u0093", // 0x90
82 "\\u0094", "\\u0095", "\\u0096", "\\u0097",
83 "\\u0098", "\\u0099", "\\u009a", "\\u009b",
84 "\\u009c", "\\u009d", "\\u009e", "\\u009f"
85 };
86
87 // Determines if the given char value is a unicode surrogate code unit (either
88 // high-surrogate or low-surrogate).
IsSurrogate(uint32 c)89 inline bool IsSurrogate(uint32 c) {
90 // Optimized form of:
91 // return c >= kMinHighSurrogate && c <= kMaxLowSurrogate;
92 // (Reduced from 3 ALU instructions to 2 ALU instructions)
93 return (c & 0xfffff800) == JsonEscaping::kMinHighSurrogate;
94 }
95
96 // Returns true if the given unicode code point cp is a valid
97 // unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint).
IsValidCodePoint(uint32 cp)98 inline bool IsValidCodePoint(uint32 cp) {
99 return cp <= JsonEscaping::kMaxCodePoint;
100 }
101
102 // Returns the low surrogate for the given unicode code point. The result is
103 // meaningless if the given code point is not a supplementary character.
ToLowSurrogate(uint32 cp)104 inline uint16 ToLowSurrogate(uint32 cp) {
105 return (cp & (JsonEscaping::kMaxLowSurrogate
106 - JsonEscaping::kMinLowSurrogate))
107 + JsonEscaping::kMinLowSurrogate;
108 }
109
110 // Returns the high surrogate for the given unicode code point. The result is
111 // meaningless if the given code point is not a supplementary character.
ToHighSurrogate(uint32 cp)112 inline uint16 ToHighSurrogate(uint32 cp) {
113 return (cp >> 10) + (JsonEscaping::kMinHighSurrogate -
114 (JsonEscaping::kMinSupplementaryCodePoint >> 10));
115 }
116
117 // Input str is encoded in UTF-8. A unicode code point could be encoded in
118 // UTF-8 using anywhere from 1 to 4 characters, and it could span multiple
119 // reads of the ByteSource.
120 //
121 // This function reads the next unicode code point from the input (str) at
122 // the given position (index), taking into account any left-over partial
123 // code point from the previous iteration (cp), together with the number
124 // of characters left to read to complete this code point (num_left).
125 //
126 // This function assumes that the input (str) is valid at the given position
127 // (index). In order words, at least one character could be read successfully.
128 //
129 // The code point read (partial or complete) is stored in (cp). Upon return,
130 // (num_left) stores the number of characters that has yet to be read in
131 // order to complete the current unicode code point. If the read is complete,
132 // then (num_left) is 0. Also, (num_read) is the number of characters read.
133 //
134 // Returns false if we encounter an invalid UTF-8 string. Returns true
135 // otherwise, including the case when we reach the end of the input (str)
136 // before a complete unicode code point is read.
ReadCodePoint(StringPiece str,int index,uint32 * cp,int * num_left,int * num_read)137 bool ReadCodePoint(StringPiece str, int index,
138 uint32 *cp, int* num_left, int *num_read) {
139 if (*num_left == 0) {
140 // Last read was complete. Start reading a new unicode code point.
141 *cp = static_cast<uint8>(str[index++]);
142 *num_read = 1;
143 // The length of the code point is determined from reading the first byte.
144 //
145 // If the first byte is between:
146 // 0..0x7f: that's the value of the code point.
147 // 0x80..0xbf: <invalid>
148 // 0xc0..0xdf: 11-bit code point encoded in 2 bytes.
149 // bit 10-6, bit 5-0
150 // 0xe0..0xef: 16-bit code point encoded in 3 bytes.
151 // bit 15-12, bit 11-6, bit 5-0
152 // 0xf0..0xf7: 21-bit code point encoded in 4 bytes.
153 // bit 20-18, bit 17-12, bit 11-6, bit 5-0
154 // 0xf8..0xff: <invalid>
155 //
156 // Meaning of each bit:
157 // <msb> bit 7: 0 - single byte code point: bits 6-0 are values.
158 // 1 - multibyte code point
159 // bit 6: 0 - subsequent bytes of multibyte code point:
160 // bits 5-0 are values.
161 // 1 - first byte of multibyte code point
162 // bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values.
163 // 1 - first byte of code point with >= 3 bytes.
164 // bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values.
165 // 1 - first byte of code point with >= 4 bytes.
166 // bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values.
167 // 1 - reserved for future expansion.
168 if (*cp <= 0x7f) {
169 return true;
170 } else if (*cp <= 0xbf) {
171 return false;
172 } else if (*cp <= 0xdf) {
173 *cp &= 0x1f;
174 *num_left = 1;
175 } else if (*cp <= 0xef) {
176 *cp &= 0x0f;
177 *num_left = 2;
178 } else if (*cp <= 0xf7) {
179 *cp &= 0x07;
180 *num_left = 3;
181 } else {
182 return false;
183 }
184 } else {
185 // Last read was partial. Initialize num_read to 0 and continue reading
186 // the last unicode code point.
187 *num_read = 0;
188 }
189 while (*num_left > 0 && index < str.size()) {
190 uint32 ch = static_cast<uint8>(str[index++]);
191 --(*num_left);
192 ++(*num_read);
193 *cp = (*cp << 6) | (ch & 0x3f);
194 if (ch < 0x80 || ch > 0xbf) return false;
195 }
196 return *num_left > 0 || (!IsSurrogate(*cp) && IsValidCodePoint(*cp));
197 }
198
199 // Stores the 16-bit unicode code point as its hexadecimal digits in buffer
200 // and returns a StringPiece that points to this buffer. The input buffer needs
201 // to be at least 6 bytes long.
ToHex(uint16 cp,char * buffer)202 StringPiece ToHex(uint16 cp, char* buffer) {
203 buffer[5] = kHex[cp & 0x0f];
204 cp >>= 4;
205 buffer[4] = kHex[cp & 0x0f];
206 cp >>= 4;
207 buffer[3] = kHex[cp & 0x0f];
208 cp >>= 4;
209 buffer[2] = kHex[cp & 0x0f];
210 return StringPiece(buffer).substr(0, 6);
211 }
212
213 // Stores the 32-bit unicode code point as its hexadecimal digits in buffer
214 // and returns a StringPiece that points to this buffer. The input buffer needs
215 // to be at least 12 bytes long.
ToSurrogateHex(uint32 cp,char * buffer)216 StringPiece ToSurrogateHex(uint32 cp, char* buffer) {
217 uint16 low = ToLowSurrogate(cp);
218 uint16 high = ToHighSurrogate(cp);
219
220 buffer[11] = kHex[low & 0x0f];
221 low >>= 4;
222 buffer[10] = kHex[low & 0x0f];
223 low >>= 4;
224 buffer[9] = kHex[low & 0x0f];
225 low >>= 4;
226 buffer[8] = kHex[low & 0x0f];
227
228 buffer[5] = kHex[high & 0x0f];
229 high >>= 4;
230 buffer[4] = kHex[high & 0x0f];
231 high >>= 4;
232 buffer[3] = kHex[high & 0x0f];
233 high >>= 4;
234 buffer[2] = kHex[high & 0x0f];
235
236 return StringPiece(buffer, 12);
237 }
238
239 // If the given unicode code point needs escaping, then returns the
240 // escaped form. The returned StringPiece either points to statically
241 // pre-allocated char[] or to the given buffer. The input buffer needs
242 // to be at least 12 bytes long.
243 //
244 // If the given unicode code point does not need escaping, an empty
245 // StringPiece is returned.
EscapeCodePoint(uint32 cp,char * buffer)246 StringPiece EscapeCodePoint(uint32 cp, char* buffer) {
247 if (cp < 0xa0) return kCommonEscapes[cp];
248 switch (cp) {
249 // These are not required by json spec
250 // but used to prevent security bugs in javascript.
251 case 0xfeff: // Zero width no-break space
252 case 0xfff9: // Interlinear annotation anchor
253 case 0xfffa: // Interlinear annotation separator
254 case 0xfffb: // Interlinear annotation terminator
255
256 case 0x00ad: // Soft-hyphen
257 case 0x06dd: // Arabic end of ayah
258 case 0x070f: // Syriac abbreviation mark
259 case 0x17b4: // Khmer vowel inherent Aq
260 case 0x17b5: // Khmer vowel inherent Aa
261 return ToHex(cp, buffer);
262
263 default:
264 if ((cp >= 0x0600 && cp <= 0x0603) || // Arabic signs
265 (cp >= 0x200b && cp <= 0x200f) || // Zero width etc.
266 (cp >= 0x2028 && cp <= 0x202e) || // Separators etc.
267 (cp >= 0x2060 && cp <= 0x2064) || // Invisible etc.
268 (cp >= 0x206a && cp <= 0x206f)) { // Shaping etc.
269 return ToHex(cp, buffer);
270 }
271
272 if (cp == 0x000e0001 || // Language tag
273 (cp >= 0x0001d173 && cp <= 0x0001d17a) || // Music formatting
274 (cp >= 0x000e0020 && cp <= 0x000e007f)) { // TAG symbols
275 return ToSurrogateHex(cp, buffer);
276 }
277 }
278 return StringPiece();
279 }
280
281 // Tries to escape the given code point first. If the given code point
282 // does not need to be escaped, but force_output is true, then render
283 // the given multi-byte code point in UTF8 in the buffer and returns it.
EscapeCodePoint(uint32 cp,char * buffer,bool force_output)284 StringPiece EscapeCodePoint(uint32 cp, char* buffer, bool force_output) {
285 StringPiece sp = EscapeCodePoint(cp, buffer);
286 if (force_output && sp.empty()) {
287 buffer[5] = (cp & 0x3f) | 0x80;
288 cp >>= 6;
289 if (cp <= 0x1f) {
290 buffer[4] = cp | 0xc0;
291 sp = StringPiece(buffer + 4, 2);
292 return sp;
293 }
294 buffer[4] = (cp & 0x3f) | 0x80;
295 cp >>= 6;
296 if (cp <= 0x0f) {
297 buffer[3] = cp | 0xe0;
298 sp = StringPiece(buffer + 3, 3);
299 return sp;
300 }
301 buffer[3] = (cp & 0x3f) | 0x80;
302 buffer[2] = ((cp >> 6) & 0x07) | 0xf0;
303 sp = StringPiece(buffer + 2, 4);
304 }
305 return sp;
306 }
307
308 } // namespace
309
Escape(strings::ByteSource * input,strings::ByteSink * output)310 void JsonEscaping::Escape(strings::ByteSource* input,
311 strings::ByteSink* output) {
312 char buffer[12] = "\\udead\\ubee";
313 uint32 cp = 0; // Current unicode code point.
314 int num_left = 0; // Num of chars to read to complete the code point.
315 while (input->Available() > 0) {
316 StringPiece str = input->Peek();
317 StringPiece escaped;
318 int i = 0;
319 int num_read;
320 bool ok;
321 bool cp_was_split = num_left > 0;
322 // Loop until we encounter either
323 // i) a code point that needs to be escaped; or
324 // ii) a split code point is completely read; or
325 // iii) a character that is not a valid utf8; or
326 // iv) end of the StringPiece str is reached.
327 do {
328 ok = ReadCodePoint(str, i, &cp, &num_left, &num_read);
329 if (num_left > 0 || !ok) break; // case iii or iv
330 escaped = EscapeCodePoint(cp, buffer, cp_was_split);
331 if (!escaped.empty()) break; // case i or ii
332 i += num_read;
333 num_read = 0;
334 } while (i < str.length()); // case iv
335 // First copy the un-escaped prefix, if any, to the output ByteSink.
336 if (i > 0) input->CopyTo(output, i);
337 if (num_read > 0) input->Skip(num_read);
338 if (!ok) {
339 // Case iii: Report error.
340 // TODO(wpoon): Add error reporting.
341 num_left = 0;
342 } else if (num_left == 0 && !escaped.empty()) {
343 // Case i or ii: Append the escaped code point to the output ByteSink.
344 output->Append(escaped.data(), escaped.size());
345 }
346 }
347 if (num_left > 0) {
348 // Treat as case iii: report error.
349 // TODO(wpoon): Add error reporting.
350 }
351 }
352
353 } // namespace converter
354 } // namespace util
355 } // namespace protobuf
356 } // namespace google
357