1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 #include <google/protobuf/util/internal/json_escaping.h>
32 
33 #include <google/protobuf/stubs/logging.h>
34 #include <google/protobuf/stubs/common.h>
35 
36 namespace google {
37 namespace protobuf {
38 namespace util {
39 namespace converter {
40 
41 namespace {
42 
43 // Array of hex characters for conversion to hex.
44 static const char kHex[] = "0123456789abcdef";
45 
46 // Characters 0x00 to 0x9f are very commonly used, so we provide a special
47 // table lookup.
48 //
49 // For unicode code point ch < 0xa0:
50 // kCommonEscapes[ch] is the escaped string of ch, if escaping is needed;
51 //                    or an empty string, if escaping is not needed.
52 static const char kCommonEscapes[160][7] = {
53     // C0 (ASCII and derivatives) control characters
54     "\\u0000", "\\u0001", "\\u0002", "\\u0003",  // 0x00
55     "\\u0004", "\\u0005", "\\u0006", "\\u0007", "\\b", "\\t", "\\n", "\\u000b",
56     "\\f", "\\r", "\\u000e", "\\u000f", "\\u0010", "\\u0011", "\\u0012",
57     "\\u0013",  // 0x10
58     "\\u0014", "\\u0015", "\\u0016", "\\u0017", "\\u0018", "\\u0019", "\\u001a",
59     "\\u001b", "\\u001c", "\\u001d", "\\u001e", "\\u001f",
60     // Escaping of " and \ are required by www.json.org string definition.
61     // Escaping of < and > are required for HTML security.
62     "", "", "\\\"", "", "", "", "", "",                              // 0x20
63     "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",  // 0x30
64     "", "", "", "", "\\u003c", "", "\\u003e", "", "", "", "", "", "", "", "",
65     "",                                                                  // 0x40
66     "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",      // 0x50
67     "", "", "", "", "\\\\", "", "", "", "", "", "", "", "", "", "", "",  // 0x60
68     "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",      // 0x70
69     "", "", "", "", "", "", "", "\\u007f",
70     // C1 (ISO 8859 and Unicode) extended control characters
71     "\\u0080", "\\u0081", "\\u0082", "\\u0083",  // 0x80
72     "\\u0084", "\\u0085", "\\u0086", "\\u0087", "\\u0088", "\\u0089", "\\u008a",
73     "\\u008b", "\\u008c", "\\u008d", "\\u008e", "\\u008f", "\\u0090", "\\u0091",
74     "\\u0092", "\\u0093",  // 0x90
75     "\\u0094", "\\u0095", "\\u0096", "\\u0097", "\\u0098", "\\u0099", "\\u009a",
76     "\\u009b", "\\u009c", "\\u009d", "\\u009e", "\\u009f"};
77 
78 // Determines if the given char value is a unicode surrogate code unit (either
79 // high-surrogate or low-surrogate).
IsSurrogate(uint32 c)80 inline bool IsSurrogate(uint32 c) {
81   // Optimized form of:
82   // return c >= kMinHighSurrogate && c <= kMaxLowSurrogate;
83   // (Reduced from 3 ALU instructions to 2 ALU instructions)
84   return (c & 0xfffff800) == JsonEscaping::kMinHighSurrogate;
85 }
86 
87 // Returns true if the given unicode code point cp is a valid
88 // unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint).
IsValidCodePoint(uint32 cp)89 inline bool IsValidCodePoint(uint32 cp) {
90   return cp <= JsonEscaping::kMaxCodePoint;
91 }
92 
93 // Returns the low surrogate for the given unicode code point. The result is
94 // meaningless if the given code point is not a supplementary character.
ToLowSurrogate(uint32 cp)95 inline uint16 ToLowSurrogate(uint32 cp) {
96   return (cp &
97           (JsonEscaping::kMaxLowSurrogate - JsonEscaping::kMinLowSurrogate)) +
98          JsonEscaping::kMinLowSurrogate;
99 }
100 
101 // Returns the high surrogate for the given unicode code point. The result is
102 // meaningless if the given code point is not a supplementary character.
ToHighSurrogate(uint32 cp)103 inline uint16 ToHighSurrogate(uint32 cp) {
104   return (cp >> 10) + (JsonEscaping::kMinHighSurrogate -
105                        (JsonEscaping::kMinSupplementaryCodePoint >> 10));
106 }
107 
108 // Input str is encoded in UTF-8. A unicode code point could be encoded in
109 // UTF-8 using anywhere from 1 to 4 characters, and it could span multiple
110 // reads of the ByteSource.
111 //
112 // This function reads the next unicode code point from the input (str) at
113 // the given position (index), taking into account any left-over partial
114 // code point from the previous iteration (cp), together with the number
115 // of characters left to read to complete this code point (num_left).
116 //
117 // This function assumes that the input (str) is valid at the given position
118 // (index). In order words, at least one character could be read successfully.
119 //
120 // The code point read (partial or complete) is stored in (cp). Upon return,
121 // (num_left) stores the number of characters that has yet to be read in
122 // order to complete the current unicode code point. If the read is complete,
123 // then (num_left) is 0. Also, (num_read) is the number of characters read.
124 //
125 // Returns false if we encounter an invalid UTF-8 string. Returns true
126 // otherwise, including the case when we reach the end of the input (str)
127 // before a complete unicode code point is read.
ReadCodePoint(StringPiece str,int index,uint32 * cp,int * num_left,int * num_read)128 bool ReadCodePoint(StringPiece str, int index, uint32* cp, int* num_left,
129                    int* num_read) {
130   if (*num_left == 0) {
131     // Last read was complete. Start reading a new unicode code point.
132     *cp = static_cast<uint8>(str[index++]);
133     *num_read = 1;
134     // The length of the code point is determined from reading the first byte.
135     //
136     // If the first byte is between:
137     //    0..0x7f: that's the value of the code point.
138     // 0x80..0xbf: <invalid>
139     // 0xc0..0xdf: 11-bit code point encoded in 2 bytes.
140     //                                   bit 10-6, bit 5-0
141     // 0xe0..0xef: 16-bit code point encoded in 3 bytes.
142     //                        bit 15-12, bit 11-6, bit 5-0
143     // 0xf0..0xf7: 21-bit code point encoded in 4 bytes.
144     //             bit 20-18, bit 17-12, bit 11-6, bit 5-0
145     // 0xf8..0xff: <invalid>
146     //
147     // Meaning of each bit:
148     // <msb> bit 7: 0 - single byte code point: bits 6-0 are values.
149     //              1 - multibyte code point
150     //       bit 6: 0 - subsequent bytes of multibyte code point:
151     //                  bits 5-0 are values.
152     //              1 - first byte of multibyte code point
153     //       bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values.
154     //              1 - first byte of code point with >= 3 bytes.
155     //       bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values.
156     //              1 - first byte of code point with >= 4 bytes.
157     //       bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values.
158     //              1 - reserved for future expansion.
159     if (*cp <= 0x7f) {
160       return true;
161     } else if (*cp <= 0xbf) {
162       return false;
163     } else if (*cp <= 0xdf) {
164       *cp &= 0x1f;
165       *num_left = 1;
166     } else if (*cp <= 0xef) {
167       *cp &= 0x0f;
168       *num_left = 2;
169     } else if (*cp <= 0xf7) {
170       *cp &= 0x07;
171       *num_left = 3;
172     } else {
173       return false;
174     }
175   } else {
176     // Last read was partial. Initialize num_read to 0 and continue reading
177     // the last unicode code point.
178     *num_read = 0;
179   }
180   while (*num_left > 0 && index < str.size()) {
181     uint32 ch = static_cast<uint8>(str[index++]);
182     --(*num_left);
183     ++(*num_read);
184     *cp = (*cp << 6) | (ch & 0x3f);
185     if (ch < 0x80 || ch > 0xbf) return false;
186   }
187   return *num_left > 0 || (!IsSurrogate(*cp) && IsValidCodePoint(*cp));
188 }
189 
190 // Stores the 16-bit unicode code point as its hexadecimal digits in buffer
191 // and returns a StringPiece that points to this buffer. The input buffer needs
192 // to be at least 6 bytes long.
ToHex(uint16 cp,char * buffer)193 StringPiece ToHex(uint16 cp, char* buffer) {
194   buffer[5] = kHex[cp & 0x0f];
195   cp >>= 4;
196   buffer[4] = kHex[cp & 0x0f];
197   cp >>= 4;
198   buffer[3] = kHex[cp & 0x0f];
199   cp >>= 4;
200   buffer[2] = kHex[cp & 0x0f];
201   return StringPiece(buffer, 6);
202 }
203 
204 // Stores the 32-bit unicode code point as its hexadecimal digits in buffer
205 // and returns a StringPiece that points to this buffer. The input buffer needs
206 // to be at least 12 bytes long.
ToSurrogateHex(uint32 cp,char * buffer)207 StringPiece ToSurrogateHex(uint32 cp, char* buffer) {
208   uint16 low = ToLowSurrogate(cp);
209   uint16 high = ToHighSurrogate(cp);
210 
211   buffer[11] = kHex[low & 0x0f];
212   low >>= 4;
213   buffer[10] = kHex[low & 0x0f];
214   low >>= 4;
215   buffer[9] = kHex[low & 0x0f];
216   low >>= 4;
217   buffer[8] = kHex[low & 0x0f];
218 
219   buffer[5] = kHex[high & 0x0f];
220   high >>= 4;
221   buffer[4] = kHex[high & 0x0f];
222   high >>= 4;
223   buffer[3] = kHex[high & 0x0f];
224   high >>= 4;
225   buffer[2] = kHex[high & 0x0f];
226 
227   return StringPiece(buffer, 12);
228 }
229 
230 // If the given unicode code point needs escaping, then returns the
231 // escaped form. The returned StringPiece either points to statically
232 // pre-allocated char[] or to the given buffer. The input buffer needs
233 // to be at least 12 bytes long.
234 //
235 // If the given unicode code point does not need escaping, an empty
236 // StringPiece is returned.
EscapeCodePoint(uint32 cp,char * buffer)237 StringPiece EscapeCodePoint(uint32 cp, char* buffer) {
238   if (cp < 0xa0) return kCommonEscapes[cp];
239   switch (cp) {
240     // These are not required by json spec
241     // but used to prevent security bugs in javascript.
242     case 0xfeff:  // Zero width no-break space
243     case 0xfff9:  // Interlinear annotation anchor
244     case 0xfffa:  // Interlinear annotation separator
245     case 0xfffb:  // Interlinear annotation terminator
246 
247     case 0x00ad:  // Soft-hyphen
248     case 0x06dd:  // Arabic end of ayah
249     case 0x070f:  // Syriac abbreviation mark
250     case 0x17b4:  // Khmer vowel inherent Aq
251     case 0x17b5:  // Khmer vowel inherent Aa
252       return ToHex(cp, buffer);
253 
254     default:
255       if ((cp >= 0x0600 && cp <= 0x0603) ||  // Arabic signs
256           (cp >= 0x200b && cp <= 0x200f) ||  // Zero width etc.
257           (cp >= 0x2028 && cp <= 0x202e) ||  // Separators etc.
258           (cp >= 0x2060 && cp <= 0x2064) ||  // Invisible etc.
259           (cp >= 0x206a && cp <= 0x206f)) {  // Shaping etc.
260         return ToHex(cp, buffer);
261       }
262 
263       if (cp == 0x000e0001 ||                        // Language tag
264           (cp >= 0x0001d173 && cp <= 0x0001d17a) ||  // Music formatting
265           (cp >= 0x000e0020 && cp <= 0x000e007f)) {  // TAG symbols
266         return ToSurrogateHex(cp, buffer);
267       }
268   }
269   return StringPiece();
270 }
271 
272 // Tries to escape the given code point first. If the given code point
273 // does not need to be escaped, but force_output is true, then render
274 // the given multi-byte code point in UTF8 in the buffer and returns it.
EscapeCodePoint(uint32 cp,char * buffer,bool force_output)275 StringPiece EscapeCodePoint(uint32 cp, char* buffer, bool force_output) {
276   StringPiece sp = EscapeCodePoint(cp, buffer);
277   if (force_output && sp.empty()) {
278     buffer[5] = (cp & 0x3f) | 0x80;
279     cp >>= 6;
280     if (cp <= 0x1f) {
281       buffer[4] = cp | 0xc0;
282       sp = StringPiece(buffer + 4, 2);
283       return sp;
284     }
285     buffer[4] = (cp & 0x3f) | 0x80;
286     cp >>= 6;
287     if (cp <= 0x0f) {
288       buffer[3] = cp | 0xe0;
289       sp = StringPiece(buffer + 3, 3);
290       return sp;
291     }
292     buffer[3] = (cp & 0x3f) | 0x80;
293     buffer[2] = ((cp >> 6) & 0x07) | 0xf0;
294     sp = StringPiece(buffer + 2, 4);
295   }
296   return sp;
297 }
298 
299 }  // namespace
300 
Escape(strings::ByteSource * input,strings::ByteSink * output)301 void JsonEscaping::Escape(strings::ByteSource* input,
302                           strings::ByteSink* output) {
303   char buffer[12] = "\\udead\\ubee";
304   uint32 cp = 0;     // Current unicode code point.
305   int num_left = 0;  // Num of chars to read to complete the code point.
306   while (input->Available() > 0) {
307     StringPiece str = input->Peek();
308     StringPiece escaped;
309     int i = 0;
310     int num_read;
311     bool ok;
312     bool cp_was_split = num_left > 0;
313     // Loop until we encounter either
314     //   i) a code point that needs to be escaped; or
315     //  ii) a split code point is completely read; or
316     // iii) a character that is not a valid utf8; or
317     //  iv) end of the StringPiece str is reached.
318     do {
319       ok = ReadCodePoint(str, i, &cp, &num_left, &num_read);
320       if (num_left > 0 || !ok) break;  // case iii or iv
321       escaped = EscapeCodePoint(cp, buffer, cp_was_split);
322       if (!escaped.empty()) break;  // case i or ii
323       i += num_read;
324       num_read = 0;
325     } while (i < str.length());  // case iv
326     // First copy the un-escaped prefix, if any, to the output ByteSink.
327     if (i > 0) input->CopyTo(output, i);
328     if (num_read > 0) input->Skip(num_read);
329     if (!ok) {
330       // Case iii: Report error.
331       // TODO(wpoon): Add error reporting.
332       num_left = 0;
333     } else if (num_left == 0 && !escaped.empty()) {
334       // Case i or ii: Append the escaped code point to the output ByteSink.
335       output->Append(escaped.data(), escaped.size());
336     }
337   }
338   if (num_left > 0) {
339     // Treat as case iii: report error.
340     // TODO(wpoon): Add error reporting.
341   }
342 }
343 
Escape(StringPiece input,strings::ByteSink * output)344 void JsonEscaping::Escape(StringPiece input, strings::ByteSink* output) {
345   const size_t len = input.length();
346   const char* p = input.data();
347 
348   bool can_skip_escaping = true;
349   for (int i = 0; i < len; i++) {
350     char c = p[i];
351     if (c < 0x20 || c >= 0x7F || c == '"' || c == '<' || c == '>' ||
352         c == '\\') {
353       can_skip_escaping = false;
354       break;
355     }
356   }
357 
358   if (can_skip_escaping) {
359     output->Append(input.data(), input.length());
360   } else {
361     strings::ArrayByteSource source(input);
362     Escape(&source, output);
363   }
364 }
365 
366 }  // namespace converter
367 }  // namespace util
368 }  // namespace protobuf
369 }  // namespace google
370