1 /*
2  * Copyright (c) 2007-2014, Lloyd Hilaiel <me@lloyd.io>
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 #include "yajl_encode.h"
18 
19 #include <assert.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include <stdio.h>
23 
CharToHex(unsigned char c,char * hexBuf)24 static void CharToHex(unsigned char c, char * hexBuf)
25 {
26     const char * hexchar = "0123456789ABCDEF";
27     hexBuf[0] = hexchar[c >> 4];
28     hexBuf[1] = hexchar[c & 0x0F];
29 }
30 
31 void
yajl_string_encode(const yajl_print_t print,void * ctx,const unsigned char * str,size_t len,int escape_solidus)32 yajl_string_encode(const yajl_print_t print,
33                    void * ctx,
34                    const unsigned char * str,
35                    size_t len,
36                    int escape_solidus)
37 {
38     size_t beg = 0;
39     size_t end = 0;
40     char hexBuf[7];
41     hexBuf[0] = '\\'; hexBuf[1] = 'u'; hexBuf[2] = '0'; hexBuf[3] = '0';
42     hexBuf[6] = 0;
43 
44     while (end < len) {
45         const char * escaped = NULL;
46         switch (str[end]) {
47             case '\r': escaped = "\\r"; break;
48             case '\n': escaped = "\\n"; break;
49             case '\\': escaped = "\\\\"; break;
50             /* it is not required to escape a solidus in JSON:
51              * read sec. 2.5: http://www.ietf.org/rfc/rfc4627.txt
52              * specifically, this production from the grammar:
53              *   unescaped = %x20-21 / %x23-5B / %x5D-10FFFF
54              */
55             case '/': if (escape_solidus) escaped = "\\/"; break;
56             case '"': escaped = "\\\""; break;
57             case '\f': escaped = "\\f"; break;
58             case '\b': escaped = "\\b"; break;
59             case '\t': escaped = "\\t"; break;
60             default:
61                 if ((unsigned char) str[end] < 32) {
62                     CharToHex(str[end], hexBuf + 4);
63                     escaped = hexBuf;
64                 }
65                 break;
66         }
67         if (escaped != NULL) {
68             print(ctx, (const char *) (str + beg), end - beg);
69             print(ctx, escaped, (unsigned int)strlen(escaped));
70             beg = ++end;
71         } else {
72             ++end;
73         }
74     }
75     print(ctx, (const char *) (str + beg), end - beg);
76 }
77 
hexToDigit(unsigned int * val,const unsigned char * hex)78 static void hexToDigit(unsigned int * val, const unsigned char * hex)
79 {
80     unsigned int i;
81     for (i=0;i<4;i++) {
82         unsigned char c = hex[i];
83         if (c >= 'A') c = (c & ~0x20) - 7;
84         c -= '0';
85         assert(!(c & 0xF0));
86         *val = (*val << 4) | c;
87     }
88 }
89 
Utf32toUtf8(unsigned int codepoint,char * utf8Buf)90 static void Utf32toUtf8(unsigned int codepoint, char * utf8Buf)
91 {
92     if (codepoint < 0x80) {
93         utf8Buf[0] = (char) codepoint;
94         utf8Buf[1] = 0;
95     } else if (codepoint < 0x0800) {
96         utf8Buf[0] = (char) ((codepoint >> 6) | 0xC0);
97         utf8Buf[1] = (char) ((codepoint & 0x3F) | 0x80);
98         utf8Buf[2] = 0;
99     } else if (codepoint < 0x10000) {
100         utf8Buf[0] = (char) ((codepoint >> 12) | 0xE0);
101         utf8Buf[1] = (char) (((codepoint >> 6) & 0x3F) | 0x80);
102         utf8Buf[2] = (char) ((codepoint & 0x3F) | 0x80);
103         utf8Buf[3] = 0;
104     } else if (codepoint < 0x200000) {
105         utf8Buf[0] =(char)((codepoint >> 18) | 0xF0);
106         utf8Buf[1] =(char)(((codepoint >> 12) & 0x3F) | 0x80);
107         utf8Buf[2] =(char)(((codepoint >> 6) & 0x3F) | 0x80);
108         utf8Buf[3] =(char)((codepoint & 0x3F) | 0x80);
109         utf8Buf[4] = 0;
110     } else {
111         utf8Buf[0] = '?';
112         utf8Buf[1] = 0;
113     }
114 }
115 
yajl_string_decode(yajl_buf buf,const unsigned char * str,size_t len)116 void yajl_string_decode(yajl_buf buf, const unsigned char * str,
117                         size_t len)
118 {
119     size_t beg = 0;
120     size_t end = 0;
121 
122     while (end < len) {
123         if (str[end] == '\\') {
124             char utf8Buf[5];
125             const char * unescaped = "?";
126             yajl_buf_append(buf, str + beg, end - beg);
127             switch (str[++end]) {
128                 case 'r': unescaped = "\r"; break;
129                 case 'n': unescaped = "\n"; break;
130                 case '\\': unescaped = "\\"; break;
131                 case '/': unescaped = "/"; break;
132                 case '"': unescaped = "\""; break;
133                 case 'f': unescaped = "\f"; break;
134                 case 'b': unescaped = "\b"; break;
135                 case 't': unescaped = "\t"; break;
136                 case 'u': {
137                     unsigned int codepoint = 0;
138                     hexToDigit(&codepoint, str + ++end);
139                     end+=3;
140                     /* check if this is a surrogate */
141                     if ((codepoint & 0xFC00) == 0xD800) {
142                         end++;
143                         if (str[end] == '\\' && str[end + 1] == 'u') {
144                             unsigned int surrogate = 0;
145                             hexToDigit(&surrogate, str + end + 2);
146                             codepoint =
147                                 (((codepoint & 0x3F) << 10) |
148                                  ((((codepoint >> 6) & 0xF) + 1) << 16) |
149                                  (surrogate & 0x3FF));
150                             end += 5;
151                         } else {
152                             unescaped = "?";
153                             break;
154                         }
155                     }
156 
157                     Utf32toUtf8(codepoint, utf8Buf);
158                     unescaped = utf8Buf;
159 
160                     if (codepoint == 0) {
161                         yajl_buf_append(buf, unescaped, 1);
162                         beg = ++end;
163                         continue;
164                     }
165 
166                     break;
167                 }
168                 default:
169                     assert("this should never happen" && 0);
170             }
171             yajl_buf_append(buf, unescaped, (unsigned int)strlen(unescaped));
172             beg = ++end;
173         } else {
174             end++;
175         }
176     }
177     yajl_buf_append(buf, str + beg, end - beg);
178 }
179 
180 #define ADV_PTR s++; if (!(len--)) return 0;
181 
yajl_string_validate_utf8(const unsigned char * s,size_t len)182 int yajl_string_validate_utf8(const unsigned char * s, size_t len)
183 {
184     if (!len) return 1;
185     if (!s) return 0;
186 
187     while (len--) {
188         /* single byte */
189         if (*s <= 0x7f) {
190             /* noop */
191         }
192         /* two byte */
193         else if ((*s >> 5) == 0x6) {
194             ADV_PTR;
195             if (!((*s >> 6) == 0x2)) return 0;
196         }
197         /* three byte */
198         else if ((*s >> 4) == 0x0e) {
199             ADV_PTR;
200             if (!((*s >> 6) == 0x2)) return 0;
201             ADV_PTR;
202             if (!((*s >> 6) == 0x2)) return 0;
203         }
204         /* four byte */
205         else if ((*s >> 3) == 0x1e) {
206             ADV_PTR;
207             if (!((*s >> 6) == 0x2)) return 0;
208             ADV_PTR;
209             if (!((*s >> 6) == 0x2)) return 0;
210             ADV_PTR;
211             if (!((*s >> 6) == 0x2)) return 0;
212         } else {
213             return 0;
214         }
215 
216         s++;
217     }
218 
219     return 1;
220 }
221