1 /*
2 charencode.c - miscellaneous character encoding
3
4 Copyright 2008 Olivia Mackall <olivia@selenic.com> and others
5
6 This software may be used and distributed according to the terms of
7 the GNU General Public License, incorporated herein by reference.
8 */
9
10 #define PY_SSIZE_T_CLEAN
11 #include <Python.h>
12 #include <assert.h>
13
14 #include "charencode.h"
15 #include "compat.h"
16 #include "util.h"
17
18 #ifdef IS_PY3K
19 /* The mapping of Python types is meant to be temporary to get Python
20 * 3 to compile. We should remove this once Python 3 support is fully
21 * supported and proper types are used in the extensions themselves. */
22 #define PyInt_Type PyLong_Type
23 #define PyInt_AS_LONG PyLong_AS_LONG
24 #endif
25
26 /* clang-format off */
27 static const char lowertable[128] = {
28 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
29 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
30 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
31 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
32 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
33 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
34 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
35 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
36 '\x40',
37 '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67', /* A-G */
38 '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f', /* H-O */
39 '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77', /* P-W */
40 '\x78', '\x79', '\x7a', /* X-Z */
41 '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
42 '\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
43 '\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
44 '\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
45 '\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f'
46 };
47
48 static const char uppertable[128] = {
49 '\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
50 '\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
51 '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
52 '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
53 '\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
54 '\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
55 '\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
56 '\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
57 '\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
58 '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
59 '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
60 '\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
61 '\x60',
62 '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47', /* a-g */
63 '\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f', /* h-o */
64 '\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57', /* p-w */
65 '\x58', '\x59', '\x5a', /* x-z */
66 '\x7b', '\x7c', '\x7d', '\x7e', '\x7f'
67 };
68
69 /* 1: no escape, 2: \<c>, 6: \u<x> */
70 static const uint8_t jsonlentable[256] = {
71 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 6, 2, 2, 6, 6, /* b, t, n, f, r */
72 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
73 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* " */
74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, /* \\ */
77 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
78 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, /* DEL */
79 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
80 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
81 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
82 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
83 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
84 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
85 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87 };
88
89 static const uint8_t jsonparanoidlentable[128] = {
90 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 6, 2, 2, 6, 6, /* b, t, n, f, r */
91 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
92 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* " */
93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 6, 1, /* <, > */
94 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
95 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, /* \\ */
96 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, /* DEL */
98 };
99
100 static const char hexchartable[16] = {
101 '0', '1', '2', '3', '4', '5', '6', '7',
102 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
103 };
104 /* clang-format on */
105
106 /*
107 * Turn a hex-encoded string into binary.
108 */
unhexlify(const char * str,Py_ssize_t len)109 PyObject *unhexlify(const char *str, Py_ssize_t len)
110 {
111 PyObject *ret;
112 char *d;
113 Py_ssize_t i;
114
115 ret = PyBytes_FromStringAndSize(NULL, len / 2);
116
117 if (!ret) {
118 return NULL;
119 }
120
121 d = PyBytes_AsString(ret);
122
123 for (i = 0; i < len;) {
124 int hi = hexdigit(str, i++);
125 int lo = hexdigit(str, i++);
126 *d++ = (hi << 4) | lo;
127 }
128
129 return ret;
130 }
131
isasciistr(PyObject * self,PyObject * args)132 PyObject *isasciistr(PyObject *self, PyObject *args)
133 {
134 const char *buf;
135 Py_ssize_t i, len;
136 if (!PyArg_ParseTuple(args, PY23("s#:isasciistr", "y#:isasciistr"),
137 &buf, &len)) {
138 return NULL;
139 }
140 i = 0;
141 /* char array in PyStringObject should be at least 4-byte aligned */
142 if (((uintptr_t)buf & 3) == 0) {
143 const uint32_t *p = (const uint32_t *)buf;
144 for (; i < len / 4; i++) {
145 if (p[i] & 0x80808080U) {
146 Py_RETURN_FALSE;
147 }
148 }
149 i *= 4;
150 }
151 for (; i < len; i++) {
152 if (buf[i] & 0x80) {
153 Py_RETURN_FALSE;
154 }
155 }
156 Py_RETURN_TRUE;
157 }
158
159 static inline PyObject *
_asciitransform(PyObject * str_obj,const char table[128],PyObject * fallback_fn)160 _asciitransform(PyObject *str_obj, const char table[128], PyObject *fallback_fn)
161 {
162 char *str, *newstr;
163 Py_ssize_t i, len;
164 PyObject *newobj = NULL;
165 PyObject *ret = NULL;
166
167 str = PyBytes_AS_STRING(str_obj);
168 len = PyBytes_GET_SIZE(str_obj);
169
170 newobj = PyBytes_FromStringAndSize(NULL, len);
171 if (!newobj) {
172 goto quit;
173 }
174
175 newstr = PyBytes_AS_STRING(newobj);
176
177 for (i = 0; i < len; i++) {
178 char c = str[i];
179 if (c & 0x80) {
180 if (fallback_fn != NULL) {
181 ret = PyObject_CallFunctionObjArgs(
182 fallback_fn, str_obj, NULL);
183 } else {
184 PyObject *err = PyUnicodeDecodeError_Create(
185 "ascii", str, len, i, (i + 1),
186 "unexpected code byte");
187 PyErr_SetObject(PyExc_UnicodeDecodeError, err);
188 Py_XDECREF(err);
189 }
190 goto quit;
191 }
192 newstr[i] = table[(unsigned char)c];
193 }
194
195 ret = newobj;
196 Py_INCREF(ret);
197 quit:
198 Py_XDECREF(newobj);
199 return ret;
200 }
201
asciilower(PyObject * self,PyObject * args)202 PyObject *asciilower(PyObject *self, PyObject *args)
203 {
204 PyObject *str_obj;
205 if (!PyArg_ParseTuple(args, "O!:asciilower", &PyBytes_Type, &str_obj)) {
206 return NULL;
207 }
208 return _asciitransform(str_obj, lowertable, NULL);
209 }
210
asciiupper(PyObject * self,PyObject * args)211 PyObject *asciiupper(PyObject *self, PyObject *args)
212 {
213 PyObject *str_obj;
214 if (!PyArg_ParseTuple(args, "O!:asciiupper", &PyBytes_Type, &str_obj)) {
215 return NULL;
216 }
217 return _asciitransform(str_obj, uppertable, NULL);
218 }
219
make_file_foldmap(PyObject * self,PyObject * args)220 PyObject *make_file_foldmap(PyObject *self, PyObject *args)
221 {
222 PyObject *dmap, *spec_obj, *normcase_fallback;
223 PyObject *file_foldmap = NULL;
224 enum normcase_spec spec;
225 PyObject *k, *v;
226 dirstateItemObject *tuple;
227 Py_ssize_t pos = 0;
228 const char *table;
229
230 if (!PyArg_ParseTuple(args, "O!O!O!:make_file_foldmap", &PyDict_Type,
231 &dmap, &PyInt_Type, &spec_obj, &PyFunction_Type,
232 &normcase_fallback)) {
233 goto quit;
234 }
235
236 spec = (int)PyInt_AS_LONG(spec_obj);
237 switch (spec) {
238 case NORMCASE_LOWER:
239 table = lowertable;
240 break;
241 case NORMCASE_UPPER:
242 table = uppertable;
243 break;
244 case NORMCASE_OTHER:
245 table = NULL;
246 break;
247 default:
248 PyErr_SetString(PyExc_TypeError, "invalid normcasespec");
249 goto quit;
250 }
251
252 /* Add some more entries to deal with additions outside this
253 function. */
254 file_foldmap = _dict_new_presized((PyDict_Size(dmap) / 10) * 11);
255 if (file_foldmap == NULL) {
256 goto quit;
257 }
258
259 while (PyDict_Next(dmap, &pos, &k, &v)) {
260 if (!dirstate_tuple_check(v)) {
261 PyErr_SetString(PyExc_TypeError,
262 "expected a dirstate tuple");
263 goto quit;
264 }
265
266 tuple = (dirstateItemObject *)v;
267 if (tuple->flags | dirstate_flag_wc_tracked) {
268 PyObject *normed;
269 if (table != NULL) {
270 normed = _asciitransform(k, table,
271 normcase_fallback);
272 } else {
273 normed = PyObject_CallFunctionObjArgs(
274 normcase_fallback, k, NULL);
275 }
276
277 if (normed == NULL) {
278 goto quit;
279 }
280 if (PyDict_SetItem(file_foldmap, normed, k) == -1) {
281 Py_DECREF(normed);
282 goto quit;
283 }
284 Py_DECREF(normed);
285 }
286 }
287 return file_foldmap;
288 quit:
289 Py_XDECREF(file_foldmap);
290 return NULL;
291 }
292
293 /* calculate length of JSON-escaped string; returns -1 if unsupported */
jsonescapelen(const char * buf,Py_ssize_t len,bool paranoid)294 static Py_ssize_t jsonescapelen(const char *buf, Py_ssize_t len, bool paranoid)
295 {
296 Py_ssize_t i, esclen = 0;
297
298 if (paranoid) {
299 /* don't want to process multi-byte escapes in C */
300 for (i = 0; i < len; i++) {
301 char c = buf[i];
302 if (c & 0x80) {
303 PyErr_SetString(PyExc_ValueError,
304 "cannot process non-ascii str");
305 return -1;
306 }
307 esclen += jsonparanoidlentable[(unsigned char)c];
308 if (esclen < 0) {
309 PyErr_SetString(PyExc_MemoryError,
310 "overflow in jsonescapelen");
311 return -1;
312 }
313 }
314 } else {
315 for (i = 0; i < len; i++) {
316 char c = buf[i];
317 esclen += jsonlentable[(unsigned char)c];
318 if (esclen < 0) {
319 PyErr_SetString(PyExc_MemoryError,
320 "overflow in jsonescapelen");
321 return -1;
322 }
323 }
324 }
325
326 return esclen;
327 }
328
329 /* map '\<c>' escape character */
jsonescapechar2(char c)330 static char jsonescapechar2(char c)
331 {
332 switch (c) {
333 case '\b':
334 return 'b';
335 case '\t':
336 return 't';
337 case '\n':
338 return 'n';
339 case '\f':
340 return 'f';
341 case '\r':
342 return 'r';
343 case '"':
344 return '"';
345 case '\\':
346 return '\\';
347 }
348 return '\0'; /* should not happen */
349 }
350
351 /* convert 'origbuf' to JSON-escaped form 'escbuf'; 'origbuf' should only
352 include characters mappable by json(paranoid)lentable */
encodejsonescape(char * escbuf,Py_ssize_t esclen,const char * origbuf,Py_ssize_t origlen,bool paranoid)353 static void encodejsonescape(char *escbuf, Py_ssize_t esclen,
354 const char *origbuf, Py_ssize_t origlen,
355 bool paranoid)
356 {
357 const uint8_t *lentable =
358 (paranoid) ? jsonparanoidlentable : jsonlentable;
359 Py_ssize_t i, j;
360
361 for (i = 0, j = 0; i < origlen; i++) {
362 char c = origbuf[i];
363 uint8_t l = lentable[(unsigned char)c];
364 assert(j + l <= esclen);
365 switch (l) {
366 case 1:
367 escbuf[j] = c;
368 break;
369 case 2:
370 escbuf[j] = '\\';
371 escbuf[j + 1] = jsonescapechar2(c);
372 break;
373 case 6:
374 memcpy(escbuf + j, "\\u00", 4);
375 escbuf[j + 4] = hexchartable[(unsigned char)c >> 4];
376 escbuf[j + 5] = hexchartable[(unsigned char)c & 0xf];
377 break;
378 }
379 j += l;
380 }
381 }
382
jsonescapeu8fast(PyObject * self,PyObject * args)383 PyObject *jsonescapeu8fast(PyObject *self, PyObject *args)
384 {
385 PyObject *origstr, *escstr;
386 const char *origbuf;
387 Py_ssize_t origlen, esclen;
388 int paranoid;
389 if (!PyArg_ParseTuple(args, "O!i:jsonescapeu8fast", &PyBytes_Type,
390 &origstr, ¶noid)) {
391 return NULL;
392 }
393
394 origbuf = PyBytes_AS_STRING(origstr);
395 origlen = PyBytes_GET_SIZE(origstr);
396 esclen = jsonescapelen(origbuf, origlen, paranoid);
397 if (esclen < 0) {
398 return NULL; /* unsupported char found or overflow */
399 }
400 if (origlen == esclen) {
401 Py_INCREF(origstr);
402 return origstr;
403 }
404
405 escstr = PyBytes_FromStringAndSize(NULL, esclen);
406 if (!escstr) {
407 return NULL;
408 }
409 encodejsonescape(PyBytes_AS_STRING(escstr), esclen, origbuf, origlen,
410 paranoid);
411
412 return escstr;
413 }
414