1 /*
2  charencode.c - miscellaneous character encoding
3 
4  Copyright 2008 Olivia Mackall <olivia@selenic.com> and others
5 
6  This software may be used and distributed according to the terms of
7  the GNU General Public License, incorporated herein by reference.
8 */
9 
10 #define PY_SSIZE_T_CLEAN
11 #include <Python.h>
12 #include <assert.h>
13 
14 #include "charencode.h"
15 #include "compat.h"
16 #include "util.h"
17 
18 #ifdef IS_PY3K
19 /* The mapping of Python types is meant to be temporary to get Python
20  * 3 to compile. We should remove this once Python 3 support is fully
21  * supported and proper types are used in the extensions themselves. */
22 #define PyInt_Type PyLong_Type
23 #define PyInt_AS_LONG PyLong_AS_LONG
24 #endif
25 
26 /* clang-format off */
27 static const char lowertable[128] = {
28 	'\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
29 	'\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
30 	'\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
31 	'\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
32 	'\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
33 	'\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
34 	'\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
35 	'\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
36 	'\x40',
37 	        '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67', /* A-G */
38 	'\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f', /* H-O */
39 	'\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77', /* P-W */
40 	'\x78', '\x79', '\x7a',                                         /* X-Z */
41 	                        '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
42 	'\x60', '\x61', '\x62', '\x63', '\x64', '\x65', '\x66', '\x67',
43 	'\x68', '\x69', '\x6a', '\x6b', '\x6c', '\x6d', '\x6e', '\x6f',
44 	'\x70', '\x71', '\x72', '\x73', '\x74', '\x75', '\x76', '\x77',
45 	'\x78', '\x79', '\x7a', '\x7b', '\x7c', '\x7d', '\x7e', '\x7f'
46 };
47 
48 static const char uppertable[128] = {
49 	'\x00', '\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07',
50 	'\x08', '\x09', '\x0a', '\x0b', '\x0c', '\x0d', '\x0e', '\x0f',
51 	'\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17',
52 	'\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f',
53 	'\x20', '\x21', '\x22', '\x23', '\x24', '\x25', '\x26', '\x27',
54 	'\x28', '\x29', '\x2a', '\x2b', '\x2c', '\x2d', '\x2e', '\x2f',
55 	'\x30', '\x31', '\x32', '\x33', '\x34', '\x35', '\x36', '\x37',
56 	'\x38', '\x39', '\x3a', '\x3b', '\x3c', '\x3d', '\x3e', '\x3f',
57 	'\x40', '\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47',
58 	'\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f',
59 	'\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57',
60 	'\x58', '\x59', '\x5a', '\x5b', '\x5c', '\x5d', '\x5e', '\x5f',
61 	'\x60',
62 		'\x41', '\x42', '\x43', '\x44', '\x45', '\x46', '\x47', /* a-g */
63 	'\x48', '\x49', '\x4a', '\x4b', '\x4c', '\x4d', '\x4e', '\x4f', /* h-o */
64 	'\x50', '\x51', '\x52', '\x53', '\x54', '\x55', '\x56', '\x57', /* p-w */
65 	'\x58', '\x59', '\x5a', 					/* x-z */
66 				'\x7b', '\x7c', '\x7d', '\x7e', '\x7f'
67 };
68 
69 /* 1: no escape, 2: \<c>, 6: \u<x> */
70 static const uint8_t jsonlentable[256] = {
71 	6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 6, 2, 2, 6, 6, /* b, t, n, f, r */
72 	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
73 	1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* " */
74 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
76 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, /* \\ */
77 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
78 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, /* DEL */
79 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
80 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
81 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
82 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
83 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
84 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
85 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87 };
88 
89 static const uint8_t jsonparanoidlentable[128] = {
90 	6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 6, 2, 2, 6, 6, /* b, t, n, f, r */
91 	6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
92 	1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* " */
93 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 6, 1, /* <, > */
94 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
95 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, /* \\ */
96 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
97 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, /* DEL */
98 };
99 
100 static const char hexchartable[16] = {
101 	'0', '1', '2', '3', '4', '5', '6', '7',
102 	'8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
103 };
104 /* clang-format on */
105 
106 /*
107  * Turn a hex-encoded string into binary.
108  */
unhexlify(const char * str,Py_ssize_t len)109 PyObject *unhexlify(const char *str, Py_ssize_t len)
110 {
111 	PyObject *ret;
112 	char *d;
113 	Py_ssize_t i;
114 
115 	ret = PyBytes_FromStringAndSize(NULL, len / 2);
116 
117 	if (!ret) {
118 		return NULL;
119 	}
120 
121 	d = PyBytes_AsString(ret);
122 
123 	for (i = 0; i < len;) {
124 		int hi = hexdigit(str, i++);
125 		int lo = hexdigit(str, i++);
126 		*d++ = (hi << 4) | lo;
127 	}
128 
129 	return ret;
130 }
131 
isasciistr(PyObject * self,PyObject * args)132 PyObject *isasciistr(PyObject *self, PyObject *args)
133 {
134 	const char *buf;
135 	Py_ssize_t i, len;
136 	if (!PyArg_ParseTuple(args, PY23("s#:isasciistr", "y#:isasciistr"),
137 	                      &buf, &len)) {
138 		return NULL;
139 	}
140 	i = 0;
141 	/* char array in PyStringObject should be at least 4-byte aligned */
142 	if (((uintptr_t)buf & 3) == 0) {
143 		const uint32_t *p = (const uint32_t *)buf;
144 		for (; i < len / 4; i++) {
145 			if (p[i] & 0x80808080U) {
146 				Py_RETURN_FALSE;
147 			}
148 		}
149 		i *= 4;
150 	}
151 	for (; i < len; i++) {
152 		if (buf[i] & 0x80) {
153 			Py_RETURN_FALSE;
154 		}
155 	}
156 	Py_RETURN_TRUE;
157 }
158 
159 static inline PyObject *
_asciitransform(PyObject * str_obj,const char table[128],PyObject * fallback_fn)160 _asciitransform(PyObject *str_obj, const char table[128], PyObject *fallback_fn)
161 {
162 	char *str, *newstr;
163 	Py_ssize_t i, len;
164 	PyObject *newobj = NULL;
165 	PyObject *ret = NULL;
166 
167 	str = PyBytes_AS_STRING(str_obj);
168 	len = PyBytes_GET_SIZE(str_obj);
169 
170 	newobj = PyBytes_FromStringAndSize(NULL, len);
171 	if (!newobj) {
172 		goto quit;
173 	}
174 
175 	newstr = PyBytes_AS_STRING(newobj);
176 
177 	for (i = 0; i < len; i++) {
178 		char c = str[i];
179 		if (c & 0x80) {
180 			if (fallback_fn != NULL) {
181 				ret = PyObject_CallFunctionObjArgs(
182 				    fallback_fn, str_obj, NULL);
183 			} else {
184 				PyObject *err = PyUnicodeDecodeError_Create(
185 				    "ascii", str, len, i, (i + 1),
186 				    "unexpected code byte");
187 				PyErr_SetObject(PyExc_UnicodeDecodeError, err);
188 				Py_XDECREF(err);
189 			}
190 			goto quit;
191 		}
192 		newstr[i] = table[(unsigned char)c];
193 	}
194 
195 	ret = newobj;
196 	Py_INCREF(ret);
197 quit:
198 	Py_XDECREF(newobj);
199 	return ret;
200 }
201 
asciilower(PyObject * self,PyObject * args)202 PyObject *asciilower(PyObject *self, PyObject *args)
203 {
204 	PyObject *str_obj;
205 	if (!PyArg_ParseTuple(args, "O!:asciilower", &PyBytes_Type, &str_obj)) {
206 		return NULL;
207 	}
208 	return _asciitransform(str_obj, lowertable, NULL);
209 }
210 
asciiupper(PyObject * self,PyObject * args)211 PyObject *asciiupper(PyObject *self, PyObject *args)
212 {
213 	PyObject *str_obj;
214 	if (!PyArg_ParseTuple(args, "O!:asciiupper", &PyBytes_Type, &str_obj)) {
215 		return NULL;
216 	}
217 	return _asciitransform(str_obj, uppertable, NULL);
218 }
219 
make_file_foldmap(PyObject * self,PyObject * args)220 PyObject *make_file_foldmap(PyObject *self, PyObject *args)
221 {
222 	PyObject *dmap, *spec_obj, *normcase_fallback;
223 	PyObject *file_foldmap = NULL;
224 	enum normcase_spec spec;
225 	PyObject *k, *v;
226 	dirstateItemObject *tuple;
227 	Py_ssize_t pos = 0;
228 	const char *table;
229 
230 	if (!PyArg_ParseTuple(args, "O!O!O!:make_file_foldmap", &PyDict_Type,
231 	                      &dmap, &PyInt_Type, &spec_obj, &PyFunction_Type,
232 	                      &normcase_fallback)) {
233 		goto quit;
234 	}
235 
236 	spec = (int)PyInt_AS_LONG(spec_obj);
237 	switch (spec) {
238 	case NORMCASE_LOWER:
239 		table = lowertable;
240 		break;
241 	case NORMCASE_UPPER:
242 		table = uppertable;
243 		break;
244 	case NORMCASE_OTHER:
245 		table = NULL;
246 		break;
247 	default:
248 		PyErr_SetString(PyExc_TypeError, "invalid normcasespec");
249 		goto quit;
250 	}
251 
252 	/* Add some more entries to deal with additions outside this
253 	   function. */
254 	file_foldmap = _dict_new_presized((PyDict_Size(dmap) / 10) * 11);
255 	if (file_foldmap == NULL) {
256 		goto quit;
257 	}
258 
259 	while (PyDict_Next(dmap, &pos, &k, &v)) {
260 		if (!dirstate_tuple_check(v)) {
261 			PyErr_SetString(PyExc_TypeError,
262 			                "expected a dirstate tuple");
263 			goto quit;
264 		}
265 
266 		tuple = (dirstateItemObject *)v;
267 		if (tuple->flags | dirstate_flag_wc_tracked) {
268 			PyObject *normed;
269 			if (table != NULL) {
270 				normed = _asciitransform(k, table,
271 				                         normcase_fallback);
272 			} else {
273 				normed = PyObject_CallFunctionObjArgs(
274 				    normcase_fallback, k, NULL);
275 			}
276 
277 			if (normed == NULL) {
278 				goto quit;
279 			}
280 			if (PyDict_SetItem(file_foldmap, normed, k) == -1) {
281 				Py_DECREF(normed);
282 				goto quit;
283 			}
284 			Py_DECREF(normed);
285 		}
286 	}
287 	return file_foldmap;
288 quit:
289 	Py_XDECREF(file_foldmap);
290 	return NULL;
291 }
292 
293 /* calculate length of JSON-escaped string; returns -1 if unsupported */
jsonescapelen(const char * buf,Py_ssize_t len,bool paranoid)294 static Py_ssize_t jsonescapelen(const char *buf, Py_ssize_t len, bool paranoid)
295 {
296 	Py_ssize_t i, esclen = 0;
297 
298 	if (paranoid) {
299 		/* don't want to process multi-byte escapes in C */
300 		for (i = 0; i < len; i++) {
301 			char c = buf[i];
302 			if (c & 0x80) {
303 				PyErr_SetString(PyExc_ValueError,
304 				                "cannot process non-ascii str");
305 				return -1;
306 			}
307 			esclen += jsonparanoidlentable[(unsigned char)c];
308 			if (esclen < 0) {
309 				PyErr_SetString(PyExc_MemoryError,
310 				                "overflow in jsonescapelen");
311 				return -1;
312 			}
313 		}
314 	} else {
315 		for (i = 0; i < len; i++) {
316 			char c = buf[i];
317 			esclen += jsonlentable[(unsigned char)c];
318 			if (esclen < 0) {
319 				PyErr_SetString(PyExc_MemoryError,
320 				                "overflow in jsonescapelen");
321 				return -1;
322 			}
323 		}
324 	}
325 
326 	return esclen;
327 }
328 
329 /* map '\<c>' escape character */
jsonescapechar2(char c)330 static char jsonescapechar2(char c)
331 {
332 	switch (c) {
333 	case '\b':
334 		return 'b';
335 	case '\t':
336 		return 't';
337 	case '\n':
338 		return 'n';
339 	case '\f':
340 		return 'f';
341 	case '\r':
342 		return 'r';
343 	case '"':
344 		return '"';
345 	case '\\':
346 		return '\\';
347 	}
348 	return '\0'; /* should not happen */
349 }
350 
351 /* convert 'origbuf' to JSON-escaped form 'escbuf'; 'origbuf' should only
352    include characters mappable by json(paranoid)lentable */
encodejsonescape(char * escbuf,Py_ssize_t esclen,const char * origbuf,Py_ssize_t origlen,bool paranoid)353 static void encodejsonescape(char *escbuf, Py_ssize_t esclen,
354                              const char *origbuf, Py_ssize_t origlen,
355                              bool paranoid)
356 {
357 	const uint8_t *lentable =
358 	    (paranoid) ? jsonparanoidlentable : jsonlentable;
359 	Py_ssize_t i, j;
360 
361 	for (i = 0, j = 0; i < origlen; i++) {
362 		char c = origbuf[i];
363 		uint8_t l = lentable[(unsigned char)c];
364 		assert(j + l <= esclen);
365 		switch (l) {
366 		case 1:
367 			escbuf[j] = c;
368 			break;
369 		case 2:
370 			escbuf[j] = '\\';
371 			escbuf[j + 1] = jsonescapechar2(c);
372 			break;
373 		case 6:
374 			memcpy(escbuf + j, "\\u00", 4);
375 			escbuf[j + 4] = hexchartable[(unsigned char)c >> 4];
376 			escbuf[j + 5] = hexchartable[(unsigned char)c & 0xf];
377 			break;
378 		}
379 		j += l;
380 	}
381 }
382 
jsonescapeu8fast(PyObject * self,PyObject * args)383 PyObject *jsonescapeu8fast(PyObject *self, PyObject *args)
384 {
385 	PyObject *origstr, *escstr;
386 	const char *origbuf;
387 	Py_ssize_t origlen, esclen;
388 	int paranoid;
389 	if (!PyArg_ParseTuple(args, "O!i:jsonescapeu8fast", &PyBytes_Type,
390 	                      &origstr, &paranoid)) {
391 		return NULL;
392 	}
393 
394 	origbuf = PyBytes_AS_STRING(origstr);
395 	origlen = PyBytes_GET_SIZE(origstr);
396 	esclen = jsonescapelen(origbuf, origlen, paranoid);
397 	if (esclen < 0) {
398 		return NULL; /* unsupported char found or overflow */
399 	}
400 	if (origlen == esclen) {
401 		Py_INCREF(origstr);
402 		return origstr;
403 	}
404 
405 	escstr = PyBytes_FromStringAndSize(NULL, esclen);
406 	if (!escstr) {
407 		return NULL;
408 	}
409 	encodejsonescape(PyBytes_AS_STRING(escstr), esclen, origbuf, origlen,
410 	                 paranoid);
411 
412 	return escstr;
413 }
414