1 /* charsets.c
2 * Routines for handling character sets
3 *
4 * Wireshark - Network traffic analyzer
5 * By Gerald Combs <gerald@wireshark.org>
6 * Copyright 1998 Gerald Combs
7 *
8 * SPDX-License-Identifier: GPL-2.0-or-later
9 */
10
11 #include "config.h"
12
13 #include <errno.h>
14 #include <glib.h>
15
16 #include <epan/proto.h>
17 #include <epan/wmem_scopes.h>
18
19 #include <wsutil/pint.h>
20 #include <wsutil/unicode-utils.h>
21
22 #include "charsets.h"
23
24 /* REPLACEMENT CHARACTER */
25 #define UNREPL 0xFFFD
26
27 /*
28 * Wikipedia's "Character encoding" template, giving a pile of character
29 * encodings and Wikipedia pages for them:
30 *
31 * http://en.wikipedia.org/wiki/Template:Character_encoding
32 *
33 * Unicode character encoding model:
34 *
35 * https://www.unicode.org/reports/tr17/
36 *
37 * International Components for Unicode character set mapping tables:
38 *
39 * http://site.icu-project.org/charts/charset
40 *
41 * MSDN information on code pages:
42 *
43 * https://docs.microsoft.com/en-us/windows/win32/intl/code-pages
44 *
45 * ASCII-based code pages, from IBM:
46 *
47 * http://www-01.ibm.com/software/globalization/cp/cp_cpgid.html
48 *
49 * EBCDIC code pages, from IBM:
50 *
51 * http://www-03.ibm.com/systems/i/software/globalization/codepages.html
52 *
53 * The IBM pages are no longer available; the versions archived on the
54 * Wayback Machine are, but the links to the PDF and text versions of
55 * the code pages don't all work (do *any* work?).
56 */
57
58 /*
59 * Given a wmem scope, a pointer, and a length, treat the string of bytes
60 * referred to by the pointer and length as an ASCII string, with all bytes
61 * with the high-order bit set being invalid, and return a pointer to a
62 * UTF-8 string, allocated using the wmem scope.
63 *
64 * Octets with the highest bit set will be converted to the Unicode
65 * REPLACEMENT CHARACTER.
66 */
67 guint8 *
get_ascii_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)68 get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
69 {
70 wmem_strbuf_t *str;
71
72 str = wmem_strbuf_sized_new(scope, length+1, 0);
73
74 while (length > 0) {
75 guint8 ch = *ptr;
76
77 if (ch < 0x80)
78 wmem_strbuf_append_c(str, ch);
79 else
80 wmem_strbuf_append_unichar(str, UNREPL);
81 ptr++;
82 length--;
83 }
84
85 return (guint8 *) wmem_strbuf_finalize(str);
86 }
87
88 /*
89 * Given a wmem scope, a pointer, and a length, treat the string of bytes
90 * referred to by the pointer and length as a UTF-8 string, and return a
91 * pointer to a UTF-8 string, allocated using the wmem scope, with all
92 * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
93 * according to the recommended "best practices" given in the Unicode
94 * Standard and specified by W3C/WHATWG.
95 *
96 * Note that in conformance with the Unicode Standard, this treats three
97 * byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired)
98 * and two byte overlong encodings of 7-bit ASCII characters as invalid and
99 * substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard
100 * derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could
101 * be added later.
102 */
103 guint8 *
get_utf_8_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)104 get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
105 {
106 wmem_strbuf_t *str;
107 guint8 ch;
108 const guint8 *prev;
109
110 str = wmem_strbuf_sized_new(scope, length+1, 0);
111
112 /* See the Unicode Standard conformance chapter at
113 * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf especially
114 * Table 3-7 "Well-Formed UTF-8 Byte Sequences" and
115 * U+FFFD Substitution of Maximal Subparts. */
116 while (length > 0) {
117 gsize unichar_len;
118 ch = *ptr;
119
120 if (ch < 0x80) {
121 wmem_strbuf_append_c(str, ch);
122 } else if (ch < 0xc2 || ch > 0xf4) {
123 wmem_strbuf_append_unichar(str, UNREPL);
124 } else {
125 prev = ptr;
126 if (ch < 0xe0) { /* 110xxxxx, 2 byte char */
127 unichar_len = 2;
128 } else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */
129 unichar_len = 3;
130 ptr++;
131 length--;
132 if (length < 1) {
133 wmem_strbuf_append_unichar(str, UNREPL);
134 continue;
135 }
136 switch (ch) {
137 case 0xe0:
138 if (*ptr < 0xa0 || *ptr > 0xbf) {
139 wmem_strbuf_append_unichar(str, UNREPL);
140 continue;
141 }
142 break;
143 case 0xed:
144 if (*ptr < 0x80 || *ptr > 0x9f) {
145 wmem_strbuf_append_unichar(str, UNREPL);
146 continue;
147 }
148 break;
149 default:
150 if (*ptr < 0x80 || *ptr > 0xbf) {
151 wmem_strbuf_append_unichar(str, UNREPL);
152 continue;
153 }
154 }
155 } else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */
156 unichar_len = 4;
157 ptr++;
158 length--;
159 if (length < 1) {
160 wmem_strbuf_append_unichar(str, UNREPL);
161 continue;
162 }
163 switch (ch) {
164 case 0xf0:
165 if (*ptr < 0x90 || *ptr > 0xbf) {
166 wmem_strbuf_append_unichar(str, UNREPL);
167 continue;
168 }
169 break;
170 case 0xf4:
171 if (*ptr < 0x80 || *ptr > 0x8f) {
172 wmem_strbuf_append_unichar(str, UNREPL);
173 continue;
174 }
175 break;
176 default:
177 if (*ptr < 0x80 || *ptr > 0xbf) {
178 wmem_strbuf_append_unichar(str, UNREPL);
179 continue;
180 }
181 }
182 ptr++;
183 length--;
184 if (length < 1) {
185 wmem_strbuf_append_unichar(str, UNREPL);
186 continue;
187 }
188 if (*ptr < 0x80 || *ptr > 0xbf) {
189 wmem_strbuf_append_unichar(str, UNREPL);
190 continue;
191 }
192 }
193
194 ptr++;
195 length--;
196 if (length < 1) {
197 wmem_strbuf_append_unichar(str, UNREPL);
198 continue;
199 }
200 if (*ptr < 0x80 || *ptr > 0xbf) {
201 wmem_strbuf_append_unichar(str, UNREPL);
202 continue;
203 } else {
204 wmem_strbuf_append_len(str, prev, unichar_len);
205 }
206 }
207
208 ptr++;
209 length--;
210 }
211
212 return (guint8 *) wmem_strbuf_finalize(str);
213 }
214
215 /*
216 * ISO 646 "Basic code table".
217 */
218 const gunichar2 charset_table_iso_646_basic[0x80] = {
219 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, /* 0x00 - */
220 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* - 0x0F */
221 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, /* 0x10 - */
222 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* - 0x1F */
223 0x0020, 0x0021, 0x0022, UNREPL, UNREPL, 0x0025, 0x0026, 0x0027, /* 0x20 - */
224 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* - 0x2F */
225 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, /* 0x30 - */
226 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* - 0x3F */
227 UNREPL, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, /* 0x40 - */
228 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* - 0x4F */
229 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, /* 0x50 - */
230 0x0058, 0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, 0x005f, /* - 0x5F */
231 UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, /* 0x60 - */
232 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* - 0x6F */
233 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, /* 0x70 - */
234 0x0078, 0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, 0x007f, /* - 0x7F */
235 };
236
237 /*
238 * Given a wmem scope, a pointer, a length, and a translation table,
239 * treat the string of bytes referred to by the pointer and length as a
240 * string encoded using one octet per character, with octets with the
241 * high-order bit clear being mapped by the translation table to 2-byte
242 * Unicode Basic Multilingual Plane characters (including REPLACEMENT
243 * CHARACTER) and octets with the high-order bit set being mapped to
244 * REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
245 * allocated using the wmem scope.
246 */
247 guint8 *
get_iso_646_string(wmem_allocator_t * scope,const guint8 * ptr,gint length,const gunichar2 table[0x80])248 get_iso_646_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80])
249 {
250 wmem_strbuf_t *str;
251
252 str = wmem_strbuf_sized_new(scope, length+1, 0);
253
254 while (length > 0) {
255 guint8 ch = *ptr;
256
257 if (ch < 0x80)
258 wmem_strbuf_append_unichar(str, table[ch]);
259 else
260 wmem_strbuf_append_unichar(str, UNREPL);
261 ptr++;
262 length--;
263 }
264
265 return (guint8 *) wmem_strbuf_finalize(str);
266 }
267
268 /*
269 * Given a wmem scope, a pointer, and a length, treat the string of bytes
270 * referred to by the pointer and length as an ISO 8859/1 string, and
271 * return a pointer to a UTF-8 string, allocated using the wmem scope.
272 */
273 guint8 *
get_8859_1_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)274 get_8859_1_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
275 {
276 wmem_strbuf_t *str;
277
278 str = wmem_strbuf_sized_new(scope, length+1, 0);
279
280 while (length > 0) {
281 guint8 ch = *ptr;
282
283 if (ch < 0x80)
284 wmem_strbuf_append_c(str, ch);
285 else {
286 /*
287 * Note: we assume here that the code points
288 * 0x80-0x9F are used for C1 control characters,
289 * and thus have the same value as the corresponding
290 * Unicode code points.
291 */
292 wmem_strbuf_append_unichar(str, ch);
293 }
294 ptr++;
295 length--;
296 }
297
298 return (guint8 *) wmem_strbuf_finalize(str);
299 }
300
301 /*
302 * Translation tables that map the upper 128 code points in single-byte
303 * "extended ASCII" character encodings to Unicode code points in the
304 * Basic Multilingual Plane.
305 */
306
307 /* ISO-8859-2 (https://en.wikipedia.org/wiki/ISO/IEC_8859-2#Code_page_layout) */
308 const gunichar2 charset_table_iso_8859_2[0x80] = {
309 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
310 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
311 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
312 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
313 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, /* 0xA0 - */
314 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, /* - 0xAF */
315 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, /* 0xB0 - */
316 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, /* - 0xBF */
317 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, /* 0xC0 - */
318 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, /* - 0xCF */
319 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, /* 0xD0 - */
320 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, /* - 0xDF */
321 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, /* 0xE0 - */
322 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, /* - 0xEF */
323 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, /* 0xF0 - */
324 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9 /* - 0xFF */
325 };
326
327 /* generated by ../tools/make_charset_ISO-8859-3 */
328 const gunichar2 charset_table_iso_8859_3[0x80] = {
329 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
330 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
331 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
332 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
333 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, UNREPL, 0x0124, 0x00a7, /* 0xA0 - */
334 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, UNREPL, 0x017b, /* - 0xAF */
335 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, /* 0xB0 - */
336 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, UNREPL, 0x017c, /* - 0xBF */
337 0x00c0, 0x00c1, 0x00c2, UNREPL, 0x00c4, 0x010a, 0x0108, 0x00c7, /* 0xC0 - */
338 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
339 UNREPL, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, /* 0xD0 - */
340 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, /* - 0xDF */
341 0x00e0, 0x00e1, 0x00e2, UNREPL, 0x00e4, 0x010b, 0x0109, 0x00e7, /* 0xE0 - */
342 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
343 UNREPL, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, /* 0xF0 - */
344 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9, /* - 0xFF */
345 };
346
347 /* generated by ../tools/make_charset_ISO-8859-4 */
348 const gunichar2 charset_table_iso_8859_4[0x80] = {
349 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
350 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
351 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
352 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
353 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, /* 0xA0 - */
354 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, /* - 0xAF */
355 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, /* 0xB0 - */
356 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, /* - 0xBF */
357 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, /* 0xC0 - */
358 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, /* - 0xCF */
359 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
360 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, /* - 0xDF */
361 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, /* 0xE0 - */
362 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, /* - 0xEF */
363 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
364 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9, /* - 0xFF */
365 };
366
367 /* ISO-8859-5 (https://en.wikipedia.org/wiki/ISO/IEC_8859-5#Code_page_layout) */
368 const gunichar2 charset_table_iso_8859_5[0x80] = {
369 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
370 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
371 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
372 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
373 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, /* 0xA0 - */
374 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, /* - 0xAF */
375 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, /* 0xB0 - */
376 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* - 0xBF */
377 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, /* 0xC0 - */
378 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* - 0xCF */
379 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, /* 0xD0 - */
380 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* - 0xDF */
381 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, /* 0xE0 - */
382 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, /* - 0xEF */
383 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, /* 0xF0 - */
384 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f /* - 0xFF */
385 };
386
387 /* generated by ../tools/make_charset_ISO-8859-6 */
388 const gunichar2 charset_table_iso_8859_6[0x80] = {
389 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
390 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
391 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
392 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
393 0x00a0, UNREPL, UNREPL, UNREPL, 0x00a4, UNREPL, UNREPL, UNREPL, /* 0xA0 - */
394 UNREPL, UNREPL, UNREPL, UNREPL, 0x060c, 0x00ad, UNREPL, UNREPL, /* - 0xAF */
395 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xB0 - */
396 UNREPL, UNREPL, UNREPL, 0x061b, UNREPL, UNREPL, UNREPL, 0x061f, /* - 0xBF */
397 UNREPL, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, /* 0xC0 - */
398 0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f, /* - 0xCF */
399 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, /* 0xD0 - */
400 0x0638, 0x0639, 0x063a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xDF */
401 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, /* 0xE0 - */
402 0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, /* - 0xEF */
403 0x0650, 0x0651, 0x0652, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xF0 - */
404 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xFF */
405 };
406
407 /* generated by ../tools/make_charset_ISO-8859-7 */
408 const gunichar2 charset_table_iso_8859_7[0x80] = {
409 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
410 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
411 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
412 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
413 0x00a0, 0x2018, 0x2019, 0x00a3, 0x20ac, 0x20af, 0x00a6, 0x00a7, /* 0xA0 - */
414 0x00a8, 0x00a9, 0x037a, 0x00ab, 0x00ac, 0x00ad, UNREPL, 0x2015, /* - 0xAF */
415 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7, /* 0xB0 - */
416 0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f, /* - 0xBF */
417 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, /* 0xC0 - */
418 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, /* - 0xCF */
419 0x03a0, 0x03a1, UNREPL, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, /* 0xD0 - */
420 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af, /* - 0xDF */
421 0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, /* 0xE0 - */
422 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, /* - 0xEF */
423 0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7, /* 0xF0 - */
424 0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, UNREPL, /* - 0xFF */
425 };
426
427 /* generated by ../tools/make_charset_ISO-8859-8 */
428 const gunichar2 charset_table_iso_8859_8[0x80] = {
429 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
430 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
431 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
432 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
433 0x00a0, UNREPL, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, /* 0xA0 - */
434 0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */
435 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
436 0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, UNREPL, /* - 0xBF */
437 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xC0 - */
438 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xCF */
439 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xD0 - */
440 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, 0x2017, /* - 0xDF */
441 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7, /* 0xE0 - */
442 0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df, /* - 0xEF */
443 0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7, /* 0xF0 - */
444 0x05e8, 0x05e9, 0x05ea, UNREPL, UNREPL, 0x200e, 0x200f, UNREPL, /* - 0xFF */
445 };
446
447 /* ISO-8859-9 (https://en.wikipedia.org/wiki/ISO/IEC_8859-9#Code_page_layout) */
448 const gunichar2 charset_table_iso_8859_9[0x80] = {
449 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
450 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
451 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
452 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
453 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, /* 0xA0 - */
454 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */
455 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
456 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, /* - 0xBF */
457 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */
458 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
459 0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
460 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df, /* - 0xDF */
461 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */
462 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
463 0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
464 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff /* - 0xFF */
465 };
466
467 /* generated by ../tools/make_charset_ISO-8859-10 */
468 const gunichar2 charset_table_iso_8859_10[0x80] = {
469 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
470 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
471 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
472 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
473 0x00a0, 0x0104, 0x0112, 0x0122, 0x012a, 0x0128, 0x0136, 0x00a7, /* 0xA0 - */
474 0x013b, 0x0110, 0x0160, 0x0166, 0x017d, 0x00ad, 0x016a, 0x014a, /* - 0xAF */
475 0x00b0, 0x0105, 0x0113, 0x0123, 0x012b, 0x0129, 0x0137, 0x00b7, /* 0xB0 - */
476 0x013c, 0x0111, 0x0161, 0x0167, 0x017e, 0x2015, 0x016b, 0x014b, /* - 0xBF */
477 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, /* 0xC0 - */
478 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
479 0x00d0, 0x0145, 0x014c, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0168, /* 0xD0 - */
480 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* - 0xDF */
481 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, /* 0xE0 - */
482 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
483 0x00f0, 0x0146, 0x014d, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x0169, /* 0xF0 - */
484 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x0138, /* - 0xFF */
485 };
486
487 /* generated by ../tools/make_charset_ISO-8859-11 */
488 const gunichar2 charset_table_iso_8859_11[0x80] = {
489 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
490 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
491 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
492 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
493 0x00a0, 0x0e01, 0x0e02, 0x0e03, 0x0e04, 0x0e05, 0x0e06, 0x0e07, /* 0xA0 - */
494 0x0e08, 0x0e09, 0x0e0a, 0x0e0b, 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f, /* - 0xAF */
495 0x0e10, 0x0e11, 0x0e12, 0x0e13, 0x0e14, 0x0e15, 0x0e16, 0x0e17, /* 0xB0 - */
496 0x0e18, 0x0e19, 0x0e1a, 0x0e1b, 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f, /* - 0xBF */
497 0x0e20, 0x0e21, 0x0e22, 0x0e23, 0x0e24, 0x0e25, 0x0e26, 0x0e27, /* 0xC0 - */
498 0x0e28, 0x0e29, 0x0e2a, 0x0e2b, 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f, /* - 0xCF */
499 0x0e30, 0x0e31, 0x0e32, 0x0e33, 0x0e34, 0x0e35, 0x0e36, 0x0e37, /* 0xD0 - */
500 0x0e38, 0x0e39, 0x0e3a, UNREPL, UNREPL, UNREPL, UNREPL, 0x0e3f, /* - 0xDF */
501 0x0e40, 0x0e41, 0x0e42, 0x0e43, 0x0e44, 0x0e45, 0x0e46, 0x0e47, /* 0xE0 - */
502 0x0e48, 0x0e49, 0x0e4a, 0x0e4b, 0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f, /* - 0xEF */
503 0x0e50, 0x0e51, 0x0e52, 0x0e53, 0x0e54, 0x0e55, 0x0e56, 0x0e57, /* 0xF0 - */
504 0x0e58, 0x0e59, 0x0e5a, 0x0e5b, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xFF */
505 };
506
507 /* generated by ../tools/make_charset_ISO-8859-13 */
508 const gunichar2 charset_table_iso_8859_13[0x80] = {
509 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
510 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
511 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
512 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
513 0x00a0, 0x201d, 0x00a2, 0x00a3, 0x00a4, 0x201e, 0x00a6, 0x00a7, /* 0xA0 - */
514 0x00d8, 0x00a9, 0x0156, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00c6, /* - 0xAF */
515 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x201c, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
516 0x00f8, 0x00b9, 0x0157, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00e6, /* - 0xBF */
517 0x0104, 0x012e, 0x0100, 0x0106, 0x00c4, 0x00c5, 0x0118, 0x0112, /* 0xC0 - */
518 0x010c, 0x00c9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012a, 0x013b, /* - 0xCF */
519 0x0160, 0x0143, 0x0145, 0x00d3, 0x014c, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
520 0x0172, 0x0141, 0x015a, 0x016a, 0x00dc, 0x017b, 0x017d, 0x00df, /* - 0xDF */
521 0x0105, 0x012f, 0x0101, 0x0107, 0x00e4, 0x00e5, 0x0119, 0x0113, /* 0xE0 - */
522 0x010d, 0x00e9, 0x017a, 0x0117, 0x0123, 0x0137, 0x012b, 0x013c, /* - 0xEF */
523 0x0161, 0x0144, 0x0146, 0x00f3, 0x014d, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
524 0x0173, 0x0142, 0x015b, 0x016b, 0x00fc, 0x017c, 0x017e, 0x2019, /* - 0xFF */
525 };
526
527 /* generated by ../tools/make_charset_ISO-8859-14 */
528 const gunichar2 charset_table_iso_8859_14[0x80] = {
529 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
530 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
531 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
532 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
533 0x00a0, 0x1e02, 0x1e03, 0x00a3, 0x010a, 0x010b, 0x1e0a, 0x00a7, /* 0xA0 - */
534 0x1e80, 0x00a9, 0x1e82, 0x1e0b, 0x1ef2, 0x00ad, 0x00ae, 0x0178, /* - 0xAF */
535 0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x1e41, 0x00b6, 0x1e56, /* 0xB0 - */
536 0x1e81, 0x1e57, 0x1e83, 0x1e60, 0x1ef3, 0x1e84, 0x1e85, 0x1e61, /* - 0xBF */
537 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */
538 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
539 0x0174, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x1e6a, /* 0xD0 - */
540 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x0176, 0x00df, /* - 0xDF */
541 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */
542 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
543 0x0175, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x1e6b, /* 0xF0 - */
544 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x0177, 0x00ff, /* - 0xFF */
545 };
546
547 /* generated by ../tools/make_charset_ISO-8859-15 */
548 const gunichar2 charset_table_iso_8859_15[0x80] = {
549 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
550 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
551 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
552 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
553 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x20ac, 0x00a5, 0x0160, 0x00a7, /* 0xA0 - */
554 0x0161, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */
555 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x017d, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
556 0x017e, 0x00b9, 0x00ba, 0x00bb, 0x0152, 0x0153, 0x0178, 0x00bf, /* - 0xBF */
557 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */
558 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
559 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
560 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* - 0xDF */
561 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */
562 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
563 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
564 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, /* - 0xFF */
565 };
566
567 /* generated by ../tools/make_charset_ISO-8859-16 */
568 const gunichar2 charset_table_iso_8859_16[0x80] = {
569 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
570 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
571 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
572 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
573 0x00a0, 0x0104, 0x0105, 0x0141, 0x20ac, 0x201e, 0x0160, 0x00a7, /* 0xA0 - */
574 0x0161, 0x00a9, 0x0218, 0x00ab, 0x0179, 0x00ad, 0x017a, 0x017b, /* - 0xAF */
575 0x00b0, 0x00b1, 0x010c, 0x0142, 0x017d, 0x201d, 0x00b6, 0x00b7, /* 0xB0 - */
576 0x017e, 0x010d, 0x0219, 0x00bb, 0x0152, 0x0153, 0x0178, 0x017c, /* - 0xBF */
577 0x00c0, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0106, 0x00c6, 0x00c7, /* 0xC0 - */
578 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
579 0x0110, 0x0143, 0x00d2, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x015a, /* 0xD0 - */
580 0x0170, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0118, 0x021a, 0x00df, /* - 0xDF */
581 0x00e0, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x0107, 0x00e6, 0x00e7, /* 0xE0 - */
582 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
583 0x0111, 0x0144, 0x00f2, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x015b, /* 0xF0 - */
584 0x0171, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0119, 0x021b, 0x00ff, /* - 0xFF */
585 };
586
587 /*
588 * Windows-1250
589 *
590 * See:
591 * httpss://en.wikipedia.org/wiki/Windows-1250)
592 * https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT
593 */
594 const gunichar2 charset_table_cp1250[0x80] = {
595 0x20ac, UNREPL, 0x201a, UNREPL, 0x201e, 0x2026, 0x2020, 0x2021, /* 0x80 - */
596 UNREPL, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179, /* - 0x8F */
597 UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, /* 0x90 - */
598 UNREPL, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a, /* - 0x9F */
599 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, /* 0xA0 - */
600 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b, /* - 0xAF */
601 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
602 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c, /* - 0xBF */
603 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, /* 0xC0 - */
604 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, /* - 0xCF */
605 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, /* 0xD0 - */
606 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, /* - 0xDF */
607 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, /* 0xE0 - */
608 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, /* - 0xEF */
609 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, /* 0xF0 - */
610 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, /* - 0xFF */
611 };
612
613 /*
614 * Windows-1251
615 *
616 * See:
617 * https://en.wikipedia.org/wiki/Windows-1251
618 * https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT
619 */
620 const gunichar2 charset_table_cp1251[0x80] = {
621 0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021, /* 0x80 - */
622 0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040B, 0x040f, /* - 0x8F */
623 0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, /* 0x90 - */
624 UNREPL, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f, /* - 0x9F */
625 0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7, /* 0xA0 - */
626 0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407, /* - 0xAF */
627 0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
628 0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457, /* - 0xBF */
629 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, /* 0xC0 - */
630 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* - 0xCF */
631 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, /* 0xD0 - */
632 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* - 0xDF */
633 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, /* 0xE0 - */
634 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* - 0xEF */
635 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, /* 0xF0 - */
636 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, /* - 0xFF */
637 };
638
639 /*
640 * Windows-1252
641 *
642 * See:
643 * https://en.wikipedia.org/wiki/Windows-1252
644 * https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
645 */
646 const gunichar2 charset_table_cp1252[0x80] = {
647 0x20ac, UNREPL, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, /* 0x80 - */
648 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, UNREPL, 0x0172, UNREPL, /* - 0x8F */
649 UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, /* 0x90 - */
650 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, UNREPL, 0x0173, 0x0178, /* - 0x9F */
651 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, /* 0xA0 - */
652 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */
653 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
654 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, /* - 0xBF */
655 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */
656 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
657 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
658 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* - 0xDF */
659 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */
660 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
661 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
662 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, /* - 0xFF */
663 };
664
665 /* generated by ./make_charset_table MACROMAN */
666 /* That's "MacRoman", not "Macro Man" (faster than a speeding recursive expansion!) */
667 const gunichar2 charset_table_mac_roman[0x80] = {
668 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1, /* 0x80 - */
669 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8, /* - 0x8F */
670 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3, /* 0x90 - */
671 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc, /* - 0x9F */
672 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, /* 0xA0 - */
673 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8, /* - 0xAF */
674 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211, /* 0xB0 - */
675 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8, /* - 0xBF */
676 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, /* 0xC0 - */
677 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153, /* - 0xCF */
678 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca, /* 0xD0 - */
679 0x00ff, 0x0178, 0x2044, 0x20ac, 0x2039, 0x203a, 0xfb01, 0xfb02, /* - 0xDF */
680 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1, /* 0xE0 - */
681 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4, /* - 0xEF */
682 0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc, /* 0xF0 - */
683 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7, /* - 0xFF */
684 };
685
686 /* generated by ./make_charset_table CP437 */
687 const gunichar2 charset_table_cp437[0x80] = {
688 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, /* 0x80 - */
689 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, /* - 0x8F */
690 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, /* 0x90 - */
691 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, /* - 0x9F */
692 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, /* 0xA0 - */
693 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, /* - 0xAF */
694 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, /* 0xB0 - */
695 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, /* - 0xBF */
696 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, /* 0xC0 - */
697 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, /* - 0xCF */
698 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, /* 0xD0 - */
699 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, /* - 0xDF */
700 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, /* 0xE0 - */
701 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229, /* - 0xEF */
702 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, /* 0xF0 - */
703 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0, /* - 0xFF */
704 };
705
706 /*
707 * CP855
708 *
709 * See
710 * https://en.wikipedia.org/wiki/CP855
711 * https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT
712 *
713 * XXX - this doesn't have the graphics for 0x00 through 0x1F shown
714 * on the Wikipedia page, but not in the Microsoft mapping file;
715 * that would require a 256-code-point mapping table. (Are those
716 * positions used for the same graphics on all code pages - the PC
717 * graphics set, or whatever it's called?)
718 */
719 const gunichar2 charset_table_cp855[0x80] = {
720 0x0452, 0x0402, 0x0453, 0x0403, 0x0451, 0x0401, 0x0454, 0x0404, /* 0x80 - */
721 0x0455, 0x0405, 0x0456, 0x0406, 0x0457, 0x0407, 0x0458, 0x0408, /* - 0x8F */
722 0x0459, 0x0409, 0x045a, 0x040a, 0x045b, 0x040b, 0x045c, 0x040c, /* 0x90 - */
723 0x045e, 0x040e, 0x045f, 0x040f, 0x044e, 0x042e, 0x044a, 0x042a, /* - 0x9F */
724 0x0430, 0x0410, 0x0431, 0x0411, 0x0446, 0x0426, 0x0434, 0x0414, /* 0xA0 - */
725 0x0435, 0x0415, 0x0444, 0x0424, 0x0433, 0x0413, 0x00ab, 0x00bb, /* - 0xAF */
726 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x0445, 0x0425, 0x0438, /* 0xB0 - */
727 0x0418, 0x2563, 0x2551, 0x2557, 0x2550, 0x0439, 0x0419, 0x2510, /* - 0xBF */
728 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x043a, 0x041a, /* 0xC0 - */
729 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, /* - 0xCF */
730 0x043b, 0x041b, 0x043c, 0x041c, 0x043d, 0x041d, 0x043e, 0x041e, /* 0xD0 - */
731 0x043f, 0x2518, 0x250c, 0x2588, 0x2584, 0x041f, 0x044f, 0x2580, /* - 0xDF */
732 0x042f, 0x0440, 0x0420, 0x0441, 0x0421, 0x0442, 0x0422, 0x0443, /* 0xE0 - */
733 0x0423, 0x0436, 0x0416, 0x0432, 0x0412, 0x044c, 0x042c, 0x2116, /* - 0xEF */
734 0x00ad, 0x044b, 0x042b, 0x0437, 0x0417, 0x0448, 0x0428, 0x044d, /* 0xF0 - */
735 0x042d, 0x0449, 0x0429, 0x0447, 0x0427, 0x00a7, 0x25a0, 0x00a0, /* - 0xFF */
736 };
737
738 /*
739 * CP866
740 *
741 * See:
742 * https://en.wikipedia.org/wiki/CP866
743 * https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT
744 */
745 const gunichar2 charset_table_cp866[0x80] = {
746 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, /* 0x80 - */
747 0x0418, 0x0419, 0x041A, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* - 0x8F */
748 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, /* 0x90 - */
749 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* - 0x9F */
750 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, /* 0xA0 - */
751 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* - 0xAF */
752 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, /* 0xB0 - */
753 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, /* - 0xBF */
754 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, /* 0xC0 - */
755 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, /* - 0xCF */
756 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, /* 0xD0 - */
757 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, /* - 0xDF */
758 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, /* 0xE0 - */
759 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, /* - 0xEF */
760 0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040e, 0x045e, /* 0xF0 - */
761 0x00b0, 0x2219, 0x00b7, 0x221a, 0x2216, 0x00a4, 0x25a0, 0x00a0, /* - 0xFF */
762 };
763
764 /*
765 * Given a wmem scope, a pointer, a length, and a translation table with
766 * 128 entries, treat the string of bytes referred to by the pointer and
767 * length as a string encoded using one octet per character, with octets
768 * with the high-order bit clear being ASCII and octets with the high-order
769 * bit set being mapped by the translation table to 2-byte Unicode Basic
770 * Multilingual Plane characters (including REPLACEMENT CHARACTER), and
771 * return a pointer to a UTF-8 string, allocated using the wmem scope.
772 */
773 guint8 *
get_unichar2_string(wmem_allocator_t * scope,const guint8 * ptr,gint length,const gunichar2 table[0x80])774 get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80])
775 {
776 wmem_strbuf_t *str;
777
778 str = wmem_strbuf_sized_new(scope, length+1, 0);
779
780 while (length > 0) {
781 guint8 ch = *ptr;
782
783 if (ch < 0x80)
784 wmem_strbuf_append_c(str, ch);
785 else
786 wmem_strbuf_append_unichar(str, table[ch-0x80]);
787 ptr++;
788 length--;
789 }
790
791 return (guint8 *) wmem_strbuf_finalize(str);
792 }
793
794 /*
795 * Given a wmem scope, a pointer, and a length, treat the string of bytes
796 * referred to by the pointer and length as a UCS-2 encoded string
797 * containing characters from the Basic Multilingual Plane (plane 0) of
798 * Unicode, and return a pointer to a UTF-8 string, allocated with the
799 * wmem scope.
800 *
801 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
802 *
803 * Specify length in bytes.
804 *
805 * XXX - should map lead and trail surrogate values to REPLACEMENT
806 * CHARACTERs (0xFFFD)?
807 * XXX - if there are an odd number of bytes, should put a
808 * REPLACEMENT CHARACTER at the end.
809 */
810 guint8 *
get_ucs_2_string(wmem_allocator_t * scope,const guint8 * ptr,gint length,const guint encoding)811 get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
812 {
813 gunichar2 uchar;
814 gint i; /* Byte counter for string */
815 wmem_strbuf_t *strbuf;
816
817 strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
818
819 for(i = 0; i + 1 < length; i += 2) {
820 if (encoding == ENC_BIG_ENDIAN){
821 uchar = pntoh16(ptr + i);
822 }else{
823 uchar = pletoh16(ptr + i);
824 }
825 wmem_strbuf_append_unichar(strbuf, uchar);
826 }
827
828 /*
829 * XXX - if i < length, this means we were handed an odd
830 * number of bytes, so we're not a valid UCS-2 string.
831 */
832 return (guint8 *) wmem_strbuf_finalize(strbuf);
833 }
834
835 /*
836 * Given a wmem scope, a pointer, and a length, treat the string of bytes
837 * referred to by the pointer and length as a UTF-16 encoded string, and
838 * return a pointer to a UTF-8 string, allocated with the wmem scope.
839 *
840 * See RFC 2781 section 2.2.
841 *
842 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
843 *
844 * Specify length in bytes.
845 *
846 * XXX - should map invalid Unicode characters to REPLACEMENT CHARACTERs.
847 */
848 guint8 *
get_utf_16_string(wmem_allocator_t * scope,const guint8 * ptr,gint length,const guint encoding)849 get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
850 {
851 wmem_strbuf_t *strbuf;
852 gunichar2 uchar2, lead_surrogate;
853 gunichar uchar;
854 gint i; /* Byte counter for string */
855
856 strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
857
858 for(i = 0; i + 1 < length; i += 2) {
859 if (encoding == ENC_BIG_ENDIAN)
860 uchar2 = pntoh16(ptr + i);
861 else
862 uchar2 = pletoh16(ptr + i);
863
864 if (IS_LEAD_SURROGATE(uchar2)) {
865 /*
866 * Lead surrogate. Must be followed by
867 * a trail surrogate.
868 */
869 i += 2;
870 if (i + 1 >= length) {
871 /*
872 * Oops, string ends with a lead surrogate.
873 *
874 * Insert a REPLACEMENT CHARACTER to mark the error,
875 * and quit.
876 */
877 wmem_strbuf_append_unichar(strbuf, UNREPL);
878 break;
879 }
880 lead_surrogate = uchar2;
881 if (encoding == ENC_BIG_ENDIAN)
882 uchar2 = pntoh16(ptr + i);
883 else
884 uchar2 = pletoh16(ptr + i);
885 if (IS_TRAIL_SURROGATE(uchar2)) {
886 /* Trail surrogate. */
887 uchar = SURROGATE_VALUE(lead_surrogate, uchar2);
888 wmem_strbuf_append_unichar(strbuf, uchar);
889 } else {
890 /*
891 * Not a trail surrogate.
892 *
893 * Insert a REPLACEMENT CHARACTER to mark the error,
894 * and continue;
895 */
896 wmem_strbuf_append_unichar(strbuf, UNREPL);
897 }
898 } else {
899 if (IS_TRAIL_SURROGATE(uchar2)) {
900 /*
901 * Trail surrogate without a preceding
902 * lead surrogate.
903 *
904 * Insert a REPLACEMENT CHARACTER to mark the error,
905 * and continue;
906 */
907 wmem_strbuf_append_unichar(strbuf, UNREPL);
908 } else {
909 /*
910 * Non-surrogate; just append it.
911 */
912 wmem_strbuf_append_unichar(strbuf, uchar2);
913 }
914 }
915 }
916
917 /*
918 * If i < length, this means we were handed an odd number of bytes,
919 * so we're not a valid UTF-16 string; insert a REPLACEMENT CHARACTER
920 * to mark the error.
921 */
922 if (i < length)
923 wmem_strbuf_append_unichar(strbuf, UNREPL);
924 return (guint8 *) wmem_strbuf_finalize(strbuf);
925 }
926
927 /*
928 * Given a wmem scope, a pointer, and a length, treat the string of bytes
929 * referred to by the pointer and length as a UCS-4 encoded string, and
930 * return a pointer to a UTF-8 string, allocated with the wmem scope.
931 *
932 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
933 *
934 * Specify length in bytes
935 *
936 * XXX - should map lead and trail surrogate values to a "substitute"
937 * UTF-8 character?
938 * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
939 * XXX - if the number of bytes isn't a multiple of 4, should put a
940 * REPLACEMENT CHARACTER at the end.
941 */
942 guint8 *
get_ucs_4_string(wmem_allocator_t * scope,const guint8 * ptr,gint length,const guint encoding)943 get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
944 {
945 gunichar uchar;
946 gint i; /* Byte counter for string */
947 wmem_strbuf_t *strbuf;
948
949 strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
950
951 for(i = 0; i + 3 < length; i += 4) {
952 if (encoding == ENC_BIG_ENDIAN)
953 uchar = pntoh32(ptr + i);
954 else
955 uchar = pletoh32(ptr + i);
956
957 wmem_strbuf_append_unichar(strbuf, uchar);
958 }
959
960 /*
961 * XXX - if i < length, this means we were handed a number
962 * of bytes that's not a multiple of 4, so we're not a valid
963 * UCS-4 string.
964 */
965 return (guint8 *)wmem_strbuf_finalize(strbuf);
966 }
967
968 /*
969 * FROM GNOKII
970 * gsm-encoding.c
971 * gsm-sms.c
972 */
973
974 /* ETSI GSM 03.38, version 6.0.1, section 6.2.1; Default alphabet */
975 static const gunichar2 gsm_default_alphabet[0x80] = {
976 '@', 0xa3, '$', 0xa5, 0xe8, 0xe9, 0xf9, 0xec,
977 0xf2, 0xc7, '\n', 0xd8, 0xf8, '\r', 0xc5, 0xe5,
978 0x394, '_', 0x3a6, 0x393, 0x39b, 0x3a9, 0x3a0, 0x3a8,
979 0x3a3, 0x398, 0x39e, 0xa0, 0xc6, 0xe6, 0xdf, 0xc9,
980 ' ', '!', '\"', '#', 0xa4, '%', '&', '\'',
981 '(', ')', '*', '+', ',', '-', '.', '/',
982 '0', '1', '2', '3', '4', '5', '6', '7',
983 '8', '9', ':', ';', '<', '=', '>', '?',
984 0xa1, 'A', 'B', 'C', 'D', 'E', 'F', 'G',
985 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
986 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
987 'X', 'Y', 'Z', 0xc4, 0xd6, 0xd1, 0xdc, 0xa7,
988 0xbf, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
989 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
990 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
991 'x', 'y', 'z', 0xe4, 0xf6, 0xf1, 0xfc, 0xe0
992 };
993
994 static gunichar
GSM_to_UNICHAR(guint8 c)995 GSM_to_UNICHAR(guint8 c)
996 {
997 if (c < G_N_ELEMENTS(gsm_default_alphabet))
998 return gsm_default_alphabet[c];
999
1000 return UNREPL;
1001 }
1002
1003 static gunichar
GSMext_to_UNICHAR(guint8 c)1004 GSMext_to_UNICHAR(guint8 c)
1005 {
1006 switch (c)
1007 {
1008 case 0x0a: return 0x0c; /* form feed */
1009 case 0x14: return '^';
1010 case 0x28: return '{';
1011 case 0x29: return '}';
1012 case 0x2f: return '\\';
1013 case 0x3c: return '[';
1014 case 0x3d: return '~';
1015 case 0x3e: return ']';
1016 case 0x40: return '|';
1017 case 0x65: return 0x20ac; /* euro */
1018 }
1019
1020 return UNREPL; /* invalid character */
1021 }
1022
1023 #define GN_BYTE_MASK ((1 << bits) - 1)
1024
1025 #define GN_CHAR_ESCAPE 0x1b
1026
1027 static gboolean
char_is_escape(unsigned char value)1028 char_is_escape(unsigned char value)
1029 {
1030 return (value == GN_CHAR_ESCAPE);
1031 }
1032
1033 static gboolean
handle_ts_23_038_char(wmem_strbuf_t * strbuf,guint8 code_point,gboolean saw_escape)1034 handle_ts_23_038_char(wmem_strbuf_t *strbuf, guint8 code_point,
1035 gboolean saw_escape)
1036 {
1037 gunichar uchar;
1038
1039 if (char_is_escape(code_point)) {
1040 /*
1041 * XXX - if saw_escape is TRUE here, then this is
1042 * the case where we escape to "another extension table",
1043 * but TS 128 038 V11.0 doesn't specify such an extension
1044 * table.
1045 */
1046 saw_escape = TRUE;
1047 } else {
1048 if (!(code_point & 0x80)) {
1049 /*
1050 * Code point is valid (7-bit).
1051 * Have we seen an escape?
1052 */
1053 if (saw_escape) {
1054 saw_escape = FALSE;
1055 uchar = GSMext_to_UNICHAR(code_point);
1056 } else {
1057 uchar = GSM_to_UNICHAR(code_point);
1058 }
1059 wmem_strbuf_append_unichar(strbuf, uchar);
1060 } else {
1061 /* Invalid - put in a REPLACEMENT CHARACTER */
1062 wmem_strbuf_append_unichar(strbuf, UNREPL);
1063 }
1064 }
1065 return saw_escape;
1066 }
1067
1068 guint8 *
get_ts_23_038_7bits_string_packed(wmem_allocator_t * scope,const guint8 * ptr,const gint bit_offset,gint no_of_chars)1069 get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const guint8 *ptr,
1070 const gint bit_offset, gint no_of_chars)
1071 {
1072 wmem_strbuf_t *strbuf;
1073 gint char_count; /* character counter for string */
1074 guint8 in_byte, out_byte, rest = 0x00;
1075 const guint8 *start_ptr = ptr;
1076 gboolean saw_escape = FALSE;
1077 int bits;
1078
1079 strbuf = wmem_strbuf_sized_new(scope, no_of_chars+1, 0);
1080
1081 bits = bit_offset & 0x07;
1082 if (!bits) {
1083 bits = 7;
1084 }
1085
1086 for(char_count = 0; char_count < no_of_chars; ptr++) {
1087 /* Get the next byte from the string. */
1088 in_byte = *ptr;
1089
1090 /*
1091 * Combine the bits we've accumulated with bits from
1092 * that byte to make a 7-bit code point.
1093 */
1094 out_byte = ((in_byte & GN_BYTE_MASK) << (7 - bits)) | rest;
1095
1096 /*
1097 * Leftover bits used in that code point.
1098 */
1099 rest = in_byte >> bits;
1100
1101 /*
1102 * If we don't start from 0th bit, we shouldn't go to the
1103 * next char. Under *out_num we have now 0 and under Rest -
1104 * _first_ part of the char.
1105 */
1106 if ((start_ptr != ptr) || (bits == 7)) {
1107 saw_escape = handle_ts_23_038_char(strbuf, out_byte,
1108 saw_escape);
1109 char_count++;
1110 }
1111
1112 /*
1113 * After reading 7 octets we have read 7 full characters
1114 * but we have 7 bits as well. This is the next character.
1115 */
1116 if ((bits == 1) && (char_count < no_of_chars)) {
1117 saw_escape = handle_ts_23_038_char(strbuf, rest,
1118 saw_escape);
1119 char_count++;
1120 bits = 7;
1121 rest = 0x00;
1122 } else {
1123 bits--;
1124 }
1125 }
1126
1127 if (saw_escape) {
1128 /*
1129 * Escape not followed by anything.
1130 *
1131 * XXX - for now, show the escape as a REPLACEMENT
1132 * CHARACTER.
1133 */
1134 wmem_strbuf_append_unichar(strbuf, UNREPL);
1135 }
1136
1137 return (guint8 *)wmem_strbuf_finalize(strbuf);
1138 }
1139
1140 guint8 *
get_ts_23_038_7bits_string_unpacked(wmem_allocator_t * scope,const guint8 * ptr,gint length)1141 get_ts_23_038_7bits_string_unpacked(wmem_allocator_t *scope, const guint8 *ptr,
1142 gint length)
1143 {
1144 wmem_strbuf_t *strbuf;
1145 gint i; /* Byte counter for string */
1146 gboolean saw_escape = FALSE;
1147
1148 strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
1149
1150 for (i = 0; i < length; i++)
1151 saw_escape = handle_ts_23_038_char(strbuf, *ptr++, saw_escape);
1152
1153 return (guint8 *)wmem_strbuf_finalize(strbuf);
1154 }
1155
1156 /*
1157 * ETSI TS 102 221 Annex A.
1158 */
1159 guint8 *
get_etsi_ts_102_221_annex_a_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)1160 get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const guint8 *ptr,
1161 gint length)
1162 {
1163 guint8 string_type;
1164 guint8 string_len;
1165 gunichar2 ucs2_base;
1166 wmem_strbuf_t *strbuf;
1167 guint i; /* Byte counter for string */
1168 gboolean saw_escape = FALSE;
1169
1170 /*
1171 * get the first octet.
1172 */
1173 if (length == 0) {
1174 /* XXX - return error indication */
1175 strbuf = wmem_strbuf_new(scope, "");
1176 return (guint8 *)wmem_strbuf_finalize(strbuf);
1177 }
1178 string_type = *ptr;
1179 ptr++;
1180 length--;
1181
1182 if (string_type == 0x80) {
1183 /*
1184 * Annex A, coding scheme 1) - big-endian UCS-2.
1185 */
1186 return get_ucs_2_string(scope, ptr, length, ENC_BIG_ENDIAN);
1187 }
1188
1189 /*
1190 * Annex A, coding schemes 2) and 3):
1191 *
1192 * the second byte is the number of characters (characters,
1193 * not octets) in the string;
1194 *
1195 * for coding scheme 2), the third byte defines bits 15 to 8
1196 * of all UCS-2 characters in the string (all bit numbers are
1197 * 1-origin, so bit 1 is the low-order bit), with bit 16 being 0;
1198 *
1199 * for coding scheme 3), the third byte and fourth bytes, treated
1200 * as a big-endian value, define the base value for all UCS-2
1201 * characters in the string;
1202 *
1203 * for all subsequent bytes, if bit 8 is 0, it's a character
1204 * in the GSM Default Alphabet, otherwise, it is added to
1205 * the UCS-2 base value to give a UCS-2 character.
1206 *
1207 * XXX - that doesn't seem to indicate that a byte of 0x1b is
1208 * treated as an escape character, it just says that a single octet
1209 * with the 8th bit not set is a GSM Default Alphabet character.
1210 */
1211
1212 /*
1213 * Get the string length, in characters.
1214 */
1215 if (length == 0) {
1216 /* XXX - return error indication */
1217 strbuf = wmem_strbuf_new(scope, "");
1218 return (guint8 *)wmem_strbuf_finalize(strbuf);
1219 }
1220 string_len = *ptr;
1221 ptr++;
1222 length--;
1223
1224 strbuf = wmem_strbuf_sized_new(scope, 2*string_len+1, 0);
1225
1226 /*
1227 * Get the UCS-2 base.
1228 */
1229 if (string_type == 0x81) {
1230 if (length == 0) {
1231 /* XXX - return error indication */
1232 return (guint8 *)wmem_strbuf_finalize(strbuf);
1233 }
1234 ucs2_base = (*ptr) << 7;
1235 ptr++;
1236 length--;
1237 } else if (string_type == 0x82) {
1238 if (length == 0) {
1239 /* XXX - return error indication */
1240 return (guint8 *)wmem_strbuf_finalize(strbuf);
1241 }
1242 ucs2_base = (*ptr) << 8;
1243 ptr++;
1244 length--;
1245
1246 if (length == 0) {
1247 /* XXX - return error indication */
1248 return (guint8 *)wmem_strbuf_finalize(strbuf);
1249 }
1250 ucs2_base |= *ptr;
1251 ptr++;
1252 length--;
1253 } else {
1254 /* Invalid string type. */
1255 /* XXX - return error indication */
1256 return (guint8 *)wmem_strbuf_finalize(strbuf);
1257 }
1258
1259 for (i = 0; i < string_len; i++) {
1260 guint8 byte;
1261
1262 if (length == 0) {
1263 /* XXX - return error indication */
1264 return (guint8 *)wmem_strbuf_finalize(strbuf);
1265 }
1266 byte = *ptr;
1267 if ((byte & 0x80) == 0) {
1268 saw_escape = handle_ts_23_038_char(strbuf, byte, saw_escape);
1269 } else {
1270 gunichar2 uchar;
1271
1272 /*
1273 * XXX - if saw_escape is true, this is bogus.
1274 *
1275 * XXX - should map lead and trail surrogate values to
1276 * REPLACEMENT CHARACTERs (0xFFFD)?
1277 * XXX - if there are an odd number of bytes, should put a
1278 * REPLACEMENT CHARACTER at the end.
1279 */
1280 uchar = ucs2_base + (byte & 0x7f);
1281 wmem_strbuf_append_unichar(strbuf, uchar);
1282 }
1283 }
1284
1285 return (guint8 *)wmem_strbuf_finalize(strbuf);
1286 }
1287
1288 guint8 *
get_ascii_7bits_string(wmem_allocator_t * scope,const guint8 * ptr,const gint bit_offset,gint no_of_chars)1289 get_ascii_7bits_string(wmem_allocator_t *scope, const guint8 *ptr,
1290 const gint bit_offset, gint no_of_chars)
1291 {
1292 wmem_strbuf_t *strbuf;
1293 gint char_count; /* character counter for string */
1294 guint8 in_byte, out_byte, rest = 0x00;
1295 const guint8 *start_ptr = ptr;
1296 int bits;
1297
1298 bits = bit_offset & 0x07;
1299 if (!bits) {
1300 bits = 7;
1301 }
1302
1303 strbuf = wmem_strbuf_sized_new(scope, no_of_chars+1, 0);
1304 for(char_count = 0; char_count < no_of_chars; ptr++) {
1305 /* Get the next byte from the string. */
1306 in_byte = *ptr;
1307
1308 /*
1309 * Combine the bits we've accumulated with bits from
1310 * that byte to make a 7-bit code point.
1311 */
1312 out_byte = (in_byte >> (8 - bits)) | rest;
1313
1314 /*
1315 * Leftover bits used in that code point.
1316 */
1317 rest = (in_byte << (bits - 1)) & 0x7f;
1318
1319 /*
1320 * If we don't start from 0th bit, we shouldn't go to the
1321 * next char. Under *out_num we have now 0 and under Rest -
1322 * _first_ part of the char.
1323 */
1324 if ((start_ptr != ptr) || (bits == 7)) {
1325 wmem_strbuf_append_c(strbuf, out_byte);
1326 char_count++;
1327 }
1328
1329 /*
1330 * After reading 7 octets we have read 7 full characters
1331 * but we have 7 bits as well. This is the next character.
1332 */
1333 if ((bits == 1) && (char_count < no_of_chars)) {
1334 wmem_strbuf_append_c(strbuf, rest);
1335 char_count++;
1336 bits = 7;
1337 rest = 0x00;
1338 } else {
1339 bits--;
1340 }
1341 }
1342
1343 return (guint8 *)wmem_strbuf_finalize(strbuf);
1344 }
1345
1346 /* ASCII/EBCDIC conversion tables from
1347 * https://web.archive.org/web/20060813174742/http://www.room42.com/store/computer_center/code_tables.shtml
1348 */
1349 #if 0
1350 static const guint8 ASCII_translate_EBCDIC [ 256 ] = {
1351 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
1352 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1353 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
1354 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
1355 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D,
1356 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
1357 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
1358 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
1359 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8,
1360 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
1361 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
1362 0xE8, 0xE9, 0xAD, 0xE0, 0xBD, 0x5F, 0x6D,
1363 0x7D, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88,
1364 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
1365 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
1366 0xA8, 0xA9, 0xC0, 0x6A, 0xD0, 0xA1, 0x4B,
1367 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1368 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1369 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1370 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1371 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1372 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1373 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1374 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1375 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1376 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1377 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1378 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1379 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1380 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1381 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1382 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B
1383 };
1384
1385 void
1386 ASCII_to_EBCDIC(guint8 *buf, guint bytes)
1387 {
1388 guint i;
1389 guint8 *bufptr;
1390
1391 bufptr = buf;
1392
1393 for (i = 0; i < bytes; i++, bufptr++) {
1394 *bufptr = ASCII_translate_EBCDIC[*bufptr];
1395 }
1396 }
1397
1398 guint8
1399 ASCII_to_EBCDIC1(guint8 c)
1400 {
1401 return ASCII_translate_EBCDIC[c];
1402 }
1403 #endif
1404
1405 static const guint8 EBCDIC_translate_ASCII [ 256 ] = {
1406 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1407 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1408 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1409 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
1410 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
1411 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
1412 0x2E, 0x2E, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
1413 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x2E, 0x3F,
1414 0x20, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1415 0x2E, 0x2E, 0x2E, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
1416 0x26, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1417 0x2E, 0x2E, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
1418 0x2D, 0x2F, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1419 0x2E, 0x2E, 0x7C, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
1420 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1421 0x2E, 0x2E, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
1422 0x2E, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
1423 0x68, 0x69, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1424 0x2E, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70,
1425 0x71, 0x72, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1426 0x2E, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
1427 0x79, 0x7A, 0x2E, 0x2E, 0x2E, 0x5B, 0x2E, 0x2E,
1428 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1429 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x5D, 0x2E, 0x2E,
1430 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
1431 0x48, 0x49, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1432 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50,
1433 0x51, 0x52, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1434 0x5C, 0x2E, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
1435 0x59, 0x5A, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1436 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
1437 0x38, 0x39, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E
1438 };
1439
1440 void
EBCDIC_to_ASCII(guint8 * buf,guint bytes)1441 EBCDIC_to_ASCII(guint8 *buf, guint bytes)
1442 {
1443 guint i;
1444 guint8 *bufptr;
1445
1446 bufptr = buf;
1447
1448 for (i = 0; i < bytes; i++, bufptr++) {
1449 *bufptr = EBCDIC_translate_ASCII[*bufptr];
1450 }
1451 }
1452
1453 guint8
EBCDIC_to_ASCII1(guint8 c)1454 EBCDIC_to_ASCII1(guint8 c)
1455 {
1456 return EBCDIC_translate_ASCII[c];
1457 }
1458
1459 /* Tables for EBCDIC code pages */
1460
1461 /* EBCDIC common; based on the table in appendix H of ESA/370 Principles
1462 of Operation, but with some code points that don't correspond to
1463 the same characters in code pages 037 and 1158 mapped to REPLACEMENT
1464 CHARACTER - there may be more code points of that sort */
1465 const gunichar2 charset_table_ebcdic[256] = {
1466 0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1467 0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1468 0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1469 0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1470 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1471 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1472 UNREPL, UNREPL, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1473 0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, UNREPL, 0x001a,
1474 0x0020, 0x00a0, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1475 UNREPL, UNREPL, UNREPL, 0x002e, 0x003c, 0x0028, 0x002b, UNREPL,
1476 0x0026, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1477 UNREPL, UNREPL, UNREPL, 0x0024, 0x002a, 0x0029, 0x003b, UNREPL,
1478 0x002d, 0x002f, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1479 UNREPL, UNREPL, UNREPL, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1480 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1481 UNREPL, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1482 UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1483 0x0068, 0x0069, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1484 UNREPL, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1485 0x0071, 0x0072, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1486 UNREPL, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1487 0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1488 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1489 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1490 0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1491 0x0048, 0x0049, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1492 0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1493 0x0051, 0x0052, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1494 0x005c, UNREPL, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1495 0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1496 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1497 0x0038, 0x0039, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1498 };
1499
1500 /* EBCDIC code page 037 */
1501 const gunichar2 charset_table_ebcdic_cp037[256] = {
1502 0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1503 0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1504 0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1505 0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1506 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1507 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1508 0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1509 0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, 0x009e, 0x001a,
1510 0x0020, 0x00a0, 0x00e2, 0x00e4, 0x00e0, 0x00e1, 0x00e3, 0x00e5,
1511 0x00e7, 0x00f1, 0x00a2, 0x002e, 0x003c, 0x0028, 0x002b, 0x007c,
1512 0x0026, 0x00e9, 0x00ea, 0x00eb, 0x00e8, 0x00ed, 0x00ee, 0x00ef,
1513 0x00ec, 0x00df, 0x0021, 0x0024, 0x002a, 0x0029, 0x003b, 0x00ac,
1514 0x002d, 0x002f, 0x00c2, 0x00c4, 0x00c0, 0x00c1, 0x00c3, 0x00c5,
1515 0x00c7, 0x00d1, 0x00a6, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1516 0x00f8, 0x00c9, 0x00ca, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf,
1517 0x00cc, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1518 0x00d8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1519 0x0068, 0x0069, 0x00ab, 0x00bb, 0x00f0, 0x00fd, 0x00fe, 0x00b1,
1520 0x00b0, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1521 0x0071, 0x0072, 0x00aa, 0x00ba, 0x00e6, 0x00b8, 0x00c6, 0x00a4,
1522 0x00b5, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1523 0x0079, 0x007a, 0x00a1, 0x00bf, 0x00d0, 0x00dd, 0x00de, 0x00ae,
1524 0x005e, 0x00a3, 0x00a5, 0x00b7, 0x00a9, 0x00a7, 0x00b6, 0x00bc,
1525 0x00bd, 0x00be, 0x005b, 0x005d, 0x00af, 0x00a8, 0x00b4, 0x00d7,
1526 0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1527 0x0048, 0x0049, 0x00ad, 0x00f4, 0x00f6, 0x00f2, 0x00f3, 0x00f5,
1528 0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1529 0x0051, 0x0052, 0x00b9, 0x00fb, 0x00fc, 0x00f9, 0x00fa, 0x00ff,
1530 0x005c, 0x00f7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1531 0x0059, 0x005a, 0x00b2, 0x00d4, 0x00d6, 0x00d2, 0x00d3, 0x00d5,
1532 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1533 0x0038, 0x0039, 0x00b3, 0x00db, 0x00dc, 0x00d9, 0x00da, 0x009f,
1534 };
1535
1536 /*
1537 * Given a wmem scope, a pointer, a length, and a translation table with
1538 * 256 entries, treat the string of bytes referred to by the pointer and
1539 * length as a string encoded using one octet per character, with octets
1540 * being mapped by the translation table to 2-byte Unicode Basic Multilingual
1541 * Plane characters (including REPLACEMENT CHARACTER), and return a
1542 * pointer to a UTF-8 string, allocated using the wmem scope.
1543 */
1544 guint8 *
get_nonascii_unichar2_string(wmem_allocator_t * scope,const guint8 * ptr,gint length,const gunichar2 table[256])1545 get_nonascii_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[256])
1546 {
1547 wmem_strbuf_t *str;
1548
1549 str = wmem_strbuf_sized_new(scope, length+1, 0);
1550
1551 while (length > 0) {
1552 guint8 ch = *ptr;
1553
1554 wmem_strbuf_append_unichar(str, table[ch]);
1555 ptr++;
1556 length--;
1557 }
1558
1559 return (guint8 *) wmem_strbuf_finalize(str);
1560 }
1561
1562 /*
1563 * Given a wmem scope, a pointer, a length, and a string referring to an
1564 * encoding (recognized by iconv), treat the bytes referred to by the pointer
1565 * and length as a string in that encoding, and return a pointer to a UTF-8
1566 * string, allocated using the wmem scope, converted from the original
1567 * encoding having substituted REPLACEMENT CHARACTER according to the
1568 * Unicode Standard 5.22 U+FFFD Substitution for Conversion
1569 * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1570 */
1571 static guint8 *
get_string_enc_iconv(wmem_allocator_t * scope,const guint8 * ptr,gint length,const gchar * encoding)1572 get_string_enc_iconv(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gchar *encoding)
1573 {
1574 GIConv cd;
1575 gsize inbytes, outbytes;
1576 gsize tempstr_size, bytes_written;
1577 gsize err;
1578 gsize max_subpart, tempinbytes;
1579 gchar *outptr, *tempstr;
1580
1581 wmem_strbuf_t *str;
1582
1583 if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) {
1584 REPORT_DISSECTOR_BUG("Unable to allocate iconv() converter from %s to UTF-8", encoding);
1585 /* Most likely to be a programming error passing in a bad encoding
1586 * name. However, could be a issue with the iconv support on the
1587 * system running WS. GLib requires iconv/libiconv, but is it possible
1588 * that some versions don't support all common encodings? */
1589 }
1590
1591 inbytes = length;
1592 str = wmem_strbuf_sized_new(scope, length+1, 0);
1593 /* XXX: If speed becomes an issue, the faster way to do this would
1594 * involve passing the wmem_strbuf_t's string buffer directly into
1595 * g_iconv to avoid a memcpy later, but that requires changes to the
1596 * wmem_strbuf interface to have non const access to the string buffer,
1597 * and to manipulate the used length directly. */
1598 outbytes = tempstr_size = MAX(8, length);
1599 outptr = tempstr = (gchar *)g_malloc(outbytes);
1600 while (inbytes > 0) {
1601 err = g_iconv(cd, (gchar **)&ptr, &inbytes, &outptr, &outbytes);
1602 bytes_written = outptr - tempstr;
1603 wmem_strbuf_append_len(str, tempstr, bytes_written);
1604 outptr = tempstr;
1605 outbytes = tempstr_size;
1606
1607 if (err == (gsize) -1) {
1608 /* Errors */
1609 switch (errno) {
1610 case EINVAL:
1611 /* Incomplete sequence at the end, not an error */
1612 wmem_strbuf_append_unichar(str, UNREPL);
1613 inbytes = 0;
1614 break;
1615 case E2BIG:
1616 /* Not enough room (UTF-8 longer than the initial buffer),
1617 * start back at the beginning of the buffer */
1618 break;
1619 case EILSEQ:
1620 /* Find the maximal subpart of the ill-formed sequence */
1621 errno = EINVAL;
1622 for (max_subpart = 1; err == (gsize)-1 && errno == EINVAL; max_subpart++) {
1623 tempinbytes = max_subpart;
1624 err = g_iconv(cd, (gchar **)&ptr, &tempinbytes,
1625 &outptr, &outbytes);
1626 }
1627 max_subpart = MAX(1, max_subpart-1);
1628 ptr += max_subpart;
1629 inbytes -= max_subpart;
1630 wmem_strbuf_append_unichar(str, UNREPL);
1631 outptr = tempstr;
1632 outbytes = tempstr_size;
1633 break;
1634 default:
1635 /* Unexpected conversion error, unrecoverable */
1636 g_free(tempstr);
1637 g_iconv_close(cd);
1638 REPORT_DISSECTOR_BUG("Unexpected iconv() error when converting from %s to UTF-8", encoding);
1639 break;
1640 }
1641 } else {
1642 /* Otherwise err is the number of replacement characters used,
1643 * but we don't care about that. */
1644 /* If we were converting to ISO-2022-JP or some other stateful
1645 * decoder with shift sequences (e.g. EBCDIC mixed-byte), a
1646 * final call with NULL input in order to output the shift
1647 * sequence back to initial state might make sense, but not
1648 * needed for UTF-8. */
1649 }
1650 }
1651
1652 g_free(tempstr);
1653 g_iconv_close(cd);
1654 return (guint8 *) wmem_strbuf_finalize(str);
1655 }
1656
1657 /*
1658 * Given a wmem scope, a pointer, and a length, treat the bytes referred to
1659 * by the pointer and length as a GB18030 encoded string, and return a pointer
1660 * to a UTF-8 string, allocated using the wmem scope, converted having
1661 * substituted REPLACEMENT CHARACTER according to the Unicode Standard
1662 * 5.22 U+FFFD Substitution for Conversion.
1663 * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1664 *
1665 * As expected, this will also decode GBK and GB2312 strings.
1666 */
1667 guint8 *
get_gb18030_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)1668 get_gb18030_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
1669 {
1670 /* iconv/libiconv support is guaranteed with GLib. Support this
1671 * via iconv, at least for now. */
1672 /* GNU libiconv has supported GB18030 (~ Windows Code page 54936) since
1673 * 2000-10-24 and version 1.4, is there is a system that compiles current
1674 * Wireshark yet its iconv only supports GBK (~ Windows Code page 936)? */
1675 const gchar *encoding = "GB18030";
1676 GIConv cd;
1677 if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) {
1678 encoding = "GBK";
1679 /* GB18030 is backwards compatible, at worst this will mean a few
1680 * extra REPLACEMENT CHARACTERs - GBK lacks the four byte encodings
1681 * from GB18030, which are all pairs of two byte sequences
1682 * 0x[81-FE] 0x[30-39]; that trailing byte is illegal in GBK
1683 * and thus the 4 byte characters will be replaced with two
1684 * REPLACEMENT CHARACTERs. */
1685 } else {
1686 g_iconv_close(cd);
1687 }
1688 return get_string_enc_iconv(scope, ptr, length, encoding);
1689 }
1690
1691 /*
1692 * Given a wmem scope, a pointer, and a length, treat the bytes referred to
1693 * by the pointer and length as a EUC-KR encoded string, and return a pointer
1694 * to a UTF-8 string, allocated using the wmem scope, converted having
1695 * substituted REPLACEMENT CHARACTER according to the Unicode Standard
1696 * 5.22 U+FFFD Substitution for Conversion.
1697 * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1698 */
1699 guint8 *
get_euc_kr_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)1700 get_euc_kr_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
1701 {
1702 /* iconv/libiconv support is guaranteed with GLib. Support this
1703 * via iconv, at least for now. */
1704 return get_string_enc_iconv(scope, ptr, length, "EUC-KR");
1705 }
1706
1707 /* T.61 to UTF-8 conversion table from OpenLDAP project
1708 * https://www.openldap.org/devel/gitweb.cgi?p=openldap.git;a=blob;f=libraries/libldap/t61.c;hb=HEAD
1709 */
1710 static const gunichar2 t61_tab[] = {
1711 0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
1712 0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
1713 0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
1714 0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
1715 0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
1716 0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
1717 0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
1718 0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
1719 0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
1720 0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
1721 0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
1722 0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
1723 0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
1724 0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
1725 0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
1726 0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
1727 0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
1728 0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
1729 0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
1730 0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
1731 0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
1732 0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
1733 0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
1734 0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
1735 0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
1736 0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
1737 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
1738 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
1739 0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
1740 0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
1741 0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
1742 0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
1743 };
1744
1745 typedef gunichar2 wvec16[16];
1746 typedef gunichar2 wvec32[32];
1747
1748 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
1749 static const wvec16 accents = {
1750 0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
1751 0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
1752
1753 /* In the following tables, base characters commented in (parentheses)
1754 * are not defined by T.61 but are mapped anyway since their Unicode
1755 * composite exists.
1756 */
1757
1758 /* Grave accented chars AEIOU (NWY) */
1759 static const wvec32 c1_vec1 = {
1760 /* Upper case */
1761 0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
1762 0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
1763 static const wvec32 c1_vec2 = {
1764 /* Lower case */
1765 0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
1766 0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
1767
1768 static const wvec32 *c1_grave[] = {
1769 NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
1770 };
1771
1772 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
1773 static const wvec32 c2_vec1 = {
1774 /* Upper case */
1775 0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
1776 0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
1777 0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
1778 0, 0xdd, 0x179, 0, 0, 0, 0, 0};
1779 static const wvec32 c2_vec2 = {
1780 /* Lower case */
1781 0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
1782 0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
1783 0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
1784 0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
1785 static const wvec32 c2_vec3 = {
1786 /* (AE and ae) */
1787 0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1788 0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1789
1790 static const wvec32 *c2_acute[] = {
1791 NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
1792 };
1793
1794 /* Circumflex AEIOUYCGHJSW (Z) */
1795 static const wvec32 c3_vec1 = {
1796 /* Upper case */
1797 0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
1798 0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
1799 0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
1800 0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
1801 static const wvec32 c3_vec2 = {
1802 /* Lower case */
1803 0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
1804 0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
1805 0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
1806 0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
1807 static const wvec32 *c3_circumflex[] = {
1808 NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
1809 };
1810
1811 /* Tilde AIOUN (EVY) */
1812 static const wvec32 c4_vec1 = {
1813 /* Upper case */
1814 0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
1815 0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
1816 static const wvec32 c4_vec2 = {
1817 /* Lower case */
1818 0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
1819 0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
1820 static const wvec32 *c4_tilde[] = {
1821 NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
1822 };
1823
1824 /* Macron AEIOU (YG) */
1825 static const wvec32 c5_vec1 = {
1826 /* Upper case */
1827 0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
1828 0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
1829 static const wvec32 c5_vec2 = {
1830 /* Lower case */
1831 0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
1832 0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
1833 static const wvec32 c5_vec3 = {
1834 /* (AE and ae) */
1835 0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1836 0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1837 static const wvec32 *c5_macron[] = {
1838 NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
1839 };
1840
1841 /* Breve AUG (EIO) */
1842 static const wvec32 c6_vec1 = {
1843 /* Upper case */
1844 0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
1845 0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1846 static const wvec32 c6_vec2 = {
1847 /* Lower case */
1848 0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
1849 0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1850 static const wvec32 *c6_breve[] = {
1851 NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
1852 };
1853
1854 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
1855 static const wvec32 c7_vec1 = {
1856 /* Upper case */
1857 0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
1858 0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
1859 0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
1860 0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
1861 static const wvec32 c7_vec2 = {
1862 /* Lower case */
1863 0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
1864 0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
1865 0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
1866 0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
1867 static const wvec32 *c7_dotabove[] = {
1868 NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
1869 };
1870
1871 /* Diaeresis AEIOUY (HWXt) */
1872 static const wvec32 c8_vec1 = {
1873 /* Upper case */
1874 0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
1875 0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
1876 static const wvec32 c8_vec2 = {
1877 /* Lower case */
1878 0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
1879 0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
1880 static const wvec32 *c8_diaeresis[] = {
1881 NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
1882 };
1883
1884 /* Ring Above AU (wy) */
1885 static const wvec32 ca_vec1 = {
1886 /* Upper case */
1887 0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1888 0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1889 static const wvec32 ca_vec2 = {
1890 /* Lower case */
1891 0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1892 0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
1893 static const wvec32 *ca_ringabove[] = {
1894 NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
1895 };
1896
1897 /* Cedilla CGKLNRST (EDH) */
1898 static const wvec32 cb_vec1 = {
1899 /* Upper case */
1900 0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
1901 0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
1902 0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1903 static const wvec32 cb_vec2 = {
1904 /* Lower case */
1905 0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
1906 0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
1907 0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1908 static const wvec32 *cb_cedilla[] = {
1909 NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
1910 };
1911
1912 /* Double Acute Accent OU */
1913 static const wvec32 cd_vec1 = {
1914 /* Upper case */
1915 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
1916 0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1917 static const wvec32 cd_vec2 = {
1918 /* Lower case */
1919 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
1920 0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1921 static const wvec32 *cd_doubleacute[] = {
1922 NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
1923 };
1924
1925 /* Ogonek AEIU (O) */
1926 static const wvec32 ce_vec1 = {
1927 /* Upper case */
1928 0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
1929 0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1930 static const wvec32 ce_vec2 = {
1931 /* Lower case */
1932 0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
1933 0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1934 static const wvec32 *ce_ogonek[] = {
1935 NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
1936 };
1937
1938 /* Caron CDELNRSTZ (AIOUGKjH) */
1939 static const wvec32 cf_vec1 = {
1940 /* Upper case */
1941 0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
1942 0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
1943 0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
1944 0, 0, 0x17d, 0, 0, 0, 0, 0};
1945 static const wvec32 cf_vec2 = {
1946 /* Lower case */
1947 0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
1948 0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
1949 0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
1950 0, 0, 0x17e, 0, 0, 0, 0, 0};
1951 static const wvec32 *cf_caron[] = {
1952 NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
1953 };
1954
1955 static const wvec32 **cx_tab[] = {
1956 NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
1957 c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
1958 cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
1959
1960 guint8 *
get_t61_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)1961 get_t61_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
1962 {
1963 gint i;
1964 const guint8 *c;
1965 wmem_strbuf_t *strbuf;
1966
1967 strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
1968
1969 for (i = 0, c = ptr; i < length; c++, i++) {
1970 if (!t61_tab[*c]) {
1971 wmem_strbuf_append_unichar(strbuf, UNREPL);
1972 } else if (i < length - 1 && (*c & 0xf0) == 0xc0) {
1973 gint j = *c & 0x0f;
1974 /* If this is the end of the string, or if the base
1975 * character is just a space, treat this as a regular
1976 * spacing character.
1977 */
1978 if ((!c[1] || c[1] == 0x20) && accents[j]) {
1979 wmem_strbuf_append_unichar(strbuf, accents[j]);
1980 } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
1981 /* We have a composite mapping for this pair */
1982 (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
1983 wmem_strbuf_append_unichar(strbuf, (*cx_tab[j][c[1]>>5])[c[1]&0x1f]);
1984 } else {
1985 /* No mapping, just swap it around so the base
1986 * character comes first.
1987 */
1988 wmem_strbuf_append_unichar(strbuf, c[1]);
1989 wmem_strbuf_append_unichar(strbuf, t61_tab[*c]);
1990 }
1991 c++; i++;
1992 continue;
1993 } else {
1994 wmem_strbuf_append_unichar(strbuf, t61_tab[*c]);
1995 }
1996 }
1997
1998 return (guint8 *)wmem_strbuf_finalize(strbuf);
1999 }
2000
2001 /*
2002 * Editor modelines - https://www.wireshark.org/tools/modelines.html
2003 *
2004 * Local variables:
2005 * c-basic-offset: 4
2006 * tab-width: 8
2007 * indent-tabs-mode: nil
2008 * End:
2009 *
2010 * vi: set shiftwidth=4 tabstop=8 expandtab:
2011 * :indentSize=4:tabSize=8:noTabs=true:
2012 */
2013