1 /* charsets.c
2  * Routines for handling character sets
3  *
4  * Wireshark - Network traffic analyzer
5  * By Gerald Combs <gerald@wireshark.org>
6  * Copyright 1998 Gerald Combs
7  *
8  * SPDX-License-Identifier: GPL-2.0-or-later
9  */
10 
11 #include "config.h"
12 
13 #include <errno.h>
14 #include <glib.h>
15 
16 #include <epan/proto.h>
17 #include <epan/wmem_scopes.h>
18 
19 #include <wsutil/pint.h>
20 #include <wsutil/unicode-utils.h>
21 
22 #include "charsets.h"
23 
24 /* REPLACEMENT CHARACTER */
25 #define UNREPL 0xFFFD
26 
27 /*
28  * Wikipedia's "Character encoding" template, giving a pile of character
29  * encodings and Wikipedia pages for them:
30  *
31  *    http://en.wikipedia.org/wiki/Template:Character_encoding
32  *
33  * Unicode character encoding model:
34  *
35  *    https://www.unicode.org/reports/tr17/
36  *
37  * International Components for Unicode character set mapping tables:
38  *
39  *    http://site.icu-project.org/charts/charset
40  *
41  * MSDN information on code pages:
42  *
43  *    https://docs.microsoft.com/en-us/windows/win32/intl/code-pages
44  *
45  * ASCII-based code pages, from IBM:
46  *
47  *    http://www-01.ibm.com/software/globalization/cp/cp_cpgid.html
48  *
49  * EBCDIC code pages, from IBM:
50  *
51  *    http://www-03.ibm.com/systems/i/software/globalization/codepages.html
52  *
53  * The IBM pages are no longer available; the versions archived on the
54  * Wayback Machine are, but the links to the PDF and text versions of
55  * the code pages don't all work (do *any* work?).
56  */
57 
58 /*
59  * Given a wmem scope, a pointer, and a length, treat the string of bytes
60  * referred to by the pointer and length as an ASCII string, with all bytes
61  * with the high-order bit set being invalid, and return a pointer to a
62  * UTF-8 string, allocated using the wmem scope.
63  *
64  * Octets with the highest bit set will be converted to the Unicode
65  * REPLACEMENT CHARACTER.
66  */
67 guint8 *
get_ascii_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)68 get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
69 {
70     wmem_strbuf_t *str;
71 
72     str = wmem_strbuf_sized_new(scope, length+1, 0);
73 
74     while (length > 0) {
75         guint8 ch = *ptr;
76 
77         if (ch < 0x80)
78             wmem_strbuf_append_c(str, ch);
79         else
80             wmem_strbuf_append_unichar(str, UNREPL);
81         ptr++;
82         length--;
83     }
84 
85     return (guint8 *) wmem_strbuf_finalize(str);
86 }
87 
88 /*
89  * Given a wmem scope, a pointer, and a length, treat the string of bytes
90  * referred to by the pointer and length as a UTF-8 string, and return a
91  * pointer to a UTF-8 string, allocated using the wmem scope, with all
92  * ill-formed sequences replaced with the Unicode REPLACEMENT CHARACTER
93  * according to the recommended "best practices" given in the Unicode
94  * Standard and specified by W3C/WHATWG.
95  *
96  * Note that in conformance with the Unicode Standard, this treats three
97  * byte sequences corresponding to UTF-16 surrogate halves (paired or unpaired)
98  * and two byte overlong encodings of 7-bit ASCII characters as invalid and
99  * substitutes REPLACEMENT CHARACTER for them. Explicit support for nonstandard
100  * derivative encoding formats (e.g. CESU-8, Java Modified UTF-8, WTF-8) could
101  * be added later.
102  */
103 guint8 *
get_utf_8_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)104 get_utf_8_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
105 {
106     wmem_strbuf_t *str;
107     guint8 ch;
108     const guint8 *prev;
109 
110     str = wmem_strbuf_sized_new(scope, length+1, 0);
111 
112     /* See the Unicode Standard conformance chapter at
113      * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf especially
114      * Table 3-7 "Well-Formed UTF-8 Byte Sequences" and
115      * U+FFFD Substitution of Maximal Subparts. */
116     while (length > 0) {
117         gsize unichar_len;
118         ch = *ptr;
119 
120         if (ch < 0x80) {
121             wmem_strbuf_append_c(str, ch);
122         } else if (ch < 0xc2 || ch > 0xf4) {
123             wmem_strbuf_append_unichar(str, UNREPL);
124         } else {
125             prev = ptr;
126             if (ch < 0xe0) { /* 110xxxxx, 2 byte char */
127                 unichar_len = 2;
128             } else if (ch < 0xf0) { /* 1110xxxx, 3 byte char */
129                 unichar_len = 3;
130                 ptr++;
131                 length--;
132                 if (length < 1) {
133                     wmem_strbuf_append_unichar(str, UNREPL);
134                     continue;
135                 }
136                 switch (ch) {
137                     case 0xe0:
138                         if (*ptr < 0xa0 || *ptr > 0xbf) {
139                             wmem_strbuf_append_unichar(str, UNREPL);
140                             continue;
141                         }
142                         break;
143                     case 0xed:
144                         if (*ptr < 0x80 || *ptr > 0x9f) {
145                             wmem_strbuf_append_unichar(str, UNREPL);
146                             continue;
147                         }
148                         break;
149                     default:
150                         if (*ptr < 0x80 || *ptr > 0xbf) {
151                             wmem_strbuf_append_unichar(str, UNREPL);
152                             continue;
153                         }
154                 }
155             } else { /* 11110xxx, 4 byte char - > 0xf4 excluded above */
156                 unichar_len = 4;
157                 ptr++;
158                 length--;
159                 if (length < 1) {
160                     wmem_strbuf_append_unichar(str, UNREPL);
161                     continue;
162                 }
163                 switch (ch) {
164                     case 0xf0:
165                         if (*ptr < 0x90 || *ptr > 0xbf) {
166                             wmem_strbuf_append_unichar(str, UNREPL);
167                             continue;
168                         }
169                         break;
170                     case 0xf4:
171                         if (*ptr < 0x80 || *ptr > 0x8f) {
172                             wmem_strbuf_append_unichar(str, UNREPL);
173                             continue;
174                         }
175                         break;
176                     default:
177                         if (*ptr < 0x80 || *ptr > 0xbf) {
178                             wmem_strbuf_append_unichar(str, UNREPL);
179                             continue;
180                         }
181                 }
182                 ptr++;
183                 length--;
184                 if (length < 1) {
185                     wmem_strbuf_append_unichar(str, UNREPL);
186                     continue;
187                 }
188                 if (*ptr < 0x80 || *ptr > 0xbf) {
189                     wmem_strbuf_append_unichar(str, UNREPL);
190                     continue;
191                 }
192             }
193 
194             ptr++;
195             length--;
196             if (length < 1) {
197                 wmem_strbuf_append_unichar(str, UNREPL);
198                 continue;
199             }
200             if (*ptr < 0x80 || *ptr > 0xbf) {
201                 wmem_strbuf_append_unichar(str, UNREPL);
202                 continue;
203             } else {
204                 wmem_strbuf_append_len(str, prev, unichar_len);
205             }
206         }
207 
208         ptr++;
209         length--;
210     }
211 
212     return (guint8 *) wmem_strbuf_finalize(str);
213 }
214 
215 /*
216  * ISO 646 "Basic code table".
217  */
218 const gunichar2 charset_table_iso_646_basic[0x80] = {
219     0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,        /* 0x00 -      */
220     0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,        /*      - 0x0F */
221     0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017,        /* 0x10 -      */
222     0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,        /*      - 0x1F */
223     0x0020, 0x0021, 0x0022, UNREPL, UNREPL, 0x0025, 0x0026, 0x0027,        /* 0x20 -      */
224     0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,        /*      - 0x2F */
225     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,        /* 0x30 -      */
226     0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,        /*      - 0x3F */
227     UNREPL, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,        /* 0x40 -      */
228     0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,        /*      - 0x4F */
229     0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,        /* 0x50 -      */
230     0x0058, 0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, 0x005f,        /*      - 0x5F */
231     UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,        /* 0x60 -      */
232     0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,        /*      - 0x6F */
233     0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,        /* 0x70 -      */
234     0x0078, 0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, 0x007f,        /*      - 0x7F */
235 };
236 
237 /*
238  * Given a wmem scope, a pointer, a length, and a translation table,
239  * treat the string of bytes referred to by the pointer and length as a
240  * string encoded using one octet per character, with octets with the
241  * high-order bit clear being mapped by the translation table to 2-byte
242  * Unicode Basic Multilingual Plane characters (including REPLACEMENT
243  * CHARACTER) and octets with the high-order bit set being mapped to
244  * REPLACEMENT CHARACTER, and return a pointer to a UTF-8 string,
245  * allocated using the wmem scope.
246  */
247 guint8 *
get_iso_646_string(wmem_allocator_t * scope,const guint8 * ptr,gint length,const gunichar2 table[0x80])248 get_iso_646_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80])
249 {
250     wmem_strbuf_t *str;
251 
252     str = wmem_strbuf_sized_new(scope, length+1, 0);
253 
254     while (length > 0) {
255         guint8 ch = *ptr;
256 
257         if (ch < 0x80)
258             wmem_strbuf_append_unichar(str, table[ch]);
259         else
260             wmem_strbuf_append_unichar(str, UNREPL);
261         ptr++;
262         length--;
263     }
264 
265     return (guint8 *) wmem_strbuf_finalize(str);
266 }
267 
268 /*
269  * Given a wmem scope, a pointer, and a length, treat the string of bytes
270  * referred to by the pointer and length as an ISO 8859/1 string, and
271  * return a pointer to a UTF-8 string, allocated using the wmem scope.
272  */
273 guint8 *
get_8859_1_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)274 get_8859_1_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
275 {
276     wmem_strbuf_t *str;
277 
278     str = wmem_strbuf_sized_new(scope, length+1, 0);
279 
280     while (length > 0) {
281         guint8 ch = *ptr;
282 
283         if (ch < 0x80)
284             wmem_strbuf_append_c(str, ch);
285         else {
286             /*
287              * Note: we assume here that the code points
288              * 0x80-0x9F are used for C1 control characters,
289              * and thus have the same value as the corresponding
290              * Unicode code points.
291              */
292             wmem_strbuf_append_unichar(str, ch);
293         }
294         ptr++;
295         length--;
296     }
297 
298     return (guint8 *) wmem_strbuf_finalize(str);
299 }
300 
301 /*
302  * Translation tables that map the upper 128 code points in single-byte
303  * "extended ASCII" character encodings to Unicode code points in the
304  * Basic Multilingual Plane.
305  */
306 
307 /* ISO-8859-2 (https://en.wikipedia.org/wiki/ISO/IEC_8859-2#Code_page_layout) */
308 const gunichar2 charset_table_iso_8859_2[0x80] = {
309     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
310     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
311     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
312     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
313     0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7,        /* 0xA0 -      */
314     0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b,        /*      - 0xAF */
315     0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7,        /* 0xB0 -      */
316     0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c,        /*      - 0xBF */
317     0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,        /* 0xC0 -      */
318     0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,        /*      - 0xCF */
319     0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,        /* 0xD0 -      */
320     0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,        /*      - 0xDF */
321     0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,        /* 0xE0 -      */
322     0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,        /*      - 0xEF */
323     0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,        /* 0xF0 -      */
324     0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9         /*      - 0xFF */
325 };
326 
327 /* generated by ../tools/make_charset_ISO-8859-3 */
328 const gunichar2 charset_table_iso_8859_3[0x80] = {
329     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
330     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
331     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
332     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
333     0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, UNREPL, 0x0124, 0x00a7,        /* 0xA0 -      */
334     0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, UNREPL, 0x017b,        /*      - 0xAF */
335     0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7,        /* 0xB0 -      */
336     0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, UNREPL, 0x017c,        /*      - 0xBF */
337     0x00c0, 0x00c1, 0x00c2, UNREPL, 0x00c4, 0x010a, 0x0108, 0x00c7,        /* 0xC0 -      */
338     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
339     UNREPL, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7,        /* 0xD0 -      */
340     0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df,        /*      - 0xDF */
341     0x00e0, 0x00e1, 0x00e2, UNREPL, 0x00e4, 0x010b, 0x0109, 0x00e7,        /* 0xE0 -      */
342     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
343     UNREPL, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7,        /* 0xF0 -      */
344     0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9,        /*      - 0xFF */
345 };
346 
347 /* generated by ../tools/make_charset_ISO-8859-4 */
348 const gunichar2 charset_table_iso_8859_4[0x80] = {
349     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
350     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
351     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
352     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
353     0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7,        /* 0xA0 -      */
354     0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af,        /*      - 0xAF */
355     0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7,        /* 0xB0 -      */
356     0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b,        /*      - 0xBF */
357     0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e,        /* 0xC0 -      */
358     0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a,        /*      - 0xCF */
359     0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
360     0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df,        /*      - 0xDF */
361     0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f,        /* 0xE0 -      */
362     0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b,        /*      - 0xEF */
363     0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
364     0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9,        /*      - 0xFF */
365 };
366 
367 /* ISO-8859-5 (https://en.wikipedia.org/wiki/ISO/IEC_8859-5#Code_page_layout) */
368 const gunichar2 charset_table_iso_8859_5[0x80] = {
369     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
370     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
371     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
372     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
373     0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,        /* 0xA0 -      */
374     0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f,        /*      - 0xAF */
375     0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,        /* 0xB0 -      */
376     0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,        /*      - 0xBF */
377     0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,        /* 0xC0 -      */
378     0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,        /*      - 0xCF */
379     0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,        /* 0xD0 -      */
380     0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,        /*      - 0xDF */
381     0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,        /* 0xE0 -      */
382     0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,        /*      - 0xEF */
383     0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457,        /* 0xF0 -      */
384     0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f         /*      - 0xFF */
385 };
386 
387 /* generated by ../tools/make_charset_ISO-8859-6 */
388 const gunichar2 charset_table_iso_8859_6[0x80] = {
389     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
390     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
391     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
392     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
393     0x00a0, UNREPL, UNREPL, UNREPL, 0x00a4, UNREPL, UNREPL, UNREPL,        /* 0xA0 -      */
394     UNREPL, UNREPL, UNREPL, UNREPL, 0x060c, 0x00ad, UNREPL, UNREPL,        /*      - 0xAF */
395     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /* 0xB0 -      */
396     UNREPL, UNREPL, UNREPL, 0x061b, UNREPL, UNREPL, UNREPL, 0x061f,        /*      - 0xBF */
397     UNREPL, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627,        /* 0xC0 -      */
398     0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f,        /*      - 0xCF */
399     0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637,        /* 0xD0 -      */
400     0x0638, 0x0639, 0x063a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /*      - 0xDF */
401     0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647,        /* 0xE0 -      */
402     0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f,        /*      - 0xEF */
403     0x0650, 0x0651, 0x0652, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /* 0xF0 -      */
404     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /*      - 0xFF */
405 };
406 
407 /* generated by ../tools/make_charset_ISO-8859-7 */
408 const gunichar2 charset_table_iso_8859_7[0x80] = {
409     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
410     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
411     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
412     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
413     0x00a0, 0x2018, 0x2019, 0x00a3, 0x20ac, 0x20af, 0x00a6, 0x00a7,        /* 0xA0 -      */
414     0x00a8, 0x00a9, 0x037a, 0x00ab, 0x00ac, 0x00ad, UNREPL, 0x2015,        /*      - 0xAF */
415     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7,        /* 0xB0 -      */
416     0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f,        /*      - 0xBF */
417     0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,        /* 0xC0 -      */
418     0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,        /*      - 0xCF */
419     0x03a0, 0x03a1, UNREPL, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,        /* 0xD0 -      */
420     0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af,        /*      - 0xDF */
421     0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7,        /* 0xE0 -      */
422     0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf,        /*      - 0xEF */
423     0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7,        /* 0xF0 -      */
424     0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, UNREPL,        /*      - 0xFF */
425 };
426 
427 /* generated by ../tools/make_charset_ISO-8859-8 */
428 const gunichar2 charset_table_iso_8859_8[0x80] = {
429     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
430     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
431     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
432     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
433     0x00a0, UNREPL, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,        /* 0xA0 -      */
434     0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,        /*      - 0xAF */
435     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
436     0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, UNREPL,        /*      - 0xBF */
437     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /* 0xC0 -      */
438     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /*      - 0xCF */
439     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,        /* 0xD0 -      */
440     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, 0x2017,        /*      - 0xDF */
441     0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7,        /* 0xE0 -      */
442     0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df,        /*      - 0xEF */
443     0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7,        /* 0xF0 -      */
444     0x05e8, 0x05e9, 0x05ea, UNREPL, UNREPL, 0x200e, 0x200f, UNREPL,        /*      - 0xFF */
445 };
446 
447 /* ISO-8859-9 (https://en.wikipedia.org/wiki/ISO/IEC_8859-9#Code_page_layout) */
448 const gunichar2 charset_table_iso_8859_9[0x80] = {
449     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
450     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
451     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
452     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
453     0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,        /* 0xA0 -      */
454     0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,        /*      - 0xAF */
455     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
456     0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,        /*      - 0xBF */
457     0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,        /* 0xC0 -      */
458     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
459     0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
460     0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df,        /*      - 0xDF */
461     0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,        /* 0xE0 -      */
462     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
463     0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
464     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff         /*      - 0xFF */
465 };
466 
467 /* generated by ../tools/make_charset_ISO-8859-10 */
468 const gunichar2 charset_table_iso_8859_10[0x80] = {
469     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
470     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
471     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
472     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
473     0x00a0, 0x0104, 0x0112, 0x0122, 0x012a, 0x0128, 0x0136, 0x00a7,        /* 0xA0 -      */
474     0x013b, 0x0110, 0x0160, 0x0166, 0x017d, 0x00ad, 0x016a, 0x014a,        /*      - 0xAF */
475     0x00b0, 0x0105, 0x0113, 0x0123, 0x012b, 0x0129, 0x0137, 0x00b7,        /* 0xB0 -      */
476     0x013c, 0x0111, 0x0161, 0x0167, 0x017e, 0x2015, 0x016b, 0x014b,        /*      - 0xBF */
477     0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e,        /* 0xC0 -      */
478     0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
479     0x00d0, 0x0145, 0x014c, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0168,        /* 0xD0 -      */
480     0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,        /*      - 0xDF */
481     0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f,        /* 0xE0 -      */
482     0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
483     0x00f0, 0x0146, 0x014d, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x0169,        /* 0xF0 -      */
484     0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x0138,        /*      - 0xFF */
485 };
486 
487 /* generated by ../tools/make_charset_ISO-8859-11 */
488 const gunichar2 charset_table_iso_8859_11[0x80] = {
489     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
490     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
491     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
492     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
493     0x00a0, 0x0e01, 0x0e02, 0x0e03, 0x0e04, 0x0e05, 0x0e06, 0x0e07,        /* 0xA0 -      */
494     0x0e08, 0x0e09, 0x0e0a, 0x0e0b, 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f,        /*      - 0xAF */
495     0x0e10, 0x0e11, 0x0e12, 0x0e13, 0x0e14, 0x0e15, 0x0e16, 0x0e17,        /* 0xB0 -      */
496     0x0e18, 0x0e19, 0x0e1a, 0x0e1b, 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f,        /*      - 0xBF */
497     0x0e20, 0x0e21, 0x0e22, 0x0e23, 0x0e24, 0x0e25, 0x0e26, 0x0e27,        /* 0xC0 -      */
498     0x0e28, 0x0e29, 0x0e2a, 0x0e2b, 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f,        /*      - 0xCF */
499     0x0e30, 0x0e31, 0x0e32, 0x0e33, 0x0e34, 0x0e35, 0x0e36, 0x0e37,        /* 0xD0 -      */
500     0x0e38, 0x0e39, 0x0e3a, UNREPL, UNREPL, UNREPL, UNREPL, 0x0e3f,        /*      - 0xDF */
501     0x0e40, 0x0e41, 0x0e42, 0x0e43, 0x0e44, 0x0e45, 0x0e46, 0x0e47,        /* 0xE0 -      */
502     0x0e48, 0x0e49, 0x0e4a, 0x0e4b, 0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f,        /*      - 0xEF */
503     0x0e50, 0x0e51, 0x0e52, 0x0e53, 0x0e54, 0x0e55, 0x0e56, 0x0e57,        /* 0xF0 -      */
504     0x0e58, 0x0e59, 0x0e5a, 0x0e5b, UNREPL, UNREPL, UNREPL, UNREPL,        /*      - 0xFF */
505 };
506 
507 /* generated by ../tools/make_charset_ISO-8859-13 */
508 const gunichar2 charset_table_iso_8859_13[0x80] = {
509     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
510     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
511     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
512     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
513     0x00a0, 0x201d, 0x00a2, 0x00a3, 0x00a4, 0x201e, 0x00a6, 0x00a7,        /* 0xA0 -      */
514     0x00d8, 0x00a9, 0x0156, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00c6,        /*      - 0xAF */
515     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x201c, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
516     0x00f8, 0x00b9, 0x0157, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00e6,        /*      - 0xBF */
517     0x0104, 0x012e, 0x0100, 0x0106, 0x00c4, 0x00c5, 0x0118, 0x0112,        /* 0xC0 -      */
518     0x010c, 0x00c9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012a, 0x013b,        /*      - 0xCF */
519     0x0160, 0x0143, 0x0145, 0x00d3, 0x014c, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
520     0x0172, 0x0141, 0x015a, 0x016a, 0x00dc, 0x017b, 0x017d, 0x00df,        /*      - 0xDF */
521     0x0105, 0x012f, 0x0101, 0x0107, 0x00e4, 0x00e5, 0x0119, 0x0113,        /* 0xE0 -      */
522     0x010d, 0x00e9, 0x017a, 0x0117, 0x0123, 0x0137, 0x012b, 0x013c,        /*      - 0xEF */
523     0x0161, 0x0144, 0x0146, 0x00f3, 0x014d, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
524     0x0173, 0x0142, 0x015b, 0x016b, 0x00fc, 0x017c, 0x017e, 0x2019,        /*      - 0xFF */
525 };
526 
527 /* generated by ../tools/make_charset_ISO-8859-14 */
528 const gunichar2 charset_table_iso_8859_14[0x80] = {
529     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
530     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
531     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
532     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
533     0x00a0, 0x1e02, 0x1e03, 0x00a3, 0x010a, 0x010b, 0x1e0a, 0x00a7,        /* 0xA0 -      */
534     0x1e80, 0x00a9, 0x1e82, 0x1e0b, 0x1ef2, 0x00ad, 0x00ae, 0x0178,        /*      - 0xAF */
535     0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x1e41, 0x00b6, 0x1e56,        /* 0xB0 -      */
536     0x1e81, 0x1e57, 0x1e83, 0x1e60, 0x1ef3, 0x1e84, 0x1e85, 0x1e61,        /*      - 0xBF */
537     0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,        /* 0xC0 -      */
538     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
539     0x0174, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x1e6a,        /* 0xD0 -      */
540     0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x0176, 0x00df,        /*      - 0xDF */
541     0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,        /* 0xE0 -      */
542     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
543     0x0175, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x1e6b,        /* 0xF0 -      */
544     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x0177, 0x00ff,        /*      - 0xFF */
545 };
546 
547 /* generated by ../tools/make_charset_ISO-8859-15 */
548 const gunichar2 charset_table_iso_8859_15[0x80] = {
549     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
550     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
551     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
552     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
553     0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x20ac, 0x00a5, 0x0160, 0x00a7,        /* 0xA0 -      */
554     0x0161, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,        /*      - 0xAF */
555     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x017d, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
556     0x017e, 0x00b9, 0x00ba, 0x00bb, 0x0152, 0x0153, 0x0178, 0x00bf,        /*      - 0xBF */
557     0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,        /* 0xC0 -      */
558     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
559     0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
560     0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,        /*      - 0xDF */
561     0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,        /* 0xE0 -      */
562     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
563     0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
564     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,        /*      - 0xFF */
565 };
566 
567 /* generated by ../tools/make_charset_ISO-8859-16 */
568 const gunichar2 charset_table_iso_8859_16[0x80] = {
569     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,        /* 0x80 -      */
570     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,        /*      - 0x8F */
571     0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,        /* 0x90 -      */
572     0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,        /*      - 0x9F */
573     0x00a0, 0x0104, 0x0105, 0x0141, 0x20ac, 0x201e, 0x0160, 0x00a7,        /* 0xA0 -      */
574     0x0161, 0x00a9, 0x0218, 0x00ab, 0x0179, 0x00ad, 0x017a, 0x017b,        /*      - 0xAF */
575     0x00b0, 0x00b1, 0x010c, 0x0142, 0x017d, 0x201d, 0x00b6, 0x00b7,        /* 0xB0 -      */
576     0x017e, 0x010d, 0x0219, 0x00bb, 0x0152, 0x0153, 0x0178, 0x017c,        /*      - 0xBF */
577     0x00c0, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0106, 0x00c6, 0x00c7,        /* 0xC0 -      */
578     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
579     0x0110, 0x0143, 0x00d2, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x015a,        /* 0xD0 -      */
580     0x0170, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0118, 0x021a, 0x00df,        /*      - 0xDF */
581     0x00e0, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x0107, 0x00e6, 0x00e7,        /* 0xE0 -      */
582     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
583     0x0111, 0x0144, 0x00f2, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x015b,        /* 0xF0 -      */
584     0x0171, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0119, 0x021b, 0x00ff,        /*      - 0xFF */
585 };
586 
587 /*
588  * Windows-1250
589  *
590  * See:
591  *     httpss://en.wikipedia.org/wiki/Windows-1250)
592  *     https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1250.TXT
593  */
594 const gunichar2 charset_table_cp1250[0x80] = {
595     0x20ac, UNREPL, 0x201a, UNREPL, 0x201e, 0x2026, 0x2020, 0x2021,        /* 0x80 -      */
596     UNREPL, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,        /*      - 0x8F */
597     UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,        /* 0x90 -      */
598     UNREPL, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,        /*      - 0x9F */
599     0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7,        /* 0xA0 -      */
600     0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,        /*      - 0xAF */
601     0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
602     0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,        /*      - 0xBF */
603     0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,        /* 0xC0 -      */
604     0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,        /*      - 0xCF */
605     0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,        /* 0xD0 -      */
606     0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,        /*      - 0xDF */
607     0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,        /* 0xE0 -      */
608     0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,        /*      - 0xEF */
609     0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,        /* 0xF0 -      */
610     0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,        /*      - 0xFF */
611 };
612 
613 /*
614  * Windows-1251
615  *
616  * See:
617  *     https://en.wikipedia.org/wiki/Windows-1251
618  *     https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1251.TXT
619  */
620 const gunichar2 charset_table_cp1251[0x80] = {
621     0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,        /* 0x80 -      */
622     0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040B, 0x040f,        /*      - 0x8F */
623     0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,        /* 0x90 -      */
624     UNREPL, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,        /*      - 0x9F */
625     0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7,        /* 0xA0 -      */
626     0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407,        /*      - 0xAF */
627     0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
628     0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457,        /*      - 0xBF */
629     0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,        /* 0xC0 -      */
630     0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,        /*      - 0xCF */
631     0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,        /* 0xD0 -      */
632     0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,        /*      - 0xDF */
633     0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,        /* 0xE0 -      */
634     0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,        /*      - 0xEF */
635     0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,        /* 0xF0 -      */
636     0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,        /*      - 0xFF */
637 };
638 
639 /*
640  * Windows-1252
641  *
642  * See:
643  *     https://en.wikipedia.org/wiki/Windows-1252
644  *     https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
645  */
646 const gunichar2 charset_table_cp1252[0x80] = {
647     0x20ac, UNREPL, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,        /* 0x80 -      */
648     0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, UNREPL, 0x0172, UNREPL,        /*      - 0x8F */
649     UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,        /* 0x90 -      */
650     0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, UNREPL, 0x0173, 0x0178,        /*      - 0x9F */
651     0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,        /* 0xA0 -      */
652     0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,        /*      - 0xAF */
653     0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,        /* 0xB0 -      */
654     0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,        /*      - 0xBF */
655     0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,        /* 0xC0 -      */
656     0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,        /*      - 0xCF */
657     0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,        /* 0xD0 -      */
658     0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,        /*      - 0xDF */
659     0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,        /* 0xE0 -      */
660     0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,        /*      - 0xEF */
661     0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,        /* 0xF0 -      */
662     0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,        /*      - 0xFF */
663 };
664 
665 /* generated by ./make_charset_table MACROMAN */
666 /* That's "MacRoman", not "Macro Man" (faster than a speeding recursive expansion!) */
667 const gunichar2 charset_table_mac_roman[0x80] = {
668     0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1,        /* 0x80 -      */
669     0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8,        /*      - 0x8F */
670     0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3,        /* 0x90 -      */
671     0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc,        /*      - 0x9F */
672     0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df,        /* 0xA0 -      */
673     0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8,        /*      - 0xAF */
674     0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211,        /* 0xB0 -      */
675     0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8,        /*      - 0xBF */
676     0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab,        /* 0xC0 -      */
677     0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153,        /*      - 0xCF */
678     0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca,        /* 0xD0 -      */
679     0x00ff, 0x0178, 0x2044, 0x20ac, 0x2039, 0x203a, 0xfb01, 0xfb02,        /*      - 0xDF */
680     0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1,        /* 0xE0 -      */
681     0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4,        /*      - 0xEF */
682     0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc,        /* 0xF0 -      */
683     0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,        /*      - 0xFF */
684 };
685 
686 /* generated by ./make_charset_table CP437 */
687 const gunichar2 charset_table_cp437[0x80] = {
688     0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,        /* 0x80 -      */
689     0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,        /*      - 0x8F */
690     0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,        /* 0x90 -      */
691     0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192,        /*      - 0x9F */
692     0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,        /* 0xA0 -      */
693     0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,        /*      - 0xAF */
694     0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,        /* 0xB0 -      */
695     0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,        /*      - 0xBF */
696     0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,        /* 0xC0 -      */
697     0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,        /*      - 0xCF */
698     0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b,        /* 0xD0 -      */
699     0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,        /*      - 0xDF */
700     0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4,        /* 0xE0 -      */
701     0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229,        /*      - 0xEF */
702     0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248,        /* 0xF0 -      */
703     0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0,        /*      - 0xFF */
704 };
705 
706 /*
707  * CP855
708  *
709  * See
710  *     https://en.wikipedia.org/wiki/CP855
711  *     https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP855.TXT
712  *
713  * XXX - this doesn't have the graphics for 0x00 through 0x1F shown
714  * on the Wikipedia page, but not in the Microsoft mapping file;
715  * that would require a 256-code-point mapping table.  (Are those
716  * positions used for the same graphics on all code pages - the PC
717  * graphics set, or whatever it's called?)
718  */
719 const gunichar2 charset_table_cp855[0x80] = {
720     0x0452, 0x0402, 0x0453, 0x0403, 0x0451, 0x0401, 0x0454, 0x0404,        /* 0x80 -      */
721     0x0455, 0x0405, 0x0456, 0x0406, 0x0457, 0x0407, 0x0458, 0x0408,        /*      - 0x8F */
722     0x0459, 0x0409, 0x045a, 0x040a, 0x045b, 0x040b, 0x045c, 0x040c,        /* 0x90 -      */
723     0x045e, 0x040e, 0x045f, 0x040f, 0x044e, 0x042e, 0x044a, 0x042a,        /*      - 0x9F */
724     0x0430, 0x0410, 0x0431, 0x0411, 0x0446, 0x0426, 0x0434, 0x0414,        /* 0xA0 -      */
725     0x0435, 0x0415, 0x0444, 0x0424, 0x0433, 0x0413, 0x00ab, 0x00bb,        /*      - 0xAF */
726     0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x0445, 0x0425, 0x0438,        /* 0xB0 -      */
727     0x0418, 0x2563, 0x2551, 0x2557, 0x2550, 0x0439, 0x0419, 0x2510,        /*      - 0xBF */
728     0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x043a, 0x041a,        /* 0xC0 -      */
729     0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,        /*      - 0xCF */
730     0x043b, 0x041b, 0x043c, 0x041c, 0x043d, 0x041d, 0x043e, 0x041e,        /* 0xD0 -      */
731     0x043f, 0x2518, 0x250c, 0x2588, 0x2584, 0x041f, 0x044f, 0x2580,        /*      - 0xDF */
732     0x042f, 0x0440, 0x0420, 0x0441, 0x0421, 0x0442, 0x0422, 0x0443,        /* 0xE0 -      */
733     0x0423, 0x0436, 0x0416, 0x0432, 0x0412, 0x044c, 0x042c, 0x2116,        /*      - 0xEF */
734     0x00ad, 0x044b, 0x042b, 0x0437, 0x0417, 0x0448, 0x0428, 0x044d,        /* 0xF0 -      */
735     0x042d, 0x0449, 0x0429, 0x0447, 0x0427, 0x00a7, 0x25a0, 0x00a0,        /*      - 0xFF */
736 };
737 
738 /*
739  * CP866
740  *
741  * See:
742  *     https://en.wikipedia.org/wiki/CP866
743  *     https://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP866.TXT
744  */
745 const gunichar2 charset_table_cp866[0x80] = {
746     0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,        /* 0x80 -      */
747     0x0418, 0x0419, 0x041A, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,        /*      - 0x8F */
748     0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,        /* 0x90 -      */
749     0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,        /*      - 0x9F */
750     0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,        /* 0xA0 -      */
751     0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,        /*      - 0xAF */
752     0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,        /* 0xB0 -      */
753     0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510,        /*      - 0xBF */
754     0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f,        /* 0xC0 -      */
755     0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567,        /*      - 0xCF */
756     0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b,        /* 0xD0 -      */
757     0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580,        /*      - 0xDF */
758     0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,        /* 0xE0 -      */
759     0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,        /*      - 0xEF */
760     0x0401, 0x0451, 0x0404, 0x0454, 0x0407, 0x0457, 0x040e, 0x045e,        /* 0xF0 -      */
761     0x00b0, 0x2219, 0x00b7, 0x221a, 0x2216, 0x00a4, 0x25a0, 0x00a0,        /*      - 0xFF */
762 };
763 
764 /*
765  * Given a wmem scope, a pointer, a length, and a translation table with
766  * 128 entries, treat the string of bytes referred to by the pointer and
767  * length as a string encoded using one octet per character, with octets
768  * with the high-order bit clear being ASCII and octets with the high-order
769  * bit set being mapped by the translation table to 2-byte Unicode Basic
770  * Multilingual Plane characters (including REPLACEMENT CHARACTER), and
771  * return a pointer to a UTF-8 string, allocated using the wmem scope.
772  */
773 guint8 *
get_unichar2_string(wmem_allocator_t * scope,const guint8 * ptr,gint length,const gunichar2 table[0x80])774 get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80])
775 {
776     wmem_strbuf_t *str;
777 
778     str = wmem_strbuf_sized_new(scope, length+1, 0);
779 
780     while (length > 0) {
781         guint8 ch = *ptr;
782 
783         if (ch < 0x80)
784             wmem_strbuf_append_c(str, ch);
785         else
786             wmem_strbuf_append_unichar(str, table[ch-0x80]);
787         ptr++;
788         length--;
789     }
790 
791     return (guint8 *) wmem_strbuf_finalize(str);
792 }
793 
794 /*
795  * Given a wmem scope, a pointer, and a length, treat the string of bytes
796  * referred to by the pointer and length as a UCS-2 encoded string
797  * containing characters from the Basic Multilingual Plane (plane 0) of
798  * Unicode, and return a pointer to a UTF-8 string, allocated with the
799  * wmem scope.
800  *
801  * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
802  *
803  * Specify length in bytes.
804  *
805  * XXX - should map lead and trail surrogate values to REPLACEMENT
806  * CHARACTERs (0xFFFD)?
807  * XXX - if there are an odd number of bytes, should put a
808  * REPLACEMENT CHARACTER at the end.
809  */
810 guint8 *
get_ucs_2_string(wmem_allocator_t * scope,const guint8 * ptr,gint length,const guint encoding)811 get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
812 {
813     gunichar2      uchar;
814     gint           i;       /* Byte counter for string */
815     wmem_strbuf_t *strbuf;
816 
817     strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
818 
819     for(i = 0; i + 1 < length; i += 2) {
820         if (encoding == ENC_BIG_ENDIAN){
821             uchar = pntoh16(ptr + i);
822         }else{
823             uchar = pletoh16(ptr + i);
824         }
825         wmem_strbuf_append_unichar(strbuf, uchar);
826     }
827 
828     /*
829      * XXX - if i < length, this means we were handed an odd
830      * number of bytes, so we're not a valid UCS-2 string.
831      */
832     return (guint8 *) wmem_strbuf_finalize(strbuf);
833 }
834 
835 /*
836  * Given a wmem scope, a pointer, and a length, treat the string of bytes
837  * referred to by the pointer and length as a UTF-16 encoded string, and
838  * return a pointer to a UTF-8 string, allocated with the wmem scope.
839  *
840  * See RFC 2781 section 2.2.
841  *
842  * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
843  *
844  * Specify length in bytes.
845  *
846  * XXX - should map invalid Unicode characters to REPLACEMENT CHARACTERs.
847  */
848 guint8 *
get_utf_16_string(wmem_allocator_t * scope,const guint8 * ptr,gint length,const guint encoding)849 get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
850 {
851     wmem_strbuf_t *strbuf;
852     gunichar2      uchar2, lead_surrogate;
853     gunichar       uchar;
854     gint           i;       /* Byte counter for string */
855 
856     strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
857 
858     for(i = 0; i + 1 < length; i += 2) {
859         if (encoding == ENC_BIG_ENDIAN)
860             uchar2 = pntoh16(ptr + i);
861         else
862             uchar2 = pletoh16(ptr + i);
863 
864         if (IS_LEAD_SURROGATE(uchar2)) {
865             /*
866              * Lead surrogate.  Must be followed by
867              * a trail surrogate.
868              */
869             i += 2;
870             if (i + 1 >= length) {
871                 /*
872                  * Oops, string ends with a lead surrogate.
873                  *
874                  * Insert a REPLACEMENT CHARACTER to mark the error,
875                  * and quit.
876                  */
877                 wmem_strbuf_append_unichar(strbuf, UNREPL);
878                 break;
879             }
880             lead_surrogate = uchar2;
881             if (encoding == ENC_BIG_ENDIAN)
882                 uchar2 = pntoh16(ptr + i);
883             else
884                 uchar2 = pletoh16(ptr + i);
885             if (IS_TRAIL_SURROGATE(uchar2)) {
886                 /* Trail surrogate. */
887                 uchar = SURROGATE_VALUE(lead_surrogate, uchar2);
888                 wmem_strbuf_append_unichar(strbuf, uchar);
889             } else {
890                 /*
891                  * Not a trail surrogate.
892                  *
893                  * Insert a REPLACEMENT CHARACTER to mark the error,
894                  * and continue;
895                  */
896                 wmem_strbuf_append_unichar(strbuf, UNREPL);
897             }
898         } else {
899             if (IS_TRAIL_SURROGATE(uchar2)) {
900                 /*
901                  * Trail surrogate without a preceding
902                  * lead surrogate.
903                  *
904                  * Insert a REPLACEMENT CHARACTER to mark the error,
905                  * and continue;
906                  */
907                 wmem_strbuf_append_unichar(strbuf, UNREPL);
908             } else {
909                 /*
910                  * Non-surrogate; just append it.
911                  */
912                 wmem_strbuf_append_unichar(strbuf, uchar2);
913             }
914         }
915     }
916 
917     /*
918      * If i < length, this means we were handed an odd number of bytes,
919      * so we're not a valid UTF-16 string; insert a REPLACEMENT CHARACTER
920      * to mark the error.
921      */
922     if (i < length)
923         wmem_strbuf_append_unichar(strbuf, UNREPL);
924     return (guint8 *) wmem_strbuf_finalize(strbuf);
925 }
926 
927 /*
928  * Given a wmem scope, a pointer, and a length, treat the string of bytes
929  * referred to by the pointer and length as a UCS-4 encoded string, and
930  * return a pointer to a UTF-8 string, allocated with the wmem scope.
931  *
932  * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
933  *
934  * Specify length in bytes
935  *
936  * XXX - should map lead and trail surrogate values to a "substitute"
937  * UTF-8 character?
938  * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
939  * XXX - if the number of bytes isn't a multiple of 4, should put a
940  * REPLACEMENT CHARACTER at the end.
941  */
942 guint8 *
get_ucs_4_string(wmem_allocator_t * scope,const guint8 * ptr,gint length,const guint encoding)943 get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
944 {
945     gunichar       uchar;
946     gint           i;       /* Byte counter for string */
947     wmem_strbuf_t *strbuf;
948 
949     strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
950 
951     for(i = 0; i + 3 < length; i += 4) {
952         if (encoding == ENC_BIG_ENDIAN)
953             uchar = pntoh32(ptr + i);
954         else
955             uchar = pletoh32(ptr + i);
956 
957         wmem_strbuf_append_unichar(strbuf, uchar);
958     }
959 
960     /*
961      * XXX - if i < length, this means we were handed a number
962      * of bytes that's not a multiple of 4, so we're not a valid
963      * UCS-4 string.
964      */
965     return (guint8 *)wmem_strbuf_finalize(strbuf);
966 }
967 
968 /*
969  * FROM GNOKII
970  * gsm-encoding.c
971  * gsm-sms.c
972  */
973 
974 /* ETSI GSM 03.38, version 6.0.1, section 6.2.1; Default alphabet */
975 static const gunichar2 gsm_default_alphabet[0x80] = {
976     '@',   0xa3,  '$',   0xa5,  0xe8,  0xe9,  0xf9,  0xec,
977     0xf2,  0xc7,  '\n',  0xd8,  0xf8,  '\r',  0xc5,  0xe5,
978     0x394, '_',   0x3a6, 0x393, 0x39b, 0x3a9, 0x3a0, 0x3a8,
979     0x3a3, 0x398, 0x39e, 0xa0,  0xc6,  0xe6,  0xdf,  0xc9,
980     ' ',   '!',   '\"',  '#',   0xa4,  '%',   '&',   '\'',
981     '(',   ')',   '*',   '+',   ',',   '-',   '.',   '/',
982     '0',   '1',   '2',   '3',   '4',   '5',   '6',   '7',
983     '8',   '9',   ':',   ';',   '<',   '=',   '>',   '?',
984     0xa1,  'A',   'B',   'C',   'D',   'E',   'F',   'G',
985     'H',   'I',   'J',   'K',   'L',   'M',   'N',   'O',
986     'P',   'Q',   'R',   'S',   'T',   'U',   'V',   'W',
987     'X',   'Y',   'Z',   0xc4,  0xd6,  0xd1,  0xdc,  0xa7,
988     0xbf,  'a',   'b',   'c',   'd',   'e',   'f',   'g',
989     'h',   'i',   'j',   'k',   'l',   'm',   'n',   'o',
990     'p',   'q',   'r',   's',   't',   'u',   'v',   'w',
991     'x',   'y',   'z',   0xe4,  0xf6,  0xf1,  0xfc,  0xe0
992 };
993 
994 static gunichar
GSM_to_UNICHAR(guint8 c)995 GSM_to_UNICHAR(guint8 c)
996 {
997     if (c < G_N_ELEMENTS(gsm_default_alphabet))
998         return gsm_default_alphabet[c];
999 
1000     return UNREPL;
1001 }
1002 
1003 static gunichar
GSMext_to_UNICHAR(guint8 c)1004 GSMext_to_UNICHAR(guint8 c)
1005 {
1006     switch (c)
1007     {
1008         case 0x0a: return 0x0c; /* form feed */
1009         case 0x14: return '^';
1010         case 0x28: return '{';
1011         case 0x29: return '}';
1012         case 0x2f: return '\\';
1013         case 0x3c: return '[';
1014         case 0x3d: return '~';
1015         case 0x3e: return ']';
1016         case 0x40: return '|';
1017         case 0x65: return 0x20ac; /* euro */
1018     }
1019 
1020     return UNREPL; /* invalid character */
1021 }
1022 
1023 #define GN_BYTE_MASK ((1 << bits) - 1)
1024 
1025 #define GN_CHAR_ESCAPE 0x1b
1026 
1027 static gboolean
char_is_escape(unsigned char value)1028 char_is_escape(unsigned char value)
1029 {
1030     return (value == GN_CHAR_ESCAPE);
1031 }
1032 
1033 static gboolean
handle_ts_23_038_char(wmem_strbuf_t * strbuf,guint8 code_point,gboolean saw_escape)1034 handle_ts_23_038_char(wmem_strbuf_t *strbuf, guint8 code_point,
1035                       gboolean saw_escape)
1036 {
1037     gunichar       uchar;
1038 
1039     if (char_is_escape(code_point)) {
1040         /*
1041          * XXX - if saw_escape is TRUE here, then this is
1042          * the case where we escape to "another extension table",
1043          * but TS 128 038 V11.0 doesn't specify such an extension
1044          * table.
1045          */
1046         saw_escape = TRUE;
1047     } else {
1048         if (!(code_point & 0x80)) {
1049             /*
1050              * Code point is valid (7-bit).
1051              * Have we seen an escape?
1052              */
1053             if (saw_escape) {
1054                 saw_escape = FALSE;
1055                 uchar = GSMext_to_UNICHAR(code_point);
1056             } else {
1057                 uchar = GSM_to_UNICHAR(code_point);
1058             }
1059             wmem_strbuf_append_unichar(strbuf, uchar);
1060         } else {
1061             /* Invalid - put in a REPLACEMENT CHARACTER */
1062             wmem_strbuf_append_unichar(strbuf, UNREPL);
1063         }
1064     }
1065     return saw_escape;
1066 }
1067 
1068 guint8 *
get_ts_23_038_7bits_string_packed(wmem_allocator_t * scope,const guint8 * ptr,const gint bit_offset,gint no_of_chars)1069 get_ts_23_038_7bits_string_packed(wmem_allocator_t *scope, const guint8 *ptr,
1070                                   const gint bit_offset, gint no_of_chars)
1071 {
1072     wmem_strbuf_t *strbuf;
1073     gint           char_count;                  /* character counter for string */
1074     guint8         in_byte, out_byte, rest = 0x00;
1075     const guint8  *start_ptr = ptr;
1076     gboolean       saw_escape = FALSE;
1077     int            bits;
1078 
1079     strbuf = wmem_strbuf_sized_new(scope, no_of_chars+1, 0);
1080 
1081     bits = bit_offset & 0x07;
1082     if (!bits) {
1083         bits = 7;
1084     }
1085 
1086     for(char_count = 0; char_count < no_of_chars; ptr++) {
1087         /* Get the next byte from the string. */
1088         in_byte = *ptr;
1089 
1090         /*
1091          * Combine the bits we've accumulated with bits from
1092          * that byte to make a 7-bit code point.
1093          */
1094         out_byte = ((in_byte & GN_BYTE_MASK) << (7 - bits)) | rest;
1095 
1096         /*
1097          * Leftover bits used in that code point.
1098          */
1099         rest = in_byte >> bits;
1100 
1101         /*
1102          * If we don't start from 0th bit, we shouldn't go to the
1103          * next char. Under *out_num we have now 0 and under Rest -
1104          * _first_ part of the char.
1105          */
1106         if ((start_ptr != ptr) || (bits == 7)) {
1107             saw_escape = handle_ts_23_038_char(strbuf, out_byte,
1108                 saw_escape);
1109             char_count++;
1110         }
1111 
1112         /*
1113          * After reading 7 octets we have read 7 full characters
1114          * but we have 7 bits as well. This is the next character.
1115          */
1116         if ((bits == 1) && (char_count < no_of_chars)) {
1117             saw_escape = handle_ts_23_038_char(strbuf, rest,
1118                 saw_escape);
1119             char_count++;
1120             bits = 7;
1121             rest = 0x00;
1122         } else {
1123             bits--;
1124         }
1125     }
1126 
1127     if (saw_escape) {
1128         /*
1129          * Escape not followed by anything.
1130          *
1131          * XXX - for now, show the escape as a REPLACEMENT
1132          * CHARACTER.
1133          */
1134         wmem_strbuf_append_unichar(strbuf, UNREPL);
1135     }
1136 
1137     return (guint8 *)wmem_strbuf_finalize(strbuf);
1138 }
1139 
1140 guint8 *
get_ts_23_038_7bits_string_unpacked(wmem_allocator_t * scope,const guint8 * ptr,gint length)1141 get_ts_23_038_7bits_string_unpacked(wmem_allocator_t *scope, const guint8 *ptr,
1142                            gint length)
1143 {
1144     wmem_strbuf_t *strbuf;
1145     gint           i;       /* Byte counter for string */
1146     gboolean       saw_escape = FALSE;
1147 
1148     strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
1149 
1150     for (i = 0; i < length; i++)
1151         saw_escape = handle_ts_23_038_char(strbuf, *ptr++, saw_escape);
1152 
1153     return (guint8 *)wmem_strbuf_finalize(strbuf);
1154 }
1155 
1156 /*
1157  * ETSI TS 102 221 Annex A.
1158  */
1159 guint8 *
get_etsi_ts_102_221_annex_a_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)1160 get_etsi_ts_102_221_annex_a_string(wmem_allocator_t *scope, const guint8 *ptr,
1161                                    gint length)
1162 {
1163     guint8         string_type;
1164     guint8         string_len;
1165     gunichar2      ucs2_base;
1166     wmem_strbuf_t *strbuf;
1167     guint          i;       /* Byte counter for string */
1168     gboolean       saw_escape = FALSE;
1169 
1170     /*
1171      * get the first octet.
1172      */
1173     if (length == 0) {
1174         /* XXX - return error indication */
1175         strbuf = wmem_strbuf_new(scope, "");
1176         return (guint8 *)wmem_strbuf_finalize(strbuf);
1177     }
1178     string_type = *ptr;
1179     ptr++;
1180     length--;
1181 
1182     if (string_type == 0x80) {
1183         /*
1184          * Annex A, coding scheme 1) - big-endian UCS-2.
1185          */
1186         return get_ucs_2_string(scope, ptr, length, ENC_BIG_ENDIAN);
1187     }
1188 
1189     /*
1190      * Annex A, coding schemes 2) and 3):
1191      *
1192      *    the second byte is the number of characters (characters,
1193      *    not octets) in the string;
1194      *
1195      *    for coding scheme 2), the third byte defines bits 15 to 8
1196      *    of all UCS-2 characters in the string (all bit numbers are
1197      *    1-origin, so bit 1 is the low-order bit), with bit 16 being 0;
1198      *
1199      *    for coding scheme 3), the third byte and fourth bytes, treated
1200      *    as a big-endian value, define the base value for all UCS-2
1201      *    characters in the string;
1202      *
1203      *    for all subsequent bytes, if bit 8 is 0, it's a character
1204      *    in the GSM Default Alphabet, otherwise, it is added to
1205      *    the UCS-2 base value to give a UCS-2 character.
1206      *
1207      * XXX - that doesn't seem to indicate that a byte of 0x1b is
1208      * treated as an escape character, it just says that a single octet
1209      * with the 8th bit not set is a GSM Default Alphabet character.
1210      */
1211 
1212     /*
1213      * Get the string length, in characters.
1214      */
1215     if (length == 0) {
1216         /* XXX - return error indication */
1217         strbuf = wmem_strbuf_new(scope, "");
1218         return (guint8 *)wmem_strbuf_finalize(strbuf);
1219     }
1220     string_len = *ptr;
1221     ptr++;
1222     length--;
1223 
1224     strbuf = wmem_strbuf_sized_new(scope, 2*string_len+1, 0);
1225 
1226     /*
1227      * Get the UCS-2 base.
1228      */
1229     if (string_type == 0x81) {
1230         if (length == 0) {
1231             /* XXX - return error indication */
1232             return (guint8 *)wmem_strbuf_finalize(strbuf);
1233 	}
1234         ucs2_base = (*ptr) << 7;
1235         ptr++;
1236         length--;
1237     } else if (string_type == 0x82) {
1238         if (length == 0) {
1239             /* XXX - return error indication */
1240             return (guint8 *)wmem_strbuf_finalize(strbuf);
1241 	}
1242         ucs2_base = (*ptr) << 8;
1243         ptr++;
1244         length--;
1245 
1246         if (length == 0) {
1247             /* XXX - return error indication */
1248             return (guint8 *)wmem_strbuf_finalize(strbuf);
1249 	}
1250         ucs2_base |= *ptr;
1251         ptr++;
1252         length--;
1253     } else {
1254         /* Invalid string type. */
1255         /* XXX - return error indication */
1256         return (guint8 *)wmem_strbuf_finalize(strbuf);
1257     }
1258 
1259     for (i = 0; i < string_len; i++) {
1260         guint8 byte;
1261 
1262         if (length == 0) {
1263             /* XXX - return error indication */
1264             return (guint8 *)wmem_strbuf_finalize(strbuf);
1265 	}
1266         byte = *ptr;
1267         if ((byte & 0x80) == 0) {
1268             saw_escape = handle_ts_23_038_char(strbuf, byte, saw_escape);
1269         } else {
1270             gunichar2 uchar;
1271 
1272             /*
1273              * XXX - if saw_escape is true, this is bogus.
1274              *
1275              * XXX - should map lead and trail surrogate values to
1276              * REPLACEMENT CHARACTERs (0xFFFD)?
1277              * XXX - if there are an odd number of bytes, should put a
1278              * REPLACEMENT CHARACTER at the end.
1279              */
1280             uchar = ucs2_base + (byte & 0x7f);
1281             wmem_strbuf_append_unichar(strbuf, uchar);
1282         }
1283     }
1284 
1285     return (guint8 *)wmem_strbuf_finalize(strbuf);
1286 }
1287 
1288 guint8 *
get_ascii_7bits_string(wmem_allocator_t * scope,const guint8 * ptr,const gint bit_offset,gint no_of_chars)1289 get_ascii_7bits_string(wmem_allocator_t *scope, const guint8 *ptr,
1290                        const gint bit_offset, gint no_of_chars)
1291 {
1292     wmem_strbuf_t *strbuf;
1293     gint           char_count;                  /* character counter for string */
1294     guint8         in_byte, out_byte, rest = 0x00;
1295     const guint8  *start_ptr = ptr;
1296     int            bits;
1297 
1298     bits = bit_offset & 0x07;
1299     if (!bits) {
1300         bits = 7;
1301     }
1302 
1303     strbuf = wmem_strbuf_sized_new(scope, no_of_chars+1, 0);
1304     for(char_count = 0; char_count < no_of_chars; ptr++) {
1305         /* Get the next byte from the string. */
1306         in_byte = *ptr;
1307 
1308         /*
1309          * Combine the bits we've accumulated with bits from
1310          * that byte to make a 7-bit code point.
1311          */
1312         out_byte = (in_byte >> (8 - bits)) | rest;
1313 
1314         /*
1315          * Leftover bits used in that code point.
1316          */
1317         rest = (in_byte << (bits - 1)) & 0x7f;
1318 
1319         /*
1320          * If we don't start from 0th bit, we shouldn't go to the
1321          * next char. Under *out_num we have now 0 and under Rest -
1322          * _first_ part of the char.
1323          */
1324         if ((start_ptr != ptr) || (bits == 7)) {
1325             wmem_strbuf_append_c(strbuf, out_byte);
1326             char_count++;
1327         }
1328 
1329         /*
1330          * After reading 7 octets we have read 7 full characters
1331          * but we have 7 bits as well. This is the next character.
1332          */
1333         if ((bits == 1) && (char_count < no_of_chars)) {
1334             wmem_strbuf_append_c(strbuf, rest);
1335             char_count++;
1336             bits = 7;
1337             rest = 0x00;
1338         } else {
1339             bits--;
1340         }
1341     }
1342 
1343     return (guint8 *)wmem_strbuf_finalize(strbuf);
1344 }
1345 
1346 /* ASCII/EBCDIC conversion tables from
1347  * https://web.archive.org/web/20060813174742/http://www.room42.com/store/computer_center/code_tables.shtml
1348  */
1349 #if 0
1350 static const guint8 ASCII_translate_EBCDIC [ 256 ] = {
1351     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
1352     0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1353     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
1354     0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
1355     0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D,
1356     0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
1357     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
1358     0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
1359     0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8,
1360     0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
1361     0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
1362     0xE8, 0xE9, 0xAD, 0xE0, 0xBD, 0x5F, 0x6D,
1363     0x7D, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88,
1364     0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
1365     0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
1366     0xA8, 0xA9, 0xC0, 0x6A, 0xD0, 0xA1, 0x4B,
1367     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1368     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1369     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1370     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1371     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1372     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1373     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1374     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1375     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1376     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1377     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1378     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1379     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1380     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1381     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
1382     0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B
1383 };
1384 
1385 void
1386 ASCII_to_EBCDIC(guint8 *buf, guint bytes)
1387 {
1388     guint    i;
1389     guint8    *bufptr;
1390 
1391     bufptr = buf;
1392 
1393     for (i = 0; i < bytes; i++, bufptr++) {
1394         *bufptr = ASCII_translate_EBCDIC[*bufptr];
1395     }
1396 }
1397 
1398 guint8
1399 ASCII_to_EBCDIC1(guint8 c)
1400 {
1401     return ASCII_translate_EBCDIC[c];
1402 }
1403 #endif
1404 
1405 static const guint8 EBCDIC_translate_ASCII [ 256 ] = {
1406     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1407     0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1408     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1409     0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
1410     0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
1411     0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
1412     0x2E, 0x2E, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
1413     0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x2E, 0x3F,
1414     0x20, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1415     0x2E, 0x2E, 0x2E, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
1416     0x26, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1417     0x2E, 0x2E, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
1418     0x2D, 0x2F, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1419     0x2E, 0x2E, 0x7C, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
1420     0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1421     0x2E, 0x2E, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
1422     0x2E, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
1423     0x68, 0x69, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1424     0x2E, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70,
1425     0x71, 0x72, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1426     0x2E, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
1427     0x79, 0x7A, 0x2E, 0x2E, 0x2E, 0x5B, 0x2E, 0x2E,
1428     0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1429     0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x5D, 0x2E, 0x2E,
1430     0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
1431     0x48, 0x49, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1432     0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50,
1433     0x51, 0x52, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1434     0x5C, 0x2E, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
1435     0x59, 0x5A, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
1436     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
1437     0x38, 0x39, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E
1438 };
1439 
1440 void
EBCDIC_to_ASCII(guint8 * buf,guint bytes)1441 EBCDIC_to_ASCII(guint8 *buf, guint bytes)
1442 {
1443     guint   i;
1444     guint8 *bufptr;
1445 
1446     bufptr = buf;
1447 
1448     for (i = 0; i < bytes; i++, bufptr++) {
1449         *bufptr = EBCDIC_translate_ASCII[*bufptr];
1450     }
1451 }
1452 
1453 guint8
EBCDIC_to_ASCII1(guint8 c)1454 EBCDIC_to_ASCII1(guint8 c)
1455 {
1456     return EBCDIC_translate_ASCII[c];
1457 }
1458 
1459 /* Tables for EBCDIC code pages */
1460 
1461 /* EBCDIC common; based on the table in appendix H of ESA/370 Principles
1462    of Operation, but with some code points that don't correspond to
1463    the same characters in code pages 037 and 1158 mapped to REPLACEMENT
1464    CHARACTER - there may be more code points of that sort */
1465 const gunichar2 charset_table_ebcdic[256] = {
1466     0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1467     0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1468     0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1469     0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1470     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1471     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1472     UNREPL, UNREPL, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1473     0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, UNREPL, 0x001a,
1474     0x0020, 0x00a0, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1475     UNREPL, UNREPL, UNREPL, 0x002e, 0x003c, 0x0028, 0x002b, UNREPL,
1476     0x0026, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1477     UNREPL, UNREPL, UNREPL, 0x0024, 0x002a, 0x0029, 0x003b, UNREPL,
1478     0x002d, 0x002f, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1479     UNREPL, UNREPL, UNREPL, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1480     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1481     UNREPL, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1482     UNREPL, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1483     0x0068, 0x0069, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1484     UNREPL, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1485     0x0071, 0x0072, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1486     UNREPL, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1487     0x0079, 0x007a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1488     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1489     UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1490     0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1491     0x0048, 0x0049, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1492     0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1493     0x0051, 0x0052, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1494     0x005c, UNREPL, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1495     0x0059, 0x005a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1496     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1497     0x0038, 0x0039, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL,
1498 };
1499 
1500 /* EBCDIC code page 037 */
1501 const gunichar2 charset_table_ebcdic_cp037[256] = {
1502     0x0000, 0x0001, 0x0002, 0x0003, 0x009c, 0x0009, 0x0086, 0x007f,
1503     0x0097, 0x008d, 0x008e, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
1504     0x0010, 0x0011, 0x0012, 0x0013, 0x009d, 0x0085, 0x0008, 0x0087,
1505     0x0018, 0x0019, 0x0092, 0x008f, 0x001c, 0x001d, 0x001e, 0x001f,
1506     0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x000a, 0x0017, 0x001b,
1507     0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x0005, 0x0006, 0x0007,
1508     0x0090, 0x0091, 0x0016, 0x0093, 0x0094, 0x0095, 0x0096, 0x0004,
1509     0x0098, 0x0099, 0x009a, 0x009b, 0x0014, 0x0015, 0x009e, 0x001a,
1510     0x0020, 0x00a0, 0x00e2, 0x00e4, 0x00e0, 0x00e1, 0x00e3, 0x00e5,
1511     0x00e7, 0x00f1, 0x00a2, 0x002e, 0x003c, 0x0028, 0x002b, 0x007c,
1512     0x0026, 0x00e9, 0x00ea, 0x00eb, 0x00e8, 0x00ed, 0x00ee, 0x00ef,
1513     0x00ec, 0x00df, 0x0021, 0x0024, 0x002a, 0x0029, 0x003b, 0x00ac,
1514     0x002d, 0x002f, 0x00c2, 0x00c4, 0x00c0, 0x00c1, 0x00c3, 0x00c5,
1515     0x00c7, 0x00d1, 0x00a6, 0x002c, 0x0025, 0x005f, 0x003e, 0x003f,
1516     0x00f8, 0x00c9, 0x00ca, 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf,
1517     0x00cc, 0x0060, 0x003a, 0x0023, 0x0040, 0x0027, 0x003d, 0x0022,
1518     0x00d8, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
1519     0x0068, 0x0069, 0x00ab, 0x00bb, 0x00f0, 0x00fd, 0x00fe, 0x00b1,
1520     0x00b0, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 0x0070,
1521     0x0071, 0x0072, 0x00aa, 0x00ba, 0x00e6, 0x00b8, 0x00c6, 0x00a4,
1522     0x00b5, 0x007e, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078,
1523     0x0079, 0x007a, 0x00a1, 0x00bf, 0x00d0, 0x00dd, 0x00de, 0x00ae,
1524     0x005e, 0x00a3, 0x00a5, 0x00b7, 0x00a9, 0x00a7, 0x00b6, 0x00bc,
1525     0x00bd, 0x00be, 0x005b, 0x005d, 0x00af, 0x00a8, 0x00b4, 0x00d7,
1526     0x007b, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
1527     0x0048, 0x0049, 0x00ad, 0x00f4, 0x00f6, 0x00f2, 0x00f3, 0x00f5,
1528     0x007d, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, 0x0050,
1529     0x0051, 0x0052, 0x00b9, 0x00fb, 0x00fc, 0x00f9, 0x00fa, 0x00ff,
1530     0x005c, 0x00f7, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058,
1531     0x0059, 0x005a, 0x00b2, 0x00d4, 0x00d6, 0x00d2, 0x00d3, 0x00d5,
1532     0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
1533     0x0038, 0x0039, 0x00b3, 0x00db, 0x00dc, 0x00d9, 0x00da, 0x009f,
1534 };
1535 
1536 /*
1537  * Given a wmem scope, a pointer, a length, and a translation table with
1538  * 256 entries, treat the string of bytes referred to by the pointer and
1539  * length as a string encoded using one octet per character, with octets
1540  * being mapped by the translation table to 2-byte Unicode Basic Multilingual
1541  * Plane characters (including REPLACEMENT CHARACTER), and return a
1542  * pointer to a UTF-8 string, allocated using the wmem scope.
1543  */
1544 guint8 *
get_nonascii_unichar2_string(wmem_allocator_t * scope,const guint8 * ptr,gint length,const gunichar2 table[256])1545 get_nonascii_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[256])
1546 {
1547     wmem_strbuf_t *str;
1548 
1549     str = wmem_strbuf_sized_new(scope, length+1, 0);
1550 
1551     while (length > 0) {
1552         guint8 ch = *ptr;
1553 
1554         wmem_strbuf_append_unichar(str, table[ch]);
1555         ptr++;
1556         length--;
1557     }
1558 
1559     return (guint8 *) wmem_strbuf_finalize(str);
1560 }
1561 
1562 /*
1563  * Given a wmem scope, a pointer, a length, and a string referring to an
1564  * encoding (recognized by iconv), treat the bytes referred to by the pointer
1565  * and length as a string in that encoding, and return a pointer to a UTF-8
1566  * string, allocated using the wmem scope, converted from the original
1567  * encoding having substituted REPLACEMENT CHARACTER according to the
1568  * Unicode Standard 5.22 U+FFFD Substitution for Conversion
1569  * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1570  */
1571 static guint8 *
get_string_enc_iconv(wmem_allocator_t * scope,const guint8 * ptr,gint length,const gchar * encoding)1572 get_string_enc_iconv(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gchar *encoding)
1573 {
1574     GIConv cd;
1575     gsize inbytes, outbytes;
1576     gsize tempstr_size, bytes_written;
1577     gsize err;
1578     gsize max_subpart, tempinbytes;
1579     gchar *outptr, *tempstr;
1580 
1581     wmem_strbuf_t *str;
1582 
1583     if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) {
1584         REPORT_DISSECTOR_BUG("Unable to allocate iconv() converter from %s to UTF-8", encoding);
1585         /* Most likely to be a programming error passing in a bad encoding
1586          * name. However, could be a issue with the iconv support on the
1587          * system running WS. GLib requires iconv/libiconv, but is it possible
1588          * that some versions don't support all common encodings? */
1589     }
1590 
1591     inbytes = length;
1592     str = wmem_strbuf_sized_new(scope, length+1, 0);
1593     /* XXX: If speed becomes an issue, the faster way to do this would
1594      * involve passing the wmem_strbuf_t's string buffer directly into
1595      * g_iconv to avoid a memcpy later, but that requires changes to the
1596      * wmem_strbuf interface to have non const access to the string buffer,
1597      * and to manipulate the used length directly. */
1598     outbytes = tempstr_size = MAX(8, length);
1599     outptr = tempstr = (gchar *)g_malloc(outbytes);
1600     while (inbytes > 0) {
1601         err = g_iconv(cd, (gchar **)&ptr, &inbytes, &outptr, &outbytes);
1602         bytes_written = outptr - tempstr;
1603         wmem_strbuf_append_len(str, tempstr, bytes_written);
1604         outptr = tempstr;
1605         outbytes = tempstr_size;
1606 
1607         if (err == (gsize) -1) {
1608             /* Errors */
1609             switch (errno) {
1610                 case EINVAL:
1611                     /* Incomplete sequence at the end, not an error */
1612                     wmem_strbuf_append_unichar(str, UNREPL);
1613                     inbytes = 0;
1614                     break;
1615                 case E2BIG:
1616                     /* Not enough room (UTF-8 longer than the initial buffer),
1617                      * start back at the beginning of the buffer */
1618                     break;
1619                 case EILSEQ:
1620                     /* Find the maximal subpart of the ill-formed sequence */
1621                     errno = EINVAL;
1622                     for (max_subpart = 1; err == (gsize)-1 && errno == EINVAL; max_subpart++) {
1623                         tempinbytes = max_subpart;
1624                         err = g_iconv(cd, (gchar **)&ptr, &tempinbytes,
1625                                 &outptr, &outbytes);
1626                     }
1627                     max_subpart = MAX(1, max_subpart-1);
1628                     ptr += max_subpart;
1629                     inbytes -= max_subpart;
1630                     wmem_strbuf_append_unichar(str, UNREPL);
1631                     outptr = tempstr;
1632                     outbytes = tempstr_size;
1633                     break;
1634                 default:
1635                     /* Unexpected conversion error, unrecoverable */
1636                     g_free(tempstr);
1637                     g_iconv_close(cd);
1638                     REPORT_DISSECTOR_BUG("Unexpected iconv() error when converting from %s to UTF-8", encoding);
1639                     break;
1640             }
1641         } else {
1642             /* Otherwise err is the number of replacement characters used,
1643              * but we don't care about that. */
1644             /* If we were converting to ISO-2022-JP or some other stateful
1645              * decoder with shift sequences (e.g. EBCDIC mixed-byte), a
1646              * final call with NULL input in order to output the shift
1647              * sequence back to initial state might make sense, but not
1648              * needed for UTF-8. */
1649         }
1650     }
1651 
1652     g_free(tempstr);
1653     g_iconv_close(cd);
1654     return (guint8 *) wmem_strbuf_finalize(str);
1655 }
1656 
1657 /*
1658  * Given a wmem scope, a pointer, and a length, treat the bytes referred to
1659  * by the pointer and length as a GB18030 encoded string, and return a pointer
1660  * to a UTF-8 string, allocated using the wmem scope, converted having
1661  * substituted REPLACEMENT CHARACTER according to the Unicode Standard
1662  * 5.22 U+FFFD Substitution for Conversion.
1663  * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1664  *
1665  * As expected, this will also decode GBK and GB2312 strings.
1666  */
1667 guint8 *
get_gb18030_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)1668 get_gb18030_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
1669 {
1670     /* iconv/libiconv support is guaranteed with GLib. Support this
1671      * via iconv, at least for now. */
1672     /* GNU libiconv has supported GB18030 (~ Windows Code page 54936) since
1673      * 2000-10-24 and version 1.4, is there is a system that compiles current
1674      * Wireshark yet its iconv only supports GBK (~ Windows Code page 936)? */
1675     const gchar *encoding = "GB18030";
1676     GIConv cd;
1677     if ((cd = g_iconv_open("UTF-8", encoding)) == (GIConv) -1) {
1678         encoding = "GBK";
1679         /* GB18030 is backwards compatible, at worst this will mean a few
1680          * extra REPLACEMENT CHARACTERs - GBK lacks the four byte encodings
1681          * from GB18030, which are all pairs of two byte sequences
1682          * 0x[81-FE] 0x[30-39]; that trailing byte is illegal in GBK
1683          * and thus the 4 byte characters will be replaced with two
1684          * REPLACEMENT CHARACTERs. */
1685     } else {
1686         g_iconv_close(cd);
1687     }
1688     return get_string_enc_iconv(scope, ptr, length, encoding);
1689 }
1690 
1691 /*
1692  * Given a wmem scope, a pointer, and a length, treat the bytes referred to
1693  * by the pointer and length as a EUC-KR encoded string, and return a pointer
1694  * to a UTF-8 string, allocated using the wmem scope, converted having
1695  * substituted REPLACEMENT CHARACTER according to the Unicode Standard
1696  * 5.22 U+FFFD Substitution for Conversion.
1697  * ( https://www.unicode.org/versions/Unicode13.0.0/ch05.pdf )
1698  */
1699 guint8 *
get_euc_kr_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)1700 get_euc_kr_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
1701 {
1702     /* iconv/libiconv support is guaranteed with GLib. Support this
1703      * via iconv, at least for now. */
1704     return get_string_enc_iconv(scope, ptr, length, "EUC-KR");
1705 }
1706 
1707 /* T.61 to UTF-8 conversion table from OpenLDAP project
1708  * https://www.openldap.org/devel/gitweb.cgi?p=openldap.git;a=blob;f=libraries/libldap/t61.c;hb=HEAD
1709  */
1710 static const gunichar2 t61_tab[] = {
1711     0x000, 0x001, 0x002, 0x003, 0x004, 0x005, 0x006, 0x007,
1712     0x008, 0x009, 0x00a, 0x00b, 0x00c, 0x00d, 0x00e, 0x00f,
1713     0x010, 0x011, 0x012, 0x013, 0x014, 0x015, 0x016, 0x017,
1714     0x018, 0x019, 0x01a, 0x01b, 0x01c, 0x01d, 0x01e, 0x01f,
1715     0x020, 0x021, 0x022, 0x000, 0x000, 0x025, 0x026, 0x027,
1716     0x028, 0x029, 0x02a, 0x02b, 0x02c, 0x02d, 0x02e, 0x02f,
1717     0x030, 0x031, 0x032, 0x033, 0x034, 0x035, 0x036, 0x037,
1718     0x038, 0x039, 0x03a, 0x03b, 0x03c, 0x03d, 0x03e, 0x03f,
1719     0x040, 0x041, 0x042, 0x043, 0x044, 0x045, 0x046, 0x047,
1720     0x048, 0x049, 0x04a, 0x04b, 0x04c, 0x04d, 0x04e, 0x04f,
1721     0x050, 0x051, 0x052, 0x053, 0x054, 0x055, 0x056, 0x057,
1722     0x058, 0x059, 0x05a, 0x05b, 0x000, 0x05d, 0x000, 0x05f,
1723     0x000, 0x061, 0x062, 0x063, 0x064, 0x065, 0x066, 0x067,
1724     0x068, 0x069, 0x06a, 0x06b, 0x06c, 0x06d, 0x06e, 0x06f,
1725     0x070, 0x071, 0x072, 0x073, 0x074, 0x075, 0x076, 0x077,
1726     0x078, 0x079, 0x07a, 0x000, 0x07c, 0x000, 0x000, 0x07f,
1727     0x080, 0x081, 0x082, 0x083, 0x084, 0x085, 0x086, 0x087,
1728     0x088, 0x089, 0x08a, 0x08b, 0x08c, 0x08d, 0x08e, 0x08f,
1729     0x090, 0x091, 0x092, 0x093, 0x094, 0x095, 0x096, 0x097,
1730     0x098, 0x099, 0x09a, 0x09b, 0x09c, 0x09d, 0x09e, 0x09f,
1731     0x0a0, 0x0a1, 0x0a2, 0x0a3, 0x024, 0x0a5, 0x023, 0x0a7,
1732     0x0a4, 0x000, 0x000, 0x0ab, 0x000, 0x000, 0x000, 0x000,
1733     0x0b0, 0x0b1, 0x0b2, 0x0b3, 0x0d7, 0x0b5, 0x0b6, 0x0b7,
1734     0x0f7, 0x000, 0x000, 0x0bb, 0x0bc, 0x0bd, 0x0be, 0x0bf,
1735     0x000, 0x300, 0x301, 0x302, 0x303, 0x304, 0x306, 0x307,
1736     0x308, 0x000, 0x30a, 0x327, 0x332, 0x30b, 0x328, 0x30c,
1737     0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
1738     0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
1739     0x2126, 0xc6, 0x0d0, 0x0aa, 0x126, 0x000, 0x132, 0x13f,
1740     0x141, 0x0d8, 0x152, 0x0ba, 0x0de, 0x166, 0x14a, 0x149,
1741     0x138, 0x0e6, 0x111, 0x0f0, 0x127, 0x131, 0x133, 0x140,
1742     0x142, 0x0f8, 0x153, 0x0df, 0x0fe, 0x167, 0x14b, 0x000
1743 };
1744 
1745 typedef gunichar2 wvec16[16];
1746 typedef gunichar2 wvec32[32];
1747 
1748 /* Substitutions when 0xc1-0xcf appears by itself or with space 0x20 */
1749 static const wvec16 accents = {
1750     0x000, 0x060, 0x0b4, 0x05e, 0x07e, 0x0af, 0x2d8, 0x2d9,
1751     0x0a8, 0x000, 0x2da, 0x0b8, 0x000, 0x2dd, 0x2db, 0x2c7};
1752 
1753 /* In the following tables, base characters commented in (parentheses)
1754  * are not defined by T.61 but are mapped anyway since their Unicode
1755  * composite exists.
1756  */
1757 
1758 /* Grave accented chars AEIOU (NWY) */
1759 static const wvec32 c1_vec1 = {
1760     /* Upper case */
1761     0, 0xc0, 0, 0, 0, 0xc8, 0, 0, 0, 0xcc, 0, 0, 0, 0, 0x1f8, 0xd2,
1762     0, 0, 0, 0, 0, 0xd9, 0, 0x1e80, 0, 0x1ef2, 0, 0, 0, 0, 0, 0};
1763 static const wvec32 c1_vec2 = {
1764     /* Lower case */
1765     0, 0xe0, 0, 0, 0, 0xe8, 0, 0, 0, 0xec, 0, 0, 0, 0, 0x1f9, 0xf2,
1766     0, 0, 0, 0, 0, 0xf9, 0, 0x1e81, 0, 0x1ef3, 0, 0, 0, 0, 0, 0};
1767 
1768 static const wvec32 *c1_grave[] = {
1769     NULL, NULL, &c1_vec1, &c1_vec2, NULL, NULL, NULL, NULL
1770 };
1771 
1772 /* Acute accented chars AEIOUYCLNRSZ (GKMPW) */
1773 static const wvec32 c2_vec1 = {
1774     /* Upper case */
1775     0, 0xc1, 0, 0x106, 0, 0xc9, 0, 0x1f4,
1776     0, 0xcd, 0, 0x1e30, 0x139, 0x1e3e, 0x143, 0xd3,
1777     0x1e54, 0, 0x154, 0x15a, 0, 0xda, 0, 0x1e82,
1778     0, 0xdd, 0x179, 0, 0, 0, 0, 0};
1779 static const wvec32 c2_vec2 = {
1780     /* Lower case */
1781     0, 0xe1, 0, 0x107, 0, 0xe9, 0, 0x1f5,
1782     0, 0xed, 0, 0x1e31, 0x13a, 0x1e3f, 0x144, 0xf3,
1783     0x1e55, 0, 0x155, 0x15b, 0, 0xfa, 0, 0x1e83,
1784     0, 0xfd, 0x17a, 0, 0, 0, 0, 0};
1785 static const wvec32 c2_vec3 = {
1786     /* (AE and ae) */
1787     0, 0x1fc, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1788     0, 0x1fd, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1789 
1790 static const wvec32 *c2_acute[] = {
1791     NULL, NULL, &c2_vec1, &c2_vec2, NULL, NULL, NULL, &c2_vec3
1792 };
1793 
1794 /* Circumflex AEIOUYCGHJSW (Z) */
1795 static const wvec32 c3_vec1 = {
1796     /* Upper case */
1797     0, 0xc2, 0, 0x108, 0, 0xca, 0, 0x11c,
1798     0x124, 0xce, 0x134, 0, 0, 0, 0, 0xd4,
1799     0, 0, 0, 0x15c, 0, 0xdb, 0, 0x174,
1800     0, 0x176, 0x1e90, 0, 0, 0, 0, 0};
1801 static const wvec32 c3_vec2 = {
1802     /* Lower case */
1803     0, 0xe2, 0, 0x109, 0, 0xea, 0, 0x11d,
1804     0x125, 0xee, 0x135, 0, 0, 0, 0, 0xf4,
1805     0, 0, 0, 0x15d, 0, 0xfb, 0, 0x175,
1806     0, 0x177, 0x1e91, 0, 0, 0, 0, 0};
1807 static const wvec32 *c3_circumflex[] = {
1808     NULL, NULL, &c3_vec1, &c3_vec2, NULL, NULL, NULL, NULL
1809 };
1810 
1811 /* Tilde AIOUN (EVY) */
1812 static const wvec32 c4_vec1 = {
1813     /* Upper case */
1814     0, 0xc3, 0, 0, 0, 0x1ebc, 0, 0, 0, 0x128, 0, 0, 0, 0, 0xd1, 0xd5,
1815     0, 0, 0, 0, 0, 0x168, 0x1e7c, 0, 0, 0x1ef8, 0, 0, 0, 0, 0, 0};
1816 static const wvec32 c4_vec2 = {
1817     /* Lower case */
1818     0, 0xe3, 0, 0, 0, 0x1ebd, 0, 0, 0, 0x129, 0, 0, 0, 0, 0xf1, 0xf5,
1819     0, 0, 0, 0, 0, 0x169, 0x1e7d, 0, 0, 0x1ef9, 0, 0, 0, 0, 0, 0};
1820 static const wvec32 *c4_tilde[] = {
1821     NULL, NULL, &c4_vec1, &c4_vec2, NULL, NULL, NULL, NULL
1822 };
1823 
1824 /* Macron AEIOU (YG) */
1825 static const wvec32 c5_vec1 = {
1826     /* Upper case */
1827     0, 0x100, 0, 0, 0, 0x112, 0, 0x1e20, 0, 0x12a, 0, 0, 0, 0, 0, 0x14c,
1828     0, 0, 0, 0, 0, 0x16a, 0, 0, 0, 0x232, 0, 0, 0, 0, 0, 0};
1829 static const wvec32 c5_vec2 = {
1830     /* Lower case */
1831     0, 0x101, 0, 0, 0, 0x113, 0, 0x1e21, 0, 0x12b, 0, 0, 0, 0, 0, 0x14d,
1832     0, 0, 0, 0, 0, 0x16b, 0, 0, 0, 0x233, 0, 0, 0, 0, 0, 0};
1833 static const wvec32 c5_vec3 = {
1834     /* (AE and ae) */
1835     0, 0x1e2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1836     0, 0x1e3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1837 static const wvec32 *c5_macron[] = {
1838     NULL, NULL, &c5_vec1, &c5_vec2, NULL, NULL, NULL, &c5_vec3
1839 };
1840 
1841 /* Breve AUG (EIO) */
1842 static const wvec32 c6_vec1 = {
1843     /* Upper case */
1844     0, 0x102, 0, 0, 0, 0x114, 0, 0x11e, 0, 0x12c, 0, 0, 0, 0, 0, 0x14e,
1845     0, 0, 0, 0, 0, 0x16c, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1846 static const wvec32 c6_vec2 = {
1847     /* Lower case */
1848     0, 0x103, 0, 0, 0, 0x115, 0, 0x11f, 0, 0x12d, 0, 0, 0, 0, 0, 0x14f,
1849     0, 0, 0, 0, 0, 0x16d, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1850 static const wvec32 *c6_breve[] = {
1851     NULL, NULL, &c6_vec1, &c6_vec2, NULL, NULL, NULL, NULL
1852 };
1853 
1854 /* Dot Above CEGIZ (AOBDFHMNPRSTWXY) */
1855 static const wvec32 c7_vec1 = {
1856     /* Upper case */
1857     0, 0x226, 0x1e02, 0x10a, 0x1e0a, 0x116, 0x1e1e, 0x120,
1858     0x1e22, 0x130, 0, 0, 0, 0x1e40, 0x1e44, 0x22e,
1859     0x1e56, 0, 0x1e58, 0x1e60, 0x1e6a, 0, 0, 0x1e86,
1860     0x1e8a, 0x1e8e, 0x17b, 0, 0, 0, 0, 0};
1861 static const wvec32 c7_vec2 = {
1862     /* Lower case */
1863     0, 0x227, 0x1e03, 0x10b, 0x1e0b, 0x117, 0x1e1f, 0x121,
1864     0x1e23, 0, 0, 0, 0, 0x1e41, 0x1e45, 0x22f,
1865     0x1e57, 0, 0x1e59, 0x1e61, 0x1e6b, 0, 0, 0x1e87,
1866     0x1e8b, 0x1e8f, 0x17c, 0, 0, 0, 0, 0};
1867 static const wvec32 *c7_dotabove[] = {
1868     NULL, NULL, &c7_vec1, &c7_vec2, NULL, NULL, NULL, NULL
1869 };
1870 
1871 /* Diaeresis AEIOUY (HWXt) */
1872 static const wvec32 c8_vec1 = {
1873     /* Upper case */
1874     0, 0xc4, 0, 0, 0, 0xcb, 0, 0, 0x1e26, 0xcf, 0, 0, 0, 0, 0, 0xd6,
1875     0, 0, 0, 0, 0, 0xdc, 0, 0x1e84, 0x1e8c, 0x178, 0, 0, 0, 0, 0, 0};
1876 static const wvec32 c8_vec2 = {
1877     /* Lower case */
1878     0, 0xe4, 0, 0, 0, 0xeb, 0, 0, 0x1e27, 0xef, 0, 0, 0, 0, 0, 0xf6,
1879     0, 0, 0, 0, 0x1e97, 0xfc, 0, 0x1e85, 0x1e8d, 0xff, 0, 0, 0, 0, 0, 0};
1880 static const wvec32 *c8_diaeresis[] = {
1881     NULL, NULL, &c8_vec1, &c8_vec2, NULL, NULL, NULL, NULL
1882 };
1883 
1884 /* Ring Above AU (wy) */
1885 static const wvec32 ca_vec1 = {
1886     /* Upper case */
1887     0, 0xc5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1888     0, 0, 0, 0, 0, 0x16e, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1889 static const wvec32 ca_vec2 = {
1890     /* Lower case */
1891     0, 0xe5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1892     0, 0, 0, 0, 0, 0x16f, 0, 0x1e98, 0, 0x1e99, 0, 0, 0, 0, 0, 0};
1893 static const wvec32 *ca_ringabove[] = {
1894     NULL, NULL, &ca_vec1, &ca_vec2, NULL, NULL, NULL, NULL
1895 };
1896 
1897 /* Cedilla CGKLNRST (EDH) */
1898 static const wvec32 cb_vec1 = {
1899     /* Upper case */
1900     0, 0, 0, 0xc7, 0x1e10, 0x228, 0, 0x122,
1901     0x1e28, 0, 0, 0x136, 0x13b, 0, 0x145, 0,
1902     0, 0, 0x156, 0x15e, 0x162, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1903 static const wvec32 cb_vec2 = {
1904     /* Lower case */
1905     0, 0, 0, 0xe7, 0x1e11, 0x229, 0, 0x123,
1906     0x1e29, 0, 0, 0x137, 0x13c, 0, 0x146, 0,
1907     0, 0, 0x157, 0x15f, 0x163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1908 static const wvec32 *cb_cedilla[] = {
1909     NULL, NULL, &cb_vec1, &cb_vec2, NULL, NULL, NULL, NULL
1910 };
1911 
1912 /* Double Acute Accent OU */
1913 static const wvec32 cd_vec1 = {
1914     /* Upper case */
1915     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x150,
1916     0, 0, 0, 0, 0, 0x170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1917 static const wvec32 cd_vec2 = {
1918     /* Lower case */
1919     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x151,
1920     0, 0, 0, 0, 0, 0x171, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1921 static const wvec32 *cd_doubleacute[] = {
1922     NULL, NULL, &cd_vec1, &cd_vec2, NULL, NULL, NULL, NULL
1923 };
1924 
1925 /* Ogonek AEIU (O) */
1926 static const wvec32 ce_vec1 = {
1927     /* Upper case */
1928     0, 0x104, 0, 0, 0, 0x118, 0, 0, 0, 0x12e, 0, 0, 0, 0, 0, 0x1ea,
1929     0, 0, 0, 0, 0, 0x172, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1930 static const wvec32 ce_vec2 = {
1931     /* Lower case */
1932     0, 0x105, 0, 0, 0, 0x119, 0, 0, 0, 0x12f, 0, 0, 0, 0, 0, 0x1eb,
1933     0, 0, 0, 0, 0, 0x173, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1934 static const wvec32 *ce_ogonek[] = {
1935     NULL, NULL, &ce_vec1, &ce_vec2, NULL, NULL, NULL, NULL
1936 };
1937 
1938 /* Caron CDELNRSTZ (AIOUGKjH) */
1939 static const wvec32 cf_vec1 = {
1940     /* Upper case */
1941     0, 0x1cd, 0, 0x10c, 0x10e, 0x11a, 0, 0x1e6,
1942     0x21e, 0x1cf, 0, 0x1e8, 0x13d, 0, 0x147, 0x1d1,
1943     0, 0, 0x158, 0x160, 0x164, 0x1d3, 0, 0,
1944     0, 0, 0x17d, 0, 0, 0, 0, 0};
1945 static const wvec32 cf_vec2 = {
1946     /* Lower case */
1947     0, 0x1ce, 0, 0x10d, 0x10f, 0x11b, 0, 0x1e7,
1948     0x21f, 0x1d0, 0x1f0, 0x1e9, 0x13e, 0, 0x148, 0x1d2,
1949     0, 0, 0x159, 0x161, 0x165, 0x1d4, 0, 0,
1950     0, 0, 0x17e, 0, 0, 0, 0, 0};
1951 static const wvec32 *cf_caron[] = {
1952     NULL, NULL, &cf_vec1, &cf_vec2, NULL, NULL, NULL, NULL
1953 };
1954 
1955 static const wvec32 **cx_tab[] = {
1956     NULL, c1_grave, c2_acute, c3_circumflex, c4_tilde, c5_macron,
1957     c6_breve, c7_dotabove, c8_diaeresis, NULL, ca_ringabove,
1958     cb_cedilla, NULL, cd_doubleacute, ce_ogonek, cf_caron };
1959 
1960 guint8 *
get_t61_string(wmem_allocator_t * scope,const guint8 * ptr,gint length)1961 get_t61_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
1962 {
1963     gint           i;
1964     const guint8  *c;
1965     wmem_strbuf_t *strbuf;
1966 
1967     strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
1968 
1969     for (i = 0, c = ptr; i < length; c++, i++) {
1970         if (!t61_tab[*c]) {
1971             wmem_strbuf_append_unichar(strbuf, UNREPL);
1972         } else if (i < length - 1 && (*c & 0xf0) == 0xc0) {
1973             gint j = *c & 0x0f;
1974             /* If this is the end of the string, or if the base
1975              * character is just a space, treat this as a regular
1976              * spacing character.
1977              */
1978             if ((!c[1] || c[1] == 0x20) && accents[j]) {
1979                 wmem_strbuf_append_unichar(strbuf, accents[j]);
1980             } else if (cx_tab[j] && cx_tab[j][c[1]>>5] &&
1981                 /* We have a composite mapping for this pair */
1982                        (*cx_tab[j][c[1]>>5])[c[1]&0x1f]) {
1983                 wmem_strbuf_append_unichar(strbuf, (*cx_tab[j][c[1]>>5])[c[1]&0x1f]);
1984             } else {
1985                 /* No mapping, just swap it around so the base
1986                  * character comes first.
1987                  */
1988                 wmem_strbuf_append_unichar(strbuf, c[1]);
1989                 wmem_strbuf_append_unichar(strbuf, t61_tab[*c]);
1990             }
1991             c++; i++;
1992             continue;
1993         } else {
1994             wmem_strbuf_append_unichar(strbuf, t61_tab[*c]);
1995         }
1996     }
1997 
1998     return (guint8 *)wmem_strbuf_finalize(strbuf);
1999 }
2000 
2001 /*
2002  * Editor modelines  -  https://www.wireshark.org/tools/modelines.html
2003  *
2004  * Local variables:
2005  * c-basic-offset: 4
2006  * tab-width: 8
2007  * indent-tabs-mode: nil
2008  * End:
2009  *
2010  * vi: set shiftwidth=4 tabstop=8 expandtab:
2011  * :indentSize=4:tabSize=8:noTabs=true:
2012  */
2013