1 /* melder_textencoding.cpp
2  *
3  * Copyright (C) 2007-2019 Paul Boersma
4  *
5  * This code is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or (at
8  * your option) any later version.
9  *
10  * This code is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13  * See the GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this work. If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "melder.h"
20 #include "../sys/Preferences.h"
21 #include "../kar/UnicodeData.h"
22 
23 #ifdef _WIN32
24 	#include <windows.h>
25 #endif
26 #if defined (macintosh)
27 	#include "macport_on.h"
28 	#include <CoreFoundation/CoreFoundation.h>
29 	#include "macport_off.h"
30 #endif
31 
32 static struct {
33 	kMelder_textInputEncoding inputEncoding;
34 	kMelder_textOutputEncoding outputEncoding;
35 } preferences;
36 
Melder_setInputEncoding(kMelder_textInputEncoding encoding)37 void Melder_setInputEncoding (kMelder_textInputEncoding encoding) { preferences. inputEncoding = encoding; }
Melder_getInputEncoding()38 kMelder_textInputEncoding Melder_getInputEncoding () { return preferences. inputEncoding; }
39 
Melder_setOutputEncoding(kMelder_textOutputEncoding encoding)40 void Melder_setOutputEncoding (kMelder_textOutputEncoding encoding) { preferences. outputEncoding = encoding; }
Melder_getOutputEncoding()41 kMelder_textOutputEncoding Melder_getOutputEncoding () { return preferences. outputEncoding; }
42 
Melder_textEncoding_prefs()43 void Melder_textEncoding_prefs () {
44 	Preferences_addEnum (U"TextEncoding.inputEncoding", & preferences. inputEncoding, kMelder_textInputEncoding, kMelder_textInputEncoding::DEFAULT);
45 	Preferences_addEnum (U"TextEncoding.outputEncoding", & preferences. outputEncoding, kMelder_textOutputEncoding, kMelder_textOutputEncoding::DEFAULT);
46 }
47 
Melder_isValidAscii(conststring32 text)48 bool Melder_isValidAscii (conststring32 text) {
49 	for (; *text != U'\0'; text ++) {
50 		if (*text > 127)
51 			return false;
52 	}
53 	return true;
54 }
55 
Melder_isEncodable(conststring32 text,int outputEncoding)56 bool Melder_isEncodable (conststring32 text, int outputEncoding) {
57 	switch (outputEncoding) {
58 		case kMelder_textOutputEncoding_ASCII: {
59 			for (; *text != U'\0'; text ++) {
60 				if (*text > 127)
61 					return false;
62 			}
63 			return true;
64 		} break;
65 		case kMelder_textOutputEncoding_ISO_LATIN1: {
66 			for (; *text != U'\0'; text ++) {
67 				if (*text > 255)
68 					return false;
69 			}
70 			return true;
71 		} break;
72 		case (int) kMelder_textOutputEncoding::UTF8:
73 		case (int) kMelder_textOutputEncoding::UTF16:
74 		case (int) kMelder_textOutputEncoding::ASCII_THEN_UTF16:
75 		case (int) kMelder_textOutputEncoding::ISO_LATIN1_THEN_UTF16: {
76 			return true;
77 		}
78 	}
79 	return false;
80 }
81 
Melder_str8IsValidUtf8(const char * string)82 bool Melder_str8IsValidUtf8 (const char *string) {
83 	for (const char8 *p = (const char8 *) & string [0]; *p != '\0'; p ++) {
84 		char32 kar = (char32) *p;
85 		if (kar <= 0x7F) {
86 			;
87 		} else if (kar <= 0xC1) {
88 			return false;
89 		} else if (kar <= 0xDF) {
90 			if ((* ++ p & 0xC0) != 0x80)
91 				return false;
92 		} else if (kar <= 0xEF) {
93 			if ((* ++ p & 0xC0) != 0x80)
94 				return false;
95 			if ((* ++ p & 0xC0) != 0x80)
96 				return false;
97 		} else if (kar <= 0xF4) {
98 			if ((* ++ p & 0xC0) != 0x80)
99 				return false;
100 			if ((* ++ p & 0xC0) != 0x80)
101 				return false;
102 			if ((* ++ p & 0xC0) != 0x80)
103 				return false;
104 		} else {
105 			return false;
106 		}
107 	}
108 	return true;
109 }
110 
Melder_killReturns_inplace(char * text)111 integer Melder_killReturns_inplace (char *text) {
112 	const char *from;
113 	char *to;
114 	for (from = text, to = text; *from != '\0'; from ++, to ++) {
115 		if (*from == 13) {   // carriage return?
116 			if (from [1] == '\n') {   // followed by linefeed? Must be a Windows text
117 				from ++;   // ignore carriage return
118 				*to = '\n';   // copy linefeed
119 			} else {   // bare carriage return? Must be a Macintosh text
120 				*to = '\n';   // change to linefeed
121 			}
122 		} else {
123 			*to = *from;
124 		}
125 	}
126 	*to = '\0';   // closing null byte
127 	return to - text;
128 }
129 
130 template <class CHAR>
Melder_killReturns_inplaceCHAR(CHAR * text)131 integer Melder_killReturns_inplaceCHAR (CHAR *text) {
132 	const CHAR *from;
133 	CHAR *to;
134 	for (from = text, to = text; *from != '\0'; from ++, to ++) {
135 		if (*from == 13) {   // carriage return?
136 			if (from [1] == '\n' || from [1] == 0x0085 /* NextLine */) {   // followed by linefeed? Must be a Windows text
137 				from ++;   // ignore carriage return
138 				*to = '\n';   // copy linefeed
139 			} else {   // bare carriage return? Must be a Macintosh text
140 				*to = '\n';   // change to linefeed (10)
141 			}
142 		} else if (*from == 0x0085 /* NextLine */ || *from == 0x000C /* FormFeed */ ||
143 		    *from == UNICODE_LINE_SEPARATOR || *from == UNICODE_PARAGRAPH_SEPARATOR)
144 		{
145 			*to = '\n';
146 		} else {
147 			*to = *from;
148 		}
149 	}
150 	*to = '\0';   // closing null character
151 	return to - text;
152 }
Melder_killReturns_inplace(char32 * text)153 integer Melder_killReturns_inplace (char32 *text) {
154 	return Melder_killReturns_inplaceCHAR <char32> (text);
155 }
156 
str32len_utf8(conststring32 string,bool nativizeNewlines)157 size_t str32len_utf8 (conststring32 string, bool nativizeNewlines) {
158 	size_t length = 0;
159 	for (const char32 *p = & string [0]; *p != U'\0'; p ++) {
160 		char32 kar = *p;
161 		if (kar <= 0x00'007F) {
162 			#ifdef _WIN32
163 				if (nativizeNewlines && kar == U'\n') length ++;
164 			#else
165 				(void) nativizeNewlines;
166 			#endif
167 			length ++;
168 		} else if (kar <= 0x00'07FF) {
169 			length += 2;
170 		} else if (kar <= 0x00'FFFF) {
171 			length += 3;
172 		} else {
173 			Melder_assert (kar <= 0x10'FFFF);
174 			length += 4;
175 		}
176 	}
177 	return length;
178 }
179 
str32len_utf16(conststring32 string,bool nativizeNewlines)180 size_t str32len_utf16 (conststring32 string, bool nativizeNewlines) {
181 	size_t length = 0;
182 	for (const char32 *p = & string [0]; *p != U'\0'; p ++) {
183 		char32 kar = *p;
184 		if (kar <= 0x00'007F) {
185 			#ifdef _WIN32
186 				if (nativizeNewlines && kar == U'\n') length ++;
187 			#else
188 				(void) nativizeNewlines;
189 			#endif
190 			length ++;
191 		} else if (kar >= 0x01'0000) {
192 			length += 2;
193 		} else {
194 			length += 1;
195 		}
196 	}
197 	return length;
198 }
199 
Melder_peek8to32(conststring8 textA)200 conststring32 Melder_peek8to32 (conststring8 textA) {
201 	if (! textA)
202 		return nullptr;
203 	static MelderString buffers [19];
204 	static int ibuffer = 0;
205 	if (++ ibuffer == 11)
206 		ibuffer = 0;
207 	MelderString_empty (& buffers [ibuffer]);
208 	uinteger n = strlen (textA), i, j;
209 	for (i = 0, j = 0; i <= n; i ++) {
210 		char8 kar1 = (char8) textA [i];   // convert sign
211 		if (kar1 <= 0x7F) {
212 			MelderString_appendCharacter (& buffers [ibuffer],
213 				(char32) kar1);
214 		} else if (kar1 <= 0xC1) {
215 			MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
216 		} else if (kar1 <= 0xDF) {
217 			char8 kar2 = textA [++ i];
218 			if ((kar2 & 0xC0) != 0x80)
219 				MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
220 			MelderString_appendCharacter (& buffers [ibuffer],
221 				(char32) ((char32) ((char32) kar1 & 0x00'001F) << 6) |
222 						  (char32) ((char32) kar2 & 0x00'003F));
223 		} else if (kar1 <= 0xEF) {
224 			char8 kar2 = textA [++ i];
225 			if ((kar2 & 0xC0) != 0x80)
226 				MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
227 			char8 kar3 = textA [++ i];
228 			if ((kar3 & 0xC0) != 0x80)
229 				MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
230 			MelderString_appendCharacter (& buffers [ibuffer],
231 				(char32) ((char32) ((char32) kar1 & 0x00'000F) << 12) |
232 				(char32) ((char32) ((char32) kar2 & 0x00'003F) << 6) |
233 						  (char32) ((char32) kar3 & 0x00'003F));
234 		} else if (kar1 <= 0xF4) {
235 			char8 kar2 = (char8) textA [++ i];
236 			if ((kar2 & 0xC0) != 0x80)
237 				MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
238 			char8 kar3 = (char8) textA [++ i];
239 			if ((kar3 & 0xC0) != 0x80)
240 				MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
241 			char8 kar4 = (char8) textA [++ i];
242 			if ((kar4 & 0xC0) != 0x80)
243 				MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
244 			char32 character =
245 				(char32) ((char32) ((char32) kar1 & 0x00'0007) << 18) |
246 				(char32) ((char32) ((char32) kar2 & 0x00'003F) << 12) |
247 				(char32) ((char32) ((char32) kar3 & 0x00'003F) << 6) |
248 						  (char32) ((char32) kar4 & 0x00'003F);
249 			MelderString_appendCharacter (& buffers [ibuffer], character);
250 		} else {
251 			MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
252 		}
253 	}
254 	return buffers [ibuffer]. string;
255 }
256 
257 char32 Melder_decodeMacRoman [256] = {
258 	  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,
259 	 20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
260 	 40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
261 	 60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
262 	 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
263 	100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
264 	120, 121, 122, 123, 124, 125, 126, 127,
265 	UNICODE_LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS, UNICODE_LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE,
266 	UNICODE_LATIN_CAPITAL_LETTER_C_WITH_CEDILLA, UNICODE_LATIN_CAPITAL_LETTER_E_WITH_ACUTE,
267 	UNICODE_LATIN_CAPITAL_LETTER_N_WITH_TILDE, UNICODE_LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS,
268 	UNICODE_LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS, UNICODE_LATIN_SMALL_LETTER_A_WITH_ACUTE,
269 	UNICODE_LATIN_SMALL_LETTER_A_WITH_GRAVE, UNICODE_LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX,
270 	UNICODE_LATIN_SMALL_LETTER_A_WITH_DIAERESIS, UNICODE_LATIN_SMALL_LETTER_A_WITH_TILDE,
271 	UNICODE_LATIN_SMALL_LETTER_A_WITH_RING_ABOVE, UNICODE_LATIN_SMALL_LETTER_C_WITH_CEDILLA,
272 	UNICODE_LATIN_SMALL_LETTER_E_WITH_ACUTE, UNICODE_LATIN_SMALL_LETTER_E_WITH_GRAVE,
273 
274 	UNICODE_LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX, UNICODE_LATIN_SMALL_LETTER_E_WITH_DIAERESIS,
275 	UNICODE_LATIN_SMALL_LETTER_I_WITH_ACUTE, UNICODE_LATIN_SMALL_LETTER_I_WITH_GRAVE,
276 	UNICODE_LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX, UNICODE_LATIN_SMALL_LETTER_I_WITH_DIAERESIS,
277 	UNICODE_LATIN_SMALL_LETTER_N_WITH_TILDE, UNICODE_LATIN_SMALL_LETTER_O_WITH_ACUTE,
278 	UNICODE_LATIN_SMALL_LETTER_O_WITH_GRAVE, UNICODE_LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX,
279 	UNICODE_LATIN_SMALL_LETTER_O_WITH_DIAERESIS, UNICODE_LATIN_SMALL_LETTER_O_WITH_TILDE,
280 	UNICODE_LATIN_SMALL_LETTER_U_WITH_ACUTE, UNICODE_LATIN_SMALL_LETTER_U_WITH_GRAVE,
281 	UNICODE_LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX, UNICODE_LATIN_SMALL_LETTER_U_WITH_DIAERESIS,
282 
283 	UNICODE_DAGGER, UNICODE_DEGREE_SIGN, UNICODE_CENT_SIGN, UNICODE_POUND_SIGN,
284 	UNICODE_SECTION_SIGN, UNICODE_BULLET, UNICODE_PILCROW_SIGN, UNICODE_LATIN_SMALL_LETTER_SHARP_S,
285 	UNICODE_REGISTERED_SIGN, UNICODE_COPYRIGHT_SIGN, UNICODE_TRADE_MARK_SIGN, UNICODE_ACUTE_ACCENT,
286 	UNICODE_DIAERESIS, UNICODE_NOT_EQUAL_TO,
287 	UNICODE_LATIN_CAPITAL_LETTER_AE, UNICODE_LATIN_CAPITAL_LETTER_O_WITH_STROKE,
288 
289 	UNICODE_INFINITY, UNICODE_PLUS_MINUS_SIGN, UNICODE_LESS_THAN_OR_EQUAL_TO, UNICODE_GREATER_THAN_OR_EQUAL_TO,
290 	UNICODE_YEN_SIGN, UNICODE_MICRO_SIGN, UNICODE_PARTIAL_DIFFERENTIAL, UNICODE_N_ARY_SUMMATION,
291 	UNICODE_N_ARY_PRODUCT, UNICODE_GREEK_SMALL_LETTER_PI, UNICODE_INTEGRAL,
292 	UNICODE_FEMININE_ORDINAL_INDICATOR, UNICODE_MASCULINE_ORDINAL_INDICATOR,
293 	UNICODE_GREEK_CAPITAL_LETTER_OMEGA, UNICODE_LATIN_SMALL_LETTER_AE,
294 	UNICODE_LATIN_SMALL_LETTER_O_WITH_STROKE,
295 
296 	UNICODE_INVERTED_QUESTION_MARK, UNICODE_INVERTED_EXCLAMATION_MARK, UNICODE_NOT_SIGN, UNICODE_SQUARE_ROOT,
297 	UNICODE_LATIN_SMALL_LETTER_F_WITH_HOOK, UNICODE_ALMOST_EQUAL_TO, UNICODE_INCREMENT,
298 	UNICODE_LEFT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK, UNICODE_RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK,
299 	UNICODE_HORIZONTAL_ELLIPSIS, UNICODE_NO_BREAK_SPACE, UNICODE_LATIN_CAPITAL_LETTER_A_WITH_GRAVE,
300 	UNICODE_LATIN_CAPITAL_LETTER_A_WITH_TILDE, UNICODE_LATIN_CAPITAL_LETTER_O_WITH_TILDE,
301 	UNICODE_LATIN_CAPITAL_LIGATURE_OE, UNICODE_LATIN_SMALL_LIGATURE_OE,
302 
303 	UNICODE_EN_DASH, UNICODE_EM_DASH, UNICODE_LEFT_DOUBLE_QUOTATION_MARK, UNICODE_RIGHT_DOUBLE_QUOTATION_MARK,
304 	UNICODE_LEFT_SINGLE_QUOTATION_MARK, UNICODE_RIGHT_SINGLE_QUOTATION_MARK, UNICODE_DIVISION_SIGN, UNICODE_LOZENGE,
305 	UNICODE_LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, UNICODE_LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS,
306 	UNICODE_FRACTION_SLASH, UNICODE_EURO_SIGN,
307 	UNICODE_SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK, UNICODE_SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK,
308 	UNICODE_LATIN_SMALL_LIGATURE_FI, UNICODE_LATIN_SMALL_LIGATURE_FL,
309 
310 	UNICODE_DOUBLE_DAGGER, UNICODE_MIDDLE_DOT,
311 	UNICODE_SINGLE_LOW_9_QUOTATION_MARK, UNICODE_DOUBLE_LOW_9_QUOTATION_MARK, UNICODE_PER_MILLE_SIGN,
312 	UNICODE_LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX, UNICODE_LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX,
313 	UNICODE_LATIN_CAPITAL_LETTER_A_WITH_ACUTE, UNICODE_LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS,
314 	UNICODE_LATIN_CAPITAL_LETTER_E_WITH_GRAVE, UNICODE_LATIN_CAPITAL_LETTER_I_WITH_ACUTE,
315 	UNICODE_LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX, UNICODE_LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS,
316 	UNICODE_LATIN_CAPITAL_LETTER_I_WITH_GRAVE, UNICODE_LATIN_CAPITAL_LETTER_O_WITH_ACUTE,
317 	UNICODE_LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX,
318 
319 	0xf8ff /* Apple logo */, UNICODE_LATIN_CAPITAL_LETTER_O_WITH_GRAVE, UNICODE_LATIN_CAPITAL_LETTER_U_WITH_ACUTE,
320 	UNICODE_LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX, UNICODE_LATIN_CAPITAL_LETTER_U_WITH_GRAVE,
321 	UNICODE_LATIN_SMALL_LETTER_DOTLESS_I, UNICODE_MODIFIER_LETTER_CIRCUMFLEX_ACCENT, UNICODE_SMALL_TILDE,
322 	UNICODE_MACRON, UNICODE_BREVE, UNICODE_DOT_ABOVE, UNICODE_RING_ABOVE, UNICODE_CEDILLA,
323 	UNICODE_DOUBLE_ACUTE_ACCENT, UNICODE_OGONEK, UNICODE_CARON };
324 
325 char32 Melder_decodeWindowsLatin1 [256] = {
326 	  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,
327 	 20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
328 	 40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
329 	 60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
330 	 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
331 	100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
332 	120, 121, 122, 123, 124, 125, 126, 127,
333 	UNICODE_EURO_SIGN, 129, UNICODE_SINGLE_LOW_9_QUOTATION_MARK, UNICODE_LATIN_SMALL_LETTER_F_WITH_HOOK,
334 	UNICODE_DOUBLE_LOW_9_QUOTATION_MARK, UNICODE_HORIZONTAL_ELLIPSIS, UNICODE_DAGGER, UNICODE_DOUBLE_DAGGER,
335 	UNICODE_MODIFIER_LETTER_CIRCUMFLEX_ACCENT, UNICODE_PER_MILLE_SIGN, UNICODE_LATIN_CAPITAL_LETTER_S_WITH_CARON,
336 	UNICODE_SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK, UNICODE_LATIN_CAPITAL_LIGATURE_OE, 141,
337 	UNICODE_LATIN_CAPITAL_LETTER_Z_WITH_CARON, 143,
338 	144, UNICODE_LEFT_SINGLE_QUOTATION_MARK, UNICODE_RIGHT_SINGLE_QUOTATION_MARK, UNICODE_LEFT_DOUBLE_QUOTATION_MARK,
339 	UNICODE_RIGHT_DOUBLE_QUOTATION_MARK, UNICODE_BULLET, UNICODE_EN_DASH, UNICODE_EM_DASH,
340 	UNICODE_SMALL_TILDE, UNICODE_TRADE_MARK_SIGN, UNICODE_LATIN_SMALL_LETTER_S_WITH_CARON,
341 	UNICODE_SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK, UNICODE_LATIN_SMALL_LIGATURE_OE, 157,
342 	UNICODE_LATIN_SMALL_LETTER_Z_WITH_CARON, UNICODE_LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS,
343 	160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
344 	180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
345 	200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
346 	220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
347 	240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 };
348 
Melder_8to32_inplace(conststring8 string8,mutablestring32 string32,kMelder_textInputEncoding inputEncoding)349 void Melder_8to32_inplace (conststring8 string8, mutablestring32 string32, kMelder_textInputEncoding inputEncoding) {
350 	char32 *q = & string32 [0];
351 	if (inputEncoding == kMelder_textInputEncoding::UNDEFINED) {
352 		inputEncoding = preferences. inputEncoding;
353 		/*
354 		 * In case the preferences weren't initialized yet, use the platform defaults:
355 		 */
356 		if (inputEncoding == kMelder_textInputEncoding::UNDEFINED) {
357 			#if defined (macintosh)
358 				inputEncoding = kMelder_textInputEncoding::UTF8_THEN_MACROMAN;
359 			#elif defined (_WIN32)
360 				inputEncoding = kMelder_textInputEncoding::UTF8_THEN_WINDOWS_LATIN1;
361 			#else
362 				inputEncoding = kMelder_textInputEncoding::UTF8_THEN_ISO_LATIN1;
363 			#endif
364 		}
365 	}
366 	if (inputEncoding == kMelder_textInputEncoding::UTF8 ||
367 		inputEncoding == kMelder_textInputEncoding::UTF8_THEN_ISO_LATIN1 ||
368 		inputEncoding == kMelder_textInputEncoding::UTF8_THEN_WINDOWS_LATIN1 ||
369 		inputEncoding == kMelder_textInputEncoding::UTF8_THEN_MACROMAN)
370 	{
371 		if (Melder_str8IsValidUtf8 (string8)) {
372 			inputEncoding = kMelder_textInputEncoding::UTF8;
373 		} else if (inputEncoding == kMelder_textInputEncoding::UTF8_THEN_ISO_LATIN1) {
374 			inputEncoding = kMelder_textInputEncoding::ISO_LATIN1;
375 		} else if (inputEncoding == kMelder_textInputEncoding::UTF8_THEN_WINDOWS_LATIN1) {
376 			inputEncoding = kMelder_textInputEncoding::WINDOWS_LATIN1;
377 		} else if (inputEncoding == kMelder_textInputEncoding::UTF8_THEN_MACROMAN) {
378 			inputEncoding = kMelder_textInputEncoding::MACROMAN;
379 		} else {
380 			Melder_assert (inputEncoding == kMelder_textInputEncoding::UTF8);
381 			Melder_throw (U"Text is not valid UTF-8; please try a different text input encoding.");
382 		}
383 	}
384 	const char8 *p = (const char8 *) & string8 [0];
385 	if (inputEncoding == kMelder_textInputEncoding::UTF8) {
386 		while (*p != '\0') {
387 			char32 kar1 = * p ++;   // convert up without sign extension
388 			if (kar1 <= 0x00'007F) {
389 				*q ++ = kar1;
390 			} else if (kar1 <= 0x00'00DF) {
391 				char32 kar2 = * p ++;   // convert up without sign extension
392 				*q ++ = ((kar1 & 0x00'001F) << 6) | (kar2 & 0x00'003F);
393 			} else if (kar1 <= 0x00'00EF) {
394 				char32 kar2 = * p ++, kar3 = * p ++;   // convert up without sign extension
395 				*q ++ = ((kar1 & 0x00'000F) << 12) | ((kar2 & 0x00'003F) << 6) | (kar3 & 0x00'003F);
396 			} else if (kar1 <= 0x00'00F4) {
397 				char32 kar2 = *p ++, kar3 = *p ++, kar4 = *p ++;   // convert up without sign extension
398 				char32 kar = ((kar1 & 0x00'0007) << 18) | ((kar2 & 0x00'003F) << 12) | ((kar3 & 0x00'003F) << 6) | (kar4 & 0x00'003F);
399 				*q ++ = kar;
400 			}
401 		}
402 	} else if (inputEncoding == kMelder_textInputEncoding::ISO_LATIN1) {
403 		while (*p != '\0')
404 			*q ++ = *p ++;
405 	} else if (inputEncoding == kMelder_textInputEncoding::WINDOWS_LATIN1) {
406 		while (*p != '\0')
407 			*q ++ = Melder_decodeWindowsLatin1 [*p ++];
408 	} else if (inputEncoding == kMelder_textInputEncoding::MACROMAN) {
409 		while (*p != '\0')
410 			*q ++ = Melder_decodeMacRoman [*p ++];
411 	} else if (inputEncoding != kMelder_textInputEncoding::UTF8) {
412 		Melder_fatal (U"Unknown text input encoding ", (int) inputEncoding, U".");
413 	}
414 	*q = U'\0';   // closing null character
415 	(void) Melder_killReturns_inplaceCHAR <char32> (string32);
416 }
417 
Melder_8to32(const char * string,kMelder_textInputEncoding inputEncoding)418 autostring32 Melder_8to32 (const char *string, kMelder_textInputEncoding inputEncoding) {
419 	if (! string)
420 		return autostring32();
421 	autostring32 result (strlen (string));
422 	Melder_8to32_inplace (string, result.get(), inputEncoding);
423 	return result;
424 }
425 
Melder_8to32(const char * string)426 autostring32 Melder_8to32 (const char *string) {
427 	if (! string)
428 		return autostring32();
429 	autostring32 result (strlen (string));
430 	Melder_8to32_inplace (string, result.get(), kMelder_textInputEncoding::UTF8);
431 	return result;
432 }
433 
Melder_peek16to32(conststring16 text)434 conststring32 Melder_peek16to32 (conststring16 text) {
435 	if (! text) return nullptr;
436 	static MelderString buffers [19];
437 	static int bufferNumber = 0;
438 	if (++ bufferNumber == 19)
439 		bufferNumber = 0;
440 	MelderString_empty (& buffers [bufferNumber]);
441 	for (;;) {
442 		char16 kar1 = *text ++;
443 		if (kar1 == u'\0') return buffers [bufferNumber]. string;
444 		if (kar1 < 0xD800) {
445 			MelderString_appendCharacter (& buffers [bufferNumber], (char32) kar1);   // convert up without sign extension
446 		} else if (kar1 < 0xDC00) {
447 			char16 kar2 = *text ++;
448 			if (kar2 >= 0xDC00 && kar2 <= 0xDFFF) {
449 				MelderString_appendCharacter (& buffers [bufferNumber],
450 					(char32) (0x01'0000 +
451 						(char32) (((char32) kar1 & 0x00'03FF) << 10) +
452 						(char32)  ((char32) kar2 & 0x00'03FF)));
453 			} else {
454 				MelderString_appendCharacter (& buffers [bufferNumber], UNICODE_REPLACEMENT_CHARACTER);
455 			}
456 		} else if (kar1 < 0xE000) {
457 			MelderString_appendCharacter (& buffers [bufferNumber], UNICODE_REPLACEMENT_CHARACTER);
458 		} else {
459 			MelderString_appendCharacter (& buffers [bufferNumber], (char32) kar1);   // convert up without sign extension
460 		}
461 	}
462 }
463 
Melder_16to32(conststring16 text)464 autostring32 Melder_16to32 (conststring16 text) {
465 	return Melder_dup (Melder_peek16to32 (text));
466 }
467 
Melder_32to8_inplace(conststring32 string,mutablestring8 utf8)468 void Melder_32to8_inplace (conststring32 string, mutablestring8 utf8) {;
469 	Melder_assert (utf8);   // and unassertable: utf8 should be a long enough buffer
470 	char8 *to = (char8 *) & utf8 [0];
471 	if (string) {
472 		const char32 *p = & string [0];
473 		while (*p != U'\0') {
474 			const char32 kar = *p ++;
475 			if (kar <= 0x00'007F) {   // 7 bits
476 				#ifdef _WIN32
477 					if (kar == U'\n')
478 						*to ++ = 13;
479 				#endif
480 				*to ++ = (char8) kar;   // guarded truncation
481 			} else if (kar <= 0x00'07FF) {   // 11 bits
482 				*to ++ = (char8) (0x00'00C0 | (kar >> 6));   // the upper 5 bits yield a number between 0xC4 and 0xDF
483 				*to ++ = (char8) (0x00'0080 | (kar & 0x00'003F));   // the lower 6 bits yield a number between 0x80 and 0xBF
484 			} else if (kar <= 0x00'FFFF) {   // 16 bits
485 				*to ++ = (char8) (0x00'00E0 | (kar >> 12));   // the upper 4 bits yield a number between 0xE0 and 0xEF
486 				*to ++ = (char8) (0x00'0080 | ((kar >> 6) & 0x00'003F));
487 				*to ++ = (char8) (0x00'0080 | (kar & 0x00'003F));
488 			} else {   // unguarded truncation to 21 bits
489 				*to ++ = (char8) (0x00'00F0 | (kar >> 18));   // the upper 3 bits yield a number between 0xF0 and 0xF4 (0x10FFFF >> 18 == 4)
490 				*to ++ = (char8) (0x00'0080 | ((kar >> 12) & 0x00'003F));   // the next 6 bits
491 				*to ++ = (char8) (0x00'0080 | ((kar >> 6) & 0x00'003F));   // the third 6 bits
492 				*to ++ = (char8) (0x00'0080 | (kar & 0x00'003F));   // the lower 6 bits
493 			}
494 		}
495 	}
496 	*to = '\0';
497 }
498 
Melder_peek32to8(conststring32 text)499 conststring8 Melder_peek32to8 (conststring32 text) {
500 	if (! text)
501 		return nullptr;
502 	static mutablestring8 buffers [19] { nullptr };
503 	static int64 bufferSizes [19] { 0 };
504 	static int bufferNumber = 0;
505 	if (++ bufferNumber == 19)
506 		bufferNumber = 0;
507 	constexpr int64 maximumNumberOfUTF8bytesPerUTF32point = 4;   // becausse we use only the lower 21 bits
508 	const int64 numberOfUTF32points = str32len (text);
509 	const int64 maximumNumberOfBytesNeeded = numberOfUTF32points * maximumNumberOfUTF8bytesPerUTF32point + 1;
510 	if ((bufferSizes [bufferNumber] - maximumNumberOfBytesNeeded) * (int64) sizeof (char) >= 10'000) {
511 		Melder_free (buffers [bufferNumber]);
512 		bufferSizes [bufferNumber] = 0;
513 	}
514 	if (maximumNumberOfBytesNeeded > bufferSizes [bufferNumber]) {
515 		const int64 newBufferSize = (int64) floor (maximumNumberOfBytesNeeded * 1.61803) + 100;
516 		buffers [bufferNumber] = (char *) Melder_realloc_f (buffers [bufferNumber], newBufferSize * (int64) sizeof (char));
517 		bufferSizes [bufferNumber] = newBufferSize;
518 	}
519 	Melder_32to8_inplace (text, buffers [bufferNumber]);
520 	return buffers [bufferNumber];
521 }
522 
Melder_32to8(conststring32 string)523 autostring8 Melder_32to8 (conststring32 string) {
524 	if (! string)
525 		return autostring8();
526 	autostring8 result (str32len_utf8 (string, true));
527 	Melder_32to8_inplace (string, result.get());
528 	return result;
529 }
530 
Melder_peek32to16(conststring32 text,bool nativizeNewlines)531 conststring16 Melder_peek32to16 (conststring32 text, bool nativizeNewlines) {
532 	if (! text)
533 		return nullptr;
534 	static MelderString16 buffers [19] { };
535 	static int bufferNumber = 0;
536 	if (++ bufferNumber == 19)
537 		bufferNumber = 0;
538 	MelderString16_empty (& buffers [bufferNumber]);
539 	int64 n = str32len (text);
540 	if (nativizeNewlines) {
541 		for (int64 i = 0; i <= n; i ++) {
542 			#ifdef _WIN32
543 				if (text [i] == U'\n')
544 					MelderString16_appendCharacter (& buffers [bufferNumber], (char32) 13);
545 			#endif
546 			MelderString16_appendCharacter (& buffers [bufferNumber], text [i]);
547 		}
548 	} else {
549 		for (int64 i = 0; i <= n; i ++)
550 			MelderString16_appendCharacter (& buffers [bufferNumber], text [i]);
551 	}
552 	return buffers [bufferNumber]. string;
553 }
Melder_peek32to16(conststring32 text)554 conststring16 Melder_peek32to16 (conststring32 text) {
555 	return Melder_peek32to16 (text, false);
556 }
557 
Melder_32to16(conststring32 text)558 autostring16 Melder_32to16 (conststring32 text) {
559 	conststring16 text16 = Melder_peek32to16 (text);
560 	int64 length = str16len (text16);
561 	autostring16 result (length);
562 	str16cpy (result.get(), text16);
563 	return result;
564 }
565 
566 #if defined (_WIN32)
Melder_32toW(conststring32 text)567 autostringW Melder_32toW (conststring32 text) {
568 	conststringW textW = Melder_peek32toW (text);
569 	int64 length = str16len ((conststring16) textW);
570 	autostringW result (length);
571 	str16cpy ((mutablestring16) result.get(), (conststring16) textW);
572 	return result;
573 }
Melder_peek32toW_fileSystem(conststring32 string)574 conststringW Melder_peek32toW_fileSystem (conststring32 string) {
575 	static wchar_t buffer [1 + kMelder_MAXPATH];
576 	//NormalizeStringW (NormalizationKC, -1, Melder_peek32toW (string), 1 + kMelder_MAXPATH, buffer);
577 	FoldStringW (MAP_PRECOMPOSED, Melder_peek32toW (string), -1, buffer, 1 + kMelder_MAXPATH);   // this works even on XP
578 	return buffer;
579 }
Melder_32toW_fileSystem(conststring32 text)580 autostringW Melder_32toW_fileSystem (conststring32 text) {
581 	conststringW textW = Melder_peek32toW_fileSystem (text);
582 	int64 length = str16len ((conststring16) textW);
583 	autostringW result (length);
584 	str16cpy ((mutablestring16) result.get(), (conststring16) textW);
585 	return result;
586 }
587 #endif
588 
Melder_32to8_fileSystem_inplace(conststring32 string,char * utf8)589 void Melder_32to8_fileSystem_inplace (conststring32 string, char *utf8) {
590 	#if defined (macintosh)
591 		/*
592 			On the Mac, the POSIX path name is stored in canonically decomposed UTF-8 encoding.
593 			The path is probably in precomposed UTF-32.
594 			So we first convert to UTF-16, then turn into CFString, then decompose, then convert to UTF-8.
595 		*/
596 		UniChar unipath [kMelder_MAXPATH+1];
597 		const int64 n = str32len (string);
598 		int n_utf16 = 0;
599 		for (int64 i = 0; i < n; i ++) {
600 			char32 kar = (char32) string [i];   // change sign (bit 32 is never used)
601 			if (kar <= 0x00'FFFF) {
602 				unipath [n_utf16 ++] = (UniChar) kar;   // including null byte; guarded truncation
603 			} else if (kar <= 0x10'FFFF) {
604 				kar -= 0x01'0000;
605 				unipath [n_utf16 ++] = (UniChar) (0x00'D800 | (kar >> 10));   // correct truncation, because UTF-32 has fewer than 27 bits (in fact it has 21 bits)
606 				unipath [n_utf16 ++] = (UniChar) (0x00'DC00 | (kar & 0x00'03FF));
607 			} else {
608 				unipath [n_utf16 ++] = UNICODE_REPLACEMENT_CHARACTER;
609 			}
610 		}
611 		unipath [n_utf16] = u'\0';
612 		CFStringRef cfpath = CFStringCreateWithCharacters (nullptr, unipath, n_utf16);
613 		CFMutableStringRef cfpath2 = CFStringCreateMutableCopy (nullptr, 0, cfpath);
614 		CFRelease (cfpath);
615 		CFStringNormalize (cfpath2, kCFStringNormalizationFormD);   // Mac requires decomposed characters
616 		CFStringGetCString (cfpath2, (char *) utf8, kMelder_MAXPATH+1, kCFStringEncodingUTF8);   // Mac POSIX requires UTF-8
617 		CFRelease (cfpath2);
618 	#elif defined (UNIX) || defined (__CYGWIN__)
619 		Melder_32to8_inplace (string, utf8);
620 	#elif defined (_WIN32)
621 		const int n = str32len (string);
622 		int j = 0;
623 		for (int i = 0; i < n; i ++)
624 			utf8 [j ++] = ( string [i] <= 255 ? string [i] : '?' );   // the usual replacement on Windows
625 		utf8 [j] = '\0';
626 	#else
627 		//#error Unsupported platform.
628 	#endif
629 }
Melder_peek32to8_fileSystem(conststring32 string)630 conststring8 Melder_peek32to8_fileSystem (conststring32 string) {
631 	static char buffer [1 + kMelder_MAXPATH];
632 	Melder_32to8_fileSystem_inplace (string, buffer);
633 	return buffer;
634 }
635 
636 #if defined (macintosh)
Melder_peek32toCfstring(conststring32 text)637 const void * Melder_peek32toCfstring (conststring32 text) {
638 	if (! text)
639 		return nullptr;
640 	static CFStringRef cfString [11];
641 	static int icfString = 0;
642 	if (++ icfString == 11)
643 		icfString = 0;
644 	if (cfString [icfString])
645 		CFRelease (cfString [icfString]);
646 	cfString [icfString] = CFStringCreateWithCString (nullptr, (const char *) Melder_peek32to8 (text), kCFStringEncodingUTF8);
647 	return cfString [icfString];
648 }
649 #endif
650 
651 /* End of file melder_textencoding.cpp */
652