1 /* melder_textencoding.cpp
2 *
3 * Copyright (C) 2007-2019 Paul Boersma
4 *
5 * This code is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or (at
8 * your option) any later version.
9 *
10 * This code is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 * See the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this work. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include "melder.h"
20 #include "../sys/Preferences.h"
21 #include "../kar/UnicodeData.h"
22
23 #ifdef _WIN32
24 #include <windows.h>
25 #endif
26 #if defined (macintosh)
27 #include "macport_on.h"
28 #include <CoreFoundation/CoreFoundation.h>
29 #include "macport_off.h"
30 #endif
31
32 static struct {
33 kMelder_textInputEncoding inputEncoding;
34 kMelder_textOutputEncoding outputEncoding;
35 } preferences;
36
Melder_setInputEncoding(kMelder_textInputEncoding encoding)37 void Melder_setInputEncoding (kMelder_textInputEncoding encoding) { preferences. inputEncoding = encoding; }
Melder_getInputEncoding()38 kMelder_textInputEncoding Melder_getInputEncoding () { return preferences. inputEncoding; }
39
Melder_setOutputEncoding(kMelder_textOutputEncoding encoding)40 void Melder_setOutputEncoding (kMelder_textOutputEncoding encoding) { preferences. outputEncoding = encoding; }
Melder_getOutputEncoding()41 kMelder_textOutputEncoding Melder_getOutputEncoding () { return preferences. outputEncoding; }
42
Melder_textEncoding_prefs()43 void Melder_textEncoding_prefs () {
44 Preferences_addEnum (U"TextEncoding.inputEncoding", & preferences. inputEncoding, kMelder_textInputEncoding, kMelder_textInputEncoding::DEFAULT);
45 Preferences_addEnum (U"TextEncoding.outputEncoding", & preferences. outputEncoding, kMelder_textOutputEncoding, kMelder_textOutputEncoding::DEFAULT);
46 }
47
Melder_isValidAscii(conststring32 text)48 bool Melder_isValidAscii (conststring32 text) {
49 for (; *text != U'\0'; text ++) {
50 if (*text > 127)
51 return false;
52 }
53 return true;
54 }
55
Melder_isEncodable(conststring32 text,int outputEncoding)56 bool Melder_isEncodable (conststring32 text, int outputEncoding) {
57 switch (outputEncoding) {
58 case kMelder_textOutputEncoding_ASCII: {
59 for (; *text != U'\0'; text ++) {
60 if (*text > 127)
61 return false;
62 }
63 return true;
64 } break;
65 case kMelder_textOutputEncoding_ISO_LATIN1: {
66 for (; *text != U'\0'; text ++) {
67 if (*text > 255)
68 return false;
69 }
70 return true;
71 } break;
72 case (int) kMelder_textOutputEncoding::UTF8:
73 case (int) kMelder_textOutputEncoding::UTF16:
74 case (int) kMelder_textOutputEncoding::ASCII_THEN_UTF16:
75 case (int) kMelder_textOutputEncoding::ISO_LATIN1_THEN_UTF16: {
76 return true;
77 }
78 }
79 return false;
80 }
81
Melder_str8IsValidUtf8(const char * string)82 bool Melder_str8IsValidUtf8 (const char *string) {
83 for (const char8 *p = (const char8 *) & string [0]; *p != '\0'; p ++) {
84 char32 kar = (char32) *p;
85 if (kar <= 0x7F) {
86 ;
87 } else if (kar <= 0xC1) {
88 return false;
89 } else if (kar <= 0xDF) {
90 if ((* ++ p & 0xC0) != 0x80)
91 return false;
92 } else if (kar <= 0xEF) {
93 if ((* ++ p & 0xC0) != 0x80)
94 return false;
95 if ((* ++ p & 0xC0) != 0x80)
96 return false;
97 } else if (kar <= 0xF4) {
98 if ((* ++ p & 0xC0) != 0x80)
99 return false;
100 if ((* ++ p & 0xC0) != 0x80)
101 return false;
102 if ((* ++ p & 0xC0) != 0x80)
103 return false;
104 } else {
105 return false;
106 }
107 }
108 return true;
109 }
110
Melder_killReturns_inplace(char * text)111 integer Melder_killReturns_inplace (char *text) {
112 const char *from;
113 char *to;
114 for (from = text, to = text; *from != '\0'; from ++, to ++) {
115 if (*from == 13) { // carriage return?
116 if (from [1] == '\n') { // followed by linefeed? Must be a Windows text
117 from ++; // ignore carriage return
118 *to = '\n'; // copy linefeed
119 } else { // bare carriage return? Must be a Macintosh text
120 *to = '\n'; // change to linefeed
121 }
122 } else {
123 *to = *from;
124 }
125 }
126 *to = '\0'; // closing null byte
127 return to - text;
128 }
129
130 template <class CHAR>
Melder_killReturns_inplaceCHAR(CHAR * text)131 integer Melder_killReturns_inplaceCHAR (CHAR *text) {
132 const CHAR *from;
133 CHAR *to;
134 for (from = text, to = text; *from != '\0'; from ++, to ++) {
135 if (*from == 13) { // carriage return?
136 if (from [1] == '\n' || from [1] == 0x0085 /* NextLine */) { // followed by linefeed? Must be a Windows text
137 from ++; // ignore carriage return
138 *to = '\n'; // copy linefeed
139 } else { // bare carriage return? Must be a Macintosh text
140 *to = '\n'; // change to linefeed (10)
141 }
142 } else if (*from == 0x0085 /* NextLine */ || *from == 0x000C /* FormFeed */ ||
143 *from == UNICODE_LINE_SEPARATOR || *from == UNICODE_PARAGRAPH_SEPARATOR)
144 {
145 *to = '\n';
146 } else {
147 *to = *from;
148 }
149 }
150 *to = '\0'; // closing null character
151 return to - text;
152 }
Melder_killReturns_inplace(char32 * text)153 integer Melder_killReturns_inplace (char32 *text) {
154 return Melder_killReturns_inplaceCHAR <char32> (text);
155 }
156
str32len_utf8(conststring32 string,bool nativizeNewlines)157 size_t str32len_utf8 (conststring32 string, bool nativizeNewlines) {
158 size_t length = 0;
159 for (const char32 *p = & string [0]; *p != U'\0'; p ++) {
160 char32 kar = *p;
161 if (kar <= 0x00'007F) {
162 #ifdef _WIN32
163 if (nativizeNewlines && kar == U'\n') length ++;
164 #else
165 (void) nativizeNewlines;
166 #endif
167 length ++;
168 } else if (kar <= 0x00'07FF) {
169 length += 2;
170 } else if (kar <= 0x00'FFFF) {
171 length += 3;
172 } else {
173 Melder_assert (kar <= 0x10'FFFF);
174 length += 4;
175 }
176 }
177 return length;
178 }
179
str32len_utf16(conststring32 string,bool nativizeNewlines)180 size_t str32len_utf16 (conststring32 string, bool nativizeNewlines) {
181 size_t length = 0;
182 for (const char32 *p = & string [0]; *p != U'\0'; p ++) {
183 char32 kar = *p;
184 if (kar <= 0x00'007F) {
185 #ifdef _WIN32
186 if (nativizeNewlines && kar == U'\n') length ++;
187 #else
188 (void) nativizeNewlines;
189 #endif
190 length ++;
191 } else if (kar >= 0x01'0000) {
192 length += 2;
193 } else {
194 length += 1;
195 }
196 }
197 return length;
198 }
199
Melder_peek8to32(conststring8 textA)200 conststring32 Melder_peek8to32 (conststring8 textA) {
201 if (! textA)
202 return nullptr;
203 static MelderString buffers [19];
204 static int ibuffer = 0;
205 if (++ ibuffer == 11)
206 ibuffer = 0;
207 MelderString_empty (& buffers [ibuffer]);
208 uinteger n = strlen (textA), i, j;
209 for (i = 0, j = 0; i <= n; i ++) {
210 char8 kar1 = (char8) textA [i]; // convert sign
211 if (kar1 <= 0x7F) {
212 MelderString_appendCharacter (& buffers [ibuffer],
213 (char32) kar1);
214 } else if (kar1 <= 0xC1) {
215 MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
216 } else if (kar1 <= 0xDF) {
217 char8 kar2 = textA [++ i];
218 if ((kar2 & 0xC0) != 0x80)
219 MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
220 MelderString_appendCharacter (& buffers [ibuffer],
221 (char32) ((char32) ((char32) kar1 & 0x00'001F) << 6) |
222 (char32) ((char32) kar2 & 0x00'003F));
223 } else if (kar1 <= 0xEF) {
224 char8 kar2 = textA [++ i];
225 if ((kar2 & 0xC0) != 0x80)
226 MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
227 char8 kar3 = textA [++ i];
228 if ((kar3 & 0xC0) != 0x80)
229 MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
230 MelderString_appendCharacter (& buffers [ibuffer],
231 (char32) ((char32) ((char32) kar1 & 0x00'000F) << 12) |
232 (char32) ((char32) ((char32) kar2 & 0x00'003F) << 6) |
233 (char32) ((char32) kar3 & 0x00'003F));
234 } else if (kar1 <= 0xF4) {
235 char8 kar2 = (char8) textA [++ i];
236 if ((kar2 & 0xC0) != 0x80)
237 MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
238 char8 kar3 = (char8) textA [++ i];
239 if ((kar3 & 0xC0) != 0x80)
240 MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
241 char8 kar4 = (char8) textA [++ i];
242 if ((kar4 & 0xC0) != 0x80)
243 MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
244 char32 character =
245 (char32) ((char32) ((char32) kar1 & 0x00'0007) << 18) |
246 (char32) ((char32) ((char32) kar2 & 0x00'003F) << 12) |
247 (char32) ((char32) ((char32) kar3 & 0x00'003F) << 6) |
248 (char32) ((char32) kar4 & 0x00'003F);
249 MelderString_appendCharacter (& buffers [ibuffer], character);
250 } else {
251 MelderString_appendCharacter (& buffers [ibuffer], UNICODE_REPLACEMENT_CHARACTER);
252 }
253 }
254 return buffers [ibuffer]. string;
255 }
256
257 char32 Melder_decodeMacRoman [256] = {
258 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
259 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
260 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
261 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
262 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
263 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
264 120, 121, 122, 123, 124, 125, 126, 127,
265 UNICODE_LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS, UNICODE_LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE,
266 UNICODE_LATIN_CAPITAL_LETTER_C_WITH_CEDILLA, UNICODE_LATIN_CAPITAL_LETTER_E_WITH_ACUTE,
267 UNICODE_LATIN_CAPITAL_LETTER_N_WITH_TILDE, UNICODE_LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS,
268 UNICODE_LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS, UNICODE_LATIN_SMALL_LETTER_A_WITH_ACUTE,
269 UNICODE_LATIN_SMALL_LETTER_A_WITH_GRAVE, UNICODE_LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX,
270 UNICODE_LATIN_SMALL_LETTER_A_WITH_DIAERESIS, UNICODE_LATIN_SMALL_LETTER_A_WITH_TILDE,
271 UNICODE_LATIN_SMALL_LETTER_A_WITH_RING_ABOVE, UNICODE_LATIN_SMALL_LETTER_C_WITH_CEDILLA,
272 UNICODE_LATIN_SMALL_LETTER_E_WITH_ACUTE, UNICODE_LATIN_SMALL_LETTER_E_WITH_GRAVE,
273
274 UNICODE_LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX, UNICODE_LATIN_SMALL_LETTER_E_WITH_DIAERESIS,
275 UNICODE_LATIN_SMALL_LETTER_I_WITH_ACUTE, UNICODE_LATIN_SMALL_LETTER_I_WITH_GRAVE,
276 UNICODE_LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX, UNICODE_LATIN_SMALL_LETTER_I_WITH_DIAERESIS,
277 UNICODE_LATIN_SMALL_LETTER_N_WITH_TILDE, UNICODE_LATIN_SMALL_LETTER_O_WITH_ACUTE,
278 UNICODE_LATIN_SMALL_LETTER_O_WITH_GRAVE, UNICODE_LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX,
279 UNICODE_LATIN_SMALL_LETTER_O_WITH_DIAERESIS, UNICODE_LATIN_SMALL_LETTER_O_WITH_TILDE,
280 UNICODE_LATIN_SMALL_LETTER_U_WITH_ACUTE, UNICODE_LATIN_SMALL_LETTER_U_WITH_GRAVE,
281 UNICODE_LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX, UNICODE_LATIN_SMALL_LETTER_U_WITH_DIAERESIS,
282
283 UNICODE_DAGGER, UNICODE_DEGREE_SIGN, UNICODE_CENT_SIGN, UNICODE_POUND_SIGN,
284 UNICODE_SECTION_SIGN, UNICODE_BULLET, UNICODE_PILCROW_SIGN, UNICODE_LATIN_SMALL_LETTER_SHARP_S,
285 UNICODE_REGISTERED_SIGN, UNICODE_COPYRIGHT_SIGN, UNICODE_TRADE_MARK_SIGN, UNICODE_ACUTE_ACCENT,
286 UNICODE_DIAERESIS, UNICODE_NOT_EQUAL_TO,
287 UNICODE_LATIN_CAPITAL_LETTER_AE, UNICODE_LATIN_CAPITAL_LETTER_O_WITH_STROKE,
288
289 UNICODE_INFINITY, UNICODE_PLUS_MINUS_SIGN, UNICODE_LESS_THAN_OR_EQUAL_TO, UNICODE_GREATER_THAN_OR_EQUAL_TO,
290 UNICODE_YEN_SIGN, UNICODE_MICRO_SIGN, UNICODE_PARTIAL_DIFFERENTIAL, UNICODE_N_ARY_SUMMATION,
291 UNICODE_N_ARY_PRODUCT, UNICODE_GREEK_SMALL_LETTER_PI, UNICODE_INTEGRAL,
292 UNICODE_FEMININE_ORDINAL_INDICATOR, UNICODE_MASCULINE_ORDINAL_INDICATOR,
293 UNICODE_GREEK_CAPITAL_LETTER_OMEGA, UNICODE_LATIN_SMALL_LETTER_AE,
294 UNICODE_LATIN_SMALL_LETTER_O_WITH_STROKE,
295
296 UNICODE_INVERTED_QUESTION_MARK, UNICODE_INVERTED_EXCLAMATION_MARK, UNICODE_NOT_SIGN, UNICODE_SQUARE_ROOT,
297 UNICODE_LATIN_SMALL_LETTER_F_WITH_HOOK, UNICODE_ALMOST_EQUAL_TO, UNICODE_INCREMENT,
298 UNICODE_LEFT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK, UNICODE_RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK,
299 UNICODE_HORIZONTAL_ELLIPSIS, UNICODE_NO_BREAK_SPACE, UNICODE_LATIN_CAPITAL_LETTER_A_WITH_GRAVE,
300 UNICODE_LATIN_CAPITAL_LETTER_A_WITH_TILDE, UNICODE_LATIN_CAPITAL_LETTER_O_WITH_TILDE,
301 UNICODE_LATIN_CAPITAL_LIGATURE_OE, UNICODE_LATIN_SMALL_LIGATURE_OE,
302
303 UNICODE_EN_DASH, UNICODE_EM_DASH, UNICODE_LEFT_DOUBLE_QUOTATION_MARK, UNICODE_RIGHT_DOUBLE_QUOTATION_MARK,
304 UNICODE_LEFT_SINGLE_QUOTATION_MARK, UNICODE_RIGHT_SINGLE_QUOTATION_MARK, UNICODE_DIVISION_SIGN, UNICODE_LOZENGE,
305 UNICODE_LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, UNICODE_LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS,
306 UNICODE_FRACTION_SLASH, UNICODE_EURO_SIGN,
307 UNICODE_SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK, UNICODE_SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK,
308 UNICODE_LATIN_SMALL_LIGATURE_FI, UNICODE_LATIN_SMALL_LIGATURE_FL,
309
310 UNICODE_DOUBLE_DAGGER, UNICODE_MIDDLE_DOT,
311 UNICODE_SINGLE_LOW_9_QUOTATION_MARK, UNICODE_DOUBLE_LOW_9_QUOTATION_MARK, UNICODE_PER_MILLE_SIGN,
312 UNICODE_LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX, UNICODE_LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX,
313 UNICODE_LATIN_CAPITAL_LETTER_A_WITH_ACUTE, UNICODE_LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS,
314 UNICODE_LATIN_CAPITAL_LETTER_E_WITH_GRAVE, UNICODE_LATIN_CAPITAL_LETTER_I_WITH_ACUTE,
315 UNICODE_LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX, UNICODE_LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS,
316 UNICODE_LATIN_CAPITAL_LETTER_I_WITH_GRAVE, UNICODE_LATIN_CAPITAL_LETTER_O_WITH_ACUTE,
317 UNICODE_LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX,
318
319 0xf8ff /* Apple logo */, UNICODE_LATIN_CAPITAL_LETTER_O_WITH_GRAVE, UNICODE_LATIN_CAPITAL_LETTER_U_WITH_ACUTE,
320 UNICODE_LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX, UNICODE_LATIN_CAPITAL_LETTER_U_WITH_GRAVE,
321 UNICODE_LATIN_SMALL_LETTER_DOTLESS_I, UNICODE_MODIFIER_LETTER_CIRCUMFLEX_ACCENT, UNICODE_SMALL_TILDE,
322 UNICODE_MACRON, UNICODE_BREVE, UNICODE_DOT_ABOVE, UNICODE_RING_ABOVE, UNICODE_CEDILLA,
323 UNICODE_DOUBLE_ACUTE_ACCENT, UNICODE_OGONEK, UNICODE_CARON };
324
325 char32 Melder_decodeWindowsLatin1 [256] = {
326 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
327 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
328 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
329 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
330 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
331 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
332 120, 121, 122, 123, 124, 125, 126, 127,
333 UNICODE_EURO_SIGN, 129, UNICODE_SINGLE_LOW_9_QUOTATION_MARK, UNICODE_LATIN_SMALL_LETTER_F_WITH_HOOK,
334 UNICODE_DOUBLE_LOW_9_QUOTATION_MARK, UNICODE_HORIZONTAL_ELLIPSIS, UNICODE_DAGGER, UNICODE_DOUBLE_DAGGER,
335 UNICODE_MODIFIER_LETTER_CIRCUMFLEX_ACCENT, UNICODE_PER_MILLE_SIGN, UNICODE_LATIN_CAPITAL_LETTER_S_WITH_CARON,
336 UNICODE_SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK, UNICODE_LATIN_CAPITAL_LIGATURE_OE, 141,
337 UNICODE_LATIN_CAPITAL_LETTER_Z_WITH_CARON, 143,
338 144, UNICODE_LEFT_SINGLE_QUOTATION_MARK, UNICODE_RIGHT_SINGLE_QUOTATION_MARK, UNICODE_LEFT_DOUBLE_QUOTATION_MARK,
339 UNICODE_RIGHT_DOUBLE_QUOTATION_MARK, UNICODE_BULLET, UNICODE_EN_DASH, UNICODE_EM_DASH,
340 UNICODE_SMALL_TILDE, UNICODE_TRADE_MARK_SIGN, UNICODE_LATIN_SMALL_LETTER_S_WITH_CARON,
341 UNICODE_SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK, UNICODE_LATIN_SMALL_LIGATURE_OE, 157,
342 UNICODE_LATIN_SMALL_LETTER_Z_WITH_CARON, UNICODE_LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS,
343 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
344 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
345 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
346 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
347 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 };
348
Melder_8to32_inplace(conststring8 string8,mutablestring32 string32,kMelder_textInputEncoding inputEncoding)349 void Melder_8to32_inplace (conststring8 string8, mutablestring32 string32, kMelder_textInputEncoding inputEncoding) {
350 char32 *q = & string32 [0];
351 if (inputEncoding == kMelder_textInputEncoding::UNDEFINED) {
352 inputEncoding = preferences. inputEncoding;
353 /*
354 * In case the preferences weren't initialized yet, use the platform defaults:
355 */
356 if (inputEncoding == kMelder_textInputEncoding::UNDEFINED) {
357 #if defined (macintosh)
358 inputEncoding = kMelder_textInputEncoding::UTF8_THEN_MACROMAN;
359 #elif defined (_WIN32)
360 inputEncoding = kMelder_textInputEncoding::UTF8_THEN_WINDOWS_LATIN1;
361 #else
362 inputEncoding = kMelder_textInputEncoding::UTF8_THEN_ISO_LATIN1;
363 #endif
364 }
365 }
366 if (inputEncoding == kMelder_textInputEncoding::UTF8 ||
367 inputEncoding == kMelder_textInputEncoding::UTF8_THEN_ISO_LATIN1 ||
368 inputEncoding == kMelder_textInputEncoding::UTF8_THEN_WINDOWS_LATIN1 ||
369 inputEncoding == kMelder_textInputEncoding::UTF8_THEN_MACROMAN)
370 {
371 if (Melder_str8IsValidUtf8 (string8)) {
372 inputEncoding = kMelder_textInputEncoding::UTF8;
373 } else if (inputEncoding == kMelder_textInputEncoding::UTF8_THEN_ISO_LATIN1) {
374 inputEncoding = kMelder_textInputEncoding::ISO_LATIN1;
375 } else if (inputEncoding == kMelder_textInputEncoding::UTF8_THEN_WINDOWS_LATIN1) {
376 inputEncoding = kMelder_textInputEncoding::WINDOWS_LATIN1;
377 } else if (inputEncoding == kMelder_textInputEncoding::UTF8_THEN_MACROMAN) {
378 inputEncoding = kMelder_textInputEncoding::MACROMAN;
379 } else {
380 Melder_assert (inputEncoding == kMelder_textInputEncoding::UTF8);
381 Melder_throw (U"Text is not valid UTF-8; please try a different text input encoding.");
382 }
383 }
384 const char8 *p = (const char8 *) & string8 [0];
385 if (inputEncoding == kMelder_textInputEncoding::UTF8) {
386 while (*p != '\0') {
387 char32 kar1 = * p ++; // convert up without sign extension
388 if (kar1 <= 0x00'007F) {
389 *q ++ = kar1;
390 } else if (kar1 <= 0x00'00DF) {
391 char32 kar2 = * p ++; // convert up without sign extension
392 *q ++ = ((kar1 & 0x00'001F) << 6) | (kar2 & 0x00'003F);
393 } else if (kar1 <= 0x00'00EF) {
394 char32 kar2 = * p ++, kar3 = * p ++; // convert up without sign extension
395 *q ++ = ((kar1 & 0x00'000F) << 12) | ((kar2 & 0x00'003F) << 6) | (kar3 & 0x00'003F);
396 } else if (kar1 <= 0x00'00F4) {
397 char32 kar2 = *p ++, kar3 = *p ++, kar4 = *p ++; // convert up without sign extension
398 char32 kar = ((kar1 & 0x00'0007) << 18) | ((kar2 & 0x00'003F) << 12) | ((kar3 & 0x00'003F) << 6) | (kar4 & 0x00'003F);
399 *q ++ = kar;
400 }
401 }
402 } else if (inputEncoding == kMelder_textInputEncoding::ISO_LATIN1) {
403 while (*p != '\0')
404 *q ++ = *p ++;
405 } else if (inputEncoding == kMelder_textInputEncoding::WINDOWS_LATIN1) {
406 while (*p != '\0')
407 *q ++ = Melder_decodeWindowsLatin1 [*p ++];
408 } else if (inputEncoding == kMelder_textInputEncoding::MACROMAN) {
409 while (*p != '\0')
410 *q ++ = Melder_decodeMacRoman [*p ++];
411 } else if (inputEncoding != kMelder_textInputEncoding::UTF8) {
412 Melder_fatal (U"Unknown text input encoding ", (int) inputEncoding, U".");
413 }
414 *q = U'\0'; // closing null character
415 (void) Melder_killReturns_inplaceCHAR <char32> (string32);
416 }
417
Melder_8to32(const char * string,kMelder_textInputEncoding inputEncoding)418 autostring32 Melder_8to32 (const char *string, kMelder_textInputEncoding inputEncoding) {
419 if (! string)
420 return autostring32();
421 autostring32 result (strlen (string));
422 Melder_8to32_inplace (string, result.get(), inputEncoding);
423 return result;
424 }
425
Melder_8to32(const char * string)426 autostring32 Melder_8to32 (const char *string) {
427 if (! string)
428 return autostring32();
429 autostring32 result (strlen (string));
430 Melder_8to32_inplace (string, result.get(), kMelder_textInputEncoding::UTF8);
431 return result;
432 }
433
Melder_peek16to32(conststring16 text)434 conststring32 Melder_peek16to32 (conststring16 text) {
435 if (! text) return nullptr;
436 static MelderString buffers [19];
437 static int bufferNumber = 0;
438 if (++ bufferNumber == 19)
439 bufferNumber = 0;
440 MelderString_empty (& buffers [bufferNumber]);
441 for (;;) {
442 char16 kar1 = *text ++;
443 if (kar1 == u'\0') return buffers [bufferNumber]. string;
444 if (kar1 < 0xD800) {
445 MelderString_appendCharacter (& buffers [bufferNumber], (char32) kar1); // convert up without sign extension
446 } else if (kar1 < 0xDC00) {
447 char16 kar2 = *text ++;
448 if (kar2 >= 0xDC00 && kar2 <= 0xDFFF) {
449 MelderString_appendCharacter (& buffers [bufferNumber],
450 (char32) (0x01'0000 +
451 (char32) (((char32) kar1 & 0x00'03FF) << 10) +
452 (char32) ((char32) kar2 & 0x00'03FF)));
453 } else {
454 MelderString_appendCharacter (& buffers [bufferNumber], UNICODE_REPLACEMENT_CHARACTER);
455 }
456 } else if (kar1 < 0xE000) {
457 MelderString_appendCharacter (& buffers [bufferNumber], UNICODE_REPLACEMENT_CHARACTER);
458 } else {
459 MelderString_appendCharacter (& buffers [bufferNumber], (char32) kar1); // convert up without sign extension
460 }
461 }
462 }
463
Melder_16to32(conststring16 text)464 autostring32 Melder_16to32 (conststring16 text) {
465 return Melder_dup (Melder_peek16to32 (text));
466 }
467
Melder_32to8_inplace(conststring32 string,mutablestring8 utf8)468 void Melder_32to8_inplace (conststring32 string, mutablestring8 utf8) {;
469 Melder_assert (utf8); // and unassertable: utf8 should be a long enough buffer
470 char8 *to = (char8 *) & utf8 [0];
471 if (string) {
472 const char32 *p = & string [0];
473 while (*p != U'\0') {
474 const char32 kar = *p ++;
475 if (kar <= 0x00'007F) { // 7 bits
476 #ifdef _WIN32
477 if (kar == U'\n')
478 *to ++ = 13;
479 #endif
480 *to ++ = (char8) kar; // guarded truncation
481 } else if (kar <= 0x00'07FF) { // 11 bits
482 *to ++ = (char8) (0x00'00C0 | (kar >> 6)); // the upper 5 bits yield a number between 0xC4 and 0xDF
483 *to ++ = (char8) (0x00'0080 | (kar & 0x00'003F)); // the lower 6 bits yield a number between 0x80 and 0xBF
484 } else if (kar <= 0x00'FFFF) { // 16 bits
485 *to ++ = (char8) (0x00'00E0 | (kar >> 12)); // the upper 4 bits yield a number between 0xE0 and 0xEF
486 *to ++ = (char8) (0x00'0080 | ((kar >> 6) & 0x00'003F));
487 *to ++ = (char8) (0x00'0080 | (kar & 0x00'003F));
488 } else { // unguarded truncation to 21 bits
489 *to ++ = (char8) (0x00'00F0 | (kar >> 18)); // the upper 3 bits yield a number between 0xF0 and 0xF4 (0x10FFFF >> 18 == 4)
490 *to ++ = (char8) (0x00'0080 | ((kar >> 12) & 0x00'003F)); // the next 6 bits
491 *to ++ = (char8) (0x00'0080 | ((kar >> 6) & 0x00'003F)); // the third 6 bits
492 *to ++ = (char8) (0x00'0080 | (kar & 0x00'003F)); // the lower 6 bits
493 }
494 }
495 }
496 *to = '\0';
497 }
498
Melder_peek32to8(conststring32 text)499 conststring8 Melder_peek32to8 (conststring32 text) {
500 if (! text)
501 return nullptr;
502 static mutablestring8 buffers [19] { nullptr };
503 static int64 bufferSizes [19] { 0 };
504 static int bufferNumber = 0;
505 if (++ bufferNumber == 19)
506 bufferNumber = 0;
507 constexpr int64 maximumNumberOfUTF8bytesPerUTF32point = 4; // becausse we use only the lower 21 bits
508 const int64 numberOfUTF32points = str32len (text);
509 const int64 maximumNumberOfBytesNeeded = numberOfUTF32points * maximumNumberOfUTF8bytesPerUTF32point + 1;
510 if ((bufferSizes [bufferNumber] - maximumNumberOfBytesNeeded) * (int64) sizeof (char) >= 10'000) {
511 Melder_free (buffers [bufferNumber]);
512 bufferSizes [bufferNumber] = 0;
513 }
514 if (maximumNumberOfBytesNeeded > bufferSizes [bufferNumber]) {
515 const int64 newBufferSize = (int64) floor (maximumNumberOfBytesNeeded * 1.61803) + 100;
516 buffers [bufferNumber] = (char *) Melder_realloc_f (buffers [bufferNumber], newBufferSize * (int64) sizeof (char));
517 bufferSizes [bufferNumber] = newBufferSize;
518 }
519 Melder_32to8_inplace (text, buffers [bufferNumber]);
520 return buffers [bufferNumber];
521 }
522
Melder_32to8(conststring32 string)523 autostring8 Melder_32to8 (conststring32 string) {
524 if (! string)
525 return autostring8();
526 autostring8 result (str32len_utf8 (string, true));
527 Melder_32to8_inplace (string, result.get());
528 return result;
529 }
530
Melder_peek32to16(conststring32 text,bool nativizeNewlines)531 conststring16 Melder_peek32to16 (conststring32 text, bool nativizeNewlines) {
532 if (! text)
533 return nullptr;
534 static MelderString16 buffers [19] { };
535 static int bufferNumber = 0;
536 if (++ bufferNumber == 19)
537 bufferNumber = 0;
538 MelderString16_empty (& buffers [bufferNumber]);
539 int64 n = str32len (text);
540 if (nativizeNewlines) {
541 for (int64 i = 0; i <= n; i ++) {
542 #ifdef _WIN32
543 if (text [i] == U'\n')
544 MelderString16_appendCharacter (& buffers [bufferNumber], (char32) 13);
545 #endif
546 MelderString16_appendCharacter (& buffers [bufferNumber], text [i]);
547 }
548 } else {
549 for (int64 i = 0; i <= n; i ++)
550 MelderString16_appendCharacter (& buffers [bufferNumber], text [i]);
551 }
552 return buffers [bufferNumber]. string;
553 }
Melder_peek32to16(conststring32 text)554 conststring16 Melder_peek32to16 (conststring32 text) {
555 return Melder_peek32to16 (text, false);
556 }
557
Melder_32to16(conststring32 text)558 autostring16 Melder_32to16 (conststring32 text) {
559 conststring16 text16 = Melder_peek32to16 (text);
560 int64 length = str16len (text16);
561 autostring16 result (length);
562 str16cpy (result.get(), text16);
563 return result;
564 }
565
566 #if defined (_WIN32)
Melder_32toW(conststring32 text)567 autostringW Melder_32toW (conststring32 text) {
568 conststringW textW = Melder_peek32toW (text);
569 int64 length = str16len ((conststring16) textW);
570 autostringW result (length);
571 str16cpy ((mutablestring16) result.get(), (conststring16) textW);
572 return result;
573 }
Melder_peek32toW_fileSystem(conststring32 string)574 conststringW Melder_peek32toW_fileSystem (conststring32 string) {
575 static wchar_t buffer [1 + kMelder_MAXPATH];
576 //NormalizeStringW (NormalizationKC, -1, Melder_peek32toW (string), 1 + kMelder_MAXPATH, buffer);
577 FoldStringW (MAP_PRECOMPOSED, Melder_peek32toW (string), -1, buffer, 1 + kMelder_MAXPATH); // this works even on XP
578 return buffer;
579 }
Melder_32toW_fileSystem(conststring32 text)580 autostringW Melder_32toW_fileSystem (conststring32 text) {
581 conststringW textW = Melder_peek32toW_fileSystem (text);
582 int64 length = str16len ((conststring16) textW);
583 autostringW result (length);
584 str16cpy ((mutablestring16) result.get(), (conststring16) textW);
585 return result;
586 }
587 #endif
588
Melder_32to8_fileSystem_inplace(conststring32 string,char * utf8)589 void Melder_32to8_fileSystem_inplace (conststring32 string, char *utf8) {
590 #if defined (macintosh)
591 /*
592 On the Mac, the POSIX path name is stored in canonically decomposed UTF-8 encoding.
593 The path is probably in precomposed UTF-32.
594 So we first convert to UTF-16, then turn into CFString, then decompose, then convert to UTF-8.
595 */
596 UniChar unipath [kMelder_MAXPATH+1];
597 const int64 n = str32len (string);
598 int n_utf16 = 0;
599 for (int64 i = 0; i < n; i ++) {
600 char32 kar = (char32) string [i]; // change sign (bit 32 is never used)
601 if (kar <= 0x00'FFFF) {
602 unipath [n_utf16 ++] = (UniChar) kar; // including null byte; guarded truncation
603 } else if (kar <= 0x10'FFFF) {
604 kar -= 0x01'0000;
605 unipath [n_utf16 ++] = (UniChar) (0x00'D800 | (kar >> 10)); // correct truncation, because UTF-32 has fewer than 27 bits (in fact it has 21 bits)
606 unipath [n_utf16 ++] = (UniChar) (0x00'DC00 | (kar & 0x00'03FF));
607 } else {
608 unipath [n_utf16 ++] = UNICODE_REPLACEMENT_CHARACTER;
609 }
610 }
611 unipath [n_utf16] = u'\0';
612 CFStringRef cfpath = CFStringCreateWithCharacters (nullptr, unipath, n_utf16);
613 CFMutableStringRef cfpath2 = CFStringCreateMutableCopy (nullptr, 0, cfpath);
614 CFRelease (cfpath);
615 CFStringNormalize (cfpath2, kCFStringNormalizationFormD); // Mac requires decomposed characters
616 CFStringGetCString (cfpath2, (char *) utf8, kMelder_MAXPATH+1, kCFStringEncodingUTF8); // Mac POSIX requires UTF-8
617 CFRelease (cfpath2);
618 #elif defined (UNIX) || defined (__CYGWIN__)
619 Melder_32to8_inplace (string, utf8);
620 #elif defined (_WIN32)
621 const int n = str32len (string);
622 int j = 0;
623 for (int i = 0; i < n; i ++)
624 utf8 [j ++] = ( string [i] <= 255 ? string [i] : '?' ); // the usual replacement on Windows
625 utf8 [j] = '\0';
626 #else
627 //#error Unsupported platform.
628 #endif
629 }
Melder_peek32to8_fileSystem(conststring32 string)630 conststring8 Melder_peek32to8_fileSystem (conststring32 string) {
631 static char buffer [1 + kMelder_MAXPATH];
632 Melder_32to8_fileSystem_inplace (string, buffer);
633 return buffer;
634 }
635
636 #if defined (macintosh)
Melder_peek32toCfstring(conststring32 text)637 const void * Melder_peek32toCfstring (conststring32 text) {
638 if (! text)
639 return nullptr;
640 static CFStringRef cfString [11];
641 static int icfString = 0;
642 if (++ icfString == 11)
643 icfString = 0;
644 if (cfString [icfString])
645 CFRelease (cfString [icfString]);
646 cfString [icfString] = CFStringCreateWithCString (nullptr, (const char *) Melder_peek32to8 (text), kCFStringEncodingUTF8);
647 return cfString [icfString];
648 }
649 #endif
650
651 /* End of file melder_textencoding.cpp */
652