1 /*
2  * Copyright (C) 2005 to 2014 by Jonathan Duddington
3  * email: jonsd@users.sourceforge.net
4  * Copyright (C) 2015-2017 Reece H. Dunn
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, see: <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "config.h"
21 
22 #include <ctype.h>
23 //#include <stdbool.h>
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <wchar.h>
29 #include <wctype.h>
30 
31 #include "ucd.h"
32 #include "espeak_ng.h"
33 #include "encoding.h"
34 #include "speech.h"
35 #include "synthesize.h"
36 #include "translate.h"
37 
38 Translator *translator = NULL; // the main translator
39 Translator *translator2 = NULL; // secondary translator for certain words
40 static char translator2_language[20] = { 0 };
41 
42 FILE *f_trans = NULL; // phoneme output text
43 int option_tone2 = 0;
44 int option_tone_flags = 0; // bit 8=emphasize allcaps, bit 9=emphasize penultimate stress
45 int option_phonemes = 0;
46 int option_phoneme_events = 0;
47 int option_endpause = 0; // suppress pause after end of text
48 int option_capitals = 0;
49 int option_punctuation = 0;
50 int option_sayas = 0;
51 static int option_sayas2 = 0; // used in translate_clause()
52 static int option_emphasis = 0; // 0=normal, 1=normal, 2=weak, 3=moderate, 4=strong
53 int option_ssml = 0;
54 int option_phoneme_input = 0; // allow [[phonemes]] in input
55 int option_phoneme_variants = 0; // 0= don't display phoneme variant mnemonics
56 int option_wordgap = 0;
57 
58 static int count_sayas_digits;
59 int skip_sentences;
60 int skip_words;
61 int skip_characters;
62 char skip_marker[N_MARKER_LENGTH];
63 int skipping_text; // waiting until word count, sentence count, or named marker is reached
64 int end_character_position;
65 int count_sentences;
66 int count_words;
67 int clause_start_char;
68 int clause_start_word;
69 int new_sentence;
70 static int word_emphasis = 0; // set if emphasis level 3 or 4
71 static int embedded_flag = 0; // there are embedded commands to be applied to the next phoneme, used in TranslateWord2()
72 
73 static int prev_clause_pause = 0;
74 static int max_clause_pause = 0;
75 static int any_stressed_words;
76 int pre_pause;
77 ALPHABET *current_alphabet;
78 
79 // these were previously in translator class
80 char word_phonemes[N_WORD_PHONEMES]; // a word translated into phoneme codes
81 int n_ph_list2;
82 PHONEME_LIST2 ph_list2[N_PHONEME_LIST]; // first stage of text->phonemes
83 
84 wchar_t option_punctlist[N_PUNCTLIST] = { 0 };
85 char ctrl_embedded = '\001'; // to allow an alternative CTRL for embedded commands
86 
87 // these are overridden by defaults set in the "speak" file
88 int option_linelength = 0;
89 
90 #define N_EMBEDDED_LIST  250
91 static int embedded_ix;
92 static int embedded_read;
93 unsigned int embedded_list[N_EMBEDDED_LIST];
94 
95 // the source text of a single clause (UTF8 bytes)
96 static char source[N_TR_SOURCE+40]; // extra space for embedded command & voice change info at end
97 
98 int n_replace_phonemes;
99 REPLACE_PHONEMES replace_phonemes[N_REPLACE_PHONEMES];
100 
101 // brackets, also 0x2014 to 0x021f which don't need to be in this list
102 static const unsigned short brackets[] = {
103 	'(', ')', '[', ']', '{', '}', '<', '>', '"', '\'', '`',
104 	0xab,   0xbb,   // double angle brackets
105 	0x300a, 0x300b, // double angle brackets (ideograph)
106 	0xe000+'<',     // private usage area
107 	0
108 };
109 
110 // other characters which break a word, but don't produce a pause
111 static const unsigned short breaks[] = { '_', 0 };
112 
113 // Tables of the relative lengths of vowels, depending on the
114 // type of the two phonemes that follow
115 // indexes are the "length_mod" value for the following phonemes
116 
117 // use this table if vowel is not the last in the word
118 static unsigned char length_mods_en[100] = {
119 //	a    ,    t    s    n    d    z    r    N    <- next
120 	100, 120, 100, 105, 100, 110, 110, 100,  95, 100, // a  <- next2
121 	105, 120, 105, 110, 125, 130, 135, 115, 125, 100, // ,
122 	105, 120,  75, 100,  75, 105, 120,  85,  75, 100, // t
123 	105, 120,  85, 105,  95, 115, 120, 100,  95, 100, // s
124 	110, 120,  95, 105, 100, 115, 120, 100, 100, 100, // n
125 	105, 120, 100, 105,  95, 115, 120, 110,  95, 100, // d
126 	105, 120, 100, 105, 105, 122, 125, 110, 105, 100, // z
127 	105, 120, 100, 105, 105, 122, 125, 110, 105, 100, // r
128 	105, 120,  95, 105, 100, 115, 120, 110, 100, 100, // N
129 	100, 120, 100, 100, 100, 100, 100, 100, 100, 100
130 };
131 
132 // as above, but for the last syllable in a word
133 static unsigned char length_mods_en0[100] = {
134 //	a    ,    t    s    n    d    z    r    N    <- next
135 	100, 150, 100, 105, 110, 115, 110, 110, 110, 100, // a  <- next2
136 	105, 150, 105, 110, 125, 135, 140, 115, 135, 100, // ,
137 	105, 150,  90, 105,  90, 122, 135, 100,  90, 100, // t
138 	105, 150, 100, 105, 100, 122, 135, 100, 100, 100, // s
139 	105, 150, 100, 105, 105, 115, 135, 110, 105, 100, // n
140 	105, 150, 100, 105, 105, 122, 130, 120, 125, 100, // d
141 	105, 150, 100, 105, 110, 122, 125, 115, 110, 100, // z
142 	105, 150, 100, 105, 105, 122, 135, 120, 105, 100, // r
143 	105, 150, 100, 105, 105, 115, 135, 110, 105, 100, // N
144 	100, 100, 100, 100, 100, 100, 100, 100, 100, 100
145 };
146 
147 
148 static unsigned char length_mods_equal[100] = {
149 //	a    ,    t    s    n    d    z    r    N    <- next
150 	110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // a  <- next2
151 	110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // ,
152 	110, 120, 100, 110, 100, 110, 110, 110, 100, 110, // t
153 	110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // s
154 	110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // n
155 	110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // d
156 	110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // z
157 	110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // r
158 	110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // N
159 	110, 120, 100, 110, 110, 110, 110, 110, 110, 110
160 };
161 
162 static unsigned char *length_mod_tabs[6] = {
163 	length_mods_en,
164 	length_mods_en,    // 1
165 	length_mods_en0,   // 2
166 	length_mods_equal, // 3
167 	length_mods_equal, // 4
168 	length_mods_equal  // 5
169 };
170 
SetLengthMods(Translator * tr,int value)171 void SetLengthMods(Translator *tr, int value)
172 {
173 	int value2;
174 
175 	tr->langopts.length_mods0 = tr->langopts.length_mods = length_mod_tabs[value % 100];
176 	if ((value2 = value / 100) != 0)
177 		tr->langopts.length_mods0 = length_mod_tabs[value2];
178 }
179 
IsAlpha(unsigned int c)180 int IsAlpha(unsigned int c)
181 {
182 	// Replacement for iswalph() which also checks for some in-word symbols
183 
184 	static const unsigned short extra_indic_alphas[] = {
185 		0xa70, 0xa71, // Gurmukhi: tippi, addak
186 		0
187 	};
188 
189 	if (iswalpha(c))
190 		return 1;
191 
192 	if (c < 0x300)
193 		return 0;
194 
195 	if ((c >= 0x901) && (c <= 0xdf7)) {
196 		// Indic scripts: Devanagari, Tamil, etc
197 		if ((c & 0x7f) < 0x64)
198 			return 1;
199 		if (lookupwchar(extra_indic_alphas, c) != 0)
200 			return 1;
201 		if ((c >= 0xd7a) && (c <= 0xd7f))
202 			return 1; // malaytalam chillu characters
203 
204 		return 0;
205 	}
206 
207 	if ((c >= 0x5b0) && (c <= 0x5c2))
208 		return 1; // Hebrew vowel marks
209 
210 	if (c == 0x0605)
211 		return 1;
212 
213 	if ((c == 0x670) || ((c >= 0x64b) && (c <= 0x65e)))
214 		return 1; // arabic vowel marks
215 
216 	if ((c >= 0x300) && (c <= 0x36f))
217 		return 1; // combining accents
218 
219 	if ((c >= 0x780) && (c <= 0x7b1))
220 		return 1; // taani/divehi (maldives)
221 
222 	if ((c >= 0xf40) && (c <= 0xfbc))
223 		return 1; // tibetan
224 
225 	if ((c >= 0x1100) && (c <= 0x11ff))
226 		return 1; // Korean jamo
227 
228 	if ((c >= 0x2800) && (c <= 0x28ff))
229 		return 1; // braille
230 
231 	if ((c > 0x3040) && (c <= 0xa700))
232 		return 1; // Chinese/Japanese.  Should never get here, but Mac OS 10.4's iswalpha seems to be broken, so just make sure
233 
234 	return 0;
235 }
236 
IsDigit09(unsigned int c)237 int IsDigit09(unsigned int c)
238 {
239 	if ((c >= '0') && (c <= '9'))
240 		return 1;
241 	return 0;
242 }
243 
IsDigit(unsigned int c)244 int IsDigit(unsigned int c)
245 {
246 	if (iswdigit(c))
247 		return 1;
248 
249 	if ((c >= 0x966) && (c <= 0x96f))
250 		return 1;
251 
252 	return 0;
253 }
254 
IsSpace(unsigned int c)255 static int IsSpace(unsigned int c)
256 {
257 	if (c == 0)
258 		return 0;
259 	if ((c >= 0x2500) && (c < 0x25a0))
260 		return 1; // box drawing characters
261 	if ((c >= 0xfff9) && (c <= 0xffff))
262 		return 1; // unicode specials
263 	return iswspace(c);
264 }
265 
isspace2(unsigned int c)266 int isspace2(unsigned int c)
267 {
268 	// can't use isspace() because on Windows, isspace(0xe1) gives TRUE !
269 	int c2;
270 
271 	if (((c2 = (c & 0xff)) == 0) || (c > ' '))
272 		return 0;
273 	return 1;
274 }
275 
DeleteTranslator(Translator * tr)276 void DeleteTranslator(Translator *tr)
277 {
278 	if (tr->data_dictlist != NULL)
279 		free(tr->data_dictlist);
280 	free(tr);
281 }
282 
lookupwchar(const unsigned short * list,int c)283 int lookupwchar(const unsigned short *list, int c)
284 {
285 	// Is the character c in the list ?
286 	int ix;
287 
288 	for (ix = 0; list[ix] != 0; ix++) {
289 		if (list[ix] == c)
290 			return ix+1;
291 	}
292 	return 0;
293 }
294 
lookupwchar2(const unsigned short * list,int c)295 int lookupwchar2(const unsigned short *list, int c)
296 {
297 	// Replace character c by another character.
298 	// Returns 0 = not found, 1 = delete character
299 
300 	int ix;
301 
302 	for (ix = 0; list[ix] != 0; ix += 2) {
303 		if (list[ix] == c)
304 			return list[ix+1];
305 	}
306 	return 0;
307 }
308 
IsBracket(int c)309 int IsBracket(int c)
310 {
311 	if ((c >= 0x2014) && (c <= 0x201f))
312 		return 1;
313 	return lookupwchar(brackets, c);
314 }
315 
utf8_nbytes(const char * buf)316 int utf8_nbytes(const char *buf)
317 {
318 	// Returns the number of bytes for the first UTF-8 character in buf
319 
320 	unsigned char c = (unsigned char)buf[0];
321 	if (c < 0x80)
322 		return 1;
323 	if (c < 0xe0)
324 		return 2;
325 	if (c < 0xf0)
326 		return 3;
327 	return 4;
328 }
329 
utf8_in2(int * c,const char * buf,int backwards)330 int utf8_in2(int *c, const char *buf, int backwards)
331 {
332 	// Reads a unicode characater from a UTF8 string
333 	// Returns the number of UTF8 bytes used.
334 	// c: holds integer representation of multibyte character
335 	// buf: position of buffer is moved, if character is read
336 	// backwards: set if we are moving backwards through the UTF8 string
337 
338 	int c1;
339 	int n_bytes;
340 	int ix;
341 	static const unsigned char mask[4] = { 0xff, 0x1f, 0x0f, 0x07 };
342 
343 	// find the start of the next/previous character
344 	while ((*buf & 0xc0) == 0x80) {
345 		// skip over non-initial bytes of a multi-byte utf8 character
346 		if (backwards)
347 			buf--;
348 		else
349 			buf++;
350 	}
351 
352 	n_bytes = 0;
353 
354 	if ((c1 = *buf++) & 0x80) {
355 		if ((c1 & 0xe0) == 0xc0)
356 			n_bytes = 1;
357 		else if ((c1 & 0xf0) == 0xe0)
358 			n_bytes = 2;
359 		else if ((c1 & 0xf8) == 0xf0)
360 			n_bytes = 3;
361 
362 		c1 &= mask[n_bytes];
363 		for (ix = 0; ix < n_bytes; ix++)
364 			c1 = (c1 << 6) + (*buf++ & 0x3f);
365 	}
366 	*c = c1;
367 	return n_bytes+1;
368 }
369 
370 #pragma GCC visibility push(default)
utf8_in(int * c,const char * buf)371 int utf8_in(int *c, const char *buf)
372 {
373 	/* Read a unicode characater from a UTF8 string
374 	 * Returns the number of UTF8 bytes used.
375 	 * buf: position of buffer is moved, if character is read
376 	 * c: holds integer representation of multibyte character by
377 	 * skipping UTF-8 header bits of bytes in following way:
378 	 * 2-byte character "ā":
379 	 * hex            binary
380 	 * c481           1100010010000001
381 	 *    |           11000100  000001
382 	 *    V              \    \ |    |
383 	 * 0101           0000000100000001
384 	 * 3-byte character "ꙅ":
385 	 * ea9985 111010101001100110000101
386 	 *            1010  011001  000101
387 	 *    |       +  +--.\   \  |    |
388 	 *    V        `--.  \`.  `.|    |
389 	 *   A645         0001001101000101
390 	 * 4-byte character "��":
391 	 * f0a09c8e 11110000101000001001110010001110
392 	 *    V          000  100000  011100  001110
393 	 *   02070e         000000100000011100001110
394 	 */
395 	return utf8_in2(c, buf, 0);
396 }
397 #pragma GCC visibility pop
398 
utf8_out(unsigned int c,char * buf)399 int utf8_out(unsigned int c, char *buf)
400 {
401 	// write a unicode character into a buffer as utf8
402 	// returns the number of bytes written
403 
404 	int n_bytes;
405 	int j;
406 	int shift;
407 	static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };
408 
409 	if (c < 0x80) {
410 		buf[0] = c;
411 		return 1;
412 	}
413 	if (c >= 0x110000) {
414 		buf[0] = ' '; // out of range character code
415 		return 1;
416 	}
417 	if (c < 0x0800)
418 		n_bytes = 1;
419 	else if (c < 0x10000)
420 		n_bytes = 2;
421 	else
422 		n_bytes = 3;
423 
424 	shift = 6*n_bytes;
425 	buf[0] = code[n_bytes] | (c >> shift);
426 	for (j = 0; j < n_bytes; j++) {
427 		shift -= 6;
428 		buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
429 	}
430 	return n_bytes+1;
431 }
432 
strchr_w(const char * s,int c)433 char *strchr_w(const char *s, int c)
434 {
435 	// return NULL for any non-ascii character
436 	if (c >= 0x80)
437 		return NULL;
438 	return strchr((char *)s, c); // (char *) is needed for Borland compiler
439 }
SpeakIndividualLetters(Translator * tr,char * word,char * phonemes,int spell_word)440 static char *SpeakIndividualLetters(Translator *tr, char *word, char *phonemes, int spell_word)
441 {
442 	int posn = 0;
443 	int capitals = 0;
444 	int non_initial = 0;
445 
446 	if (spell_word > 2)
447 		capitals = 2; // speak 'capital'
448 	if (spell_word > 1)
449 		capitals |= 4; // speak charater code for unknown letters
450 
451 	while ((*word != ' ') && (*word != 0)) {
452 		word += TranslateLetter(tr, word, phonemes, capitals | non_initial);
453 		posn++;
454 		non_initial = 1;
455 		if (phonemes[0] == phonSWITCH) {
456 			// change to another language in order to translate this word
457 			strcpy(word_phonemes, phonemes);
458 			return NULL;
459 		}
460 	}
461 	SetSpellingStress(tr, phonemes, spell_word, posn);
462 	return word;
463 }
464 
CheckDottedAbbrev(char * word1)465 static int CheckDottedAbbrev(char *word1)
466 {
467 	int wc;
468 	int count = 0;
469 	int nbytes;
470 	int ok;
471 	int ix;
472 	char *word;
473 	char *wbuf;
474 	char word_buf[80];
475 
476 	word = word1;
477 	wbuf = word_buf;
478 
479 	for (;;) {
480 		ok = 0;
481 		nbytes = utf8_in(&wc, word);
482 		if ((word[nbytes] == ' ') && IsAlpha(wc)) {
483 			if (word[nbytes+1] == '.') {
484 				if (word[nbytes+2] == ' ')
485 					ok = 1;
486 				else if (word[nbytes+2] == '\'') {
487 					nbytes += 2; // delete the final dot (eg. u.s.a.'s)
488 					ok = 2;
489 				}
490 			} else if ((count > 0) && (word[nbytes] == ' '))
491 				ok = 2;
492 		}
493 
494 		if (ok == 0)
495 			break;
496 
497 		for (ix = 0; ix < nbytes; ix++)
498 			*wbuf++ = word[ix];
499 
500 		count++;
501 
502 		if (ok == 2) {
503 			word += nbytes;
504 			break;
505 		}
506 
507 		word += (nbytes + 3);
508 	}
509 
510 	if (count > 1) {
511 		ix = wbuf - word_buf;
512 		memcpy(word1, word_buf, ix);
513 		while (&word1[ix] < word)
514 			word1[ix++] = ' ';
515 		dictionary_skipwords = (count - 1)*2;
516 	}
517 	return count;
518 }
519 
520 extern char *phondata_ptr;
521 
TranslateWord3(Translator * tr,char * word_start,WORD_TAB * wtab,char * word_out)522 static int TranslateWord3(Translator *tr, char *word_start, WORD_TAB *wtab, char *word_out)
523 {
524 	// word1 is terminated by space (0x20) character
525 
526 	char *word1;
527 	int word_length;
528 	int ix;
529 	char *p;
530 	int pfix;
531 	int n_chars;
532 	unsigned int dictionary_flags[2];
533 	unsigned int dictionary_flags2[2];
534 	int end_type = 0;
535 	int end_type1 = 0;
536 	int prefix_type = 0;
537 	int prefix_stress;
538 	char *wordx;
539 	char phonemes[N_WORD_PHONEMES];
540 	char phonemes2[N_WORD_PHONEMES];
541 	char prefix_phonemes[N_WORD_PHONEMES];
542 	char unpron_phonemes[N_WORD_PHONEMES];
543 	char end_phonemes[N_WORD_PHONEMES];
544 	char end_phonemes2[N_WORD_PHONEMES];
545 	char word_copy[N_WORD_BYTES];
546 	char word_copy2[N_WORD_BYTES];
547 	int word_copy_length;
548 	char prefix_chars[0x3f + 2];
549 	bool found = false;
550 	int end_flags;
551 	int c_temp; // save a character byte while we temporarily replace it with space
552 	int first_char;
553 	int last_char = 0;
554 	int prefix_flags = 0;
555 	int more_suffixes;
556 	int confirm_prefix;
557 	int spell_word;
558 	int emphasize_allcaps = 0;
559 	int wflags;
560 	int wmark;
561 	int was_unpronouncable = 0;
562 	int loopcount;
563 	int add_suffix_phonemes = 0;
564 	WORD_TAB wtab_null[8];
565 
566 	// translate these to get pronunciations of plural 's' suffix (different forms depending on
567 	// the preceding letter
568 	static char word_zz[4] = { 0, 'z', 'z', 0 };
569 	static char word_iz[4] = { 0, 'i', 'z', 0 };
570 	static char word_ss[4] = { 0, 's', 's', 0 };
571 
572 	if (wtab == NULL) {
573 		memset(wtab_null, 0, sizeof(wtab_null));
574 		wtab = wtab_null;
575 	}
576 	wflags = wtab->flags;
577 	wmark = wtab->wmark;
578 
579 	dictionary_flags[0] = 0;
580 	dictionary_flags[1] = 0;
581 	dictionary_flags2[0] = 0;
582 	dictionary_flags2[1] = 0;
583 	dictionary_skipwords = 0;
584 
585 	phonemes[0] = 0;
586 	unpron_phonemes[0] = 0;
587 	prefix_phonemes[0] = 0;
588 	end_phonemes[0] = 0;
589 
590 	if (tr->data_dictlist == NULL) {
591 		// dictionary is not loaded
592 		word_phonemes[0] = 0;
593 		return 0;
594 	}
595 
596 	// count the length of the word
597 	word1 = word_start;
598 	if (*word1 == ' ') word1++; // possibly a dot was replaced by space:  $dot
599 	wordx = word1;
600 
601 	utf8_in(&first_char, wordx);
602 	word_length = 0;
603 	while ((*wordx != 0) && (*wordx != ' ')) {
604 		wordx += utf8_in(&last_char, wordx);
605 		word_length++;
606 	}
607 
608 	word_copy_length = wordx - word_start;
609 	if (word_copy_length >= N_WORD_BYTES)
610 		word_copy_length = N_WORD_BYTES-1;
611 	memcpy(word_copy2, word_start, word_copy_length);
612 
613 	spell_word = 0;
614 
615 	if ((word_length == 1) && (wflags & FLAG_TRANSLATOR2)) {
616 		// retranslating a 1-character word using a different language, say its name
617 		utf8_in(&c_temp, wordx+1); // the next character
618 		if (!IsAlpha(c_temp) || (AlphabetFromChar(last_char) != AlphabetFromChar(c_temp)))
619 			spell_word = 1;
620 	}
621 
622 	if (option_sayas == SAYAS_KEY) {
623 		if (word_length == 1)
624 			spell_word = 4;
625 		else {
626 			// is there a translation for this keyname ?
627 			word1--;
628 			*word1 = '_'; // prefix keyname with '_'
629 			found = LookupDictList(tr, &word1, phonemes, dictionary_flags, 0, wtab);
630 		}
631 	}
632 
633 	// try an initial lookup in the dictionary list, we may find a pronunciation specified, or
634 	// we may just find some flags
635 	if (option_sayas & 0x10) {
636 		// SAYAS_CHAR, SAYAS_GYLPH, or SAYAS_SINGLE_CHAR
637 		spell_word = option_sayas & 0xf; // 2,3,4
638 	} else {
639 		if (!found)
640 			found = LookupDictList(tr, &word1, phonemes, dictionary_flags, FLAG_ALLOW_TEXTMODE, wtab);   // the original word
641 
642 		if ((dictionary_flags[0] & (FLAG_ALLOW_DOT | FLAG_NEEDS_DOT)) && (wordx[1] == '.'))
643 			wordx[1] = ' '; // remove a Dot after this word
644 
645 		if (dictionary_flags[0] & FLAG_TEXTMODE) {
646 			if (word_out != NULL)
647 				strcpy(word_out, word1);
648 
649 			return dictionary_flags[0];
650 		} else if ((found == false) && (dictionary_flags[0] & FLAG_SKIPWORDS) && !(dictionary_flags[0] & FLAG_ABBREV)) {
651 			// grouped words, but no translation.  Join the words with hyphens.
652 			wordx = word1;
653 			ix = 0;
654 			while (ix < dictionary_skipwords) {
655 				if (*wordx == ' ') {
656 					*wordx = '-';
657 					ix++;
658 				}
659 				wordx++;
660 			}
661 		}
662 
663 		if ((word_length == 1) && (dictionary_skipwords == 0)) {
664 			// is this a series of single letters separated by dots?
665 			if (CheckDottedAbbrev(word1)) {
666 				dictionary_flags[0] = 0;
667 				dictionary_flags[1] = 0;
668 				spell_word = 1;
669 				if (dictionary_skipwords)
670 					dictionary_flags[0] = FLAG_SKIPWORDS;
671 			}
672 		}
673 
674 		if (phonemes[0] == phonSWITCH) {
675 			// change to another language in order to translate this word
676 			strcpy(word_phonemes, phonemes);
677 			return 0;
678 		}
679 
680 		if ((wmark > 0) && (wmark < 8)) {
681 			// the stressed syllable has been specified in the text  (TESTING)
682 			dictionary_flags[0] = (dictionary_flags[0] & ~0xf) | wmark;
683 		}
684 
685 		if (!found && (dictionary_flags[0] & FLAG_ABBREV)) {
686 			// the word has $abbrev flag, but no pronunciation specified.  Speak as individual letters
687 			spell_word = 1;
688 		}
689 
690 		if (!found && iswdigit(first_char)) {
691 			Lookup(tr, "_0lang", word_phonemes);
692 			if (word_phonemes[0] == phonSWITCH)
693 				return 0;
694 
695 			if ((tr->langopts.numbers2 & NUM2_ENGLISH_NUMERALS) && !(wtab->flags & FLAG_CHAR_REPLACED)) {
696 				// for this language, speak English numerals (0-9) with the English voice
697 				sprintf(word_phonemes, "%c", phonSWITCH);
698 				return 0;
699 			}
700 
701 			found = TranslateNumber(tr, word1, phonemes, dictionary_flags, wtab, 0);
702 		}
703 
704 		if (!found && ((wflags & FLAG_UPPERS) != FLAG_FIRST_UPPER)) {
705 			// either all upper or all lower case
706 
707 			if ((tr->langopts.numbers & NUM_ROMAN) || ((tr->langopts.numbers & NUM_ROMAN_CAPITALS) && (wflags & FLAG_ALL_UPPER))) {
708 				if ((wflags & FLAG_LAST_WORD) || !(wtab[1].flags & FLAG_NOSPACE)) {
709 					// don't use Roman number if this word is not separated from the next word (eg. "XLTest")
710 					if ((found = TranslateRoman(tr, word1, phonemes, wtab)) != 0)
711 						dictionary_flags[0] |= FLAG_ABBREV; // prevent emphasis if capitals
712 				}
713 			}
714 		}
715 
716 		if ((wflags & FLAG_ALL_UPPER) && (word_length > 1) && iswalpha(first_char)) {
717 			if ((option_tone_flags & OPTION_EMPHASIZE_ALLCAPS) && !(dictionary_flags[0] & FLAG_ABBREV)) {
718 				// emphasize words which are in capitals
719 				emphasize_allcaps = FLAG_EMPHASIZED;
720 			} else if (!found && !(dictionary_flags[0] &  FLAG_SKIPWORDS) && (word_length < 4) && (tr->clause_lower_count > 3)
721 			           && (tr->clause_upper_count <= tr->clause_lower_count)) {
722 				// An upper case word in a lower case clause. This could be an abbreviation.
723 				spell_word = 1;
724 			}
725 		}
726 	}
727 
728 	if (spell_word > 0) {
729 		// Speak as individual letters
730 		phonemes[0] = 0;
731 
732 		if (SpeakIndividualLetters(tr, word1, phonemes, spell_word) == NULL) {
733 			if (word_length > 1)
734 				return FLAG_SPELLWORD; // a mixture of languages, retranslate as individual letters, separated by spaces
735 			return 0;
736 		}
737 		strcpy(word_phonemes, phonemes);
738 		if (wflags & FLAG_TRANSLATOR2)
739 			return 0;
740 		return dictionary_flags[0] & FLAG_SKIPWORDS; // for "b.c.d"
741 	} else if (found == false) {
742 		// word's pronunciation is not given in the dictionary list, although
743 		// dictionary_flags may have ben set there
744 
745 		int posn;
746 		int non_initial;
747 		int length;
748 
749 		posn = 0;
750 		non_initial = 0;
751 		length = 999;
752 		wordx = word1;
753 
754 		while (((length < 3) && (length > 0)) || (word_length > 1 && Unpronouncable(tr, wordx, posn))) {
755 			// This word looks "unpronouncable", so speak letters individually until we
756 			// find a remainder that we can pronounce.
757 			was_unpronouncable = FLAG_WAS_UNPRONOUNCABLE;
758 			emphasize_allcaps = 0;
759 
760 			if (wordx[0] == '\'')
761 				break;
762 
763 			if (posn > 0)
764 				non_initial = 1;
765 
766 			wordx += TranslateLetter(tr, wordx, unpron_phonemes, non_initial);
767 			posn++;
768 			if (unpron_phonemes[0] == phonSWITCH) {
769 				// change to another language in order to translate this word
770 				strcpy(word_phonemes, unpron_phonemes);
771 				if (strcmp(&unpron_phonemes[1], "en") == 0)
772 					return FLAG_SPELLWORD; // _^_en must have been set in TranslateLetter(), not *_rules which uses only _^_
773 				return 0;
774 			}
775 
776 			length = 0;
777 			while (wordx[length] != ' ') length++;
778 		}
779 		SetSpellingStress(tr, unpron_phonemes, 0, posn);
780 
781 		// anything left ?
782 		if (*wordx != ' ') {
783 			if ((unpron_phonemes[0] != 0) && (wordx[0] != '\'')) {
784 				// letters which have been spoken individually from affecting the pronunciation of the pronuncable part
785 				wordx[-1] = ' ';
786 			}
787 
788 			// Translate the stem
789 			end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags, dictionary_flags);
790 
791 			if (phonemes[0] == phonSWITCH) {
792 				// change to another language in order to translate this word
793 				strcpy(word_phonemes, phonemes);
794 				return 0;
795 			}
796 
797 			if ((phonemes[0] == 0) && (end_phonemes[0] == 0)) {
798 				int wc;
799 				// characters not recognised, speak them individually
800 				// ?? should we say super/sub-script numbers and letters here?
801 				utf8_in(&wc, wordx);
802 				if ((word_length == 1) && (IsAlpha(wc) || IsSuperscript(wc))) {
803 					if ((wordx = SpeakIndividualLetters(tr, wordx, phonemes, spell_word)) == NULL)
804 						return 0;
805 					strcpy(word_phonemes, phonemes);
806 					return 0;
807 				}
808 			}
809 
810 			c_temp = wordx[-1];
811 
812 			found = false;
813 			confirm_prefix = 1;
814 			for (loopcount = 0; (loopcount < 50) && (end_type & SUFX_P); loopcount++) {
815 				// Found a standard prefix, remove it and retranslate
816 				// loopcount guards against an endless loop
817 				if (confirm_prefix && !(end_type & SUFX_B)) {
818 					int end2;
819 					char end_phonemes22[N_WORD_PHONEMES];
820 
821 					// remove any standard suffix and confirm that the prefix is still recognised
822 					phonemes2[0] = 0;
823 					end2 = TranslateRules(tr, wordx, phonemes2, N_WORD_PHONEMES, end_phonemes22, wflags|FLAG_NO_PREFIX|FLAG_NO_TRACE, dictionary_flags);
824 					if (end2) {
825 						RemoveEnding(tr, wordx, end2, word_copy);
826 						end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags|FLAG_NO_TRACE, dictionary_flags);
827 						memcpy(wordx, word_copy, strlen(word_copy));
828 						if ((end_type & SUFX_P) == 0) {
829 							// after removing the suffix, the prefix is no longer recognised.
830 							// Keep the suffix, but don't use the prefix
831 							end_type = end2;
832 							strcpy(phonemes, phonemes2);
833 							strcpy(end_phonemes, end_phonemes22);
834 							if (option_phonemes & espeakPHONEMES_TRACE) {
835 								DecodePhonemes(end_phonemes, end_phonemes22);
836 								fprintf(f_trans, "  suffix [%s]\n\n", end_phonemes22);
837 							}
838 						}
839 						confirm_prefix = 0;
840 						continue;
841 					}
842 				}
843 
844 				prefix_type = end_type;
845 
846 				if (prefix_type & SUFX_V)
847 					tr->expect_verb = 1; // use the verb form of the word
848 
849 				wordx[-1] = c_temp;
850 
851 				if ((prefix_type & SUFX_B) == 0) {
852 					for (ix = (prefix_type & 0xf); ix > 0; ix--) { // num. of characters to remove
853 						wordx++;
854 						while ((*wordx & 0xc0) == 0x80) wordx++; // for multibyte characters
855 					}
856 				} else {
857 					pfix = 1;
858 					prefix_chars[0] = 0;
859 					n_chars = prefix_type & 0x3f;
860 
861 					for (ix = 0; ix < n_chars; ix++) { // num. of bytes to remove
862 						prefix_chars[pfix++] = *wordx++;
863 
864 						if ((prefix_type & SUFX_B) && (ix == (n_chars-1)))
865 							prefix_chars[pfix-1] = 0; // discard the last character of the prefix, this is the separator character
866 					}
867 					prefix_chars[pfix] = 0;
868 				}
869 				c_temp = wordx[-1];
870 				wordx[-1] = ' ';
871 				confirm_prefix = 1;
872 				wflags |= FLAG_PREFIX_REMOVED;
873 
874 				if (prefix_type & SUFX_B) {
875 					// SUFX_B is used for Turkish, tr_rules contains " ' (Pb"
876 					// examine the prefix part
877 					char *wordpf;
878 					char prefix_phonemes2[12];
879 
880 					strncpy0(prefix_phonemes2, end_phonemes, sizeof(prefix_phonemes2));
881 					wordpf = &prefix_chars[1];
882 					strcpy(prefix_phonemes, phonemes);
883 
884 					// look for stress marker or $abbrev
885 					found = LookupDictList(tr, &wordpf, phonemes, dictionary_flags, 0, wtab);
886 					if (found)
887 						strcpy(prefix_phonemes, phonemes);
888 					if (dictionary_flags[0] & FLAG_ABBREV) {
889 						prefix_phonemes[0] = 0;
890 						SpeakIndividualLetters(tr, wordpf, prefix_phonemes, 1);
891 					}
892 				} else
893 					strcat(prefix_phonemes, end_phonemes);
894 				end_phonemes[0] = 0;
895 
896 				end_type = 0;
897 				found = LookupDictList(tr, &wordx, phonemes, dictionary_flags2, SUFX_P, wtab); // without prefix
898 				if (dictionary_flags[0] == 0) {
899 					dictionary_flags[0] = dictionary_flags2[0];
900 					dictionary_flags[1] = dictionary_flags2[1];
901 				} else
902 					prefix_flags = 1;
903 				if (found == false) {
904 					end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags & (FLAG_HYPHEN_AFTER | FLAG_PREFIX_REMOVED), dictionary_flags);
905 
906 					if (phonemes[0] == phonSWITCH) {
907 						// change to another language in order to translate this word
908 						wordx[-1] = c_temp;
909 						strcpy(word_phonemes, phonemes);
910 						return 0;
911 					}
912 				}
913 			}
914 
915 			if ((end_type != 0) && !(end_type & SUFX_P)) {
916 				end_type1 = end_type;
917 				strcpy(phonemes2, phonemes);
918 
919 				// The word has a standard ending, re-translate without this ending
920 				end_flags = RemoveEnding(tr, wordx, end_type, word_copy);
921 				more_suffixes = 1;
922 
923 				while (more_suffixes) {
924 					more_suffixes = 0;
925 					phonemes[0] = 0;
926 
927 					if (prefix_phonemes[0] != 0) {
928 						// lookup the stem without the prefix removed
929 						wordx[-1] = c_temp;
930 						found = LookupDictList(tr, &word1, phonemes, dictionary_flags2, end_flags, wtab);  // include prefix, but not suffix
931 						wordx[-1] = ' ';
932 						if (phonemes[0] == phonSWITCH) {
933 							// change to another language in order to translate this word
934 							memcpy(wordx, word_copy, strlen(word_copy));
935 							strcpy(word_phonemes, phonemes);
936 							return 0;
937 						}
938 						if (dictionary_flags[0] == 0) {
939 							dictionary_flags[0] = dictionary_flags2[0];
940 							dictionary_flags[1] = dictionary_flags2[1];
941 						}
942 						if (found)
943 							prefix_phonemes[0] = 0; // matched whole word, don't need prefix now
944 
945 						if ((found == false) && (dictionary_flags2[0] != 0))
946 							prefix_flags = 1;
947 					}
948 					if (found == false) {
949 						found = LookupDictList(tr, &wordx, phonemes, dictionary_flags2, end_flags, wtab);  // without prefix and suffix
950 						if (phonemes[0] == phonSWITCH) {
951 							// change to another language in order to translate this word
952 							memcpy(wordx, word_copy, strlen(word_copy));
953 							strcpy(word_phonemes, phonemes);
954 							return 0;
955 						}
956 
957 						if (dictionary_flags[0] == 0) {
958 							dictionary_flags[0] = dictionary_flags2[0];
959 							dictionary_flags[1] = dictionary_flags2[1];
960 						}
961 					}
962 					if (found == false) {
963 						if (end_type & SUFX_Q) {
964 							// don't retranslate, use the original lookup result
965 							strcpy(phonemes, phonemes2);
966 						} else {
967 							if (end_flags & FLAG_SUFX)
968 								wflags |= FLAG_SUFFIX_REMOVED;
969 							if (end_type & SUFX_A)
970 								wflags |= FLAG_SUFFIX_VOWEL;
971 
972 							if (end_type & SUFX_M) {
973 								// allow more suffixes before this suffix
974 								strcpy(end_phonemes2, end_phonemes);
975 								end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags, dictionary_flags);
976 								strcat(end_phonemes, end_phonemes2); // add the phonemes for the previous suffixes after this one
977 
978 								if ((end_type != 0) && !(end_type & SUFX_P)) {
979 									// there is another suffix
980 									end_flags = RemoveEnding(tr, wordx, end_type, NULL);
981 									more_suffixes = 1;
982 								}
983 							} else {
984 								// don't remove any previous suffix
985 								TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, NULL, wflags, dictionary_flags);
986 								end_type = 0;
987 							}
988 
989 							if (phonemes[0] == phonSWITCH) {
990 								// change to another language in order to translate this word
991 								strcpy(word_phonemes, phonemes);
992 								memcpy(wordx, word_copy, strlen(word_copy));
993 								wordx[-1] = c_temp;
994 								return 0;
995 							}
996 						}
997 					}
998 				}
999 
1000 
1001 				if ((end_type1 & SUFX_T) == 0) {
1002 					// the default is to add the suffix and then determine the word's stress pattern
1003 					AppendPhonemes(tr, phonemes, N_WORD_PHONEMES, end_phonemes);
1004 					end_phonemes[0] = 0;
1005 				}
1006 				memcpy(wordx, word_copy, strlen(word_copy));
1007 			}
1008 
1009 			wordx[-1] = c_temp;
1010 		}
1011 	}
1012 
1013 	if (wflags & FLAG_HAS_PLURAL) {
1014 		// s or 's suffix, append [s], [z] or [Iz] depending on previous letter
1015 		if (last_char == 'f')
1016 			TranslateRules(tr, &word_ss[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1017 		else if ((last_char == 0) || (strchr_w("hsx", last_char) == NULL))
1018 			TranslateRules(tr, &word_zz[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1019 		else
1020 			TranslateRules(tr, &word_iz[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1021 	}
1022 
1023 	wflags |= emphasize_allcaps;
1024 
1025 	// determine stress pattern for this word
1026 
1027 	add_suffix_phonemes = 0;
1028 	if (end_phonemes[0] != 0)
1029 		add_suffix_phonemes = 2;
1030 
1031 	prefix_stress = 0;
1032 	for (p = prefix_phonemes; *p != 0; p++) {
1033 		if ((*p == phonSTRESS_P) || (*p == phonSTRESS_P2))
1034 			prefix_stress = *p;
1035 	}
1036 	if (prefix_flags || (prefix_stress != 0)) {
1037 		if ((tr->langopts.param[LOPT_PREFIXES]) || (prefix_type & SUFX_T)) {
1038 			char *p_local;
1039 			// German, keep a secondary stress on the stem
1040 			SetWordStress(tr, phonemes, dictionary_flags, 3, 0);
1041 
1042 			// reduce all but the first primary stress
1043 			ix = 0;
1044 			for (p_local = prefix_phonemes; *p_local != 0; p_local++) {
1045 				if (*p_local == phonSTRESS_P) {
1046 					if (ix == 0)
1047 						ix = 1;
1048 					else
1049 						*p_local = phonSTRESS_3;
1050 				}
1051 			}
1052 			snprintf(word_phonemes, sizeof(word_phonemes), "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1053 			word_phonemes[N_WORD_PHONEMES-1] = 0;
1054 			SetWordStress(tr, word_phonemes, dictionary_flags, -1, 0);
1055 		} else {
1056 			// stress position affects the whole word, including prefix
1057 			snprintf(word_phonemes, sizeof(word_phonemes), "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1058 			word_phonemes[N_WORD_PHONEMES-1] = 0;
1059 			SetWordStress(tr, word_phonemes, dictionary_flags, -1, 0);
1060 		}
1061 	} else {
1062 		SetWordStress(tr, phonemes, dictionary_flags, -1, add_suffix_phonemes);
1063 		snprintf(word_phonemes, sizeof(word_phonemes), "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1064 		word_phonemes[N_WORD_PHONEMES-1] = 0;
1065 	}
1066 
1067 	if (end_phonemes[0] != 0) {
1068 		// a suffix had the SUFX_T option set, add the suffix after the stress pattern has been determined
1069 		ix = strlen(word_phonemes);
1070 		end_phonemes[N_WORD_PHONEMES-1-ix] = 0; // ensure no buffer overflow
1071 		strcpy(&word_phonemes[ix], end_phonemes);
1072 	}
1073 
1074 	if (wflags & FLAG_LAST_WORD) {
1075 		// don't use $brk pause before the last word of a sentence
1076 		// (but allow it for emphasis, see below
1077 		dictionary_flags[0] &= ~FLAG_PAUSE1;
1078 	}
1079 
1080 	if ((wflags & FLAG_HYPHEN) && (tr->langopts.stress_flags & S_HYPEN_UNSTRESS))
1081 		ChangeWordStress(tr, word_phonemes, 3);
1082 	else if (wflags & FLAG_EMPHASIZED2) {
1083 		// A word is indicated in the source text as stressed
1084 		// Give it stress level 6 (for the intonation module)
1085 		ChangeWordStress(tr, word_phonemes, 6);
1086 
1087 		if (wflags & FLAG_EMPHASIZED)
1088 			dictionary_flags[0] |= FLAG_PAUSE1; // precede by short pause
1089 	} else if (wtab[dictionary_skipwords].flags & FLAG_LAST_WORD) {
1090 		// the word has attribute to stress or unstress when at end of clause
1091 		if (dictionary_flags[0] & (FLAG_STRESS_END | FLAG_STRESS_END2))
1092 			ChangeWordStress(tr, word_phonemes, 4);
1093 		else if ((dictionary_flags[0] & FLAG_UNSTRESS_END) && (any_stressed_words))
1094 			ChangeWordStress(tr, word_phonemes, 3);
1095 	}
1096 
1097 	// dictionary flags for this word give a clue about which alternative pronunciations of
1098 	// following words to use.
1099 	if (end_type1 & SUFX_F) {
1100 		// expect a verb form, with or without -s suffix
1101 		tr->expect_verb = 2;
1102 		tr->expect_verb_s = 2;
1103 	}
1104 
1105 	if (dictionary_flags[1] & FLAG_PASTF) {
1106 		// expect perfect tense in next two words
1107 		tr->expect_past = 3;
1108 		tr->expect_verb = 0;
1109 		tr->expect_noun = 0;
1110 	} else if (dictionary_flags[1] & FLAG_VERBF) {
1111 		// expect a verb in the next word
1112 		tr->expect_verb = 2;
1113 		tr->expect_verb_s = 0; // verb won't have -s suffix
1114 		tr->expect_noun = 0;
1115 	} else if (dictionary_flags[1] & FLAG_VERBSF) {
1116 		// expect a verb, must have a -s suffix
1117 		tr->expect_verb = 0;
1118 		tr->expect_verb_s = 2;
1119 		tr->expect_past = 0;
1120 		tr->expect_noun = 0;
1121 	} else if (dictionary_flags[1] & FLAG_NOUNF) {
1122 		// not expecting a verb next
1123 		tr->expect_noun = 2;
1124 		tr->expect_verb = 0;
1125 		tr->expect_verb_s = 0;
1126 		tr->expect_past = 0;
1127 	}
1128 
1129 	if ((wordx[0] != 0) && (!(dictionary_flags[1] & FLAG_VERB_EXT))) {
1130 		if (tr->expect_verb > 0)
1131 			tr->expect_verb--;
1132 
1133 		if (tr->expect_verb_s > 0)
1134 			tr->expect_verb_s--;
1135 
1136 		if (tr->expect_noun > 0)
1137 			tr->expect_noun--;
1138 
1139 		if (tr->expect_past > 0)
1140 			tr->expect_past--;
1141 	}
1142 
1143 	if ((word_length == 1) && (tr->translator_name == L('e', 'n')) && iswalpha(first_char) && (first_char != 'i')) {
1144 		// English Specific !!!!
1145 		// any single letter before a dot is an abbreviation, except 'I'
1146 		dictionary_flags[0] |= FLAG_ALLOW_DOT;
1147 	}
1148 
1149 	if ((tr->langopts.param[LOPT_ALT] & 2) && ((dictionary_flags[0] & (FLAG_ALT_TRANS | FLAG_ALT2_TRANS)) != 0))
1150 		ApplySpecialAttribute2(tr, word_phonemes, dictionary_flags[0]);
1151 
1152 	dictionary_flags[0] |= was_unpronouncable;
1153 	memcpy(word_start, word_copy2, word_copy_length);
1154 	return dictionary_flags[0];
1155 }
1156 
TranslateWord(Translator * tr,char * word_start,WORD_TAB * wtab,char * word_out)1157 int TranslateWord(Translator *tr, char *word_start, WORD_TAB *wtab, char *word_out)
1158 {
1159 	char words_phonemes[N_WORD_PHONEMES]; // a word translated into phoneme codes
1160 	char *phonemes = words_phonemes;
1161 	int available = N_WORD_PHONEMES;
1162 	int first_word = 1;
1163 
1164 	int flags = TranslateWord3(tr, word_start, wtab, word_out);
1165 	if (flags & FLAG_TEXTMODE && word_out) {
1166 		// Ensure that start of word rules match with the replaced text,
1167 		// so that emoji and other characters are pronounced correctly.
1168 		char word[N_WORD_BYTES+1];
1169 		word[0] = 0;
1170 		word[1] = ' ';
1171 		strcpy(word+2, word_out);
1172 		word_out = word+2;
1173 
1174 		while (*word_out && available > 1) {
1175 			int c;
1176 			utf8_in(&c, word_out);
1177 			if (iswupper(c)) {
1178 				wtab->flags |= FLAG_FIRST_UPPER;
1179 				utf8_out(tolower(c), word_out);
1180 			} else {
1181 				wtab->flags &= ~FLAG_FIRST_UPPER;
1182 			}
1183 
1184 			TranslateWord3(tr, word_out, wtab, NULL);
1185 
1186 			int n;
1187 			if (first_word) {
1188 				n = snprintf(phonemes, available, "%s", word_phonemes);
1189 				first_word = 0;
1190 			} else {
1191 				n = snprintf(phonemes, available, "%c%s", phonEND_WORD, word_phonemes);
1192 			}
1193 
1194 			available -= n;
1195 			phonemes += n;
1196 
1197 			// skip to the next word in a multi-word replacement
1198 			while (!isspace(*word_out)) ++word_out;
1199 			while (isspace(*word_out))  ++word_out;
1200 		}
1201 		snprintf(word_phonemes, sizeof(word_phonemes), "%s", words_phonemes);
1202 	}
1203 	return flags;
1204 }
1205 
SetPlist2(PHONEME_LIST2 * p,unsigned char phcode)1206 static void SetPlist2(PHONEME_LIST2 *p, unsigned char phcode)
1207 {
1208 	p->phcode = phcode;
1209 	p->stresslevel = 0;
1210 	p->tone_ph = 0;
1211 	p->synthflags = embedded_flag;
1212 	p->sourceix = 0;
1213 	embedded_flag = 0;
1214 }
1215 
CountSyllables(unsigned char * phonemes)1216 static int CountSyllables(unsigned char *phonemes)
1217 {
1218 	int count = 0;
1219 	int phon;
1220 	while ((phon = *phonemes++) != 0) {
1221 		if (phoneme_tab[phon]->type == phVOWEL)
1222 			count++;
1223 	}
1224 	return count;
1225 }
1226 
Word_EmbeddedCmd()1227 static void Word_EmbeddedCmd()
1228 {
1229 	// Process embedded commands for emphasis, sayas, and break
1230 	int embedded_cmd;
1231 	int value;
1232 
1233 	do {
1234 		embedded_cmd = embedded_list[embedded_read++];
1235 		value = embedded_cmd >> 8;
1236 
1237 		switch (embedded_cmd & 0x1f)
1238 		{
1239 		case EMBED_Y:
1240 			option_sayas = value;
1241 			break;
1242 
1243 		case EMBED_F:
1244 			option_emphasis = value;
1245 			break;
1246 
1247 		case EMBED_B:
1248 			// break command
1249 			if (value == 0)
1250 				pre_pause = 0; // break=none
1251 			else
1252 				pre_pause += value;
1253 			break;
1254 		}
1255 	} while (((embedded_cmd & 0x80) == 0) && (embedded_read < embedded_ix));
1256 }
1257 
SetTranslator2(const char * new_language)1258 int SetTranslator2(const char *new_language)
1259 {
1260 	// Set translator2 to a second language
1261 	int new_phoneme_tab;
1262 
1263 	if ((new_phoneme_tab = SelectPhonemeTableName(new_language)) >= 0) {
1264 		if ((translator2 != NULL) && (strcmp(new_language, translator2_language) != 0)) {
1265 			// we already have an alternative translator, but not for the required language, delete it
1266 			DeleteTranslator(translator2);
1267 			translator2 = NULL;
1268 		}
1269 
1270 		if (translator2 == NULL) {
1271 			translator2 = SelectTranslator(new_language);
1272 			strcpy(translator2_language, new_language);
1273 
1274 			if (LoadDictionary(translator2, translator2->dictionary_name, 0) != 0) {
1275 				SelectPhonemeTable(voice->phoneme_tab_ix); // revert to original phoneme table
1276 				new_phoneme_tab = -1;
1277 				translator2_language[0] = 0;
1278 			}
1279 			translator2->phoneme_tab_ix = new_phoneme_tab;
1280 		}
1281 	}
1282 	if (translator2 != NULL)
1283 		translator2->phonemes_repeat[0] = 0;
1284 	return new_phoneme_tab;
1285 }
1286 
TranslateWord2(Translator * tr,char * word,WORD_TAB * wtab,int prepause)1287 static int TranslateWord2(Translator *tr, char *word, WORD_TAB *wtab, int prepause)
1288 {
1289 	int flags = 0;
1290 	int stress;
1291 	int next_stress;
1292 	int next_tone = 0;
1293 	unsigned char *p;
1294 	int srcix;
1295 	int found_dict_flag;
1296 	unsigned char ph_code;
1297 	PHONEME_LIST2 *plist2;
1298 	PHONEME_TAB *ph;
1299 	int max_stress;
1300 	int max_stress_ix = 0;
1301 	int prev_vowel = -1;
1302 	int pitch_raised = 0;
1303 	int switch_phonemes = -1;
1304 	int first_phoneme = 1;
1305 	int source_ix;
1306 	int len;
1307 	int ix;
1308 	int sylimit; // max. number of syllables in a word to be combined with a preceding preposition
1309 	const char *new_language;
1310 	int bad_phoneme;
1311 	int word_flags;
1312 	int word_copy_len;
1313 	char word_copy[N_WORD_BYTES+1];
1314 	char word_replaced[N_WORD_BYTES+1];
1315 	char old_dictionary_name[40];
1316 
1317 	len = wtab->length;
1318 	if (len > 31) len = 31;
1319 	source_ix = (wtab->sourceix & 0x7ff) | (len << 11); // bits 0-10 sourceix, bits 11-15 word length
1320 
1321 	word_flags = wtab[0].flags;
1322 	if (word_flags & FLAG_EMBEDDED) {
1323 		wtab[0].flags &= ~FLAG_EMBEDDED; // clear it in case we call TranslateWord2() again for the same word
1324 		embedded_flag = SFLAG_EMBEDDED;
1325 
1326 		Word_EmbeddedCmd();
1327 	}
1328 
1329 	if ((word[0] == 0) || (word_flags & FLAG_DELETE_WORD)) {
1330 		// nothing to translate.  Add a dummy phoneme to carry any embedded commands
1331 		if (embedded_flag) {
1332 			ph_list2[n_ph_list2].phcode = phonEND_WORD;
1333 			ph_list2[n_ph_list2].stresslevel = 0;
1334 			ph_list2[n_ph_list2].wordstress = 0;
1335 			ph_list2[n_ph_list2].tone_ph = 0;
1336 			ph_list2[n_ph_list2].synthflags = embedded_flag;
1337 			ph_list2[n_ph_list2].sourceix = 0;
1338 			n_ph_list2++;
1339 			embedded_flag = 0;
1340 		}
1341 		word_phonemes[0] = 0;
1342 		return 0;
1343 	}
1344 
1345 	// after a $pause word attribute, ignore a $pause attribute on the next two words
1346 	if (tr->prepause_timeout > 0)
1347 		tr->prepause_timeout--;
1348 
1349 	if ((option_sayas & 0xf0) == 0x10) {
1350 		if (!(word_flags & FLAG_FIRST_WORD)) {
1351 			// SAYAS_CHARS, SAYAS_GLYPHS, or SAYAS_SINGLECHARS.  Pause between each word.
1352 			prepause += 4;
1353 		}
1354 	}
1355 
1356 	if (word_flags & FLAG_FIRST_UPPER) {
1357 		if ((option_capitals > 2) && (embedded_ix < N_EMBEDDED_LIST-6)) {
1358 			// indicate capital letter by raising pitch
1359 			if (embedded_flag)
1360 				embedded_list[embedded_ix-1] &= ~0x80; // already embedded command before this word, remove terminator
1361 			if ((pitch_raised = option_capitals) == 3)
1362 				pitch_raised = 20; // default pitch raise for capitals
1363 			embedded_list[embedded_ix++] = EMBED_P+0x40+0x80 + (pitch_raised << 8); // raise pitch
1364 			embedded_flag = SFLAG_EMBEDDED;
1365 		}
1366 	}
1367 
1368 	p = (unsigned char *)word_phonemes;
1369 	if (word_flags & FLAG_PHONEMES) {
1370 		// The input is in phoneme mnemonics, not language text
1371 		int c1;
1372 		char lang_name[12];
1373 
1374 		if (memcmp(word, "_^_", 3) == 0) {
1375 			// switch languages
1376 			word += 3;
1377 			for (ix = 0;;) {
1378 				c1 = *word++;
1379 				if ((c1 == ' ') || (c1 == 0))
1380 					break;
1381 				lang_name[ix++] = tolower(c1);
1382 			}
1383 			lang_name[ix] = 0;
1384 
1385 			if ((ix = LookupPhonemeTable(lang_name)) > 0) {
1386 				SelectPhonemeTable(ix);
1387 				word_phonemes[0] = phonSWITCH;
1388 				word_phonemes[1] = ix;
1389 				word_phonemes[2] = 0;
1390 			}
1391 		} else
1392 			EncodePhonemes(word, word_phonemes, &bad_phoneme);
1393 		flags = FLAG_FOUND;
1394 	} else {
1395 		int c2;
1396 		ix = 0;
1397 		while (((c2 = word_copy[ix] = word[ix]) != ' ') && (c2 != 0) && (ix < N_WORD_BYTES)) ix++;
1398 		word_copy_len = ix;
1399 
1400 		word_replaced[2] = 0;
1401 		flags = TranslateWord(translator, word, wtab, &word_replaced[2]);
1402 
1403 		if (flags & FLAG_SPELLWORD) {
1404 			// re-translate the word as individual letters, separated by spaces
1405 			memcpy(word, word_copy, word_copy_len);
1406 			return flags;
1407 		}
1408 
1409 		if ((flags & FLAG_COMBINE) && !(wtab[1].flags & FLAG_PHONEMES)) {
1410 			char *p2;
1411 			int ok = 1;
1412 			unsigned int flags2[2];
1413 			int c_word2;
1414 			char ph_buf[N_WORD_PHONEMES];
1415 
1416 			flags2[0] = 0;
1417 			sylimit = tr->langopts.param[LOPT_COMBINE_WORDS];
1418 
1419 			// LANG=cs,sk
1420 			// combine a preposition with the following word
1421 			p2 = word;
1422 			while (*p2 != ' ') p2++;
1423 
1424 			utf8_in(&c_word2, p2+1); // first character of the next word;
1425 			if (!iswalpha(c_word2))
1426 				ok = 0;
1427 
1428 			if (ok != 0) {
1429 				strcpy(ph_buf, word_phonemes);
1430 
1431 				flags2[0] = TranslateWord(translator, p2+1, wtab+1, NULL);
1432 				if ((flags2[0] & FLAG_WAS_UNPRONOUNCABLE) || (word_phonemes[0] == phonSWITCH))
1433 					ok = 0;
1434 
1435 				if (sylimit & 0x100) {
1436 					// only if the second word has $alt attribute
1437 					if ((flags2[0] & FLAG_ALT_TRANS) == 0)
1438 						ok = 0;
1439 				}
1440 
1441 				if ((sylimit & 0x200) && ((wtab+1)->flags & FLAG_LAST_WORD)) {
1442 					// not if the next word is end-of-sentence
1443 					ok = 0;
1444 				}
1445 
1446 				if (ok == 0)
1447 					strcpy(word_phonemes, ph_buf);
1448 			}
1449 
1450 			if (ok) {
1451 				*p2 = '-'; // replace next space by hyphen
1452 				wtab[0].flags &= ~FLAG_ALL_UPPER; // prevent it being considered an abbreviation
1453 				flags = TranslateWord(translator, word, wtab, NULL); // translate the combined word
1454 				if ((sylimit > 0) && (CountSyllables(p) > (sylimit & 0x1f))) {
1455 					// revert to separate words
1456 					*p2 = ' ';
1457 					flags = TranslateWord(translator, word, wtab, NULL);
1458 				} else {
1459 					if (flags == 0)
1460 						flags = flags2[0]; // no flags for the combined word, so use flags from the second word eg. lang-hu "nem december 7-e"
1461 					flags |= FLAG_SKIPWORDS;
1462 					dictionary_skipwords = 1;
1463 				}
1464 			}
1465 		}
1466 
1467 		if (p[0] == phonSWITCH) {
1468 			int switch_attempt;
1469 			strcpy(old_dictionary_name, dictionary_name);
1470 			for (switch_attempt = 0; switch_attempt < 2; switch_attempt++) {
1471 				// this word uses a different language
1472 				memcpy(word, word_copy, word_copy_len);
1473 
1474 				new_language = (char *)(&p[1]);
1475 				if (new_language[0] == 0)
1476 					new_language = "en";
1477 
1478 				switch_phonemes = SetTranslator2(new_language);
1479 
1480 				if (switch_phonemes >= 0) {
1481 					// re-translate the word using the new translator
1482 					wtab[0].flags |= FLAG_TRANSLATOR2;
1483 					if (word_replaced[2] != 0) {
1484 						word_replaced[0] = 0; // byte before the start of the word
1485 						word_replaced[1] = ' ';
1486 						flags = TranslateWord(translator2, &word_replaced[1], wtab, NULL);
1487 					} else
1488 						flags = TranslateWord(translator2, word, wtab, &word_replaced[2]);
1489 				}
1490 
1491 				if (p[0] != phonSWITCH)
1492 					break;
1493 			}
1494 
1495 			if (p[0] == phonSWITCH)
1496 				return FLAG_SPELLWORD;
1497 
1498 			if (switch_phonemes < 0) {
1499 				// language code is not recognised or 2nd translator won't translate it
1500 				p[0] = phonSCHWA; // just say something
1501 				p[1] = phonSCHWA;
1502 				p[2] = 0;
1503 			}
1504 
1505 			if (switch_phonemes == -1) {
1506 				strcpy(dictionary_name, old_dictionary_name);
1507 				SelectPhonemeTable(voice->phoneme_tab_ix);
1508 
1509 				// leave switch_phonemes set, but use the original phoneme table number.
1510 				// This will suppress LOPT_REGRESSIVE_VOICING
1511 				switch_phonemes = voice->phoneme_tab_ix; // original phoneme table
1512 			}
1513 		}
1514 
1515 		if (!(word_flags & FLAG_HYPHEN)) {
1516 			if (flags & FLAG_PAUSE1) {
1517 				if (prepause < 1)
1518 					prepause = 1;
1519 			}
1520 			if ((flags & FLAG_PREPAUSE) && !(word_flags & (FLAG_LAST_WORD | FLAG_FIRST_WORD)) && !(wtab[-1].flags & FLAG_FIRST_WORD) && (tr->prepause_timeout == 0)) {
1521 				// the word is marked in the dictionary list with $pause
1522 				if (prepause < 4) prepause = 4;
1523 				tr->prepause_timeout = 3;
1524 			}
1525 		}
1526 
1527 		if ((option_emphasis >= 3) && (prepause < 1))
1528 			prepause = 1;
1529 	}
1530 
1531 	stress = 0;
1532 	next_stress = 1;
1533 	srcix = 0;
1534 	max_stress = -1;
1535 
1536 	found_dict_flag = 0;
1537 	if ((flags & FLAG_FOUND) && !(flags & FLAG_TEXTMODE))
1538 		found_dict_flag = SFLAG_DICTIONARY;
1539 
1540 	while ((prepause > 0) && (n_ph_list2 < N_PHONEME_LIST-4)) {
1541 		// add pause phonemes here. Either because of punctuation (brackets or quotes) in the
1542 		// text, or because the word is marked in the dictionary lookup as a conjunction
1543 		if (prepause > 1) {
1544 			SetPlist2(&ph_list2[n_ph_list2++], phonPAUSE);
1545 			prepause -= 2;
1546 		} else {
1547 			SetPlist2(&ph_list2[n_ph_list2++], phonPAUSE_NOLINK);
1548 			prepause--;
1549 		}
1550 		tr->end_stressed_vowel = 0; // forget about the previous word
1551 		tr->prev_dict_flags[0] = 0;
1552 		tr->prev_dict_flags[1] = 0;
1553 	}
1554 	plist2 = &ph_list2[n_ph_list2];
1555 
1556 	if ((option_capitals == 1) && (word_flags & FLAG_FIRST_UPPER)) {
1557 		SetPlist2(&ph_list2[n_ph_list2++], phonPAUSE_SHORT);
1558 		SetPlist2(&ph_list2[n_ph_list2++], phonCAPITAL);
1559 		if ((word_flags & FLAG_ALL_UPPER) && IsAlpha(word[1])) {
1560 			// word > 1 letter and all capitals
1561 			SetPlist2(&ph_list2[n_ph_list2++], phonPAUSE_SHORT);
1562 			SetPlist2(&ph_list2[n_ph_list2++], phonCAPITAL);
1563 		}
1564 	}
1565 
1566 	if (switch_phonemes >= 0) {
1567 		if ((p[0] == phonPAUSE) && (p[1] == phonSWITCH)) {
1568 			// the new word starts with a phoneme table switch, so there's no need to switch before it.
1569 			if (ph_list2[n_ph_list2-1].phcode == phonSWITCH) {
1570 				// previous phoneme is also a phonSWITCH, delete it
1571 				n_ph_list2--;
1572 			}
1573 		} else {
1574 			// this word uses a different phoneme table
1575 			if (ph_list2[n_ph_list2-1].phcode == phonSWITCH) {
1576 				// previous phoneme is also a phonSWITCH, just change its phoneme table number
1577 				n_ph_list2--;
1578 			} else
1579 				SetPlist2(&ph_list2[n_ph_list2], phonSWITCH);
1580 			ph_list2[n_ph_list2++].tone_ph = switch_phonemes; // temporary phoneme table number
1581 		}
1582 	}
1583 
1584 	// remove initial pause from a word if it follows a hyphen
1585 	if ((word_flags & FLAG_HYPHEN) && (phoneme_tab[*p]->type == phPAUSE))
1586 		p++;
1587 
1588 	if ((p[0] == 0) && (embedded_flag)) {
1589 		// no phonemes.  Insert a very short pause to carry an embedded command
1590 		p[0] = phonPAUSE_VSHORT;
1591 		p[1] = 0;
1592 	}
1593 
1594 	while (((ph_code = *p++) != 0) && (n_ph_list2 < N_PHONEME_LIST-4)) {
1595 		if (ph_code == 255)
1596 			continue; // unknown phoneme
1597 
1598 		// Add the phonemes to the first stage phoneme list (ph_list2)
1599 		ph = phoneme_tab[ph_code];
1600 
1601 		if (ph_code == phonSWITCH) {
1602 			ph_list2[n_ph_list2].phcode = ph_code;
1603 			ph_list2[n_ph_list2].sourceix = 0;
1604 			ph_list2[n_ph_list2].synthflags = 0;
1605 			ph_list2[n_ph_list2++].tone_ph = *p;
1606 			SelectPhonemeTable(*p);
1607 			p++;
1608 		} else if (ph->type == phSTRESS) {
1609 			// don't add stress phonemes codes to the list, but give their stress
1610 			// value to the next vowel phoneme
1611 			// std_length is used to hold stress number or (if >10) a tone number for a tone language
1612 			if (ph->program == 0)
1613 				next_stress = ph->std_length;
1614 			else {
1615 				// for tone languages, the tone number for a syllable follows the vowel
1616 				if (prev_vowel >= 0)
1617 					ph_list2[prev_vowel].tone_ph = ph_code;
1618 				else
1619 					next_tone = ph_code; // no previous vowel, apply to the next vowel
1620 			}
1621 		} else if (ph_code == phonSYLLABIC) {
1622 			// mark the previous phoneme as a syllabic consonant
1623 			prev_vowel = n_ph_list2-1;
1624 			ph_list2[prev_vowel].synthflags |= SFLAG_SYLLABLE;
1625 			ph_list2[prev_vowel].stresslevel = next_stress;
1626 		} else if (ph_code == phonLENGTHEN)
1627 			ph_list2[n_ph_list2-1].synthflags |= SFLAG_LENGTHEN;
1628 		else if (ph_code == phonEND_WORD) {
1629 			// a || symbol in a phoneme string was used to indicate a word boundary
1630 			// Don't add this phoneme to the list, but make sure the next phoneme has
1631 			// a newword indication
1632 			srcix = source_ix+1;
1633 		} else if (ph_code == phonX1) {
1634 			// a language specific action
1635 			if (tr->langopts.param[LOPT_IT_DOUBLING])
1636 				flags |= FLAG_DOUBLING;
1637 		} else {
1638 			ph_list2[n_ph_list2].phcode = ph_code;
1639 			ph_list2[n_ph_list2].tone_ph = 0;
1640 			ph_list2[n_ph_list2].synthflags = embedded_flag | found_dict_flag;
1641 			embedded_flag = 0;
1642 			ph_list2[n_ph_list2].sourceix = srcix;
1643 			srcix = 0;
1644 
1645 			if (ph->type == phVOWEL) {
1646 				stress = next_stress;
1647 				next_stress = 1; // default is 'unstressed'
1648 
1649 				if (stress >= 4)
1650 					any_stressed_words = 1;
1651 
1652 				if ((prev_vowel >= 0) && (n_ph_list2-1) != prev_vowel)
1653 					ph_list2[n_ph_list2-1].stresslevel = stress; // set stress for previous consonant
1654 
1655 				ph_list2[n_ph_list2].synthflags |= SFLAG_SYLLABLE;
1656 				prev_vowel = n_ph_list2;
1657 
1658 				if (stress > max_stress) {
1659 					max_stress = stress;
1660 					max_stress_ix = n_ph_list2;
1661 				}
1662 				if (next_tone != 0) {
1663 					ph_list2[n_ph_list2].tone_ph = next_tone;
1664 					next_tone = 0;
1665 				}
1666 			} else {
1667 				if (first_phoneme && tr->langopts.param[LOPT_IT_DOUBLING]) {
1668 					if (((tr->prev_dict_flags[0] & FLAG_DOUBLING) && (tr->langopts.param[LOPT_IT_DOUBLING] & 1)) ||
1669 					    (tr->end_stressed_vowel && (tr->langopts.param[LOPT_IT_DOUBLING] & 2))) {
1670 						// italian, double the initial consonant if the previous word ends with a
1671 						// stressed vowel, or is marked with a flag
1672 						ph_list2[n_ph_list2].synthflags |= SFLAG_LENGTHEN;
1673 					}
1674 				}
1675 			}
1676 
1677 			ph_list2[n_ph_list2].stresslevel = stress;
1678 			n_ph_list2++;
1679 			first_phoneme = 0;
1680 		}
1681 	}
1682 
1683 	if (word_flags & FLAG_COMMA_AFTER)
1684 		SetPlist2(&ph_list2[n_ph_list2++], phonPAUSE_CLAUSE);
1685 
1686 	// don't set new-word if there is a hyphen before it
1687 	if ((word_flags & FLAG_HYPHEN) == 0)
1688 		plist2->sourceix = source_ix;
1689 
1690 	tr->end_stressed_vowel = 0;
1691 	if ((stress >= 4) && (phoneme_tab[ph_list2[n_ph_list2-1].phcode]->type == phVOWEL))
1692 		tr->end_stressed_vowel = 1; // word ends with a stressed vowel
1693 
1694 	if (switch_phonemes >= 0) {
1695 		// this word uses a different phoneme table, now switch back
1696 		strcpy(dictionary_name, old_dictionary_name);
1697 		SelectPhonemeTable(voice->phoneme_tab_ix);
1698 		SetPlist2(&ph_list2[n_ph_list2], phonSWITCH);
1699 		ph_list2[n_ph_list2++].tone_ph = voice->phoneme_tab_ix; // original phoneme table number
1700 	}
1701 
1702 
1703 	if (pitch_raised > 0) {
1704 		embedded_list[embedded_ix++] = EMBED_P+0x60+0x80 + (pitch_raised << 8); // lower pitch
1705 		SetPlist2(&ph_list2[n_ph_list2], phonPAUSE_SHORT);
1706 		ph_list2[n_ph_list2++].synthflags = SFLAG_EMBEDDED;
1707 	}
1708 
1709 	if (flags & FLAG_STRESS_END2) {
1710 		// this's word's stress could be increased later
1711 		ph_list2[max_stress_ix].synthflags |= SFLAG_PROMOTE_STRESS;
1712 	}
1713 
1714 	tr->prev_dict_flags[0] = flags;
1715 	return flags;
1716 }
1717 
EmbeddedCommand(unsigned int * source_index_out)1718 static int EmbeddedCommand(unsigned int *source_index_out)
1719 {
1720 	// An embedded command to change the pitch, volume, etc.
1721 	// returns number of commands added to embedded_list
1722 
1723 	// pitch,speed,amplitude,expression,reverb,tone,voice,sayas
1724 	const char *commands = "PSARHTIVYMUBF";
1725 	int value = -1;
1726 	int sign = 0;
1727 	unsigned char c;
1728 	char *p;
1729 	int cmd;
1730 	int source_index = *source_index_out;
1731 
1732 	c = source[source_index];
1733 	if (c == '+') {
1734 		sign = 0x40;
1735 		source_index++;
1736 	} else if (c == '-') {
1737 		sign = 0x60;
1738 		source_index++;
1739 	}
1740 
1741 	if (IsDigit09(source[source_index])) {
1742 		value = atoi(&source[source_index]);
1743 		while (IsDigit09(source[source_index]))
1744 			source_index++;
1745 	}
1746 
1747 	c = source[source_index++];
1748 	if (embedded_ix >= (N_EMBEDDED_LIST - 2))
1749 		return 0; // list is full
1750 
1751 	if ((p = strchr_w(commands, c)) == NULL)
1752 		return 0;
1753 	cmd = (p - commands)+1;
1754 	if (value == -1) {
1755 		value = embedded_default[cmd];
1756 		sign = 0;
1757 	}
1758 
1759 	if (cmd == EMBED_Y) {
1760 		option_sayas2 = value;
1761 		count_sayas_digits = 0;
1762 	}
1763 	if (cmd == EMBED_F) {
1764 		if (value >= 3)
1765 			word_emphasis = FLAG_EMPHASIZED;
1766 		else
1767 			word_emphasis = 0;
1768 	}
1769 
1770 	embedded_list[embedded_ix++] = cmd + sign + (value << 8);
1771 	*source_index_out = source_index;
1772 	return 1;
1773 }
1774 
SubstituteChar(Translator * tr,unsigned int c,unsigned int next_in,int * insert,int * wordflags)1775 static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
1776 {
1777 	int ix;
1778 	unsigned int word;
1779 	unsigned int new_c, c2, c_lower;
1780 	int upper_case = 0;
1781 	static int ignore_next = 0;
1782 	const unsigned int *replace_chars;
1783 
1784 	if (ignore_next) {
1785 		ignore_next = 0;
1786 		return 8;
1787 	}
1788 	if (c == 0) return 0;
1789 
1790 	if ((replace_chars = tr->langopts.replace_chars) == NULL)
1791 		return c;
1792 
1793 	// there is a list of character codes to be substituted with alternative codes
1794 
1795 	if (iswupper(c_lower = c)) {
1796 		c_lower = towlower2(c);
1797 		upper_case = 1;
1798 	}
1799 
1800 	new_c = 0;
1801 	for (ix = 0; (word = replace_chars[ix]) != 0; ix += 2) {
1802 		if (c_lower == (word & 0xffff)) {
1803 			if ((word >> 16) == 0) {
1804 				new_c = replace_chars[ix+1];
1805 				break;
1806 			}
1807 			if ((word >> 16) == (unsigned int)towlower2(next_in)) {
1808 				new_c = replace_chars[ix+1];
1809 				ignore_next = 1;
1810 				break;
1811 			}
1812 		}
1813 	}
1814 
1815 	if (new_c == 0)
1816 		return c; // no substitution
1817 
1818 	if (new_c & 0xffe00000) {
1819 		// there is a second character to be inserted
1820 		// don't convert the case of the second character unless the next letter is also upper case
1821 		c2 = new_c >> 16;
1822 		if (upper_case && iswupper(next_in))
1823 			c2 = toupper(c2);
1824 		*insert = c2;
1825 		new_c &= 0xffff;
1826 	}
1827 
1828 	if (upper_case)
1829 		new_c = toupper(new_c);
1830 
1831 	*wordflags |= FLAG_CHAR_REPLACED;
1832 	return new_c;
1833 }
1834 
TranslateChar(Translator * tr,char * ptr,int prev_in,unsigned int c,unsigned int next_in,int * insert,int * wordflags)1835 static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
1836 {
1837 	// To allow language specific examination and replacement of characters
1838 
1839 	int code;
1840 	int initial;
1841 	int medial;
1842 	int final;
1843 	int next2;
1844 
1845 	static const unsigned char hangul_compatibility[0x34] = {
1846 		0,  0x00, 0x01, 0xaa, 0x02, 0xac, 0xad, 0x03,
1847 		0x04, 0x05, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb4,
1848 		0xb6, 0x06, 0x07, 0x08, 0xb9, 0x09, 0x0a, 0xbc,
1849 		0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x61,
1850 		0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
1851 		0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71,
1852 		0x72, 0x73, 0x74, 0x75
1853 	};
1854 
1855 	// check for Korean Hangul letters
1856 	if (((code = c - 0xac00) >= 0) && (c <= 0xd7af)) {
1857 		// break a syllable hangul into 2 or 3 individual jamo
1858 		initial = (code/28)/21;
1859 		medial = (code/28) % 21;
1860 		final = code % 28;
1861 
1862 		if (initial == 11) {
1863 			// null initial
1864 			c = medial + 0x1161;
1865 			if (final > 0)
1866 				*insert = final + 0x11a7;
1867 		} else {
1868 			// extact the initial and insert the remainder with a null initial
1869 			c = initial + 0x1100;
1870 			*insert = (11*28*21) + (medial*28) + final + 0xac00;
1871 		}
1872 		return c;
1873 	} else if (((code = c - 0x3130) >= 0) && (code < 0x34)) {
1874 		// Hangul compatibility jamo
1875 		return hangul_compatibility[code] + 0x1100;
1876 	}
1877 
1878 	switch (tr->translator_name)
1879 	{
1880 	case L('a', 'f'):
1881 	case L('n', 'l'):
1882 		// look for 'n  and replace by a special character (unicode: schwa)
1883 
1884 		if (!iswalpha(prev_in)) {
1885 			utf8_in(&next2, &ptr[1]);
1886 
1887 			if ((c == '\'') && IsSpace(next2)) {
1888 				if ((next_in == 'n') && (tr->translator_name == L('a', 'f'))) {
1889 					// n preceded by either apostrophe or U2019 "right single quotation mark"
1890 					ptr[0] = ' '; // delete the n
1891 					return 0x0259; // replace  '  by  unicode schwa character
1892 				}
1893 				if ((next_in == 'n') || (next_in == 't')) {
1894 					// Dutch, [@n] and [@t]
1895 					return 0x0259; // replace  '  by  unicode schwa character
1896 				}
1897 			}
1898 		}
1899 		break;
1900 	}
1901 	return SubstituteChar(tr, c, next_in, insert, wordflags);
1902 }
1903 
1904 static const char *UCase_ga[] = { "bp", "bhf", "dt", "gc", "hA", "mb", "nd", "ng", "ts", "tA", "nA", NULL };
1905 
UpperCaseInWord(Translator * tr,char * word,int c)1906 static int UpperCaseInWord(Translator *tr, char *word, int c)
1907 {
1908 	int ix;
1909 	int len;
1910 	const char *p;
1911 
1912 	if (tr->translator_name == L('g', 'a')) {
1913 		// Irish
1914 		for (ix = 0;; ix++) {
1915 			if ((p = UCase_ga[ix]) == NULL)
1916 				break;
1917 
1918 			len = strlen(p);
1919 			if ((word[-len] == ' ') && (memcmp(&word[-len+1], p, len-1) == 0)) {
1920 				if ((c == p[len-1]) || ((p[len-1] == 'A') && IsVowel(tr, c)))
1921 					return 1;
1922 			}
1923 		}
1924 	}
1925 	return 0;
1926 }
1927 
TranslateClause(Translator * tr,int * tone_out,char ** voice_change)1928 void TranslateClause(Translator *tr, int *tone_out, char **voice_change)
1929 {
1930 	int ix;
1931 	int c;
1932 	int cc = 0;
1933 	unsigned int source_index = 0;
1934 	unsigned int prev_source_index = 0;
1935 	int source_index_word = 0;
1936 	int prev_in;
1937 	int prev_out = ' ';
1938 	int prev_out2;
1939 	int prev_in_save = 0;
1940 	int next_in;
1941 	int next_in_nbytes;
1942 	int char_inserted = 0;
1943 	int clause_pause;
1944 	int pre_pause_add = 0;
1945 	int word_mark = 0;
1946 	int all_upper_case = FLAG_ALL_UPPER;
1947 	int finished;
1948 	int single_quoted;
1949 	int phoneme_mode = 0;
1950 	int dict_flags = 0; // returned from dictionary lookup
1951 	int word_flags; // set here
1952 	int next_word_flags;
1953 	int new_sentence2;
1954 	int embedded_count = 0;
1955 	int letter_count = 0;
1956 	int space_inserted = 0;
1957 	int syllable_marked = 0;
1958 	int decimal_sep_count = 0;
1959 	char *word;
1960 	char *p;
1961 	int j, k;
1962 	int n_digits;
1963 	int charix_top = 0;
1964 
1965 	short charix[N_TR_SOURCE+4];
1966 	WORD_TAB words[N_CLAUSE_WORDS];
1967 	static char voice_change_name[40];
1968 	int word_count = 0; // index into words
1969 
1970 	char sbuf[N_TR_SOURCE];
1971 
1972 	int terminator;
1973 	int tone;
1974 	int tone2;
1975 
1976 	if (tr == NULL)
1977 		return;
1978 
1979 	embedded_ix = 0;
1980 	embedded_read = 0;
1981 	pre_pause = 0;
1982 	any_stressed_words = 0;
1983 
1984 	if ((clause_start_char = count_characters) < 0)
1985 		clause_start_char = 0;
1986 	clause_start_word = count_words + 1;
1987 
1988 	for (ix = 0; ix < N_TR_SOURCE; ix++)
1989 		charix[ix] = 0;
1990 	terminator = ReadClause(tr, source, charix, &charix_top, N_TR_SOURCE, &tone2, voice_change_name);
1991 
1992 	charix[charix_top+1] = 0;
1993 	charix[charix_top+2] = 0x7fff;
1994 	charix[charix_top+3] = 0;
1995 
1996 	clause_pause = (terminator & CLAUSE_PAUSE) * 10; // mS
1997 	if (terminator & CLAUSE_PAUSE_LONG)
1998 		clause_pause = clause_pause * 32; // pause value is *320mS not *10mS
1999 
2000 	tone = (terminator & CLAUSE_INTONATION_TYPE) >> 12;
2001 	if (tone2 != 0) {
2002 		// override the tone type
2003 		tone = tone2;
2004 	}
2005 
2006 	for (p = source; *p != 0; p++) {
2007 		if (!isspace2(*p))
2008 			break;
2009 	}
2010 	if (*p == 0) {
2011 		// No characters except spaces. This is not a sentence.
2012 		// Don't add this pause, just make up the previous pause to this value;
2013 		clause_pause -= max_clause_pause;
2014 		if (clause_pause < 0)
2015 			clause_pause = 0;
2016 
2017 		if (new_sentence)
2018 			terminator |= CLAUSE_TYPE_SENTENCE; // carry forward an end-of-sentence indicator
2019 		max_clause_pause += clause_pause;
2020 		new_sentence2 = 0;
2021 	} else {
2022 		max_clause_pause = clause_pause;
2023 		new_sentence2 = new_sentence;
2024 	}
2025 	tr->clause_terminator = terminator;
2026 
2027 	if (new_sentence2) {
2028 		count_sentences++;
2029 		if (skip_sentences > 0) {
2030 			skip_sentences--;
2031 			if (skip_sentences == 0)
2032 				skipping_text = 0;
2033 		}
2034 	}
2035 
2036 	memset(&ph_list2[0], 0, sizeof(ph_list2[0]));
2037 	ph_list2[0].phcode = phonPAUSE_SHORT;
2038 
2039 	n_ph_list2 = 1;
2040 	tr->prev_last_stress = 0;
2041 	tr->prepause_timeout = 0;
2042 	tr->expect_verb = 0;
2043 	tr->expect_noun = 0;
2044 	tr->expect_past = 0;
2045 	tr->expect_verb_s = 0;
2046 	tr->phonemes_repeat_count = 0;
2047 	tr->end_stressed_vowel = 0;
2048 	tr->prev_dict_flags[0] = 0;
2049 	tr->prev_dict_flags[1] = 0;
2050 
2051 	word_count = 0;
2052 	single_quoted = 0;
2053 	word_flags = 0;
2054 	next_word_flags = 0;
2055 
2056 	sbuf[0] = 0;
2057 	sbuf[1] = ' ';
2058 	sbuf[2] = ' ';
2059 	ix = 3;
2060 	prev_in = ' ';
2061 
2062 	words[0].start = ix;
2063 	words[0].flags = 0;
2064 	finished = 0;
2065 
2066 	for (j = 0; charix[j] <= 0; j++) ;
2067 	words[0].sourceix = charix[j];
2068 	k = 0;
2069 	while (charix[j] != 0) {
2070 		// count the number of characters (excluding multibyte continuation bytes)
2071 		if (charix[j++] != -1)
2072 			k++;
2073 	}
2074 	words[0].length = k;
2075 
2076 	while (!finished && (ix < (int)sizeof(sbuf)) && (n_ph_list2 < N_PHONEME_LIST-4)) {
2077 		prev_out2 = prev_out;
2078 		utf8_in2(&prev_out, &sbuf[ix-1], 1);
2079 
2080 		if (tr->langopts.tone_numbers && IsDigit09(prev_out) && IsAlpha(prev_out2)) {
2081 			// tone numbers can be part of a word, consider them as alphabetic
2082 			prev_out = 'a';
2083 		}
2084 
2085 		if (prev_in_save != 0) {
2086 			prev_in = prev_in_save;
2087 			prev_in_save = 0;
2088 		} else if (source_index > 0)
2089 			utf8_in2(&prev_in, &source[source_index-1], 1);
2090 
2091 		prev_source_index = source_index;
2092 
2093 		if (char_inserted) {
2094 			c = char_inserted;
2095 			char_inserted = 0;
2096 		} else {
2097 			source_index += utf8_in(&cc, &source[source_index]);
2098 			c = cc;
2099 		}
2100 		next_in_nbytes = utf8_in(&next_in, &source[source_index]);
2101 
2102 		if (c == 0) {
2103 			finished = 1;
2104 			c = ' ';
2105 		}
2106 
2107 		if ((c == CTRL_EMBEDDED) || (c == ctrl_embedded)) {
2108 			// start of embedded command in the text
2109 			int srcix = source_index-1;
2110 
2111 			if (prev_in != ' ') {
2112 				c = ' ';
2113 				prev_in_save = c;
2114 				source_index--;
2115 			} else {
2116 				embedded_count += EmbeddedCommand(&source_index);
2117 				prev_in_save = prev_in;
2118 				// replace the embedded command by spaces
2119 				memset(&source[srcix], ' ', source_index-srcix);
2120 				source_index = srcix;
2121 				continue;
2122 			}
2123 		}
2124 
2125 		if ((option_sayas2 == SAYAS_KEY) && (c != ' ')) {
2126 			if ((prev_in == ' ') && (next_in == ' '))
2127 				option_sayas2 = SAYAS_SINGLE_CHARS; // single character, speak its name
2128 			c = towlower2(c);
2129 		}
2130 
2131 
2132 		if (phoneme_mode) {
2133 			all_upper_case = FLAG_PHONEMES;
2134 
2135 			if ((c == ']') && (next_in == ']')) {
2136 				phoneme_mode = 0;
2137 				source_index++;
2138 				c = ' ';
2139 			}
2140 		} else if ((option_sayas2 & 0xf0) == SAYAS_DIGITS) {
2141 			if (iswdigit(c)) {
2142 				count_sayas_digits++;
2143 				if (count_sayas_digits > (option_sayas2 & 0xf)) {
2144 					// break after the specified number of digits
2145 					c = ' ';
2146 					space_inserted = 1;
2147 					count_sayas_digits = 0;
2148 				}
2149 			} else {
2150 				count_sayas_digits = 0;
2151 				if (iswdigit(prev_out)) {
2152 					c = ' ';
2153 					space_inserted = 1;
2154 				}
2155 			}
2156 		} else if ((option_sayas2 & 0x10) == 0) {
2157 			// speak as words
2158 
2159 			if ((c == 0x92) || (c == 0xb4) || (c == 0x2019) || (c == 0x2032))
2160 				c = '\''; // 'microsoft' quote or sexed closing single quote, or prime - possibly used as apostrophe
2161 
2162 			if (((c == 0x2018) || (c == '?')) && IsAlpha(prev_out) && IsAlpha(next_in)) {
2163 				// ? between two letters may be a smart-quote replaced by ?
2164 				c = '\'';
2165 			}
2166 
2167 			if (c == CHAR_EMPHASIS) {
2168 				// this character is a marker that the previous word is the focus of the clause
2169 				c = ' ';
2170 				word_flags |= FLAG_FOCUS;
2171 			}
2172 
2173 			if (c == CHAR_COMMA_BREAK) {
2174 				c = ' ';
2175 				word_flags |= FLAG_COMMA_AFTER;
2176 			}
2177 
2178 			c = TranslateChar(tr, &source[source_index], prev_in, c, next_in, &char_inserted, &word_flags);  // optional language specific function
2179 			if (c == 8)
2180 				continue; // ignore this character
2181 
2182 			if (char_inserted)
2183 				next_in = char_inserted;
2184 
2185 			// allow certain punctuation within a word (usually only apostrophe)
2186 			if (!IsAlpha(c) && !IsSpace(c) && (wcschr(tr->punct_within_word, c) == 0)) {
2187 				if (IsAlpha(prev_out)) {
2188 					if (tr->langopts.tone_numbers && IsDigit09(c) && !IsDigit09(next_in)) {
2189 						// allow a tone number as part of the word
2190 					} else {
2191 						c = ' '; // ensure we have an end-of-word terminator
2192 						space_inserted = 1;
2193 					}
2194 				}
2195 			}
2196 
2197 			if (iswdigit(prev_out)) {
2198 				if (!iswdigit(c) && (c != '.') && (c != ',') && (c != ' ')) {
2199 					c = ' '; // terminate digit string with a space
2200 					space_inserted = 1;
2201 				}
2202 			} else { // Prev output is not digit
2203 				if (prev_in == ',') {
2204 					// Workaround for several consecutive commas —
2205 					// replace current character with space
2206 					if (c == ',')
2207 						c = ' ';
2208 				} else {
2209 					decimal_sep_count = 0;
2210 				}
2211 			}
2212 
2213 			if (c == '[') {
2214 				if ((next_in == '\002') || ((next_in == '[') && option_phoneme_input)) {
2215 					//  "[\002" is used internally to start phoneme mode
2216 					phoneme_mode = FLAG_PHONEMES;
2217 					source_index++;
2218 					continue;
2219 				}
2220 			}
2221 
2222 			if (IsAlpha(c)) {
2223 				if (!IsAlpha(prev_out) || (tr->langopts.ideographs && ((c > 0x3040) || (prev_out > 0x3040)))) {
2224 					if (wcschr(tr->punct_within_word, prev_out) == 0)
2225 						letter_count = 0; // don't reset count for an apostrophy within a word
2226 
2227 					if ((prev_out != ' ') && (wcschr(tr->punct_within_word, prev_out) == 0)) {
2228 						// start of word, insert space if not one there already
2229 						c = ' ';
2230 						space_inserted = 1;
2231 
2232 						if (!IsBracket(prev_out)) // ?? perhaps only set FLAG_NOSPACE for . - /  (hyphenated words, URLs, etc)
2233 							next_word_flags |= FLAG_NOSPACE;
2234 					} else {
2235 						if (iswupper(c))
2236 							word_flags |= FLAG_FIRST_UPPER;
2237 
2238 						if ((prev_out == ' ') && iswdigit(sbuf[ix-2]) && !iswdigit(prev_in)) {
2239 							// word, following a number, but with a space between
2240 							// Add an extra space, to distinguish "2 a" from "2a"
2241 							sbuf[ix++] = ' ';
2242 							words[word_count].start++;
2243 						}
2244 					}
2245 				}
2246 
2247 				if (c != ' ') {
2248 					letter_count++;
2249 
2250 					if (tr->letter_bits_offset > 0) {
2251 						if (((c < 0x250) && (prev_out >= tr->letter_bits_offset)) ||
2252 						    ((c >= tr->letter_bits_offset) && (letter_count > 1) && (prev_out < 0x250))) {
2253 							// Don't mix native and Latin characters in the same word
2254 							// Break into separate words
2255 							if (IsAlpha(prev_out)) {
2256 								c = ' ';
2257 								space_inserted = 1;
2258 								word_flags |= FLAG_HYPHEN_AFTER;
2259 								next_word_flags |= FLAG_HYPHEN;
2260 							}
2261 						}
2262 					}
2263 				}
2264 
2265 				if (iswupper(c)) {
2266 					c = towlower2(c);
2267 
2268 					if ((j = tr->langopts.param[LOPT_CAPS_IN_WORD]) > 0) {
2269 						if ((j == 2) && (syllable_marked == 0)) {
2270 							char_inserted = c;
2271 							c = 0x2c8; // stress marker
2272 							syllable_marked = 1;
2273 						}
2274 					} else {
2275 						if (iswlower(prev_in)) {
2276 							// lower case followed by upper case in a word
2277 							if (UpperCaseInWord(tr, &sbuf[ix], c) == 1) {
2278 								// convert to lower case and continue
2279 								c = towlower2(c);
2280 							} else {
2281 								c = ' '; // lower case followed by upper case, treat as new word
2282 								space_inserted = 1;
2283 								prev_in_save = c;
2284 							}
2285 						} else if ((c != ' ') && iswupper(prev_in) && iswlower(next_in)) {
2286 							int next2_in;
2287 							utf8_in(&next2_in, &source[source_index + next_in_nbytes]);
2288 
2289 							if ((tr->translator_name == L('n', 'l')) && (letter_count == 2) && (c == 'j') && (prev_in == 'I')) {
2290 								// Dutch words may capitalise initial IJ, don't split
2291 							} else if (IsAlpha(next2_in)) {
2292 								// changing from upper to lower case, start new word at the last uppercase, if 3 or more letters
2293 								c = ' ';
2294 								space_inserted = 1;
2295 								prev_in_save = c;
2296 								next_word_flags |= FLAG_NOSPACE;
2297 							}
2298 						}
2299 					}
2300 				} else {
2301 					if ((all_upper_case) && (letter_count > 2)) {
2302 						if ((c == 's') && (next_in == ' ')) {
2303 							c = ' ';
2304 							all_upper_case |= FLAG_HAS_PLURAL;
2305 
2306 							if (sbuf[ix-1] == '\'')
2307 								sbuf[ix-1] = ' ';
2308 						} else
2309 							all_upper_case = 0; // current word contains lower case letters, not "'s"
2310 					} else
2311 						all_upper_case = 0;
2312 				}
2313 			} else if (c == '-') {
2314 				if (!IsSpace(prev_in) && IsAlpha(next_in)) {
2315 					if (prev_out != ' ') {
2316 						// previous 'word' not yet ended (not alpha or numeric), start new word now.
2317 						c = ' ';
2318 						space_inserted = 1;
2319 					} else {
2320 						// '-' between two letters is a hyphen, treat as a space
2321 						word_flags |= FLAG_HYPHEN;
2322 						if (word_count > 0)
2323 							words[word_count-1].flags |= FLAG_HYPHEN_AFTER;
2324 						c = ' ';
2325 					}
2326 				} else if ((prev_in == ' ') && (next_in == ' ')) {
2327 					// ' - ' dash between two spaces, treat as pause
2328 					c = ' ';
2329 					pre_pause_add = 4;
2330 				} else if (next_in == '-') {
2331 					// double hyphen, treat as pause
2332 					source_index++;
2333 					c = ' ';
2334 					pre_pause_add = 4;
2335 				} else if ((prev_out == ' ') && IsAlpha(prev_out2) && !IsAlpha(prev_in)) {
2336 					// insert extra space between a word + space + hyphen, to distinguish 'a -2' from 'a-2'
2337 					sbuf[ix++] = ' ';
2338 					words[word_count].start++;
2339 				}
2340 			} else if (c == '.') {
2341 				if (prev_out == '.') {
2342 					// multiple dots, separate by spaces. Note >3 dots has been replaced by elipsis
2343 					c = ' ';
2344 					space_inserted = 1;
2345 				} else if ((word_count > 0) && !(words[word_count-1].flags & FLAG_NOSPACE) && IsAlpha(prev_in)) {
2346 					// dot after a word, with space following, probably an abbreviation
2347 					words[word_count-1].flags |= FLAG_HAS_DOT;
2348 
2349 					if (IsSpace(next_in) || (next_in == '-'))
2350 						c = ' '; // remove the dot if it's followed by a space or hyphen, so that it's not pronounced
2351 				}
2352 			} else if (c == '\'') {
2353 				if (((prev_in == '.') || iswalnum(prev_in)) && IsAlpha(next_in)) {
2354 					// between two letters, or in an abbreviation (eg. u.s.a.'s). Consider the apostrophe as part of the word
2355 					single_quoted = 0;
2356 				} else if ((tr->langopts.param[LOPT_APOSTROPHE] & 1) && IsAlpha(next_in))
2357 					single_quoted = 0; // apostrophe at start of word is part of the word
2358 				else if ((tr->langopts.param[LOPT_APOSTROPHE] & 2) && IsAlpha(prev_in))
2359 					single_quoted = 0; // apostrophe at end of word is part of the word
2360 				else if ((wcschr(tr->char_plus_apostrophe, prev_in) != 0) && (prev_out2 == ' ')) {
2361 					// consider single character plus apostrophe as a word
2362 					single_quoted = 0;
2363 					if (next_in == ' ')
2364 						source_index++; // skip following space
2365 				} else {
2366 					if ((prev_out == 's') && (single_quoted == 0)) {
2367 						// looks like apostrophe after an 's'
2368 						c = ' ';
2369 					} else {
2370 						if (IsSpace(prev_out))
2371 							single_quoted = 1;
2372 						else
2373 							single_quoted = 0;
2374 
2375 						pre_pause_add = 4; // single quote
2376 						c = ' ';
2377 					}
2378 				}
2379 			} else if (lookupwchar(breaks, c) != 0)
2380 				c = ' '; // various characters to treat as space
2381 			else if (iswdigit(c)) {
2382 				if (tr->langopts.tone_numbers && IsAlpha(prev_out) && !IsDigit(next_in)) {
2383 				} else if ((prev_out != ' ') && !iswdigit(prev_out)) {
2384 					if ((prev_out != tr->langopts.decimal_sep) || ((decimal_sep_count > 0) && (tr->langopts.decimal_sep == ','))) {
2385 						c = ' ';
2386 						space_inserted = 1;
2387 					} else
2388 						decimal_sep_count = 1;
2389 				} else if ((prev_out == ' ') && IsAlpha(prev_out2) && !IsAlpha(prev_in)) {
2390 					// insert extra space between a word and a number, to distinguish 'a 2' from 'a2'
2391 					sbuf[ix++] = ' ';
2392 					words[word_count].start++;
2393 				}
2394 			}
2395 		}
2396 
2397 		if (IsSpace(c)) {
2398 			if (prev_out == ' ') {
2399 				word_flags |= FLAG_MULTIPLE_SPACES;
2400 				continue; // multiple spaces
2401 			}
2402 
2403 			if ((cc == 0x09) || (cc == 0x0a))
2404 				next_word_flags |= FLAG_MULTIPLE_SPACES; // tab or newline, not a simple space
2405 
2406 			if (space_inserted) {
2407 				// count the number of characters since the start of the word
2408 				j = 0;
2409 				k = source_index - 1;
2410 				while ((k >= source_index_word) && (charix[k] != 0)) {
2411 					if (charix[k] > 0) // don't count initial bytes of multi-byte character
2412 						j++;
2413 					k--;
2414 				}
2415 				words[word_count].length = j;
2416 			}
2417 
2418 			source_index_word = source_index;
2419 
2420 			// end of 'word'
2421 			sbuf[ix++] = ' ';
2422 
2423 			if ((word_count < N_CLAUSE_WORDS-1) && (ix > words[word_count].start)) {
2424 				if (embedded_count > 0) {
2425 					// there are embedded commands before this word
2426 					embedded_list[embedded_ix-1] |= 0x80; // terminate list of commands for this word
2427 					words[word_count].flags |= FLAG_EMBEDDED;
2428 					embedded_count = 0;
2429 				}
2430 				words[word_count].pre_pause = pre_pause;
2431 				words[word_count].flags |= (all_upper_case | word_flags | word_emphasis);
2432 				words[word_count].wmark = word_mark;
2433 
2434 				if (pre_pause > 0) {
2435 					// insert an extra space before the word, to prevent influence from previous word across the pause
2436 					for (j = ix; j > words[word_count].start; j--)
2437 						sbuf[j] = sbuf[j-1];
2438 					sbuf[j] = ' ';
2439 					words[word_count].start++;
2440 					ix++;
2441 				}
2442 
2443 				word_count++;
2444 				words[word_count].start = ix;
2445 				words[word_count].flags = 0;
2446 
2447 				for (j = source_index; charix[j] <= 0; j++) // skip blanks
2448 					;
2449 				words[word_count].sourceix = charix[j];
2450 				k = 0;
2451 				while (charix[j] != 0) {
2452 					// count the number of characters (excluding multibyte continuation bytes)
2453 					if (charix[j++] != -1)
2454 						k++;
2455 				}
2456 				words[word_count].length = k;
2457 
2458 				word_flags = next_word_flags;
2459 				next_word_flags = 0;
2460 				pre_pause = 0;
2461 				word_mark = 0;
2462 				all_upper_case = FLAG_ALL_UPPER;
2463 				syllable_marked = 0;
2464 			}
2465 
2466 			if (space_inserted) {
2467 				source_index = prev_source_index; // rewind to the previous character
2468 				char_inserted = 0;
2469 				space_inserted = 0;
2470 			}
2471 		} else {
2472 			if ((ix < (N_TR_SOURCE - 4)))
2473 				ix += utf8_out(c, &sbuf[ix]);
2474 		}
2475 		if (pre_pause_add > pre_pause)
2476 			pre_pause = pre_pause_add;
2477 		pre_pause_add = 0;
2478 	}
2479 
2480 	if ((word_count == 0) && (embedded_count > 0)) {
2481 		// add a null 'word' to carry the embedded command flag
2482 		embedded_list[embedded_ix-1] |= 0x80;
2483 		words[word_count].flags |= FLAG_EMBEDDED;
2484 		word_count = 1;
2485 	}
2486 
2487 	tr->clause_end = &sbuf[ix-1];
2488 	sbuf[ix] = 0;
2489 	words[0].pre_pause = 0; // don't add extra pause at beginning of clause
2490 	words[word_count].pre_pause = 8;
2491 	if (word_count > 0) {
2492 		ix = word_count-1;
2493 		while ((ix > 0) && (IsBracket(sbuf[words[ix].start])))
2494 			ix--; // the last word is a bracket, mark the previous word as last
2495 		words[ix].flags |= FLAG_LAST_WORD;
2496 
2497 		// FLAG_NOSPACE check to avoid recognizing  .mr  -mr
2498 		if ((terminator & CLAUSE_DOT_AFTER_LAST_WORD) && !(words[word_count-1].flags & FLAG_NOSPACE))
2499 			words[word_count-1].flags |= FLAG_HAS_DOT;
2500 	}
2501 	words[0].flags |= FLAG_FIRST_WORD;
2502 
2503 	for (ix = 0; ix < word_count; ix++) {
2504 		int nx;
2505 		int c_temp;
2506 		char *pn;
2507 		char *pw;
2508 		int nw;
2509 		char number_buf[150];
2510 		WORD_TAB num_wtab[50]; // copy of 'words', when splitting numbers into parts
2511 
2512 		// start speaking at a specified word position in the text?
2513 		count_words++;
2514 		if (skip_words > 0) {
2515 			skip_words--;
2516 			if (skip_words == 0)
2517 				skipping_text = 0;
2518 		}
2519 		if (skipping_text)
2520 			continue;
2521 
2522 		current_alphabet = NULL;
2523 
2524 		// digits should have been converted to Latin alphabet ('0' to '9')
2525 		word = pw = &sbuf[words[ix].start];
2526 
2527 		if (iswdigit(word[0]) && (tr->langopts.break_numbers != BREAK_THOUSANDS)) {
2528 			// Languages with 100000 numbers.  Remove thousands separators so that we can insert them again later
2529 			pn = number_buf;
2530 			while (pn < &number_buf[sizeof(number_buf)-20]) {
2531 				if (iswdigit(*pw))
2532 					*pn++ = *pw++;
2533 				else if ((*pw == tr->langopts.thousands_sep) && (pw[1] == ' ')
2534 				           && iswdigit(pw[2]) && (pw[3] != ' ') && (pw[4] != ' ')) { // don't allow only 1 or 2 digits in the final part
2535 					pw += 2;
2536 					ix++; // skip "word"
2537 				} else {
2538 					nx = pw - word;
2539 					memset(word, ' ', nx);
2540 					nx = pn - number_buf;
2541 					memcpy(word, number_buf, nx);
2542 					break;
2543 				}
2544 			}
2545 			pw = word;
2546 		}
2547 
2548 		for (n_digits = 0; iswdigit(word[n_digits]); n_digits++) // count consecutive digits
2549 			;
2550 
2551 		if (n_digits > 4) {
2552 			// word is entirely digits, insert commas and break into 3 digit "words"
2553 			number_buf[0] = ' ';
2554 			pn = &number_buf[1];
2555 			nx = n_digits;
2556 			nw = 0;
2557 
2558 			if ((n_digits > tr->langopts.max_digits) || (word[0] == '0'))
2559 				words[ix].flags |= FLAG_INDIVIDUAL_DIGITS;
2560 
2561 			while (pn < &number_buf[sizeof(number_buf)-20]) {
2562 				if (!IsDigit09(c = *pw++) && (c != tr->langopts.decimal_sep))
2563 					break;
2564 
2565 				*pn++ = c;
2566 				nx--;
2567 				if ((nx > 0) && (tr->langopts.break_numbers & (1 << nx))) {
2568 					memcpy(&num_wtab[nw++], &words[ix], sizeof(WORD_TAB)); // copy the 'words' entry for each word of numbers
2569 
2570 					if (tr->langopts.thousands_sep != ' ')
2571 						*pn++ = tr->langopts.thousands_sep;
2572 					*pn++ = ' ';
2573 
2574 					if ((words[ix].flags & FLAG_INDIVIDUAL_DIGITS) == 0) {
2575 						if (tr->langopts.break_numbers & (1 << (nx-1))) {
2576 							// the next group only has 1 digits, make it three
2577 							*pn++ = '0';
2578 							*pn++ = '0';
2579 						}
2580 						if (tr->langopts.break_numbers & (1 << (nx-2))) {
2581 							// the next group only has 2 digits (eg. Indian languages), make it three
2582 							*pn++ = '0';
2583 						}
2584 					}
2585 				}
2586 			}
2587 			pw--;
2588 			memcpy(&num_wtab[nw], &words[ix], sizeof(WORD_TAB)*2); // the original number word, and the word after it
2589 
2590 			for (j = 1; j <= nw; j++)
2591 				num_wtab[j].flags &= ~(FLAG_MULTIPLE_SPACES | FLAG_EMBEDDED); // don't use these flags for subsequent parts when splitting a number
2592 
2593 			// include the next few characters, in case there are an ordinal indicator or other suffix
2594 			memcpy(pn, pw, 16);
2595 			pn[16] = 0;
2596 			nw = 0;
2597 
2598 			for (pw = &number_buf[1]; pw < pn;) {
2599 				// keep wflags for each part, for FLAG_HYPHEN_AFTER
2600 				dict_flags = TranslateWord2(tr, pw, &num_wtab[nw++], words[ix].pre_pause);
2601 				while (*pw++ != ' ')
2602 					;
2603 				words[ix].pre_pause = 0;
2604 			}
2605 		} else {
2606 			pre_pause = 0;
2607 
2608 			dict_flags = TranslateWord2(tr, word, &words[ix], words[ix].pre_pause);
2609 
2610 			if (pre_pause > words[ix+1].pre_pause) {
2611 				words[ix+1].pre_pause = pre_pause;
2612 				pre_pause = 0;
2613 			}
2614 
2615 			if (dict_flags & FLAG_SPELLWORD) {
2616 				// redo the word, speaking single letters
2617 				for (pw = word; *pw != ' ';) {
2618 					memset(number_buf, ' ', 9);
2619 					nx = utf8_in(&c_temp, pw);
2620 					memcpy(&number_buf[2], pw, nx);
2621 					TranslateWord2(tr, &number_buf[2], &words[ix], 0);
2622 					pw += nx;
2623 				}
2624 			}
2625 
2626 			if ((dict_flags & (FLAG_ALLOW_DOT | FLAG_NEEDS_DOT)) && (ix == word_count - 1 - dictionary_skipwords) && (terminator & CLAUSE_DOT_AFTER_LAST_WORD)) {
2627 				// probably an abbreviation such as Mr. or B. rather than end of sentence
2628 				clause_pause = 10;
2629 				tone = 4;
2630 			}
2631 		}
2632 
2633 		if (dict_flags & FLAG_SKIPWORDS) {
2634 			// dictionary indicates skip next word(s)
2635 			while (dictionary_skipwords > 0) {
2636 				words[ix+dictionary_skipwords].flags |= FLAG_DELETE_WORD;
2637 				dictionary_skipwords--;
2638 			}
2639 		}
2640 	}
2641 
2642 	if (embedded_read < embedded_ix) {
2643 		// any embedded commands not yet processed?
2644 		Word_EmbeddedCmd();
2645 	}
2646 
2647 	for (ix = 0; ix < 2; ix++) {
2648 		// terminate the clause with 2 PAUSE phonemes
2649 		PHONEME_LIST2 *p2;
2650 		p2 = &ph_list2[n_ph_list2 + ix];
2651 		p2->phcode = phonPAUSE;
2652 		p2->stresslevel = 0;
2653 		p2->sourceix = source_index;
2654 		p2->synthflags = 0;
2655 	}
2656 	n_ph_list2 += 2;
2657 
2658 	if (count_words == 0)
2659 		clause_pause = 0;
2660 	if (Eof() && ((word_count == 0) || (option_endpause == 0)))
2661 		clause_pause = 10;
2662 
2663 	MakePhonemeList(tr, clause_pause, new_sentence2);
2664 	phoneme_list[N_PHONEME_LIST].ph = NULL; // recognize end of phoneme_list array, in Generate()
2665 	phoneme_list[N_PHONEME_LIST].sourceix = 1;
2666 
2667 	if (embedded_count) { // ???? is this needed
2668 		phoneme_list[n_phoneme_list-2].synthflags = SFLAG_EMBEDDED;
2669 		embedded_list[embedded_ix-1] |= 0x80;
2670 		embedded_list[embedded_ix] = 0x80;
2671 	}
2672 
2673 	prev_clause_pause = clause_pause;
2674 
2675 	if (tone_out != NULL)
2676 		*tone_out = tone;
2677 
2678 	new_sentence = 0;
2679 	if (terminator & CLAUSE_TYPE_SENTENCE)
2680 		new_sentence = 1; // next clause is a new sentence
2681 
2682 	if (voice_change != NULL) {
2683 		// return new voice name if an embedded voice change command terminated the clause
2684 		if (terminator & CLAUSE_TYPE_VOICE_CHANGE)
2685 			*voice_change = voice_change_name;
2686 		else
2687 			*voice_change = NULL;
2688 	}
2689 }
2690 
InitText(int control)2691 void InitText(int control)
2692 {
2693 	count_sentences = 0;
2694 	count_words = 0;
2695 	end_character_position = 0;
2696 	skip_sentences = 0;
2697 	skip_marker[0] = 0;
2698 	skip_words = 0;
2699 	skip_characters = 0;
2700 	skipping_text = 0;
2701 	new_sentence = 1;
2702 
2703 	prev_clause_pause = 0;
2704 
2705 	option_sayas = 0;
2706 	option_sayas2 = 0;
2707 	option_emphasis = 0;
2708 	word_emphasis = 0;
2709 	embedded_flag = 0;
2710 
2711 	InitText2();
2712 
2713 	if ((control & espeakKEEP_NAMEDATA) == 0)
2714 		InitNamedata();
2715 }
2716