1 /*
2 * Copyright (C) 2005 to 2014 by Jonathan Duddington
3 * email: jonsd@users.sourceforge.net
4 * Copyright (C) 2015-2017 Reece H. Dunn
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, see: <http://www.gnu.org/licenses/>.
18 */
19
20 #include "config.h"
21
22 #include <ctype.h>
23 //#include <stdbool.h>
24 #include <stdint.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <wchar.h>
29 #include <wctype.h>
30
31 #include "ucd.h"
32 #include "espeak_ng.h"
33 #include "encoding.h"
34 #include "speech.h"
35 #include "synthesize.h"
36 #include "translate.h"
37
38 Translator *translator = NULL; // the main translator
39 Translator *translator2 = NULL; // secondary translator for certain words
40 static char translator2_language[20] = { 0 };
41
42 FILE *f_trans = NULL; // phoneme output text
43 int option_tone2 = 0;
44 int option_tone_flags = 0; // bit 8=emphasize allcaps, bit 9=emphasize penultimate stress
45 int option_phonemes = 0;
46 int option_phoneme_events = 0;
47 int option_endpause = 0; // suppress pause after end of text
48 int option_capitals = 0;
49 int option_punctuation = 0;
50 int option_sayas = 0;
51 static int option_sayas2 = 0; // used in translate_clause()
52 static int option_emphasis = 0; // 0=normal, 1=normal, 2=weak, 3=moderate, 4=strong
53 int option_ssml = 0;
54 int option_phoneme_input = 0; // allow [[phonemes]] in input
55 int option_phoneme_variants = 0; // 0= don't display phoneme variant mnemonics
56 int option_wordgap = 0;
57
58 static int count_sayas_digits;
59 int skip_sentences;
60 int skip_words;
61 int skip_characters;
62 char skip_marker[N_MARKER_LENGTH];
63 int skipping_text; // waiting until word count, sentence count, or named marker is reached
64 int end_character_position;
65 int count_sentences;
66 int count_words;
67 int clause_start_char;
68 int clause_start_word;
69 int new_sentence;
70 static int word_emphasis = 0; // set if emphasis level 3 or 4
71 static int embedded_flag = 0; // there are embedded commands to be applied to the next phoneme, used in TranslateWord2()
72
73 static int prev_clause_pause = 0;
74 static int max_clause_pause = 0;
75 static int any_stressed_words;
76 int pre_pause;
77 ALPHABET *current_alphabet;
78
79 // these were previously in translator class
80 char word_phonemes[N_WORD_PHONEMES]; // a word translated into phoneme codes
81 int n_ph_list2;
82 PHONEME_LIST2 ph_list2[N_PHONEME_LIST]; // first stage of text->phonemes
83
84 wchar_t option_punctlist[N_PUNCTLIST] = { 0 };
85 char ctrl_embedded = '\001'; // to allow an alternative CTRL for embedded commands
86
87 // these are overridden by defaults set in the "speak" file
88 int option_linelength = 0;
89
90 #define N_EMBEDDED_LIST 250
91 static int embedded_ix;
92 static int embedded_read;
93 unsigned int embedded_list[N_EMBEDDED_LIST];
94
95 // the source text of a single clause (UTF8 bytes)
96 static char source[N_TR_SOURCE+40]; // extra space for embedded command & voice change info at end
97
98 int n_replace_phonemes;
99 REPLACE_PHONEMES replace_phonemes[N_REPLACE_PHONEMES];
100
101 // brackets, also 0x2014 to 0x021f which don't need to be in this list
102 static const unsigned short brackets[] = {
103 '(', ')', '[', ']', '{', '}', '<', '>', '"', '\'', '`',
104 0xab, 0xbb, // double angle brackets
105 0x300a, 0x300b, // double angle brackets (ideograph)
106 0xe000+'<', // private usage area
107 0
108 };
109
110 // other characters which break a word, but don't produce a pause
111 static const unsigned short breaks[] = { '_', 0 };
112
113 // Tables of the relative lengths of vowels, depending on the
114 // type of the two phonemes that follow
115 // indexes are the "length_mod" value for the following phonemes
116
117 // use this table if vowel is not the last in the word
118 static unsigned char length_mods_en[100] = {
119 // a , t s n d z r N <- next
120 100, 120, 100, 105, 100, 110, 110, 100, 95, 100, // a <- next2
121 105, 120, 105, 110, 125, 130, 135, 115, 125, 100, // ,
122 105, 120, 75, 100, 75, 105, 120, 85, 75, 100, // t
123 105, 120, 85, 105, 95, 115, 120, 100, 95, 100, // s
124 110, 120, 95, 105, 100, 115, 120, 100, 100, 100, // n
125 105, 120, 100, 105, 95, 115, 120, 110, 95, 100, // d
126 105, 120, 100, 105, 105, 122, 125, 110, 105, 100, // z
127 105, 120, 100, 105, 105, 122, 125, 110, 105, 100, // r
128 105, 120, 95, 105, 100, 115, 120, 110, 100, 100, // N
129 100, 120, 100, 100, 100, 100, 100, 100, 100, 100
130 };
131
132 // as above, but for the last syllable in a word
133 static unsigned char length_mods_en0[100] = {
134 // a , t s n d z r N <- next
135 100, 150, 100, 105, 110, 115, 110, 110, 110, 100, // a <- next2
136 105, 150, 105, 110, 125, 135, 140, 115, 135, 100, // ,
137 105, 150, 90, 105, 90, 122, 135, 100, 90, 100, // t
138 105, 150, 100, 105, 100, 122, 135, 100, 100, 100, // s
139 105, 150, 100, 105, 105, 115, 135, 110, 105, 100, // n
140 105, 150, 100, 105, 105, 122, 130, 120, 125, 100, // d
141 105, 150, 100, 105, 110, 122, 125, 115, 110, 100, // z
142 105, 150, 100, 105, 105, 122, 135, 120, 105, 100, // r
143 105, 150, 100, 105, 105, 115, 135, 110, 105, 100, // N
144 100, 100, 100, 100, 100, 100, 100, 100, 100, 100
145 };
146
147
148 static unsigned char length_mods_equal[100] = {
149 // a , t s n d z r N <- next
150 110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // a <- next2
151 110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // ,
152 110, 120, 100, 110, 100, 110, 110, 110, 100, 110, // t
153 110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // s
154 110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // n
155 110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // d
156 110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // z
157 110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // r
158 110, 120, 100, 110, 110, 110, 110, 110, 110, 110, // N
159 110, 120, 100, 110, 110, 110, 110, 110, 110, 110
160 };
161
162 static unsigned char *length_mod_tabs[6] = {
163 length_mods_en,
164 length_mods_en, // 1
165 length_mods_en0, // 2
166 length_mods_equal, // 3
167 length_mods_equal, // 4
168 length_mods_equal // 5
169 };
170
SetLengthMods(Translator * tr,int value)171 void SetLengthMods(Translator *tr, int value)
172 {
173 int value2;
174
175 tr->langopts.length_mods0 = tr->langopts.length_mods = length_mod_tabs[value % 100];
176 if ((value2 = value / 100) != 0)
177 tr->langopts.length_mods0 = length_mod_tabs[value2];
178 }
179
IsAlpha(unsigned int c)180 int IsAlpha(unsigned int c)
181 {
182 // Replacement for iswalph() which also checks for some in-word symbols
183
184 static const unsigned short extra_indic_alphas[] = {
185 0xa70, 0xa71, // Gurmukhi: tippi, addak
186 0
187 };
188
189 if (iswalpha(c))
190 return 1;
191
192 if (c < 0x300)
193 return 0;
194
195 if ((c >= 0x901) && (c <= 0xdf7)) {
196 // Indic scripts: Devanagari, Tamil, etc
197 if ((c & 0x7f) < 0x64)
198 return 1;
199 if (lookupwchar(extra_indic_alphas, c) != 0)
200 return 1;
201 if ((c >= 0xd7a) && (c <= 0xd7f))
202 return 1; // malaytalam chillu characters
203
204 return 0;
205 }
206
207 if ((c >= 0x5b0) && (c <= 0x5c2))
208 return 1; // Hebrew vowel marks
209
210 if (c == 0x0605)
211 return 1;
212
213 if ((c == 0x670) || ((c >= 0x64b) && (c <= 0x65e)))
214 return 1; // arabic vowel marks
215
216 if ((c >= 0x300) && (c <= 0x36f))
217 return 1; // combining accents
218
219 if ((c >= 0x780) && (c <= 0x7b1))
220 return 1; // taani/divehi (maldives)
221
222 if ((c >= 0xf40) && (c <= 0xfbc))
223 return 1; // tibetan
224
225 if ((c >= 0x1100) && (c <= 0x11ff))
226 return 1; // Korean jamo
227
228 if ((c >= 0x2800) && (c <= 0x28ff))
229 return 1; // braille
230
231 if ((c > 0x3040) && (c <= 0xa700))
232 return 1; // Chinese/Japanese. Should never get here, but Mac OS 10.4's iswalpha seems to be broken, so just make sure
233
234 return 0;
235 }
236
IsDigit09(unsigned int c)237 int IsDigit09(unsigned int c)
238 {
239 if ((c >= '0') && (c <= '9'))
240 return 1;
241 return 0;
242 }
243
IsDigit(unsigned int c)244 int IsDigit(unsigned int c)
245 {
246 if (iswdigit(c))
247 return 1;
248
249 if ((c >= 0x966) && (c <= 0x96f))
250 return 1;
251
252 return 0;
253 }
254
IsSpace(unsigned int c)255 static int IsSpace(unsigned int c)
256 {
257 if (c == 0)
258 return 0;
259 if ((c >= 0x2500) && (c < 0x25a0))
260 return 1; // box drawing characters
261 if ((c >= 0xfff9) && (c <= 0xffff))
262 return 1; // unicode specials
263 return iswspace(c);
264 }
265
isspace2(unsigned int c)266 int isspace2(unsigned int c)
267 {
268 // can't use isspace() because on Windows, isspace(0xe1) gives TRUE !
269 int c2;
270
271 if (((c2 = (c & 0xff)) == 0) || (c > ' '))
272 return 0;
273 return 1;
274 }
275
DeleteTranslator(Translator * tr)276 void DeleteTranslator(Translator *tr)
277 {
278 if (tr->data_dictlist != NULL)
279 free(tr->data_dictlist);
280 free(tr);
281 }
282
lookupwchar(const unsigned short * list,int c)283 int lookupwchar(const unsigned short *list, int c)
284 {
285 // Is the character c in the list ?
286 int ix;
287
288 for (ix = 0; list[ix] != 0; ix++) {
289 if (list[ix] == c)
290 return ix+1;
291 }
292 return 0;
293 }
294
lookupwchar2(const unsigned short * list,int c)295 int lookupwchar2(const unsigned short *list, int c)
296 {
297 // Replace character c by another character.
298 // Returns 0 = not found, 1 = delete character
299
300 int ix;
301
302 for (ix = 0; list[ix] != 0; ix += 2) {
303 if (list[ix] == c)
304 return list[ix+1];
305 }
306 return 0;
307 }
308
IsBracket(int c)309 int IsBracket(int c)
310 {
311 if ((c >= 0x2014) && (c <= 0x201f))
312 return 1;
313 return lookupwchar(brackets, c);
314 }
315
utf8_nbytes(const char * buf)316 int utf8_nbytes(const char *buf)
317 {
318 // Returns the number of bytes for the first UTF-8 character in buf
319
320 unsigned char c = (unsigned char)buf[0];
321 if (c < 0x80)
322 return 1;
323 if (c < 0xe0)
324 return 2;
325 if (c < 0xf0)
326 return 3;
327 return 4;
328 }
329
utf8_in2(int * c,const char * buf,int backwards)330 int utf8_in2(int *c, const char *buf, int backwards)
331 {
332 // Reads a unicode characater from a UTF8 string
333 // Returns the number of UTF8 bytes used.
334 // c: holds integer representation of multibyte character
335 // buf: position of buffer is moved, if character is read
336 // backwards: set if we are moving backwards through the UTF8 string
337
338 int c1;
339 int n_bytes;
340 int ix;
341 static const unsigned char mask[4] = { 0xff, 0x1f, 0x0f, 0x07 };
342
343 // find the start of the next/previous character
344 while ((*buf & 0xc0) == 0x80) {
345 // skip over non-initial bytes of a multi-byte utf8 character
346 if (backwards)
347 buf--;
348 else
349 buf++;
350 }
351
352 n_bytes = 0;
353
354 if ((c1 = *buf++) & 0x80) {
355 if ((c1 & 0xe0) == 0xc0)
356 n_bytes = 1;
357 else if ((c1 & 0xf0) == 0xe0)
358 n_bytes = 2;
359 else if ((c1 & 0xf8) == 0xf0)
360 n_bytes = 3;
361
362 c1 &= mask[n_bytes];
363 for (ix = 0; ix < n_bytes; ix++)
364 c1 = (c1 << 6) + (*buf++ & 0x3f);
365 }
366 *c = c1;
367 return n_bytes+1;
368 }
369
370 #pragma GCC visibility push(default)
utf8_in(int * c,const char * buf)371 int utf8_in(int *c, const char *buf)
372 {
373 /* Read a unicode characater from a UTF8 string
374 * Returns the number of UTF8 bytes used.
375 * buf: position of buffer is moved, if character is read
376 * c: holds integer representation of multibyte character by
377 * skipping UTF-8 header bits of bytes in following way:
378 * 2-byte character "ā":
379 * hex binary
380 * c481 1100010010000001
381 * | 11000100 000001
382 * V \ \ | |
383 * 0101 0000000100000001
384 * 3-byte character "ꙅ":
385 * ea9985 111010101001100110000101
386 * 1010 011001 000101
387 * | + +--.\ \ | |
388 * V `--. \`. `.| |
389 * A645 0001001101000101
390 * 4-byte character "":
391 * f0a09c8e 11110000101000001001110010001110
392 * V 000 100000 011100 001110
393 * 02070e 000000100000011100001110
394 */
395 return utf8_in2(c, buf, 0);
396 }
397 #pragma GCC visibility pop
398
utf8_out(unsigned int c,char * buf)399 int utf8_out(unsigned int c, char *buf)
400 {
401 // write a unicode character into a buffer as utf8
402 // returns the number of bytes written
403
404 int n_bytes;
405 int j;
406 int shift;
407 static char unsigned code[4] = { 0, 0xc0, 0xe0, 0xf0 };
408
409 if (c < 0x80) {
410 buf[0] = c;
411 return 1;
412 }
413 if (c >= 0x110000) {
414 buf[0] = ' '; // out of range character code
415 return 1;
416 }
417 if (c < 0x0800)
418 n_bytes = 1;
419 else if (c < 0x10000)
420 n_bytes = 2;
421 else
422 n_bytes = 3;
423
424 shift = 6*n_bytes;
425 buf[0] = code[n_bytes] | (c >> shift);
426 for (j = 0; j < n_bytes; j++) {
427 shift -= 6;
428 buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
429 }
430 return n_bytes+1;
431 }
432
strchr_w(const char * s,int c)433 char *strchr_w(const char *s, int c)
434 {
435 // return NULL for any non-ascii character
436 if (c >= 0x80)
437 return NULL;
438 return strchr((char *)s, c); // (char *) is needed for Borland compiler
439 }
SpeakIndividualLetters(Translator * tr,char * word,char * phonemes,int spell_word)440 static char *SpeakIndividualLetters(Translator *tr, char *word, char *phonemes, int spell_word)
441 {
442 int posn = 0;
443 int capitals = 0;
444 int non_initial = 0;
445
446 if (spell_word > 2)
447 capitals = 2; // speak 'capital'
448 if (spell_word > 1)
449 capitals |= 4; // speak charater code for unknown letters
450
451 while ((*word != ' ') && (*word != 0)) {
452 word += TranslateLetter(tr, word, phonemes, capitals | non_initial);
453 posn++;
454 non_initial = 1;
455 if (phonemes[0] == phonSWITCH) {
456 // change to another language in order to translate this word
457 strcpy(word_phonemes, phonemes);
458 return NULL;
459 }
460 }
461 SetSpellingStress(tr, phonemes, spell_word, posn);
462 return word;
463 }
464
CheckDottedAbbrev(char * word1)465 static int CheckDottedAbbrev(char *word1)
466 {
467 int wc;
468 int count = 0;
469 int nbytes;
470 int ok;
471 int ix;
472 char *word;
473 char *wbuf;
474 char word_buf[80];
475
476 word = word1;
477 wbuf = word_buf;
478
479 for (;;) {
480 ok = 0;
481 nbytes = utf8_in(&wc, word);
482 if ((word[nbytes] == ' ') && IsAlpha(wc)) {
483 if (word[nbytes+1] == '.') {
484 if (word[nbytes+2] == ' ')
485 ok = 1;
486 else if (word[nbytes+2] == '\'') {
487 nbytes += 2; // delete the final dot (eg. u.s.a.'s)
488 ok = 2;
489 }
490 } else if ((count > 0) && (word[nbytes] == ' '))
491 ok = 2;
492 }
493
494 if (ok == 0)
495 break;
496
497 for (ix = 0; ix < nbytes; ix++)
498 *wbuf++ = word[ix];
499
500 count++;
501
502 if (ok == 2) {
503 word += nbytes;
504 break;
505 }
506
507 word += (nbytes + 3);
508 }
509
510 if (count > 1) {
511 ix = wbuf - word_buf;
512 memcpy(word1, word_buf, ix);
513 while (&word1[ix] < word)
514 word1[ix++] = ' ';
515 dictionary_skipwords = (count - 1)*2;
516 }
517 return count;
518 }
519
520 extern char *phondata_ptr;
521
TranslateWord3(Translator * tr,char * word_start,WORD_TAB * wtab,char * word_out)522 static int TranslateWord3(Translator *tr, char *word_start, WORD_TAB *wtab, char *word_out)
523 {
524 // word1 is terminated by space (0x20) character
525
526 char *word1;
527 int word_length;
528 int ix;
529 char *p;
530 int pfix;
531 int n_chars;
532 unsigned int dictionary_flags[2];
533 unsigned int dictionary_flags2[2];
534 int end_type = 0;
535 int end_type1 = 0;
536 int prefix_type = 0;
537 int prefix_stress;
538 char *wordx;
539 char phonemes[N_WORD_PHONEMES];
540 char phonemes2[N_WORD_PHONEMES];
541 char prefix_phonemes[N_WORD_PHONEMES];
542 char unpron_phonemes[N_WORD_PHONEMES];
543 char end_phonemes[N_WORD_PHONEMES];
544 char end_phonemes2[N_WORD_PHONEMES];
545 char word_copy[N_WORD_BYTES];
546 char word_copy2[N_WORD_BYTES];
547 int word_copy_length;
548 char prefix_chars[0x3f + 2];
549 bool found = false;
550 int end_flags;
551 int c_temp; // save a character byte while we temporarily replace it with space
552 int first_char;
553 int last_char = 0;
554 int prefix_flags = 0;
555 int more_suffixes;
556 int confirm_prefix;
557 int spell_word;
558 int emphasize_allcaps = 0;
559 int wflags;
560 int wmark;
561 int was_unpronouncable = 0;
562 int loopcount;
563 int add_suffix_phonemes = 0;
564 WORD_TAB wtab_null[8];
565
566 // translate these to get pronunciations of plural 's' suffix (different forms depending on
567 // the preceding letter
568 static char word_zz[4] = { 0, 'z', 'z', 0 };
569 static char word_iz[4] = { 0, 'i', 'z', 0 };
570 static char word_ss[4] = { 0, 's', 's', 0 };
571
572 if (wtab == NULL) {
573 memset(wtab_null, 0, sizeof(wtab_null));
574 wtab = wtab_null;
575 }
576 wflags = wtab->flags;
577 wmark = wtab->wmark;
578
579 dictionary_flags[0] = 0;
580 dictionary_flags[1] = 0;
581 dictionary_flags2[0] = 0;
582 dictionary_flags2[1] = 0;
583 dictionary_skipwords = 0;
584
585 phonemes[0] = 0;
586 unpron_phonemes[0] = 0;
587 prefix_phonemes[0] = 0;
588 end_phonemes[0] = 0;
589
590 if (tr->data_dictlist == NULL) {
591 // dictionary is not loaded
592 word_phonemes[0] = 0;
593 return 0;
594 }
595
596 // count the length of the word
597 word1 = word_start;
598 if (*word1 == ' ') word1++; // possibly a dot was replaced by space: $dot
599 wordx = word1;
600
601 utf8_in(&first_char, wordx);
602 word_length = 0;
603 while ((*wordx != 0) && (*wordx != ' ')) {
604 wordx += utf8_in(&last_char, wordx);
605 word_length++;
606 }
607
608 word_copy_length = wordx - word_start;
609 if (word_copy_length >= N_WORD_BYTES)
610 word_copy_length = N_WORD_BYTES-1;
611 memcpy(word_copy2, word_start, word_copy_length);
612
613 spell_word = 0;
614
615 if ((word_length == 1) && (wflags & FLAG_TRANSLATOR2)) {
616 // retranslating a 1-character word using a different language, say its name
617 utf8_in(&c_temp, wordx+1); // the next character
618 if (!IsAlpha(c_temp) || (AlphabetFromChar(last_char) != AlphabetFromChar(c_temp)))
619 spell_word = 1;
620 }
621
622 if (option_sayas == SAYAS_KEY) {
623 if (word_length == 1)
624 spell_word = 4;
625 else {
626 // is there a translation for this keyname ?
627 word1--;
628 *word1 = '_'; // prefix keyname with '_'
629 found = LookupDictList(tr, &word1, phonemes, dictionary_flags, 0, wtab);
630 }
631 }
632
633 // try an initial lookup in the dictionary list, we may find a pronunciation specified, or
634 // we may just find some flags
635 if (option_sayas & 0x10) {
636 // SAYAS_CHAR, SAYAS_GYLPH, or SAYAS_SINGLE_CHAR
637 spell_word = option_sayas & 0xf; // 2,3,4
638 } else {
639 if (!found)
640 found = LookupDictList(tr, &word1, phonemes, dictionary_flags, FLAG_ALLOW_TEXTMODE, wtab); // the original word
641
642 if ((dictionary_flags[0] & (FLAG_ALLOW_DOT | FLAG_NEEDS_DOT)) && (wordx[1] == '.'))
643 wordx[1] = ' '; // remove a Dot after this word
644
645 if (dictionary_flags[0] & FLAG_TEXTMODE) {
646 if (word_out != NULL)
647 strcpy(word_out, word1);
648
649 return dictionary_flags[0];
650 } else if ((found == false) && (dictionary_flags[0] & FLAG_SKIPWORDS) && !(dictionary_flags[0] & FLAG_ABBREV)) {
651 // grouped words, but no translation. Join the words with hyphens.
652 wordx = word1;
653 ix = 0;
654 while (ix < dictionary_skipwords) {
655 if (*wordx == ' ') {
656 *wordx = '-';
657 ix++;
658 }
659 wordx++;
660 }
661 }
662
663 if ((word_length == 1) && (dictionary_skipwords == 0)) {
664 // is this a series of single letters separated by dots?
665 if (CheckDottedAbbrev(word1)) {
666 dictionary_flags[0] = 0;
667 dictionary_flags[1] = 0;
668 spell_word = 1;
669 if (dictionary_skipwords)
670 dictionary_flags[0] = FLAG_SKIPWORDS;
671 }
672 }
673
674 if (phonemes[0] == phonSWITCH) {
675 // change to another language in order to translate this word
676 strcpy(word_phonemes, phonemes);
677 return 0;
678 }
679
680 if ((wmark > 0) && (wmark < 8)) {
681 // the stressed syllable has been specified in the text (TESTING)
682 dictionary_flags[0] = (dictionary_flags[0] & ~0xf) | wmark;
683 }
684
685 if (!found && (dictionary_flags[0] & FLAG_ABBREV)) {
686 // the word has $abbrev flag, but no pronunciation specified. Speak as individual letters
687 spell_word = 1;
688 }
689
690 if (!found && iswdigit(first_char)) {
691 Lookup(tr, "_0lang", word_phonemes);
692 if (word_phonemes[0] == phonSWITCH)
693 return 0;
694
695 if ((tr->langopts.numbers2 & NUM2_ENGLISH_NUMERALS) && !(wtab->flags & FLAG_CHAR_REPLACED)) {
696 // for this language, speak English numerals (0-9) with the English voice
697 sprintf(word_phonemes, "%c", phonSWITCH);
698 return 0;
699 }
700
701 found = TranslateNumber(tr, word1, phonemes, dictionary_flags, wtab, 0);
702 }
703
704 if (!found && ((wflags & FLAG_UPPERS) != FLAG_FIRST_UPPER)) {
705 // either all upper or all lower case
706
707 if ((tr->langopts.numbers & NUM_ROMAN) || ((tr->langopts.numbers & NUM_ROMAN_CAPITALS) && (wflags & FLAG_ALL_UPPER))) {
708 if ((wflags & FLAG_LAST_WORD) || !(wtab[1].flags & FLAG_NOSPACE)) {
709 // don't use Roman number if this word is not separated from the next word (eg. "XLTest")
710 if ((found = TranslateRoman(tr, word1, phonemes, wtab)) != 0)
711 dictionary_flags[0] |= FLAG_ABBREV; // prevent emphasis if capitals
712 }
713 }
714 }
715
716 if ((wflags & FLAG_ALL_UPPER) && (word_length > 1) && iswalpha(first_char)) {
717 if ((option_tone_flags & OPTION_EMPHASIZE_ALLCAPS) && !(dictionary_flags[0] & FLAG_ABBREV)) {
718 // emphasize words which are in capitals
719 emphasize_allcaps = FLAG_EMPHASIZED;
720 } else if (!found && !(dictionary_flags[0] & FLAG_SKIPWORDS) && (word_length < 4) && (tr->clause_lower_count > 3)
721 && (tr->clause_upper_count <= tr->clause_lower_count)) {
722 // An upper case word in a lower case clause. This could be an abbreviation.
723 spell_word = 1;
724 }
725 }
726 }
727
728 if (spell_word > 0) {
729 // Speak as individual letters
730 phonemes[0] = 0;
731
732 if (SpeakIndividualLetters(tr, word1, phonemes, spell_word) == NULL) {
733 if (word_length > 1)
734 return FLAG_SPELLWORD; // a mixture of languages, retranslate as individual letters, separated by spaces
735 return 0;
736 }
737 strcpy(word_phonemes, phonemes);
738 if (wflags & FLAG_TRANSLATOR2)
739 return 0;
740 return dictionary_flags[0] & FLAG_SKIPWORDS; // for "b.c.d"
741 } else if (found == false) {
742 // word's pronunciation is not given in the dictionary list, although
743 // dictionary_flags may have ben set there
744
745 int posn;
746 int non_initial;
747 int length;
748
749 posn = 0;
750 non_initial = 0;
751 length = 999;
752 wordx = word1;
753
754 while (((length < 3) && (length > 0)) || (word_length > 1 && Unpronouncable(tr, wordx, posn))) {
755 // This word looks "unpronouncable", so speak letters individually until we
756 // find a remainder that we can pronounce.
757 was_unpronouncable = FLAG_WAS_UNPRONOUNCABLE;
758 emphasize_allcaps = 0;
759
760 if (wordx[0] == '\'')
761 break;
762
763 if (posn > 0)
764 non_initial = 1;
765
766 wordx += TranslateLetter(tr, wordx, unpron_phonemes, non_initial);
767 posn++;
768 if (unpron_phonemes[0] == phonSWITCH) {
769 // change to another language in order to translate this word
770 strcpy(word_phonemes, unpron_phonemes);
771 if (strcmp(&unpron_phonemes[1], "en") == 0)
772 return FLAG_SPELLWORD; // _^_en must have been set in TranslateLetter(), not *_rules which uses only _^_
773 return 0;
774 }
775
776 length = 0;
777 while (wordx[length] != ' ') length++;
778 }
779 SetSpellingStress(tr, unpron_phonemes, 0, posn);
780
781 // anything left ?
782 if (*wordx != ' ') {
783 if ((unpron_phonemes[0] != 0) && (wordx[0] != '\'')) {
784 // letters which have been spoken individually from affecting the pronunciation of the pronuncable part
785 wordx[-1] = ' ';
786 }
787
788 // Translate the stem
789 end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags, dictionary_flags);
790
791 if (phonemes[0] == phonSWITCH) {
792 // change to another language in order to translate this word
793 strcpy(word_phonemes, phonemes);
794 return 0;
795 }
796
797 if ((phonemes[0] == 0) && (end_phonemes[0] == 0)) {
798 int wc;
799 // characters not recognised, speak them individually
800 // ?? should we say super/sub-script numbers and letters here?
801 utf8_in(&wc, wordx);
802 if ((word_length == 1) && (IsAlpha(wc) || IsSuperscript(wc))) {
803 if ((wordx = SpeakIndividualLetters(tr, wordx, phonemes, spell_word)) == NULL)
804 return 0;
805 strcpy(word_phonemes, phonemes);
806 return 0;
807 }
808 }
809
810 c_temp = wordx[-1];
811
812 found = false;
813 confirm_prefix = 1;
814 for (loopcount = 0; (loopcount < 50) && (end_type & SUFX_P); loopcount++) {
815 // Found a standard prefix, remove it and retranslate
816 // loopcount guards against an endless loop
817 if (confirm_prefix && !(end_type & SUFX_B)) {
818 int end2;
819 char end_phonemes22[N_WORD_PHONEMES];
820
821 // remove any standard suffix and confirm that the prefix is still recognised
822 phonemes2[0] = 0;
823 end2 = TranslateRules(tr, wordx, phonemes2, N_WORD_PHONEMES, end_phonemes22, wflags|FLAG_NO_PREFIX|FLAG_NO_TRACE, dictionary_flags);
824 if (end2) {
825 RemoveEnding(tr, wordx, end2, word_copy);
826 end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags|FLAG_NO_TRACE, dictionary_flags);
827 memcpy(wordx, word_copy, strlen(word_copy));
828 if ((end_type & SUFX_P) == 0) {
829 // after removing the suffix, the prefix is no longer recognised.
830 // Keep the suffix, but don't use the prefix
831 end_type = end2;
832 strcpy(phonemes, phonemes2);
833 strcpy(end_phonemes, end_phonemes22);
834 if (option_phonemes & espeakPHONEMES_TRACE) {
835 DecodePhonemes(end_phonemes, end_phonemes22);
836 fprintf(f_trans, " suffix [%s]\n\n", end_phonemes22);
837 }
838 }
839 confirm_prefix = 0;
840 continue;
841 }
842 }
843
844 prefix_type = end_type;
845
846 if (prefix_type & SUFX_V)
847 tr->expect_verb = 1; // use the verb form of the word
848
849 wordx[-1] = c_temp;
850
851 if ((prefix_type & SUFX_B) == 0) {
852 for (ix = (prefix_type & 0xf); ix > 0; ix--) { // num. of characters to remove
853 wordx++;
854 while ((*wordx & 0xc0) == 0x80) wordx++; // for multibyte characters
855 }
856 } else {
857 pfix = 1;
858 prefix_chars[0] = 0;
859 n_chars = prefix_type & 0x3f;
860
861 for (ix = 0; ix < n_chars; ix++) { // num. of bytes to remove
862 prefix_chars[pfix++] = *wordx++;
863
864 if ((prefix_type & SUFX_B) && (ix == (n_chars-1)))
865 prefix_chars[pfix-1] = 0; // discard the last character of the prefix, this is the separator character
866 }
867 prefix_chars[pfix] = 0;
868 }
869 c_temp = wordx[-1];
870 wordx[-1] = ' ';
871 confirm_prefix = 1;
872 wflags |= FLAG_PREFIX_REMOVED;
873
874 if (prefix_type & SUFX_B) {
875 // SUFX_B is used for Turkish, tr_rules contains " ' (Pb"
876 // examine the prefix part
877 char *wordpf;
878 char prefix_phonemes2[12];
879
880 strncpy0(prefix_phonemes2, end_phonemes, sizeof(prefix_phonemes2));
881 wordpf = &prefix_chars[1];
882 strcpy(prefix_phonemes, phonemes);
883
884 // look for stress marker or $abbrev
885 found = LookupDictList(tr, &wordpf, phonemes, dictionary_flags, 0, wtab);
886 if (found)
887 strcpy(prefix_phonemes, phonemes);
888 if (dictionary_flags[0] & FLAG_ABBREV) {
889 prefix_phonemes[0] = 0;
890 SpeakIndividualLetters(tr, wordpf, prefix_phonemes, 1);
891 }
892 } else
893 strcat(prefix_phonemes, end_phonemes);
894 end_phonemes[0] = 0;
895
896 end_type = 0;
897 found = LookupDictList(tr, &wordx, phonemes, dictionary_flags2, SUFX_P, wtab); // without prefix
898 if (dictionary_flags[0] == 0) {
899 dictionary_flags[0] = dictionary_flags2[0];
900 dictionary_flags[1] = dictionary_flags2[1];
901 } else
902 prefix_flags = 1;
903 if (found == false) {
904 end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags & (FLAG_HYPHEN_AFTER | FLAG_PREFIX_REMOVED), dictionary_flags);
905
906 if (phonemes[0] == phonSWITCH) {
907 // change to another language in order to translate this word
908 wordx[-1] = c_temp;
909 strcpy(word_phonemes, phonemes);
910 return 0;
911 }
912 }
913 }
914
915 if ((end_type != 0) && !(end_type & SUFX_P)) {
916 end_type1 = end_type;
917 strcpy(phonemes2, phonemes);
918
919 // The word has a standard ending, re-translate without this ending
920 end_flags = RemoveEnding(tr, wordx, end_type, word_copy);
921 more_suffixes = 1;
922
923 while (more_suffixes) {
924 more_suffixes = 0;
925 phonemes[0] = 0;
926
927 if (prefix_phonemes[0] != 0) {
928 // lookup the stem without the prefix removed
929 wordx[-1] = c_temp;
930 found = LookupDictList(tr, &word1, phonemes, dictionary_flags2, end_flags, wtab); // include prefix, but not suffix
931 wordx[-1] = ' ';
932 if (phonemes[0] == phonSWITCH) {
933 // change to another language in order to translate this word
934 memcpy(wordx, word_copy, strlen(word_copy));
935 strcpy(word_phonemes, phonemes);
936 return 0;
937 }
938 if (dictionary_flags[0] == 0) {
939 dictionary_flags[0] = dictionary_flags2[0];
940 dictionary_flags[1] = dictionary_flags2[1];
941 }
942 if (found)
943 prefix_phonemes[0] = 0; // matched whole word, don't need prefix now
944
945 if ((found == false) && (dictionary_flags2[0] != 0))
946 prefix_flags = 1;
947 }
948 if (found == false) {
949 found = LookupDictList(tr, &wordx, phonemes, dictionary_flags2, end_flags, wtab); // without prefix and suffix
950 if (phonemes[0] == phonSWITCH) {
951 // change to another language in order to translate this word
952 memcpy(wordx, word_copy, strlen(word_copy));
953 strcpy(word_phonemes, phonemes);
954 return 0;
955 }
956
957 if (dictionary_flags[0] == 0) {
958 dictionary_flags[0] = dictionary_flags2[0];
959 dictionary_flags[1] = dictionary_flags2[1];
960 }
961 }
962 if (found == false) {
963 if (end_type & SUFX_Q) {
964 // don't retranslate, use the original lookup result
965 strcpy(phonemes, phonemes2);
966 } else {
967 if (end_flags & FLAG_SUFX)
968 wflags |= FLAG_SUFFIX_REMOVED;
969 if (end_type & SUFX_A)
970 wflags |= FLAG_SUFFIX_VOWEL;
971
972 if (end_type & SUFX_M) {
973 // allow more suffixes before this suffix
974 strcpy(end_phonemes2, end_phonemes);
975 end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags, dictionary_flags);
976 strcat(end_phonemes, end_phonemes2); // add the phonemes for the previous suffixes after this one
977
978 if ((end_type != 0) && !(end_type & SUFX_P)) {
979 // there is another suffix
980 end_flags = RemoveEnding(tr, wordx, end_type, NULL);
981 more_suffixes = 1;
982 }
983 } else {
984 // don't remove any previous suffix
985 TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, NULL, wflags, dictionary_flags);
986 end_type = 0;
987 }
988
989 if (phonemes[0] == phonSWITCH) {
990 // change to another language in order to translate this word
991 strcpy(word_phonemes, phonemes);
992 memcpy(wordx, word_copy, strlen(word_copy));
993 wordx[-1] = c_temp;
994 return 0;
995 }
996 }
997 }
998 }
999
1000
1001 if ((end_type1 & SUFX_T) == 0) {
1002 // the default is to add the suffix and then determine the word's stress pattern
1003 AppendPhonemes(tr, phonemes, N_WORD_PHONEMES, end_phonemes);
1004 end_phonemes[0] = 0;
1005 }
1006 memcpy(wordx, word_copy, strlen(word_copy));
1007 }
1008
1009 wordx[-1] = c_temp;
1010 }
1011 }
1012
1013 if (wflags & FLAG_HAS_PLURAL) {
1014 // s or 's suffix, append [s], [z] or [Iz] depending on previous letter
1015 if (last_char == 'f')
1016 TranslateRules(tr, &word_ss[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1017 else if ((last_char == 0) || (strchr_w("hsx", last_char) == NULL))
1018 TranslateRules(tr, &word_zz[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1019 else
1020 TranslateRules(tr, &word_iz[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1021 }
1022
1023 wflags |= emphasize_allcaps;
1024
1025 // determine stress pattern for this word
1026
1027 add_suffix_phonemes = 0;
1028 if (end_phonemes[0] != 0)
1029 add_suffix_phonemes = 2;
1030
1031 prefix_stress = 0;
1032 for (p = prefix_phonemes; *p != 0; p++) {
1033 if ((*p == phonSTRESS_P) || (*p == phonSTRESS_P2))
1034 prefix_stress = *p;
1035 }
1036 if (prefix_flags || (prefix_stress != 0)) {
1037 if ((tr->langopts.param[LOPT_PREFIXES]) || (prefix_type & SUFX_T)) {
1038 char *p_local;
1039 // German, keep a secondary stress on the stem
1040 SetWordStress(tr, phonemes, dictionary_flags, 3, 0);
1041
1042 // reduce all but the first primary stress
1043 ix = 0;
1044 for (p_local = prefix_phonemes; *p_local != 0; p_local++) {
1045 if (*p_local == phonSTRESS_P) {
1046 if (ix == 0)
1047 ix = 1;
1048 else
1049 *p_local = phonSTRESS_3;
1050 }
1051 }
1052 snprintf(word_phonemes, sizeof(word_phonemes), "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1053 word_phonemes[N_WORD_PHONEMES-1] = 0;
1054 SetWordStress(tr, word_phonemes, dictionary_flags, -1, 0);
1055 } else {
1056 // stress position affects the whole word, including prefix
1057 snprintf(word_phonemes, sizeof(word_phonemes), "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1058 word_phonemes[N_WORD_PHONEMES-1] = 0;
1059 SetWordStress(tr, word_phonemes, dictionary_flags, -1, 0);
1060 }
1061 } else {
1062 SetWordStress(tr, phonemes, dictionary_flags, -1, add_suffix_phonemes);
1063 snprintf(word_phonemes, sizeof(word_phonemes), "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1064 word_phonemes[N_WORD_PHONEMES-1] = 0;
1065 }
1066
1067 if (end_phonemes[0] != 0) {
1068 // a suffix had the SUFX_T option set, add the suffix after the stress pattern has been determined
1069 ix = strlen(word_phonemes);
1070 end_phonemes[N_WORD_PHONEMES-1-ix] = 0; // ensure no buffer overflow
1071 strcpy(&word_phonemes[ix], end_phonemes);
1072 }
1073
1074 if (wflags & FLAG_LAST_WORD) {
1075 // don't use $brk pause before the last word of a sentence
1076 // (but allow it for emphasis, see below
1077 dictionary_flags[0] &= ~FLAG_PAUSE1;
1078 }
1079
1080 if ((wflags & FLAG_HYPHEN) && (tr->langopts.stress_flags & S_HYPEN_UNSTRESS))
1081 ChangeWordStress(tr, word_phonemes, 3);
1082 else if (wflags & FLAG_EMPHASIZED2) {
1083 // A word is indicated in the source text as stressed
1084 // Give it stress level 6 (for the intonation module)
1085 ChangeWordStress(tr, word_phonemes, 6);
1086
1087 if (wflags & FLAG_EMPHASIZED)
1088 dictionary_flags[0] |= FLAG_PAUSE1; // precede by short pause
1089 } else if (wtab[dictionary_skipwords].flags & FLAG_LAST_WORD) {
1090 // the word has attribute to stress or unstress when at end of clause
1091 if (dictionary_flags[0] & (FLAG_STRESS_END | FLAG_STRESS_END2))
1092 ChangeWordStress(tr, word_phonemes, 4);
1093 else if ((dictionary_flags[0] & FLAG_UNSTRESS_END) && (any_stressed_words))
1094 ChangeWordStress(tr, word_phonemes, 3);
1095 }
1096
1097 // dictionary flags for this word give a clue about which alternative pronunciations of
1098 // following words to use.
1099 if (end_type1 & SUFX_F) {
1100 // expect a verb form, with or without -s suffix
1101 tr->expect_verb = 2;
1102 tr->expect_verb_s = 2;
1103 }
1104
1105 if (dictionary_flags[1] & FLAG_PASTF) {
1106 // expect perfect tense in next two words
1107 tr->expect_past = 3;
1108 tr->expect_verb = 0;
1109 tr->expect_noun = 0;
1110 } else if (dictionary_flags[1] & FLAG_VERBF) {
1111 // expect a verb in the next word
1112 tr->expect_verb = 2;
1113 tr->expect_verb_s = 0; // verb won't have -s suffix
1114 tr->expect_noun = 0;
1115 } else if (dictionary_flags[1] & FLAG_VERBSF) {
1116 // expect a verb, must have a -s suffix
1117 tr->expect_verb = 0;
1118 tr->expect_verb_s = 2;
1119 tr->expect_past = 0;
1120 tr->expect_noun = 0;
1121 } else if (dictionary_flags[1] & FLAG_NOUNF) {
1122 // not expecting a verb next
1123 tr->expect_noun = 2;
1124 tr->expect_verb = 0;
1125 tr->expect_verb_s = 0;
1126 tr->expect_past = 0;
1127 }
1128
1129 if ((wordx[0] != 0) && (!(dictionary_flags[1] & FLAG_VERB_EXT))) {
1130 if (tr->expect_verb > 0)
1131 tr->expect_verb--;
1132
1133 if (tr->expect_verb_s > 0)
1134 tr->expect_verb_s--;
1135
1136 if (tr->expect_noun > 0)
1137 tr->expect_noun--;
1138
1139 if (tr->expect_past > 0)
1140 tr->expect_past--;
1141 }
1142
1143 if ((word_length == 1) && (tr->translator_name == L('e', 'n')) && iswalpha(first_char) && (first_char != 'i')) {
1144 // English Specific !!!!
1145 // any single letter before a dot is an abbreviation, except 'I'
1146 dictionary_flags[0] |= FLAG_ALLOW_DOT;
1147 }
1148
1149 if ((tr->langopts.param[LOPT_ALT] & 2) && ((dictionary_flags[0] & (FLAG_ALT_TRANS | FLAG_ALT2_TRANS)) != 0))
1150 ApplySpecialAttribute2(tr, word_phonemes, dictionary_flags[0]);
1151
1152 dictionary_flags[0] |= was_unpronouncable;
1153 memcpy(word_start, word_copy2, word_copy_length);
1154 return dictionary_flags[0];
1155 }
1156
TranslateWord(Translator * tr,char * word_start,WORD_TAB * wtab,char * word_out)1157 int TranslateWord(Translator *tr, char *word_start, WORD_TAB *wtab, char *word_out)
1158 {
1159 char words_phonemes[N_WORD_PHONEMES]; // a word translated into phoneme codes
1160 char *phonemes = words_phonemes;
1161 int available = N_WORD_PHONEMES;
1162 int first_word = 1;
1163
1164 int flags = TranslateWord3(tr, word_start, wtab, word_out);
1165 if (flags & FLAG_TEXTMODE && word_out) {
1166 // Ensure that start of word rules match with the replaced text,
1167 // so that emoji and other characters are pronounced correctly.
1168 char word[N_WORD_BYTES+1];
1169 word[0] = 0;
1170 word[1] = ' ';
1171 strcpy(word+2, word_out);
1172 word_out = word+2;
1173
1174 while (*word_out && available > 1) {
1175 int c;
1176 utf8_in(&c, word_out);
1177 if (iswupper(c)) {
1178 wtab->flags |= FLAG_FIRST_UPPER;
1179 utf8_out(tolower(c), word_out);
1180 } else {
1181 wtab->flags &= ~FLAG_FIRST_UPPER;
1182 }
1183
1184 TranslateWord3(tr, word_out, wtab, NULL);
1185
1186 int n;
1187 if (first_word) {
1188 n = snprintf(phonemes, available, "%s", word_phonemes);
1189 first_word = 0;
1190 } else {
1191 n = snprintf(phonemes, available, "%c%s", phonEND_WORD, word_phonemes);
1192 }
1193
1194 available -= n;
1195 phonemes += n;
1196
1197 // skip to the next word in a multi-word replacement
1198 while (!isspace(*word_out)) ++word_out;
1199 while (isspace(*word_out)) ++word_out;
1200 }
1201 snprintf(word_phonemes, sizeof(word_phonemes), "%s", words_phonemes);
1202 }
1203 return flags;
1204 }
1205
SetPlist2(PHONEME_LIST2 * p,unsigned char phcode)1206 static void SetPlist2(PHONEME_LIST2 *p, unsigned char phcode)
1207 {
1208 p->phcode = phcode;
1209 p->stresslevel = 0;
1210 p->tone_ph = 0;
1211 p->synthflags = embedded_flag;
1212 p->sourceix = 0;
1213 embedded_flag = 0;
1214 }
1215
CountSyllables(unsigned char * phonemes)1216 static int CountSyllables(unsigned char *phonemes)
1217 {
1218 int count = 0;
1219 int phon;
1220 while ((phon = *phonemes++) != 0) {
1221 if (phoneme_tab[phon]->type == phVOWEL)
1222 count++;
1223 }
1224 return count;
1225 }
1226
Word_EmbeddedCmd()1227 static void Word_EmbeddedCmd()
1228 {
1229 // Process embedded commands for emphasis, sayas, and break
1230 int embedded_cmd;
1231 int value;
1232
1233 do {
1234 embedded_cmd = embedded_list[embedded_read++];
1235 value = embedded_cmd >> 8;
1236
1237 switch (embedded_cmd & 0x1f)
1238 {
1239 case EMBED_Y:
1240 option_sayas = value;
1241 break;
1242
1243 case EMBED_F:
1244 option_emphasis = value;
1245 break;
1246
1247 case EMBED_B:
1248 // break command
1249 if (value == 0)
1250 pre_pause = 0; // break=none
1251 else
1252 pre_pause += value;
1253 break;
1254 }
1255 } while (((embedded_cmd & 0x80) == 0) && (embedded_read < embedded_ix));
1256 }
1257
SetTranslator2(const char * new_language)1258 int SetTranslator2(const char *new_language)
1259 {
1260 // Set translator2 to a second language
1261 int new_phoneme_tab;
1262
1263 if ((new_phoneme_tab = SelectPhonemeTableName(new_language)) >= 0) {
1264 if ((translator2 != NULL) && (strcmp(new_language, translator2_language) != 0)) {
1265 // we already have an alternative translator, but not for the required language, delete it
1266 DeleteTranslator(translator2);
1267 translator2 = NULL;
1268 }
1269
1270 if (translator2 == NULL) {
1271 translator2 = SelectTranslator(new_language);
1272 strcpy(translator2_language, new_language);
1273
1274 if (LoadDictionary(translator2, translator2->dictionary_name, 0) != 0) {
1275 SelectPhonemeTable(voice->phoneme_tab_ix); // revert to original phoneme table
1276 new_phoneme_tab = -1;
1277 translator2_language[0] = 0;
1278 }
1279 translator2->phoneme_tab_ix = new_phoneme_tab;
1280 }
1281 }
1282 if (translator2 != NULL)
1283 translator2->phonemes_repeat[0] = 0;
1284 return new_phoneme_tab;
1285 }
1286
TranslateWord2(Translator * tr,char * word,WORD_TAB * wtab,int prepause)1287 static int TranslateWord2(Translator *tr, char *word, WORD_TAB *wtab, int prepause)
1288 {
1289 int flags = 0;
1290 int stress;
1291 int next_stress;
1292 int next_tone = 0;
1293 unsigned char *p;
1294 int srcix;
1295 int found_dict_flag;
1296 unsigned char ph_code;
1297 PHONEME_LIST2 *plist2;
1298 PHONEME_TAB *ph;
1299 int max_stress;
1300 int max_stress_ix = 0;
1301 int prev_vowel = -1;
1302 int pitch_raised = 0;
1303 int switch_phonemes = -1;
1304 int first_phoneme = 1;
1305 int source_ix;
1306 int len;
1307 int ix;
1308 int sylimit; // max. number of syllables in a word to be combined with a preceding preposition
1309 const char *new_language;
1310 int bad_phoneme;
1311 int word_flags;
1312 int word_copy_len;
1313 char word_copy[N_WORD_BYTES+1];
1314 char word_replaced[N_WORD_BYTES+1];
1315 char old_dictionary_name[40];
1316
1317 len = wtab->length;
1318 if (len > 31) len = 31;
1319 source_ix = (wtab->sourceix & 0x7ff) | (len << 11); // bits 0-10 sourceix, bits 11-15 word length
1320
1321 word_flags = wtab[0].flags;
1322 if (word_flags & FLAG_EMBEDDED) {
1323 wtab[0].flags &= ~FLAG_EMBEDDED; // clear it in case we call TranslateWord2() again for the same word
1324 embedded_flag = SFLAG_EMBEDDED;
1325
1326 Word_EmbeddedCmd();
1327 }
1328
1329 if ((word[0] == 0) || (word_flags & FLAG_DELETE_WORD)) {
1330 // nothing to translate. Add a dummy phoneme to carry any embedded commands
1331 if (embedded_flag) {
1332 ph_list2[n_ph_list2].phcode = phonEND_WORD;
1333 ph_list2[n_ph_list2].stresslevel = 0;
1334 ph_list2[n_ph_list2].wordstress = 0;
1335 ph_list2[n_ph_list2].tone_ph = 0;
1336 ph_list2[n_ph_list2].synthflags = embedded_flag;
1337 ph_list2[n_ph_list2].sourceix = 0;
1338 n_ph_list2++;
1339 embedded_flag = 0;
1340 }
1341 word_phonemes[0] = 0;
1342 return 0;
1343 }
1344
1345 // after a $pause word attribute, ignore a $pause attribute on the next two words
1346 if (tr->prepause_timeout > 0)
1347 tr->prepause_timeout--;
1348
1349 if ((option_sayas & 0xf0) == 0x10) {
1350 if (!(word_flags & FLAG_FIRST_WORD)) {
1351 // SAYAS_CHARS, SAYAS_GLYPHS, or SAYAS_SINGLECHARS. Pause between each word.
1352 prepause += 4;
1353 }
1354 }
1355
1356 if (word_flags & FLAG_FIRST_UPPER) {
1357 if ((option_capitals > 2) && (embedded_ix < N_EMBEDDED_LIST-6)) {
1358 // indicate capital letter by raising pitch
1359 if (embedded_flag)
1360 embedded_list[embedded_ix-1] &= ~0x80; // already embedded command before this word, remove terminator
1361 if ((pitch_raised = option_capitals) == 3)
1362 pitch_raised = 20; // default pitch raise for capitals
1363 embedded_list[embedded_ix++] = EMBED_P+0x40+0x80 + (pitch_raised << 8); // raise pitch
1364 embedded_flag = SFLAG_EMBEDDED;
1365 }
1366 }
1367
1368 p = (unsigned char *)word_phonemes;
1369 if (word_flags & FLAG_PHONEMES) {
1370 // The input is in phoneme mnemonics, not language text
1371 int c1;
1372 char lang_name[12];
1373
1374 if (memcmp(word, "_^_", 3) == 0) {
1375 // switch languages
1376 word += 3;
1377 for (ix = 0;;) {
1378 c1 = *word++;
1379 if ((c1 == ' ') || (c1 == 0))
1380 break;
1381 lang_name[ix++] = tolower(c1);
1382 }
1383 lang_name[ix] = 0;
1384
1385 if ((ix = LookupPhonemeTable(lang_name)) > 0) {
1386 SelectPhonemeTable(ix);
1387 word_phonemes[0] = phonSWITCH;
1388 word_phonemes[1] = ix;
1389 word_phonemes[2] = 0;
1390 }
1391 } else
1392 EncodePhonemes(word, word_phonemes, &bad_phoneme);
1393 flags = FLAG_FOUND;
1394 } else {
1395 int c2;
1396 ix = 0;
1397 while (((c2 = word_copy[ix] = word[ix]) != ' ') && (c2 != 0) && (ix < N_WORD_BYTES)) ix++;
1398 word_copy_len = ix;
1399
1400 word_replaced[2] = 0;
1401 flags = TranslateWord(translator, word, wtab, &word_replaced[2]);
1402
1403 if (flags & FLAG_SPELLWORD) {
1404 // re-translate the word as individual letters, separated by spaces
1405 memcpy(word, word_copy, word_copy_len);
1406 return flags;
1407 }
1408
1409 if ((flags & FLAG_COMBINE) && !(wtab[1].flags & FLAG_PHONEMES)) {
1410 char *p2;
1411 int ok = 1;
1412 unsigned int flags2[2];
1413 int c_word2;
1414 char ph_buf[N_WORD_PHONEMES];
1415
1416 flags2[0] = 0;
1417 sylimit = tr->langopts.param[LOPT_COMBINE_WORDS];
1418
1419 // LANG=cs,sk
1420 // combine a preposition with the following word
1421 p2 = word;
1422 while (*p2 != ' ') p2++;
1423
1424 utf8_in(&c_word2, p2+1); // first character of the next word;
1425 if (!iswalpha(c_word2))
1426 ok = 0;
1427
1428 if (ok != 0) {
1429 strcpy(ph_buf, word_phonemes);
1430
1431 flags2[0] = TranslateWord(translator, p2+1, wtab+1, NULL);
1432 if ((flags2[0] & FLAG_WAS_UNPRONOUNCABLE) || (word_phonemes[0] == phonSWITCH))
1433 ok = 0;
1434
1435 if (sylimit & 0x100) {
1436 // only if the second word has $alt attribute
1437 if ((flags2[0] & FLAG_ALT_TRANS) == 0)
1438 ok = 0;
1439 }
1440
1441 if ((sylimit & 0x200) && ((wtab+1)->flags & FLAG_LAST_WORD)) {
1442 // not if the next word is end-of-sentence
1443 ok = 0;
1444 }
1445
1446 if (ok == 0)
1447 strcpy(word_phonemes, ph_buf);
1448 }
1449
1450 if (ok) {
1451 *p2 = '-'; // replace next space by hyphen
1452 wtab[0].flags &= ~FLAG_ALL_UPPER; // prevent it being considered an abbreviation
1453 flags = TranslateWord(translator, word, wtab, NULL); // translate the combined word
1454 if ((sylimit > 0) && (CountSyllables(p) > (sylimit & 0x1f))) {
1455 // revert to separate words
1456 *p2 = ' ';
1457 flags = TranslateWord(translator, word, wtab, NULL);
1458 } else {
1459 if (flags == 0)
1460 flags = flags2[0]; // no flags for the combined word, so use flags from the second word eg. lang-hu "nem december 7-e"
1461 flags |= FLAG_SKIPWORDS;
1462 dictionary_skipwords = 1;
1463 }
1464 }
1465 }
1466
1467 if (p[0] == phonSWITCH) {
1468 int switch_attempt;
1469 strcpy(old_dictionary_name, dictionary_name);
1470 for (switch_attempt = 0; switch_attempt < 2; switch_attempt++) {
1471 // this word uses a different language
1472 memcpy(word, word_copy, word_copy_len);
1473
1474 new_language = (char *)(&p[1]);
1475 if (new_language[0] == 0)
1476 new_language = "en";
1477
1478 switch_phonemes = SetTranslator2(new_language);
1479
1480 if (switch_phonemes >= 0) {
1481 // re-translate the word using the new translator
1482 wtab[0].flags |= FLAG_TRANSLATOR2;
1483 if (word_replaced[2] != 0) {
1484 word_replaced[0] = 0; // byte before the start of the word
1485 word_replaced[1] = ' ';
1486 flags = TranslateWord(translator2, &word_replaced[1], wtab, NULL);
1487 } else
1488 flags = TranslateWord(translator2, word, wtab, &word_replaced[2]);
1489 }
1490
1491 if (p[0] != phonSWITCH)
1492 break;
1493 }
1494
1495 if (p[0] == phonSWITCH)
1496 return FLAG_SPELLWORD;
1497
1498 if (switch_phonemes < 0) {
1499 // language code is not recognised or 2nd translator won't translate it
1500 p[0] = phonSCHWA; // just say something
1501 p[1] = phonSCHWA;
1502 p[2] = 0;
1503 }
1504
1505 if (switch_phonemes == -1) {
1506 strcpy(dictionary_name, old_dictionary_name);
1507 SelectPhonemeTable(voice->phoneme_tab_ix);
1508
1509 // leave switch_phonemes set, but use the original phoneme table number.
1510 // This will suppress LOPT_REGRESSIVE_VOICING
1511 switch_phonemes = voice->phoneme_tab_ix; // original phoneme table
1512 }
1513 }
1514
1515 if (!(word_flags & FLAG_HYPHEN)) {
1516 if (flags & FLAG_PAUSE1) {
1517 if (prepause < 1)
1518 prepause = 1;
1519 }
1520 if ((flags & FLAG_PREPAUSE) && !(word_flags & (FLAG_LAST_WORD | FLAG_FIRST_WORD)) && !(wtab[-1].flags & FLAG_FIRST_WORD) && (tr->prepause_timeout == 0)) {
1521 // the word is marked in the dictionary list with $pause
1522 if (prepause < 4) prepause = 4;
1523 tr->prepause_timeout = 3;
1524 }
1525 }
1526
1527 if ((option_emphasis >= 3) && (prepause < 1))
1528 prepause = 1;
1529 }
1530
1531 stress = 0;
1532 next_stress = 1;
1533 srcix = 0;
1534 max_stress = -1;
1535
1536 found_dict_flag = 0;
1537 if ((flags & FLAG_FOUND) && !(flags & FLAG_TEXTMODE))
1538 found_dict_flag = SFLAG_DICTIONARY;
1539
1540 while ((prepause > 0) && (n_ph_list2 < N_PHONEME_LIST-4)) {
1541 // add pause phonemes here. Either because of punctuation (brackets or quotes) in the
1542 // text, or because the word is marked in the dictionary lookup as a conjunction
1543 if (prepause > 1) {
1544 SetPlist2(&ph_list2[n_ph_list2++], phonPAUSE);
1545 prepause -= 2;
1546 } else {
1547 SetPlist2(&ph_list2[n_ph_list2++], phonPAUSE_NOLINK);
1548 prepause--;
1549 }
1550 tr->end_stressed_vowel = 0; // forget about the previous word
1551 tr->prev_dict_flags[0] = 0;
1552 tr->prev_dict_flags[1] = 0;
1553 }
1554 plist2 = &ph_list2[n_ph_list2];
1555
1556 if ((option_capitals == 1) && (word_flags & FLAG_FIRST_UPPER)) {
1557 SetPlist2(&ph_list2[n_ph_list2++], phonPAUSE_SHORT);
1558 SetPlist2(&ph_list2[n_ph_list2++], phonCAPITAL);
1559 if ((word_flags & FLAG_ALL_UPPER) && IsAlpha(word[1])) {
1560 // word > 1 letter and all capitals
1561 SetPlist2(&ph_list2[n_ph_list2++], phonPAUSE_SHORT);
1562 SetPlist2(&ph_list2[n_ph_list2++], phonCAPITAL);
1563 }
1564 }
1565
1566 if (switch_phonemes >= 0) {
1567 if ((p[0] == phonPAUSE) && (p[1] == phonSWITCH)) {
1568 // the new word starts with a phoneme table switch, so there's no need to switch before it.
1569 if (ph_list2[n_ph_list2-1].phcode == phonSWITCH) {
1570 // previous phoneme is also a phonSWITCH, delete it
1571 n_ph_list2--;
1572 }
1573 } else {
1574 // this word uses a different phoneme table
1575 if (ph_list2[n_ph_list2-1].phcode == phonSWITCH) {
1576 // previous phoneme is also a phonSWITCH, just change its phoneme table number
1577 n_ph_list2--;
1578 } else
1579 SetPlist2(&ph_list2[n_ph_list2], phonSWITCH);
1580 ph_list2[n_ph_list2++].tone_ph = switch_phonemes; // temporary phoneme table number
1581 }
1582 }
1583
1584 // remove initial pause from a word if it follows a hyphen
1585 if ((word_flags & FLAG_HYPHEN) && (phoneme_tab[*p]->type == phPAUSE))
1586 p++;
1587
1588 if ((p[0] == 0) && (embedded_flag)) {
1589 // no phonemes. Insert a very short pause to carry an embedded command
1590 p[0] = phonPAUSE_VSHORT;
1591 p[1] = 0;
1592 }
1593
1594 while (((ph_code = *p++) != 0) && (n_ph_list2 < N_PHONEME_LIST-4)) {
1595 if (ph_code == 255)
1596 continue; // unknown phoneme
1597
1598 // Add the phonemes to the first stage phoneme list (ph_list2)
1599 ph = phoneme_tab[ph_code];
1600
1601 if (ph_code == phonSWITCH) {
1602 ph_list2[n_ph_list2].phcode = ph_code;
1603 ph_list2[n_ph_list2].sourceix = 0;
1604 ph_list2[n_ph_list2].synthflags = 0;
1605 ph_list2[n_ph_list2++].tone_ph = *p;
1606 SelectPhonemeTable(*p);
1607 p++;
1608 } else if (ph->type == phSTRESS) {
1609 // don't add stress phonemes codes to the list, but give their stress
1610 // value to the next vowel phoneme
1611 // std_length is used to hold stress number or (if >10) a tone number for a tone language
1612 if (ph->program == 0)
1613 next_stress = ph->std_length;
1614 else {
1615 // for tone languages, the tone number for a syllable follows the vowel
1616 if (prev_vowel >= 0)
1617 ph_list2[prev_vowel].tone_ph = ph_code;
1618 else
1619 next_tone = ph_code; // no previous vowel, apply to the next vowel
1620 }
1621 } else if (ph_code == phonSYLLABIC) {
1622 // mark the previous phoneme as a syllabic consonant
1623 prev_vowel = n_ph_list2-1;
1624 ph_list2[prev_vowel].synthflags |= SFLAG_SYLLABLE;
1625 ph_list2[prev_vowel].stresslevel = next_stress;
1626 } else if (ph_code == phonLENGTHEN)
1627 ph_list2[n_ph_list2-1].synthflags |= SFLAG_LENGTHEN;
1628 else if (ph_code == phonEND_WORD) {
1629 // a || symbol in a phoneme string was used to indicate a word boundary
1630 // Don't add this phoneme to the list, but make sure the next phoneme has
1631 // a newword indication
1632 srcix = source_ix+1;
1633 } else if (ph_code == phonX1) {
1634 // a language specific action
1635 if (tr->langopts.param[LOPT_IT_DOUBLING])
1636 flags |= FLAG_DOUBLING;
1637 } else {
1638 ph_list2[n_ph_list2].phcode = ph_code;
1639 ph_list2[n_ph_list2].tone_ph = 0;
1640 ph_list2[n_ph_list2].synthflags = embedded_flag | found_dict_flag;
1641 embedded_flag = 0;
1642 ph_list2[n_ph_list2].sourceix = srcix;
1643 srcix = 0;
1644
1645 if (ph->type == phVOWEL) {
1646 stress = next_stress;
1647 next_stress = 1; // default is 'unstressed'
1648
1649 if (stress >= 4)
1650 any_stressed_words = 1;
1651
1652 if ((prev_vowel >= 0) && (n_ph_list2-1) != prev_vowel)
1653 ph_list2[n_ph_list2-1].stresslevel = stress; // set stress for previous consonant
1654
1655 ph_list2[n_ph_list2].synthflags |= SFLAG_SYLLABLE;
1656 prev_vowel = n_ph_list2;
1657
1658 if (stress > max_stress) {
1659 max_stress = stress;
1660 max_stress_ix = n_ph_list2;
1661 }
1662 if (next_tone != 0) {
1663 ph_list2[n_ph_list2].tone_ph = next_tone;
1664 next_tone = 0;
1665 }
1666 } else {
1667 if (first_phoneme && tr->langopts.param[LOPT_IT_DOUBLING]) {
1668 if (((tr->prev_dict_flags[0] & FLAG_DOUBLING) && (tr->langopts.param[LOPT_IT_DOUBLING] & 1)) ||
1669 (tr->end_stressed_vowel && (tr->langopts.param[LOPT_IT_DOUBLING] & 2))) {
1670 // italian, double the initial consonant if the previous word ends with a
1671 // stressed vowel, or is marked with a flag
1672 ph_list2[n_ph_list2].synthflags |= SFLAG_LENGTHEN;
1673 }
1674 }
1675 }
1676
1677 ph_list2[n_ph_list2].stresslevel = stress;
1678 n_ph_list2++;
1679 first_phoneme = 0;
1680 }
1681 }
1682
1683 if (word_flags & FLAG_COMMA_AFTER)
1684 SetPlist2(&ph_list2[n_ph_list2++], phonPAUSE_CLAUSE);
1685
1686 // don't set new-word if there is a hyphen before it
1687 if ((word_flags & FLAG_HYPHEN) == 0)
1688 plist2->sourceix = source_ix;
1689
1690 tr->end_stressed_vowel = 0;
1691 if ((stress >= 4) && (phoneme_tab[ph_list2[n_ph_list2-1].phcode]->type == phVOWEL))
1692 tr->end_stressed_vowel = 1; // word ends with a stressed vowel
1693
1694 if (switch_phonemes >= 0) {
1695 // this word uses a different phoneme table, now switch back
1696 strcpy(dictionary_name, old_dictionary_name);
1697 SelectPhonemeTable(voice->phoneme_tab_ix);
1698 SetPlist2(&ph_list2[n_ph_list2], phonSWITCH);
1699 ph_list2[n_ph_list2++].tone_ph = voice->phoneme_tab_ix; // original phoneme table number
1700 }
1701
1702
1703 if (pitch_raised > 0) {
1704 embedded_list[embedded_ix++] = EMBED_P+0x60+0x80 + (pitch_raised << 8); // lower pitch
1705 SetPlist2(&ph_list2[n_ph_list2], phonPAUSE_SHORT);
1706 ph_list2[n_ph_list2++].synthflags = SFLAG_EMBEDDED;
1707 }
1708
1709 if (flags & FLAG_STRESS_END2) {
1710 // this's word's stress could be increased later
1711 ph_list2[max_stress_ix].synthflags |= SFLAG_PROMOTE_STRESS;
1712 }
1713
1714 tr->prev_dict_flags[0] = flags;
1715 return flags;
1716 }
1717
EmbeddedCommand(unsigned int * source_index_out)1718 static int EmbeddedCommand(unsigned int *source_index_out)
1719 {
1720 // An embedded command to change the pitch, volume, etc.
1721 // returns number of commands added to embedded_list
1722
1723 // pitch,speed,amplitude,expression,reverb,tone,voice,sayas
1724 const char *commands = "PSARHTIVYMUBF";
1725 int value = -1;
1726 int sign = 0;
1727 unsigned char c;
1728 char *p;
1729 int cmd;
1730 int source_index = *source_index_out;
1731
1732 c = source[source_index];
1733 if (c == '+') {
1734 sign = 0x40;
1735 source_index++;
1736 } else if (c == '-') {
1737 sign = 0x60;
1738 source_index++;
1739 }
1740
1741 if (IsDigit09(source[source_index])) {
1742 value = atoi(&source[source_index]);
1743 while (IsDigit09(source[source_index]))
1744 source_index++;
1745 }
1746
1747 c = source[source_index++];
1748 if (embedded_ix >= (N_EMBEDDED_LIST - 2))
1749 return 0; // list is full
1750
1751 if ((p = strchr_w(commands, c)) == NULL)
1752 return 0;
1753 cmd = (p - commands)+1;
1754 if (value == -1) {
1755 value = embedded_default[cmd];
1756 sign = 0;
1757 }
1758
1759 if (cmd == EMBED_Y) {
1760 option_sayas2 = value;
1761 count_sayas_digits = 0;
1762 }
1763 if (cmd == EMBED_F) {
1764 if (value >= 3)
1765 word_emphasis = FLAG_EMPHASIZED;
1766 else
1767 word_emphasis = 0;
1768 }
1769
1770 embedded_list[embedded_ix++] = cmd + sign + (value << 8);
1771 *source_index_out = source_index;
1772 return 1;
1773 }
1774
SubstituteChar(Translator * tr,unsigned int c,unsigned int next_in,int * insert,int * wordflags)1775 static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
1776 {
1777 int ix;
1778 unsigned int word;
1779 unsigned int new_c, c2, c_lower;
1780 int upper_case = 0;
1781 static int ignore_next = 0;
1782 const unsigned int *replace_chars;
1783
1784 if (ignore_next) {
1785 ignore_next = 0;
1786 return 8;
1787 }
1788 if (c == 0) return 0;
1789
1790 if ((replace_chars = tr->langopts.replace_chars) == NULL)
1791 return c;
1792
1793 // there is a list of character codes to be substituted with alternative codes
1794
1795 if (iswupper(c_lower = c)) {
1796 c_lower = towlower2(c);
1797 upper_case = 1;
1798 }
1799
1800 new_c = 0;
1801 for (ix = 0; (word = replace_chars[ix]) != 0; ix += 2) {
1802 if (c_lower == (word & 0xffff)) {
1803 if ((word >> 16) == 0) {
1804 new_c = replace_chars[ix+1];
1805 break;
1806 }
1807 if ((word >> 16) == (unsigned int)towlower2(next_in)) {
1808 new_c = replace_chars[ix+1];
1809 ignore_next = 1;
1810 break;
1811 }
1812 }
1813 }
1814
1815 if (new_c == 0)
1816 return c; // no substitution
1817
1818 if (new_c & 0xffe00000) {
1819 // there is a second character to be inserted
1820 // don't convert the case of the second character unless the next letter is also upper case
1821 c2 = new_c >> 16;
1822 if (upper_case && iswupper(next_in))
1823 c2 = toupper(c2);
1824 *insert = c2;
1825 new_c &= 0xffff;
1826 }
1827
1828 if (upper_case)
1829 new_c = toupper(new_c);
1830
1831 *wordflags |= FLAG_CHAR_REPLACED;
1832 return new_c;
1833 }
1834
TranslateChar(Translator * tr,char * ptr,int prev_in,unsigned int c,unsigned int next_in,int * insert,int * wordflags)1835 static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
1836 {
1837 // To allow language specific examination and replacement of characters
1838
1839 int code;
1840 int initial;
1841 int medial;
1842 int final;
1843 int next2;
1844
1845 static const unsigned char hangul_compatibility[0x34] = {
1846 0, 0x00, 0x01, 0xaa, 0x02, 0xac, 0xad, 0x03,
1847 0x04, 0x05, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb4,
1848 0xb6, 0x06, 0x07, 0x08, 0xb9, 0x09, 0x0a, 0xbc,
1849 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x61,
1850 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
1851 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71,
1852 0x72, 0x73, 0x74, 0x75
1853 };
1854
1855 // check for Korean Hangul letters
1856 if (((code = c - 0xac00) >= 0) && (c <= 0xd7af)) {
1857 // break a syllable hangul into 2 or 3 individual jamo
1858 initial = (code/28)/21;
1859 medial = (code/28) % 21;
1860 final = code % 28;
1861
1862 if (initial == 11) {
1863 // null initial
1864 c = medial + 0x1161;
1865 if (final > 0)
1866 *insert = final + 0x11a7;
1867 } else {
1868 // extact the initial and insert the remainder with a null initial
1869 c = initial + 0x1100;
1870 *insert = (11*28*21) + (medial*28) + final + 0xac00;
1871 }
1872 return c;
1873 } else if (((code = c - 0x3130) >= 0) && (code < 0x34)) {
1874 // Hangul compatibility jamo
1875 return hangul_compatibility[code] + 0x1100;
1876 }
1877
1878 switch (tr->translator_name)
1879 {
1880 case L('a', 'f'):
1881 case L('n', 'l'):
1882 // look for 'n and replace by a special character (unicode: schwa)
1883
1884 if (!iswalpha(prev_in)) {
1885 utf8_in(&next2, &ptr[1]);
1886
1887 if ((c == '\'') && IsSpace(next2)) {
1888 if ((next_in == 'n') && (tr->translator_name == L('a', 'f'))) {
1889 // n preceded by either apostrophe or U2019 "right single quotation mark"
1890 ptr[0] = ' '; // delete the n
1891 return 0x0259; // replace ' by unicode schwa character
1892 }
1893 if ((next_in == 'n') || (next_in == 't')) {
1894 // Dutch, [@n] and [@t]
1895 return 0x0259; // replace ' by unicode schwa character
1896 }
1897 }
1898 }
1899 break;
1900 }
1901 return SubstituteChar(tr, c, next_in, insert, wordflags);
1902 }
1903
1904 static const char *UCase_ga[] = { "bp", "bhf", "dt", "gc", "hA", "mb", "nd", "ng", "ts", "tA", "nA", NULL };
1905
UpperCaseInWord(Translator * tr,char * word,int c)1906 static int UpperCaseInWord(Translator *tr, char *word, int c)
1907 {
1908 int ix;
1909 int len;
1910 const char *p;
1911
1912 if (tr->translator_name == L('g', 'a')) {
1913 // Irish
1914 for (ix = 0;; ix++) {
1915 if ((p = UCase_ga[ix]) == NULL)
1916 break;
1917
1918 len = strlen(p);
1919 if ((word[-len] == ' ') && (memcmp(&word[-len+1], p, len-1) == 0)) {
1920 if ((c == p[len-1]) || ((p[len-1] == 'A') && IsVowel(tr, c)))
1921 return 1;
1922 }
1923 }
1924 }
1925 return 0;
1926 }
1927
TranslateClause(Translator * tr,int * tone_out,char ** voice_change)1928 void TranslateClause(Translator *tr, int *tone_out, char **voice_change)
1929 {
1930 int ix;
1931 int c;
1932 int cc = 0;
1933 unsigned int source_index = 0;
1934 unsigned int prev_source_index = 0;
1935 int source_index_word = 0;
1936 int prev_in;
1937 int prev_out = ' ';
1938 int prev_out2;
1939 int prev_in_save = 0;
1940 int next_in;
1941 int next_in_nbytes;
1942 int char_inserted = 0;
1943 int clause_pause;
1944 int pre_pause_add = 0;
1945 int word_mark = 0;
1946 int all_upper_case = FLAG_ALL_UPPER;
1947 int finished;
1948 int single_quoted;
1949 int phoneme_mode = 0;
1950 int dict_flags = 0; // returned from dictionary lookup
1951 int word_flags; // set here
1952 int next_word_flags;
1953 int new_sentence2;
1954 int embedded_count = 0;
1955 int letter_count = 0;
1956 int space_inserted = 0;
1957 int syllable_marked = 0;
1958 int decimal_sep_count = 0;
1959 char *word;
1960 char *p;
1961 int j, k;
1962 int n_digits;
1963 int charix_top = 0;
1964
1965 short charix[N_TR_SOURCE+4];
1966 WORD_TAB words[N_CLAUSE_WORDS];
1967 static char voice_change_name[40];
1968 int word_count = 0; // index into words
1969
1970 char sbuf[N_TR_SOURCE];
1971
1972 int terminator;
1973 int tone;
1974 int tone2;
1975
1976 if (tr == NULL)
1977 return;
1978
1979 embedded_ix = 0;
1980 embedded_read = 0;
1981 pre_pause = 0;
1982 any_stressed_words = 0;
1983
1984 if ((clause_start_char = count_characters) < 0)
1985 clause_start_char = 0;
1986 clause_start_word = count_words + 1;
1987
1988 for (ix = 0; ix < N_TR_SOURCE; ix++)
1989 charix[ix] = 0;
1990 terminator = ReadClause(tr, source, charix, &charix_top, N_TR_SOURCE, &tone2, voice_change_name);
1991
1992 charix[charix_top+1] = 0;
1993 charix[charix_top+2] = 0x7fff;
1994 charix[charix_top+3] = 0;
1995
1996 clause_pause = (terminator & CLAUSE_PAUSE) * 10; // mS
1997 if (terminator & CLAUSE_PAUSE_LONG)
1998 clause_pause = clause_pause * 32; // pause value is *320mS not *10mS
1999
2000 tone = (terminator & CLAUSE_INTONATION_TYPE) >> 12;
2001 if (tone2 != 0) {
2002 // override the tone type
2003 tone = tone2;
2004 }
2005
2006 for (p = source; *p != 0; p++) {
2007 if (!isspace2(*p))
2008 break;
2009 }
2010 if (*p == 0) {
2011 // No characters except spaces. This is not a sentence.
2012 // Don't add this pause, just make up the previous pause to this value;
2013 clause_pause -= max_clause_pause;
2014 if (clause_pause < 0)
2015 clause_pause = 0;
2016
2017 if (new_sentence)
2018 terminator |= CLAUSE_TYPE_SENTENCE; // carry forward an end-of-sentence indicator
2019 max_clause_pause += clause_pause;
2020 new_sentence2 = 0;
2021 } else {
2022 max_clause_pause = clause_pause;
2023 new_sentence2 = new_sentence;
2024 }
2025 tr->clause_terminator = terminator;
2026
2027 if (new_sentence2) {
2028 count_sentences++;
2029 if (skip_sentences > 0) {
2030 skip_sentences--;
2031 if (skip_sentences == 0)
2032 skipping_text = 0;
2033 }
2034 }
2035
2036 memset(&ph_list2[0], 0, sizeof(ph_list2[0]));
2037 ph_list2[0].phcode = phonPAUSE_SHORT;
2038
2039 n_ph_list2 = 1;
2040 tr->prev_last_stress = 0;
2041 tr->prepause_timeout = 0;
2042 tr->expect_verb = 0;
2043 tr->expect_noun = 0;
2044 tr->expect_past = 0;
2045 tr->expect_verb_s = 0;
2046 tr->phonemes_repeat_count = 0;
2047 tr->end_stressed_vowel = 0;
2048 tr->prev_dict_flags[0] = 0;
2049 tr->prev_dict_flags[1] = 0;
2050
2051 word_count = 0;
2052 single_quoted = 0;
2053 word_flags = 0;
2054 next_word_flags = 0;
2055
2056 sbuf[0] = 0;
2057 sbuf[1] = ' ';
2058 sbuf[2] = ' ';
2059 ix = 3;
2060 prev_in = ' ';
2061
2062 words[0].start = ix;
2063 words[0].flags = 0;
2064 finished = 0;
2065
2066 for (j = 0; charix[j] <= 0; j++) ;
2067 words[0].sourceix = charix[j];
2068 k = 0;
2069 while (charix[j] != 0) {
2070 // count the number of characters (excluding multibyte continuation bytes)
2071 if (charix[j++] != -1)
2072 k++;
2073 }
2074 words[0].length = k;
2075
2076 while (!finished && (ix < (int)sizeof(sbuf)) && (n_ph_list2 < N_PHONEME_LIST-4)) {
2077 prev_out2 = prev_out;
2078 utf8_in2(&prev_out, &sbuf[ix-1], 1);
2079
2080 if (tr->langopts.tone_numbers && IsDigit09(prev_out) && IsAlpha(prev_out2)) {
2081 // tone numbers can be part of a word, consider them as alphabetic
2082 prev_out = 'a';
2083 }
2084
2085 if (prev_in_save != 0) {
2086 prev_in = prev_in_save;
2087 prev_in_save = 0;
2088 } else if (source_index > 0)
2089 utf8_in2(&prev_in, &source[source_index-1], 1);
2090
2091 prev_source_index = source_index;
2092
2093 if (char_inserted) {
2094 c = char_inserted;
2095 char_inserted = 0;
2096 } else {
2097 source_index += utf8_in(&cc, &source[source_index]);
2098 c = cc;
2099 }
2100 next_in_nbytes = utf8_in(&next_in, &source[source_index]);
2101
2102 if (c == 0) {
2103 finished = 1;
2104 c = ' ';
2105 }
2106
2107 if ((c == CTRL_EMBEDDED) || (c == ctrl_embedded)) {
2108 // start of embedded command in the text
2109 int srcix = source_index-1;
2110
2111 if (prev_in != ' ') {
2112 c = ' ';
2113 prev_in_save = c;
2114 source_index--;
2115 } else {
2116 embedded_count += EmbeddedCommand(&source_index);
2117 prev_in_save = prev_in;
2118 // replace the embedded command by spaces
2119 memset(&source[srcix], ' ', source_index-srcix);
2120 source_index = srcix;
2121 continue;
2122 }
2123 }
2124
2125 if ((option_sayas2 == SAYAS_KEY) && (c != ' ')) {
2126 if ((prev_in == ' ') && (next_in == ' '))
2127 option_sayas2 = SAYAS_SINGLE_CHARS; // single character, speak its name
2128 c = towlower2(c);
2129 }
2130
2131
2132 if (phoneme_mode) {
2133 all_upper_case = FLAG_PHONEMES;
2134
2135 if ((c == ']') && (next_in == ']')) {
2136 phoneme_mode = 0;
2137 source_index++;
2138 c = ' ';
2139 }
2140 } else if ((option_sayas2 & 0xf0) == SAYAS_DIGITS) {
2141 if (iswdigit(c)) {
2142 count_sayas_digits++;
2143 if (count_sayas_digits > (option_sayas2 & 0xf)) {
2144 // break after the specified number of digits
2145 c = ' ';
2146 space_inserted = 1;
2147 count_sayas_digits = 0;
2148 }
2149 } else {
2150 count_sayas_digits = 0;
2151 if (iswdigit(prev_out)) {
2152 c = ' ';
2153 space_inserted = 1;
2154 }
2155 }
2156 } else if ((option_sayas2 & 0x10) == 0) {
2157 // speak as words
2158
2159 if ((c == 0x92) || (c == 0xb4) || (c == 0x2019) || (c == 0x2032))
2160 c = '\''; // 'microsoft' quote or sexed closing single quote, or prime - possibly used as apostrophe
2161
2162 if (((c == 0x2018) || (c == '?')) && IsAlpha(prev_out) && IsAlpha(next_in)) {
2163 // ? between two letters may be a smart-quote replaced by ?
2164 c = '\'';
2165 }
2166
2167 if (c == CHAR_EMPHASIS) {
2168 // this character is a marker that the previous word is the focus of the clause
2169 c = ' ';
2170 word_flags |= FLAG_FOCUS;
2171 }
2172
2173 if (c == CHAR_COMMA_BREAK) {
2174 c = ' ';
2175 word_flags |= FLAG_COMMA_AFTER;
2176 }
2177
2178 c = TranslateChar(tr, &source[source_index], prev_in, c, next_in, &char_inserted, &word_flags); // optional language specific function
2179 if (c == 8)
2180 continue; // ignore this character
2181
2182 if (char_inserted)
2183 next_in = char_inserted;
2184
2185 // allow certain punctuation within a word (usually only apostrophe)
2186 if (!IsAlpha(c) && !IsSpace(c) && (wcschr(tr->punct_within_word, c) == 0)) {
2187 if (IsAlpha(prev_out)) {
2188 if (tr->langopts.tone_numbers && IsDigit09(c) && !IsDigit09(next_in)) {
2189 // allow a tone number as part of the word
2190 } else {
2191 c = ' '; // ensure we have an end-of-word terminator
2192 space_inserted = 1;
2193 }
2194 }
2195 }
2196
2197 if (iswdigit(prev_out)) {
2198 if (!iswdigit(c) && (c != '.') && (c != ',') && (c != ' ')) {
2199 c = ' '; // terminate digit string with a space
2200 space_inserted = 1;
2201 }
2202 } else { // Prev output is not digit
2203 if (prev_in == ',') {
2204 // Workaround for several consecutive commas —
2205 // replace current character with space
2206 if (c == ',')
2207 c = ' ';
2208 } else {
2209 decimal_sep_count = 0;
2210 }
2211 }
2212
2213 if (c == '[') {
2214 if ((next_in == '\002') || ((next_in == '[') && option_phoneme_input)) {
2215 // "[\002" is used internally to start phoneme mode
2216 phoneme_mode = FLAG_PHONEMES;
2217 source_index++;
2218 continue;
2219 }
2220 }
2221
2222 if (IsAlpha(c)) {
2223 if (!IsAlpha(prev_out) || (tr->langopts.ideographs && ((c > 0x3040) || (prev_out > 0x3040)))) {
2224 if (wcschr(tr->punct_within_word, prev_out) == 0)
2225 letter_count = 0; // don't reset count for an apostrophy within a word
2226
2227 if ((prev_out != ' ') && (wcschr(tr->punct_within_word, prev_out) == 0)) {
2228 // start of word, insert space if not one there already
2229 c = ' ';
2230 space_inserted = 1;
2231
2232 if (!IsBracket(prev_out)) // ?? perhaps only set FLAG_NOSPACE for . - / (hyphenated words, URLs, etc)
2233 next_word_flags |= FLAG_NOSPACE;
2234 } else {
2235 if (iswupper(c))
2236 word_flags |= FLAG_FIRST_UPPER;
2237
2238 if ((prev_out == ' ') && iswdigit(sbuf[ix-2]) && !iswdigit(prev_in)) {
2239 // word, following a number, but with a space between
2240 // Add an extra space, to distinguish "2 a" from "2a"
2241 sbuf[ix++] = ' ';
2242 words[word_count].start++;
2243 }
2244 }
2245 }
2246
2247 if (c != ' ') {
2248 letter_count++;
2249
2250 if (tr->letter_bits_offset > 0) {
2251 if (((c < 0x250) && (prev_out >= tr->letter_bits_offset)) ||
2252 ((c >= tr->letter_bits_offset) && (letter_count > 1) && (prev_out < 0x250))) {
2253 // Don't mix native and Latin characters in the same word
2254 // Break into separate words
2255 if (IsAlpha(prev_out)) {
2256 c = ' ';
2257 space_inserted = 1;
2258 word_flags |= FLAG_HYPHEN_AFTER;
2259 next_word_flags |= FLAG_HYPHEN;
2260 }
2261 }
2262 }
2263 }
2264
2265 if (iswupper(c)) {
2266 c = towlower2(c);
2267
2268 if ((j = tr->langopts.param[LOPT_CAPS_IN_WORD]) > 0) {
2269 if ((j == 2) && (syllable_marked == 0)) {
2270 char_inserted = c;
2271 c = 0x2c8; // stress marker
2272 syllable_marked = 1;
2273 }
2274 } else {
2275 if (iswlower(prev_in)) {
2276 // lower case followed by upper case in a word
2277 if (UpperCaseInWord(tr, &sbuf[ix], c) == 1) {
2278 // convert to lower case and continue
2279 c = towlower2(c);
2280 } else {
2281 c = ' '; // lower case followed by upper case, treat as new word
2282 space_inserted = 1;
2283 prev_in_save = c;
2284 }
2285 } else if ((c != ' ') && iswupper(prev_in) && iswlower(next_in)) {
2286 int next2_in;
2287 utf8_in(&next2_in, &source[source_index + next_in_nbytes]);
2288
2289 if ((tr->translator_name == L('n', 'l')) && (letter_count == 2) && (c == 'j') && (prev_in == 'I')) {
2290 // Dutch words may capitalise initial IJ, don't split
2291 } else if (IsAlpha(next2_in)) {
2292 // changing from upper to lower case, start new word at the last uppercase, if 3 or more letters
2293 c = ' ';
2294 space_inserted = 1;
2295 prev_in_save = c;
2296 next_word_flags |= FLAG_NOSPACE;
2297 }
2298 }
2299 }
2300 } else {
2301 if ((all_upper_case) && (letter_count > 2)) {
2302 if ((c == 's') && (next_in == ' ')) {
2303 c = ' ';
2304 all_upper_case |= FLAG_HAS_PLURAL;
2305
2306 if (sbuf[ix-1] == '\'')
2307 sbuf[ix-1] = ' ';
2308 } else
2309 all_upper_case = 0; // current word contains lower case letters, not "'s"
2310 } else
2311 all_upper_case = 0;
2312 }
2313 } else if (c == '-') {
2314 if (!IsSpace(prev_in) && IsAlpha(next_in)) {
2315 if (prev_out != ' ') {
2316 // previous 'word' not yet ended (not alpha or numeric), start new word now.
2317 c = ' ';
2318 space_inserted = 1;
2319 } else {
2320 // '-' between two letters is a hyphen, treat as a space
2321 word_flags |= FLAG_HYPHEN;
2322 if (word_count > 0)
2323 words[word_count-1].flags |= FLAG_HYPHEN_AFTER;
2324 c = ' ';
2325 }
2326 } else if ((prev_in == ' ') && (next_in == ' ')) {
2327 // ' - ' dash between two spaces, treat as pause
2328 c = ' ';
2329 pre_pause_add = 4;
2330 } else if (next_in == '-') {
2331 // double hyphen, treat as pause
2332 source_index++;
2333 c = ' ';
2334 pre_pause_add = 4;
2335 } else if ((prev_out == ' ') && IsAlpha(prev_out2) && !IsAlpha(prev_in)) {
2336 // insert extra space between a word + space + hyphen, to distinguish 'a -2' from 'a-2'
2337 sbuf[ix++] = ' ';
2338 words[word_count].start++;
2339 }
2340 } else if (c == '.') {
2341 if (prev_out == '.') {
2342 // multiple dots, separate by spaces. Note >3 dots has been replaced by elipsis
2343 c = ' ';
2344 space_inserted = 1;
2345 } else if ((word_count > 0) && !(words[word_count-1].flags & FLAG_NOSPACE) && IsAlpha(prev_in)) {
2346 // dot after a word, with space following, probably an abbreviation
2347 words[word_count-1].flags |= FLAG_HAS_DOT;
2348
2349 if (IsSpace(next_in) || (next_in == '-'))
2350 c = ' '; // remove the dot if it's followed by a space or hyphen, so that it's not pronounced
2351 }
2352 } else if (c == '\'') {
2353 if (((prev_in == '.') || iswalnum(prev_in)) && IsAlpha(next_in)) {
2354 // between two letters, or in an abbreviation (eg. u.s.a.'s). Consider the apostrophe as part of the word
2355 single_quoted = 0;
2356 } else if ((tr->langopts.param[LOPT_APOSTROPHE] & 1) && IsAlpha(next_in))
2357 single_quoted = 0; // apostrophe at start of word is part of the word
2358 else if ((tr->langopts.param[LOPT_APOSTROPHE] & 2) && IsAlpha(prev_in))
2359 single_quoted = 0; // apostrophe at end of word is part of the word
2360 else if ((wcschr(tr->char_plus_apostrophe, prev_in) != 0) && (prev_out2 == ' ')) {
2361 // consider single character plus apostrophe as a word
2362 single_quoted = 0;
2363 if (next_in == ' ')
2364 source_index++; // skip following space
2365 } else {
2366 if ((prev_out == 's') && (single_quoted == 0)) {
2367 // looks like apostrophe after an 's'
2368 c = ' ';
2369 } else {
2370 if (IsSpace(prev_out))
2371 single_quoted = 1;
2372 else
2373 single_quoted = 0;
2374
2375 pre_pause_add = 4; // single quote
2376 c = ' ';
2377 }
2378 }
2379 } else if (lookupwchar(breaks, c) != 0)
2380 c = ' '; // various characters to treat as space
2381 else if (iswdigit(c)) {
2382 if (tr->langopts.tone_numbers && IsAlpha(prev_out) && !IsDigit(next_in)) {
2383 } else if ((prev_out != ' ') && !iswdigit(prev_out)) {
2384 if ((prev_out != tr->langopts.decimal_sep) || ((decimal_sep_count > 0) && (tr->langopts.decimal_sep == ','))) {
2385 c = ' ';
2386 space_inserted = 1;
2387 } else
2388 decimal_sep_count = 1;
2389 } else if ((prev_out == ' ') && IsAlpha(prev_out2) && !IsAlpha(prev_in)) {
2390 // insert extra space between a word and a number, to distinguish 'a 2' from 'a2'
2391 sbuf[ix++] = ' ';
2392 words[word_count].start++;
2393 }
2394 }
2395 }
2396
2397 if (IsSpace(c)) {
2398 if (prev_out == ' ') {
2399 word_flags |= FLAG_MULTIPLE_SPACES;
2400 continue; // multiple spaces
2401 }
2402
2403 if ((cc == 0x09) || (cc == 0x0a))
2404 next_word_flags |= FLAG_MULTIPLE_SPACES; // tab or newline, not a simple space
2405
2406 if (space_inserted) {
2407 // count the number of characters since the start of the word
2408 j = 0;
2409 k = source_index - 1;
2410 while ((k >= source_index_word) && (charix[k] != 0)) {
2411 if (charix[k] > 0) // don't count initial bytes of multi-byte character
2412 j++;
2413 k--;
2414 }
2415 words[word_count].length = j;
2416 }
2417
2418 source_index_word = source_index;
2419
2420 // end of 'word'
2421 sbuf[ix++] = ' ';
2422
2423 if ((word_count < N_CLAUSE_WORDS-1) && (ix > words[word_count].start)) {
2424 if (embedded_count > 0) {
2425 // there are embedded commands before this word
2426 embedded_list[embedded_ix-1] |= 0x80; // terminate list of commands for this word
2427 words[word_count].flags |= FLAG_EMBEDDED;
2428 embedded_count = 0;
2429 }
2430 words[word_count].pre_pause = pre_pause;
2431 words[word_count].flags |= (all_upper_case | word_flags | word_emphasis);
2432 words[word_count].wmark = word_mark;
2433
2434 if (pre_pause > 0) {
2435 // insert an extra space before the word, to prevent influence from previous word across the pause
2436 for (j = ix; j > words[word_count].start; j--)
2437 sbuf[j] = sbuf[j-1];
2438 sbuf[j] = ' ';
2439 words[word_count].start++;
2440 ix++;
2441 }
2442
2443 word_count++;
2444 words[word_count].start = ix;
2445 words[word_count].flags = 0;
2446
2447 for (j = source_index; charix[j] <= 0; j++) // skip blanks
2448 ;
2449 words[word_count].sourceix = charix[j];
2450 k = 0;
2451 while (charix[j] != 0) {
2452 // count the number of characters (excluding multibyte continuation bytes)
2453 if (charix[j++] != -1)
2454 k++;
2455 }
2456 words[word_count].length = k;
2457
2458 word_flags = next_word_flags;
2459 next_word_flags = 0;
2460 pre_pause = 0;
2461 word_mark = 0;
2462 all_upper_case = FLAG_ALL_UPPER;
2463 syllable_marked = 0;
2464 }
2465
2466 if (space_inserted) {
2467 source_index = prev_source_index; // rewind to the previous character
2468 char_inserted = 0;
2469 space_inserted = 0;
2470 }
2471 } else {
2472 if ((ix < (N_TR_SOURCE - 4)))
2473 ix += utf8_out(c, &sbuf[ix]);
2474 }
2475 if (pre_pause_add > pre_pause)
2476 pre_pause = pre_pause_add;
2477 pre_pause_add = 0;
2478 }
2479
2480 if ((word_count == 0) && (embedded_count > 0)) {
2481 // add a null 'word' to carry the embedded command flag
2482 embedded_list[embedded_ix-1] |= 0x80;
2483 words[word_count].flags |= FLAG_EMBEDDED;
2484 word_count = 1;
2485 }
2486
2487 tr->clause_end = &sbuf[ix-1];
2488 sbuf[ix] = 0;
2489 words[0].pre_pause = 0; // don't add extra pause at beginning of clause
2490 words[word_count].pre_pause = 8;
2491 if (word_count > 0) {
2492 ix = word_count-1;
2493 while ((ix > 0) && (IsBracket(sbuf[words[ix].start])))
2494 ix--; // the last word is a bracket, mark the previous word as last
2495 words[ix].flags |= FLAG_LAST_WORD;
2496
2497 // FLAG_NOSPACE check to avoid recognizing .mr -mr
2498 if ((terminator & CLAUSE_DOT_AFTER_LAST_WORD) && !(words[word_count-1].flags & FLAG_NOSPACE))
2499 words[word_count-1].flags |= FLAG_HAS_DOT;
2500 }
2501 words[0].flags |= FLAG_FIRST_WORD;
2502
2503 for (ix = 0; ix < word_count; ix++) {
2504 int nx;
2505 int c_temp;
2506 char *pn;
2507 char *pw;
2508 int nw;
2509 char number_buf[150];
2510 WORD_TAB num_wtab[50]; // copy of 'words', when splitting numbers into parts
2511
2512 // start speaking at a specified word position in the text?
2513 count_words++;
2514 if (skip_words > 0) {
2515 skip_words--;
2516 if (skip_words == 0)
2517 skipping_text = 0;
2518 }
2519 if (skipping_text)
2520 continue;
2521
2522 current_alphabet = NULL;
2523
2524 // digits should have been converted to Latin alphabet ('0' to '9')
2525 word = pw = &sbuf[words[ix].start];
2526
2527 if (iswdigit(word[0]) && (tr->langopts.break_numbers != BREAK_THOUSANDS)) {
2528 // Languages with 100000 numbers. Remove thousands separators so that we can insert them again later
2529 pn = number_buf;
2530 while (pn < &number_buf[sizeof(number_buf)-20]) {
2531 if (iswdigit(*pw))
2532 *pn++ = *pw++;
2533 else if ((*pw == tr->langopts.thousands_sep) && (pw[1] == ' ')
2534 && iswdigit(pw[2]) && (pw[3] != ' ') && (pw[4] != ' ')) { // don't allow only 1 or 2 digits in the final part
2535 pw += 2;
2536 ix++; // skip "word"
2537 } else {
2538 nx = pw - word;
2539 memset(word, ' ', nx);
2540 nx = pn - number_buf;
2541 memcpy(word, number_buf, nx);
2542 break;
2543 }
2544 }
2545 pw = word;
2546 }
2547
2548 for (n_digits = 0; iswdigit(word[n_digits]); n_digits++) // count consecutive digits
2549 ;
2550
2551 if (n_digits > 4) {
2552 // word is entirely digits, insert commas and break into 3 digit "words"
2553 number_buf[0] = ' ';
2554 pn = &number_buf[1];
2555 nx = n_digits;
2556 nw = 0;
2557
2558 if ((n_digits > tr->langopts.max_digits) || (word[0] == '0'))
2559 words[ix].flags |= FLAG_INDIVIDUAL_DIGITS;
2560
2561 while (pn < &number_buf[sizeof(number_buf)-20]) {
2562 if (!IsDigit09(c = *pw++) && (c != tr->langopts.decimal_sep))
2563 break;
2564
2565 *pn++ = c;
2566 nx--;
2567 if ((nx > 0) && (tr->langopts.break_numbers & (1 << nx))) {
2568 memcpy(&num_wtab[nw++], &words[ix], sizeof(WORD_TAB)); // copy the 'words' entry for each word of numbers
2569
2570 if (tr->langopts.thousands_sep != ' ')
2571 *pn++ = tr->langopts.thousands_sep;
2572 *pn++ = ' ';
2573
2574 if ((words[ix].flags & FLAG_INDIVIDUAL_DIGITS) == 0) {
2575 if (tr->langopts.break_numbers & (1 << (nx-1))) {
2576 // the next group only has 1 digits, make it three
2577 *pn++ = '0';
2578 *pn++ = '0';
2579 }
2580 if (tr->langopts.break_numbers & (1 << (nx-2))) {
2581 // the next group only has 2 digits (eg. Indian languages), make it three
2582 *pn++ = '0';
2583 }
2584 }
2585 }
2586 }
2587 pw--;
2588 memcpy(&num_wtab[nw], &words[ix], sizeof(WORD_TAB)*2); // the original number word, and the word after it
2589
2590 for (j = 1; j <= nw; j++)
2591 num_wtab[j].flags &= ~(FLAG_MULTIPLE_SPACES | FLAG_EMBEDDED); // don't use these flags for subsequent parts when splitting a number
2592
2593 // include the next few characters, in case there are an ordinal indicator or other suffix
2594 memcpy(pn, pw, 16);
2595 pn[16] = 0;
2596 nw = 0;
2597
2598 for (pw = &number_buf[1]; pw < pn;) {
2599 // keep wflags for each part, for FLAG_HYPHEN_AFTER
2600 dict_flags = TranslateWord2(tr, pw, &num_wtab[nw++], words[ix].pre_pause);
2601 while (*pw++ != ' ')
2602 ;
2603 words[ix].pre_pause = 0;
2604 }
2605 } else {
2606 pre_pause = 0;
2607
2608 dict_flags = TranslateWord2(tr, word, &words[ix], words[ix].pre_pause);
2609
2610 if (pre_pause > words[ix+1].pre_pause) {
2611 words[ix+1].pre_pause = pre_pause;
2612 pre_pause = 0;
2613 }
2614
2615 if (dict_flags & FLAG_SPELLWORD) {
2616 // redo the word, speaking single letters
2617 for (pw = word; *pw != ' ';) {
2618 memset(number_buf, ' ', 9);
2619 nx = utf8_in(&c_temp, pw);
2620 memcpy(&number_buf[2], pw, nx);
2621 TranslateWord2(tr, &number_buf[2], &words[ix], 0);
2622 pw += nx;
2623 }
2624 }
2625
2626 if ((dict_flags & (FLAG_ALLOW_DOT | FLAG_NEEDS_DOT)) && (ix == word_count - 1 - dictionary_skipwords) && (terminator & CLAUSE_DOT_AFTER_LAST_WORD)) {
2627 // probably an abbreviation such as Mr. or B. rather than end of sentence
2628 clause_pause = 10;
2629 tone = 4;
2630 }
2631 }
2632
2633 if (dict_flags & FLAG_SKIPWORDS) {
2634 // dictionary indicates skip next word(s)
2635 while (dictionary_skipwords > 0) {
2636 words[ix+dictionary_skipwords].flags |= FLAG_DELETE_WORD;
2637 dictionary_skipwords--;
2638 }
2639 }
2640 }
2641
2642 if (embedded_read < embedded_ix) {
2643 // any embedded commands not yet processed?
2644 Word_EmbeddedCmd();
2645 }
2646
2647 for (ix = 0; ix < 2; ix++) {
2648 // terminate the clause with 2 PAUSE phonemes
2649 PHONEME_LIST2 *p2;
2650 p2 = &ph_list2[n_ph_list2 + ix];
2651 p2->phcode = phonPAUSE;
2652 p2->stresslevel = 0;
2653 p2->sourceix = source_index;
2654 p2->synthflags = 0;
2655 }
2656 n_ph_list2 += 2;
2657
2658 if (count_words == 0)
2659 clause_pause = 0;
2660 if (Eof() && ((word_count == 0) || (option_endpause == 0)))
2661 clause_pause = 10;
2662
2663 MakePhonemeList(tr, clause_pause, new_sentence2);
2664 phoneme_list[N_PHONEME_LIST].ph = NULL; // recognize end of phoneme_list array, in Generate()
2665 phoneme_list[N_PHONEME_LIST].sourceix = 1;
2666
2667 if (embedded_count) { // ???? is this needed
2668 phoneme_list[n_phoneme_list-2].synthflags = SFLAG_EMBEDDED;
2669 embedded_list[embedded_ix-1] |= 0x80;
2670 embedded_list[embedded_ix] = 0x80;
2671 }
2672
2673 prev_clause_pause = clause_pause;
2674
2675 if (tone_out != NULL)
2676 *tone_out = tone;
2677
2678 new_sentence = 0;
2679 if (terminator & CLAUSE_TYPE_SENTENCE)
2680 new_sentence = 1; // next clause is a new sentence
2681
2682 if (voice_change != NULL) {
2683 // return new voice name if an embedded voice change command terminated the clause
2684 if (terminator & CLAUSE_TYPE_VOICE_CHANGE)
2685 *voice_change = voice_change_name;
2686 else
2687 *voice_change = NULL;
2688 }
2689 }
2690
InitText(int control)2691 void InitText(int control)
2692 {
2693 count_sentences = 0;
2694 count_words = 0;
2695 end_character_position = 0;
2696 skip_sentences = 0;
2697 skip_marker[0] = 0;
2698 skip_words = 0;
2699 skip_characters = 0;
2700 skipping_text = 0;
2701 new_sentence = 1;
2702
2703 prev_clause_pause = 0;
2704
2705 option_sayas = 0;
2706 option_sayas2 = 0;
2707 option_emphasis = 0;
2708 word_emphasis = 0;
2709 embedded_flag = 0;
2710
2711 InitText2();
2712
2713 if ((control & espeakKEEP_NAMEDATA) == 0)
2714 InitNamedata();
2715 }
2716