1 /***************************************************************************
2 * Copyright (C) 2005 to 2014 by Jonathan Duddington *
3 * email: jonsd@users.sourceforge.net *
4 * *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 3 of the License, or *
8 * (at your option) any later version. *
9 * *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
14 * *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, see: *
17 * <http://www.gnu.org/licenses/>. *
18 ***************************************************************************/
19
20 #include "StdAfx.h"
21
22 #include <stdio.h>
23 #include <ctype.h>
24 #include <stdlib.h>
25 #include <string.h>
26
27 #include <wctype.h>
28 #include <wchar.h>
29
30 #include "speak_lib.h"
31 #include "speech.h"
32 #include "phoneme.h"
33 #include "synthesize.h"
34 #include "voice.h"
35 #include "translate.h"
36
37 #define WORD_STRESS_CHAR '*'
38
39
40 Translator *translator = NULL; // the main translator
41 Translator *translator2 = NULL; // secondary translator for certain words
42 static char translator2_language[20] = {0};
43
44 FILE *f_trans = NULL; // phoneme output text
45 int option_tone2 = 0;
46 int option_tone_flags = 0; // bit 8=emphasize allcaps, bit 9=emphasize penultimate stress
47 int option_phonemes = 0;
48 int option_phoneme_events = 0;
49 int option_quiet = 0;
50 int option_endpause = 0; // suppress pause after end of text
51 int option_capitals = 0;
52 int option_punctuation = 0;
53 int option_sayas = 0;
54 static int option_sayas2 = 0; // used in translate_clause()
55 static int option_emphasis = 0; // 0=normal, 1=normal, 2=weak, 3=moderate, 4=strong
56 int option_ssml = 0;
57 int option_phoneme_input = 0; // allow [[phonemes]] in input
58 int option_phoneme_variants = 0; // 0= don't display phoneme variant mnemonics
59 int option_wordgap = 0;
60
61 static int count_sayas_digits;
62 int skip_sentences;
63 int skip_words;
64 int skip_characters;
65 char skip_marker[N_MARKER_LENGTH];
66 int skipping_text; // waiting until word count, sentence count, or named marker is reached
67 int end_character_position;
68 int count_sentences;
69 int count_words;
70 int clause_start_char;
71 int clause_start_word;
72 int new_sentence;
73 static int word_emphasis = 0; // set if emphasis level 3 or 4
74 static int embedded_flag = 0; // there are embedded commands to be applied to the next phoneme, used in TranslateWord2()
75
76 static int prev_clause_pause=0;
77 static int max_clause_pause = 0;
78 static int any_stressed_words;
79 int pre_pause;
80 ALPHABET *current_alphabet;
81
82
83 // these were previously in translator class
84 #ifdef PLATFORM_WINDOWS
85 char word_phonemes[N_WORD_PHONEMES*2]; // longer, because snprint() is not available
86 #else
87 char word_phonemes[N_WORD_PHONEMES]; // a word translated into phoneme codes
88 #endif
89 int n_ph_list2;
90 PHONEME_LIST2 ph_list2[N_PHONEME_LIST]; // first stage of text->phonemes
91
92
93
94 wchar_t option_punctlist[N_PUNCTLIST]= {0};
95 char ctrl_embedded = '\001'; // to allow an alternative CTRL for embedded commands
96 int option_multibyte=espeakCHARS_AUTO; // 0=auto, 1=utf8, 2=8bit, 3=wchar, 4=16bit
97
98 // these are overridden by defaults set in the "speak" file
99 int option_linelength = 0;
100
101 #define N_EMBEDDED_LIST 250
102 static int embedded_ix;
103 static int embedded_read;
104 unsigned int embedded_list[N_EMBEDDED_LIST];
105
106 // the source text of a single clause (UTF8 bytes)
107 static char source[N_TR_SOURCE+40]; // extra space for embedded command & voice change info at end
108
109 int n_replace_phonemes;
110 REPLACE_PHONEMES replace_phonemes[N_REPLACE_PHONEMES];
111
112
113 // brackets, also 0x2014 to 0x021f which don't need to be in this list
114 static const unsigned short brackets[] = {
115 '(',')','[',']','{','}','<','>','"','\'','`',
116 0xab,0xbb, // double angle brackets
117 0x300a,0x300b, // double angle brackets (ideograph)
118 0xe000+'<', // private usage area
119 0
120 };
121
122 // other characters which break a word, but don't produce a pause
123 static const unsigned short breaks[] = {'_', 0};
124
125 // treat these characters as spaces, in addition to iswspace()
126 // static const wchar_t chars_space[] = {0x2500,0x2501,0}; // box drawing horiz
127
128
129 // Translate character codes 0xA0 to 0xFF into their unicode values
130 // ISO_8859_1 is set as default
131 static const unsigned short ISO_8859_1[0x60] = {
132 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, // a0
133 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, // a8
134 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, // b0
135 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, // b8
136 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
137 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
138 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
139 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, // d8
140 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
141 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
142 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
143 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, // f8
144 };
145
146 static const unsigned short ISO_8859_2[0x60] = {
147 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, // a0
148 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, // a8
149 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, // b0
150 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, // b8
151 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, // c0
152 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, // c8
153 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, // d0
154 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, // d8
155 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, // e0
156 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, // e8
157 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, // f0
158 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, // f8
159 };
160
161 static const unsigned short ISO_8859_3[0x60] = {
162 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, 0x0000, 0x0124, 0x00a7, // a0
163 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, 0x0000, 0x017b, // a8
164 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, // b0
165 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, 0x0000, 0x017c, // b8
166 0x00c0, 0x00c1, 0x00c2, 0x0000, 0x00c4, 0x010a, 0x0108, 0x00c7, // c0
167 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
168 0x0000, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, // d0
169 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, // d8
170 0x00e0, 0x00e1, 0x00e2, 0x0000, 0x00e4, 0x010b, 0x0109, 0x00e7, // e0
171 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
172 0x0000, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, // f0
173 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9, // f8
174 };
175
176 static const unsigned short ISO_8859_4[0x60] = {
177 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, // a0
178 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, // a8
179 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, // b0
180 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, // b8
181 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, // c0
182 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, // c8
183 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
184 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, // d8
185 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, // e0
186 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, // e8
187 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
188 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9, // f8
189 };
190
191 static const unsigned short ISO_8859_5[0x60] = {
192 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, // a0 Cyrillic
193 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x00ad, 0x040e, 0x040f, // a8
194 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, // b0
195 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, // b8
196 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, // c0
197 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, // c8
198 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, // d0
199 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, // d8
200 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, // e0
201 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, // e8
202 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, // f0
203 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f, // f8
204 };
205
206 static const unsigned short ISO_8859_7[0x60] = {
207 0x00a0, 0x2018, 0x2019, 0x00a3, 0x20ac, 0x20af, 0x00a6, 0x00a7, // a0 Greek
208 0x00a8, 0x00a9, 0x037a, 0x00ab, 0x00ac, 0x00ad, 0x0000, 0x2015, // a8
209 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7, // b0
210 0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f, // b8
211 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, // c0
212 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, // c8
213 0x03a0, 0x03a1, 0x0000, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, // d0
214 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af, // d8
215 0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, // e0
216 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, // e8
217 0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7, // f0
218 0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, 0x0000, // f8
219 };
220
221 static const unsigned short ISO_8859_9[0x60] = {
222 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, // a0
223 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, // a8
224 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, // b0
225 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, // b8
226 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
227 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
228 0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
229 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df, // d8
230 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
231 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
232 0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
233 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff, // f8
234 };
235
236 static const unsigned short ISO_8859_14[0x60] = {
237 0x00a0, 0x1e02, 0x1e03, 0x00a3, 0x010a, 0x010b, 0x1e0a, 0x00a7, // a0 Welsh
238 0x1e80, 0x00a9, 0x1e82, 0x1e0b, 0x1ef2, 0x00ad, 0x00ae, 0x0178, // a8
239 0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x1e41, 0x00b6, 0x1e56, // b0
240 0x1e81, 0x1e57, 0x1e83, 0x1e60, 0x1ef3, 0x1e84, 0x1e85, 0x1e61, // b8
241 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
242 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
243 0x0174, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x1e6a, // d0
244 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x0176, 0x00df, // d8
245 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
246 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
247 0x0175, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x1e6b, // f0
248 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x0177, 0x00ff, // f8
249 };
250
251 static const unsigned short KOI8_R[0x60] = {
252 0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556, // a0 Russian
253 0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e, // a8
254 0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565, // b0
255 0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9, // b8
256 0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, // c0
257 0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, // c8
258 0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, // d0
259 0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a, // d8
260 0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, // e0
261 0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, // e8
262 0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, // f0
263 0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a, // f8
264 };
265
266 static const unsigned short ISCII[0x60] = {
267 0x0020, 0x0901, 0x0902, 0x0903, 0x0905, 0x0906, 0x0907, 0x0908, // a0
268 0x0909, 0x090a, 0x090b, 0x090e, 0x090f, 0x0910, 0x090d, 0x0912, // a8
269 0x0913, 0x0914, 0x0911, 0x0915, 0x0916, 0x0917, 0x0918, 0x0919, // b0
270 0x091a, 0x091b, 0x091c, 0x091d, 0x091e, 0x091f, 0x0920, 0x0921, // b8
271 0x0922, 0x0923, 0x0924, 0x0925, 0x0926, 0x0927, 0x0928, 0x0929, // c0
272 0x092a, 0x092b, 0x092c, 0x092d, 0x092e, 0x092f, 0x095f, 0x0930, // c8
273 0x0931, 0x0932, 0x0933, 0x0934, 0x0935, 0x0936, 0x0937, 0x0938, // d0
274 0x0939, 0x0020, 0x093e, 0x093f, 0x0940, 0x0941, 0x0942, 0x0943, // d8
275 0x0946, 0x0947, 0x0948, 0x0945, 0x094a, 0x094b, 0x094c, 0x0949, // e0
276 0x094d, 0x093c, 0x0964, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, // e8
277 0x0020, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, // f0
278 0x0037, 0x0038, 0x0039, 0x20, 0x20, 0x20, 0x20, 0x20, // f8
279 };
280
281 const unsigned short *charsets[N_CHARSETS] = {
282 ISO_8859_1,
283 ISO_8859_1,
284 ISO_8859_2,
285 ISO_8859_3,
286 ISO_8859_4,
287 ISO_8859_5,
288 ISO_8859_1,
289 ISO_8859_7,
290 ISO_8859_1,
291 ISO_8859_9,
292 ISO_8859_1,
293 ISO_8859_1,
294 ISO_8859_1,
295 ISO_8859_1,
296 ISO_8859_14,
297 ISO_8859_1,
298 ISO_8859_1,
299 ISO_8859_1,
300 KOI8_R, // 18
301 ISCII
302 };
303
304 // Tables of the relative lengths of vowels, depending on the
305 // type of the two phonemes that follow
306 // indexes are the "length_mod" value for the following phonemes
307
308 // use this table if vowel is not the last in the word
309 static unsigned char length_mods_en[100] = {
310 /* a , t s n d z r N <- next */
311 100,120,100,105,100,110,110,100, 95, 100, /* a <- next2 */
312 105,120,105,110,125,130,135,115,125, 100, /* , */
313 105,120, 75,100, 75,105,120, 85, 75, 100, /* t */
314 105,120, 85,105, 95,115,120,100, 95, 100, /* s */
315 110,120, 95,105,100,115,120,100,100, 100, /* n */
316 105,120,100,105, 95,115,120,110, 95, 100, /* d */
317 105,120,100,105,105,122,125,110,105, 100, /* z */
318 105,120,100,105,105,122,125,110,105, 100, /* r */
319 105,120, 95,105,100,115,120,110,100, 100, /* N */
320 100,120,100,100,100,100,100,100,100, 100
321 }; // SPARE
322
323 // as above, but for the last syllable in a word
324 static unsigned char length_mods_en0[100] = {
325 /* a , t s n d z r N <- next */
326 100,150,100,105,110,115,110,110,110, 100, /* a <- next2 */
327 105,150,105,110,125,135,140,115,135, 100, /* , */
328 105,150, 90,105, 90,122,135,100, 90, 100, /* t */
329 105,150,100,105,100,122,135,100,100, 100, /* s */
330 105,150,100,105,105,115,135,110,105, 100, /* n */
331 105,150,100,105,105,122,130,120,125, 100, /* d */
332 105,150,100,105,110,122,125,115,110, 100, /* z */
333 105,150,100,105,105,122,135,120,105, 100, /* r */
334 105,150,100,105,105,115,135,110,105, 100, /* N */
335 100,100,100,100,100,100,100,100,100, 100
336 }; // SPARE
337
338
339 static unsigned char length_mods_equal[100] = {
340 /* a , t s n d z r N <- next */
341 110,120,100,110,110,110,110,110,110, 110, /* a <- next2 */
342 110,120,100,110,110,110,110,110,110, 110, /* , */
343 110,120,100,110,100,110,110,110,100, 110, /* t */
344 110,120,100,110,110,110,110,110,110, 110, /* s */
345 110,120,100,110,110,110,110,110,110, 110, /* n */
346 110,120,100,110,110,110,110,110,110, 110, /* d */
347 110,120,100,110,110,110,110,110,110, 110, /* z */
348 110,120,100,110,110,110,110,110,110, 110, /* r */
349 110,120,100,110,110,110,110,110,110, 110, /* N */
350 110,120,100,110,110,110,110,110,110, 110
351 }; // SPARE
352
353
354 static unsigned char *length_mod_tabs[6] = {
355 length_mods_en,
356 length_mods_en, // 1
357 length_mods_en0, // 2
358 length_mods_equal, // 3
359 length_mods_equal, // 4
360 length_mods_equal // 5
361 };
362
363
SetLengthMods(Translator * tr,int value)364 void SetLengthMods(Translator *tr, int value)
365 {//==========================================
366 int value2;
367
368 tr->langopts.length_mods0 = tr->langopts.length_mods = length_mod_tabs[value % 100];
369 if((value2 = value / 100) != 0)
370 {
371 tr->langopts.length_mods0 = length_mod_tabs[value2];
372 }
373 }
374
375
376
IsAlpha(unsigned int c)377 int IsAlpha(unsigned int c)
378 {//========================
379 // Replacement for iswalph() which also checks for some in-word symbols
380
381 static const unsigned short extra_indic_alphas[] = {
382 0xa70,0xa71, // Gurmukhi: tippi, addak
383 0
384 };
385
386 if(iswalpha2(c))
387 return(1);
388
389 if(c < 0x300)
390 return(0);
391
392 if((c >= 0x901) && (c <= 0xdf7))
393 {
394 // Indic scripts: Devanagari, Tamil, etc
395 if((c & 0x7f) < 0x64)
396 return(1);
397 if(lookupwchar(extra_indic_alphas, c) != 0)
398 return(1);
399 if((c >= 0xd7a) && (c <= 0xd7f))
400 return(1); // malaytalam chillu characters
401
402 return(0);
403 }
404
405 if((c >= 0x5b0) && (c <= 0x5c2))
406 return(1); // Hebrew vowel marks
407
408 if(c == 0x0605)
409 return(1);
410
411 if((c == 0x670) || ((c >= 0x64b) && (c <= 0x65e)))
412 return(1); // arabic vowel marks
413
414 if((c >= 0x300) && (c <= 0x36f))
415 return(1); // combining accents
416
417 if((c >= 0x780) && (c <= 0x7b1))
418 return(1); // taani/divehi (maldives)
419
420 if((c >= 0xf40) && (c <= 0xfbc))
421 return(1); // tibetan
422
423 if((c >= 0x1100) && (c <= 0x11ff))
424 return(1); //Korean jamo
425
426 if((c >= 0x2800) && (c <= 0x28ff))
427 return(1); // braille
428
429 if((c > 0x3040) && (c <= 0xa700))
430 return(1); // Chinese/Japanese. Should never get here, but Mac OS 10.4's iswalpha seems to be broken, so just make sure
431
432 return(0);
433 }
434
IsDigit09(unsigned int c)435 int IsDigit09(unsigned int c)
436 {//============================
437 if((c >= '0') && (c <= '9'))
438 return(1);
439 return(0);
440 }
441
IsDigit(unsigned int c)442 int IsDigit(unsigned int c)
443 {//========================
444 if(iswdigit(c))
445 return(1);
446
447 if((c >= 0x966) && (c <= 0x96f))
448 return(1);
449
450 return(0);
451 }
452
IsSpace(unsigned int c)453 int IsSpace(unsigned int c)
454 {//========================
455 if(c == 0)
456 return(0);
457 if((c >= 0x2500) && (c < 0x25a0))
458 return(1); // box drawing characters
459 if((c >= 0xfff9) && (c <= 0xffff))
460 return(1); // unicode specials
461 // if(wcschr(chars_space,c))
462 // return(1);
463 return(iswspace(c));
464 }
465
466
DeleteTranslator(Translator * tr)467 void DeleteTranslator(Translator *tr)
468 {//==================================
469 if(tr->data_dictlist != NULL)
470 Free(tr->data_dictlist);
471 Free(tr);
472 }
473
474
lookupwchar(const unsigned short * list,int c)475 int lookupwchar(const unsigned short *list,int c)
476 {//==============================================
477 // Is the character c in the list ?
478 int ix;
479
480 for(ix=0; list[ix] != 0; ix++)
481 {
482 if(list[ix] == c)
483 return(ix+1);
484 }
485 return(0);
486 }
487
488
lookupwchar2(const unsigned short * list,int c)489 int lookupwchar2(const unsigned short *list,int c)
490 {//==============================================
491 // Replace character c by another character.
492 // Returns 0 = not found, 1 = delete character
493 int ix;
494
495 for(ix=0; list[ix] != 0; ix+=2)
496 {
497 if(list[ix] == c)
498 return(list[ix+1]);
499 }
500 return(0);
501 }
502
503
IsBracket(int c)504 int IsBracket(int c)
505 {//=================
506 if((c >= 0x2014) && (c <= 0x201f))
507 return(1);
508 return(lookupwchar(brackets,c));
509 }
510
511
utf8_out(unsigned int c,char * buf)512 int utf8_out(unsigned int c, char *buf)
513 {//====================================
514 // write a unicode character into a buffer as utf8
515 // returns the number of bytes written
516 int n_bytes;
517 int j;
518 int shift;
519 static char unsigned code[4] = {0,0xc0,0xe0,0xf0};
520
521 if(c < 0x80)
522 {
523 buf[0] = c;
524 return(1);
525 }
526 if(c >= 0x110000)
527 {
528 buf[0] = ' '; // out of range character code
529 return(1);
530 }
531 if(c < 0x0800)
532 n_bytes = 1;
533 else if(c < 0x10000)
534 n_bytes = 2;
535 else
536 n_bytes = 3;
537
538 shift = 6*n_bytes;
539 buf[0] = code[n_bytes] | (c >> shift);
540 for(j=0; j<n_bytes; j++)
541 {
542 shift -= 6;
543 buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
544 }
545 return(n_bytes+1);
546 } // end of utf8_out
547
548
utf8_nbytes(const char * buf)549 int utf8_nbytes(const char *buf)
550 {//=============================
551 // Returns the number of bytes for the first UTF-8 character in buf
552 unsigned char c = (unsigned char)buf[0];
553 if(c < 0x80)
554 return(1);
555 if(c < 0xe0)
556 return(2);
557 if(c < 0xf0)
558 return(3);
559 return(4);
560 }
561
562
utf8_in2(int * c,const char * buf,int backwards)563 int utf8_in2(int *c, const char *buf, int backwards)
564 {//=================================================
565 // Read a unicode characater from a UTF8 string
566 // Returns the number of UTF8 bytes used.
567 // backwards: set if we are moving backwards through the UTF8 string
568 int c1;
569 int n_bytes;
570 int ix;
571 static const unsigned char mask[4] = {0xff,0x1f,0x0f,0x07};
572
573 // find the start of the next/previous character
574 while((*buf & 0xc0) == 0x80)
575 {
576 // skip over non-initial bytes of a multi-byte utf8 character
577 if(backwards)
578 buf--;
579 else
580 buf++;
581 }
582
583 n_bytes = 0;
584
585 if((c1 = *buf++) & 0x80)
586 {
587 if((c1 & 0xe0) == 0xc0)
588 n_bytes = 1;
589 else if((c1 & 0xf0) == 0xe0)
590 n_bytes = 2;
591 else if((c1 & 0xf8) == 0xf0)
592 n_bytes = 3;
593
594 c1 &= mask[n_bytes];
595 for(ix=0; ix<n_bytes; ix++)
596 {
597 c1 = (c1 << 6) + (*buf++ & 0x3f);
598 }
599 }
600 *c = c1;
601 return(n_bytes+1);
602 }
603
604
utf8_in(int * c,const char * buf)605 int utf8_in(int *c, const char *buf)
606 {//=================================
607 // Read a unicode characater from a UTF8 string
608 // Returns the number of UTF8 bytes used.
609 return(utf8_in2(c,buf,0));
610 }
611
612
strchr_w(const char * s,int c)613 char *strchr_w(const char *s, int c)
614 {//=================================
615 // return NULL for any non-ascii character
616 if(c >= 0x80)
617 return(NULL);
618 return(strchr((char *)s,c)); // (char *) is needed for Borland compiler
619 }
620
621
IsAllUpper(const char * word)622 int IsAllUpper(const char *word)
623 {//=============================
624 int c;
625 while((*word != 0) && !isspace2(*word))
626 {
627 word += utf8_in(&c, word);
628 if(!iswupper2(c))
629 return(0);
630 }
631 return(1);
632 }
633
634
SpeakIndividualLetters(Translator * tr,char * word,char * phonemes,int spell_word)635 static char *SpeakIndividualLetters(Translator *tr, char *word, char *phonemes, int spell_word)
636 {//============================================================================================
637 int posn = 0;
638 int capitals = 0;
639 int non_initial = 0;
640
641 if(spell_word > 2)
642 capitals = 2; // speak 'capital'
643 if(spell_word > 1)
644 capitals |= 4; // speak charater code for unknown letters
645
646 while((*word != ' ') && (*word != 0))
647 {
648 word += TranslateLetter(tr, word, phonemes, capitals | non_initial);
649 posn++;
650 non_initial = 1;
651 if(phonemes[0] == phonSWITCH)
652 {
653 // change to another language in order to translate this word
654 strcpy(word_phonemes,phonemes);
655 return(NULL);
656 }
657 }
658 SetSpellingStress(tr,phonemes,spell_word,posn);
659 return(word);
660 } // end of SpeakIndividualLetters
661
662
663
CheckDottedAbbrev(char * word1,WORD_TAB * wtab)664 static int CheckDottedAbbrev(char *word1, WORD_TAB *wtab)
665 {//=====================================================
666 int wc;
667 int count = 0;
668 int nbytes;
669 int ok;
670 int ix;
671 char *word;
672 char *wbuf;
673 char word_buf[80];
674
675 word = word1;
676 wbuf = word_buf;
677 ix = 0;
678
679 for(;;)
680 {
681 ok = 0;
682 nbytes = utf8_in(&wc, word);
683 if((word[nbytes] == ' ') && IsAlpha(wc))
684 {
685 if(word[nbytes+1] == '.')
686 {
687 if(word[nbytes+2] == ' ')
688 ok = 1;
689 else if(word[nbytes+2] =='\'')
690 {
691 nbytes += 2; // delete the final dot (eg. u.s.a.'s)
692 ok = 2;
693 }
694 }
695 else if((count > 0) && (word[nbytes] == ' '))
696 ok = 2;
697 }
698
699 if(ok == 0)
700 break;
701
702 for(ix=0; ix < nbytes; ix++)
703 *wbuf++ = word[ix];
704
705 count++;
706
707 if(ok == 2)
708 {
709 word += nbytes;
710 break;
711 }
712
713 word += (nbytes + 3);
714 }
715
716 if(count > 1)
717 {
718 ix = wbuf - word_buf;
719 memcpy(word1, word_buf, ix);
720 while(&word1[ix] < word)
721 word1[ix++] = ' ';
722 dictionary_skipwords = (count - 1)*2;
723 }
724 return(count);
725 } // end of CheckDottedAbbrev
726
727
728 extern char *phondata_ptr;
729
ChangeEquivalentPhonemes(Translator * tr,int lang2,char * phonemes)730 int ChangeEquivalentPhonemes(Translator *tr, int lang2, char *phonemes)
731 {//====================================================================
732 // tr: the original language
733 // lang2: phoneme table number for the temporary language
734 // phonemes: the phonemes to be replaced
735
736 int ix;
737 int len;
738 char phon;
739 char *p;
740 unsigned char *pb;
741 char *eqlist;
742 char *p_out;
743 char *p_in;
744 int remove_stress = 0;
745 char phonbuf[N_WORD_PHONEMES];
746
747 // has a phoneme equivalence table been specified for thus language pair?
748 if((ix = phoneme_tab_list[tr->phoneme_tab_ix].equivalence_tables) == 0)
749 return(0);
750
751 pb = (unsigned char *)&phondata_ptr[ix];
752
753 for(;;)
754 {
755 if(pb[0] == 0)
756 return(0); // table not found
757
758 if(pb[0] == lang2)
759 break;
760
761 len = (pb[2] << 8) + pb[3]; // size of this table in words
762 pb += (len * 4);
763 }
764 remove_stress = pb[1];
765
766 if(option_phonemes == 2)
767 {
768 DecodePhonemes(phonemes, phonbuf);
769 fprintf(f_trans,"(%s) %s -> (%s) ", phoneme_tab_list[lang2].name, phonbuf, phoneme_tab_list[tr->phoneme_tab_ix].name);
770 }
771
772 p_in = phonemes;
773 eqlist = (char *)&pb[8];
774 p_out = phonbuf;
775
776 while((phon = *p_in++) != 0)
777 {
778 if(remove_stress && ((phon & 0xff) < phonSTRESS_PREV))
779 continue; // remove stress marks
780
781 // is there a translation for this phoneme code?
782 p = eqlist;
783 while(*p != 0)
784 {
785 len = strlen(&p[1]);
786 if(*p == phon)
787 {
788 strcpy(p_out, &p[1]);
789 p_out += len;
790 break;
791 }
792 p += (len + 2);
793 }
794 if(*p == 0)
795 {
796 // no translation found
797 *p_out++ = phon;
798 }
799 }
800 *p_out = 0;
801
802 if(remove_stress)
803 {
804 SetWordStress(tr, phonbuf, NULL, -1, 0);
805 }
806
807 strcpy(phonemes, phonbuf);
808
809 if(option_phonemes == 2)
810 {
811 SelectPhonemeTable(tr->phoneme_tab_ix);
812 DecodePhonemes(phonemes, phonbuf);
813 fprintf(f_trans,"%s\n\n", phonbuf);
814 }
815 return(1);
816 } // end of ChangeEquivalentPhonemes
817
818
819
820
TranslateWord(Translator * tr,char * word_start,int next_pause,WORD_TAB * wtab,char * word_out)821 int TranslateWord(Translator *tr, char *word_start, int next_pause, WORD_TAB *wtab, char *word_out)
822 {//==================================================================================================
823 // word1 is terminated by space (0x20) character
824
825 char *word1;
826 int word_length;
827 int ix;
828 char *p;
829 int pfix;
830 int n_chars;
831 unsigned int dictionary_flags[2];
832 unsigned int dictionary_flags2[2];
833 int end_type=0;
834 int end_type1=0;
835 int prefix_type=0;
836 int prefix_stress;
837 char *wordx;
838 char phonemes[N_WORD_PHONEMES];
839 char phonemes2[N_WORD_PHONEMES];
840 char prefix_phonemes[N_WORD_PHONEMES];
841 char unpron_phonemes[N_WORD_PHONEMES];
842 char end_phonemes[N_WORD_PHONEMES];
843 char end_phonemes2[N_WORD_PHONEMES];
844 char word_copy[N_WORD_BYTES];
845 char word_copy2[N_WORD_BYTES];
846 int word_copy_length;
847 char prefix_chars[0x3f + 2];
848 int found=0;
849 int end_flags;
850 int c_temp; // save a character byte while we temporarily replace it with space
851 int first_char;
852 int last_char = 0;
853 int add_plural_suffix = 0;
854 int prefix_flags = 0;
855 int more_suffixes;
856 int confirm_prefix;
857 int spell_word;
858 int stress_bits;
859 int emphasize_allcaps = 0;
860 int wflags;
861 int wmark;
862 int was_unpronouncable = 0;
863 int loopcount;
864 WORD_TAB wtab_null[8];
865
866 // translate these to get pronunciations of plural 's' suffix (different forms depending on
867 // the preceding letter
868 static char word_zz[4] = {0,'z','z',0};
869 static char word_iz[4] = {0,'i','z',0};
870 static char word_ss[4] = {0,'s','s',0};
871
872 if(wtab == NULL)
873 {
874 memset(wtab_null, 0, sizeof(wtab_null));
875 wtab = wtab_null;
876 }
877 wflags = wtab->flags;
878 wmark = wtab->wmark;
879
880 dictionary_flags[0] = 0;
881 dictionary_flags[1] = 0;
882 dictionary_flags2[0] = 0;
883 dictionary_flags2[1] = 0;
884 dictionary_skipwords = 0;
885
886 phonemes[0] = 0;
887 unpron_phonemes[0] = 0;
888 prefix_phonemes[0] = 0;
889 end_phonemes[0] = 0;
890
891 if(tr->data_dictlist == NULL)
892 {
893 // dictionary is not loaded
894 word_phonemes[0] = 0;
895 return(0);
896 }
897
898 // count the length of the word
899 word1 = word_start;
900 if(*word1 == ' ') word1++; // possibly a dot was replaced by space: $dot
901 wordx = word1;
902
903 utf8_in(&first_char,wordx);
904 word_length = 0;
905 while((*wordx != 0) && (*wordx != ' '))
906 {
907 wordx += utf8_in(&last_char,wordx);
908 word_length++;
909 }
910
911 word_copy_length = wordx - word_start;
912 if(word_copy_length >= N_WORD_BYTES)
913 word_copy_length = N_WORD_BYTES-1;
914 memcpy(word_copy2, word_start, word_copy_length);
915
916 spell_word = 0;
917
918 if((word_length == 1) && (wflags & FLAG_TRANSLATOR2))
919 {
920 // retranslating a 1-character word using a different language, say its name
921 utf8_in(&c_temp, wordx+1); // the next character
922 if(!IsAlpha(c_temp) || (AlphabetFromChar(last_char) != AlphabetFromChar(c_temp)))
923 spell_word = 1;
924 }
925
926 if(option_sayas == SAYAS_KEY)
927 {
928 if(word_length == 1)
929 spell_word = 4;
930 else
931 {
932 // is there a translation for this keyname ?
933 word1--;
934 *word1 = '_'; // prefix keyname with '_'
935 found = LookupDictList(tr, &word1, phonemes, dictionary_flags, 0, wtab);
936 }
937 }
938
939 // try an initial lookup in the dictionary list, we may find a pronunciation specified, or
940 // we may just find some flags
941 if(option_sayas & 0x10)
942 {
943 // SAYAS_CHAR, SAYAS_GYLPH, or SAYAS_SINGLE_CHAR
944 spell_word = option_sayas & 0xf; // 2,3,4
945 }
946 else
947 {
948 if(!found)
949 found = LookupDictList(tr, &word1, phonemes, dictionary_flags, FLAG_ALLOW_TEXTMODE, wtab); // the original word
950
951
952 if((dictionary_flags[0] & (FLAG_ALLOW_DOT || FLAG_NEEDS_DOT)) && (wordx[1] == '.'))
953 {
954 wordx[1] = ' '; // remove a Dot after this word
955 }
956
957 if(dictionary_flags[0] & FLAG_TEXTMODE)
958 {
959 if(word_out != NULL)
960 strcpy(word_out, word1);
961
962 first_char = word1[0];
963 stress_bits = dictionary_flags[0] & 0x7f;
964 found = LookupDictList(tr, &word1, phonemes, dictionary_flags2, 0, wtab); // the text replacement
965 if(dictionary_flags2[0]!=0)
966 {
967 dictionary_flags[0] = dictionary_flags2[0];
968 dictionary_flags[1] = dictionary_flags2[1];
969 if(stress_bits != 0)
970 {
971 // keep any stress information from the original word
972 dictionary_flags[0] = (dictionary_flags[0] & ~0x7f) | stress_bits;
973 }
974 }
975 }
976 else if((found==0) && (dictionary_flags[0] & FLAG_SKIPWORDS) && !(dictionary_flags[0] & FLAG_ABBREV))
977 {
978 // grouped words, but no translation. Join the words with hyphens.
979 wordx = word1;
980 ix = 0;
981 while(ix < dictionary_skipwords)
982 {
983 if(*wordx == ' ')
984 {
985 *wordx = '-';
986 ix++;
987 }
988 wordx++;
989 }
990 }
991
992 if((word_length == 1) && (dictionary_skipwords == 0))
993 {
994 // is this a series of single letters separated by dots?
995 if(CheckDottedAbbrev(word1, wtab))
996 {
997 dictionary_flags[0] = 0;
998 dictionary_flags[1] = 0;
999 spell_word = 1;
1000 if(dictionary_skipwords)
1001 dictionary_flags[0] = FLAG_SKIPWORDS;
1002 }
1003 }
1004
1005
1006 // if textmode, LookupDictList() replaces word1 by the new text and returns found=0
1007
1008 if(phonemes[0] == phonSWITCH)
1009 {
1010 // change to another language in order to translate this word
1011 strcpy(word_phonemes,phonemes);
1012 return(0);
1013 }
1014
1015 if((wmark > 0) && (wmark < 8))
1016 {
1017 // the stressed syllable has been specified in the text (TESTING)
1018 dictionary_flags[0] = (dictionary_flags[0] & ~0xf) | wmark;
1019 }
1020
1021 if(!found && (dictionary_flags[0] & FLAG_ABBREV))
1022 {
1023 // the word has $abbrev flag, but no pronunciation specified. Speak as individual letters
1024 spell_word = 1;
1025 }
1026
1027 if(!found && iswdigit(first_char))
1028 {
1029 Lookup(tr,"_0lang",word_phonemes);
1030 if(word_phonemes[0] == phonSWITCH)
1031 return(0);
1032
1033 if((tr->langopts.numbers2 & NUM2_ENGLISH_NUMERALS) && !(wtab->flags & FLAG_CHAR_REPLACED))
1034 {
1035 // for this language, speak English numerals (0-9) with the English voice
1036 sprintf(word_phonemes,"%c",phonSWITCH);
1037 return(0);
1038 }
1039
1040 found = TranslateNumber(tr, word1, phonemes, dictionary_flags, wtab, 0);
1041 }
1042
1043 if(!found && ((wflags & FLAG_UPPERS) != FLAG_FIRST_UPPER))
1044 {
1045 // either all upper or all lower case
1046
1047 if((tr->langopts.numbers & NUM_ROMAN) || ((tr->langopts.numbers & NUM_ROMAN_CAPITALS) && (wflags & FLAG_ALL_UPPER)))
1048 {
1049 if((wflags & FLAG_LAST_WORD) || !(wtab[1].flags & FLAG_NOSPACE))
1050 {
1051 // don't use Roman number if this word is not separated from the next word (eg. "XLTest")
1052 if((found = TranslateRoman(tr, word1, phonemes, wtab)) != 0)
1053 dictionary_flags[0] |= FLAG_ABBREV; // prevent emphasis if capitals
1054 }
1055 }
1056 }
1057
1058 if((wflags & FLAG_ALL_UPPER) && (word_length > 1)&& iswalpha2(first_char))
1059 {
1060 if((option_tone_flags & OPTION_EMPHASIZE_ALLCAPS) && !(dictionary_flags[0] & FLAG_ABBREV))
1061 {
1062 // emphasize words which are in capitals
1063 emphasize_allcaps = FLAG_EMPHASIZED;
1064 }
1065 else if(!found && !(dictionary_flags[0] & FLAG_SKIPWORDS) && (word_length<4) && (tr->clause_lower_count > 3)
1066 && (tr->clause_upper_count <= tr->clause_lower_count))
1067 {
1068 // An upper case word in a lower case clause. This could be an abbreviation.
1069 spell_word = 1;
1070 }
1071 }
1072 }
1073
1074 if(spell_word > 0)
1075 {
1076 // Speak as individual letters
1077 phonemes[0] = 0;
1078 end_type = 0;
1079
1080 if(SpeakIndividualLetters(tr, word1, phonemes, spell_word) == NULL)
1081 {
1082 if(word_length > 1)
1083 return(FLAG_SPELLWORD); // a mixture of languages, retranslate as individual letters, separated by spaces
1084 if(phonemes[0] == phonSWITCH)
1085 {
1086 // problem with espeak -vbg "b.c.d.e.f"
1087 }
1088 return(0);
1089 }
1090 strcpy(word_phonemes, phonemes);
1091 if(wflags & FLAG_TRANSLATOR2)
1092 return(0);
1093 return(dictionary_flags[0] & FLAG_SKIPWORDS); // for "b.c.d"
1094 }
1095 else if(found == 0)
1096 {
1097 int posn;
1098 int non_initial;
1099 int length;
1100 // word's pronunciation is not given in the dictionary list, although
1101 // dictionary_flags may have ben set there
1102
1103 posn = 0;
1104 non_initial = 0;
1105 length = 999;
1106 wordx = word1;
1107
1108 while(((length < 3) && (length > 0))|| (word_length > 1 && Unpronouncable(tr, wordx, posn)))
1109 {
1110 // This word looks "unpronouncable", so speak letters individually until we
1111 // find a remainder that we can pronounce.
1112 was_unpronouncable = FLAG_WAS_UNPRONOUNCABLE;
1113 emphasize_allcaps = 0;
1114
1115 if(wordx[0] == '\'')
1116 break;
1117
1118 if(posn > 0)
1119 non_initial = 1;
1120
1121 wordx += TranslateLetter(tr, wordx, unpron_phonemes, non_initial);
1122 posn++;
1123 if(unpron_phonemes[0] == phonSWITCH)
1124 {
1125 // change to another language in order to translate this word
1126 strcpy(word_phonemes,unpron_phonemes);
1127 if(strcmp(&unpron_phonemes[1],"en")==0)
1128 return(FLAG_SPELLWORD); // _^_en must have been set in TranslateLetter(), not *_rules which uses only _^_
1129 return(0);
1130 }
1131
1132 #ifdef deleted
1133 p = &wordx[word_length-3]; // this looks wrong. Doesn't consider multi-byte chars.
1134 if(memcmp(p,"'s ",3) == 0)
1135 {
1136 // remove a 's suffix and pronounce this separately (not as an individual letter)
1137 add_plural_suffix = 1;
1138 p[0] = ' ';
1139 p[1] = ' ';
1140 last_char = p[-1];
1141 }
1142 #endif
1143 length=0;
1144 while(wordx[length] != ' ') length++;
1145 }
1146 SetSpellingStress(tr,unpron_phonemes,0,posn);
1147
1148 // anything left ?
1149 if(*wordx != ' ')
1150 {
1151 if((unpron_phonemes[0] != 0) && (wordx[0] != '\''))
1152 {
1153 // letters which have been spoken individually from affecting the pronunciation of the pronuncable part
1154 wordx[-1] = ' ';
1155 }
1156
1157 // Translate the stem
1158 end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags, dictionary_flags);
1159
1160 if(phonemes[0] == phonSWITCH)
1161 {
1162 // change to another language in order to translate this word
1163 strcpy(word_phonemes,phonemes);
1164 return(0);
1165 }
1166
1167 #ifdef deleted
1168 // ?? allow $unpr while translating rules, not just on initial FLAG_UNPRON_TEST
1169 if(end_type & SUFX_UNPRON)
1170 {
1171 phonemes[0] = 0; // discard and retranslate as individual letters
1172 SpeakIndividualLetters(tr, wordx, phonemes, 0);
1173 strcpy(word_phonemes, phonemes);
1174 return(0);
1175 }
1176 #endif
1177
1178 if((phonemes[0] == 0) && (end_phonemes[0] == 0))
1179 {
1180 int wc;
1181 // characters not recognised, speak them individually
1182 // ?? should we say super/sub-script numbers and letters here?
1183 utf8_in(&wc, wordx);
1184 if((word_length == 1) && (IsAlpha(wc) || IsSuperscript(wc)))
1185 {
1186 if((wordx = SpeakIndividualLetters(tr, wordx, phonemes, spell_word)) == NULL)
1187 {
1188 return(0);
1189 }
1190 strcpy(word_phonemes, phonemes);
1191 return(0);
1192 }
1193 }
1194
1195 c_temp = wordx[-1];
1196
1197 found = 0;
1198 confirm_prefix = 1;
1199 for (loopcount = 0; (loopcount < 50) && (end_type & SUFX_P); loopcount++)
1200 {
1201 // Found a standard prefix, remove it and retranslate
1202 // loopcount guards against an endless loop
1203 if(confirm_prefix && !(end_type & SUFX_B))
1204 {
1205 int end2;
1206 char end_phonemes2[N_WORD_PHONEMES];
1207
1208 // remove any standard suffix and confirm that the prefix is still recognised
1209 phonemes2[0] = 0;
1210 end2 = TranslateRules(tr, wordx, phonemes2, N_WORD_PHONEMES, end_phonemes2, wflags|FLAG_NO_PREFIX|FLAG_NO_TRACE, dictionary_flags);
1211 if(end2)
1212 {
1213 RemoveEnding(tr, wordx, end2, word_copy);
1214 end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags|FLAG_NO_TRACE, dictionary_flags);
1215 memcpy(wordx,word_copy,strlen(word_copy));
1216 if((end_type & SUFX_P) == 0)
1217 {
1218 // after removing the suffix, the prefix is no longer recognised.
1219 // Keep the suffix, but don't use the prefix
1220 end_type = end2;
1221 strcpy(phonemes,phonemes2);
1222 strcpy(end_phonemes,end_phonemes2);
1223 if(option_phonemes == 2)
1224 {
1225 DecodePhonemes(end_phonemes,end_phonemes2);
1226 fprintf(f_trans," suffix [%s]\n\n",end_phonemes2);
1227 }
1228 }
1229 confirm_prefix = 0;
1230 continue;
1231 }
1232 }
1233
1234 prefix_type = end_type;
1235
1236 if(prefix_type & SUFX_V)
1237 {
1238 tr->expect_verb = 1; // use the verb form of the word
1239 }
1240
1241 wordx[-1] = c_temp;
1242
1243 if((prefix_type & SUFX_B) == 0)
1244 {
1245 for(ix=(prefix_type & 0xf); ix>0; ix--) // num. of characters to remove
1246 {
1247 wordx++;
1248 while((*wordx & 0xc0) == 0x80) wordx++; // for multibyte characters
1249 }
1250 }
1251 else
1252 {
1253 pfix = 1;
1254 prefix_chars[0] = 0;
1255 n_chars = prefix_type & 0x3f;
1256
1257 for(ix=0; ix < n_chars; ix++) // num. of bytes to remove
1258 {
1259 prefix_chars[pfix++] = *wordx++;
1260
1261 if((prefix_type & SUFX_B) && (ix == (n_chars-1)))
1262 {
1263 prefix_chars[pfix-1] = 0; // discard the last character of the prefix, this is the separator character
1264 }
1265 }
1266 prefix_chars[pfix] = 0;
1267 }
1268 c_temp = wordx[-1];
1269 wordx[-1] = ' ';
1270 confirm_prefix = 1;
1271 wflags |= FLAG_PREFIX_REMOVED;
1272
1273 if(prefix_type & SUFX_B)
1274 {
1275 // SUFX_B is used for Turkish, tr_rules contains " ' (Pb"
1276 // examine the prefix part
1277 char *wordpf;
1278 char prefix_phonemes2[12];
1279
1280 strncpy0(prefix_phonemes2,end_phonemes,sizeof(prefix_phonemes2));
1281 wordpf = &prefix_chars[1];
1282 strcpy(prefix_phonemes, phonemes);
1283
1284 // look for stress marker or $abbrev
1285 found = LookupDictList(tr, &wordpf, phonemes, dictionary_flags, 0, wtab);
1286 if(found)
1287 {
1288 strcpy(prefix_phonemes, phonemes);
1289 }
1290 if(dictionary_flags[0] & FLAG_ABBREV)
1291 {
1292 prefix_phonemes[0] = 0;
1293 SpeakIndividualLetters(tr, wordpf, prefix_phonemes, 1);
1294 }
1295 }
1296 else
1297 {
1298 strcat(prefix_phonemes,end_phonemes);
1299 }
1300 end_phonemes[0] = 0;
1301
1302 end_type = 0;
1303 found = LookupDictList(tr, &wordx, phonemes, dictionary_flags2, SUFX_P, wtab); // without prefix
1304 if(dictionary_flags[0]==0)
1305 {
1306 dictionary_flags[0] = dictionary_flags2[0];
1307 dictionary_flags[1] = dictionary_flags2[1];
1308 }
1309 else
1310 prefix_flags = 1;
1311 if(found == 0)
1312 {
1313 end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags & (FLAG_HYPHEN_AFTER | FLAG_PREFIX_REMOVED), dictionary_flags);
1314
1315 if(phonemes[0] == phonSWITCH)
1316 {
1317 // change to another language in order to translate this word
1318 wordx[-1] = c_temp;
1319 strcpy(word_phonemes,phonemes);
1320 return(0);
1321 }
1322 }
1323 }
1324
1325
1326
1327
1328 if((end_type != 0) && !(end_type & SUFX_P))
1329 {
1330 end_type1 = end_type;
1331 strcpy(phonemes2,phonemes);
1332
1333 // The word has a standard ending, re-translate without this ending
1334 end_flags = RemoveEnding(tr, wordx, end_type, word_copy);
1335 more_suffixes = 1;
1336
1337 while(more_suffixes)
1338 {
1339 more_suffixes = 0;
1340 phonemes[0] = 0;
1341
1342 if(prefix_phonemes[0] != 0)
1343 {
1344 // lookup the stem without the prefix removed
1345 wordx[-1] = c_temp;
1346 found = LookupDictList(tr, &word1, phonemes, dictionary_flags2, end_flags, wtab); // include prefix, but not suffix
1347 wordx[-1] = ' ';
1348 if(phonemes[0] == phonSWITCH)
1349 {
1350 // change to another language in order to translate this word
1351 memcpy(wordx,word_copy,strlen(word_copy));
1352 strcpy(word_phonemes,phonemes);
1353 return(0);
1354 }
1355 if(dictionary_flags[0]==0)
1356 {
1357 dictionary_flags[0] = dictionary_flags2[0];
1358 dictionary_flags[1] = dictionary_flags2[1];
1359 }
1360 if(found)
1361 prefix_phonemes[0] = 0; // matched whole word, don't need prefix now
1362
1363 if((found==0) && (dictionary_flags2[0] != 0))
1364 prefix_flags = 1;
1365 }
1366 if(found == 0)
1367 {
1368 found = LookupDictList(tr, &wordx, phonemes, dictionary_flags2, end_flags, wtab); // without prefix and suffix
1369 if(phonemes[0] == phonSWITCH)
1370 {
1371 // change to another language in order to translate this word
1372 memcpy(wordx,word_copy,strlen(word_copy));
1373 strcpy(word_phonemes,phonemes);
1374 return(0);
1375 }
1376
1377 if(dictionary_flags2[0] & FLAG_ABBREV)
1378 {
1379 // Removing the suffix leaves a word which should be spoken as individual letters
1380 // Not yet implemented
1381 }
1382 if(dictionary_flags[0]==0)
1383 {
1384 dictionary_flags[0] = dictionary_flags2[0];
1385 dictionary_flags[1] = dictionary_flags2[1];
1386 }
1387 }
1388 if(found == 0)
1389 {
1390 if(end_type & SUFX_Q)
1391 {
1392 // don't retranslate, use the original lookup result
1393 strcpy(phonemes,phonemes2);
1394 }
1395 else
1396 {
1397 if(end_flags & FLAG_SUFX)
1398 wflags |= FLAG_SUFFIX_REMOVED;
1399 if(end_type & SUFX_A)
1400 wflags |= FLAG_SUFFIX_VOWEL;
1401
1402 if(end_type & SUFX_M)
1403 {
1404 // allow more suffixes before this suffix
1405 strcpy(end_phonemes2, end_phonemes);
1406 end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags, dictionary_flags);
1407 strcat(end_phonemes, end_phonemes2); // add the phonemes for the previous suffixes after this one
1408
1409 if((end_type != 0) && !(end_type & SUFX_P))
1410 {
1411 // there is another suffix
1412 end_flags = RemoveEnding(tr, wordx, end_type, NULL);
1413 more_suffixes = 1;
1414 }
1415 }
1416 else
1417 {
1418 // don't remove any previous suffix
1419 TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, NULL, wflags, dictionary_flags);
1420 end_type = 0;
1421 }
1422
1423 if(phonemes[0] == phonSWITCH)
1424 {
1425 // change to another language in order to translate this word
1426 strcpy(word_phonemes,phonemes);
1427 memcpy(wordx,word_copy,strlen(word_copy));
1428 wordx[-1] = c_temp;
1429 return(0);
1430 }
1431 }
1432 }
1433 }
1434
1435
1436 if((end_type1 & SUFX_T) == 0)
1437 {
1438 // the default is to add the suffix and then determine the word's stress pattern
1439 AppendPhonemes(tr,phonemes, N_WORD_PHONEMES, end_phonemes);
1440 end_phonemes[0] = 0;
1441 }
1442 memcpy(wordx,word_copy,strlen(word_copy));
1443 }
1444
1445
1446
1447
1448 wordx[-1] = c_temp;
1449 }
1450 }
1451
1452 if((add_plural_suffix) || (wflags & FLAG_HAS_PLURAL))
1453 {
1454 // s or 's suffix, append [s], [z] or [Iz] depending on previous letter
1455 if(last_char == 'f')
1456 TranslateRules(tr, &word_ss[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1457 else if((last_char==0) || (strchr_w("hsx",last_char)==NULL))
1458 TranslateRules(tr, &word_zz[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1459 else
1460 TranslateRules(tr, &word_iz[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1461 }
1462
1463 wflags |= emphasize_allcaps;
1464
1465
1466 /* determine stress pattern for this word */
1467 /******************************************/
1468 prefix_stress = 0;
1469 for(p = prefix_phonemes; *p != 0; p++)
1470 {
1471 if((*p == phonSTRESS_P) || (*p == phonSTRESS_P2))
1472 {
1473 prefix_stress = *p;
1474 }
1475 }
1476 if(prefix_flags || (prefix_stress != 0))
1477 {
1478 if((tr->langopts.param[LOPT_PREFIXES]) || (prefix_type & SUFX_T))
1479 {
1480 char *p;
1481 // German, keep a secondary stress on the stem
1482 SetWordStress(tr, phonemes, dictionary_flags, 3, 0);
1483
1484 // reduce all but the first primary stress
1485 ix=0;
1486 for(p=prefix_phonemes; *p != 0; p++)
1487 {
1488 if(*p == phonSTRESS_P)
1489 {
1490 if(ix==0)
1491 ix=1;
1492 else
1493 *p = phonSTRESS_3;
1494 }
1495 }
1496 #ifdef PLATFORM_WINDOWS
1497 sprintf(word_phonemes, "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1498 #else
1499 snprintf(word_phonemes, sizeof(word_phonemes), "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1500 #endif
1501 word_phonemes[N_WORD_PHONEMES-1] = 0;
1502 SetWordStress(tr, word_phonemes, dictionary_flags, -1, 0);
1503 }
1504 else
1505 {
1506 // stress position affects the whole word, including prefix
1507 #ifdef PLATFORM_WINDOWS
1508 sprintf(word_phonemes, "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1509 #else
1510 snprintf(word_phonemes, sizeof(word_phonemes), "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1511 #endif
1512 word_phonemes[N_WORD_PHONEMES-1] = 0;
1513 SetWordStress(tr, word_phonemes, dictionary_flags, -1, 0);
1514 }
1515 }
1516 else
1517 {
1518 if(prefix_phonemes[0] == 0)
1519 SetWordStress(tr, phonemes, dictionary_flags, -1, 0);
1520 else
1521 SetWordStress(tr, phonemes, dictionary_flags, -1, 0);
1522 #ifdef PLATFORM_WINDOWS
1523 sprintf(word_phonemes, "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1524 #else
1525 snprintf(word_phonemes, sizeof(word_phonemes), "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1526 #endif
1527 word_phonemes[N_WORD_PHONEMES-1] = 0;
1528 }
1529
1530 if(end_phonemes[0] != 0)
1531 {
1532 // a suffix had the SUFX_T option set, add the suffix after the stress pattern has been determined
1533 ix = strlen(word_phonemes);
1534 end_phonemes[N_WORD_PHONEMES-1-ix] = 0; // ensure no buffer overflow
1535 strcpy(&word_phonemes[ix], end_phonemes);
1536 }
1537
1538 if(wflags & FLAG_LAST_WORD)
1539 {
1540 // don't use $brk pause before the last word of a sentence
1541 // (but allow it for emphasis, see below
1542 dictionary_flags[0] &= ~FLAG_PAUSE1;
1543 }
1544
1545 #ifdef deleted
1546 // but it causes problems if these are not a person name
1547 if(tr->translator_name == L('h','u'))
1548 {
1549 // lang=hu, If the last two words of a clause have capital letters (eg. a person name), unstress the last word.
1550 if((wflags & (FLAG_LAST_WORD | FLAG_FIRST_UPPER | FLAG_ALL_UPPER | FLAG_FIRST_WORD)) == (FLAG_LAST_WORD | FLAG_FIRST_UPPER))
1551 {
1552 if(((wtab[-1].flags & (FLAG_FIRST_UPPER | FLAG_ALL_UPPER)) == FLAG_FIRST_UPPER) && ((tr->clause_terminator != 0x90028) || (wflags & FLAG_HAS_DOT)))
1553 {
1554 ChangeWordStress(tr,word_phonemes,3);
1555 }
1556 }
1557 }
1558 #endif
1559
1560 if((wflags & FLAG_HYPHEN) && (tr->langopts.stress_flags & S_HYPEN_UNSTRESS))
1561 {
1562 ChangeWordStress(tr,word_phonemes,3);
1563 }
1564 else if(wflags & FLAG_EMPHASIZED2)
1565 {
1566 // A word is indicated in the source text as stressed
1567 // Give it stress level 6 (for the intonation module)
1568 ChangeWordStress(tr,word_phonemes,6);
1569
1570 if(wflags & FLAG_EMPHASIZED)
1571 dictionary_flags[0] |= FLAG_PAUSE1; // precede by short pause
1572 }
1573 else if(wtab[dictionary_skipwords].flags & FLAG_LAST_WORD)
1574 {
1575 // the word has attribute to stress or unstress when at end of clause
1576 if(dictionary_flags[0] & (FLAG_STRESS_END | FLAG_STRESS_END2))
1577 ChangeWordStress(tr,word_phonemes,4);
1578 else if((dictionary_flags[0] & FLAG_UNSTRESS_END) && (any_stressed_words))
1579 ChangeWordStress(tr,word_phonemes,3);
1580 }
1581
1582
1583 // dictionary flags for this word give a clue about which alternative pronunciations of
1584 // following words to use.
1585 if(end_type1 & SUFX_F)
1586 {
1587 // expect a verb form, with or without -s suffix
1588 tr->expect_verb = 2;
1589 tr->expect_verb_s = 2;
1590 }
1591
1592 if(dictionary_flags[1] & FLAG_PASTF)
1593 {
1594 /* expect perfect tense in next two words */
1595 tr->expect_past = 3;
1596 tr->expect_verb = 0;
1597 tr->expect_noun = 0;
1598 }
1599 else if(dictionary_flags[1] & FLAG_VERBF)
1600 {
1601 /* expect a verb in the next word */
1602 tr->expect_verb = 2;
1603 tr->expect_verb_s = 0; /* verb won't have -s suffix */
1604 tr->expect_noun = 0;
1605 }
1606 else if(dictionary_flags[1] & FLAG_VERBSF)
1607 {
1608 // expect a verb, must have a -s suffix
1609 tr->expect_verb = 0;
1610 tr->expect_verb_s = 2;
1611 tr->expect_past = 0;
1612 tr->expect_noun = 0;
1613 }
1614 else if(dictionary_flags[1] & FLAG_NOUNF)
1615 {
1616 /* not expecting a verb next */
1617 tr->expect_noun = 2;
1618 tr->expect_verb = 0;
1619 tr->expect_verb_s = 0;
1620 tr->expect_past = 0;
1621 }
1622
1623 if((wordx[0] != 0) && (!(dictionary_flags[1] & FLAG_VERB_EXT)))
1624 {
1625 if(tr->expect_verb > 0)
1626 tr->expect_verb--;
1627
1628 if(tr->expect_verb_s > 0)
1629 tr->expect_verb_s--;
1630
1631 if(tr->expect_noun >0)
1632 tr->expect_noun--;
1633
1634 if(tr->expect_past > 0)
1635 tr->expect_past--;
1636 }
1637
1638 if((word_length == 1) && (tr->translator_name == L('e','n')) && iswalpha2(first_char) && (first_char != 'i'))
1639 {
1640 // English Specific !!!!
1641 // any single letter before a dot is an abbreviation, except 'I'
1642 dictionary_flags[0] |= FLAG_ALLOW_DOT;
1643 }
1644
1645 if((tr->langopts.param[LOPT_ALT] & 2) && ((dictionary_flags[0] & (FLAG_ALT_TRANS | FLAG_ALT2_TRANS)) != 0))
1646 {
1647 ApplySpecialAttribute2(tr,word_phonemes,dictionary_flags[0]);
1648 }
1649
1650 dictionary_flags[0] |= was_unpronouncable;
1651 memcpy(word_start, word_copy2, word_copy_length);
1652 return(dictionary_flags[0]);
1653 } // end of TranslateWord
1654
1655
1656
SetPlist2(PHONEME_LIST2 * p,unsigned char phcode)1657 static void SetPlist2(PHONEME_LIST2 *p, unsigned char phcode)
1658 {//==========================================================
1659 p->phcode = phcode;
1660 p->stresslevel = 0;
1661 p->tone_ph = 0;
1662 p->synthflags = embedded_flag;
1663 p->sourceix = 0;
1664 embedded_flag = 0;
1665 }
1666
CountSyllables(unsigned char * phonemes)1667 static int CountSyllables(unsigned char *phonemes)
1668 {//===============================================
1669 int count = 0;
1670 int phon;
1671 while((phon = *phonemes++) != 0)
1672 {
1673 if(phoneme_tab[phon]->type == phVOWEL)
1674 count++;
1675 }
1676 return(count);
1677 }
1678
1679
Word_EmbeddedCmd()1680 void Word_EmbeddedCmd()
1681 {//====================
1682 // Process embedded commands for emphasis, sayas, and break
1683 int embedded_cmd;
1684 int value;
1685
1686 do
1687 {
1688 embedded_cmd = embedded_list[embedded_read++];
1689 value = embedded_cmd >> 8;
1690
1691 switch(embedded_cmd & 0x1f)
1692 {
1693 case EMBED_Y:
1694 option_sayas = value;
1695 break;
1696
1697 case EMBED_F:
1698 option_emphasis = value;
1699 break;
1700
1701 case EMBED_B:
1702 // break command
1703 if(value == 0)
1704 pre_pause = 0; // break=none
1705 else
1706 pre_pause += value;
1707 break;
1708 }
1709 } while(((embedded_cmd & 0x80) == 0) && (embedded_read < embedded_ix));
1710 } // end of Word_EmbeddedCmd
1711
1712
SetTranslator2(const char * new_language)1713 int SetTranslator2(const char *new_language)
1714 {//=========================================
1715 // Set translator2 to a second language
1716 int new_phoneme_tab;
1717 const char *new_phtab_name;
1718 int bitmap;
1719 int dialect = 0;
1720
1721 new_phtab_name = new_language;
1722 if((bitmap = translator->langopts.dict_dialect) != 0)
1723 {
1724 if((bitmap & (1 << DICTDIALECT_EN_US)) && (strcmp(new_language, "en") == 0))
1725 {
1726 new_phtab_name = "en-us";
1727 dialect = DICTDIALECT_EN_US;
1728 }
1729 if((bitmap & (1 << DICTDIALECT_ES_LA)) && (strcmp(new_language, "es") == 0))
1730 {
1731 new_phtab_name = "es-la";
1732 dialect = DICTDIALECT_ES_LA;
1733 }
1734 }
1735
1736 if((new_phoneme_tab = SelectPhonemeTableName(new_phtab_name)) >= 0)
1737 {
1738 if((translator2 != NULL) && (strcmp(new_language,translator2_language) != 0))
1739 {
1740 // we already have an alternative translator, but not for the required language, delete it
1741 DeleteTranslator(translator2);
1742 translator2 = NULL;
1743 }
1744
1745 if(translator2 == NULL)
1746 {
1747 translator2 = SelectTranslator(new_language);
1748 strcpy(translator2_language,new_language);
1749
1750 if(LoadDictionary(translator2, translator2->dictionary_name, 0) != 0)
1751 {
1752 SelectPhonemeTable(voice->phoneme_tab_ix); // revert to original phoneme table
1753 new_phoneme_tab = -1;
1754 translator2_language[0] = 0;
1755 }
1756 else
1757 {
1758 if(dialect == DICTDIALECT_EN_US)
1759 {
1760 // en-us
1761 translator2->dict_condition = 0x48; // bits 3, 6
1762 translator2->langopts.param[LOPT_REDUCE_T] = 1;
1763 }
1764 if(dialect == DICTDIALECT_ES_LA)
1765 {
1766 translator2->dict_condition = 0x04; // bit 2
1767 }
1768 }
1769 translator2->phoneme_tab_ix = new_phoneme_tab;
1770 }
1771 }
1772 if(translator2 != NULL)
1773 translator2->phonemes_repeat[0] = 0;
1774 return(new_phoneme_tab);
1775 } // end of SetTranslator2
1776
1777
1778
TranslateWord2(Translator * tr,char * word,WORD_TAB * wtab,int pre_pause,int next_pause)1779 static int TranslateWord2(Translator *tr, char *word, WORD_TAB *wtab, int pre_pause, int next_pause)
1780 {//=================================================================================================
1781 int flags=0;
1782 int stress;
1783 int next_stress;
1784 int next_tone=0;
1785 unsigned char *p;
1786 int srcix;
1787 int found_dict_flag;
1788 unsigned char ph_code;
1789 PHONEME_LIST2 *plist2;
1790 PHONEME_TAB *ph;
1791 int max_stress;
1792 int max_stress_ix=0;
1793 int prev_vowel = -1;
1794 int pitch_raised = 0;
1795 int switch_phonemes = -1;
1796 int first_phoneme = 1;
1797 int source_ix;
1798 int len;
1799 int ix;
1800 int sylimit; // max. number of syllables in a word to be combined with a preceding preposition
1801 const char *new_language;
1802 int bad_phoneme;
1803 int word_flags;
1804 int word_copy_len;
1805 char word_copy[N_WORD_BYTES+1];
1806 char word_replaced[N_WORD_BYTES+1];
1807 char old_dictionary_name[40];
1808
1809 if((f_logespeak != NULL) && (logging_type & 8))
1810 {
1811 fprintf(f_logespeak,"WORD: flg=%.5x len=%d '",wtab->flags,wtab->length);
1812 for(ix=0; ix<40; ix++)
1813 {
1814 if(word[ix]==0) break;
1815 fputc(word[ix], f_logespeak);
1816 }
1817 fprintf(f_logespeak,"'\n");
1818 }
1819
1820 len = wtab->length;
1821 if(len > 31) len = 31;
1822 source_ix = (wtab->sourceix & 0x7ff) | (len << 11); // bits 0-10 sourceix, bits 11-15 word length
1823
1824 word_flags = wtab[0].flags;
1825 if(word_flags & FLAG_EMBEDDED)
1826 {
1827 wtab[0].flags &= ~FLAG_EMBEDDED; // clear it in case we call TranslateWord2() again for the same word
1828 embedded_flag = SFLAG_EMBEDDED;
1829
1830 Word_EmbeddedCmd();
1831 }
1832
1833 if((word[0] == 0) || (word_flags & FLAG_DELETE_WORD))
1834 {
1835 // nothing to translate. Add a dummy phoneme to carry any embedded commands
1836 if(embedded_flag)
1837 {
1838 ph_list2[n_ph_list2].phcode = phonEND_WORD;
1839 ph_list2[n_ph_list2].stresslevel = 0;
1840 ph_list2[n_ph_list2].wordstress = 0;
1841 ph_list2[n_ph_list2].tone_ph = 0;
1842 ph_list2[n_ph_list2].synthflags = embedded_flag;
1843 ph_list2[n_ph_list2].sourceix = 0;
1844 n_ph_list2++;
1845 embedded_flag = 0;
1846 }
1847 word_phonemes[0] = 0;
1848 return(0);
1849 }
1850
1851 // after a $pause word attribute, ignore a $pause attribute on the next two words
1852 if(tr->prepause_timeout > 0)
1853 tr->prepause_timeout--;
1854
1855 if((option_sayas & 0xf0) == 0x10)
1856 {
1857 if(!(word_flags & FLAG_FIRST_WORD))
1858 {
1859 // SAYAS_CHARS, SAYAS_GLYPHS, or SAYAS_SINGLECHARS. Pause between each word.
1860 pre_pause += 4;
1861 }
1862 }
1863
1864 if(word_flags & FLAG_FIRST_UPPER)
1865 {
1866 if((option_capitals > 2) && (embedded_ix < N_EMBEDDED_LIST-6))
1867 {
1868 // indicate capital letter by raising pitch
1869 if(embedded_flag)
1870 embedded_list[embedded_ix-1] &= ~0x80; // already embedded command before this word, remove terminator
1871 if((pitch_raised = option_capitals) == 3)
1872 pitch_raised = 20; // default pitch raise for capitals
1873 embedded_list[embedded_ix++] = EMBED_P+0x40+0x80 + (pitch_raised << 8); // raise pitch
1874 embedded_flag = SFLAG_EMBEDDED;
1875 }
1876 }
1877
1878 p = (unsigned char *)word_phonemes;
1879 if(word_flags & FLAG_PHONEMES)
1880 {
1881 // The input is in phoneme mnemonics, not language text
1882 int c1;
1883 char lang_name[12];
1884
1885 if(memcmp(word,"_^_",3)==0)
1886 {
1887 // switch languages
1888 word+=3;
1889 for(ix=0;;)
1890 {
1891 c1 = *word++;
1892 if((c1==' ') || (c1==0))
1893 break;
1894 lang_name[ix++] = tolower(c1);
1895 }
1896 lang_name[ix] = 0;
1897
1898 if((ix = LookupPhonemeTable(lang_name)) > 0)
1899 {
1900 SelectPhonemeTable(ix);
1901 word_phonemes[0] = phonSWITCH;
1902 word_phonemes[1] = ix;
1903 word_phonemes[2] = 0;
1904 }
1905 }
1906 else
1907 {
1908 EncodePhonemes(word,word_phonemes,&bad_phoneme);
1909 }
1910 flags = FLAG_FOUND;
1911 }
1912 else
1913 {
1914 int c2;
1915 ix = 0;
1916 while(((c2 = word_copy[ix] = word[ix]) != ' ') && (c2 != 0) && (ix < N_WORD_BYTES)) ix++;
1917 word_copy_len = ix;
1918
1919 word_replaced[2] = 0;
1920 flags = TranslateWord(translator, word, next_pause, wtab, &word_replaced[2]);
1921
1922 if(flags & FLAG_SPELLWORD)
1923 {
1924 // re-translate the word as individual letters, separated by spaces
1925 memcpy(word, word_copy, word_copy_len);
1926 return(flags);
1927 }
1928
1929 if((flags & FLAG_COMBINE) && !(wtab[1].flags & FLAG_PHONEMES))
1930 {
1931 char *p2;
1932 int ok = 1;
1933 unsigned int flags2[2];
1934 int c_word2;
1935 char ph_buf[N_WORD_PHONEMES];
1936
1937 flags2[0] = 0;
1938 sylimit = tr->langopts.param[LOPT_COMBINE_WORDS];
1939
1940 // LANG=cs,sk
1941 // combine a preposition with the following word
1942 p2 = word;
1943 while(*p2 != ' ') p2++;
1944
1945 utf8_in(&c_word2, p2+1); // first character of the next word;
1946 if(!iswalpha2(c_word2))
1947 {
1948 ok =0;
1949 }
1950
1951 if(ok != 0)
1952 {
1953 strcpy(ph_buf,word_phonemes);
1954
1955 flags2[0] = TranslateWord(translator, p2+1, 0, wtab+1, NULL);
1956 if((flags2[0] & FLAG_WAS_UNPRONOUNCABLE) || (word_phonemes[0] == phonSWITCH))
1957 ok = 0;
1958
1959 if(sylimit & 0x100)
1960 {
1961 // only if the second word has $alt attribute
1962 if((flags2[0] & FLAG_ALT_TRANS) == 0)
1963 {
1964 ok = 0;
1965 }
1966 }
1967
1968 if((sylimit & 0x200) && ((wtab+1)->flags & FLAG_LAST_WORD))
1969 {
1970 // not if the next word is end-of-sentence
1971 ok = 0;
1972 }
1973
1974 if(ok == 0)
1975 {
1976 strcpy(word_phonemes,ph_buf);
1977 }
1978 }
1979
1980 if(ok)
1981 {
1982 *p2 = '-'; // replace next space by hyphen
1983 wtab[0].flags &= ~FLAG_ALL_UPPER; // prevent it being considered an abbreviation
1984 flags = TranslateWord(translator, word, next_pause, wtab, NULL); // translate the combined word
1985 if((sylimit > 0) && (CountSyllables(p) > (sylimit & 0x1f)))
1986 {
1987 // revert to separate words
1988 *p2 = ' ';
1989 flags = TranslateWord(translator, word, next_pause, wtab, NULL);
1990 }
1991 else
1992 {
1993 if(flags == 0)
1994 flags = flags2[0]; // no flags for the combined word, so use flags from the second word eg. lang-hu "nem december 7-e"
1995 flags |= FLAG_SKIPWORDS;
1996 dictionary_skipwords = 1;
1997 }
1998 }
1999 }
2000
2001 if(p[0]==phonSWITCH)
2002 {
2003 int switch_attempt;
2004 strcpy(old_dictionary_name, dictionary_name);
2005 for(switch_attempt=0; switch_attempt < 2; switch_attempt++)
2006 {
2007 // this word uses a different language
2008 memcpy(word, word_copy, word_copy_len);
2009
2010 new_language = (char *)(&p[1]);
2011 if(new_language[0]==0)
2012 new_language = "en";
2013
2014 switch_phonemes = SetTranslator2(new_language);
2015
2016 if(switch_phonemes >= 0)
2017 {
2018 // re-translate the word using the new translator
2019 wtab[0].flags |= FLAG_TRANSLATOR2;
2020 if(word_replaced[2] != 0)
2021 {
2022 word_replaced[0] = 0; // byte before the start of the word
2023 word_replaced[1] = ' ';
2024 flags = TranslateWord(translator2, &word_replaced[1], next_pause, wtab, NULL);
2025 }
2026 else
2027 flags = TranslateWord(translator2, word, next_pause, wtab, &word_replaced[2]);
2028 }
2029
2030 if(p[0] != phonSWITCH)
2031 break;
2032 }
2033
2034 // strcpy((char *)p,translator2->word_phonemes);
2035
2036 if(p[0] == phonSWITCH)
2037 return(FLAG_SPELLWORD);
2038
2039 if(switch_phonemes < 0)
2040 {
2041 // language code is not recognised or 2nd translator won't translate it
2042 p[0] = phonSCHWA; // just say something
2043 p[1] = phonSCHWA;
2044 p[2] = 0;
2045 }
2046
2047 // ?? Option to convert from language2 phonemes to the equivalent language1 phonemes
2048 // ?? Option to set the word-stress according to language1 rules eg. lang=fr)
2049 if(ChangeEquivalentPhonemes(tr, switch_phonemes, (char *)p))
2050 {
2051 // Phonemes have been converted from the foreign language to the native language
2052 switch_phonemes = -1;
2053 }
2054
2055 if(switch_phonemes == -1)
2056 {
2057 strcpy(dictionary_name, old_dictionary_name);
2058 SelectPhonemeTable(voice->phoneme_tab_ix);
2059
2060 // leave switch_phonemes set, but use the original phoneme table number.
2061 // This will suppress LOPT_REGRESSIVE_VOICING
2062 switch_phonemes = voice->phoneme_tab_ix; // original phoneme table
2063 }
2064 }
2065
2066 if(!(word_flags & FLAG_HYPHEN))
2067 {
2068 if(flags & FLAG_PAUSE1)
2069 {
2070 if(pre_pause < 1)
2071 pre_pause = 1;
2072 }
2073 if((flags & FLAG_PREPAUSE) && !(word_flags && (FLAG_LAST_WORD | FLAG_FIRST_WORD)) && !(wtab[-1].flags & FLAG_FIRST_WORD) && (tr->prepause_timeout == 0))
2074 {
2075 // the word is marked in the dictionary list with $pause
2076 if(pre_pause < 4) pre_pause = 4;
2077 tr->prepause_timeout = 3;
2078 }
2079 }
2080
2081 if((option_emphasis >= 3) && (pre_pause < 1))
2082 pre_pause = 1;
2083 }
2084
2085 stress = 0;
2086 next_stress = 1;
2087 srcix = 0;
2088 max_stress = -1;
2089
2090 found_dict_flag = 0;
2091 if((flags & FLAG_FOUND) && !(flags & FLAG_TEXTMODE))
2092 found_dict_flag = SFLAG_DICTIONARY;
2093
2094 while((pre_pause > 0) && (n_ph_list2 < N_PHONEME_LIST-4))
2095 {
2096 // add pause phonemes here. Either because of punctuation (brackets or quotes) in the
2097 // text, or because the word is marked in the dictionary lookup as a conjunction
2098 if(pre_pause > 1)
2099 {
2100 SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE);
2101 pre_pause -= 2;
2102 }
2103 else
2104 {
2105 SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE_NOLINK);
2106 pre_pause--;
2107 }
2108 tr->end_stressed_vowel = 0; // forget about the previous word
2109 tr->prev_dict_flags[0] = 0;
2110 tr->prev_dict_flags[1] = 0;
2111 }
2112 plist2 = &ph_list2[n_ph_list2];
2113
2114 if((option_capitals==1) && (word_flags & FLAG_FIRST_UPPER))
2115 {
2116 SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE_SHORT);
2117 SetPlist2(&ph_list2[n_ph_list2++],phonCAPITAL);
2118 if((word_flags & FLAG_ALL_UPPER) && IsAlpha(word[1]))
2119 {
2120 // word > 1 letter and all capitals
2121 SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE_SHORT);
2122 SetPlist2(&ph_list2[n_ph_list2++],phonCAPITAL);
2123 }
2124 }
2125
2126 if(switch_phonemes >= 0)
2127 {
2128 if((p[0] == phonPAUSE) && (p[1] == phonSWITCH))
2129 {
2130 // the new word starts with a phoneme table switch, so there's no need to switch before it.
2131 if(ph_list2[n_ph_list2-1].phcode == phonSWITCH)
2132 {
2133 //previous phoneme is also a phonSWITCH, delete it
2134 n_ph_list2--;
2135 }
2136 }
2137 else
2138 {
2139 // this word uses a different phoneme table
2140 if(ph_list2[n_ph_list2-1].phcode == phonSWITCH)
2141 {
2142 //previous phoneme is also a phonSWITCH, just change its phoneme table number
2143 n_ph_list2--;
2144 }
2145 else
2146 {
2147 SetPlist2(&ph_list2[n_ph_list2],phonSWITCH);
2148 }
2149 ph_list2[n_ph_list2++].tone_ph = switch_phonemes; // temporary phoneme table number
2150 }
2151 }
2152
2153 // remove initial pause from a word if it follows a hyphen
2154 if((word_flags & FLAG_HYPHEN) && (phoneme_tab[*p]->type == phPAUSE))
2155 p++;
2156
2157 if((p[0] == 0) && (embedded_flag))
2158 {
2159 // no phonemes. Insert a very short pause to carry an embedded command
2160 p[0] = phonPAUSE_VSHORT;
2161 p[1] = 0;
2162 }
2163
2164 while(((ph_code = *p++) != 0) && (n_ph_list2 < N_PHONEME_LIST-4))
2165 {
2166 if(ph_code == 255)
2167 continue; // unknown phoneme
2168
2169 // Add the phonemes to the first stage phoneme list (ph_list2)
2170 ph = phoneme_tab[ph_code];
2171
2172 if(ph_code == phonSWITCH)
2173 {
2174 ph_list2[n_ph_list2].phcode = ph_code;
2175 ph_list2[n_ph_list2].sourceix = 0;
2176 ph_list2[n_ph_list2].synthflags = 0;
2177 ph_list2[n_ph_list2++].tone_ph = *p;
2178 SelectPhonemeTable(*p);
2179 p++;
2180 }
2181 else if(ph->type == phSTRESS)
2182 {
2183 // don't add stress phonemes codes to the list, but give their stress
2184 // value to the next vowel phoneme
2185 // std_length is used to hold stress number or (if >10) a tone number for a tone language
2186 if(ph->program == 0)
2187 next_stress = ph->std_length;
2188 else
2189 {
2190 // for tone languages, the tone number for a syllable follows the vowel
2191 if(prev_vowel >= 0)
2192 {
2193 ph_list2[prev_vowel].tone_ph = ph_code;
2194 }
2195 else
2196 {
2197 next_tone = ph_code; // no previous vowel, apply to the next vowel
2198 }
2199 }
2200 }
2201 else if(ph_code == phonSYLLABIC)
2202 {
2203 // mark the previous phoneme as a syllabic consonant
2204 prev_vowel = n_ph_list2-1;
2205 ph_list2[prev_vowel].synthflags |= SFLAG_SYLLABLE;
2206 ph_list2[prev_vowel].stresslevel = next_stress;
2207 }
2208 else if(ph_code == phonLENGTHEN)
2209 {
2210 ph_list2[n_ph_list2-1].synthflags |= SFLAG_LENGTHEN;
2211 }
2212 else if(ph_code == phonEND_WORD)
2213 {
2214 // a || symbol in a phoneme string was used to indicate a word boundary
2215 // Don't add this phoneme to the list, but make sure the next phoneme has
2216 // a newword indication
2217 srcix = source_ix+1;
2218 }
2219 else if(ph_code == phonX1)
2220 {
2221 // a language specific action
2222 if(tr->langopts.param[LOPT_IT_DOUBLING])
2223 {
2224 flags |= FLAG_DOUBLING;
2225 }
2226 }
2227 else
2228 {
2229 ph_list2[n_ph_list2].phcode = ph_code;
2230 ph_list2[n_ph_list2].tone_ph = 0;
2231 ph_list2[n_ph_list2].synthflags = embedded_flag | found_dict_flag;
2232 embedded_flag = 0;
2233 ph_list2[n_ph_list2].sourceix = srcix;
2234 srcix = 0;
2235
2236 if(ph->type == phVOWEL)
2237 {
2238 stress = next_stress;
2239 next_stress = 1; // default is 'unstressed'
2240
2241 if(stress >= 4)
2242 {
2243 any_stressed_words = 1;
2244 }
2245
2246 if((prev_vowel >= 0) && (n_ph_list2-1) != prev_vowel)
2247 ph_list2[n_ph_list2-1].stresslevel = stress; // set stress for previous consonant
2248
2249 ph_list2[n_ph_list2].synthflags |= SFLAG_SYLLABLE;
2250 prev_vowel = n_ph_list2;
2251
2252 if(stress > max_stress)
2253 {
2254 max_stress = stress;
2255 max_stress_ix = n_ph_list2;
2256 }
2257 if(next_tone != 0)
2258 {
2259 ph_list2[n_ph_list2].tone_ph = next_tone;
2260 next_tone=0;
2261 }
2262 }
2263 else
2264 {
2265 if(first_phoneme && tr->langopts.param[LOPT_IT_DOUBLING])
2266 {
2267 if(((tr->prev_dict_flags[0] & FLAG_DOUBLING) && (tr->langopts.param[LOPT_IT_DOUBLING] & 1)) ||
2268 (tr->end_stressed_vowel && (tr->langopts.param[LOPT_IT_DOUBLING] & 2)))
2269 {
2270 // italian, double the initial consonant if the previous word ends with a
2271 // stressed vowel, or is marked with a flag
2272 ph_list2[n_ph_list2].synthflags |= SFLAG_LENGTHEN;
2273 }
2274 }
2275 }
2276
2277 ph_list2[n_ph_list2].stresslevel = stress;
2278 n_ph_list2++;
2279 first_phoneme = 0;
2280 }
2281 }
2282
2283 if(word_flags & FLAG_COMMA_AFTER)
2284 {
2285 SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE_CLAUSE);
2286 }
2287
2288 // don't set new-word if there is a hyphen before it
2289 if((word_flags & FLAG_HYPHEN) == 0)
2290 {
2291 plist2->sourceix = source_ix;
2292 }
2293
2294 tr->end_stressed_vowel = 0;
2295 if((stress >= 4) && (phoneme_tab[ph_list2[n_ph_list2-1].phcode]->type == phVOWEL))
2296 {
2297 tr->end_stressed_vowel = 1; // word ends with a stressed vowel
2298 }
2299
2300 if(switch_phonemes >= 0)
2301 {
2302 // this word uses a different phoneme table, now switch back
2303 strcpy(dictionary_name, old_dictionary_name);
2304 SelectPhonemeTable(voice->phoneme_tab_ix);
2305 SetPlist2(&ph_list2[n_ph_list2],phonSWITCH);
2306 ph_list2[n_ph_list2++].tone_ph = voice->phoneme_tab_ix; // original phoneme table number
2307 }
2308
2309
2310 if(pitch_raised > 0)
2311 {
2312 embedded_list[embedded_ix++] = EMBED_P+0x60+0x80 + (pitch_raised << 8); // lower pitch
2313 SetPlist2(&ph_list2[n_ph_list2],phonPAUSE_SHORT);
2314 ph_list2[n_ph_list2++].synthflags = SFLAG_EMBEDDED;
2315 }
2316
2317 if(flags & FLAG_STRESS_END2)
2318 {
2319 // this's word's stress could be increased later
2320 ph_list2[max_stress_ix].synthflags |= SFLAG_PROMOTE_STRESS;
2321 }
2322
2323 tr->prev_dict_flags[0] = flags;
2324 return(flags);
2325 } // end of TranslateWord2
2326
2327
2328
EmbeddedCommand(unsigned int * source_index_out)2329 static int EmbeddedCommand(unsigned int *source_index_out)
2330 {//=======================================================
2331 // An embedded command to change the pitch, volume, etc.
2332 // returns number of commands added to embedded_list
2333
2334 // pitch,speed,amplitude,expression,reverb,tone,voice,sayas
2335 const char *commands = "PSARHTIVYMUBF";
2336 int value = -1;
2337 int sign = 0;
2338 unsigned char c;
2339 char *p;
2340 int cmd;
2341 int source_index = *source_index_out;
2342
2343 c = source[source_index];
2344 if(c == '+')
2345 {
2346 sign = 0x40;
2347 source_index++;
2348 }
2349 else if(c == '-')
2350 {
2351 sign = 0x60;
2352 source_index++;
2353 }
2354
2355 if(IsDigit09(source[source_index]))
2356 {
2357 value = atoi(&source[source_index]);
2358 while(IsDigit09(source[source_index]))
2359 source_index++;
2360 }
2361
2362 c = source[source_index++];
2363 if(embedded_ix >= (N_EMBEDDED_LIST - 2))
2364 return(0); // list is full
2365
2366 if((p = strchr_w(commands,c)) == NULL)
2367 return(0);
2368 cmd = (p - commands)+1;
2369 if(value == -1)
2370 {
2371 value = embedded_default[cmd];
2372 sign = 0;
2373 }
2374
2375 if(cmd == EMBED_Y)
2376 {
2377 option_sayas2 = value;
2378 count_sayas_digits = 0;
2379 }
2380 if(cmd == EMBED_F)
2381 {
2382 if(value >= 3)
2383 word_emphasis = FLAG_EMPHASIZED;
2384 else
2385 word_emphasis = 0;
2386 }
2387
2388 embedded_list[embedded_ix++] = cmd + sign + (value << 8);
2389 *source_index_out = source_index;
2390 return(1);
2391 } // end of EmbeddedCommand
2392
2393
2394
SubstituteChar(Translator * tr,unsigned int c,unsigned int next_in,int * insert,int * wordflags)2395 static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
2396 {//=========================================================================================
2397 int ix;
2398 unsigned int word;
2399 unsigned int new_c, c2, c_lower;
2400 int upper_case = 0;
2401 static int ignore_next = 0;
2402 const unsigned int *replace_chars;
2403
2404 if(ignore_next)
2405 {
2406 ignore_next = 0;
2407 return(8);
2408 }
2409 if(c == 0) return(0);
2410
2411 if((replace_chars = tr->langopts.replace_chars) == NULL)
2412 return(c);
2413
2414 // there is a list of character codes to be substituted with alternative codes
2415
2416 if(iswupper2(c_lower = c))
2417 {
2418 c_lower = towlower2(c);
2419 upper_case = 1;
2420 }
2421
2422 new_c = 0;
2423 for(ix=0; (word = replace_chars[ix]) != 0; ix+=2)
2424 {
2425 if(c_lower == (word & 0xffff))
2426 {
2427 if((word >> 16) == 0)
2428 {
2429 new_c = replace_chars[ix+1];
2430 break;
2431 }
2432 if((word >> 16) == (unsigned int)towlower2(next_in))
2433 {
2434 new_c = replace_chars[ix+1];
2435 ignore_next = 1;
2436 break;
2437 }
2438 }
2439 }
2440
2441 if(new_c == 0)
2442 return(c); // no substitution
2443
2444 if(new_c & 0xffe00000)
2445 {
2446 // there is a second character to be inserted
2447 // don't convert the case of the second character unless the next letter is also upper case
2448 c2 = new_c >> 16;
2449 if(upper_case && iswupper2(next_in))
2450 c2 = towupper2(c2);
2451 *insert = c2;
2452 new_c &= 0xffff;
2453 }
2454
2455 if(upper_case)
2456 new_c = towupper2(new_c);
2457
2458 *wordflags |= FLAG_CHAR_REPLACED;
2459 return(new_c);
2460
2461 }
2462
2463
TranslateChar(Translator * tr,char * ptr,int prev_in,unsigned int c,unsigned int next_in,int * insert,int * wordflags)2464 static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
2465 {//================================================================================================================
2466 // To allow language specific examination and replacement of characters
2467
2468 int code;
2469 int initial;
2470 int medial;
2471 int final;
2472 int next2;
2473
2474 static const unsigned char hangul_compatibility[0x34] = {
2475 0, 0x00,0x01,0xaa,0x02,0xac,0xad,0x03,
2476 0x04,0x05,0xb0,0xb1,0xb2,0xb3,0xb4,0xb4,
2477 0xb6,0x06,0x07,0x08,0xb9,0x09,0x0a,0xbc,
2478 0x0c,0x0d,0x0e,0x0f,0x10,0x11,0x12,0x61,
2479 0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,
2480 0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,0x70,0x71,
2481 0x72,0x73,0x74,0x75
2482 };
2483
2484 // check for Korean Hangul letters
2485 if(((code = c - 0xac00) >= 0) && (c <= 0xd7af))
2486 {
2487 // break a syllable hangul into 2 or 3 individual jamo
2488 initial = (code/28)/21;
2489 medial = (code/28) % 21;
2490 final = code % 28;
2491
2492 if(initial == 11)
2493 {
2494 // null initial
2495 c = medial + 0x1161;
2496 if(final > 0)
2497 *insert = final + 0x11a7;
2498 }
2499 else
2500 {
2501 // extact the initial and insert the remainder with a null initial
2502 c = initial + 0x1100;
2503 *insert = (11*28*21) + (medial*28) + final + 0xac00;
2504 }
2505 return(c);
2506 }
2507 else if(((code = c - 0x3130) >= 0) && (code < 0x34))
2508 {
2509 // Hangul compatibility jamo
2510 return(hangul_compatibility[code] + 0x1100);
2511 }
2512
2513 switch(tr->translator_name)
2514 {
2515 case L('a','f'):
2516 case L('n','l'):
2517 // look for 'n and replace by a special character (unicode: schwa)
2518
2519
2520 if(!iswalpha2(prev_in))
2521 {
2522 utf8_in(&next2, &ptr[1]);
2523
2524 if((c == '\'') && IsSpace(next2))
2525 {
2526 if((next_in == 'n') && (tr->translator_name == L('a','f')))
2527 {
2528 // n preceded by either apostrophe or U2019 "right single quotation mark"
2529 ptr[0] = ' '; // delete the n
2530 return(0x0259); // replace ' by unicode schwa character
2531 }
2532 if((next_in == 'n') || (next_in == 't'))
2533 {
2534 // Dutch, [@n] and [@t]
2535 return(0x0259); // replace ' by unicode schwa character
2536 }
2537 }
2538 }
2539 break;
2540 }
2541 return(SubstituteChar(tr, c, next_in, insert, wordflags));
2542 }
2543
2544
2545 static const char *UCase_ga[] = {"bp","bhf","dt","gc","hA","mb","nd","ng","ts","tA","nA",NULL};
2546
UpperCaseInWord(Translator * tr,char * word,int c)2547 int UpperCaseInWord(Translator *tr, char *word, int c)
2548 {//=====================================================
2549 int ix;
2550 int len;
2551 const char *p;
2552
2553 if(tr->translator_name == L('g','a'))
2554 {
2555 // Irish
2556 for(ix=0; ; ix++)
2557 {
2558 if((p = UCase_ga[ix]) == NULL)
2559 break;
2560
2561 len = strlen(p);
2562 if((word[-len]==' ') && (memcmp(&word[-len+1], p, len-1) == 0))
2563 {
2564 if((c == p[len-1]) || ((p[len-1]=='A') && IsVowel(tr, c)))
2565 return(1);
2566 }
2567 }
2568 }
2569 return(0);
2570 }
2571
2572
TranslateClause(Translator * tr,FILE * f_text,const void * vp_input,int * tone_out,char ** voice_change)2573 void *TranslateClause(Translator *tr, FILE *f_text, const void *vp_input, int *tone_out, char **voice_change)
2574 {//==========================================================================================================
2575 int ix;
2576 int c;
2577 int cc;
2578 unsigned int source_index=0;
2579 unsigned int prev_source_index=0;
2580 int source_index_word=0;
2581 int prev_in;
2582 int prev_out=' ';
2583 int prev_out2;
2584 int prev_in_save=0;
2585 int next_in;
2586 int next_in_nbytes;
2587 int char_inserted=0;
2588 int clause_pause;
2589 int pre_pause_add=0;
2590 int word_mark = 0;
2591 int all_upper_case=FLAG_ALL_UPPER;
2592 int finished;
2593 int single_quoted;
2594 int phoneme_mode = 0;
2595 int dict_flags = 0; // returned from dictionary lookup
2596 int word_flags; // set here
2597 int next_word_flags;
2598 int new_sentence2;
2599 int embedded_count = 0;
2600 int letter_count = 0;
2601 int space_inserted = 0;
2602 int syllable_marked = 0;
2603 int decimal_sep_count = 0;
2604 char *word;
2605 char *p;
2606 int j, k;
2607 int n_digits;
2608 int charix_top=0;
2609
2610 short charix[N_TR_SOURCE+4];
2611 WORD_TAB words[N_CLAUSE_WORDS];
2612 static char voice_change_name[40];
2613 int word_count=0; // index into words
2614
2615 char sbuf[N_TR_SOURCE];
2616
2617 int terminator;
2618 int tone;
2619 int tone2;
2620
2621 if(tr==NULL)
2622 {
2623 return(NULL);
2624 }
2625
2626 p_textinput = (unsigned char *)vp_input;
2627 p_wchar_input = (wchar_t *)vp_input;
2628
2629 embedded_ix = 0;
2630 embedded_read = 0;
2631 pre_pause = 0;
2632 any_stressed_words = 0;
2633
2634 if((clause_start_char = count_characters) < 0)
2635 clause_start_char = 0;
2636 clause_start_word = count_words + 1;
2637
2638 for(ix=0; ix<N_TR_SOURCE; ix++)
2639 charix[ix] = 0;
2640 terminator = ReadClause(tr, f_text, source, charix, &charix_top, N_TR_SOURCE, &tone2, voice_change_name);
2641
2642 if((f_logespeak != NULL) && (logging_type & 4))
2643 {
2644 fprintf(f_logespeak,"CLAUSE %x:\n",terminator);
2645 for(p=source; *p != 0; p++)
2646 fputc(*p, f_logespeak);
2647 fprintf(f_logespeak,"ENDCLAUSE\n");
2648 fflush(f_logespeak);
2649 }
2650 p = source;
2651
2652 charix[charix_top+1] = 0;
2653 charix[charix_top+2] = 0x7fff;
2654 charix[charix_top+3] = 0;
2655
2656 clause_pause = (terminator & 0xfff) * 10; // mS
2657 if(terminator & CLAUSE_PAUSE_LONG)
2658 clause_pause = clause_pause * 32 ; // pause value is *320mS not *10mS
2659
2660 tone = (terminator >> 12) & 0x7;
2661 if(tone2 != 0)
2662 {
2663 // override the tone type
2664 tone = tone2;
2665 }
2666
2667 for(p=source; *p != 0; p++)
2668 {
2669 if(!isspace2(*p))
2670 {
2671 break;
2672 }
2673 }
2674 if(*p == 0)
2675 {
2676 // No characters except spaces. This is not a sentence.
2677 // Don't add this pause, just make up the previous pause to this value;
2678 clause_pause -= max_clause_pause;
2679 if(clause_pause < 0)
2680 clause_pause = 0;
2681
2682 if(new_sentence)
2683 terminator |= CLAUSE_BIT_SENTENCE; // carry forward an end-of-sentence indicator
2684 max_clause_pause += clause_pause;
2685 new_sentence2 = 0;
2686 }
2687 else
2688 {
2689 max_clause_pause = clause_pause;
2690 new_sentence2 = new_sentence;
2691 }
2692 tr->clause_terminator = terminator;
2693
2694 if(new_sentence2)
2695 {
2696 count_sentences++;
2697 if(skip_sentences > 0)
2698 {
2699 skip_sentences--;
2700 if(skip_sentences == 0)
2701 skipping_text = 0;
2702 }
2703 }
2704
2705 memset(&ph_list2[0],0,sizeof(ph_list2[0]));
2706 ph_list2[0].phcode = phonPAUSE_SHORT;
2707
2708 n_ph_list2 = 1;
2709 tr->prev_last_stress = 0;
2710 tr->prepause_timeout = 0;
2711 tr->expect_verb=0;
2712 tr->expect_noun=0;
2713 tr->expect_past=0;
2714 tr->expect_verb_s=0;
2715 tr->phonemes_repeat_count = 0;
2716 tr->end_stressed_vowel=0;
2717 tr->prev_dict_flags[0] = 0;
2718 tr->prev_dict_flags[1] = 0;
2719
2720 word_count = 0;
2721 single_quoted = 0;
2722 word_flags = 0;
2723 next_word_flags = 0;
2724
2725 sbuf[0] = 0;
2726 sbuf[1] = ' ';
2727 sbuf[2] = ' ';
2728 ix = 3;
2729 prev_in = ' ';
2730
2731 words[0].start = ix;
2732 words[0].flags = 0;
2733 finished = 0;
2734
2735 for(j=0; charix[j]<=0; j++);
2736 words[0].sourceix = charix[j];
2737 k = 0;
2738 while(charix[j] != 0)
2739 {
2740 // count the number of characters (excluding multibyte continuation bytes)
2741 if(charix[j++] != -1)
2742 k++;
2743 }
2744 words[0].length = k;
2745
2746 while(!finished && (ix < (int)sizeof(sbuf))&& (n_ph_list2 < N_PHONEME_LIST-4))
2747 {
2748 prev_out2 = prev_out;
2749 utf8_in2(&prev_out,&sbuf[ix-1],1); // prev_out = sbuf[ix-1];
2750
2751 if(tr->langopts.tone_numbers && IsDigit09(prev_out) && IsAlpha(prev_out2))
2752 {
2753 // tone numbers can be part of a word, consider them as alphabetic
2754 prev_out = 'a';
2755 }
2756
2757 if(prev_in_save != 0)
2758 {
2759 prev_in = prev_in_save;
2760 prev_in_save = 0;
2761 }
2762 else if(source_index > 0)
2763 {
2764 utf8_in2(&prev_in,&source[source_index-1],1); // prev_in = source[source_index-1];
2765 }
2766
2767 prev_source_index = source_index;
2768
2769 if(char_inserted)
2770 {
2771 c = char_inserted;
2772 char_inserted = 0;
2773 }
2774 else
2775 {
2776 source_index += utf8_in(&cc,&source[source_index]); // cc = source[source_index++];
2777 c = cc;
2778 }
2779 next_in_nbytes = utf8_in(&next_in,&source[source_index]);
2780
2781 if(c == 0)
2782 {
2783 finished = 1;
2784 c = ' ';
2785 }
2786
2787 if((c == CTRL_EMBEDDED) || (c == ctrl_embedded))
2788 {
2789 // start of embedded command in the text
2790 int srcix = source_index-1;
2791
2792 if(prev_in != ' ')
2793 {
2794 c = ' ';
2795 prev_in_save = c;
2796 source_index--;
2797 }
2798 else
2799 {
2800 embedded_count += EmbeddedCommand(&source_index);
2801 prev_in_save = prev_in;
2802 // replace the embedded command by spaces
2803 memset(&source[srcix],' ',source_index-srcix);
2804 source_index = srcix;
2805 continue;
2806 }
2807 }
2808
2809 if((option_sayas2 == SAYAS_KEY) && (c != ' '))
2810 {
2811 if((prev_in == ' ') && (next_in == ' '))
2812 option_sayas2 = SAYAS_SINGLE_CHARS; // single character, speak its name
2813 c = towlower2(c);
2814 }
2815
2816
2817 if(phoneme_mode)
2818 {
2819 all_upper_case = FLAG_PHONEMES;
2820
2821 if((c == ']') && (next_in == ']'))
2822 {
2823 phoneme_mode = 0;
2824 source_index++;
2825 c = ' ';
2826 }
2827 }
2828 else if((option_sayas2 & 0xf0) == SAYAS_DIGITS)
2829 {
2830 if(iswdigit(c))
2831 {
2832 count_sayas_digits++;
2833 if(count_sayas_digits > (option_sayas2 & 0xf))
2834 {
2835 // break after the specified number of digits
2836 c = ' ';
2837 space_inserted = 1;
2838 count_sayas_digits = 0;
2839 }
2840 }
2841 else
2842 {
2843 count_sayas_digits = 0;
2844 if(iswdigit(prev_out))
2845 {
2846 c = ' ';
2847 space_inserted = 1;
2848 }
2849 }
2850 }
2851 else if((option_sayas2 & 0x10) == 0)
2852 {
2853 // speak as words
2854
2855 #ifdef deleted
2856 if((c == '/') && (tr->langopts.testing & 2) && IsDigit09(next_in) && IsAlpha(prev_out))
2857 {
2858 // TESTING, explicit indication of stressed syllable by /2 after the word
2859 word_mark = next_in-'0';
2860 source_index++;
2861 c = ' ';
2862 }
2863 #endif
2864 if((c == 0x92) || (c == 0xb4) || (c == 0x2019) || (c == 0x2032))
2865 c = '\''; // 'microsoft' quote or sexed closing single quote, or prime - possibly used as apostrophe
2866
2867 if(((c == 0x2018) || (c == '?')) && IsAlpha(prev_out) && IsAlpha(next_in))
2868 {
2869 // ? between two letters may be a smart-quote replaced by ?
2870 c = '\'';
2871 }
2872
2873 if(c == CHAR_EMPHASIS)
2874 {
2875 // this character is a marker that the previous word is the focus of the clause
2876 c = ' ';
2877 word_flags |= FLAG_FOCUS;
2878 }
2879
2880 if(c == CHAR_COMMA_BREAK)
2881 {
2882 c = ' ';
2883 word_flags |= FLAG_COMMA_AFTER;
2884 }
2885
2886 c = TranslateChar(tr, &source[source_index], prev_in,c, next_in, &char_inserted, &word_flags); // optional language specific function
2887 if(c == 8)
2888 continue; // ignore this character
2889
2890 if(char_inserted)
2891 next_in = char_inserted;
2892
2893 // allow certain punctuation within a word (usually only apostrophe)
2894 if(!IsAlpha(c) && !IsSpace(c) && (wcschr(tr->punct_within_word,c) == 0))
2895 {
2896 if(IsAlpha(prev_out))
2897 {
2898 if(tr->langopts.tone_numbers && IsDigit09(c) && !IsDigit09(next_in))
2899 {
2900 // allow a tone number as part of the word
2901 }
2902 else
2903 {
2904 c = ' '; // ensure we have an end-of-word terminator
2905 space_inserted = 1;
2906 }
2907 }
2908 }
2909
2910 if(iswdigit(prev_out))
2911 {
2912 if(!iswdigit(c) && (c != '.') && (c != ',') && (c != ' '))
2913 {
2914 c = ' '; // terminate digit string with a space
2915 space_inserted = 1;
2916 }
2917 }
2918 else
2919 {
2920 if(prev_in != ',')
2921 {
2922 decimal_sep_count = 0;
2923 }
2924 }
2925
2926 if(c == '[')
2927 {
2928 if((next_in == '\002') || ((next_in == '[') && option_phoneme_input))
2929 {
2930 // "[\002" is used internally to start phoneme mode
2931 phoneme_mode = FLAG_PHONEMES;
2932 source_index++;
2933 continue;
2934 }
2935 }
2936
2937 if(IsAlpha(c))
2938 {
2939 if(!IsAlpha(prev_out) || (tr->langopts.ideographs && ((c > 0x3040) || (prev_out > 0x3040))))
2940 {
2941 if(wcschr(tr->punct_within_word,prev_out) == 0)
2942 letter_count = 0; // don't reset count for an apostrophy within a word
2943
2944 if((prev_out != ' ') && (wcschr(tr->punct_within_word,prev_out) == 0))
2945 {
2946 // start of word, insert space if not one there already
2947 c = ' ';
2948 space_inserted = 1;
2949
2950 if(!IsBracket(prev_out)) // ?? perhaps only set FLAG_NOSPACE for . - / (hyphenated words, URLs, etc)
2951 {
2952 next_word_flags |= FLAG_NOSPACE;
2953 }
2954 }
2955 else
2956 {
2957 if(iswupper2(c))
2958 word_flags |= FLAG_FIRST_UPPER;
2959
2960 if((prev_out == ' ') && iswdigit(sbuf[ix-2]) && !iswdigit(prev_in))
2961 {
2962 // word, following a number, but with a space between
2963 // Add an extra space, to distinguish "2 a" from "2a"
2964 sbuf[ix++] = ' ';
2965 words[word_count].start++;
2966 }
2967 }
2968 }
2969
2970 if(c != ' ')
2971 {
2972 letter_count++;
2973
2974 if(tr->letter_bits_offset > 0)
2975 {
2976 if(((c < 0x250) && (prev_out >= tr->letter_bits_offset)) ||
2977 ((c >= tr->letter_bits_offset) && (letter_count > 1) && (prev_out < 0x250)))
2978 {
2979 // Don't mix native and Latin characters in the same word
2980 // Break into separate words
2981 if(IsAlpha(prev_out))
2982 {
2983 c = ' ';
2984 space_inserted = 1;
2985 word_flags |= FLAG_HYPHEN_AFTER;
2986 next_word_flags |= FLAG_HYPHEN;
2987 }
2988 }
2989 }
2990 }
2991
2992 if(iswupper2(c))
2993 {
2994 c = towlower2(c);
2995
2996 if((j = tr->langopts.param[LOPT_CAPS_IN_WORD]) > 0)
2997 {
2998 if((j == 2) && (syllable_marked == 0))
2999 {
3000 char_inserted = c;
3001 c = 0x2c8; // stress marker
3002 syllable_marked = 1;
3003 }
3004 }
3005 else
3006 {
3007 if(iswlower2(prev_in))
3008 {
3009 // lower case followed by upper case in a word
3010 if(UpperCaseInWord(tr, &sbuf[ix], c) == 1)
3011 {
3012 // convert to lower case and continue
3013 c = towlower2(c);
3014 }
3015 else
3016 {
3017 c = ' '; // lower case followed by upper case, treat as new word
3018 space_inserted = 1;
3019 prev_in_save = c;
3020 // next_word_flags |= FLAG_NOSPACE; // problem: prevents FLAG_HAS_DOT being set
3021 }
3022 }
3023 else if((c != ' ') && iswupper2(prev_in) && iswlower2(next_in))
3024 {
3025 int next2_in;
3026 utf8_in(&next2_in,&source[source_index + next_in_nbytes]);
3027
3028 if((tr->translator_name == L('n','l')) && (letter_count==2) && (c == 'j') && (prev_in == 'I'))
3029 {
3030 // Dutch words may capitalise initial IJ, don't split
3031 }
3032 else
3033 if(IsAlpha(next2_in))
3034 {
3035 // changing from upper to lower case, start new word at the last uppercase, if 3 or more letters
3036 c = ' ';
3037 space_inserted = 1;
3038 prev_in_save = c;
3039 next_word_flags |= FLAG_NOSPACE;
3040 }
3041 }
3042 }
3043 }
3044 else
3045 {
3046 if((all_upper_case) && (letter_count > 2))
3047 {
3048 if((c == 's') && (next_in==' '))
3049 {
3050 c = ' ';
3051 all_upper_case |= FLAG_HAS_PLURAL;
3052
3053 if(sbuf[ix-1] == '\'')
3054 sbuf[ix-1] = ' ';
3055 }
3056 else
3057 all_upper_case = 0; // current word contains lower case letters, not "'s"
3058 }
3059 else
3060 all_upper_case = 0;
3061 }
3062 }
3063 else if(c=='-')
3064 {
3065 if(!IsSpace(prev_in) && IsAlpha(next_in))
3066 {
3067 if(prev_out != ' ')
3068 {
3069 // previous 'word' not yet ended (not alpha or numeric), start new word now.
3070 c = ' ';
3071 space_inserted = 1;
3072 }
3073 else
3074 {
3075 // '-' between two letters is a hyphen, treat as a space
3076 word_flags |= FLAG_HYPHEN;
3077 if(word_count > 0)
3078 words[word_count-1].flags |= FLAG_HYPHEN_AFTER;
3079 c = ' ';
3080 }
3081 }
3082 else if((prev_in==' ') && (next_in==' '))
3083 {
3084 // ' - ' dash between two spaces, treat as pause
3085 c = ' ';
3086 pre_pause_add = 4;
3087 }
3088 else if(next_in=='-')
3089 {
3090 // double hyphen, treat as pause
3091 source_index++;
3092 c = ' ';
3093 pre_pause_add = 4;
3094 }
3095 else if((prev_out == ' ') && IsAlpha(prev_out2) && !IsAlpha(prev_in))
3096 {
3097 // insert extra space between a word + space + hyphen, to distinguish 'a -2' from 'a-2'
3098 sbuf[ix++] = ' ';
3099 words[word_count].start++;
3100 }
3101 }
3102 else if(c == '.')
3103 {
3104 if(prev_out == '.')
3105 {
3106 // multiple dots, separate by spaces. Note >3 dots has been replaced by elipsis
3107 c = ' ';
3108 space_inserted = 1;
3109 }
3110 else if((word_count > 0) && !(words[word_count-1].flags & FLAG_NOSPACE) && IsAlpha(prev_in))
3111 {
3112 // dot after a word, with space following, probably an abbreviation
3113 words[word_count-1].flags |= FLAG_HAS_DOT;
3114
3115 if(IsSpace(next_in) || (next_in == '-'))
3116 c = ' '; // remove the dot if it's followed by a space or hyphen, so that it's not pronounced
3117 }
3118 }
3119 else if(c == '\'')
3120 {
3121 if(((prev_in == '.') || iswalnum(prev_in)) && IsAlpha(next_in))
3122 {
3123 // between two letters, or in an abbreviation (eg. u.s.a.'s). Consider the apostrophe as part of the word
3124 single_quoted = 0;
3125 }
3126 else if((tr->langopts.param[LOPT_APOSTROPHE] & 1) && IsAlpha(next_in))
3127 {
3128 single_quoted = 0; // apostrophe at start of word is part of the word
3129 }
3130 else if((tr->langopts.param[LOPT_APOSTROPHE] & 2) && IsAlpha(prev_in))
3131 {
3132 single_quoted = 0; // apostrophe at end of word is part of the word
3133 }
3134 else if((wcschr(tr->char_plus_apostrophe,prev_in) != 0) && (prev_out2 == ' '))
3135 {
3136 // consider single character plus apostrophe as a word
3137 single_quoted = 0;
3138 if(next_in == ' ')
3139 {
3140 source_index++; // skip following space
3141 }
3142 }
3143 else
3144 {
3145 if((prev_out == 's') && (single_quoted==0))
3146 {
3147 // looks like apostrophe after an 's'
3148 c = ' ';
3149 }
3150 else
3151 {
3152 if(IsSpace(prev_out))
3153 single_quoted = 1;
3154 else
3155 single_quoted = 0;
3156
3157 pre_pause_add = 4; // single quote
3158 c = ' ';
3159 }
3160 }
3161 }
3162 else
3163 #ifdef deleted
3164 // Brackets are now recognised in TranslateRules()
3165 if(IsBracket(c))
3166 {
3167 pre_pause_add = 4;
3168 c = ' ';
3169 }
3170 else
3171 #endif
3172 if(lookupwchar(breaks,c) != 0)
3173 {
3174 c = ' '; // various characters to treat as space
3175 }
3176 else if(iswdigit(c))
3177 {
3178 if(tr->langopts.tone_numbers && IsAlpha(prev_out) && !IsDigit(next_in))
3179 {
3180 }
3181 else if((prev_out != ' ') && !iswdigit(prev_out))
3182 {
3183 if((prev_out != tr->langopts.decimal_sep) || ((decimal_sep_count > 0) && (tr->langopts.decimal_sep == ',')))
3184 {
3185 c = ' ';
3186 space_inserted = 1;
3187 }
3188 else
3189 {
3190 decimal_sep_count = 1;
3191 }
3192 }
3193 else if((prev_out == ' ') && IsAlpha(prev_out2) && !IsAlpha(prev_in))
3194 {
3195 // insert extra space between a word and a number, to distinguish 'a 2' from 'a2'
3196 sbuf[ix++] = ' ';
3197 words[word_count].start++;
3198 }
3199 }
3200 }
3201
3202 if(IsSpace(c))
3203 {
3204 if(prev_out == ' ')
3205 {
3206 word_flags |= FLAG_MULTIPLE_SPACES;
3207 continue; // multiple spaces
3208 }
3209
3210 if((cc == 0x09) || (cc == 0x0a))
3211 {
3212 next_word_flags |= FLAG_MULTIPLE_SPACES; // tab or newline, not a simple space
3213 }
3214
3215 if(space_inserted)
3216 {
3217 // count the number of characters since the start of the word
3218 j = 0;
3219 k = source_index - 1;
3220 while((k >= source_index_word) && (charix[k] != 0))
3221 {
3222 if(charix[k] > 0) // don't count initial bytes of multi-byte character
3223 j++;
3224 k--;
3225 }
3226 words[word_count].length = j;
3227 }
3228
3229 source_index_word = source_index;
3230
3231 // end of 'word'
3232 sbuf[ix++] = ' ';
3233
3234 if((word_count < N_CLAUSE_WORDS-1) && (ix > words[word_count].start))
3235 {
3236 if(embedded_count > 0)
3237 {
3238 // there are embedded commands before this word
3239 embedded_list[embedded_ix-1] |= 0x80; // terminate list of commands for this word
3240 words[word_count].flags |= FLAG_EMBEDDED;
3241 embedded_count = 0;
3242 }
3243 words[word_count].pre_pause = pre_pause;
3244 words[word_count].flags |= (all_upper_case | word_flags | word_emphasis);
3245 words[word_count].wmark = word_mark;
3246
3247 if(pre_pause > 0)
3248 {
3249 // insert an extra space before the word, to prevent influence from previous word across the pause
3250 for(j=ix; j>words[word_count].start; j--)
3251 {
3252 sbuf[j] = sbuf[j-1];
3253 }
3254 sbuf[j] = ' ';
3255 words[word_count].start++;
3256 ix++;
3257 }
3258
3259 word_count++;
3260 words[word_count].start = ix;
3261 words[word_count].flags = 0;
3262
3263 for(j=source_index; charix[j] <= 0; j++); // skip blanks
3264 words[word_count].sourceix = charix[j];
3265 k = 0;
3266 while(charix[j] != 0)
3267 {
3268 // count the number of characters (excluding multibyte continuation bytes)
3269 if(charix[j++] != -1)
3270 k++;
3271 }
3272 words[word_count].length = k;
3273
3274 word_flags = next_word_flags;
3275 next_word_flags = 0;
3276 pre_pause = 0;
3277 word_mark = 0;
3278 all_upper_case = FLAG_ALL_UPPER;
3279 syllable_marked = 0;
3280 }
3281
3282 if(space_inserted)
3283 {
3284 source_index = prev_source_index; // rewind to the previous character
3285 char_inserted = 0;
3286 space_inserted = 0;
3287 }
3288 }
3289 else
3290 {
3291 if((ix < (N_TR_SOURCE - 4)))
3292 ix += utf8_out(c,&sbuf[ix]); // sbuf[ix++] = c;
3293 }
3294 if(pre_pause_add > pre_pause)
3295 pre_pause = pre_pause_add;
3296 pre_pause_add = 0;
3297 }
3298
3299 if((word_count==0) && (embedded_count > 0))
3300 {
3301 // add a null 'word' to carry the embedded command flag
3302 embedded_list[embedded_ix-1] |= 0x80;
3303 words[word_count].flags |= FLAG_EMBEDDED;
3304 word_count = 1;
3305 }
3306
3307 tr->clause_end = &sbuf[ix-1];
3308 sbuf[ix] = 0;
3309 words[0].pre_pause = 0; // don't add extra pause at beginning of clause
3310 words[word_count].pre_pause = 8;
3311 if(word_count > 0)
3312 {
3313 ix = word_count-1;
3314 while((ix > 0) && (IsBracket(sbuf[words[ix].start])))
3315 ix--; // the last word is a bracket, mark the previous word as last
3316 words[ix].flags |= FLAG_LAST_WORD;
3317
3318 // FLAG_NOSPACE check to avoid recognizing .mr -mr
3319 if((terminator & CLAUSE_DOT) && !(words[word_count-1].flags & FLAG_NOSPACE))
3320 words[word_count-1].flags |= FLAG_HAS_DOT;
3321 }
3322 words[0].flags |= FLAG_FIRST_WORD;
3323
3324
3325 for(ix=0; ix < word_count; ix++)
3326 {
3327 int nx;
3328 int c_temp;
3329 char *pn;
3330 char *pw;
3331 int nw;
3332 char number_buf[150];
3333 WORD_TAB num_wtab[50]; // copy of 'words', when splitting numbers into parts
3334
3335 // start speaking at a specified word position in the text?
3336 count_words++;
3337 if(skip_words > 0)
3338 {
3339 skip_words--;
3340 if(skip_words == 0)
3341 skipping_text = 0;
3342 }
3343 if(skipping_text)
3344 continue;
3345
3346 current_alphabet = NULL;
3347
3348 // digits should have been converted to Latin alphabet ('0' to '9')
3349 word = pw = &sbuf[words[ix].start];
3350
3351 if(iswdigit(word[0]) && (tr->langopts.break_numbers != BREAK_THOUSANDS))
3352 {
3353 // Languages with 100000 numbers. Remove thousands separators so that we can insert them again later
3354 pn = number_buf;
3355 while(pn < &number_buf[sizeof(number_buf)-20])
3356 {
3357 if(iswdigit(*pw))
3358 {
3359 *pn++ = *pw++;
3360 }
3361 else if((*pw == tr->langopts.thousands_sep) && (pw[1] == ' ')
3362 && iswdigit(pw[2]) && (pw[3] != ' ') && (pw[4] != ' ')) // don't allow only 1 or 2 digits in the final part
3363 {
3364 pw += 2;
3365 ix++; // skip "word"
3366 }
3367 else
3368 {
3369 nx = pw - word;
3370 memset(word,' ',nx);
3371 nx = pn - number_buf;
3372 memcpy(word,number_buf,nx);
3373 break;
3374 }
3375 }
3376 pw = word;
3377 }
3378
3379 for(n_digits=0; iswdigit(word[n_digits]); n_digits++); // count consecutive digits
3380
3381 if(n_digits > 4)
3382 {
3383 // word is entirely digits, insert commas and break into 3 digit "words"
3384 number_buf[0] = ' ';
3385 pn = &number_buf[1];
3386 nx = n_digits;
3387 nw = 0;
3388
3389 if((n_digits > tr->langopts.max_digits) || (word[0] == '0'))
3390 words[ix].flags |= FLAG_INDIVIDUAL_DIGITS;
3391
3392 while(pn < &number_buf[sizeof(number_buf)-20])
3393 {
3394 if(!IsDigit09(c = *pw++) && (c != tr->langopts.decimal_sep))
3395 break;
3396
3397 *pn++ = c;
3398 nx--;
3399 if((nx > 0) && (tr->langopts.break_numbers & (1 << nx)))
3400 {
3401 memcpy(&num_wtab[nw++], &words[ix], sizeof(WORD_TAB)); // copy the 'words' entry for each word of numbers
3402
3403 if(tr->langopts.thousands_sep != ' ')
3404 {
3405 *pn++ = tr->langopts.thousands_sep;
3406 }
3407 *pn++ = ' ';
3408
3409 if((words[ix].flags & FLAG_INDIVIDUAL_DIGITS) == 0)
3410 {
3411 if(tr->langopts.break_numbers & (1 << (nx-1)))
3412 {
3413 // the next group only has 1 digits, make it three
3414 *pn++ = '0';
3415 *pn++ = '0';
3416 }
3417 if(tr->langopts.break_numbers & (1 << (nx-2)))
3418 {
3419 // the next group only has 2 digits (eg. Indian languages), make it three
3420 *pn++ = '0';
3421 }
3422 }
3423 }
3424 }
3425 pw--;
3426 memcpy(&num_wtab[nw], &words[ix], sizeof(WORD_TAB)*2); // the original number word, and the word after it
3427
3428 for(j=1; j<=nw; j++)
3429 {
3430 num_wtab[j].flags &= ~(FLAG_MULTIPLE_SPACES | FLAG_EMBEDDED); // don't use these flags for subsequent parts when splitting a number
3431 }
3432
3433 // include the next few characters, in case there are an ordinal indicator or other suffix
3434 memcpy(pn, pw, 16);
3435 pn[16] = 0;
3436 nw = 0;
3437
3438 for(pw = &number_buf[1]; pw < pn;)
3439 {
3440 // keep wflags for each part, for FLAG_HYPHEN_AFTER
3441 dict_flags = TranslateWord2(tr, pw, &num_wtab[nw++], words[ix].pre_pause,0 );
3442 while(*pw++ != ' ');
3443 words[ix].pre_pause = 0;
3444 }
3445 }
3446 else
3447 {
3448 pre_pause = 0;
3449
3450 dict_flags = TranslateWord2(tr, word, &words[ix], words[ix].pre_pause, words[ix+1].pre_pause);
3451
3452 if(pre_pause > words[ix+1].pre_pause)
3453 {
3454 words[ix+1].pre_pause = pre_pause;
3455 pre_pause = 0;
3456 }
3457
3458 if(dict_flags & FLAG_SPELLWORD)
3459 {
3460 // redo the word, speaking single letters
3461 for(pw = word; *pw != ' ';)
3462 {
3463 memset(number_buf,' ',9);
3464 nx = utf8_in(&c_temp, pw);
3465 memcpy(&number_buf[2],pw,nx);
3466 TranslateWord2(tr, &number_buf[2], &words[ix], 0, 0 );
3467 pw += nx;
3468 }
3469 }
3470
3471 if((dict_flags & (FLAG_ALLOW_DOT | FLAG_NEEDS_DOT)) && (ix == word_count - 1 - dictionary_skipwords) && (terminator & CLAUSE_DOT))
3472 {
3473 // probably an abbreviation such as Mr. or B. rather than end of sentence
3474 clause_pause = 10;
3475 tone = 4;
3476 }
3477 }
3478
3479 if(dict_flags & FLAG_SKIPWORDS)
3480 {
3481 // dictionary indicates skip next word(s)
3482 while(dictionary_skipwords > 0)
3483 {
3484 words[ix+dictionary_skipwords].flags |= FLAG_DELETE_WORD;
3485 dictionary_skipwords--;
3486 }
3487 }
3488 }
3489
3490 if(embedded_read < embedded_ix)
3491 {
3492 // any embedded commands not yet processed?
3493 Word_EmbeddedCmd();
3494 }
3495
3496 for(ix=0; ix<2; ix++)
3497 {
3498 // terminate the clause with 2 PAUSE phonemes
3499 PHONEME_LIST2 *p2;
3500 p2 = &ph_list2[n_ph_list2 + ix];
3501 p2->phcode = phonPAUSE;
3502 p2->stresslevel = 0;
3503 p2->sourceix = source_index;
3504 p2->synthflags = 0;
3505 }
3506 n_ph_list2 += 2;
3507
3508 if(count_words == 0)
3509 {
3510 clause_pause = 0;
3511 }
3512 if(Eof() && ((word_count == 0) || (option_endpause==0)))
3513 {
3514 clause_pause = 10;
3515 }
3516
3517 MakePhonemeList(tr, clause_pause, new_sentence2);
3518 phoneme_list[N_PHONEME_LIST].ph = NULL; // recognize end of phoneme_list array, in Generate()
3519 phoneme_list[N_PHONEME_LIST].sourceix = 1;
3520
3521 if(embedded_count) // ???? is this needed
3522 {
3523 phoneme_list[n_phoneme_list-2].synthflags = SFLAG_EMBEDDED;
3524 embedded_list[embedded_ix-1] |= 0x80;
3525 embedded_list[embedded_ix] = 0x80;
3526 }
3527
3528
3529 prev_clause_pause = clause_pause;
3530
3531 if(tone_out != NULL)
3532 *tone_out = tone;
3533
3534 new_sentence = 0;
3535 if(terminator & CLAUSE_BIT_SENTENCE)
3536 {
3537 new_sentence = 1; // next clause is a new sentence
3538 }
3539
3540
3541 if(voice_change != NULL)
3542 {
3543 // return new voice name if an embedded voice change command terminated the clause
3544 if(terminator & CLAUSE_BIT_VOICE)
3545 *voice_change = voice_change_name;
3546 else
3547 *voice_change = NULL;
3548 }
3549
3550 if(Eof() || (vp_input==NULL))
3551 return(NULL);
3552
3553 if(option_multibyte == espeakCHARS_WCHAR)
3554 return((void *)p_wchar_input);
3555 else
3556 return((void *)p_textinput);
3557 } // end of TranslateClause
3558
3559
3560
3561
3562
InitText(int control)3563 void InitText(int control)
3564 {//=======================
3565 count_sentences = 0;
3566 count_words = 0;
3567 end_character_position = 0;
3568 skip_sentences = 0;
3569 skip_marker[0] = 0;
3570 skip_words = 0;
3571 skip_characters = 0;
3572 skipping_text = 0;
3573 new_sentence = 1;
3574
3575 prev_clause_pause = 0;
3576
3577 option_sayas = 0;
3578 option_sayas2 = 0;
3579 option_emphasis = 0;
3580 word_emphasis = 0;
3581 embedded_flag = 0;
3582
3583 InitText2();
3584
3585 if((control & espeakKEEP_NAMEDATA) == 0)
3586 {
3587 InitNamedata();
3588 }
3589 }
3590
3591