1 /***************************************************************************
2  *   Copyright (C) 2005 to 2014 by Jonathan Duddington                     *
3  *   email: jonsd@users.sourceforge.net                                    *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 3 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, see:                                 *
17  *               <http://www.gnu.org/licenses/>.                           *
18  ***************************************************************************/
19 
20 #include "StdAfx.h"
21 
22 #include <stdio.h>
23 #include <ctype.h>
24 #include <stdlib.h>
25 #include <string.h>
26 
27 #include <wctype.h>
28 #include <wchar.h>
29 
30 #include "speak_lib.h"
31 #include "speech.h"
32 #include "phoneme.h"
33 #include "synthesize.h"
34 #include "voice.h"
35 #include "translate.h"
36 
37 #define WORD_STRESS_CHAR   '*'
38 
39 
40 Translator *translator = NULL;    // the main translator
41 Translator *translator2 = NULL;   // secondary translator for certain words
42 static char translator2_language[20] = {0};
43 
44 FILE *f_trans = NULL;     // phoneme output text
45 int option_tone2 = 0;
46 int option_tone_flags = 0;   // bit 8=emphasize allcaps, bit 9=emphasize penultimate stress
47 int option_phonemes = 0;
48 int option_phoneme_events = 0;
49 int option_quiet = 0;
50 int option_endpause = 0;  // suppress pause after end of text
51 int option_capitals = 0;
52 int option_punctuation = 0;
53 int option_sayas = 0;
54 static int option_sayas2 = 0;  // used in translate_clause()
55 static int option_emphasis = 0;  // 0=normal, 1=normal, 2=weak, 3=moderate, 4=strong
56 int option_ssml = 0;
57 int option_phoneme_input = 0;  // allow [[phonemes]] in input
58 int option_phoneme_variants = 0;  // 0= don't display phoneme variant mnemonics
59 int option_wordgap = 0;
60 
61 static int count_sayas_digits;
62 int skip_sentences;
63 int skip_words;
64 int skip_characters;
65 char skip_marker[N_MARKER_LENGTH];
66 int skipping_text;   // waiting until word count, sentence count, or named marker is reached
67 int end_character_position;
68 int count_sentences;
69 int count_words;
70 int clause_start_char;
71 int clause_start_word;
72 int new_sentence;
73 static int word_emphasis = 0;    // set if emphasis level 3 or 4
74 static int embedded_flag = 0;    // there are embedded commands to be applied to the next phoneme, used in TranslateWord2()
75 
76 static int prev_clause_pause=0;
77 static int max_clause_pause = 0;
78 static int any_stressed_words;
79 int pre_pause;
80 ALPHABET *current_alphabet;
81 
82 
83 // these were previously in translator class
84 #ifdef PLATFORM_WINDOWS
85 char word_phonemes[N_WORD_PHONEMES*2];    // longer, because snprint() is not available
86 #else
87 char word_phonemes[N_WORD_PHONEMES];    // a word translated into phoneme codes
88 #endif
89 int n_ph_list2;
90 PHONEME_LIST2 ph_list2[N_PHONEME_LIST];	// first stage of text->phonemes
91 
92 
93 
94 wchar_t option_punctlist[N_PUNCTLIST]= {0};
95 char ctrl_embedded = '\001';    // to allow an alternative CTRL for embedded commands
96 int option_multibyte=espeakCHARS_AUTO;   // 0=auto, 1=utf8, 2=8bit, 3=wchar, 4=16bit
97 
98 // these are overridden by defaults set in the "speak" file
99 int option_linelength = 0;
100 
101 #define N_EMBEDDED_LIST  250
102 static int embedded_ix;
103 static int embedded_read;
104 unsigned int embedded_list[N_EMBEDDED_LIST];
105 
106 // the source text of a single clause (UTF8 bytes)
107 static char source[N_TR_SOURCE+40];     // extra space for embedded command & voice change info at end
108 
109 int n_replace_phonemes;
110 REPLACE_PHONEMES replace_phonemes[N_REPLACE_PHONEMES];
111 
112 
113 // brackets, also 0x2014 to 0x021f which don't need to be in this list
114 static const unsigned short brackets[] = {
115 	'(',')','[',']','{','}','<','>','"','\'','`',
116 	0xab,0xbb,  // double angle brackets
117 	0x300a,0x300b,  // double angle brackets (ideograph)
118 	0xe000+'<',  // private usage area
119 	0
120 };
121 
122 // other characters which break a word, but don't produce a pause
123 static const unsigned short breaks[] = {'_', 0};
124 
125 // treat these characters as spaces, in addition to iswspace()
126 // static const wchar_t chars_space[] = {0x2500,0x2501,0};  // box drawing horiz
127 
128 
129 // Translate character codes 0xA0 to 0xFF into their unicode values
130 // ISO_8859_1 is set as default
131 static const unsigned short ISO_8859_1[0x60] = {
132 	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, // a0
133 	0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, // a8
134 	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, // b0
135 	0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, // b8
136 	0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
137 	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
138 	0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
139 	0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, // d8
140 	0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
141 	0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
142 	0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
143 	0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, // f8
144 };
145 
146 static const unsigned short ISO_8859_2[0x60] = {
147 	0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, // a0
148 	0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, // a8
149 	0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, // b0
150 	0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, // b8
151 	0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, // c0
152 	0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, // c8
153 	0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, // d0
154 	0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, // d8
155 	0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, // e0
156 	0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, // e8
157 	0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, // f0
158 	0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, // f8
159 };
160 
161 static const unsigned short ISO_8859_3[0x60] = {
162 	0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, 0x0000, 0x0124, 0x00a7, // a0
163 	0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, 0x0000, 0x017b, // a8
164 	0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, // b0
165 	0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, 0x0000, 0x017c, // b8
166 	0x00c0, 0x00c1, 0x00c2, 0x0000, 0x00c4, 0x010a, 0x0108, 0x00c7, // c0
167 	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
168 	0x0000, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, // d0
169 	0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, // d8
170 	0x00e0, 0x00e1, 0x00e2, 0x0000, 0x00e4, 0x010b, 0x0109, 0x00e7, // e0
171 	0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
172 	0x0000, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, // f0
173 	0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9, // f8
174 };
175 
176 static const unsigned short ISO_8859_4[0x60] = {
177 	0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, // a0
178 	0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, // a8
179 	0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, // b0
180 	0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, // b8
181 	0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, // c0
182 	0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, // c8
183 	0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
184 	0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, // d8
185 	0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, // e0
186 	0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, // e8
187 	0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
188 	0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9, // f8
189 };
190 
191 static const unsigned short ISO_8859_5[0x60] = {
192 	0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, // a0  Cyrillic
193 	0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x00ad, 0x040e, 0x040f, // a8
194 	0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, // b0
195 	0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, // b8
196 	0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, // c0
197 	0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, // c8
198 	0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, // d0
199 	0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, // d8
200 	0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, // e0
201 	0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, // e8
202 	0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, // f0
203 	0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f, // f8
204 };
205 
206 static const unsigned short ISO_8859_7[0x60] = {
207 	0x00a0, 0x2018, 0x2019, 0x00a3, 0x20ac, 0x20af, 0x00a6, 0x00a7, // a0  Greek
208 	0x00a8, 0x00a9, 0x037a, 0x00ab, 0x00ac, 0x00ad, 0x0000, 0x2015, // a8
209 	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7, // b0
210 	0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f, // b8
211 	0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, // c0
212 	0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, // c8
213 	0x03a0, 0x03a1, 0x0000, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, // d0
214 	0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af, // d8
215 	0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, // e0
216 	0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, // e8
217 	0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7, // f0
218 	0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, 0x0000, // f8
219 };
220 
221 static const unsigned short ISO_8859_9[0x60] = {
222 	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, // a0
223 	0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, // a8
224 	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, // b0
225 	0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, // b8
226 	0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
227 	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
228 	0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, // d0
229 	0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df, // d8
230 	0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
231 	0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
232 	0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, // f0
233 	0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff, // f8
234 };
235 
236 static const unsigned short ISO_8859_14[0x60] = {
237 	0x00a0, 0x1e02, 0x1e03, 0x00a3, 0x010a, 0x010b, 0x1e0a, 0x00a7, // a0  Welsh
238 	0x1e80, 0x00a9, 0x1e82, 0x1e0b, 0x1ef2, 0x00ad, 0x00ae, 0x0178, // a8
239 	0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x1e41, 0x00b6, 0x1e56, // b0
240 	0x1e81, 0x1e57, 0x1e83, 0x1e60, 0x1ef3, 0x1e84, 0x1e85, 0x1e61, // b8
241 	0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // c0
242 	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, // c8
243 	0x0174, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x1e6a, // d0
244 	0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x0176, 0x00df, // d8
245 	0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, // e0
246 	0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, // e8
247 	0x0175, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x1e6b, // f0
248 	0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x0177, 0x00ff, // f8
249 };
250 
251 static const unsigned short KOI8_R[0x60] = {
252 	0x2550, 0x2551, 0x2552, 0x0451, 0x2553, 0x2554, 0x2555, 0x2556, // a0  Russian
253 	0x2557, 0x2558, 0x2559, 0x255a, 0x255b, 0x255c, 0x255d, 0x255e, // a8
254 	0x255f, 0x2560, 0x2561, 0x0401, 0x2562, 0x2563, 0x2564, 0x2565, // b0
255 	0x2566, 0x2567, 0x2568, 0x2569, 0x256a, 0x256b, 0x256c, 0x00a9, // b8
256 	0x044e, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, // c0
257 	0x0445, 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, // c8
258 	0x043f, 0x044f, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, // d0
259 	0x044c, 0x044b, 0x0437, 0x0448, 0x044d, 0x0449, 0x0447, 0x044a, // d8
260 	0x042e, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, // e0
261 	0x0425, 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, // e8
262 	0x041f, 0x042f, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, // f0
263 	0x042c, 0x042b, 0x0417, 0x0428, 0x042d, 0x0429, 0x0427, 0x042a, // f8
264 };
265 
266 static const unsigned short ISCII[0x60] = {
267 	0x0020, 0x0901, 0x0902, 0x0903, 0x0905, 0x0906, 0x0907, 0x0908, // a0
268 	0x0909, 0x090a, 0x090b, 0x090e, 0x090f, 0x0910, 0x090d, 0x0912, // a8
269 	0x0913, 0x0914, 0x0911, 0x0915, 0x0916, 0x0917, 0x0918, 0x0919, // b0
270 	0x091a, 0x091b, 0x091c, 0x091d, 0x091e, 0x091f, 0x0920, 0x0921, // b8
271 	0x0922, 0x0923, 0x0924, 0x0925, 0x0926, 0x0927, 0x0928, 0x0929, // c0
272 	0x092a, 0x092b, 0x092c, 0x092d, 0x092e, 0x092f, 0x095f, 0x0930, // c8
273 	0x0931, 0x0932, 0x0933, 0x0934, 0x0935, 0x0936, 0x0937, 0x0938, // d0
274 	0x0939, 0x0020, 0x093e, 0x093f, 0x0940, 0x0941, 0x0942, 0x0943, // d8
275 	0x0946, 0x0947, 0x0948, 0x0945, 0x094a, 0x094b, 0x094c, 0x0949, // e0
276 	0x094d, 0x093c, 0x0964, 0x0020, 0x0020, 0x0020, 0x0020, 0x0020, // e8
277 	0x0020, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, // f0
278 	0x0037, 0x0038, 0x0039, 0x20,   0x20,   0x20,   0x20,   0x20,   // f8
279 };
280 
281 const unsigned short *charsets[N_CHARSETS] = {
282 	ISO_8859_1,
283 	ISO_8859_1,
284 	ISO_8859_2,
285 	ISO_8859_3,
286 	ISO_8859_4,
287 	ISO_8859_5,
288 	ISO_8859_1,
289 	ISO_8859_7,
290 	ISO_8859_1,
291 	ISO_8859_9,
292 	ISO_8859_1,
293 	ISO_8859_1,
294 	ISO_8859_1,
295 	ISO_8859_1,
296 	ISO_8859_14,
297 	ISO_8859_1,
298 	ISO_8859_1,
299 	ISO_8859_1,
300 	KOI8_R,          // 18
301 	ISCII
302 };
303 
304 // Tables of the relative lengths of vowels, depending on the
305 // type of the two phonemes that follow
306 // indexes are the "length_mod" value for the following phonemes
307 
308 // use this table if vowel is not the last in the word
309 static unsigned char length_mods_en[100] = {
310 	/*  a   ,   t   s   n   d   z   r   N   <- next */
311 	100,120,100,105,100,110,110,100, 95, 100,  /* a  <- next2 */
312 	105,120,105,110,125,130,135,115,125, 100,  /* , */
313 	105,120, 75,100, 75,105,120, 85, 75, 100,  /* t */
314 	105,120, 85,105, 95,115,120,100, 95, 100,  /* s */
315 	110,120, 95,105,100,115,120,100,100, 100,  /* n */
316 	105,120,100,105, 95,115,120,110, 95, 100,  /* d */
317 	105,120,100,105,105,122,125,110,105, 100,  /* z */
318 	105,120,100,105,105,122,125,110,105, 100,  /* r */
319 	105,120, 95,105,100,115,120,110,100, 100,  /* N */
320 	100,120,100,100,100,100,100,100,100, 100
321 }; // SPARE
322 
323 // as above, but for the last syllable in a word
324 static unsigned char length_mods_en0[100] = {
325 	/*  a   ,   t   s   n   d   z   r    N  <- next */
326 	100,150,100,105,110,115,110,110,110, 100,  /* a  <- next2 */
327 	105,150,105,110,125,135,140,115,135, 100,  /* , */
328 	105,150, 90,105, 90,122,135,100, 90, 100,  /* t */
329 	105,150,100,105,100,122,135,100,100, 100,  /* s */
330 	105,150,100,105,105,115,135,110,105, 100,  /* n */
331 	105,150,100,105,105,122,130,120,125, 100,  /* d */
332 	105,150,100,105,110,122,125,115,110, 100,  /* z */
333 	105,150,100,105,105,122,135,120,105, 100,  /* r */
334 	105,150,100,105,105,115,135,110,105, 100,  /* N */
335 	100,100,100,100,100,100,100,100,100, 100
336 }; // SPARE
337 
338 
339 static unsigned char length_mods_equal[100] = {
340 	/*  a   ,   t   s   n   d   z   r   N   <- next */
341 	110,120,100,110,110,110,110,110,110, 110,  /* a  <- next2 */
342 	110,120,100,110,110,110,110,110,110, 110,  /* , */
343 	110,120,100,110,100,110,110,110,100, 110,  /* t */
344 	110,120,100,110,110,110,110,110,110, 110,  /* s */
345 	110,120,100,110,110,110,110,110,110, 110,  /* n */
346 	110,120,100,110,110,110,110,110,110, 110,  /* d */
347 	110,120,100,110,110,110,110,110,110, 110,  /* z */
348 	110,120,100,110,110,110,110,110,110, 110,  /* r */
349 	110,120,100,110,110,110,110,110,110, 110,  /* N */
350 	110,120,100,110,110,110,110,110,110, 110
351 }; // SPARE
352 
353 
354 static unsigned char *length_mod_tabs[6] = {
355 	length_mods_en,
356 	length_mods_en,     // 1
357 	length_mods_en0,    // 2
358 	length_mods_equal,  // 3
359 	length_mods_equal,  // 4
360 	length_mods_equal   // 5
361 };
362 
363 
SetLengthMods(Translator * tr,int value)364 void SetLengthMods(Translator *tr, int value)
365 {//==========================================
366 	int value2;
367 
368 	tr->langopts.length_mods0 = tr->langopts.length_mods = length_mod_tabs[value % 100];
369 	if((value2 = value / 100) != 0)
370 	{
371 		tr->langopts.length_mods0 = length_mod_tabs[value2];
372 	}
373 }
374 
375 
376 
IsAlpha(unsigned int c)377 int IsAlpha(unsigned int c)
378 {//========================
379 // Replacement for iswalph() which also checks for some in-word symbols
380 
381 	static const unsigned short extra_indic_alphas[] = {
382 		0xa70,0xa71,	// Gurmukhi: tippi, addak
383 		0
384 	};
385 
386 	if(iswalpha2(c))
387 		return(1);
388 
389 	if(c < 0x300)
390 		return(0);
391 
392 	if((c >= 0x901) && (c <= 0xdf7))
393 	{
394 		// Indic scripts: Devanagari, Tamil, etc
395 		if((c & 0x7f) < 0x64)
396 			return(1);
397 		if(lookupwchar(extra_indic_alphas, c) != 0)
398 			return(1);
399 		if((c >= 0xd7a) && (c <= 0xd7f))
400 			return(1);   // malaytalam chillu characters
401 
402 		return(0);
403 	}
404 
405 	if((c >= 0x5b0) && (c <= 0x5c2))
406 		return(1);  // Hebrew vowel marks
407 
408 	if(c == 0x0605)
409 		return(1);
410 
411 	if((c == 0x670) || ((c >= 0x64b) && (c <= 0x65e)))
412 		return(1);   // arabic vowel marks
413 
414 	if((c >= 0x300) && (c <= 0x36f))
415 		return(1);   // combining accents
416 
417 	if((c >= 0x780) && (c <= 0x7b1))
418 		return(1);   // taani/divehi (maldives)
419 
420 	if((c >= 0xf40) && (c <= 0xfbc))
421 		return(1);   // tibetan
422 
423 	if((c >= 0x1100) && (c <= 0x11ff))
424 		return(1);  //Korean jamo
425 
426 	if((c >= 0x2800) && (c <= 0x28ff))
427 		return(1);  // braille
428 
429 	if((c > 0x3040) && (c <= 0xa700))
430 		return(1); // Chinese/Japanese.  Should never get here, but Mac OS 10.4's iswalpha seems to be broken, so just make sure
431 
432 	return(0);
433 }
434 
IsDigit09(unsigned int c)435 int IsDigit09(unsigned int c)
436 {//============================
437 	if((c >= '0') && (c <= '9'))
438 		return(1);
439 	return(0);
440 }
441 
IsDigit(unsigned int c)442 int IsDigit(unsigned int c)
443 {//========================
444 	if(iswdigit(c))
445 		return(1);
446 
447 	if((c >= 0x966) && (c <= 0x96f))
448 		return(1);
449 
450 	return(0);
451 }
452 
IsSpace(unsigned int c)453 int IsSpace(unsigned int c)
454 {//========================
455 	if(c == 0)
456 		return(0);
457 	if((c >= 0x2500) && (c < 0x25a0))
458 		return(1);  // box drawing characters
459 	if((c >= 0xfff9) && (c <= 0xffff))
460 		return(1);  // unicode specials
461 //	if(wcschr(chars_space,c))
462 //		return(1);
463 	return(iswspace(c));
464 }
465 
466 
DeleteTranslator(Translator * tr)467 void DeleteTranslator(Translator *tr)
468 {//==================================
469 	if(tr->data_dictlist != NULL)
470 		Free(tr->data_dictlist);
471 	Free(tr);
472 }
473 
474 
lookupwchar(const unsigned short * list,int c)475 int lookupwchar(const unsigned short *list,int c)
476 {//==============================================
477 // Is the character c in the list ?
478 	int ix;
479 
480 	for(ix=0; list[ix] != 0; ix++)
481 	{
482 		if(list[ix] == c)
483 			return(ix+1);
484 	}
485 	return(0);
486 }
487 
488 
lookupwchar2(const unsigned short * list,int c)489 int lookupwchar2(const unsigned short *list,int c)
490 {//==============================================
491 // Replace character c by another character.
492 // Returns 0 = not found, 1 = delete character
493 	int ix;
494 
495 	for(ix=0; list[ix] != 0; ix+=2)
496 	{
497 		if(list[ix] == c)
498 			return(list[ix+1]);
499 	}
500 	return(0);
501 }
502 
503 
IsBracket(int c)504 int IsBracket(int c)
505 {//=================
506 	if((c >= 0x2014) && (c <= 0x201f))
507 		return(1);
508 	return(lookupwchar(brackets,c));
509 }
510 
511 
utf8_out(unsigned int c,char * buf)512 int utf8_out(unsigned int c, char *buf)
513 {//====================================
514 // write a unicode character into a buffer as utf8
515 // returns the number of bytes written
516 	int n_bytes;
517 	int j;
518 	int shift;
519 	static char unsigned code[4] = {0,0xc0,0xe0,0xf0};
520 
521 	if(c < 0x80)
522 	{
523 		buf[0] = c;
524 		return(1);
525 	}
526 	if(c >= 0x110000)
527 	{
528 		buf[0] = ' ';      // out of range character code
529 		return(1);
530 	}
531 	if(c < 0x0800)
532 		n_bytes = 1;
533 	else if(c < 0x10000)
534 		n_bytes = 2;
535 	else
536 		n_bytes = 3;
537 
538 	shift = 6*n_bytes;
539 	buf[0] = code[n_bytes] | (c >> shift);
540 	for(j=0; j<n_bytes; j++)
541 	{
542 		shift -= 6;
543 		buf[j+1] = 0x80 + ((c >> shift) & 0x3f);
544 	}
545 	return(n_bytes+1);
546 }  // end of utf8_out
547 
548 
utf8_nbytes(const char * buf)549 int utf8_nbytes(const char *buf)
550 {//=============================
551 // Returns the number of bytes for the first UTF-8 character in buf
552 	unsigned char c = (unsigned char)buf[0];
553 	if(c < 0x80)
554 		return(1);
555 	if(c < 0xe0)
556 		return(2);
557 	if(c < 0xf0)
558 		return(3);
559 	return(4);
560 }
561 
562 
utf8_in2(int * c,const char * buf,int backwards)563 int utf8_in2(int *c, const char *buf, int backwards)
564 {//=================================================
565 // Read a unicode characater from a UTF8 string
566 // Returns the number of UTF8 bytes used.
567 // backwards: set if we are moving backwards through the UTF8 string
568 	int c1;
569 	int n_bytes;
570 	int ix;
571 	static const unsigned char mask[4] = {0xff,0x1f,0x0f,0x07};
572 
573 	// find the start of the next/previous character
574 	while((*buf & 0xc0) == 0x80)
575 	{
576 		// skip over non-initial bytes of a multi-byte utf8 character
577 		if(backwards)
578 			buf--;
579 		else
580 			buf++;
581 	}
582 
583 	n_bytes = 0;
584 
585 	if((c1 = *buf++) & 0x80)
586 	{
587 		if((c1 & 0xe0) == 0xc0)
588 			n_bytes = 1;
589 		else if((c1 & 0xf0) == 0xe0)
590 			n_bytes = 2;
591 		else if((c1 & 0xf8) == 0xf0)
592 			n_bytes = 3;
593 
594 		c1 &= mask[n_bytes];
595 		for(ix=0; ix<n_bytes; ix++)
596 		{
597 			c1 = (c1 << 6) + (*buf++ & 0x3f);
598 		}
599 	}
600 	*c = c1;
601 	return(n_bytes+1);
602 }
603 
604 
utf8_in(int * c,const char * buf)605 int utf8_in(int *c, const char *buf)
606 {//=================================
607 // Read a unicode characater from a UTF8 string
608 // Returns the number of UTF8 bytes used.
609 	return(utf8_in2(c,buf,0));
610 }
611 
612 
strchr_w(const char * s,int c)613 char *strchr_w(const char *s, int c)
614 {//=================================
615 // return NULL for any non-ascii character
616 	if(c >= 0x80)
617 		return(NULL);
618 	return(strchr((char *)s,c));    // (char *) is needed for Borland compiler
619 }
620 
621 
IsAllUpper(const char * word)622 int IsAllUpper(const char *word)
623 {//=============================
624 	int c;
625 	while((*word != 0) && !isspace2(*word))
626 	{
627 		word += utf8_in(&c, word);
628 		if(!iswupper2(c))
629 			return(0);
630 	}
631 	return(1);
632 }
633 
634 
SpeakIndividualLetters(Translator * tr,char * word,char * phonemes,int spell_word)635 static char *SpeakIndividualLetters(Translator *tr, char *word, char *phonemes, int spell_word)
636 {//============================================================================================
637 	int posn = 0;
638 	int capitals = 0;
639 	int non_initial = 0;
640 
641 	if(spell_word > 2)
642 		capitals = 2;  // speak 'capital'
643 	if(spell_word > 1)
644 		capitals |= 4; // speak charater code for unknown letters
645 
646 	while((*word != ' ') && (*word != 0))
647 	{
648 		word += TranslateLetter(tr, word, phonemes, capitals | non_initial);
649 		posn++;
650 		non_initial = 1;
651 		if(phonemes[0] == phonSWITCH)
652 		{
653 			// change to another language in order to translate this word
654 			strcpy(word_phonemes,phonemes);
655 			return(NULL);
656 		}
657 	}
658 	SetSpellingStress(tr,phonemes,spell_word,posn);
659 	return(word);
660 }  // end of SpeakIndividualLetters
661 
662 
663 
CheckDottedAbbrev(char * word1,WORD_TAB * wtab)664 static int CheckDottedAbbrev(char *word1, WORD_TAB *wtab)
665 {//=====================================================
666 	int wc;
667 	int count = 0;
668 	int nbytes;
669 	int ok;
670 	int ix;
671 	char *word;
672 	char *wbuf;
673 	char word_buf[80];
674 
675 	word = word1;
676 	wbuf = word_buf;
677 	ix = 0;
678 
679 	for(;;)
680 	{
681 		ok = 0;
682 		nbytes = utf8_in(&wc, word);
683 		if((word[nbytes] == ' ') && IsAlpha(wc))
684 		{
685 			if(word[nbytes+1] == '.')
686 			{
687 				if(word[nbytes+2] == ' ')
688 					ok = 1;
689 				else if(word[nbytes+2] =='\'')
690 				{
691 					nbytes += 2;   // delete the final dot (eg. u.s.a.'s)
692 					ok = 2;
693 				}
694 			}
695 			else if((count > 0) && (word[nbytes] == ' '))
696 				ok = 2;
697 		}
698 
699 		if(ok == 0)
700 			break;
701 
702 		for(ix=0; ix < nbytes; ix++)
703 			*wbuf++ = word[ix];
704 
705 		count++;
706 
707 		if(ok == 2)
708 		{
709 			word += nbytes;
710 			break;
711 		}
712 
713 		word += (nbytes + 3);
714 	}
715 
716 	if(count > 1)
717 	{
718 		ix = wbuf - word_buf;
719 		memcpy(word1, word_buf, ix);
720 		while(&word1[ix] < word)
721 			word1[ix++] = ' ';
722 		dictionary_skipwords = (count - 1)*2;
723 	}
724 	return(count);
725 }  // end of CheckDottedAbbrev
726 
727 
728 extern char *phondata_ptr;
729 
ChangeEquivalentPhonemes(Translator * tr,int lang2,char * phonemes)730 int ChangeEquivalentPhonemes(Translator *tr, int lang2, char *phonemes)
731 {//====================================================================
732 // tr:  the original language
733 // lang2:  phoneme table number for the temporary language
734 // phonemes: the phonemes to be replaced
735 
736 	int ix;
737 	int len;
738 	char  phon;
739 	char *p;
740 	unsigned char *pb;
741 	char *eqlist;
742 	char *p_out;
743 	char *p_in;
744 	int  remove_stress = 0;
745 	char phonbuf[N_WORD_PHONEMES];
746 
747 	// has a phoneme equivalence table been specified for thus language pair?
748 	if((ix = phoneme_tab_list[tr->phoneme_tab_ix].equivalence_tables) == 0)
749 		return(0);
750 
751 	pb = (unsigned char *)&phondata_ptr[ix];
752 
753 	for(;;)
754 	{
755 		if(pb[0] == 0)
756 			return(0);   // table not found
757 
758 		if(pb[0] == lang2)
759 			break;
760 
761 		len = (pb[2] << 8) + pb[3];   // size of this table in words
762 		pb += (len * 4);
763 	}
764 	remove_stress = pb[1];
765 
766 	if(option_phonemes == 2)
767 	{
768 		DecodePhonemes(phonemes, phonbuf);
769 		fprintf(f_trans,"(%s) %s  -> (%s) ", phoneme_tab_list[lang2].name, phonbuf, phoneme_tab_list[tr->phoneme_tab_ix].name);
770 	}
771 
772 	p_in = phonemes;
773 	eqlist = (char *)&pb[8];
774 	p_out = phonbuf;
775 
776 	while((phon = *p_in++) != 0)
777 	{
778 		if(remove_stress && ((phon & 0xff) < phonSTRESS_PREV))
779 			continue;   // remove stress marks
780 
781 		// is there a translation for this phoneme code?
782 		p = eqlist;
783 		while(*p != 0)
784 		{
785 			len = strlen(&p[1]);
786 			if(*p == phon)
787 			{
788 				strcpy(p_out, &p[1]);
789 				p_out += len;
790 				break;
791 			}
792 			p += (len + 2);
793 		}
794 		if(*p == 0)
795 		{
796 			// no translation found
797 			*p_out++ = phon;
798 		}
799 	}
800 	*p_out = 0;
801 
802 	if(remove_stress)
803 	{
804 		SetWordStress(tr, phonbuf, NULL, -1, 0);
805 	}
806 
807 	strcpy(phonemes, phonbuf);
808 
809 	if(option_phonemes == 2)
810 	{
811 		SelectPhonemeTable(tr->phoneme_tab_ix);
812 		DecodePhonemes(phonemes, phonbuf);
813 		fprintf(f_trans,"%s\n\n", phonbuf);
814 	}
815 	return(1);
816 }  // end of ChangeEquivalentPhonemes
817 
818 
819 
820 
TranslateWord(Translator * tr,char * word_start,int next_pause,WORD_TAB * wtab,char * word_out)821 int TranslateWord(Translator *tr, char *word_start, int next_pause, WORD_TAB *wtab, char *word_out)
822 {//==================================================================================================
823 // word1 is terminated by space (0x20) character
824 
825 	char *word1;
826 	int word_length;
827 	int ix;
828 	char *p;
829 	int pfix;
830 	int n_chars;
831 	unsigned int dictionary_flags[2];
832 	unsigned int dictionary_flags2[2];
833 	int end_type=0;
834 	int end_type1=0;
835 	int prefix_type=0;
836 	int prefix_stress;
837 	char *wordx;
838 	char phonemes[N_WORD_PHONEMES];
839 	char phonemes2[N_WORD_PHONEMES];
840 	char prefix_phonemes[N_WORD_PHONEMES];
841 	char unpron_phonemes[N_WORD_PHONEMES];
842 	char end_phonemes[N_WORD_PHONEMES];
843 	char end_phonemes2[N_WORD_PHONEMES];
844 	char word_copy[N_WORD_BYTES];
845 	char word_copy2[N_WORD_BYTES];
846 	int word_copy_length;
847 	char prefix_chars[0x3f + 2];
848 	int found=0;
849 	int end_flags;
850 	int c_temp;   // save a character byte while we temporarily replace it with space
851 	int first_char;
852 	int last_char = 0;
853 	int add_plural_suffix = 0;
854 	int prefix_flags = 0;
855 	int more_suffixes;
856 	int confirm_prefix;
857 	int spell_word;
858 	int stress_bits;
859 	int emphasize_allcaps = 0;
860 	int wflags;
861 	int wmark;
862 	int was_unpronouncable = 0;
863 	int loopcount;
864 	WORD_TAB wtab_null[8];
865 
866 	// translate these to get pronunciations of plural 's' suffix (different forms depending on
867 	// the preceding letter
868 	static char word_zz[4] = {0,'z','z',0};
869 	static char word_iz[4] = {0,'i','z',0};
870 	static char word_ss[4] = {0,'s','s',0};
871 
872 	if(wtab == NULL)
873 	{
874 		memset(wtab_null, 0, sizeof(wtab_null));
875 		wtab = wtab_null;
876 	}
877 	wflags = wtab->flags;
878 	wmark = wtab->wmark;
879 
880 	dictionary_flags[0] = 0;
881 	dictionary_flags[1] = 0;
882 	dictionary_flags2[0] = 0;
883 	dictionary_flags2[1] = 0;
884 	dictionary_skipwords = 0;
885 
886 	phonemes[0] = 0;
887 	unpron_phonemes[0] = 0;
888 	prefix_phonemes[0] = 0;
889 	end_phonemes[0] = 0;
890 
891 	if(tr->data_dictlist == NULL)
892 	{
893 		// dictionary is not loaded
894 		word_phonemes[0] = 0;
895 		return(0);
896 	}
897 
898 	// count the length of the word
899 	word1 = word_start;
900 	if(*word1 == ' ') word1++;   // possibly a dot was replaced by space:  $dot
901 	wordx = word1;
902 
903 	utf8_in(&first_char,wordx);
904 	word_length = 0;
905 	while((*wordx != 0) && (*wordx != ' '))
906 	{
907 		wordx += utf8_in(&last_char,wordx);
908 		word_length++;
909 	}
910 
911 	word_copy_length = wordx - word_start;
912 	if(word_copy_length >= N_WORD_BYTES)
913 		word_copy_length = N_WORD_BYTES-1;
914 	memcpy(word_copy2, word_start, word_copy_length);
915 
916 	spell_word = 0;
917 
918 	if((word_length == 1) && (wflags & FLAG_TRANSLATOR2))
919 	{
920 		// retranslating a 1-character word using a different language, say its name
921 		utf8_in(&c_temp, wordx+1);  // the next character
922 		if(!IsAlpha(c_temp) || (AlphabetFromChar(last_char) != AlphabetFromChar(c_temp)))
923 			spell_word = 1;
924 	}
925 
926 	if(option_sayas == SAYAS_KEY)
927 	{
928 		if(word_length == 1)
929 			spell_word = 4;
930 		else
931 		{
932 			// is there a translation for this keyname ?
933 			word1--;
934 			*word1 = '_';   // prefix keyname with '_'
935 			found = LookupDictList(tr, &word1, phonemes, dictionary_flags, 0, wtab);
936 		}
937 	}
938 
939 	// try an initial lookup in the dictionary list, we may find a pronunciation specified, or
940 	// we may just find some flags
941 	if(option_sayas & 0x10)
942 	{
943 		// SAYAS_CHAR, SAYAS_GYLPH, or SAYAS_SINGLE_CHAR
944 		spell_word = option_sayas & 0xf;    // 2,3,4
945 	}
946 	else
947 	{
948 		if(!found)
949 			found = LookupDictList(tr, &word1, phonemes, dictionary_flags, FLAG_ALLOW_TEXTMODE, wtab);   // the original word
950 
951 
952 		if((dictionary_flags[0] & (FLAG_ALLOW_DOT || FLAG_NEEDS_DOT)) && (wordx[1] == '.'))
953 		{
954 			wordx[1] = ' ';   // remove a Dot after this word
955 		}
956 
957 		if(dictionary_flags[0] & FLAG_TEXTMODE)
958 		{
959 			if(word_out != NULL)
960 				strcpy(word_out, word1);
961 
962 			first_char = word1[0];
963 			stress_bits = dictionary_flags[0] & 0x7f;
964 			found = LookupDictList(tr, &word1, phonemes, dictionary_flags2, 0, wtab);   // the text replacement
965 			if(dictionary_flags2[0]!=0)
966 			{
967 				dictionary_flags[0] = dictionary_flags2[0];
968 				dictionary_flags[1] = dictionary_flags2[1];
969 				if(stress_bits != 0)
970 				{
971 					// keep any stress information from the original word
972 					dictionary_flags[0] = (dictionary_flags[0] & ~0x7f) | stress_bits;
973 				}
974 			}
975 		}
976 		else if((found==0) && (dictionary_flags[0] & FLAG_SKIPWORDS) && !(dictionary_flags[0] & FLAG_ABBREV))
977 		{
978 			// grouped words, but no translation.  Join the words with hyphens.
979 			wordx = word1;
980 			ix = 0;
981 			while(ix < dictionary_skipwords)
982 			{
983 				if(*wordx == ' ')
984 				{
985 					*wordx = '-';
986 					ix++;
987 				}
988 				wordx++;
989 			}
990 		}
991 
992 		if((word_length == 1) && (dictionary_skipwords == 0))
993 		{
994 			// is this a series of single letters separated by dots?
995 			if(CheckDottedAbbrev(word1, wtab))
996 			{
997 				dictionary_flags[0] = 0;
998 				dictionary_flags[1] = 0;
999 				spell_word = 1;
1000 				if(dictionary_skipwords)
1001 					dictionary_flags[0] = FLAG_SKIPWORDS;
1002 			}
1003 		}
1004 
1005 
1006 		// if textmode, LookupDictList() replaces word1 by the new text and returns found=0
1007 
1008 		if(phonemes[0] == phonSWITCH)
1009 		{
1010 			// change to another language in order to translate this word
1011 			strcpy(word_phonemes,phonemes);
1012 			return(0);
1013 		}
1014 
1015 		if((wmark > 0) && (wmark < 8))
1016 		{
1017 			// the stressed syllable has been specified in the text  (TESTING)
1018 			dictionary_flags[0] = (dictionary_flags[0] & ~0xf) | wmark;
1019 		}
1020 
1021 		if(!found && (dictionary_flags[0] & FLAG_ABBREV))
1022 		{
1023 			// the word has $abbrev flag, but no pronunciation specified.  Speak as individual letters
1024 			spell_word = 1;
1025 		}
1026 
1027 		if(!found && iswdigit(first_char))
1028 		{
1029 			Lookup(tr,"_0lang",word_phonemes);
1030 			if(word_phonemes[0] == phonSWITCH)
1031 				return(0);
1032 
1033 			if((tr->langopts.numbers2 & NUM2_ENGLISH_NUMERALS) && !(wtab->flags & FLAG_CHAR_REPLACED))
1034 			{
1035 				// for this language, speak English numerals (0-9) with the English voice
1036 				sprintf(word_phonemes,"%c",phonSWITCH);
1037 				return(0);
1038 			}
1039 
1040 			found = TranslateNumber(tr, word1, phonemes, dictionary_flags, wtab, 0);
1041 		}
1042 
1043 		if(!found && ((wflags & FLAG_UPPERS) != FLAG_FIRST_UPPER))
1044 		{
1045 			// either all upper or all lower case
1046 
1047 			if((tr->langopts.numbers & NUM_ROMAN) || ((tr->langopts.numbers & NUM_ROMAN_CAPITALS) && (wflags & FLAG_ALL_UPPER)))
1048 			{
1049 				if((wflags & FLAG_LAST_WORD) || !(wtab[1].flags & FLAG_NOSPACE))
1050 				{
1051 					// don't use Roman number if this word is not separated from the next word (eg. "XLTest")
1052 					if((found = TranslateRoman(tr, word1, phonemes, wtab)) != 0)
1053 						dictionary_flags[0] |= FLAG_ABBREV;   // prevent emphasis if capitals
1054 				}
1055 			}
1056 		}
1057 
1058 		if((wflags & FLAG_ALL_UPPER) && (word_length > 1)&& iswalpha2(first_char))
1059 		{
1060 			if((option_tone_flags & OPTION_EMPHASIZE_ALLCAPS) && !(dictionary_flags[0] & FLAG_ABBREV))
1061 			{
1062 				// emphasize words which are in capitals
1063 				emphasize_allcaps = FLAG_EMPHASIZED;
1064 			}
1065 			else if(!found && !(dictionary_flags[0] &  FLAG_SKIPWORDS) && (word_length<4) && (tr->clause_lower_count > 3)
1066 					&& (tr->clause_upper_count <= tr->clause_lower_count))
1067 			{
1068 				// An upper case word in a lower case clause. This could be an abbreviation.
1069 				spell_word = 1;
1070 			}
1071 		}
1072 	}
1073 
1074 	if(spell_word > 0)
1075 	{
1076 		// Speak as individual letters
1077 		phonemes[0] = 0;
1078 		end_type = 0;
1079 
1080 		if(SpeakIndividualLetters(tr, word1, phonemes, spell_word) == NULL)
1081 		{
1082 			if(word_length > 1)
1083 				return(FLAG_SPELLWORD);  // a mixture of languages, retranslate as individual letters, separated by spaces
1084 			if(phonemes[0] == phonSWITCH)
1085 			{
1086 // problem with espeak -vbg "b.c.d.e.f"
1087 			}
1088 			return(0);
1089 		}
1090 		strcpy(word_phonemes, phonemes);
1091 		if(wflags & FLAG_TRANSLATOR2)
1092 			return(0);
1093 		return(dictionary_flags[0] & FLAG_SKIPWORDS);  // for "b.c.d"
1094 	}
1095 	else if(found == 0)
1096 	{
1097 		int posn;
1098 		int non_initial;
1099 		int length;
1100 		// word's pronunciation is not given in the dictionary list, although
1101 		// dictionary_flags may have ben set there
1102 
1103 		posn = 0;
1104 		non_initial = 0;
1105 		length = 999;
1106 		wordx = word1;
1107 
1108 		while(((length < 3) && (length > 0))|| (word_length > 1 && Unpronouncable(tr, wordx, posn)))
1109 		{
1110 			// This word looks "unpronouncable", so speak letters individually until we
1111 			// find a remainder that we can pronounce.
1112 			was_unpronouncable = FLAG_WAS_UNPRONOUNCABLE;
1113 			emphasize_allcaps = 0;
1114 
1115 			if(wordx[0] == '\'')
1116 				break;
1117 
1118 			if(posn > 0)
1119 				non_initial = 1;
1120 
1121 			wordx += TranslateLetter(tr, wordx, unpron_phonemes, non_initial);
1122 			posn++;
1123 			if(unpron_phonemes[0] == phonSWITCH)
1124 			{
1125 				// change to another language in order to translate this word
1126 				strcpy(word_phonemes,unpron_phonemes);
1127 				if(strcmp(&unpron_phonemes[1],"en")==0)
1128 					return(FLAG_SPELLWORD);   // _^_en must have been set in TranslateLetter(), not *_rules which uses only _^_
1129 				return(0);
1130 			}
1131 
1132 #ifdef deleted
1133 			p = &wordx[word_length-3];    // this looks wrong.  Doesn't consider multi-byte chars.
1134 			if(memcmp(p,"'s ",3) == 0)
1135 			{
1136 				// remove a 's suffix and pronounce this separately (not as an individual letter)
1137 				add_plural_suffix = 1;
1138 				p[0] = ' ';
1139 				p[1] = ' ';
1140 				last_char = p[-1];
1141 			}
1142 #endif
1143 			length=0;
1144 			while(wordx[length] != ' ') length++;
1145 		}
1146 		SetSpellingStress(tr,unpron_phonemes,0,posn);
1147 
1148 		// anything left ?
1149 		if(*wordx != ' ')
1150 		{
1151 			if((unpron_phonemes[0] != 0) && (wordx[0] != '\''))
1152 			{
1153 				// letters which have been spoken individually from affecting the pronunciation of the pronuncable part
1154 				wordx[-1] = ' ';
1155 			}
1156 
1157 			// Translate the stem
1158 			end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags, dictionary_flags);
1159 
1160 			if(phonemes[0] == phonSWITCH)
1161 			{
1162 				// change to another language in order to translate this word
1163 				strcpy(word_phonemes,phonemes);
1164 				return(0);
1165 			}
1166 
1167 #ifdef deleted
1168 // ?? allow $unpr while translating rules, not just on initial FLAG_UNPRON_TEST
1169 if(end_type & SUFX_UNPRON)
1170 {
1171 	phonemes[0] = 0;  // discard and retranslate as individual letters
1172 	SpeakIndividualLetters(tr, wordx, phonemes, 0);
1173 	strcpy(word_phonemes, phonemes);
1174 	return(0);
1175 }
1176 #endif
1177 
1178 			if((phonemes[0] == 0) && (end_phonemes[0] == 0))
1179 			{
1180 				int wc;
1181 				// characters not recognised, speak them individually
1182 				// ?? should we say super/sub-script numbers and letters here?
1183 				utf8_in(&wc, wordx);
1184 				if((word_length == 1) && (IsAlpha(wc) || IsSuperscript(wc)))
1185 				{
1186 					if((wordx = SpeakIndividualLetters(tr, wordx, phonemes, spell_word)) == NULL)
1187 					{
1188 						return(0);
1189 					}
1190 					strcpy(word_phonemes, phonemes);
1191 					return(0);
1192 				}
1193 			}
1194 
1195 			c_temp = wordx[-1];
1196 
1197 			found = 0;
1198 			confirm_prefix = 1;
1199 			for (loopcount = 0; (loopcount < 50) && (end_type & SUFX_P); loopcount++)
1200 			{
1201 				// Found a standard prefix, remove it and retranslate
1202 				// loopcount guards against an endless loop
1203 				if(confirm_prefix && !(end_type & SUFX_B))
1204 				{
1205 					int end2;
1206 					char end_phonemes2[N_WORD_PHONEMES];
1207 
1208 					// remove any standard suffix and confirm that the prefix is still recognised
1209 					phonemes2[0] = 0;
1210 					end2 = TranslateRules(tr, wordx, phonemes2, N_WORD_PHONEMES, end_phonemes2, wflags|FLAG_NO_PREFIX|FLAG_NO_TRACE, dictionary_flags);
1211 					if(end2)
1212 					{
1213 						RemoveEnding(tr, wordx, end2, word_copy);
1214 						end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags|FLAG_NO_TRACE, dictionary_flags);
1215 						memcpy(wordx,word_copy,strlen(word_copy));
1216 						if((end_type & SUFX_P) == 0)
1217 						{
1218 							// after removing the suffix, the prefix is no longer recognised.
1219 							// Keep the suffix, but don't use the prefix
1220 							end_type = end2;
1221 							strcpy(phonemes,phonemes2);
1222 							strcpy(end_phonemes,end_phonemes2);
1223 							if(option_phonemes == 2)
1224 							{
1225 								DecodePhonemes(end_phonemes,end_phonemes2);
1226 								fprintf(f_trans,"  suffix [%s]\n\n",end_phonemes2);
1227 							}
1228 						}
1229 						confirm_prefix = 0;
1230 						continue;
1231 					}
1232 				}
1233 
1234 				prefix_type = end_type;
1235 
1236 				if(prefix_type & SUFX_V)
1237 				{
1238 					tr->expect_verb = 1;      // use the verb form of the word
1239 				}
1240 
1241 				wordx[-1] = c_temp;
1242 
1243 				if((prefix_type & SUFX_B) == 0)
1244 				{
1245 					for(ix=(prefix_type & 0xf); ix>0; ix--)    // num. of characters to remove
1246 					{
1247 						wordx++;
1248 						while((*wordx & 0xc0) == 0x80) wordx++;  // for multibyte characters
1249 					}
1250 				}
1251 				else
1252 				{
1253 					pfix = 1;
1254 					prefix_chars[0] = 0;
1255 					n_chars = prefix_type & 0x3f;
1256 
1257 					for(ix=0; ix < n_chars; ix++)    // num. of bytes to remove
1258 					{
1259 						prefix_chars[pfix++] = *wordx++;
1260 
1261 						if((prefix_type & SUFX_B) && (ix == (n_chars-1)))
1262 						{
1263 							prefix_chars[pfix-1] = 0;  // discard the last character of the prefix, this is the separator character
1264 						}
1265 					}
1266 					prefix_chars[pfix] = 0;
1267 				}
1268 				c_temp = wordx[-1];
1269 				wordx[-1] = ' ';
1270 				confirm_prefix = 1;
1271 				wflags |= FLAG_PREFIX_REMOVED;
1272 
1273 				if(prefix_type & SUFX_B)
1274 				{
1275 // SUFX_B is used for Turkish, tr_rules contains " ' (Pb"
1276 					// examine the prefix part
1277 					char *wordpf;
1278 					char prefix_phonemes2[12];
1279 
1280 					strncpy0(prefix_phonemes2,end_phonemes,sizeof(prefix_phonemes2));
1281 					wordpf = &prefix_chars[1];
1282 					strcpy(prefix_phonemes, phonemes);
1283 
1284 					// look for stress marker or $abbrev
1285 					found = LookupDictList(tr, &wordpf, phonemes, dictionary_flags, 0, wtab);
1286 					if(found)
1287 					{
1288 						strcpy(prefix_phonemes, phonemes);
1289 					}
1290 					if(dictionary_flags[0] & FLAG_ABBREV)
1291 					{
1292 						prefix_phonemes[0] = 0;
1293 						SpeakIndividualLetters(tr, wordpf, prefix_phonemes, 1);
1294 					}
1295 				}
1296 				else
1297 				{
1298 					strcat(prefix_phonemes,end_phonemes);
1299 				}
1300 				end_phonemes[0] = 0;
1301 
1302 				end_type = 0;
1303 				found = LookupDictList(tr, &wordx, phonemes, dictionary_flags2, SUFX_P, wtab);   // without prefix
1304 				if(dictionary_flags[0]==0)
1305 				{
1306 					dictionary_flags[0] = dictionary_flags2[0];
1307 					dictionary_flags[1] = dictionary_flags2[1];
1308 				}
1309 				else
1310 					prefix_flags = 1;
1311 				if(found == 0)
1312 				{
1313 					end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags & (FLAG_HYPHEN_AFTER | FLAG_PREFIX_REMOVED), dictionary_flags);
1314 
1315 					if(phonemes[0] == phonSWITCH)
1316 					{
1317 						// change to another language in order to translate this word
1318 						wordx[-1] = c_temp;
1319 						strcpy(word_phonemes,phonemes);
1320 						return(0);
1321 					}
1322 				}
1323 			}
1324 
1325 
1326 
1327 
1328 			if((end_type != 0) && !(end_type & SUFX_P))
1329 			{
1330 				end_type1 = end_type;
1331 				strcpy(phonemes2,phonemes);
1332 
1333 				// The word has a standard ending, re-translate without this ending
1334 				end_flags = RemoveEnding(tr, wordx, end_type, word_copy);
1335 				more_suffixes = 1;
1336 
1337 				while(more_suffixes)
1338 				{
1339 					more_suffixes = 0;
1340 					phonemes[0] = 0;
1341 
1342 					if(prefix_phonemes[0] != 0)
1343 					{
1344 						// lookup the stem without the prefix removed
1345 						wordx[-1] = c_temp;
1346 						found = LookupDictList(tr, &word1, phonemes, dictionary_flags2, end_flags, wtab);  // include prefix, but not suffix
1347 						wordx[-1] = ' ';
1348 						if(phonemes[0] == phonSWITCH)
1349 						{
1350 							// change to another language in order to translate this word
1351 							memcpy(wordx,word_copy,strlen(word_copy));
1352 							strcpy(word_phonemes,phonemes);
1353 							return(0);
1354 						}
1355 						if(dictionary_flags[0]==0)
1356 						{
1357 							dictionary_flags[0] = dictionary_flags2[0];
1358 							dictionary_flags[1] = dictionary_flags2[1];
1359 						}
1360 						if(found)
1361 							prefix_phonemes[0] = 0;  // matched whole word, don't need prefix now
1362 
1363 						if((found==0) && (dictionary_flags2[0] != 0))
1364 							prefix_flags = 1;
1365 					}
1366 					if(found == 0)
1367 					{
1368 						found = LookupDictList(tr, &wordx, phonemes, dictionary_flags2, end_flags, wtab);  // without prefix and suffix
1369 						if(phonemes[0] == phonSWITCH)
1370 						{
1371 							// change to another language in order to translate this word
1372 							memcpy(wordx,word_copy,strlen(word_copy));
1373 							strcpy(word_phonemes,phonemes);
1374 							return(0);
1375 						}
1376 
1377 						if(dictionary_flags2[0] & FLAG_ABBREV)
1378 						{
1379 							// Removing the suffix leaves a word which should be spoken as individual letters
1380 							// Not yet implemented
1381 						}
1382 						if(dictionary_flags[0]==0)
1383 						{
1384 							dictionary_flags[0] = dictionary_flags2[0];
1385 							dictionary_flags[1] = dictionary_flags2[1];
1386 						}
1387 					}
1388 					if(found == 0)
1389 					{
1390 						if(end_type & SUFX_Q)
1391 						{
1392 							// don't retranslate, use the original lookup result
1393 							strcpy(phonemes,phonemes2);
1394 						}
1395 						else
1396 						{
1397 							if(end_flags & FLAG_SUFX)
1398 								wflags |= FLAG_SUFFIX_REMOVED;
1399 							if(end_type & SUFX_A)
1400 								wflags |= FLAG_SUFFIX_VOWEL;
1401 
1402 							if(end_type & SUFX_M)
1403 							{
1404 								// allow more suffixes before this suffix
1405 								strcpy(end_phonemes2, end_phonemes);
1406 								end_type = TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, end_phonemes, wflags, dictionary_flags);
1407 								strcat(end_phonemes, end_phonemes2);   // add the phonemes for the previous suffixes after this one
1408 
1409 								if((end_type != 0) && !(end_type & SUFX_P))
1410 								{
1411 									// there is another suffix
1412 									end_flags = RemoveEnding(tr, wordx, end_type, NULL);
1413 									more_suffixes = 1;
1414 								}
1415 							}
1416 							else
1417 							{
1418 								// don't remove any previous suffix
1419 								TranslateRules(tr, wordx, phonemes, N_WORD_PHONEMES, NULL, wflags, dictionary_flags);
1420 								end_type = 0;
1421 							}
1422 
1423 							if(phonemes[0] == phonSWITCH)
1424 							{
1425 								// change to another language in order to translate this word
1426 								strcpy(word_phonemes,phonemes);
1427 								memcpy(wordx,word_copy,strlen(word_copy));
1428 								wordx[-1] = c_temp;
1429 								return(0);
1430 							}
1431 						}
1432 					}
1433 				}
1434 
1435 
1436 				if((end_type1 & SUFX_T) == 0)
1437 				{
1438 					// the default is to add the suffix and then determine the word's stress pattern
1439 					AppendPhonemes(tr,phonemes, N_WORD_PHONEMES, end_phonemes);
1440 					end_phonemes[0] = 0;
1441 				}
1442 				memcpy(wordx,word_copy,strlen(word_copy));
1443 			}
1444 
1445 
1446 
1447 
1448 			wordx[-1] = c_temp;
1449 		}
1450 	}
1451 
1452 	if((add_plural_suffix) || (wflags & FLAG_HAS_PLURAL))
1453 	{
1454 		// s or 's suffix, append [s], [z] or [Iz] depending on previous letter
1455 		if(last_char == 'f')
1456 			TranslateRules(tr, &word_ss[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1457 		else if((last_char==0) || (strchr_w("hsx",last_char)==NULL))
1458 			TranslateRules(tr, &word_zz[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1459 		else
1460 			TranslateRules(tr, &word_iz[1], phonemes, N_WORD_PHONEMES, NULL, 0, NULL);
1461 	}
1462 
1463 	wflags |= emphasize_allcaps;
1464 
1465 
1466 	/* determine stress pattern for this word */
1467 	/******************************************/
1468 	prefix_stress = 0;
1469 	for(p = prefix_phonemes; *p != 0; p++)
1470 	{
1471 		if((*p == phonSTRESS_P) || (*p == phonSTRESS_P2))
1472 		{
1473 			prefix_stress = *p;
1474 		}
1475 	}
1476 	if(prefix_flags || (prefix_stress != 0))
1477 	{
1478 		if((tr->langopts.param[LOPT_PREFIXES]) || (prefix_type & SUFX_T))
1479 		{
1480 			char *p;
1481 			// German, keep a secondary stress on the stem
1482 			SetWordStress(tr, phonemes, dictionary_flags, 3, 0);
1483 
1484 			// reduce all but the first primary stress
1485 			ix=0;
1486 			for(p=prefix_phonemes; *p != 0; p++)
1487 			{
1488 				if(*p == phonSTRESS_P)
1489 				{
1490 					if(ix==0)
1491 						ix=1;
1492 					else
1493 						*p = phonSTRESS_3;
1494 				}
1495 			}
1496 #ifdef PLATFORM_WINDOWS
1497 			sprintf(word_phonemes, "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1498 #else
1499 			snprintf(word_phonemes, sizeof(word_phonemes), "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1500 #endif
1501 			word_phonemes[N_WORD_PHONEMES-1] = 0;
1502 			SetWordStress(tr, word_phonemes, dictionary_flags, -1, 0);
1503 		}
1504 		else
1505 		{
1506 			// stress position affects the whole word, including prefix
1507 #ifdef PLATFORM_WINDOWS
1508 			sprintf(word_phonemes, "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1509 #else
1510 			snprintf(word_phonemes, sizeof(word_phonemes), "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1511 #endif
1512 			word_phonemes[N_WORD_PHONEMES-1] = 0;
1513 			SetWordStress(tr, word_phonemes, dictionary_flags, -1, 0);
1514 		}
1515 	}
1516 	else
1517 	{
1518 		if(prefix_phonemes[0] == 0)
1519 			SetWordStress(tr, phonemes, dictionary_flags, -1, 0);
1520 		else
1521 			SetWordStress(tr, phonemes, dictionary_flags, -1, 0);
1522 #ifdef PLATFORM_WINDOWS
1523 		sprintf(word_phonemes, "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1524 #else
1525 		snprintf(word_phonemes, sizeof(word_phonemes), "%s%s%s", unpron_phonemes, prefix_phonemes, phonemes);
1526 #endif
1527 		word_phonemes[N_WORD_PHONEMES-1] = 0;
1528 	}
1529 
1530 	if(end_phonemes[0] != 0)
1531 	{
1532 		// a suffix had the SUFX_T option set, add the suffix after the stress pattern has been determined
1533 		ix = strlen(word_phonemes);
1534 		end_phonemes[N_WORD_PHONEMES-1-ix] = 0;   // ensure no buffer overflow
1535 		strcpy(&word_phonemes[ix], end_phonemes);
1536 	}
1537 
1538 	if(wflags & FLAG_LAST_WORD)
1539 	{
1540 		// don't use $brk pause before the last word of a sentence
1541 		// (but allow it for emphasis, see below
1542 		dictionary_flags[0] &= ~FLAG_PAUSE1;
1543 	}
1544 
1545 #ifdef deleted
1546 // but it causes problems if these are not a person name
1547 	if(tr->translator_name == L('h','u'))
1548 	{
1549 		// lang=hu, If the last two words of a clause have capital letters (eg. a person name), unstress the last word.
1550 		if((wflags & (FLAG_LAST_WORD | FLAG_FIRST_UPPER | FLAG_ALL_UPPER | FLAG_FIRST_WORD)) == (FLAG_LAST_WORD | FLAG_FIRST_UPPER))
1551 		{
1552 			if(((wtab[-1].flags & (FLAG_FIRST_UPPER | FLAG_ALL_UPPER)) == FLAG_FIRST_UPPER) && ((tr->clause_terminator != 0x90028) || (wflags & FLAG_HAS_DOT)))
1553 			{
1554 				ChangeWordStress(tr,word_phonemes,3);
1555 			}
1556 		}
1557 	}
1558 #endif
1559 
1560 	if((wflags & FLAG_HYPHEN) && (tr->langopts.stress_flags & S_HYPEN_UNSTRESS))
1561 	{
1562 		ChangeWordStress(tr,word_phonemes,3);
1563 	}
1564 	else if(wflags & FLAG_EMPHASIZED2)
1565 	{
1566 		// A word is indicated in the source text as stressed
1567 		// Give it stress level 6 (for the intonation module)
1568 		ChangeWordStress(tr,word_phonemes,6);
1569 
1570 		if(wflags & FLAG_EMPHASIZED)
1571 			dictionary_flags[0] |= FLAG_PAUSE1;   // precede by short pause
1572 	}
1573 	else if(wtab[dictionary_skipwords].flags & FLAG_LAST_WORD)
1574 	{
1575 		// the word has attribute to stress or unstress when at end of clause
1576 		if(dictionary_flags[0] & (FLAG_STRESS_END | FLAG_STRESS_END2))
1577 			ChangeWordStress(tr,word_phonemes,4);
1578 		else if((dictionary_flags[0] & FLAG_UNSTRESS_END) && (any_stressed_words))
1579 			ChangeWordStress(tr,word_phonemes,3);
1580 	}
1581 
1582 
1583 	// dictionary flags for this word give a clue about which alternative pronunciations of
1584 	// following words to use.
1585 	if(end_type1 & SUFX_F)
1586 	{
1587 		// expect a verb form, with or without -s suffix
1588 		tr->expect_verb = 2;
1589 		tr->expect_verb_s = 2;
1590 	}
1591 
1592 	if(dictionary_flags[1] & FLAG_PASTF)
1593 	{
1594 		/* expect perfect tense in next two words */
1595 		tr->expect_past = 3;
1596 		tr->expect_verb = 0;
1597 		tr->expect_noun = 0;
1598 	}
1599 	else if(dictionary_flags[1] & FLAG_VERBF)
1600 	{
1601 		/* expect a verb in the next word */
1602 		tr->expect_verb = 2;
1603 		tr->expect_verb_s = 0;   /* verb won't have -s suffix */
1604 		tr->expect_noun = 0;
1605 	}
1606 	else if(dictionary_flags[1] & FLAG_VERBSF)
1607 	{
1608 		// expect a verb, must have a -s suffix
1609 		tr->expect_verb = 0;
1610 		tr->expect_verb_s = 2;
1611 		tr->expect_past = 0;
1612 		tr->expect_noun = 0;
1613 	}
1614 	else if(dictionary_flags[1] & FLAG_NOUNF)
1615 	{
1616 		/* not expecting a verb next */
1617 		tr->expect_noun = 2;
1618 		tr->expect_verb = 0;
1619 		tr->expect_verb_s = 0;
1620 		tr->expect_past = 0;
1621 	}
1622 
1623 	if((wordx[0] != 0) && (!(dictionary_flags[1] & FLAG_VERB_EXT)))
1624 	{
1625 		if(tr->expect_verb > 0)
1626 			tr->expect_verb--;
1627 
1628 		if(tr->expect_verb_s > 0)
1629 			tr->expect_verb_s--;
1630 
1631 		if(tr->expect_noun >0)
1632 			tr->expect_noun--;
1633 
1634 		if(tr->expect_past > 0)
1635 			tr->expect_past--;
1636 	}
1637 
1638 	if((word_length == 1) && (tr->translator_name == L('e','n')) && iswalpha2(first_char) && (first_char != 'i'))
1639 	{
1640 // English Specific !!!!
1641 		// any single letter before a dot is an abbreviation, except 'I'
1642 		dictionary_flags[0] |= FLAG_ALLOW_DOT;
1643 	}
1644 
1645 	if((tr->langopts.param[LOPT_ALT] & 2) && ((dictionary_flags[0] & (FLAG_ALT_TRANS | FLAG_ALT2_TRANS)) != 0))
1646 	{
1647 		ApplySpecialAttribute2(tr,word_phonemes,dictionary_flags[0]);
1648 	}
1649 
1650 	dictionary_flags[0] |= was_unpronouncable;
1651 	memcpy(word_start, word_copy2, word_copy_length);
1652 	return(dictionary_flags[0]);
1653 }  //  end of TranslateWord
1654 
1655 
1656 
SetPlist2(PHONEME_LIST2 * p,unsigned char phcode)1657 static void SetPlist2(PHONEME_LIST2 *p, unsigned char phcode)
1658 {//==========================================================
1659 	p->phcode = phcode;
1660 	p->stresslevel = 0;
1661 	p->tone_ph = 0;
1662 	p->synthflags = embedded_flag;
1663 	p->sourceix = 0;
1664 	embedded_flag = 0;
1665 }
1666 
CountSyllables(unsigned char * phonemes)1667 static int CountSyllables(unsigned char *phonemes)
1668 {//===============================================
1669 	int count = 0;
1670 	int phon;
1671 	while((phon = *phonemes++) != 0)
1672 	{
1673 		if(phoneme_tab[phon]->type == phVOWEL)
1674 			count++;
1675 	}
1676 	return(count);
1677 }
1678 
1679 
Word_EmbeddedCmd()1680 void Word_EmbeddedCmd()
1681 {//====================
1682 // Process embedded commands for emphasis, sayas, and break
1683 	int embedded_cmd;
1684 	int value;
1685 
1686 	do
1687 	{
1688 		embedded_cmd = embedded_list[embedded_read++];
1689 		value = embedded_cmd >> 8;
1690 
1691 		switch(embedded_cmd & 0x1f)
1692 		{
1693 		case EMBED_Y:
1694 			option_sayas = value;
1695 			break;
1696 
1697 		case EMBED_F:
1698 			option_emphasis = value;
1699 			break;
1700 
1701 		case EMBED_B:
1702 			// break command
1703 			if(value == 0)
1704 				pre_pause = 0;  // break=none
1705 			else
1706 				pre_pause += value;
1707 			break;
1708 		}
1709 	} while(((embedded_cmd & 0x80) == 0) && (embedded_read < embedded_ix));
1710 }  // end of Word_EmbeddedCmd
1711 
1712 
SetTranslator2(const char * new_language)1713 int SetTranslator2(const char *new_language)
1714 {//=========================================
1715 // Set translator2 to a second language
1716 	int new_phoneme_tab;
1717 	const char *new_phtab_name;
1718 	int bitmap;
1719 	int dialect = 0;
1720 
1721 	new_phtab_name = new_language;
1722 	if((bitmap = translator->langopts.dict_dialect) != 0)
1723 	{
1724 		if((bitmap & (1 << DICTDIALECT_EN_US)) && (strcmp(new_language, "en") == 0))
1725 		{
1726 			new_phtab_name = "en-us";
1727 			dialect = DICTDIALECT_EN_US;
1728 		}
1729 		if((bitmap & (1 << DICTDIALECT_ES_LA)) && (strcmp(new_language, "es") == 0))
1730 		{
1731 			new_phtab_name = "es-la";
1732 			dialect = DICTDIALECT_ES_LA;
1733 		}
1734 	}
1735 
1736 	if((new_phoneme_tab = SelectPhonemeTableName(new_phtab_name)) >= 0)
1737 	{
1738 		if((translator2 != NULL) && (strcmp(new_language,translator2_language) != 0))
1739 		{
1740 			// we already have an alternative translator, but not for the required language, delete it
1741 			DeleteTranslator(translator2);
1742 			translator2 = NULL;
1743 		}
1744 
1745 		if(translator2 == NULL)
1746 		{
1747 			translator2 = SelectTranslator(new_language);
1748 			strcpy(translator2_language,new_language);
1749 
1750 			if(LoadDictionary(translator2, translator2->dictionary_name, 0) != 0)
1751 			{
1752 				SelectPhonemeTable(voice->phoneme_tab_ix);  // revert to original phoneme table
1753 				new_phoneme_tab = -1;
1754 				translator2_language[0] = 0;
1755 			}
1756 			else
1757 			{
1758 				if(dialect == DICTDIALECT_EN_US)
1759 				{
1760 					// en-us
1761 					translator2->dict_condition = 0x48;  // bits 3, 6
1762 					translator2->langopts.param[LOPT_REDUCE_T] = 1;
1763 				}
1764 				if(dialect == DICTDIALECT_ES_LA)
1765 				{
1766 					translator2->dict_condition = 0x04;  // bit 2
1767 				}
1768 			}
1769 			translator2->phoneme_tab_ix = new_phoneme_tab;
1770 		}
1771 	}
1772 	if(translator2 != NULL)
1773 		translator2->phonemes_repeat[0] = 0;
1774 	return(new_phoneme_tab);
1775 }  // end of SetTranslator2
1776 
1777 
1778 
TranslateWord2(Translator * tr,char * word,WORD_TAB * wtab,int pre_pause,int next_pause)1779 static int TranslateWord2(Translator *tr, char *word, WORD_TAB *wtab, int pre_pause, int next_pause)
1780 {//=================================================================================================
1781 	int flags=0;
1782 	int stress;
1783 	int next_stress;
1784 	int next_tone=0;
1785 	unsigned char *p;
1786 	int srcix;
1787 	int found_dict_flag;
1788 	unsigned char ph_code;
1789 	PHONEME_LIST2 *plist2;
1790 	PHONEME_TAB *ph;
1791 	int max_stress;
1792 	int max_stress_ix=0;
1793 	int prev_vowel = -1;
1794 	int pitch_raised = 0;
1795 	int switch_phonemes = -1;
1796 	int first_phoneme = 1;
1797 	int source_ix;
1798 	int len;
1799 	int ix;
1800 	int sylimit;        // max. number of syllables in a word to be combined with a preceding preposition
1801 	const char *new_language;
1802 	int bad_phoneme;
1803 	int word_flags;
1804 	int word_copy_len;
1805 	char word_copy[N_WORD_BYTES+1];
1806 	char word_replaced[N_WORD_BYTES+1];
1807 	char old_dictionary_name[40];
1808 
1809 	if((f_logespeak != NULL) && (logging_type & 8))
1810 	{
1811 		fprintf(f_logespeak,"WORD: flg=%.5x len=%d  '",wtab->flags,wtab->length);
1812 		for(ix=0; ix<40; ix++)
1813 		{
1814 			if(word[ix]==0) break;
1815 			fputc(word[ix], f_logespeak);
1816 		}
1817 		fprintf(f_logespeak,"'\n");
1818 	}
1819 
1820 	len = wtab->length;
1821 	if(len > 31) len = 31;
1822 	source_ix = (wtab->sourceix & 0x7ff) | (len << 11); // bits 0-10 sourceix, bits 11-15 word length
1823 
1824 	word_flags = wtab[0].flags;
1825 	if(word_flags & FLAG_EMBEDDED)
1826 	{
1827 		wtab[0].flags &= ~FLAG_EMBEDDED;  // clear it in case we call TranslateWord2() again for the same word
1828 		embedded_flag = SFLAG_EMBEDDED;
1829 
1830 		Word_EmbeddedCmd();
1831 	}
1832 
1833 	if((word[0] == 0) || (word_flags & FLAG_DELETE_WORD))
1834 	{
1835 		// nothing to translate.  Add a dummy phoneme to carry any embedded commands
1836 		if(embedded_flag)
1837 		{
1838 			ph_list2[n_ph_list2].phcode = phonEND_WORD;
1839 			ph_list2[n_ph_list2].stresslevel = 0;
1840 			ph_list2[n_ph_list2].wordstress = 0;
1841 			ph_list2[n_ph_list2].tone_ph = 0;
1842 			ph_list2[n_ph_list2].synthflags = embedded_flag;
1843 			ph_list2[n_ph_list2].sourceix = 0;
1844 			n_ph_list2++;
1845 			embedded_flag = 0;
1846 		}
1847 		word_phonemes[0] = 0;
1848 		return(0);
1849 	}
1850 
1851 	// after a $pause word attribute, ignore a $pause attribute on the next two words
1852 	if(tr->prepause_timeout > 0)
1853 		tr->prepause_timeout--;
1854 
1855 	if((option_sayas & 0xf0) == 0x10)
1856 	{
1857 		if(!(word_flags & FLAG_FIRST_WORD))
1858 		{
1859 			// SAYAS_CHARS, SAYAS_GLYPHS, or SAYAS_SINGLECHARS.  Pause between each word.
1860 			pre_pause += 4;
1861 		}
1862 	}
1863 
1864 	if(word_flags & FLAG_FIRST_UPPER)
1865 	{
1866 		if((option_capitals > 2) && (embedded_ix < N_EMBEDDED_LIST-6))
1867 		{
1868 			// indicate capital letter by raising pitch
1869 			if(embedded_flag)
1870 				embedded_list[embedded_ix-1] &= ~0x80;   // already embedded command before this word, remove terminator
1871 			if((pitch_raised = option_capitals) == 3)
1872 				pitch_raised = 20;  // default pitch raise for capitals
1873 			embedded_list[embedded_ix++] = EMBED_P+0x40+0x80 + (pitch_raised << 8);  // raise pitch
1874 			embedded_flag = SFLAG_EMBEDDED;
1875 		}
1876 	}
1877 
1878 	p = (unsigned char *)word_phonemes;
1879 	if(word_flags & FLAG_PHONEMES)
1880 	{
1881 		// The input is in phoneme mnemonics, not language text
1882 		int c1;
1883 		char lang_name[12];
1884 
1885 		if(memcmp(word,"_^_",3)==0)
1886 		{
1887 			// switch languages
1888 			word+=3;
1889 			for(ix=0;;)
1890 			{
1891 				c1 = *word++;
1892 				if((c1==' ') || (c1==0))
1893 					break;
1894 				lang_name[ix++] = tolower(c1);
1895 			}
1896 			lang_name[ix] = 0;
1897 
1898 			if((ix = LookupPhonemeTable(lang_name)) > 0)
1899 			{
1900 				SelectPhonemeTable(ix);
1901 				word_phonemes[0] = phonSWITCH;
1902 				word_phonemes[1] = ix;
1903 				word_phonemes[2] = 0;
1904 			}
1905 		}
1906 		else
1907 		{
1908 			EncodePhonemes(word,word_phonemes,&bad_phoneme);
1909 		}
1910 		flags = FLAG_FOUND;
1911 	}
1912 	else
1913 	{
1914 		int c2;
1915 		ix = 0;
1916 		while(((c2 = word_copy[ix] = word[ix]) != ' ') && (c2 != 0) && (ix < N_WORD_BYTES)) ix++;
1917 		word_copy_len = ix;
1918 
1919 		word_replaced[2] = 0;
1920 		flags = TranslateWord(translator, word, next_pause, wtab, &word_replaced[2]);
1921 
1922 		if(flags & FLAG_SPELLWORD)
1923 		{
1924 			// re-translate the word as individual letters, separated by spaces
1925 			memcpy(word, word_copy, word_copy_len);
1926 			return(flags);
1927 		}
1928 
1929 		if((flags & FLAG_COMBINE) && !(wtab[1].flags & FLAG_PHONEMES))
1930 		{
1931 			char *p2;
1932 			int ok = 1;
1933 			unsigned int flags2[2];
1934 			int c_word2;
1935 			char ph_buf[N_WORD_PHONEMES];
1936 
1937 			flags2[0] = 0;
1938 			sylimit = tr->langopts.param[LOPT_COMBINE_WORDS];
1939 
1940 			// LANG=cs,sk
1941 			// combine a preposition with the following word
1942 			p2 = word;
1943 			while(*p2 != ' ') p2++;
1944 
1945 			utf8_in(&c_word2, p2+1);   // first character of the next word;
1946 			if(!iswalpha2(c_word2))
1947 			{
1948 				ok =0;
1949 			}
1950 
1951 			if(ok != 0)
1952 			{
1953 				strcpy(ph_buf,word_phonemes);
1954 
1955 				flags2[0] = TranslateWord(translator, p2+1, 0, wtab+1, NULL);
1956 				if((flags2[0] & FLAG_WAS_UNPRONOUNCABLE) || (word_phonemes[0] == phonSWITCH))
1957 					ok = 0;
1958 
1959 				if(sylimit & 0x100)
1960 				{
1961 					// only if the second word has $alt attribute
1962 					if((flags2[0] & FLAG_ALT_TRANS) == 0)
1963 					{
1964 						ok = 0;
1965 					}
1966 				}
1967 
1968 				if((sylimit & 0x200) && ((wtab+1)->flags & FLAG_LAST_WORD))
1969 				{
1970 					// not if the next word is end-of-sentence
1971 					ok = 0;
1972 				}
1973 
1974 				if(ok == 0)
1975 				{
1976 					strcpy(word_phonemes,ph_buf);
1977 				}
1978 			}
1979 
1980 			if(ok)
1981 			{
1982 				*p2 = '-'; // replace next space by hyphen
1983 				wtab[0].flags &= ~FLAG_ALL_UPPER;  // prevent it being considered an abbreviation
1984 				flags = TranslateWord(translator, word, next_pause, wtab, NULL);  // translate the combined word
1985 				if((sylimit > 0) && (CountSyllables(p) > (sylimit & 0x1f)))
1986 				{
1987 					// revert to separate words
1988 					*p2 = ' ';
1989 					flags = TranslateWord(translator, word, next_pause, wtab, NULL);
1990 				}
1991 				else
1992 				{
1993 					if(flags == 0)
1994 						flags = flags2[0];   // no flags for the combined word, so use flags from the second word eg. lang-hu "nem december 7-e"
1995 					flags |= FLAG_SKIPWORDS;
1996 					dictionary_skipwords = 1;
1997 				}
1998 			}
1999 		}
2000 
2001 		if(p[0]==phonSWITCH)
2002 		{
2003 			int switch_attempt;
2004 			strcpy(old_dictionary_name, dictionary_name);
2005 			for(switch_attempt=0; switch_attempt < 2; switch_attempt++)
2006 			{
2007 				// this word uses a different language
2008 				memcpy(word, word_copy, word_copy_len);
2009 
2010 				new_language = (char *)(&p[1]);
2011 				if(new_language[0]==0)
2012 					new_language = "en";
2013 
2014 				switch_phonemes = SetTranslator2(new_language);
2015 
2016 				if(switch_phonemes >= 0)
2017 				{
2018 					// re-translate the word using the new translator
2019 					wtab[0].flags |= FLAG_TRANSLATOR2;
2020 					if(word_replaced[2] != 0)
2021 					{
2022 						word_replaced[0] = 0;   // byte before the start of the word
2023 						word_replaced[1] = ' ';
2024 						flags = TranslateWord(translator2, &word_replaced[1], next_pause, wtab, NULL);
2025 					}
2026 					else
2027 						flags = TranslateWord(translator2, word, next_pause, wtab, &word_replaced[2]);
2028 				}
2029 
2030 				if(p[0] != phonSWITCH)
2031 					break;
2032 			}
2033 
2034 			//				strcpy((char *)p,translator2->word_phonemes);
2035 
2036 			if(p[0] == phonSWITCH)
2037 				return(FLAG_SPELLWORD);
2038 
2039 			if(switch_phonemes < 0)
2040 			{
2041 				// language code is not recognised or 2nd translator won't translate it
2042 				p[0] = phonSCHWA;  // just say something
2043 				p[1] = phonSCHWA;
2044 				p[2] = 0;
2045 			}
2046 
2047 // ?? Option to convert from language2 phonemes to the equivalent language1 phonemes
2048 // ?? Option to set the word-stress according to language1 rules eg. lang=fr)
2049 			if(ChangeEquivalentPhonemes(tr, switch_phonemes, (char *)p))
2050 			{
2051 				// Phonemes have been converted from the foreign language to the native language
2052 				switch_phonemes = -1;
2053 			}
2054 
2055 			if(switch_phonemes == -1)
2056 			{
2057 				strcpy(dictionary_name, old_dictionary_name);
2058 				SelectPhonemeTable(voice->phoneme_tab_ix);
2059 
2060 				// leave switch_phonemes set, but use the original phoneme table number.
2061 				// This will suppress LOPT_REGRESSIVE_VOICING
2062 				switch_phonemes = voice->phoneme_tab_ix;   // original phoneme table
2063 			}
2064 		}
2065 
2066 		if(!(word_flags & FLAG_HYPHEN))
2067 		{
2068 			if(flags & FLAG_PAUSE1)
2069 			{
2070 				if(pre_pause < 1)
2071 					pre_pause = 1;
2072 			}
2073 			if((flags & FLAG_PREPAUSE) && !(word_flags && (FLAG_LAST_WORD | FLAG_FIRST_WORD)) && !(wtab[-1].flags & FLAG_FIRST_WORD) && (tr->prepause_timeout == 0))
2074 			{
2075 				// the word is marked in the dictionary list with $pause
2076 				if(pre_pause < 4) pre_pause = 4;
2077 				tr->prepause_timeout = 3;
2078 			}
2079 		}
2080 
2081 		if((option_emphasis >= 3) && (pre_pause < 1))
2082 			pre_pause = 1;
2083 	}
2084 
2085 	stress = 0;
2086 	next_stress = 1;
2087 	srcix = 0;
2088 	max_stress = -1;
2089 
2090 	found_dict_flag = 0;
2091 	if((flags & FLAG_FOUND) && !(flags & FLAG_TEXTMODE))
2092 		found_dict_flag = SFLAG_DICTIONARY;
2093 
2094 	while((pre_pause > 0) && (n_ph_list2 < N_PHONEME_LIST-4))
2095 	{
2096 		// add pause phonemes here. Either because of punctuation (brackets or quotes) in the
2097 		// text, or because the word is marked in the dictionary lookup as a conjunction
2098 		if(pre_pause > 1)
2099 		{
2100 			SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE);
2101 			pre_pause -= 2;
2102 		}
2103 		else
2104 		{
2105 			SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE_NOLINK);
2106 			pre_pause--;
2107 		}
2108 		tr->end_stressed_vowel = 0;   // forget about the previous word
2109 		tr->prev_dict_flags[0] = 0;
2110 		tr->prev_dict_flags[1] = 0;
2111 	}
2112 	plist2 = &ph_list2[n_ph_list2];
2113 
2114 	if((option_capitals==1) && (word_flags & FLAG_FIRST_UPPER))
2115 	{
2116 		SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE_SHORT);
2117 		SetPlist2(&ph_list2[n_ph_list2++],phonCAPITAL);
2118 		if((word_flags & FLAG_ALL_UPPER) && IsAlpha(word[1]))
2119 		{
2120 			// word > 1 letter and all capitals
2121 			SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE_SHORT);
2122 			SetPlist2(&ph_list2[n_ph_list2++],phonCAPITAL);
2123 		}
2124 	}
2125 
2126 	if(switch_phonemes >= 0)
2127 	{
2128 		if((p[0] == phonPAUSE) && (p[1] == phonSWITCH))
2129 		{
2130 			// the new word starts with a phoneme table switch, so there's no need to switch before it.
2131 			if(ph_list2[n_ph_list2-1].phcode == phonSWITCH)
2132 			{
2133 				//previous phoneme is also a phonSWITCH, delete it
2134 				n_ph_list2--;
2135 			}
2136 		}
2137 		else
2138 		{
2139 			// this word uses a different phoneme table
2140 			if(ph_list2[n_ph_list2-1].phcode == phonSWITCH)
2141 			{
2142 				//previous phoneme is also a phonSWITCH, just change its phoneme table number
2143 				n_ph_list2--;
2144 			}
2145 			else
2146 			{
2147 				SetPlist2(&ph_list2[n_ph_list2],phonSWITCH);
2148 			}
2149 			ph_list2[n_ph_list2++].tone_ph = switch_phonemes;  // temporary phoneme table number
2150 		}
2151 	}
2152 
2153 	// remove initial pause from a word if it follows a hyphen
2154 	if((word_flags & FLAG_HYPHEN) && (phoneme_tab[*p]->type == phPAUSE))
2155 		p++;
2156 
2157 	if((p[0] == 0) && (embedded_flag))
2158 	{
2159 		// no phonemes.  Insert a very short pause to carry an embedded command
2160 		p[0] = phonPAUSE_VSHORT;
2161 		p[1] = 0;
2162 	}
2163 
2164 	while(((ph_code = *p++) != 0) && (n_ph_list2 < N_PHONEME_LIST-4))
2165 	{
2166 		if(ph_code == 255)
2167 			continue;      // unknown phoneme
2168 
2169 		// Add the phonemes to the first stage phoneme list (ph_list2)
2170 		ph = phoneme_tab[ph_code];
2171 
2172 		if(ph_code == phonSWITCH)
2173 		{
2174 			ph_list2[n_ph_list2].phcode = ph_code;
2175 			ph_list2[n_ph_list2].sourceix = 0;
2176 			ph_list2[n_ph_list2].synthflags = 0;
2177 			ph_list2[n_ph_list2++].tone_ph = *p;
2178 			SelectPhonemeTable(*p);
2179 			p++;
2180 		}
2181 		else if(ph->type == phSTRESS)
2182 		{
2183 			// don't add stress phonemes codes to the list, but give their stress
2184 			// value to the next vowel phoneme
2185 			// std_length is used to hold stress number or (if >10) a tone number for a tone language
2186 			if(ph->program == 0)
2187 				next_stress = ph->std_length;
2188 			else
2189 			{
2190 				// for tone languages, the tone number for a syllable follows the vowel
2191 				if(prev_vowel >= 0)
2192 				{
2193 					ph_list2[prev_vowel].tone_ph = ph_code;
2194 				}
2195 				else
2196 				{
2197 					next_tone = ph_code;       // no previous vowel, apply to the next vowel
2198 				}
2199 			}
2200 		}
2201 		else if(ph_code == phonSYLLABIC)
2202 		{
2203 			// mark the previous phoneme as a syllabic consonant
2204 			prev_vowel = n_ph_list2-1;
2205 			ph_list2[prev_vowel].synthflags |= SFLAG_SYLLABLE;
2206 			ph_list2[prev_vowel].stresslevel = next_stress;
2207 		}
2208 		else if(ph_code == phonLENGTHEN)
2209 		{
2210 			ph_list2[n_ph_list2-1].synthflags |= SFLAG_LENGTHEN;
2211 		}
2212 		else if(ph_code == phonEND_WORD)
2213 		{
2214 			// a || symbol in a phoneme string was used to indicate a word boundary
2215 			// Don't add this phoneme to the list, but make sure the next phoneme has
2216 			// a newword indication
2217 			srcix = source_ix+1;
2218 		}
2219 		else if(ph_code == phonX1)
2220 		{
2221 			// a language specific action
2222 			if(tr->langopts.param[LOPT_IT_DOUBLING])
2223 			{
2224 				flags |= FLAG_DOUBLING;
2225 			}
2226 		}
2227 		else
2228 		{
2229 			ph_list2[n_ph_list2].phcode = ph_code;
2230 			ph_list2[n_ph_list2].tone_ph = 0;
2231 			ph_list2[n_ph_list2].synthflags = embedded_flag | found_dict_flag;
2232 			embedded_flag = 0;
2233 			ph_list2[n_ph_list2].sourceix = srcix;
2234 			srcix = 0;
2235 
2236 			if(ph->type == phVOWEL)
2237 			{
2238 				stress = next_stress;
2239 				next_stress = 1;  // default is 'unstressed'
2240 
2241 				if(stress >= 4)
2242 				{
2243 					any_stressed_words = 1;
2244 				}
2245 
2246 				if((prev_vowel >= 0) && (n_ph_list2-1) != prev_vowel)
2247 					ph_list2[n_ph_list2-1].stresslevel = stress;  // set stress for previous consonant
2248 
2249 				ph_list2[n_ph_list2].synthflags |= SFLAG_SYLLABLE;
2250 				prev_vowel = n_ph_list2;
2251 
2252 				if(stress > max_stress)
2253 				{
2254 					max_stress = stress;
2255 					max_stress_ix = n_ph_list2;
2256 				}
2257 				if(next_tone != 0)
2258 				{
2259 					ph_list2[n_ph_list2].tone_ph = next_tone;
2260 					next_tone=0;
2261 				}
2262 			}
2263 			else
2264 			{
2265 				if(first_phoneme && tr->langopts.param[LOPT_IT_DOUBLING])
2266 				{
2267 					if(((tr->prev_dict_flags[0] & FLAG_DOUBLING) && (tr->langopts.param[LOPT_IT_DOUBLING] & 1)) ||
2268 							(tr->end_stressed_vowel && (tr->langopts.param[LOPT_IT_DOUBLING] & 2)))
2269 					{
2270 						// italian, double the initial consonant if the previous word ends with a
2271 						// stressed vowel, or is marked with a flag
2272 						ph_list2[n_ph_list2].synthflags |= SFLAG_LENGTHEN;
2273 					}
2274 				}
2275 			}
2276 
2277 			ph_list2[n_ph_list2].stresslevel = stress;
2278 			n_ph_list2++;
2279 			first_phoneme = 0;
2280 		}
2281 	}
2282 
2283 	if(word_flags & FLAG_COMMA_AFTER)
2284 	{
2285 		SetPlist2(&ph_list2[n_ph_list2++],phonPAUSE_CLAUSE);
2286 	}
2287 
2288 	// don't set new-word if there is a hyphen before it
2289 	if((word_flags & FLAG_HYPHEN) == 0)
2290 	{
2291 		plist2->sourceix = source_ix;
2292 	}
2293 
2294 	tr->end_stressed_vowel = 0;
2295 	if((stress >= 4) && (phoneme_tab[ph_list2[n_ph_list2-1].phcode]->type == phVOWEL))
2296 	{
2297 		tr->end_stressed_vowel = 1;   // word ends with a stressed vowel
2298 	}
2299 
2300 	if(switch_phonemes >= 0)
2301 	{
2302 		// this word uses a different phoneme table, now switch back
2303 		strcpy(dictionary_name, old_dictionary_name);
2304 		SelectPhonemeTable(voice->phoneme_tab_ix);
2305 		SetPlist2(&ph_list2[n_ph_list2],phonSWITCH);
2306 		ph_list2[n_ph_list2++].tone_ph = voice->phoneme_tab_ix;  // original phoneme table number
2307 	}
2308 
2309 
2310 	if(pitch_raised > 0)
2311 	{
2312 		embedded_list[embedded_ix++] = EMBED_P+0x60+0x80 + (pitch_raised << 8);  // lower pitch
2313 		SetPlist2(&ph_list2[n_ph_list2],phonPAUSE_SHORT);
2314 		ph_list2[n_ph_list2++].synthflags = SFLAG_EMBEDDED;
2315 	}
2316 
2317 	if(flags & FLAG_STRESS_END2)
2318 	{
2319 		// this's word's stress could be increased later
2320 		ph_list2[max_stress_ix].synthflags |= SFLAG_PROMOTE_STRESS;
2321 	}
2322 
2323 	tr->prev_dict_flags[0] = flags;
2324 	return(flags);
2325 }  //  end of TranslateWord2
2326 
2327 
2328 
EmbeddedCommand(unsigned int * source_index_out)2329 static int EmbeddedCommand(unsigned int *source_index_out)
2330 {//=======================================================
2331 	// An embedded command to change the pitch, volume, etc.
2332 	// returns number of commands added to embedded_list
2333 
2334 	// pitch,speed,amplitude,expression,reverb,tone,voice,sayas
2335 	const char *commands = "PSARHTIVYMUBF";
2336 	int value = -1;
2337 	int sign = 0;
2338 	unsigned char c;
2339 	char *p;
2340 	int cmd;
2341 	int source_index = *source_index_out;
2342 
2343 	c = source[source_index];
2344 	if(c == '+')
2345 	{
2346 		sign = 0x40;
2347 		source_index++;
2348 	}
2349 	else if(c == '-')
2350 	{
2351 		sign = 0x60;
2352 		source_index++;
2353 	}
2354 
2355 	if(IsDigit09(source[source_index]))
2356 	{
2357 		value = atoi(&source[source_index]);
2358 		while(IsDigit09(source[source_index]))
2359 			source_index++;
2360 	}
2361 
2362 	c = source[source_index++];
2363 	if(embedded_ix >= (N_EMBEDDED_LIST - 2))
2364 		return(0);  // list is full
2365 
2366 	if((p = strchr_w(commands,c)) == NULL)
2367 		return(0);
2368 	cmd = (p - commands)+1;
2369 	if(value == -1)
2370 	{
2371 		value = embedded_default[cmd];
2372 		sign = 0;
2373 	}
2374 
2375 	if(cmd == EMBED_Y)
2376 	{
2377 		option_sayas2 = value;
2378 		count_sayas_digits = 0;
2379 	}
2380 	if(cmd == EMBED_F)
2381 	{
2382 		if(value >= 3)
2383 			word_emphasis = FLAG_EMPHASIZED;
2384 		else
2385 			word_emphasis = 0;
2386 	}
2387 
2388 	embedded_list[embedded_ix++] = cmd + sign + (value << 8);
2389 	*source_index_out = source_index;
2390 	return(1);
2391 }  //  end of EmbeddedCommand
2392 
2393 
2394 
SubstituteChar(Translator * tr,unsigned int c,unsigned int next_in,int * insert,int * wordflags)2395 static int SubstituteChar(Translator *tr, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
2396 {//=========================================================================================
2397 	int ix;
2398 	unsigned int word;
2399 	unsigned int new_c, c2, c_lower;
2400 	int upper_case = 0;
2401 	static int ignore_next = 0;
2402 	const unsigned int *replace_chars;
2403 
2404 	if(ignore_next)
2405 	{
2406 		ignore_next = 0;
2407 		return(8);
2408 	}
2409 	if(c == 0) return(0);
2410 
2411 	if((replace_chars = tr->langopts.replace_chars) == NULL)
2412 		return(c);
2413 
2414 	// there is a list of character codes to be substituted with alternative codes
2415 
2416 	if(iswupper2(c_lower = c))
2417 	{
2418 		c_lower = towlower2(c);
2419 		upper_case = 1;
2420 	}
2421 
2422 	new_c = 0;
2423 	for(ix=0; (word = replace_chars[ix]) != 0; ix+=2)
2424 	{
2425 		if(c_lower == (word & 0xffff))
2426 		{
2427 			if((word >> 16) == 0)
2428 			{
2429 				new_c = replace_chars[ix+1];
2430 				break;
2431 			}
2432 			if((word >> 16) == (unsigned int)towlower2(next_in))
2433 			{
2434 				new_c = replace_chars[ix+1];
2435 				ignore_next = 1;
2436 				break;
2437 			}
2438 		}
2439 	}
2440 
2441 	if(new_c == 0)
2442 		return(c);    // no substitution
2443 
2444 	if(new_c & 0xffe00000)
2445 	{
2446 		// there is a second character to be inserted
2447 		// don't convert the case of the second character unless the next letter is also upper case
2448 		c2 = new_c >> 16;
2449 		if(upper_case && iswupper2(next_in))
2450 			c2 = towupper2(c2);
2451 		*insert = c2;
2452 		new_c &= 0xffff;
2453 	}
2454 
2455 	if(upper_case)
2456 		new_c = towupper2(new_c);
2457 
2458 	*wordflags |= FLAG_CHAR_REPLACED;
2459 	return(new_c);
2460 
2461 }
2462 
2463 
TranslateChar(Translator * tr,char * ptr,int prev_in,unsigned int c,unsigned int next_in,int * insert,int * wordflags)2464 static int TranslateChar(Translator *tr, char *ptr, int prev_in, unsigned int c, unsigned int next_in, int *insert, int *wordflags)
2465 {//================================================================================================================
2466 	// To allow language specific examination and replacement of characters
2467 
2468 	int code;
2469 	int initial;
2470 	int medial;
2471 	int final;
2472 	int next2;
2473 
2474 	static const unsigned char hangul_compatibility[0x34] = {
2475 		0,  0x00,0x01,0xaa,0x02,0xac,0xad,0x03,
2476 		0x04,0x05,0xb0,0xb1,0xb2,0xb3,0xb4,0xb4,
2477 		0xb6,0x06,0x07,0x08,0xb9,0x09,0x0a,0xbc,
2478 		0x0c,0x0d,0x0e,0x0f,0x10,0x11,0x12,0x61,
2479 		0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,
2480 		0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,0x70,0x71,
2481 		0x72,0x73,0x74,0x75
2482 	};
2483 
2484 	// check for Korean Hangul letters
2485 	if(((code = c - 0xac00) >= 0) && (c <= 0xd7af))
2486 	{
2487 		// break a syllable hangul into 2 or 3 individual jamo
2488 		initial = (code/28)/21;
2489 		medial = (code/28) % 21;
2490 		final = code % 28;
2491 
2492 		if(initial == 11)
2493 		{
2494 			// null initial
2495 			c = medial + 0x1161;
2496 			if(final > 0)
2497 				*insert = final + 0x11a7;
2498 		}
2499 		else
2500 		{
2501 			// extact the initial and insert the remainder with a null initial
2502 			c = initial + 0x1100;
2503 			*insert = (11*28*21) + (medial*28) + final + 0xac00;
2504 		}
2505 		return(c);
2506 	}
2507 	else if(((code = c - 0x3130) >= 0) && (code < 0x34))
2508 	{
2509 		// Hangul compatibility jamo
2510 		return(hangul_compatibility[code] + 0x1100);
2511 	}
2512 
2513 	switch(tr->translator_name)
2514 	{
2515 	case L('a','f'):
2516 	case L('n','l'):
2517 		// look for 'n  and replace by a special character (unicode: schwa)
2518 
2519 
2520 		if(!iswalpha2(prev_in))
2521 		{
2522 			utf8_in(&next2, &ptr[1]);
2523 
2524 			if((c == '\'') && IsSpace(next2))
2525 			{
2526 				if((next_in == 'n') && (tr->translator_name == L('a','f')))
2527 				{
2528 					// n preceded by either apostrophe or U2019 "right single quotation mark"
2529 					ptr[0] = ' ';  // delete the  n
2530 					return(0x0259); // replace  '  by  unicode schwa character
2531 				}
2532 				if((next_in == 'n') || (next_in == 't'))
2533 				{
2534 					// Dutch, [@n] and [@t]
2535 					return(0x0259); // replace  '  by  unicode schwa character
2536 				}
2537 			}
2538 		}
2539 		break;
2540 	}
2541 	return(SubstituteChar(tr, c, next_in, insert, wordflags));
2542 }
2543 
2544 
2545 static const char *UCase_ga[] = {"bp","bhf","dt","gc","hA","mb","nd","ng","ts","tA","nA",NULL};
2546 
UpperCaseInWord(Translator * tr,char * word,int c)2547 int UpperCaseInWord(Translator *tr, char *word, int c)
2548 {//=====================================================
2549 	int ix;
2550 	int len;
2551 	const char *p;
2552 
2553 	if(tr->translator_name == L('g','a'))
2554 	{
2555 		// Irish
2556 		for(ix=0; ; ix++)
2557 		{
2558 			if((p = UCase_ga[ix]) == NULL)
2559 				break;
2560 
2561 			len = strlen(p);
2562 			if((word[-len]==' ') && (memcmp(&word[-len+1], p, len-1) == 0))
2563 			{
2564 				if((c == p[len-1]) || ((p[len-1]=='A') && IsVowel(tr, c)))
2565 					return(1);
2566 			}
2567 		}
2568 	}
2569 	return(0);
2570 }
2571 
2572 
TranslateClause(Translator * tr,FILE * f_text,const void * vp_input,int * tone_out,char ** voice_change)2573 void *TranslateClause(Translator *tr, FILE *f_text, const void *vp_input, int *tone_out, char **voice_change)
2574 {//==========================================================================================================
2575 	int ix;
2576 	int c;
2577 	int cc;
2578 	unsigned int source_index=0;
2579 	unsigned int prev_source_index=0;
2580 	int source_index_word=0;
2581 	int prev_in;
2582 	int prev_out=' ';
2583 	int prev_out2;
2584 	int prev_in_save=0;
2585 	int next_in;
2586 	int next_in_nbytes;
2587 	int char_inserted=0;
2588 	int clause_pause;
2589 	int pre_pause_add=0;
2590 	int word_mark = 0;
2591 	int all_upper_case=FLAG_ALL_UPPER;
2592 	int finished;
2593 	int single_quoted;
2594 	int phoneme_mode = 0;
2595 	int dict_flags = 0;        // returned from dictionary lookup
2596 	int word_flags;        // set here
2597 	int next_word_flags;
2598 	int new_sentence2;
2599 	int embedded_count = 0;
2600 	int letter_count = 0;
2601 	int space_inserted = 0;
2602 	int syllable_marked = 0;
2603 	int decimal_sep_count = 0;
2604 	char *word;
2605 	char *p;
2606 	int j, k;
2607 	int n_digits;
2608 	int charix_top=0;
2609 
2610 	short charix[N_TR_SOURCE+4];
2611 	WORD_TAB words[N_CLAUSE_WORDS];
2612 	static char voice_change_name[40];
2613 	int word_count=0;      // index into words
2614 
2615 	char sbuf[N_TR_SOURCE];
2616 
2617 	int terminator;
2618 	int tone;
2619 	int tone2;
2620 
2621 	if(tr==NULL)
2622 	{
2623 		return(NULL);
2624 	}
2625 
2626 	p_textinput = (unsigned char *)vp_input;
2627 	p_wchar_input = (wchar_t *)vp_input;
2628 
2629 	embedded_ix = 0;
2630 	embedded_read = 0;
2631 	pre_pause = 0;
2632 	any_stressed_words = 0;
2633 
2634 	if((clause_start_char = count_characters) < 0)
2635 		clause_start_char = 0;
2636 	clause_start_word = count_words + 1;
2637 
2638 	for(ix=0; ix<N_TR_SOURCE; ix++)
2639 		charix[ix] = 0;
2640 	terminator = ReadClause(tr, f_text, source, charix, &charix_top, N_TR_SOURCE, &tone2, voice_change_name);
2641 
2642 	if((f_logespeak != NULL) && (logging_type & 4))
2643 	{
2644 		fprintf(f_logespeak,"CLAUSE %x:\n",terminator);
2645 		for(p=source; *p != 0; p++)
2646 			fputc(*p, f_logespeak);
2647 		fprintf(f_logespeak,"ENDCLAUSE\n");
2648 		fflush(f_logespeak);
2649 	}
2650 	p = source;
2651 
2652 	charix[charix_top+1] = 0;
2653 	charix[charix_top+2] = 0x7fff;
2654 	charix[charix_top+3] = 0;
2655 
2656 	clause_pause = (terminator & 0xfff) * 10;  // mS
2657 	if(terminator & CLAUSE_PAUSE_LONG)
2658 	  clause_pause = clause_pause * 32 ;  // pause value is *320mS not *10mS
2659 
2660 	tone = (terminator >> 12) & 0x7;
2661 	if(tone2 != 0)
2662 	{
2663 		// override the tone type
2664 		tone = tone2;
2665 	}
2666 
2667 	for(p=source; *p != 0; p++)
2668 	{
2669 		if(!isspace2(*p))
2670 		{
2671 			break;
2672 		}
2673 	}
2674 	if(*p == 0)
2675 	{
2676 		// No characters except spaces. This is not a sentence.
2677 		// Don't add this pause, just make up the previous pause to this value;
2678 		clause_pause -= max_clause_pause;
2679 		if(clause_pause < 0)
2680 			clause_pause = 0;
2681 
2682 		if(new_sentence)
2683 			terminator |= CLAUSE_BIT_SENTENCE;  // carry forward an end-of-sentence indicator
2684 		max_clause_pause += clause_pause;
2685 		new_sentence2 = 0;
2686 	}
2687 	else
2688 	{
2689 		max_clause_pause = clause_pause;
2690 		new_sentence2 = new_sentence;
2691 	}
2692 	tr->clause_terminator = terminator;
2693 
2694 	if(new_sentence2)
2695 	{
2696 		count_sentences++;
2697 		if(skip_sentences > 0)
2698 		{
2699 			skip_sentences--;
2700 			if(skip_sentences == 0)
2701 				skipping_text = 0;
2702 		}
2703 	}
2704 
2705 	memset(&ph_list2[0],0,sizeof(ph_list2[0]));
2706 	ph_list2[0].phcode = phonPAUSE_SHORT;
2707 
2708 	n_ph_list2 = 1;
2709 	tr->prev_last_stress = 0;
2710 	tr->prepause_timeout = 0;
2711 	tr->expect_verb=0;
2712 	tr->expect_noun=0;
2713 	tr->expect_past=0;
2714 	tr->expect_verb_s=0;
2715 	tr->phonemes_repeat_count = 0;
2716 	tr->end_stressed_vowel=0;
2717 	tr->prev_dict_flags[0] = 0;
2718 	tr->prev_dict_flags[1] = 0;
2719 
2720 	word_count = 0;
2721 	single_quoted = 0;
2722 	word_flags = 0;
2723 	next_word_flags = 0;
2724 
2725 	sbuf[0] = 0;
2726 	sbuf[1] = ' ';
2727 	sbuf[2] = ' ';
2728 	ix = 3;
2729 	prev_in = ' ';
2730 
2731 	words[0].start = ix;
2732 	words[0].flags = 0;
2733 	finished = 0;
2734 
2735 	for(j=0; charix[j]<=0; j++);
2736 	words[0].sourceix = charix[j];
2737 	k = 0;
2738 	while(charix[j] != 0)
2739 	{
2740 		// count the number of characters (excluding multibyte continuation bytes)
2741 		if(charix[j++] != -1)
2742 			k++;
2743 	}
2744 	words[0].length = k;
2745 
2746 	while(!finished && (ix < (int)sizeof(sbuf))&& (n_ph_list2 < N_PHONEME_LIST-4))
2747 	{
2748 		prev_out2 = prev_out;
2749 		utf8_in2(&prev_out,&sbuf[ix-1],1);   // prev_out = sbuf[ix-1];
2750 
2751 		if(tr->langopts.tone_numbers && IsDigit09(prev_out) && IsAlpha(prev_out2))
2752 		{
2753 			// tone numbers can be part of a word, consider them as alphabetic
2754 			prev_out = 'a';
2755 		}
2756 
2757 		if(prev_in_save != 0)
2758 		{
2759 			prev_in = prev_in_save;
2760 			prev_in_save = 0;
2761 		}
2762 		else if(source_index > 0)
2763 		{
2764 			utf8_in2(&prev_in,&source[source_index-1],1);  //  prev_in = source[source_index-1];
2765 		}
2766 
2767 		prev_source_index = source_index;
2768 
2769 		if(char_inserted)
2770 		{
2771 			c = char_inserted;
2772 			char_inserted = 0;
2773 		}
2774 		else
2775 		{
2776 			source_index += utf8_in(&cc,&source[source_index]);   // cc = source[source_index++];
2777 			c = cc;
2778 		}
2779 		next_in_nbytes = utf8_in(&next_in,&source[source_index]);
2780 
2781 		if(c == 0)
2782 		{
2783 			finished = 1;
2784 			c = ' ';
2785 		}
2786 
2787 		if((c == CTRL_EMBEDDED) || (c == ctrl_embedded))
2788 		{
2789 			// start of embedded command in the text
2790 			int srcix = source_index-1;
2791 
2792 			if(prev_in != ' ')
2793 			{
2794 				c = ' ';
2795 				prev_in_save = c;
2796 				source_index--;
2797 			}
2798 			else
2799 			{
2800 				embedded_count += EmbeddedCommand(&source_index);
2801 				prev_in_save = prev_in;
2802 				// replace the embedded command by spaces
2803 				memset(&source[srcix],' ',source_index-srcix);
2804 				source_index = srcix;
2805 				continue;
2806 			}
2807 		}
2808 
2809 		if((option_sayas2 == SAYAS_KEY) && (c != ' '))
2810 		{
2811 			if((prev_in == ' ') && (next_in == ' '))
2812 				option_sayas2 = SAYAS_SINGLE_CHARS;   // single character, speak its name
2813 			c = towlower2(c);
2814 		}
2815 
2816 
2817 		if(phoneme_mode)
2818 		{
2819 			all_upper_case = FLAG_PHONEMES;
2820 
2821 			if((c == ']') && (next_in == ']'))
2822 			{
2823 				phoneme_mode = 0;
2824 				source_index++;
2825 				c = ' ';
2826 			}
2827 		}
2828 		else if((option_sayas2 & 0xf0) == SAYAS_DIGITS)
2829 		{
2830 			if(iswdigit(c))
2831 			{
2832 				count_sayas_digits++;
2833 				if(count_sayas_digits > (option_sayas2 & 0xf))
2834 				{
2835 					// break after the specified number of digits
2836 					c = ' ';
2837 					space_inserted = 1;
2838 					count_sayas_digits = 0;
2839 				}
2840 			}
2841 			else
2842 			{
2843 				count_sayas_digits = 0;
2844 				if(iswdigit(prev_out))
2845 				{
2846 					c = ' ';
2847 					space_inserted = 1;
2848 				}
2849 			}
2850 		}
2851 		else if((option_sayas2 & 0x10) == 0)
2852 		{
2853 			// speak as words
2854 
2855 #ifdef deleted
2856 if((c == '/') && (tr->langopts.testing & 2) && IsDigit09(next_in) && IsAlpha(prev_out))
2857 {
2858 	// TESTING, explicit indication of stressed syllable by /2 after the word
2859 	word_mark = next_in-'0';
2860 	source_index++;
2861 	c = ' ';
2862 }
2863 #endif
2864 			if((c == 0x92) || (c == 0xb4) || (c == 0x2019) || (c == 0x2032))
2865 				c = '\'';    // 'microsoft' quote or sexed closing single quote, or prime - possibly used as apostrophe
2866 
2867 			if(((c == 0x2018) || (c == '?')) && IsAlpha(prev_out) && IsAlpha(next_in))
2868 			{
2869 				// ? between two letters may be a smart-quote replaced by ?
2870 				c = '\'';
2871 			}
2872 
2873 			if(c == CHAR_EMPHASIS)
2874 			{
2875 				// this character is a marker that the previous word is the focus of the clause
2876 				c = ' ';
2877 				word_flags |= FLAG_FOCUS;
2878 			}
2879 
2880 			if(c == CHAR_COMMA_BREAK)
2881 			{
2882 				c = ' ';
2883 				word_flags |= FLAG_COMMA_AFTER;
2884 			}
2885 
2886 			c = TranslateChar(tr, &source[source_index], prev_in,c, next_in, &char_inserted, &word_flags);  // optional language specific function
2887 			if(c == 8)
2888 				continue;  // ignore this character
2889 
2890 			if(char_inserted)
2891 				next_in = char_inserted;
2892 
2893 			// allow certain punctuation within a word (usually only apostrophe)
2894 			if(!IsAlpha(c) && !IsSpace(c) && (wcschr(tr->punct_within_word,c) == 0))
2895 			{
2896 				if(IsAlpha(prev_out))
2897 				{
2898 					if(tr->langopts.tone_numbers && IsDigit09(c) && !IsDigit09(next_in))
2899 					{
2900 						// allow a tone number as part of the word
2901 					}
2902 					else
2903 					{
2904 						c = ' ';   // ensure we have an end-of-word terminator
2905 						space_inserted = 1;
2906 					}
2907 				}
2908 			}
2909 
2910 			if(iswdigit(prev_out))
2911 			{
2912 				if(!iswdigit(c) && (c != '.') && (c != ',') && (c != ' '))
2913 				{
2914 					c = ' ';   // terminate digit string with a space
2915 					space_inserted = 1;
2916 				}
2917 			}
2918 			else
2919 			{
2920 				if(prev_in != ',')
2921 				{
2922 					decimal_sep_count = 0;
2923 				}
2924 			}
2925 
2926 			if(c == '[')
2927 			{
2928 				if((next_in == '\002') || ((next_in == '[') && option_phoneme_input))
2929 				{
2930 					//  "[\002" is used internally to start phoneme mode
2931 					phoneme_mode = FLAG_PHONEMES;
2932 					source_index++;
2933 					continue;
2934 				}
2935 			}
2936 
2937 			if(IsAlpha(c))
2938 			{
2939 				if(!IsAlpha(prev_out) || (tr->langopts.ideographs && ((c > 0x3040) || (prev_out > 0x3040))))
2940 				{
2941 					if(wcschr(tr->punct_within_word,prev_out) == 0)
2942 						letter_count = 0;    // don't reset count for an apostrophy within a word
2943 
2944 					if((prev_out != ' ') && (wcschr(tr->punct_within_word,prev_out) == 0))
2945 					{
2946 						// start of word, insert space if not one there already
2947 						c = ' ';
2948 						space_inserted = 1;
2949 
2950 						if(!IsBracket(prev_out))    // ?? perhaps only set FLAG_NOSPACE for . - /  (hyphenated words, URLs, etc)
2951 						{
2952 							next_word_flags |= FLAG_NOSPACE;
2953 						}
2954 					}
2955 					else
2956 					{
2957 						if(iswupper2(c))
2958 							word_flags |= FLAG_FIRST_UPPER;
2959 
2960 						if((prev_out == ' ') && iswdigit(sbuf[ix-2]) && !iswdigit(prev_in))
2961 						{
2962 							// word, following a number, but with a space between
2963 							// Add an extra space, to distinguish "2 a" from "2a"
2964 							sbuf[ix++] = ' ';
2965 							words[word_count].start++;
2966 						}
2967 					}
2968 				}
2969 
2970 				if(c != ' ')
2971 				{
2972 					letter_count++;
2973 
2974 					if(tr->letter_bits_offset > 0)
2975 					{
2976 						if(((c < 0x250) && (prev_out >= tr->letter_bits_offset)) ||
2977 								((c >= tr->letter_bits_offset) && (letter_count > 1) && (prev_out < 0x250)))
2978 						{
2979 							// Don't mix native and Latin characters in the same word
2980 							// Break into separate words
2981 							if(IsAlpha(prev_out))
2982 							{
2983 								c = ' ';
2984 								space_inserted = 1;
2985 								word_flags |= FLAG_HYPHEN_AFTER;
2986 								next_word_flags |= FLAG_HYPHEN;
2987 							}
2988 						}
2989 					}
2990 				}
2991 
2992 				if(iswupper2(c))
2993 				{
2994 					c = towlower2(c);
2995 
2996 					if((j = tr->langopts.param[LOPT_CAPS_IN_WORD]) > 0)
2997 					{
2998 						if((j == 2) && (syllable_marked == 0))
2999 						{
3000 							char_inserted = c;
3001 							c = 0x2c8;   // stress marker
3002 							syllable_marked = 1;
3003 						}
3004 					}
3005 					else
3006 					{
3007 						if(iswlower2(prev_in))
3008 						{
3009 							// lower case followed by upper case in a word
3010 							if(UpperCaseInWord(tr, &sbuf[ix], c) == 1)
3011 							{
3012 								// convert to lower case and continue
3013 								c = towlower2(c);
3014 							}
3015 							else
3016 							{
3017 								c = ' ';      // lower case followed by upper case, treat as new word
3018 								space_inserted = 1;
3019 								prev_in_save = c;
3020 								//							next_word_flags |= FLAG_NOSPACE;  // problem: prevents FLAG_HAS_DOT being set
3021 							}
3022 						}
3023 						else if((c != ' ') && iswupper2(prev_in) && iswlower2(next_in))
3024 						{
3025 							int next2_in;
3026 							utf8_in(&next2_in,&source[source_index + next_in_nbytes]);
3027 
3028 							if((tr->translator_name == L('n','l')) && (letter_count==2) && (c == 'j') && (prev_in == 'I'))
3029 							{
3030 								// Dutch words may capitalise initial IJ, don't split
3031 							}
3032 							else
3033 							if(IsAlpha(next2_in))
3034 							{
3035 								// changing from upper to lower case, start new word at the last uppercase, if 3 or more letters
3036 								c = ' ';
3037 								space_inserted = 1;
3038 								prev_in_save = c;
3039 								next_word_flags |= FLAG_NOSPACE;
3040 							}
3041 						}
3042 					}
3043 				}
3044 				else
3045 				{
3046 					if((all_upper_case) && (letter_count > 2))
3047 					{
3048 						if((c == 's') && (next_in==' '))
3049 						{
3050 							c = ' ';
3051 							all_upper_case |= FLAG_HAS_PLURAL;
3052 
3053 							if(sbuf[ix-1] == '\'')
3054 								sbuf[ix-1] = ' ';
3055 						}
3056 						else
3057 							all_upper_case = 0;  // current word contains lower case letters, not "'s"
3058 					}
3059 					else
3060 						all_upper_case = 0;
3061 				}
3062 			}
3063 			else if(c=='-')
3064 			{
3065 				if(!IsSpace(prev_in) && IsAlpha(next_in))
3066 				{
3067 					if(prev_out != ' ')
3068 					{
3069 						// previous 'word' not yet ended (not alpha or numeric), start new word now.
3070 						c = ' ';
3071 						space_inserted = 1;
3072 					}
3073 					else
3074 					{
3075 						// '-' between two letters is a hyphen, treat as a space
3076 						word_flags |= FLAG_HYPHEN;
3077 						if(word_count > 0)
3078 							words[word_count-1].flags |= FLAG_HYPHEN_AFTER;
3079 						c = ' ';
3080 					}
3081 				}
3082 				else if((prev_in==' ') && (next_in==' '))
3083 				{
3084 					// ' - ' dash between two spaces, treat as pause
3085 					c = ' ';
3086 					pre_pause_add = 4;
3087 				}
3088 				else if(next_in=='-')
3089 				{
3090 					// double hyphen, treat as pause
3091 					source_index++;
3092 					c = ' ';
3093 					pre_pause_add = 4;
3094 				}
3095 				else if((prev_out == ' ') && IsAlpha(prev_out2) && !IsAlpha(prev_in))
3096 				{
3097 					// insert extra space between a word + space + hyphen, to distinguish 'a -2' from 'a-2'
3098 					sbuf[ix++] = ' ';
3099 					words[word_count].start++;
3100 				}
3101 			}
3102 			else if(c == '.')
3103 			{
3104 				if(prev_out == '.')
3105 				{
3106 					// multiple dots, separate by spaces. Note >3 dots has been replaced by elipsis
3107 					c = ' ';
3108 					space_inserted = 1;
3109 				}
3110 				else if((word_count > 0) && !(words[word_count-1].flags & FLAG_NOSPACE) && IsAlpha(prev_in))
3111 				{
3112 					// dot after a word, with space following, probably an abbreviation
3113 					words[word_count-1].flags |= FLAG_HAS_DOT;
3114 
3115 					if(IsSpace(next_in) || (next_in == '-'))
3116 						c = ' ';   // remove the dot if it's followed by a space or hyphen, so that it's not pronounced
3117 				}
3118 			}
3119 			else if(c == '\'')
3120 			{
3121 				if(((prev_in == '.') || iswalnum(prev_in)) && IsAlpha(next_in))
3122 				{
3123 					// between two letters, or in an abbreviation (eg. u.s.a.'s). Consider the apostrophe as part of the word
3124 					single_quoted = 0;
3125 				}
3126 				else if((tr->langopts.param[LOPT_APOSTROPHE] & 1) && IsAlpha(next_in))
3127 				{
3128 					single_quoted = 0;   // apostrophe at start of word is part of the word
3129 				}
3130 				else if((tr->langopts.param[LOPT_APOSTROPHE] & 2) && IsAlpha(prev_in))
3131 				{
3132 					single_quoted = 0;   // apostrophe at end of word is part of the word
3133 				}
3134 				else if((wcschr(tr->char_plus_apostrophe,prev_in) != 0) && (prev_out2 == ' '))
3135 				{
3136 					// consider single character plus apostrophe as a word
3137 					single_quoted = 0;
3138 					if(next_in == ' ')
3139 					{
3140 						source_index++;  // skip following space
3141 					}
3142 				}
3143 				else
3144 				{
3145 					if((prev_out == 's') && (single_quoted==0))
3146 					{
3147 						// looks like apostrophe after an 's'
3148 						c = ' ';
3149 					}
3150 					else
3151 					{
3152 						if(IsSpace(prev_out))
3153 							single_quoted = 1;
3154 						else
3155 							single_quoted = 0;
3156 
3157 						pre_pause_add = 4;   // single quote
3158 						c = ' ';
3159 					}
3160 				}
3161 			}
3162 			else
3163 #ifdef deleted
3164 // Brackets are now recognised in TranslateRules()
3165 				if(IsBracket(c))
3166 				{
3167 					pre_pause_add = 4;
3168 					c = ' ';
3169 				}
3170 				else
3171 #endif
3172 					if(lookupwchar(breaks,c) != 0)
3173 					{
3174 						c = ' ';  // various characters to treat as space
3175 					}
3176 					else if(iswdigit(c))
3177 					{
3178 						if(tr->langopts.tone_numbers && IsAlpha(prev_out) && !IsDigit(next_in))
3179 						{
3180 						}
3181 						else if((prev_out != ' ') && !iswdigit(prev_out))
3182 						{
3183 							if((prev_out != tr->langopts.decimal_sep) || ((decimal_sep_count > 0) && (tr->langopts.decimal_sep == ',')))
3184 							{
3185 								c = ' ';
3186 								space_inserted = 1;
3187 							}
3188 							else
3189 							{
3190 								decimal_sep_count = 1;
3191 							}
3192 						}
3193 						else if((prev_out == ' ') && IsAlpha(prev_out2) && !IsAlpha(prev_in))
3194 						{
3195 							// insert extra space between a word and a number, to distinguish 'a 2' from 'a2'
3196 							sbuf[ix++] = ' ';
3197 							words[word_count].start++;
3198 						}
3199 					}
3200 		}
3201 
3202 		if(IsSpace(c))
3203 		{
3204 			if(prev_out == ' ')
3205 			{
3206 				word_flags |= FLAG_MULTIPLE_SPACES;
3207 				continue;   // multiple spaces
3208 			}
3209 
3210 			if((cc == 0x09) || (cc == 0x0a))
3211 			{
3212 				next_word_flags |= FLAG_MULTIPLE_SPACES;   // tab or newline, not a simple space
3213 			}
3214 
3215 			if(space_inserted)
3216 			{
3217 				// count the number of characters since the start of the word
3218 				j = 0;
3219 				k = source_index - 1;
3220 				while((k >= source_index_word) && (charix[k] != 0))
3221 				{
3222 					if(charix[k] > 0)    // don't count initial bytes of multi-byte character
3223 						j++;
3224 					k--;
3225 				}
3226 				words[word_count].length = j;
3227 			}
3228 
3229 			source_index_word = source_index;
3230 
3231 			// end of 'word'
3232 			sbuf[ix++] = ' ';
3233 
3234 			if((word_count < N_CLAUSE_WORDS-1) && (ix > words[word_count].start))
3235 			{
3236 				if(embedded_count > 0)
3237 				{
3238 					// there are embedded commands before this word
3239 					embedded_list[embedded_ix-1] |= 0x80;   // terminate list of commands for this word
3240 					words[word_count].flags |= FLAG_EMBEDDED;
3241 					embedded_count = 0;
3242 				}
3243 				words[word_count].pre_pause = pre_pause;
3244 				words[word_count].flags |= (all_upper_case | word_flags | word_emphasis);
3245 				words[word_count].wmark = word_mark;
3246 
3247 				if(pre_pause > 0)
3248 				{
3249 					// insert an extra space before the word, to prevent influence from previous word across the pause
3250 					for(j=ix; j>words[word_count].start; j--)
3251 					{
3252 						sbuf[j] = sbuf[j-1];
3253 					}
3254 					sbuf[j] = ' ';
3255 					words[word_count].start++;
3256 					ix++;
3257 				}
3258 
3259 				word_count++;
3260 				words[word_count].start = ix;
3261 				words[word_count].flags = 0;
3262 
3263 				for(j=source_index; charix[j] <= 0; j++);   // skip blanks
3264 				words[word_count].sourceix = charix[j];
3265 				k = 0;
3266 				while(charix[j] != 0)
3267 				{
3268 					// count the number of characters (excluding multibyte continuation bytes)
3269 					if(charix[j++] != -1)
3270 						k++;
3271 				}
3272 				words[word_count].length = k;
3273 
3274 				word_flags = next_word_flags;
3275 				next_word_flags = 0;
3276 				pre_pause = 0;
3277 				word_mark = 0;
3278 				all_upper_case = FLAG_ALL_UPPER;
3279 				syllable_marked = 0;
3280 			}
3281 
3282 			if(space_inserted)
3283 			{
3284 				source_index = prev_source_index;    // rewind to the previous character
3285 				char_inserted = 0;
3286 				space_inserted = 0;
3287 			}
3288 		}
3289 		else
3290 		{
3291 			if((ix < (N_TR_SOURCE - 4)))
3292 				ix += utf8_out(c,&sbuf[ix]);   // sbuf[ix++] = c;
3293 		}
3294 		if(pre_pause_add > pre_pause)
3295 			pre_pause = pre_pause_add;
3296 		pre_pause_add = 0;
3297 	}
3298 
3299 	if((word_count==0) && (embedded_count > 0))
3300 	{
3301 		// add a null 'word' to carry the embedded command flag
3302 		embedded_list[embedded_ix-1] |= 0x80;
3303 		words[word_count].flags |= FLAG_EMBEDDED;
3304 		word_count = 1;
3305 	}
3306 
3307 	tr->clause_end = &sbuf[ix-1];
3308 	sbuf[ix] = 0;
3309 	words[0].pre_pause = 0;  // don't add extra pause at beginning of clause
3310 	words[word_count].pre_pause = 8;
3311 	if(word_count > 0)
3312 	{
3313 		ix = word_count-1;
3314 		while((ix > 0) && (IsBracket(sbuf[words[ix].start])))
3315 			ix--;  // the last word is a bracket, mark the previous word as last
3316 		words[ix].flags |= FLAG_LAST_WORD;
3317 
3318 		// FLAG_NOSPACE check to avoid recognizing  .mr  -mr
3319 		if((terminator & CLAUSE_DOT) && !(words[word_count-1].flags & FLAG_NOSPACE))
3320 			words[word_count-1].flags |= FLAG_HAS_DOT;
3321 	}
3322 	words[0].flags |= FLAG_FIRST_WORD;
3323 
3324 
3325 	for(ix=0; ix < word_count; ix++)
3326 	{
3327 		int nx;
3328 		int c_temp;
3329 		char *pn;
3330 		char *pw;
3331 		int nw;
3332 		char number_buf[150];
3333 		WORD_TAB num_wtab[50];  // copy of 'words', when splitting numbers into parts
3334 
3335 		// start speaking at a specified word position in the text?
3336 		count_words++;
3337 		if(skip_words > 0)
3338 		{
3339 			skip_words--;
3340 			if(skip_words == 0)
3341 				skipping_text = 0;
3342 		}
3343 		if(skipping_text)
3344 			continue;
3345 
3346 		current_alphabet = NULL;
3347 
3348 		// digits should have been converted to Latin alphabet ('0' to '9')
3349 		word = pw = &sbuf[words[ix].start];
3350 
3351 		if(iswdigit(word[0]) && (tr->langopts.break_numbers != BREAK_THOUSANDS))
3352 		{
3353 			// Languages with 100000 numbers.  Remove thousands separators so that we can insert them again later
3354 			pn = number_buf;
3355 			while(pn < &number_buf[sizeof(number_buf)-20])
3356 			{
3357 				if(iswdigit(*pw))
3358 				{
3359 					*pn++ = *pw++;
3360 				}
3361 				else if((*pw == tr->langopts.thousands_sep) && (pw[1] == ' ')
3362 						&& iswdigit(pw[2]) && (pw[3] != ' ') && (pw[4] != ' '))  // don't allow only 1 or 2 digits in the final part
3363 				{
3364 					pw += 2;
3365 					ix++;  // skip "word"
3366 				}
3367 				else
3368 				{
3369 					nx = pw - word;
3370 					memset(word,' ',nx);
3371 					nx = pn - number_buf;
3372 					memcpy(word,number_buf,nx);
3373 					break;
3374 				}
3375 			}
3376 			pw = word;
3377 		}
3378 
3379 		for(n_digits=0; iswdigit(word[n_digits]); n_digits++);  // count consecutive digits
3380 
3381 		if(n_digits > 4)
3382 		{
3383 			// word is entirely digits, insert commas and break into 3 digit "words"
3384 			number_buf[0] = ' ';
3385 			pn = &number_buf[1];
3386 			nx = n_digits;
3387 			nw = 0;
3388 
3389 			if((n_digits > tr->langopts.max_digits) || (word[0] == '0'))
3390 				words[ix].flags |= FLAG_INDIVIDUAL_DIGITS;
3391 
3392 			while(pn < &number_buf[sizeof(number_buf)-20])
3393 			{
3394 				if(!IsDigit09(c = *pw++) && (c != tr->langopts.decimal_sep))
3395 					break;
3396 
3397 				*pn++ = c;
3398 				nx--;
3399 				if((nx > 0) && (tr->langopts.break_numbers & (1 << nx)))
3400 				{
3401 					memcpy(&num_wtab[nw++], &words[ix], sizeof(WORD_TAB));   // copy the 'words' entry for each word of numbers
3402 
3403 					if(tr->langopts.thousands_sep != ' ')
3404 					{
3405 						*pn++ = tr->langopts.thousands_sep;
3406 					}
3407 					*pn++ = ' ';
3408 
3409 					if((words[ix].flags & FLAG_INDIVIDUAL_DIGITS) == 0)
3410 					{
3411 						if(tr->langopts.break_numbers & (1 << (nx-1)))
3412 						{
3413 							// the next group only has 1 digits, make it three
3414 							*pn++ = '0';
3415 							*pn++ = '0';
3416 						}
3417 						if(tr->langopts.break_numbers & (1 << (nx-2)))
3418 						{
3419 							// the next group only has 2 digits (eg. Indian languages), make it three
3420 							*pn++ = '0';
3421 						}
3422 					}
3423 				}
3424 			}
3425 			pw--;
3426 			memcpy(&num_wtab[nw], &words[ix], sizeof(WORD_TAB)*2);    // the original number word, and the word after it
3427 
3428 			for(j=1; j<=nw; j++)
3429 			{
3430 				num_wtab[j].flags &= ~(FLAG_MULTIPLE_SPACES | FLAG_EMBEDDED);     // don't use these flags for subsequent parts when splitting a number
3431 			}
3432 
3433 			// include the next few characters, in case there are an ordinal indicator or other suffix
3434 			memcpy(pn, pw, 16);
3435 			pn[16] = 0;
3436 			nw = 0;
3437 
3438 			for(pw = &number_buf[1]; pw < pn;)
3439 			{
3440 				// keep wflags for each part, for FLAG_HYPHEN_AFTER
3441 				dict_flags = TranslateWord2(tr, pw, &num_wtab[nw++], words[ix].pre_pause,0 );
3442 				while(*pw++ != ' ');
3443 				words[ix].pre_pause = 0;
3444 			}
3445 		}
3446 		else
3447 		{
3448 			pre_pause = 0;
3449 
3450 			dict_flags = TranslateWord2(tr, word, &words[ix], words[ix].pre_pause, words[ix+1].pre_pause);
3451 
3452 			if(pre_pause > words[ix+1].pre_pause)
3453 			{
3454 				words[ix+1].pre_pause = pre_pause;
3455 				pre_pause = 0;
3456 			}
3457 
3458 			if(dict_flags & FLAG_SPELLWORD)
3459 			{
3460 				// redo the word, speaking single letters
3461 				for(pw = word; *pw != ' ';)
3462 				{
3463 					memset(number_buf,' ',9);
3464 					nx = utf8_in(&c_temp, pw);
3465 					memcpy(&number_buf[2],pw,nx);
3466 					TranslateWord2(tr, &number_buf[2], &words[ix], 0, 0 );
3467 					pw += nx;
3468 				}
3469 			}
3470 
3471 			if((dict_flags & (FLAG_ALLOW_DOT | FLAG_NEEDS_DOT)) && (ix == word_count - 1 - dictionary_skipwords) && (terminator & CLAUSE_DOT))
3472 			{
3473 				// probably an abbreviation such as Mr. or B. rather than end of sentence
3474 				clause_pause = 10;
3475 				tone = 4;
3476 			}
3477 		}
3478 
3479 		if(dict_flags & FLAG_SKIPWORDS)
3480 		{
3481 			// dictionary indicates skip next word(s)
3482 			while(dictionary_skipwords > 0)
3483 			{
3484 				words[ix+dictionary_skipwords].flags |= FLAG_DELETE_WORD;
3485 				dictionary_skipwords--;
3486 			}
3487 		}
3488 	}
3489 
3490 	if(embedded_read < embedded_ix)
3491 	{
3492 		// any embedded commands not yet processed?
3493 		Word_EmbeddedCmd();
3494 	}
3495 
3496 	for(ix=0; ix<2; ix++)
3497 	{
3498 		// terminate the clause with 2 PAUSE phonemes
3499 		PHONEME_LIST2 *p2;
3500 		p2 = &ph_list2[n_ph_list2 + ix];
3501 		p2->phcode = phonPAUSE;
3502 		p2->stresslevel = 0;
3503 		p2->sourceix = source_index;
3504 		p2->synthflags = 0;
3505 	}
3506 	n_ph_list2 += 2;
3507 
3508 	if(count_words == 0)
3509 	{
3510 		clause_pause = 0;
3511 	}
3512 	if(Eof() && ((word_count == 0) || (option_endpause==0)))
3513 	{
3514 		clause_pause = 10;
3515 	}
3516 
3517 	MakePhonemeList(tr, clause_pause, new_sentence2);
3518 	phoneme_list[N_PHONEME_LIST].ph = NULL;   // recognize end of phoneme_list array, in Generate()
3519 	phoneme_list[N_PHONEME_LIST].sourceix = 1;
3520 
3521 	if(embedded_count)   // ???? is this needed
3522 	{
3523 		phoneme_list[n_phoneme_list-2].synthflags = SFLAG_EMBEDDED;
3524 		embedded_list[embedded_ix-1] |= 0x80;
3525 		embedded_list[embedded_ix] = 0x80;
3526 	}
3527 
3528 
3529 	prev_clause_pause = clause_pause;
3530 
3531 	if(tone_out != NULL)
3532 		*tone_out = tone;
3533 
3534 	new_sentence = 0;
3535 	if(terminator & CLAUSE_BIT_SENTENCE)
3536 	{
3537 		new_sentence = 1;  // next clause is a new sentence
3538 	}
3539 
3540 
3541 	if(voice_change != NULL)
3542 	{
3543 		// return new voice name if an embedded voice change command terminated the clause
3544 		if(terminator & CLAUSE_BIT_VOICE)
3545 			*voice_change = voice_change_name;
3546 		else
3547 			*voice_change = NULL;
3548 	}
3549 
3550 	if(Eof() || (vp_input==NULL))
3551 		return(NULL);
3552 
3553 	if(option_multibyte == espeakCHARS_WCHAR)
3554 		return((void *)p_wchar_input);
3555 	else
3556 		return((void *)p_textinput);
3557 }  //  end of TranslateClause
3558 
3559 
3560 
3561 
3562 
InitText(int control)3563 void InitText(int control)
3564 {//=======================
3565 	count_sentences = 0;
3566 	count_words = 0;
3567 	end_character_position = 0;
3568 	skip_sentences = 0;
3569 	skip_marker[0] = 0;
3570 	skip_words = 0;
3571 	skip_characters = 0;
3572 	skipping_text = 0;
3573 	new_sentence = 1;
3574 
3575 	prev_clause_pause = 0;
3576 
3577 	option_sayas = 0;
3578 	option_sayas2 = 0;
3579 	option_emphasis = 0;
3580 	word_emphasis = 0;
3581 	embedded_flag = 0;
3582 
3583 	InitText2();
3584 
3585 	if((control & espeakKEEP_NAMEDATA) == 0)
3586 	{
3587 		InitNamedata();
3588 	}
3589 }
3590 
3591