1 /***************************************************************************
2  *   Copyright (C) 2005 to 2014 by Jonathan Duddington                     *
3  *   email: jonsd@users.sourceforge.net                                    *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 3 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write see:                           *
17  *               <http://www.gnu.org/licenses/>.                           *
18  ***************************************************************************/
19 
20 #include "StdAfx.h"
21 
22 #include <stdio.h>
23 #include <ctype.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <wctype.h>
27 
28 #include "speak_lib.h"
29 #include "speech.h"
30 #include "phoneme.h"
31 #include "synthesize.h"
32 #include "translate.h"
33 
34 extern void Write4Bytes(FILE *f, int value);
35 int HashDictionary(const char *string);
36 
37 static FILE *f_log = NULL;
38 extern char *dir_dictionary;
39 
40 extern char word_phonemes[N_WORD_PHONEMES];    // a word translated into phoneme codes
41 
42 static int linenum;
43 static int error_count;
44 static int text_mode = 0;
45 static int debug_flag = 0;
46 static int error_need_dictionary = 0;
47 
48 static int hash_counts[N_HASH_DICT];
49 static char *hash_chains[N_HASH_DICT];
50 static char letterGroupsDefined[N_LETTER_GROUPS];
51 
52 MNEM_TAB mnem_rules[] = {
53 	{"unpr",   DOLLAR_UNPR},
54 	{"noprefix", DOLLAR_NOPREFIX},  // rule fails if a prefix has been removed
55 	{"list",   DOLLAR_LIST},    // a pronunciation is given in the *_list file
56 
57 	{"w_alt1", 0x11},
58 	{"w_alt2", 0x12},
59 	{"w_alt3", 0x13},
60 	{"w_alt4", 0x14},
61 	{"w_alt5", 0x15},
62 	{"w_alt6", 0x16},
63 	{"w_alt", 0x11},   // note: put longer names before their sub-strings
64 
65 	{"p_alt1", 0x21},
66 	{"p_alt2", 0x22},
67 	{"p_alt3", 0x23},
68 	{"p_alt4", 0x24},
69 	{"p_alt5", 0x25},
70 	{"p_alt6", 0x26},
71 	{"p_alt", 0x21},
72 	{NULL, -1}
73 };
74 
75 MNEM_TAB mnem_flags[] = {
76 	// these in the first group put a value in bits0-3 of dictionary_flags
77 	{"$1", 0x41},           // stress on 1st syllable
78 	{"$2", 0x42},           // stress on 2nd syllable
79 	{"$3", 0x43},
80 	{"$4", 0x44},
81 	{"$5", 0x45},
82 	{"$6", 0x46},
83 	{"$7", 0x47},
84 	{"$u", 0x48},           // reduce to unstressed
85 	{"$u1", 0x49},
86 	{"$u2", 0x4a},
87 	{"$u3", 0x4b},
88 	{"$u+",  0x4c},           // reduce to unstressed, but stress at end of clause
89 	{"$u1+", 0x4d},
90 	{"$u2+", 0x4e},
91 	{"$u3+", 0x4f},
92 
93 
94 	// these set the corresponding numbered bit if dictionary_flags
95 	{"$pause",     8},    // ensure pause before this word
96 	{"$strend",    9},   // full stress if at end of clause
97 	{"$strend2",   10},   // full stress if at end of clause, or only followed by unstressed
98 	{"$unstressend",11},  // reduce stress at end of clause
99 	{"$abbrev",    13},   // use this pronuciation rather than split into letters
100 
101 // language specific
102 	{"$double",    14},   // IT double the initial consonant of next word
103 	{"$alt",       15},   // use alternative pronunciation
104 	{"$alt1",      15},   // synonym for $alt
105 	{"$alt2",      16},
106 	{"$alt3",      17},
107 	{"$alt4",      18},
108 	{"$alt5",      19},
109 	{"$alt6",      20},
110 
111 	{"$combine",   23},   // Combine with the next word
112 
113 	{"$dot",       24},   // ignore '.' after this word (abbreviation)
114 	{"$hasdot",    25},   // use this pronunciation if there is a dot after the word
115 
116 	{"$max3",      27},   // limit to 3 repetitions
117 	{"$brk",       28},   // a shorter $pause
118 	{"$text",      29},   // word translates to replcement text, not phonemes
119 
120 // flags in dictionary word 2
121 	{"$verbf",   0x20},   // verb follows
122 	{"$verbsf",  0x21},   // verb follows, allow -s suffix
123 	{"$nounf",   0x22},   // noun follows
124 	{"$pastf",   0x23},   // past tense follows
125 	{"$verb",    0x24},   // use this pronunciation when its a verb
126 	{"$noun",    0x25},   // use this pronunciation when its a noun
127 	{"$past",    0x26},   // use this pronunciation when its past tense
128 	{"$verbextend",0x28}, // extend influence of 'verb follows'
129 	{"$capital", 0x29},   // use this pronunciation if initial letter is upper case
130 	{"$allcaps", 0x2a},   // use this pronunciation if initial letter is upper case
131 	{"$accent",  0x2b},   // character name is base-character name + accent name
132 	{"$sentence",0x2d},   // only if this clause is a sentence (i.e. terminator is {. ? !} not {, ; :}
133 	{"$only",    0x2e},   // only match on this word without suffix
134 	{"$onlys",   0x2f},   // only match with none, or with 's' suffix
135 	{"$stem",    0x30},   // must have a suffix
136 	{"$atend",   0x31},   // use this pronunciation if at end of clause
137 	{"$atstart", 0x32},   // use this pronunciation at start of clause
138 	{"$native",  0x33},   // not if we've switched translators
139 
140 	// doesn't set dictionary_flags
141 	{"$?",        100},   // conditional rule, followed by byte giving the condition number
142 
143 	{"$textmode",  200},
144 	{"$phonememode", 201},
145 	{NULL,   -1}
146 };
147 
148 
149 #define LEN_GROUP_NAME  12
150 
151 typedef struct {
152 	char name[LEN_GROUP_NAME+1];
153 	unsigned int start;
154 	unsigned int length;
155 	int group3_ix;
156 } RGROUP;
157 
158 
isspace2(unsigned int c)159 int isspace2(unsigned int c)
160 {//=========================
161 // can't use isspace() because on Windows, isspace(0xe1) gives TRUE !
162 	int c2;
163 
164 	if(((c2 = (c & 0xff)) == 0) || (c > ' '))
165 		return(0);
166 	return(1);
167 }
168 
169 
170 
fopen_log(const char * fname,const char * access)171 static FILE *fopen_log(const char *fname,const char *access)
172 {//==================================================
173 // performs fopen, but produces error message to f_log if it fails
174 	FILE *f;
175 
176 	if((f = fopen(fname,access)) == NULL)
177 	{
178 		if(f_log != NULL)
179 			fprintf(f_log,"Can't access (%s) file '%s'\n",access,fname);
180 	}
181 	return(f);
182 }
183 
184 
LookupMnemName(MNEM_TAB * table,const int value)185 const char *LookupMnemName(MNEM_TAB *table, const int value)
186 //==========================================================
187 /* Lookup a mnemonic string in a table, return its name */
188 {
189 	while(table->mnem != NULL)
190 	{
191 		if(table->value==value)
192 			return(table->mnem);
193 		table++;
194 	}
195 	return("");   /* not found */
196 }   /* end of LookupMnemValue */
197 
198 
print_dictionary_flags(unsigned int * flags,char * buf,int buf_len)199 void print_dictionary_flags(unsigned int *flags, char *buf, int buf_len)
200 {//========================================================================
201 	int stress;
202 	int ix;
203 	const char *name;
204 	int len;
205 	int total = 0;
206 
207 	buf[0] = 0;
208 	if((stress = flags[0] & 0xf) != 0)
209 	{
210 		sprintf(buf, "%s", LookupMnemName(mnem_flags, stress + 0x40));
211 		total = strlen(buf);
212 		buf += total;
213 	}
214 
215 	for(ix=8; ix<64; ix++)
216 	{
217 		if(((ix < 30) && (flags[0] & (1 << ix))) || ((ix >= 0x20) && (flags[1] & (1 << (ix-0x20)))))
218 		{
219 			name = LookupMnemName(mnem_flags, ix);
220 			len = strlen(name) + 1;
221 			total += len;
222 			if(total >= buf_len)
223 				continue;
224 			sprintf(buf, " %s", name);
225 			buf += len;
226 		}
227 	}
228 }
229 
230 
231 
232 
DecodeRule(const char * group_chars,int group_length,char * rule,int control)233 char *DecodeRule(const char *group_chars, int group_length, char *rule, int control)
234 {//=================================================================================
235 	/* Convert compiled match template to ascii */
236 
237 	unsigned char rb;
238 	unsigned char c;
239 	char *p;
240 	char *p_end;
241 	int  ix;
242 	int  match_type;
243 	int  finished=0;
244 	int  value;
245 	int  linenum=0;
246 	int  flags;
247 	int  suffix_char;
248 	int  condition_num=0;
249 	int  at_start = 0;
250 	const char *name;
251 	char buf[200];
252 	char buf_pre[200];
253 	char suffix[20];
254 	static char output[80];
255 
256 	static char symbols[] =
257 		{' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
258 		'&','%','+','#','S','D','Z','A','L','!',' ','@','?','J','N','K','V','?','T','X','?','W'
259 		};
260 
261 	static char symbols_lg[] = {'A','B','C','H','F','G','Y'};
262 
263 	match_type = 0;
264 	buf_pre[0] = 0;
265 
266 	for(ix=0; ix<group_length; ix++)
267 	{
268 		buf[ix] = group_chars[ix];
269 	}
270 	buf[ix] = 0;
271 
272 	p = &buf[strlen(buf)];
273 	while(!finished)
274 	{
275 		rb = *rule++;
276 
277 		if(rb <= RULE_LINENUM)
278 		{
279 			switch(rb)
280 			{
281 			case 0:
282 			case RULE_PHONEMES:
283 				finished=1;
284 				break;
285 			case RULE_PRE_ATSTART:
286 				at_start = 1;  // drop through to next case
287 			case RULE_PRE:
288 				match_type = RULE_PRE;
289 				*p = 0;
290 				p = buf_pre;
291 				break;
292 			case RULE_POST:
293 				match_type = RULE_POST;
294 				*p = 0;
295 				strcat(buf," (");
296 				p = &buf[strlen(buf)];
297 				break;
298 			case RULE_PH_COMMON:
299 				break;
300 			case RULE_CONDITION:
301 				/* conditional rule, next byte gives condition number */
302 				condition_num = *rule++;
303 				break;
304 			case RULE_LINENUM:
305 				value = (rule[1] & 0xff) - 1;
306 				linenum = (rule[0] & 0xff) - 1 + (value * 255);
307 				rule+=2;
308 				break;
309 			}
310 			continue;
311 		}
312 
313 		if(rb == RULE_DOLLAR)
314 		{
315 			value = *rule++ & 0xff;
316 			if((value != 0x01) || (control & FLAG_UNPRON_TEST))
317 			{
318 				// TODO write the string backwards if in RULE_PRE
319 				p[0] = '$';
320 				name = LookupMnemName(mnem_rules, value);
321 				strcpy(&p[1],name);
322 				p += (strlen(name)+1);
323 			}
324 			c = ' ';
325 		}
326 		else if(rb == RULE_ENDING)
327 		{
328 			static const char *flag_chars = "eipvdfq tba ";
329 			flags = ((rule[0] & 0x7f)<< 8) + (rule[1] & 0x7f);
330 			suffix_char = 'S';
331 			if(flags & (SUFX_P >> 8))
332 				suffix_char = 'P';
333 			sprintf(suffix,"%c%d",suffix_char,rule[2] & 0x7f);
334 			rule += 3;
335 			for(ix=0; ix<9; ix++)
336 			{
337 				if(flags & 1)
338 					sprintf(&suffix[strlen(suffix)],"%c",flag_chars[ix]);
339 				flags = (flags >> 1);
340 			}
341 			strcpy(p,suffix);
342 			p += strlen(suffix);
343 			c = ' ';
344 		}
345 		else if(rb == RULE_LETTERGP)
346 		{
347 			c = symbols_lg[*rule++ - 'A'];
348 		}
349 		else if(rb == RULE_LETTERGP2)
350 		{
351 			value = *rule++ - 'A';
352 			p[0] = 'L';
353 			p[1] = (value / 10) + '0';
354 			c = (value % 10) + '0';
355 
356 			if(match_type == RULE_PRE)
357 			{
358 				p[0] = c;
359 				c = 'L';
360 			}
361 			p+=2;
362 		}
363 		else if(rb <= RULE_LAST_RULE)
364 			c = symbols[rb];
365 		else if(rb == RULE_SPACE)
366 			c = '_';
367 		else
368 			c = rb;
369 		*p++ = c;
370 	}
371 	*p = 0;
372 
373 	p = output;
374 	p_end = p + sizeof(output) - 1;
375 
376 	if(linenum > 0)
377 	{
378 		sprintf(p,"%5d:\t",linenum);
379 		p += 7;
380 	}
381 	if(condition_num > 0)
382 	{
383 		sprintf(p,"?%d ",condition_num);
384 		p = &p[strlen(p)];
385 	}
386 	if(((ix = strlen(buf_pre)) > 0) || at_start)
387 	{
388 		if(at_start)
389 			*p++ = '_';
390 		while((--ix >= 0) && (p < p_end-3))
391 			*p++ = buf_pre[ix];
392 		*p++ = ')';
393 		*p++ = ' ';
394 	}
395 	*p = 0;
396 
397 	buf[p_end - p] = 0;  // prevent overflow in output[]
398 	strcat(p,buf);
399 	ix = strlen(output);
400 	while(ix < 8)
401 		output[ix++]=' ';
402 	output[ix]=0;
403 	return(output);
404 }   /* end of DecodeRule */
405 
406 
407 
408 
compile_line(char * linebuf,char * dict_line,int * hash)409 static int compile_line(char *linebuf, char *dict_line, int *hash)
410 {//===============================================================
411 // Compile a line in the language_list file
412 	unsigned char  c;
413 	char *p;
414 	char *word;
415 	char *phonetic;
416 	unsigned int  ix;
417 	int  step;
418 	unsigned int  n_flag_codes = 0;
419 	int flagnum;
420 	int  flag_offset;
421 	int  length;
422 	int  multiple_words = 0;
423 	int  multiple_numeric_hyphen = 0;
424 	char *multiple_string = NULL;
425 	char *multiple_string_end = NULL;
426 
427 	int len_word;
428 	int len_phonetic;
429 	int text_not_phonemes;   // this word specifies replacement text, not phonemes
430 	unsigned int  wc;
431 	int all_upper_case;
432 
433 	char *mnemptr;
434 	unsigned char flag_codes[100];
435 	char encoded_ph[200];
436 	char bad_phoneme_str[4];
437 	int bad_phoneme;
438 	static char nullstring[] = {0};
439 
440 	text_not_phonemes = 0;
441 	phonetic = word = nullstring;
442 
443 	p = linebuf;
444 //	while(isspace2(*p)) p++;
445 
446 #ifdef deleted
447 	if(*p == '$')
448 	{
449 		if(memcmp(p,"$textmode",9) == 0)
450 		{
451 			text_mode = 1;
452 			return(0);
453 		}
454 		if(memcmp(p,"$phonememode",12) == 0)
455 		{
456 			text_mode = 0;
457 			return(0);
458 		}
459 	}
460 #endif
461 
462 	step = 0;
463 
464 	c = 0;
465 	while(c != '\n')
466 	{
467 		c = *p;
468 
469 		if((c == '?') && (step==0))
470 		{
471 			// conditional rule, allow only if the numbered condition is set for the voice
472 			flag_offset = 100;
473 
474 			p++;
475 			if(*p == '!')
476 			{
477 				// allow only if the numbered condition is NOT set
478 				flag_offset = 132;
479 				p++;
480 			}
481 
482 			ix = 0;
483 			if(IsDigit09(*p))
484 			{
485 				ix += (*p-'0');
486 				p++;
487 			}
488 			if(IsDigit09(*p))
489 			{
490 				ix = ix*10 + (*p-'0');
491 				p++;
492 			}
493 			flag_codes[n_flag_codes++] = ix + flag_offset;
494 			c = *p;
495 		}
496 
497 		if((c == '$') && isalnum(p[1]))
498 		{
499 			/* read keyword parameter */
500 			mnemptr = p;
501 			while(!isspace2(c = *p)) p++;
502 			*p = 0;
503 
504 			flagnum = LookupMnem(mnem_flags,mnemptr);
505 			if(flagnum > 0)
506 			{
507 				if(flagnum == 200)
508 				{
509 					text_mode = 1;
510 				}
511 				else if(flagnum == 201)
512 				{
513 					text_mode = 0;
514 				}
515 				else if(flagnum == BITNUM_FLAG_TEXTMODE)
516 				{
517 					text_not_phonemes = 1;
518 				}
519 				else
520 				{
521 					flag_codes[n_flag_codes++] = flagnum;
522 				}
523 			}
524 			else
525 			{
526 				fprintf(f_log,"%5d: Unknown keyword: %s\n",linenum,mnemptr);
527 				error_count++;
528 			}
529 		}
530 
531 		if((c == '/') && (p[1] == '/') && (multiple_words==0))
532 		{
533 			c = '\n';   /* "//" treat comment as end of line */
534 		}
535 
536 		switch(step)
537 		{
538 		case 0:
539 			if(c == '(')
540 			{
541 				multiple_words = 1;
542 				word = p+1;
543 				step = 1;
544 			}
545 			else if(!isspace2(c))
546 			{
547 				word = p;
548 				step = 1;
549 			}
550 			break;
551 
552 		case 1:
553 			if((c == '-') && multiple_words)
554 			{
555 				if(IsDigit09(word[0]))
556 				{
557 					multiple_numeric_hyphen = 1;
558 				}
559 //				else  // ???
560 				{
561 					flag_codes[n_flag_codes++] = BITNUM_FLAG_HYPHENATED;
562 				}
563 				c = ' ';
564 			}
565 			if(isspace2(c))
566 			{
567 				p[0] = 0;   /* terminate english word */
568 
569 				if(multiple_words)
570 				{
571 					multiple_string = multiple_string_end = p+1;
572 					step = 2;
573 				}
574 				else
575 				{
576 					step = 3;
577 				}
578 			}
579 			else if(c == ')')
580 			{
581 				if(multiple_words)
582 				{
583 					p[0] = 0;
584 					multiple_words = 0;
585 					step = 3;
586 				}
587 				else if(word[0] != '_')
588 				{
589 					fprintf(f_log, "%5d: Missing '('\n", linenum);
590 					error_count++;
591 					step = 3;
592 				}
593 			}
594 			break;
595 
596 		case 2:
597 			if(isspace2(c))
598 			{
599 				multiple_words++;
600 			}
601 			else if(c == ')')
602 			{
603 				p[0] = ' ';   // terminate extra string
604 				multiple_string_end = p+1;
605 				step = 3;
606 			}
607 			break;
608 
609 		case 3:
610 			if(!isspace2(c))
611 			{
612 				phonetic = p;
613 				step = 4;
614 			}
615 			break;
616 
617 		case 4:
618 			if(isspace2(c))
619 			{
620 				p[0] = 0;   /* terminate phonetic */
621 				step = 5;
622 			}
623 			break;
624 
625 		case 5:
626 			break;
627 		}
628 		p++;
629 	}
630 
631 	if(word[0] == 0)
632 	{
633 		return(0);   /* blank line */
634 	}
635 
636 	if(text_mode)
637 		text_not_phonemes = 1;
638 
639 	if(text_not_phonemes)
640 	{
641 		if(word[0] == '_')
642 		{
643 			// This is a special word, used by eSpeak.  Translate this into phonemes now
644 			strcat(phonetic, " ");     // need a space to indicate word-boundary
645 
646 			// PROBLEM  vowel reductions are not applied to the translated phonemes
647 			// condition rules are not applied
648 			TranslateWord(translator,phonetic,0,NULL,NULL);
649 			text_not_phonemes = 0;
650 			strncpy0(encoded_ph, word_phonemes, N_WORD_BYTES-4);
651 
652 			if((word_phonemes[0] == 0) && (error_need_dictionary < 3))
653 			{
654 				// the dictionary was not loaded, we need a second attempt
655 				error_need_dictionary++;
656 				fprintf(f_log,"%5d: Need to compile dictionary again\n",linenum);
657 			}
658 			{
659 //char decoded_phonemes[128];
660 //DecodePhonemes(word_phonemes,decoded_phonemes);
661 //printf("Translator %x  %s  [%s] [%s]\n",translator->translator_name,word,phonetic,decoded_phonemes);
662 			}
663 		}
664 		else
665 		{
666 			// this is replacement text, so don't encode as phonemes. Restrict the length of the replacement word
667 			strncpy0(encoded_ph,phonetic,N_WORD_BYTES-4);
668 		}
669 	}
670 	else
671 	{
672 		EncodePhonemes(phonetic,encoded_ph,&bad_phoneme);
673 		if(strchr(encoded_ph,phonSWITCH) != 0)
674 		{
675 			flag_codes[n_flag_codes++] = BITNUM_FLAG_ONLY_S;  // don't match on suffixes (except 's') when switching languages
676 		}
677 
678 		// check for errors in the phonemes codes
679 		if(bad_phoneme != 0)
680 		{
681 			// unrecognised phoneme, report error
682 			bad_phoneme_str[utf8_out(bad_phoneme, bad_phoneme_str)] = 0;
683 			fprintf(f_log,"%5d: Bad phoneme [%s] (U+%x) in: %s  %s\n",linenum,bad_phoneme_str,bad_phoneme,word,phonetic);
684 			error_count++;
685 		}
686 	}
687 
688 	if(text_not_phonemes != translator->langopts.textmode)
689 	{
690 		flag_codes[n_flag_codes++] = BITNUM_FLAG_TEXTMODE;
691 	}
692 
693 
694 	if(sscanf(word,"U+%x",&wc) == 1)
695 	{
696 		// Character code
697 		ix = utf8_out(wc, word);
698 		word[ix] = 0;
699 	}
700 	else if(word[0] != '_')
701 	{
702 		// convert to lower case, and note if the word is all-capitals
703 		int c2;
704 
705 		all_upper_case = 1;
706 		p = word;
707 		for(p=word;;)
708 		{
709 			// this assumes that the lower case char is the same length as the upper case char
710 			// OK, except for Turkish "I", but use towlower() rather than towlower2()
711 			ix = utf8_in(&c2,p);
712 			if(c2 == 0)
713 				break;
714 			if(iswupper2(c2))
715 			{
716 				utf8_out(towlower2(c2),p);
717 			}
718 			else
719 			{
720 				all_upper_case = 0;
721 			}
722 			p += ix;
723 		}
724 		if(all_upper_case)
725 		{
726 			flag_codes[n_flag_codes++] = BITNUM_FLAG_ALLCAPS;
727 		}
728 	}
729 
730 	len_word = strlen(word);
731 
732 	if(translator->transpose_min > 0)
733 	{
734 		len_word = TransposeAlphabet(translator, word);
735 	}
736 
737 	*hash = HashDictionary(word);
738 	len_phonetic = strlen(encoded_ph);
739 
740 	dict_line[1] = len_word;   // bit 6 indicates whether the word has been compressed
741 	len_word &= 0x3f;
742 
743 	memcpy(&dict_line[2],word,len_word);
744 
745 	if(len_phonetic == 0)
746 	{
747 		// no phonemes specified. set bit 7
748 		dict_line[1] |= 0x80;
749 		length = len_word + 2;
750 	}
751 	else
752 	{
753 		length = len_word + len_phonetic + 3;
754 		strcpy(&dict_line[(len_word)+2],encoded_ph);
755 	}
756 
757 	for(ix=0; ix<n_flag_codes; ix++)
758 	{
759 		dict_line[ix+length] = flag_codes[ix];
760 	}
761 	length += n_flag_codes;
762 
763 	if((multiple_string != NULL) && (multiple_words > 0))
764 	{
765 		if(multiple_words > 10)
766 		{
767 			fprintf(f_log,"%5d: Two many parts in a multi-word entry: %d\n",linenum,multiple_words);
768 			error_count++;
769 		}
770 		else
771 		{
772 			dict_line[length++] = 80 + multiple_words;
773 			ix = multiple_string_end - multiple_string;
774 			if(multiple_numeric_hyphen)
775 			{
776 				dict_line[length++] = ' ';   // ???
777 			}
778 			memcpy(&dict_line[length],multiple_string,ix);
779 			length += ix;
780 		}
781 	}
782 	dict_line[0] = length;
783 
784 
785 	return(length);
786 }  /* end of compile_line */
787 
788 
789 
compile_dictlist_start(void)790 static void compile_dictlist_start(void)
791 {//=====================================
792 // initialise dictionary list
793 	int ix;
794 	char *p;
795 	char *p2;
796 
797 	for(ix=0; ix<N_HASH_DICT; ix++)
798 	{
799 		p = hash_chains[ix];
800 		while(p != NULL)
801 		{
802 			memcpy(&p2,p,sizeof(char *));
803 			free(p);
804 			p = p2;
805 		}
806 		hash_chains[ix] = NULL;
807 		hash_counts[ix]=0;
808 	}
809 }
810 
811 
compile_dictlist_end(FILE * f_out)812 static void compile_dictlist_end(FILE *f_out)
813 {//==========================================
814 // Write out the compiled dictionary list
815 	int hash;
816 	int length;
817 	char *p;
818 
819 	if(f_log != NULL)
820 	{
821 #ifdef OUTPUT_FORMAT
822 		for(hash=0; hash<N_HASH_DICT; hash++)
823 		{
824 			fprintf(f_log,"%8d",hash_counts[hash]);
825 			if((hash & 7) == 7)
826 				fputc('\n',f_log);
827 		}
828 		fflush(f_log);
829 #endif
830 	}
831 
832 	for(hash=0; hash<N_HASH_DICT; hash++)
833 	{
834 		p = hash_chains[hash];
835 		hash_counts[hash] = (int)ftell(f_out);
836 
837 		while(p != NULL)
838 		{
839 			length = *(p+sizeof(char *));
840 			fwrite(p+sizeof(char *),length,1,f_out);
841 			memcpy(&p,p,sizeof(char *));
842 		}
843 		fputc(0,f_out);
844 	}
845 }
846 
847 
848 
compile_dictlist_file(const char * path,const char * filename)849 static int compile_dictlist_file(const char *path, const char* filename)
850 {//=====================================================================
851 	int  length;
852 	int  hash;
853 	char *p;
854 	int  count=0;
855 	FILE *f_in;
856 	char buf[200];
857 	char fname[sizeof(path_home)+45];
858 	char dict_line[128];
859 
860 	text_mode = 0;
861 
862 	// try with and without '.txt' extension
863 	sprintf(fname,"%s%s.txt",path,filename);
864 	if((f_in = fopen(fname,"r")) == NULL)
865 	{
866 		sprintf(fname,"%s%s",path,filename);
867 		if((f_in = fopen(fname,"r")) == NULL)
868 			return(-1);
869 	}
870 
871 	fprintf(f_log,"Compiling: '%s'\n",fname);
872 
873 	linenum=0;
874 
875 	while(fgets(buf,sizeof(buf),f_in) != NULL)
876 	{
877 		linenum++;
878 
879 		length = compile_line(buf,dict_line,&hash);
880 		if(length == 0)  continue;   /* blank line */
881 
882 		hash_counts[hash]++;
883 
884 		p = (char *)malloc(length+sizeof(char *));
885 		if(p == NULL)
886 		{
887 			if(f_log != NULL)
888 			{
889 				fprintf(f_log,"Can't allocate memory\n");
890 				error_count++;
891 			}
892 			break;
893 		}
894 
895 		memcpy(p,&hash_chains[hash],sizeof(char *));
896 		hash_chains[hash] = p;
897 		memcpy(p+sizeof(char *),dict_line,length);
898 		count++;
899 	}
900 
901 	fprintf(f_log,"\t%d entries\n",count);
902 	fclose(f_in);
903 	return(0);
904 }   /* end of compile_dictlist_file */
905 
906 
907 
908 static char rule_cond[80];
909 static char rule_pre[80];
910 static char rule_post[80];
911 static char rule_match[80];
912 static char rule_phonemes[80];
913 static char group_name[LEN_GROUP_NAME+1];
914 static int group3_ix;
915 
916 #define N_RULES 2000		// max rules for each group
917 
918 
919 
isHexDigit(int c)920 int isHexDigit(int c)
921 {
922 	if((c >= '0') && (c <= '9'))
923 		return(c - '0');
924 	if((c >= 'a') && (c <= 'f'))
925 		return(c - 'a' + 10);
926 	if((c >= 'A') && (c <= 'F'))
927 		return(c - 'A' + 10);
928 	return(-1);
929 }
930 
931 
copy_rule_string(char * string,int * state_out)932 static void copy_rule_string(char *string, int *state_out)
933 {//=======================================================
934 // state 0: conditional, 1=pre, 2=match, 3=post, 4=phonemes
935 	static char *outbuf[5] = {rule_cond, rule_pre, rule_match, rule_post, rule_phonemes};
936 	static int next_state[5] = {2,2,4,4,4};
937 	char *output;
938 	char *p;
939 	int ix;
940 	int len;
941 	char c;
942 	int c2, c3;
943 	int  sxflags;
944 	int  value;
945 	int  literal;
946 	int  hexdigit_input = 0;
947 	int state = *state_out;
948 	MNEM_TAB *mr;
949 
950 	if(string[0] == 0) return;
951 
952 	output = outbuf[state];
953 	if(state==4)
954 	{
955 		// append to any previous phoneme string, i.e. allow spaces in the phoneme string
956 		len = strlen(rule_phonemes);
957 		if(len > 0)
958 			rule_phonemes[len++] = ' ';
959 		output = &rule_phonemes[len];
960 	}
961 	sxflags = 0x808000;           // to ensure non-zero bytes
962 
963 	for(p=string,ix=0;;)
964 	{
965 		literal = 0;
966 		c = *p++;
967 		if((c == '0') && (p[0] == 'x') && (isHexDigit(p[1]) >= 0) && (isHexDigit(p[2]) >= 0))
968 		{
969 			hexdigit_input = 1;
970 			c = p[1];
971 			p+= 2;
972 		}
973 		if(c == '\\')
974 		{
975 			c = *p++;   // treat next character literally
976 //#ifdef deleted
977 			if((c >= '0') && (c <= '3') && (p[0] >= '0') && (p[0] <= '7') && (p[1] >= '0') && (p[1] <= '7'))
978 			{
979 				// character code given by 3 digit octal value;
980 				c = (c-'0')*64 + (p[0]-'0')*8 + (p[1]-'0');
981 				p += 2;
982 			}
983 //endif
984 			literal = 1;
985 		}
986 		if(hexdigit_input)
987 		{
988 			if(((c2 = isHexDigit(c)) >= 0) && ((c3 = isHexDigit(p[0])) >= 0))
989 			{
990 				c = c2 * 16 + c3;
991 				literal = 1;
992 				p++;
993 			}
994 			else
995 			{
996 				hexdigit_input = 0;
997 			}
998 		}
999 		if((state==1) || (state==3))
1000 		{
1001 			// replace special characters (note: 'E' is reserved for a replaced silent 'e')
1002 			if(literal == 0)
1003 			{
1004 				static const char lettergp_letters[9] = {LETTERGP_A,LETTERGP_B,LETTERGP_C,0,0,LETTERGP_F,LETTERGP_G,LETTERGP_H,LETTERGP_Y};
1005 				switch(c)
1006 				{
1007 				case '_':
1008 					c = RULE_SPACE;
1009 					break;
1010 
1011 				case 'Y':
1012 					c = 'I';   // drop through to next case
1013 				case 'A':   // vowel
1014 				case 'B':
1015 				case 'C':
1016 				case 'H':
1017 				case 'F':
1018 				case 'G':
1019 					if(state == 1)
1020 					{
1021 						// pre-rule, put the number before the RULE_LETTERGP;
1022 						output[ix++] = lettergp_letters[c-'A'] + 'A';
1023 						c = RULE_LETTERGP;
1024 					}
1025 					else
1026 					{
1027 						output[ix++] = RULE_LETTERGP;
1028 						c = lettergp_letters[c-'A'] + 'A';
1029 					}
1030 					break;
1031 				case 'D':
1032 					c = RULE_DIGIT;
1033 					break;
1034 				case 'K':
1035 					c = RULE_NOTVOWEL;
1036 					break;
1037 				case 'N':
1038 					c = RULE_NO_SUFFIX;
1039 					break;
1040 				case 'V':
1041 					c = RULE_IFVERB;
1042 					break;
1043 				case 'Z':
1044 					c = RULE_NONALPHA;
1045 					break;
1046 				case '+':
1047 					c = RULE_INC_SCORE;
1048 					break;
1049 				case '@':
1050 					c = RULE_SYLLABLE;
1051 					break;
1052 				case '&':
1053 					c = RULE_STRESSED;
1054 					break;
1055 				case '%':
1056 					c = RULE_DOUBLE;
1057 					break;
1058 				case '#':
1059 					c = RULE_DEL_FWD;
1060 					break;
1061 				case '!':
1062 					c = RULE_CAPITAL;
1063 					break;
1064 				case 'T':
1065 					output[ix++] = RULE_DOLLAR;
1066 					c = 0x11;
1067 					break;
1068 				case 'W':
1069 					c = RULE_SPELLING;
1070 					break;
1071 				case 'X':
1072 					c = RULE_NOVOWELS;
1073 					break;
1074 				case 'J':
1075 					c = RULE_SKIPCHARS;
1076 					break;
1077 				case 'L':
1078 					// expect two digits
1079 					c = *p++ - '0';
1080 					value = *p++ - '0';
1081 					c = c * 10 + value;
1082 					if((value < 0) || (value > 9))
1083 					{
1084 						c = 0;
1085 						fprintf(f_log,"%5d: Expected 2 digits after 'L'\n",linenum);
1086 						error_count++;
1087 					}
1088 					else if((c <= 0) || (c >= N_LETTER_GROUPS) || (letterGroupsDefined[(int)c] == 0))
1089 					{
1090 						fprintf(f_log,"%5d: Letter group L%.2d not defined\n",linenum,c);
1091 						error_count++;
1092 					}
1093 					c += 'A';
1094 					if(state == 1)
1095 					{
1096 						// pre-rule, put the group number before the RULE_LETTERGP command
1097 						output[ix++] = c;
1098 						c = RULE_LETTERGP2;
1099 					}
1100 					else
1101 					{
1102 						output[ix++] = RULE_LETTERGP2;
1103 					}
1104 					break;
1105 
1106 				case '$':
1107 					value = 0;
1108 					mr = mnem_rules;
1109 					while(mr->mnem != NULL)
1110 					{
1111 						len = strlen(mr->mnem);
1112 						if(memcmp(p, mr->mnem, len) == 0)
1113 						{
1114 							value = mr->value;
1115 							p += len;
1116 							break;
1117 						}
1118 						mr++;
1119 					}
1120 
1121 					if(state == 1)
1122 					{
1123 						// pre-rule, put the number before the RULE_DOLLAR
1124 						output[ix++] = value;
1125 						c = RULE_DOLLAR;
1126 					}
1127 					else
1128 					{
1129 						output[ix++] = RULE_DOLLAR;
1130 						c = value;
1131 					}
1132 
1133 					if(value == 0)
1134 					{
1135 						fprintf(f_log,"%5d: $ command not recognized\n",linenum);
1136 						error_count++;
1137 					}
1138 					break;
1139 
1140 				case 'P':
1141 					sxflags |= SUFX_P;   // Prefix, now drop through to Suffix
1142 				case 'S':
1143 					output[ix++] = RULE_ENDING;
1144 					value = 0;
1145 					while(!isspace2(c = *p++) && (c != 0))
1146 					{
1147 						switch(c)
1148 						{
1149 						case 'e':
1150 							sxflags |= SUFX_E;
1151 							break;
1152 						case 'i':
1153 							sxflags |= SUFX_I;
1154 							break;
1155 						case 'p':	// obsolete, replaced by 'P' above
1156 							sxflags |= SUFX_P;
1157 							break;
1158 						case 'v':
1159 							sxflags |= SUFX_V;
1160 							break;
1161 						case 'd':
1162 							sxflags |= SUFX_D;
1163 							break;
1164 						case 'f':
1165 							sxflags |= SUFX_F;
1166 							break;
1167 						case 'q':
1168 							sxflags |= SUFX_Q;
1169 							break;
1170 						case 't':
1171 							sxflags |= SUFX_T;
1172 							break;
1173 						case 'b':
1174 							sxflags |= SUFX_B;
1175 							break;
1176 						case 'a':
1177 							sxflags |= SUFX_A;
1178 							break;
1179 						case 'm':
1180 							sxflags |= SUFX_M;
1181 							break;
1182 						default:
1183 							if(IsDigit09(c))
1184 								value = (value*10) + (c - '0');
1185 							break;
1186 						}
1187 					}
1188 					p--;
1189 					output[ix++] = sxflags >> 16;
1190 					output[ix++] = sxflags >> 8;
1191 					c = value | 0x80;
1192 					break;
1193 				}
1194 			}
1195 		}
1196 		output[ix++] = c;
1197 		if(c == 0) break;
1198 	}
1199 
1200 	*state_out = next_state[state];
1201 }  //  end of copy_rule_string
1202 
1203 
1204 
compile_rule(char * input)1205 static char *compile_rule(char *input)
1206 {//===================================
1207 	int ix;
1208 	unsigned char c;
1209 	int wc;
1210 	char *p;
1211 	char *prule;
1212 	int len;
1213 	int len_name;
1214 	int start;
1215 	int state=2;
1216 	int finish=0;
1217 	char buf[80];
1218 	char output[150];
1219 	int bad_phoneme;
1220 	char bad_phoneme_str[4];
1221 
1222 	buf[0]=0;
1223 	rule_cond[0]=0;
1224 	rule_pre[0]=0;
1225 	rule_post[0]=0;
1226 	rule_match[0]=0;
1227 	rule_phonemes[0]=0;
1228 
1229 	p = buf;
1230 
1231 	for(ix=0; finish==0; ix++)
1232 	{
1233 		c = input[ix];
1234 
1235 		switch(c = input[ix])
1236 		{
1237 		case ')':		// end of prefix section
1238 			*p = 0;
1239 			state = 1;
1240 			copy_rule_string(buf,&state);
1241 			p = buf;
1242 			break;
1243 
1244 		case '(':		// start of suffix section
1245 			*p = 0;
1246 			state = 2;
1247 			copy_rule_string(buf,&state);
1248 			state = 3;
1249 			p = buf;
1250 			if(input[ix+1] == ' ')
1251 			{
1252 				fprintf(f_log,"%5d: Syntax error. Space after (\n",linenum);
1253 				error_count++;
1254 			}
1255 			break;
1256 
1257 		case '\n':		// end of line
1258 		case '\r':
1259 		case 0:			// end of line
1260 			*p = 0;
1261 			copy_rule_string(buf,&state);
1262 			finish=1;
1263 			break;
1264 
1265 		case '\t':		// end of section section
1266 		case ' ':
1267 			*p = 0;
1268 			copy_rule_string(buf,&state);
1269 			p = buf;
1270 			break;
1271 
1272 		case '?':
1273 			if(state==2)
1274 				state=0;
1275 			else
1276 				*p++ = c;
1277 			break;
1278 
1279 		default:
1280 			*p++ = c;
1281 			break;
1282 		}
1283 	}
1284 
1285 	if(strcmp(rule_match,"$group")==0)
1286 		strcpy(rule_match,group_name);
1287 
1288 	if(rule_match[0]==0)
1289 	{
1290 		if(rule_post[0] != 0)
1291 		{
1292 			fprintf(f_log,"%5d: Syntax error\n",linenum);
1293 			error_count++;
1294 		}
1295 		return(NULL);
1296 	}
1297 
1298 	EncodePhonemes(rule_phonemes,buf,&bad_phoneme);
1299 	if(bad_phoneme != 0)
1300 	{
1301 		bad_phoneme_str[utf8_out(bad_phoneme, bad_phoneme_str)] = 0;
1302 		fprintf(f_log,"%5d: Bad phoneme [%s] (U+%x) in: %s\n",linenum,bad_phoneme_str,bad_phoneme,input);
1303 		error_count++;
1304 	}
1305 	strcpy(output,buf);
1306 	len = strlen(buf)+1;
1307 
1308 	len_name = strlen(group_name);
1309 	if((len_name > 0) && (memcmp(rule_match,group_name,len_name) != 0))
1310 	{
1311 		utf8_in(&wc,rule_match);
1312 		if((group_name[0] == '9') && IsDigit(wc))
1313 		{
1314 			// numeric group, rule_match starts with a digit, so OK
1315 		}
1316 		else
1317 		{
1318 			fprintf(f_log,"%5d: Wrong initial letters '%s' for group '%s'\n",linenum,rule_match,group_name);
1319 			error_count++;
1320 		}
1321 	}
1322 	strcpy(&output[len],rule_match);
1323 	len += strlen(rule_match);
1324 
1325 	if(debug_flag)
1326 	{
1327 		output[len] = RULE_LINENUM;
1328 		output[len+1] = (linenum % 255) + 1;
1329 		output[len+2] = (linenum / 255) + 1;
1330 		len+=3;
1331 	}
1332 
1333 	if(rule_cond[0] != 0)
1334 	{
1335 		ix = -1;
1336 		if(rule_cond[0] == '!')
1337 		{
1338 			// allow the rule only if the condition number is NOT set for the voice
1339 			ix = atoi(&rule_cond[1]) + 32;
1340 		}
1341 		else
1342 		{
1343 			// allow the rule only if the condition number is set for the voice
1344 			ix = atoi(rule_cond);
1345 		}
1346 
1347 		if((ix > 0) && (ix < 255))
1348 		{
1349 			output[len++] = RULE_CONDITION;
1350 			output[len++] = ix;
1351 		}
1352 		else
1353 		{
1354 			fprintf(f_log,"%5d: bad condition number ?%d\n",linenum,ix);
1355 			error_count++;
1356 		}
1357 	}
1358 	if(rule_pre[0] != 0)
1359 	{
1360 		start = 0;
1361 		if(rule_pre[0] == RULE_SPACE)
1362 		{
1363 			// omit '_' at the beginning of the pre-string and imply it by using RULE_PRE_ATSTART
1364 			c = RULE_PRE_ATSTART;
1365 			start = 1;
1366 		}
1367 		else
1368 		{
1369 			c = RULE_PRE;
1370 		}
1371 		output[len++] = c;
1372 
1373 		// output PRE string in reverse order
1374 		for(ix = strlen(rule_pre)-1; ix>=start; ix--)
1375 			output[len++] = rule_pre[ix];
1376 	}
1377 
1378 	if(rule_post[0] != 0)
1379 	{
1380 		sprintf(&output[len],"%c%s",RULE_POST,rule_post);
1381 		len += (strlen(rule_post)+1);
1382 	}
1383 	output[len++]=0;
1384 	prule = (char *)malloc(len);
1385 	memcpy(prule,output,len);
1386 	return(prule);
1387 }  //  end of compile_rule
1388 
1389 
string_sorter(char ** a,char ** b)1390 int __cdecl string_sorter(char **a, char **b)
1391 {//===========================================
1392 	char *pa, *pb;
1393 	int ix;
1394 
1395 	if((ix = strcmp(pa = *a,pb = *b)) != 0)
1396 		return(ix);
1397 	pa += (strlen(pa)+1);
1398 	pb += (strlen(pb)+1);
1399 	return(strcmp(pa,pb));
1400 }   /* end of string_sorter */
1401 
1402 
rgroup_sorter(RGROUP * a,RGROUP * b)1403 static int __cdecl rgroup_sorter(RGROUP *a, RGROUP *b)
1404 {//===================================================
1405 // Sort long names before short names
1406 	int ix;
1407 	ix = strlen(b->name) - strlen(a->name);
1408 	if(ix != 0) return(ix);
1409 	ix = strcmp(a->name,b->name);
1410 	if(ix != 0) return(ix);
1411 	return(a->start-b->start);
1412 }
1413 
1414 
1415 #ifdef OUTPUT_FORMAT
print_rule_group(FILE * f_out,int n_rules,char ** rules,char * name)1416 static void print_rule_group(FILE *f_out, int n_rules, char **rules, char *name)
1417 {//=============================================================================
1418 	int rule;
1419 	int ix;
1420 	unsigned char c;
1421 	int len1;
1422 	int len2;
1423 	int spaces;
1424 	char *p;
1425 	char *pout;
1426 	int condition;
1427 	char buf[80];
1428 	char suffix[12];
1429 
1430 	static unsigned char symbols[] = {'@','&','%','+','#','$','D','Z','A','B','C','F'};
1431 
1432 	fprintf(f_out,"\n$group %s\n",name);
1433 
1434 	for(rule=0; rule<n_rules; rule++)
1435 	{
1436 		p = rules[rule];
1437 		len1 = strlen(p) + 1;
1438 		p = &p[len1];
1439 		len2 = strlen(p);
1440 
1441 		rule_match[0]=0;
1442 		rule_pre[0]=0;
1443 		rule_post[0]=0;
1444 		condition = 0;
1445 
1446 		pout = rule_match;
1447 		for(ix=0; ix<len2; ix++)
1448 		{
1449 			switch(c = p[ix])
1450 			{
1451 			case RULE_PRE:
1452 				*pout = 0;
1453 				pout = rule_pre;
1454 				break;
1455 			case RULE_POST:
1456 				*pout = 0;
1457 				pout = rule_post;
1458 				break;
1459 			case RULE_CONDITION:
1460 				condition = p[++ix];
1461 				break;
1462 			case RULE_ENDING:
1463 				sprintf(suffix,"$%d[%x]",(p[ix+2]),p[ix+1] & 0x7f);
1464 				ix += 2;
1465 				strcpy(pout,suffix);
1466 				pout += strlen(suffix);
1467 				break;
1468 			default:
1469 				if(c <= RULE_LETTER7)
1470 					c = symbols[c-RULE_SYLLABLE];
1471 				if(c == ' ')
1472 					c = '_';
1473 				*pout++ = c;
1474 				break;
1475 			}
1476 		}
1477 		*pout = 0;
1478 
1479 		spaces = 12;
1480 		if(condition > 0)
1481 		{
1482 			sprintf(buf,"?%d ",condition);
1483 			spaces -= strlen(buf);
1484 			fprintf(f_out,"%s",buf);
1485 		}
1486 
1487 		if(rule_pre[0] != 0)
1488 		{
1489 			p = buf;
1490 			for(ix=strlen(rule_pre)-1; ix>=0; ix--)
1491 				*p++ = rule_pre[ix];
1492 			sprintf(p,") ");
1493 			spaces -= strlen(buf);
1494 			for(ix=0; ix<spaces; ix++)
1495 				fputc(' ',f_out);
1496 			fprintf(f_out,"%s",buf);
1497 			spaces = 0;
1498 		}
1499 
1500 		for(ix=0; ix<spaces; ix++)
1501 			fputc(' ',f_out);
1502 
1503 		spaces = 14;
1504 		sprintf(buf," %s ",rule_match);
1505 		if(rule_post[0] != 0)
1506 		{
1507 			p = &buf[strlen(buf)];
1508 			sprintf(p,"(%s ",rule_post);
1509 		}
1510 		fprintf(f_out,"%s",buf);
1511 		spaces -= strlen(buf);
1512 
1513 		for(ix=0; ix<spaces; ix++)
1514 			fputc(' ',f_out);
1515 		DecodePhonemes(rules[rule],buf);
1516 		fprintf(f_out,"%s\n",buf);   // phonemes
1517 	}
1518 }
1519 #endif
1520 
1521 
1522 //#define LIST_GROUP_INFO
output_rule_group(FILE * f_out,int n_rules,char ** rules,char * name)1523 static void output_rule_group(FILE *f_out, int n_rules, char **rules, char *name)
1524 {//==============================================================================
1525 	int ix;
1526 	int len1;
1527 	int len2;
1528 	int len_name;
1529 	char *p;
1530 	char *p2, *p3;
1531 	const char *common;
1532 
1533 	short nextchar_count[256];
1534 	memset(nextchar_count,0,sizeof(nextchar_count));
1535 
1536 	len_name = strlen(name);
1537 
1538 #ifdef OUTPUT_FORMAT
1539 	print_rule_group(f_log,n_rules,rules,name);
1540 #endif
1541 
1542 	// sort the rules in this group by their phoneme string
1543 	common = "";
1544 	qsort((void *)rules,n_rules,sizeof(char *),(int (__cdecl *)(const void *,const void *))string_sorter);
1545 
1546 	if(strcmp(name,"9")==0)
1547 		len_name = 0;    //  don't remove characters from numeric match strings
1548 
1549 	for(ix=0; ix<n_rules; ix++)
1550 	{
1551 		p = rules[ix];
1552 		len1 = strlen(p) + 1;  // phoneme string
1553 		p3 = &p[len1];
1554 		p2 = p3 + len_name;        // remove group name from start of match string
1555 		len2 = strlen(p2);
1556 
1557 		nextchar_count[(unsigned char)(p2[0])]++;   // the next byte after the group name
1558 
1559 		if((common[0] != 0) && (strcmp(p,common)==0))
1560 		{
1561 			fwrite(p2,len2,1,f_out);
1562 			fputc(0,f_out);		// no phoneme string, it's the same as previous rule
1563 		}
1564 		else
1565 		{
1566 			if((ix < n_rules-1) && (strcmp(p,rules[ix+1])==0))
1567 			{
1568 				common = rules[ix];   // phoneme string is same as next, set as common
1569 				fputc(RULE_PH_COMMON,f_out);
1570 			}
1571 
1572 			fwrite(p2,len2,1,f_out);
1573 			fputc(RULE_PHONEMES,f_out);
1574 			fwrite(p,len1,1,f_out);
1575 		}
1576 	}
1577 
1578 #ifdef LIST_GROUP_INFO
1579 	for(ix=32; ix<256; ix++)
1580 	{
1581 		if(nextchar_count[ix] > 30)
1582 			printf("Group %s   %c  %d\n",name,ix,nextchar_count[ix]);
1583 	}
1584 #endif
1585 }  //  end of output_rule_group
1586 
1587 
1588 
compile_lettergroup(char * input,FILE * f_out)1589 static int compile_lettergroup(char *input, FILE *f_out)
1590 {//=====================================================
1591 	char *p;
1592 	char *p_start;
1593 	int group;
1594 	int ix;
1595 	int n_items;
1596 	int length;
1597 	int max_length = 0;
1598 
1599 #define N_LETTERGP_ITEMS 200
1600 	char *items[N_LETTERGP_ITEMS];
1601 	char item_length[N_LETTERGP_ITEMS];
1602 
1603 	p = input;
1604 	if(!IsDigit09(p[0]) || !IsDigit09(p[1]))
1605 	{
1606 		fprintf(f_log,"%5d: Expected 2 digits after '.L'\n",linenum);
1607 		error_count++;
1608 		return(1);
1609 	}
1610 
1611 	group = atoi(&p[0]);
1612 	if(group >= N_LETTER_GROUPS)
1613 	{
1614 		fprintf(f_log,"%5d: lettergroup out of range (01-%.2d)\n",linenum,N_LETTER_GROUPS-1);
1615 		error_count++;
1616 		return(1);
1617 	}
1618 
1619 	while(!isspace2(*p)) p++;
1620 
1621 	fputc(RULE_GROUP_START,f_out);
1622 	fputc(RULE_LETTERGP2,f_out);
1623 	fputc(group + 'A', f_out);
1624 	if(letterGroupsDefined[group] != 0)
1625 	{
1626 		fprintf(f_log,"%5d: lettergroup L%.2d is already defined\n",linenum,group);
1627 		error_count++;
1628 	}
1629 	letterGroupsDefined[group] = 1;
1630 
1631 	n_items = 0;
1632 	while(n_items < N_LETTERGP_ITEMS)
1633 	{
1634 		while(isspace2(*p)) p++;
1635 		if(*p == 0)
1636 			break;
1637 
1638 		items[n_items] = p_start = p;
1639 		while((*p & 0xff) > ' ')
1640 		{
1641 			if (*p == '_') *p = ' ';   // allow '_' for word break
1642 			p++;
1643 		}
1644 		*p++ = 0;
1645 		length = p - p_start;
1646 		if(length > max_length)
1647 			max_length = length;
1648 		item_length[n_items++] = length;
1649 	}
1650 
1651 	// write out the items, longest first
1652 	while(max_length > 1)
1653 	{
1654 		for(ix=0; ix < n_items; ix++)
1655 		{
1656 			if(item_length[ix] == max_length)
1657 			{
1658 				fwrite(items[ix],1,max_length,f_out);
1659 			}
1660 		}
1661 		max_length--;
1662 	}
1663 
1664 	fputc(RULE_GROUP_END,f_out);
1665 
1666 	return(0);
1667 }
1668 
1669 
compile_dictrules(FILE * f_in,FILE * f_out,char * fname_temp)1670 static int compile_dictrules(FILE *f_in, FILE *f_out, char *fname_temp)
1671 {//====================================================================
1672 	char *prule;
1673 	unsigned char *p;
1674 	int ix;
1675 	int c;
1676 	int gp;
1677 	FILE *f_temp;
1678 	int n_rules=0;
1679 	int count=0;
1680 	int different;
1681 	int wc;
1682 	const char *prev_rgroup_name;
1683 	unsigned int char_code;
1684 	int compile_mode=0;
1685 	char *buf;
1686 	char buf1[500];
1687 	char *rules[N_RULES];
1688 
1689 	int n_rgroups = 0;
1690 	int n_groups3 = 0;
1691 	RGROUP rgroup[N_RULE_GROUP2];
1692 
1693 	linenum = 0;
1694 	group_name[0] = 0;
1695 
1696 	if((f_temp = fopen_log(fname_temp,"wb")) == NULL)
1697 		return(1);
1698 
1699 	for(;;)
1700 	{
1701 		linenum++;
1702 		buf = fgets(buf1,sizeof(buf1),f_in);
1703 		if(buf != NULL)
1704 		{
1705 			if((p = (unsigned char *)strstr(buf,"//")) != NULL)
1706 				*p = 0;
1707 
1708 			if(buf[0] == '\r') buf++;  // ignore extra \r in \r\n
1709 		}
1710 
1711 		if((buf == NULL) || (buf[0] == '.'))
1712 		{
1713 			// next .group or end of file, write out the previous group
1714 
1715 			if(n_rules > 0)
1716 			{
1717 				strcpy(rgroup[n_rgroups].name,group_name);
1718 				rgroup[n_rgroups].group3_ix = group3_ix;
1719 				rgroup[n_rgroups].start = ftell(f_temp);
1720 				output_rule_group(f_temp,n_rules,rules,group_name);
1721 				rgroup[n_rgroups].length = ftell(f_temp) - rgroup[n_rgroups].start;
1722 				n_rgroups++;
1723 
1724 				count += n_rules;
1725 			}
1726 			n_rules = 0;
1727 
1728 			if(compile_mode == 2)
1729 			{
1730 				// end of the character replacements section
1731 				fwrite(&n_rules,1,4,f_out);   // write a zero word to terminate the replacemenmt list
1732 				compile_mode = 0;
1733 			}
1734 
1735 			if(buf == NULL) break;   // end of file
1736 
1737 			if(memcmp(buf,".L",2)==0)
1738 			{
1739 				compile_lettergroup(&buf[2], f_out);
1740 				continue;
1741 			}
1742 
1743 			if(memcmp(buf,".replace",8)==0)
1744 			{
1745 				compile_mode = 2;
1746 				fputc(RULE_GROUP_START,f_out);
1747 				fputc(RULE_REPLACEMENTS,f_out);
1748 
1749 				// advance to next word boundary
1750 				while((ftell(f_out) & 3) != 0)
1751 					fputc(0,f_out);
1752 			}
1753 
1754 			if(memcmp(buf,".group",6)==0)
1755 			{
1756 				compile_mode = 1;
1757 
1758 				p = (unsigned char *)&buf[6];
1759 				while((p[0]==' ') || (p[0]=='\t')) p++;    // Note: Windows isspace(0xe1) gives TRUE !
1760 				ix = 0;
1761 				while((*p > ' ') && (ix < LEN_GROUP_NAME))
1762 					group_name[ix++] = *p++;
1763 				group_name[ix]=0;
1764 				group3_ix = 0;
1765 
1766 				if(sscanf(group_name,"0x%x",&char_code)==1)
1767 				{
1768 					// group character is given as a character code (max 16 bits)
1769 					p = (unsigned char *)group_name;
1770 
1771 					if(char_code > 0x100)
1772 					{
1773 						*p++ = (char_code >> 8);
1774 					}
1775 					*p++ = char_code;
1776 					*p = 0;
1777 				}
1778 				else
1779 				{
1780 					if(translator->letter_bits_offset > 0)
1781 					{
1782 						utf8_in(&wc, group_name);
1783 						if(((ix = (wc - translator->letter_bits_offset)) >= 0) && (ix < 128))
1784 						{
1785 							group3_ix = ix+1;   // not zero
1786 						}
1787 					}
1788 				}
1789 
1790 				if((group3_ix == 0) && (strlen(group_name) > 2))
1791 				{
1792 					if(utf8_in(&c,group_name) < 2)
1793 					{
1794 						fprintf(f_log,"%5d: Group name longer than 2 bytes (UTF8)",linenum);
1795 						error_count++;
1796 					}
1797 
1798 					group_name[2] = 0;
1799 				}
1800 			}
1801 
1802 			continue;
1803 		}
1804 
1805 		switch(compile_mode)
1806 		{
1807 		case 1:    //  .group
1808 			prule = compile_rule(buf);
1809 			if((prule != NULL) && (n_rules < N_RULES))
1810 			{
1811 				rules[n_rules++] = prule;
1812 			}
1813 			break;
1814 
1815 		case 2:   //  .replace
1816 		{
1817 			int replace1;
1818 			int replace2;
1819 			char *p;
1820 
1821 			p = buf;
1822 			replace1 = 0;
1823 			replace2 = 0;
1824 			while(isspace2(*p)) p++;
1825 			ix = 0;
1826 			while((unsigned char)(*p) > 0x20)   // not space or zero-byte
1827 			{
1828 				p += utf8_in(&c,p);
1829 				replace1 += (c << ix);
1830 				ix += 16;
1831 			}
1832 			while(isspace2(*p)) p++;
1833 			ix = 0;
1834 			while((unsigned char)(*p) > 0x20)
1835 			{
1836 				p += utf8_in(&c,p);
1837 				replace2 += (c << ix);
1838 				ix += 16;
1839 			}
1840 			if(replace1 != 0)
1841 			{
1842 				Write4Bytes(f_out,replace1);   // write as little-endian
1843 				Write4Bytes(f_out,replace2);   // if big-endian, reverse the bytes in LoadDictionary()
1844 			}
1845 		}
1846 		break;
1847 		}
1848 	}
1849 	fclose(f_temp);
1850 
1851 	qsort((void *)rgroup,n_rgroups,sizeof(rgroup[0]),(int (__cdecl *)(const void *,const void *))rgroup_sorter);
1852 
1853 	if((f_temp = fopen(fname_temp,"rb"))==NULL)
1854 		return(2);
1855 
1856 	prev_rgroup_name = "\n";
1857 
1858 	for(gp = 0; gp < n_rgroups; gp++)
1859 	{
1860 		fseek(f_temp,rgroup[gp].start,SEEK_SET);
1861 
1862 		if((different = strcmp(rgroup[gp].name, prev_rgroup_name)) != 0)
1863 		{
1864 			// not the same as the previous group
1865 			if(gp > 0)
1866 				fputc(RULE_GROUP_END,f_out);
1867 			fputc(RULE_GROUP_START,f_out);
1868 
1869 			if(rgroup[gp].group3_ix != 0)
1870 			{
1871 				n_groups3++;
1872 				fputc(1,f_out);
1873 				fputc(rgroup[gp].group3_ix, f_out);
1874 			}
1875 			else
1876 			{
1877 				fprintf(f_out, "%s", prev_rgroup_name = rgroup[gp].name);
1878 			}
1879 			fputc(0,f_out);
1880 		}
1881 
1882 		for(ix=rgroup[gp].length; ix>0; ix--)
1883 		{
1884 			c = fgetc(f_temp);
1885 			fputc(c,f_out);
1886 		}
1887 
1888 		if(different)
1889 		{
1890 		}
1891 	}
1892 	fputc(RULE_GROUP_END,f_out);
1893 	fputc(0,f_out);
1894 
1895 	fclose(f_temp);
1896 	remove(fname_temp);
1897 
1898 	fprintf(f_log,"\t%d rules, %d groups (%d)\n\n",count,n_rgroups,n_groups3);
1899 	return(0);
1900 }  //  end of compile_dictrules
1901 
1902 
1903 
CompileDictionary(const char * dsource,const char * dict_name,FILE * log,char * fname_err,int flags)1904 int CompileDictionary(const char *dsource, const char *dict_name, FILE *log, char *fname_err, int flags)
1905 {//=====================================================================================================
1906 // fname:  space to write the filename in case of error
1907 // flags: bit 0:  include source line number information, for debug purposes.
1908 
1909 	FILE *f_in;
1910 	FILE *f_out;
1911 	int offset_rules=0;
1912 	int value;
1913 	char fname_in[sizeof(path_home)+45];
1914 	char fname_out[sizeof(path_home)+15];
1915 	char fname_temp[sizeof(path_home)+15];
1916 	char path[sizeof(path_home)+40];       // path_dsource+20
1917 
1918 	error_count = 0;
1919 	error_need_dictionary = 0;
1920 	memset(letterGroupsDefined,0,sizeof(letterGroupsDefined));
1921 
1922 	debug_flag = flags & 1;
1923 
1924 	if(dsource == NULL)
1925 		dsource = "";
1926 
1927 	f_log = log;
1928 //f_log = fopen("log2.txt","w");
1929 	if(f_log == NULL)
1930 		f_log = stderr;
1931 
1932 	// try with and without '.txt' extension
1933 	sprintf(path,"%s%s_",dsource,dict_name);
1934 	sprintf(fname_in,"%srules.txt",path);
1935 	if((f_in = fopen(fname_in,"r")) == NULL)
1936 	{
1937 		sprintf(fname_in,"%srules",path);
1938 		if((f_in = fopen_log(fname_in,"r")) == NULL)
1939 		{
1940 			if(fname_err)
1941 				strcpy(fname_err,fname_in);
1942 			return(-1);
1943 		}
1944 	}
1945 
1946 	sprintf(fname_out,"%s%c%s_dict",path_home,PATHSEP,dict_name);
1947 	if((f_out = fopen_log(fname_out,"wb+")) == NULL)
1948 	{
1949 		if(fname_err)
1950 			strcpy(fname_err,fname_out);
1951 		return(-1);
1952 	}
1953 	sprintf(fname_temp,"%s%ctemp",path_home,PATHSEP);
1954 
1955 	value = N_HASH_DICT;
1956 	Write4Bytes(f_out,value);
1957 	Write4Bytes(f_out,offset_rules);
1958 
1959 	compile_dictlist_start();
1960 
1961 	fprintf(f_log,"Using phonemetable: '%s'\n",phoneme_tab_list[phoneme_tab_number].name);
1962 	compile_dictlist_file(path,"roots");
1963 	if(translator->langopts.listx)
1964 	{
1965 		compile_dictlist_file(path,"list");
1966 		compile_dictlist_file(path,"listx");
1967 	}
1968 	else
1969 	{
1970 		compile_dictlist_file(path,"listx");
1971 		compile_dictlist_file(path,"list");
1972 	}
1973 	compile_dictlist_file(path,"extra");
1974 
1975 	compile_dictlist_end(f_out);
1976 	offset_rules = ftell(f_out);
1977 
1978 	fprintf(f_log,"Compiling: '%s'\n",fname_in);
1979 
1980 	compile_dictrules(f_in,f_out,fname_temp);
1981 	fclose(f_in);
1982 
1983 	fseek(f_out,4,SEEK_SET);
1984 	Write4Bytes(f_out,offset_rules);
1985 	fclose(f_out);
1986 	fflush(f_log);
1987 
1988 	LoadDictionary(translator, dict_name, 0);
1989 
1990 	return(error_count);
1991 }  //  end of compile_dictionary
1992 
1993