1 /***************************************************************************
2 * Copyright (C) 2005 to 2014 by Jonathan Duddington *
3 * email: jonsd@users.sourceforge.net *
4 * *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 3 of the License, or *
8 * (at your option) any later version. *
9 * *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
14 * *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write see: *
17 * <http://www.gnu.org/licenses/>. *
18 ***************************************************************************/
19
20 #include "StdAfx.h"
21
22 #include <stdio.h>
23 #include <ctype.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <wctype.h>
27
28 #include "speak_lib.h"
29 #include "speech.h"
30 #include "phoneme.h"
31 #include "synthesize.h"
32 #include "translate.h"
33
34 extern void Write4Bytes(FILE *f, int value);
35 int HashDictionary(const char *string);
36
37 static FILE *f_log = NULL;
38 extern char *dir_dictionary;
39
40 extern char word_phonemes[N_WORD_PHONEMES]; // a word translated into phoneme codes
41
42 static int linenum;
43 static int error_count;
44 static int text_mode = 0;
45 static int debug_flag = 0;
46 static int error_need_dictionary = 0;
47
48 static int hash_counts[N_HASH_DICT];
49 static char *hash_chains[N_HASH_DICT];
50 static char letterGroupsDefined[N_LETTER_GROUPS];
51
52 MNEM_TAB mnem_rules[] = {
53 {"unpr", DOLLAR_UNPR},
54 {"noprefix", DOLLAR_NOPREFIX}, // rule fails if a prefix has been removed
55 {"list", DOLLAR_LIST}, // a pronunciation is given in the *_list file
56
57 {"w_alt1", 0x11},
58 {"w_alt2", 0x12},
59 {"w_alt3", 0x13},
60 {"w_alt4", 0x14},
61 {"w_alt5", 0x15},
62 {"w_alt6", 0x16},
63 {"w_alt", 0x11}, // note: put longer names before their sub-strings
64
65 {"p_alt1", 0x21},
66 {"p_alt2", 0x22},
67 {"p_alt3", 0x23},
68 {"p_alt4", 0x24},
69 {"p_alt5", 0x25},
70 {"p_alt6", 0x26},
71 {"p_alt", 0x21},
72 {NULL, -1}
73 };
74
75 MNEM_TAB mnem_flags[] = {
76 // these in the first group put a value in bits0-3 of dictionary_flags
77 {"$1", 0x41}, // stress on 1st syllable
78 {"$2", 0x42}, // stress on 2nd syllable
79 {"$3", 0x43},
80 {"$4", 0x44},
81 {"$5", 0x45},
82 {"$6", 0x46},
83 {"$7", 0x47},
84 {"$u", 0x48}, // reduce to unstressed
85 {"$u1", 0x49},
86 {"$u2", 0x4a},
87 {"$u3", 0x4b},
88 {"$u+", 0x4c}, // reduce to unstressed, but stress at end of clause
89 {"$u1+", 0x4d},
90 {"$u2+", 0x4e},
91 {"$u3+", 0x4f},
92
93
94 // these set the corresponding numbered bit if dictionary_flags
95 {"$pause", 8}, // ensure pause before this word
96 {"$strend", 9}, // full stress if at end of clause
97 {"$strend2", 10}, // full stress if at end of clause, or only followed by unstressed
98 {"$unstressend",11}, // reduce stress at end of clause
99 {"$abbrev", 13}, // use this pronuciation rather than split into letters
100
101 // language specific
102 {"$double", 14}, // IT double the initial consonant of next word
103 {"$alt", 15}, // use alternative pronunciation
104 {"$alt1", 15}, // synonym for $alt
105 {"$alt2", 16},
106 {"$alt3", 17},
107 {"$alt4", 18},
108 {"$alt5", 19},
109 {"$alt6", 20},
110
111 {"$combine", 23}, // Combine with the next word
112
113 {"$dot", 24}, // ignore '.' after this word (abbreviation)
114 {"$hasdot", 25}, // use this pronunciation if there is a dot after the word
115
116 {"$max3", 27}, // limit to 3 repetitions
117 {"$brk", 28}, // a shorter $pause
118 {"$text", 29}, // word translates to replcement text, not phonemes
119
120 // flags in dictionary word 2
121 {"$verbf", 0x20}, // verb follows
122 {"$verbsf", 0x21}, // verb follows, allow -s suffix
123 {"$nounf", 0x22}, // noun follows
124 {"$pastf", 0x23}, // past tense follows
125 {"$verb", 0x24}, // use this pronunciation when its a verb
126 {"$noun", 0x25}, // use this pronunciation when its a noun
127 {"$past", 0x26}, // use this pronunciation when its past tense
128 {"$verbextend",0x28}, // extend influence of 'verb follows'
129 {"$capital", 0x29}, // use this pronunciation if initial letter is upper case
130 {"$allcaps", 0x2a}, // use this pronunciation if initial letter is upper case
131 {"$accent", 0x2b}, // character name is base-character name + accent name
132 {"$sentence",0x2d}, // only if this clause is a sentence (i.e. terminator is {. ? !} not {, ; :}
133 {"$only", 0x2e}, // only match on this word without suffix
134 {"$onlys", 0x2f}, // only match with none, or with 's' suffix
135 {"$stem", 0x30}, // must have a suffix
136 {"$atend", 0x31}, // use this pronunciation if at end of clause
137 {"$atstart", 0x32}, // use this pronunciation at start of clause
138 {"$native", 0x33}, // not if we've switched translators
139
140 // doesn't set dictionary_flags
141 {"$?", 100}, // conditional rule, followed by byte giving the condition number
142
143 {"$textmode", 200},
144 {"$phonememode", 201},
145 {NULL, -1}
146 };
147
148
149 #define LEN_GROUP_NAME 12
150
151 typedef struct {
152 char name[LEN_GROUP_NAME+1];
153 unsigned int start;
154 unsigned int length;
155 int group3_ix;
156 } RGROUP;
157
158
isspace2(unsigned int c)159 int isspace2(unsigned int c)
160 {//=========================
161 // can't use isspace() because on Windows, isspace(0xe1) gives TRUE !
162 int c2;
163
164 if(((c2 = (c & 0xff)) == 0) || (c > ' '))
165 return(0);
166 return(1);
167 }
168
169
170
fopen_log(const char * fname,const char * access)171 static FILE *fopen_log(const char *fname,const char *access)
172 {//==================================================
173 // performs fopen, but produces error message to f_log if it fails
174 FILE *f;
175
176 if((f = fopen(fname,access)) == NULL)
177 {
178 if(f_log != NULL)
179 fprintf(f_log,"Can't access (%s) file '%s'\n",access,fname);
180 }
181 return(f);
182 }
183
184
LookupMnemName(MNEM_TAB * table,const int value)185 const char *LookupMnemName(MNEM_TAB *table, const int value)
186 //==========================================================
187 /* Lookup a mnemonic string in a table, return its name */
188 {
189 while(table->mnem != NULL)
190 {
191 if(table->value==value)
192 return(table->mnem);
193 table++;
194 }
195 return(""); /* not found */
196 } /* end of LookupMnemValue */
197
198
print_dictionary_flags(unsigned int * flags,char * buf,int buf_len)199 void print_dictionary_flags(unsigned int *flags, char *buf, int buf_len)
200 {//========================================================================
201 int stress;
202 int ix;
203 const char *name;
204 int len;
205 int total = 0;
206
207 buf[0] = 0;
208 if((stress = flags[0] & 0xf) != 0)
209 {
210 sprintf(buf, "%s", LookupMnemName(mnem_flags, stress + 0x40));
211 total = strlen(buf);
212 buf += total;
213 }
214
215 for(ix=8; ix<64; ix++)
216 {
217 if(((ix < 30) && (flags[0] & (1 << ix))) || ((ix >= 0x20) && (flags[1] & (1 << (ix-0x20)))))
218 {
219 name = LookupMnemName(mnem_flags, ix);
220 len = strlen(name) + 1;
221 total += len;
222 if(total >= buf_len)
223 continue;
224 sprintf(buf, " %s", name);
225 buf += len;
226 }
227 }
228 }
229
230
231
232
DecodeRule(const char * group_chars,int group_length,char * rule,int control)233 char *DecodeRule(const char *group_chars, int group_length, char *rule, int control)
234 {//=================================================================================
235 /* Convert compiled match template to ascii */
236
237 unsigned char rb;
238 unsigned char c;
239 char *p;
240 char *p_end;
241 int ix;
242 int match_type;
243 int finished=0;
244 int value;
245 int linenum=0;
246 int flags;
247 int suffix_char;
248 int condition_num=0;
249 int at_start = 0;
250 const char *name;
251 char buf[200];
252 char buf_pre[200];
253 char suffix[20];
254 static char output[80];
255
256 static char symbols[] =
257 {' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
258 '&','%','+','#','S','D','Z','A','L','!',' ','@','?','J','N','K','V','?','T','X','?','W'
259 };
260
261 static char symbols_lg[] = {'A','B','C','H','F','G','Y'};
262
263 match_type = 0;
264 buf_pre[0] = 0;
265
266 for(ix=0; ix<group_length; ix++)
267 {
268 buf[ix] = group_chars[ix];
269 }
270 buf[ix] = 0;
271
272 p = &buf[strlen(buf)];
273 while(!finished)
274 {
275 rb = *rule++;
276
277 if(rb <= RULE_LINENUM)
278 {
279 switch(rb)
280 {
281 case 0:
282 case RULE_PHONEMES:
283 finished=1;
284 break;
285 case RULE_PRE_ATSTART:
286 at_start = 1; // drop through to next case
287 case RULE_PRE:
288 match_type = RULE_PRE;
289 *p = 0;
290 p = buf_pre;
291 break;
292 case RULE_POST:
293 match_type = RULE_POST;
294 *p = 0;
295 strcat(buf," (");
296 p = &buf[strlen(buf)];
297 break;
298 case RULE_PH_COMMON:
299 break;
300 case RULE_CONDITION:
301 /* conditional rule, next byte gives condition number */
302 condition_num = *rule++;
303 break;
304 case RULE_LINENUM:
305 value = (rule[1] & 0xff) - 1;
306 linenum = (rule[0] & 0xff) - 1 + (value * 255);
307 rule+=2;
308 break;
309 }
310 continue;
311 }
312
313 if(rb == RULE_DOLLAR)
314 {
315 value = *rule++ & 0xff;
316 if((value != 0x01) || (control & FLAG_UNPRON_TEST))
317 {
318 // TODO write the string backwards if in RULE_PRE
319 p[0] = '$';
320 name = LookupMnemName(mnem_rules, value);
321 strcpy(&p[1],name);
322 p += (strlen(name)+1);
323 }
324 c = ' ';
325 }
326 else if(rb == RULE_ENDING)
327 {
328 static const char *flag_chars = "eipvdfq tba ";
329 flags = ((rule[0] & 0x7f)<< 8) + (rule[1] & 0x7f);
330 suffix_char = 'S';
331 if(flags & (SUFX_P >> 8))
332 suffix_char = 'P';
333 sprintf(suffix,"%c%d",suffix_char,rule[2] & 0x7f);
334 rule += 3;
335 for(ix=0; ix<9; ix++)
336 {
337 if(flags & 1)
338 sprintf(&suffix[strlen(suffix)],"%c",flag_chars[ix]);
339 flags = (flags >> 1);
340 }
341 strcpy(p,suffix);
342 p += strlen(suffix);
343 c = ' ';
344 }
345 else if(rb == RULE_LETTERGP)
346 {
347 c = symbols_lg[*rule++ - 'A'];
348 }
349 else if(rb == RULE_LETTERGP2)
350 {
351 value = *rule++ - 'A';
352 p[0] = 'L';
353 p[1] = (value / 10) + '0';
354 c = (value % 10) + '0';
355
356 if(match_type == RULE_PRE)
357 {
358 p[0] = c;
359 c = 'L';
360 }
361 p+=2;
362 }
363 else if(rb <= RULE_LAST_RULE)
364 c = symbols[rb];
365 else if(rb == RULE_SPACE)
366 c = '_';
367 else
368 c = rb;
369 *p++ = c;
370 }
371 *p = 0;
372
373 p = output;
374 p_end = p + sizeof(output) - 1;
375
376 if(linenum > 0)
377 {
378 sprintf(p,"%5d:\t",linenum);
379 p += 7;
380 }
381 if(condition_num > 0)
382 {
383 sprintf(p,"?%d ",condition_num);
384 p = &p[strlen(p)];
385 }
386 if(((ix = strlen(buf_pre)) > 0) || at_start)
387 {
388 if(at_start)
389 *p++ = '_';
390 while((--ix >= 0) && (p < p_end-3))
391 *p++ = buf_pre[ix];
392 *p++ = ')';
393 *p++ = ' ';
394 }
395 *p = 0;
396
397 buf[p_end - p] = 0; // prevent overflow in output[]
398 strcat(p,buf);
399 ix = strlen(output);
400 while(ix < 8)
401 output[ix++]=' ';
402 output[ix]=0;
403 return(output);
404 } /* end of DecodeRule */
405
406
407
408
compile_line(char * linebuf,char * dict_line,int * hash)409 static int compile_line(char *linebuf, char *dict_line, int *hash)
410 {//===============================================================
411 // Compile a line in the language_list file
412 unsigned char c;
413 char *p;
414 char *word;
415 char *phonetic;
416 unsigned int ix;
417 int step;
418 unsigned int n_flag_codes = 0;
419 int flagnum;
420 int flag_offset;
421 int length;
422 int multiple_words = 0;
423 int multiple_numeric_hyphen = 0;
424 char *multiple_string = NULL;
425 char *multiple_string_end = NULL;
426
427 int len_word;
428 int len_phonetic;
429 int text_not_phonemes; // this word specifies replacement text, not phonemes
430 unsigned int wc;
431 int all_upper_case;
432
433 char *mnemptr;
434 unsigned char flag_codes[100];
435 char encoded_ph[200];
436 char bad_phoneme_str[4];
437 int bad_phoneme;
438 static char nullstring[] = {0};
439
440 text_not_phonemes = 0;
441 phonetic = word = nullstring;
442
443 p = linebuf;
444 // while(isspace2(*p)) p++;
445
446 #ifdef deleted
447 if(*p == '$')
448 {
449 if(memcmp(p,"$textmode",9) == 0)
450 {
451 text_mode = 1;
452 return(0);
453 }
454 if(memcmp(p,"$phonememode",12) == 0)
455 {
456 text_mode = 0;
457 return(0);
458 }
459 }
460 #endif
461
462 step = 0;
463
464 c = 0;
465 while(c != '\n')
466 {
467 c = *p;
468
469 if((c == '?') && (step==0))
470 {
471 // conditional rule, allow only if the numbered condition is set for the voice
472 flag_offset = 100;
473
474 p++;
475 if(*p == '!')
476 {
477 // allow only if the numbered condition is NOT set
478 flag_offset = 132;
479 p++;
480 }
481
482 ix = 0;
483 if(IsDigit09(*p))
484 {
485 ix += (*p-'0');
486 p++;
487 }
488 if(IsDigit09(*p))
489 {
490 ix = ix*10 + (*p-'0');
491 p++;
492 }
493 flag_codes[n_flag_codes++] = ix + flag_offset;
494 c = *p;
495 }
496
497 if((c == '$') && isalnum(p[1]))
498 {
499 /* read keyword parameter */
500 mnemptr = p;
501 while(!isspace2(c = *p)) p++;
502 *p = 0;
503
504 flagnum = LookupMnem(mnem_flags,mnemptr);
505 if(flagnum > 0)
506 {
507 if(flagnum == 200)
508 {
509 text_mode = 1;
510 }
511 else if(flagnum == 201)
512 {
513 text_mode = 0;
514 }
515 else if(flagnum == BITNUM_FLAG_TEXTMODE)
516 {
517 text_not_phonemes = 1;
518 }
519 else
520 {
521 flag_codes[n_flag_codes++] = flagnum;
522 }
523 }
524 else
525 {
526 fprintf(f_log,"%5d: Unknown keyword: %s\n",linenum,mnemptr);
527 error_count++;
528 }
529 }
530
531 if((c == '/') && (p[1] == '/') && (multiple_words==0))
532 {
533 c = '\n'; /* "//" treat comment as end of line */
534 }
535
536 switch(step)
537 {
538 case 0:
539 if(c == '(')
540 {
541 multiple_words = 1;
542 word = p+1;
543 step = 1;
544 }
545 else if(!isspace2(c))
546 {
547 word = p;
548 step = 1;
549 }
550 break;
551
552 case 1:
553 if((c == '-') && multiple_words)
554 {
555 if(IsDigit09(word[0]))
556 {
557 multiple_numeric_hyphen = 1;
558 }
559 // else // ???
560 {
561 flag_codes[n_flag_codes++] = BITNUM_FLAG_HYPHENATED;
562 }
563 c = ' ';
564 }
565 if(isspace2(c))
566 {
567 p[0] = 0; /* terminate english word */
568
569 if(multiple_words)
570 {
571 multiple_string = multiple_string_end = p+1;
572 step = 2;
573 }
574 else
575 {
576 step = 3;
577 }
578 }
579 else if(c == ')')
580 {
581 if(multiple_words)
582 {
583 p[0] = 0;
584 multiple_words = 0;
585 step = 3;
586 }
587 else if(word[0] != '_')
588 {
589 fprintf(f_log, "%5d: Missing '('\n", linenum);
590 error_count++;
591 step = 3;
592 }
593 }
594 break;
595
596 case 2:
597 if(isspace2(c))
598 {
599 multiple_words++;
600 }
601 else if(c == ')')
602 {
603 p[0] = ' '; // terminate extra string
604 multiple_string_end = p+1;
605 step = 3;
606 }
607 break;
608
609 case 3:
610 if(!isspace2(c))
611 {
612 phonetic = p;
613 step = 4;
614 }
615 break;
616
617 case 4:
618 if(isspace2(c))
619 {
620 p[0] = 0; /* terminate phonetic */
621 step = 5;
622 }
623 break;
624
625 case 5:
626 break;
627 }
628 p++;
629 }
630
631 if(word[0] == 0)
632 {
633 return(0); /* blank line */
634 }
635
636 if(text_mode)
637 text_not_phonemes = 1;
638
639 if(text_not_phonemes)
640 {
641 if(word[0] == '_')
642 {
643 // This is a special word, used by eSpeak. Translate this into phonemes now
644 strcat(phonetic, " "); // need a space to indicate word-boundary
645
646 // PROBLEM vowel reductions are not applied to the translated phonemes
647 // condition rules are not applied
648 TranslateWord(translator,phonetic,0,NULL,NULL);
649 text_not_phonemes = 0;
650 strncpy0(encoded_ph, word_phonemes, N_WORD_BYTES-4);
651
652 if((word_phonemes[0] == 0) && (error_need_dictionary < 3))
653 {
654 // the dictionary was not loaded, we need a second attempt
655 error_need_dictionary++;
656 fprintf(f_log,"%5d: Need to compile dictionary again\n",linenum);
657 }
658 {
659 //char decoded_phonemes[128];
660 //DecodePhonemes(word_phonemes,decoded_phonemes);
661 //printf("Translator %x %s [%s] [%s]\n",translator->translator_name,word,phonetic,decoded_phonemes);
662 }
663 }
664 else
665 {
666 // this is replacement text, so don't encode as phonemes. Restrict the length of the replacement word
667 strncpy0(encoded_ph,phonetic,N_WORD_BYTES-4);
668 }
669 }
670 else
671 {
672 EncodePhonemes(phonetic,encoded_ph,&bad_phoneme);
673 if(strchr(encoded_ph,phonSWITCH) != 0)
674 {
675 flag_codes[n_flag_codes++] = BITNUM_FLAG_ONLY_S; // don't match on suffixes (except 's') when switching languages
676 }
677
678 // check for errors in the phonemes codes
679 if(bad_phoneme != 0)
680 {
681 // unrecognised phoneme, report error
682 bad_phoneme_str[utf8_out(bad_phoneme, bad_phoneme_str)] = 0;
683 fprintf(f_log,"%5d: Bad phoneme [%s] (U+%x) in: %s %s\n",linenum,bad_phoneme_str,bad_phoneme,word,phonetic);
684 error_count++;
685 }
686 }
687
688 if(text_not_phonemes != translator->langopts.textmode)
689 {
690 flag_codes[n_flag_codes++] = BITNUM_FLAG_TEXTMODE;
691 }
692
693
694 if(sscanf(word,"U+%x",&wc) == 1)
695 {
696 // Character code
697 ix = utf8_out(wc, word);
698 word[ix] = 0;
699 }
700 else if(word[0] != '_')
701 {
702 // convert to lower case, and note if the word is all-capitals
703 int c2;
704
705 all_upper_case = 1;
706 p = word;
707 for(p=word;;)
708 {
709 // this assumes that the lower case char is the same length as the upper case char
710 // OK, except for Turkish "I", but use towlower() rather than towlower2()
711 ix = utf8_in(&c2,p);
712 if(c2 == 0)
713 break;
714 if(iswupper2(c2))
715 {
716 utf8_out(towlower2(c2),p);
717 }
718 else
719 {
720 all_upper_case = 0;
721 }
722 p += ix;
723 }
724 if(all_upper_case)
725 {
726 flag_codes[n_flag_codes++] = BITNUM_FLAG_ALLCAPS;
727 }
728 }
729
730 len_word = strlen(word);
731
732 if(translator->transpose_min > 0)
733 {
734 len_word = TransposeAlphabet(translator, word);
735 }
736
737 *hash = HashDictionary(word);
738 len_phonetic = strlen(encoded_ph);
739
740 dict_line[1] = len_word; // bit 6 indicates whether the word has been compressed
741 len_word &= 0x3f;
742
743 memcpy(&dict_line[2],word,len_word);
744
745 if(len_phonetic == 0)
746 {
747 // no phonemes specified. set bit 7
748 dict_line[1] |= 0x80;
749 length = len_word + 2;
750 }
751 else
752 {
753 length = len_word + len_phonetic + 3;
754 strcpy(&dict_line[(len_word)+2],encoded_ph);
755 }
756
757 for(ix=0; ix<n_flag_codes; ix++)
758 {
759 dict_line[ix+length] = flag_codes[ix];
760 }
761 length += n_flag_codes;
762
763 if((multiple_string != NULL) && (multiple_words > 0))
764 {
765 if(multiple_words > 10)
766 {
767 fprintf(f_log,"%5d: Two many parts in a multi-word entry: %d\n",linenum,multiple_words);
768 error_count++;
769 }
770 else
771 {
772 dict_line[length++] = 80 + multiple_words;
773 ix = multiple_string_end - multiple_string;
774 if(multiple_numeric_hyphen)
775 {
776 dict_line[length++] = ' '; // ???
777 }
778 memcpy(&dict_line[length],multiple_string,ix);
779 length += ix;
780 }
781 }
782 dict_line[0] = length;
783
784
785 return(length);
786 } /* end of compile_line */
787
788
789
compile_dictlist_start(void)790 static void compile_dictlist_start(void)
791 {//=====================================
792 // initialise dictionary list
793 int ix;
794 char *p;
795 char *p2;
796
797 for(ix=0; ix<N_HASH_DICT; ix++)
798 {
799 p = hash_chains[ix];
800 while(p != NULL)
801 {
802 memcpy(&p2,p,sizeof(char *));
803 free(p);
804 p = p2;
805 }
806 hash_chains[ix] = NULL;
807 hash_counts[ix]=0;
808 }
809 }
810
811
compile_dictlist_end(FILE * f_out)812 static void compile_dictlist_end(FILE *f_out)
813 {//==========================================
814 // Write out the compiled dictionary list
815 int hash;
816 int length;
817 char *p;
818
819 if(f_log != NULL)
820 {
821 #ifdef OUTPUT_FORMAT
822 for(hash=0; hash<N_HASH_DICT; hash++)
823 {
824 fprintf(f_log,"%8d",hash_counts[hash]);
825 if((hash & 7) == 7)
826 fputc('\n',f_log);
827 }
828 fflush(f_log);
829 #endif
830 }
831
832 for(hash=0; hash<N_HASH_DICT; hash++)
833 {
834 p = hash_chains[hash];
835 hash_counts[hash] = (int)ftell(f_out);
836
837 while(p != NULL)
838 {
839 length = *(p+sizeof(char *));
840 fwrite(p+sizeof(char *),length,1,f_out);
841 memcpy(&p,p,sizeof(char *));
842 }
843 fputc(0,f_out);
844 }
845 }
846
847
848
compile_dictlist_file(const char * path,const char * filename)849 static int compile_dictlist_file(const char *path, const char* filename)
850 {//=====================================================================
851 int length;
852 int hash;
853 char *p;
854 int count=0;
855 FILE *f_in;
856 char buf[200];
857 char fname[sizeof(path_home)+45];
858 char dict_line[128];
859
860 text_mode = 0;
861
862 // try with and without '.txt' extension
863 sprintf(fname,"%s%s.txt",path,filename);
864 if((f_in = fopen(fname,"r")) == NULL)
865 {
866 sprintf(fname,"%s%s",path,filename);
867 if((f_in = fopen(fname,"r")) == NULL)
868 return(-1);
869 }
870
871 fprintf(f_log,"Compiling: '%s'\n",fname);
872
873 linenum=0;
874
875 while(fgets(buf,sizeof(buf),f_in) != NULL)
876 {
877 linenum++;
878
879 length = compile_line(buf,dict_line,&hash);
880 if(length == 0) continue; /* blank line */
881
882 hash_counts[hash]++;
883
884 p = (char *)malloc(length+sizeof(char *));
885 if(p == NULL)
886 {
887 if(f_log != NULL)
888 {
889 fprintf(f_log,"Can't allocate memory\n");
890 error_count++;
891 }
892 break;
893 }
894
895 memcpy(p,&hash_chains[hash],sizeof(char *));
896 hash_chains[hash] = p;
897 memcpy(p+sizeof(char *),dict_line,length);
898 count++;
899 }
900
901 fprintf(f_log,"\t%d entries\n",count);
902 fclose(f_in);
903 return(0);
904 } /* end of compile_dictlist_file */
905
906
907
908 static char rule_cond[80];
909 static char rule_pre[80];
910 static char rule_post[80];
911 static char rule_match[80];
912 static char rule_phonemes[80];
913 static char group_name[LEN_GROUP_NAME+1];
914 static int group3_ix;
915
916 #define N_RULES 2000 // max rules for each group
917
918
919
isHexDigit(int c)920 int isHexDigit(int c)
921 {
922 if((c >= '0') && (c <= '9'))
923 return(c - '0');
924 if((c >= 'a') && (c <= 'f'))
925 return(c - 'a' + 10);
926 if((c >= 'A') && (c <= 'F'))
927 return(c - 'A' + 10);
928 return(-1);
929 }
930
931
copy_rule_string(char * string,int * state_out)932 static void copy_rule_string(char *string, int *state_out)
933 {//=======================================================
934 // state 0: conditional, 1=pre, 2=match, 3=post, 4=phonemes
935 static char *outbuf[5] = {rule_cond, rule_pre, rule_match, rule_post, rule_phonemes};
936 static int next_state[5] = {2,2,4,4,4};
937 char *output;
938 char *p;
939 int ix;
940 int len;
941 char c;
942 int c2, c3;
943 int sxflags;
944 int value;
945 int literal;
946 int hexdigit_input = 0;
947 int state = *state_out;
948 MNEM_TAB *mr;
949
950 if(string[0] == 0) return;
951
952 output = outbuf[state];
953 if(state==4)
954 {
955 // append to any previous phoneme string, i.e. allow spaces in the phoneme string
956 len = strlen(rule_phonemes);
957 if(len > 0)
958 rule_phonemes[len++] = ' ';
959 output = &rule_phonemes[len];
960 }
961 sxflags = 0x808000; // to ensure non-zero bytes
962
963 for(p=string,ix=0;;)
964 {
965 literal = 0;
966 c = *p++;
967 if((c == '0') && (p[0] == 'x') && (isHexDigit(p[1]) >= 0) && (isHexDigit(p[2]) >= 0))
968 {
969 hexdigit_input = 1;
970 c = p[1];
971 p+= 2;
972 }
973 if(c == '\\')
974 {
975 c = *p++; // treat next character literally
976 //#ifdef deleted
977 if((c >= '0') && (c <= '3') && (p[0] >= '0') && (p[0] <= '7') && (p[1] >= '0') && (p[1] <= '7'))
978 {
979 // character code given by 3 digit octal value;
980 c = (c-'0')*64 + (p[0]-'0')*8 + (p[1]-'0');
981 p += 2;
982 }
983 //endif
984 literal = 1;
985 }
986 if(hexdigit_input)
987 {
988 if(((c2 = isHexDigit(c)) >= 0) && ((c3 = isHexDigit(p[0])) >= 0))
989 {
990 c = c2 * 16 + c3;
991 literal = 1;
992 p++;
993 }
994 else
995 {
996 hexdigit_input = 0;
997 }
998 }
999 if((state==1) || (state==3))
1000 {
1001 // replace special characters (note: 'E' is reserved for a replaced silent 'e')
1002 if(literal == 0)
1003 {
1004 static const char lettergp_letters[9] = {LETTERGP_A,LETTERGP_B,LETTERGP_C,0,0,LETTERGP_F,LETTERGP_G,LETTERGP_H,LETTERGP_Y};
1005 switch(c)
1006 {
1007 case '_':
1008 c = RULE_SPACE;
1009 break;
1010
1011 case 'Y':
1012 c = 'I'; // drop through to next case
1013 case 'A': // vowel
1014 case 'B':
1015 case 'C':
1016 case 'H':
1017 case 'F':
1018 case 'G':
1019 if(state == 1)
1020 {
1021 // pre-rule, put the number before the RULE_LETTERGP;
1022 output[ix++] = lettergp_letters[c-'A'] + 'A';
1023 c = RULE_LETTERGP;
1024 }
1025 else
1026 {
1027 output[ix++] = RULE_LETTERGP;
1028 c = lettergp_letters[c-'A'] + 'A';
1029 }
1030 break;
1031 case 'D':
1032 c = RULE_DIGIT;
1033 break;
1034 case 'K':
1035 c = RULE_NOTVOWEL;
1036 break;
1037 case 'N':
1038 c = RULE_NO_SUFFIX;
1039 break;
1040 case 'V':
1041 c = RULE_IFVERB;
1042 break;
1043 case 'Z':
1044 c = RULE_NONALPHA;
1045 break;
1046 case '+':
1047 c = RULE_INC_SCORE;
1048 break;
1049 case '@':
1050 c = RULE_SYLLABLE;
1051 break;
1052 case '&':
1053 c = RULE_STRESSED;
1054 break;
1055 case '%':
1056 c = RULE_DOUBLE;
1057 break;
1058 case '#':
1059 c = RULE_DEL_FWD;
1060 break;
1061 case '!':
1062 c = RULE_CAPITAL;
1063 break;
1064 case 'T':
1065 output[ix++] = RULE_DOLLAR;
1066 c = 0x11;
1067 break;
1068 case 'W':
1069 c = RULE_SPELLING;
1070 break;
1071 case 'X':
1072 c = RULE_NOVOWELS;
1073 break;
1074 case 'J':
1075 c = RULE_SKIPCHARS;
1076 break;
1077 case 'L':
1078 // expect two digits
1079 c = *p++ - '0';
1080 value = *p++ - '0';
1081 c = c * 10 + value;
1082 if((value < 0) || (value > 9))
1083 {
1084 c = 0;
1085 fprintf(f_log,"%5d: Expected 2 digits after 'L'\n",linenum);
1086 error_count++;
1087 }
1088 else if((c <= 0) || (c >= N_LETTER_GROUPS) || (letterGroupsDefined[(int)c] == 0))
1089 {
1090 fprintf(f_log,"%5d: Letter group L%.2d not defined\n",linenum,c);
1091 error_count++;
1092 }
1093 c += 'A';
1094 if(state == 1)
1095 {
1096 // pre-rule, put the group number before the RULE_LETTERGP command
1097 output[ix++] = c;
1098 c = RULE_LETTERGP2;
1099 }
1100 else
1101 {
1102 output[ix++] = RULE_LETTERGP2;
1103 }
1104 break;
1105
1106 case '$':
1107 value = 0;
1108 mr = mnem_rules;
1109 while(mr->mnem != NULL)
1110 {
1111 len = strlen(mr->mnem);
1112 if(memcmp(p, mr->mnem, len) == 0)
1113 {
1114 value = mr->value;
1115 p += len;
1116 break;
1117 }
1118 mr++;
1119 }
1120
1121 if(state == 1)
1122 {
1123 // pre-rule, put the number before the RULE_DOLLAR
1124 output[ix++] = value;
1125 c = RULE_DOLLAR;
1126 }
1127 else
1128 {
1129 output[ix++] = RULE_DOLLAR;
1130 c = value;
1131 }
1132
1133 if(value == 0)
1134 {
1135 fprintf(f_log,"%5d: $ command not recognized\n",linenum);
1136 error_count++;
1137 }
1138 break;
1139
1140 case 'P':
1141 sxflags |= SUFX_P; // Prefix, now drop through to Suffix
1142 case 'S':
1143 output[ix++] = RULE_ENDING;
1144 value = 0;
1145 while(!isspace2(c = *p++) && (c != 0))
1146 {
1147 switch(c)
1148 {
1149 case 'e':
1150 sxflags |= SUFX_E;
1151 break;
1152 case 'i':
1153 sxflags |= SUFX_I;
1154 break;
1155 case 'p': // obsolete, replaced by 'P' above
1156 sxflags |= SUFX_P;
1157 break;
1158 case 'v':
1159 sxflags |= SUFX_V;
1160 break;
1161 case 'd':
1162 sxflags |= SUFX_D;
1163 break;
1164 case 'f':
1165 sxflags |= SUFX_F;
1166 break;
1167 case 'q':
1168 sxflags |= SUFX_Q;
1169 break;
1170 case 't':
1171 sxflags |= SUFX_T;
1172 break;
1173 case 'b':
1174 sxflags |= SUFX_B;
1175 break;
1176 case 'a':
1177 sxflags |= SUFX_A;
1178 break;
1179 case 'm':
1180 sxflags |= SUFX_M;
1181 break;
1182 default:
1183 if(IsDigit09(c))
1184 value = (value*10) + (c - '0');
1185 break;
1186 }
1187 }
1188 p--;
1189 output[ix++] = sxflags >> 16;
1190 output[ix++] = sxflags >> 8;
1191 c = value | 0x80;
1192 break;
1193 }
1194 }
1195 }
1196 output[ix++] = c;
1197 if(c == 0) break;
1198 }
1199
1200 *state_out = next_state[state];
1201 } // end of copy_rule_string
1202
1203
1204
compile_rule(char * input)1205 static char *compile_rule(char *input)
1206 {//===================================
1207 int ix;
1208 unsigned char c;
1209 int wc;
1210 char *p;
1211 char *prule;
1212 int len;
1213 int len_name;
1214 int start;
1215 int state=2;
1216 int finish=0;
1217 char buf[80];
1218 char output[150];
1219 int bad_phoneme;
1220 char bad_phoneme_str[4];
1221
1222 buf[0]=0;
1223 rule_cond[0]=0;
1224 rule_pre[0]=0;
1225 rule_post[0]=0;
1226 rule_match[0]=0;
1227 rule_phonemes[0]=0;
1228
1229 p = buf;
1230
1231 for(ix=0; finish==0; ix++)
1232 {
1233 c = input[ix];
1234
1235 switch(c = input[ix])
1236 {
1237 case ')': // end of prefix section
1238 *p = 0;
1239 state = 1;
1240 copy_rule_string(buf,&state);
1241 p = buf;
1242 break;
1243
1244 case '(': // start of suffix section
1245 *p = 0;
1246 state = 2;
1247 copy_rule_string(buf,&state);
1248 state = 3;
1249 p = buf;
1250 if(input[ix+1] == ' ')
1251 {
1252 fprintf(f_log,"%5d: Syntax error. Space after (\n",linenum);
1253 error_count++;
1254 }
1255 break;
1256
1257 case '\n': // end of line
1258 case '\r':
1259 case 0: // end of line
1260 *p = 0;
1261 copy_rule_string(buf,&state);
1262 finish=1;
1263 break;
1264
1265 case '\t': // end of section section
1266 case ' ':
1267 *p = 0;
1268 copy_rule_string(buf,&state);
1269 p = buf;
1270 break;
1271
1272 case '?':
1273 if(state==2)
1274 state=0;
1275 else
1276 *p++ = c;
1277 break;
1278
1279 default:
1280 *p++ = c;
1281 break;
1282 }
1283 }
1284
1285 if(strcmp(rule_match,"$group")==0)
1286 strcpy(rule_match,group_name);
1287
1288 if(rule_match[0]==0)
1289 {
1290 if(rule_post[0] != 0)
1291 {
1292 fprintf(f_log,"%5d: Syntax error\n",linenum);
1293 error_count++;
1294 }
1295 return(NULL);
1296 }
1297
1298 EncodePhonemes(rule_phonemes,buf,&bad_phoneme);
1299 if(bad_phoneme != 0)
1300 {
1301 bad_phoneme_str[utf8_out(bad_phoneme, bad_phoneme_str)] = 0;
1302 fprintf(f_log,"%5d: Bad phoneme [%s] (U+%x) in: %s\n",linenum,bad_phoneme_str,bad_phoneme,input);
1303 error_count++;
1304 }
1305 strcpy(output,buf);
1306 len = strlen(buf)+1;
1307
1308 len_name = strlen(group_name);
1309 if((len_name > 0) && (memcmp(rule_match,group_name,len_name) != 0))
1310 {
1311 utf8_in(&wc,rule_match);
1312 if((group_name[0] == '9') && IsDigit(wc))
1313 {
1314 // numeric group, rule_match starts with a digit, so OK
1315 }
1316 else
1317 {
1318 fprintf(f_log,"%5d: Wrong initial letters '%s' for group '%s'\n",linenum,rule_match,group_name);
1319 error_count++;
1320 }
1321 }
1322 strcpy(&output[len],rule_match);
1323 len += strlen(rule_match);
1324
1325 if(debug_flag)
1326 {
1327 output[len] = RULE_LINENUM;
1328 output[len+1] = (linenum % 255) + 1;
1329 output[len+2] = (linenum / 255) + 1;
1330 len+=3;
1331 }
1332
1333 if(rule_cond[0] != 0)
1334 {
1335 ix = -1;
1336 if(rule_cond[0] == '!')
1337 {
1338 // allow the rule only if the condition number is NOT set for the voice
1339 ix = atoi(&rule_cond[1]) + 32;
1340 }
1341 else
1342 {
1343 // allow the rule only if the condition number is set for the voice
1344 ix = atoi(rule_cond);
1345 }
1346
1347 if((ix > 0) && (ix < 255))
1348 {
1349 output[len++] = RULE_CONDITION;
1350 output[len++] = ix;
1351 }
1352 else
1353 {
1354 fprintf(f_log,"%5d: bad condition number ?%d\n",linenum,ix);
1355 error_count++;
1356 }
1357 }
1358 if(rule_pre[0] != 0)
1359 {
1360 start = 0;
1361 if(rule_pre[0] == RULE_SPACE)
1362 {
1363 // omit '_' at the beginning of the pre-string and imply it by using RULE_PRE_ATSTART
1364 c = RULE_PRE_ATSTART;
1365 start = 1;
1366 }
1367 else
1368 {
1369 c = RULE_PRE;
1370 }
1371 output[len++] = c;
1372
1373 // output PRE string in reverse order
1374 for(ix = strlen(rule_pre)-1; ix>=start; ix--)
1375 output[len++] = rule_pre[ix];
1376 }
1377
1378 if(rule_post[0] != 0)
1379 {
1380 sprintf(&output[len],"%c%s",RULE_POST,rule_post);
1381 len += (strlen(rule_post)+1);
1382 }
1383 output[len++]=0;
1384 prule = (char *)malloc(len);
1385 memcpy(prule,output,len);
1386 return(prule);
1387 } // end of compile_rule
1388
1389
string_sorter(char ** a,char ** b)1390 int __cdecl string_sorter(char **a, char **b)
1391 {//===========================================
1392 char *pa, *pb;
1393 int ix;
1394
1395 if((ix = strcmp(pa = *a,pb = *b)) != 0)
1396 return(ix);
1397 pa += (strlen(pa)+1);
1398 pb += (strlen(pb)+1);
1399 return(strcmp(pa,pb));
1400 } /* end of string_sorter */
1401
1402
rgroup_sorter(RGROUP * a,RGROUP * b)1403 static int __cdecl rgroup_sorter(RGROUP *a, RGROUP *b)
1404 {//===================================================
1405 // Sort long names before short names
1406 int ix;
1407 ix = strlen(b->name) - strlen(a->name);
1408 if(ix != 0) return(ix);
1409 ix = strcmp(a->name,b->name);
1410 if(ix != 0) return(ix);
1411 return(a->start-b->start);
1412 }
1413
1414
1415 #ifdef OUTPUT_FORMAT
print_rule_group(FILE * f_out,int n_rules,char ** rules,char * name)1416 static void print_rule_group(FILE *f_out, int n_rules, char **rules, char *name)
1417 {//=============================================================================
1418 int rule;
1419 int ix;
1420 unsigned char c;
1421 int len1;
1422 int len2;
1423 int spaces;
1424 char *p;
1425 char *pout;
1426 int condition;
1427 char buf[80];
1428 char suffix[12];
1429
1430 static unsigned char symbols[] = {'@','&','%','+','#','$','D','Z','A','B','C','F'};
1431
1432 fprintf(f_out,"\n$group %s\n",name);
1433
1434 for(rule=0; rule<n_rules; rule++)
1435 {
1436 p = rules[rule];
1437 len1 = strlen(p) + 1;
1438 p = &p[len1];
1439 len2 = strlen(p);
1440
1441 rule_match[0]=0;
1442 rule_pre[0]=0;
1443 rule_post[0]=0;
1444 condition = 0;
1445
1446 pout = rule_match;
1447 for(ix=0; ix<len2; ix++)
1448 {
1449 switch(c = p[ix])
1450 {
1451 case RULE_PRE:
1452 *pout = 0;
1453 pout = rule_pre;
1454 break;
1455 case RULE_POST:
1456 *pout = 0;
1457 pout = rule_post;
1458 break;
1459 case RULE_CONDITION:
1460 condition = p[++ix];
1461 break;
1462 case RULE_ENDING:
1463 sprintf(suffix,"$%d[%x]",(p[ix+2]),p[ix+1] & 0x7f);
1464 ix += 2;
1465 strcpy(pout,suffix);
1466 pout += strlen(suffix);
1467 break;
1468 default:
1469 if(c <= RULE_LETTER7)
1470 c = symbols[c-RULE_SYLLABLE];
1471 if(c == ' ')
1472 c = '_';
1473 *pout++ = c;
1474 break;
1475 }
1476 }
1477 *pout = 0;
1478
1479 spaces = 12;
1480 if(condition > 0)
1481 {
1482 sprintf(buf,"?%d ",condition);
1483 spaces -= strlen(buf);
1484 fprintf(f_out,"%s",buf);
1485 }
1486
1487 if(rule_pre[0] != 0)
1488 {
1489 p = buf;
1490 for(ix=strlen(rule_pre)-1; ix>=0; ix--)
1491 *p++ = rule_pre[ix];
1492 sprintf(p,") ");
1493 spaces -= strlen(buf);
1494 for(ix=0; ix<spaces; ix++)
1495 fputc(' ',f_out);
1496 fprintf(f_out,"%s",buf);
1497 spaces = 0;
1498 }
1499
1500 for(ix=0; ix<spaces; ix++)
1501 fputc(' ',f_out);
1502
1503 spaces = 14;
1504 sprintf(buf," %s ",rule_match);
1505 if(rule_post[0] != 0)
1506 {
1507 p = &buf[strlen(buf)];
1508 sprintf(p,"(%s ",rule_post);
1509 }
1510 fprintf(f_out,"%s",buf);
1511 spaces -= strlen(buf);
1512
1513 for(ix=0; ix<spaces; ix++)
1514 fputc(' ',f_out);
1515 DecodePhonemes(rules[rule],buf);
1516 fprintf(f_out,"%s\n",buf); // phonemes
1517 }
1518 }
1519 #endif
1520
1521
1522 //#define LIST_GROUP_INFO
output_rule_group(FILE * f_out,int n_rules,char ** rules,char * name)1523 static void output_rule_group(FILE *f_out, int n_rules, char **rules, char *name)
1524 {//==============================================================================
1525 int ix;
1526 int len1;
1527 int len2;
1528 int len_name;
1529 char *p;
1530 char *p2, *p3;
1531 const char *common;
1532
1533 short nextchar_count[256];
1534 memset(nextchar_count,0,sizeof(nextchar_count));
1535
1536 len_name = strlen(name);
1537
1538 #ifdef OUTPUT_FORMAT
1539 print_rule_group(f_log,n_rules,rules,name);
1540 #endif
1541
1542 // sort the rules in this group by their phoneme string
1543 common = "";
1544 qsort((void *)rules,n_rules,sizeof(char *),(int (__cdecl *)(const void *,const void *))string_sorter);
1545
1546 if(strcmp(name,"9")==0)
1547 len_name = 0; // don't remove characters from numeric match strings
1548
1549 for(ix=0; ix<n_rules; ix++)
1550 {
1551 p = rules[ix];
1552 len1 = strlen(p) + 1; // phoneme string
1553 p3 = &p[len1];
1554 p2 = p3 + len_name; // remove group name from start of match string
1555 len2 = strlen(p2);
1556
1557 nextchar_count[(unsigned char)(p2[0])]++; // the next byte after the group name
1558
1559 if((common[0] != 0) && (strcmp(p,common)==0))
1560 {
1561 fwrite(p2,len2,1,f_out);
1562 fputc(0,f_out); // no phoneme string, it's the same as previous rule
1563 }
1564 else
1565 {
1566 if((ix < n_rules-1) && (strcmp(p,rules[ix+1])==0))
1567 {
1568 common = rules[ix]; // phoneme string is same as next, set as common
1569 fputc(RULE_PH_COMMON,f_out);
1570 }
1571
1572 fwrite(p2,len2,1,f_out);
1573 fputc(RULE_PHONEMES,f_out);
1574 fwrite(p,len1,1,f_out);
1575 }
1576 }
1577
1578 #ifdef LIST_GROUP_INFO
1579 for(ix=32; ix<256; ix++)
1580 {
1581 if(nextchar_count[ix] > 30)
1582 printf("Group %s %c %d\n",name,ix,nextchar_count[ix]);
1583 }
1584 #endif
1585 } // end of output_rule_group
1586
1587
1588
compile_lettergroup(char * input,FILE * f_out)1589 static int compile_lettergroup(char *input, FILE *f_out)
1590 {//=====================================================
1591 char *p;
1592 char *p_start;
1593 int group;
1594 int ix;
1595 int n_items;
1596 int length;
1597 int max_length = 0;
1598
1599 #define N_LETTERGP_ITEMS 200
1600 char *items[N_LETTERGP_ITEMS];
1601 char item_length[N_LETTERGP_ITEMS];
1602
1603 p = input;
1604 if(!IsDigit09(p[0]) || !IsDigit09(p[1]))
1605 {
1606 fprintf(f_log,"%5d: Expected 2 digits after '.L'\n",linenum);
1607 error_count++;
1608 return(1);
1609 }
1610
1611 group = atoi(&p[0]);
1612 if(group >= N_LETTER_GROUPS)
1613 {
1614 fprintf(f_log,"%5d: lettergroup out of range (01-%.2d)\n",linenum,N_LETTER_GROUPS-1);
1615 error_count++;
1616 return(1);
1617 }
1618
1619 while(!isspace2(*p)) p++;
1620
1621 fputc(RULE_GROUP_START,f_out);
1622 fputc(RULE_LETTERGP2,f_out);
1623 fputc(group + 'A', f_out);
1624 if(letterGroupsDefined[group] != 0)
1625 {
1626 fprintf(f_log,"%5d: lettergroup L%.2d is already defined\n",linenum,group);
1627 error_count++;
1628 }
1629 letterGroupsDefined[group] = 1;
1630
1631 n_items = 0;
1632 while(n_items < N_LETTERGP_ITEMS)
1633 {
1634 while(isspace2(*p)) p++;
1635 if(*p == 0)
1636 break;
1637
1638 items[n_items] = p_start = p;
1639 while((*p & 0xff) > ' ')
1640 {
1641 if (*p == '_') *p = ' '; // allow '_' for word break
1642 p++;
1643 }
1644 *p++ = 0;
1645 length = p - p_start;
1646 if(length > max_length)
1647 max_length = length;
1648 item_length[n_items++] = length;
1649 }
1650
1651 // write out the items, longest first
1652 while(max_length > 1)
1653 {
1654 for(ix=0; ix < n_items; ix++)
1655 {
1656 if(item_length[ix] == max_length)
1657 {
1658 fwrite(items[ix],1,max_length,f_out);
1659 }
1660 }
1661 max_length--;
1662 }
1663
1664 fputc(RULE_GROUP_END,f_out);
1665
1666 return(0);
1667 }
1668
1669
compile_dictrules(FILE * f_in,FILE * f_out,char * fname_temp)1670 static int compile_dictrules(FILE *f_in, FILE *f_out, char *fname_temp)
1671 {//====================================================================
1672 char *prule;
1673 unsigned char *p;
1674 int ix;
1675 int c;
1676 int gp;
1677 FILE *f_temp;
1678 int n_rules=0;
1679 int count=0;
1680 int different;
1681 int wc;
1682 const char *prev_rgroup_name;
1683 unsigned int char_code;
1684 int compile_mode=0;
1685 char *buf;
1686 char buf1[500];
1687 char *rules[N_RULES];
1688
1689 int n_rgroups = 0;
1690 int n_groups3 = 0;
1691 RGROUP rgroup[N_RULE_GROUP2];
1692
1693 linenum = 0;
1694 group_name[0] = 0;
1695
1696 if((f_temp = fopen_log(fname_temp,"wb")) == NULL)
1697 return(1);
1698
1699 for(;;)
1700 {
1701 linenum++;
1702 buf = fgets(buf1,sizeof(buf1),f_in);
1703 if(buf != NULL)
1704 {
1705 if((p = (unsigned char *)strstr(buf,"//")) != NULL)
1706 *p = 0;
1707
1708 if(buf[0] == '\r') buf++; // ignore extra \r in \r\n
1709 }
1710
1711 if((buf == NULL) || (buf[0] == '.'))
1712 {
1713 // next .group or end of file, write out the previous group
1714
1715 if(n_rules > 0)
1716 {
1717 strcpy(rgroup[n_rgroups].name,group_name);
1718 rgroup[n_rgroups].group3_ix = group3_ix;
1719 rgroup[n_rgroups].start = ftell(f_temp);
1720 output_rule_group(f_temp,n_rules,rules,group_name);
1721 rgroup[n_rgroups].length = ftell(f_temp) - rgroup[n_rgroups].start;
1722 n_rgroups++;
1723
1724 count += n_rules;
1725 }
1726 n_rules = 0;
1727
1728 if(compile_mode == 2)
1729 {
1730 // end of the character replacements section
1731 fwrite(&n_rules,1,4,f_out); // write a zero word to terminate the replacemenmt list
1732 compile_mode = 0;
1733 }
1734
1735 if(buf == NULL) break; // end of file
1736
1737 if(memcmp(buf,".L",2)==0)
1738 {
1739 compile_lettergroup(&buf[2], f_out);
1740 continue;
1741 }
1742
1743 if(memcmp(buf,".replace",8)==0)
1744 {
1745 compile_mode = 2;
1746 fputc(RULE_GROUP_START,f_out);
1747 fputc(RULE_REPLACEMENTS,f_out);
1748
1749 // advance to next word boundary
1750 while((ftell(f_out) & 3) != 0)
1751 fputc(0,f_out);
1752 }
1753
1754 if(memcmp(buf,".group",6)==0)
1755 {
1756 compile_mode = 1;
1757
1758 p = (unsigned char *)&buf[6];
1759 while((p[0]==' ') || (p[0]=='\t')) p++; // Note: Windows isspace(0xe1) gives TRUE !
1760 ix = 0;
1761 while((*p > ' ') && (ix < LEN_GROUP_NAME))
1762 group_name[ix++] = *p++;
1763 group_name[ix]=0;
1764 group3_ix = 0;
1765
1766 if(sscanf(group_name,"0x%x",&char_code)==1)
1767 {
1768 // group character is given as a character code (max 16 bits)
1769 p = (unsigned char *)group_name;
1770
1771 if(char_code > 0x100)
1772 {
1773 *p++ = (char_code >> 8);
1774 }
1775 *p++ = char_code;
1776 *p = 0;
1777 }
1778 else
1779 {
1780 if(translator->letter_bits_offset > 0)
1781 {
1782 utf8_in(&wc, group_name);
1783 if(((ix = (wc - translator->letter_bits_offset)) >= 0) && (ix < 128))
1784 {
1785 group3_ix = ix+1; // not zero
1786 }
1787 }
1788 }
1789
1790 if((group3_ix == 0) && (strlen(group_name) > 2))
1791 {
1792 if(utf8_in(&c,group_name) < 2)
1793 {
1794 fprintf(f_log,"%5d: Group name longer than 2 bytes (UTF8)",linenum);
1795 error_count++;
1796 }
1797
1798 group_name[2] = 0;
1799 }
1800 }
1801
1802 continue;
1803 }
1804
1805 switch(compile_mode)
1806 {
1807 case 1: // .group
1808 prule = compile_rule(buf);
1809 if((prule != NULL) && (n_rules < N_RULES))
1810 {
1811 rules[n_rules++] = prule;
1812 }
1813 break;
1814
1815 case 2: // .replace
1816 {
1817 int replace1;
1818 int replace2;
1819 char *p;
1820
1821 p = buf;
1822 replace1 = 0;
1823 replace2 = 0;
1824 while(isspace2(*p)) p++;
1825 ix = 0;
1826 while((unsigned char)(*p) > 0x20) // not space or zero-byte
1827 {
1828 p += utf8_in(&c,p);
1829 replace1 += (c << ix);
1830 ix += 16;
1831 }
1832 while(isspace2(*p)) p++;
1833 ix = 0;
1834 while((unsigned char)(*p) > 0x20)
1835 {
1836 p += utf8_in(&c,p);
1837 replace2 += (c << ix);
1838 ix += 16;
1839 }
1840 if(replace1 != 0)
1841 {
1842 Write4Bytes(f_out,replace1); // write as little-endian
1843 Write4Bytes(f_out,replace2); // if big-endian, reverse the bytes in LoadDictionary()
1844 }
1845 }
1846 break;
1847 }
1848 }
1849 fclose(f_temp);
1850
1851 qsort((void *)rgroup,n_rgroups,sizeof(rgroup[0]),(int (__cdecl *)(const void *,const void *))rgroup_sorter);
1852
1853 if((f_temp = fopen(fname_temp,"rb"))==NULL)
1854 return(2);
1855
1856 prev_rgroup_name = "\n";
1857
1858 for(gp = 0; gp < n_rgroups; gp++)
1859 {
1860 fseek(f_temp,rgroup[gp].start,SEEK_SET);
1861
1862 if((different = strcmp(rgroup[gp].name, prev_rgroup_name)) != 0)
1863 {
1864 // not the same as the previous group
1865 if(gp > 0)
1866 fputc(RULE_GROUP_END,f_out);
1867 fputc(RULE_GROUP_START,f_out);
1868
1869 if(rgroup[gp].group3_ix != 0)
1870 {
1871 n_groups3++;
1872 fputc(1,f_out);
1873 fputc(rgroup[gp].group3_ix, f_out);
1874 }
1875 else
1876 {
1877 fprintf(f_out, "%s", prev_rgroup_name = rgroup[gp].name);
1878 }
1879 fputc(0,f_out);
1880 }
1881
1882 for(ix=rgroup[gp].length; ix>0; ix--)
1883 {
1884 c = fgetc(f_temp);
1885 fputc(c,f_out);
1886 }
1887
1888 if(different)
1889 {
1890 }
1891 }
1892 fputc(RULE_GROUP_END,f_out);
1893 fputc(0,f_out);
1894
1895 fclose(f_temp);
1896 remove(fname_temp);
1897
1898 fprintf(f_log,"\t%d rules, %d groups (%d)\n\n",count,n_rgroups,n_groups3);
1899 return(0);
1900 } // end of compile_dictrules
1901
1902
1903
CompileDictionary(const char * dsource,const char * dict_name,FILE * log,char * fname_err,int flags)1904 int CompileDictionary(const char *dsource, const char *dict_name, FILE *log, char *fname_err, int flags)
1905 {//=====================================================================================================
1906 // fname: space to write the filename in case of error
1907 // flags: bit 0: include source line number information, for debug purposes.
1908
1909 FILE *f_in;
1910 FILE *f_out;
1911 int offset_rules=0;
1912 int value;
1913 char fname_in[sizeof(path_home)+45];
1914 char fname_out[sizeof(path_home)+15];
1915 char fname_temp[sizeof(path_home)+15];
1916 char path[sizeof(path_home)+40]; // path_dsource+20
1917
1918 error_count = 0;
1919 error_need_dictionary = 0;
1920 memset(letterGroupsDefined,0,sizeof(letterGroupsDefined));
1921
1922 debug_flag = flags & 1;
1923
1924 if(dsource == NULL)
1925 dsource = "";
1926
1927 f_log = log;
1928 //f_log = fopen("log2.txt","w");
1929 if(f_log == NULL)
1930 f_log = stderr;
1931
1932 // try with and without '.txt' extension
1933 sprintf(path,"%s%s_",dsource,dict_name);
1934 sprintf(fname_in,"%srules.txt",path);
1935 if((f_in = fopen(fname_in,"r")) == NULL)
1936 {
1937 sprintf(fname_in,"%srules",path);
1938 if((f_in = fopen_log(fname_in,"r")) == NULL)
1939 {
1940 if(fname_err)
1941 strcpy(fname_err,fname_in);
1942 return(-1);
1943 }
1944 }
1945
1946 sprintf(fname_out,"%s%c%s_dict",path_home,PATHSEP,dict_name);
1947 if((f_out = fopen_log(fname_out,"wb+")) == NULL)
1948 {
1949 if(fname_err)
1950 strcpy(fname_err,fname_out);
1951 return(-1);
1952 }
1953 sprintf(fname_temp,"%s%ctemp",path_home,PATHSEP);
1954
1955 value = N_HASH_DICT;
1956 Write4Bytes(f_out,value);
1957 Write4Bytes(f_out,offset_rules);
1958
1959 compile_dictlist_start();
1960
1961 fprintf(f_log,"Using phonemetable: '%s'\n",phoneme_tab_list[phoneme_tab_number].name);
1962 compile_dictlist_file(path,"roots");
1963 if(translator->langopts.listx)
1964 {
1965 compile_dictlist_file(path,"list");
1966 compile_dictlist_file(path,"listx");
1967 }
1968 else
1969 {
1970 compile_dictlist_file(path,"listx");
1971 compile_dictlist_file(path,"list");
1972 }
1973 compile_dictlist_file(path,"extra");
1974
1975 compile_dictlist_end(f_out);
1976 offset_rules = ftell(f_out);
1977
1978 fprintf(f_log,"Compiling: '%s'\n",fname_in);
1979
1980 compile_dictrules(f_in,f_out,fname_temp);
1981 fclose(f_in);
1982
1983 fseek(f_out,4,SEEK_SET);
1984 Write4Bytes(f_out,offset_rules);
1985 fclose(f_out);
1986 fflush(f_log);
1987
1988 LoadDictionary(translator, dict_name, 0);
1989
1990 return(error_count);
1991 } // end of compile_dictionary
1992
1993