1 /*************************************************************************/
2 /*                                                                       */
3 /*                Centre for Speech Technology Research                  */
4 /*                     University of Edinburgh, UK                       */
5 /*                         Copyright (c) 1996                            */
6 /*                        All Rights Reserved.                           */
7 /*                                                                       */
8 /*  Permission is hereby granted, free of charge, to use and distribute  */
9 /*  this software and its documentation without restriction, including   */
10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
12 /*  permit persons to whom this work is furnished to do so, subject to   */
13 /*  the following conditions:                                            */
14 /*   1. The code must retain the above copyright notice, this list of    */
15 /*      conditions and the following disclaimer.                         */
16 /*   2. Any modifications must be clearly marked as such.                */
17 /*   3. Original authors' names are not deleted.                         */
18 /*   4. The authors' names are not used to endorse or promote products   */
19 /*      derived from this software without specific prior written        */
20 /*      permission.                                                      */
21 /*                                                                       */
22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
30 /*  THIS SOFTWARE.                                                       */
31 /*                                                                       */
32 /*************************************************************************/
33 /*                     Author :  Alan W Black                            */
34 /*                     Date   :  December 1997                           */
35 /*-----------------------------------------------------------------------*/
36 /*                                                                       */
37 /* A LTS rule compiler, where rules are contextual rewrite rules.  Rules */
38 /* are of the for LC [ x ] RC => y where LC and RC are regexs on the     */
39 /* tape only and x and y are simple strings on symbols.  That is the     */
40 /* standard form of LTS rules used in Festival.                          */
41 /*                                                                       */
42 /*=======================================================================*/
43 #include <iostream>
44 #include "EST_cutils.h"
45 #include "EST_WFST.h"
46 
47 static LISP lts_find_feasible_pairs(LISP rules);
48 static LISP make_fp(LISP in, LISP out);
49 static LISP find_outs(LISP rule);
50 static LISP find_ins(LISP rule);
51 static LISP add_alpha(LISP n, LISP existing);
52 static LISP lts_find_alphabets(LISP rules);
53 static void ltsrule_compile(LISP inalpha, LISP outalpha,
54 			    LISP fp, LISP sets, LISP rule,
55 			    EST_WFST &a, EST_WFST &not_a);
56 static LISP analyse_rule(LISP rule);
57 static LISP expand_sets(LISP l, LISP fp, LISP sets);
58 static LISP expand_set(LISP p, LISP fp, LISP sets);
59 static LISP find_notMAP(LISP MAP,LISP fp);
60 
ltscompile(LISP lts_rules,EST_WFST & all_wfst)61 void ltscompile(LISP lts_rules, EST_WFST &all_wfst)
62 {
63     // Build a transducer from given LTS rules.  Because the interpretation
64     // of these rules is normally ordered and the WFST is not, the
65     // complement of each cumulative WFST must be generated before
66     // adding the next rule
67     LISP r;
68     LISP fp;  // feasible pairs, those pairs with rules (rather than IxO)
69     LISP inalpha,outalpha,alphas;
70     LISP sets=siod_nth(2,lts_rules);
71     LISP rules=siod_nth(3,lts_rules);
72     EST_WFST nots;
73 
74     alphas = lts_find_alphabets(lts_rules);
75     inalpha = car(alphas);
76     outalpha = cdr(alphas);
77 
78     fp = lts_find_feasible_pairs(rules);
79 
80     // set up an empty WFST, accepts nothing
81     all_wfst.build_from_regex(inalpha,outalpha,NIL);
82     // matches things not matched by rules, everything to begin with
83     nots.build_from_regex(inalpha,outalpha,
84 			  cons(rintern("*"),
85 			       cons(cons(rintern("or"),fp),NIL)));
86     nots.save("-");
87 
88     for (r=rules; r != NIL; r=cdr(r))
89     {
90 	EST_WFST a, not_a,b,c,d;
91 
92 	// all = all u (all' n r)
93 	ltsrule_compile(inalpha,outalpha,fp,sets,car(r),a,not_a);
94 	pprint(car(r));
95 	a.save("-");
96 	c.intersection(a,nots);
97 	c.save("-");
98 
99 	// Add for next rule
100 	b.uunion(nots,not_a);
101 	not_a.save("-");
102 	b.save("-");
103 	nots = b;
104 
105 	d.uunion(all_wfst,c);
106 	all_wfst = d;
107 	all_wfst.save("-");
108     }
109 }
110 
lts_find_alphabets(LISP rules)111 static LISP lts_find_alphabets(LISP rules)
112 {
113     // Find the alphabets used in the rules
114     LISP r;
115     LISP in=NIL, out=NIL;
116 
117     for (r=siod_nth(3,rules); r != NIL; r=cdr(r))
118     {
119 	in = add_alpha(find_ins(car(r)),in);
120 	out = add_alpha(find_outs(car(r)),out);
121     }
122 
123     return cons(in,out);
124 }
125 
add_alpha(LISP n,LISP existing)126 static LISP add_alpha(LISP n, LISP existing)
127 {
128     // Add values in n if not already in existing
129     LISP t;
130     LISP e=existing;
131 
132     for (t=n; t != NIL; t=cdr(t))
133 	if (!siod_member_str(get_c_string(car(t)),e))
134 	    e = cons(car(t),e);
135 
136     return e;
137 }
138 
find_ins(LISP rule)139 static LISP find_ins(LISP rule)
140 {
141     // find all symbols in [] in rule
142     LISP c;
143     int state=FALSE;
144     LISP ins = NIL;
145 
146     for (c=rule; c != NIL; c=cdr(c))
147     {
148 	if (streq("[",get_c_string(car(c))))
149 	    state=TRUE;
150 	else if (streq("]",get_c_string(car(c))))
151 	    break;
152 	else if (state)
153 	    ins = cons(car(c),ins);
154     }
155     return reverse(ins);
156 }
157 
find_outs(LISP rule)158 static LISP find_outs(LISP rule)
159 {
160     // find all symbols after = rule
161     LISP c;
162     int state=FALSE;
163     LISP outs = NIL;
164 
165     for (c=rule; c != NIL; c=cdr(c))
166     {
167 	if (streq("=",get_c_string(car(c))))
168 	    state=TRUE;
169 	else if (state)
170 	    outs = cons(car(c),outs);
171     }
172     return reverse(outs);
173 }
174 
lts_find_feasible_pairs(LISP rules)175 static LISP lts_find_feasible_pairs(LISP rules)
176 {
177     // Find the set of pairs that have rules associated with them
178     // This effectively defines the transducer alphabet.
179     // We take the surface part in [] and the part after the = and
180     // linearly match them to form a set of pairs, padded with epsilon
181     // if necessary.
182     LISP fp = NIL;
183     LISP r;
184 
185     for (r=rules; r != NIL; r=cdr(r))
186     {
187 	LISP in = find_ins(car(r));
188 	LISP out = find_outs(car(r));
189 
190 	LISP pairs = make_fp(in,out);
191 
192 	for (LISP p=pairs; p != NIL; p=cdr(p))
193 	{
194 	    if (!siod_member_str(get_c_string(car(p)),fp))
195 		fp = cons(car(p),fp);
196 	}
197     }
198     return fp;
199 }
200 
make_fp(LISP in,LISP out)201 static LISP make_fp(LISP in, LISP out)
202 {
203     // Returns a list of pairs by matching each member of in to out
204     // padding the shorted one with _epsilon_ if necessary
205     LISP i,o;
206     LISP fp=NIL;
207     EST_String is,os;
208     int m;
209 
210     if (siod_llength(in) > siod_llength(out))
211 	m = siod_llength(in);
212     else
213 	m = siod_llength(out);
214 
215     for (i=in,o=out ; m > 0; --m,i=cdr(i),o=cdr(o))
216     {
217 	if (i == NIL)
218 	    is = "__epsilon__";
219 	else
220 	    is = get_c_string(car(i));
221 	if (o == NIL)
222 	    os = "__epsilon__";
223 	else
224 	    os = get_c_string(car(o));
225         fp = cons(strintern(is+"/"+os),fp);
226     }
227     return reverse(fp);
228 }
229 
ltsrule_compile(LISP inalpha,LISP outalpha,LISP fp,LISP sets,LISP rule,EST_WFST & a,EST_WFST & not_a)230 static void ltsrule_compile(LISP inalpha, LISP outalpha,
231 			    LISP fp, LISP sets, LISP rule,
232 			    EST_WFST &a, EST_WFST &not_a)
233 {
234     // Return two regexs, one matching with rewrites and another
235     // that matches things this rule doesn't match.
236     LISP LC,MAP,RC,notMAP,r;
237 
238     r = analyse_rule(rule);
239     LC = siod_nth(0,r);
240     MAP = siod_nth(1,r);
241     RC = siod_nth(2,r);
242 
243     LC = expand_sets(LC,fp,sets);
244     RC = expand_sets(RC,fp,sets);
245     notMAP = find_notMAP(MAP,fp);
246 
247 
248     LISP kk = cons(LC,cons(MAP,cons(RC,NIL)));
249     cout << "kk rule" << endl;
250     pprint(kk);
251     a.kkrule_compile(inalpha,outalpha,fp,kk,NIL);
252 
253     // (or (* <fp>) (not <rule>))  ;;  everything except the rule
254     LISP regex_r = cons(rintern("and"),append(LC,append(MAP,RC)));
255 //    LISP nn = cons(rintern("or"),
256 //		   cons(cons(rintern("*"),cons(cons(rintern("or"),fp),NIL)),
257 //			cons(cons(rintern("not"),cons(regex_r,NIL)),
258 //			     NIL)));
259     LISP nn = cons(rintern("not"),cons(regex_r,NIL));
260     not_a.build_from_regex(inalpha,outalpha,nn);
261 
262 }
263 
analyse_rule(LISP rule)264 static LISP analyse_rule(LISP rule)
265 {
266     // return the left context, map and right context;
267     LISP LC=NIL, RC=NIL, in=NIL, out=NIL;
268     LISP l;
269     int state=0;
270 
271     for (l=rule; l != NIL; l=cdr(l))
272     {
273 	if ((state==0) && (!streq("[",get_c_string(car(l)))))
274 	    LC = cons(car(l),LC);
275 	else if ((state==0) && (streq("[",get_c_string(car(l)))))
276 	    state = 1;
277 	else if ((state==1) && (!streq("]",get_c_string(car(l)))))
278 	    in = cons(car(l),in);
279 	else if ((state==1) && (streq("]",get_c_string(car(l)))))
280 	    state = 2;
281 	else if ((state==2) && (!streq("=",get_c_string(car(l)))))
282 	    RC = cons(car(l),RC);
283 	else if ((state==2) && (streq("=",get_c_string(car(l)))))
284 	    state = 3;
285 	else if (state == 3)
286 	{
287 	    out = l;
288 	    break;
289 	}
290     }
291 
292     return cons(reverse(LC),
293 	    cons(make_fp(reverse(in),out),
294 	     cons(reverse(RC),NIL)));
295 
296 }
297 
expand_sets(LISP l,LISP fp,LISP sets)298 static LISP expand_sets(LISP l, LISP fp, LISP sets)
299 {
300     // Expand sets in l and fix regex characters
301     LISP r,es=NIL;
302 
303     for (r=l; r != NIL; r=cdr(r))
304     {
305 	LISP s = expand_set(car(r),fp,sets);
306 	if (cdr(r) && (streq("*",get_c_string(car(cdr(r))))))
307 	{
308 	    es = cons(cons(rintern("*"),s),es);
309 	    r=cdr(r);
310 	}
311 	else if (cdr(r) && (streq("+",get_c_string(car(cdr(r))))))
312 	{
313 	    es = cons(cons(rintern("+"),s),es);
314 	    r=cdr(r);
315 	}
316 	else
317 	    es = cons(cons(rintern("and"),s),es);
318     }
319     return reverse(es);
320 }
321 
expand_set(LISP p,LISP fp,LISP sets)322 static LISP expand_set(LISP p, LISP fp, LISP sets)
323 {
324     // expand p with respect to sets and feasible pairs
325     LISP set = siod_assoc_str(get_c_string(p),sets);
326     LISP s,f;
327     LISP r=NIL;
328 
329     if (set == NIL)
330 	set = cons(p,NIL);
331 
332     for (s=set; s != NIL; s=cdr(s))
333     {
334 	for (f=fp; f != NIL; f=cdr(f))
335 	{
336 	    EST_String ss = get_c_string(car(s));
337 	    EST_String sf = get_c_string(car(f));
338 
339 	    if (sf.contains(ss+"/"))
340 		r = cons(car(f),r);
341 	}
342     }
343 
344     return reverse(r);
345 }
346 
find_notMAP(LISP MAP,LISP fp)347 static LISP find_notMAP(LISP MAP,LISP fp)
348 {
349     // Returns REGEX that matches everything except MAP,  this doesn't
350     // try all possible epsilons though
351     LISP r,notrp=NIL,m,np;
352     EST_String s,l,p,sr,lr,rr;
353 
354     for (m=MAP; m != NIL; m=cdr(m))
355     {
356 	p = get_c_string(car(m));
357 	if (p.contains("/"))
358 	{
359 	    s = p.before("/");
360 	    l = p.after("/");
361 	}
362 	else
363 	{
364 	    s = p;
365 	    l = p;
366 	}
367 
368 	for (np=NIL,r=fp; r != NIL; r = cdr(r))
369 	{
370 	    rr = get_c_string(car(r));
371 	    if (rr.contains("/"))
372 	    {
373 		sr = rr.before("/");
374 		lr = rr.after("/");
375 	    }
376 	    else
377 	    {
378 		sr = rr;
379 		lr = rr;
380 	    }
381 	    if ((s == sr) && (l != lr))
382 		np = cons(car(r),np);
383 	}
384 	notrp = cons(cons(rintern("or"),np),notrp);
385     }
386 
387     return reverse(notrp);
388 }
389 
390