1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : December 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* A LTS rule compiler, where rules are contextual rewrite rules. Rules */
38 /* are of the for LC [ x ] RC => y where LC and RC are regexs on the */
39 /* tape only and x and y are simple strings on symbols. That is the */
40 /* standard form of LTS rules used in Festival. */
41 /* */
42 /*=======================================================================*/
43 #include <iostream>
44 #include "EST_cutils.h"
45 #include "EST_WFST.h"
46
47 static LISP lts_find_feasible_pairs(LISP rules);
48 static LISP make_fp(LISP in, LISP out);
49 static LISP find_outs(LISP rule);
50 static LISP find_ins(LISP rule);
51 static LISP add_alpha(LISP n, LISP existing);
52 static LISP lts_find_alphabets(LISP rules);
53 static void ltsrule_compile(LISP inalpha, LISP outalpha,
54 LISP fp, LISP sets, LISP rule,
55 EST_WFST &a, EST_WFST ¬_a);
56 static LISP analyse_rule(LISP rule);
57 static LISP expand_sets(LISP l, LISP fp, LISP sets);
58 static LISP expand_set(LISP p, LISP fp, LISP sets);
59 static LISP find_notMAP(LISP MAP,LISP fp);
60
ltscompile(LISP lts_rules,EST_WFST & all_wfst)61 void ltscompile(LISP lts_rules, EST_WFST &all_wfst)
62 {
63 // Build a transducer from given LTS rules. Because the interpretation
64 // of these rules is normally ordered and the WFST is not, the
65 // complement of each cumulative WFST must be generated before
66 // adding the next rule
67 LISP r;
68 LISP fp; // feasible pairs, those pairs with rules (rather than IxO)
69 LISP inalpha,outalpha,alphas;
70 LISP sets=siod_nth(2,lts_rules);
71 LISP rules=siod_nth(3,lts_rules);
72 EST_WFST nots;
73
74 alphas = lts_find_alphabets(lts_rules);
75 inalpha = car(alphas);
76 outalpha = cdr(alphas);
77
78 fp = lts_find_feasible_pairs(rules);
79
80 // set up an empty WFST, accepts nothing
81 all_wfst.build_from_regex(inalpha,outalpha,NIL);
82 // matches things not matched by rules, everything to begin with
83 nots.build_from_regex(inalpha,outalpha,
84 cons(rintern("*"),
85 cons(cons(rintern("or"),fp),NIL)));
86 nots.save("-");
87
88 for (r=rules; r != NIL; r=cdr(r))
89 {
90 EST_WFST a, not_a,b,c,d;
91
92 // all = all u (all' n r)
93 ltsrule_compile(inalpha,outalpha,fp,sets,car(r),a,not_a);
94 pprint(car(r));
95 a.save("-");
96 c.intersection(a,nots);
97 c.save("-");
98
99 // Add for next rule
100 b.uunion(nots,not_a);
101 not_a.save("-");
102 b.save("-");
103 nots = b;
104
105 d.uunion(all_wfst,c);
106 all_wfst = d;
107 all_wfst.save("-");
108 }
109 }
110
lts_find_alphabets(LISP rules)111 static LISP lts_find_alphabets(LISP rules)
112 {
113 // Find the alphabets used in the rules
114 LISP r;
115 LISP in=NIL, out=NIL;
116
117 for (r=siod_nth(3,rules); r != NIL; r=cdr(r))
118 {
119 in = add_alpha(find_ins(car(r)),in);
120 out = add_alpha(find_outs(car(r)),out);
121 }
122
123 return cons(in,out);
124 }
125
add_alpha(LISP n,LISP existing)126 static LISP add_alpha(LISP n, LISP existing)
127 {
128 // Add values in n if not already in existing
129 LISP t;
130 LISP e=existing;
131
132 for (t=n; t != NIL; t=cdr(t))
133 if (!siod_member_str(get_c_string(car(t)),e))
134 e = cons(car(t),e);
135
136 return e;
137 }
138
find_ins(LISP rule)139 static LISP find_ins(LISP rule)
140 {
141 // find all symbols in [] in rule
142 LISP c;
143 int state=FALSE;
144 LISP ins = NIL;
145
146 for (c=rule; c != NIL; c=cdr(c))
147 {
148 if (streq("[",get_c_string(car(c))))
149 state=TRUE;
150 else if (streq("]",get_c_string(car(c))))
151 break;
152 else if (state)
153 ins = cons(car(c),ins);
154 }
155 return reverse(ins);
156 }
157
find_outs(LISP rule)158 static LISP find_outs(LISP rule)
159 {
160 // find all symbols after = rule
161 LISP c;
162 int state=FALSE;
163 LISP outs = NIL;
164
165 for (c=rule; c != NIL; c=cdr(c))
166 {
167 if (streq("=",get_c_string(car(c))))
168 state=TRUE;
169 else if (state)
170 outs = cons(car(c),outs);
171 }
172 return reverse(outs);
173 }
174
lts_find_feasible_pairs(LISP rules)175 static LISP lts_find_feasible_pairs(LISP rules)
176 {
177 // Find the set of pairs that have rules associated with them
178 // This effectively defines the transducer alphabet.
179 // We take the surface part in [] and the part after the = and
180 // linearly match them to form a set of pairs, padded with epsilon
181 // if necessary.
182 LISP fp = NIL;
183 LISP r;
184
185 for (r=rules; r != NIL; r=cdr(r))
186 {
187 LISP in = find_ins(car(r));
188 LISP out = find_outs(car(r));
189
190 LISP pairs = make_fp(in,out);
191
192 for (LISP p=pairs; p != NIL; p=cdr(p))
193 {
194 if (!siod_member_str(get_c_string(car(p)),fp))
195 fp = cons(car(p),fp);
196 }
197 }
198 return fp;
199 }
200
make_fp(LISP in,LISP out)201 static LISP make_fp(LISP in, LISP out)
202 {
203 // Returns a list of pairs by matching each member of in to out
204 // padding the shorted one with _epsilon_ if necessary
205 LISP i,o;
206 LISP fp=NIL;
207 EST_String is,os;
208 int m;
209
210 if (siod_llength(in) > siod_llength(out))
211 m = siod_llength(in);
212 else
213 m = siod_llength(out);
214
215 for (i=in,o=out ; m > 0; --m,i=cdr(i),o=cdr(o))
216 {
217 if (i == NIL)
218 is = "__epsilon__";
219 else
220 is = get_c_string(car(i));
221 if (o == NIL)
222 os = "__epsilon__";
223 else
224 os = get_c_string(car(o));
225 fp = cons(strintern(is+"/"+os),fp);
226 }
227 return reverse(fp);
228 }
229
ltsrule_compile(LISP inalpha,LISP outalpha,LISP fp,LISP sets,LISP rule,EST_WFST & a,EST_WFST & not_a)230 static void ltsrule_compile(LISP inalpha, LISP outalpha,
231 LISP fp, LISP sets, LISP rule,
232 EST_WFST &a, EST_WFST ¬_a)
233 {
234 // Return two regexs, one matching with rewrites and another
235 // that matches things this rule doesn't match.
236 LISP LC,MAP,RC,notMAP,r;
237
238 r = analyse_rule(rule);
239 LC = siod_nth(0,r);
240 MAP = siod_nth(1,r);
241 RC = siod_nth(2,r);
242
243 LC = expand_sets(LC,fp,sets);
244 RC = expand_sets(RC,fp,sets);
245 notMAP = find_notMAP(MAP,fp);
246
247
248 LISP kk = cons(LC,cons(MAP,cons(RC,NIL)));
249 cout << "kk rule" << endl;
250 pprint(kk);
251 a.kkrule_compile(inalpha,outalpha,fp,kk,NIL);
252
253 // (or (* <fp>) (not <rule>)) ;; everything except the rule
254 LISP regex_r = cons(rintern("and"),append(LC,append(MAP,RC)));
255 // LISP nn = cons(rintern("or"),
256 // cons(cons(rintern("*"),cons(cons(rintern("or"),fp),NIL)),
257 // cons(cons(rintern("not"),cons(regex_r,NIL)),
258 // NIL)));
259 LISP nn = cons(rintern("not"),cons(regex_r,NIL));
260 not_a.build_from_regex(inalpha,outalpha,nn);
261
262 }
263
analyse_rule(LISP rule)264 static LISP analyse_rule(LISP rule)
265 {
266 // return the left context, map and right context;
267 LISP LC=NIL, RC=NIL, in=NIL, out=NIL;
268 LISP l;
269 int state=0;
270
271 for (l=rule; l != NIL; l=cdr(l))
272 {
273 if ((state==0) && (!streq("[",get_c_string(car(l)))))
274 LC = cons(car(l),LC);
275 else if ((state==0) && (streq("[",get_c_string(car(l)))))
276 state = 1;
277 else if ((state==1) && (!streq("]",get_c_string(car(l)))))
278 in = cons(car(l),in);
279 else if ((state==1) && (streq("]",get_c_string(car(l)))))
280 state = 2;
281 else if ((state==2) && (!streq("=",get_c_string(car(l)))))
282 RC = cons(car(l),RC);
283 else if ((state==2) && (streq("=",get_c_string(car(l)))))
284 state = 3;
285 else if (state == 3)
286 {
287 out = l;
288 break;
289 }
290 }
291
292 return cons(reverse(LC),
293 cons(make_fp(reverse(in),out),
294 cons(reverse(RC),NIL)));
295
296 }
297
expand_sets(LISP l,LISP fp,LISP sets)298 static LISP expand_sets(LISP l, LISP fp, LISP sets)
299 {
300 // Expand sets in l and fix regex characters
301 LISP r,es=NIL;
302
303 for (r=l; r != NIL; r=cdr(r))
304 {
305 LISP s = expand_set(car(r),fp,sets);
306 if (cdr(r) && (streq("*",get_c_string(car(cdr(r))))))
307 {
308 es = cons(cons(rintern("*"),s),es);
309 r=cdr(r);
310 }
311 else if (cdr(r) && (streq("+",get_c_string(car(cdr(r))))))
312 {
313 es = cons(cons(rintern("+"),s),es);
314 r=cdr(r);
315 }
316 else
317 es = cons(cons(rintern("and"),s),es);
318 }
319 return reverse(es);
320 }
321
expand_set(LISP p,LISP fp,LISP sets)322 static LISP expand_set(LISP p, LISP fp, LISP sets)
323 {
324 // expand p with respect to sets and feasible pairs
325 LISP set = siod_assoc_str(get_c_string(p),sets);
326 LISP s,f;
327 LISP r=NIL;
328
329 if (set == NIL)
330 set = cons(p,NIL);
331
332 for (s=set; s != NIL; s=cdr(s))
333 {
334 for (f=fp; f != NIL; f=cdr(f))
335 {
336 EST_String ss = get_c_string(car(s));
337 EST_String sf = get_c_string(car(f));
338
339 if (sf.contains(ss+"/"))
340 r = cons(car(f),r);
341 }
342 }
343
344 return reverse(r);
345 }
346
find_notMAP(LISP MAP,LISP fp)347 static LISP find_notMAP(LISP MAP,LISP fp)
348 {
349 // Returns REGEX that matches everything except MAP, this doesn't
350 // try all possible epsilons though
351 LISP r,notrp=NIL,m,np;
352 EST_String s,l,p,sr,lr,rr;
353
354 for (m=MAP; m != NIL; m=cdr(m))
355 {
356 p = get_c_string(car(m));
357 if (p.contains("/"))
358 {
359 s = p.before("/");
360 l = p.after("/");
361 }
362 else
363 {
364 s = p;
365 l = p;
366 }
367
368 for (np=NIL,r=fp; r != NIL; r = cdr(r))
369 {
370 rr = get_c_string(car(r));
371 if (rr.contains("/"))
372 {
373 sr = rr.before("/");
374 lr = rr.after("/");
375 }
376 else
377 {
378 sr = rr;
379 lr = rr;
380 }
381 if ((s == sr) && (l != lr))
382 np = cons(car(r),np);
383 }
384 notrp = cons(cons(rintern("or"),np),notrp);
385 }
386
387 return reverse(notrp);
388 }
389
390