1 /*************************************************************************/
2 /*                                                                       */
3 /*                Centre for Speech Technology Research                  */
4 /*                     University of Edinburgh, UK                       */
5 /*                      Copyright (c) 1996-1998                          */
6 /*                        All Rights Reserved.                           */
7 /*                                                                       */
8 /*  Permission is hereby granted, free of charge, to use and distribute  */
9 /*  this software and its documentation without restriction, including   */
10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
12 /*  permit persons to whom this work is furnished to do so, subject to   */
13 /*  the following conditions:                                            */
14 /*   1. The code must retain the above copyright notice, this list of    */
15 /*      conditions and the following disclaimer.                         */
16 /*   2. Any modifications must be clearly marked as such.                */
17 /*   3. Original authors' names are not deleted.                         */
18 /*   4. The authors' names are not used to endorse or promote products   */
19 /*      derived from this software without specific prior written        */
20 /*      permission.                                                      */
21 /*                                                                       */
22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
30 /*  THIS SOFTWARE.                                                       */
31 /*                                                                       */
32 /*************************************************************************/
33 /*                     Author :  Alan W Black                            */
34 /*                     Date   :  December 1997                           */
35 /*-----------------------------------------------------------------------*/
36 /*                                                                       */
37 /* A Koskenniemi/Kay/Kaplan rule compiler to WFST using the techniques   */
38 /* Ritchie et al.'s "Computational Morphology" (but followed through to  */
39 /* make real WFSTs).                                                     */
40 /*                                                                       */
41 /*=======================================================================*/
42 #include <iostream>
43 #include "EST_WFST.h"
44 #include "EST_cutils.h"
45 
operator <<(ostream & s,const EST_WFST & w)46 ostream &operator << (ostream &s, const EST_WFST &w)
47 {
48   (void)w;
49   return s << "<<EST_WFST>>";
50 }
51 
52 Declare_TList(EST_WFST)
53 
54 #if defined(INSTANTIATE_TEMPLATES)
55 #include "../base_class/EST_TList.cc"
56 
57 Instantiate_TList(EST_WFST)
58 
59 #endif
60 
61 static LISP expand_fp(const EST_String p,LISP fp);
62 static LISP find_feasible_pairs(LISP rules);
63 static LISP all_but(LISP rulepair,LISP fp);
64 static LISP expand_sets(LISP sets,LISP fp);
65 static LISP inline_sets(LISP l, LISP sets);
66 static void full_kkcompile(LISP inalpha,LISP outalpha,
67 			   LISP fp, LISP rules, LISP sets,
68 			   EST_WFST &all_wfst);
69 
kkcompile(LISP ruleset,EST_WFST & all_wfst)70 void kkcompile(LISP ruleset, EST_WFST &all_wfst)
71 {
72     // Build a transducer from given kkrule (Kay/Kaplan/Koskenniemi)
73     // Rules are of the form LeftContext Map RightContext
74 
75     // The WFST is recognizing all string except the rulepair unless
76     // its in the proper context.
77     LISP fp;  // feasible pairs, those pairs with rules (rather than IxO)
78     LISP inalpha = siod_nth(1,siod_assoc_str("Alphabets",cdr(cdr(ruleset))));
79     LISP outalpha = siod_nth(2,siod_assoc_str("Alphabets",cdr(cdr(ruleset))));
80     LISP sets = cdr(siod_assoc_str("Sets",ruleset));
81     LISP rules = cdr(siod_assoc_str("Rules",ruleset));
82 
83     fp = find_feasible_pairs(rules);
84     sets = expand_sets(sets,fp);
85 
86     full_kkcompile(inalpha,outalpha,fp,rules,sets,all_wfst);
87 }
88 
full_kkcompile(LISP inalpha,LISP outalpha,LISP fp,LISP rules,LISP sets,EST_WFST & all_wfst)89 static void full_kkcompile(LISP inalpha,LISP outalpha,
90 			   LISP fp, LISP rules, LISP sets,
91 			   EST_WFST &all_wfst)
92 {
93     wfst_list rulelist;
94     LISP r;
95 
96     for (r=rules; r != NIL; r=cdr(r))
97     {
98 	EST_WFST r_wfst,base_wfst,det_wfst;
99 	rulelist.append(r_wfst);
100 	EST_WFST &rr_wfst = rulelist.last(); // to avoid copying the filled one
101 	cout << "Rule: " << siod_llength(rules)-siod_llength(r) << endl;
102 	pprint(car(r));
103 	base_wfst.kkrule_compile(inalpha,outalpha,fp,car(r),sets);
104 	cout << "          base " << base_wfst.summary() << endl;
105 	det_wfst.determinize(base_wfst);
106 	cout << "  determinized " << det_wfst.summary() << endl;
107 	rr_wfst.minimize(det_wfst);
108 	cout << "     minimized " << rr_wfst.summary() << endl;
109     }
110 
111     cout << "WFST: intersecting " << rulelist.length() << " rules" << endl;
112     EST_Litem *p,*nnp;
113     int i;
114     for (i=0,p=rulelist.head(); p->next() != 0; p=nnp)
115     {
116 	EST_WFST r_wfst,base_wfst,det_wfst;
117 	EST_WFST mmm;
118 	rulelist.append(r_wfst);
119 	EST_WFST &rr_wfst = rulelist.last(); // to avoid copying the filled one
120 	cout << "intersecting " << i << " and " << i+1 << " " <<
121 	    rulelist.length()-2 << " left" << endl;
122 	cout << "   " << rulelist(p).summary() << " and " << endl;
123 	cout << "   " << rulelist(p->next()).summary() << " becomes " << endl;
124 	mmm.intersection(rulelist(p),rulelist(p->next()));
125 	cout << "   " << mmm.summary() << " minimizes to " << endl;
126 	rr_wfst.minimize(mmm);
127 	cout << "   " << rr_wfst.summary() << endl;
128 	nnp=p->next()->next();
129 	i+=2;
130 	rulelist.remove(p->next());
131 	rulelist.remove(p);
132     }
133 
134     all_wfst = rulelist.first();
135 
136 }
137 
expand_sets(LISP sets,LISP fp)138 static LISP expand_sets(LISP sets,LISP fp)
139 {
140     // Expand sets into regexes that represent them.  Single
141     // char values are converted to disjunctions of feasible pairs
142     // that have the same surface character
143     LISP s,es=NIL,e,ne;
144 
145     for (s=sets; s != NIL; s=cdr(s))
146     {
147 	for (ne=NIL,e=cdr(car(s)); e != NIL; e=cdr(e))
148 	{
149 	    EST_String ss = get_c_string(car(e));
150 	    if (ss.contains("/"))
151 		ne = cons(car(e),ne);
152 	    else
153 		ne = append(expand_fp(ss,fp),ne);
154 	}
155 	if (ne == NIL)
156 	{
157 	    cerr << "WFST: kkcompile: set " << get_c_string(car(car(s))) <<
158 		" has no feasible pairs" << endl;
159 	}
160 
161 	else if (siod_llength(ne) == 1)
162 	    es = cons(cons(car(car(s)),ne),es);
163 	else
164 	    es = cons(cons(car(car(s)),
165 			   cons(cons(rintern("or"),reverse(ne)),
166 				NIL)),es);
167     }
168 
169     return reverse(es);
170 }
171 
expand_fp(const EST_String p,LISP fp)172 static LISP expand_fp(const EST_String p,LISP fp)
173 {
174     // Find all fp's that have this p as their surface char
175     LISP m=NIL,f;
176     EST_Regex rg(EST_String("^")+p+"/.*");
177 
178     for (f=fp; f != NIL; f=cdr(f))
179     {
180 	EST_String ss = get_c_string(car(f));
181 	if ((p == ss) || (ss.matches(rg)))
182 	    m = cons(car(f),m);
183     }
184     return m;
185 }
186 
find_feasible_pairs(LISP rules)187 static LISP find_feasible_pairs(LISP rules)
188 {
189     // Find the set of pairs that have rules associated with them
190     // This effectively defines the transducer alphabet.
191     LISP fp = NIL;
192     LISP r;
193 
194     for (r=rules; r != NIL; r=cdr(r))
195     {
196 	if (siod_member_str(get_c_string(siod_nth(0,car(r))),fp) == NIL)
197 	    fp = cons(siod_nth(0,car(r)),fp);
198     }
199     return fp;
200 }
201 
surface_coercion(LISP rt)202 static int surface_coercion(LISP rt)
203 {
204     return (streq("<=",get_c_string(rt)));
205 }
206 
context_restriction(LISP rt)207 static int context_restriction(LISP rt)
208 {
209     return (streq("=>",get_c_string(rt)));
210 }
211 
composite(LISP rt)212 static int composite(LISP rt)
213 {
214     return (streq("<=>",get_c_string(rt)));
215 }
216 
inline_sets(LISP l,LISP sets)217 static LISP inline_sets(LISP l, LISP sets)
218 {
219     // Replace any set name with the regex equivalent
220     LISP s;
221     if (l == NIL)
222 	return NIL;
223     else if (consp(l))
224 	return cons(inline_sets(car(l),sets),inline_sets(cdr(l),sets));
225     else if ((s=siod_assoc_str(get_c_string(l),sets)) != NIL)
226 	return car(cdr(s));
227     else
228 	return l;
229 }
230 
kkrule_compile(LISP inalpha,LISP outalpha,LISP fp,LISP rule,LISP sets)231 void EST_WFST::kkrule_compile(LISP inalpha, LISP outalpha, LISP fp,
232 			      LISP rule,LISP sets)
233 {
234     // Build a WFST to transduce this particular rule
235     // Accepts any other combination of feasible pairs too
236     LISP leftcontext = inline_sets(siod_nth(2,rule),sets);
237     LISP rulepair = siod_nth(0,rule);
238     LISP ruletype = siod_nth(1,rule);
239     LISP rightcontext = inline_sets(siod_nth(4,rule),sets);
240     LISP p;
241     int i;
242     int end_LC,end_RP,end_NOTRP,end_RC,err_state;
243 
244     // Initialize alphabets
245     init(inalpha,outalpha);  // should be passed as discretes
246 
247     p_start_state = add_state(wfst_final);  // empty WFST
248     // Add transitions for all pairs except rulepair
249     for (p=fp; p != NIL; p=cdr(p))
250 	if ((!equal(rulepair,car(p))) ||
251 	    (surface_coercion(ruletype)))
252 	    build_wfst(p_start_state,p_start_state,car(p));
253 
254     // build for LC
255     if (leftcontext)
256     {
257 	end_LC = add_state(wfst_final);
258 	build_wfst(p_start_state,end_LC,leftcontext);
259 	// for all states in LC mark final & add epsilon to p_start_state
260 	for (i=end_LC; i < p_num_states; i++)
261 	{
262 	    build_wfst(i,p_start_state,epsilon_label());
263 	    p_states[i]->set_type(wfst_final);
264 	}
265     }
266     else   // no LC
267 	end_LC = p_start_state;
268 
269     // build for RP and RC from end_LC
270     if (composite(ruletype) || context_restriction(ruletype))
271     {
272 	if (rightcontext)
273 	{
274 	    end_RP = add_state(wfst_nonfinal);
275 	    build_wfst(end_LC,end_RP,rulepair);
276 	    // build for RC from end map to p_start_state
277 	    build_wfst(end_RP,p_start_state,rightcontext);
278 	    err_state = add_state(wfst_error);
279 	    for (i=end_RP; i < err_state; i++)
280 	    {   // for everything other that the correct path go to err_state
281 		// without this explicit error state the epsilon to start
282 		// allows almost everything
283 		if (transition(i,get_c_string(epsilon_label()))
284 		    != WFST_ERROR_STATE)
285 		    break;  // not a state require extra transitions
286 		for (p=fp; p != NIL; p=cdr(p))
287 		    if (transition(i,get_c_string(car(p))) == WFST_ERROR_STATE)
288 			build_wfst(i,err_state,car(p));
289 		build_wfst(i,p_start_state,epsilon_label());
290 		p_states[i]->set_type(wfst_licence);
291 	    }
292 	}
293 	else  // no RC, so end back at start
294 	    build_wfst(end_LC,p_start_state,rulepair);
295     }
296 
297     // Build for notRP and RC from end_LC
298     if (composite(ruletype) || surface_coercion(ruletype))
299     {
300 	LISP abrp = all_but(rulepair,fp);
301 	if (abrp)
302 	{
303 	    if (rightcontext)
304 	    {
305 		end_RC = add_state(wfst_error);
306 		end_NOTRP = add_state(wfst_nonfinal);
307 		build_wfst(end_LC,end_NOTRP,abrp);
308 		// build for RC from end RP to error state
309 		build_wfst(end_NOTRP,end_RC,rightcontext);
310 		// for all states in RC except final one mark final & add
311 		// epsilon to p_start_state
312 		for (i=end_NOTRP; i < p_num_states; i++)
313 		{
314 		    build_wfst(i,p_start_state,epsilon_label());
315 		    p_states[i]->set_type(wfst_final);
316 		}
317 	    }
318 	    else			// no RC,
319 	    {
320 		end_RC = add_state(wfst_error);
321 		build_wfst(end_LC,end_RC,abrp);
322 	    }
323 	}
324     }
325 }
326 
all_but(LISP rulepair,LISP fp)327 static LISP all_but(LISP rulepair,LISP fp)
328 {
329     // Returns pairs that have the same surface symbol as rulepair
330     // but different lexical symbol
331     LISP r,notrp=NIL;
332     EST_String s,l,p,sr,lr,rr;
333 
334     p = get_c_string(rulepair);
335     if (p.contains("/"))
336     {
337 	s = p.before("/");
338 	l = p.after("/");
339     }
340     else
341     {
342 	s = p;
343 	l = p;
344     }
345 
346     for (r=fp; r != NIL; r = cdr(r))
347     {
348 	rr = get_c_string(car(r));
349 	if (rr.contains("/"))
350 	{
351 	    sr = rr.before("/");
352 	    lr = rr.after("/");
353 	}
354 	else
355 	{
356 	    sr = rr;
357 	    lr = rr;
358 	}
359 	if ((l != lr) && (s == sr))
360 	    notrp = cons(car(r),notrp);
361     }
362 
363     if (siod_llength(notrp) > 1)
364 	notrp = cons(strintern("or"),notrp);
365     return notrp;
366 }
367 
intersect(wfst_list & wl,EST_WFST & all)368 void intersect(wfst_list &wl, EST_WFST &all)
369 {
370     // Intersect the wfst's in wl into all
371 
372     all.intersection(wl);
373 
374 }
375 
376