1  /************************************************************************/
2  /*                                                                      */
3  /*                Centre for Speech Technology Research                 */
4  /*                     University of Edinburgh, UK                      */
5  /*                       Copyright (c) 2002                             */
6  /*                        All Rights Reserved.                          */
7  /*                                                                      */
8  /*  Permission is hereby granted, free of charge, to use and distribute */
9  /*  this software and its documentation without restriction, including  */
10  /*  without limitation the rights to use, copy, modify, merge, publish, */
11  /*  distribute, sublicense, and/or sell copies of this work, and to     */
12  /*  permit persons to whom this work is furnished to do so, subject to  */
13  /*  the following conditions:                                           */
14  /*   1. The code must retain the above copyright notice, this list of   */
15  /*      conditions and the following disclaimer.                        */
16  /*   2. Any modifications must be clearly marked as such.               */
17  /*   3. Original authors' names are not deleted.                        */
18  /*   4. The authors' names are not used to endorse or promote products  */
19  /*      derived from this software without specific prior written       */
20  /*      permission.                                                     */
21  /*                                                                      */
22  /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK       */
23  /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING     */
24  /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT  */
25  /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE    */
26  /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES   */
27  /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN  */
28  /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,         */
29  /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF      */
30  /*  THIS SOFTWARE.                                                      */
31  /*                                                                      */
32  /*************************************************************************/
33  /*                                                                       */
34  /*                 Author: Rob Clark  (robert@cstr.ed.ac.uk)             */
35  /* --------------------------------------------------------------------  */
36  /* Code to read APML format XML as utterances.                           */
37  /*                                                                       */
38  /*************************************************************************/
39 
40 #include <cstdlib>
41 #include <cstdio>
42 #include "EST_THash.h"
43 #include "EST_error.h"
44 #include "apml.h"
45 #include "rxp/XML_Parser.h"
46 
47 static EST_Regex simpleIDRegex(".*#id(w\\([0-9]+\\))");
48 static EST_Regex rangeIDRegex(".*#id(w\\([0-9]+\\)).*id(w\\([0-9]+\\))");
49 static EST_Regex RXpunc("[\\.,\\?\\!\"]+");
50 
51 class Parse_State
52   {
53 public:
54     int depth;
55     int maxid;
56     EST_Utterance *utt;
57     EST_Relation *tokens;
58     EST_Relation *perf;
59     EST_Relation *com;
60     EST_Relation *semstruct;
61     EST_Relation *emphasis;
62     EST_Relation *boundary;
63     EST_Relation *pause;
64     EST_Item *parent;
65     EST_Item *pending;
66     EST_Item *last_token;
67   };
68 
69 class Apml_Parser_Class : public XML_Parser_Class
70 {
71 protected:
72   virtual void document_open(XML_Parser_Class &c,
73 			XML_Parser &p,
74 			void *data);
75   virtual void document_close(XML_Parser_Class &c,
76 			 XML_Parser &p,
77 			 void *data);
78 
79   virtual void element_open(XML_Parser_Class &c,
80 		       XML_Parser &p,
81 		       void *data,
82 		       const char *name,
83 		       XML_Attribute_List &attributes);
84   virtual void element(XML_Parser_Class &c,
85 		  XML_Parser &p,
86 		  void *data,
87 		  const char *name,
88 		  XML_Attribute_List &attributes);
89   virtual void element_close(XML_Parser_Class &c,
90 			XML_Parser &p,
91 			void *data,
92 			const char *name);
93 
94   virtual void pcdata(XML_Parser_Class &c,
95 		 XML_Parser &p,
96 		 void *data,
97 		 const char *chars);
98   virtual void cdata(XML_Parser_Class &c,
99 		XML_Parser &p,
100 		void *data,
101 		const char *chars);
102 
103   virtual void processing(XML_Parser_Class &c,
104 		     XML_Parser &p,
105 		     void *data,
106 		     const char *instruction);
107   virtual void error(XML_Parser_Class &c,
108 		XML_Parser &p,
109 		void *data);
110 };
111 
print_attributes(XML_Attribute_List & attributes)112 static void print_attributes(XML_Attribute_List &attributes)
113 {
114   XML_Attribute_List::Entries them;
115 
116   for(them.begin(attributes); them ; them++)
117     printf(" %s='%s'",
118 	   (const char *)them->k,
119 	   (const char *)them->v);
120 }
121 
apml_read(FILE * file,const EST_String & name,EST_Utterance & u,int & max_id)122 EST_read_status apml_read(FILE *file,
123 			     const EST_String &name,
124 			     EST_Utterance &u,
125 			     int &max_id)
126 {
127   (void)max_id;
128   (void)print_attributes;	// just to shut -Wall up.
129   Apml_Parser_Class pclass;
130   Parse_State state;
131 
132   u.clear();
133 
134   state.utt=&u;
135 
136   XML_Parser *parser = pclass.make_parser(file, name, &state);
137   parser->track_context(TRUE);
138 
139   CATCH_ERRORS()
140     return read_format_error;
141 
142   parser->go();
143 
144   END_CATCH_ERRORS();
145 
146   return read_ok;
147 }
148 
149 
150 
151 /** Now we define the callbacks.
152   */
153 
document_open(XML_Parser_Class & c,XML_Parser & p,void * data)154 void Apml_Parser_Class::document_open(XML_Parser_Class &c,
155 		      XML_Parser &p,
156 		      void *data)
157 {
158   (void)c; (void)p;
159   Parse_State *state = (Parse_State *)data;
160 
161   state->maxid=0;
162 
163   state->depth=1;
164   state->parent=NULL;
165   state->pending=NULL;
166   state->last_token=NULL;
167 
168   // create relations:
169   state->perf = state->utt->create_relation("Perfomative");
170   state->com = state->utt->create_relation("Communicative");
171   state->tokens = state->utt->create_relation("Token");
172   state->semstruct = state->utt->create_relation("SemStructure");
173   state->emphasis = state->utt->create_relation("Emphasis");
174   state->boundary = state->utt->create_relation("Boundary");
175   state->pause = state->utt->create_relation("Pause");
176 
177 
178 }
179 
document_close(XML_Parser_Class & c,XML_Parser & p,void * data)180 void Apml_Parser_Class::document_close(XML_Parser_Class &c,
181 		    XML_Parser &p,
182 		    void *data)
183 {
184   (void)c; (void)p; (void)data;
185 }
186 
187 
element_open(XML_Parser_Class & c,XML_Parser & p,void * data,const char * name,XML_Attribute_List & attributes)188 void Apml_Parser_Class::element_open(XML_Parser_Class &c,
189 		  XML_Parser &p,
190 		  void *data,
191 		  const char *name,
192 		  XML_Attribute_List &attributes)
193 {
194   (void)c; (void)p; (void)attributes;
195   Parse_State *state = (Parse_State *)data;
196 
197   //cout << " In element_open: " << name << "\n";
198 
199   if (strcmp(name, "turnallocation")==0)
200     {
201       // currently ignore
202       return;
203     }
204 
205   if (strcmp(name, "apml")==0)
206     return;  // ignore
207 
208   state->depth++;
209 
210   if( strcmp(name, "performative")==0
211       || strcmp(name, "rheme")==0
212       || strcmp(name, "theme")==0
213       || strcmp(name, "emphasis")==0
214       || strcmp(name, "boundary")==0
215       || strcmp(name, "pause")==0)
216     {
217 
218       // create new item content
219       EST_Item_Content *cont = new EST_Item_Content();
220       cont->set_name(name);
221 
222       XML_Attribute_List::Entries them;
223       for(them.begin(attributes); them ; them++)
224 	{
225 	  EST_String k = them->k;
226 	  EST_String v = them->v;
227 	  cont->f.set(k,v);
228 	}
229 
230       EST_Item *item;
231 
232       if( strcmp(name, "emphasis")==0 )
233 	{
234 	  item = state->emphasis->append();
235 	  state->pending = item;
236 	}
237       else if(strcmp(name, "boundary")==0 )
238 	{
239 	  item = state->boundary->append();
240 	  if(state->last_token)
241 	    item->append_daughter(state->last_token);
242 	}
243       else if(strcmp(name, "pause")==0 )
244 	{
245 	  item = state->pause->append();
246 	  if(state->last_token)
247 	    item->append_daughter(state->last_token);
248 	}
249       else
250 	{
251 	  if (state->parent == NULL)
252 	    item = state->semstruct->append();
253 	  else
254 	    item = state->parent->append_daughter();
255 	  state->parent=item;
256 	}
257 
258       item->set_contents(cont);
259 
260 
261     }
262   else
263     EST_warning("APML Parser: unknown element %s", name);
264 }
265 
266 
element(XML_Parser_Class & c,XML_Parser & p,void * data,const char * name,XML_Attribute_List & attributes)267 void Apml_Parser_Class::element(XML_Parser_Class &c,
268 				XML_Parser &p,
269 				void *data,
270 				const char *name,
271 				XML_Attribute_List &attributes)
272 {
273   (void)c; (void)p; (void)attributes;
274 
275   element_open(c, p, data, name, attributes);
276   element_close(c, p, data, name);
277 }
278 
279 
element_close(XML_Parser_Class & c,XML_Parser & p,void * data,const char * name)280 void Apml_Parser_Class::element_close(XML_Parser_Class &c,
281 		   XML_Parser &p,
282 		   void *data,
283 		   const char *name)
284 {
285   (void)c; (void)p; (void)name;
286   Parse_State *state = (Parse_State *)data;
287 
288   if ( strcmp(name, "emphasis")==0
289        || strcmp(name, "boundary")==0
290        || strcmp(name, "pause")==0 )
291     {
292       state->depth--;
293       state->pending=NULL;
294     }
295 
296 
297   if (strcmp(name, "performative")==0
298       || strcmp(name, "theme")==0
299       || strcmp(name, "rheme")==0)
300     {
301       state->depth--;
302       state->pending = NULL;
303       state->parent=state->parent->up();
304     }
305 }
306 
307 
pcdata(XML_Parser_Class & c,XML_Parser & p,void * data,const char * chars)308 void Apml_Parser_Class::pcdata(XML_Parser_Class &c,
309 	    XML_Parser &p,
310 	    void *data,
311 	    const char *chars)
312 {
313   (void)c;
314 
315  Parse_State *state = (Parse_State *)data;
316  EST_String strings[255];
317 
318  split(chars,strings,255,RXwhite);
319 
320  //   for(int cc=0 ; cc < 20 ; ++cc)
321  //  cout << cc << ": \"" << strings[cc] << "\" (" << strings[cc].length() << ")\n";
322 
323  int s=0;
324 
325  while( s < 1 || strings[s].length() > 0 )
326    {
327      if(strings[s].length() > 0 )
328        {
329 	 // Just Punctuation
330 	 if(strings[s].matches(RXpunc))
331 	   {
332 	     state->last_token->set("punc",strings[s]);
333 	   }
334 	 // Text and possibly punc
335 	 else
336 	   {
337 	     EST_Item_Content *cont = new EST_Item_Content();
338 	     EST_Item *item;
339 
340 	     if (state->parent == NULL)
341 	       item = state->semstruct->append();
342 	     else
343 	       item = state->parent->append_daughter();
344 	     item->set_contents(cont);
345 
346 	     // strip pre-punc here.
347 	     int i = strings[s].index(RXpunc);
348 	     EST_String ps = strings[s].at(RXpunc);
349 	     EST_String intermediate;
350 	     if( ps.length() > 0 && i == 0)
351 	       {
352 		 cout << "Got pre punc: " << ps << endl;
353 		 intermediate = strings[s].after(RXpunc);
354 		 // cont->set_name(strings[s].before(RXpunc));
355 		 item->set("prepunctuation",ps);
356 	       }
357 	     else
358 	       {
359 		 intermediate = strings[s];
360 		 item->set("prepunctuation","");
361 	       }
362 	     // now strip punc
363 	     ps = intermediate.at(RXpunc);
364 	     if( ps.length() > 0 )
365 	       {
366 		 cout << "Got punc: " << ps << endl;
367 		 cont->set_name(intermediate.before(RXpunc));
368 		 item->set("punc",ps);
369 	       }
370 	     else
371 	       {
372 		 cont->set_name(intermediate);
373 		 item->set("punc","");
374 	       }
375 
376 	   state->tokens->append(item);
377 	   state->last_token = item;
378 
379 	   if(state->pending)
380 	     {
381 	       state->pending->append_daughter(item);
382 	     }
383 
384 	   //  if (state->parent != NULL && p.context(0) == "w")
385 	   //  state->parent->set(EST_String("token"), chars);
386 
387 	   //cout << "  got token: " << item->name() << "\n";
388 	   }
389        }
390      ++s;
391    }
392 }
393 
394 
cdata(XML_Parser_Class & c,XML_Parser & p,void * data,const char * chars)395 void Apml_Parser_Class::cdata(XML_Parser_Class &c,
396 	   XML_Parser &p,
397 	   void *data,
398 	   const char *chars)
399 {
400   (void)c; (void)p; (void)data; (void)chars;
401   // Parse_State *state = (Parse_State *)data;
402 
403   //   printf("APML XML Parser [cdata[%s]] %d\n", chars, state->depth);
404 }
405 
406 
processing(XML_Parser_Class & c,XML_Parser & p,void * data,const char * instruction)407 void Apml_Parser_Class::processing(XML_Parser_Class &c,
408 		XML_Parser &p,
409 		void *data,
410 		const char *instruction)
411 {
412   (void)c; (void)p;
413   Parse_State *state = (Parse_State *)data;
414 
415   printf("APML XML Parser [proc[%s]] %d\n", instruction, state->depth);
416 }
417 
418 
error(XML_Parser_Class & c,XML_Parser & p,void * data)419 void Apml_Parser_Class::error(XML_Parser_Class &c,
420 	   XML_Parser &p,
421 	   void *data)
422 {
423   (void)c; (void)p;  (void)data;
424   // Parse_State *state = (Parse_State *)data;
425 
426   EST_error("APML Parser %s", get_error(p));
427 
428   est_error_throw();
429 }
430 
431 
432 
433 
434 
435 
436 
437