1  /************************************************************************/
2  /*                                                                      */
3  /*                Centre for Speech Technology Research                 */
4  /*                     University of Edinburgh, UK                      */
5  /*                        Copyright (c) 1997                            */
6  /*                        All Rights Reserved.                          */
7  /*                                                                      */
8  /*  Permission is hereby granted, free of charge, to use and distribute */
9  /*  this software and its documentation without restriction, including  */
10  /*  without limitation the rights to use, copy, modify, merge, publish, */
11  /*  distribute, sublicense, and/or sell copies of this work, and to     */
12  /*  permit persons to whom this work is furnished to do so, subject to  */
13  /*  the following conditions:                                           */
14  /*   1. The code must retain the above copyright notice, this list of   */
15  /*      conditions and the following disclaimer.                        */
16  /*   2. Any modifications must be clearly marked as such.               */
17  /*   3. Original authors' names are not deleted.                        */
18  /*   4. The authors' names are not used to endorse or promote products  */
19  /*      derived from this software without specific prior written       */
20  /*      permission.                                                     */
21  /*                                                                      */
22  /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK       */
23  /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING     */
24  /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT  */
25  /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE    */
26  /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES   */
27  /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN  */
28  /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,         */
29  /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF      */
30  /*  THIS SOFTWARE.                                                      */
31  /*                                                                      */
32  /************************************************************************/
33  /*	       Author : Richard Caley (rjc@cstr.ed.ac.uk)		 */
34  /*                 Date  : February 1997                                */
35  /* -------------------------------------------------------------------- */
36  /*                                                                      */
37  /* A Regular expression class to go with the CSTR EST_String class. Uses*/
38  /* Henry Spencer`s regexp routines which allocate space dynamically     */
39  /* using malloc, so we use free in here rather than wfree because       */
40  /* wfree might at some time start doing something more than just be a   */
41  /* safe wrapper around free. If you try and use another regexp          */
42  /* package, beware of changes to how memory is allocated.               */
43  /*                                                                      */
44  /* We maintain two compiled versions, one for substring matches and     */
45  /* one for whole string matches (because sometimes the regexp           */
46  /* compiler can special case the latter). These are compiled when       */
47  /* first used.                                                          */
48  /*                                                                      */
49  /************************************************************************/
50 
51 #ifdef NO_EST
52 #    include <unistd.h>
53 #else
54 #    include "EST_unix.h"
55 #endif
56 #include <cstdlib>
57 #include <cstdio>
58 #include <cstring>
59 #include "EST_String.h"
60 #include "EST_Regex.h"
61 
62 #ifdef sun
63 #ifndef __svr4__
64 /* SunOS */
65 #include <cstring>
66 #endif
67 #endif
68 
69 // extern "C" {
70 #include "regexp.h"
71 
72 /*
73 void *t_regcomp(void *v)
74 {
75   return v;
76 }
77 
78 void *cpp_regcomp(void *v)
79 {
80   return v;
81 }
82 */
83 // #define wfree(P) (1==1)
84 
85 // These define the different escape conventions for the FSF's
86 // regexp code and Henry Spencer's
87 
88 static const char *fsf_magic="^$*+?[].\\";
89 static const char *fsf_magic_backslashed="()|<>";
90 static const char *spencer_magic="^$*+?[].()|\\\n";
91 static const char *spencer_magic_backslashed="<>";
92 
93 EST_Regex RXwhite("[ \n\t\r]+");
94 EST_Regex RXalpha("[A-Za-z]+");
95 EST_Regex RXlowercase("[a-z]+");
96 EST_Regex RXuppercase("[A-Z]+");
97 EST_Regex RXalphanum("[0-9A-Za-z]+");
98 EST_Regex RXidentifier("[A-Za-z_][0-9A-Za-z_]+");
99 EST_Regex RXint("-?[0-9]+");
100 EST_Regex RXdouble("-?\\(\\([0-9]+\\.[0-9]*\\)\\|\\([0-9]+\\)\\|\\(\\.[0-9]+\\)\\)\\([eE][---+]?[0-9]+\\)?");
101 
102 // use this to free compiled regex since the regexp package uses malloc
103 // and walloc might end up doing something clever.
104 
105 /* extern "C" void free(void *p); */
106 
107 #if NSUBEXP != EST_Regex_max_subexpressions
108 #   error "EST_Regex_max_subexpressions must be equal to  NSUBEXP"
109 #endif
110 
EST_Regex(void)111 EST_Regex::EST_Regex(void) : EST_String()
112 {
113   compiled = NULL;
114   compiled_match = NULL;
115 }
116 
EST_Regex(const char * s)117 EST_Regex::EST_Regex(const char *s) : EST_String(s)
118 
119 {
120 
121   compiled = NULL;
122   compiled_match = NULL;
123 
124 }
125 
EST_Regex(EST_String s)126 EST_Regex::EST_Regex(EST_String s) : EST_String(s)
127 
128 {
129   compiled = NULL;
130   compiled_match = NULL;
131 }
132 
EST_Regex(const EST_Regex & ex)133 EST_Regex::EST_Regex(const EST_Regex &ex) : EST_String(ex)
134 {
135   compiled = NULL;
136   compiled_match = NULL;
137 }
138 
139 
~EST_Regex()140 EST_Regex::~EST_Regex()
141 {
142     if (compiled_match)
143       free(compiled_match);
144     if (compiled)
145       free(compiled);
146 }
147 
148 // Convert a regular expression from the external syntax (defined by the
149 // the FSF library) to the one expected by the regexp routines (which
150 // say it's V8 syntax).
151 
regularize(int match) const152 char *EST_Regex::regularize(int match) const
153 {
154   char *reg = walloc(char, size()*2+3);
155   char *r=reg;
156   const char *e;
157   int magic=0,last_was_bs=0;
158   const char * in_brackets=NULL;
159   const char *ex = (size()==0)?"":str();
160 
161   if (match && *ex != '^')
162     *(r++) = '^';
163 
164   for(e=ex; *e ; e++)
165     {
166      if (*e == '\\' && !last_was_bs)
167        {
168 	 last_was_bs=1;
169 	 continue;
170        }
171 
172      magic=strchr((last_was_bs?fsf_magic_backslashed:fsf_magic), *e)!=NULL;
173 
174      if (in_brackets)
175        {
176 	 *(r++) = *e;
177 	 if (*e  == ']' && (e-in_brackets)>1)
178 	   in_brackets=0;
179        }
180      else if (magic)
181        {
182 	 if (strchr(spencer_magic_backslashed, *e))
183 	   *(r++) = '\\';
184 
185 	 *(r++) = *e;
186 	 if (*e  == '[')
187 	   in_brackets=e;
188        }
189      else
190        {
191 	 if (strchr(spencer_magic, *e))
192 	     *(r++) = '\\';
193 
194 	 *(r++) = *e;
195        }
196      last_was_bs=0;
197     }
198 
199   if (match && (e==ex || *(e-1) != '$'))
200     {
201       if (last_was_bs)
202 	*(r++) = '\\';
203       *(r++) = '$';
204     }
205 
206   *r='\0';
207 
208   //  cerr<<"reg||"<<ex<<"||"<<reg<<"\n";
209 
210   return reg;
211 }
212 
compile()213 void EST_Regex::compile()
214 {
215   if (!compiled)
216     {
217       char *reg=regularize(0);
218       void * t =(void *)hs_regcomp(reg);
219       compiled=t;
220       wfree(reg);
221     }
222 
223   if (!compiled)
224     cerr << "EST_Regex: can't compile '" << str() << "'\n";
225 }
226 
compile_match()227 void EST_Regex::compile_match()
228 {
229   if (!compiled_match)
230     {
231       char *reg=regularize(1);
232 
233       void * t =(void *)hs_regcomp(reg);
234       compiled_match=t;
235       wfree(reg);
236     }
237 
238   if (!compiled_match)
239       cerr << "EST_Regex: can't compile '" << str() << "'\n";
240 }
241 
run(const char * on,int from,int & start,int & end,int * starts,int * ends)242 int EST_Regex::run(const char *on, int from, int &start, int &end, int *starts, int *ends)
243 {
244 
245   compile();
246 
247   if (compiled && from <= (int)strlen(on))
248     {
249       if (hs_regexec((hs_regexp *)compiled, on+from))
250 	{
251 	  hs_regexp *re = (hs_regexp *)compiled;
252 
253 	  start = re->startp[0] - on;
254 	  end   = re->endp[0]- on;
255 
256 	  if (starts)
257 	    {
258 	      int i;
259 	      for (i=0; i<EST_Regex_max_subexpressions; i++)
260 		starts[i] = re->startp[i]?(re->startp[i] - on):-1;
261 	    }
262 	  if (ends)
263 	    {
264 	      int i;
265 	      for (i=0; i<EST_Regex_max_subexpressions; i++)
266 		  ends[i] = re->endp[i]?(re->endp[i] - on):-1;
267 	    }
268 
269 	  return 1;
270 	}
271     }
272   return 0;
273 }
274 
run_match(const char * on,int from,int * starts,int * ends)275 int EST_Regex::run_match(const char *on, int from, int *starts, int *ends)
276 {
277 
278   compile_match();
279 
280   hs_regexp *re = (hs_regexp *)compiled_match;
281 
282   if (compiled_match && from <= (int)strlen(on))
283     if (hs_regexec(re, on+from))
284       {
285 	  if (starts)
286 	    {
287 	      int i;
288 	      for (i=0; i<EST_Regex_max_subexpressions; i++)
289 		starts[i] = re->startp[i]?(re->startp[i] - on):-1;
290 	    }
291 	  if (ends)
292 	    {
293 	      int i;
294 	      for (i=0; i<EST_Regex_max_subexpressions; i++)
295 		ends[i] = re->endp[i]?(re->endp[i] - on):-1;
296 	    }
297 	  return 1;
298       }
299 
300   return 0;
301 }
302 
operator =(const EST_Regex ex)303 EST_Regex &EST_Regex::operator = (const EST_Regex ex)
304 {
305   ((EST_String &)(*this)) = (EST_String)ex;
306   compiled = NULL;
307   compiled_match = NULL;
308 
309   return *this;
310 }
311 
operator =(const EST_String s)312 EST_Regex &EST_Regex::operator = (const EST_String s)
313 {
314   ((EST_String &)(*this)) = s;
315   compiled = NULL;
316   compiled_match = NULL;
317 
318   return *this;
319 }
320 
operator =(const char * s)321 EST_Regex &EST_Regex::operator = (const char *s)
322 {
323   ((EST_String &)(*this)) = s;
324   compiled = NULL;
325   compiled_match = NULL;
326 
327   return *this;
328 }
329 
operator <<(ostream & s,const EST_Regex & str)330 ostream &operator << (ostream &s, const EST_Regex &str)
331 {
332   return s << (EST_String)str;
333 }
334 
335