1 /************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /************************************************************************/
33 /* Author : Richard Caley (rjc@cstr.ed.ac.uk) */
34 /* Date : February 1997 */
35 /* -------------------------------------------------------------------- */
36 /* */
37 /* A Regular expression class to go with the CSTR EST_String class. Uses*/
38 /* Henry Spencer`s regexp routines which allocate space dynamically */
39 /* using malloc, so we use free in here rather than wfree because */
40 /* wfree might at some time start doing something more than just be a */
41 /* safe wrapper around free. If you try and use another regexp */
42 /* package, beware of changes to how memory is allocated. */
43 /* */
44 /* We maintain two compiled versions, one for substring matches and */
45 /* one for whole string matches (because sometimes the regexp */
46 /* compiler can special case the latter). These are compiled when */
47 /* first used. */
48 /* */
49 /************************************************************************/
50
51 #ifdef NO_EST
52 # include <unistd.h>
53 #else
54 # include "EST_unix.h"
55 #endif
56 #include <cstdlib>
57 #include <cstdio>
58 #include <cstring>
59 #include "EST_String.h"
60 #include "EST_Regex.h"
61
62 #ifdef sun
63 #ifndef __svr4__
64 /* SunOS */
65 #include <cstring>
66 #endif
67 #endif
68
69 // extern "C" {
70 #include "regexp.h"
71
72 /*
73 void *t_regcomp(void *v)
74 {
75 return v;
76 }
77
78 void *cpp_regcomp(void *v)
79 {
80 return v;
81 }
82 */
83 // #define wfree(P) (1==1)
84
85 // These define the different escape conventions for the FSF's
86 // regexp code and Henry Spencer's
87
88 static const char *fsf_magic="^$*+?[].\\";
89 static const char *fsf_magic_backslashed="()|<>";
90 static const char *spencer_magic="^$*+?[].()|\\\n";
91 static const char *spencer_magic_backslashed="<>";
92
93 EST_Regex RXwhite("[ \n\t\r]+");
94 EST_Regex RXalpha("[A-Za-z]+");
95 EST_Regex RXlowercase("[a-z]+");
96 EST_Regex RXuppercase("[A-Z]+");
97 EST_Regex RXalphanum("[0-9A-Za-z]+");
98 EST_Regex RXidentifier("[A-Za-z_][0-9A-Za-z_]+");
99 EST_Regex RXint("-?[0-9]+");
100 EST_Regex RXdouble("-?\\(\\([0-9]+\\.[0-9]*\\)\\|\\([0-9]+\\)\\|\\(\\.[0-9]+\\)\\)\\([eE][---+]?[0-9]+\\)?");
101
102 // use this to free compiled regex since the regexp package uses malloc
103 // and walloc might end up doing something clever.
104
105 /* extern "C" void free(void *p); */
106
107 #if NSUBEXP != EST_Regex_max_subexpressions
108 # error "EST_Regex_max_subexpressions must be equal to NSUBEXP"
109 #endif
110
EST_Regex(void)111 EST_Regex::EST_Regex(void) : EST_String()
112 {
113 compiled = NULL;
114 compiled_match = NULL;
115 }
116
EST_Regex(const char * s)117 EST_Regex::EST_Regex(const char *s) : EST_String(s)
118
119 {
120
121 compiled = NULL;
122 compiled_match = NULL;
123
124 }
125
EST_Regex(EST_String s)126 EST_Regex::EST_Regex(EST_String s) : EST_String(s)
127
128 {
129 compiled = NULL;
130 compiled_match = NULL;
131 }
132
EST_Regex(const EST_Regex & ex)133 EST_Regex::EST_Regex(const EST_Regex &ex) : EST_String(ex)
134 {
135 compiled = NULL;
136 compiled_match = NULL;
137 }
138
139
~EST_Regex()140 EST_Regex::~EST_Regex()
141 {
142 if (compiled_match)
143 free(compiled_match);
144 if (compiled)
145 free(compiled);
146 }
147
148 // Convert a regular expression from the external syntax (defined by the
149 // the FSF library) to the one expected by the regexp routines (which
150 // say it's V8 syntax).
151
regularize(int match) const152 char *EST_Regex::regularize(int match) const
153 {
154 char *reg = walloc(char, size()*2+3);
155 char *r=reg;
156 const char *e;
157 int magic=0,last_was_bs=0;
158 const char * in_brackets=NULL;
159 const char *ex = (size()==0)?"":str();
160
161 if (match && *ex != '^')
162 *(r++) = '^';
163
164 for(e=ex; *e ; e++)
165 {
166 if (*e == '\\' && !last_was_bs)
167 {
168 last_was_bs=1;
169 continue;
170 }
171
172 magic=strchr((last_was_bs?fsf_magic_backslashed:fsf_magic), *e)!=NULL;
173
174 if (in_brackets)
175 {
176 *(r++) = *e;
177 if (*e == ']' && (e-in_brackets)>1)
178 in_brackets=0;
179 }
180 else if (magic)
181 {
182 if (strchr(spencer_magic_backslashed, *e))
183 *(r++) = '\\';
184
185 *(r++) = *e;
186 if (*e == '[')
187 in_brackets=e;
188 }
189 else
190 {
191 if (strchr(spencer_magic, *e))
192 *(r++) = '\\';
193
194 *(r++) = *e;
195 }
196 last_was_bs=0;
197 }
198
199 if (match && (e==ex || *(e-1) != '$'))
200 {
201 if (last_was_bs)
202 *(r++) = '\\';
203 *(r++) = '$';
204 }
205
206 *r='\0';
207
208 // cerr<<"reg||"<<ex<<"||"<<reg<<"\n";
209
210 return reg;
211 }
212
compile()213 void EST_Regex::compile()
214 {
215 if (!compiled)
216 {
217 char *reg=regularize(0);
218 void * t =(void *)hs_regcomp(reg);
219 compiled=t;
220 wfree(reg);
221 }
222
223 if (!compiled)
224 cerr << "EST_Regex: can't compile '" << str() << "'\n";
225 }
226
compile_match()227 void EST_Regex::compile_match()
228 {
229 if (!compiled_match)
230 {
231 char *reg=regularize(1);
232
233 void * t =(void *)hs_regcomp(reg);
234 compiled_match=t;
235 wfree(reg);
236 }
237
238 if (!compiled_match)
239 cerr << "EST_Regex: can't compile '" << str() << "'\n";
240 }
241
run(const char * on,int from,int & start,int & end,int * starts,int * ends)242 int EST_Regex::run(const char *on, int from, int &start, int &end, int *starts, int *ends)
243 {
244
245 compile();
246
247 if (compiled && from <= (int)strlen(on))
248 {
249 if (hs_regexec((hs_regexp *)compiled, on+from))
250 {
251 hs_regexp *re = (hs_regexp *)compiled;
252
253 start = re->startp[0] - on;
254 end = re->endp[0]- on;
255
256 if (starts)
257 {
258 int i;
259 for (i=0; i<EST_Regex_max_subexpressions; i++)
260 starts[i] = re->startp[i]?(re->startp[i] - on):-1;
261 }
262 if (ends)
263 {
264 int i;
265 for (i=0; i<EST_Regex_max_subexpressions; i++)
266 ends[i] = re->endp[i]?(re->endp[i] - on):-1;
267 }
268
269 return 1;
270 }
271 }
272 return 0;
273 }
274
run_match(const char * on,int from,int * starts,int * ends)275 int EST_Regex::run_match(const char *on, int from, int *starts, int *ends)
276 {
277
278 compile_match();
279
280 hs_regexp *re = (hs_regexp *)compiled_match;
281
282 if (compiled_match && from <= (int)strlen(on))
283 if (hs_regexec(re, on+from))
284 {
285 if (starts)
286 {
287 int i;
288 for (i=0; i<EST_Regex_max_subexpressions; i++)
289 starts[i] = re->startp[i]?(re->startp[i] - on):-1;
290 }
291 if (ends)
292 {
293 int i;
294 for (i=0; i<EST_Regex_max_subexpressions; i++)
295 ends[i] = re->endp[i]?(re->endp[i] - on):-1;
296 }
297 return 1;
298 }
299
300 return 0;
301 }
302
operator =(const EST_Regex ex)303 EST_Regex &EST_Regex::operator = (const EST_Regex ex)
304 {
305 ((EST_String &)(*this)) = (EST_String)ex;
306 compiled = NULL;
307 compiled_match = NULL;
308
309 return *this;
310 }
311
operator =(const EST_String s)312 EST_Regex &EST_Regex::operator = (const EST_String s)
313 {
314 ((EST_String &)(*this)) = s;
315 compiled = NULL;
316 compiled_match = NULL;
317
318 return *this;
319 }
320
operator =(const char * s)321 EST_Regex &EST_Regex::operator = (const char *s)
322 {
323 ((EST_String &)(*this)) = s;
324 compiled = NULL;
325 compiled_match = NULL;
326
327 return *this;
328 }
329
operator <<(ostream & s,const EST_Regex & str)330 ostream &operator << (ostream &s, const EST_Regex &str)
331 {
332 return s << (EST_String)str;
333 }
334
335