1 /* $Id: rx.c 398 2004-02-25 00:00:32Z dvd $ */
2
3 #include <string.h> /*strlen,strcpy,strcmp*/
4 #include <assert.h>
5 #include "u.h" /*u_get,u_strlen*/
6 #include "xmlc.h"
7 #include "m.h"
8 #include "s.h"
9 #include "ht.h"
10 #include "ll.h"
11 #include "er.h"
12 #include "rx.h"
13
14 #define LEN_P RX_LEN_P
15 #define PRIME_P RX_PRIME_P
16 #define LIM_P RX_LIM_P
17 #define LEN_2 RX_LEN_2
18 #define PRIME_2 RX_PRIME_2
19 #define LEN_R RX_LEN_R
20 #define PRIME_R RX_PRIME_R
21
22 #define R_AVG_SIZE 16
23
24 /* it is good to have few patterns when deltas are memoized */
25 #define P_ERROR 0
26 #define P_NOT_ALLOWED 1
27 #define P_EMPTY 2
28 #define P_CHOICE 3
29 #define P_GROUP 4
30 #define P_ONE_OR_MORE 5 /*+*/
31 #define P_EXCEPT 6 /*single-single*/
32 #define P_RANGE 7 /*lower,upper inclusive*/
33 #define P_CLASS 8 /*complement is .-*/
34 #define P_ANY 9
35 #define P_CHAR 10
36
37 #define P_SIZE 3
38 #define P_AVG_SIZE 2
39
40 static int p_size[]={1,1,1,3,3,2,3,3,2,1,2};
41
42 #define P_TYP(i) (pattern[i]&0xF)
43 #define P_IS(i,x) (x==P_TYP(i))
44 #define P_CHK(i,x) assert(P_IS(i,x))
45
46 #define P_unop(TYP,p,p1) P_CHK(p,TYP); p1=pattern[p+1]
47 #define P_binop(TYP,p,p1,p2) P_unop(TYP,p,p1); p2=pattern[p+2]
48 #define NotAllowed(p) P_CHK(p,P_NotAllowed)
49 #define Empty(p) P_CHK(p,P_Empty)
50 #define Any(p) P_CHK(p,P_Any)
51 #define Choice(p,p1,p2) P_binop(P_CHOICE,p,p1,p2)
52 #define Group(p,p1,p2) P_binop(P_GROUP,p,p1,p2)
53 #define OneOrMore(p,p1) P_unop(P_ONE_OR_MORE,p,p1)
54 #define Except(p,p1,p2) P_binop(P_EXCEPT,p,p1,p2)
55 #define Range(p,cf,cl) P_binop(P_RANGE,p,cf,cl)
56 #define Class(p,cn) P_unop(P_CLASS,p,cn)
57 #define Char(p,c) P_unop(P_CHAR,p,c)
58
59 #define P_NUL 0x100
60
61 #define setNullable(x) if(x) pattern[i_p]|=P_NUL
62 #define nullable(p) (pattern[p]&P_NUL)
63
64 int rx_compact=0;
65 /* 'compact' in drv and rx do different things.
66 In drv, it limits the size of the table of memoized deltas. In rx, it limits the size
67 of the buffer for cached regular expressions; memoized deltas are always limited by LIM_M,
68 since the whole repertoire of unicode characters can blow up the buffer.
69 */
70
71 static char *regex;
72 static int *pattern;
73 static int (*r2p)[2];
74 static struct hashtable ht_r,ht_p,ht_2;
75 static int i_p,len_p,i_r,len_r,i_2,len_2;
76 static int empty,notAllowed,any;
77
accept_p(void)78 static int accept_p(void) {
79 int j;
80 if((j=ht_get(&ht_p,i_p))==-1) {
81 ht_put(&ht_p,j=i_p);
82 i_p+=p_size[P_TYP(i_p)];
83 if(i_p+P_SIZE>len_p) pattern=(int*)m_stretch(pattern,len_p=2*(i_p+P_SIZE),i_p,sizeof(int));
84 }
85 return j;
86 }
87
88 #define P_NEW(x) (pattern[i_p]=x)
89
90 #define P_newunop(TYP,p1) P_NEW(TYP); pattern[i_p+1]=p1
91 #define P_newbinop(TYP,p1,p2) P_newunop(TYP,p1); pattern[i_p+2]=p2
newNotAllowed(void)92 static int newNotAllowed(void) {P_NEW(P_NOT_ALLOWED); return accept_p();}
newEmpty(void)93 static int newEmpty(void) {P_NEW(P_EMPTY); setNullable(1); return accept_p();}
newAny(void)94 static int newAny(void) {P_NEW(P_ANY); return accept_p();}
newChoice(int p1,int p2)95 static int newChoice(int p1,int p2) {P_newbinop(P_CHOICE,p1,p2); setNullable(nullable(p1)||nullable(p2)); return accept_p();}
newGroup(int p1,int p2)96 static int newGroup(int p1,int p2) {P_newbinop(P_GROUP,p1,p2); setNullable(nullable(p1)&&nullable(p2)); return accept_p();}
newOneOrMore(int p1)97 static int newOneOrMore(int p1) {P_newunop(P_ONE_OR_MORE,p1); setNullable(nullable(p1)); return accept_p();}
newExcept(int p1,int p2)98 static int newExcept(int p1,int p2) {P_newbinop(P_EXCEPT,p1,p2); return accept_p();}
newRange(int cf,int cl)99 static int newRange(int cf,int cl) {P_newbinop(P_RANGE,cf,cl); return accept_p();}
newClass(int cn)100 static int newClass(int cn) {P_newunop(P_CLASS,cn); return accept_p();}
newChar(int c)101 static int newChar(int c) {P_newunop(P_CHAR,c); return accept_p();}
102
one_or_more(int p)103 static int one_or_more(int p) {
104 if(P_IS(p,P_EMPTY)) return p;
105 if(P_IS(p,P_NOT_ALLOWED)) return p;
106 return newOneOrMore(p);
107 }
108
group(int p1,int p2)109 static int group(int p1,int p2) {
110 if(P_IS(p1,P_NOT_ALLOWED)) return p1;
111 if(P_IS(p2,P_NOT_ALLOWED)) return p2;
112 if(P_IS(p1,P_EMPTY)) return p2;
113 if(P_IS(p2,P_EMPTY)) return p1;
114 return newGroup(p1,p2);
115 }
116
samechoice(int p1,int p2)117 static int samechoice(int p1,int p2) {
118 if(P_IS(p1,P_CHOICE)) {
119 int p11,p12; Choice(p1,p11,p12);
120 return p12==p2||samechoice(p11,p2);
121 } else return p1==p2;
122 }
123
choice(int p1,int p2)124 static int choice(int p1,int p2) {
125 if(P_IS(p1,P_NOT_ALLOWED)) return p2;
126 if(P_IS(p2,P_NOT_ALLOWED)) return p1;
127 if(P_IS(p2,P_CHOICE)) {
128 int p21,p22; Choice(p2,p21,p22);
129 p1=choice(p1,p21); return choice(p1,p22);
130 }
131 if(samechoice(p1,p2)) return p1;
132 if(nullable(p1) && (P_IS(p2,P_EMPTY))) return p1;
133 if(nullable(p2) && (P_IS(p1,P_EMPTY))) return p2;
134 return newChoice(p1,p2);
135 }
136
cls(int cn)137 static int cls(int cn) {
138 if(cn<0) return newExcept(any,newClass(-cn));
139 if(cn==0) return notAllowed;
140 return newClass(cn);
141 }
142
equal_r(int r1,int r2)143 static int equal_r(int r1,int r2) {return strcmp(regex+r1,regex+r2)==0;}
hash_r(int r)144 static int hash_r(int r) {return s_hval(regex+r);}
145
equal_p(int p1,int p2)146 static int equal_p(int p1,int p2) {
147 int *pp1=pattern+p1,*pp2=pattern+p2;
148 if(P_TYP(p1)!=P_TYP(p2)) return 0;
149 switch(p_size[P_TYP(p1)]) {
150 case 3: if(pp1[2]!=pp2[2]) return 0;
151 case 2: if(pp1[1]!=pp2[1]) return 0;
152 case 1: return 1;
153 default: assert(0);
154 }
155 return 0;
156 }
hash_p(int p)157 static int hash_p(int p) {
158 int *pp=pattern+p; int h=0;
159 switch(p_size[P_TYP(p)]) {
160 case 1: h=pp[0]&0xF; break;
161 case 2: h=(pp[0]&0xF)|(pp[1]<<4); break;
162 case 3: h=(pp[0]&0xF)|((pp[1]^pp[2])<<4); break;
163 default: assert(0);
164 }
165 return h*PRIME_P;
166 }
167
equal_2(int x1,int x2)168 static int equal_2(int x1,int x2) {return r2p[x1][0]==r2p[x2][0];}
hash_2(int x)169 static int hash_2(int x) {return r2p[x][0]*PRIME_2;}
170
add_r(char * rx)171 static int add_r(char *rx) {
172 int len=strlen(rx)+1;
173 if(i_r+len>len_r) regex=(char*)m_stretch(regex,len_r=2*(i_r+len),i_r,sizeof(char));
174 strcpy(regex+i_r,rx);
175 return len;
176 }
177
178 #define ERRPOS
179
180 #define err(msg) (*er_vprintf)(msg" in \"%s\" at offset %i\n",ap)
rx_default_verror_handler(int erno,va_list ap)181 void rx_default_verror_handler(int erno,va_list ap) {
182 (*er_printf)("regular expressions: ");
183 switch(erno) {
184 case RX_ER_BADCH: err("bad character"); break;
185 case RX_ER_UNFIN: err("unfinished expression"); break;
186 case RX_ER_NOLSQ: err("'[' expected"); break;
187 case RX_ER_NORSQ: err("']' expected"); break;
188 case RX_ER_NOLCU: err("'{' expected"); break;
189 case RX_ER_NORCU: err("'}' expected"); break;
190 case RX_ER_NOLPA: err("'(' expected"); break;
191 case RX_ER_NORPA: err("')' expected"); break;
192 case RX_ER_BADCL: err("unknown class"); break;
193 case RX_ER_NODGT: err("digit expected"); break;
194 case RX_ER_DNUOB: err("reversed bounds"); break;
195 case RX_ER_NOTRC: err("range or class expected"); break;
196 default: assert(0);
197 }
198 }
199
200 void (*rx_verror_handler)(int erno,va_list ap)=&rx_default_verror_handler;
201
error_handler(int erno,...)202 static void error_handler(int erno,...) {
203 va_list ap; va_start(ap,erno); (*rx_verror_handler)(erno,ap); va_end(ap);
204 }
205
206 #define LEN_M RX_LEN_M
207 #define PRIME_M RX_PRIME_M
208 #define LIM_M RX_LIM_M
209
210 #define M_SIZE 3
211
212 #define M_SET(p) memo[i_m][M_SIZE-1]=p
213 #define M_RET(m) memo[m][M_SIZE-1]
214
215 static int (*memo)[M_SIZE];
216 static int i_m,len_m;
217 static struct hashtable ht_m;
218
new_memo(int p,int c)219 static int new_memo(int p,int c) {
220 int *me=memo[i_m];
221 ht_deli(&ht_m,i_m);
222 me[0]=p; me[1]=c;
223 return ht_get(&ht_m,i_m);
224 }
225
equal_m(int m1,int m2)226 static int equal_m(int m1,int m2) {
227 int *me1=memo[m1],*me2=memo[m2];
228 return (me1[0]==me2[0])&&(me1[1]==me2[1]);
229 }
hash_m(int m)230 static int hash_m(int m) {
231 int *me=memo[m];
232 return (me[0]^me[1])*PRIME_M;
233 }
234
accept_m(void)235 static void accept_m(void) {
236 if(ht_get(&ht_m,i_m)!=-1) ht_del(&ht_m,i_m);
237 ht_put(&ht_m,i_m++);
238 if(i_m>=LIM_M) i_m=0;
239 if(i_m==len_m) memo=(int(*)[M_SIZE])m_stretch(memo,len_m=i_m*2,i_m,sizeof(int[M_SIZE]));
240 }
241
242 static void windup(void);
243 static int initialized=0;
rx_init(void)244 void rx_init(void) {
245 if(!initialized) { initialized=1;
246 pattern=(int *)m_alloc(len_p=P_AVG_SIZE*LEN_P,sizeof(int));
247 r2p=(int (*)[2])m_alloc(len_2=LEN_2,sizeof(int[2]));
248 regex=(char*)m_alloc(len_r=R_AVG_SIZE*LEN_R,sizeof(char));
249 memo=(int (*)[M_SIZE])m_alloc(len_m=LEN_M,sizeof(int[M_SIZE]));
250
251 ht_init(&ht_p,LEN_P,&hash_p,&equal_p);
252 ht_init(&ht_2,LEN_2,&hash_2,&equal_2);
253 ht_init(&ht_r,LEN_R,&hash_r,&equal_r);
254 ht_init(&ht_m,LEN_M,&hash_m,&equal_m);
255
256 windup();
257 }
258 }
259
rx_clear(void)260 void rx_clear(void) {
261 ht_clear(&ht_p); ht_clear(&ht_2); ht_clear(&ht_r); ht_clear(&ht_m);
262 windup();
263 }
264
windup(void)265 static void windup(void) {
266 i_p=i_r=i_2=i_m=0;
267 pattern[0]=P_ERROR; accept_p();
268 empty=newEmpty(); notAllowed=newNotAllowed(); any=newAny();
269 }
270
271 #define SYM_END 0
272 #define SYM_CLS 1
273 #define SYM_ESC 2
274 #define SYM_CHR 3
275
276 static int r0,ri,sym,val,errors;
277
error(int erno)278 static void error(int erno) {
279 if(!errors) error_handler(erno,regex+r0,u_strlen(regex+r0)-u_strlen(regex+ri));
280 ++errors;
281 }
282
283 #include "rx_cls_u.c"
284
chclass(void)285 static int chclass(void) {
286 int u,cl,rj;
287 ri+=u_get(&u,regex+ri);
288 if(u=='\0') {--ri; error(RX_ER_NOLCU); return 0;}
289 if(u!='{') {error(RX_ER_NOLCU); return 0;}
290 rj=ri;
291 for(;;) {
292 if(regex[rj]=='\0') {ri=rj; error(RX_ER_NORCU); return 0;}
293 if(regex[rj]=='}') {
294 if((cl=s_ntab(regex+ri,rj-ri,clstab,NUM_CLS_U))==NUM_CLS_U) {error(RX_ER_BADCL); cl=0;}
295 ri=rj+1;
296 return cl;
297 }
298 ++rj;
299 }
300 }
301
302 #define CLS_NL (NUM_CLS_U+1)
303 #define CLS_S (NUM_CLS_U+2)
304 #define CLS_I (NUM_CLS_U+3)
305 #define CLS_C (NUM_CLS_U+4)
306 #define CLS_W (NUM_CLS_U+5)
307 #define NUM_CLS (NUM_CLS_U+6)
308
getsym(void)309 static void getsym(void) {
310 int u;
311 if(regex[ri]=='\0') sym=SYM_END; else {
312 ri+=u_get(&u,regex+ri);
313 if(u=='\\') {
314 ri+=u_get(&u,regex+ri);
315 switch(u) {
316 case '\0': --ri; error(RX_ER_UNFIN); sym=SYM_END; break;
317 case 'p': sym=SYM_CLS; val=chclass(); break;
318 case 'P': sym=SYM_CLS; val=-chclass(); break;
319 case 's': sym=SYM_CLS; val=CLS_S; break;
320 case 'S': sym=SYM_CLS; val=-CLS_S; break;
321 case 'i': sym=SYM_CLS; val=CLS_I; break;
322 case 'I': sym=SYM_CLS; val=-CLS_I; break;
323 case 'c': sym=SYM_CLS; val=CLS_C; break;
324 case 'C': sym=SYM_CLS; val=-CLS_C; break;
325 case 'd': sym=SYM_CLS; val=CLS_U_Nd; break;
326 case 'D': sym=SYM_CLS; val=-CLS_U_Nd; break;
327 case 'w': sym=SYM_CLS; val=CLS_W; break;
328 case 'W': sym=SYM_CLS; val=-CLS_W; break;
329 case 'n': sym=SYM_ESC; val=0xA; break;
330 case 'r': sym=SYM_ESC; val=0xD; break;
331 case 't': sym=SYM_ESC; val=0x9; break;
332 case '\\': case '|': case '.': case '-': case '^': case '?': case '*': case '+':
333 case '{': case '}': case '[': case ']': case '(': case ')':
334 sym=SYM_ESC; val=u; break;
335 default: error(RX_ER_BADCH); sym=SYM_ESC; val=u; break;
336 }
337 } else {
338 switch(u) {
339 case '.': sym=SYM_CLS; val=-CLS_NL; break;
340 default: sym=SYM_CHR; val=u; break;
341 }
342 }
343 }
344 }
345
chk_get(int v,int erno)346 static void chk_get(int v,int erno) {if(sym!=SYM_CHR||val!=v) error(erno); getsym();}
347
348
349 #define chkrch(val) if((val)=='['||(val)==']'||(val)=='-') error(RX_ER_NOTRC)
350
chgroup(void)351 static int chgroup(void) {
352 int p=notAllowed,c;
353 for(;;) {
354 switch(sym) {
355 case SYM_CHR: chkrch(val);
356 case SYM_ESC: c=val; getsym();
357 if(sym==SYM_CHR&&val=='-') {
358 if(regex[ri]=='[') {
359 p=choice(p,newChar(c));
360 goto END_OF_GROUP;
361 } else {
362 getsym();
363 switch(sym) {
364 case SYM_CHR: chkrch(val);
365 case SYM_ESC: p=choice(p,newRange(c,val)); getsym(); break;
366 default: error(RX_ER_BADCH); getsym(); break;
367 }
368 }
369 } else {
370 p=choice(p,newChar(c));
371 }
372 break;
373 case SYM_CLS: p=choice(p,cls(val)); getsym(); break;
374 case SYM_END: error(RX_ER_NORSQ); goto END_OF_GROUP;
375 default: assert(0);
376 }
377 if(sym==SYM_CHR&&(val==']'||val=='-')) goto END_OF_GROUP;
378 }
379 END_OF_GROUP:;
380 return p;
381 }
382
chexpr(void)383 static int chexpr(void) {
384 int p;
385 if(sym==SYM_CHR&&val=='^') { getsym();
386 p=newExcept(any,chgroup());
387 } else {
388 p=chgroup();
389 }
390 if(sym==SYM_CHR&&val=='-') { getsym();
391 chk_get('[',RX_ER_NOLSQ); p=newExcept(p,chexpr()); chk_get(']',RX_ER_NORSQ);
392 }
393 return p;
394 }
395
396 static int expression(void);
atom(void)397 static int atom(void) {
398 int p=0;
399 switch(sym) {
400 case SYM_CHR:
401 switch(val) {
402 case '[': getsym(); p=chexpr(); chk_get(']',RX_ER_NORSQ); break;
403 case '(': getsym(); p=expression(); chk_get(')',RX_ER_NORPA); break;
404 case '{': case '?': case '*': case '+': case '|':
405 case ')': case ']': case '}': error(RX_ER_BADCH); getsym(); break;
406 default: p=newChar(val); getsym(); break;
407 }
408 break;
409 case SYM_ESC: p=newChar(val); getsym(); break;
410 case SYM_CLS: p=cls(val); getsym(); break;
411 default: error(RX_ER_BADCH); getsym(); break;
412 }
413 return p;
414 }
415
number(void)416 static int number(void) {
417 int n=0,m;
418 for(;;) {
419 if(sym!=SYM_CHR) goto END_OF_DIGITS;
420 switch(val) {
421 case '0': m=0; break;
422 case '1': m=1; break;
423 case '2': m=2; break;
424 case '3': m=3; break;
425 case '4': m=4; break;
426 case '5': m=5; break;
427 case '6': m=6; break;
428 case '7': m=7; break;
429 case '8': m=8; break;
430 case '9': m=9; break;
431 default: goto END_OF_DIGITS;
432 }
433 n=n*10+m;
434 getsym();
435 }
436 END_OF_DIGITS:;
437 return n;
438 }
439
quantifier(int p0)440 static int quantifier(int p0) {
441 int p=empty,n,n0;
442 n=n0=number();
443 while(n--) p=group(p,p0);
444 if(sym==SYM_CHR) {
445 if(val==',') {
446 getsym();
447 if(sym==SYM_CHR && val=='}') {
448 p=group(p,choice(empty,one_or_more(p0)));
449 } else {
450 n=number()-n0; if(n<0) {error(RX_ER_DNUOB); n=0;}
451 while(n--) p=group(p,choice(empty,p0));
452 }
453 }
454 } else error(RX_ER_NODGT);
455 return p;
456 }
457
piece(void)458 static int piece(void) {
459 int p;
460 p=atom();
461 if(sym==SYM_CHR) {
462 switch(val) {
463 case '{': getsym(); p=quantifier(p); chk_get('}',RX_ER_NOLCU); break;
464 case '?': getsym(); p=choice(empty,p); break;
465 case '*': getsym(); p=choice(empty,one_or_more(p)); break;
466 case '+': getsym(); p=one_or_more(p); break;
467 default: break;
468 }
469 }
470 return p;
471 }
472
branch(void)473 static int branch(void) {
474 int p;
475 p=empty;
476 while(!(sym==SYM_END||(sym==SYM_CHR&&(val=='|'||val==')')))) p=group(p,piece());
477 return p;
478 }
479
expression(void)480 static int expression(void) {
481 int p;
482 p=branch();
483 while(sym==SYM_CHR&&val=='|') {
484 getsym();
485 p=choice(p,branch());
486 }
487 return p;
488 }
489
bind(int r)490 static void bind(int r) {
491 r0=ri=r; sym=-1; errors=0;
492 getsym();
493 }
494
compile(char * rx)495 static int compile(char *rx) {
496 int r=0,p=0,d_r;
497 d_r=add_r(rx);
498 if((r=ht_get(&ht_r,i_r))==-1) {
499 if(rx_compact&&i_p>=P_AVG_SIZE*LIM_P) {rx_clear(); d_r=add_r(rx);}
500 ht_put(&ht_r,r=i_r);
501 i_r+=d_r;
502 bind(r); p=expression(); if(sym!=SYM_END) error(RX_ER_BADCH);
503 r2p[i_2][0]=r; r2p[i_2][1]=p;
504 ht_put(&ht_2,i_2++);
505 if(i_2==len_2) r2p=(int(*)[2])m_stretch(r2p,len_2=2*i_2,i_2,sizeof(int[2]));
506 } else {
507 r2p[i_2][0]=r;
508 p=r2p[ht_get(&ht_2,i_2)][1];
509 }
510 return p;
511 }
512
513 #include "rx_cls_ranges.c"
514
in_class(int c,int cn)515 static int in_class(int c,int cn) {
516 switch(cn) {
517 case 0: return 0;
518 case CLS_U_C: return in_class(c,CLS_U_Cc)||in_class(c,CLS_U_Cf)||in_class(c,CLS_U_Co);
519 case CLS_U_Cc: return u_in_ranges(c,CcRanges,sizeof(CcRanges)/sizeof(int[2]));
520 case CLS_U_Cf: return u_in_ranges(c,CfRanges,sizeof(CfRanges)/sizeof(int[2]));
521 case CLS_U_Co: return u_in_ranges(c,CoRanges,sizeof(CoRanges)/sizeof(int[2]));
522 case CLS_U_IsAlphabeticPresentationForms: return u_in_ranges(c,IsAlphabeticPresentationFormsRanges,sizeof(IsAlphabeticPresentationFormsRanges)/sizeof(int[2]));
523 case CLS_U_IsArabic: return u_in_ranges(c,IsArabicRanges,sizeof(IsArabicRanges)/sizeof(int[2]));
524 case CLS_U_IsArabicPresentationForms_A: return u_in_ranges(c,IsArabicPresentationForms_ARanges,sizeof(IsArabicPresentationForms_ARanges)/sizeof(int[2]));
525 case CLS_U_IsArabicPresentationForms_B: return u_in_ranges(c,IsArabicPresentationForms_BRanges,sizeof(IsArabicPresentationForms_BRanges)/sizeof(int[2]));
526 case CLS_U_IsArmenian: return u_in_ranges(c,IsArmenianRanges,sizeof(IsArmenianRanges)/sizeof(int[2]));
527 case CLS_U_IsArrows: return u_in_ranges(c,IsArrowsRanges,sizeof(IsArrowsRanges)/sizeof(int[2]));
528 case CLS_U_IsBasicLatin: return u_in_ranges(c,IsBasicLatinRanges,sizeof(IsBasicLatinRanges)/sizeof(int[2]));
529 case CLS_U_IsBengali: return u_in_ranges(c,IsBengaliRanges,sizeof(IsBengaliRanges)/sizeof(int[2]));
530 case CLS_U_IsBlockElements: return u_in_ranges(c,IsBlockElementsRanges,sizeof(IsBlockElementsRanges)/sizeof(int[2]));
531 case CLS_U_IsBopomofo: return u_in_ranges(c,IsBopomofoRanges,sizeof(IsBopomofoRanges)/sizeof(int[2]));
532 case CLS_U_IsBopomofoExtended: return u_in_ranges(c,IsBopomofoExtendedRanges,sizeof(IsBopomofoExtendedRanges)/sizeof(int[2]));
533 case CLS_U_IsBoxDrawing: return u_in_ranges(c,IsBoxDrawingRanges,sizeof(IsBoxDrawingRanges)/sizeof(int[2]));
534 case CLS_U_IsBraillePatterns: return u_in_ranges(c,IsBraillePatternsRanges,sizeof(IsBraillePatternsRanges)/sizeof(int[2]));
535 case CLS_U_IsByzantineMusicalSymbols: return u_in_ranges(c,IsByzantineMusicalSymbolsRanges,sizeof(IsByzantineMusicalSymbolsRanges)/sizeof(int[2]));
536 case CLS_U_IsCJKCompatibility: return u_in_ranges(c,IsCJKCompatibilityRanges,sizeof(IsCJKCompatibilityRanges)/sizeof(int[2]));
537 case CLS_U_IsCJKCompatibilityForms: return u_in_ranges(c,IsCJKCompatibilityFormsRanges,sizeof(IsCJKCompatibilityFormsRanges)/sizeof(int[2]));
538 case CLS_U_IsCJKCompatibilityIdeographs: return u_in_ranges(c,IsCJKCompatibilityIdeographsRanges,sizeof(IsCJKCompatibilityIdeographsRanges)/sizeof(int[2]));
539 case CLS_U_IsCJKCompatibilityIdeographsSupplement: return u_in_ranges(c,IsCJKCompatibilityIdeographsSupplementRanges,sizeof(IsCJKCompatibilityIdeographsSupplementRanges)/sizeof(int[2]));
540 case CLS_U_IsCJKRadicalsSupplement: return u_in_ranges(c,IsCJKRadicalsSupplementRanges,sizeof(IsCJKRadicalsSupplementRanges)/sizeof(int[2]));
541 case CLS_U_IsCJKSymbolsandPunctuation: return u_in_ranges(c,IsCJKSymbolsandPunctuationRanges,sizeof(IsCJKSymbolsandPunctuationRanges)/sizeof(int[2]));
542 case CLS_U_IsCJKUnifiedIdeographs: return u_in_ranges(c,IsCJKUnifiedIdeographsRanges,sizeof(IsCJKUnifiedIdeographsRanges)/sizeof(int[2]));
543 case CLS_U_IsCJKUnifiedIdeographsExtensionA: return u_in_ranges(c,IsCJKUnifiedIdeographsExtensionARanges,sizeof(IsCJKUnifiedIdeographsExtensionARanges)/sizeof(int[2]));
544 case CLS_U_IsCJKUnifiedIdeographsExtensionB: return u_in_ranges(c,IsCJKUnifiedIdeographsExtensionBRanges,sizeof(IsCJKUnifiedIdeographsExtensionBRanges)/sizeof(int[2]));
545 case CLS_U_IsCherokee: return u_in_ranges(c,IsCherokeeRanges,sizeof(IsCherokeeRanges)/sizeof(int[2]));
546 case CLS_U_IsCombiningDiacriticalMarks: return u_in_ranges(c,IsCombiningDiacriticalMarksRanges,sizeof(IsCombiningDiacriticalMarksRanges)/sizeof(int[2]));
547 case CLS_U_IsCombiningHalfMarks: return u_in_ranges(c,IsCombiningHalfMarksRanges,sizeof(IsCombiningHalfMarksRanges)/sizeof(int[2]));
548 case CLS_U_IsCombiningMarksforSymbols: return u_in_ranges(c,IsCombiningMarksforSymbolsRanges,sizeof(IsCombiningMarksforSymbolsRanges)/sizeof(int[2]));
549 case CLS_U_IsControlPictures: return u_in_ranges(c,IsControlPicturesRanges,sizeof(IsControlPicturesRanges)/sizeof(int[2]));
550 case CLS_U_IsCurrencySymbols: return u_in_ranges(c,IsCurrencySymbolsRanges,sizeof(IsCurrencySymbolsRanges)/sizeof(int[2]));
551 case CLS_U_IsCyrillic: return u_in_ranges(c,IsCyrillicRanges,sizeof(IsCyrillicRanges)/sizeof(int[2]));
552 case CLS_U_IsDeseret: return u_in_ranges(c,IsDeseretRanges,sizeof(IsDeseretRanges)/sizeof(int[2]));
553 case CLS_U_IsDevanagari: return u_in_ranges(c,IsDevanagariRanges,sizeof(IsDevanagariRanges)/sizeof(int[2]));
554 case CLS_U_IsDingbats: return u_in_ranges(c,IsDingbatsRanges,sizeof(IsDingbatsRanges)/sizeof(int[2]));
555 case CLS_U_IsEnclosedAlphanumerics: return u_in_ranges(c,IsEnclosedAlphanumericsRanges,sizeof(IsEnclosedAlphanumericsRanges)/sizeof(int[2]));
556 case CLS_U_IsEnclosedCJKLettersandMonths: return u_in_ranges(c,IsEnclosedCJKLettersandMonthsRanges,sizeof(IsEnclosedCJKLettersandMonthsRanges)/sizeof(int[2]));
557 case CLS_U_IsEthiopic: return u_in_ranges(c,IsEthiopicRanges,sizeof(IsEthiopicRanges)/sizeof(int[2]));
558 case CLS_U_IsGeneralPunctuation: return u_in_ranges(c,IsGeneralPunctuationRanges,sizeof(IsGeneralPunctuationRanges)/sizeof(int[2]));
559 case CLS_U_IsGeometricShapes: return u_in_ranges(c,IsGeometricShapesRanges,sizeof(IsGeometricShapesRanges)/sizeof(int[2]));
560 case CLS_U_IsGeorgian: return u_in_ranges(c,IsGeorgianRanges,sizeof(IsGeorgianRanges)/sizeof(int[2]));
561 case CLS_U_IsGothic: return u_in_ranges(c,IsGothicRanges,sizeof(IsGothicRanges)/sizeof(int[2]));
562 case CLS_U_IsGreek: return u_in_ranges(c,IsGreekRanges,sizeof(IsGreekRanges)/sizeof(int[2]));
563 case CLS_U_IsGreekExtended: return u_in_ranges(c,IsGreekExtendedRanges,sizeof(IsGreekExtendedRanges)/sizeof(int[2]));
564 case CLS_U_IsGujarati: return u_in_ranges(c,IsGujaratiRanges,sizeof(IsGujaratiRanges)/sizeof(int[2]));
565 case CLS_U_IsGurmukhi: return u_in_ranges(c,IsGurmukhiRanges,sizeof(IsGurmukhiRanges)/sizeof(int[2]));
566 case CLS_U_IsHalfwidthandFullwidthForms: return u_in_ranges(c,IsHalfwidthandFullwidthFormsRanges,sizeof(IsHalfwidthandFullwidthFormsRanges)/sizeof(int[2]));
567 case CLS_U_IsHangulCompatibilityJamo: return u_in_ranges(c,IsHangulCompatibilityJamoRanges,sizeof(IsHangulCompatibilityJamoRanges)/sizeof(int[2]));
568 case CLS_U_IsHangulJamo: return u_in_ranges(c,IsHangulJamoRanges,sizeof(IsHangulJamoRanges)/sizeof(int[2]));
569 case CLS_U_IsHangulSyllables: return u_in_ranges(c,IsHangulSyllablesRanges,sizeof(IsHangulSyllablesRanges)/sizeof(int[2]));
570 case CLS_U_IsHebrew: return u_in_ranges(c,IsHebrewRanges,sizeof(IsHebrewRanges)/sizeof(int[2]));
571 case CLS_U_IsHiragana: return u_in_ranges(c,IsHiraganaRanges,sizeof(IsHiraganaRanges)/sizeof(int[2]));
572 case CLS_U_IsIPAExtensions: return u_in_ranges(c,IsIPAExtensionsRanges,sizeof(IsIPAExtensionsRanges)/sizeof(int[2]));
573 case CLS_U_IsIdeographicDescriptionCharacters: return u_in_ranges(c,IsIdeographicDescriptionCharactersRanges,sizeof(IsIdeographicDescriptionCharactersRanges)/sizeof(int[2]));
574 case CLS_U_IsKanbun: return u_in_ranges(c,IsKanbunRanges,sizeof(IsKanbunRanges)/sizeof(int[2]));
575 case CLS_U_IsKangxiRadicals: return u_in_ranges(c,IsKangxiRadicalsRanges,sizeof(IsKangxiRadicalsRanges)/sizeof(int[2]));
576 case CLS_U_IsKannada: return u_in_ranges(c,IsKannadaRanges,sizeof(IsKannadaRanges)/sizeof(int[2]));
577 case CLS_U_IsKatakana: return u_in_ranges(c,IsKatakanaRanges,sizeof(IsKatakanaRanges)/sizeof(int[2]));
578 case CLS_U_IsKhmer: return u_in_ranges(c,IsKhmerRanges,sizeof(IsKhmerRanges)/sizeof(int[2]));
579 case CLS_U_IsLao: return u_in_ranges(c,IsLaoRanges,sizeof(IsLaoRanges)/sizeof(int[2]));
580 case CLS_U_IsLatin_1Supplement: return u_in_ranges(c,IsLatin_1SupplementRanges,sizeof(IsLatin_1SupplementRanges)/sizeof(int[2]));
581 case CLS_U_IsLatinExtended_A: return u_in_ranges(c,IsLatinExtended_ARanges,sizeof(IsLatinExtended_ARanges)/sizeof(int[2]));
582 case CLS_U_IsLatinExtended_B: return u_in_ranges(c,IsLatinExtended_BRanges,sizeof(IsLatinExtended_BRanges)/sizeof(int[2]));
583 case CLS_U_IsLatinExtendedAdditional: return u_in_ranges(c,IsLatinExtendedAdditionalRanges,sizeof(IsLatinExtendedAdditionalRanges)/sizeof(int[2]));
584 case CLS_U_IsLetterlikeSymbols: return u_in_ranges(c,IsLetterlikeSymbolsRanges,sizeof(IsLetterlikeSymbolsRanges)/sizeof(int[2]));
585 case CLS_U_IsMalayalam: return u_in_ranges(c,IsMalayalamRanges,sizeof(IsMalayalamRanges)/sizeof(int[2]));
586 case CLS_U_IsMathematicalAlphanumericSymbols: return u_in_ranges(c,IsMathematicalAlphanumericSymbolsRanges,sizeof(IsMathematicalAlphanumericSymbolsRanges)/sizeof(int[2]));
587 case CLS_U_IsMathematicalOperators: return u_in_ranges(c,IsMathematicalOperatorsRanges,sizeof(IsMathematicalOperatorsRanges)/sizeof(int[2]));
588 case CLS_U_IsMiscellaneousSymbols: return u_in_ranges(c,IsMiscellaneousSymbolsRanges,sizeof(IsMiscellaneousSymbolsRanges)/sizeof(int[2]));
589 case CLS_U_IsMiscellaneousTechnical: return u_in_ranges(c,IsMiscellaneousTechnicalRanges,sizeof(IsMiscellaneousTechnicalRanges)/sizeof(int[2]));
590 case CLS_U_IsMongolian: return u_in_ranges(c,IsMongolianRanges,sizeof(IsMongolianRanges)/sizeof(int[2]));
591 case CLS_U_IsMusicalSymbols: return u_in_ranges(c,IsMusicalSymbolsRanges,sizeof(IsMusicalSymbolsRanges)/sizeof(int[2]));
592 case CLS_U_IsMyanmar: return u_in_ranges(c,IsMyanmarRanges,sizeof(IsMyanmarRanges)/sizeof(int[2]));
593 case CLS_U_IsNumberForms: return u_in_ranges(c,IsNumberFormsRanges,sizeof(IsNumberFormsRanges)/sizeof(int[2]));
594 case CLS_U_IsOgham: return u_in_ranges(c,IsOghamRanges,sizeof(IsOghamRanges)/sizeof(int[2]));
595 case CLS_U_IsOldItalic: return u_in_ranges(c,IsOldItalicRanges,sizeof(IsOldItalicRanges)/sizeof(int[2]));
596 case CLS_U_IsOpticalCharacterRecognition: return u_in_ranges(c,IsOpticalCharacterRecognitionRanges,sizeof(IsOpticalCharacterRecognitionRanges)/sizeof(int[2]));
597 case CLS_U_IsOriya: return u_in_ranges(c,IsOriyaRanges,sizeof(IsOriyaRanges)/sizeof(int[2]));
598 case CLS_U_IsPrivateUse: return u_in_ranges(c,IsPrivateUseRanges,sizeof(IsPrivateUseRanges)/sizeof(int[2]));
599 case CLS_U_IsRunic: return u_in_ranges(c,IsRunicRanges,sizeof(IsRunicRanges)/sizeof(int[2]));
600 case CLS_U_IsSinhala: return u_in_ranges(c,IsSinhalaRanges,sizeof(IsSinhalaRanges)/sizeof(int[2]));
601 case CLS_U_IsSmallFormVariants: return u_in_ranges(c,IsSmallFormVariantsRanges,sizeof(IsSmallFormVariantsRanges)/sizeof(int[2]));
602 case CLS_U_IsSpacingModifierLetters: return u_in_ranges(c,IsSpacingModifierLettersRanges,sizeof(IsSpacingModifierLettersRanges)/sizeof(int[2]));
603 case CLS_U_IsSpecials: return u_in_ranges(c,IsSpecialsRanges,sizeof(IsSpecialsRanges)/sizeof(int[2]));
604 case CLS_U_IsSuperscriptsandSubscripts: return u_in_ranges(c,IsSuperscriptsandSubscriptsRanges,sizeof(IsSuperscriptsandSubscriptsRanges)/sizeof(int[2]));
605 case CLS_U_IsSyriac: return u_in_ranges(c,IsSyriacRanges,sizeof(IsSyriacRanges)/sizeof(int[2]));
606 case CLS_U_IsTags: return u_in_ranges(c,IsTagsRanges,sizeof(IsTagsRanges)/sizeof(int[2]));
607 case CLS_U_IsTamil: return u_in_ranges(c,IsTamilRanges,sizeof(IsTamilRanges)/sizeof(int[2]));
608 case CLS_U_IsTelugu: return u_in_ranges(c,IsTeluguRanges,sizeof(IsTeluguRanges)/sizeof(int[2]));
609 case CLS_U_IsThaana: return u_in_ranges(c,IsThaanaRanges,sizeof(IsThaanaRanges)/sizeof(int[2]));
610 case CLS_U_IsThai: return u_in_ranges(c,IsThaiRanges,sizeof(IsThaiRanges)/sizeof(int[2]));
611 case CLS_U_IsTibetan: return u_in_ranges(c,IsTibetanRanges,sizeof(IsTibetanRanges)/sizeof(int[2]));
612 case CLS_U_IsUnifiedCanadianAboriginalSyllabics: return u_in_ranges(c,IsUnifiedCanadianAboriginalSyllabicsRanges,sizeof(IsUnifiedCanadianAboriginalSyllabicsRanges)/sizeof(int[2]));
613 case CLS_U_IsYiRadicals: return u_in_ranges(c,IsYiRadicalsRanges,sizeof(IsYiRadicalsRanges)/sizeof(int[2]));
614 case CLS_U_IsYiSyllables: return u_in_ranges(c,IsYiSyllablesRanges,sizeof(IsYiSyllablesRanges)/sizeof(int[2]));
615 case CLS_U_L: return in_class(c,CLS_U_Ll)||in_class(c,CLS_U_Lm)||in_class(c,CLS_U_Lo)||in_class(c,CLS_U_Lt)||in_class(c,CLS_U_Lu);
616 case CLS_U_Ll: return u_in_ranges(c,LlRanges,sizeof(LlRanges)/sizeof(int[2]));
617 case CLS_U_Lm: return u_in_ranges(c,LmRanges,sizeof(LmRanges)/sizeof(int[2]));
618 case CLS_U_Lo: return u_in_ranges(c,LoRanges,sizeof(LoRanges)/sizeof(int[2]));
619 case CLS_U_Lt: return u_in_ranges(c,LtRanges,sizeof(LtRanges)/sizeof(int[2]));
620 case CLS_U_Lu: return u_in_ranges(c,LuRanges,sizeof(LuRanges)/sizeof(int[2]));
621 case CLS_U_M: return in_class(c,CLS_U_Mc)||in_class(c,CLS_U_Me)||in_class(c,CLS_U_Mn);
622 case CLS_U_Mc: return u_in_ranges(c,McRanges,sizeof(McRanges)/sizeof(int[2]));
623 case CLS_U_Me: return u_in_ranges(c,MeRanges,sizeof(MeRanges)/sizeof(int[2]));
624 case CLS_U_Mn: return u_in_ranges(c,MnRanges,sizeof(MnRanges)/sizeof(int[2]));
625 case CLS_U_N: return in_class(c,CLS_U_Nd)||in_class(c,CLS_U_Nl)||in_class(c,CLS_U_No);
626 case CLS_U_Nd: return u_in_ranges(c,NdRanges,sizeof(NdRanges)/sizeof(int[2]));
627 case CLS_U_Nl: return u_in_ranges(c,NlRanges,sizeof(NlRanges)/sizeof(int[2]));
628 case CLS_U_No: return u_in_ranges(c,NoRanges,sizeof(NoRanges)/sizeof(int[2]));
629 case CLS_U_P: return in_class(c,CLS_U_Pc)||in_class(c,CLS_U_Pd)||in_class(c,CLS_U_Pe)||in_class(c,CLS_U_Pf)||in_class(c,CLS_U_Pi)||in_class(c,CLS_U_Po)||in_class(c,CLS_U_Ps);
630 case CLS_U_Pc: return u_in_ranges(c,PcRanges,sizeof(PcRanges)/sizeof(int[2]));
631 case CLS_U_Pd: return u_in_ranges(c,PdRanges,sizeof(PdRanges)/sizeof(int[2]));
632 case CLS_U_Pe: return u_in_ranges(c,PeRanges,sizeof(PeRanges)/sizeof(int[2]));
633 case CLS_U_Pf: return u_in_ranges(c,PfRanges,sizeof(PfRanges)/sizeof(int[2]));
634 case CLS_U_Pi: return u_in_ranges(c,PiRanges,sizeof(PiRanges)/sizeof(int[2]));
635 case CLS_U_Po: return u_in_ranges(c,PoRanges,sizeof(PoRanges)/sizeof(int[2]));
636 case CLS_U_Ps: return u_in_ranges(c,PsRanges,sizeof(PsRanges)/sizeof(int[2]));
637 case CLS_U_S: return in_class(c,CLS_U_Sc)||in_class(c,CLS_U_Sk)||in_class(c,CLS_U_Sm)||in_class(c,CLS_U_So);
638 case CLS_U_Sc: return u_in_ranges(c,ScRanges,sizeof(ScRanges)/sizeof(int[2]));
639 case CLS_U_Sk: return u_in_ranges(c,SkRanges,sizeof(SkRanges)/sizeof(int[2]));
640 case CLS_U_Sm: return u_in_ranges(c,SmRanges,sizeof(SmRanges)/sizeof(int[2]));
641 case CLS_U_So: return u_in_ranges(c,SoRanges,sizeof(SoRanges)/sizeof(int[2]));
642 case CLS_U_Z: return in_class(c,CLS_U_Zl)||in_class(c,CLS_U_Zp)||in_class(c,CLS_U_Zs);
643 case CLS_U_Zl: return u_in_ranges(c,ZlRanges,sizeof(ZlRanges)/sizeof(int[2]));
644 case CLS_U_Zp: return u_in_ranges(c,ZpRanges,sizeof(ZpRanges)/sizeof(int[2]));
645 case CLS_U_Zs: return u_in_ranges(c,ZsRanges,sizeof(ZsRanges)/sizeof(int[2]));
646 case CLS_NL: return c=='\n'||c=='\r';
647 case CLS_S: return xmlc_white_space(c);
648 case CLS_I: return xmlc_base_char(c)||xmlc_ideographic(c)||c=='_'||c==':';
649 case CLS_C: return in_class(c,CLS_I)||xmlc_digit(c)||xmlc_combining_char(c)||xmlc_extender(c)||c=='.'||c=='-';
650 case CLS_W: return !(in_class(c,CLS_U_P)||in_class(c,CLS_U_Z)||in_class(c,CLS_U_C));
651 default: assert(0);
652 }
653 return 0;
654 }
655
656
drv(int p,int c)657 static int drv(int p,int c) {
658 int p1,p2,cf,cl,cn,ret,m;
659 assert(!P_IS(p,P_ERROR));
660 m=new_memo(p,c);
661 if(m!=-1) return M_RET(m);
662 switch(P_TYP(p)) {
663 case P_NOT_ALLOWED: case P_EMPTY: ret=notAllowed; break;
664 case P_CHOICE: Choice(p,p1,p2); ret=choice(drv(p1,c),drv(p2,c)); break;
665 case P_GROUP: Group(p,p1,p2); {int p11=group(drv(p1,c),p2); ret=nullable(p1)?choice(p11,drv(p2,c)):p11;} break;
666 case P_ONE_OR_MORE: OneOrMore(p,p1); ret=group(drv(p1,c),choice(empty,p)); break;
667 case P_EXCEPT: Except(p,p1,p2); ret=nullable(drv(p1,c))&&!nullable(drv(p2,c))?empty:notAllowed; break;
668 case P_RANGE: Range(p,cf,cl); ret=cf<=c&&c<=cl?empty:notAllowed; break;
669 case P_CLASS: Class(p,cn); ret=in_class(c,cn)?empty:notAllowed; break;
670 case P_ANY: ret=empty; break;
671 case P_CHAR: Char(p,cf); ret=c==cf?empty:notAllowed; break;
672 default: ret=0; assert(0);
673 }
674 new_memo(p,c); M_SET(ret);
675 accept_m();
676 return ret;
677 }
678
rx_check(char * rx)679 int rx_check(char *rx) {(void)compile(rx); return !errors;}
680
rx_match(char * rx,char * s,int n)681 int rx_match(char *rx,char *s,int n) {
682 int p=compile(rx);
683 if(!errors) {
684 char *end=s+n;
685 int u;
686 for(;;) {
687 if(p==notAllowed) return 0;
688 if(s==end) return nullable(p);
689 s+=u_get(&u,s);
690 p=drv(p,u);
691 }
692 } else return 0;
693 }
694
rx_rmatch(char * rx,char * s,int n)695 int rx_rmatch(char *rx,char *s,int n) {
696 int p=compile(rx);
697 if(!errors) {
698 char *end=s+n;
699 int u;
700 for(;;) {
701 if(p==notAllowed) return 0;
702 if(s==end) return nullable(p);
703 s+=u_get(&u,s);
704 if(xmlc_white_space(u)) u=' ';
705 p=drv(p,u);
706 }
707 } else return 0;
708 }
709
rx_cmatch(char * rx,char * s,int n)710 int rx_cmatch(char *rx,char *s,int n) {
711 int p=compile(rx);
712 if(!errors) {
713 char *end=s+n;
714 int u;
715 SKIP_SPACE: for(;;) {
716 if(s==end) return nullable(p);
717 s+=u_get(&u,s);
718 if(!xmlc_white_space(u)) break;
719 }
720 for(;;) {
721 if(p==notAllowed) return 0;
722 if(xmlc_white_space(u)) { u=' ';
723 p=drv(p,u);
724 if(p==notAllowed) {
725 for(;;) {
726 if(s==end) return 1;
727 s+=u_get(&u,s);
728 if(!xmlc_white_space(u)) return 0;
729 }
730 } else goto SKIP_SPACE;
731 }
732 p=drv(p,u);
733 if(s==end) goto SKIP_SPACE;
734 s+=u_get(&u,s);
735 }
736 } else return 0;
737 }
738
739