1 /* $Id: rx.c 398 2004-02-25 00:00:32Z dvd $ */
2 
3 #include <string.h> /*strlen,strcpy,strcmp*/
4 #include <assert.h>
5 #include "u.h" /*u_get,u_strlen*/
6 #include "xmlc.h"
7 #include "m.h"
8 #include "s.h"
9 #include "ht.h"
10 #include "ll.h"
11 #include "er.h"
12 #include "rx.h"
13 
14 #define LEN_P RX_LEN_P
15 #define PRIME_P RX_PRIME_P
16 #define LIM_P RX_LIM_P
17 #define LEN_2 RX_LEN_2
18 #define PRIME_2 RX_PRIME_2
19 #define LEN_R RX_LEN_R
20 #define PRIME_R RX_PRIME_R
21 
22 #define R_AVG_SIZE 16
23 
24 /* it is good to have few patterns when deltas are memoized */
25 #define P_ERROR 0
26 #define P_NOT_ALLOWED  1
27 #define P_EMPTY 2
28 #define P_CHOICE 3
29 #define P_GROUP 4
30 #define P_ONE_OR_MORE 5 /*+*/
31 #define P_EXCEPT 6 /*single-single*/
32 #define P_RANGE 7 /*lower,upper inclusive*/
33 #define P_CLASS 8 /*complement is .-*/
34 #define P_ANY 9
35 #define P_CHAR 10
36 
37 #define P_SIZE 3
38 #define P_AVG_SIZE 2
39 
40 static int p_size[]={1,1,1,3,3,2,3,3,2,1,2};
41 
42 #define P_TYP(i) (pattern[i]&0xF)
43 #define P_IS(i,x)  (x==P_TYP(i))
44 #define P_CHK(i,x)  assert(P_IS(i,x))
45 
46 #define P_unop(TYP,p,p1) P_CHK(p,TYP); p1=pattern[p+1]
47 #define P_binop(TYP,p,p1,p2) P_unop(TYP,p,p1); p2=pattern[p+2]
48 #define NotAllowed(p) P_CHK(p,P_NotAllowed)
49 #define Empty(p) P_CHK(p,P_Empty)
50 #define Any(p) P_CHK(p,P_Any)
51 #define Choice(p,p1,p2) P_binop(P_CHOICE,p,p1,p2)
52 #define Group(p,p1,p2) P_binop(P_GROUP,p,p1,p2)
53 #define OneOrMore(p,p1) P_unop(P_ONE_OR_MORE,p,p1)
54 #define Except(p,p1,p2) P_binop(P_EXCEPT,p,p1,p2)
55 #define Range(p,cf,cl) P_binop(P_RANGE,p,cf,cl)
56 #define Class(p,cn) P_unop(P_CLASS,p,cn)
57 #define Char(p,c) P_unop(P_CHAR,p,c)
58 
59 #define P_NUL 0x100
60 
61 #define setNullable(x) if(x) pattern[i_p]|=P_NUL
62 #define nullable(p) (pattern[p]&P_NUL)
63 
64 int rx_compact=0;
65 /* 'compact' in drv and rx do different things.
66  In drv, it limits the size of the table of memoized deltas. In rx, it limits the size
67  of the buffer for cached regular expressions; memoized deltas are always limited by LIM_M,
68  since the whole repertoire of unicode characters can blow up the buffer.
69  */
70 
71 static char *regex;
72 static int *pattern;
73 static int (*r2p)[2];
74 static struct hashtable ht_r,ht_p,ht_2;
75 static int i_p,len_p,i_r,len_r,i_2,len_2;
76 static int empty,notAllowed,any;
77 
accept_p(void)78 static int accept_p(void) {
79   int j;
80   if((j=ht_get(&ht_p,i_p))==-1) {
81     ht_put(&ht_p,j=i_p);
82     i_p+=p_size[P_TYP(i_p)];
83     if(i_p+P_SIZE>len_p) pattern=(int*)m_stretch(pattern,len_p=2*(i_p+P_SIZE),i_p,sizeof(int));
84   }
85   return j;
86 }
87 
88 #define P_NEW(x) (pattern[i_p]=x)
89 
90 #define P_newunop(TYP,p1) P_NEW(TYP); pattern[i_p+1]=p1
91 #define P_newbinop(TYP,p1,p2) P_newunop(TYP,p1); pattern[i_p+2]=p2
newNotAllowed(void)92 static int newNotAllowed(void) {P_NEW(P_NOT_ALLOWED); return accept_p();}
newEmpty(void)93 static int newEmpty(void) {P_NEW(P_EMPTY); setNullable(1); return accept_p();}
newAny(void)94 static int newAny(void) {P_NEW(P_ANY); return accept_p();}
newChoice(int p1,int p2)95 static int newChoice(int p1,int p2) {P_newbinop(P_CHOICE,p1,p2); setNullable(nullable(p1)||nullable(p2)); return accept_p();}
newGroup(int p1,int p2)96 static int newGroup(int p1,int p2) {P_newbinop(P_GROUP,p1,p2); setNullable(nullable(p1)&&nullable(p2)); return accept_p();}
newOneOrMore(int p1)97 static int newOneOrMore(int p1) {P_newunop(P_ONE_OR_MORE,p1); setNullable(nullable(p1)); return accept_p();}
newExcept(int p1,int p2)98 static int newExcept(int p1,int p2) {P_newbinop(P_EXCEPT,p1,p2); return accept_p();}
newRange(int cf,int cl)99 static int newRange(int cf,int cl) {P_newbinop(P_RANGE,cf,cl); return accept_p();}
newClass(int cn)100 static int newClass(int cn) {P_newunop(P_CLASS,cn); return accept_p();}
newChar(int c)101 static int newChar(int c) {P_newunop(P_CHAR,c); return accept_p();}
102 
one_or_more(int p)103 static int one_or_more(int p) {
104   if(P_IS(p,P_EMPTY)) return p;
105   if(P_IS(p,P_NOT_ALLOWED)) return p;
106   return newOneOrMore(p);
107 }
108 
group(int p1,int p2)109 static int group(int p1,int p2) {
110   if(P_IS(p1,P_NOT_ALLOWED)) return p1;
111   if(P_IS(p2,P_NOT_ALLOWED)) return p2;
112   if(P_IS(p1,P_EMPTY)) return p2;
113   if(P_IS(p2,P_EMPTY)) return p1;
114   return newGroup(p1,p2);
115 }
116 
samechoice(int p1,int p2)117 static int samechoice(int p1,int p2) {
118   if(P_IS(p1,P_CHOICE)) {
119     int p11,p12; Choice(p1,p11,p12);
120     return p12==p2||samechoice(p11,p2);
121   } else return p1==p2;
122 }
123 
choice(int p1,int p2)124 static int choice(int p1,int p2) {
125   if(P_IS(p1,P_NOT_ALLOWED)) return p2;
126   if(P_IS(p2,P_NOT_ALLOWED)) return p1;
127   if(P_IS(p2,P_CHOICE)) {
128     int p21,p22; Choice(p2,p21,p22);
129     p1=choice(p1,p21); return choice(p1,p22);
130   }
131   if(samechoice(p1,p2)) return p1;
132   if(nullable(p1) && (P_IS(p2,P_EMPTY))) return p1;
133   if(nullable(p2) && (P_IS(p1,P_EMPTY))) return p2;
134   return newChoice(p1,p2);
135 }
136 
cls(int cn)137 static int cls(int cn) {
138   if(cn<0) return newExcept(any,newClass(-cn));
139   if(cn==0) return notAllowed;
140   return newClass(cn);
141 }
142 
equal_r(int r1,int r2)143 static int equal_r(int r1,int r2) {return strcmp(regex+r1,regex+r2)==0;}
hash_r(int r)144 static int hash_r(int r) {return s_hval(regex+r);}
145 
equal_p(int p1,int p2)146 static int equal_p(int p1,int p2) {
147   int *pp1=pattern+p1,*pp2=pattern+p2;
148   if(P_TYP(p1)!=P_TYP(p2)) return 0;
149   switch(p_size[P_TYP(p1)]) {
150   case 3: if(pp1[2]!=pp2[2]) return 0;
151   case 2: if(pp1[1]!=pp2[1]) return 0;
152   case 1: return 1;
153   default: assert(0);
154   }
155   return 0;
156 }
hash_p(int p)157 static int hash_p(int p) {
158   int *pp=pattern+p; int h=0;
159   switch(p_size[P_TYP(p)]) {
160   case 1: h=pp[0]&0xF; break;
161   case 2: h=(pp[0]&0xF)|(pp[1]<<4); break;
162   case 3: h=(pp[0]&0xF)|((pp[1]^pp[2])<<4); break;
163   default: assert(0);
164   }
165   return h*PRIME_P;
166 }
167 
equal_2(int x1,int x2)168 static int equal_2(int x1,int x2) {return r2p[x1][0]==r2p[x2][0];}
hash_2(int x)169 static int hash_2(int x) {return r2p[x][0]*PRIME_2;}
170 
add_r(char * rx)171 static int add_r(char *rx) {
172   int len=strlen(rx)+1;
173   if(i_r+len>len_r) regex=(char*)m_stretch(regex,len_r=2*(i_r+len),i_r,sizeof(char));
174   strcpy(regex+i_r,rx);
175   return len;
176 }
177 
178 #define ERRPOS
179 
180 #define err(msg) (*er_vprintf)(msg" in \"%s\" at offset %i\n",ap)
rx_default_verror_handler(int erno,va_list ap)181 void rx_default_verror_handler(int erno,va_list ap) {
182   (*er_printf)("regular expressions: ");
183   switch(erno) {
184   case RX_ER_BADCH: err("bad character"); break;
185   case RX_ER_UNFIN: err("unfinished expression"); break;
186   case RX_ER_NOLSQ: err("'[' expected"); break;
187   case RX_ER_NORSQ: err("']' expected"); break;
188   case RX_ER_NOLCU: err("'{' expected"); break;
189   case RX_ER_NORCU: err("'}' expected"); break;
190   case RX_ER_NOLPA: err("'(' expected"); break;
191   case RX_ER_NORPA: err("')' expected"); break;
192   case RX_ER_BADCL: err("unknown class"); break;
193   case RX_ER_NODGT: err("digit expected"); break;
194   case RX_ER_DNUOB: err("reversed bounds"); break;
195   case RX_ER_NOTRC: err("range or class expected"); break;
196   default: assert(0);
197   }
198 }
199 
200 void (*rx_verror_handler)(int erno,va_list ap)=&rx_default_verror_handler;
201 
error_handler(int erno,...)202 static void error_handler(int erno,...) {
203   va_list ap; va_start(ap,erno); (*rx_verror_handler)(erno,ap); va_end(ap);
204 }
205 
206 #define LEN_M RX_LEN_M
207 #define PRIME_M RX_PRIME_M
208 #define LIM_M RX_LIM_M
209 
210 #define M_SIZE 3
211 
212 #define M_SET(p) memo[i_m][M_SIZE-1]=p
213 #define M_RET(m) memo[m][M_SIZE-1]
214 
215 static int (*memo)[M_SIZE];
216 static int i_m,len_m;
217 static struct hashtable ht_m;
218 
new_memo(int p,int c)219 static int new_memo(int p,int c) {
220   int *me=memo[i_m];
221   ht_deli(&ht_m,i_m);
222   me[0]=p; me[1]=c;
223   return ht_get(&ht_m,i_m);
224 }
225 
equal_m(int m1,int m2)226 static int equal_m(int m1,int m2) {
227   int *me1=memo[m1],*me2=memo[m2];
228   return (me1[0]==me2[0])&&(me1[1]==me2[1]);
229 }
hash_m(int m)230 static int hash_m(int m) {
231   int *me=memo[m];
232   return (me[0]^me[1])*PRIME_M;
233 }
234 
accept_m(void)235 static void accept_m(void) {
236   if(ht_get(&ht_m,i_m)!=-1) ht_del(&ht_m,i_m);
237   ht_put(&ht_m,i_m++);
238   if(i_m>=LIM_M) i_m=0;
239   if(i_m==len_m) memo=(int(*)[M_SIZE])m_stretch(memo,len_m=i_m*2,i_m,sizeof(int[M_SIZE]));
240 }
241 
242 static void windup(void);
243 static int initialized=0;
rx_init(void)244 void rx_init(void) {
245   if(!initialized) { initialized=1;
246     pattern=(int *)m_alloc(len_p=P_AVG_SIZE*LEN_P,sizeof(int));
247     r2p=(int (*)[2])m_alloc(len_2=LEN_2,sizeof(int[2]));
248     regex=(char*)m_alloc(len_r=R_AVG_SIZE*LEN_R,sizeof(char));
249     memo=(int (*)[M_SIZE])m_alloc(len_m=LEN_M,sizeof(int[M_SIZE]));
250 
251     ht_init(&ht_p,LEN_P,&hash_p,&equal_p);
252     ht_init(&ht_2,LEN_2,&hash_2,&equal_2);
253     ht_init(&ht_r,LEN_R,&hash_r,&equal_r);
254     ht_init(&ht_m,LEN_M,&hash_m,&equal_m);
255 
256     windup();
257   }
258 }
259 
rx_clear(void)260 void rx_clear(void) {
261   ht_clear(&ht_p); ht_clear(&ht_2); ht_clear(&ht_r); ht_clear(&ht_m);
262   windup();
263 }
264 
windup(void)265 static void windup(void) {
266   i_p=i_r=i_2=i_m=0;
267   pattern[0]=P_ERROR;  accept_p();
268   empty=newEmpty(); notAllowed=newNotAllowed(); any=newAny();
269 }
270 
271 #define SYM_END 0
272 #define SYM_CLS 1
273 #define SYM_ESC 2
274 #define SYM_CHR 3
275 
276 static int r0,ri,sym,val,errors;
277 
error(int erno)278 static void error(int erno) {
279   if(!errors) error_handler(erno,regex+r0,u_strlen(regex+r0)-u_strlen(regex+ri));
280   ++errors;
281 }
282 
283 #include "rx_cls_u.c"
284 
chclass(void)285 static int chclass(void) {
286   int u,cl,rj;
287   ri+=u_get(&u,regex+ri);
288   if(u=='\0') {--ri; error(RX_ER_NOLCU); return 0;}
289   if(u!='{') {error(RX_ER_NOLCU); return 0;}
290   rj=ri;
291   for(;;) {
292     if(regex[rj]=='\0') {ri=rj; error(RX_ER_NORCU); return 0;}
293     if(regex[rj]=='}') {
294       if((cl=s_ntab(regex+ri,rj-ri,clstab,NUM_CLS_U))==NUM_CLS_U) {error(RX_ER_BADCL); cl=0;}
295       ri=rj+1;
296       return cl;
297     }
298     ++rj;
299   }
300 }
301 
302 #define CLS_NL (NUM_CLS_U+1)
303 #define CLS_S (NUM_CLS_U+2)
304 #define CLS_I (NUM_CLS_U+3)
305 #define CLS_C (NUM_CLS_U+4)
306 #define CLS_W (NUM_CLS_U+5)
307 #define NUM_CLS (NUM_CLS_U+6)
308 
getsym(void)309 static void getsym(void) {
310   int u;
311   if(regex[ri]=='\0') sym=SYM_END; else {
312     ri+=u_get(&u,regex+ri);
313     if(u=='\\') {
314       ri+=u_get(&u,regex+ri);
315       switch(u) {
316       case '\0': --ri; error(RX_ER_UNFIN); sym=SYM_END; break;
317       case 'p': sym=SYM_CLS; val=chclass(); break;
318       case 'P': sym=SYM_CLS; val=-chclass(); break;
319       case 's': sym=SYM_CLS; val=CLS_S; break;
320       case 'S': sym=SYM_CLS; val=-CLS_S; break;
321       case 'i': sym=SYM_CLS; val=CLS_I; break;
322       case 'I': sym=SYM_CLS; val=-CLS_I; break;
323       case 'c': sym=SYM_CLS; val=CLS_C; break;
324       case 'C': sym=SYM_CLS; val=-CLS_C; break;
325       case 'd': sym=SYM_CLS; val=CLS_U_Nd; break;
326       case 'D': sym=SYM_CLS; val=-CLS_U_Nd; break;
327       case 'w': sym=SYM_CLS; val=CLS_W; break;
328       case 'W': sym=SYM_CLS; val=-CLS_W; break;
329       case 'n': sym=SYM_ESC; val=0xA; break;
330       case 'r': sym=SYM_ESC; val=0xD; break;
331       case 't': sym=SYM_ESC; val=0x9; break;
332       case '\\': case '|': case '.': case '-': case '^': case '?': case '*': case '+':
333       case '{': case '}': case '[': case ']': case '(': case ')':
334 	sym=SYM_ESC; val=u; break;
335       default: error(RX_ER_BADCH); sym=SYM_ESC; val=u; break;
336       }
337     } else {
338       switch(u) {
339       case '.': sym=SYM_CLS; val=-CLS_NL; break;
340       default: sym=SYM_CHR; val=u; break;
341       }
342     }
343   }
344 }
345 
chk_get(int v,int erno)346 static void chk_get(int v,int erno) {if(sym!=SYM_CHR||val!=v) error(erno); getsym();}
347 
348 
349 #define chkrch(val) if((val)=='['||(val)==']'||(val)=='-') error(RX_ER_NOTRC)
350 
chgroup(void)351 static int chgroup(void) {
352   int p=notAllowed,c;
353   for(;;) {
354     switch(sym) {
355     case SYM_CHR: chkrch(val);
356     case SYM_ESC: c=val; getsym();
357       if(sym==SYM_CHR&&val=='-') {
358 	if(regex[ri]=='[') {
359 	  p=choice(p,newChar(c));
360 	  goto END_OF_GROUP;
361 	} else {
362 	  getsym();
363 	  switch(sym) {
364 	  case SYM_CHR: chkrch(val);
365 	  case SYM_ESC: p=choice(p,newRange(c,val)); getsym(); break;
366 	  default: error(RX_ER_BADCH); getsym(); break;
367 	  }
368 	}
369       } else {
370 	p=choice(p,newChar(c));
371       }
372       break;
373     case SYM_CLS: p=choice(p,cls(val)); getsym(); break;
374     case SYM_END: error(RX_ER_NORSQ); goto END_OF_GROUP;
375     default: assert(0);
376     }
377     if(sym==SYM_CHR&&(val==']'||val=='-')) goto END_OF_GROUP;
378   }
379   END_OF_GROUP:;
380   return p;
381 }
382 
chexpr(void)383 static int chexpr(void) {
384   int p;
385   if(sym==SYM_CHR&&val=='^') { getsym();
386     p=newExcept(any,chgroup());
387   } else {
388     p=chgroup();
389   }
390   if(sym==SYM_CHR&&val=='-') { getsym();
391     chk_get('[',RX_ER_NOLSQ); p=newExcept(p,chexpr()); chk_get(']',RX_ER_NORSQ);
392   }
393   return p;
394 }
395 
396 static int expression(void);
atom(void)397 static int atom(void) {
398   int p=0;
399   switch(sym) {
400   case SYM_CHR:
401     switch(val) {
402     case '[': getsym(); p=chexpr(); chk_get(']',RX_ER_NORSQ); break;
403     case '(': getsym(); p=expression(); chk_get(')',RX_ER_NORPA); break;
404     case '{': case '?': case '*': case '+': case '|':
405     case ')': case ']': case '}': error(RX_ER_BADCH); getsym(); break;
406     default: p=newChar(val); getsym(); break;
407     }
408     break;
409   case SYM_ESC: p=newChar(val); getsym(); break;
410   case SYM_CLS: p=cls(val); getsym(); break;
411   default: error(RX_ER_BADCH); getsym(); break;
412   }
413   return p;
414 }
415 
number(void)416 static int number(void) {
417   int n=0,m;
418   for(;;) {
419     if(sym!=SYM_CHR) goto END_OF_DIGITS;
420     switch(val) {
421     case '0': m=0; break;
422     case '1': m=1; break;
423     case '2': m=2; break;
424     case '3': m=3; break;
425     case '4': m=4; break;
426     case '5': m=5; break;
427     case '6': m=6; break;
428     case '7': m=7; break;
429     case '8': m=8; break;
430     case '9': m=9; break;
431     default: goto END_OF_DIGITS;
432     }
433     n=n*10+m;
434     getsym();
435   }
436   END_OF_DIGITS:;
437   return n;
438 }
439 
quantifier(int p0)440 static int quantifier(int p0) {
441   int p=empty,n,n0;
442   n=n0=number();
443   while(n--) p=group(p,p0);
444   if(sym==SYM_CHR) {
445     if(val==',') {
446       getsym();
447       if(sym==SYM_CHR && val=='}') {
448 	p=group(p,choice(empty,one_or_more(p0)));
449       } else {
450 	n=number()-n0; if(n<0) {error(RX_ER_DNUOB); n=0;}
451 	while(n--) p=group(p,choice(empty,p0));
452       }
453     }
454   } else error(RX_ER_NODGT);
455   return p;
456 }
457 
piece(void)458 static int piece(void) {
459   int p;
460   p=atom();
461   if(sym==SYM_CHR) {
462     switch(val) {
463     case '{': getsym(); p=quantifier(p); chk_get('}',RX_ER_NOLCU); break;
464     case '?': getsym(); p=choice(empty,p); break;
465     case '*': getsym(); p=choice(empty,one_or_more(p)); break;
466     case '+': getsym(); p=one_or_more(p); break;
467     default: break;
468     }
469   }
470   return p;
471 }
472 
branch(void)473 static int branch(void) {
474   int p;
475   p=empty;
476   while(!(sym==SYM_END||(sym==SYM_CHR&&(val=='|'||val==')')))) p=group(p,piece());
477   return p;
478 }
479 
expression(void)480 static int expression(void) {
481   int p;
482   p=branch();
483   while(sym==SYM_CHR&&val=='|') {
484     getsym();
485     p=choice(p,branch());
486   }
487   return p;
488 }
489 
bind(int r)490 static void bind(int r) {
491   r0=ri=r; sym=-1; errors=0;
492   getsym();
493 }
494 
compile(char * rx)495 static int compile(char *rx) {
496   int r=0,p=0,d_r;
497   d_r=add_r(rx);
498   if((r=ht_get(&ht_r,i_r))==-1) {
499     if(rx_compact&&i_p>=P_AVG_SIZE*LIM_P) {rx_clear(); d_r=add_r(rx);}
500     ht_put(&ht_r,r=i_r);
501     i_r+=d_r;
502     bind(r); p=expression(); if(sym!=SYM_END) error(RX_ER_BADCH);
503     r2p[i_2][0]=r; r2p[i_2][1]=p;
504     ht_put(&ht_2,i_2++);
505     if(i_2==len_2) r2p=(int(*)[2])m_stretch(r2p,len_2=2*i_2,i_2,sizeof(int[2]));
506   } else {
507     r2p[i_2][0]=r;
508     p=r2p[ht_get(&ht_2,i_2)][1];
509   }
510   return p;
511 }
512 
513 #include "rx_cls_ranges.c"
514 
in_class(int c,int cn)515 static int in_class(int c,int cn) {
516   switch(cn) {
517   case 0: return 0;
518   case CLS_U_C: return in_class(c,CLS_U_Cc)||in_class(c,CLS_U_Cf)||in_class(c,CLS_U_Co);
519   case CLS_U_Cc: return u_in_ranges(c,CcRanges,sizeof(CcRanges)/sizeof(int[2]));
520   case CLS_U_Cf: return u_in_ranges(c,CfRanges,sizeof(CfRanges)/sizeof(int[2]));
521   case CLS_U_Co: return u_in_ranges(c,CoRanges,sizeof(CoRanges)/sizeof(int[2]));
522   case CLS_U_IsAlphabeticPresentationForms: return u_in_ranges(c,IsAlphabeticPresentationFormsRanges,sizeof(IsAlphabeticPresentationFormsRanges)/sizeof(int[2]));
523   case CLS_U_IsArabic: return u_in_ranges(c,IsArabicRanges,sizeof(IsArabicRanges)/sizeof(int[2]));
524   case CLS_U_IsArabicPresentationForms_A: return u_in_ranges(c,IsArabicPresentationForms_ARanges,sizeof(IsArabicPresentationForms_ARanges)/sizeof(int[2]));
525   case CLS_U_IsArabicPresentationForms_B: return u_in_ranges(c,IsArabicPresentationForms_BRanges,sizeof(IsArabicPresentationForms_BRanges)/sizeof(int[2]));
526   case CLS_U_IsArmenian: return u_in_ranges(c,IsArmenianRanges,sizeof(IsArmenianRanges)/sizeof(int[2]));
527   case CLS_U_IsArrows: return u_in_ranges(c,IsArrowsRanges,sizeof(IsArrowsRanges)/sizeof(int[2]));
528   case CLS_U_IsBasicLatin: return u_in_ranges(c,IsBasicLatinRanges,sizeof(IsBasicLatinRanges)/sizeof(int[2]));
529   case CLS_U_IsBengali: return u_in_ranges(c,IsBengaliRanges,sizeof(IsBengaliRanges)/sizeof(int[2]));
530   case CLS_U_IsBlockElements: return u_in_ranges(c,IsBlockElementsRanges,sizeof(IsBlockElementsRanges)/sizeof(int[2]));
531   case CLS_U_IsBopomofo: return u_in_ranges(c,IsBopomofoRanges,sizeof(IsBopomofoRanges)/sizeof(int[2]));
532   case CLS_U_IsBopomofoExtended: return u_in_ranges(c,IsBopomofoExtendedRanges,sizeof(IsBopomofoExtendedRanges)/sizeof(int[2]));
533   case CLS_U_IsBoxDrawing: return u_in_ranges(c,IsBoxDrawingRanges,sizeof(IsBoxDrawingRanges)/sizeof(int[2]));
534   case CLS_U_IsBraillePatterns: return u_in_ranges(c,IsBraillePatternsRanges,sizeof(IsBraillePatternsRanges)/sizeof(int[2]));
535   case CLS_U_IsByzantineMusicalSymbols: return u_in_ranges(c,IsByzantineMusicalSymbolsRanges,sizeof(IsByzantineMusicalSymbolsRanges)/sizeof(int[2]));
536   case CLS_U_IsCJKCompatibility: return u_in_ranges(c,IsCJKCompatibilityRanges,sizeof(IsCJKCompatibilityRanges)/sizeof(int[2]));
537   case CLS_U_IsCJKCompatibilityForms: return u_in_ranges(c,IsCJKCompatibilityFormsRanges,sizeof(IsCJKCompatibilityFormsRanges)/sizeof(int[2]));
538   case CLS_U_IsCJKCompatibilityIdeographs: return u_in_ranges(c,IsCJKCompatibilityIdeographsRanges,sizeof(IsCJKCompatibilityIdeographsRanges)/sizeof(int[2]));
539   case CLS_U_IsCJKCompatibilityIdeographsSupplement: return u_in_ranges(c,IsCJKCompatibilityIdeographsSupplementRanges,sizeof(IsCJKCompatibilityIdeographsSupplementRanges)/sizeof(int[2]));
540   case CLS_U_IsCJKRadicalsSupplement: return u_in_ranges(c,IsCJKRadicalsSupplementRanges,sizeof(IsCJKRadicalsSupplementRanges)/sizeof(int[2]));
541   case CLS_U_IsCJKSymbolsandPunctuation: return u_in_ranges(c,IsCJKSymbolsandPunctuationRanges,sizeof(IsCJKSymbolsandPunctuationRanges)/sizeof(int[2]));
542   case CLS_U_IsCJKUnifiedIdeographs: return u_in_ranges(c,IsCJKUnifiedIdeographsRanges,sizeof(IsCJKUnifiedIdeographsRanges)/sizeof(int[2]));
543   case CLS_U_IsCJKUnifiedIdeographsExtensionA: return u_in_ranges(c,IsCJKUnifiedIdeographsExtensionARanges,sizeof(IsCJKUnifiedIdeographsExtensionARanges)/sizeof(int[2]));
544   case CLS_U_IsCJKUnifiedIdeographsExtensionB: return u_in_ranges(c,IsCJKUnifiedIdeographsExtensionBRanges,sizeof(IsCJKUnifiedIdeographsExtensionBRanges)/sizeof(int[2]));
545   case CLS_U_IsCherokee: return u_in_ranges(c,IsCherokeeRanges,sizeof(IsCherokeeRanges)/sizeof(int[2]));
546   case CLS_U_IsCombiningDiacriticalMarks: return u_in_ranges(c,IsCombiningDiacriticalMarksRanges,sizeof(IsCombiningDiacriticalMarksRanges)/sizeof(int[2]));
547   case CLS_U_IsCombiningHalfMarks: return u_in_ranges(c,IsCombiningHalfMarksRanges,sizeof(IsCombiningHalfMarksRanges)/sizeof(int[2]));
548   case CLS_U_IsCombiningMarksforSymbols: return u_in_ranges(c,IsCombiningMarksforSymbolsRanges,sizeof(IsCombiningMarksforSymbolsRanges)/sizeof(int[2]));
549   case CLS_U_IsControlPictures: return u_in_ranges(c,IsControlPicturesRanges,sizeof(IsControlPicturesRanges)/sizeof(int[2]));
550   case CLS_U_IsCurrencySymbols: return u_in_ranges(c,IsCurrencySymbolsRanges,sizeof(IsCurrencySymbolsRanges)/sizeof(int[2]));
551   case CLS_U_IsCyrillic: return u_in_ranges(c,IsCyrillicRanges,sizeof(IsCyrillicRanges)/sizeof(int[2]));
552   case CLS_U_IsDeseret: return u_in_ranges(c,IsDeseretRanges,sizeof(IsDeseretRanges)/sizeof(int[2]));
553   case CLS_U_IsDevanagari: return u_in_ranges(c,IsDevanagariRanges,sizeof(IsDevanagariRanges)/sizeof(int[2]));
554   case CLS_U_IsDingbats: return u_in_ranges(c,IsDingbatsRanges,sizeof(IsDingbatsRanges)/sizeof(int[2]));
555   case CLS_U_IsEnclosedAlphanumerics: return u_in_ranges(c,IsEnclosedAlphanumericsRanges,sizeof(IsEnclosedAlphanumericsRanges)/sizeof(int[2]));
556   case CLS_U_IsEnclosedCJKLettersandMonths: return u_in_ranges(c,IsEnclosedCJKLettersandMonthsRanges,sizeof(IsEnclosedCJKLettersandMonthsRanges)/sizeof(int[2]));
557   case CLS_U_IsEthiopic: return u_in_ranges(c,IsEthiopicRanges,sizeof(IsEthiopicRanges)/sizeof(int[2]));
558   case CLS_U_IsGeneralPunctuation: return u_in_ranges(c,IsGeneralPunctuationRanges,sizeof(IsGeneralPunctuationRanges)/sizeof(int[2]));
559   case CLS_U_IsGeometricShapes: return u_in_ranges(c,IsGeometricShapesRanges,sizeof(IsGeometricShapesRanges)/sizeof(int[2]));
560   case CLS_U_IsGeorgian: return u_in_ranges(c,IsGeorgianRanges,sizeof(IsGeorgianRanges)/sizeof(int[2]));
561   case CLS_U_IsGothic: return u_in_ranges(c,IsGothicRanges,sizeof(IsGothicRanges)/sizeof(int[2]));
562   case CLS_U_IsGreek: return u_in_ranges(c,IsGreekRanges,sizeof(IsGreekRanges)/sizeof(int[2]));
563   case CLS_U_IsGreekExtended: return u_in_ranges(c,IsGreekExtendedRanges,sizeof(IsGreekExtendedRanges)/sizeof(int[2]));
564   case CLS_U_IsGujarati: return u_in_ranges(c,IsGujaratiRanges,sizeof(IsGujaratiRanges)/sizeof(int[2]));
565   case CLS_U_IsGurmukhi: return u_in_ranges(c,IsGurmukhiRanges,sizeof(IsGurmukhiRanges)/sizeof(int[2]));
566   case CLS_U_IsHalfwidthandFullwidthForms: return u_in_ranges(c,IsHalfwidthandFullwidthFormsRanges,sizeof(IsHalfwidthandFullwidthFormsRanges)/sizeof(int[2]));
567   case CLS_U_IsHangulCompatibilityJamo: return u_in_ranges(c,IsHangulCompatibilityJamoRanges,sizeof(IsHangulCompatibilityJamoRanges)/sizeof(int[2]));
568   case CLS_U_IsHangulJamo: return u_in_ranges(c,IsHangulJamoRanges,sizeof(IsHangulJamoRanges)/sizeof(int[2]));
569   case CLS_U_IsHangulSyllables: return u_in_ranges(c,IsHangulSyllablesRanges,sizeof(IsHangulSyllablesRanges)/sizeof(int[2]));
570   case CLS_U_IsHebrew: return u_in_ranges(c,IsHebrewRanges,sizeof(IsHebrewRanges)/sizeof(int[2]));
571   case CLS_U_IsHiragana: return u_in_ranges(c,IsHiraganaRanges,sizeof(IsHiraganaRanges)/sizeof(int[2]));
572   case CLS_U_IsIPAExtensions: return u_in_ranges(c,IsIPAExtensionsRanges,sizeof(IsIPAExtensionsRanges)/sizeof(int[2]));
573   case CLS_U_IsIdeographicDescriptionCharacters: return u_in_ranges(c,IsIdeographicDescriptionCharactersRanges,sizeof(IsIdeographicDescriptionCharactersRanges)/sizeof(int[2]));
574   case CLS_U_IsKanbun: return u_in_ranges(c,IsKanbunRanges,sizeof(IsKanbunRanges)/sizeof(int[2]));
575   case CLS_U_IsKangxiRadicals: return u_in_ranges(c,IsKangxiRadicalsRanges,sizeof(IsKangxiRadicalsRanges)/sizeof(int[2]));
576   case CLS_U_IsKannada: return u_in_ranges(c,IsKannadaRanges,sizeof(IsKannadaRanges)/sizeof(int[2]));
577   case CLS_U_IsKatakana: return u_in_ranges(c,IsKatakanaRanges,sizeof(IsKatakanaRanges)/sizeof(int[2]));
578   case CLS_U_IsKhmer: return u_in_ranges(c,IsKhmerRanges,sizeof(IsKhmerRanges)/sizeof(int[2]));
579   case CLS_U_IsLao: return u_in_ranges(c,IsLaoRanges,sizeof(IsLaoRanges)/sizeof(int[2]));
580   case CLS_U_IsLatin_1Supplement: return u_in_ranges(c,IsLatin_1SupplementRanges,sizeof(IsLatin_1SupplementRanges)/sizeof(int[2]));
581   case CLS_U_IsLatinExtended_A: return u_in_ranges(c,IsLatinExtended_ARanges,sizeof(IsLatinExtended_ARanges)/sizeof(int[2]));
582   case CLS_U_IsLatinExtended_B: return u_in_ranges(c,IsLatinExtended_BRanges,sizeof(IsLatinExtended_BRanges)/sizeof(int[2]));
583   case CLS_U_IsLatinExtendedAdditional: return u_in_ranges(c,IsLatinExtendedAdditionalRanges,sizeof(IsLatinExtendedAdditionalRanges)/sizeof(int[2]));
584   case CLS_U_IsLetterlikeSymbols: return u_in_ranges(c,IsLetterlikeSymbolsRanges,sizeof(IsLetterlikeSymbolsRanges)/sizeof(int[2]));
585   case CLS_U_IsMalayalam: return u_in_ranges(c,IsMalayalamRanges,sizeof(IsMalayalamRanges)/sizeof(int[2]));
586   case CLS_U_IsMathematicalAlphanumericSymbols: return u_in_ranges(c,IsMathematicalAlphanumericSymbolsRanges,sizeof(IsMathematicalAlphanumericSymbolsRanges)/sizeof(int[2]));
587   case CLS_U_IsMathematicalOperators: return u_in_ranges(c,IsMathematicalOperatorsRanges,sizeof(IsMathematicalOperatorsRanges)/sizeof(int[2]));
588   case CLS_U_IsMiscellaneousSymbols: return u_in_ranges(c,IsMiscellaneousSymbolsRanges,sizeof(IsMiscellaneousSymbolsRanges)/sizeof(int[2]));
589   case CLS_U_IsMiscellaneousTechnical: return u_in_ranges(c,IsMiscellaneousTechnicalRanges,sizeof(IsMiscellaneousTechnicalRanges)/sizeof(int[2]));
590   case CLS_U_IsMongolian: return u_in_ranges(c,IsMongolianRanges,sizeof(IsMongolianRanges)/sizeof(int[2]));
591   case CLS_U_IsMusicalSymbols: return u_in_ranges(c,IsMusicalSymbolsRanges,sizeof(IsMusicalSymbolsRanges)/sizeof(int[2]));
592   case CLS_U_IsMyanmar: return u_in_ranges(c,IsMyanmarRanges,sizeof(IsMyanmarRanges)/sizeof(int[2]));
593   case CLS_U_IsNumberForms: return u_in_ranges(c,IsNumberFormsRanges,sizeof(IsNumberFormsRanges)/sizeof(int[2]));
594   case CLS_U_IsOgham: return u_in_ranges(c,IsOghamRanges,sizeof(IsOghamRanges)/sizeof(int[2]));
595   case CLS_U_IsOldItalic: return u_in_ranges(c,IsOldItalicRanges,sizeof(IsOldItalicRanges)/sizeof(int[2]));
596   case CLS_U_IsOpticalCharacterRecognition: return u_in_ranges(c,IsOpticalCharacterRecognitionRanges,sizeof(IsOpticalCharacterRecognitionRanges)/sizeof(int[2]));
597   case CLS_U_IsOriya: return u_in_ranges(c,IsOriyaRanges,sizeof(IsOriyaRanges)/sizeof(int[2]));
598   case CLS_U_IsPrivateUse: return u_in_ranges(c,IsPrivateUseRanges,sizeof(IsPrivateUseRanges)/sizeof(int[2]));
599   case CLS_U_IsRunic: return u_in_ranges(c,IsRunicRanges,sizeof(IsRunicRanges)/sizeof(int[2]));
600   case CLS_U_IsSinhala: return u_in_ranges(c,IsSinhalaRanges,sizeof(IsSinhalaRanges)/sizeof(int[2]));
601   case CLS_U_IsSmallFormVariants: return u_in_ranges(c,IsSmallFormVariantsRanges,sizeof(IsSmallFormVariantsRanges)/sizeof(int[2]));
602   case CLS_U_IsSpacingModifierLetters: return u_in_ranges(c,IsSpacingModifierLettersRanges,sizeof(IsSpacingModifierLettersRanges)/sizeof(int[2]));
603   case CLS_U_IsSpecials: return u_in_ranges(c,IsSpecialsRanges,sizeof(IsSpecialsRanges)/sizeof(int[2]));
604   case CLS_U_IsSuperscriptsandSubscripts: return u_in_ranges(c,IsSuperscriptsandSubscriptsRanges,sizeof(IsSuperscriptsandSubscriptsRanges)/sizeof(int[2]));
605   case CLS_U_IsSyriac: return u_in_ranges(c,IsSyriacRanges,sizeof(IsSyriacRanges)/sizeof(int[2]));
606   case CLS_U_IsTags: return u_in_ranges(c,IsTagsRanges,sizeof(IsTagsRanges)/sizeof(int[2]));
607   case CLS_U_IsTamil: return u_in_ranges(c,IsTamilRanges,sizeof(IsTamilRanges)/sizeof(int[2]));
608   case CLS_U_IsTelugu: return u_in_ranges(c,IsTeluguRanges,sizeof(IsTeluguRanges)/sizeof(int[2]));
609   case CLS_U_IsThaana: return u_in_ranges(c,IsThaanaRanges,sizeof(IsThaanaRanges)/sizeof(int[2]));
610   case CLS_U_IsThai: return u_in_ranges(c,IsThaiRanges,sizeof(IsThaiRanges)/sizeof(int[2]));
611   case CLS_U_IsTibetan: return u_in_ranges(c,IsTibetanRanges,sizeof(IsTibetanRanges)/sizeof(int[2]));
612   case CLS_U_IsUnifiedCanadianAboriginalSyllabics: return u_in_ranges(c,IsUnifiedCanadianAboriginalSyllabicsRanges,sizeof(IsUnifiedCanadianAboriginalSyllabicsRanges)/sizeof(int[2]));
613   case CLS_U_IsYiRadicals: return u_in_ranges(c,IsYiRadicalsRanges,sizeof(IsYiRadicalsRanges)/sizeof(int[2]));
614   case CLS_U_IsYiSyllables: return u_in_ranges(c,IsYiSyllablesRanges,sizeof(IsYiSyllablesRanges)/sizeof(int[2]));
615   case CLS_U_L: return in_class(c,CLS_U_Ll)||in_class(c,CLS_U_Lm)||in_class(c,CLS_U_Lo)||in_class(c,CLS_U_Lt)||in_class(c,CLS_U_Lu);
616   case CLS_U_Ll: return u_in_ranges(c,LlRanges,sizeof(LlRanges)/sizeof(int[2]));
617   case CLS_U_Lm: return u_in_ranges(c,LmRanges,sizeof(LmRanges)/sizeof(int[2]));
618   case CLS_U_Lo: return u_in_ranges(c,LoRanges,sizeof(LoRanges)/sizeof(int[2]));
619   case CLS_U_Lt: return u_in_ranges(c,LtRanges,sizeof(LtRanges)/sizeof(int[2]));
620   case CLS_U_Lu: return u_in_ranges(c,LuRanges,sizeof(LuRanges)/sizeof(int[2]));
621   case CLS_U_M: return in_class(c,CLS_U_Mc)||in_class(c,CLS_U_Me)||in_class(c,CLS_U_Mn);
622   case CLS_U_Mc: return u_in_ranges(c,McRanges,sizeof(McRanges)/sizeof(int[2]));
623   case CLS_U_Me: return u_in_ranges(c,MeRanges,sizeof(MeRanges)/sizeof(int[2]));
624   case CLS_U_Mn: return u_in_ranges(c,MnRanges,sizeof(MnRanges)/sizeof(int[2]));
625   case CLS_U_N: return in_class(c,CLS_U_Nd)||in_class(c,CLS_U_Nl)||in_class(c,CLS_U_No);
626   case CLS_U_Nd: return u_in_ranges(c,NdRanges,sizeof(NdRanges)/sizeof(int[2]));
627   case CLS_U_Nl: return u_in_ranges(c,NlRanges,sizeof(NlRanges)/sizeof(int[2]));
628   case CLS_U_No: return u_in_ranges(c,NoRanges,sizeof(NoRanges)/sizeof(int[2]));
629   case CLS_U_P: return in_class(c,CLS_U_Pc)||in_class(c,CLS_U_Pd)||in_class(c,CLS_U_Pe)||in_class(c,CLS_U_Pf)||in_class(c,CLS_U_Pi)||in_class(c,CLS_U_Po)||in_class(c,CLS_U_Ps);
630   case CLS_U_Pc: return u_in_ranges(c,PcRanges,sizeof(PcRanges)/sizeof(int[2]));
631   case CLS_U_Pd: return u_in_ranges(c,PdRanges,sizeof(PdRanges)/sizeof(int[2]));
632   case CLS_U_Pe: return u_in_ranges(c,PeRanges,sizeof(PeRanges)/sizeof(int[2]));
633   case CLS_U_Pf: return u_in_ranges(c,PfRanges,sizeof(PfRanges)/sizeof(int[2]));
634   case CLS_U_Pi: return u_in_ranges(c,PiRanges,sizeof(PiRanges)/sizeof(int[2]));
635   case CLS_U_Po: return u_in_ranges(c,PoRanges,sizeof(PoRanges)/sizeof(int[2]));
636   case CLS_U_Ps: return u_in_ranges(c,PsRanges,sizeof(PsRanges)/sizeof(int[2]));
637   case CLS_U_S: return in_class(c,CLS_U_Sc)||in_class(c,CLS_U_Sk)||in_class(c,CLS_U_Sm)||in_class(c,CLS_U_So);
638   case CLS_U_Sc: return u_in_ranges(c,ScRanges,sizeof(ScRanges)/sizeof(int[2]));
639   case CLS_U_Sk: return u_in_ranges(c,SkRanges,sizeof(SkRanges)/sizeof(int[2]));
640   case CLS_U_Sm: return u_in_ranges(c,SmRanges,sizeof(SmRanges)/sizeof(int[2]));
641   case CLS_U_So: return u_in_ranges(c,SoRanges,sizeof(SoRanges)/sizeof(int[2]));
642   case CLS_U_Z: return in_class(c,CLS_U_Zl)||in_class(c,CLS_U_Zp)||in_class(c,CLS_U_Zs);
643   case CLS_U_Zl: return u_in_ranges(c,ZlRanges,sizeof(ZlRanges)/sizeof(int[2]));
644   case CLS_U_Zp: return u_in_ranges(c,ZpRanges,sizeof(ZpRanges)/sizeof(int[2]));
645   case CLS_U_Zs: return u_in_ranges(c,ZsRanges,sizeof(ZsRanges)/sizeof(int[2]));
646   case CLS_NL: return c=='\n'||c=='\r';
647   case CLS_S: return xmlc_white_space(c);
648   case CLS_I: return xmlc_base_char(c)||xmlc_ideographic(c)||c=='_'||c==':';
649   case CLS_C: return in_class(c,CLS_I)||xmlc_digit(c)||xmlc_combining_char(c)||xmlc_extender(c)||c=='.'||c=='-';
650   case CLS_W: return !(in_class(c,CLS_U_P)||in_class(c,CLS_U_Z)||in_class(c,CLS_U_C));
651   default: assert(0);
652   }
653   return 0;
654 }
655 
656 
drv(int p,int c)657 static int drv(int p,int c) {
658   int p1,p2,cf,cl,cn,ret,m;
659   assert(!P_IS(p,P_ERROR));
660   m=new_memo(p,c);
661   if(m!=-1) return M_RET(m);
662   switch(P_TYP(p)) {
663   case P_NOT_ALLOWED: case P_EMPTY: ret=notAllowed; break;
664   case P_CHOICE: Choice(p,p1,p2); ret=choice(drv(p1,c),drv(p2,c)); break;
665   case P_GROUP: Group(p,p1,p2); {int p11=group(drv(p1,c),p2); ret=nullable(p1)?choice(p11,drv(p2,c)):p11;} break;
666   case P_ONE_OR_MORE: OneOrMore(p,p1); ret=group(drv(p1,c),choice(empty,p)); break;
667   case P_EXCEPT: Except(p,p1,p2); ret=nullable(drv(p1,c))&&!nullable(drv(p2,c))?empty:notAllowed; break;
668   case P_RANGE: Range(p,cf,cl); ret=cf<=c&&c<=cl?empty:notAllowed; break;
669   case P_CLASS: Class(p,cn); ret=in_class(c,cn)?empty:notAllowed; break;
670   case P_ANY: ret=empty; break;
671   case P_CHAR: Char(p,cf); ret=c==cf?empty:notAllowed; break;
672   default: ret=0; assert(0);
673   }
674   new_memo(p,c); M_SET(ret);
675   accept_m();
676   return ret;
677 }
678 
rx_check(char * rx)679 int rx_check(char *rx) {(void)compile(rx); return !errors;}
680 
rx_match(char * rx,char * s,int n)681 int rx_match(char *rx,char *s,int n) {
682   int p=compile(rx);
683   if(!errors) {
684     char *end=s+n;
685     int u;
686     for(;;) {
687       if(p==notAllowed) return 0;
688       if(s==end) return nullable(p);
689       s+=u_get(&u,s);
690       p=drv(p,u);
691     }
692   } else return 0;
693 }
694 
rx_rmatch(char * rx,char * s,int n)695 int rx_rmatch(char *rx,char *s,int n) {
696   int p=compile(rx);
697   if(!errors) {
698     char *end=s+n;
699     int u;
700     for(;;) {
701       if(p==notAllowed) return 0;
702       if(s==end) return nullable(p);
703       s+=u_get(&u,s);
704       if(xmlc_white_space(u)) u=' ';
705       p=drv(p,u);
706     }
707   } else return 0;
708 }
709 
rx_cmatch(char * rx,char * s,int n)710 int rx_cmatch(char *rx,char *s,int n) {
711   int p=compile(rx);
712   if(!errors) {
713     char *end=s+n;
714     int u;
715     SKIP_SPACE: for(;;) {
716       if(s==end) return nullable(p);
717       s+=u_get(&u,s);
718       if(!xmlc_white_space(u)) break;
719     }
720     for(;;) {
721       if(p==notAllowed) return 0;
722       if(xmlc_white_space(u)) { u=' ';
723 	p=drv(p,u);
724 	if(p==notAllowed) {
725 	  for(;;) {
726 	    if(s==end) return 1;
727 	    s+=u_get(&u,s);
728 	    if(!xmlc_white_space(u)) return 0;
729 	  }
730 	} else goto SKIP_SPACE;
731       }
732       p=drv(p,u);
733       if(s==end) goto SKIP_SPACE;
734       s+=u_get(&u,s);
735     }
736   } else return 0;
737 }
738 
739