1 
2 #include "lpeg.h"
3 
4 
5 /*
6 ** $Id: lpprint.c,v 1.7 2013/04/12 16:29:49 roberto Exp $
7 ** Copyright 2007, Lua.org & PUC-Rio  (see 'lpeg.html' for license)
8 */
9 
10 /* #include <ctype.h>*/
11 /* #include <limits.h>*/
12 /* #include <stdio.h>*/
13 
14 
15 /* #include "lptypes.h"*/
16 /* #include "lpprint.h"*/
17 /* #include "lpcode.h"*/
18 
19 
20 #if defined(LPEG_DEBUG)
21 
22 /*
23 ** {======================================================
24 ** Printing patterns (for debugging)
25 ** =======================================================
26 */
27 
28 
printcharset(const byte * st)29 void printcharset (const byte *st) {
30   int i;
31   printf("[");
32   for (i = 0; i <= UCHAR_MAX; i++) {
33     int first = i;
34     while (testchar(st, i) && i <= UCHAR_MAX) i++;
35     if (i - 1 == first)  /* unary range? */
36       printf("(%02x)", first);
37     else if (i - 1 > first)  /* non-empty range? */
38       printf("(%02x-%02x)", first, i - 1);
39   }
40   printf("]");
41 }
42 
43 
printcapkind(int kind)44 static void printcapkind (int kind) {
45   const char *const modes[] = {
46     "close", "position", "constant", "backref",
47     "argument", "simple", "table", "function",
48     "query", "string", "num", "substitution", "fold",
49     "runtime", "group"};
50   printf("%s", modes[kind]);
51 }
52 
53 
printjmp(const Instruction * op,const Instruction * p)54 static void printjmp (const Instruction *op, const Instruction *p) {
55   printf("-> %d", (int)(p + (p + 1)->offset - op));
56 }
57 
58 
printinst(const Instruction * op,const Instruction * p)59 static void printinst (const Instruction *op, const Instruction *p) {
60   const char *const names[] = {
61     "any", "char", "set",
62     "testany", "testchar", "testset",
63     "span", "behind",
64     "ret", "end",
65     "choice", "jmp", "call", "open_call",
66     "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup",
67      "fullcapture", "opencapture", "closecapture", "closeruntime"
68   };
69   printf("%02ld: %s ", (long)(p - op), names[p->i.code]);
70   switch ((Opcode)p->i.code) {
71     case IChar: {
72       printf("'%c'", p->i.aux);
73       break;
74     }
75     case ITestChar: {
76       printf("'%c'", p->i.aux); printjmp(op, p);
77       break;
78     }
79     case IFullCapture: {
80       printcapkind(getkind(p));
81       printf(" (size = %d)  (idx = %d)", getoff(p), p->i.key);
82       break;
83     }
84     case IOpenCapture: {
85       printcapkind(getkind(p));
86       printf(" (idx = %d)", p->i.key);
87       break;
88     }
89     case ISet: {
90       printcharset((p+1)->buff);
91       break;
92     }
93     case ITestSet: {
94       printcharset((p+2)->buff); printjmp(op, p);
95       break;
96     }
97     case ISpan: {
98       printcharset((p+1)->buff);
99       break;
100     }
101     case IOpenCall: {
102       printf("-> %d", (p + 1)->offset);
103       break;
104     }
105     case IBehind: {
106       printf("%d", p->i.aux);
107       break;
108     }
109     case IJmp: case ICall: case ICommit: case IChoice:
110     case IPartialCommit: case IBackCommit: case ITestAny: {
111       printjmp(op, p);
112       break;
113     }
114     default: break;
115   }
116   printf("\n");
117 }
118 
119 
printpatt(Instruction * p,int n)120 void printpatt (Instruction *p, int n) {
121   Instruction *op = p;
122   while (p < op + n) {
123     printinst(op, p);
124     p += sizei(p);
125   }
126 }
127 
128 
129 #if defined(LPEG_DEBUG)
printcap(Capture * cap)130 static void printcap (Capture *cap) {
131   printcapkind(cap->kind);
132   printf(" (idx: %d - size: %d) -> %p\n", cap->idx, cap->siz, cap->s);
133 }
134 
135 
printcaplist(Capture * cap,Capture * limit)136 void printcaplist (Capture *cap, Capture *limit) {
137   printf(">======\n");
138   for (; cap->s && (limit == NULL || cap < limit); cap++)
139     printcap(cap);
140   printf("=======\n");
141 }
142 #endif
143 
144 /* }====================================================== */
145 
146 
147 /*
148 ** {======================================================
149 ** Printing trees (for debugging)
150 ** =======================================================
151 */
152 
153 static const char *tagnames[] = {
154   "char", "set", "any",
155   "true", "false",
156   "rep",
157   "seq", "choice",
158   "not", "and",
159   "call", "opencall", "rule", "grammar",
160   "behind",
161   "capture", "run-time"
162 };
163 
164 
printtree(TTree * tree,int ident)165 void printtree (TTree *tree, int ident) {
166   int i;
167   for (i = 0; i < ident; i++) printf(" ");
168   printf("%s", tagnames[tree->tag]);
169   switch (tree->tag) {
170     case TChar: {
171       int c = tree->u.n;
172       if (isprint(c))
173         printf(" '%c'\n", c);
174       else
175         printf(" (%02X)\n", c);
176       break;
177     }
178     case TSet: {
179       printcharset(treebuffer(tree));
180       printf("\n");
181       break;
182     }
183     case TOpenCall: case TCall: {
184       printf(" key: %d\n", tree->key);
185       break;
186     }
187     case TBehind: {
188       printf(" %d\n", tree->u.n);
189         printtree(sib1(tree), ident + 2);
190       break;
191     }
192     case TCapture: {
193       printf(" cap: %d  key: %d  n: %d\n", tree->cap, tree->key, tree->u.n);
194       printtree(sib1(tree), ident + 2);
195       break;
196     }
197     case TRule: {
198       printf(" n: %d  key: %d\n", tree->cap, tree->key);
199       printtree(sib1(tree), ident + 2);
200       break;  /* do not print next rule as a sibling */
201     }
202     case TGrammar: {
203       TTree *rule = sib1(tree);
204       printf(" %d\n", tree->u.n);  /* number of rules */
205       for (i = 0; i < tree->u.n; i++) {
206         printtree(rule, ident + 2);
207         rule = sib2(rule);
208       }
209       assert(rule->tag == TTrue);  /* sentinel */
210       break;
211     }
212     default: {
213       int sibs = numsiblings[tree->tag];
214       printf("\n");
215       if (sibs >= 1) {
216         printtree(sib1(tree), ident + 2);
217         if (sibs >= 2)
218           printtree(sib2(tree), ident + 2);
219       }
220       break;
221     }
222   }
223 }
224 
225 
printktable(lua_State * L,int idx)226 void printktable (lua_State *L, int idx) {
227   int n, i;
228   lua_getfenv(L, idx);
229   if (lua_isnil(L, -1))  /* no ktable? */
230     return;
231   n = lua_objlen(L, -1);
232   printf("[");
233   for (i = 1; i <= n; i++) {
234     printf("%d = ", i);
235     lua_rawgeti(L, -1, i);
236     if (lua_isstring(L, -1))
237       printf("%s  ", lua_tostring(L, -1));
238     else
239       printf("%s  ", lua_typename(L, lua_type(L, -1)));
240     lua_pop(L, 1);
241   }
242   printf("]\n");
243   /* leave ktable at the stack */
244 }
245 
246 /* }====================================================== */
247 
248 #endif
249 /*
250 ** $Id: lpvm.c,v 1.5 2013/04/12 16:29:49 roberto Exp $
251 ** Copyright 2007, Lua.org & PUC-Rio  (see 'lpeg.html' for license)
252 */
253 
254 /* #include <limits.h>*/
255 /* #include <string.h>*/
256 
257 
258 /* #include "lua.h"*/
259 /* #include "lauxlib.h"*/
260 
261 /* #include "lpcap.h"*/
262 /* #include "lptypes.h"*/
263 /* #include "lpvm.h"*/
264 /* #include "lpprint.h"*/
265 
266 
267 /* initial size for call/backtrack stack */
268 #if !defined(INITBACK)
269 #define INITBACK	100
270 #endif
271 
272 
273 #define getoffset(p)	(((p) + 1)->offset)
274 
275 static const Instruction giveup = {{IGiveup, 0, 0}};
276 
277 
278 /*
279 ** {======================================================
280 ** Virtual Machine
281 ** =======================================================
282 */
283 
284 
285 typedef struct Stack {
286   const char *s;  /* saved position (or NULL for calls) */
287   const Instruction *p;  /* next instruction */
288   int caplevel;
289 } Stack;
290 
291 
292 #define getstackbase(L, ptop)	((Stack *)lua_touserdata(L, stackidx(ptop)))
293 
294 
295 /*
296 ** Double the size of the array of captures
297 */
doublecap(lua_State * L,Capture * cap,int captop,int ptop)298 static Capture *doublecap (lua_State *L, Capture *cap, int captop, int ptop) {
299   Capture *newc;
300   if (captop >= INT_MAX/((int)sizeof(Capture) * 2))
301     luaL_error(L, "too many captures");
302   newc = (Capture *)lua_newuserdata(L, captop * 2 * sizeof(Capture));
303   memcpy(newc, cap, captop * sizeof(Capture));
304   lua_replace(L, caplistidx(ptop));
305   return newc;
306 }
307 
308 
309 /*
310 ** Double the size of the stack
311 */
doublestack(lua_State * L,Stack ** stacklimit,int ptop)312 static Stack *doublestack (lua_State *L, Stack **stacklimit, int ptop) {
313   Stack *stack = getstackbase(L, ptop);
314   Stack *newstack;
315   int n = *stacklimit - stack;  /* current stack size */
316   int max, newn;
317   lua_getfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX);
318   max = lua_tointeger(L, -1);  /* maximum allowed size */
319   lua_pop(L, 1);
320   if (n >= max)  /* already at maximum size? */
321     luaL_error(L, "too many pending calls/choices");
322   newn = 2 * n;  /* new size */
323   if (newn > max) newn = max;
324   newstack = (Stack *)lua_newuserdata(L, newn * sizeof(Stack));
325   memcpy(newstack, stack, n * sizeof(Stack));
326   lua_replace(L, stackidx(ptop));
327   *stacklimit = newstack + newn;
328   return newstack + n;  /* return next position */
329 }
330 
331 
332 /*
333 ** Interpret the result of a dynamic capture: false -> fail;
334 ** true -> keep current position; number -> next position.
335 ** Return new subject position. 'fr' is stack index where
336 ** is the result; 'curr' is current subject position; 'limit'
337 ** is subject's size.
338 */
resdyncaptures(lua_State * L,int fr,int curr,int limit)339 static int resdyncaptures (lua_State *L, int fr, int curr, int limit) {
340   lua_Integer res;
341   if (!lua_toboolean(L, fr)) {  /* false value? */
342     lua_settop(L, fr - 1);  /* remove results */
343     return -1;  /* and fail */
344   }
345   else if (lua_isboolean(L, fr))  /* true? */
346     res = curr;  /* keep current position */
347   else {
348     res = lua_tointeger(L, fr) - 1;  /* new position */
349     if (res < curr || res > limit)
350       luaL_error(L, "invalid position returned by match-time capture");
351   }
352   lua_remove(L, fr);  /* remove first result (offset) */
353   return res;
354 }
355 
356 
357 /*
358 ** Add capture values returned by a dynamic capture to the capture list
359 ** 'base', nested inside a group capture. 'fd' indexes the first capture
360 ** value, 'n' is the number of values (at least 1).
361 */
adddyncaptures(const char * s,Capture * base,int n,int fd)362 static void adddyncaptures (const char *s, Capture *base, int n, int fd) {
363   int i;
364   /* Cgroup capture is already there */
365   assert(base[0].kind == Cgroup && base[0].siz == 0);
366   base[0].idx = 0;  /* make it an anonymous group */
367   for (i = 1; i <= n; i++) {  /* add runtime captures */
368     base[i].kind = Cruntime;
369     base[i].siz = 1;  /* mark it as closed */
370     base[i].idx = fd + i - 1;  /* stack index of capture value */
371     base[i].s = s;
372   }
373   base[i].kind = Cclose;  /* close group */
374   base[i].siz = 1;
375   base[i].s = s;
376 }
377 
378 
379 /*
380 ** Remove dynamic captures from the Lua stack (called in case of failure)
381 */
removedyncap(lua_State * L,Capture * capture,int level,int last)382 static int removedyncap (lua_State *L, Capture *capture,
383                          int level, int last) {
384   int id = finddyncap(capture + level, capture + last);  /* index of 1st cap. */
385   int top = lua_gettop(L);
386   if (id == 0) return 0;  /* no dynamic captures? */
387   lua_settop(L, id - 1);  /* remove captures */
388   return top - id + 1;  /* number of values removed */
389 }
390 
391 
392 /*
393 ** Opcode interpreter
394 */
match(lua_State * L,const char * o,const char * s,const char * e,Instruction * op,Capture * capture,int ptop)395 const char *match (lua_State *L, const char *o, const char *s, const char *e,
396                    Instruction *op, Capture *capture, int ptop) {
397   Stack stackbase[INITBACK];
398   Stack *stacklimit = stackbase + INITBACK;
399   Stack *stack = stackbase;  /* point to first empty slot in stack */
400   int capsize = INITCAPSIZE;
401   int captop = 0;  /* point to first empty slot in captures */
402   int ndyncap = 0;  /* number of dynamic captures (in Lua stack) */
403   const Instruction *p = op;  /* current instruction */
404   stack->p = &giveup; stack->s = s; stack->caplevel = 0; stack++;
405   lua_pushlightuserdata(L, stackbase);
406   for (;;) {
407 #if defined(DEBUG)
408       printf("s: |%s| stck:%d, dyncaps:%d, caps:%d  ",
409              s, stack - getstackbase(L, ptop), ndyncap, captop);
410       printinst(op, p);
411       printcaplist(capture, capture + captop);
412 #endif
413     assert(stackidx(ptop) + ndyncap == lua_gettop(L) && ndyncap <= captop);
414     switch ((Opcode)p->i.code) {
415       case IEnd: {
416         assert(stack == getstackbase(L, ptop) + 1);
417         capture[captop].kind = Cclose;
418         capture[captop].s = NULL;
419         return s;
420       }
421       case IGiveup: {
422         assert(stack == getstackbase(L, ptop));
423         return NULL;
424       }
425       case IRet: {
426         assert(stack > getstackbase(L, ptop) && (stack - 1)->s == NULL);
427         p = (--stack)->p;
428         continue;
429       }
430       case IAny: {
431         if (s < e) { p++; s++; }
432         else goto fail;
433         continue;
434       }
435       case ITestAny: {
436         if (s < e) p += 2;
437         else p += getoffset(p);
438         continue;
439       }
440       case IChar: {
441         if ((byte)*s == p->i.aux && s < e) { p++; s++; }
442         else goto fail;
443         continue;
444       }
445       case ITestChar: {
446         if ((byte)*s == p->i.aux && s < e) p += 2;
447         else p += getoffset(p);
448         continue;
449       }
450       case ISet: {
451         int c = (byte)*s;
452         if (testchar((p+1)->buff, c) && s < e)
453           { p += CHARSETINSTSIZE; s++; }
454         else goto fail;
455         continue;
456       }
457       case ITestSet: {
458         int c = (byte)*s;
459         if (testchar((p + 2)->buff, c) && s < e)
460           p += 1 + CHARSETINSTSIZE;
461         else p += getoffset(p);
462         continue;
463       }
464       case IBehind: {
465         int n = p->i.aux;
466         if (n > s - o) goto fail;
467         s -= n; p++;
468         continue;
469       }
470       case ISpan: {
471         for (; s < e; s++) {
472           int c = (byte)*s;
473           if (!testchar((p+1)->buff, c)) break;
474         }
475         p += CHARSETINSTSIZE;
476         continue;
477       }
478       case IJmp: {
479         p += getoffset(p);
480         continue;
481       }
482       case IChoice: {
483         if (stack == stacklimit)
484           stack = doublestack(L, &stacklimit, ptop);
485         stack->p = p + getoffset(p);
486         stack->s = s;
487         stack->caplevel = captop;
488         stack++;
489         p += 2;
490         continue;
491       }
492       case ICall: {
493         if (stack == stacklimit)
494           stack = doublestack(L, &stacklimit, ptop);
495         stack->s = NULL;
496         stack->p = p + 2;  /* save return address */
497         stack++;
498         p += getoffset(p);
499         continue;
500       }
501       case ICommit: {
502         assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL);
503         stack--;
504         p += getoffset(p);
505         continue;
506       }
507       case IPartialCommit: {
508         assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL);
509         (stack - 1)->s = s;
510         (stack - 1)->caplevel = captop;
511         p += getoffset(p);
512         continue;
513       }
514       case IBackCommit: {
515         assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL);
516         s = (--stack)->s;
517         captop = stack->caplevel;
518         p += getoffset(p);
519         continue;
520       }
521       case IFailTwice:
522         assert(stack > getstackbase(L, ptop));
523         stack--;
524         /* go through */
525       case IFail:
526       fail: { /* pattern failed: try to backtrack */
527         do {  /* remove pending calls */
528           assert(stack > getstackbase(L, ptop));
529           s = (--stack)->s;
530         } while (s == NULL);
531         if (ndyncap > 0)  /* is there matchtime captures? */
532           ndyncap -= removedyncap(L, capture, stack->caplevel, captop);
533         captop = stack->caplevel;
534         p = stack->p;
535         continue;
536       }
537       case ICloseRunTime: {
538         CapState cs;
539         int rem, res, n;
540         int fr = lua_gettop(L) + 1;  /* stack index of first result */
541         cs.s = o; cs.L = L; cs.ocap = capture; cs.ptop = ptop;
542         n = runtimecap(&cs, capture + captop, s, &rem);  /* call function */
543         captop -= n;  /* remove nested captures */
544         fr -= rem;  /* 'rem' items were popped from Lua stack */
545         res = resdyncaptures(L, fr, s - o, e - o);  /* get result */
546         if (res == -1)  /* fail? */
547           goto fail;
548         s = o + res;  /* else update current position */
549         n = lua_gettop(L) - fr + 1;  /* number of new captures */
550         ndyncap += n - rem;  /* update number of dynamic captures */
551         if (n > 0) {  /* any new capture? */
552           if ((captop += n + 2) >= capsize) {
553             capture = doublecap(L, capture, captop, ptop);
554             capsize = 2 * captop;
555           }
556           /* add new captures to 'capture' list */
557           adddyncaptures(s, capture + captop - n - 2, n, fr);
558         }
559         p++;
560         continue;
561       }
562       case ICloseCapture: {
563         const char *s1 = s;
564         assert(captop > 0);
565         /* if possible, turn capture into a full capture */
566         if (capture[captop - 1].siz == 0 &&
567             s1 - capture[captop - 1].s < UCHAR_MAX) {
568           capture[captop - 1].siz = s1 - capture[captop - 1].s + 1;
569           p++;
570           continue;
571         }
572         else {
573           capture[captop].siz = 1;  /* mark entry as closed */
574           capture[captop].s = s;
575           goto pushcapture;
576         }
577       }
578       case IOpenCapture:
579         capture[captop].siz = 0;  /* mark entry as open */
580         capture[captop].s = s;
581         goto pushcapture;
582       case IFullCapture:
583         capture[captop].siz = getoff(p) + 1;  /* save capture size */
584         capture[captop].s = s - getoff(p);
585         /* goto pushcapture; */
586       pushcapture: {
587         capture[captop].idx = p->i.key;
588         capture[captop].kind = getkind(p);
589         if (++captop >= capsize) {
590           capture = doublecap(L, capture, captop, ptop);
591           capsize = 2 * captop;
592         }
593         p++;
594         continue;
595       }
596       default: assert(0); return NULL;
597     }
598   }
599 }
600 
601 /* }====================================================== */
602 
603 
604 /*
605 ** $Id: lpcode.c,v 1.21 2014/12/12 17:01:29 roberto Exp $
606 ** Copyright 2007, Lua.org & PUC-Rio  (see 'lpeg.html' for license)
607 */
608 
609 /* #include <limits.h>*/
610 
611 
612 /* #include "lua.h"*/
613 /* #include "lauxlib.h"*/
614 
615 /* #include "lptypes.h"*/
616 /* #include "lpcode.h"*/
617 
618 
619 /* signals a "no-instruction */
620 #define NOINST		-1
621 
622 
623 
624 static const Charset fullset_ =
625   {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
626     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
627     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
628     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}};
629 
630 static const Charset *fullset = &fullset_;
631 
632 /*
633 ** {======================================================
634 ** Analysis and some optimizations
635 ** =======================================================
636 */
637 
638 /*
639 ** Check whether a charset is empty (returns IFail), singleton (IChar),
640 ** full (IAny), or none of those (ISet). When singleton, '*c' returns
641 ** which character it is. (When generic set, the set was the input,
642 ** so there is no need to return it.)
643 */
charsettype(const byte * cs,int * c)644 static Opcode charsettype (const byte *cs, int *c) {
645   int count = 0;  /* number of characters in the set */
646   int i;
647   int candidate = -1;  /* candidate position for the singleton char */
648   for (i = 0; i < CHARSETSIZE; i++) {  /* for each byte */
649     int b = cs[i];
650     if (b == 0) {  /* is byte empty? */
651       if (count > 1)  /* was set neither empty nor singleton? */
652         return ISet;  /* neither full nor empty nor singleton */
653       /* else set is still empty or singleton */
654     }
655     else if (b == 0xFF) {  /* is byte full? */
656       if (count < (i * BITSPERCHAR))  /* was set not full? */
657         return ISet;  /* neither full nor empty nor singleton */
658       else count += BITSPERCHAR;  /* set is still full */
659     }
660     else if ((b & (b - 1)) == 0) {  /* has byte only one bit? */
661       if (count > 0)  /* was set not empty? */
662         return ISet;  /* neither full nor empty nor singleton */
663       else {  /* set has only one char till now; track it */
664         count++;
665         candidate = i;
666       }
667     }
668     else return ISet;  /* byte is neither empty, full, nor singleton */
669   }
670   switch (count) {
671     case 0: return IFail;  /* empty set */
672     case 1: {  /* singleton; find character bit inside byte */
673       int b = cs[candidate];
674       *c = candidate * BITSPERCHAR;
675       if ((b & 0xF0) != 0) { *c += 4; b >>= 4; }
676       if ((b & 0x0C) != 0) { *c += 2; b >>= 2; }
677       if ((b & 0x02) != 0) { *c += 1; }
678       return IChar;
679     }
680     default: {
681        assert(count == CHARSETSIZE * BITSPERCHAR);  /* full set */
682        return IAny;
683     }
684   }
685 }
686 
687 
688 /*
689 ** A few basic operations on Charsets
690 */
cs_complement(Charset * cs)691 static void cs_complement (Charset *cs) {
692   loopset(i, cs->cs[i] = ~cs->cs[i]);
693 }
694 
cs_equal(const byte * cs1,const byte * cs2)695 static int cs_equal (const byte *cs1, const byte *cs2) {
696   loopset(i, if (cs1[i] != cs2[i]) return 0);
697   return 1;
698 }
699 
cs_disjoint(const Charset * cs1,const Charset * cs2)700 static int cs_disjoint (const Charset *cs1, const Charset *cs2) {
701   loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;)
702   return 1;
703 }
704 
705 
706 /*
707 ** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a
708 ** charset and return 1; else return 0.
709 */
tocharset(TTree * tree,Charset * cs)710 int tocharset (TTree *tree, Charset *cs) {
711   switch (tree->tag) {
712     case TSet: {  /* copy set */
713       loopset(i, cs->cs[i] = treebuffer(tree)[i]);
714       return 1;
715     }
716     case TChar: {  /* only one char */
717       assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX);
718       loopset(i, cs->cs[i] = 0);  /* erase all chars */
719       setchar(cs->cs, tree->u.n);  /* add that one */
720       return 1;
721     }
722     case TAny: {
723       loopset(i, cs->cs[i] = 0xFF);  /* add all characters to the set */
724       return 1;
725     }
726     default: return 0;
727   }
728 }
729 
730 
731 /*
732 ** Check whether a pattern tree has captures
733 */
hascaptures(TTree * tree)734 int hascaptures (TTree *tree) {
735  tailcall:
736   switch (tree->tag) {
737     case TCapture: case TRunTime:
738       return 1;
739     case TCall:
740       tree = sib2(tree); goto tailcall;  /* return hascaptures(sib2(tree)); */
741     case TOpenCall: assert(0);
742     default: {
743       switch (numsiblings[tree->tag]) {
744         case 1:  /* return hascaptures(sib1(tree)); */
745           tree = sib1(tree); goto tailcall;
746         case 2:
747           if (hascaptures(sib1(tree))) return 1;
748           /* else return hascaptures(sib2(tree)); */
749           tree = sib2(tree); goto tailcall;
750         default: assert(numsiblings[tree->tag] == 0); return 0;
751       }
752     }
753   }
754 }
755 
756 
757 /*
758 ** Checks how a pattern behaves regarding the empty string,
759 ** in one of two different ways:
760 ** A pattern is *nullable* if it can match without consuming any character;
761 ** A pattern is *nofail* if it never fails for any string
762 ** (including the empty string).
763 ** The difference is only for predicates and run-time captures;
764 ** for other patterns, the two properties are equivalent.
765 ** (With predicates, &'a' is nullable but not nofail. Of course,
766 ** nofail => nullable.)
767 ** These functions are all convervative in the following way:
768 **    p is nullable => nullable(p)
769 **    nofail(p) => p cannot fail
770 ** The function assumes that TOpenCall is not nullable;
771 ** this will be checked again when the grammar is fixed.
772 ** Run-time captures can do whatever they want, so the result
773 ** is conservative.
774 */
checkaux(TTree * tree,int pred)775 int checkaux (TTree *tree, int pred) {
776  tailcall:
777   switch (tree->tag) {
778     case TChar: case TSet: case TAny:
779     case TFalse: case TOpenCall:
780       return 0;  /* not nullable */
781     case TRep: case TTrue:
782       return 1;  /* no fail */
783     case TNot: case TBehind:  /* can match empty, but can fail */
784       if (pred == PEnofail) return 0;
785       else return 1;  /* PEnullable */
786     case TAnd:  /* can match empty; fail iff body does */
787       if (pred == PEnullable) return 1;
788       /* else return checkaux(sib1(tree), pred); */
789       tree = sib1(tree); goto tailcall;
790     case TRunTime:  /* can fail; match empty iff body does */
791       if (pred == PEnofail) return 0;
792       /* else return checkaux(sib1(tree), pred); */
793       tree = sib1(tree); goto tailcall;
794     case TSeq:
795       if (!checkaux(sib1(tree), pred)) return 0;
796       /* else return checkaux(sib2(tree), pred); */
797       tree = sib2(tree); goto tailcall;
798     case TChoice:
799       if (checkaux(sib2(tree), pred)) return 1;
800       /* else return checkaux(sib1(tree), pred); */
801       tree = sib1(tree); goto tailcall;
802     case TCapture: case TGrammar: case TRule:
803       /* return checkaux(sib1(tree), pred); */
804       tree = sib1(tree); goto tailcall;
805     case TCall:  /* return checkaux(sib2(tree), pred); */
806       tree = sib2(tree); goto tailcall;
807     default: assert(0); return 0;
808   }
809 }
810 
811 
812 /*
813 ** number of characters to match a pattern (or -1 if variable)
814 ** ('count' avoids infinite loops for grammars)
815 */
fixedlenx(TTree * tree,int count,int len)816 int fixedlenx (TTree *tree, int count, int len) {
817  tailcall:
818   switch (tree->tag) {
819     case TChar: case TSet: case TAny:
820       return len + 1;
821     case TFalse: case TTrue: case TNot: case TAnd: case TBehind:
822       return len;
823     case TRep: case TRunTime: case TOpenCall:
824       return -1;
825     case TCapture: case TRule: case TGrammar:
826       /* return fixedlenx(sib1(tree), count); */
827       tree = sib1(tree); goto tailcall;
828     case TCall:
829       if (count++ >= MAXRULES)
830         return -1;  /* may be a loop */
831       /* else return fixedlenx(sib2(tree), count); */
832       tree = sib2(tree); goto tailcall;
833     case TSeq: {
834       len = fixedlenx(sib1(tree), count, len);
835       if (len < 0) return -1;
836       /* else return fixedlenx(sib2(tree), count, len); */
837       tree = sib2(tree); goto tailcall;
838     }
839     case TChoice: {
840       int n1, n2;
841       n1 = fixedlenx(sib1(tree), count, len);
842       if (n1 < 0) return -1;
843       n2 = fixedlenx(sib2(tree), count, len);
844       if (n1 == n2) return n1;
845       else return -1;
846     }
847     default: assert(0); return 0;
848   };
849 }
850 
851 
852 /*
853 ** Computes the 'first set' of a pattern.
854 ** The result is a conservative aproximation:
855 **   match p ax -> x (for some x) ==> a belongs to first(p)
856 ** or
857 **   a not in first(p) ==> match p ax -> fail (for all x)
858 **
859 ** The set 'follow' is the first set of what follows the
860 ** pattern (full set if nothing follows it).
861 **
862 ** The function returns 0 when this resulting set can be used for
863 ** test instructions that avoid the pattern altogether.
864 ** A non-zero return can happen for two reasons:
865 ** 1) match p '' -> ''            ==> return has bit 1 set
866 ** (tests cannot be used because they would always fail for an empty input);
867 ** 2) there is a match-time capture ==> return has bit 2 set
868 ** (optimizations should not bypass match-time captures).
869 */
getfirst(TTree * tree,const Charset * follow,Charset * firstset)870 static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) {
871  tailcall:
872   switch (tree->tag) {
873     case TChar: case TSet: case TAny: {
874       tocharset(tree, firstset);
875       return 0;
876     }
877     case TTrue: {
878       loopset(i, firstset->cs[i] = follow->cs[i]);
879       return 1;  /* accepts the empty string */
880     }
881     case TFalse: {
882       loopset(i, firstset->cs[i] = 0);
883       return 0;
884     }
885     case TChoice: {
886       Charset csaux;
887       int e1 = getfirst(sib1(tree), follow, firstset);
888       int e2 = getfirst(sib2(tree), follow, &csaux);
889       loopset(i, firstset->cs[i] |= csaux.cs[i]);
890       return e1 | e2;
891     }
892     case TSeq: {
893       if (!nullable(sib1(tree))) {
894         /* when p1 is not nullable, p2 has nothing to contribute;
895            return getfirst(sib1(tree), fullset, firstset); */
896         tree = sib1(tree); follow = fullset; goto tailcall;
897       }
898       else {  /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */
899         Charset csaux;
900         int e2 = getfirst(sib2(tree), follow, &csaux);
901         int e1 = getfirst(sib1(tree), &csaux, firstset);
902         if (e1 == 0) return 0;  /* 'e1' ensures that first can be used */
903         else if ((e1 | e2) & 2)  /* one of the children has a matchtime? */
904           return 2;  /* pattern has a matchtime capture */
905         else return e2;  /* else depends on 'e2' */
906       }
907     }
908     case TRep: {
909       getfirst(sib1(tree), follow, firstset);
910       loopset(i, firstset->cs[i] |= follow->cs[i]);
911       return 1;  /* accept the empty string */
912     }
913     case TCapture: case TGrammar: case TRule: {
914       /* return getfirst(sib1(tree), follow, firstset); */
915       tree = sib1(tree); goto tailcall;
916     }
917     case TRunTime: {  /* function invalidates any follow info. */
918       int e = getfirst(sib1(tree), fullset, firstset);
919       if (e) return 2;  /* function is not "protected"? */
920       else return 0;  /* pattern inside capture ensures first can be used */
921     }
922     case TCall: {
923       /* return getfirst(sib2(tree), follow, firstset); */
924       tree = sib2(tree); goto tailcall;
925     }
926     case TAnd: {
927       int e = getfirst(sib1(tree), follow, firstset);
928       loopset(i, firstset->cs[i] &= follow->cs[i]);
929       return e;
930     }
931     case TNot: {
932       if (tocharset(sib1(tree), firstset)) {
933         cs_complement(firstset);
934         return 1;
935       }
936       /* else go through */
937     }
938     case TBehind: {  /* instruction gives no new information */
939       /* call 'getfirst' only to check for math-time captures */
940       int e = getfirst(sib1(tree), follow, firstset);
941       loopset(i, firstset->cs[i] = follow->cs[i]);  /* uses follow */
942       return e | 1;  /* always can accept the empty string */
943     }
944     default: assert(0); return 0;
945   }
946 }
947 
948 
949 /*
950 ** If 'headfail(tree)' true, then 'tree' can fail only depending on the
951 ** next character of the subject.
952 */
headfail(TTree * tree)953 static int headfail (TTree *tree) {
954  tailcall:
955   switch (tree->tag) {
956     case TChar: case TSet: case TAny: case TFalse:
957       return 1;
958     case TTrue: case TRep: case TRunTime: case TNot:
959     case TBehind:
960       return 0;
961     case TCapture: case TGrammar: case TRule: case TAnd:
962       tree = sib1(tree); goto tailcall;  /* return headfail(sib1(tree)); */
963     case TCall:
964       tree = sib2(tree); goto tailcall;  /* return headfail(sib2(tree)); */
965     case TSeq:
966       if (!nofail(sib2(tree))) return 0;
967       /* else return headfail(sib1(tree)); */
968       tree = sib1(tree); goto tailcall;
969     case TChoice:
970       if (!headfail(sib1(tree))) return 0;
971       /* else return headfail(sib2(tree)); */
972       tree = sib2(tree); goto tailcall;
973     default: assert(0); return 0;
974   }
975 }
976 
977 
978 /*
979 ** Check whether the code generation for the given tree can benefit
980 ** from a follow set (to avoid computing the follow set when it is
981 ** not needed)
982 */
needfollow(TTree * tree)983 static int needfollow (TTree *tree) {
984  tailcall:
985   switch (tree->tag) {
986     case TChar: case TSet: case TAny:
987     case TFalse: case TTrue: case TAnd: case TNot:
988     case TRunTime: case TGrammar: case TCall: case TBehind:
989       return 0;
990     case TChoice: case TRep:
991       return 1;
992     case TCapture:
993       tree = sib1(tree); goto tailcall;
994     case TSeq:
995       tree = sib2(tree); goto tailcall;
996     default: assert(0); return 0;
997   }
998 }
999 
1000 /* }====================================================== */
1001 
1002 
1003 
1004 /*
1005 ** {======================================================
1006 ** Code generation
1007 ** =======================================================
1008 */
1009 
1010 
1011 /*
1012 ** size of an instruction
1013 */
sizei(const Instruction * i)1014 int sizei (const Instruction *i) {
1015   switch((Opcode)i->i.code) {
1016     case ISet: case ISpan: return CHARSETINSTSIZE;
1017     case ITestSet: return CHARSETINSTSIZE + 1;
1018     case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall:
1019     case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit:
1020       return 2;
1021     default: return 1;
1022   }
1023 }
1024 
1025 
1026 /*
1027 ** state for the compiler
1028 */
1029 typedef struct CompileState {
1030   Pattern *p;  /* pattern being compiled */
1031   int ncode;  /* next position in p->code to be filled */
1032   lua_State *L;
1033 } CompileState;
1034 
1035 
1036 /*
1037 ** code generation is recursive; 'opt' indicates that the code is
1038 ** being generated under a 'IChoice' operator jumping to its end
1039 ** (that is, the match is "optional").
1040 ** 'tt' points to a previous test protecting this code. 'fl' is
1041 ** the follow set of the pattern.
1042 */
1043 static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
1044                      const Charset *fl);
1045 
1046 
realloccode(lua_State * L,Pattern * p,int nsize)1047 void realloccode (lua_State *L, Pattern *p, int nsize) {
1048   void *ud;
1049   lua_Alloc f = lua_getallocf(L, &ud);
1050   void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction),
1051                                   nsize * sizeof(Instruction));
1052   if (newblock == NULL && nsize > 0)
1053     luaL_error(L, "not enough memory");
1054   p->code = (Instruction *)newblock;
1055   p->codesize = nsize;
1056 }
1057 
1058 
nextinstruction(CompileState * compst)1059 static int nextinstruction (CompileState *compst) {
1060   int size = compst->p->codesize;
1061   if (compst->ncode >= size)
1062     realloccode(compst->L, compst->p, size * 2);
1063   return compst->ncode++;
1064 }
1065 
1066 
1067 #define getinstr(cs,i)		((cs)->p->code[i])
1068 
1069 
addinstruction(CompileState * compst,Opcode op,int aux)1070 static int addinstruction (CompileState *compst, Opcode op, int aux) {
1071   int i = nextinstruction(compst);
1072   getinstr(compst, i).i.code = op;
1073   getinstr(compst, i).i.aux = aux;
1074   return i;
1075 }
1076 
1077 
1078 /*
1079 ** Add an instruction followed by space for an offset (to be set later)
1080 */
addoffsetinst(CompileState * compst,Opcode op)1081 static int addoffsetinst (CompileState *compst, Opcode op) {
1082   int i = addinstruction(compst, op, 0);  /* instruction */
1083   addinstruction(compst, (Opcode)0, 0);  /* open space for offset */
1084   assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2);
1085   return i;
1086 }
1087 
1088 
1089 /*
1090 ** Set the offset of an instruction
1091 */
setoffset(CompileState * compst,int instruction,int offset)1092 static void setoffset (CompileState *compst, int instruction, int offset) {
1093   getinstr(compst, instruction + 1).offset = offset;
1094 }
1095 
1096 
1097 /*
1098 ** Add a capture instruction:
1099 ** 'op' is the capture instruction; 'cap' the capture kind;
1100 ** 'key' the key into ktable; 'aux' is the optional capture offset
1101 **
1102 */
addinstcap(CompileState * compst,Opcode op,int cap,int key,int aux)1103 static int addinstcap (CompileState *compst, Opcode op, int cap, int key,
1104                        int aux) {
1105   int i = addinstruction(compst, op, joinkindoff(cap, aux));
1106   getinstr(compst, i).i.key = key;
1107   return i;
1108 }
1109 
1110 
1111 #define gethere(compst) 	((compst)->ncode)
1112 
1113 #define target(code,i)		((i) + code[i + 1].offset)
1114 
1115 
1116 /*
1117 ** Patch 'instruction' to jump to 'target'
1118 */
jumptothere(CompileState * compst,int instruction,int target)1119 static void jumptothere (CompileState *compst, int instruction, int target) {
1120   if (instruction >= 0)
1121     setoffset(compst, instruction, target - instruction);
1122 }
1123 
1124 
1125 /*
1126 ** Patch 'instruction' to jump to current position
1127 */
jumptohere(CompileState * compst,int instruction)1128 static void jumptohere (CompileState *compst, int instruction) {
1129   jumptothere(compst, instruction, gethere(compst));
1130 }
1131 
1132 
1133 /*
1134 ** Code an IChar instruction, or IAny if there is an equivalent
1135 ** test dominating it
1136 */
codechar(CompileState * compst,int c,int tt)1137 static void codechar (CompileState *compst, int c, int tt) {
1138   if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar &&
1139                  getinstr(compst, tt).i.aux == c)
1140     addinstruction(compst, IAny, 0);
1141   else
1142     addinstruction(compst, IChar, c);
1143 }
1144 
1145 
1146 /*
1147 ** Add a charset posfix to an instruction
1148 */
addcharset(CompileState * compst,const byte * cs)1149 static void addcharset (CompileState *compst, const byte *cs) {
1150   int p = gethere(compst);
1151   int i;
1152   for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++)
1153     nextinstruction(compst);  /* space for buffer */
1154   /* fill buffer with charset */
1155   loopset(j, getinstr(compst, p).buff[j] = cs[j]);
1156 }
1157 
1158 
1159 /*
1160 ** code a char set, optimizing unit sets for IChar, "complete"
1161 ** sets for IAny, and empty sets for IFail; also use an IAny
1162 ** when instruction is dominated by an equivalent test.
1163 */
codecharset(CompileState * compst,const byte * cs,int tt)1164 static void codecharset (CompileState *compst, const byte *cs, int tt) {
1165   int c = 0;  /* (=) to avoid warnings */
1166   Opcode op = charsettype(cs, &c);
1167   switch (op) {
1168     case IChar: codechar(compst, c, tt); break;
1169     case ISet: {  /* non-trivial set? */
1170       if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet &&
1171           cs_equal(cs, getinstr(compst, tt + 2).buff))
1172         addinstruction(compst, IAny, 0);
1173       else {
1174         addinstruction(compst, ISet, 0);
1175         addcharset(compst, cs);
1176       }
1177       break;
1178     }
1179     default: addinstruction(compst, op, c); break;
1180   }
1181 }
1182 
1183 
1184 /*
1185 ** code a test set, optimizing unit sets for ITestChar, "complete"
1186 ** sets for ITestAny, and empty sets for IJmp (always fails).
1187 ** 'e' is true iff test should accept the empty string. (Test
1188 ** instructions in the current VM never accept the empty string.)
1189 */
codetestset(CompileState * compst,Charset * cs,int e)1190 static int codetestset (CompileState *compst, Charset *cs, int e) {
1191   if (e) return NOINST;  /* no test */
1192   else {
1193     int c = 0;
1194     Opcode op = charsettype(cs->cs, &c);
1195     switch (op) {
1196       case IFail: return addoffsetinst(compst, IJmp);  /* always jump */
1197       case IAny: return addoffsetinst(compst, ITestAny);
1198       case IChar: {
1199         int i = addoffsetinst(compst, ITestChar);
1200         getinstr(compst, i).i.aux = c;
1201         return i;
1202       }
1203       case ISet: {
1204         int i = addoffsetinst(compst, ITestSet);
1205         addcharset(compst, cs->cs);
1206         return i;
1207       }
1208       default: assert(0); return 0;
1209     }
1210   }
1211 }
1212 
1213 
1214 /*
1215 ** Find the final destination of a sequence of jumps
1216 */
finaltarget(Instruction * code,int i)1217 static int finaltarget (Instruction *code, int i) {
1218   while (code[i].i.code == IJmp)
1219     i = target(code, i);
1220   return i;
1221 }
1222 
1223 
1224 /*
1225 ** final label (after traversing any jumps)
1226 */
finallabel(Instruction * code,int i)1227 static int finallabel (Instruction *code, int i) {
1228   return finaltarget(code, target(code, i));
1229 }
1230 
1231 
1232 /*
1233 ** <behind(p)> == behind n; <p>   (where n = fixedlen(p))
1234 */
codebehind(CompileState * compst,TTree * tree)1235 static void codebehind (CompileState *compst, TTree *tree) {
1236   if (tree->u.n > 0)
1237     addinstruction(compst, IBehind, tree->u.n);
1238   codegen(compst, sib1(tree), 0, NOINST, fullset);
1239 }
1240 
1241 
1242 /*
1243 ** Choice; optimizations:
1244 ** - when p1 is headfail
1245 ** - when first(p1) and first(p2) are disjoint; than
1246 ** a character not in first(p1) cannot go to p1, and a character
1247 ** in first(p1) cannot go to p2 (at it is not in first(p2)).
1248 ** (The optimization is not valid if p1 accepts the empty string,
1249 ** as then there is no character at all...)
1250 ** - when p2 is empty and opt is true; a IPartialCommit can resuse
1251 ** the Choice already active in the stack.
1252 */
codechoice(CompileState * compst,TTree * p1,TTree * p2,int opt,const Charset * fl)1253 static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt,
1254                         const Charset *fl) {
1255   int emptyp2 = (p2->tag == TTrue);
1256   Charset cs1, cs2;
1257   int e1 = getfirst(p1, fullset, &cs1);
1258   if (headfail(p1) ||
1259       (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) {
1260     /* <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */
1261     int test = codetestset(compst, &cs1, 0);
1262     int jmp = NOINST;
1263     codegen(compst, p1, 0, test, fl);
1264     if (!emptyp2)
1265       jmp = addoffsetinst(compst, IJmp);
1266     jumptohere(compst, test);
1267     codegen(compst, p2, opt, NOINST, fl);
1268     jumptohere(compst, jmp);
1269   }
1270   else if (opt && emptyp2) {
1271     /* p1? == IPartialCommit; p1 */
1272     jumptohere(compst, addoffsetinst(compst, IPartialCommit));
1273     codegen(compst, p1, 1, NOINST, fullset);
1274   }
1275   else {
1276     /* <p1 / p2> ==
1277         test(fail(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */
1278     int pcommit;
1279     int test = codetestset(compst, &cs1, e1);
1280     int pchoice = addoffsetinst(compst, IChoice);
1281     codegen(compst, p1, emptyp2, test, fullset);
1282     pcommit = addoffsetinst(compst, ICommit);
1283     jumptohere(compst, pchoice);
1284     jumptohere(compst, test);
1285     codegen(compst, p2, opt, NOINST, fl);
1286     jumptohere(compst, pcommit);
1287   }
1288 }
1289 
1290 
1291 /*
1292 ** And predicate
1293 ** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n
1294 ** (valid only when 'p' has no captures)
1295 */
codeand(CompileState * compst,TTree * tree,int tt)1296 static void codeand (CompileState *compst, TTree *tree, int tt) {
1297   int n = fixedlen(tree);
1298   if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) {
1299     codegen(compst, tree, 0, tt, fullset);
1300     if (n > 0)
1301       addinstruction(compst, IBehind, n);
1302   }
1303   else {  /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */
1304     int pcommit;
1305     int pchoice = addoffsetinst(compst, IChoice);
1306     codegen(compst, tree, 0, tt, fullset);
1307     pcommit = addoffsetinst(compst, IBackCommit);
1308     jumptohere(compst, pchoice);
1309     addinstruction(compst, IFail, 0);
1310     jumptohere(compst, pcommit);
1311   }
1312 }
1313 
1314 
1315 /*
1316 ** Captures: if pattern has fixed (and not too big) length, use
1317 ** a single IFullCapture instruction after the match; otherwise,
1318 ** enclose the pattern with OpenCapture - CloseCapture.
1319 */
codecapture(CompileState * compst,TTree * tree,int tt,const Charset * fl)1320 static void codecapture (CompileState *compst, TTree *tree, int tt,
1321                          const Charset *fl) {
1322   int len = fixedlen(sib1(tree));
1323   if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) {
1324     codegen(compst, sib1(tree), 0, tt, fl);
1325     addinstcap(compst, IFullCapture, tree->cap, tree->key, len);
1326   }
1327   else {
1328     addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0);
1329     codegen(compst, sib1(tree), 0, tt, fl);
1330     addinstcap(compst, ICloseCapture, Cclose, 0, 0);
1331   }
1332 }
1333 
1334 
coderuntime(CompileState * compst,TTree * tree,int tt)1335 static void coderuntime (CompileState *compst, TTree *tree, int tt) {
1336   addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0);
1337   codegen(compst, sib1(tree), 0, tt, fullset);
1338   addinstcap(compst, ICloseRunTime, Cclose, 0, 0);
1339 }
1340 
1341 
1342 /*
1343 ** Repetion; optimizations:
1344 ** When pattern is a charset, can use special instruction ISpan.
1345 ** When pattern is head fail, or if it starts with characters that
1346 ** are disjoint from what follows the repetions, a simple test
1347 ** is enough (a fail inside the repetition would backtrack to fail
1348 ** again in the following pattern, so there is no need for a choice).
1349 ** When 'opt' is true, the repetion can reuse the Choice already
1350 ** active in the stack.
1351 */
coderep(CompileState * compst,TTree * tree,int opt,const Charset * fl)1352 static void coderep (CompileState *compst, TTree *tree, int opt,
1353                      const Charset *fl) {
1354   Charset st;
1355   if (tocharset(tree, &st)) {
1356     addinstruction(compst, ISpan, 0);
1357     addcharset(compst, st.cs);
1358   }
1359   else {
1360     int e1 = getfirst(tree, fullset, &st);
1361     if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) {
1362       /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */
1363       int jmp;
1364       int test = codetestset(compst, &st, 0);
1365       codegen(compst, tree, opt, test, fullset);
1366       jmp = addoffsetinst(compst, IJmp);
1367       jumptohere(compst, test);
1368       jumptothere(compst, jmp, test);
1369     }
1370     else {
1371       /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */
1372       /* or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; */
1373       int commit, l2;
1374       int test = codetestset(compst, &st, e1);
1375       int pchoice = NOINST;
1376       if (opt)
1377         jumptohere(compst, addoffsetinst(compst, IPartialCommit));
1378       else
1379         pchoice = addoffsetinst(compst, IChoice);
1380       l2 = gethere(compst);
1381       codegen(compst, tree, 0, NOINST, fullset);
1382       commit = addoffsetinst(compst, IPartialCommit);
1383       jumptothere(compst, commit, l2);
1384       jumptohere(compst, pchoice);
1385       jumptohere(compst, test);
1386     }
1387   }
1388 }
1389 
1390 
1391 /*
1392 ** Not predicate; optimizations:
1393 ** In any case, if first test fails, 'not' succeeds, so it can jump to
1394 ** the end. If pattern is headfail, that is all (it cannot fail
1395 ** in other parts); this case includes 'not' of simple sets. Otherwise,
1396 ** use the default code (a choice plus a failtwice).
1397 */
codenot(CompileState * compst,TTree * tree)1398 static void codenot (CompileState *compst, TTree *tree) {
1399   Charset st;
1400   int e = getfirst(tree, fullset, &st);
1401   int test = codetestset(compst, &st, e);
1402   if (headfail(tree))  /* test (fail(p1)) -> L1; fail; L1:  */
1403     addinstruction(compst, IFail, 0);
1404   else {
1405     /* test(fail(p))-> L1; choice L1; <p>; failtwice; L1:  */
1406     int pchoice = addoffsetinst(compst, IChoice);
1407     codegen(compst, tree, 0, NOINST, fullset);
1408     addinstruction(compst, IFailTwice, 0);
1409     jumptohere(compst, pchoice);
1410   }
1411   jumptohere(compst, test);
1412 }
1413 
1414 
1415 /*
1416 ** change open calls to calls, using list 'positions' to find
1417 ** correct offsets; also optimize tail calls
1418 */
correctcalls(CompileState * compst,int * positions,int from,int to)1419 static void correctcalls (CompileState *compst, int *positions,
1420                           int from, int to) {
1421   int i;
1422   Instruction *code = compst->p->code;
1423   for (i = from; i < to; i += sizei(&code[i])) {
1424     if (code[i].i.code == IOpenCall) {
1425       int n = code[i].i.key;  /* rule number */
1426       int rule = positions[n];  /* rule position */
1427       assert(rule == from || code[rule - 1].i.code == IRet);
1428       if (code[finaltarget(code, i + 2)].i.code == IRet)  /* call; ret ? */
1429         code[i].i.code = IJmp;  /* tail call */
1430       else
1431         code[i].i.code = ICall;
1432       jumptothere(compst, i, rule);  /* call jumps to respective rule */
1433     }
1434   }
1435   assert(i == to);
1436 }
1437 
1438 
1439 /*
1440 ** Code for a grammar:
1441 ** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2:
1442 */
codegrammar(CompileState * compst,TTree * grammar)1443 static void codegrammar (CompileState *compst, TTree *grammar) {
1444   int positions[MAXRULES];
1445   int rulenumber = 0;
1446   TTree *rule;
1447   int firstcall = addoffsetinst(compst, ICall);  /* call initial rule */
1448   int jumptoend = addoffsetinst(compst, IJmp);  /* jump to the end */
1449   int start = gethere(compst);  /* here starts the initial rule */
1450   jumptohere(compst, firstcall);
1451   for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
1452     positions[rulenumber++] = gethere(compst);  /* save rule position */
1453     codegen(compst, sib1(rule), 0, NOINST, fullset);  /* code rule */
1454     addinstruction(compst, IRet, 0);
1455   }
1456   assert(rule->tag == TTrue);
1457   jumptohere(compst, jumptoend);
1458   correctcalls(compst, positions, start, gethere(compst));
1459 }
1460 
1461 
codecall(CompileState * compst,TTree * call)1462 static void codecall (CompileState *compst, TTree *call) {
1463   int c = addoffsetinst(compst, IOpenCall);  /* to be corrected later */
1464   getinstr(compst, c).i.key = sib2(call)->cap;  /* rule number */
1465   assert(sib2(call)->tag == TRule);
1466 }
1467 
1468 
1469 /*
1470 ** Code first child of a sequence
1471 ** (second child is called in-place to allow tail call)
1472 ** Return 'tt' for second child
1473 */
codeseq1(CompileState * compst,TTree * p1,TTree * p2,int tt,const Charset * fl)1474 static int codeseq1 (CompileState *compst, TTree *p1, TTree *p2,
1475                      int tt, const Charset *fl) {
1476   if (needfollow(p1)) {
1477     Charset fl1;
1478     getfirst(p2, fl, &fl1);  /* p1 follow is p2 first */
1479     codegen(compst, p1, 0, tt, &fl1);
1480   }
1481   else  /* use 'fullset' as follow */
1482     codegen(compst, p1, 0, tt, fullset);
1483   if (fixedlen(p1) != 0)  /* can 'p1' consume anything? */
1484     return  NOINST;  /* invalidate test */
1485   else return tt;  /* else 'tt' still protects sib2 */
1486 }
1487 
1488 
1489 /*
1490 ** Main code-generation function: dispatch to auxiliar functions
1491 ** according to kind of tree. ('needfollow' should return true
1492 ** only for consructions that use 'fl'.)
1493 */
codegen(CompileState * compst,TTree * tree,int opt,int tt,const Charset * fl)1494 static void codegen (CompileState *compst, TTree *tree, int opt, int tt,
1495                      const Charset *fl) {
1496  tailcall:
1497   switch (tree->tag) {
1498     case TChar: codechar(compst, tree->u.n, tt); break;
1499     case TAny: addinstruction(compst, IAny, 0); break;
1500     case TSet: codecharset(compst, treebuffer(tree), tt); break;
1501     case TTrue: break;
1502     case TFalse: addinstruction(compst, IFail, 0); break;
1503     case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break;
1504     case TRep: coderep(compst, sib1(tree), opt, fl); break;
1505     case TBehind: codebehind(compst, tree); break;
1506     case TNot: codenot(compst, sib1(tree)); break;
1507     case TAnd: codeand(compst, sib1(tree), tt); break;
1508     case TCapture: codecapture(compst, tree, tt, fl); break;
1509     case TRunTime: coderuntime(compst, tree, tt); break;
1510     case TGrammar: codegrammar(compst, tree); break;
1511     case TCall: codecall(compst, tree); break;
1512     case TSeq: {
1513       tt = codeseq1(compst, sib1(tree), sib2(tree), tt, fl);  /* code 'p1' */
1514       /* codegen(compst, p2, opt, tt, fl); */
1515       tree = sib2(tree); goto tailcall;
1516     }
1517     default: assert(0);
1518   }
1519 }
1520 
1521 
1522 /*
1523 ** Optimize jumps and other jump-like instructions.
1524 ** * Update labels of instructions with labels to their final
1525 ** destinations (e.g., choice L1; ... L1: jmp L2: becomes
1526 ** choice L2)
1527 ** * Jumps to other instructions that do jumps become those
1528 ** instructions (e.g., jump to return becomes a return; jump
1529 ** to commit becomes a commit)
1530 */
peephole(CompileState * compst)1531 static void peephole (CompileState *compst) {
1532   Instruction *code = compst->p->code;
1533   int i;
1534   for (i = 0; i < compst->ncode; i += sizei(&code[i])) {
1535    redo:
1536     switch (code[i].i.code) {
1537       case IChoice: case ICall: case ICommit: case IPartialCommit:
1538       case IBackCommit: case ITestChar: case ITestSet:
1539       case ITestAny: {  /* instructions with labels */
1540         jumptothere(compst, i, finallabel(code, i));  /* optimize label */
1541         break;
1542       }
1543       case IJmp: {
1544         int ft = finaltarget(code, i);
1545         switch (code[ft].i.code) {  /* jumping to what? */
1546           case IRet: case IFail: case IFailTwice:
1547           case IEnd: {  /* instructions with unconditional implicit jumps */
1548             code[i] = code[ft];  /* jump becomes that instruction */
1549             code[i + 1].i.code = IAny;  /* 'no-op' for target position */
1550             break;
1551           }
1552           case ICommit: case IPartialCommit:
1553           case IBackCommit: {  /* inst. with unconditional explicit jumps */
1554             int fft = finallabel(code, ft);
1555             code[i] = code[ft];  /* jump becomes that instruction... */
1556             jumptothere(compst, i, fft);  /* but must correct its offset */
1557             goto redo;  /* reoptimize its label */
1558           }
1559           default: {
1560             jumptothere(compst, i, ft);  /* optimize label */
1561             break;
1562           }
1563         }
1564         break;
1565       }
1566       default: break;
1567     }
1568   }
1569   assert(code[i - 1].i.code == IEnd);
1570 }
1571 
1572 
1573 /*
1574 ** Compile a pattern
1575 */
compile(lua_State * L,Pattern * p)1576 Instruction *compile (lua_State *L, Pattern *p) {
1577   CompileState compst;
1578   compst.p = p;  compst.ncode = 0;  compst.L = L;
1579   realloccode(L, p, 2);  /* minimum initial size */
1580   codegen(&compst, p->tree, 0, NOINST, fullset);
1581   addinstruction(&compst, IEnd, 0);
1582   realloccode(L, p, compst.ncode);  /* set final size */
1583   peephole(&compst);
1584   return p->code;
1585 }
1586 
1587 
1588 /* }====================================================== */
1589 
1590 /*
1591 ** $Id: lpcap.c,v 1.5 2014/12/12 16:58:47 roberto Exp $
1592 ** Copyright 2007, Lua.org & PUC-Rio  (see 'lpeg.html' for license)
1593 */
1594 
1595 /* #include "lua.h"*/
1596 /* #include "lauxlib.h"*/
1597 
1598 /* #include "lpcap.h"*/
1599 /* #include "lptypes.h"*/
1600 
1601 
1602 #define captype(cap)	((cap)->kind)
1603 
1604 #define isclosecap(cap)	(captype(cap) == Cclose)
1605 
1606 #define closeaddr(c)	((c)->s + (c)->siz - 1)
1607 
1608 #define isfullcap(cap)	((cap)->siz != 0)
1609 
1610 #define getfromktable(cs,v)	lua_rawgeti((cs)->L, ktableidx((cs)->ptop), v)
1611 
1612 #define pushluaval(cs)		getfromktable(cs, (cs)->cap->idx)
1613 
1614 
1615 
1616 /*
1617 ** Put at the cache for Lua values the value indexed by 'v' in ktable
1618 ** of the running pattern (if it is not there yet); returns its index.
1619 */
updatecache(CapState * cs,int v)1620 static int updatecache (CapState *cs, int v) {
1621   int idx = cs->ptop + 1;  /* stack index of cache for Lua values */
1622   if (v != cs->valuecached) {  /* not there? */
1623     getfromktable(cs, v);  /* get value from 'ktable' */
1624     lua_replace(cs->L, idx);  /* put it at reserved stack position */
1625     cs->valuecached = v;  /* keep track of what is there */
1626   }
1627   return idx;
1628 }
1629 
1630 
1631 static int pushcapture (CapState *cs);
1632 
1633 
1634 /*
1635 ** Goes back in a list of captures looking for an open capture
1636 ** corresponding to a close
1637 */
findopen(Capture * cap)1638 static Capture *findopen (Capture *cap) {
1639   int n = 0;  /* number of closes waiting an open */
1640   for (;;) {
1641     cap--;
1642     if (isclosecap(cap)) n++;  /* one more open to skip */
1643     else if (!isfullcap(cap))
1644       if (n-- == 0) return cap;
1645   }
1646 }
1647 
1648 
1649 /*
1650 ** Go to the next capture
1651 */
nextcap(CapState * cs)1652 static void nextcap (CapState *cs) {
1653   Capture *cap = cs->cap;
1654   if (!isfullcap(cap)) {  /* not a single capture? */
1655     int n = 0;  /* number of opens waiting a close */
1656     for (;;) {  /* look for corresponding close */
1657       cap++;
1658       if (isclosecap(cap)) {
1659         if (n-- == 0) break;
1660       }
1661       else if (!isfullcap(cap)) n++;
1662     }
1663   }
1664   cs->cap = cap + 1;  /* + 1 to skip last close (or entire single capture) */
1665 }
1666 
1667 
1668 /*
1669 ** Push on the Lua stack all values generated by nested captures inside
1670 ** the current capture. Returns number of values pushed. 'addextra'
1671 ** makes it push the entire match after all captured values. The
1672 ** entire match is pushed also if there are no other nested values,
1673 ** so the function never returns zero.
1674 */
pushnestedvalues(CapState * cs,int addextra)1675 static int pushnestedvalues (CapState *cs, int addextra) {
1676   Capture *co = cs->cap;
1677   if (isfullcap(cs->cap++)) {  /* no nested captures? */
1678     lua_pushlstring(cs->L, co->s, co->siz - 1);  /* push whole match */
1679     return 1;  /* that is it */
1680   }
1681   else {
1682     int n = 0;
1683     while (!isclosecap(cs->cap))  /* repeat for all nested patterns */
1684       n += pushcapture(cs);
1685     if (addextra || n == 0) {  /* need extra? */
1686       lua_pushlstring(cs->L, co->s, cs->cap->s - co->s);  /* push whole match */
1687       n++;
1688     }
1689     cs->cap++;  /* skip close entry */
1690     return n;
1691   }
1692 }
1693 
1694 
1695 /*
1696 ** Push only the first value generated by nested captures
1697 */
pushonenestedvalue(CapState * cs)1698 static void pushonenestedvalue (CapState *cs) {
1699   int n = pushnestedvalues(cs, 0);
1700   if (n > 1)
1701     lua_pop(cs->L, n - 1);  /* pop extra values */
1702 }
1703 
1704 
1705 /*
1706 ** Try to find a named group capture with the name given at the top of
1707 ** the stack; goes backward from 'cap'.
1708 */
findback(CapState * cs,Capture * cap)1709 static Capture *findback (CapState *cs, Capture *cap) {
1710   lua_State *L = cs->L;
1711   while (cap-- > cs->ocap) {  /* repeat until end of list */
1712     if (isclosecap(cap))
1713       cap = findopen(cap);  /* skip nested captures */
1714     else if (!isfullcap(cap))
1715       continue; /* opening an enclosing capture: skip and get previous */
1716     if (captype(cap) == Cgroup) {
1717       getfromktable(cs, cap->idx);  /* get group name */
1718       if (lua_equal(L, -2, -1)) {  /* right group? */
1719         lua_pop(L, 2);  /* remove reference name and group name */
1720         return cap;
1721       }
1722       else lua_pop(L, 1);  /* remove group name */
1723     }
1724   }
1725   luaL_error(L, "back reference '%s' not found", lua_tostring(L, -1));
1726   return NULL;  /* to avoid warnings */
1727 }
1728 
1729 
1730 /*
1731 ** Back-reference capture. Return number of values pushed.
1732 */
backrefcap(CapState * cs)1733 static int backrefcap (CapState *cs) {
1734   int n;
1735   Capture *curr = cs->cap;
1736   pushluaval(cs);  /* reference name */
1737   cs->cap = findback(cs, curr);  /* find corresponding group */
1738   n = pushnestedvalues(cs, 0);  /* push group's values */
1739   cs->cap = curr + 1;
1740   return n;
1741 }
1742 
1743 
1744 /*
1745 ** Table capture: creates a new table and populates it with nested
1746 ** captures.
1747 */
tablecap(CapState * cs)1748 static int tablecap (CapState *cs) {
1749   lua_State *L = cs->L;
1750   int n = 0;
1751   lua_newtable(L);
1752   if (isfullcap(cs->cap++))
1753     return 1;  /* table is empty */
1754   while (!isclosecap(cs->cap)) {
1755     if (captype(cs->cap) == Cgroup && cs->cap->idx != 0) {  /* named group? */
1756       pushluaval(cs);  /* push group name */
1757       pushonenestedvalue(cs);
1758       lua_settable(L, -3);
1759     }
1760     else {  /* not a named group */
1761       int i;
1762       int k = pushcapture(cs);
1763       for (i = k; i > 0; i--)  /* store all values into table */
1764         lua_rawseti(L, -(i + 1), n + i);
1765       n += k;
1766     }
1767   }
1768   cs->cap++;  /* skip close entry */
1769   return 1;  /* number of values pushed (only the table) */
1770 }
1771 
1772 
1773 /*
1774 ** Table-query capture
1775 */
querycap(CapState * cs)1776 static int querycap (CapState *cs) {
1777   int idx = cs->cap->idx;
1778   pushonenestedvalue(cs);  /* get nested capture */
1779   lua_gettable(cs->L, updatecache(cs, idx));  /* query cap. value at table */
1780   if (!lua_isnil(cs->L, -1))
1781     return 1;
1782   else {  /* no value */
1783     lua_pop(cs->L, 1);  /* remove nil */
1784     return 0;
1785   }
1786 }
1787 
1788 
1789 /*
1790 ** Fold capture
1791 */
foldcap(CapState * cs)1792 static int foldcap (CapState *cs) {
1793   int n;
1794   lua_State *L = cs->L;
1795   int idx = cs->cap->idx;
1796   if (isfullcap(cs->cap++) ||  /* no nested captures? */
1797       isclosecap(cs->cap) ||  /* no nested captures (large subject)? */
1798       (n = pushcapture(cs)) == 0)  /* nested captures with no values? */
1799     return luaL_error(L, "no initial value for fold capture");
1800   if (n > 1)
1801     lua_pop(L, n - 1);  /* leave only one result for accumulator */
1802   while (!isclosecap(cs->cap)) {
1803     lua_pushvalue(L, updatecache(cs, idx));  /* get folding function */
1804     lua_insert(L, -2);  /* put it before accumulator */
1805     n = pushcapture(cs);  /* get next capture's values */
1806     lua_call(L, n + 1, 1);  /* call folding function */
1807   }
1808   cs->cap++;  /* skip close entry */
1809   return 1;  /* only accumulator left on the stack */
1810 }
1811 
1812 
1813 /*
1814 ** Function capture
1815 */
functioncap(CapState * cs)1816 static int functioncap (CapState *cs) {
1817   int n;
1818   int top = lua_gettop(cs->L);
1819   pushluaval(cs);  /* push function */
1820   n = pushnestedvalues(cs, 0);  /* push nested captures */
1821   lua_call(cs->L, n, LUA_MULTRET);  /* call function */
1822   return lua_gettop(cs->L) - top;  /* return function's results */
1823 }
1824 
1825 
1826 /*
1827 ** Select capture
1828 */
numcap(CapState * cs)1829 static int numcap (CapState *cs) {
1830   int idx = cs->cap->idx;  /* value to select */
1831   if (idx == 0) {  /* no values? */
1832     nextcap(cs);  /* skip entire capture */
1833     return 0;  /* no value produced */
1834   }
1835   else {
1836     int n = pushnestedvalues(cs, 0);
1837     if (n < idx)  /* invalid index? */
1838       return luaL_error(cs->L, "no capture '%d'", idx);
1839     else {
1840       lua_pushvalue(cs->L, -(n - idx + 1));  /* get selected capture */
1841       lua_replace(cs->L, -(n + 1));  /* put it in place of 1st capture */
1842       lua_pop(cs->L, n - 1);  /* remove other captures */
1843       return 1;
1844     }
1845   }
1846 }
1847 
1848 
1849 /*
1850 ** Return the stack index of the first runtime capture in the given
1851 ** list of captures (or zero if no runtime captures)
1852 */
finddyncap(Capture * cap,Capture * last)1853 int finddyncap (Capture *cap, Capture *last) {
1854   for (; cap < last; cap++) {
1855     if (cap->kind == Cruntime)
1856       return cap->idx;  /* stack position of first capture */
1857   }
1858   return 0;  /* no dynamic captures in this segment */
1859 }
1860 
1861 
1862 /*
1863 ** Calls a runtime capture. Returns number of captures removed by
1864 ** the call, including the initial Cgroup. (Captures to be added are
1865 ** on the Lua stack.)
1866 */
runtimecap(CapState * cs,Capture * close,const char * s,int * rem)1867 int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) {
1868   int n, id;
1869   lua_State *L = cs->L;
1870   int otop = lua_gettop(L);
1871   Capture *open = findopen(close);
1872   assert(captype(open) == Cgroup);
1873   id = finddyncap(open, close);  /* get first dynamic capture argument */
1874   close->kind = Cclose;  /* closes the group */
1875   close->s = s;
1876   cs->cap = open; cs->valuecached = 0;  /* prepare capture state */
1877   luaL_checkstack(L, 4, "too many runtime captures");
1878   pushluaval(cs);  /* push function to be called */
1879   lua_pushvalue(L, SUBJIDX);  /* push original subject */
1880   lua_pushinteger(L, s - cs->s + 1);  /* push current position */
1881   n = pushnestedvalues(cs, 0);  /* push nested captures */
1882   lua_call(L, n + 2, LUA_MULTRET);  /* call dynamic function */
1883   if (id > 0) {  /* are there old dynamic captures to be removed? */
1884     int i;
1885     for (i = id; i <= otop; i++)
1886       lua_remove(L, id);  /* remove old dynamic captures */
1887     *rem = otop - id + 1;  /* total number of dynamic captures removed */
1888   }
1889   else
1890     *rem = 0;  /* no dynamic captures removed */
1891   return close - open;  /* number of captures of all kinds removed */
1892 }
1893 
1894 
1895 /*
1896 ** Auxiliary structure for substitution and string captures: keep
1897 ** information about nested captures for future use, avoiding to push
1898 ** string results into Lua
1899 */
1900 typedef struct StrAux {
1901   int isstring;  /* whether capture is a string */
1902   union {
1903     Capture *cp;  /* if not a string, respective capture */
1904     struct {  /* if it is a string... */
1905       const char *s;  /* ... starts here */
1906       const char *e;  /* ... ends here */
1907     } s;
1908   } u;
1909 } StrAux;
1910 
1911 #define MAXSTRCAPS	10
1912 
1913 /*
1914 ** Collect values from current capture into array 'cps'. Current
1915 ** capture must be Cstring (first call) or Csimple (recursive calls).
1916 ** (In first call, fills %0 with whole match for Cstring.)
1917 ** Returns number of elements in the array that were filled.
1918 */
getstrcaps(CapState * cs,StrAux * cps,int n)1919 static int getstrcaps (CapState *cs, StrAux *cps, int n) {
1920   int k = n++;
1921   cps[k].isstring = 1;  /* get string value */
1922   cps[k].u.s.s = cs->cap->s;  /* starts here */
1923   if (!isfullcap(cs->cap++)) {  /* nested captures? */
1924     while (!isclosecap(cs->cap)) {  /* traverse them */
1925       if (n >= MAXSTRCAPS)  /* too many captures? */
1926         nextcap(cs);  /* skip extra captures (will not need them) */
1927       else if (captype(cs->cap) == Csimple)  /* string? */
1928         n = getstrcaps(cs, cps, n);  /* put info. into array */
1929       else {
1930         cps[n].isstring = 0;  /* not a string */
1931         cps[n].u.cp = cs->cap;  /* keep original capture */
1932         nextcap(cs);
1933         n++;
1934       }
1935     }
1936     cs->cap++;  /* skip close */
1937   }
1938   cps[k].u.s.e = closeaddr(cs->cap - 1);  /* ends here */
1939   return n;
1940 }
1941 
1942 
1943 /*
1944 ** add next capture value (which should be a string) to buffer 'b'
1945 */
1946 static int addonestring (luaL_Buffer *b, CapState *cs, const char *what);
1947 
1948 
1949 /*
1950 ** String capture: add result to buffer 'b' (instead of pushing
1951 ** it into the stack)
1952 */
stringcap(luaL_Buffer * b,CapState * cs)1953 static void stringcap (luaL_Buffer *b, CapState *cs) {
1954   StrAux cps[MAXSTRCAPS];
1955   int n;
1956   size_t len, i;
1957   const char *fmt;  /* format string */
1958   fmt = lua_tolstring(cs->L, updatecache(cs, cs->cap->idx), &len);
1959   n = getstrcaps(cs, cps, 0) - 1;  /* collect nested captures */
1960   for (i = 0; i < len; i++) {  /* traverse them */
1961     if (fmt[i] != '%')  /* not an escape? */
1962       luaL_addchar(b, fmt[i]);  /* add it to buffer */
1963     else if (fmt[++i] < '0' || fmt[i] > '9')  /* not followed by a digit? */
1964       luaL_addchar(b, fmt[i]);  /* add to buffer */
1965     else {
1966       int l = fmt[i] - '0';  /* capture index */
1967       if (l > n)
1968         luaL_error(cs->L, "invalid capture index (%d)", l);
1969       else if (cps[l].isstring)
1970         luaL_addlstring(b, cps[l].u.s.s, cps[l].u.s.e - cps[l].u.s.s);
1971       else {
1972         Capture *curr = cs->cap;
1973         cs->cap = cps[l].u.cp;  /* go back to evaluate that nested capture */
1974         if (!addonestring(b, cs, "capture"))
1975           luaL_error(cs->L, "no values in capture index %d", l);
1976         cs->cap = curr;  /* continue from where it stopped */
1977       }
1978     }
1979   }
1980 }
1981 
1982 
1983 /*
1984 ** Substitution capture: add result to buffer 'b'
1985 */
substcap(luaL_Buffer * b,CapState * cs)1986 static void substcap (luaL_Buffer *b, CapState *cs) {
1987   const char *curr = cs->cap->s;
1988   if (isfullcap(cs->cap))  /* no nested captures? */
1989     luaL_addlstring(b, curr, cs->cap->siz - 1);  /* keep original text */
1990   else {
1991     cs->cap++;  /* skip open entry */
1992     while (!isclosecap(cs->cap)) {  /* traverse nested captures */
1993       const char *next = cs->cap->s;
1994       luaL_addlstring(b, curr, next - curr);  /* add text up to capture */
1995       if (addonestring(b, cs, "replacement"))
1996         curr = closeaddr(cs->cap - 1);  /* continue after match */
1997       else  /* no capture value */
1998         curr = next;  /* keep original text in final result */
1999     }
2000     luaL_addlstring(b, curr, cs->cap->s - curr);  /* add last piece of text */
2001   }
2002   cs->cap++;  /* go to next capture */
2003 }
2004 
2005 
2006 /*
2007 ** Evaluates a capture and adds its first value to buffer 'b'; returns
2008 ** whether there was a value
2009 */
addonestring(luaL_Buffer * b,CapState * cs,const char * what)2010 static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) {
2011   switch (captype(cs->cap)) {
2012     case Cstring:
2013       stringcap(b, cs);  /* add capture directly to buffer */
2014       return 1;
2015     case Csubst:
2016       substcap(b, cs);  /* add capture directly to buffer */
2017       return 1;
2018     default: {
2019       lua_State *L = cs->L;
2020       int n = pushcapture(cs);
2021       if (n > 0) {
2022         if (n > 1) lua_pop(L, n - 1);  /* only one result */
2023         if (!lua_isstring(L, -1))
2024           luaL_error(L, "invalid %s value (a %s)", what, luaL_typename(L, -1));
2025         luaL_addvalue(b);
2026       }
2027       return n;
2028     }
2029   }
2030 }
2031 
2032 
2033 /*
2034 ** Push all values of the current capture into the stack; returns
2035 ** number of values pushed
2036 */
pushcapture(CapState * cs)2037 static int pushcapture (CapState *cs) {
2038   lua_State *L = cs->L;
2039   luaL_checkstack(L, 4, "too many captures");
2040   switch (captype(cs->cap)) {
2041     case Cposition: {
2042       lua_pushinteger(L, cs->cap->s - cs->s + 1);
2043       cs->cap++;
2044       return 1;
2045     }
2046     case Cconst: {
2047       pushluaval(cs);
2048       cs->cap++;
2049       return 1;
2050     }
2051     case Carg: {
2052       int arg = (cs->cap++)->idx;
2053       if (arg + FIXEDARGS > cs->ptop)
2054         return luaL_error(L, "reference to absent extra argument #%d", arg);
2055       lua_pushvalue(L, arg + FIXEDARGS);
2056       return 1;
2057     }
2058     case Csimple: {
2059       int k = pushnestedvalues(cs, 1);
2060       lua_insert(L, -k);  /* make whole match be first result */
2061       return k;
2062     }
2063     case Cruntime: {
2064       lua_pushvalue(L, (cs->cap++)->idx);  /* value is in the stack */
2065       return 1;
2066     }
2067     case Cstring: {
2068       luaL_Buffer b;
2069       luaL_buffinit(L, &b);
2070       stringcap(&b, cs);
2071       luaL_pushresult(&b);
2072       return 1;
2073     }
2074     case Csubst: {
2075       luaL_Buffer b;
2076       luaL_buffinit(L, &b);
2077       substcap(&b, cs);
2078       luaL_pushresult(&b);
2079       return 1;
2080     }
2081     case Cgroup: {
2082       if (cs->cap->idx == 0)  /* anonymous group? */
2083         return pushnestedvalues(cs, 0);  /* add all nested values */
2084       else {  /* named group: add no values */
2085         nextcap(cs);  /* skip capture */
2086         return 0;
2087       }
2088     }
2089     case Cbackref: return backrefcap(cs);
2090     case Ctable: return tablecap(cs);
2091     case Cfunction: return functioncap(cs);
2092     case Cnum: return numcap(cs);
2093     case Cquery: return querycap(cs);
2094     case Cfold: return foldcap(cs);
2095     default: assert(0); return 0;
2096   }
2097 }
2098 
2099 
2100 /*
2101 ** Prepare a CapState structure and traverse the entire list of
2102 ** captures in the stack pushing its results. 's' is the subject
2103 ** string, 'r' is the final position of the match, and 'ptop'
2104 ** the index in the stack where some useful values were pushed.
2105 ** Returns the number of results pushed. (If the list produces no
2106 ** results, push the final position of the match.)
2107 */
getcaptures(lua_State * L,const char * s,const char * r,int ptop)2108 int getcaptures (lua_State *L, const char *s, const char *r, int ptop) {
2109   Capture *capture = (Capture *)lua_touserdata(L, caplistidx(ptop));
2110   int n = 0;
2111   if (!isclosecap(capture)) {  /* is there any capture? */
2112     CapState cs;
2113     cs.ocap = cs.cap = capture; cs.L = L;
2114     cs.s = s; cs.valuecached = 0; cs.ptop = ptop;
2115     do {  /* collect their values */
2116       n += pushcapture(&cs);
2117     } while (!isclosecap(cs.cap));
2118   }
2119   if (n == 0) {  /* no capture values? */
2120     lua_pushinteger(L, r - s + 1);  /* return only end position */
2121     n = 1;
2122   }
2123   return n;
2124 }
2125 
2126 
2127 /*
2128 ** $Id: lptree.c,v 1.15 2015/03/04 17:23:00 roberto Exp $
2129 ** Copyright 2013, Lua.org & PUC-Rio  (see 'lpeg.html' for license)
2130 */
2131 
2132 /* #include <ctype.h>*/
2133 /* #include <limits.h>*/
2134 /* #include <string.h>*/
2135 
2136 
2137 /* #include "lua.h"*/
2138 /* #include "lauxlib.h"*/
2139 
2140 /* #include "lptypes.h"*/
2141 /* #include "lpcap.h"*/
2142 /* #include "lpcode.h"*/
2143 /* #include "lpprint.h"*/
2144 /* #include "lptree.h"*/
2145 
2146 
2147 /* number of siblings for each tree */
2148 const byte numsiblings[] = {
2149   0, 0, 0,	/* char, set, any */
2150   0, 0,		/* true, false */
2151   1,		/* rep */
2152   2, 2,		/* seq, choice */
2153   1, 1,		/* not, and */
2154   0, 0, 2, 1,  /* call, opencall, rule, grammar */
2155   1,  /* behind */
2156   1, 1  /* capture, runtime capture */
2157 };
2158 
2159 
2160 static TTree *newgrammar (lua_State *L, int arg);
2161 
2162 
2163 /*
2164 ** returns a reasonable name for value at index 'idx' on the stack
2165 */
val2str(lua_State * L,int idx)2166 static const char *val2str (lua_State *L, int idx) {
2167   const char *k = lua_tostring(L, idx);
2168   if (k != NULL)
2169     return lua_pushfstring(L, "%s", k);
2170   else
2171     return lua_pushfstring(L, "(a %s)", luaL_typename(L, idx));
2172 }
2173 
2174 
2175 /*
2176 ** Fix a TOpenCall into a TCall node, using table 'postable' to
2177 ** translate a key to its rule address in the tree. Raises an
2178 ** error if key does not exist.
2179 */
fixonecall(lua_State * L,int postable,TTree * g,TTree * t)2180 static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t) {
2181   int n;
2182   lua_rawgeti(L, -1, t->key);  /* get rule's name */
2183   lua_gettable(L, postable);  /* query name in position table */
2184   n = lua_tonumber(L, -1);  /* get (absolute) position */
2185   lua_pop(L, 1);  /* remove position */
2186   if (n == 0) {  /* no position? */
2187     lua_rawgeti(L, -1, t->key);  /* get rule's name again */
2188     luaL_error(L, "rule '%s' undefined in given grammar", val2str(L, -1));
2189   }
2190   t->tag = TCall;
2191   t->u.ps = n - (t - g);  /* position relative to node */
2192   assert(sib2(t)->tag == TRule);
2193   sib2(t)->key = t->key;
2194 }
2195 
2196 
2197 /*
2198 ** Transform left associative constructions into right
2199 ** associative ones, for sequence and choice; that is:
2200 ** (t11 + t12) + t2  =>  t11 + (t12 + t2)
2201 ** (t11 * t12) * t2  =>  t11 * (t12 * t2)
2202 ** (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2))
2203 */
correctassociativity(TTree * tree)2204 static void correctassociativity (TTree *tree) {
2205   TTree *t1 = sib1(tree);
2206   assert(tree->tag == TChoice || tree->tag == TSeq);
2207   while (t1->tag == tree->tag) {
2208     int n1size = tree->u.ps - 1;  /* t1 == Op t11 t12 */
2209     int n11size = t1->u.ps - 1;
2210     int n12size = n1size - n11size - 1;
2211     memmove(sib1(tree), sib1(t1), n11size * sizeof(TTree)); /* move t11 */
2212     tree->u.ps = n11size + 1;
2213     sib2(tree)->tag = tree->tag;
2214     sib2(tree)->u.ps = n12size + 1;
2215   }
2216 }
2217 
2218 
2219 /*
2220 ** Make final adjustments in a tree. Fix open calls in tree 't',
2221 ** making them refer to their respective rules or raising appropriate
2222 ** errors (if not inside a grammar). Correct associativity of associative
2223 ** constructions (making them right associative). Assume that tree's
2224 ** ktable is at the top of the stack (for error messages).
2225 */
finalfix(lua_State * L,int postable,TTree * g,TTree * t)2226 static void finalfix (lua_State *L, int postable, TTree *g, TTree *t) {
2227  tailcall:
2228   switch (t->tag) {
2229     case TGrammar:  /* subgrammars were already fixed */
2230       return;
2231     case TOpenCall: {
2232       if (g != NULL)  /* inside a grammar? */
2233         fixonecall(L, postable, g, t);
2234       else {  /* open call outside grammar */
2235         lua_rawgeti(L, -1, t->key);
2236         luaL_error(L, "rule '%s' used outside a grammar", val2str(L, -1));
2237       }
2238       break;
2239     }
2240     case TSeq: case TChoice:
2241       correctassociativity(t);
2242       break;
2243   }
2244   switch (numsiblings[t->tag]) {
2245     case 1: /* finalfix(L, postable, g, sib1(t)); */
2246       t = sib1(t); goto tailcall;
2247     case 2:
2248       finalfix(L, postable, g, sib1(t));
2249       t = sib2(t); goto tailcall;  /* finalfix(L, postable, g, sib2(t)); */
2250     default: assert(numsiblings[t->tag] == 0); break;
2251   }
2252 }
2253 
2254 
2255 
2256 /*
2257 ** {===================================================================
2258 ** KTable manipulation
2259 **
2260 ** - The ktable of a pattern 'p' can be shared by other patterns that
2261 ** contain 'p' and no other constants. Because of this sharing, we
2262 ** should not add elements to a 'ktable' unless it was freshly created
2263 ** for the new pattern.
2264 **
2265 ** - The maximum index in a ktable is USHRT_MAX, because trees and
2266 ** patterns use unsigned shorts to store those indices.
2267 ** ====================================================================
2268 */
2269 
2270 /*
2271 ** Create a new 'ktable' to the pattern at the top of the stack.
2272 */
newktable(lua_State * L,int n)2273 static void newktable (lua_State *L, int n) {
2274   lua_createtable(L, n, 0);  /* create a fresh table */
2275   lua_setfenv(L, -2);  /* set it as 'ktable' for pattern */
2276 }
2277 
2278 
2279 /*
2280 ** Add element 'idx' to 'ktable' of pattern at the top of the stack;
2281 ** Return index of new element.
2282 ** If new element is nil, does not add it to table (as it would be
2283 ** useless) and returns 0, as ktable[0] is always nil.
2284 */
addtoktable(lua_State * L,int idx)2285 static int addtoktable (lua_State *L, int idx) {
2286   if (lua_isnil(L, idx))  /* nil value? */
2287     return 0;
2288   else {
2289     int n;
2290     lua_getfenv(L, -1);  /* get ktable from pattern */
2291     n = lua_objlen(L, -1);
2292     if (n >= USHRT_MAX)
2293       luaL_error(L, "too many Lua values in pattern");
2294     lua_pushvalue(L, idx);  /* element to be added */
2295     lua_rawseti(L, -2, ++n);
2296     lua_pop(L, 1);  /* remove 'ktable' */
2297     return n;
2298   }
2299 }
2300 
2301 
2302 /*
2303 ** Return the number of elements in the ktable at 'idx'.
2304 ** In Lua 5.2/5.3, default "environment" for patterns is nil, not
2305 ** a table. Treat it as an empty table. In Lua 5.1, assumes that
2306 ** the environment has no numeric indices (len == 0)
2307 */
ktablelen(lua_State * L,int idx)2308 static int ktablelen (lua_State *L, int idx) {
2309   if (!lua_istable(L, idx)) return 0;
2310   else return lua_objlen(L, idx);
2311 }
2312 
2313 
2314 /*
2315 ** Concatentate the contents of table 'idx1' into table 'idx2'.
2316 ** (Assume that both indices are negative.)
2317 ** Return the original length of table 'idx2' (or 0, if no
2318 ** element was added, as there is no need to correct any index).
2319 */
concattable(lua_State * L,int idx1,int idx2)2320 static int concattable (lua_State *L, int idx1, int idx2) {
2321   int i;
2322   int n1 = ktablelen(L, idx1);
2323   int n2 = ktablelen(L, idx2);
2324   if (n1 + n2 > USHRT_MAX)
2325     luaL_error(L, "too many Lua values in pattern");
2326   if (n1 == 0) return 0;  /* nothing to correct */
2327   for (i = 1; i <= n1; i++) {
2328     lua_rawgeti(L, idx1, i);
2329     lua_rawseti(L, idx2 - 1, n2 + i);  /* correct 'idx2' */
2330   }
2331   return n2;
2332 }
2333 
2334 
2335 /*
2336 ** When joining 'ktables', constants from one of the subpatterns must
2337 ** be renumbered; 'correctkeys' corrects their indices (adding 'n'
2338 ** to each of them)
2339 */
correctkeys(TTree * tree,int n)2340 static void correctkeys (TTree *tree, int n) {
2341   if (n == 0) return;  /* no correction? */
2342  tailcall:
2343   switch (tree->tag) {
2344     case TOpenCall: case TCall: case TRunTime: case TRule: {
2345       if (tree->key > 0)
2346         tree->key += n;
2347       break;
2348     }
2349     case TCapture: {
2350       if (tree->key > 0 && tree->cap != Carg && tree->cap != Cnum)
2351         tree->key += n;
2352       break;
2353     }
2354     default: break;
2355   }
2356   switch (numsiblings[tree->tag]) {
2357     case 1:  /* correctkeys(sib1(tree), n); */
2358       tree = sib1(tree); goto tailcall;
2359     case 2:
2360       correctkeys(sib1(tree), n);
2361       tree = sib2(tree); goto tailcall;  /* correctkeys(sib2(tree), n); */
2362     default: assert(numsiblings[tree->tag] == 0); break;
2363   }
2364 }
2365 
2366 
2367 /*
2368 ** Join the ktables from p1 and p2 the ktable for the new pattern at the
2369 ** top of the stack, reusing them when possible.
2370 */
joinktables(lua_State * L,int p1,TTree * t2,int p2)2371 static void joinktables (lua_State *L, int p1, TTree *t2, int p2) {
2372   int n1, n2;
2373   lua_getfenv(L, p1);  /* get ktables */
2374   lua_getfenv(L, p2);
2375   n1 = ktablelen(L, -2);
2376   n2 = ktablelen(L, -1);
2377   if (n1 == 0 && n2 == 0)  /* are both tables empty? */
2378     lua_pop(L, 2);  /* nothing to be done; pop tables */
2379   else if (n2 == 0 || lua_equal(L, -2, -1)) {  /* 2nd table empty or equal? */
2380     lua_pop(L, 1);  /* pop 2nd table */
2381     lua_setfenv(L, -2);  /* set 1st ktable into new pattern */
2382   }
2383   else if (n1 == 0) {  /* first table is empty? */
2384     lua_setfenv(L, -3);  /* set 2nd table into new pattern */
2385     lua_pop(L, 1);  /* pop 1st table */
2386   }
2387   else {
2388     lua_createtable(L, n1 + n2, 0);  /* create ktable for new pattern */
2389     /* stack: new p; ktable p1; ktable p2; new ktable */
2390     concattable(L, -3, -1);  /* from p1 into new ktable */
2391     concattable(L, -2, -1);  /* from p2 into new ktable */
2392     lua_setfenv(L, -4);  /* new ktable becomes 'p' environment */
2393     lua_pop(L, 2);  /* pop other ktables */
2394     correctkeys(t2, n1);  /* correction for indices from p2 */
2395   }
2396 }
2397 
2398 
2399 /*
2400 ** copy 'ktable' of element 'idx' to new tree (on top of stack)
2401 */
copyktable(lua_State * L,int idx)2402 static void copyktable (lua_State *L, int idx) {
2403   lua_getfenv(L, idx);
2404   lua_setfenv(L, -2);
2405 }
2406 
2407 
2408 /*
2409 ** merge 'ktable' from 'stree' at stack index 'idx' into 'ktable'
2410 ** from tree at the top of the stack, and correct corresponding
2411 ** tree.
2412 */
mergektable(lua_State * L,int idx,TTree * stree)2413 static void mergektable (lua_State *L, int idx, TTree *stree) {
2414   int n;
2415   lua_getfenv(L, -1);  /* get ktables */
2416   lua_getfenv(L, idx);
2417   n = concattable(L, -1, -2);
2418   lua_pop(L, 2);  /* remove both ktables */
2419   correctkeys(stree, n);
2420 }
2421 
2422 
2423 /*
2424 ** Create a new 'ktable' to the pattern at the top of the stack, adding
2425 ** all elements from pattern 'p' (if not 0) plus element 'idx' to it.
2426 ** Return index of new element.
2427 */
addtonewktable(lua_State * L,int p,int idx)2428 static int addtonewktable (lua_State *L, int p, int idx) {
2429   newktable(L, 1);
2430   if (p)
2431     mergektable(L, p, NULL);
2432   return addtoktable(L, idx);
2433 }
2434 
2435 /* }====================================================== */
2436 
2437 
2438 /*
2439 ** {======================================================
2440 ** Tree generation
2441 ** =======================================================
2442 */
2443 
2444 /*
2445 ** In 5.2, could use 'luaL_testudata'...
2446 */
testpattern(lua_State * L,int idx)2447 static int testpattern (lua_State *L, int idx) {
2448   if (lua_touserdata(L, idx)) {  /* value is a userdata? */
2449     if (lua_getmetatable(L, idx)) {  /* does it have a metatable? */
2450       luaL_getmetatable(L, PATTERN_T);
2451       if (lua_rawequal(L, -1, -2)) {  /* does it have the correct mt? */
2452         lua_pop(L, 2);  /* remove both metatables */
2453         return 1;
2454       }
2455     }
2456   }
2457   return 0;
2458 }
2459 
2460 
getpattern(lua_State * L,int idx)2461 static Pattern *getpattern (lua_State *L, int idx) {
2462   return (Pattern *)luaL_checkudata(L, idx, PATTERN_T);
2463 }
2464 
2465 
getsize(lua_State * L,int idx)2466 static int getsize (lua_State *L, int idx) {
2467   return (lua_objlen(L, idx) - sizeof(Pattern)) / sizeof(TTree) + 1;
2468 }
2469 
2470 
gettree(lua_State * L,int idx,int * len)2471 static TTree *gettree (lua_State *L, int idx, int *len) {
2472   Pattern *p = getpattern(L, idx);
2473   if (len)
2474     *len = getsize(L, idx);
2475   return p->tree;
2476 }
2477 
2478 
2479 /*
2480 ** create a pattern
2481 */
newtree(lua_State * L,int len)2482 static TTree *newtree (lua_State *L, int len) {
2483   size_t size = (len - 1) * sizeof(TTree) + sizeof(Pattern);
2484   Pattern *p = (Pattern *)lua_newuserdata(L, size);
2485   luaL_getmetatable(L, PATTERN_T);
2486   lua_setmetatable(L, -2);
2487   p->code = NULL;  p->codesize = 0;
2488   return p->tree;
2489 }
2490 
2491 
newleaf(lua_State * L,int tag)2492 static TTree *newleaf (lua_State *L, int tag) {
2493   TTree *tree = newtree(L, 1);
2494   tree->tag = tag;
2495   return tree;
2496 }
2497 
2498 
newcharset(lua_State * L)2499 static TTree *newcharset (lua_State *L) {
2500   TTree *tree = newtree(L, bytes2slots(CHARSETSIZE) + 1);
2501   tree->tag = TSet;
2502   loopset(i, treebuffer(tree)[i] = 0);
2503   return tree;
2504 }
2505 
2506 
2507 /*
2508 ** add to tree a sequence where first sibling is 'sib' (with size
2509 ** 'sibsize'); returns position for second sibling
2510 */
seqaux(TTree * tree,TTree * sib,int sibsize)2511 static TTree *seqaux (TTree *tree, TTree *sib, int sibsize) {
2512   tree->tag = TSeq; tree->u.ps = sibsize + 1;
2513   memcpy(sib1(tree), sib, sibsize * sizeof(TTree));
2514   return sib2(tree);
2515 }
2516 
2517 
2518 /*
2519 ** Build a sequence of 'n' nodes, each with tag 'tag' and 'u.n' got
2520 ** from the array 's' (or 0 if array is NULL). (TSeq is binary, so it
2521 ** must build a sequence of sequence of sequence...)
2522 */
fillseq(TTree * tree,int tag,int n,const char * s)2523 static void fillseq (TTree *tree, int tag, int n, const char *s) {
2524   int i;
2525   for (i = 0; i < n - 1; i++) {  /* initial n-1 copies of Seq tag; Seq ... */
2526     tree->tag = TSeq; tree->u.ps = 2;
2527     sib1(tree)->tag = tag;
2528     sib1(tree)->u.n = s ? (byte)s[i] : 0;
2529     tree = sib2(tree);
2530   }
2531   tree->tag = tag;  /* last one does not need TSeq */
2532   tree->u.n = s ? (byte)s[i] : 0;
2533 }
2534 
2535 
2536 /*
2537 ** Numbers as patterns:
2538 ** 0 == true (always match); n == TAny repeated 'n' times;
2539 ** -n == not (TAny repeated 'n' times)
2540 */
numtree(lua_State * L,int n)2541 static TTree *numtree (lua_State *L, int n) {
2542   if (n == 0)
2543     return newleaf(L, TTrue);
2544   else {
2545     TTree *tree, *nd;
2546     if (n > 0)
2547       tree = nd = newtree(L, 2 * n - 1);
2548     else {  /* negative: code it as !(-n) */
2549       n = -n;
2550       tree = newtree(L, 2 * n);
2551       tree->tag = TNot;
2552       nd = sib1(tree);
2553     }
2554     fillseq(nd, TAny, n, NULL);  /* sequence of 'n' any's */
2555     return tree;
2556   }
2557 }
2558 
2559 
2560 /*
2561 ** Convert value at index 'idx' to a pattern
2562 */
getpatt(lua_State * L,int idx,int * len)2563 static TTree *getpatt (lua_State *L, int idx, int *len) {
2564   TTree *tree;
2565   switch (lua_type(L, idx)) {
2566     case LUA_TSTRING: {
2567       size_t slen;
2568       const char *s = lua_tolstring(L, idx, &slen);  /* get string */
2569       if (slen == 0)  /* empty? */
2570         tree = newleaf(L, TTrue);  /* always match */
2571       else {
2572         tree = newtree(L, 2 * (slen - 1) + 1);
2573         fillseq(tree, TChar, slen, s);  /* sequence of 'slen' chars */
2574       }
2575       break;
2576     }
2577     case LUA_TNUMBER: {
2578       int n = lua_tointeger(L, idx);
2579       tree = numtree(L, n);
2580       break;
2581     }
2582     case LUA_TBOOLEAN: {
2583       tree = (lua_toboolean(L, idx) ? newleaf(L, TTrue) : newleaf(L, TFalse));
2584       break;
2585     }
2586     case LUA_TTABLE: {
2587       tree = newgrammar(L, idx);
2588       break;
2589     }
2590     case LUA_TFUNCTION: {
2591       tree = newtree(L, 2);
2592       tree->tag = TRunTime;
2593       tree->key = addtonewktable(L, 0, idx);
2594       sib1(tree)->tag = TTrue;
2595       break;
2596     }
2597     default: {
2598       return gettree(L, idx, len);
2599     }
2600   }
2601   lua_replace(L, idx);  /* put new tree into 'idx' slot */
2602   if (len)
2603     *len = getsize(L, idx);
2604   return tree;
2605 }
2606 
2607 
2608 /*
2609 ** create a new tree, whith a new root and one sibling.
2610 ** Sibling must be on the Lua stack, at index 1.
2611 */
newroot1sib(lua_State * L,int tag)2612 static TTree *newroot1sib (lua_State *L, int tag) {
2613   int s1;
2614   TTree *tree1 = getpatt(L, 1, &s1);
2615   TTree *tree = newtree(L, 1 + s1);  /* create new tree */
2616   tree->tag = tag;
2617   memcpy(sib1(tree), tree1, s1 * sizeof(TTree));
2618   copyktable(L, 1);
2619   return tree;
2620 }
2621 
2622 
2623 /*
2624 ** create a new tree, whith a new root and 2 siblings.
2625 ** Siblings must be on the Lua stack, first one at index 1.
2626 */
newroot2sib(lua_State * L,int tag)2627 static TTree *newroot2sib (lua_State *L, int tag) {
2628   int s1, s2;
2629   TTree *tree1 = getpatt(L, 1, &s1);
2630   TTree *tree2 = getpatt(L, 2, &s2);
2631   TTree *tree = newtree(L, 1 + s1 + s2);  /* create new tree */
2632   tree->tag = tag;
2633   tree->u.ps =  1 + s1;
2634   memcpy(sib1(tree), tree1, s1 * sizeof(TTree));
2635   memcpy(sib2(tree), tree2, s2 * sizeof(TTree));
2636   joinktables(L, 1, sib2(tree), 2);
2637   return tree;
2638 }
2639 
2640 
lp_P(lua_State * L)2641 static int lp_P (lua_State *L) {
2642   luaL_checkany(L, 1);
2643   getpatt(L, 1, NULL);
2644   lua_settop(L, 1);
2645   return 1;
2646 }
2647 
2648 
2649 /*
2650 ** sequence operator; optimizations:
2651 ** false x => false, x true => x, true x => x
2652 ** (cannot do x . false => false because x may have runtime captures)
2653 */
lp_seq(lua_State * L)2654 static int lp_seq (lua_State *L) {
2655   TTree *tree1 = getpatt(L, 1, NULL);
2656   TTree *tree2 = getpatt(L, 2, NULL);
2657   if (tree1->tag == TFalse || tree2->tag == TTrue)
2658     lua_pushvalue(L, 1);  /* false . x == false, x . true = x */
2659   else if (tree1->tag == TTrue)
2660     lua_pushvalue(L, 2);  /* true . x = x */
2661   else
2662     newroot2sib(L, TSeq);
2663   return 1;
2664 }
2665 
2666 
2667 /*
2668 ** choice operator; optimizations:
2669 ** charset / charset => charset
2670 ** true / x => true, x / false => x, false / x => x
2671 ** (x / true is not equivalent to true)
2672 */
lp_choice(lua_State * L)2673 static int lp_choice (lua_State *L) {
2674   Charset st1, st2;
2675   TTree *t1 = getpatt(L, 1, NULL);
2676   TTree *t2 = getpatt(L, 2, NULL);
2677   if (tocharset(t1, &st1) && tocharset(t2, &st2)) {
2678     TTree *t = newcharset(L);
2679     loopset(i, treebuffer(t)[i] = st1.cs[i] | st2.cs[i]);
2680   }
2681   else if (nofail(t1) || t2->tag == TFalse)
2682     lua_pushvalue(L, 1);  /* true / x => true, x / false => x */
2683   else if (t1->tag == TFalse)
2684     lua_pushvalue(L, 2);  /* false / x => x */
2685   else
2686     newroot2sib(L, TChoice);
2687   return 1;
2688 }
2689 
2690 
2691 /*
2692 ** p^n
2693 */
lp_star(lua_State * L)2694 static int lp_star (lua_State *L) {
2695   int size1;
2696   int n = (int)luaL_checkinteger(L, 2);
2697   TTree *tree1 = getpatt(L, 1, &size1);
2698   if (n >= 0) {  /* seq tree1 (seq tree1 ... (seq tree1 (rep tree1))) */
2699     TTree *tree = newtree(L, (n + 1) * (size1 + 1));
2700     if (nullable(tree1))
2701       luaL_error(L, "loop body may accept empty string");
2702     while (n--)  /* repeat 'n' times */
2703       tree = seqaux(tree, tree1, size1);
2704     tree->tag = TRep;
2705     memcpy(sib1(tree), tree1, size1 * sizeof(TTree));
2706   }
2707   else {  /* choice (seq tree1 ... choice tree1 true ...) true */
2708     TTree *tree;
2709     n = -n;
2710     /* size = (choice + seq + tree1 + true) * n, but the last has no seq */
2711     tree = newtree(L, n * (size1 + 3) - 1);
2712     for (; n > 1; n--) {  /* repeat (n - 1) times */
2713       tree->tag = TChoice; tree->u.ps = n * (size1 + 3) - 2;
2714       sib2(tree)->tag = TTrue;
2715       tree = sib1(tree);
2716       tree = seqaux(tree, tree1, size1);
2717     }
2718     tree->tag = TChoice; tree->u.ps = size1 + 1;
2719     sib2(tree)->tag = TTrue;
2720     memcpy(sib1(tree), tree1, size1 * sizeof(TTree));
2721   }
2722   copyktable(L, 1);
2723   return 1;
2724 }
2725 
2726 
2727 /*
2728 ** #p == &p
2729 */
lp_and(lua_State * L)2730 static int lp_and (lua_State *L) {
2731   newroot1sib(L, TAnd);
2732   return 1;
2733 }
2734 
2735 
2736 /*
2737 ** -p == !p
2738 */
lp_not(lua_State * L)2739 static int lp_not (lua_State *L) {
2740   newroot1sib(L, TNot);
2741   return 1;
2742 }
2743 
2744 
2745 /*
2746 ** [t1 - t2] == Seq (Not t2) t1
2747 ** If t1 and t2 are charsets, make their difference.
2748 */
lp_sub(lua_State * L)2749 static int lp_sub (lua_State *L) {
2750   Charset st1, st2;
2751   int s1, s2;
2752   TTree *t1 = getpatt(L, 1, &s1);
2753   TTree *t2 = getpatt(L, 2, &s2);
2754   if (tocharset(t1, &st1) && tocharset(t2, &st2)) {
2755     TTree *t = newcharset(L);
2756     loopset(i, treebuffer(t)[i] = st1.cs[i] & ~st2.cs[i]);
2757   }
2758   else {
2759     TTree *tree = newtree(L, 2 + s1 + s2);
2760     tree->tag = TSeq;  /* sequence of... */
2761     tree->u.ps =  2 + s2;
2762     sib1(tree)->tag = TNot;  /* ...not... */
2763     memcpy(sib1(sib1(tree)), t2, s2 * sizeof(TTree));  /* ...t2 */
2764     memcpy(sib2(tree), t1, s1 * sizeof(TTree));  /* ... and t1 */
2765     joinktables(L, 1, sib1(tree), 2);
2766   }
2767   return 1;
2768 }
2769 
2770 
lp_set(lua_State * L)2771 static int lp_set (lua_State *L) {
2772   size_t l;
2773   const char *s = luaL_checklstring(L, 1, &l);
2774   TTree *tree = newcharset(L);
2775   while (l--) {
2776     setchar(treebuffer(tree), (byte)(*s));
2777     s++;
2778   }
2779   return 1;
2780 }
2781 
2782 
lp_range(lua_State * L)2783 static int lp_range (lua_State *L) {
2784   int arg;
2785   int top = lua_gettop(L);
2786   TTree *tree = newcharset(L);
2787   for (arg = 1; arg <= top; arg++) {
2788     int c;
2789     size_t l;
2790     const char *r = luaL_checklstring(L, arg, &l);
2791     luaL_argcheck(L, l == 2, arg, "range must have two characters");
2792     for (c = (byte)r[0]; c <= (byte)r[1]; c++)
2793       setchar(treebuffer(tree), c);
2794   }
2795   return 1;
2796 }
2797 
2798 
2799 /*
2800 ** Look-behind predicate
2801 */
lp_behind(lua_State * L)2802 static int lp_behind (lua_State *L) {
2803   TTree *tree;
2804   TTree *tree1 = getpatt(L, 1, NULL);
2805   int n = fixedlen(tree1);
2806   luaL_argcheck(L, n > 0, 1, "pattern may not have fixed length");
2807   luaL_argcheck(L, !hascaptures(tree1), 1, "pattern have captures");
2808   luaL_argcheck(L, n <= MAXBEHIND, 1, "pattern too long to look behind");
2809   tree = newroot1sib(L, TBehind);
2810   tree->u.n = n;
2811   return 1;
2812 }
2813 
2814 
2815 /*
2816 ** Create a non-terminal
2817 */
lp_V(lua_State * L)2818 static int lp_V (lua_State *L) {
2819   TTree *tree = newleaf(L, TOpenCall);
2820   luaL_argcheck(L, !lua_isnoneornil(L, 1), 1, "non-nil value expected");
2821   tree->key = addtonewktable(L, 0, 1);
2822   return 1;
2823 }
2824 
2825 
2826 /*
2827 ** Create a tree for a non-empty capture, with a body and
2828 ** optionally with an associated Lua value (at index 'labelidx' in the
2829 ** stack)
2830 */
capture_aux(lua_State * L,int cap,int labelidx)2831 static int capture_aux (lua_State *L, int cap, int labelidx) {
2832   TTree *tree = newroot1sib(L, TCapture);
2833   tree->cap = cap;
2834   tree->key = (labelidx == 0) ? 0 : addtonewktable(L, 1, labelidx);
2835   return 1;
2836 }
2837 
2838 
2839 /*
2840 ** Fill a tree with an empty capture, using an empty (TTrue) sibling.
2841 */
auxemptycap(TTree * tree,int cap)2842 static TTree *auxemptycap (TTree *tree, int cap) {
2843   tree->tag = TCapture;
2844   tree->cap = cap;
2845   sib1(tree)->tag = TTrue;
2846   return tree;
2847 }
2848 
2849 
2850 /*
2851 ** Create a tree for an empty capture
2852 */
newemptycap(lua_State * L,int cap)2853 static TTree *newemptycap (lua_State *L, int cap) {
2854   return auxemptycap(newtree(L, 2), cap);
2855 }
2856 
2857 
2858 /*
2859 ** Create a tree for an empty capture with an associated Lua value
2860 */
newemptycapkey(lua_State * L,int cap,int idx)2861 static TTree *newemptycapkey (lua_State *L, int cap, int idx) {
2862   TTree *tree = auxemptycap(newtree(L, 2), cap);
2863   tree->key = addtonewktable(L, 0, idx);
2864   return tree;
2865 }
2866 
2867 
2868 /*
2869 ** Captures with syntax p / v
2870 ** (function capture, query capture, string capture, or number capture)
2871 */
lp_divcapture(lua_State * L)2872 static int lp_divcapture (lua_State *L) {
2873   switch (lua_type(L, 2)) {
2874     case LUA_TFUNCTION: return capture_aux(L, Cfunction, 2);
2875     case LUA_TTABLE: return capture_aux(L, Cquery, 2);
2876     case LUA_TSTRING: return capture_aux(L, Cstring, 2);
2877     case LUA_TNUMBER: {
2878       int n = lua_tointeger(L, 2);
2879       TTree *tree = newroot1sib(L, TCapture);
2880       luaL_argcheck(L, 0 <= n && n <= SHRT_MAX, 1, "invalid number");
2881       tree->cap = Cnum;
2882       tree->key = n;
2883       return 1;
2884     }
2885     default: return luaL_argerror(L, 2, "invalid replacement value");
2886   }
2887 }
2888 
2889 
lp_substcapture(lua_State * L)2890 static int lp_substcapture (lua_State *L) {
2891   return capture_aux(L, Csubst, 0);
2892 }
2893 
2894 
lp_tablecapture(lua_State * L)2895 static int lp_tablecapture (lua_State *L) {
2896   return capture_aux(L, Ctable, 0);
2897 }
2898 
2899 
lp_groupcapture(lua_State * L)2900 static int lp_groupcapture (lua_State *L) {
2901   if (lua_isnoneornil(L, 2))
2902     return capture_aux(L, Cgroup, 0);
2903   else {
2904     luaL_checkstring(L, 2);
2905     return capture_aux(L, Cgroup, 2);
2906   }
2907 }
2908 
2909 
lp_foldcapture(lua_State * L)2910 static int lp_foldcapture (lua_State *L) {
2911   luaL_checktype(L, 2, LUA_TFUNCTION);
2912   return capture_aux(L, Cfold, 2);
2913 }
2914 
2915 
lp_simplecapture(lua_State * L)2916 static int lp_simplecapture (lua_State *L) {
2917   return capture_aux(L, Csimple, 0);
2918 }
2919 
2920 
lp_poscapture(lua_State * L)2921 static int lp_poscapture (lua_State *L) {
2922   newemptycap(L, Cposition);
2923   return 1;
2924 }
2925 
2926 
lp_argcapture(lua_State * L)2927 static int lp_argcapture (lua_State *L) {
2928   int n = (int)luaL_checkinteger(L, 1);
2929   TTree *tree = newemptycap(L, Carg);
2930   tree->key = n;
2931   luaL_argcheck(L, 0 < n && n <= SHRT_MAX, 1, "invalid argument index");
2932   return 1;
2933 }
2934 
2935 
lp_backref(lua_State * L)2936 static int lp_backref (lua_State *L) {
2937   luaL_checkstring(L, 1);
2938   newemptycapkey(L, Cbackref, 1);
2939   return 1;
2940 }
2941 
2942 
2943 /*
2944 ** Constant capture
2945 */
lp_constcapture(lua_State * L)2946 static int lp_constcapture (lua_State *L) {
2947   int i;
2948   int n = lua_gettop(L);  /* number of values */
2949   if (n == 0)  /* no values? */
2950     newleaf(L, TTrue);  /* no capture */
2951   else if (n == 1)
2952     newemptycapkey(L, Cconst, 1);  /* single constant capture */
2953   else {  /* create a group capture with all values */
2954     TTree *tree = newtree(L, 1 + 3 * (n - 1) + 2);
2955     newktable(L, n);  /* create a 'ktable' for new tree */
2956     tree->tag = TCapture;
2957     tree->cap = Cgroup;
2958     tree->key = 0;
2959     tree = sib1(tree);
2960     for (i = 1; i <= n - 1; i++) {
2961       tree->tag = TSeq;
2962       tree->u.ps = 3;  /* skip TCapture and its sibling */
2963       auxemptycap(sib1(tree), Cconst);
2964       sib1(tree)->key = addtoktable(L, i);
2965       tree = sib2(tree);
2966     }
2967     auxemptycap(tree, Cconst);
2968     tree->key = addtoktable(L, i);
2969   }
2970   return 1;
2971 }
2972 
2973 
lp_matchtime(lua_State * L)2974 static int lp_matchtime (lua_State *L) {
2975   TTree *tree;
2976   luaL_checktype(L, 2, LUA_TFUNCTION);
2977   tree = newroot1sib(L, TRunTime);
2978   tree->key = addtonewktable(L, 1, 2);
2979   return 1;
2980 }
2981 
2982 /* }====================================================== */
2983 
2984 
2985 /*
2986 ** {======================================================
2987 ** Grammar - Tree generation
2988 ** =======================================================
2989 */
2990 
2991 /*
2992 ** push on the stack the index and the pattern for the
2993 ** initial rule of grammar at index 'arg' in the stack;
2994 ** also add that index into position table.
2995 */
getfirstrule(lua_State * L,int arg,int postab)2996 static void getfirstrule (lua_State *L, int arg, int postab) {
2997   lua_rawgeti(L, arg, 1);  /* access first element */
2998   if (lua_isstring(L, -1)) {  /* is it the name of initial rule? */
2999     lua_pushvalue(L, -1);  /* duplicate it to use as key */
3000     lua_gettable(L, arg);  /* get associated rule */
3001   }
3002   else {
3003     lua_pushinteger(L, 1);  /* key for initial rule */
3004     lua_insert(L, -2);  /* put it before rule */
3005   }
3006   if (!testpattern(L, -1)) {  /* initial rule not a pattern? */
3007     if (lua_isnil(L, -1))
3008       luaL_error(L, "grammar has no initial rule");
3009     else
3010       luaL_error(L, "initial rule '%s' is not a pattern", lua_tostring(L, -2));
3011   }
3012   lua_pushvalue(L, -2);  /* push key */
3013   lua_pushinteger(L, 1);  /* push rule position (after TGrammar) */
3014   lua_settable(L, postab);  /* insert pair at position table */
3015 }
3016 
3017 /*
3018 ** traverse grammar at index 'arg', pushing all its keys and patterns
3019 ** into the stack. Create a new table (before all pairs key-pattern) to
3020 ** collect all keys and their associated positions in the final tree
3021 ** (the "position table").
3022 ** Return the number of rules and (in 'totalsize') the total size
3023 ** for the new tree.
3024 */
collectrules(lua_State * L,int arg,int * totalsize)3025 static int collectrules (lua_State *L, int arg, int *totalsize) {
3026   int n = 1;  /* to count number of rules */
3027   int postab = lua_gettop(L) + 1;  /* index of position table */
3028   int size;  /* accumulator for total size */
3029   lua_newtable(L);  /* create position table */
3030   getfirstrule(L, arg, postab);
3031   size = 2 + getsize(L, postab + 2);  /* TGrammar + TRule + rule */
3032   lua_pushnil(L);  /* prepare to traverse grammar table */
3033   while (lua_next(L, arg) != 0) {
3034     if (lua_tonumber(L, -2) == 1 ||
3035         lua_equal(L, -2, postab + 1)) {  /* initial rule? */
3036       lua_pop(L, 1);  /* remove value (keep key for lua_next) */
3037       continue;
3038     }
3039     if (!testpattern(L, -1))  /* value is not a pattern? */
3040       luaL_error(L, "rule '%s' is not a pattern", val2str(L, -2));
3041     luaL_checkstack(L, LUA_MINSTACK, "grammar has too many rules");
3042     lua_pushvalue(L, -2);  /* push key (to insert into position table) */
3043     lua_pushinteger(L, size);
3044     lua_settable(L, postab);
3045     size += 1 + getsize(L, -1);  /* update size */
3046     lua_pushvalue(L, -2);  /* push key (for next lua_next) */
3047     n++;
3048   }
3049   *totalsize = size + 1;  /* TTrue to finish list of rules */
3050   return n;
3051 }
3052 
3053 
buildgrammar(lua_State * L,TTree * grammar,int frule,int n)3054 static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) {
3055   int i;
3056   TTree *nd = sib1(grammar);  /* auxiliary pointer to traverse the tree */
3057   for (i = 0; i < n; i++) {  /* add each rule into new tree */
3058     int ridx = frule + 2*i + 1;  /* index of i-th rule */
3059     int rulesize;
3060     TTree *rn = gettree(L, ridx, &rulesize);
3061     nd->tag = TRule;
3062     nd->key = 0;
3063     nd->cap = i;  /* rule number */
3064     nd->u.ps = rulesize + 1;  /* point to next rule */
3065     memcpy(sib1(nd), rn, rulesize * sizeof(TTree));  /* copy rule */
3066     mergektable(L, ridx, sib1(nd));  /* merge its ktable into new one */
3067     nd = sib2(nd);  /* move to next rule */
3068   }
3069   nd->tag = TTrue;  /* finish list of rules */
3070 }
3071 
3072 
3073 /*
3074 ** Check whether a tree has potential infinite loops
3075 */
checkloops(TTree * tree)3076 static int checkloops (TTree *tree) {
3077  tailcall:
3078   if (tree->tag == TRep && nullable(sib1(tree)))
3079     return 1;
3080   else if (tree->tag == TGrammar)
3081     return 0;  /* sub-grammars already checked */
3082   else {
3083     switch (numsiblings[tree->tag]) {
3084       case 1:  /* return checkloops(sib1(tree)); */
3085         tree = sib1(tree); goto tailcall;
3086       case 2:
3087         if (checkloops(sib1(tree))) return 1;
3088         /* else return checkloops(sib2(tree)); */
3089         tree = sib2(tree); goto tailcall;
3090       default: assert(numsiblings[tree->tag] == 0); return 0;
3091     }
3092   }
3093 }
3094 
3095 
verifyerror(lua_State * L,int * passed,int npassed)3096 static int verifyerror (lua_State *L, int *passed, int npassed) {
3097   int i, j;
3098   for (i = npassed - 1; i >= 0; i--) {  /* search for a repetition */
3099     for (j = i - 1; j >= 0; j--) {
3100       if (passed[i] == passed[j]) {
3101         lua_rawgeti(L, -1, passed[i]);  /* get rule's key */
3102         return luaL_error(L, "rule '%s' may be left recursive", val2str(L, -1));
3103       }
3104     }
3105   }
3106   return luaL_error(L, "too many left calls in grammar");
3107 }
3108 
3109 
3110 /*
3111 ** Check whether a rule can be left recursive; raise an error in that
3112 ** case; otherwise return 1 iff pattern is nullable. Assume ktable at
3113 ** the top of the stack.
3114 */
verifyrule(lua_State * L,TTree * tree,int * passed,int npassed,int nullable)3115 static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed,
3116                        int nullable) {
3117  tailcall:
3118   switch (tree->tag) {
3119     case TChar: case TSet: case TAny:
3120     case TFalse:
3121       return nullable;  /* cannot pass from here */
3122     case TTrue:
3123     case TBehind:  /* look-behind cannot have calls */
3124       return 1;
3125     case TNot: case TAnd: case TRep:
3126       /* return verifyrule(L, sib1(tree), passed, npassed, 1); */
3127       tree = sib1(tree); nullable = 1; goto tailcall;
3128     case TCapture: case TRunTime:
3129       /* return verifyrule(L, sib1(tree), passed, npassed); */
3130       tree = sib1(tree); goto tailcall;
3131     case TCall:
3132       /* return verifyrule(L, sib2(tree), passed, npassed); */
3133       tree = sib2(tree); goto tailcall;
3134     case TSeq:  /* only check 2nd child if first is nullable */
3135       if (!verifyrule(L, sib1(tree), passed, npassed, 0))
3136         return nullable;
3137       /* else return verifyrule(L, sib2(tree), passed, npassed); */
3138       tree = sib2(tree); goto tailcall;
3139     case TChoice:  /* must check both children */
3140       nullable = verifyrule(L, sib1(tree), passed, npassed, nullable);
3141       /* return verifyrule(L, sib2(tree), passed, npassed, nullable); */
3142       tree = sib2(tree); goto tailcall;
3143     case TRule:
3144       if (npassed >= MAXRULES)
3145         return verifyerror(L, passed, npassed);
3146       else {
3147         passed[npassed++] = tree->key;
3148         /* return verifyrule(L, sib1(tree), passed, npassed); */
3149         tree = sib1(tree); goto tailcall;
3150       }
3151     case TGrammar:
3152       return nullable(tree);  /* sub-grammar cannot be left recursive */
3153     default: assert(0); return 0;
3154   }
3155 }
3156 
3157 
verifygrammar(lua_State * L,TTree * grammar)3158 static void verifygrammar (lua_State *L, TTree *grammar) {
3159   int passed[MAXRULES];
3160   TTree *rule;
3161   /* check left-recursive rules */
3162   for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
3163     if (rule->key == 0) continue;  /* unused rule */
3164     verifyrule(L, sib1(rule), passed, 0, 0);
3165   }
3166   assert(rule->tag == TTrue);
3167   /* check infinite loops inside rules */
3168   for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) {
3169     if (rule->key == 0) continue;  /* unused rule */
3170     if (checkloops(sib1(rule))) {
3171       lua_rawgeti(L, -1, rule->key);  /* get rule's key */
3172       luaL_error(L, "empty loop in rule '%s'", val2str(L, -1));
3173     }
3174   }
3175   assert(rule->tag == TTrue);
3176 }
3177 
3178 
3179 /*
3180 ** Give a name for the initial rule if it is not referenced
3181 */
initialrulename(lua_State * L,TTree * grammar,int frule)3182 static void initialrulename (lua_State *L, TTree *grammar, int frule) {
3183   if (sib1(grammar)->key == 0) {  /* initial rule is not referenced? */
3184     int n = lua_objlen(L, -1) + 1;  /* index for name */
3185     lua_pushvalue(L, frule);  /* rule's name */
3186     lua_rawseti(L, -2, n);  /* ktable was on the top of the stack */
3187     sib1(grammar)->key = n;
3188   }
3189 }
3190 
3191 
newgrammar(lua_State * L,int arg)3192 static TTree *newgrammar (lua_State *L, int arg) {
3193   int treesize;
3194   int frule = lua_gettop(L) + 2;  /* position of first rule's key */
3195   int n = collectrules(L, arg, &treesize);
3196   TTree *g = newtree(L, treesize);
3197   luaL_argcheck(L, n <= MAXRULES, arg, "grammar has too many rules");
3198   g->tag = TGrammar;  g->u.n = n;
3199   lua_newtable(L);  /* create 'ktable' */
3200   lua_setfenv(L, -2);
3201   buildgrammar(L, g, frule, n);
3202   lua_getfenv(L, -1);  /* get 'ktable' for new tree */
3203   finalfix(L, frule - 1, g, sib1(g));
3204   initialrulename(L, g, frule);
3205   verifygrammar(L, g);
3206   lua_pop(L, 1);  /* remove 'ktable' */
3207   lua_insert(L, -(n * 2 + 2));  /* move new table to proper position */
3208   lua_pop(L, n * 2 + 1);  /* remove position table + rule pairs */
3209   return g;  /* new table at the top of the stack */
3210 }
3211 
3212 /* }====================================================== */
3213 
3214 
prepcompile(lua_State * L,Pattern * p,int idx)3215 static Instruction *prepcompile (lua_State *L, Pattern *p, int idx) {
3216   lua_getfenv(L, idx);  /* push 'ktable' (may be used by 'finalfix') */
3217   finalfix(L, 0, NULL, p->tree);
3218   lua_pop(L, 1);  /* remove 'ktable' */
3219   return compile(L, p);
3220 }
3221 
3222 
lp_printtree(lua_State * L)3223 static int lp_printtree (lua_State *L) {
3224   TTree *tree = getpatt(L, 1, NULL);
3225   int c = lua_toboolean(L, 2);
3226   if (c) {
3227     lua_getfenv(L, 1);  /* push 'ktable' (may be used by 'finalfix') */
3228     finalfix(L, 0, NULL, tree);
3229     lua_pop(L, 1);  /* remove 'ktable' */
3230   }
3231   printktable(L, 1);
3232   printtree(tree, 0);
3233   return 0;
3234 }
3235 
3236 
lp_printcode(lua_State * L)3237 static int lp_printcode (lua_State *L) {
3238   Pattern *p = getpattern(L, 1);
3239   printktable(L, 1);
3240   if (p->code == NULL)  /* not compiled yet? */
3241     prepcompile(L, p, 1);
3242   printpatt(p->code, p->codesize);
3243   return 0;
3244 }
3245 
3246 
3247 /*
3248 ** Get the initial position for the match, interpreting negative
3249 ** values from the end of the subject
3250 */
initposition(lua_State * L,size_t len)3251 static size_t initposition (lua_State *L, size_t len) {
3252   lua_Integer ii = luaL_optinteger(L, 3, 1);
3253   if (ii > 0) {  /* positive index? */
3254     if ((size_t)ii <= len)  /* inside the string? */
3255       return (size_t)ii - 1;  /* return it (corrected to 0-base) */
3256     else return len;  /* crop at the end */
3257   }
3258   else {  /* negative index */
3259     if ((size_t)(-ii) <= len)  /* inside the string? */
3260       return len - ((size_t)(-ii));  /* return position from the end */
3261     else return 0;  /* crop at the beginning */
3262   }
3263 }
3264 
3265 
3266 /*
3267 ** Main match function
3268 */
lp_match(lua_State * L)3269 static int lp_match (lua_State *L) {
3270   Capture capture[INITCAPSIZE];
3271   const char *r;
3272   size_t l;
3273   Pattern *p = (getpatt(L, 1, NULL), getpattern(L, 1));
3274   Instruction *code = (p->code != NULL) ? p->code : prepcompile(L, p, 1);
3275   const char *s = luaL_checklstring(L, SUBJIDX, &l);
3276   size_t i = initposition(L, l);
3277   int ptop = lua_gettop(L);
3278   lua_pushnil(L);  /* initialize subscache */
3279   lua_pushlightuserdata(L, capture);  /* initialize caplistidx */
3280   lua_getfenv(L, 1);  /* initialize penvidx */
3281   r = match(L, s, s + i, s + l, code, capture, ptop);
3282   if (r == NULL) {
3283     lua_pushnil(L);
3284     return 1;
3285   }
3286   return getcaptures(L, s, r, ptop);
3287 }
3288 
3289 
3290 
3291 /*
3292 ** {======================================================
3293 ** Library creation and functions not related to matching
3294 ** =======================================================
3295 */
3296 
lp_setmax(lua_State * L)3297 static int lp_setmax (lua_State *L) {
3298   luaL_optinteger(L, 1, -1);
3299   lua_settop(L, 1);
3300   lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX);
3301   return 0;
3302 }
3303 
3304 
lp_version(lua_State * L)3305 static int lp_version (lua_State *L) {
3306   lua_pushstring(L, VERSION);
3307   return 1;
3308 }
3309 
3310 
lp_type(lua_State * L)3311 static int lp_type (lua_State *L) {
3312   if (testpattern(L, 1))
3313     lua_pushliteral(L, "pattern");
3314   else
3315     lua_pushnil(L);
3316   return 1;
3317 }
3318 
3319 
lp_gc(lua_State * L)3320 int lp_gc (lua_State *L) {
3321   Pattern *p = getpattern(L, 1);
3322   if (p->codesize > 0)
3323     realloccode(L, p, 0);
3324   return 0;
3325 }
3326 
3327 
createcat(lua_State * L,const char * catname,int (catf)(int))3328 static void createcat (lua_State *L, const char *catname, int (catf) (int)) {
3329   TTree *t = newcharset(L);
3330   int i;
3331   for (i = 0; i <= UCHAR_MAX; i++)
3332     if (catf(i)) setchar(treebuffer(t), i);
3333   lua_setfield(L, -2, catname);
3334 }
3335 
3336 
lp_locale(lua_State * L)3337 static int lp_locale (lua_State *L) {
3338   if (lua_isnoneornil(L, 1)) {
3339     lua_settop(L, 0);
3340     lua_createtable(L, 0, 12);
3341   }
3342   else {
3343     luaL_checktype(L, 1, LUA_TTABLE);
3344     lua_settop(L, 1);
3345   }
3346   createcat(L, "alnum", isalnum);
3347   createcat(L, "alpha", isalpha);
3348   createcat(L, "cntrl", iscntrl);
3349   createcat(L, "digit", isdigit);
3350   createcat(L, "graph", isgraph);
3351   createcat(L, "lower", islower);
3352   createcat(L, "print", isprint);
3353   createcat(L, "punct", ispunct);
3354   createcat(L, "space", isspace);
3355   createcat(L, "upper", isupper);
3356   createcat(L, "xdigit", isxdigit);
3357   return 1;
3358 }
3359 
3360 
3361 static struct luaL_Reg pattreg[] = {
3362   {"ptree", lp_printtree},
3363   {"pcode", lp_printcode},
3364   {"match", lp_match},
3365   {"B", lp_behind},
3366   {"V", lp_V},
3367   {"C", lp_simplecapture},
3368   {"Cc", lp_constcapture},
3369   {"Cmt", lp_matchtime},
3370   {"Cb", lp_backref},
3371   {"Carg", lp_argcapture},
3372   {"Cp", lp_poscapture},
3373   {"Cs", lp_substcapture},
3374   {"Ct", lp_tablecapture},
3375   {"Cf", lp_foldcapture},
3376   {"Cg", lp_groupcapture},
3377   {"P", lp_P},
3378   {"S", lp_set},
3379   {"R", lp_range},
3380   {"locale", lp_locale},
3381   {"version", lp_version},
3382   {"setmaxstack", lp_setmax},
3383   {"type", lp_type},
3384   {NULL, NULL}
3385 };
3386 
3387 
3388 static struct luaL_Reg metareg[] = {
3389   {"__mul", lp_seq},
3390   {"__add", lp_choice},
3391   {"__pow", lp_star},
3392   {"__gc", lp_gc},
3393   {"__len", lp_and},
3394   {"__div", lp_divcapture},
3395   {"__unm", lp_not},
3396   {"__sub", lp_sub},
3397   {NULL, NULL}
3398 };
3399 
3400 
3401 int luaopen_lpeg (lua_State *L);
luaopen_lpeg(lua_State * L)3402 int luaopen_lpeg (lua_State *L) {
3403   luaL_newmetatable(L, PATTERN_T);
3404   lua_pushnumber(L, MAXBACK);  /* initialize maximum backtracking */
3405   lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX);
3406   luaL_register(L, NULL, metareg);
3407   luaL_register(L, "lpeg", pattreg);
3408   lua_pushvalue(L, -1);
3409   lua_setfield(L, -3, "__index");
3410   return 1;
3411 }
3412 
3413 /* }====================================================== */
3414