1 /*
2 ** 2012-11-13
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 **
13 ** The code in this file implements a compact but reasonably
14 ** efficient regular-expression matcher for posix extended regular
15 ** expressions against UTF8 text.
16 **
17 ** This file is an SQLite extension.  It registers a single function
18 ** named "regexp(A,B)" where A is the regular expression and B is the
19 ** string to be matched.  By registering this function, SQLite will also
20 ** then implement the "B regexp A" operator.  Note that with the function
21 ** the regular expression comes first, but with the operator it comes
22 ** second.
23 **
24 **  The following regular expression syntax is supported:
25 **
26 **     X*      zero or more occurrences of X
27 **     X+      one or more occurrences of X
28 **     X?      zero or one occurrences of X
29 **     X{p,q}  between p and q occurrences of X
30 **     (X)     match X
31 **     X|Y     X or Y
32 **     ^X      X occurring at the beginning of the string
33 **     X$      X occurring at the end of the string
34 **     .       Match any single character
35 **     \c      Character c where c is one of \{}()[]|*+?.
36 **     \c      C-language escapes for c in afnrtv.  ex: \t or \n
37 **     \uXXXX  Where XXXX is exactly 4 hex digits, unicode value XXXX
38 **     \xXX    Where XX is exactly 2 hex digits, unicode value XX
39 **     [abc]   Any single character from the set abc
40 **     [^abc]  Any single character not in the set abc
41 **     [a-z]   Any single character in the range a-z
42 **     [^a-z]  Any single character not in the range a-z
43 **     \b      Word boundary
44 **     \w      Word character.  [A-Za-z0-9_]
45 **     \W      Non-word character
46 **     \d      Digit
47 **     \D      Non-digit
48 **     \s      Whitespace character
49 **     \S      Non-whitespace character
50 **
51 ** A nondeterministic finite automaton (NFA) is used for matching, so the
52 ** performance is bounded by O(N*M) where N is the size of the regular
53 ** expression and M is the size of the input string.  The matcher never
54 ** exhibits exponential behavior.  Note that the X{p,q} operator expands
55 ** to p copies of X following by q-p copies of X? and that the size of the
56 ** regular expression in the O(N*M) performance bound is computed after
57 ** this expansion.
58 */
59 #include <string.h>
60 #include <stdlib.h>
61 #include "sqlite3ext.h"
62 SQLITE_EXTENSION_INIT1
63 
64 /*
65 ** The following #defines change the names of some functions implemented in
66 ** this file to prevent name collisions with C-library functions of the
67 ** same name.
68 */
69 #define re_match   sqlite3re_match
70 #define re_compile sqlite3re_compile
71 #define re_free    sqlite3re_free
72 
73 /* The end-of-input character */
74 #define RE_EOF            0    /* End of input */
75 
76 /* The NFA is implemented as sequence of opcodes taken from the following
77 ** set.  Each opcode has a single integer argument.
78 */
79 #define RE_OP_MATCH       1    /* Match the one character in the argument */
80 #define RE_OP_ANY         2    /* Match any one character.  (Implements ".") */
81 #define RE_OP_ANYSTAR     3    /* Special optimized version of .* */
82 #define RE_OP_FORK        4    /* Continue to both next and opcode at iArg */
83 #define RE_OP_GOTO        5    /* Jump to opcode at iArg */
84 #define RE_OP_ACCEPT      6    /* Halt and indicate a successful match */
85 #define RE_OP_CC_INC      7    /* Beginning of a [...] character class */
86 #define RE_OP_CC_EXC      8    /* Beginning of a [^...] character class */
87 #define RE_OP_CC_VALUE    9    /* Single value in a character class */
88 #define RE_OP_CC_RANGE   10    /* Range of values in a character class */
89 #define RE_OP_WORD       11    /* Perl word character [A-Za-z0-9_] */
90 #define RE_OP_NOTWORD    12    /* Not a perl word character */
91 #define RE_OP_DIGIT      13    /* digit:  [0-9] */
92 #define RE_OP_NOTDIGIT   14    /* Not a digit */
93 #define RE_OP_SPACE      15    /* space:  [ \t\n\r\v\f] */
94 #define RE_OP_NOTSPACE   16    /* Not a digit */
95 #define RE_OP_BOUNDARY   17    /* Boundary between word and non-word */
96 
97 /* Each opcode is a "state" in the NFA */
98 typedef unsigned short ReStateNumber;
99 
100 /* Because this is an NFA and not a DFA, multiple states can be active at
101 ** once.  An instance of the following object records all active states in
102 ** the NFA.  The implementation is optimized for the common case where the
103 ** number of actives states is small.
104 */
105 typedef struct ReStateSet {
106   unsigned nState;            /* Number of current states */
107   ReStateNumber *aState;      /* Current states */
108 } ReStateSet;
109 
110 /* An input string read one character at a time.
111 */
112 typedef struct ReInput ReInput;
113 struct ReInput {
114   const unsigned char *z;  /* All text */
115   int i;                   /* Next byte to read */
116   int mx;                  /* EOF when i>=mx */
117 };
118 
119 /* A compiled NFA (or an NFA that is in the process of being compiled) is
120 ** an instance of the following object.
121 */
122 typedef struct ReCompiled ReCompiled;
123 struct ReCompiled {
124   ReInput sIn;                /* Regular expression text */
125   const char *zErr;           /* Error message to return */
126   char *aOp;                  /* Operators for the virtual machine */
127   int *aArg;                  /* Arguments to each operator */
128   unsigned (*xNextChar)(ReInput*);  /* Next character function */
129   unsigned char zInit[12];    /* Initial text to match */
130   int nInit;                  /* Number of characters in zInit */
131   unsigned nState;            /* Number of entries in aOp[] and aArg[] */
132   unsigned nAlloc;            /* Slots allocated for aOp[] and aArg[] */
133 };
134 
135 /* Add a state to the given state set if it is not already there */
re_add_state(ReStateSet * pSet,int newState)136 static void re_add_state(ReStateSet *pSet, int newState){
137   unsigned i;
138   for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
139   pSet->aState[pSet->nState++] = newState;
140 }
141 
142 /* Extract the next unicode character from *pzIn and return it.  Advance
143 ** *pzIn to the first byte past the end of the character returned.  To
144 ** be clear:  this routine converts utf8 to unicode.  This routine is
145 ** optimized for the common case where the next character is a single byte.
146 */
re_next_char(ReInput * p)147 static unsigned re_next_char(ReInput *p){
148   unsigned c;
149   if( p->i>=p->mx ) return 0;
150   c = p->z[p->i++];
151   if( c>=0x80 ){
152     if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){
153       c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f);
154       if( c<0x80 ) c = 0xfffd;
155     }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80
156            && (p->z[p->i+1]&0xc0)==0x80 ){
157       c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f);
158       p->i += 2;
159       if( c<=0x3ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
160     }else if( (c&0xf8)==0xf0 && p->i+3<p->mx && (p->z[p->i]&0xc0)==0x80
161            && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
162       c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6)
163                        | (p->z[p->i+2]&0x3f);
164       p->i += 3;
165       if( c<=0xffff || c>0x10ffff ) c = 0xfffd;
166     }else{
167       c = 0xfffd;
168     }
169   }
170   return c;
171 }
re_next_char_nocase(ReInput * p)172 static unsigned re_next_char_nocase(ReInput *p){
173   unsigned c = re_next_char(p);
174   if( c>='A' && c<='Z' ) c += 'a' - 'A';
175   return c;
176 }
177 
178 /* Return true if c is a perl "word" character:  [A-Za-z0-9_] */
re_word_char(int c)179 static int re_word_char(int c){
180   return (c>='0' && c<='9') || (c>='a' && c<='z')
181       || (c>='A' && c<='Z') || c=='_';
182 }
183 
184 /* Return true if c is a "digit" character:  [0-9] */
re_digit_char(int c)185 static int re_digit_char(int c){
186   return (c>='0' && c<='9');
187 }
188 
189 /* Return true if c is a perl "space" character:  [ \t\r\n\v\f] */
re_space_char(int c)190 static int re_space_char(int c){
191   return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
192 }
193 
194 /* Run a compiled regular expression on the zero-terminated input
195 ** string zIn[].  Return true on a match and false if there is no match.
196 */
re_match(ReCompiled * pRe,const unsigned char * zIn,int nIn)197 static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
198   ReStateSet aStateSet[2], *pThis, *pNext;
199   ReStateNumber aSpace[100];
200   ReStateNumber *pToFree;
201   unsigned int i = 0;
202   unsigned int iSwap = 0;
203   int c = RE_EOF+1;
204   int cPrev = 0;
205   int rc = 0;
206   ReInput in;
207 
208   in.z = zIn;
209   in.i = 0;
210   in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
211 
212   /* Look for the initial prefix match, if there is one. */
213   if( pRe->nInit ){
214     unsigned char x = pRe->zInit[0];
215     while( in.i+pRe->nInit<=in.mx
216      && (zIn[in.i]!=x ||
217          strncmp((const char*)zIn+in.i, (const char*)pRe->zInit, pRe->nInit)!=0)
218     ){
219       in.i++;
220     }
221     if( in.i+pRe->nInit>in.mx ) return 0;
222   }
223 
224   if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
225     pToFree = 0;
226     aStateSet[0].aState = aSpace;
227   }else{
228     pToFree = sqlite3_malloc( sizeof(ReStateNumber)*2*pRe->nState );
229     if( pToFree==0 ) return -1;
230     aStateSet[0].aState = pToFree;
231   }
232   aStateSet[1].aState = &aStateSet[0].aState[pRe->nState];
233   pNext = &aStateSet[1];
234   pNext->nState = 0;
235   re_add_state(pNext, 0);
236   while( c!=RE_EOF && pNext->nState>0 ){
237     cPrev = c;
238     c = pRe->xNextChar(&in);
239     pThis = pNext;
240     pNext = &aStateSet[iSwap];
241     iSwap = 1 - iSwap;
242     pNext->nState = 0;
243     for(i=0; i<pThis->nState; i++){
244       int x = pThis->aState[i];
245       switch( pRe->aOp[x] ){
246         case RE_OP_MATCH: {
247           if( pRe->aArg[x]==c ) re_add_state(pNext, x+1);
248           break;
249         }
250         case RE_OP_ANY: {
251           re_add_state(pNext, x+1);
252           break;
253         }
254         case RE_OP_WORD: {
255           if( re_word_char(c) ) re_add_state(pNext, x+1);
256           break;
257         }
258         case RE_OP_NOTWORD: {
259           if( !re_word_char(c) ) re_add_state(pNext, x+1);
260           break;
261         }
262         case RE_OP_DIGIT: {
263           if( re_digit_char(c) ) re_add_state(pNext, x+1);
264           break;
265         }
266         case RE_OP_NOTDIGIT: {
267           if( !re_digit_char(c) ) re_add_state(pNext, x+1);
268           break;
269         }
270         case RE_OP_SPACE: {
271           if( re_space_char(c) ) re_add_state(pNext, x+1);
272           break;
273         }
274         case RE_OP_NOTSPACE: {
275           if( !re_space_char(c) ) re_add_state(pNext, x+1);
276           break;
277         }
278         case RE_OP_BOUNDARY: {
279           if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1);
280           break;
281         }
282         case RE_OP_ANYSTAR: {
283           re_add_state(pNext, x);
284           re_add_state(pThis, x+1);
285           break;
286         }
287         case RE_OP_FORK: {
288           re_add_state(pThis, x+pRe->aArg[x]);
289           re_add_state(pThis, x+1);
290           break;
291         }
292         case RE_OP_GOTO: {
293           re_add_state(pThis, x+pRe->aArg[x]);
294           break;
295         }
296         case RE_OP_ACCEPT: {
297           rc = 1;
298           goto re_match_end;
299         }
300         case RE_OP_CC_INC:
301         case RE_OP_CC_EXC: {
302           int j = 1;
303           int n = pRe->aArg[x];
304           int hit = 0;
305           for(j=1; j>0 && j<n; j++){
306             if( pRe->aOp[x+j]==RE_OP_CC_VALUE ){
307               if( pRe->aArg[x+j]==c ){
308                 hit = 1;
309                 j = -1;
310               }
311             }else{
312               if( pRe->aArg[x+j]<=c && pRe->aArg[x+j+1]>=c ){
313                 hit = 1;
314                 j = -1;
315               }else{
316                 j++;
317               }
318             }
319           }
320           if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit;
321           if( hit ) re_add_state(pNext, x+n);
322           break;
323         }
324       }
325     }
326   }
327   for(i=0; i<pNext->nState; i++){
328     if( pRe->aOp[pNext->aState[i]]==RE_OP_ACCEPT ){ rc = 1; break; }
329   }
330 re_match_end:
331   sqlite3_free(pToFree);
332   return rc;
333 }
334 
335 /* Resize the opcode and argument arrays for an RE under construction.
336 */
re_resize(ReCompiled * p,int N)337 static int re_resize(ReCompiled *p, int N){
338   char *aOp;
339   int *aArg;
340   aOp = sqlite3_realloc(p->aOp, N*sizeof(p->aOp[0]));
341   if( aOp==0 ) return 1;
342   p->aOp = aOp;
343   aArg = sqlite3_realloc(p->aArg, N*sizeof(p->aArg[0]));
344   if( aArg==0 ) return 1;
345   p->aArg = aArg;
346   p->nAlloc = N;
347   return 0;
348 }
349 
350 /* Insert a new opcode and argument into an RE under construction.  The
351 ** insertion point is just prior to existing opcode iBefore.
352 */
re_insert(ReCompiled * p,int iBefore,int op,int arg)353 static int re_insert(ReCompiled *p, int iBefore, int op, int arg){
354   int i;
355   if( p->nAlloc<=p->nState && re_resize(p, p->nAlloc*2) ) return 0;
356   for(i=p->nState; i>iBefore; i--){
357     p->aOp[i] = p->aOp[i-1];
358     p->aArg[i] = p->aArg[i-1];
359   }
360   p->nState++;
361   p->aOp[iBefore] = op;
362   p->aArg[iBefore] = arg;
363   return iBefore;
364 }
365 
366 /* Append a new opcode and argument to the end of the RE under construction.
367 */
re_append(ReCompiled * p,int op,int arg)368 static int re_append(ReCompiled *p, int op, int arg){
369   return re_insert(p, p->nState, op, arg);
370 }
371 
372 /* Make a copy of N opcodes starting at iStart onto the end of the RE
373 ** under construction.
374 */
re_copy(ReCompiled * p,int iStart,int N)375 static void re_copy(ReCompiled *p, int iStart, int N){
376   if( p->nState+N>=p->nAlloc && re_resize(p, p->nAlloc*2+N) ) return;
377   memcpy(&p->aOp[p->nState], &p->aOp[iStart], N*sizeof(p->aOp[0]));
378   memcpy(&p->aArg[p->nState], &p->aArg[iStart], N*sizeof(p->aArg[0]));
379   p->nState += N;
380 }
381 
382 /* Return true if c is a hexadecimal digit character:  [0-9a-fA-F]
383 ** If c is a hex digit, also set *pV = (*pV)*16 + valueof(c).  If
384 ** c is not a hex digit *pV is unchanged.
385 */
re_hex(int c,int * pV)386 static int re_hex(int c, int *pV){
387   if( c>='0' && c<='9' ){
388     c -= '0';
389   }else if( c>='a' && c<='f' ){
390     c -= 'a' - 10;
391   }else if( c>='A' && c<='F' ){
392     c -= 'A' - 10;
393   }else{
394     return 0;
395   }
396   *pV = (*pV)*16 + (c & 0xff);
397   return 1;
398 }
399 
400 /* A backslash character has been seen, read the next character and
401 ** return its interpretation.
402 */
re_esc_char(ReCompiled * p)403 static unsigned re_esc_char(ReCompiled *p){
404   static const char zEsc[] = "afnrtv\\()*.+?[$^{|}]";
405   static const char zTrans[] = "\a\f\n\r\t\v";
406   int i, v = 0;
407   char c;
408   if( p->sIn.i>=p->sIn.mx ) return 0;
409   c = p->sIn.z[p->sIn.i];
410   if( c=='u' && p->sIn.i+4<p->sIn.mx ){
411     const unsigned char *zIn = p->sIn.z + p->sIn.i;
412     if( re_hex(zIn[1],&v)
413      && re_hex(zIn[2],&v)
414      && re_hex(zIn[3],&v)
415      && re_hex(zIn[4],&v)
416     ){
417       p->sIn.i += 5;
418       return v;
419     }
420   }
421   if( c=='x' && p->sIn.i+2<p->sIn.mx ){
422     const unsigned char *zIn = p->sIn.z + p->sIn.i;
423     if( re_hex(zIn[1],&v)
424      && re_hex(zIn[2],&v)
425     ){
426       p->sIn.i += 3;
427       return v;
428     }
429   }
430   for(i=0; zEsc[i] && zEsc[i]!=c; i++){}
431   if( zEsc[i] ){
432     if( i<6 ) c = zTrans[i];
433     p->sIn.i++;
434   }else{
435     p->zErr = "unknown \\ escape";
436   }
437   return c;
438 }
439 
440 /* Forward declaration */
441 static const char *re_subcompile_string(ReCompiled*);
442 
443 /* Peek at the next byte of input */
rePeek(ReCompiled * p)444 static unsigned char rePeek(ReCompiled *p){
445   return p->sIn.i<p->sIn.mx ? p->sIn.z[p->sIn.i] : 0;
446 }
447 
448 /* Compile RE text into a sequence of opcodes.  Continue up to the
449 ** first unmatched ")" character, then return.  If an error is found,
450 ** return a pointer to the error message string.
451 */
re_subcompile_re(ReCompiled * p)452 static const char *re_subcompile_re(ReCompiled *p){
453   const char *zErr;
454   int iStart, iEnd, iGoto;
455   iStart = p->nState;
456   zErr = re_subcompile_string(p);
457   if( zErr ) return zErr;
458   while( rePeek(p)=='|' ){
459     iEnd = p->nState;
460     re_insert(p, iStart, RE_OP_FORK, iEnd + 2 - iStart);
461     iGoto = re_append(p, RE_OP_GOTO, 0);
462     p->sIn.i++;
463     zErr = re_subcompile_string(p);
464     if( zErr ) return zErr;
465     p->aArg[iGoto] = p->nState - iGoto;
466   }
467   return 0;
468 }
469 
470 /* Compile an element of regular expression text (anything that can be
471 ** an operand to the "|" operator).  Return NULL on success or a pointer
472 ** to the error message if there is a problem.
473 */
re_subcompile_string(ReCompiled * p)474 static const char *re_subcompile_string(ReCompiled *p){
475   int iPrev = -1;
476   int iStart;
477   unsigned c;
478   const char *zErr;
479   while( (c = p->xNextChar(&p->sIn))!=0 ){
480     iStart = p->nState;
481     switch( c ){
482       case '|':
483       case '$':
484       case ')': {
485         p->sIn.i--;
486         return 0;
487       }
488       case '(': {
489         zErr = re_subcompile_re(p);
490         if( zErr ) return zErr;
491         if( rePeek(p)!=')' ) return "unmatched '('";
492         p->sIn.i++;
493         break;
494       }
495       case '.': {
496         if( rePeek(p)=='*' ){
497           re_append(p, RE_OP_ANYSTAR, 0);
498           p->sIn.i++;
499         }else{
500           re_append(p, RE_OP_ANY, 0);
501         }
502         break;
503       }
504       case '*': {
505         if( iPrev<0 ) return "'*' without operand";
506         re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1);
507         re_append(p, RE_OP_FORK, iPrev - p->nState + 1);
508         break;
509       }
510       case '+': {
511         if( iPrev<0 ) return "'+' without operand";
512         re_append(p, RE_OP_FORK, iPrev - p->nState);
513         break;
514       }
515       case '?': {
516         if( iPrev<0 ) return "'?' without operand";
517         re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1);
518         break;
519       }
520       case '{': {
521         int m = 0, n = 0;
522         int sz, j;
523         if( iPrev<0 ) return "'{m,n}' without operand";
524         while( (c=rePeek(p))>='0' && c<='9' ){ m = m*10 + c - '0'; p->sIn.i++; }
525         n = m;
526         if( c==',' ){
527           p->sIn.i++;
528           n = 0;
529           while( (c=rePeek(p))>='0' && c<='9' ){ n = n*10 + c-'0'; p->sIn.i++; }
530         }
531         if( c!='}' ) return "unmatched '{'";
532         if( n>0 && n<m ) return "n less than m in '{m,n}'";
533         p->sIn.i++;
534         sz = p->nState - iPrev;
535         if( m==0 ){
536           if( n==0 ) return "both m and n are zero in '{m,n}'";
537           re_insert(p, iPrev, RE_OP_FORK, sz+1);
538           n--;
539         }else{
540           for(j=1; j<m; j++) re_copy(p, iPrev, sz);
541         }
542         for(j=m; j<n; j++){
543           re_append(p, RE_OP_FORK, sz+1);
544           re_copy(p, iPrev, sz);
545         }
546         if( n==0 && m>0 ){
547           re_append(p, RE_OP_FORK, -sz);
548         }
549         break;
550       }
551       case '[': {
552         int iFirst = p->nState;
553         if( rePeek(p)=='^' ){
554           re_append(p, RE_OP_CC_EXC, 0);
555           p->sIn.i++;
556         }else{
557           re_append(p, RE_OP_CC_INC, 0);
558         }
559         while( (c = p->xNextChar(&p->sIn))!=0 ){
560           if( c=='[' && rePeek(p)==':' ){
561             return "POSIX character classes not supported";
562           }
563           if( c=='\\' ) c = re_esc_char(p);
564           if( rePeek(p)=='-' ){
565             re_append(p, RE_OP_CC_RANGE, c);
566             p->sIn.i++;
567             c = p->xNextChar(&p->sIn);
568             if( c=='\\' ) c = re_esc_char(p);
569             re_append(p, RE_OP_CC_RANGE, c);
570           }else{
571             re_append(p, RE_OP_CC_VALUE, c);
572           }
573           if( rePeek(p)==']' ){ p->sIn.i++; break; }
574         }
575         if( c==0 ) return "unclosed '['";
576         p->aArg[iFirst] = p->nState - iFirst;
577         break;
578       }
579       case '\\': {
580         int specialOp = 0;
581         switch( rePeek(p) ){
582           case 'b': specialOp = RE_OP_BOUNDARY;   break;
583           case 'd': specialOp = RE_OP_DIGIT;      break;
584           case 'D': specialOp = RE_OP_NOTDIGIT;   break;
585           case 's': specialOp = RE_OP_SPACE;      break;
586           case 'S': specialOp = RE_OP_NOTSPACE;   break;
587           case 'w': specialOp = RE_OP_WORD;       break;
588           case 'W': specialOp = RE_OP_NOTWORD;    break;
589         }
590         if( specialOp ){
591           p->sIn.i++;
592           re_append(p, specialOp, 0);
593         }else{
594           c = re_esc_char(p);
595           re_append(p, RE_OP_MATCH, c);
596         }
597         break;
598       }
599       default: {
600         re_append(p, RE_OP_MATCH, c);
601         break;
602       }
603     }
604     iPrev = iStart;
605   }
606   return 0;
607 }
608 
609 /* Free and reclaim all the memory used by a previously compiled
610 ** regular expression.  Applications should invoke this routine once
611 ** for every call to re_compile() to avoid memory leaks.
612 */
re_free(ReCompiled * pRe)613 void re_free(ReCompiled *pRe){
614   if( pRe ){
615     sqlite3_free(pRe->aOp);
616     sqlite3_free(pRe->aArg);
617     sqlite3_free(pRe);
618   }
619 }
620 
621 /*
622 ** Compile a textual regular expression in zIn[] into a compiled regular
623 ** expression suitable for us by re_match() and return a pointer to the
624 ** compiled regular expression in *ppRe.  Return NULL on success or an
625 ** error message if something goes wrong.
626 */
re_compile(ReCompiled ** ppRe,const char * zIn,int noCase)627 const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
628   ReCompiled *pRe;
629   const char *zErr;
630   int i, j;
631 
632   *ppRe = 0;
633   pRe = sqlite3_malloc( sizeof(*pRe) );
634   if( pRe==0 ){
635     return "out of memory";
636   }
637   memset(pRe, 0, sizeof(*pRe));
638   pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char;
639   if( re_resize(pRe, 30) ){
640     re_free(pRe);
641     return "out of memory";
642   }
643   if( zIn[0]=='^' ){
644     zIn++;
645   }else{
646     re_append(pRe, RE_OP_ANYSTAR, 0);
647   }
648   pRe->sIn.z = (unsigned char*)zIn;
649   pRe->sIn.i = 0;
650   pRe->sIn.mx = (int)strlen(zIn);
651   zErr = re_subcompile_re(pRe);
652   if( zErr ){
653     re_free(pRe);
654     return zErr;
655   }
656   if( rePeek(pRe)=='$' && pRe->sIn.i+1>=pRe->sIn.mx ){
657     re_append(pRe, RE_OP_MATCH, RE_EOF);
658     re_append(pRe, RE_OP_ACCEPT, 0);
659     *ppRe = pRe;
660   }else if( pRe->sIn.i>=pRe->sIn.mx ){
661     re_append(pRe, RE_OP_ACCEPT, 0);
662     *ppRe = pRe;
663   }else{
664     re_free(pRe);
665     return "unrecognized character";
666   }
667 
668   /* The following is a performance optimization.  If the regex begins with
669   ** ".*" (if the input regex lacks an initial "^") and afterwards there are
670   ** one or more matching characters, enter those matching characters into
671   ** zInit[].  The re_match() routine can then search ahead in the input
672   ** string looking for the initial match without having to run the whole
673   ** regex engine over the string.  Do not worry able trying to match
674   ** unicode characters beyond plane 0 - those are very rare and this is
675   ** just an optimization. */
676   if( pRe->aOp[0]==RE_OP_ANYSTAR ){
677     for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
678       unsigned x = pRe->aArg[i];
679       if( x<=127 ){
680         pRe->zInit[j++] = x;
681       }else if( x<=0xfff ){
682         pRe->zInit[j++] = 0xc0 | (x>>6);
683         pRe->zInit[j++] = 0x80 | (x&0x3f);
684       }else if( x<=0xffff ){
685         pRe->zInit[j++] = 0xd0 | (x>>12);
686         pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
687         pRe->zInit[j++] = 0x80 | (x&0x3f);
688       }else{
689         break;
690       }
691     }
692     if( j>0 && pRe->zInit[j-1]==0 ) j--;
693     pRe->nInit = j;
694   }
695   return pRe->zErr;
696 }
697 
698 /*
699 ** Implementation of the regexp() SQL function.  This function implements
700 ** the build-in REGEXP operator.  The first argument to the function is the
701 ** pattern and the second argument is the string.  So, the SQL statements:
702 **
703 **       A REGEXP B
704 **
705 ** is implemented as regexp(B,A).
706 */
re_sql_func(sqlite3_context * context,int argc,sqlite3_value ** argv)707 static void re_sql_func(
708   sqlite3_context *context,
709   int argc,
710   sqlite3_value **argv
711 ){
712   ReCompiled *pRe;          /* Compiled regular expression */
713   const char *zPattern;     /* The regular expression */
714   const unsigned char *zStr;/* String being searched */
715   const char *zErr;         /* Compile error message */
716   int setAux = 0;           /* True to invoke sqlite3_set_auxdata() */
717 
718   pRe = sqlite3_get_auxdata(context, 0);
719   if( pRe==0 ){
720     zPattern = (const char*)sqlite3_value_text(argv[0]);
721     if( zPattern==0 ) return;
722     zErr = re_compile(&pRe, zPattern, 0);
723     if( zErr ){
724       re_free(pRe);
725       sqlite3_result_error(context, zErr, -1);
726       return;
727     }
728     if( pRe==0 ){
729       sqlite3_result_error_nomem(context);
730       return;
731     }
732     setAux = 1;
733   }
734   zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
735   if( zStr!=0 ){
736     sqlite3_result_int(context, re_match(pRe, zStr, -1));
737   }
738   if( setAux ){
739     sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
740   }
741 }
742 
743 /*
744 ** Invoke this routine to register the regexp() function with the
745 ** SQLite database connection.
746 */
747 #ifdef _WIN32
748 __declspec(dllexport)
749 #endif
sqlite3_regexp_init(sqlite3 * db,char ** pzErrMsg,const sqlite3_api_routines * pApi)750 int sqlite3_regexp_init(
751   sqlite3 *db,
752   char **pzErrMsg,
753   const sqlite3_api_routines *pApi
754 ){
755   int rc = SQLITE_OK;
756   SQLITE_EXTENSION_INIT2(pApi);
757   rc = sqlite3_create_function(db, "regexp", 2, SQLITE_UTF8, 0,
758                                  re_sql_func, 0, 0);
759   return rc;
760 }
761