1 /* -*- Mode: java; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2  *
3  * This Source Code Form is subject to the terms of the Mozilla Public
4  * License, v. 2.0. If a copy of the MPL was not distributed with this
5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 
7 package org.mozilla.javascript.regexp;
8 
9 import org.mozilla.javascript.*;
10 
11 /**
12  *
13  */
14 public class RegExpImpl implements RegExpProxy {
15 
isRegExp(Scriptable obj)16     public boolean isRegExp(Scriptable obj) {
17         return obj instanceof NativeRegExp;
18     }
19 
compileRegExp(Context cx, String source, String flags)20     public Object compileRegExp(Context cx, String source, String flags)
21     {
22         return NativeRegExp.compileRE(cx, source, flags, false);
23     }
24 
wrapRegExp(Context cx, Scriptable scope, Object compiled)25     public Scriptable wrapRegExp(Context cx, Scriptable scope,
26                                  Object compiled)
27     {
28         return new NativeRegExp(scope, (RECompiled) compiled);
29     }
30 
action(Context cx, Scriptable scope, Scriptable thisObj, Object[] args, int actionType)31     public Object action(Context cx, Scriptable scope,
32                          Scriptable thisObj, Object[] args,
33                          int actionType)
34     {
35         GlobData data = new GlobData();
36         data.mode = actionType;
37 
38         switch (actionType) {
39           case RA_MATCH:
40             {
41                 Object rval;
42                 data.optarg = 1;
43                 rval = matchOrReplace(cx, scope, thisObj, args,
44                                       this, data, false);
45                 return data.arrayobj == null ? rval : data.arrayobj;
46             }
47 
48           case RA_SEARCH:
49             data.optarg = 1;
50             return matchOrReplace(cx, scope, thisObj, args,
51                                   this, data, false);
52 
53           case RA_REPLACE:
54             {
55                 Object arg1 = args.length < 2 ? Undefined.instance : args[1];
56                 String repstr = null;
57                 Function lambda = null;
58                 if (arg1 instanceof Function) {
59                     lambda = (Function) arg1;
60                 } else {
61                     repstr = ScriptRuntime.toString(arg1);
62                 }
63 
64                 data.optarg = 2;
65                 data.lambda = lambda;
66                 data.repstr = repstr;
67                 data.dollar = repstr == null ? -1 : repstr.indexOf('$');
68                 data.charBuf = null;
69                 data.leftIndex = 0;
70                 Object val = matchOrReplace(cx, scope, thisObj, args,
71                                             this, data, true);
72 
73                 if (data.charBuf == null) {
74                     if (data.global || val == null
75                         || !val.equals(Boolean.TRUE))
76                     {
77                         /* Didn't match even once. */
78                         return data.str;
79                     }
80                     SubString lc = this.leftContext;
81                     replace_glob(data, cx, scope, this, lc.index, lc.length);
82                 }
83                 SubString rc = this.rightContext;
84                 data.charBuf.append(rc.str, rc.index, rc.index + rc.length);
85                 return data.charBuf.toString();
86             }
87 
88           default:
89             throw Kit.codeBug();
90         }
91     }
92 
93     /**
94      * Analog of C match_or_replace.
95      */
96     private static Object matchOrReplace(Context cx, Scriptable scope,
97                                          Scriptable thisObj, Object[] args,
98                                          RegExpImpl reImpl,
99                                          GlobData data, boolean forceFlat)
100     {
101         NativeRegExp re;
102 
103         String str = ScriptRuntime.toString(thisObj);
104         data.str = str;
105         Scriptable topScope = ScriptableObject.getTopLevelScope(scope);
106 
107         if (args.length == 0) {
108             RECompiled compiled = NativeRegExp.compileRE(cx, "", "", false);
109             re = new NativeRegExp(topScope, compiled);
110         } else if (args[0] instanceof NativeRegExp) {
111             re = (NativeRegExp) args[0];
112         } else {
113             String src = ScriptRuntime.toString(args[0]);
114             String opt;
115             if (data.optarg < args.length) {
116                 args[0] = src;
117                 opt = ScriptRuntime.toString(args[data.optarg]);
118             } else {
119                 opt = null;
120             }
121             RECompiled compiled = NativeRegExp.compileRE(cx, src, opt, forceFlat);
122             re = new NativeRegExp(topScope, compiled);
123         }
124 
125         data.global = (re.getFlags() & NativeRegExp.JSREG_GLOB) != 0;
126         int[] indexp = { 0 };
127         Object result = null;
128         if (data.mode == RA_SEARCH) {
129             result = re.executeRegExp(cx, scope, reImpl,
130                                       str, indexp, NativeRegExp.TEST);
131             if (result != null && result.equals(Boolean.TRUE))
132                 result = Integer.valueOf(reImpl.leftContext.length);
133             else
134                 result = Integer.valueOf(-1);
135         } else if (data.global) {
136             re.lastIndex = 0;
137             for (int count = 0; indexp[0] <= str.length(); count++) {
138                 result = re.executeRegExp(cx, scope, reImpl,
139                                           str, indexp, NativeRegExp.TEST);
140                 if (result == null || !result.equals(Boolean.TRUE))
141                     break;
142                 if (data.mode == RA_MATCH) {
143                     match_glob(data, cx, scope, count, reImpl);
144                 } else {
145                     if (data.mode != RA_REPLACE) Kit.codeBug();
146                     SubString lastMatch = reImpl.lastMatch;
147                     int leftIndex = data.leftIndex;
148                     int leftlen = lastMatch.index - leftIndex;
149                     data.leftIndex = lastMatch.index + lastMatch.length;
150                     replace_glob(data, cx, scope, reImpl, leftIndex, leftlen);
151                 }
152                 if (reImpl.lastMatch.length == 0) {
153                     if (indexp[0] == str.length())
154                         break;
155                     indexp[0]++;
156                 }
157             }
158         } else {
159             result = re.executeRegExp(cx, scope, reImpl, str, indexp,
160                                       ((data.mode == RA_REPLACE)
161                                        ? NativeRegExp.TEST
162                                        : NativeRegExp.MATCH));
163         }
164 
165         return result;
166     }
167 
168 
169 
170     public int find_split(Context cx, Scriptable scope, String target,
171                           String separator, Scriptable reObj,
172                           int[] ip, int[] matchlen,
173                           boolean[] matched, String[][] parensp)
174     {
175         int i = ip[0];
176         int length = target.length();
177         int result;
178 
179         int version = cx.getLanguageVersion();
180         NativeRegExp re = (NativeRegExp) reObj;
181         again:
182         while (true) {  // imitating C label
183             /* JS1.2 deviated from Perl by never matching at end of string. */
184             int ipsave = ip[0]; // reuse ip to save object creation
185             ip[0] = i;
186             Object ret = re.executeRegExp(cx, scope, this, target, ip,
187                                           NativeRegExp.TEST);
188             if (ret != Boolean.TRUE) {
189                 // Mismatch: ensure our caller advances i past end of string.
190                 ip[0] = ipsave;
191                 matchlen[0] = 1;
192                 matched[0] = false;
193                 return length;
194             }
195             i = ip[0];
196             ip[0] = ipsave;
197             matched[0] = true;
198 
199             SubString sep = this.lastMatch;
200             matchlen[0] = sep.length;
201             if (matchlen[0] == 0) {
202                 /*
203                  * Empty string match: never split on an empty
204                  * match at the start of a find_split cycle.  Same
205                  * rule as for an empty global match in
206                  * match_or_replace.
207                  */
208                 if (i == ip[0]) {
209                     /*
210                      * "Bump-along" to avoid sticking at an empty
211                      * match, but don't bump past end of string --
212                      * our caller must do that by adding
213                      * sep->length to our return value.
214                      */
215                     if (i == length) {
216                         if (version == Context.VERSION_1_2) {
217                             matchlen[0] = 1;
218                             result = i;
219                         }
220                         else
221                             result = -1;
222                         break;
223                     }
224                     i++;
225                     continue again; // imitating C goto
226                 }
227             }
228             // PR_ASSERT((size_t)i >= sep->length);
229             result = i - matchlen[0];
230             break;
231         }
232         int size = (parens == null) ? 0 : parens.length;
233         parensp[0] = new String[size];
234         for (int num = 0; num < size; num++) {
235             SubString parsub = getParenSubString(num);
236             parensp[0][num] = parsub.toString();
237         }
238         return result;
239     }
240 
241     /**
242      * Analog of REGEXP_PAREN_SUBSTRING in C jsregexp.h.
243      * Assumes zero-based; i.e., for $3, i==2
244      */
245     SubString getParenSubString(int i)
246     {
247         if (parens != null && i < parens.length) {
248             SubString parsub = parens[i];
249             if (parsub != null) {
250                 return parsub;
251             }
252         }
253         return SubString.emptySubString;
254     }
255 
256     /*
257      * Analog of match_glob() in jsstr.c
258      */
259     private static void match_glob(GlobData mdata, Context cx,
260                                    Scriptable scope, int count,
261                                    RegExpImpl reImpl)
262     {
263         if (mdata.arrayobj == null) {
264             mdata.arrayobj = cx.newArray(scope, 0);
265         }
266         SubString matchsub = reImpl.lastMatch;
267         String matchstr = matchsub.toString();
268         mdata.arrayobj.put(count, mdata.arrayobj, matchstr);
269     }
270 
271     /*
272      * Analog of replace_glob() in jsstr.c
273      */
274     private static void replace_glob(GlobData rdata, Context cx,
275                                      Scriptable scope, RegExpImpl reImpl,
276                                      int leftIndex, int leftlen)
277     {
278         int replen;
279         String lambdaStr;
280         if (rdata.lambda != null) {
281             // invoke lambda function with args lastMatch, $1, $2, ... $n,
282             // leftContext.length, whole string.
283             SubString[] parens = reImpl.parens;
284             int parenCount = (parens == null) ? 0 : parens.length;
285             Object[] args = new Object[parenCount + 3];
286             args[0] = reImpl.lastMatch.toString();
287             for (int i=0; i < parenCount; i++) {
288                 SubString sub = parens[i];
289                 if (sub != null) {
290                     args[i+1] = sub.toString();
291                 } else {
292                     args[i+1] = Undefined.instance;
293                 }
294             }
295             args[parenCount+1] = Integer.valueOf(reImpl.leftContext.length);
296             args[parenCount+2] = rdata.str;
297             // This is a hack to prevent expose of reImpl data to
298             // JS function which can run new regexps modifing
299             // regexp that are used later by the engine.
300             // TODO: redesign is necessary
301             if (reImpl != ScriptRuntime.getRegExpProxy(cx)) Kit.codeBug();
302             RegExpImpl re2 = new RegExpImpl();
303             re2.multiline = reImpl.multiline;
304             re2.input = reImpl.input;
305             ScriptRuntime.setRegExpProxy(cx, re2);
306             try {
307                 Scriptable parent = ScriptableObject.getTopLevelScope(scope);
308                 Object result = rdata.lambda.call(cx, parent, parent, args);
309                 lambdaStr = ScriptRuntime.toString(result);
310             } finally {
311                 ScriptRuntime.setRegExpProxy(cx, reImpl);
312             }
313             replen = lambdaStr.length();
314         } else {
315             lambdaStr = null;
316             replen = rdata.repstr.length();
317             if (rdata.dollar >= 0) {
318                 int[] skip = new int[1];
319                 int dp = rdata.dollar;
320                 do {
321                     SubString sub = interpretDollar(cx, reImpl, rdata.repstr,
322                                                     dp, skip);
323                     if (sub != null) {
324                         replen += sub.length - skip[0];
325                         dp += skip[0];
326                     } else {
327                         ++dp;
328                     }
329                     dp = rdata.repstr.indexOf('$', dp);
330                 } while (dp >= 0);
331             }
332         }
333 
334         int growth = leftlen + replen + reImpl.rightContext.length;
335         StringBuilder charBuf = rdata.charBuf;
336         if (charBuf == null) {
337             charBuf = new StringBuilder(growth);
338             rdata.charBuf = charBuf;
339         } else {
340             charBuf.ensureCapacity(rdata.charBuf.length() + growth);
341         }
342 
343         charBuf.append(reImpl.leftContext.str, leftIndex, leftIndex + leftlen);
344         if (rdata.lambda != null) {
345             charBuf.append(lambdaStr);
346         } else {
347             do_replace(rdata, cx, reImpl);
348         }
349     }
350 
351     private static SubString interpretDollar(Context cx, RegExpImpl res,
352                                              String da, int dp, int[] skip)
353     {
354         char dc;
355         int num, tmp;
356 
357         if (da.charAt(dp) != '$') Kit.codeBug();
358 
359         /* Allow a real backslash (literal "\\") to escape "$1" etc. */
360         int version = cx.getLanguageVersion();
361         if (version != Context.VERSION_DEFAULT
362             && version <= Context.VERSION_1_4)
363         {
364             if (dp > 0 && da.charAt(dp - 1) == '\\')
365                 return null;
366         }
367         int daL = da.length();
368         if (dp + 1 >= daL)
369             return null;
370         /* Interpret all Perl match-induced dollar variables. */
371         dc = da.charAt(dp + 1);
372         if (NativeRegExp.isDigit(dc)) {
373             int cp;
374             if (version != Context.VERSION_DEFAULT
375                 && version <= Context.VERSION_1_4)
376             {
377                 if (dc == '0')
378                     return null;
379                 /* Check for overflow to avoid gobbling arbitrary decimal digits. */
380                 num = 0;
381                 cp = dp;
382                 while (++cp < daL && NativeRegExp.isDigit(dc = da.charAt(cp)))
383                 {
384                     tmp = 10 * num + (dc - '0');
385                     if (tmp < num)
386                         break;
387                     num = tmp;
388                 }
389             }
390             else {  /* ECMA 3, 1-9 or 01-99 */
391                 int parenCount = (res.parens == null) ? 0 : res.parens.length;
392                 num = dc - '0';
393                 if (num > parenCount)
394                     return null;
395                 cp = dp + 2;
396                 if ((dp + 2) < daL) {
397                     dc = da.charAt(dp + 2);
398                     if (NativeRegExp.isDigit(dc)) {
399                         tmp = 10 * num + (dc - '0');
400                         if (tmp <= parenCount) {
401                             cp++;
402                             num = tmp;
403                         }
404                     }
405                 }
406                 if (num == 0) return null;  /* $0 or $00 is not valid */
407             }
408             /* Adjust num from 1 $n-origin to 0 array-index-origin. */
409             num--;
410             skip[0] = cp - dp;
411             return res.getParenSubString(num);
412         }
413 
414         skip[0] = 2;
415         switch (dc) {
416           case '$':
417             return new SubString("$");
418           case '&':
419             return res.lastMatch;
420           case '+':
421             return res.lastParen;
422           case '`':
423             if (version == Context.VERSION_1_2) {
424                 /*
425                  * JS1.2 imitated the Perl4 bug where left context at each step
426                  * in an iterative use of a global regexp started from last match,
427                  * not from the start of the target string.  But Perl4 does start
428                  * $` at the beginning of the target string when it is used in a
429                  * substitution, so we emulate that special case here.
430                  */
431                 res.leftContext.index = 0;
432                 res.leftContext.length = res.lastMatch.index;
433             }
434             return res.leftContext;
435           case '\'':
436             return res.rightContext;
437         }
438         return null;
439     }
440 
441     /**
442      * Analog of do_replace in jsstr.c
443      */
444     private static void do_replace(GlobData rdata, Context cx,
445                                    RegExpImpl regExpImpl)
446     {
447         StringBuilder charBuf = rdata.charBuf;
448         int cp = 0;
449         String da = rdata.repstr;
450         int dp = rdata.dollar;
451         if (dp != -1) {
452             int[] skip = new int[1];
453             do {
454                 int len = dp - cp;
455                 charBuf.append(da.substring(cp, dp));
456                 cp = dp;
457                 SubString sub = interpretDollar(cx, regExpImpl, da,
458                                                 dp, skip);
459                 if (sub != null) {
460                     len = sub.length;
461                     if (len > 0) {
462                         charBuf.append(sub.str, sub.index, sub.index + len);
463                     }
464                     cp += skip[0];
465                     dp += skip[0];
466                 } else {
467                     ++dp;
468                 }
469                 dp = da.indexOf('$', dp);
470             } while (dp >= 0);
471         }
472         int daL = da.length();
473         if (daL > cp) {
474             charBuf.append(da.substring(cp, daL));
475         }
476     }
477 
478     /*
479      * See ECMA 15.5.4.8.  Modified to match JS 1.2 - optionally takes
480      * a limit argument and accepts a regular expression as the split
481      * argument.
482      */
483     public Object js_split(Context cx, Scriptable scope,
484                                    String target, Object[] args)
485     {
486         // create an empty Array to return;
487         Scriptable result = cx.newArray(scope, 0);
488 
489         // return an array consisting of the target if no separator given
490         // don't check against undefined, because we want
491         // 'fooundefinedbar'.split(void 0) to split to ['foo', 'bar']
492         if (args.length < 1) {
493             result.put(0, result, target);
494             return result;
495         }
496 
497         // Use the second argument as the split limit, if given.
498         boolean limited = (args.length > 1) && (args[1] != Undefined.instance);
499         long limit = 0;  // Initialize to avoid warning.
500         if (limited) {
501             /* Clamp limit between 0 and 1 + string length. */
502             limit = ScriptRuntime.toUint32(args[1]);
503             if (limit > target.length())
504                 limit = 1 + target.length();
505         }
506 
507         String separator = null;
508         int[] matchlen = new int[1];
509         Scriptable re = null;
510         RegExpProxy reProxy = null;
511         if (args[0] instanceof Scriptable) {
512             reProxy = ScriptRuntime.getRegExpProxy(cx);
513             if (reProxy != null) {
514                 Scriptable test = (Scriptable)args[0];
515                 if (reProxy.isRegExp(test)) {
516                     re = test;
517                 }
518             }
519         }
520         if (re == null) {
521             separator = ScriptRuntime.toString(args[0]);
522             matchlen[0] = separator.length();
523         }
524 
525         // split target with separator or re
526         int[] ip = { 0 };
527         int match;
528         int len = 0;
529         boolean[] matched = { false };
530         String[][] parens = { null };
531         int version = cx.getLanguageVersion();
532         while ((match = find_split(cx, scope, target, separator, version,
533                                    reProxy, re, ip, matchlen, matched, parens))
534                >= 0)
535         {
536             if ((limited && len >= limit) || (match > target.length()))
537                 break;
538 
539             String substr;
540             if (target.length() == 0)
541                 substr = target;
542             else
543                 substr = target.substring(ip[0], match);
544 
545             result.put(len, result, substr);
546             len++;
547         /*
548          * Imitate perl's feature of including parenthesized substrings
549          * that matched part of the delimiter in the new array, after the
550          * split substring that was delimited.
551          */
552             if (re != null && matched[0] == true) {
553                 int size = parens[0].length;
554                 for (int num = 0; num < size; num++) {
555                     if (limited && len >= limit)
556                         break;
557                     result.put(len, result, parens[0][num]);
558                     len++;
559                 }
560                 matched[0] = false;
561             }
562             ip[0] = match + matchlen[0];
563 
564             if (version < Context.VERSION_1_3
565                 && version != Context.VERSION_DEFAULT)
566             {
567         /*
568          * Deviate from ECMA to imitate Perl, which omits a final
569          * split unless a limit argument is given and big enough.
570          */
571                 if (!limited && ip[0] == target.length())
572                     break;
573             }
574         }
575         return result;
576     }
577 
578     /*
579      * Used by js_split to find the next split point in target,
580      * starting at offset ip and looking either for the given
581      * separator substring, or for the next re match.  ip and
582      * matchlen must be reference variables (assumed to be arrays of
583      * length 1) so they can be updated in the leading whitespace or
584      * re case.
585      *
586      * Return -1 on end of string, >= 0 for a valid index of the next
587      * separator occurrence if found, or the string length if no
588      * separator is found.
589      */
find_split(Context cx, Scriptable scope, String target, String separator, int version, RegExpProxy reProxy, Scriptable re, int[] ip, int[] matchlen, boolean[] matched, String[][] parensp)590     private static int find_split(Context cx, Scriptable scope, String target,
591                                   String separator, int version,
592                                   RegExpProxy reProxy, Scriptable re,
593                                   int[] ip, int[] matchlen, boolean[] matched,
594                                   String[][] parensp)
595     {
596         int i = ip[0];
597         int length = target.length();
598 
599         /*
600          * Perl4 special case for str.split(' '), only if the user has selected
601          * JavaScript1.2 explicitly.  Split on whitespace, and skip leading w/s.
602          * Strange but true, apparently modeled after awk.
603          */
604         if (version == Context.VERSION_1_2 &&
605             re == null && separator.length() == 1 && separator.charAt(0) == ' ')
606         {
607             /* Skip leading whitespace if at front of str. */
608             if (i == 0) {
609                 while (i < length && Character.isWhitespace(target.charAt(i)))
610                     i++;
611                 ip[0] = i;
612             }
613 
614             /* Don't delimit whitespace at end of string. */
615             if (i == length)
616                 return -1;
617 
618             /* Skip over the non-whitespace chars. */
619             while (i < length
620                    && !Character.isWhitespace(target.charAt(i)))
621                 i++;
622 
623             /* Now skip the next run of whitespace. */
624             int j = i;
625             while (j < length && Character.isWhitespace(target.charAt(j)))
626                 j++;
627 
628             /* Update matchlen to count delimiter chars. */
629             matchlen[0] = j - i;
630             return i;
631         }
632 
633         /*
634          * Stop if past end of string.  If at end of string, we will
635          * return target length, so that
636          *
637          *  "ab,".split(',') => new Array("ab", "")
638          *
639          * and the resulting array converts back to the string "ab,"
640          * for symmetry.  NB: This differs from perl, which drops the
641          * trailing empty substring if the LIMIT argument is omitted.
642          */
643         if (i > length)
644             return -1;
645 
646         /*
647          * Match a regular expression against the separator at or
648          * above index i.  Return -1 at end of string instead of
649          * trying for a match, so we don't get stuck in a loop.
650          */
651         if (re != null) {
652             return reProxy.find_split(cx, scope, target, separator, re,
653                                       ip, matchlen, matched, parensp);
654         }
655 
656         /*
657          * Deviate from ECMA by never splitting an empty string by any separator
658          * string into a non-empty array (an array of length 1 that contains the
659          * empty string).
660          */
661         if (version != Context.VERSION_DEFAULT && version < Context.VERSION_1_3
662             && length == 0)
663             return -1;
664 
665         /*
666          * Special case: if sep is the empty string, split str into
667          * one character substrings.  Let our caller worry about
668          * whether to split once at end of string into an empty
669          * substring.
670          *
671          * For 1.2 compatibility, at the end of the string, we return the length as
672          * the result, and set the separator length to 1 -- this allows the caller
673          * to include an additional null string at the end of the substring list.
674          */
675         if (separator.length() == 0) {
676             if (version == Context.VERSION_1_2) {
677                 if (i == length) {
678                     matchlen[0] = 1;
679                     return i;
680                 }
681                 return i + 1;
682             }
683             return (i == length) ? -1 : i + 1;
684         }
685 
686         /* Punt to j.l.s.indexOf; return target length if separator is
687          * not found.
688          */
689         if (ip[0] >= length)
690             return length;
691 
692         i = target.indexOf(separator, ip[0]);
693 
694         return (i != -1) ? i : length;
695     }
696 
697     protected String          input;         /* input string to match (perl $_, GC root) */
698     protected boolean         multiline;     /* whether input contains newlines (perl $*) */
699     protected SubString[]     parens;        /* Vector of SubString; last set of parens
700                                       matched (perl $1, $2) */
701     protected SubString       lastMatch;     /* last string matched (perl $&) */
702     protected SubString       lastParen;     /* last paren matched (perl $+) */
703     protected SubString       leftContext;   /* input to left of last match (perl $`) */
704     protected SubString       rightContext;  /* input to right of last match (perl $') */
705 }
706 
707 
708 final class GlobData
709 {
710     int      mode;      /* input: return index, match object, or void */
711     int      optarg;    /* input: index of optional flags argument */
712     boolean  global;    /* output: whether regexp was global */
713     String   str;       /* output: 'this' parameter object as string */
714 
715     // match-specific data
716 
717     Scriptable arrayobj;
718 
719     // replace-specific data
720 
721     Function      lambda;        /* replacement function object or null */
722     String        repstr;        /* replacement string */
723     int           dollar = -1;   /* -1 or index of first $ in repstr */
724     StringBuilder charBuf;       /* result characters, null initially */
725     int           leftIndex;     /* leftContext index, always 0 for JS1.2 */
726 }
727