1 /* -*- Mode: java; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- 2 * 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 package org.mozilla.javascript.regexp; 8 9 import org.mozilla.javascript.*; 10 11 /** 12 * 13 */ 14 public class RegExpImpl implements RegExpProxy { 15 isRegExp(Scriptable obj)16 public boolean isRegExp(Scriptable obj) { 17 return obj instanceof NativeRegExp; 18 } 19 compileRegExp(Context cx, String source, String flags)20 public Object compileRegExp(Context cx, String source, String flags) 21 { 22 return NativeRegExp.compileRE(cx, source, flags, false); 23 } 24 wrapRegExp(Context cx, Scriptable scope, Object compiled)25 public Scriptable wrapRegExp(Context cx, Scriptable scope, 26 Object compiled) 27 { 28 return new NativeRegExp(scope, (RECompiled) compiled); 29 } 30 action(Context cx, Scriptable scope, Scriptable thisObj, Object[] args, int actionType)31 public Object action(Context cx, Scriptable scope, 32 Scriptable thisObj, Object[] args, 33 int actionType) 34 { 35 GlobData data = new GlobData(); 36 data.mode = actionType; 37 38 switch (actionType) { 39 case RA_MATCH: 40 { 41 Object rval; 42 data.optarg = 1; 43 rval = matchOrReplace(cx, scope, thisObj, args, 44 this, data, false); 45 return data.arrayobj == null ? rval : data.arrayobj; 46 } 47 48 case RA_SEARCH: 49 data.optarg = 1; 50 return matchOrReplace(cx, scope, thisObj, args, 51 this, data, false); 52 53 case RA_REPLACE: 54 { 55 Object arg1 = args.length < 2 ? Undefined.instance : args[1]; 56 String repstr = null; 57 Function lambda = null; 58 if (arg1 instanceof Function) { 59 lambda = (Function) arg1; 60 } else { 61 repstr = ScriptRuntime.toString(arg1); 62 } 63 64 data.optarg = 2; 65 data.lambda = lambda; 66 data.repstr = repstr; 67 data.dollar = repstr == null ? -1 : repstr.indexOf('$'); 68 data.charBuf = null; 69 data.leftIndex = 0; 70 Object val = matchOrReplace(cx, scope, thisObj, args, 71 this, data, true); 72 73 if (data.charBuf == null) { 74 if (data.global || val == null 75 || !val.equals(Boolean.TRUE)) 76 { 77 /* Didn't match even once. */ 78 return data.str; 79 } 80 SubString lc = this.leftContext; 81 replace_glob(data, cx, scope, this, lc.index, lc.length); 82 } 83 SubString rc = this.rightContext; 84 data.charBuf.append(rc.str, rc.index, rc.index + rc.length); 85 return data.charBuf.toString(); 86 } 87 88 default: 89 throw Kit.codeBug(); 90 } 91 } 92 93 /** 94 * Analog of C match_or_replace. 95 */ 96 private static Object matchOrReplace(Context cx, Scriptable scope, 97 Scriptable thisObj, Object[] args, 98 RegExpImpl reImpl, 99 GlobData data, boolean forceFlat) 100 { 101 NativeRegExp re; 102 103 String str = ScriptRuntime.toString(thisObj); 104 data.str = str; 105 Scriptable topScope = ScriptableObject.getTopLevelScope(scope); 106 107 if (args.length == 0) { 108 RECompiled compiled = NativeRegExp.compileRE(cx, "", "", false); 109 re = new NativeRegExp(topScope, compiled); 110 } else if (args[0] instanceof NativeRegExp) { 111 re = (NativeRegExp) args[0]; 112 } else { 113 String src = ScriptRuntime.toString(args[0]); 114 String opt; 115 if (data.optarg < args.length) { 116 args[0] = src; 117 opt = ScriptRuntime.toString(args[data.optarg]); 118 } else { 119 opt = null; 120 } 121 RECompiled compiled = NativeRegExp.compileRE(cx, src, opt, forceFlat); 122 re = new NativeRegExp(topScope, compiled); 123 } 124 125 data.global = (re.getFlags() & NativeRegExp.JSREG_GLOB) != 0; 126 int[] indexp = { 0 }; 127 Object result = null; 128 if (data.mode == RA_SEARCH) { 129 result = re.executeRegExp(cx, scope, reImpl, 130 str, indexp, NativeRegExp.TEST); 131 if (result != null && result.equals(Boolean.TRUE)) 132 result = Integer.valueOf(reImpl.leftContext.length); 133 else 134 result = Integer.valueOf(-1); 135 } else if (data.global) { 136 re.lastIndex = 0; 137 for (int count = 0; indexp[0] <= str.length(); count++) { 138 result = re.executeRegExp(cx, scope, reImpl, 139 str, indexp, NativeRegExp.TEST); 140 if (result == null || !result.equals(Boolean.TRUE)) 141 break; 142 if (data.mode == RA_MATCH) { 143 match_glob(data, cx, scope, count, reImpl); 144 } else { 145 if (data.mode != RA_REPLACE) Kit.codeBug(); 146 SubString lastMatch = reImpl.lastMatch; 147 int leftIndex = data.leftIndex; 148 int leftlen = lastMatch.index - leftIndex; 149 data.leftIndex = lastMatch.index + lastMatch.length; 150 replace_glob(data, cx, scope, reImpl, leftIndex, leftlen); 151 } 152 if (reImpl.lastMatch.length == 0) { 153 if (indexp[0] == str.length()) 154 break; 155 indexp[0]++; 156 } 157 } 158 } else { 159 result = re.executeRegExp(cx, scope, reImpl, str, indexp, 160 ((data.mode == RA_REPLACE) 161 ? NativeRegExp.TEST 162 : NativeRegExp.MATCH)); 163 } 164 165 return result; 166 } 167 168 169 170 public int find_split(Context cx, Scriptable scope, String target, 171 String separator, Scriptable reObj, 172 int[] ip, int[] matchlen, 173 boolean[] matched, String[][] parensp) 174 { 175 int i = ip[0]; 176 int length = target.length(); 177 int result; 178 179 int version = cx.getLanguageVersion(); 180 NativeRegExp re = (NativeRegExp) reObj; 181 again: 182 while (true) { // imitating C label 183 /* JS1.2 deviated from Perl by never matching at end of string. */ 184 int ipsave = ip[0]; // reuse ip to save object creation 185 ip[0] = i; 186 Object ret = re.executeRegExp(cx, scope, this, target, ip, 187 NativeRegExp.TEST); 188 if (ret != Boolean.TRUE) { 189 // Mismatch: ensure our caller advances i past end of string. 190 ip[0] = ipsave; 191 matchlen[0] = 1; 192 matched[0] = false; 193 return length; 194 } 195 i = ip[0]; 196 ip[0] = ipsave; 197 matched[0] = true; 198 199 SubString sep = this.lastMatch; 200 matchlen[0] = sep.length; 201 if (matchlen[0] == 0) { 202 /* 203 * Empty string match: never split on an empty 204 * match at the start of a find_split cycle. Same 205 * rule as for an empty global match in 206 * match_or_replace. 207 */ 208 if (i == ip[0]) { 209 /* 210 * "Bump-along" to avoid sticking at an empty 211 * match, but don't bump past end of string -- 212 * our caller must do that by adding 213 * sep->length to our return value. 214 */ 215 if (i == length) { 216 if (version == Context.VERSION_1_2) { 217 matchlen[0] = 1; 218 result = i; 219 } 220 else 221 result = -1; 222 break; 223 } 224 i++; 225 continue again; // imitating C goto 226 } 227 } 228 // PR_ASSERT((size_t)i >= sep->length); 229 result = i - matchlen[0]; 230 break; 231 } 232 int size = (parens == null) ? 0 : parens.length; 233 parensp[0] = new String[size]; 234 for (int num = 0; num < size; num++) { 235 SubString parsub = getParenSubString(num); 236 parensp[0][num] = parsub.toString(); 237 } 238 return result; 239 } 240 241 /** 242 * Analog of REGEXP_PAREN_SUBSTRING in C jsregexp.h. 243 * Assumes zero-based; i.e., for $3, i==2 244 */ 245 SubString getParenSubString(int i) 246 { 247 if (parens != null && i < parens.length) { 248 SubString parsub = parens[i]; 249 if (parsub != null) { 250 return parsub; 251 } 252 } 253 return SubString.emptySubString; 254 } 255 256 /* 257 * Analog of match_glob() in jsstr.c 258 */ 259 private static void match_glob(GlobData mdata, Context cx, 260 Scriptable scope, int count, 261 RegExpImpl reImpl) 262 { 263 if (mdata.arrayobj == null) { 264 mdata.arrayobj = cx.newArray(scope, 0); 265 } 266 SubString matchsub = reImpl.lastMatch; 267 String matchstr = matchsub.toString(); 268 mdata.arrayobj.put(count, mdata.arrayobj, matchstr); 269 } 270 271 /* 272 * Analog of replace_glob() in jsstr.c 273 */ 274 private static void replace_glob(GlobData rdata, Context cx, 275 Scriptable scope, RegExpImpl reImpl, 276 int leftIndex, int leftlen) 277 { 278 int replen; 279 String lambdaStr; 280 if (rdata.lambda != null) { 281 // invoke lambda function with args lastMatch, $1, $2, ... $n, 282 // leftContext.length, whole string. 283 SubString[] parens = reImpl.parens; 284 int parenCount = (parens == null) ? 0 : parens.length; 285 Object[] args = new Object[parenCount + 3]; 286 args[0] = reImpl.lastMatch.toString(); 287 for (int i=0; i < parenCount; i++) { 288 SubString sub = parens[i]; 289 if (sub != null) { 290 args[i+1] = sub.toString(); 291 } else { 292 args[i+1] = Undefined.instance; 293 } 294 } 295 args[parenCount+1] = Integer.valueOf(reImpl.leftContext.length); 296 args[parenCount+2] = rdata.str; 297 // This is a hack to prevent expose of reImpl data to 298 // JS function which can run new regexps modifing 299 // regexp that are used later by the engine. 300 // TODO: redesign is necessary 301 if (reImpl != ScriptRuntime.getRegExpProxy(cx)) Kit.codeBug(); 302 RegExpImpl re2 = new RegExpImpl(); 303 re2.multiline = reImpl.multiline; 304 re2.input = reImpl.input; 305 ScriptRuntime.setRegExpProxy(cx, re2); 306 try { 307 Scriptable parent = ScriptableObject.getTopLevelScope(scope); 308 Object result = rdata.lambda.call(cx, parent, parent, args); 309 lambdaStr = ScriptRuntime.toString(result); 310 } finally { 311 ScriptRuntime.setRegExpProxy(cx, reImpl); 312 } 313 replen = lambdaStr.length(); 314 } else { 315 lambdaStr = null; 316 replen = rdata.repstr.length(); 317 if (rdata.dollar >= 0) { 318 int[] skip = new int[1]; 319 int dp = rdata.dollar; 320 do { 321 SubString sub = interpretDollar(cx, reImpl, rdata.repstr, 322 dp, skip); 323 if (sub != null) { 324 replen += sub.length - skip[0]; 325 dp += skip[0]; 326 } else { 327 ++dp; 328 } 329 dp = rdata.repstr.indexOf('$', dp); 330 } while (dp >= 0); 331 } 332 } 333 334 int growth = leftlen + replen + reImpl.rightContext.length; 335 StringBuilder charBuf = rdata.charBuf; 336 if (charBuf == null) { 337 charBuf = new StringBuilder(growth); 338 rdata.charBuf = charBuf; 339 } else { 340 charBuf.ensureCapacity(rdata.charBuf.length() + growth); 341 } 342 343 charBuf.append(reImpl.leftContext.str, leftIndex, leftIndex + leftlen); 344 if (rdata.lambda != null) { 345 charBuf.append(lambdaStr); 346 } else { 347 do_replace(rdata, cx, reImpl); 348 } 349 } 350 351 private static SubString interpretDollar(Context cx, RegExpImpl res, 352 String da, int dp, int[] skip) 353 { 354 char dc; 355 int num, tmp; 356 357 if (da.charAt(dp) != '$') Kit.codeBug(); 358 359 /* Allow a real backslash (literal "\\") to escape "$1" etc. */ 360 int version = cx.getLanguageVersion(); 361 if (version != Context.VERSION_DEFAULT 362 && version <= Context.VERSION_1_4) 363 { 364 if (dp > 0 && da.charAt(dp - 1) == '\\') 365 return null; 366 } 367 int daL = da.length(); 368 if (dp + 1 >= daL) 369 return null; 370 /* Interpret all Perl match-induced dollar variables. */ 371 dc = da.charAt(dp + 1); 372 if (NativeRegExp.isDigit(dc)) { 373 int cp; 374 if (version != Context.VERSION_DEFAULT 375 && version <= Context.VERSION_1_4) 376 { 377 if (dc == '0') 378 return null; 379 /* Check for overflow to avoid gobbling arbitrary decimal digits. */ 380 num = 0; 381 cp = dp; 382 while (++cp < daL && NativeRegExp.isDigit(dc = da.charAt(cp))) 383 { 384 tmp = 10 * num + (dc - '0'); 385 if (tmp < num) 386 break; 387 num = tmp; 388 } 389 } 390 else { /* ECMA 3, 1-9 or 01-99 */ 391 int parenCount = (res.parens == null) ? 0 : res.parens.length; 392 num = dc - '0'; 393 if (num > parenCount) 394 return null; 395 cp = dp + 2; 396 if ((dp + 2) < daL) { 397 dc = da.charAt(dp + 2); 398 if (NativeRegExp.isDigit(dc)) { 399 tmp = 10 * num + (dc - '0'); 400 if (tmp <= parenCount) { 401 cp++; 402 num = tmp; 403 } 404 } 405 } 406 if (num == 0) return null; /* $0 or $00 is not valid */ 407 } 408 /* Adjust num from 1 $n-origin to 0 array-index-origin. */ 409 num--; 410 skip[0] = cp - dp; 411 return res.getParenSubString(num); 412 } 413 414 skip[0] = 2; 415 switch (dc) { 416 case '$': 417 return new SubString("$"); 418 case '&': 419 return res.lastMatch; 420 case '+': 421 return res.lastParen; 422 case '`': 423 if (version == Context.VERSION_1_2) { 424 /* 425 * JS1.2 imitated the Perl4 bug where left context at each step 426 * in an iterative use of a global regexp started from last match, 427 * not from the start of the target string. But Perl4 does start 428 * $` at the beginning of the target string when it is used in a 429 * substitution, so we emulate that special case here. 430 */ 431 res.leftContext.index = 0; 432 res.leftContext.length = res.lastMatch.index; 433 } 434 return res.leftContext; 435 case '\'': 436 return res.rightContext; 437 } 438 return null; 439 } 440 441 /** 442 * Analog of do_replace in jsstr.c 443 */ 444 private static void do_replace(GlobData rdata, Context cx, 445 RegExpImpl regExpImpl) 446 { 447 StringBuilder charBuf = rdata.charBuf; 448 int cp = 0; 449 String da = rdata.repstr; 450 int dp = rdata.dollar; 451 if (dp != -1) { 452 int[] skip = new int[1]; 453 do { 454 int len = dp - cp; 455 charBuf.append(da.substring(cp, dp)); 456 cp = dp; 457 SubString sub = interpretDollar(cx, regExpImpl, da, 458 dp, skip); 459 if (sub != null) { 460 len = sub.length; 461 if (len > 0) { 462 charBuf.append(sub.str, sub.index, sub.index + len); 463 } 464 cp += skip[0]; 465 dp += skip[0]; 466 } else { 467 ++dp; 468 } 469 dp = da.indexOf('$', dp); 470 } while (dp >= 0); 471 } 472 int daL = da.length(); 473 if (daL > cp) { 474 charBuf.append(da.substring(cp, daL)); 475 } 476 } 477 478 /* 479 * See ECMA 15.5.4.8. Modified to match JS 1.2 - optionally takes 480 * a limit argument and accepts a regular expression as the split 481 * argument. 482 */ 483 public Object js_split(Context cx, Scriptable scope, 484 String target, Object[] args) 485 { 486 // create an empty Array to return; 487 Scriptable result = cx.newArray(scope, 0); 488 489 // return an array consisting of the target if no separator given 490 // don't check against undefined, because we want 491 // 'fooundefinedbar'.split(void 0) to split to ['foo', 'bar'] 492 if (args.length < 1) { 493 result.put(0, result, target); 494 return result; 495 } 496 497 // Use the second argument as the split limit, if given. 498 boolean limited = (args.length > 1) && (args[1] != Undefined.instance); 499 long limit = 0; // Initialize to avoid warning. 500 if (limited) { 501 /* Clamp limit between 0 and 1 + string length. */ 502 limit = ScriptRuntime.toUint32(args[1]); 503 if (limit > target.length()) 504 limit = 1 + target.length(); 505 } 506 507 String separator = null; 508 int[] matchlen = new int[1]; 509 Scriptable re = null; 510 RegExpProxy reProxy = null; 511 if (args[0] instanceof Scriptable) { 512 reProxy = ScriptRuntime.getRegExpProxy(cx); 513 if (reProxy != null) { 514 Scriptable test = (Scriptable)args[0]; 515 if (reProxy.isRegExp(test)) { 516 re = test; 517 } 518 } 519 } 520 if (re == null) { 521 separator = ScriptRuntime.toString(args[0]); 522 matchlen[0] = separator.length(); 523 } 524 525 // split target with separator or re 526 int[] ip = { 0 }; 527 int match; 528 int len = 0; 529 boolean[] matched = { false }; 530 String[][] parens = { null }; 531 int version = cx.getLanguageVersion(); 532 while ((match = find_split(cx, scope, target, separator, version, 533 reProxy, re, ip, matchlen, matched, parens)) 534 >= 0) 535 { 536 if ((limited && len >= limit) || (match > target.length())) 537 break; 538 539 String substr; 540 if (target.length() == 0) 541 substr = target; 542 else 543 substr = target.substring(ip[0], match); 544 545 result.put(len, result, substr); 546 len++; 547 /* 548 * Imitate perl's feature of including parenthesized substrings 549 * that matched part of the delimiter in the new array, after the 550 * split substring that was delimited. 551 */ 552 if (re != null && matched[0] == true) { 553 int size = parens[0].length; 554 for (int num = 0; num < size; num++) { 555 if (limited && len >= limit) 556 break; 557 result.put(len, result, parens[0][num]); 558 len++; 559 } 560 matched[0] = false; 561 } 562 ip[0] = match + matchlen[0]; 563 564 if (version < Context.VERSION_1_3 565 && version != Context.VERSION_DEFAULT) 566 { 567 /* 568 * Deviate from ECMA to imitate Perl, which omits a final 569 * split unless a limit argument is given and big enough. 570 */ 571 if (!limited && ip[0] == target.length()) 572 break; 573 } 574 } 575 return result; 576 } 577 578 /* 579 * Used by js_split to find the next split point in target, 580 * starting at offset ip and looking either for the given 581 * separator substring, or for the next re match. ip and 582 * matchlen must be reference variables (assumed to be arrays of 583 * length 1) so they can be updated in the leading whitespace or 584 * re case. 585 * 586 * Return -1 on end of string, >= 0 for a valid index of the next 587 * separator occurrence if found, or the string length if no 588 * separator is found. 589 */ find_split(Context cx, Scriptable scope, String target, String separator, int version, RegExpProxy reProxy, Scriptable re, int[] ip, int[] matchlen, boolean[] matched, String[][] parensp)590 private static int find_split(Context cx, Scriptable scope, String target, 591 String separator, int version, 592 RegExpProxy reProxy, Scriptable re, 593 int[] ip, int[] matchlen, boolean[] matched, 594 String[][] parensp) 595 { 596 int i = ip[0]; 597 int length = target.length(); 598 599 /* 600 * Perl4 special case for str.split(' '), only if the user has selected 601 * JavaScript1.2 explicitly. Split on whitespace, and skip leading w/s. 602 * Strange but true, apparently modeled after awk. 603 */ 604 if (version == Context.VERSION_1_2 && 605 re == null && separator.length() == 1 && separator.charAt(0) == ' ') 606 { 607 /* Skip leading whitespace if at front of str. */ 608 if (i == 0) { 609 while (i < length && Character.isWhitespace(target.charAt(i))) 610 i++; 611 ip[0] = i; 612 } 613 614 /* Don't delimit whitespace at end of string. */ 615 if (i == length) 616 return -1; 617 618 /* Skip over the non-whitespace chars. */ 619 while (i < length 620 && !Character.isWhitespace(target.charAt(i))) 621 i++; 622 623 /* Now skip the next run of whitespace. */ 624 int j = i; 625 while (j < length && Character.isWhitespace(target.charAt(j))) 626 j++; 627 628 /* Update matchlen to count delimiter chars. */ 629 matchlen[0] = j - i; 630 return i; 631 } 632 633 /* 634 * Stop if past end of string. If at end of string, we will 635 * return target length, so that 636 * 637 * "ab,".split(',') => new Array("ab", "") 638 * 639 * and the resulting array converts back to the string "ab," 640 * for symmetry. NB: This differs from perl, which drops the 641 * trailing empty substring if the LIMIT argument is omitted. 642 */ 643 if (i > length) 644 return -1; 645 646 /* 647 * Match a regular expression against the separator at or 648 * above index i. Return -1 at end of string instead of 649 * trying for a match, so we don't get stuck in a loop. 650 */ 651 if (re != null) { 652 return reProxy.find_split(cx, scope, target, separator, re, 653 ip, matchlen, matched, parensp); 654 } 655 656 /* 657 * Deviate from ECMA by never splitting an empty string by any separator 658 * string into a non-empty array (an array of length 1 that contains the 659 * empty string). 660 */ 661 if (version != Context.VERSION_DEFAULT && version < Context.VERSION_1_3 662 && length == 0) 663 return -1; 664 665 /* 666 * Special case: if sep is the empty string, split str into 667 * one character substrings. Let our caller worry about 668 * whether to split once at end of string into an empty 669 * substring. 670 * 671 * For 1.2 compatibility, at the end of the string, we return the length as 672 * the result, and set the separator length to 1 -- this allows the caller 673 * to include an additional null string at the end of the substring list. 674 */ 675 if (separator.length() == 0) { 676 if (version == Context.VERSION_1_2) { 677 if (i == length) { 678 matchlen[0] = 1; 679 return i; 680 } 681 return i + 1; 682 } 683 return (i == length) ? -1 : i + 1; 684 } 685 686 /* Punt to j.l.s.indexOf; return target length if separator is 687 * not found. 688 */ 689 if (ip[0] >= length) 690 return length; 691 692 i = target.indexOf(separator, ip[0]); 693 694 return (i != -1) ? i : length; 695 } 696 697 protected String input; /* input string to match (perl $_, GC root) */ 698 protected boolean multiline; /* whether input contains newlines (perl $*) */ 699 protected SubString[] parens; /* Vector of SubString; last set of parens 700 matched (perl $1, $2) */ 701 protected SubString lastMatch; /* last string matched (perl $&) */ 702 protected SubString lastParen; /* last paren matched (perl $+) */ 703 protected SubString leftContext; /* input to left of last match (perl $`) */ 704 protected SubString rightContext; /* input to right of last match (perl $') */ 705 } 706 707 708 final class GlobData 709 { 710 int mode; /* input: return index, match object, or void */ 711 int optarg; /* input: index of optional flags argument */ 712 boolean global; /* output: whether regexp was global */ 713 String str; /* output: 'this' parameter object as string */ 714 715 // match-specific data 716 717 Scriptable arrayobj; 718 719 // replace-specific data 720 721 Function lambda; /* replacement function object or null */ 722 String repstr; /* replacement string */ 723 int dollar = -1; /* -1 or index of first $ in repstr */ 724 StringBuilder charBuf; /* result characters, null initially */ 725 int leftIndex; /* leftContext index, always 0 for JS1.2 */ 726 } 727