1 /* Matcher.java -- Instance of a regular expression applied to a char sequence.
2    Copyright (C) 2002, 2004, 2006 Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 
39 package java.util.regex;
40 
41 import gnu.java.lang.CPStringBuilder;
42 
43 import gnu.java.util.regex.CharIndexed;
44 import gnu.java.util.regex.RE;
45 import gnu.java.util.regex.REMatch;
46 
47 /**
48  * Instance of a regular expression applied to a char sequence.
49  *
50  * @since 1.4
51  */
52 public final class Matcher implements MatchResult
53 {
54   private Pattern pattern;
55   private CharSequence input;
56   // We use CharIndexed as an input object to the getMatch method in order
57   // that /\G/ (the end of the previous match) may work.  The information
58   // of the previous match is stored in the CharIndexed object.
59   private CharIndexed inputCharIndexed;
60   private int position;
61   private int appendPosition;
62   private REMatch match;
63 
64   /**
65    * The start of the region of the input on which to match.
66    */
67   private int regionStart;
68 
69   /**
70    * The end of the region of the input on which to match.
71    */
72   private int regionEnd;
73 
74   /**
75    * True if the match process should look beyond the
76    * region marked by regionStart to regionEnd when
77    * performing lookAhead, lookBehind and boundary
78    * matching.
79    */
80   private boolean transparentBounds;
81 
82   /**
83    * The flags that affect the anchoring bounds.
84    * If {@link #hasAnchoringBounds()} is {@code true},
85    * the match process will honour the
86    * anchoring bounds: ^, \A, \Z, \z and $.  If
87    * {@link #hasAnchoringBounds()} is {@code false},
88    * the anchors are ignored and appropriate flags,
89    * stored in this variable, are used to provide this
90    * behaviour.
91    */
92   private int anchoringBounds;
93 
Matcher(Pattern pattern, CharSequence input)94   Matcher(Pattern pattern, CharSequence input)
95   {
96     this.pattern = pattern;
97     this.input = input;
98     this.inputCharIndexed = RE.makeCharIndexed(input, 0);
99     regionStart = 0;
100     regionEnd = input.length();
101     transparentBounds = false;
102     anchoringBounds = 0;
103   }
104 
105   /**
106    * Changes the pattern used by the {@link Matcher} to
107    * the one specified.  Existing match information is lost,
108    * but the input and the matcher's position within it is
109    * retained.
110    *
111    * @param newPattern the new pattern to use.
112    * @return this matcher.
113    * @throws IllegalArgumentException if {@code newPattern} is
114    *                                  {@code null}.
115    * @since 1.5
116    */
usePattern(Pattern newPattern)117   public Matcher usePattern(Pattern newPattern)
118   {
119     if (newPattern == null)
120       throw new IllegalArgumentException("The new pattern was null.");
121     pattern = newPattern;
122     match = null;
123 
124     return this;
125   }
126 
127   /**
128    * @param sb The target string buffer
129    * @param replacement The replacement string
130    *
131    * @exception IllegalStateException If no match has yet been attempted,
132    * or if the previous match operation failed
133    * @exception IndexOutOfBoundsException If the replacement string refers
134    * to a capturing group that does not exist in the pattern
135    */
appendReplacement(StringBuffer sb, String replacement)136   public Matcher appendReplacement (StringBuffer sb, String replacement)
137     throws IllegalStateException
138   {
139     assertMatchOp();
140     sb.append(input.subSequence(appendPosition,
141                                 match.getStartIndex()).toString());
142     sb.append(RE.getReplacement(replacement, match,
143         RE.REG_REPLACE_USE_BACKSLASHESCAPE));
144     appendPosition = match.getEndIndex();
145     return this;
146   }
147 
148   /**
149    * @param sb The target string buffer
150    */
appendTail(StringBuffer sb)151   public StringBuffer appendTail (StringBuffer sb)
152   {
153     sb.append(input.subSequence(appendPosition, input.length()).toString());
154     return sb;
155   }
156 
157   /**
158    * @exception IllegalStateException If no match has yet been attempted,
159    * or if the previous match operation failed
160    */
end()161   public int end ()
162     throws IllegalStateException
163   {
164     assertMatchOp();
165     return match.getEndIndex();
166   }
167 
168   /**
169    * @param group The index of a capturing group in this matcher's pattern
170    *
171    * @exception IllegalStateException If no match has yet been attempted,
172    * or if the previous match operation failed
173    * @exception IndexOutOfBoundsException If the replacement string refers
174    * to a capturing group that does not exist in the pattern
175    */
end(int group)176   public int end (int group)
177     throws IllegalStateException
178   {
179     assertMatchOp();
180     return match.getEndIndex(group);
181   }
182 
find()183   public boolean find ()
184   {
185     boolean first = (match == null);
186     if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
187       match = pattern.getRE().getMatch(inputCharIndexed, position, anchoringBounds);
188     else
189       match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
190                                        position, anchoringBounds);
191     if (match != null)
192       {
193         int endIndex = match.getEndIndex();
194         // Is the match within input limits?
195         if (endIndex > input.length())
196           {
197             match = null;
198             return false;
199           }
200         // Are we stuck at the same position?
201         if (!first && endIndex == position)
202           {
203             match = null;
204             // Not at the end of the input yet?
205             if (position < input.length() - 1)
206               {
207                 position++;
208                 return find(position);
209               }
210             else
211               return false;
212           }
213         position = endIndex;
214         return true;
215       }
216     return false;
217   }
218 
219   /**
220    * @param start The index to start the new pattern matching
221    *
222    * @exception IndexOutOfBoundsException If the replacement string refers
223    * to a capturing group that does not exist in the pattern
224    */
find(int start)225   public boolean find (int start)
226   {
227     if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
228       match = pattern.getRE().getMatch(inputCharIndexed, start, anchoringBounds);
229     else
230       match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd),
231                                        start, anchoringBounds);
232     if (match != null)
233       {
234         position = match.getEndIndex();
235         return true;
236       }
237     return false;
238   }
239 
240   /**
241    * @exception IllegalStateException If no match has yet been attempted,
242    * or if the previous match operation failed
243    */
group()244   public String group ()
245   {
246     assertMatchOp();
247     return match.toString();
248   }
249 
250   /**
251    * @param group The index of a capturing group in this matcher's pattern
252    *
253    * @exception IllegalStateException If no match has yet been attempted,
254    * or if the previous match operation failed
255    * @exception IndexOutOfBoundsException If the replacement string refers
256    * to a capturing group that does not exist in the pattern
257    */
group(int group)258   public String group (int group)
259     throws IllegalStateException
260   {
261     assertMatchOp();
262     return match.toString(group);
263   }
264 
265   /**
266    * @param replacement The replacement string
267    */
replaceFirst(String replacement)268   public String replaceFirst (String replacement)
269   {
270     reset();
271     // Semantics might not quite match
272     return pattern.getRE().substitute(input, replacement, position,
273         RE.REG_REPLACE_USE_BACKSLASHESCAPE);
274   }
275 
276   /**
277    * @param replacement The replacement string
278    */
replaceAll(String replacement)279   public String replaceAll (String replacement)
280   {
281     reset();
282     return pattern.getRE().substituteAll(input, replacement, position,
283         RE.REG_REPLACE_USE_BACKSLASHESCAPE);
284   }
285 
groupCount()286   public int groupCount ()
287   {
288     return pattern.getRE().getNumSubs();
289   }
290 
lookingAt()291   public boolean lookingAt ()
292   {
293     if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
294       match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
295                                        anchoringBounds|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
296     else
297       match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
298                                        anchoringBounds|RE.REG_FIX_STARTING_POSITION);
299     if (match != null)
300       {
301         if (match.getStartIndex() == 0)
302           {
303             position = match.getEndIndex();
304             return true;
305           }
306         match = null;
307       }
308     return false;
309   }
310 
311   /**
312    * Attempts to match the entire input sequence against the pattern.
313    *
314    * If the match succeeds then more information can be obtained via the
315    * start, end, and group methods.
316    *
317    * @see #start()
318    * @see #end()
319    * @see #group()
320    */
matches()321   public boolean matches ()
322   {
323     if (transparentBounds || (regionStart == 0 && regionEnd == input.length()))
324       match = pattern.getRE().getMatch(inputCharIndexed, regionStart,
325                                        anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX);
326     else
327       match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0,
328                                        anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION);
329     if (match != null)
330       {
331         if (match.getStartIndex() == 0)
332           {
333             position = match.getEndIndex();
334             if (position == input.length())
335                 return true;
336           }
337         match = null;
338       }
339     return false;
340   }
341 
342   /**
343    * Returns the Pattern that is interpreted by this Matcher
344    */
pattern()345   public Pattern pattern ()
346   {
347     return pattern;
348   }
349 
350   /**
351    * Resets the internal state of the matcher, including
352    * resetting the region to its default state of encompassing
353    * the whole input.  The state of {@link #hasTransparentBounds()}
354    * and {@link #hasAnchoringBounds()} are unaffected.
355    *
356    * @return a reference to this matcher.
357    * @see #regionStart()
358    * @see #regionEnd()
359    * @see #hasTransparentBounds()
360    * @see #hasAnchoringBounds()
361    */
reset()362   public Matcher reset ()
363   {
364     position = 0;
365     match = null;
366     regionStart = 0;
367     regionEnd = input.length();
368     appendPosition = 0;
369     return this;
370   }
371 
372   /**
373    * Resets the internal state of the matcher, including
374    * resetting the region to its default state of encompassing
375    * the whole input.  The state of {@link #hasTransparentBounds()}
376    * and {@link #hasAnchoringBounds()} are unaffected.
377    *
378    * @param input The new input character sequence.
379    * @return a reference to this matcher.
380    * @see #regionStart()
381    * @see #regionEnd()
382    * @see #hasTransparentBounds()
383    * @see #hasAnchoringBounds()
384    */
reset(CharSequence input)385   public Matcher reset (CharSequence input)
386   {
387     this.input = input;
388     this.inputCharIndexed = RE.makeCharIndexed(input, 0);
389     return reset();
390   }
391 
392   /**
393    * @return the index of a capturing group in this matcher's pattern
394    *
395    * @exception IllegalStateException If no match has yet been attempted,
396    * or if the previous match operation failed
397    */
start()398   public int start ()
399     throws IllegalStateException
400   {
401     assertMatchOp();
402     return match.getStartIndex();
403   }
404 
405   /**
406    * @param group The index of a capturing group in this matcher's pattern
407    *
408    * @exception IllegalStateException If no match has yet been attempted,
409    * or if the previous match operation failed
410    * @exception IndexOutOfBoundsException If the replacement string refers
411    * to a capturing group that does not exist in the pattern
412    */
start(int group)413   public int start (int group)
414     throws IllegalStateException
415   {
416     assertMatchOp();
417     return match.getStartIndex(group);
418   }
419 
420   /**
421    * @return True if and only if the matcher hit the end of input.
422    * @since 1.5
423    */
hitEnd()424   public boolean hitEnd()
425   {
426     return inputCharIndexed.hitEnd();
427   }
428 
429   /**
430    * @return A string expression of this matcher.
431    */
toString()432   public String toString()
433   {
434     CPStringBuilder sb = new CPStringBuilder();
435     sb.append(this.getClass().getName())
436       .append("[pattern=").append(pattern.pattern())
437       .append(" region=").append(regionStart).append(",").append(regionEnd)
438       .append(" anchoringBounds=").append(anchoringBounds == 0)
439       .append(" transparentBounds=").append(transparentBounds)
440       .append(" lastmatch=").append(match == null ? "" : match.toString())
441       .append("]");
442     return sb.toString();
443   }
444 
assertMatchOp()445   private void assertMatchOp()
446   {
447     if (match == null) throw new IllegalStateException();
448   }
449 
450   /**
451    * <p>
452    * Defines the region of the input on which to match.
453    * By default, the {@link Matcher} attempts to match
454    * the whole string (from 0 to the length of the input),
455    * but a region between {@code start} (inclusive) and
456    * {@code end} (exclusive) on which to match may instead
457    * be defined using this method.
458    * </p>
459    * <p>
460    * The behaviour of region matching is further affected
461    * by the use of transparent or opaque bounds (see
462    * {@link #useTransparentBounds(boolean)}) and whether or not
463    * anchors ({@code ^} and {@code $}) are in use
464    * (see {@link #useAnchoringBounds(boolean)}).  With transparent
465    * bounds, the matcher is aware of input outside the bounds
466    * set by this method, whereas, with opaque bounds (the default)
467    * only the input within the bounds is used.  The use of
468    * anchors are affected by this setting; with transparent
469    * bounds, anchors will match the beginning of the real input,
470    * while with opaque bounds they match the beginning of the
471    * region.  {@link #useAnchoringBounds(boolean)} can be used
472    * to turn on or off the matching of anchors.
473    * </p>
474    *
475    * @param start the start of the region (inclusive).
476    * @param end the end of the region (exclusive).
477    * @return a reference to this matcher.
478    * @throws IndexOutOfBoundsException if either {@code start} or
479    *                                   {@code end} are less than zero,
480    *                                   if either {@code start} or
481    *                                   {@code end} are greater than the
482    *                                   length of the input, or if
483    *                                   {@code start} is greater than
484    *                                   {@code end}.
485    * @see #regionStart()
486    * @see #regionEnd()
487    * @see #hasTransparentBounds()
488    * @see #useTransparentBounds(boolean)
489    * @see #hasAnchoringBounds()
490    * @see #useAnchoringBounds(boolean)
491    * @since 1.5
492    */
region(int start, int end)493   public Matcher region(int start, int end)
494   {
495     int length = input.length();
496     if (start < 0)
497       throw new IndexOutOfBoundsException("The start position was less than zero.");
498     if (start >= length)
499       throw new IndexOutOfBoundsException("The start position is after the end of the input.");
500     if (end < 0)
501       throw new IndexOutOfBoundsException("The end position was less than zero.");
502     if (end > length)
503       throw new IndexOutOfBoundsException("The end position is after the end of the input.");
504     if (start > end)
505       throw new IndexOutOfBoundsException("The start position is after the end position.");
506     reset();
507     regionStart = start;
508     regionEnd = end;
509     return this;
510   }
511 
512   /**
513    * The start of the region on which to perform matches (inclusive).
514    *
515    * @return the start index of the region.
516    * @see #region(int,int)
517    * #see #regionEnd()
518    * @since 1.5
519    */
regionStart()520   public int regionStart()
521   {
522     return regionStart;
523   }
524 
525   /**
526    * The end of the region on which to perform matches (exclusive).
527    *
528    * @return the end index of the region.
529    * @see #region(int,int)
530    * @see #regionStart()
531    * @since 1.5
532    */
regionEnd()533   public int regionEnd()
534   {
535     return regionEnd;
536   }
537 
538   /**
539    * Returns true if the bounds of the region marked by
540    * {@link #regionStart()} and {@link #regionEnd()} are
541    * transparent.  When these bounds are transparent, the
542    * matching process can look beyond them to perform
543    * lookahead, lookbehind and boundary matching operations.
544    * By default, the bounds are opaque.
545    *
546    * @return true if the bounds of the matching region are
547    *         transparent.
548    * @see #useTransparentBounds(boolean)
549    * @see #region(int,int)
550    * @see #regionStart()
551    * @see #regionEnd()
552    * @since 1.5
553    */
hasTransparentBounds()554   public boolean hasTransparentBounds()
555   {
556     return transparentBounds;
557   }
558 
559   /**
560    * Sets the transparency of the bounds of the region
561    * marked by {@link #regionStart()} and {@link #regionEnd()}.
562    * A value of {@code true} makes the bounds transparent,
563    * so the matcher can see beyond them to perform lookahead,
564    * lookbehind and boundary matching operations.  A value
565    * of {@code false} (the default) makes the bounds opaque,
566    * restricting the match to the input region denoted
567    * by {@link #regionStart()} and {@link #regionEnd()}.
568    *
569    * @param transparent true if the bounds should be transparent.
570    * @return a reference to this matcher.
571    * @see #hasTransparentBounds()
572    * @see #region(int,int)
573    * @see #regionStart()
574    * @see #regionEnd()
575    * @since 1.5
576    */
useTransparentBounds(boolean transparent)577   public Matcher useTransparentBounds(boolean transparent)
578   {
579     transparentBounds = transparent;
580     return this;
581   }
582 
583   /**
584    * Returns true if the matcher will honour the use of
585    * the anchoring bounds: {@code ^}, {@code \A}, {@code \Z},
586    * {@code \z} and {@code $}.  By default, the anchors
587    * are used.  Note that the effect of the anchors is
588    * also affected by {@link #hasTransparentBounds()}.
589    *
590    * @return true if the matcher will attempt to match
591    *         the anchoring bounds.
592    * @see #useAnchoringBounds(boolean)
593    * @see #hasTransparentBounds()
594    * @since 1.5
595    */
hasAnchoringBounds()596   public boolean hasAnchoringBounds()
597   {
598     return anchoringBounds == 0;
599   }
600 
601   /**
602    * Enables or disables the use of the anchoring bounds:
603    * {@code ^}, {@code \A}, {@code \Z}, {@code \z} and
604    * {@code $}. By default, their use is enabled.  When
605    * disabled, the matcher will not attempt to match
606    * the anchors.
607    *
608    * @param useAnchors true if anchoring bounds should be used.
609    * @return a reference to this matcher.
610    * @since 1.5
611    * @see #hasAnchoringBounds()
612    */
useAnchoringBounds(boolean useAnchors)613   public Matcher useAnchoringBounds(boolean useAnchors)
614   {
615     if (useAnchors)
616       anchoringBounds = 0;
617     else
618       anchoringBounds = RE.REG_NOTBOL|RE.REG_NOTEOL;
619     return this;
620   }
621 
622   /**
623    * Returns a read-only snapshot of the current state of
624    * the {@link Matcher} as a {@link MatchResult}.  Any
625    * subsequent changes to this instance are not reflected
626    * in the returned {@link MatchResult}.
627    *
628    * @return a {@link MatchResult} instance representing the
629    *         current state of the {@link Matcher}.
630    */
toMatchResult()631   public MatchResult toMatchResult()
632   {
633     Matcher snapshot = new Matcher(pattern, input);
634     if (match != null)
635       snapshot.match = (REMatch) match.clone();
636     return snapshot;
637   }
638 
639   /**
640    * Returns a literalized string of s where characters {@code $} and {@code
641    * \\} are escaped.
642    *
643    * @param s the string to literalize.
644    * @return the literalized string.
645    * @since 1.5
646    */
quoteReplacement(String s)647   public static String quoteReplacement(String s)
648   {
649     if (s == null)
650       throw new NullPointerException();
651     CPStringBuilder sb = new CPStringBuilder();
652     for (int i = 0; i < s.length(); i++)
653     {
654       char ch = s.charAt(i);
655       if (ch == '$' || ch == '\\')
656         sb.append('\\');
657       sb.append(ch);
658     }
659     return sb.toString();
660   }
661 
662 }
663