1 /*
2  * $Id: PatternMatcherInput.java,v 1.7 2003/11/07 20:16:25 dfs Exp $
3  *
4  * ====================================================================
5  * The Apache Software License, Version 1.1
6  *
7  * Copyright (c) 2000 The Apache Software Foundation.  All rights
8  * reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  *
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in
19  *    the documentation and/or other materials provided with the
20  *    distribution.
21  *
22  * 3. The end-user documentation included with the redistribution,
23  *    if any, must include the following acknowledgment:
24  *       "This product includes software developed by the
25  *        Apache Software Foundation (http://www.apache.org/)."
26  *    Alternately, this acknowledgment may appear in the software itself,
27  *    if and wherever such third-party acknowledgments normally appear.
28  *
29  * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
30  *    must not be used to endorse or promote products derived from this
31  *    software without prior written permission. For written
32  *    permission, please contact apache@apache.org.
33  *
34  * 5. Products derived from this software may not be called "Apache"
35  *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
36  *    name, without prior written permission of the Apache Software Foundation.
37  *
38  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
39  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
40  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
42  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
45  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
46  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
48  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49  * SUCH DAMAGE.
50  * ====================================================================
51  *
52  * This software consists of voluntary contributions made by many
53  * individuals on behalf of the Apache Software Foundation.  For more
54  * information on the Apache Software Foundation, please see
55  * <http://www.apache.org/>.
56  */
57 
58 
59 package org.apache.oro.text.regex;
60 
61 
62 /**
63  * The PatternMatcherInput class is used to preserve state across
64  * calls to the <code>contains()</code> methods of PatternMatcher instances.
65  * It is also used to specify that only a subregion of a string
66  * should be used as input when looking for a pattern match.  All that
67  * is meant by preserving state is that the end offset of the last match
68  * is remembered, so that the next match is performed from that point
69  * where the last match left off.  This offset can be accessed from
70  * the {@link #getCurrentOffset()} method and can be set with the
71  * {@link #setCurrentOffset(int)} method.
72  * <p>
73  * You would use a PatternMatcherInput object when you want to search for
74  * more than just the first occurrence of a pattern in a string, or when
75  * you only want to search a subregion of the string for a match.  An
76  * example of its most common use is:
77  * <blockquote><pre>
78  * PatternMatcher matcher;
79  * PatternCompiler compiler;
80  * Pattern pattern;
81  * PatternMatcherInput input;
82  * MatchResult result;
83  *
84  * compiler = new Perl5Compiler();
85  * matcher  = new Perl5Matcher();
86  *
87  * try {
88  *   pattern = compiler.compile(somePatternString);
89  * } catch(MalformedPatternException e) {
90  *   System.out.println("Bad pattern.");
91  *   System.out.println(e.getMessage());
92  *   return;
93  * }
94  *
95  * input   = new PatternMatcherInput(someStringInput);
96  *
97  * while(matcher.contains(input, pattern)) {
98  *   result = matcher.getMatch();
99  *   // Perform whatever processing on the result you want.
100  * }
101  * // Suppose we want to start searching from the beginning again with
102  * // a different pattern.
103  * // Just set the current offset to the begin offset.
104  * input.setCurrentOffset(input.getBeginOffset());
105  *
106  * // Second search omitted
107  *
108  * // Suppose we're done with this input, but want to search another string.
109  * // There's no need to create another PatternMatcherInput instance.
110  * // We can just use the setInput() method.
111  * input.setInput(aNewInputString);
112  *
113  * </pre></blockquote>
114  *
115  * @version @version@
116  * @since 1.0
117  * @see PatternMatcher
118  */
119 public final class PatternMatcherInput {
120   String _originalStringInput;
121   char[] _originalCharInput, _originalBuffer, _toLowerBuffer;
122   int _beginOffset, _endOffset, _currentOffset;
123   int _matchBeginOffset = -1, _matchEndOffset = -1;
124 
125   /**
126    * Creates a PatternMatcherInput object, associating a region of a String
127    * as input to be used for pattern matching by PatternMatcher objects.
128    * A copy of the string is not made, therefore you should not modify
129    * the string unless you know what you are doing.
130    * The current offset of the PatternMatcherInput is set to the begin
131    * offset of the region.
132    * <p>
133    * @param input  The input to associate with the PatternMatcherInput.
134    * @param begin  The offset into the char[] to use as the beginning of
135    *               the input.
136    * @param length The length of the reegion starting from the begin offset
137    *               to use as the input for pattern matching purposes.
138    */
PatternMatcherInput(String input, int begin, int length)139   public PatternMatcherInput(String input, int begin, int length) {
140     setInput(input, begin, length);
141   }
142 
143   /**
144    * Like calling
145    * <blockquote><pre>
146    * PatternMatcherInput(input, 0, input.length());
147    * </pre></blockquote>
148    * <p>
149    * @param input  The input to associate with the PatternMatcherInput.
150    */
PatternMatcherInput(String input)151   public PatternMatcherInput(String input) {
152     this(input, 0, input.length());
153   }
154 
155 
156   /**
157    * Creates a PatternMatcherInput object, associating a region of a string
158    * (represented as a char[]) as input
159    * to be used for pattern matching by PatternMatcher objects.
160    * A copy of the string is not made, therefore you should not modify
161    * the string unless you know what you are doing.
162    * The current offset of the PatternMatcherInput is set to the begin
163    * offset of the region.
164    * <p>
165    * @param input  The input to associate with the PatternMatcherInput.
166    * @param begin  The offset into the char[] to use as the beginning of
167    *               the input.
168    * @param length The length of the reegion starting from the begin offset
169    *               to use as the input for pattern matching purposes.
170    */
PatternMatcherInput(char[] input, int begin, int length)171   public PatternMatcherInput(char[] input, int begin, int length) {
172     setInput(input, begin, length);
173   }
174 
175   /**
176    * Like calling:
177    * <blockquote><pre>
178    * PatternMatcherInput(input, 0, input.length);
179    * </pre></blockquote>
180    * <p>
181    * @param input  The input to associate with the PatternMatcherInput.
182    */
PatternMatcherInput(char[] input)183   public PatternMatcherInput(char[] input) {
184     this(input, 0, input.length);
185   }
186 
187 
188   /**
189    * @return The length of the region to be considered input for pattern
190    *         matching purposes.  Essentially this is then end offset minus
191    *         the begin offset.
192    */
length()193   public int length()        {
194     return (_endOffset - _beginOffset);
195     //return _originalBuffer.length;
196   }
197 
198 
199   /**
200    * Associates a region of a String as input
201    * to be used for pattern matching by PatternMatcher objects.
202    * The current offset of the PatternMatcherInput is set to the begin
203    * offset of the region.
204    * <p>
205    * @param input  The input to associate with the PatternMatcherInput.
206    * @param begin  The offset into the String to use as the beginning of
207    *               the input.
208    * @param length The length of the reegion starting from the begin offset
209    *               to use as the input for pattern matching purposes.
210    */
setInput(String input, int begin, int length)211   public void setInput(String input, int begin, int length) {
212     _originalStringInput = input;
213     _originalCharInput = null;
214     _toLowerBuffer = null;
215     _originalBuffer = input.toCharArray();
216     setCurrentOffset(begin);
217     setBeginOffset(begin);
218     setEndOffset(_beginOffset + length);
219   }
220 
221   /**
222    * This method is identical to calling:
223    * <blockquote><pre>
224    * setInput(input, 0, input.length());
225    * </pre></blockquote>
226    * <p>
227    * @param input  The input to associate with the PatternMatcherInput.
228    */
setInput(String input)229   public void setInput(String input) {
230     setInput(input, 0, input.length());
231   }
232 
233 
234   /**
235    * Associates a region of a string (represented as a char[]) as input
236    * to be used for pattern matching by PatternMatcher objects.
237    * A copy of the string is not made, therefore you should not modify
238    * the string unless you know what you are doing.
239    * The current offset of the PatternMatcherInput is set to the begin
240    * offset of the region.
241    * <p>
242    * @param input  The input to associate with the PatternMatcherInput.
243    * @param begin  The offset into the char[] to use as the beginning of
244    *               the input.
245    * @param length The length of the reegion starting from the begin offset
246    *               to use as the input for pattern matching purposes.
247    */
setInput(char[] input, int begin, int length)248   public void setInput(char[] input, int begin, int length) {
249     _originalStringInput = null;
250     _toLowerBuffer  = null;
251     _originalBuffer = _originalCharInput = input;
252     setCurrentOffset(begin);
253     setBeginOffset(begin);
254     setEndOffset(_beginOffset + length);
255   }
256 
257 
258   /**
259    * This method is identical to calling:
260    * <blockquote><pre>
261    * setInput(input, 0, input.length);
262    * </pre></blockquote>
263    * <p>
264    * @param input  The input to associate with the PatternMatcherInput.
265    */
setInput(char[] input)266   public void setInput(char[] input) {
267     setInput(input, 0, input.length);
268   }
269 
270 
271   /**
272    * Returns the character at a particular offset relative to the begin
273    * offset of the input.
274    * <p>
275    * @param offset  The offset at which to fetch a character (relative to
276    *                the beginning offset.
277    * @return The character at a particular offset.
278    * @exception ArrayIndexOutOfBoundsException If the offset does not occur
279    *            within the bounds of the input.
280    */
charAt(int offset)281   public char charAt(int offset) {
282     return _originalBuffer[_beginOffset + offset];
283   }
284 
285   /**
286    * Returns a new string that is a substring of the PatternMatcherInput
287    * instance. The substring begins at the specified beginOffset relative
288    * to the begin offset and extends to the specified endOffset - 1
289    * relative to the begin offset of the PatternMatcherInput instance.
290    * <p>
291    * @param beginOffset  The offset relative to the begin offset of the
292    *        PatternMatcherInput at which to start the substring (inclusive).
293    * @param endOffset  The offset relative to the begin offset of the
294    *        PatternMatcherInput at which to end the substring (exclusive).
295    * @return The specified substring.
296    * @exception ArrayIndexOutOfBoundsException If one of the offsets does
297    *        not occur within the bounds of the input.
298    */
substring(int beginOffset, int endOffset)299   public String substring(int beginOffset, int endOffset) {
300     return new String(_originalBuffer, _beginOffset+beginOffset,
301 		      endOffset - beginOffset);
302   }
303 
304   /**
305    * Returns a new string that is a substring of the PatternMatcherInput
306    * instance. The substring begins at the specified beginOffset relative
307    * to the begin offset and extends to the end offset of the
308    * PatternMatcherInput.
309    * <p>
310    * @param beginOffset  The offset relative to the begin offset of the
311    *        PatternMatcherInput at which to start the substring.
312    * @return The specified substring.
313    * @exception ArrayIndexOutOfBoundsException If the offset does not occur
314    *            within the bounds of the input.
315    */
substring(int beginOffset)316   public String substring(int beginOffset) {
317     beginOffset+=_beginOffset;
318     return new String(_originalBuffer, beginOffset, _endOffset - beginOffset);
319   }
320 
321 
322   /**
323    * Retrieves the original input used to initialize the PatternMatcherInput
324    * instance.  If a String was used, the String instance will be returned.
325    * If a char[] was used, a char instance will be returned.  This violates
326    * data encapsulation and hiding principles, but it is a great convenience
327    * for the programmer.
328    * <p>
329    * @return The String or char[] input used to initialize the
330    *         PatternMatcherInput instance.
331    */
getInput()332   public Object getInput(){
333     if(_originalStringInput == null)
334       return _originalCharInput;
335     return _originalStringInput;
336   }
337 
338   /**
339    * Retrieves the char[] buffer to be used used as input by PatternMatcher
340    * implementations to look for matches.  This array should be treated
341    * as read only by the programmer.
342    * <p>
343    * @return The char[] buffer to be used as input by PatternMatcher
344    *         implementations.
345    */
getBuffer()346   public char[] getBuffer() { return _originalBuffer;  }
347 
348   /**
349    * Returns whether or not the end of the input has been reached.
350    * <p>
351    * @return True if the current offset is greater than or equal to the
352    *         end offset.
353    */
endOfInput()354   public boolean endOfInput(){ return (_currentOffset >= _endOffset); }
355 
356 
357   /**
358    * @return The offset of the input that should be considered the start
359    *         of the region to be considered as input by PatternMatcher
360    *         methods.
361    */
getBeginOffset()362   public int getBeginOffset()   { return _beginOffset; }
363 
364   /**
365    * @return The offset of the input that should be considered the end
366    *         of the region to be considered as input by PatternMatcher
367    *         methods.  This offset is actually 1 plus the last offset
368    *         that is part of the input region.
369    */
getEndOffset()370   public int getEndOffset()     { return _endOffset;  }
371 
372   /**
373    * @return The offset of the input that should be considered the current
374    *         offset where PatternMatcher methods should start looking for
375    *         matches.
376    */
getCurrentOffset()377   public int getCurrentOffset() { return _currentOffset; }
378 
379   /**
380    * Sets the offset of the input that should be considered the start
381    * of the region to be considered as input by PatternMatcher
382    * methods.  In other words, everything before this offset is ignored
383    * by a PatternMatcher.
384    * <p>
385    * @param offset  The offset to use as the beginning of the input.
386    */
setBeginOffset(int offset)387   public void setBeginOffset(int offset)   { _beginOffset = offset; }
388 
389   /**
390    * Sets the offset of the input that should be considered the end
391    * of the region to be considered as input by PatternMatcher
392    * methods.  This offset is actually 1 plus the last offset
393    * that is part of the input region.
394    * <p>
395    * @param offset  The offset to use as the end of the input.
396    */
setEndOffset(int offset)397   public void setEndOffset(int offset)     { _endOffset = offset; }
398 
399   /**
400    * Sets the offset of the input that should be considered the current
401    * offset where PatternMatcher methods should start looking for
402    * matches.  Also resets all match offset information to -1.  By calling
403    * this method, you invalidate all previous match information.  Therefore
404    * a PatternMatcher implementation must call this method before setting
405    * match offset information.
406    * <p>
407    * @param offset  The offset to use as the current offset.
408    */
setCurrentOffset(int offset)409   public void setCurrentOffset(int offset) {
410     _currentOffset    = offset;
411     setMatchOffsets(-1, -1);
412   }
413 
414   /**
415    * Returns the string representation of the input, where the input is
416    * considered to start from the begin offset and end at the end offset.
417    * <p>
418    * @return The string representation of the input.
419    */
toString()420   public String toString() {
421     return new String(_originalBuffer, _beginOffset, length());
422   }
423 
424 
425   /**
426    * A convenience method returning the part of the input occurring before
427    * the last match found by a call to a Perl5Matcher
428    * {@link Perl5Matcher#contains contains} method.
429    * <p>
430    * @return The input preceeding a match.
431    */
preMatch()432   public String preMatch() {
433     return new String(_originalBuffer, _beginOffset,
434 		      _matchBeginOffset - _beginOffset);
435   }
436 
437 
438   /**
439    * A convenience method returning the part of the input occurring after
440    * the last match found by a call to a Perl5Matcher
441    * {@link Perl5Matcher#contains contains} method.
442    * <p>
443    * @return The input succeeding a contains() match.
444    */
postMatch()445   public String postMatch() {
446     return new String(_originalBuffer, _matchEndOffset,
447 		      _endOffset - _matchEndOffset);
448   }
449 
450 
451   /**
452    * A convenience method returning the part of the input corresponding
453    * to the last match found by a call to a Perl5Matcher
454    * {@link Perl5Matcher#contains contains} method.
455    * The method is not called getMatch() so as not to confuse it
456    * with Perl5Matcher's getMatch() which returns a MatchResult instance
457    * and also for consistency with preMatch() and postMatch().
458    * <p>
459    * @return The input consisting of the match found by contains().
460    */
match()461   public String match() {
462     return new String(_originalBuffer, _matchBeginOffset,
463 		      _matchEndOffset - _matchBeginOffset);
464   }
465 
466 
467   /**
468    * This method is intended for use by PatternMatcher implementations.
469    * It is necessary to record the location of the previous match so that
470    * consecutive contains() matches involving null string matches are
471    * properly handled.  If you are not implementing a PatternMatcher, forget
472    * this method exists.  If you use it outside of its intended context, you
473    * will only disrupt the stored state.
474    * <p>
475    * As a note, the preMatch(), postMatch(), and match() methods are provided
476    * as conveniences because PatternMatcherInput must store match offset
477    * information to completely preserve state for consecutive PatternMatcher
478    * contains() matches.
479    * <p>
480    * @param matchBeginOffset  The begin offset of a match found by contains().
481    * @param matchEndOffset    The end offset of a match found by contains().
482    */
setMatchOffsets(int matchBeginOffset, int matchEndOffset)483   public void setMatchOffsets(int matchBeginOffset, int matchEndOffset) {
484     _matchBeginOffset    = matchBeginOffset;
485     _matchEndOffset      = matchEndOffset;
486   }
487 
488   /**
489    * Returns the offset marking the beginning of the match found by
490    * contains().
491    * <p>
492    * @return The begin offset of a contains() match.
493    */
getMatchBeginOffset()494   public int getMatchBeginOffset()    { return _matchBeginOffset; }
495 
496   /**
497    * Returns the offset marking the end of the match found by contains().
498    * <p>
499    * @return The end offset of a contains() match.
500    */
getMatchEndOffset()501   public int getMatchEndOffset()      { return _matchEndOffset; }
502 }
503