1 /*
2  * $Id: Util.java,v 1.15 2003/11/07 20:16:25 dfs Exp $
3  *
4  * ====================================================================
5  * The Apache Software License, Version 1.1
6  *
7  * Copyright (c) 2000-2002 The Apache Software Foundation.  All rights
8  * reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  *
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in
19  *    the documentation and/or other materials provided with the
20  *    distribution.
21  *
22  * 3. The end-user documentation included with the redistribution,
23  *    if any, must include the following acknowledgment:
24  *       "This product includes software developed by the
25  *        Apache Software Foundation (http://www.apache.org/)."
26  *    Alternately, this acknowledgment may appear in the software itself,
27  *    if and wherever such third-party acknowledgments normally appear.
28  *
29  * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
30  *    must not be used to endorse or promote products derived from this
31  *    software without prior written permission. For written
32  *    permission, please contact apache@apache.org.
33  *
34  * 5. Products derived from this software may not be called "Apache"
35  *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
36  *    name, without prior written permission of the Apache Software Foundation.
37  *
38  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
39  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
40  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
42  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
45  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
46  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
48  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49  * SUCH DAMAGE.
50  * ====================================================================
51  *
52  * This software consists of voluntary contributions made by many
53  * individuals on behalf of the Apache Software Foundation.  For more
54  * information on the Apache Software Foundation, please see
55  * <http://www.apache.org/>.
56  */
57 
58 
59 package org.apache.oro.text.regex;
60 
61 import java.util.*;
62 
63 /**
64  * The Util class is a holder for useful static utility methods that can
65  * be generically applied to Pattern and PatternMatcher instances.
66  * This class cannot and is not meant to be instantiated.
67  * The Util class currently contains versions of the split() and substitute()
68  * methods inspired by Perl's split function and <b>s</b> operation
69  * respectively, although they are implemented in such a way as not to
70  * rely on the Perl5 implementations of the OROMatcher packages regular
71  * expression interfaces.  They may operate on any interface implementations
72  * conforming to the OROMatcher API specification for the PatternMatcher,
73  * Pattern, and MatchResult interfaces. Future versions of the class may
74  * include additional utility methods.
75  * <p>
76  * A grep method is not included for two reasons:
77  * <ol>
78  *     <li> The details of reading a line at a time from an input stream
79  *          differ in JDK 1.0.2 and JDK 1.1, making it difficult to
80  *          retain compatibility across both Java releases.
81  *     <li> Grep style processing is trivial for the programmer to implement
82  *          in a while loop.  Rarely does anyone want to retrieve all
83  *          occurences of a pattern and then process them.  More often a
84  *          programmer will retrieve pattern matches and process them as they
85  *          are retrieved, which is more efficient than storing them all in a
86  *          Vector and then accessing them.
87  * </ol>
88  *
89  * @version @version@
90  * @since 1.0
91  * @see Pattern
92  * @see PatternMatcher
93  */
94 public final class Util {
95   /**
96    * A constant passed to the {@link #substitute substitute()}
97    * methods indicating that all occurrences of a pattern should be
98    * substituted.
99    */
100   public static final int SUBSTITUTE_ALL = -1;
101 
102   /**
103    * A constant passed to the {@link #split split()} methods
104    * indicating that all occurrences of a pattern should be used to
105    * split a string.
106    */
107   public static final int SPLIT_ALL = 0;
108 
109   /**
110    * The default destructor for the Util class.  It is made private
111    * to prevent the instantiation of the class.
112    */
Util()113   private Util() { }
114 
115 
116   /**
117    * Splits up a <code>String</code> instance and stores results as a
118    * <code>List</code> of substrings numbering no more than a specified
119    * limit.  The string is split with a regular expression as the delimiter.
120    * The <b>limit</b> parameter essentially says to split the
121    * string only on at most the first <b>limit - 1</b> number of pattern
122    * occurences.
123    * <p>
124    * This method is inspired by the Perl split() function and behaves
125    * identically to it when used in conjunction with the Perl5Matcher and
126    * Perl5Pattern classes except for the following difference:
127    * <ul><p>
128    * In Perl, if the split expression contains parentheses, the split()
129    * method creates additional list elements from each of the matching
130    * subgroups in the pattern.  In other words:
131    * <ul><p>
132    * <code>split(list, "/([,-])/", "8-12,15,18", Util.SPLIT_ALL)</code></ul>
133    * <p> produces the list containing:
134    * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul>
135    * <p> The OROMatcher split method does not follow this behavior.  The
136    * following list would be produced by OROMatcher:
137    * <ul><p><code> { "8", "12",  "15", "18" } </code> </ul>
138    * <p> To obtain the Perl behavior, use
139    * {@link org.apache.oro.text.perl.Perl5Util#split}.
140    * </ul>
141    * <p>
142    * @param results A Collection to which the split results are appended.
143    *         After the method returns, it contains the substrings of the input
144    *         that occur between the regular expression delimiter occurences.
145    *         The input will not be split into any more substrings than the
146    *         specified <code>limit</code>.  A way of thinking of this is that
147    *         only the first <code>limit - 1</code> matches of the delimiting
148    *         regular expression will be used to split the input.
149    * @param matcher The regular expression matcher to execute the split.
150    * @param pattern The regular expression to use as a split delimiter.
151    * @param input   The <code>String</code> to split.
152    * @param limit  The limit on the number of resulting split elements.
153    *               Values <= 0 produce the same behavior as using the
154    *               <b>SPLIT_ALL</b> constant which causes the limit to be
155    *               ignored and splits to be performed on all occurrences of
156    *               the pattern.  You should use the <b>SPLIT_ALL</b> constant
157    *               to achieve this behavior instead of relying on the default
158    *               behavior associated with non-positive limit values.
159    * @since 2.0
160    */
split(Collection results, PatternMatcher matcher, Pattern pattern, String input, int limit)161   public static void split(Collection results, PatternMatcher matcher,
162 			   Pattern pattern, String input, int limit)
163   {
164     int beginOffset;
165     MatchResult currentResult;
166     PatternMatcherInput pinput;
167 
168     pinput = new PatternMatcherInput(input);
169     beginOffset = 0;
170 
171     while(--limit != 0 && matcher.contains(pinput, pattern)) {
172       currentResult = matcher.getMatch();
173       results.add(input.substring(beginOffset,
174 				  currentResult.beginOffset(0)));
175       beginOffset = currentResult.endOffset(0);
176     }
177 
178     results.add(input.substring(beginOffset, input.length()));
179   }
180 
181 
182   /**
183    * Splits up a <code>String</code> instance and stores results as a
184    * <code>Collection</code> of all its substrings using a regular expression
185    * as the delimiter.
186    * This method is inspired by the Perl split() function and behaves
187    * identically to it when used in conjunction with the Perl5Matcher and
188    * Perl5Pattern classes except for the following difference:
189    * <p>
190    * <ul>
191    * In Perl, if the split expression contains parentheses, the split()
192    * method creates additional list elements from each of the matching
193    * subgroups in the pattern.  In other words:
194    * <ul><p><code>split(list, "/([,-])/", "8-12,15,18")</code></ul>
195    * <p> produces the list containing:
196    * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul>
197    * <p> The OROMatcher split method does not follow this behavior.  The
198    * following list would be produced by OROMatcher:
199    * <ul><p><code> { "8", "12",  "15", "18" } </code> </ul>
200    * <p> To obtain the Perl behavior, use
201    * {@link org.apache.oro.text.perl.Perl5Util#split}.
202    * </ul>
203    * <p>
204    * This method is identical to calling:
205    * <blockquote><pre>
206    * split(matcher, pattern, input, Util.SPLIT_ALL);
207    * </pre></blockquote>
208    * <p>
209    * @param results A <code>Collection</code> to which all the substrings of
210    *         the input that occur between the regular expression delimiter
211    *         occurences are appended.
212    * @param matcher The regular expression matcher to execute the split.
213    * @param pattern The regular expression to use as a split delimiter.
214    * @param input   The <code>String</code> to split.
215    * @since 2.0
216    */
split(Collection results, PatternMatcher matcher, Pattern pattern, String input)217   public static void split(Collection results,  PatternMatcher matcher,
218 			   Pattern pattern, String input)
219   {
220     split(results, matcher, pattern, input, SPLIT_ALL);
221   }
222 
223   /**
224    * Splits up a <code>String</code> instance into strings contained in a
225    * <code>Vector</code> of size not greater than a specified limit.  The
226    * string is split with a regular expression as the delimiter.
227    * The <b>limit</b> parameter essentially says to split the
228    * string only on at most the first <b>limit - 1</b> number of pattern
229    * occurences.
230    * <p>
231    * This method is inspired by the Perl split() function and behaves
232    * identically to it when used in conjunction with the Perl5Matcher and
233    * Perl5Pattern classes except for the following difference:
234    * <ul><p>
235    * In Perl, if the split expression contains parentheses, the split()
236    * method creates additional list elements from each of the matching
237    * subgroups in the pattern.  In other words:
238    * <ul><p><code>split("/([,-])/", "8-12,15,18")</code></ul>
239    * <p> produces the Vector containing:
240    * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul>
241    * <p> The OROMatcher split method does not follow this behavior.  The
242    * following Vector would be produced by OROMatcher:
243    * <ul><p><code> { "8", "12",  "15", "18" } </code> </ul>
244    * <p> To obtain the Perl behavior, use
245    * {@link org.apache.oro.text.perl.Perl5Util#split}.
246    * </ul>
247    * <p>
248    * @deprecated Use
249    *  {@link #split(Collection, PatternMatcher, Pattern, String, int)} instead.
250    * @param matcher The regular expression matcher to execute the split.
251    * @param pattern The regular expression to use as a split delimiter.
252    * @param input  The <code>String</code> to split.
253    * @param limit  The limit on the size of the returned <code>Vector</code>.
254    *               Values <= 0 produce the same behavior as using the
255    *               <b>SPLIT_ALL</b> constant which causes the limit to be
256    *               ignored and splits to be performed on all occurrences of
257    *               the pattern.  You should use the <b>SPLIT_ALL</b> constant
258    *               to achieve this behavior instead of relying on the default
259    *               behavior associated with non-positive limit values.
260    * @return A <code>Vector</code> containing the substrings of the input
261    *         that occur between the regular expression delimiter occurences.
262    *         The input will not be split into any more substrings than the
263    *         specified <code>limit</code>.  A way of thinking of this is that
264    *         only the first <code>limit - 1</code> matches of the delimiting
265    *         regular expression will be used to split the input.
266    * @since 1.0
267    */
split(PatternMatcher matcher, Pattern pattern, String input, int limit)268   public static Vector split(PatternMatcher matcher, Pattern pattern,
269 			     String input, int limit)
270   {
271     Vector results = new Vector(20);
272 
273     split(results, matcher, pattern, input, limit);
274 
275     return results;
276   }
277 
278 
279   /**
280    * Splits up a <code>String</code> instance into a <code>Vector</code>
281    * of all its substrings using a regular expression as the delimiter.
282    * This method is inspired by the Perl split() function and behaves
283    * identically to it when used in conjunction with the Perl5Matcher and
284    * Perl5Pattern classes except for the following difference:
285    * <p>
286    * <ul>
287    * In Perl, if the split expression contains parentheses, the split()
288    * method creates additional list elements from each of the matching
289    * subgroups in the pattern.  In other words:
290    * <ul><p><code>split("/([,-])/", "8-12,15,18")</code></ul>
291    * <p> produces the Vector containing:
292    * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul>
293    * <p> The OROMatcher split method does not follow this behavior.  The
294    * following Vector would be produced by OROMatcher:
295    * <ul><p><code> { "8", "12",  "15", "18" } </code> </ul>
296    * <p> To obtain the Perl behavior, use
297    * {@link org.apache.oro.text.perl.Perl5Util#split}.
298    * </ul>
299    * <p>
300    * This method is identical to calling:
301    * <blockquote><pre>
302    * split(matcher, pattern, input, Util.SPLIT_ALL);
303    * </pre></blockquote>
304    * <p>
305    * @deprecated Use
306    * {@link #split(Collection, PatternMatcher, Pattern, String)} instead.
307    * @param matcher The regular expression matcher to execute the split.
308    * @param pattern The regular expression to use as a split delimiter.
309    * @param input   The <code>String</code> to split.
310    * @return A <code>Vector</code> containing all the substrings of the input
311    *         that occur between the regular expression delimiter occurences.
312    * @since 1.0
313    */
split( PatternMatcher matcher, Pattern pattern, String input)314   public static Vector split( PatternMatcher matcher, Pattern pattern,
315 			      String input)
316   {
317     return split(matcher, pattern, input, SPLIT_ALL);
318   }
319 
320 
321   /**
322    * Searches a string for a pattern and replaces the first occurrences
323    * of the pattern with a Substitution up to the number of
324    * substitutions specified by the <b>numSubs</b> parameter.  A
325    * <b>numSubs</b> value of <b>SUBSTITUTE_ALL</b> will cause all occurrences
326    * of the pattern to be replaced.
327    * <p>
328    * @param matcher The regular expression matcher to execute the pattern
329    *                search.
330    * @param pattern The regular expression to search for and substitute
331    *                occurrences of.
332    * @param sub     The Substitution used to substitute pattern occurences.
333    * @param input   The <code>String</code> on which to perform substitutions.
334    * @param numSubs The number of substitutions to perform.  Only the
335    *                first <b> numSubs </b> patterns encountered are
336    *                substituted.  If you want to substitute all occurences
337    *                set this parameter to <b> SUBSTITUTE_ALL </b>.
338    * @return A String comprising the input string with the substitutions,
339    *         if any, made.  If no substitutions are made, the returned String
340    *         is the original input String.
341    * @since 1.0
342    */
substitute(PatternMatcher matcher, Pattern pattern, Substitution sub, String input, int numSubs)343   public static String substitute(PatternMatcher matcher, Pattern pattern,
344 				  Substitution sub, String input, int numSubs)
345   {
346     StringBuffer buffer = new StringBuffer(input.length());
347     PatternMatcherInput pinput = new PatternMatcherInput(input);
348 
349     // Users have indicated that they expect the result to be the
350     // original input string, rather than a copy, if no substitutions
351     // are performed,
352     if(substitute(buffer, matcher, pattern, sub, pinput, numSubs) != 0)
353       return buffer.toString();
354     return input;
355   }
356 
357   /**
358    * Searches a string for a pattern and substitutes only the first
359    * occurence of the pattern.
360    * <p>
361    * This method is identical to calling:
362    * <blockquote><pre>
363    * substitute(matcher, pattern, sub, input, 1);
364    * </pre></blockquote>
365    * <p>
366    * @param matcher The regular expression matcher to execute the pattern
367    *                search.
368    * @param pattern The regular expression to search for and substitute
369    *                occurrences of.
370    * @param sub     The Substitution used to substitute pattern occurences.
371    * @param input   The <code>String</code> on which to perform substitutions.
372    * @return A String comprising the input string with the substitutions,
373    *         if any, made.  If no substitutions are made, the returned String
374    *         is the original input String.
375    * @since 1.0
376    */
substitute(PatternMatcher matcher, Pattern pattern, Substitution sub, String input)377   public static String substitute(PatternMatcher matcher, Pattern pattern,
378 				  Substitution sub, String input)
379   {
380     return substitute(matcher, pattern, sub, input, 1);
381   }
382 
383   /**
384    * Searches a string for a pattern and replaces the first occurrences
385    * of the pattern with a Substitution up to the number of
386    * substitutions specified by the <b>numSubs</b> parameter.  A
387    * <b>numSubs</b> value of <b>SUBSTITUTE_ALL</b> will cause all occurrences
388    * of the pattern to be replaced.  The number of substitutions made
389    * is returned.
390    * <p>
391    * @param result  The StringBuffer in which to store the result of the
392    *                substitutions.  The buffer is only appended to.
393    * @param matcher The regular expression matcher to execute the pattern
394    *                search.
395    * @param pattern The regular expression to search for and substitute
396    *                occurrences of.
397    * @param sub     The Substitution used to substitute pattern occurences.
398    * @param input   The input on which to perform substitutions.
399    * @param numSubs The number of substitutions to perform.  Only the
400    *                first <b> numSubs </b> patterns encountered are
401    *                substituted.  If you want to substitute all occurences
402    *                set this parameter to <b> SUBSTITUTE_ALL </b>.
403    * @return The number of substitutions made.
404    * @since 2.0.6
405    */
substitute(StringBuffer result, PatternMatcher matcher, Pattern pattern, Substitution sub, String input, int numSubs)406   public static int substitute(StringBuffer result,
407 			       PatternMatcher matcher, Pattern pattern,
408 			       Substitution sub, String input,
409 			       int numSubs)
410   {
411     PatternMatcherInput pinput = new PatternMatcherInput(input);
412     return substitute(result, matcher, pattern, sub, pinput, numSubs);
413   }
414 
415   /**
416    * Searches a string for a pattern and replaces the first occurrences
417    * of the pattern with a Substitution up to the number of
418    * substitutions specified by the <b>numSubs</b> parameter.  A
419    * <b>numSubs</b> value of <b>SUBSTITUTE_ALL</b> will cause all occurrences
420    * of the pattern to be replaced.  The number of substitutions made
421    * is returned.
422    * <p>
423    * @param result  The StringBuffer in which to store the result of the
424    *                substitutions.  The buffer is only appended to.
425    * @param matcher The regular expression matcher to execute the pattern
426    *                search.
427    * @param pattern The regular expression to search for and substitute
428    *                occurrences of.
429    * @param sub     The Substitution used to substitute pattern occurences.
430    * @param input   The input on which to perform substitutions.
431    * @param numSubs The number of substitutions to perform.  Only the
432    *                first <b> numSubs </b> patterns encountered are
433    *                substituted.  If you want to substitute all occurences
434    *                set this parameter to <b> SUBSTITUTE_ALL </b>.
435    * @return The number of substitutions made.
436    * @since 2.0.3
437    */
substitute(StringBuffer result, PatternMatcher matcher, Pattern pattern, Substitution sub, PatternMatcherInput input, int numSubs)438   public static int substitute(StringBuffer result,
439 			       PatternMatcher matcher, Pattern pattern,
440 			       Substitution sub, PatternMatcherInput input,
441 			       int numSubs)
442   {
443     int beginOffset, subCount;
444     char[] inputBuffer;
445 
446     subCount    = 0;
447     beginOffset = input.getBeginOffset();
448     inputBuffer = input.getBuffer();
449 
450     // Must be != 0 because SUBSTITUTE_ALL is represented by -1.
451     // Do NOT change to numSubs > 0.
452     while(numSubs != 0 && matcher.contains(input, pattern)) {
453       --numSubs;
454       ++subCount;
455       result.append(inputBuffer, beginOffset,
456 		    input.getMatchBeginOffset() - beginOffset);
457       sub.appendSubstitution(result, matcher.getMatch(), subCount,
458 			     input, matcher, pattern);
459       beginOffset = input.getMatchEndOffset();
460     }
461 
462     result.append(inputBuffer, beginOffset, input.length() - beginOffset);
463     return subCount;
464   }
465 }
466