1 /* 2 * $Id: Util.java,v 1.15 2003/11/07 20:16:25 dfs Exp $ 3 * 4 * ==================================================================== 5 * The Apache Software License, Version 1.1 6 * 7 * Copyright (c) 2000-2002 The Apache Software Foundation. All rights 8 * reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in 19 * the documentation and/or other materials provided with the 20 * distribution. 21 * 22 * 3. The end-user documentation included with the redistribution, 23 * if any, must include the following acknowledgment: 24 * "This product includes software developed by the 25 * Apache Software Foundation (http://www.apache.org/)." 26 * Alternately, this acknowledgment may appear in the software itself, 27 * if and wherever such third-party acknowledgments normally appear. 28 * 29 * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 30 * must not be used to endorse or promote products derived from this 31 * software without prior written permission. For written 32 * permission, please contact apache@apache.org. 33 * 34 * 5. Products derived from this software may not be called "Apache" 35 * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 36 * name, without prior written permission of the Apache Software Foundation. 37 * 38 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED 39 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 40 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 41 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR 42 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 44 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 45 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 46 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 47 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 48 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 49 * SUCH DAMAGE. 50 * ==================================================================== 51 * 52 * This software consists of voluntary contributions made by many 53 * individuals on behalf of the Apache Software Foundation. For more 54 * information on the Apache Software Foundation, please see 55 * <http://www.apache.org/>. 56 */ 57 58 59 package org.apache.oro.text.regex; 60 61 import java.util.*; 62 63 /** 64 * The Util class is a holder for useful static utility methods that can 65 * be generically applied to Pattern and PatternMatcher instances. 66 * This class cannot and is not meant to be instantiated. 67 * The Util class currently contains versions of the split() and substitute() 68 * methods inspired by Perl's split function and <b>s</b> operation 69 * respectively, although they are implemented in such a way as not to 70 * rely on the Perl5 implementations of the OROMatcher packages regular 71 * expression interfaces. They may operate on any interface implementations 72 * conforming to the OROMatcher API specification for the PatternMatcher, 73 * Pattern, and MatchResult interfaces. Future versions of the class may 74 * include additional utility methods. 75 * <p> 76 * A grep method is not included for two reasons: 77 * <ol> 78 * <li> The details of reading a line at a time from an input stream 79 * differ in JDK 1.0.2 and JDK 1.1, making it difficult to 80 * retain compatibility across both Java releases. 81 * <li> Grep style processing is trivial for the programmer to implement 82 * in a while loop. Rarely does anyone want to retrieve all 83 * occurences of a pattern and then process them. More often a 84 * programmer will retrieve pattern matches and process them as they 85 * are retrieved, which is more efficient than storing them all in a 86 * Vector and then accessing them. 87 * </ol> 88 * 89 * @version @version@ 90 * @since 1.0 91 * @see Pattern 92 * @see PatternMatcher 93 */ 94 public final class Util { 95 /** 96 * A constant passed to the {@link #substitute substitute()} 97 * methods indicating that all occurrences of a pattern should be 98 * substituted. 99 */ 100 public static final int SUBSTITUTE_ALL = -1; 101 102 /** 103 * A constant passed to the {@link #split split()} methods 104 * indicating that all occurrences of a pattern should be used to 105 * split a string. 106 */ 107 public static final int SPLIT_ALL = 0; 108 109 /** 110 * The default destructor for the Util class. It is made private 111 * to prevent the instantiation of the class. 112 */ Util()113 private Util() { } 114 115 116 /** 117 * Splits up a <code>String</code> instance and stores results as a 118 * <code>List</code> of substrings numbering no more than a specified 119 * limit. The string is split with a regular expression as the delimiter. 120 * The <b>limit</b> parameter essentially says to split the 121 * string only on at most the first <b>limit - 1</b> number of pattern 122 * occurences. 123 * <p> 124 * This method is inspired by the Perl split() function and behaves 125 * identically to it when used in conjunction with the Perl5Matcher and 126 * Perl5Pattern classes except for the following difference: 127 * <ul><p> 128 * In Perl, if the split expression contains parentheses, the split() 129 * method creates additional list elements from each of the matching 130 * subgroups in the pattern. In other words: 131 * <ul><p> 132 * <code>split(list, "/([,-])/", "8-12,15,18", Util.SPLIT_ALL)</code></ul> 133 * <p> produces the list containing: 134 * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul> 135 * <p> The OROMatcher split method does not follow this behavior. The 136 * following list would be produced by OROMatcher: 137 * <ul><p><code> { "8", "12", "15", "18" } </code> </ul> 138 * <p> To obtain the Perl behavior, use 139 * {@link org.apache.oro.text.perl.Perl5Util#split}. 140 * </ul> 141 * <p> 142 * @param results A Collection to which the split results are appended. 143 * After the method returns, it contains the substrings of the input 144 * that occur between the regular expression delimiter occurences. 145 * The input will not be split into any more substrings than the 146 * specified <code>limit</code>. A way of thinking of this is that 147 * only the first <code>limit - 1</code> matches of the delimiting 148 * regular expression will be used to split the input. 149 * @param matcher The regular expression matcher to execute the split. 150 * @param pattern The regular expression to use as a split delimiter. 151 * @param input The <code>String</code> to split. 152 * @param limit The limit on the number of resulting split elements. 153 * Values <= 0 produce the same behavior as using the 154 * <b>SPLIT_ALL</b> constant which causes the limit to be 155 * ignored and splits to be performed on all occurrences of 156 * the pattern. You should use the <b>SPLIT_ALL</b> constant 157 * to achieve this behavior instead of relying on the default 158 * behavior associated with non-positive limit values. 159 * @since 2.0 160 */ split(Collection results, PatternMatcher matcher, Pattern pattern, String input, int limit)161 public static void split(Collection results, PatternMatcher matcher, 162 Pattern pattern, String input, int limit) 163 { 164 int beginOffset; 165 MatchResult currentResult; 166 PatternMatcherInput pinput; 167 168 pinput = new PatternMatcherInput(input); 169 beginOffset = 0; 170 171 while(--limit != 0 && matcher.contains(pinput, pattern)) { 172 currentResult = matcher.getMatch(); 173 results.add(input.substring(beginOffset, 174 currentResult.beginOffset(0))); 175 beginOffset = currentResult.endOffset(0); 176 } 177 178 results.add(input.substring(beginOffset, input.length())); 179 } 180 181 182 /** 183 * Splits up a <code>String</code> instance and stores results as a 184 * <code>Collection</code> of all its substrings using a regular expression 185 * as the delimiter. 186 * This method is inspired by the Perl split() function and behaves 187 * identically to it when used in conjunction with the Perl5Matcher and 188 * Perl5Pattern classes except for the following difference: 189 * <p> 190 * <ul> 191 * In Perl, if the split expression contains parentheses, the split() 192 * method creates additional list elements from each of the matching 193 * subgroups in the pattern. In other words: 194 * <ul><p><code>split(list, "/([,-])/", "8-12,15,18")</code></ul> 195 * <p> produces the list containing: 196 * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul> 197 * <p> The OROMatcher split method does not follow this behavior. The 198 * following list would be produced by OROMatcher: 199 * <ul><p><code> { "8", "12", "15", "18" } </code> </ul> 200 * <p> To obtain the Perl behavior, use 201 * {@link org.apache.oro.text.perl.Perl5Util#split}. 202 * </ul> 203 * <p> 204 * This method is identical to calling: 205 * <blockquote><pre> 206 * split(matcher, pattern, input, Util.SPLIT_ALL); 207 * </pre></blockquote> 208 * <p> 209 * @param results A <code>Collection</code> to which all the substrings of 210 * the input that occur between the regular expression delimiter 211 * occurences are appended. 212 * @param matcher The regular expression matcher to execute the split. 213 * @param pattern The regular expression to use as a split delimiter. 214 * @param input The <code>String</code> to split. 215 * @since 2.0 216 */ split(Collection results, PatternMatcher matcher, Pattern pattern, String input)217 public static void split(Collection results, PatternMatcher matcher, 218 Pattern pattern, String input) 219 { 220 split(results, matcher, pattern, input, SPLIT_ALL); 221 } 222 223 /** 224 * Splits up a <code>String</code> instance into strings contained in a 225 * <code>Vector</code> of size not greater than a specified limit. The 226 * string is split with a regular expression as the delimiter. 227 * The <b>limit</b> parameter essentially says to split the 228 * string only on at most the first <b>limit - 1</b> number of pattern 229 * occurences. 230 * <p> 231 * This method is inspired by the Perl split() function and behaves 232 * identically to it when used in conjunction with the Perl5Matcher and 233 * Perl5Pattern classes except for the following difference: 234 * <ul><p> 235 * In Perl, if the split expression contains parentheses, the split() 236 * method creates additional list elements from each of the matching 237 * subgroups in the pattern. In other words: 238 * <ul><p><code>split("/([,-])/", "8-12,15,18")</code></ul> 239 * <p> produces the Vector containing: 240 * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul> 241 * <p> The OROMatcher split method does not follow this behavior. The 242 * following Vector would be produced by OROMatcher: 243 * <ul><p><code> { "8", "12", "15", "18" } </code> </ul> 244 * <p> To obtain the Perl behavior, use 245 * {@link org.apache.oro.text.perl.Perl5Util#split}. 246 * </ul> 247 * <p> 248 * @deprecated Use 249 * {@link #split(Collection, PatternMatcher, Pattern, String, int)} instead. 250 * @param matcher The regular expression matcher to execute the split. 251 * @param pattern The regular expression to use as a split delimiter. 252 * @param input The <code>String</code> to split. 253 * @param limit The limit on the size of the returned <code>Vector</code>. 254 * Values <= 0 produce the same behavior as using the 255 * <b>SPLIT_ALL</b> constant which causes the limit to be 256 * ignored and splits to be performed on all occurrences of 257 * the pattern. You should use the <b>SPLIT_ALL</b> constant 258 * to achieve this behavior instead of relying on the default 259 * behavior associated with non-positive limit values. 260 * @return A <code>Vector</code> containing the substrings of the input 261 * that occur between the regular expression delimiter occurences. 262 * The input will not be split into any more substrings than the 263 * specified <code>limit</code>. A way of thinking of this is that 264 * only the first <code>limit - 1</code> matches of the delimiting 265 * regular expression will be used to split the input. 266 * @since 1.0 267 */ split(PatternMatcher matcher, Pattern pattern, String input, int limit)268 public static Vector split(PatternMatcher matcher, Pattern pattern, 269 String input, int limit) 270 { 271 Vector results = new Vector(20); 272 273 split(results, matcher, pattern, input, limit); 274 275 return results; 276 } 277 278 279 /** 280 * Splits up a <code>String</code> instance into a <code>Vector</code> 281 * of all its substrings using a regular expression as the delimiter. 282 * This method is inspired by the Perl split() function and behaves 283 * identically to it when used in conjunction with the Perl5Matcher and 284 * Perl5Pattern classes except for the following difference: 285 * <p> 286 * <ul> 287 * In Perl, if the split expression contains parentheses, the split() 288 * method creates additional list elements from each of the matching 289 * subgroups in the pattern. In other words: 290 * <ul><p><code>split("/([,-])/", "8-12,15,18")</code></ul> 291 * <p> produces the Vector containing: 292 * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul> 293 * <p> The OROMatcher split method does not follow this behavior. The 294 * following Vector would be produced by OROMatcher: 295 * <ul><p><code> { "8", "12", "15", "18" } </code> </ul> 296 * <p> To obtain the Perl behavior, use 297 * {@link org.apache.oro.text.perl.Perl5Util#split}. 298 * </ul> 299 * <p> 300 * This method is identical to calling: 301 * <blockquote><pre> 302 * split(matcher, pattern, input, Util.SPLIT_ALL); 303 * </pre></blockquote> 304 * <p> 305 * @deprecated Use 306 * {@link #split(Collection, PatternMatcher, Pattern, String)} instead. 307 * @param matcher The regular expression matcher to execute the split. 308 * @param pattern The regular expression to use as a split delimiter. 309 * @param input The <code>String</code> to split. 310 * @return A <code>Vector</code> containing all the substrings of the input 311 * that occur between the regular expression delimiter occurences. 312 * @since 1.0 313 */ split( PatternMatcher matcher, Pattern pattern, String input)314 public static Vector split( PatternMatcher matcher, Pattern pattern, 315 String input) 316 { 317 return split(matcher, pattern, input, SPLIT_ALL); 318 } 319 320 321 /** 322 * Searches a string for a pattern and replaces the first occurrences 323 * of the pattern with a Substitution up to the number of 324 * substitutions specified by the <b>numSubs</b> parameter. A 325 * <b>numSubs</b> value of <b>SUBSTITUTE_ALL</b> will cause all occurrences 326 * of the pattern to be replaced. 327 * <p> 328 * @param matcher The regular expression matcher to execute the pattern 329 * search. 330 * @param pattern The regular expression to search for and substitute 331 * occurrences of. 332 * @param sub The Substitution used to substitute pattern occurences. 333 * @param input The <code>String</code> on which to perform substitutions. 334 * @param numSubs The number of substitutions to perform. Only the 335 * first <b> numSubs </b> patterns encountered are 336 * substituted. If you want to substitute all occurences 337 * set this parameter to <b> SUBSTITUTE_ALL </b>. 338 * @return A String comprising the input string with the substitutions, 339 * if any, made. If no substitutions are made, the returned String 340 * is the original input String. 341 * @since 1.0 342 */ substitute(PatternMatcher matcher, Pattern pattern, Substitution sub, String input, int numSubs)343 public static String substitute(PatternMatcher matcher, Pattern pattern, 344 Substitution sub, String input, int numSubs) 345 { 346 StringBuffer buffer = new StringBuffer(input.length()); 347 PatternMatcherInput pinput = new PatternMatcherInput(input); 348 349 // Users have indicated that they expect the result to be the 350 // original input string, rather than a copy, if no substitutions 351 // are performed, 352 if(substitute(buffer, matcher, pattern, sub, pinput, numSubs) != 0) 353 return buffer.toString(); 354 return input; 355 } 356 357 /** 358 * Searches a string for a pattern and substitutes only the first 359 * occurence of the pattern. 360 * <p> 361 * This method is identical to calling: 362 * <blockquote><pre> 363 * substitute(matcher, pattern, sub, input, 1); 364 * </pre></blockquote> 365 * <p> 366 * @param matcher The regular expression matcher to execute the pattern 367 * search. 368 * @param pattern The regular expression to search for and substitute 369 * occurrences of. 370 * @param sub The Substitution used to substitute pattern occurences. 371 * @param input The <code>String</code> on which to perform substitutions. 372 * @return A String comprising the input string with the substitutions, 373 * if any, made. If no substitutions are made, the returned String 374 * is the original input String. 375 * @since 1.0 376 */ substitute(PatternMatcher matcher, Pattern pattern, Substitution sub, String input)377 public static String substitute(PatternMatcher matcher, Pattern pattern, 378 Substitution sub, String input) 379 { 380 return substitute(matcher, pattern, sub, input, 1); 381 } 382 383 /** 384 * Searches a string for a pattern and replaces the first occurrences 385 * of the pattern with a Substitution up to the number of 386 * substitutions specified by the <b>numSubs</b> parameter. A 387 * <b>numSubs</b> value of <b>SUBSTITUTE_ALL</b> will cause all occurrences 388 * of the pattern to be replaced. The number of substitutions made 389 * is returned. 390 * <p> 391 * @param result The StringBuffer in which to store the result of the 392 * substitutions. The buffer is only appended to. 393 * @param matcher The regular expression matcher to execute the pattern 394 * search. 395 * @param pattern The regular expression to search for and substitute 396 * occurrences of. 397 * @param sub The Substitution used to substitute pattern occurences. 398 * @param input The input on which to perform substitutions. 399 * @param numSubs The number of substitutions to perform. Only the 400 * first <b> numSubs </b> patterns encountered are 401 * substituted. If you want to substitute all occurences 402 * set this parameter to <b> SUBSTITUTE_ALL </b>. 403 * @return The number of substitutions made. 404 * @since 2.0.6 405 */ substitute(StringBuffer result, PatternMatcher matcher, Pattern pattern, Substitution sub, String input, int numSubs)406 public static int substitute(StringBuffer result, 407 PatternMatcher matcher, Pattern pattern, 408 Substitution sub, String input, 409 int numSubs) 410 { 411 PatternMatcherInput pinput = new PatternMatcherInput(input); 412 return substitute(result, matcher, pattern, sub, pinput, numSubs); 413 } 414 415 /** 416 * Searches a string for a pattern and replaces the first occurrences 417 * of the pattern with a Substitution up to the number of 418 * substitutions specified by the <b>numSubs</b> parameter. A 419 * <b>numSubs</b> value of <b>SUBSTITUTE_ALL</b> will cause all occurrences 420 * of the pattern to be replaced. The number of substitutions made 421 * is returned. 422 * <p> 423 * @param result The StringBuffer in which to store the result of the 424 * substitutions. The buffer is only appended to. 425 * @param matcher The regular expression matcher to execute the pattern 426 * search. 427 * @param pattern The regular expression to search for and substitute 428 * occurrences of. 429 * @param sub The Substitution used to substitute pattern occurences. 430 * @param input The input on which to perform substitutions. 431 * @param numSubs The number of substitutions to perform. Only the 432 * first <b> numSubs </b> patterns encountered are 433 * substituted. If you want to substitute all occurences 434 * set this parameter to <b> SUBSTITUTE_ALL </b>. 435 * @return The number of substitutions made. 436 * @since 2.0.3 437 */ substitute(StringBuffer result, PatternMatcher matcher, Pattern pattern, Substitution sub, PatternMatcherInput input, int numSubs)438 public static int substitute(StringBuffer result, 439 PatternMatcher matcher, Pattern pattern, 440 Substitution sub, PatternMatcherInput input, 441 int numSubs) 442 { 443 int beginOffset, subCount; 444 char[] inputBuffer; 445 446 subCount = 0; 447 beginOffset = input.getBeginOffset(); 448 inputBuffer = input.getBuffer(); 449 450 // Must be != 0 because SUBSTITUTE_ALL is represented by -1. 451 // Do NOT change to numSubs > 0. 452 while(numSubs != 0 && matcher.contains(input, pattern)) { 453 --numSubs; 454 ++subCount; 455 result.append(inputBuffer, beginOffset, 456 input.getMatchBeginOffset() - beginOffset); 457 sub.appendSubstitution(result, matcher.getMatch(), subCount, 458 input, matcher, pattern); 459 beginOffset = input.getMatchEndOffset(); 460 } 461 462 result.append(inputBuffer, beginOffset, input.length() - beginOffset); 463 return subCount; 464 } 465 } 466