1 /* 2 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package com.sun.xml.internal.ws.encoding; 27 28 import javax.xml.ws.WebServiceException; 29 30 /** 31 * This class tokenizes RFC822 and MIME headers into the basic 32 * symbols specified by RFC822 and MIME. <p> 33 * 34 * This class handles folded headers (ie headers with embedded 35 * CRLF SPACE sequences). The folds are removed in the returned 36 * tokens. 37 * 38 * @version 1.9, 02/03/27 39 * @author John Mani 40 */ 41 42 class HeaderTokenizer { 43 44 /** 45 * The Token class represents tokens returned by the 46 * HeaderTokenizer. 47 */ 48 static class Token { 49 50 private int type; 51 private String value; 52 53 /** 54 * Token type indicating an ATOM. 55 */ 56 public static final int ATOM = -1; 57 58 /** 59 * Token type indicating a quoted string. The value 60 * field contains the string without the quotes. 61 */ 62 public static final int QUOTEDSTRING = -2; 63 64 /** 65 * Token type indicating a comment. The value field 66 * contains the comment string without the comment 67 * start and end symbols. 68 */ 69 public static final int COMMENT = -3; 70 71 /** 72 * Token type indicating end of input. 73 */ 74 public static final int EOF = -4; 75 76 /** 77 * Constructor. 78 * @param type Token type 79 * @param value Token value 80 */ Token(int type, String value)81 public Token(int type, String value) { 82 this.type = type; 83 this.value = value; 84 } 85 86 /** 87 * Return the type of the token. If the token represents a 88 * delimiter or a control character, the type is that character 89 * itself, converted to an integer. Otherwise, it's value is 90 * one of the following: 91 * <ul> 92 * <li><code>ATOM</code> A sequence of ASCII characters 93 * delimited by either SPACE, CTL, "(", <"> or the 94 * specified SPECIALS 95 * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters 96 * within quotes 97 * <li><code>COMMENT</code> A sequence of ASCII characters 98 * within "(" and ")". 99 * <li><code>EOF</code> End of header 100 * </ul> 101 */ getType()102 public int getType() { 103 return type; 104 } 105 106 /** 107 * Returns the value of the token just read. When the current 108 * token is a quoted string, this field contains the body of the 109 * string, without the quotes. When the current token is a comment, 110 * this field contains the body of the comment. 111 * 112 * @return token value 113 */ getValue()114 public String getValue() { 115 return value; 116 } 117 } 118 119 private String string; // the string to be tokenized 120 private boolean skipComments; // should comments be skipped ? 121 private String delimiters; // delimiter string 122 private int currentPos; // current parse position 123 private int maxPos; // string length 124 private int nextPos; // track start of next Token for next() 125 private int peekPos; // track start of next Token for peek() 126 127 /** 128 * RFC822 specials 129 */ 130 private final static String RFC822 = "()<>@,;:\\\"\t .[]"; 131 132 /** 133 * MIME specials 134 */ 135 final static String MIME = "()<>@,;:\\\"\t []/?="; 136 137 // The EOF Token 138 private final static Token EOFToken = new Token(Token.EOF, null); 139 140 /** 141 * Constructor that takes a rfc822 style header. 142 * 143 * @param header The rfc822 header to be tokenized 144 * @param delimiters Set of delimiter characters 145 * to be used to delimit ATOMS. These 146 * are usually <code>RFC822</code> or 147 * <code>MIME</code> 148 * @param skipComments If true, comments are skipped and 149 * not returned as tokens 150 */ HeaderTokenizer(String header, String delimiters, boolean skipComments)151 HeaderTokenizer(String header, String delimiters, 152 boolean skipComments) { 153 string = (header == null) ? "" : header; // paranoia ?! 154 this.skipComments = skipComments; 155 this.delimiters = delimiters; 156 currentPos = nextPos = peekPos = 0; 157 maxPos = string.length(); 158 } 159 160 /** 161 * Constructor. Comments are ignored and not returned as tokens 162 * 163 * @param header The header that is tokenized 164 * @param delimiters The delimiters to be used 165 */ HeaderTokenizer(String header, String delimiters)166 HeaderTokenizer(String header, String delimiters) { 167 this(header, delimiters, true); 168 } 169 170 /** 171 * Constructor. The RFC822 defined delimiters - RFC822 - are 172 * used to delimit ATOMS. Also comments are skipped and not 173 * returned as tokens 174 */ HeaderTokenizer(String header)175 HeaderTokenizer(String header) { 176 this(header, RFC822); 177 } 178 179 /** 180 * Parses the next token from this String. <p> 181 * 182 * Clients sit in a loop calling next() to parse successive 183 * tokens until an EOF Token is returned. 184 * 185 * @return the next Token 186 * @exception WebServiceException if the parse fails 187 */ next()188 Token next() throws WebServiceException { 189 Token tk; 190 191 currentPos = nextPos; // setup currentPos 192 tk = getNext(); 193 nextPos = peekPos = currentPos; // update currentPos and peekPos 194 return tk; 195 } 196 197 /** 198 * Peek at the next token, without actually removing the token 199 * from the parse stream. Invoking this method multiple times 200 * will return successive tokens, until <code>next()</code> is 201 * called. <p> 202 * 203 * @return the next Token 204 * @exception WebServiceException if the parse fails 205 */ peek()206 Token peek() throws WebServiceException { 207 Token tk; 208 209 currentPos = peekPos; // setup currentPos 210 tk = getNext(); 211 peekPos = currentPos; // update peekPos 212 return tk; 213 } 214 215 /** 216 * Return the rest of the Header. 217 * 218 * @return String rest of header. null is returned if we are 219 * already at end of header 220 */ getRemainder()221 String getRemainder() { 222 return string.substring(nextPos); 223 } 224 225 /* 226 * Return the next token starting from 'currentPos'. After the 227 * parse, 'currentPos' is updated to point to the start of the 228 * next token. 229 */ getNext()230 private Token getNext() throws WebServiceException { 231 // If we're already at end of string, return EOF 232 if (currentPos >= maxPos) 233 return EOFToken; 234 235 // Skip white-space, position currentPos beyond the space 236 if (skipWhiteSpace() == Token.EOF) 237 return EOFToken; 238 239 char c; 240 int start; 241 boolean filter = false; 242 243 c = string.charAt(currentPos); 244 245 // Check or Skip comments and position currentPos 246 // beyond the comment 247 while (c == '(') { 248 // Parsing comment .. 249 int nesting; 250 for (start = ++currentPos, nesting = 1; 251 nesting > 0 && currentPos < maxPos; 252 currentPos++) { 253 c = string.charAt(currentPos); 254 if (c == '\\') { // Escape sequence 255 currentPos++; // skip the escaped character 256 filter = true; 257 } else if (c == '\r') 258 filter = true; 259 else if (c == '(') 260 nesting++; 261 else if (c == ')') 262 nesting--; 263 } 264 if (nesting != 0) 265 throw new WebServiceException("Unbalanced comments"); 266 267 if (!skipComments) { 268 // Return the comment, if we are asked to. 269 // Note that the comment start & end markers are ignored. 270 String s; 271 if (filter) // need to go thru the token again. 272 s = filterToken(string, start, currentPos-1); 273 else 274 s = string.substring(start,currentPos-1); 275 276 return new Token(Token.COMMENT, s); 277 } 278 279 // Skip any whitespace after the comment. 280 if (skipWhiteSpace() == Token.EOF) 281 return EOFToken; 282 c = string.charAt(currentPos); 283 } 284 285 // Check for quoted-string and position currentPos 286 // beyond the terminating quote 287 if (c == '"') { 288 for (start = ++currentPos; currentPos < maxPos; currentPos++) { 289 c = string.charAt(currentPos); 290 if (c == '\\') { // Escape sequence 291 currentPos++; 292 filter = true; 293 } else if (c == '\r') 294 filter = true; 295 else if (c == '"') { 296 currentPos++; 297 String s; 298 299 if (filter) 300 s = filterToken(string, start, currentPos-1); 301 else 302 s = string.substring(start,currentPos-1); 303 304 return new Token(Token.QUOTEDSTRING, s); 305 } 306 } 307 throw new WebServiceException("Unbalanced quoted string"); 308 } 309 310 // Check for SPECIAL or CTL 311 if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) { 312 currentPos++; // re-position currentPos 313 char ch[] = new char[1]; 314 ch[0] = c; 315 return new Token((int)c, new String(ch)); 316 } 317 318 // Check for ATOM 319 for (start = currentPos; currentPos < maxPos; currentPos++) { 320 c = string.charAt(currentPos); 321 // ATOM is delimited by either SPACE, CTL, "(", <"> 322 // or the specified SPECIALS 323 if (c < 040 || c >= 0177 || c == '(' || c == ' ' || 324 c == '"' || delimiters.indexOf(c) >= 0) 325 break; 326 } 327 return new Token(Token.ATOM, string.substring(start, currentPos)); 328 } 329 330 // Skip SPACE, HT, CR and NL skipWhiteSpace()331 private int skipWhiteSpace() { 332 char c; 333 for (; currentPos < maxPos; currentPos++) 334 if (((c = string.charAt(currentPos)) != ' ') && 335 (c != '\t') && (c != '\r') && (c != '\n')) 336 return currentPos; 337 return Token.EOF; 338 } 339 340 /* Process escape sequences and embedded LWSPs from a comment or 341 * quoted string. 342 */ filterToken(String s, int start, int end)343 private static String filterToken(String s, int start, int end) { 344 StringBuffer sb = new StringBuffer(); 345 char c; 346 boolean gotEscape = false; 347 boolean gotCR = false; 348 349 for (int i = start; i < end; i++) { 350 c = s.charAt(i); 351 if (c == '\n' && gotCR) { 352 // This LF is part of an unescaped 353 // CRLF sequence (i.e, LWSP). Skip it. 354 gotCR = false; 355 continue; 356 } 357 358 gotCR = false; 359 if (!gotEscape) { 360 // Previous character was NOT '\' 361 if (c == '\\') // skip this character 362 gotEscape = true; 363 else if (c == '\r') // skip this character 364 gotCR = true; 365 else // append this character 366 sb.append(c); 367 } else { 368 // Previous character was '\'. So no need to 369 // bother with any special processing, just 370 // append this character 371 sb.append(c); 372 gotEscape = false; 373 } 374 } 375 return sb.toString(); 376 } 377 } 378