1 /*
2  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 package com.sun.xml.internal.ws.encoding;
27 
28 import javax.xml.ws.WebServiceException;
29 
30 /**
31  * This class tokenizes RFC822 and MIME headers into the basic
32  * symbols specified by RFC822 and MIME. <p>
33  *
34  * This class handles folded headers (ie headers with embedded
35  * CRLF SPACE sequences). The folds are removed in the returned
36  * tokens.
37  *
38  * @version 1.9, 02/03/27
39  * @author  John Mani
40  */
41 
42 class HeaderTokenizer {
43 
44     /**
45      * The Token class represents tokens returned by the
46      * HeaderTokenizer.
47      */
48     static class Token {
49 
50         private int type;
51         private String value;
52 
53         /**
54          * Token type indicating an ATOM.
55          */
56         public static final int ATOM            = -1;
57 
58         /**
59          * Token type indicating a quoted string. The value
60          * field contains the string without the quotes.
61          */
62         public static final int QUOTEDSTRING    = -2;
63 
64         /**
65          * Token type indicating a comment. The value field
66          * contains the comment string without the comment
67          * start and end symbols.
68          */
69         public static final int COMMENT         = -3;
70 
71         /**
72          * Token type indicating end of input.
73          */
74         public static final int  EOF            = -4;
75 
76         /**
77          * Constructor.
78          * @param       type    Token type
79          * @param       value   Token value
80          */
Token(int type, String value)81         public Token(int type, String value) {
82              this.type = type;
83              this.value = value;
84         }
85 
86         /**
87          * Return the type of the token. If the token represents a
88          * delimiter or a control character, the type is that character
89          * itself, converted to an integer. Otherwise, it's value is
90          * one of the following:
91          * <ul>
92          * <li><code>ATOM</code> A sequence of ASCII characters
93          *      delimited by either SPACE, CTL, "(", <"> or the
94          *      specified SPECIALS
95          * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
96          *      within quotes
97          * <li><code>COMMENT</code> A sequence of ASCII characters
98          *      within "(" and ")".
99          * <li><code>EOF</code> End of header
100          * </ul>
101          */
getType()102         public int getType() {
103             return type;
104         }
105 
106         /**
107          * Returns the value of the token just read. When the current
108          * token is a quoted string, this field contains the body of the
109          * string, without the quotes. When the current token is a comment,
110          * this field contains the body of the comment.
111          *
112          * @return      token value
113          */
getValue()114         public String getValue() {
115             return value;
116         }
117     }
118 
119     private String string; // the string to be tokenized
120     private boolean skipComments; // should comments be skipped ?
121     private String delimiters; // delimiter string
122     private int currentPos; // current parse position
123     private int maxPos; // string length
124     private int nextPos; // track start of next Token for next()
125     private int peekPos; // track start of next Token for peek()
126 
127     /**
128      * RFC822 specials
129      */
130     private final static String RFC822 = "()<>@,;:\\\"\t .[]";
131 
132     /**
133      * MIME specials
134      */
135     final static String MIME = "()<>@,;:\\\"\t []/?=";
136 
137     // The EOF Token
138     private final static Token EOFToken = new Token(Token.EOF, null);
139 
140     /**
141      * Constructor that takes a rfc822 style header.
142      *
143      * @param   header  The rfc822 header to be tokenized
144      * @param   delimiters      Set of delimiter characters
145      *                          to be used to delimit ATOMS. These
146      *                          are usually <code>RFC822</code> or
147      *                          <code>MIME</code>
148      * @param   skipComments  If true, comments are skipped and
149      *                          not returned as tokens
150      */
HeaderTokenizer(String header, String delimiters, boolean skipComments)151     HeaderTokenizer(String header, String delimiters,
152                            boolean skipComments) {
153         string = (header == null) ? "" : header; // paranoia ?!
154         this.skipComments = skipComments;
155         this.delimiters = delimiters;
156         currentPos = nextPos = peekPos = 0;
157         maxPos = string.length();
158     }
159 
160     /**
161      * Constructor. Comments are ignored and not returned as tokens
162      *
163      * @param   header  The header that is tokenized
164      * @param   delimiters  The delimiters to be used
165      */
HeaderTokenizer(String header, String delimiters)166     HeaderTokenizer(String header, String delimiters) {
167             this(header, delimiters, true);
168     }
169 
170     /**
171      * Constructor. The RFC822 defined delimiters - RFC822 - are
172      * used to delimit ATOMS. Also comments are skipped and not
173      * returned as tokens
174      */
HeaderTokenizer(String header)175     HeaderTokenizer(String header)  {
176             this(header, RFC822);
177     }
178 
179     /**
180      * Parses the next token from this String. <p>
181      *
182      * Clients sit in a loop calling next() to parse successive
183      * tokens until an EOF Token is returned.
184      *
185      * @return          the next Token
186      * @exception WebServiceException if the parse fails
187      */
next()188     Token next() throws WebServiceException {
189         Token tk;
190 
191         currentPos = nextPos; // setup currentPos
192         tk = getNext();
193         nextPos = peekPos = currentPos; // update currentPos and peekPos
194         return tk;
195     }
196 
197     /**
198      * Peek at the next token, without actually removing the token
199      * from the parse stream. Invoking this method multiple times
200      * will return successive tokens, until <code>next()</code> is
201      * called. <p>
202      *
203      * @return          the next Token
204      * @exception       WebServiceException if the parse fails
205      */
peek()206     Token peek() throws WebServiceException {
207         Token tk;
208 
209         currentPos = peekPos; // setup currentPos
210         tk = getNext();
211         peekPos = currentPos; // update peekPos
212         return tk;
213     }
214 
215     /**
216      * Return the rest of the Header.
217      *
218      * @return String   rest of header. null is returned if we are
219      *                  already at end of header
220      */
getRemainder()221     String getRemainder() {
222             return string.substring(nextPos);
223     }
224 
225     /*
226      * Return the next token starting from 'currentPos'. After the
227      * parse, 'currentPos' is updated to point to the start of the
228      * next token.
229      */
getNext()230     private Token getNext() throws WebServiceException {
231         // If we're already at end of string, return EOF
232         if (currentPos >= maxPos)
233             return EOFToken;
234 
235         // Skip white-space, position currentPos beyond the space
236         if (skipWhiteSpace() == Token.EOF)
237             return EOFToken;
238 
239         char c;
240         int start;
241         boolean filter = false;
242 
243         c = string.charAt(currentPos);
244 
245         // Check or Skip comments and position currentPos
246         // beyond the comment
247         while (c == '(') {
248             // Parsing comment ..
249             int nesting;
250             for (start = ++currentPos, nesting = 1;
251              nesting > 0 && currentPos < maxPos;
252              currentPos++) {
253             c = string.charAt(currentPos);
254             if (c == '\\') {  // Escape sequence
255                 currentPos++; // skip the escaped character
256                 filter = true;
257             } else if (c == '\r')
258                 filter = true;
259             else if (c == '(')
260                 nesting++;
261             else if (c == ')')
262                 nesting--;
263             }
264             if (nesting != 0)
265             throw new WebServiceException("Unbalanced comments");
266 
267             if (!skipComments) {
268             // Return the comment, if we are asked to.
269             // Note that the comment start & end markers are ignored.
270             String s;
271             if (filter) // need to go thru the token again.
272                 s = filterToken(string, start, currentPos-1);
273             else
274                 s = string.substring(start,currentPos-1);
275 
276             return new Token(Token.COMMENT, s);
277             }
278 
279             // Skip any whitespace after the comment.
280             if (skipWhiteSpace() == Token.EOF)
281             return EOFToken;
282             c = string.charAt(currentPos);
283         }
284 
285         // Check for quoted-string and position currentPos
286         //  beyond the terminating quote
287         if (c == '"') {
288             for (start = ++currentPos; currentPos < maxPos; currentPos++) {
289             c = string.charAt(currentPos);
290             if (c == '\\') { // Escape sequence
291                 currentPos++;
292                 filter = true;
293             } else if (c == '\r')
294                 filter = true;
295             else if (c == '"') {
296                 currentPos++;
297                 String s;
298 
299                 if (filter)
300                 s = filterToken(string, start, currentPos-1);
301                 else
302                 s = string.substring(start,currentPos-1);
303 
304                 return new Token(Token.QUOTEDSTRING, s);
305             }
306             }
307             throw new WebServiceException("Unbalanced quoted string");
308         }
309 
310         // Check for SPECIAL or CTL
311         if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
312             currentPos++; // re-position currentPos
313             char ch[] = new char[1];
314             ch[0] = c;
315             return new Token((int)c, new String(ch));
316         }
317 
318         // Check for ATOM
319         for (start = currentPos; currentPos < maxPos; currentPos++) {
320             c = string.charAt(currentPos);
321             // ATOM is delimited by either SPACE, CTL, "(", <">
322             // or the specified SPECIALS
323             if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
324             c == '"' || delimiters.indexOf(c) >= 0)
325             break;
326         }
327         return new Token(Token.ATOM, string.substring(start, currentPos));
328         }
329 
330         // Skip SPACE, HT, CR and NL
skipWhiteSpace()331         private int skipWhiteSpace() {
332         char c;
333         for (; currentPos < maxPos; currentPos++)
334             if (((c = string.charAt(currentPos)) != ' ') &&
335             (c != '\t') && (c != '\r') && (c != '\n'))
336             return currentPos;
337         return Token.EOF;
338     }
339 
340     /* Process escape sequences and embedded LWSPs from a comment or
341      * quoted string.
342      */
filterToken(String s, int start, int end)343     private static String filterToken(String s, int start, int end) {
344         StringBuffer sb = new StringBuffer();
345         char c;
346         boolean gotEscape = false;
347         boolean gotCR = false;
348 
349         for (int i = start; i < end; i++) {
350             c = s.charAt(i);
351             if (c == '\n' && gotCR) {
352             // This LF is part of an unescaped
353             // CRLF sequence (i.e, LWSP). Skip it.
354             gotCR = false;
355             continue;
356             }
357 
358             gotCR = false;
359             if (!gotEscape) {
360             // Previous character was NOT '\'
361             if (c == '\\') // skip this character
362                 gotEscape = true;
363             else if (c == '\r') // skip this character
364                 gotCR = true;
365             else // append this character
366                 sb.append(c);
367             } else {
368             // Previous character was '\'. So no need to
369             // bother with any special processing, just
370             // append this character
371             sb.append(c);
372             gotEscape = false;
373             }
374         }
375         return sb.toString();
376     }
377 }
378