1 /*
2  * Copyright (C) 2005-2008 Jive Software. All rights reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package org.jivesoftware.openfire.nio;
18 
19 import java.nio.CharBuffer;
20 import java.nio.charset.Charset;
21 import java.nio.charset.CharsetDecoder;
22 import java.nio.charset.CodingErrorAction;
23 import java.util.ArrayList;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28 
29 import org.apache.mina.core.buffer.IoBuffer;
30 import org.apache.mina.filter.codec.ProtocolDecoderException;
31 import org.jivesoftware.util.JiveGlobals;
32 import org.jivesoftware.util.PropertyEventDispatcher;
33 import org.jivesoftware.util.PropertyEventListener;
34 
35 /**
36  * This is a Light-Weight XML Parser.
37  * It read data from a channel and collect data until data are available in
38  * the channel.
39  * When a message is complete you can retrieve messages invoking the method
40  * getMsgs() and you can invoke the method areThereMsgs() to know if at least
41  * an message is presents.
42  *
43  * @author Daniele Piras
44  * @author Gaston Dombiak
45  */
46 class XMLLightweightParser {
47 
48     private static final Pattern XML_HAS_CHARREF = Pattern.compile("&#(0*([0-9]+)|[xX]0*([0-9a-fA-F]+));");
49 
50     private static final String MAX_PROPERTY_NAME = "xmpp.parser.buffer.size";
51     private static int maxBufferSize;
52     // Chars that rappresent CDATA section start
53     protected static char[] CDATA_START = {'<', '!', '[', 'C', 'D', 'A', 'T', 'A', '['};
54     // Chars that rappresent CDATA section end
55     protected static char[] CDATA_END = {']', ']', '>'};
56 
57     // Buffer with all data retrieved
58     protected StringBuilder buffer = new StringBuilder();
59 
60     // ---- INTERNAL STATUS -------
61     // Initial status
62     protected static final int INIT = 0;
63     // Status used when the first tag name is retrieved
64     protected static final int HEAD = 2;
65     // Status used when robot is inside the xml and it looking for the tag conclusion
66     protected static final int INSIDE = 3;
67     // Status used when a '<' is found and try to find the conclusion tag.
68     protected static final int PRETAIL = 4;
69     // Status used when the ending tag is equal to the head tag
70     protected static final int TAIL = 5;
71     // Status used when robot is inside the main tag and found an '/' to check '/>'.
72     protected static final int VERIFY_CLOSE_TAG = 6;
73     //  Status used when you are inside a parameter
74     protected static final int INSIDE_PARAM_VALUE = 7;
75     //  Status used when you are inside a cdata section
76     protected static final int INSIDE_CDATA = 8;
77     // Status used when you are outside a tag/reading text
78     protected static final int OUTSIDE = 9;
79 
80     final String[] sstatus = {"INIT", "", "HEAD", "INSIDE", "PRETAIL", "TAIL", "VERIFY", "INSIDE_PARAM", "INSIDE_CDATA", "OUTSIDE"};
81 
82 
83     // Current robot status
84     protected int status = XMLLightweightParser.INIT;
85 
86     // Index to looking for a CDATA section start or end.
87     protected int cdataOffset = 0;
88 
89     // Number of chars that machs with the head tag. If the tailCount is equal to
90     // the head length so a close tag is found.
91     protected int tailCount = 0;
92     // Indicate the starting point in the buffer for the next message.
93     protected int startLastMsg = 0;
94     // Flag used to discover tag in the form <tag />.
95     protected boolean insideRootTag = false;
96     // Object conteining the head tag
97     protected StringBuilder head = new StringBuilder(16);
98     // List with all finished messages found.
99     protected List<String> msgs = new ArrayList<>();
100     private int depth = 0;
101 
102     protected boolean insideChildrenTag = false;
103 
104     CharsetDecoder encoder;
105 
106     static {
107         // Set default max buffer size to 1MB. If limit is reached then close connection
108         maxBufferSize = JiveGlobals.getIntProperty(MAX_PROPERTY_NAME, 1048576);
109         // Listen for changes to this property
PropertyEventDispatcher.addListener(new PropertyListener())110         PropertyEventDispatcher.addListener(new PropertyListener());
111     }
112 
XMLLightweightParser(Charset charset)113     public XMLLightweightParser(Charset charset) {
114         encoder = charset.newDecoder()
115             .onMalformedInput(CodingErrorAction.REPLACE)
116             .onUnmappableCharacter(CodingErrorAction.REPLACE);
117     }
118 
119     /*
120     * true if the parser has found some complete xml message.
121     */
areThereMsgs()122     public boolean areThereMsgs() {
123         return (msgs.size() > 0);
124     }
125 
126     /*
127     * @return an array with all messages found
128     */
getMsgs()129     public String[] getMsgs() {
130         String[] res = new String[msgs.size()];
131         for (int i = 0; i < res.length; i++) {
132             res[i] = msgs.get(i);
133         }
134         msgs.clear();
135         invalidateBuffer();
136         return res;
137     }
138 
139     /*
140     * Method use to re-initialize the buffer
141     */
invalidateBuffer()142     protected void invalidateBuffer() {
143         if (buffer.length() > 0) {
144             String str = buffer.substring(startLastMsg);
145             buffer.delete(0, buffer.length());
146             buffer.append(str);
147             buffer.trimToSize();
148         }
149         startLastMsg = 0;
150     }
151 
152 
153     /*
154     * Method that add a message to the list and reinit parser.
155     */
foundMsg(String msg)156     protected void foundMsg(String msg) throws XMLNotWellFormedException {
157         // Add message to the complete message list
158         if (msg != null) {
159             if (hasIllegalCharacterReferences(msg)) {
160                 buffer = null;
161                 throw new XMLNotWellFormedException("Illegal character reference found in: " + msg);
162             }
163             msgs.add(msg);
164         }
165         // Move the position into the buffer
166         status = XMLLightweightParser.INIT;
167         tailCount = 0;
168         cdataOffset = 0;
169         head.setLength(0);
170         insideRootTag = false;
171         insideChildrenTag = false;
172         depth = 0;
173     }
174 
175     /*
176     * Main reading method
177     */
read(IoBuffer byteBuffer)178     public void read(IoBuffer byteBuffer) throws Exception {
179         if (buffer == null) {
180             // exception was thrown before, avoid duplicate exception(s)
181             // "read" and discard remaining data
182             byteBuffer.position(byteBuffer.limit());
183             return;
184         }
185         invalidateBuffer();
186         // Check that the buffer is not bigger than 1 Megabyte. For security reasons
187         // we will abort parsing when 1 Mega of queued chars was found.
188         if (buffer.length() > maxBufferSize) {
189             // purge the local buffer / free memory
190             buffer = null;
191             // processing the exception takes quite long
192             final ProtocolDecoderException ex = new ProtocolDecoderException("Stopped parsing never ending stanza");
193             ex.setHexdump("(redacted hex dump of never ending stanza)");
194             throw ex;
195         }
196         CharBuffer charBuffer = CharBuffer.allocate(byteBuffer.capacity());
197         encoder.reset();
198         encoder.decode(byteBuffer.buf(), charBuffer, false);
199         char[] buf = new char[charBuffer.position()];
200         charBuffer.flip();
201         charBuffer.get(buf);
202         int readChar = buf.length;
203 
204         // Just return if nothing was read
205         if (readChar == 0) {
206             return;
207         }
208 
209         buffer.append(buf);
210 
211         // Robot.
212         char ch;
213         boolean isHighSurrogate = false;
214         for (int i = 0; i < readChar; i++) {
215             ch = buf[i];
216             if (ch < 0x20 && ch != 0x9 && ch != 0xA && ch != 0xD) {
217                  //Unicode characters in the range 0x0000-0x001F other than 9, A, and D are not allowed in XML
218                 buffer = null;
219                 throw new XMLNotWellFormedException("Character is invalid in: " + ch);
220             }
221             if (isHighSurrogate) {
222                 if (Character.isLowSurrogate(ch)) {
223                     // Everything is fine. Clean up traces for surrogates
224                     isHighSurrogate = false;
225                 }
226                 else {
227                     // Trigger error. Found high surrogate not followed by low surrogate
228                     buffer = null;
229                     throw new Exception("Found high surrogate not followed by low surrogate");
230                 }
231             }
232             else if (Character.isHighSurrogate(ch)) {
233                 isHighSurrogate = true;
234             }
235             else if (Character.isLowSurrogate(ch)) {
236                 // Trigger error. Found low surrogate char without a preceding high surrogate
237                 buffer = null;
238                 throw new Exception("Found low surrogate char without a preceding high surrogate");
239             }
240             if (status == XMLLightweightParser.TAIL) {
241                 // Looking for the close tag
242                 if (depth < 1 && ch == head.charAt(tailCount)) {
243                     tailCount++;
244                     if (tailCount == head.length()) {
245                         // Close stanza found!
246                         // Calculate the correct start,end position of the message into the buffer
247                         int end = buffer.length() - readChar + (i + 1);
248                         String msg = buffer.substring(startLastMsg, end);
249                         // Add message to the list
250                         foundMsg(msg);
251                         startLastMsg = end;
252                     }
253                 } else {
254                     tailCount = 0;
255                     status = XMLLightweightParser.INSIDE;
256                 }
257             } else if (status == XMLLightweightParser.PRETAIL) {
258                 if (ch == XMLLightweightParser.CDATA_START[cdataOffset]) {
259                     cdataOffset++;
260                     if (cdataOffset == XMLLightweightParser.CDATA_START.length) {
261                         status = XMLLightweightParser.INSIDE_CDATA;
262                         cdataOffset = 0;
263                         continue;
264                     }
265                 } else {
266                     cdataOffset = 0;
267                     status = XMLLightweightParser.INSIDE;
268                 }
269                 if (ch == '/') {
270                     status = XMLLightweightParser.TAIL;
271                     depth--;
272                 }
273                 else if (ch == '!') {
274                     // This is a <! (comment) so ignore it
275                     status = XMLLightweightParser.INSIDE;
276                 }
277                 else {
278                     depth++;
279                 }
280             } else if (status == XMLLightweightParser.VERIFY_CLOSE_TAG) {
281                 if (ch == '>') {
282                     depth--;
283                     status = XMLLightweightParser.OUTSIDE;
284                     if (depth < 1) {
285                         // Found a tag in the form <tag />
286                         int end = buffer.length() - readChar + (i + 1);
287                         String msg = buffer.substring(startLastMsg, end);
288                         // Add message to the list
289                         foundMsg(msg);
290                         startLastMsg = end;
291                     }
292                 } else if (ch == '<') {
293                     status = XMLLightweightParser.PRETAIL;
294                     insideChildrenTag = true;
295                 } else {
296                     status = XMLLightweightParser.INSIDE;
297                 }
298             } else if (status == XMLLightweightParser.INSIDE_PARAM_VALUE) {
299 
300                 if (ch == '"') {
301                     status = XMLLightweightParser.INSIDE;
302                 }
303             } else if (status == XMLLightweightParser.INSIDE_CDATA) {
304                 if (ch == XMLLightweightParser.CDATA_END[cdataOffset]) {
305                     cdataOffset++;
306                     if (cdataOffset == XMLLightweightParser.CDATA_END.length) {
307                         status = XMLLightweightParser.OUTSIDE;
308                         cdataOffset = 0;
309                     }
310                 } else if (cdataOffset == XMLLightweightParser.CDATA_END.length-1 && ch == XMLLightweightParser.CDATA_END[cdataOffset - 1]) {
311                     // if we are looking for the last CDATA_END char, and we instead found an extra ']'
312                     // char, leave cdataOffset as is and proceed to the next char. This could be a case
313                     // where the XML character data ends with multiple square braces. For Example ]]]>
314                 } else {
315                     cdataOffset = 0;
316                 }
317             } else if (status == XMLLightweightParser.INSIDE) {
318                 if (ch == XMLLightweightParser.CDATA_START[cdataOffset]) {
319                     cdataOffset++;
320                     if (cdataOffset == XMLLightweightParser.CDATA_START.length) {
321                         status = XMLLightweightParser.INSIDE_CDATA;
322                         cdataOffset = 0;
323                         continue;
324                     }
325                 } else {
326                     cdataOffset = 0;
327                     status = XMLLightweightParser.INSIDE;
328                 }
329                 if (ch == '"') {
330                     status = XMLLightweightParser.INSIDE_PARAM_VALUE;
331                 } else if (ch == '>') {
332                     status = XMLLightweightParser.OUTSIDE;
333                     if (insideRootTag && (head.length() == 14 || head.length() == 5 || head.length() == 13)) {
334                         final String headString = head.toString();
335                         if ("stream:stream>".equals(headString)
336                             || "?xml>".equals(headString)) {
337                             // Found closing stream:stream
338                             int end = buffer.length() - readChar + (i + 1);
339                             // Skip LF, CR and other "weird" characters that could appear
340                             while (startLastMsg < end && '<' != buffer.charAt(startLastMsg)) {
341                                 startLastMsg++;
342                             }
343                             String msg = buffer.substring(startLastMsg, end);
344                             foundMsg(msg);
345                             startLastMsg = end;
346                         }
347                     }
348                     insideRootTag = false;
349                 } else if (ch == '/') {
350                     status = XMLLightweightParser.VERIFY_CLOSE_TAG;
351                 }
352             } else if (status == XMLLightweightParser.HEAD) {
353                 if (Character.isWhitespace(ch) || ch == '>') {
354                     // Append > to head to allow searching </tag>
355                     head.append('>');
356                     if(ch == '>')
357                         status = XMLLightweightParser.OUTSIDE;
358                     else
359                         status = XMLLightweightParser.INSIDE;
360                     insideRootTag = true;
361                     insideChildrenTag = false;
362                     continue;
363                 }
364                 else if (ch == '/' && head.length() > 0) {
365                     status = XMLLightweightParser.VERIFY_CLOSE_TAG;
366                     depth--;
367                 }
368                 head.append(ch);
369 
370             } else if (status == XMLLightweightParser.INIT) {
371                 if (ch == '<') {
372                     status = XMLLightweightParser.HEAD;
373                     depth = 1;
374                 }
375                 else {
376                     startLastMsg++;
377                 }
378             } else if (status == XMLLightweightParser.OUTSIDE) {
379                 if (ch == '<') {
380                     status = XMLLightweightParser.PRETAIL;
381                     cdataOffset = 1;
382                     insideChildrenTag = true;
383                 }
384             }
385         }
386         if (head.length() == 15 || head.length() == 14) {
387             final String headString = head.toString();
388             if ("/stream:stream>".equals(headString)) {
389                 foundMsg("</stream:stream>");
390             }
391         }
392     }
393 
394     /**
395      * This method verifies if the provided argument contains at least one numeric character reference (
396      * <code>CharRef	   ::=   	'&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';</code>) for which the decimal or hexidecimal
397      * character value refers to an invalid XML 1.0 character.
398      *
399      * @param string
400      *            The input string
401      * @return {@code true} if the input string contains an invalid numeric character reference, {@code false}
402      *         otherwise.
403      * @see http://www.w3.org/TR/2008/REC-xml-20081126/#dt-charref
404      */
hasIllegalCharacterReferences(String string)405     public static boolean hasIllegalCharacterReferences(String string) {
406         // If there's no character reference, don't bother to do more specific checking.
407         final Matcher matcher = XML_HAS_CHARREF.matcher(string);
408 
409         while (matcher.find()) {
410             final String decValue = matcher.group(2);
411             if (decValue != null) {
412                 final int value = Integer.parseInt(decValue);
413                 if (!isLegalXmlCharacter(value)) {
414                     return true;
415                 } else {
416                     continue;
417                 }
418             }
419 
420             final String hexValue = matcher.group(3);
421             if (hexValue != null) {
422                 final int value = Integer.parseInt(hexValue, 16);
423                 if (!isLegalXmlCharacter(value)) {
424                     return true;
425                 } else {
426                     continue;
427                 }
428             }
429 
430             // This is bad. The XML_HAS_CHARREF expression should have a hit for either the decimal
431             // or the heximal notation.
432             throw new IllegalStateException(
433                     "An error occurred while searching for illegal character references in the value [" + string + "].");
434         }
435 
436         return false;
437     }
438 
439     /**
440      * Verifies if the codepoint value represents a valid character as defined in paragraph 2.2 of
441      * "Extensible Markup Language (XML) 1.0 (Fifth Edition)"
442      *
443      * @param value
444      *            the codepoint
445      * @return {@code true} if the codepoint is a valid charater per XML 1.0 definition, {@code false} otherwise.
446      * @see http://www.w3.org/TR/2008/REC-xml-20081126/#NT-Char
447      */
isLegalXmlCharacter(int value)448     public static boolean isLegalXmlCharacter(int value) {
449         return value == 0x9 || value == 0xA || value == 0xD || (value >= 0x20 && value <= 0xD7FF)
450                 || (value >= 0xE000 && value <= 0xFFFD) || (value >= 0x10000 && value <= 0x10FFFF);
451     }
452 
453     private static class PropertyListener implements PropertyEventListener {
454         @Override
propertySet(String property, Map<String, Object> params)455         public void propertySet(String property, Map<String, Object> params) {
456             if (MAX_PROPERTY_NAME.equals(property)) {
457                 String value = (String) params.get("value");
458                 if (value != null) {
459                     maxBufferSize = Integer.parseInt(value);
460                 }
461             }
462         }
463 
464         @Override
propertyDeleted(String property, Map<String, Object> params)465         public void propertyDeleted(String property, Map<String, Object> params) {
466             if (MAX_PROPERTY_NAME.equals(property)) {
467                 // Use default value when none was specified
468                 maxBufferSize = 1048576;
469             }
470         }
471 
472         @Override
xmlPropertySet(String property, Map<String, Object> params)473         public void xmlPropertySet(String property, Map<String, Object> params) {
474             // Do nothing
475         }
476 
477         @Override
xmlPropertyDeleted(String property, Map<String, Object> params)478         public void xmlPropertyDeleted(String property, Map<String, Object> params) {
479             // Do nothing
480         }
481     }
482 }
483