1 /* 2 * Copyright (C) 2005-2008 Jive Software. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.jivesoftware.openfire.nio; 18 19 import java.nio.CharBuffer; 20 import java.nio.charset.Charset; 21 import java.nio.charset.CharsetDecoder; 22 import java.nio.charset.CodingErrorAction; 23 import java.util.ArrayList; 24 import java.util.List; 25 import java.util.Map; 26 import java.util.regex.Matcher; 27 import java.util.regex.Pattern; 28 29 import org.apache.mina.core.buffer.IoBuffer; 30 import org.apache.mina.filter.codec.ProtocolDecoderException; 31 import org.jivesoftware.util.JiveGlobals; 32 import org.jivesoftware.util.PropertyEventDispatcher; 33 import org.jivesoftware.util.PropertyEventListener; 34 35 /** 36 * This is a Light-Weight XML Parser. 37 * It read data from a channel and collect data until data are available in 38 * the channel. 39 * When a message is complete you can retrieve messages invoking the method 40 * getMsgs() and you can invoke the method areThereMsgs() to know if at least 41 * an message is presents. 42 * 43 * @author Daniele Piras 44 * @author Gaston Dombiak 45 */ 46 class XMLLightweightParser { 47 48 private static final Pattern XML_HAS_CHARREF = Pattern.compile("&#(0*([0-9]+)|[xX]0*([0-9a-fA-F]+));"); 49 50 private static final String MAX_PROPERTY_NAME = "xmpp.parser.buffer.size"; 51 private static int maxBufferSize; 52 // Chars that rappresent CDATA section start 53 protected static char[] CDATA_START = {'<', '!', '[', 'C', 'D', 'A', 'T', 'A', '['}; 54 // Chars that rappresent CDATA section end 55 protected static char[] CDATA_END = {']', ']', '>'}; 56 57 // Buffer with all data retrieved 58 protected StringBuilder buffer = new StringBuilder(); 59 60 // ---- INTERNAL STATUS ------- 61 // Initial status 62 protected static final int INIT = 0; 63 // Status used when the first tag name is retrieved 64 protected static final int HEAD = 2; 65 // Status used when robot is inside the xml and it looking for the tag conclusion 66 protected static final int INSIDE = 3; 67 // Status used when a '<' is found and try to find the conclusion tag. 68 protected static final int PRETAIL = 4; 69 // Status used when the ending tag is equal to the head tag 70 protected static final int TAIL = 5; 71 // Status used when robot is inside the main tag and found an '/' to check '/>'. 72 protected static final int VERIFY_CLOSE_TAG = 6; 73 // Status used when you are inside a parameter 74 protected static final int INSIDE_PARAM_VALUE = 7; 75 // Status used when you are inside a cdata section 76 protected static final int INSIDE_CDATA = 8; 77 // Status used when you are outside a tag/reading text 78 protected static final int OUTSIDE = 9; 79 80 final String[] sstatus = {"INIT", "", "HEAD", "INSIDE", "PRETAIL", "TAIL", "VERIFY", "INSIDE_PARAM", "INSIDE_CDATA", "OUTSIDE"}; 81 82 83 // Current robot status 84 protected int status = XMLLightweightParser.INIT; 85 86 // Index to looking for a CDATA section start or end. 87 protected int cdataOffset = 0; 88 89 // Number of chars that machs with the head tag. If the tailCount is equal to 90 // the head length so a close tag is found. 91 protected int tailCount = 0; 92 // Indicate the starting point in the buffer for the next message. 93 protected int startLastMsg = 0; 94 // Flag used to discover tag in the form <tag />. 95 protected boolean insideRootTag = false; 96 // Object conteining the head tag 97 protected StringBuilder head = new StringBuilder(16); 98 // List with all finished messages found. 99 protected List<String> msgs = new ArrayList<>(); 100 private int depth = 0; 101 102 protected boolean insideChildrenTag = false; 103 104 CharsetDecoder encoder; 105 106 static { 107 // Set default max buffer size to 1MB. If limit is reached then close connection 108 maxBufferSize = JiveGlobals.getIntProperty(MAX_PROPERTY_NAME, 1048576); 109 // Listen for changes to this property PropertyEventDispatcher.addListener(new PropertyListener())110 PropertyEventDispatcher.addListener(new PropertyListener()); 111 } 112 XMLLightweightParser(Charset charset)113 public XMLLightweightParser(Charset charset) { 114 encoder = charset.newDecoder() 115 .onMalformedInput(CodingErrorAction.REPLACE) 116 .onUnmappableCharacter(CodingErrorAction.REPLACE); 117 } 118 119 /* 120 * true if the parser has found some complete xml message. 121 */ areThereMsgs()122 public boolean areThereMsgs() { 123 return (msgs.size() > 0); 124 } 125 126 /* 127 * @return an array with all messages found 128 */ getMsgs()129 public String[] getMsgs() { 130 String[] res = new String[msgs.size()]; 131 for (int i = 0; i < res.length; i++) { 132 res[i] = msgs.get(i); 133 } 134 msgs.clear(); 135 invalidateBuffer(); 136 return res; 137 } 138 139 /* 140 * Method use to re-initialize the buffer 141 */ invalidateBuffer()142 protected void invalidateBuffer() { 143 if (buffer.length() > 0) { 144 String str = buffer.substring(startLastMsg); 145 buffer.delete(0, buffer.length()); 146 buffer.append(str); 147 buffer.trimToSize(); 148 } 149 startLastMsg = 0; 150 } 151 152 153 /* 154 * Method that add a message to the list and reinit parser. 155 */ foundMsg(String msg)156 protected void foundMsg(String msg) throws XMLNotWellFormedException { 157 // Add message to the complete message list 158 if (msg != null) { 159 if (hasIllegalCharacterReferences(msg)) { 160 buffer = null; 161 throw new XMLNotWellFormedException("Illegal character reference found in: " + msg); 162 } 163 msgs.add(msg); 164 } 165 // Move the position into the buffer 166 status = XMLLightweightParser.INIT; 167 tailCount = 0; 168 cdataOffset = 0; 169 head.setLength(0); 170 insideRootTag = false; 171 insideChildrenTag = false; 172 depth = 0; 173 } 174 175 /* 176 * Main reading method 177 */ read(IoBuffer byteBuffer)178 public void read(IoBuffer byteBuffer) throws Exception { 179 if (buffer == null) { 180 // exception was thrown before, avoid duplicate exception(s) 181 // "read" and discard remaining data 182 byteBuffer.position(byteBuffer.limit()); 183 return; 184 } 185 invalidateBuffer(); 186 // Check that the buffer is not bigger than 1 Megabyte. For security reasons 187 // we will abort parsing when 1 Mega of queued chars was found. 188 if (buffer.length() > maxBufferSize) { 189 // purge the local buffer / free memory 190 buffer = null; 191 // processing the exception takes quite long 192 final ProtocolDecoderException ex = new ProtocolDecoderException("Stopped parsing never ending stanza"); 193 ex.setHexdump("(redacted hex dump of never ending stanza)"); 194 throw ex; 195 } 196 CharBuffer charBuffer = CharBuffer.allocate(byteBuffer.capacity()); 197 encoder.reset(); 198 encoder.decode(byteBuffer.buf(), charBuffer, false); 199 char[] buf = new char[charBuffer.position()]; 200 charBuffer.flip(); 201 charBuffer.get(buf); 202 int readChar = buf.length; 203 204 // Just return if nothing was read 205 if (readChar == 0) { 206 return; 207 } 208 209 buffer.append(buf); 210 211 // Robot. 212 char ch; 213 boolean isHighSurrogate = false; 214 for (int i = 0; i < readChar; i++) { 215 ch = buf[i]; 216 if (ch < 0x20 && ch != 0x9 && ch != 0xA && ch != 0xD) { 217 //Unicode characters in the range 0x0000-0x001F other than 9, A, and D are not allowed in XML 218 buffer = null; 219 throw new XMLNotWellFormedException("Character is invalid in: " + ch); 220 } 221 if (isHighSurrogate) { 222 if (Character.isLowSurrogate(ch)) { 223 // Everything is fine. Clean up traces for surrogates 224 isHighSurrogate = false; 225 } 226 else { 227 // Trigger error. Found high surrogate not followed by low surrogate 228 buffer = null; 229 throw new Exception("Found high surrogate not followed by low surrogate"); 230 } 231 } 232 else if (Character.isHighSurrogate(ch)) { 233 isHighSurrogate = true; 234 } 235 else if (Character.isLowSurrogate(ch)) { 236 // Trigger error. Found low surrogate char without a preceding high surrogate 237 buffer = null; 238 throw new Exception("Found low surrogate char without a preceding high surrogate"); 239 } 240 if (status == XMLLightweightParser.TAIL) { 241 // Looking for the close tag 242 if (depth < 1 && ch == head.charAt(tailCount)) { 243 tailCount++; 244 if (tailCount == head.length()) { 245 // Close stanza found! 246 // Calculate the correct start,end position of the message into the buffer 247 int end = buffer.length() - readChar + (i + 1); 248 String msg = buffer.substring(startLastMsg, end); 249 // Add message to the list 250 foundMsg(msg); 251 startLastMsg = end; 252 } 253 } else { 254 tailCount = 0; 255 status = XMLLightweightParser.INSIDE; 256 } 257 } else if (status == XMLLightweightParser.PRETAIL) { 258 if (ch == XMLLightweightParser.CDATA_START[cdataOffset]) { 259 cdataOffset++; 260 if (cdataOffset == XMLLightweightParser.CDATA_START.length) { 261 status = XMLLightweightParser.INSIDE_CDATA; 262 cdataOffset = 0; 263 continue; 264 } 265 } else { 266 cdataOffset = 0; 267 status = XMLLightweightParser.INSIDE; 268 } 269 if (ch == '/') { 270 status = XMLLightweightParser.TAIL; 271 depth--; 272 } 273 else if (ch == '!') { 274 // This is a <! (comment) so ignore it 275 status = XMLLightweightParser.INSIDE; 276 } 277 else { 278 depth++; 279 } 280 } else if (status == XMLLightweightParser.VERIFY_CLOSE_TAG) { 281 if (ch == '>') { 282 depth--; 283 status = XMLLightweightParser.OUTSIDE; 284 if (depth < 1) { 285 // Found a tag in the form <tag /> 286 int end = buffer.length() - readChar + (i + 1); 287 String msg = buffer.substring(startLastMsg, end); 288 // Add message to the list 289 foundMsg(msg); 290 startLastMsg = end; 291 } 292 } else if (ch == '<') { 293 status = XMLLightweightParser.PRETAIL; 294 insideChildrenTag = true; 295 } else { 296 status = XMLLightweightParser.INSIDE; 297 } 298 } else if (status == XMLLightweightParser.INSIDE_PARAM_VALUE) { 299 300 if (ch == '"') { 301 status = XMLLightweightParser.INSIDE; 302 } 303 } else if (status == XMLLightweightParser.INSIDE_CDATA) { 304 if (ch == XMLLightweightParser.CDATA_END[cdataOffset]) { 305 cdataOffset++; 306 if (cdataOffset == XMLLightweightParser.CDATA_END.length) { 307 status = XMLLightweightParser.OUTSIDE; 308 cdataOffset = 0; 309 } 310 } else if (cdataOffset == XMLLightweightParser.CDATA_END.length-1 && ch == XMLLightweightParser.CDATA_END[cdataOffset - 1]) { 311 // if we are looking for the last CDATA_END char, and we instead found an extra ']' 312 // char, leave cdataOffset as is and proceed to the next char. This could be a case 313 // where the XML character data ends with multiple square braces. For Example ]]]> 314 } else { 315 cdataOffset = 0; 316 } 317 } else if (status == XMLLightweightParser.INSIDE) { 318 if (ch == XMLLightweightParser.CDATA_START[cdataOffset]) { 319 cdataOffset++; 320 if (cdataOffset == XMLLightweightParser.CDATA_START.length) { 321 status = XMLLightweightParser.INSIDE_CDATA; 322 cdataOffset = 0; 323 continue; 324 } 325 } else { 326 cdataOffset = 0; 327 status = XMLLightweightParser.INSIDE; 328 } 329 if (ch == '"') { 330 status = XMLLightweightParser.INSIDE_PARAM_VALUE; 331 } else if (ch == '>') { 332 status = XMLLightweightParser.OUTSIDE; 333 if (insideRootTag && (head.length() == 14 || head.length() == 5 || head.length() == 13)) { 334 final String headString = head.toString(); 335 if ("stream:stream>".equals(headString) 336 || "?xml>".equals(headString)) { 337 // Found closing stream:stream 338 int end = buffer.length() - readChar + (i + 1); 339 // Skip LF, CR and other "weird" characters that could appear 340 while (startLastMsg < end && '<' != buffer.charAt(startLastMsg)) { 341 startLastMsg++; 342 } 343 String msg = buffer.substring(startLastMsg, end); 344 foundMsg(msg); 345 startLastMsg = end; 346 } 347 } 348 insideRootTag = false; 349 } else if (ch == '/') { 350 status = XMLLightweightParser.VERIFY_CLOSE_TAG; 351 } 352 } else if (status == XMLLightweightParser.HEAD) { 353 if (Character.isWhitespace(ch) || ch == '>') { 354 // Append > to head to allow searching </tag> 355 head.append('>'); 356 if(ch == '>') 357 status = XMLLightweightParser.OUTSIDE; 358 else 359 status = XMLLightweightParser.INSIDE; 360 insideRootTag = true; 361 insideChildrenTag = false; 362 continue; 363 } 364 else if (ch == '/' && head.length() > 0) { 365 status = XMLLightweightParser.VERIFY_CLOSE_TAG; 366 depth--; 367 } 368 head.append(ch); 369 370 } else if (status == XMLLightweightParser.INIT) { 371 if (ch == '<') { 372 status = XMLLightweightParser.HEAD; 373 depth = 1; 374 } 375 else { 376 startLastMsg++; 377 } 378 } else if (status == XMLLightweightParser.OUTSIDE) { 379 if (ch == '<') { 380 status = XMLLightweightParser.PRETAIL; 381 cdataOffset = 1; 382 insideChildrenTag = true; 383 } 384 } 385 } 386 if (head.length() == 15 || head.length() == 14) { 387 final String headString = head.toString(); 388 if ("/stream:stream>".equals(headString)) { 389 foundMsg("</stream:stream>"); 390 } 391 } 392 } 393 394 /** 395 * This method verifies if the provided argument contains at least one numeric character reference ( 396 * <code>CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';</code>) for which the decimal or hexidecimal 397 * character value refers to an invalid XML 1.0 character. 398 * 399 * @param string 400 * The input string 401 * @return {@code true} if the input string contains an invalid numeric character reference, {@code false} 402 * otherwise. 403 * @see http://www.w3.org/TR/2008/REC-xml-20081126/#dt-charref 404 */ hasIllegalCharacterReferences(String string)405 public static boolean hasIllegalCharacterReferences(String string) { 406 // If there's no character reference, don't bother to do more specific checking. 407 final Matcher matcher = XML_HAS_CHARREF.matcher(string); 408 409 while (matcher.find()) { 410 final String decValue = matcher.group(2); 411 if (decValue != null) { 412 final int value = Integer.parseInt(decValue); 413 if (!isLegalXmlCharacter(value)) { 414 return true; 415 } else { 416 continue; 417 } 418 } 419 420 final String hexValue = matcher.group(3); 421 if (hexValue != null) { 422 final int value = Integer.parseInt(hexValue, 16); 423 if (!isLegalXmlCharacter(value)) { 424 return true; 425 } else { 426 continue; 427 } 428 } 429 430 // This is bad. The XML_HAS_CHARREF expression should have a hit for either the decimal 431 // or the heximal notation. 432 throw new IllegalStateException( 433 "An error occurred while searching for illegal character references in the value [" + string + "]."); 434 } 435 436 return false; 437 } 438 439 /** 440 * Verifies if the codepoint value represents a valid character as defined in paragraph 2.2 of 441 * "Extensible Markup Language (XML) 1.0 (Fifth Edition)" 442 * 443 * @param value 444 * the codepoint 445 * @return {@code true} if the codepoint is a valid charater per XML 1.0 definition, {@code false} otherwise. 446 * @see http://www.w3.org/TR/2008/REC-xml-20081126/#NT-Char 447 */ isLegalXmlCharacter(int value)448 public static boolean isLegalXmlCharacter(int value) { 449 return value == 0x9 || value == 0xA || value == 0xD || (value >= 0x20 && value <= 0xD7FF) 450 || (value >= 0xE000 && value <= 0xFFFD) || (value >= 0x10000 && value <= 0x10FFFF); 451 } 452 453 private static class PropertyListener implements PropertyEventListener { 454 @Override propertySet(String property, Map<String, Object> params)455 public void propertySet(String property, Map<String, Object> params) { 456 if (MAX_PROPERTY_NAME.equals(property)) { 457 String value = (String) params.get("value"); 458 if (value != null) { 459 maxBufferSize = Integer.parseInt(value); 460 } 461 } 462 } 463 464 @Override propertyDeleted(String property, Map<String, Object> params)465 public void propertyDeleted(String property, Map<String, Object> params) { 466 if (MAX_PROPERTY_NAME.equals(property)) { 467 // Use default value when none was specified 468 maxBufferSize = 1048576; 469 } 470 } 471 472 @Override xmlPropertySet(String property, Map<String, Object> params)473 public void xmlPropertySet(String property, Map<String, Object> params) { 474 // Do nothing 475 } 476 477 @Override xmlPropertyDeleted(String property, Map<String, Object> params)478 public void xmlPropertyDeleted(String property, Map<String, Object> params) { 479 // Do nothing 480 } 481 } 482 } 483