1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.utilities.io; 22 23 import de.lmu.ifi.dbs.elki.utilities.datastructures.BitsUtil; 24 25 /** 26 * Helper functionality for parsing. 27 * 28 * @author Erich Schubert 29 */ 30 public final class ParseUtil { 31 /** 32 * Private constructor. Static methods only. 33 */ ParseUtil()34 private ParseUtil() { 35 // Do not use. 36 } 37 38 /** 39 * Preallocated exceptions. 40 */ 41 public static final NumberFormatException EMPTY_STRING = new NumberFormatException("Parser called on an empty string.") { 42 private static final long serialVersionUID = 1L; 43 44 @Override 45 public synchronized Throwable fillInStackTrace() { 46 return this; 47 } 48 }; 49 50 /** 51 * Preallocated exceptions. 52 */ 53 public static final NumberFormatException EXPONENT_OVERFLOW = new NumberFormatException("Precision overflow for double exponent.") { 54 private static final long serialVersionUID = 1L; 55 56 @Override 57 public synchronized Throwable fillInStackTrace() { 58 return this; 59 } 60 }; 61 62 /** 63 * Preallocated exceptions. 64 */ 65 public static final NumberFormatException INVALID_EXPONENT = new NumberFormatException("Invalid exponent") { 66 private static final long serialVersionUID = 1L; 67 68 @Override 69 public synchronized Throwable fillInStackTrace() { 70 return this; 71 } 72 }; 73 74 /** 75 * Preallocated exceptions. 76 */ 77 public static final NumberFormatException TRAILING_CHARACTERS = new NumberFormatException("String sequence was not completely consumed.") { 78 private static final long serialVersionUID = 1L; 79 80 @Override 81 public synchronized Throwable fillInStackTrace() { 82 return this; 83 } 84 }; 85 86 /** 87 * Preallocated exceptions. 88 */ 89 public static final NumberFormatException PRECISION_OVERFLOW = new NumberFormatException("Precision overflow for long values.") { 90 private static final long serialVersionUID = 1L; 91 92 @Override 93 public synchronized Throwable fillInStackTrace() { 94 return this; 95 } 96 }; 97 98 /** 99 * Preallocated exceptions. 100 */ 101 public static final NumberFormatException NOT_A_NUMBER = new NumberFormatException("Number must start with a digit or dot.") { 102 private static final long serialVersionUID = 1L; 103 104 @Override 105 public synchronized Throwable fillInStackTrace() { 106 return this; 107 } 108 }; 109 110 /** 111 * Infinity pattern, with any capitalization 112 */ 113 private static final char[] INFINITY_PATTERN = { // 114 'I', 'n', 'f', 'i', 'n', 'i', 't', 'y', // 115 'i', 'N', 'F', 'I', 'N', 'I', 'T', 'Y' }; 116 117 /** Length of pattern */ 118 private static final int INFINITY_LENGTH = INFINITY_PATTERN.length >> 1; 119 120 /** 121 * Parse a double from a character sequence. 122 * 123 * In contrast to Javas {@link Double#parseDouble}, this will <em>not</em> 124 * create an object and thus is expected to put less load on the garbage 125 * collector. It will accept some more spellings of NaN and infinity, thus 126 * removing the need for checking for these independently. 127 * 128 * @param str String 129 * @param start Begin 130 * @param end End 131 * @return Double value 132 */ parseDouble(final byte[] str, final int start, final int end)133 public static double parseDouble(final byte[] str, final int start, final int end) { 134 if(start >= end) { 135 throw EMPTY_STRING; 136 } 137 // Current position and character. 138 int pos = start; 139 byte cur = str[pos]; 140 141 // Match for NaN spellings 142 if(matchNaN(str, cur, pos, end)) { 143 return Double.NaN; 144 } 145 // Match sign 146 boolean isNegative = (cur == '-'); 147 // Carefully consume the - character, update c and i: 148 if((isNegative || (cur == '+')) && (++pos < end)) { 149 cur = str[pos]; 150 } 151 if(matchInf(str, cur, pos, end)) { 152 return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; 153 } 154 155 // Begin parsing real numbers! 156 if(((cur < '0') || (cur > '9')) && (cur != '.')) { 157 throw NOT_A_NUMBER; 158 } 159 160 // Parse digits into a long, remember offset of decimal point. 161 long decimal = 0; 162 int decimalPoint = -1; 163 while(true) { 164 final int digit = cur - '0'; 165 if((digit >= 0) && (digit <= 9)) { 166 final long tmp = (decimal << 3) + (decimal << 1) + digit; 167 if(tmp >= decimal) { 168 decimal = tmp; // Otherwise, silently ignore the extra digits. 169 } 170 else if(++decimalPoint == 0) { // Because we ignored the digit 171 throw PRECISION_OVERFLOW; 172 } 173 } 174 else if((cur == '.') && (decimalPoint < 0)) { 175 decimalPoint = pos; 176 } 177 else { // No more digits, or a second dot. 178 break; 179 } 180 if(++pos >= end) { 181 break; 182 } 183 cur = str[pos]; 184 } 185 // We need the offset from the back for adjusting the exponent: 186 // Note that we need the current value of i! 187 decimalPoint = (decimalPoint >= 0) ? pos - decimalPoint - 1 : 0; 188 189 // Reads exponent. 190 int exp = 0; 191 if((pos + 1 < end) && ((cur == 'E') || (cur == 'e'))) { 192 cur = str[++pos]; 193 final boolean isNegativeExp = (cur == '-'); 194 if((isNegativeExp || (cur == '+')) && (++pos < end)) { 195 cur = str[pos]; 196 } 197 if((cur < '0') || (cur > '9')) { // At least one digit required. 198 throw INVALID_EXPONENT; 199 } 200 while(true) { 201 final int digit = cur - '0'; 202 if(digit < 0 || digit > 9) { 203 break; 204 } 205 exp = (exp << 3) + (exp << 1) + digit; 206 if(exp > Double.MAX_EXPONENT) { 207 throw EXPONENT_OVERFLOW; 208 } 209 if(++pos >= end) { 210 break; 211 } 212 cur = str[pos]; 213 } 214 exp = isNegativeExp ? -exp : exp; 215 } 216 // Adjust exponent by the offset of the dot in our long. 217 exp = decimalPoint > 0 ? (exp - decimalPoint) : exp; 218 if(pos != end) { 219 throw TRAILING_CHARACTERS; 220 } 221 222 return BitsUtil.lpow10(isNegative ? -decimal : decimal, exp); 223 } 224 225 /** 226 * Parse a double from a character sequence. 227 * 228 * In contrast to Javas {@link Double#parseDouble}, this will <em>not</em> 229 * create an object and thus is expected to put less load on the garbage 230 * collector. It will accept some more spellings of NaN and infinity, thus 231 * removing the need for checking for these independently. 232 * 233 * @param str String 234 * @return Double value 235 */ parseDouble(final CharSequence str)236 public static double parseDouble(final CharSequence str) { 237 return parseDouble(str, 0, str.length()); 238 } 239 240 /** 241 * Parse a double from a character sequence. 242 * 243 * In contrast to Javas {@link Double#parseDouble}, this will <em>not</em> 244 * create an object and thus is expected to put less load on the garbage 245 * collector. It will accept some more spellings of NaN and infinity, thus 246 * removing the need for checking for these independently. 247 * 248 * @param str String 249 * @param start Begin 250 * @param end End 251 * @return Double value 252 */ parseDouble(final CharSequence str, final int start, final int end)253 public static double parseDouble(final CharSequence str, final int start, final int end) { 254 if(start >= end) { 255 throw EMPTY_STRING; 256 } 257 // Current position and character. 258 int pos = start; 259 char cur = str.charAt(pos); 260 261 // Match for NaN spellings 262 if(matchNaN(str, cur, pos, end)) { 263 return Double.NaN; 264 } 265 // Match sign 266 boolean isNegative = (cur == '-'); 267 // Carefully consume the - character, update c and i: 268 if((isNegative || (cur == '+')) && (++pos < end)) { 269 cur = str.charAt(pos); 270 } 271 if(matchInf(str, cur, pos, end)) { 272 return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY; 273 } 274 275 // Begin parsing real numbers! 276 if(((cur < '0') || (cur > '9')) && (cur != '.')) { 277 throw NOT_A_NUMBER; 278 } 279 280 // Parse digits into a long, remember offset of decimal point. 281 long decimal = 0; 282 int decimalPoint = -1; 283 while(true) { 284 final int digit = cur - '0'; 285 if((digit >= 0) && (digit <= 9)) { 286 final long tmp = (decimal << 3) + (decimal << 1) + digit; 287 if(tmp >= decimal) { 288 decimal = tmp; // Otherwise, silently ignore the extra digits. 289 } 290 else if(++decimalPoint == 0) { // Because we ignored the digit 291 throw PRECISION_OVERFLOW; 292 } 293 } 294 else if((cur == '.') && (decimalPoint < 0)) { 295 decimalPoint = pos; 296 } 297 else { // No more digits, or a second dot. 298 break; 299 } 300 if(++pos >= end) { 301 break; 302 } 303 cur = str.charAt(pos); 304 } 305 // We need the offset from the back for adjusting the exponent: 306 // Note that we need the current value of i! 307 decimalPoint = (decimalPoint >= 0) ? pos - decimalPoint - 1 : 0; 308 309 // Reads exponent. 310 int exp = 0; 311 if((pos + 1 < end) && ((cur == 'E') || (cur == 'e'))) { 312 cur = str.charAt(++pos); 313 final boolean isNegativeExp = (cur == '-'); 314 if((isNegativeExp || (cur == '+')) && (++pos < end)) { 315 cur = str.charAt(pos); 316 } 317 if((cur < '0') || (cur > '9')) { // At least one digit required. 318 throw INVALID_EXPONENT; 319 } 320 while(true) { 321 final int digit = cur - '0'; 322 if(digit < 0 || digit > 9) { 323 break; 324 } 325 exp = (exp << 3) + (exp << 1) + digit; 326 if(exp > Double.MAX_EXPONENT) { 327 throw EXPONENT_OVERFLOW; 328 } 329 if(++pos >= end) { 330 break; 331 } 332 cur = str.charAt(pos); 333 } 334 exp = isNegativeExp ? -exp : exp; 335 } 336 // Adjust exponent by the offset of the dot in our long. 337 exp = decimalPoint > 0 ? (exp - decimalPoint) : exp; 338 if(pos != end) { 339 throw TRAILING_CHARACTERS; 340 } 341 342 return BitsUtil.lpow10(isNegative ? -decimal : decimal, exp); 343 } 344 345 /** 346 * Parse a long integer from a character sequence. 347 * 348 * @param str String 349 * @return Long value 350 */ parseLongBase10(final CharSequence str)351 public static long parseLongBase10(final CharSequence str) { 352 return parseLongBase10(str, 0, str.length()); 353 } 354 355 /** 356 * Parse a long integer from a character sequence. 357 * 358 * @param str String 359 * @param start Begin 360 * @param end End 361 * @return Long value 362 */ parseLongBase10(final CharSequence str, final int start, final int end)363 public static long parseLongBase10(final CharSequence str, final int start, final int end) { 364 // Current position and character. 365 int pos = start; 366 char cur = str.charAt(pos); 367 368 // Match sign 369 boolean isNegative = (cur == '-'); 370 // Carefully consume the - character, update c and i: 371 if((isNegative || (cur == '+')) && (++pos < end)) { 372 cur = str.charAt(pos); 373 } 374 375 // Begin parsing real numbers! 376 if((cur < '0') || (cur > '9')) { 377 throw NOT_A_NUMBER; 378 } 379 380 // Parse digits into a long, remember offset of decimal point. 381 long decimal = 0; 382 while(true) { 383 final int digit = cur - '0'; 384 if((digit >= 0) && (digit <= 9)) { 385 final long tmp = (decimal << 3) + (decimal << 1) + digit; 386 if(tmp < decimal) { 387 throw PRECISION_OVERFLOW; 388 } 389 decimal = tmp; 390 } 391 else { // No more digits, or a second dot. 392 break; 393 } 394 if(++pos < end) { 395 cur = str.charAt(pos); 396 } 397 else { 398 break; 399 } 400 } 401 if(pos != end) { 402 throw TRAILING_CHARACTERS; 403 } 404 405 return isNegative ? -decimal : decimal; 406 } 407 408 /** 409 * Parse an integer from a character sequence. 410 * 411 * @param str String 412 * @return Int value 413 */ parseIntBase10(final CharSequence str)414 public static int parseIntBase10(final CharSequence str) { 415 return parseIntBase10(str, 0, str.length()); 416 } 417 418 /** 419 * Parse an integer from a character sequence. 420 * 421 * @param str String 422 * @param start Begin 423 * @param end End 424 * @return int value 425 */ parseIntBase10(final CharSequence str, final int start, final int end)426 public static int parseIntBase10(final CharSequence str, final int start, final int end) { 427 // Current position and character. 428 int pos = start; 429 char cur = str.charAt(pos); 430 431 // Match sign 432 boolean isNegative = (cur == '-'); 433 // Carefully consume the - character, update c and i: 434 if((isNegative || (cur == '+')) && (++pos < end)) { 435 cur = str.charAt(pos); 436 } 437 438 // Begin parsing real numbers! 439 if((cur < '0') || (cur > '9')) { 440 throw NOT_A_NUMBER; 441 } 442 443 // Parse digits into a long, remember offset of decimal point. 444 int decimal = 0; 445 while(true) { 446 final int digit = cur - '0'; 447 if((digit >= 0) && (digit <= 9)) { 448 final int tmp = (decimal << 3) + (decimal << 1) + digit; 449 if(tmp < decimal) { 450 throw PRECISION_OVERFLOW; 451 } 452 decimal = tmp; 453 } 454 else { // No more digits, or a second dot. 455 break; 456 } 457 if(++pos < end) { 458 cur = str.charAt(pos); 459 } 460 else { 461 break; 462 } 463 } 464 if(pos != end) { 465 throw TRAILING_CHARACTERS; 466 } 467 468 return isNegative ? -decimal : decimal; 469 } 470 471 /** 472 * Match "inf", "infinity" in a number of different capitalizations. 473 * 474 * @param str String to match 475 * @param firstchar First character 476 * @param start Interval begin 477 * @param end Interval end 478 * @return {@code true} when infinity was recognized. 479 */ matchInf(byte[] str, byte firstchar, int start, int end)480 private static boolean matchInf(byte[] str, byte firstchar, int start, int end) { 481 final int len = end - start; 482 // The wonders of unicode. The infinity symbol \u221E is three bytes: 483 if(len == 3 && firstchar == -0x1E && str[start + 1] == -0x78 && str[start + 2] == -0x62) { 484 return true; 485 } 486 if((len != 3 && len != INFINITY_LENGTH) // 487 || (firstchar != 'I' && firstchar != 'i')) { 488 return false; 489 } 490 for(int i = 1, j = INFINITY_LENGTH + 1; i < INFINITY_LENGTH; i++, j++) { 491 final byte c = str[start + i]; 492 if(c != INFINITY_PATTERN[i] && c != INFINITY_PATTERN[j]) { 493 return false; 494 } 495 if(i == 2 && len == 3) { 496 return true; 497 } 498 } 499 return true; 500 } 501 502 /** 503 * Match "inf", "infinity" in a number of different capitalizations. 504 * 505 * @param str String to match 506 * @param firstchar First character 507 * @param start Interval begin 508 * @param end Interval end 509 * @return {@code true} when infinity was recognized. 510 */ matchInf(CharSequence str, char firstchar, int start, int end)511 private static boolean matchInf(CharSequence str, char firstchar, int start, int end) { 512 final int len = end - start; 513 // The wonders of unicode: infinity symbol 514 if(len == 1 && firstchar == '\u221E') { 515 return true; 516 } 517 if((len != 3 && len != INFINITY_LENGTH) // 518 || (firstchar != 'I' && firstchar != 'i')) { 519 return false; 520 } 521 for(int i = 1, j = INFINITY_LENGTH + 1; i < INFINITY_LENGTH; i++, j++) { 522 final char c = str.charAt(start + i); 523 if(c != INFINITY_PATTERN[i] && c != INFINITY_PATTERN[j]) { 524 return false; 525 } 526 if(i == 2 && len == 3) { 527 return true; 528 } 529 } 530 return true; 531 } 532 533 /** 534 * Match "NaN" in a number of different capitalizations. 535 * 536 * @param str String to match 537 * @param firstchar First character 538 * @param start Interval begin 539 * @param end Interval end 540 * @return {@code true} when NaN was recognized. 541 */ matchNaN(byte[] str, byte firstchar, int start, int end)542 private static boolean matchNaN(byte[] str, byte firstchar, int start, int end) { 543 final int len = end - start; 544 if(len < 2 || len > 3 || (firstchar != 'N' && firstchar != 'n')) { 545 return false; 546 } 547 final byte c1 = str[start + 1]; 548 if(c1 != 'a' && c1 != 'A') { 549 return false; 550 } 551 // Accept just "NA", too: 552 if(len == 2) { 553 return true; 554 } 555 final byte c2 = str[start + 2]; 556 return c2 == 'N' || c2 == 'n'; 557 } 558 559 /** 560 * Match "NaN" in a number of different capitalizations. 561 * 562 * @param str String to match 563 * @param firstchar First character 564 * @param start Interval begin 565 * @param end Interval end 566 * @return {@code true} when NaN was recognized. 567 */ matchNaN(CharSequence str, char firstchar, int start, int end)568 private static boolean matchNaN(CharSequence str, char firstchar, int start, int end) { 569 final int len = end - start; 570 if(len < 2 || len > 3 || (firstchar != 'N' && firstchar != 'n')) { 571 return false; 572 } 573 final char c1 = str.charAt(start + 1); 574 if(c1 != 'a' && c1 != 'A') { 575 return false; 576 } 577 // Accept just "NA", too: 578 if(len == 2) { 579 return true; 580 } 581 final char c2 = str.charAt(start + 2); 582 return c2 == 'N' || c2 == 'n'; 583 } 584 } 585