1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.utilities.io;
22 
23 import de.lmu.ifi.dbs.elki.utilities.datastructures.BitsUtil;
24 
25 /**
26  * Helper functionality for parsing.
27  *
28  * @author Erich Schubert
29  */
30 public final class ParseUtil {
31   /**
32    * Private constructor. Static methods only.
33    */
ParseUtil()34   private ParseUtil() {
35     // Do not use.
36   }
37 
38   /**
39    * Preallocated exceptions.
40    */
41   public static final NumberFormatException EMPTY_STRING = new NumberFormatException("Parser called on an empty string.") {
42     private static final long serialVersionUID = 1L;
43 
44     @Override
45     public synchronized Throwable fillInStackTrace() {
46       return this;
47     }
48   };
49 
50   /**
51    * Preallocated exceptions.
52    */
53   public static final NumberFormatException EXPONENT_OVERFLOW = new NumberFormatException("Precision overflow for double exponent.") {
54     private static final long serialVersionUID = 1L;
55 
56     @Override
57     public synchronized Throwable fillInStackTrace() {
58       return this;
59     }
60   };
61 
62   /**
63    * Preallocated exceptions.
64    */
65   public static final NumberFormatException INVALID_EXPONENT = new NumberFormatException("Invalid exponent") {
66     private static final long serialVersionUID = 1L;
67 
68     @Override
69     public synchronized Throwable fillInStackTrace() {
70       return this;
71     }
72   };
73 
74   /**
75    * Preallocated exceptions.
76    */
77   public static final NumberFormatException TRAILING_CHARACTERS = new NumberFormatException("String sequence was not completely consumed.") {
78     private static final long serialVersionUID = 1L;
79 
80     @Override
81     public synchronized Throwable fillInStackTrace() {
82       return this;
83     }
84   };
85 
86   /**
87    * Preallocated exceptions.
88    */
89   public static final NumberFormatException PRECISION_OVERFLOW = new NumberFormatException("Precision overflow for long values.") {
90     private static final long serialVersionUID = 1L;
91 
92     @Override
93     public synchronized Throwable fillInStackTrace() {
94       return this;
95     }
96   };
97 
98   /**
99    * Preallocated exceptions.
100    */
101   public static final NumberFormatException NOT_A_NUMBER = new NumberFormatException("Number must start with a digit or dot.") {
102     private static final long serialVersionUID = 1L;
103 
104     @Override
105     public synchronized Throwable fillInStackTrace() {
106       return this;
107     }
108   };
109 
110   /**
111    * Infinity pattern, with any capitalization
112    */
113   private static final char[] INFINITY_PATTERN = { //
114       'I', 'n', 'f', 'i', 'n', 'i', 't', 'y', //
115       'i', 'N', 'F', 'I', 'N', 'I', 'T', 'Y' };
116 
117   /** Length of pattern */
118   private static final int INFINITY_LENGTH = INFINITY_PATTERN.length >> 1;
119 
120   /**
121    * Parse a double from a character sequence.
122    *
123    * In contrast to Javas {@link Double#parseDouble}, this will <em>not</em>
124    * create an object and thus is expected to put less load on the garbage
125    * collector. It will accept some more spellings of NaN and infinity, thus
126    * removing the need for checking for these independently.
127    *
128    * @param str String
129    * @param start Begin
130    * @param end End
131    * @return Double value
132    */
parseDouble(final byte[] str, final int start, final int end)133   public static double parseDouble(final byte[] str, final int start, final int end) {
134     if(start >= end) {
135       throw EMPTY_STRING;
136     }
137     // Current position and character.
138     int pos = start;
139     byte cur = str[pos];
140 
141     // Match for NaN spellings
142     if(matchNaN(str, cur, pos, end)) {
143       return Double.NaN;
144     }
145     // Match sign
146     boolean isNegative = (cur == '-');
147     // Carefully consume the - character, update c and i:
148     if((isNegative || (cur == '+')) && (++pos < end)) {
149       cur = str[pos];
150     }
151     if(matchInf(str, cur, pos, end)) {
152       return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
153     }
154 
155     // Begin parsing real numbers!
156     if(((cur < '0') || (cur > '9')) && (cur != '.')) {
157       throw NOT_A_NUMBER;
158     }
159 
160     // Parse digits into a long, remember offset of decimal point.
161     long decimal = 0;
162     int decimalPoint = -1;
163     while(true) {
164       final int digit = cur - '0';
165       if((digit >= 0) && (digit <= 9)) {
166         final long tmp = (decimal << 3) + (decimal << 1) + digit;
167         if(tmp >= decimal) {
168           decimal = tmp; // Otherwise, silently ignore the extra digits.
169         }
170         else if(++decimalPoint == 0) { // Because we ignored the digit
171           throw PRECISION_OVERFLOW;
172         }
173       }
174       else if((cur == '.') && (decimalPoint < 0)) {
175         decimalPoint = pos;
176       }
177       else { // No more digits, or a second dot.
178         break;
179       }
180       if(++pos >= end) {
181         break;
182       }
183       cur = str[pos];
184     }
185     // We need the offset from the back for adjusting the exponent:
186     // Note that we need the current value of i!
187     decimalPoint = (decimalPoint >= 0) ? pos - decimalPoint - 1 : 0;
188 
189     // Reads exponent.
190     int exp = 0;
191     if((pos + 1 < end) && ((cur == 'E') || (cur == 'e'))) {
192       cur = str[++pos];
193       final boolean isNegativeExp = (cur == '-');
194       if((isNegativeExp || (cur == '+')) && (++pos < end)) {
195         cur = str[pos];
196       }
197       if((cur < '0') || (cur > '9')) { // At least one digit required.
198         throw INVALID_EXPONENT;
199       }
200       while(true) {
201         final int digit = cur - '0';
202         if(digit < 0 || digit > 9) {
203           break;
204         }
205         exp = (exp << 3) + (exp << 1) + digit;
206         if(exp > Double.MAX_EXPONENT) {
207           throw EXPONENT_OVERFLOW;
208         }
209         if(++pos >= end) {
210           break;
211         }
212         cur = str[pos];
213       }
214       exp = isNegativeExp ? -exp : exp;
215     }
216     // Adjust exponent by the offset of the dot in our long.
217     exp = decimalPoint > 0 ? (exp - decimalPoint) : exp;
218     if(pos != end) {
219       throw TRAILING_CHARACTERS;
220     }
221 
222     return BitsUtil.lpow10(isNegative ? -decimal : decimal, exp);
223   }
224 
225   /**
226    * Parse a double from a character sequence.
227    *
228    * In contrast to Javas {@link Double#parseDouble}, this will <em>not</em>
229    * create an object and thus is expected to put less load on the garbage
230    * collector. It will accept some more spellings of NaN and infinity, thus
231    * removing the need for checking for these independently.
232    *
233    * @param str String
234    * @return Double value
235    */
parseDouble(final CharSequence str)236   public static double parseDouble(final CharSequence str) {
237     return parseDouble(str, 0, str.length());
238   }
239 
240   /**
241    * Parse a double from a character sequence.
242    *
243    * In contrast to Javas {@link Double#parseDouble}, this will <em>not</em>
244    * create an object and thus is expected to put less load on the garbage
245    * collector. It will accept some more spellings of NaN and infinity, thus
246    * removing the need for checking for these independently.
247    *
248    * @param str String
249    * @param start Begin
250    * @param end End
251    * @return Double value
252    */
parseDouble(final CharSequence str, final int start, final int end)253   public static double parseDouble(final CharSequence str, final int start, final int end) {
254     if(start >= end) {
255       throw EMPTY_STRING;
256     }
257     // Current position and character.
258     int pos = start;
259     char cur = str.charAt(pos);
260 
261     // Match for NaN spellings
262     if(matchNaN(str, cur, pos, end)) {
263       return Double.NaN;
264     }
265     // Match sign
266     boolean isNegative = (cur == '-');
267     // Carefully consume the - character, update c and i:
268     if((isNegative || (cur == '+')) && (++pos < end)) {
269       cur = str.charAt(pos);
270     }
271     if(matchInf(str, cur, pos, end)) {
272       return isNegative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
273     }
274 
275     // Begin parsing real numbers!
276     if(((cur < '0') || (cur > '9')) && (cur != '.')) {
277       throw NOT_A_NUMBER;
278     }
279 
280     // Parse digits into a long, remember offset of decimal point.
281     long decimal = 0;
282     int decimalPoint = -1;
283     while(true) {
284       final int digit = cur - '0';
285       if((digit >= 0) && (digit <= 9)) {
286         final long tmp = (decimal << 3) + (decimal << 1) + digit;
287         if(tmp >= decimal) {
288           decimal = tmp; // Otherwise, silently ignore the extra digits.
289         }
290         else if(++decimalPoint == 0) { // Because we ignored the digit
291           throw PRECISION_OVERFLOW;
292         }
293       }
294       else if((cur == '.') && (decimalPoint < 0)) {
295         decimalPoint = pos;
296       }
297       else { // No more digits, or a second dot.
298         break;
299       }
300       if(++pos >= end) {
301         break;
302       }
303       cur = str.charAt(pos);
304     }
305     // We need the offset from the back for adjusting the exponent:
306     // Note that we need the current value of i!
307     decimalPoint = (decimalPoint >= 0) ? pos - decimalPoint - 1 : 0;
308 
309     // Reads exponent.
310     int exp = 0;
311     if((pos + 1 < end) && ((cur == 'E') || (cur == 'e'))) {
312       cur = str.charAt(++pos);
313       final boolean isNegativeExp = (cur == '-');
314       if((isNegativeExp || (cur == '+')) && (++pos < end)) {
315         cur = str.charAt(pos);
316       }
317       if((cur < '0') || (cur > '9')) { // At least one digit required.
318         throw INVALID_EXPONENT;
319       }
320       while(true) {
321         final int digit = cur - '0';
322         if(digit < 0 || digit > 9) {
323           break;
324         }
325         exp = (exp << 3) + (exp << 1) + digit;
326         if(exp > Double.MAX_EXPONENT) {
327           throw EXPONENT_OVERFLOW;
328         }
329         if(++pos >= end) {
330           break;
331         }
332         cur = str.charAt(pos);
333       }
334       exp = isNegativeExp ? -exp : exp;
335     }
336     // Adjust exponent by the offset of the dot in our long.
337     exp = decimalPoint > 0 ? (exp - decimalPoint) : exp;
338     if(pos != end) {
339       throw TRAILING_CHARACTERS;
340     }
341 
342     return BitsUtil.lpow10(isNegative ? -decimal : decimal, exp);
343   }
344 
345   /**
346    * Parse a long integer from a character sequence.
347    *
348    * @param str String
349    * @return Long value
350    */
parseLongBase10(final CharSequence str)351   public static long parseLongBase10(final CharSequence str) {
352     return parseLongBase10(str, 0, str.length());
353   }
354 
355   /**
356    * Parse a long integer from a character sequence.
357    *
358    * @param str String
359    * @param start Begin
360    * @param end End
361    * @return Long value
362    */
parseLongBase10(final CharSequence str, final int start, final int end)363   public static long parseLongBase10(final CharSequence str, final int start, final int end) {
364     // Current position and character.
365     int pos = start;
366     char cur = str.charAt(pos);
367 
368     // Match sign
369     boolean isNegative = (cur == '-');
370     // Carefully consume the - character, update c and i:
371     if((isNegative || (cur == '+')) && (++pos < end)) {
372       cur = str.charAt(pos);
373     }
374 
375     // Begin parsing real numbers!
376     if((cur < '0') || (cur > '9')) {
377       throw NOT_A_NUMBER;
378     }
379 
380     // Parse digits into a long, remember offset of decimal point.
381     long decimal = 0;
382     while(true) {
383       final int digit = cur - '0';
384       if((digit >= 0) && (digit <= 9)) {
385         final long tmp = (decimal << 3) + (decimal << 1) + digit;
386         if(tmp < decimal) {
387           throw PRECISION_OVERFLOW;
388         }
389         decimal = tmp;
390       }
391       else { // No more digits, or a second dot.
392         break;
393       }
394       if(++pos < end) {
395         cur = str.charAt(pos);
396       }
397       else {
398         break;
399       }
400     }
401     if(pos != end) {
402       throw TRAILING_CHARACTERS;
403     }
404 
405     return isNegative ? -decimal : decimal;
406   }
407 
408   /**
409    * Parse an integer from a character sequence.
410    *
411    * @param str String
412    * @return Int value
413    */
parseIntBase10(final CharSequence str)414   public static int parseIntBase10(final CharSequence str) {
415     return parseIntBase10(str, 0, str.length());
416   }
417 
418   /**
419    * Parse an integer from a character sequence.
420    *
421    * @param str String
422    * @param start Begin
423    * @param end End
424    * @return int value
425    */
parseIntBase10(final CharSequence str, final int start, final int end)426   public static int parseIntBase10(final CharSequence str, final int start, final int end) {
427     // Current position and character.
428     int pos = start;
429     char cur = str.charAt(pos);
430 
431     // Match sign
432     boolean isNegative = (cur == '-');
433     // Carefully consume the - character, update c and i:
434     if((isNegative || (cur == '+')) && (++pos < end)) {
435       cur = str.charAt(pos);
436     }
437 
438     // Begin parsing real numbers!
439     if((cur < '0') || (cur > '9')) {
440       throw NOT_A_NUMBER;
441     }
442 
443     // Parse digits into a long, remember offset of decimal point.
444     int decimal = 0;
445     while(true) {
446       final int digit = cur - '0';
447       if((digit >= 0) && (digit <= 9)) {
448         final int tmp = (decimal << 3) + (decimal << 1) + digit;
449         if(tmp < decimal) {
450           throw PRECISION_OVERFLOW;
451         }
452         decimal = tmp;
453       }
454       else { // No more digits, or a second dot.
455         break;
456       }
457       if(++pos < end) {
458         cur = str.charAt(pos);
459       }
460       else {
461         break;
462       }
463     }
464     if(pos != end) {
465       throw TRAILING_CHARACTERS;
466     }
467 
468     return isNegative ? -decimal : decimal;
469   }
470 
471   /**
472    * Match "inf", "infinity" in a number of different capitalizations.
473    *
474    * @param str String to match
475    * @param firstchar First character
476    * @param start Interval begin
477    * @param end Interval end
478    * @return {@code true} when infinity was recognized.
479    */
matchInf(byte[] str, byte firstchar, int start, int end)480   private static boolean matchInf(byte[] str, byte firstchar, int start, int end) {
481     final int len = end - start;
482     // The wonders of unicode. The infinity symbol \u221E is three bytes:
483     if(len == 3 && firstchar == -0x1E && str[start + 1] == -0x78 && str[start + 2] == -0x62) {
484       return true;
485     }
486     if((len != 3 && len != INFINITY_LENGTH) //
487         || (firstchar != 'I' && firstchar != 'i')) {
488       return false;
489     }
490     for(int i = 1, j = INFINITY_LENGTH + 1; i < INFINITY_LENGTH; i++, j++) {
491       final byte c = str[start + i];
492       if(c != INFINITY_PATTERN[i] && c != INFINITY_PATTERN[j]) {
493         return false;
494       }
495       if(i == 2 && len == 3) {
496         return true;
497       }
498     }
499     return true;
500   }
501 
502   /**
503    * Match "inf", "infinity" in a number of different capitalizations.
504    *
505    * @param str String to match
506    * @param firstchar First character
507    * @param start Interval begin
508    * @param end Interval end
509    * @return {@code true} when infinity was recognized.
510    */
matchInf(CharSequence str, char firstchar, int start, int end)511   private static boolean matchInf(CharSequence str, char firstchar, int start, int end) {
512     final int len = end - start;
513     // The wonders of unicode: infinity symbol
514     if(len == 1 && firstchar == '\u221E') {
515       return true;
516     }
517     if((len != 3 && len != INFINITY_LENGTH) //
518         || (firstchar != 'I' && firstchar != 'i')) {
519       return false;
520     }
521     for(int i = 1, j = INFINITY_LENGTH + 1; i < INFINITY_LENGTH; i++, j++) {
522       final char c = str.charAt(start + i);
523       if(c != INFINITY_PATTERN[i] && c != INFINITY_PATTERN[j]) {
524         return false;
525       }
526       if(i == 2 && len == 3) {
527         return true;
528       }
529     }
530     return true;
531   }
532 
533   /**
534    * Match "NaN" in a number of different capitalizations.
535    *
536    * @param str String to match
537    * @param firstchar First character
538    * @param start Interval begin
539    * @param end Interval end
540    * @return {@code true} when NaN was recognized.
541    */
matchNaN(byte[] str, byte firstchar, int start, int end)542   private static boolean matchNaN(byte[] str, byte firstchar, int start, int end) {
543     final int len = end - start;
544     if(len < 2 || len > 3 || (firstchar != 'N' && firstchar != 'n')) {
545       return false;
546     }
547     final byte c1 = str[start + 1];
548     if(c1 != 'a' && c1 != 'A') {
549       return false;
550     }
551     // Accept just "NA", too:
552     if(len == 2) {
553       return true;
554     }
555     final byte c2 = str[start + 2];
556     return c2 == 'N' || c2 == 'n';
557   }
558 
559   /**
560    * Match "NaN" in a number of different capitalizations.
561    *
562    * @param str String to match
563    * @param firstchar First character
564    * @param start Interval begin
565    * @param end Interval end
566    * @return {@code true} when NaN was recognized.
567    */
matchNaN(CharSequence str, char firstchar, int start, int end)568   private static boolean matchNaN(CharSequence str, char firstchar, int start, int end) {
569     final int len = end - start;
570     if(len < 2 || len > 3 || (firstchar != 'N' && firstchar != 'n')) {
571       return false;
572     }
573     final char c1 = str.charAt(start + 1);
574     if(c1 != 'a' && c1 != 'A') {
575       return false;
576     }
577     // Accept just "NA", too:
578     if(len == 2) {
579       return true;
580     }
581     final char c2 = str.charAt(start + 2);
582     return c2 == 'N' || c2 == 'n';
583   }
584 }
585