1 // Licensed under a 3-clause BSD style license - see LICENSE.rst
2 
3 #include "tokenizer.h"
4 
create_tokenizer(char delimiter,char comment,char quotechar,char expchar,int fill_extra_cols,int strip_whitespace_lines,int strip_whitespace_fields,int use_fast_converter)5 tokenizer_t *create_tokenizer(char delimiter, char comment, char quotechar, char expchar,
6                               int fill_extra_cols, int strip_whitespace_lines,
7                               int strip_whitespace_fields, int use_fast_converter)
8 {
9     // Create the tokenizer in memory
10     tokenizer_t *tokenizer = (tokenizer_t *) malloc(sizeof(tokenizer_t));
11 
12     // Initialize the tokenizer fields
13     tokenizer->source = NULL;
14     tokenizer->source_len = 0;
15     tokenizer->source_pos = 0;
16     tokenizer->delimiter = delimiter;
17     tokenizer->comment = comment;
18     tokenizer->quotechar = quotechar;
19     tokenizer->expchar = expchar;
20     tokenizer->newline = '\n';
21     tokenizer->output_cols = NULL;
22     tokenizer->col_ptrs = NULL;
23     tokenizer->output_len = NULL;
24     tokenizer->num_cols = 0;
25     tokenizer->num_rows = 0;
26     tokenizer->fill_extra_cols = fill_extra_cols;
27     tokenizer->state = START_LINE;
28     tokenizer->code = NO_ERROR;
29     tokenizer->iter_col = 0;
30     tokenizer->curr_pos = NULL;
31     tokenizer->strip_whitespace_lines = strip_whitespace_lines;
32     tokenizer->strip_whitespace_fields = strip_whitespace_fields;
33     tokenizer->use_fast_converter = use_fast_converter;
34     tokenizer->comment_lines = (char *) malloc(INITIAL_COMMENT_LEN);
35     tokenizer->comment_pos = 0;
36     tokenizer->comment_lines_len = 0;
37 
38     // This is a bit of a hack -- buf holds an empty string to represent
39     // empty field values
40     tokenizer->buf = calloc(2, sizeof(char));
41 
42     // By default both \n and \r are accepted as newline, unless one of
43     // them has also been specified as field delimiter
44     if (tokenizer->delimiter == '\n')
45         tokenizer->newline = '\r';
46 
47     return tokenizer;
48 }
49 
50 
delete_data(tokenizer_t * tokenizer)51 void delete_data(tokenizer_t *tokenizer)
52 {
53     // Don't free tokenizer->source because it points to part of
54     // an already freed Python object
55     int i;
56 
57     if (tokenizer->output_cols)
58     {
59         for (i = 0; i < tokenizer->num_cols; ++i)
60         {
61             free(tokenizer->output_cols[i]);
62         }
63     }
64 
65     free(tokenizer->output_cols);
66     free(tokenizer->col_ptrs);
67     free(tokenizer->output_len);
68 
69     // Set pointers to 0 so we don't use freed memory when reading over again
70     tokenizer->output_cols = 0;
71     tokenizer->col_ptrs = 0;
72     tokenizer->output_len = 0;
73 }
74 
75 
delete_tokenizer(tokenizer_t * tokenizer)76 void delete_tokenizer(tokenizer_t *tokenizer)
77 {
78     delete_data(tokenizer);
79     free(tokenizer->comment_lines);
80     free(tokenizer->buf);
81     free(tokenizer);
82 }
83 
84 
resize_col(tokenizer_t * self,int index)85 void resize_col(tokenizer_t *self, int index)
86 {
87     // Temporarily store the position in output_cols[index] to
88     // which col_ptrs[index] points
89     long diff = self->col_ptrs[index] - self->output_cols[index];
90 
91     // Double the size of the column string
92     self->output_cols[index] = (char *) realloc(self->output_cols[index], 2 *
93                                                 self->output_len[index] * sizeof(char));
94 
95     // Set the second (newly allocated) half of the column string to all zeros
96     memset(self->output_cols[index] + self->output_len[index] * sizeof(char), 0,
97            self->output_len[index] * sizeof(char));
98 
99     self->output_len[index] *= 2;
100     // realloc() might move the address in memory, so we have to move
101     // col_ptrs[index] to an offset of the new address
102     self->col_ptrs[index] = self->output_cols[index] + diff;
103 }
104 
105 
resize_comments(tokenizer_t * self)106 void resize_comments(tokenizer_t *self)
107 {
108     // Double the size of the comments string
109     self->comment_lines = (char *) realloc(self->comment_lines,
110                                            self->comment_pos + 1);
111     // Set the second (newly allocated) half of the column string to all zeros
112     memset(self->comment_lines + self->comment_lines_len * sizeof(char), 0,
113            (self->comment_pos + 1 - self->comment_lines_len) * sizeof(char));
114 
115     self->comment_lines_len = self->comment_pos + 1;
116 }
117 
118 /*
119   Resize the column string if necessary and then append c to the
120   end of the column string, incrementing the column position pointer.
121 */
push(tokenizer_t * self,char c,int col)122 static inline void push(tokenizer_t *self, char c, int col)
123 {
124     if (self->col_ptrs[col] - self->output_cols[col] >=
125         self->output_len[col])
126     {
127         resize_col(self, col);
128     }
129 
130     *self->col_ptrs[col]++ = c;
131 }
132 
133 
134 /*
135   Resize the comment string if necessary and then append c to the
136   end of the comment string.
137 */
push_comment(tokenizer_t * self,char c)138 static inline void push_comment(tokenizer_t *self, char c)
139 {
140     if (self->comment_pos >= self->comment_lines_len)
141     {
142         resize_comments(self);
143     }
144     self->comment_lines[self->comment_pos++] = c;
145 }
146 
147 
end_comment(tokenizer_t * self)148 static inline void end_comment(tokenizer_t *self)
149 {
150     // Signal empty comment by inserting \x01
151     if (self->comment_pos == 0 || self->comment_lines[self->comment_pos - 1] == '\x00')
152     {
153         push_comment(self, '\x01');
154     }
155     push_comment(self, '\x00');
156 }
157 
158 
159 #define PUSH(c) push(self, c, col)
160 
161 
162 /* Set the state to START_FIELD and begin with the assumption that
163    the field is entirely whitespace in order to handle the possibility
164    that the comment character is found before any non-whitespace even
165    if whitespace stripping is disabled.
166 */
167 #define BEGIN_FIELD()                           \
168     self->state = START_FIELD;                  \
169     whitespace = 1
170 
171 
172 /*
173   First, backtrack to eliminate trailing whitespace if strip_whitespace_fields
174   is true. If the field is empty, push '\x01' as a marker.
175   Append a null byte to the end of the column string as a field delimiting marker.
176   Increment the variable col if we are tokenizing data.
177 */
end_field(tokenizer_t * self,int * col,int header)178 static inline void end_field(tokenizer_t *self, int *col, int header)
179 {
180     if (self->strip_whitespace_fields &&
181             self->col_ptrs[*col] != self->output_cols[*col])
182     {
183         --self->col_ptrs[*col];
184         while (*self->col_ptrs[*col] == ' ' || *self->col_ptrs[*col] == '\t')
185         {
186             *self->col_ptrs[*col]-- = '\x00';
187         }
188         ++self->col_ptrs[*col];
189     }
190     if (self->col_ptrs[*col] == self->output_cols[*col] ||
191             self->col_ptrs[*col][-1] == '\x00')
192     {
193         push(self, '\x01', *col);
194     }
195     push(self, '\x00', *col);
196     if (!header) {
197         ++*col;
198     }
199 }
200 
201 
202 #define END_FIELD() end_field(self, &col, header)
203 
204 
205 // Set the error code to c for later retrieval and return c
206 #define RETURN(c)                                               \
207     do {                                                        \
208         self->code = c;                                         \
209         return c;                                               \
210     } while (0)
211 
212 
213 /*
214   If we are tokenizing the header, end after the first line.
215   Handle the possibility of insufficient columns appropriately;
216   if fill_extra_cols=1, then append empty fields, but otherwise
217   return an error. Increment our row count and possibly end if
218   all the necessary rows have already been parsed.
219 */
end_line(tokenizer_t * self,int col,int header,int end,tokenizer_state * old_state)220 static inline int end_line(tokenizer_t *self, int col, int header, int end,
221                            tokenizer_state *old_state)
222 {
223     if (header)
224     {
225         ++self->source_pos;
226         RETURN(NO_ERROR);
227     }
228     else if (self->fill_extra_cols)
229     {
230         while (col < self->num_cols)
231         {
232                 PUSH('\x01');
233             END_FIELD();
234         }
235     }
236     else if (col < self->num_cols)
237     {
238         RETURN(NOT_ENOUGH_COLS);
239     }
240 
241     ++self->num_rows;
242     *old_state = START_LINE;
243 
244     if (end != -1 && self->num_rows == end)
245     {
246         ++self->source_pos;
247         RETURN(NO_ERROR);
248     }
249     return -1;
250 }
251 
252 
253 #define END_LINE() if (end_line(self, col, header, end, &old_state) != -1) return self->code
254 
255 
skip_lines(tokenizer_t * self,int offset,int header)256 int skip_lines(tokenizer_t *self, int offset, int header)
257 {
258     int signif_chars = 0;
259     int comment = 0;
260     int i = 0;
261     char c;
262 
263     while (i < offset)
264     {
265         if (self->source_pos >= self->source_len)
266         {
267             if (header)
268                 RETURN(INVALID_LINE); // header line is required
269             else
270                 RETURN(NO_ERROR); // no data in input
271         }
272 
273         c = self->source[self->source_pos];
274 
275         if ((c == '\r' || c == '\n') && c != self->delimiter)
276         {
277             if (c == '\r' && self->source_pos < self->source_len - 1 &&
278                 self->source[self->source_pos + 1] == '\n')
279             {
280                 ++self->source_pos; // skip \n in \r\n
281             }
282             if (!comment && signif_chars > 0)
283                 ++i;
284             else if (comment && !header)
285                 end_comment(self);
286             // Start by assuming a line is empty and non-commented
287             signif_chars = 0;
288             comment = 0;
289         }
290         else if ((c != ' ' && c != '\t') || !self->strip_whitespace_lines)
291         {
292                 // Comment line
293                 if (!signif_chars && self->comment != 0 && c == self->comment)
294                     comment = 1;
295                 else if (comment && !header)
296                     push_comment(self, c);
297 
298                 // Significant character encountered
299                 ++signif_chars;
300         }
301         else if (comment && !header)
302         {
303             push_comment(self, c);
304         }
305 
306             ++self->source_pos;
307     }
308 
309     RETURN(NO_ERROR);
310 }
311 
312 
tokenize(tokenizer_t * self,int end,int header,int num_cols)313 int tokenize(tokenizer_t *self, int end, int header, int num_cols)
314 {
315     char c; // Input character
316     int col = 0; // Current column ignoring possibly excluded columns
317     tokenizer_state old_state = START_LINE; // Last state the tokenizer was in before CR mode
318     int i = 0;
319     int whitespace = 1;
320     delete_data(self); // Clear old reading data
321     self->num_rows = 0;
322     self->comment_lines_len = INITIAL_COMMENT_LEN;
323 
324     if (header)
325         self->num_cols = 1; // Store header output in one column
326     else
327         self->num_cols = num_cols;
328 
329     // Allocate memory for structures used during tokenization
330     self->output_cols = (char **) malloc(self->num_cols * sizeof(char *));
331     self->col_ptrs = (char **) malloc(self->num_cols * sizeof(char *));
332     self->output_len = (size_t *) malloc(self->num_cols * sizeof(size_t));
333 
334     for (i = 0; i < self->num_cols; ++i)
335     {
336         self->output_cols[i] = (char *) calloc(1, INITIAL_COL_SIZE *
337                                                sizeof(char));
338         // Make each col_ptrs pointer point to the beginning of the
339         // column string
340         self->col_ptrs[i] = self->output_cols[i];
341         self->output_len[i] = INITIAL_COL_SIZE;
342     }
343 
344     if (end == 0)
345         RETURN(NO_ERROR); // Don't read if end == 0
346 
347     self->state = START_LINE;
348 
349     // Loop until all of self->source has been read
350     while (self->source_pos < self->source_len + 1)
351     {
352         if (self->source_pos == self->source_len)
353             c = self->newline;
354         else
355             c = self->source[self->source_pos];
356 
357         if (c == '\r' && c != self->delimiter && c != self->newline)
358             c = '\n';
359 
360         switch (self->state)
361         {
362         case START_LINE:
363             if (c == self->newline)
364                 break;
365             else if ((c == ' ' || c == '\t') && self->strip_whitespace_lines)
366                 break;
367             else if (self->comment != 0 && c == self->comment)
368             {
369                 // Comment line; ignore
370                 self->state = COMMENT;
371                 break;
372             }
373             // Initialize variables for the beginning of line parsing
374             col = 0;
375             BEGIN_FIELD();
376             // Parse in mode START_FIELD
377 
378         case START_FIELD:
379             // Strip whitespace before field begins
380             if ((c == ' ' || c == '\t') && self->strip_whitespace_fields)
381                 break;
382             else if (!self->strip_whitespace_lines && self->comment != 0 &&
383                      c == self->comment)
384             {
385                 // Comment line, not caught earlier because of no stripping
386                 self->state = COMMENT;
387                 break;
388             }
389             // Handle newline characters first
390             else if (c == self->newline)
391             {
392                 if (self->strip_whitespace_lines)
393                 {
394                     // Move on if the delimiter is whitespace, e.g.
395                     // '1 2 3   '->['1','2','3']
396                     if (self->delimiter == ' ' || self->delimiter == '\t')
397                         ;
398                     // Register an empty field if non-whitespace delimiter,
399                     // e.g. '1,2, '->['1','2','']
400                     else
401                     {
402                         if (col >= self->num_cols)
403                             RETURN(TOO_MANY_COLS);
404                         END_FIELD();
405                     }
406                 }
407 
408                 else if (!self->strip_whitespace_lines)
409                 {
410                     // In this case we don't want to left-strip the field,
411                     // so we backtrack
412                     size_t tmp = self->source_pos;
413                     --self->source_pos;
414 
415                     while (self->source_pos >= 0 &&
416                            self->source[self->source_pos] != self->delimiter
417                            && self->source[self->source_pos] != '\n'
418                            && self->source[self->source_pos] != '\r')
419                     {
420                         --self->source_pos;
421                     }
422 
423                     // Backtracked to line beginning
424                     if (self->source_pos == -1
425                         || self->source[self->source_pos] == '\n'
426                         || self->source[self->source_pos] == '\r')
427                     {
428                         self->source_pos = tmp;
429                     }
430                     else
431                     {
432                         ++self->source_pos;
433 
434                         if (self->source_pos == tmp)
435                             // No whitespace, just an empty field
436                             ;
437                         else
438                             while (self->source_pos < tmp)
439                             {
440                                 // Append whitespace characters
441                                 PUSH(self->source[self->source_pos]);
442                                 ++self->source_pos;
443                             }
444 
445                         if (col >= self->num_cols)
446                             RETURN(TOO_MANY_COLS);
447                         END_FIELD(); // Whitespace counts as a field
448                     }
449                 }
450 
451                 END_LINE();
452                 self->state = START_LINE;
453                 break;
454             }
455 
456             // Before proceeding with a new field check column does not exceed
457             // number defined in header or from auto-detect to avoid segfaults
458             // such as https://github.com/astropy/astropy/issues/9922
459             else if (col >= self->num_cols)
460                 RETURN(TOO_MANY_COLS);
461             else if (c == self->delimiter) // Field ends before it begins
462             {
463                 END_FIELD();
464                 BEGIN_FIELD();
465                 break;
466             }
467             else if (c == self->quotechar) // Start parsing quoted field
468             {
469                 self->state = START_QUOTED_FIELD;
470                 break;
471             }
472             else // Valid field character, parse again in FIELD mode
473                 self->state = FIELD;
474 
475         case FIELD:
476             if (self->comment != 0 && c == self->comment && whitespace && col == 0)
477                 // No whitespace stripping, but the comment char is found
478                 // before any data, e.g. '  # a b c'
479                 self->state = COMMENT;
480             else if (c == self->delimiter && self->source_pos < self->source_len)
481             {
482                 // End of field, look for new field
483                 END_FIELD();
484                 BEGIN_FIELD();
485             }
486             else if (c == self->newline)
487             {
488                 // Line ending, stop parsing both field and line
489                 END_FIELD();
490                 END_LINE();
491                 self->state = START_LINE;
492             }
493             else
494             {
495                 if (c != ' ' && c != '\t')
496                     whitespace = 0; // Field is not all whitespace
497                 PUSH(c);
498             }
499             break;
500 
501         case START_QUOTED_FIELD:
502             if ((c == ' ' || c == '\t') && self->strip_whitespace_fields)
503             {
504                 // Ignore initial whitespace
505                 break;
506             }
507             else if (c == self->quotechar)
508             {
509                 // Lookahead check for double quote inside quoted field,
510                 // e.g. """cd" => "cd
511                 if (self->source_pos < self->source_len - 1)
512                 {
513                     if (self->source[self->source_pos + 1] == self->quotechar)
514                     {
515                         self->state = QUOTED_FIELD_DOUBLE_QUOTE;
516                         PUSH(c);
517                         break;
518                     }
519                 }
520                 // Parse rest of field normally, e.g. ""c
521                 self->state = FIELD;
522             }
523             else
524             {
525                 // Valid field character, parse again in QUOTED_FIELD mode
526                 self->state = QUOTED_FIELD;
527             }
528 
529         case QUOTED_FIELD_NEWLINE:
530             if (self->state == QUOTED_FIELD)
531                 ; // fall through
532             // Ignore initial whitespace if strip_whitespace_lines and
533             // newlines regardless
534             else if (((c == ' ' || c == '\t') && self->strip_whitespace_lines)
535                      || c == self->newline)
536                 break;
537             else if (c == self->quotechar)
538             {
539                 self->state = FIELD;
540                 break;
541             }
542             else
543             {
544                 // Once data begins, parse it as a normal quoted field
545                 self->state = QUOTED_FIELD;
546             }
547 
548         case QUOTED_FIELD:
549             if (c == self->quotechar)
550             {
551                 // Lookahead check for double quote inside quoted field,
552                 // e.g. "ab""cd" => ab"cd
553                 if (self->source_pos < self->source_len - 1)
554                 {
555                     if (self->source[self->source_pos + 1] == self->quotechar)
556                     {
557                         self->state = QUOTED_FIELD_DOUBLE_QUOTE;
558                         PUSH(c);
559                         break;
560                     }
561                 }
562                 // Parse rest of field normally, e.g. "ab"c
563                 self->state = FIELD;
564             }
565             else if (c == self->newline)
566                 self->state = QUOTED_FIELD_NEWLINE;
567             else
568             {
569                 PUSH(c);
570             }
571             break;
572 
573         case QUOTED_FIELD_DOUBLE_QUOTE:
574             // Ignore the second double quote from "ab""cd" and parse rest of
575             // field normally as quoted field.
576             self->state = QUOTED_FIELD;
577             break;
578 
579         case COMMENT:
580             if (c == self->newline)
581             {
582                 self->state = START_LINE;
583                 if (!header)
584                     end_comment(self);
585             }
586             else if (!header)
587                 push_comment(self, c);
588             break; // Keep looping until we find a newline
589 
590         }
591 
592         ++self->source_pos;
593     }
594 
595     RETURN(0);
596 }
597 
598 
ascii_strncasecmp(const char * str1,const char * str2,size_t n)599 static int ascii_strncasecmp(const char *str1, const char *str2, size_t n)
600 {
601     int char1, char2;
602 
603     do
604     {
605         char1 = tolower(*(str1++));
606         char2 = tolower(*(str2++));
607         n--;
608     } while (n && char1 != '\0' && char1 == char2);
609 
610     return (char1 - char2);
611 }
612 
613 
str_to_long(tokenizer_t * self,char * str)614 long str_to_long(tokenizer_t *self, char *str)
615 {
616     char *tmp;
617     long ret;
618     errno = 0;
619     ret = strtol(str, &tmp, 10);
620 
621     if (tmp == str || *tmp != '\0')
622         self->code = CONVERSION_ERROR;
623     else if (errno == ERANGE)
624         self->code = OVERFLOW_ERROR;
625 
626     return ret;
627 }
628 
629 
str_to_double(tokenizer_t * self,char * str)630 double str_to_double(tokenizer_t *self, char *str)
631 {
632     char *tmp;
633     double val;
634     errno = 0;
635 
636     if (self->use_fast_converter)
637     {
638         val = xstrtod(str, &tmp, '.', self->expchar, ',', 1);
639 
640         if (errno == EINVAL || tmp == str || *tmp != '\0')
641         {
642             goto conversion_error;
643         }
644         else if (errno == ERANGE)
645         {
646             self->code = OVERFLOW_ERROR;
647         }
648         else if (errno == EDOM)        // xstrtod signalling invalid exponents
649         {
650             self->code = CONVERSION_ERROR;
651         }
652 
653         return val;
654     }
655 
656     else
657     {
658         val = strtod(str, &tmp);
659 
660         if (errno == EINVAL || tmp == str || *tmp != '\0')
661         {
662             goto conversion_error;
663         }
664         else if (errno == ERANGE)
665         {
666             self->code = OVERFLOW_ERROR;
667         }
668         else if (errno == EDOM)
669         {
670             self->code = CONVERSION_ERROR;
671         }
672 
673         return val;
674     }
675 
676 conversion_error:
677     // Handle inf and nan values for xstrtod and platforms whose strtod
678     // doesn't support this
679     val = 1.0;
680     tmp = str;
681 
682     if (*tmp == '+')
683     {
684         tmp++;
685     }
686     else if (*tmp == '-')
687     {
688         tmp++;
689         val = -1.0;
690     }
691 
692     if (0 == ascii_strncasecmp(tmp, "nan", 3))
693     {
694         // Handle optional nan type specifier; this is ignored
695         tmp += 3;
696         val = NAN;
697     }
698     else if (0 == ascii_strncasecmp(tmp, "inf", 3))
699     {
700         tmp += 3;
701         if (0 == ascii_strncasecmp(tmp, "inity", 5))
702         {
703             tmp += 5;
704         }
705         val *= INFINITY;
706     }
707     else
708     {
709        // Original (tmp == str || *tmp != '\0') case, no NaN or inf found
710         self->code = CONVERSION_ERROR;
711         val = 0;
712     }
713 
714     return val;
715 }
716 
717 // ---------------------------------------------------------------------------
718 // Implementation of xstrtod
719 
720 //
721 // strtod.c
722 //
723 // Convert string to double
724 //
725 // Copyright (C) 2002 Michael Ringgaard. All rights reserved.
726 //
727 // Redistribution and use in source and binary forms, with or without
728 // modification, are permitted provided that the following conditions
729 // are met:
730 //
731 // 1. Redistributions of source code must retain the above copyright
732 //    notice, this list of conditions and the following disclaimer.
733 // 2. Redistributions in binary form must reproduce the above copyright
734 //    notice, this list of conditions and the following disclaimer in the
735 //    documentation and/or other materials provided with the distribution.
736 // 3. Neither the name of the project nor the names of its contributors
737 //    may be used to endorse or promote products derived from this software
738 //    without specific prior written permission.
739 //
740 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
741 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
742 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
743 // ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
744 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
745 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
746 // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
747 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
748 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
749 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
750 // SUCH DAMAGE.
751 //
752 // -----------------------------------------------------------------------
753 // Modifications by Warren Weckesser, March 2011:
754 // * Rename strtod() to xstrtod().
755 // * Added decimal and sci arguments.
756 // * Skip trailing spaces.
757 // * Commented out the other functions.
758 // Modifications by Richard T Guy, August 2013:
759 // * Add tsep argument for thousands separator
760 // Modifications by Michael Mueller, August 2014:
761 // * Cache powers of 10 in memory to avoid rounding errors
762 // * Stop parsing decimals after 17 significant figures
763 // Modifications by Derek Homeier, August 2015:
764 // * Recognise alternative exponent characters passed in 'sci'; try automatic
765 //   detection of allowed Fortran formats with sci='A'
766 // * Require exactly 3 digits in exponent for Fortran-type format '8.7654+321'
767 // Modifications by Derek Homeier, September-December 2016:
768 // * Fixed some corner cases of very large or small exponents; proper return
769 // * do not increment num_digits until nonzero digit read in
770 //
771 
xstrtod(const char * str,char ** endptr,char decimal,char expchar,char tsep,int skip_trailing)772 double xstrtod(const char *str, char **endptr, char decimal,
773                char expchar, char tsep, int skip_trailing)
774 {
775     double number;
776     int exponent;
777     int negative;
778     char *p = (char *) str;
779     char exp;
780     char sci;
781     int num_digits;
782     int num_decimals;
783     int max_digits = 17;
784     int num_exp = 3;
785     int non_zero;
786     int n;
787     // Cache powers of 10 in memory
788     static double e[] = {1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10,
789                          1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20,
790                          1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30,
791                          1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40,
792                          1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50,
793                          1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60,
794                          1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, 1e70,
795                          1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, 1e80,
796                          1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, 1e90,
797                          1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, 1e100,
798                          1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, 1e110,
799                          1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, 1e120,
800                          1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130,
801                          1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140,
802                          1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150,
803                          1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, 1e160,
804                          1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, 1e170,
805                          1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, 1e180,
806                          1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, 1e190,
807                          1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, 1e200,
808                          1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, 1e210,
809                          1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220,
810                          1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230,
811                          1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240,
812                          1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, 1e250,
813                          1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, 1e260,
814                          1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, 1e270,
815                          1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, 1e280,
816                          1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, 1e290,
817                          1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, 1e300,
818                          1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308};
819     // Cache additional negative powers of 10
820     /* static double m[] = {1e-309, 1e-310, 1e-311, 1e-312, 1e-313, 1e-314,
821                          1e-315, 1e-316, 1e-317, 1e-318, 1e-319, 1e-320,
822                          1e-321, 1e-322, 1e-323}; */
823     errno = 0;
824 
825     // Skip leading whitespace
826     while (isspace(*p)) p++;
827 
828     // Handle optional sign
829     negative = 0;
830     switch (*p)
831     {
832     case '-': negative = 1; // Fall through to increment position
833     case '+': p++;
834     }
835 
836     // No numerical value following sign - make no conversion and return zero,
837     // resetting endptr to beginning of str (consistent with strtod behaviour)
838     // E.g. -1.e0 and -.0e1 are valid, -.e0 is not!
839     if (!(isdigit(*p) || (*p == decimal && isdigit(*(p + 1)))))
840     {
841         if (endptr) *endptr = (char *) str;
842         return 0e0;
843     }
844 
845     number = 0.;
846     exponent = 0;
847     num_digits = 0;
848     num_decimals = 0;
849     non_zero = 0;
850 
851     // Process string of digits
852     while (isdigit(*p))
853     {
854         if (num_digits < max_digits)
855         {
856             number = number * 10. + (*p - '0');
857             non_zero += (*p != '0');
858             if(non_zero) num_digits++;
859         }
860         else
861             ++exponent;
862 
863         p++;
864         p += (tsep != '\0' && *p == tsep);
865     }
866 
867     // Process decimal part
868     if (*p == decimal)
869     {
870         p++;
871 
872         while (num_digits < max_digits && isdigit(*p))
873         {
874             number = number * 10. + (*p - '0');
875             non_zero += (*p != '0');
876             if(non_zero) num_digits++;
877             num_decimals++;
878             p++;
879         }
880 
881         if (num_digits >= max_digits) // consume extra decimal digits
882             while (isdigit(*p))
883                 ++p;
884 
885         exponent -= num_decimals;
886     }
887 
888     // Exactly 0 - no precision loss/OverflowError
889     if (num_digits == 0) number = 0.0;
890 
891     // Correct for sign
892     if (negative) number = -number;
893 
894     // Process an exponent string
895     sci = toupper(expchar);
896     if (sci == 'A')
897     {
898         // check for possible Fortran exponential notations, including
899         // triple-digits with no character
900         exp = toupper(*p);
901         if (exp == 'E' || exp == 'D' || exp == 'Q' || *p == '+' || *p == '-')
902         {
903             // Handle optional sign
904             negative = 0;
905             switch (exp)
906             {
907             case '-':
908                 negative = 1;   // Fall through to increment pos
909             case '+':
910                 p++;
911                 break;
912             case 'E':
913             case 'D':
914             case 'Q':
915                 switch (*++p)
916                 {
917                 case '-':
918                     negative = 1;   // Fall through to increment pos
919                 case '+':
920                     p++;
921                 }
922             }
923 
924             // Process string of digits
925             n = 0;
926             while (isdigit(*p))
927             {
928                 n = n * 10 + (*p - '0');
929                 num_exp--;
930                 p++;
931             }
932             // Trigger error if not exactly three digits
933             if (num_exp != 0 && (exp == '+' || exp == '-'))
934             {
935                errno = EDOM;
936                number = 0.0;
937             }
938 
939             if (negative)
940                 exponent -= n;
941             else
942                 exponent += n;
943         }
944     }
945     else if (toupper(*p) == sci)
946     {
947         // Handle optional sign
948         negative = 0;
949         switch (*++p)
950         {
951         case '-':
952             negative = 1;   // Fall through to increment pos
953         case '+':
954             p++;
955         }
956 
957         // Process string of digits
958         n = 0;
959         while (isdigit(*p))
960         {
961             n = n * 10 + (*p - '0');
962             p++;
963         }
964 
965         if (negative)
966             exponent -= n;
967         else
968             exponent += n;
969     }
970 
971     // largest representable float64 is 1.7977e+308, closest to 0 ~4.94e-324,
972     // but multiplying exponents in in two steps gives slightly better precision
973     if (number != 0.0) {
974         if (exponent > 305)
975         {
976             if (exponent > 308)   // leading zeros already subtracted from exp
977                 number *= HUGE_VAL;
978             else
979             {
980                 number *= e[exponent-300];
981                 number *= 1.e300;
982             }
983         }
984         else if (exponent < -308) // subnormal
985         {
986             if (exponent < -616) // prevent invalid array access
987                 number = 0.;
988             else
989             {
990                 number /= e[-308-exponent];
991                 number *= 1.e-308;
992             }
993             // trigger warning if resolution is > ~1.e-15;
994             // strtod does so for |number| <~ 2.25e-308
995             // if (number > -4.94e-309 && number < 4.94e-309)
996             errno = ERANGE;
997         }
998         else if (exponent > 0)
999             number *= e[exponent];
1000         else if (exponent < 0)
1001             number /= e[-exponent];
1002 
1003         if (number >= HUGE_VAL || number <= -HUGE_VAL)
1004             errno = ERANGE;
1005     }
1006 
1007     if (skip_trailing) {
1008         // Skip trailing whitespace
1009         while (isspace(*p)) p++;
1010     }
1011 
1012     if (endptr) *endptr = p;
1013     return number;
1014 }
1015 
1016 
start_iteration(tokenizer_t * self,int col)1017 void start_iteration(tokenizer_t *self, int col)
1018 {
1019     // Begin looping over the column string with index col
1020     self->iter_col = col;
1021     // Start at the initial pointer position
1022     self->curr_pos = self->output_cols[col];
1023 }
1024 
1025 
next_field(tokenizer_t * self,int * size)1026 char *next_field(tokenizer_t *self, int *size)
1027 {
1028     char *tmp = self->curr_pos;
1029 
1030     // pass through the entire field until reaching the delimiter
1031     while (*self->curr_pos != '\x00')
1032     ++self->curr_pos;
1033 
1034     ++self->curr_pos; // next field begins after the delimiter
1035 
1036     if (*tmp == '\x01') // empty field; this is a hack
1037     {
1038         if (size)
1039             *size = 0;
1040         return self->buf;
1041     }
1042 
1043     else
1044     {
1045         if (size)
1046             *size = self->curr_pos - tmp - 1;
1047         return tmp;
1048     }
1049 }
1050 
1051 
get_line(char * ptr,size_t * len,size_t map_len)1052 char *get_line(char *ptr, size_t *len, size_t map_len)
1053 {
1054     size_t pos = 0;
1055 
1056     while (pos < map_len)
1057     {
1058         if (ptr[pos] == '\r')
1059         {
1060             *len = pos;
1061             // Windows line break (\r\n)
1062             if (pos != map_len - 1 && ptr[pos + 1] == '\n')
1063                 return ptr + pos + 2; // skip newline character
1064             else // Carriage return line break
1065                 return ptr + pos + 1;
1066         }
1067 
1068         else if (ptr[pos] == '\n')
1069         {
1070             *len = pos;
1071             return ptr + pos + 1;
1072         }
1073 
1074         ++pos;
1075     }
1076 
1077     // done with input
1078     return 0;
1079 }
1080 
1081 
reset_comments(tokenizer_t * self)1082 void reset_comments(tokenizer_t *self)
1083 {
1084     free(self->comment_lines);
1085     self->comment_pos = 0;
1086     self->comment_lines_len = INITIAL_COMMENT_LEN;
1087     self->comment_lines = (char *) malloc(INITIAL_COMMENT_LEN);
1088 }
1089