1 // Licensed under a 3-clause BSD style license - see LICENSE.rst
2
3 #include "tokenizer.h"
4
create_tokenizer(char delimiter,char comment,char quotechar,char expchar,int fill_extra_cols,int strip_whitespace_lines,int strip_whitespace_fields,int use_fast_converter)5 tokenizer_t *create_tokenizer(char delimiter, char comment, char quotechar, char expchar,
6 int fill_extra_cols, int strip_whitespace_lines,
7 int strip_whitespace_fields, int use_fast_converter)
8 {
9 // Create the tokenizer in memory
10 tokenizer_t *tokenizer = (tokenizer_t *) malloc(sizeof(tokenizer_t));
11
12 // Initialize the tokenizer fields
13 tokenizer->source = NULL;
14 tokenizer->source_len = 0;
15 tokenizer->source_pos = 0;
16 tokenizer->delimiter = delimiter;
17 tokenizer->comment = comment;
18 tokenizer->quotechar = quotechar;
19 tokenizer->expchar = expchar;
20 tokenizer->newline = '\n';
21 tokenizer->output_cols = NULL;
22 tokenizer->col_ptrs = NULL;
23 tokenizer->output_len = NULL;
24 tokenizer->num_cols = 0;
25 tokenizer->num_rows = 0;
26 tokenizer->fill_extra_cols = fill_extra_cols;
27 tokenizer->state = START_LINE;
28 tokenizer->code = NO_ERROR;
29 tokenizer->iter_col = 0;
30 tokenizer->curr_pos = NULL;
31 tokenizer->strip_whitespace_lines = strip_whitespace_lines;
32 tokenizer->strip_whitespace_fields = strip_whitespace_fields;
33 tokenizer->use_fast_converter = use_fast_converter;
34 tokenizer->comment_lines = (char *) malloc(INITIAL_COMMENT_LEN);
35 tokenizer->comment_pos = 0;
36 tokenizer->comment_lines_len = 0;
37
38 // This is a bit of a hack -- buf holds an empty string to represent
39 // empty field values
40 tokenizer->buf = calloc(2, sizeof(char));
41
42 // By default both \n and \r are accepted as newline, unless one of
43 // them has also been specified as field delimiter
44 if (tokenizer->delimiter == '\n')
45 tokenizer->newline = '\r';
46
47 return tokenizer;
48 }
49
50
delete_data(tokenizer_t * tokenizer)51 void delete_data(tokenizer_t *tokenizer)
52 {
53 // Don't free tokenizer->source because it points to part of
54 // an already freed Python object
55 int i;
56
57 if (tokenizer->output_cols)
58 {
59 for (i = 0; i < tokenizer->num_cols; ++i)
60 {
61 free(tokenizer->output_cols[i]);
62 }
63 }
64
65 free(tokenizer->output_cols);
66 free(tokenizer->col_ptrs);
67 free(tokenizer->output_len);
68
69 // Set pointers to 0 so we don't use freed memory when reading over again
70 tokenizer->output_cols = 0;
71 tokenizer->col_ptrs = 0;
72 tokenizer->output_len = 0;
73 }
74
75
delete_tokenizer(tokenizer_t * tokenizer)76 void delete_tokenizer(tokenizer_t *tokenizer)
77 {
78 delete_data(tokenizer);
79 free(tokenizer->comment_lines);
80 free(tokenizer->buf);
81 free(tokenizer);
82 }
83
84
resize_col(tokenizer_t * self,int index)85 void resize_col(tokenizer_t *self, int index)
86 {
87 // Temporarily store the position in output_cols[index] to
88 // which col_ptrs[index] points
89 long diff = self->col_ptrs[index] - self->output_cols[index];
90
91 // Double the size of the column string
92 self->output_cols[index] = (char *) realloc(self->output_cols[index], 2 *
93 self->output_len[index] * sizeof(char));
94
95 // Set the second (newly allocated) half of the column string to all zeros
96 memset(self->output_cols[index] + self->output_len[index] * sizeof(char), 0,
97 self->output_len[index] * sizeof(char));
98
99 self->output_len[index] *= 2;
100 // realloc() might move the address in memory, so we have to move
101 // col_ptrs[index] to an offset of the new address
102 self->col_ptrs[index] = self->output_cols[index] + diff;
103 }
104
105
resize_comments(tokenizer_t * self)106 void resize_comments(tokenizer_t *self)
107 {
108 // Double the size of the comments string
109 self->comment_lines = (char *) realloc(self->comment_lines,
110 self->comment_pos + 1);
111 // Set the second (newly allocated) half of the column string to all zeros
112 memset(self->comment_lines + self->comment_lines_len * sizeof(char), 0,
113 (self->comment_pos + 1 - self->comment_lines_len) * sizeof(char));
114
115 self->comment_lines_len = self->comment_pos + 1;
116 }
117
118 /*
119 Resize the column string if necessary and then append c to the
120 end of the column string, incrementing the column position pointer.
121 */
push(tokenizer_t * self,char c,int col)122 static inline void push(tokenizer_t *self, char c, int col)
123 {
124 if (self->col_ptrs[col] - self->output_cols[col] >=
125 self->output_len[col])
126 {
127 resize_col(self, col);
128 }
129
130 *self->col_ptrs[col]++ = c;
131 }
132
133
134 /*
135 Resize the comment string if necessary and then append c to the
136 end of the comment string.
137 */
push_comment(tokenizer_t * self,char c)138 static inline void push_comment(tokenizer_t *self, char c)
139 {
140 if (self->comment_pos >= self->comment_lines_len)
141 {
142 resize_comments(self);
143 }
144 self->comment_lines[self->comment_pos++] = c;
145 }
146
147
end_comment(tokenizer_t * self)148 static inline void end_comment(tokenizer_t *self)
149 {
150 // Signal empty comment by inserting \x01
151 if (self->comment_pos == 0 || self->comment_lines[self->comment_pos - 1] == '\x00')
152 {
153 push_comment(self, '\x01');
154 }
155 push_comment(self, '\x00');
156 }
157
158
159 #define PUSH(c) push(self, c, col)
160
161
162 /* Set the state to START_FIELD and begin with the assumption that
163 the field is entirely whitespace in order to handle the possibility
164 that the comment character is found before any non-whitespace even
165 if whitespace stripping is disabled.
166 */
167 #define BEGIN_FIELD() \
168 self->state = START_FIELD; \
169 whitespace = 1
170
171
172 /*
173 First, backtrack to eliminate trailing whitespace if strip_whitespace_fields
174 is true. If the field is empty, push '\x01' as a marker.
175 Append a null byte to the end of the column string as a field delimiting marker.
176 Increment the variable col if we are tokenizing data.
177 */
end_field(tokenizer_t * self,int * col,int header)178 static inline void end_field(tokenizer_t *self, int *col, int header)
179 {
180 if (self->strip_whitespace_fields &&
181 self->col_ptrs[*col] != self->output_cols[*col])
182 {
183 --self->col_ptrs[*col];
184 while (*self->col_ptrs[*col] == ' ' || *self->col_ptrs[*col] == '\t')
185 {
186 *self->col_ptrs[*col]-- = '\x00';
187 }
188 ++self->col_ptrs[*col];
189 }
190 if (self->col_ptrs[*col] == self->output_cols[*col] ||
191 self->col_ptrs[*col][-1] == '\x00')
192 {
193 push(self, '\x01', *col);
194 }
195 push(self, '\x00', *col);
196 if (!header) {
197 ++*col;
198 }
199 }
200
201
202 #define END_FIELD() end_field(self, &col, header)
203
204
205 // Set the error code to c for later retrieval and return c
206 #define RETURN(c) \
207 do { \
208 self->code = c; \
209 return c; \
210 } while (0)
211
212
213 /*
214 If we are tokenizing the header, end after the first line.
215 Handle the possibility of insufficient columns appropriately;
216 if fill_extra_cols=1, then append empty fields, but otherwise
217 return an error. Increment our row count and possibly end if
218 all the necessary rows have already been parsed.
219 */
end_line(tokenizer_t * self,int col,int header,int end,tokenizer_state * old_state)220 static inline int end_line(tokenizer_t *self, int col, int header, int end,
221 tokenizer_state *old_state)
222 {
223 if (header)
224 {
225 ++self->source_pos;
226 RETURN(NO_ERROR);
227 }
228 else if (self->fill_extra_cols)
229 {
230 while (col < self->num_cols)
231 {
232 PUSH('\x01');
233 END_FIELD();
234 }
235 }
236 else if (col < self->num_cols)
237 {
238 RETURN(NOT_ENOUGH_COLS);
239 }
240
241 ++self->num_rows;
242 *old_state = START_LINE;
243
244 if (end != -1 && self->num_rows == end)
245 {
246 ++self->source_pos;
247 RETURN(NO_ERROR);
248 }
249 return -1;
250 }
251
252
253 #define END_LINE() if (end_line(self, col, header, end, &old_state) != -1) return self->code
254
255
skip_lines(tokenizer_t * self,int offset,int header)256 int skip_lines(tokenizer_t *self, int offset, int header)
257 {
258 int signif_chars = 0;
259 int comment = 0;
260 int i = 0;
261 char c;
262
263 while (i < offset)
264 {
265 if (self->source_pos >= self->source_len)
266 {
267 if (header)
268 RETURN(INVALID_LINE); // header line is required
269 else
270 RETURN(NO_ERROR); // no data in input
271 }
272
273 c = self->source[self->source_pos];
274
275 if ((c == '\r' || c == '\n') && c != self->delimiter)
276 {
277 if (c == '\r' && self->source_pos < self->source_len - 1 &&
278 self->source[self->source_pos + 1] == '\n')
279 {
280 ++self->source_pos; // skip \n in \r\n
281 }
282 if (!comment && signif_chars > 0)
283 ++i;
284 else if (comment && !header)
285 end_comment(self);
286 // Start by assuming a line is empty and non-commented
287 signif_chars = 0;
288 comment = 0;
289 }
290 else if ((c != ' ' && c != '\t') || !self->strip_whitespace_lines)
291 {
292 // Comment line
293 if (!signif_chars && self->comment != 0 && c == self->comment)
294 comment = 1;
295 else if (comment && !header)
296 push_comment(self, c);
297
298 // Significant character encountered
299 ++signif_chars;
300 }
301 else if (comment && !header)
302 {
303 push_comment(self, c);
304 }
305
306 ++self->source_pos;
307 }
308
309 RETURN(NO_ERROR);
310 }
311
312
tokenize(tokenizer_t * self,int end,int header,int num_cols)313 int tokenize(tokenizer_t *self, int end, int header, int num_cols)
314 {
315 char c; // Input character
316 int col = 0; // Current column ignoring possibly excluded columns
317 tokenizer_state old_state = START_LINE; // Last state the tokenizer was in before CR mode
318 int i = 0;
319 int whitespace = 1;
320 delete_data(self); // Clear old reading data
321 self->num_rows = 0;
322 self->comment_lines_len = INITIAL_COMMENT_LEN;
323
324 if (header)
325 self->num_cols = 1; // Store header output in one column
326 else
327 self->num_cols = num_cols;
328
329 // Allocate memory for structures used during tokenization
330 self->output_cols = (char **) malloc(self->num_cols * sizeof(char *));
331 self->col_ptrs = (char **) malloc(self->num_cols * sizeof(char *));
332 self->output_len = (size_t *) malloc(self->num_cols * sizeof(size_t));
333
334 for (i = 0; i < self->num_cols; ++i)
335 {
336 self->output_cols[i] = (char *) calloc(1, INITIAL_COL_SIZE *
337 sizeof(char));
338 // Make each col_ptrs pointer point to the beginning of the
339 // column string
340 self->col_ptrs[i] = self->output_cols[i];
341 self->output_len[i] = INITIAL_COL_SIZE;
342 }
343
344 if (end == 0)
345 RETURN(NO_ERROR); // Don't read if end == 0
346
347 self->state = START_LINE;
348
349 // Loop until all of self->source has been read
350 while (self->source_pos < self->source_len + 1)
351 {
352 if (self->source_pos == self->source_len)
353 c = self->newline;
354 else
355 c = self->source[self->source_pos];
356
357 if (c == '\r' && c != self->delimiter && c != self->newline)
358 c = '\n';
359
360 switch (self->state)
361 {
362 case START_LINE:
363 if (c == self->newline)
364 break;
365 else if ((c == ' ' || c == '\t') && self->strip_whitespace_lines)
366 break;
367 else if (self->comment != 0 && c == self->comment)
368 {
369 // Comment line; ignore
370 self->state = COMMENT;
371 break;
372 }
373 // Initialize variables for the beginning of line parsing
374 col = 0;
375 BEGIN_FIELD();
376 // Parse in mode START_FIELD
377
378 case START_FIELD:
379 // Strip whitespace before field begins
380 if ((c == ' ' || c == '\t') && self->strip_whitespace_fields)
381 break;
382 else if (!self->strip_whitespace_lines && self->comment != 0 &&
383 c == self->comment)
384 {
385 // Comment line, not caught earlier because of no stripping
386 self->state = COMMENT;
387 break;
388 }
389 // Handle newline characters first
390 else if (c == self->newline)
391 {
392 if (self->strip_whitespace_lines)
393 {
394 // Move on if the delimiter is whitespace, e.g.
395 // '1 2 3 '->['1','2','3']
396 if (self->delimiter == ' ' || self->delimiter == '\t')
397 ;
398 // Register an empty field if non-whitespace delimiter,
399 // e.g. '1,2, '->['1','2','']
400 else
401 {
402 if (col >= self->num_cols)
403 RETURN(TOO_MANY_COLS);
404 END_FIELD();
405 }
406 }
407
408 else if (!self->strip_whitespace_lines)
409 {
410 // In this case we don't want to left-strip the field,
411 // so we backtrack
412 size_t tmp = self->source_pos;
413 --self->source_pos;
414
415 while (self->source_pos >= 0 &&
416 self->source[self->source_pos] != self->delimiter
417 && self->source[self->source_pos] != '\n'
418 && self->source[self->source_pos] != '\r')
419 {
420 --self->source_pos;
421 }
422
423 // Backtracked to line beginning
424 if (self->source_pos == -1
425 || self->source[self->source_pos] == '\n'
426 || self->source[self->source_pos] == '\r')
427 {
428 self->source_pos = tmp;
429 }
430 else
431 {
432 ++self->source_pos;
433
434 if (self->source_pos == tmp)
435 // No whitespace, just an empty field
436 ;
437 else
438 while (self->source_pos < tmp)
439 {
440 // Append whitespace characters
441 PUSH(self->source[self->source_pos]);
442 ++self->source_pos;
443 }
444
445 if (col >= self->num_cols)
446 RETURN(TOO_MANY_COLS);
447 END_FIELD(); // Whitespace counts as a field
448 }
449 }
450
451 END_LINE();
452 self->state = START_LINE;
453 break;
454 }
455
456 // Before proceeding with a new field check column does not exceed
457 // number defined in header or from auto-detect to avoid segfaults
458 // such as https://github.com/astropy/astropy/issues/9922
459 else if (col >= self->num_cols)
460 RETURN(TOO_MANY_COLS);
461 else if (c == self->delimiter) // Field ends before it begins
462 {
463 END_FIELD();
464 BEGIN_FIELD();
465 break;
466 }
467 else if (c == self->quotechar) // Start parsing quoted field
468 {
469 self->state = START_QUOTED_FIELD;
470 break;
471 }
472 else // Valid field character, parse again in FIELD mode
473 self->state = FIELD;
474
475 case FIELD:
476 if (self->comment != 0 && c == self->comment && whitespace && col == 0)
477 // No whitespace stripping, but the comment char is found
478 // before any data, e.g. ' # a b c'
479 self->state = COMMENT;
480 else if (c == self->delimiter && self->source_pos < self->source_len)
481 {
482 // End of field, look for new field
483 END_FIELD();
484 BEGIN_FIELD();
485 }
486 else if (c == self->newline)
487 {
488 // Line ending, stop parsing both field and line
489 END_FIELD();
490 END_LINE();
491 self->state = START_LINE;
492 }
493 else
494 {
495 if (c != ' ' && c != '\t')
496 whitespace = 0; // Field is not all whitespace
497 PUSH(c);
498 }
499 break;
500
501 case START_QUOTED_FIELD:
502 if ((c == ' ' || c == '\t') && self->strip_whitespace_fields)
503 {
504 // Ignore initial whitespace
505 break;
506 }
507 else if (c == self->quotechar)
508 {
509 // Lookahead check for double quote inside quoted field,
510 // e.g. """cd" => "cd
511 if (self->source_pos < self->source_len - 1)
512 {
513 if (self->source[self->source_pos + 1] == self->quotechar)
514 {
515 self->state = QUOTED_FIELD_DOUBLE_QUOTE;
516 PUSH(c);
517 break;
518 }
519 }
520 // Parse rest of field normally, e.g. ""c
521 self->state = FIELD;
522 }
523 else
524 {
525 // Valid field character, parse again in QUOTED_FIELD mode
526 self->state = QUOTED_FIELD;
527 }
528
529 case QUOTED_FIELD_NEWLINE:
530 if (self->state == QUOTED_FIELD)
531 ; // fall through
532 // Ignore initial whitespace if strip_whitespace_lines and
533 // newlines regardless
534 else if (((c == ' ' || c == '\t') && self->strip_whitespace_lines)
535 || c == self->newline)
536 break;
537 else if (c == self->quotechar)
538 {
539 self->state = FIELD;
540 break;
541 }
542 else
543 {
544 // Once data begins, parse it as a normal quoted field
545 self->state = QUOTED_FIELD;
546 }
547
548 case QUOTED_FIELD:
549 if (c == self->quotechar)
550 {
551 // Lookahead check for double quote inside quoted field,
552 // e.g. "ab""cd" => ab"cd
553 if (self->source_pos < self->source_len - 1)
554 {
555 if (self->source[self->source_pos + 1] == self->quotechar)
556 {
557 self->state = QUOTED_FIELD_DOUBLE_QUOTE;
558 PUSH(c);
559 break;
560 }
561 }
562 // Parse rest of field normally, e.g. "ab"c
563 self->state = FIELD;
564 }
565 else if (c == self->newline)
566 self->state = QUOTED_FIELD_NEWLINE;
567 else
568 {
569 PUSH(c);
570 }
571 break;
572
573 case QUOTED_FIELD_DOUBLE_QUOTE:
574 // Ignore the second double quote from "ab""cd" and parse rest of
575 // field normally as quoted field.
576 self->state = QUOTED_FIELD;
577 break;
578
579 case COMMENT:
580 if (c == self->newline)
581 {
582 self->state = START_LINE;
583 if (!header)
584 end_comment(self);
585 }
586 else if (!header)
587 push_comment(self, c);
588 break; // Keep looping until we find a newline
589
590 }
591
592 ++self->source_pos;
593 }
594
595 RETURN(0);
596 }
597
598
ascii_strncasecmp(const char * str1,const char * str2,size_t n)599 static int ascii_strncasecmp(const char *str1, const char *str2, size_t n)
600 {
601 int char1, char2;
602
603 do
604 {
605 char1 = tolower(*(str1++));
606 char2 = tolower(*(str2++));
607 n--;
608 } while (n && char1 != '\0' && char1 == char2);
609
610 return (char1 - char2);
611 }
612
613
str_to_long(tokenizer_t * self,char * str)614 long str_to_long(tokenizer_t *self, char *str)
615 {
616 char *tmp;
617 long ret;
618 errno = 0;
619 ret = strtol(str, &tmp, 10);
620
621 if (tmp == str || *tmp != '\0')
622 self->code = CONVERSION_ERROR;
623 else if (errno == ERANGE)
624 self->code = OVERFLOW_ERROR;
625
626 return ret;
627 }
628
629
str_to_double(tokenizer_t * self,char * str)630 double str_to_double(tokenizer_t *self, char *str)
631 {
632 char *tmp;
633 double val;
634 errno = 0;
635
636 if (self->use_fast_converter)
637 {
638 val = xstrtod(str, &tmp, '.', self->expchar, ',', 1);
639
640 if (errno == EINVAL || tmp == str || *tmp != '\0')
641 {
642 goto conversion_error;
643 }
644 else if (errno == ERANGE)
645 {
646 self->code = OVERFLOW_ERROR;
647 }
648 else if (errno == EDOM) // xstrtod signalling invalid exponents
649 {
650 self->code = CONVERSION_ERROR;
651 }
652
653 return val;
654 }
655
656 else
657 {
658 val = strtod(str, &tmp);
659
660 if (errno == EINVAL || tmp == str || *tmp != '\0')
661 {
662 goto conversion_error;
663 }
664 else if (errno == ERANGE)
665 {
666 self->code = OVERFLOW_ERROR;
667 }
668 else if (errno == EDOM)
669 {
670 self->code = CONVERSION_ERROR;
671 }
672
673 return val;
674 }
675
676 conversion_error:
677 // Handle inf and nan values for xstrtod and platforms whose strtod
678 // doesn't support this
679 val = 1.0;
680 tmp = str;
681
682 if (*tmp == '+')
683 {
684 tmp++;
685 }
686 else if (*tmp == '-')
687 {
688 tmp++;
689 val = -1.0;
690 }
691
692 if (0 == ascii_strncasecmp(tmp, "nan", 3))
693 {
694 // Handle optional nan type specifier; this is ignored
695 tmp += 3;
696 val = NAN;
697 }
698 else if (0 == ascii_strncasecmp(tmp, "inf", 3))
699 {
700 tmp += 3;
701 if (0 == ascii_strncasecmp(tmp, "inity", 5))
702 {
703 tmp += 5;
704 }
705 val *= INFINITY;
706 }
707 else
708 {
709 // Original (tmp == str || *tmp != '\0') case, no NaN or inf found
710 self->code = CONVERSION_ERROR;
711 val = 0;
712 }
713
714 return val;
715 }
716
717 // ---------------------------------------------------------------------------
718 // Implementation of xstrtod
719
720 //
721 // strtod.c
722 //
723 // Convert string to double
724 //
725 // Copyright (C) 2002 Michael Ringgaard. All rights reserved.
726 //
727 // Redistribution and use in source and binary forms, with or without
728 // modification, are permitted provided that the following conditions
729 // are met:
730 //
731 // 1. Redistributions of source code must retain the above copyright
732 // notice, this list of conditions and the following disclaimer.
733 // 2. Redistributions in binary form must reproduce the above copyright
734 // notice, this list of conditions and the following disclaimer in the
735 // documentation and/or other materials provided with the distribution.
736 // 3. Neither the name of the project nor the names of its contributors
737 // may be used to endorse or promote products derived from this software
738 // without specific prior written permission.
739 //
740 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
741 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
742 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
743 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
744 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
745 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
746 // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
747 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
748 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
749 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
750 // SUCH DAMAGE.
751 //
752 // -----------------------------------------------------------------------
753 // Modifications by Warren Weckesser, March 2011:
754 // * Rename strtod() to xstrtod().
755 // * Added decimal and sci arguments.
756 // * Skip trailing spaces.
757 // * Commented out the other functions.
758 // Modifications by Richard T Guy, August 2013:
759 // * Add tsep argument for thousands separator
760 // Modifications by Michael Mueller, August 2014:
761 // * Cache powers of 10 in memory to avoid rounding errors
762 // * Stop parsing decimals after 17 significant figures
763 // Modifications by Derek Homeier, August 2015:
764 // * Recognise alternative exponent characters passed in 'sci'; try automatic
765 // detection of allowed Fortran formats with sci='A'
766 // * Require exactly 3 digits in exponent for Fortran-type format '8.7654+321'
767 // Modifications by Derek Homeier, September-December 2016:
768 // * Fixed some corner cases of very large or small exponents; proper return
769 // * do not increment num_digits until nonzero digit read in
770 //
771
xstrtod(const char * str,char ** endptr,char decimal,char expchar,char tsep,int skip_trailing)772 double xstrtod(const char *str, char **endptr, char decimal,
773 char expchar, char tsep, int skip_trailing)
774 {
775 double number;
776 int exponent;
777 int negative;
778 char *p = (char *) str;
779 char exp;
780 char sci;
781 int num_digits;
782 int num_decimals;
783 int max_digits = 17;
784 int num_exp = 3;
785 int non_zero;
786 int n;
787 // Cache powers of 10 in memory
788 static double e[] = {1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10,
789 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20,
790 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30,
791 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40,
792 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50,
793 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60,
794 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, 1e70,
795 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, 1e80,
796 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, 1e90,
797 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, 1e100,
798 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, 1e110,
799 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, 1e120,
800 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130,
801 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140,
802 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150,
803 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, 1e160,
804 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, 1e170,
805 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, 1e180,
806 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, 1e190,
807 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, 1e200,
808 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, 1e210,
809 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220,
810 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230,
811 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240,
812 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, 1e250,
813 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, 1e260,
814 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, 1e270,
815 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, 1e280,
816 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, 1e290,
817 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, 1e300,
818 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308};
819 // Cache additional negative powers of 10
820 /* static double m[] = {1e-309, 1e-310, 1e-311, 1e-312, 1e-313, 1e-314,
821 1e-315, 1e-316, 1e-317, 1e-318, 1e-319, 1e-320,
822 1e-321, 1e-322, 1e-323}; */
823 errno = 0;
824
825 // Skip leading whitespace
826 while (isspace(*p)) p++;
827
828 // Handle optional sign
829 negative = 0;
830 switch (*p)
831 {
832 case '-': negative = 1; // Fall through to increment position
833 case '+': p++;
834 }
835
836 // No numerical value following sign - make no conversion and return zero,
837 // resetting endptr to beginning of str (consistent with strtod behaviour)
838 // E.g. -1.e0 and -.0e1 are valid, -.e0 is not!
839 if (!(isdigit(*p) || (*p == decimal && isdigit(*(p + 1)))))
840 {
841 if (endptr) *endptr = (char *) str;
842 return 0e0;
843 }
844
845 number = 0.;
846 exponent = 0;
847 num_digits = 0;
848 num_decimals = 0;
849 non_zero = 0;
850
851 // Process string of digits
852 while (isdigit(*p))
853 {
854 if (num_digits < max_digits)
855 {
856 number = number * 10. + (*p - '0');
857 non_zero += (*p != '0');
858 if(non_zero) num_digits++;
859 }
860 else
861 ++exponent;
862
863 p++;
864 p += (tsep != '\0' && *p == tsep);
865 }
866
867 // Process decimal part
868 if (*p == decimal)
869 {
870 p++;
871
872 while (num_digits < max_digits && isdigit(*p))
873 {
874 number = number * 10. + (*p - '0');
875 non_zero += (*p != '0');
876 if(non_zero) num_digits++;
877 num_decimals++;
878 p++;
879 }
880
881 if (num_digits >= max_digits) // consume extra decimal digits
882 while (isdigit(*p))
883 ++p;
884
885 exponent -= num_decimals;
886 }
887
888 // Exactly 0 - no precision loss/OverflowError
889 if (num_digits == 0) number = 0.0;
890
891 // Correct for sign
892 if (negative) number = -number;
893
894 // Process an exponent string
895 sci = toupper(expchar);
896 if (sci == 'A')
897 {
898 // check for possible Fortran exponential notations, including
899 // triple-digits with no character
900 exp = toupper(*p);
901 if (exp == 'E' || exp == 'D' || exp == 'Q' || *p == '+' || *p == '-')
902 {
903 // Handle optional sign
904 negative = 0;
905 switch (exp)
906 {
907 case '-':
908 negative = 1; // Fall through to increment pos
909 case '+':
910 p++;
911 break;
912 case 'E':
913 case 'D':
914 case 'Q':
915 switch (*++p)
916 {
917 case '-':
918 negative = 1; // Fall through to increment pos
919 case '+':
920 p++;
921 }
922 }
923
924 // Process string of digits
925 n = 0;
926 while (isdigit(*p))
927 {
928 n = n * 10 + (*p - '0');
929 num_exp--;
930 p++;
931 }
932 // Trigger error if not exactly three digits
933 if (num_exp != 0 && (exp == '+' || exp == '-'))
934 {
935 errno = EDOM;
936 number = 0.0;
937 }
938
939 if (negative)
940 exponent -= n;
941 else
942 exponent += n;
943 }
944 }
945 else if (toupper(*p) == sci)
946 {
947 // Handle optional sign
948 negative = 0;
949 switch (*++p)
950 {
951 case '-':
952 negative = 1; // Fall through to increment pos
953 case '+':
954 p++;
955 }
956
957 // Process string of digits
958 n = 0;
959 while (isdigit(*p))
960 {
961 n = n * 10 + (*p - '0');
962 p++;
963 }
964
965 if (negative)
966 exponent -= n;
967 else
968 exponent += n;
969 }
970
971 // largest representable float64 is 1.7977e+308, closest to 0 ~4.94e-324,
972 // but multiplying exponents in in two steps gives slightly better precision
973 if (number != 0.0) {
974 if (exponent > 305)
975 {
976 if (exponent > 308) // leading zeros already subtracted from exp
977 number *= HUGE_VAL;
978 else
979 {
980 number *= e[exponent-300];
981 number *= 1.e300;
982 }
983 }
984 else if (exponent < -308) // subnormal
985 {
986 if (exponent < -616) // prevent invalid array access
987 number = 0.;
988 else
989 {
990 number /= e[-308-exponent];
991 number *= 1.e-308;
992 }
993 // trigger warning if resolution is > ~1.e-15;
994 // strtod does so for |number| <~ 2.25e-308
995 // if (number > -4.94e-309 && number < 4.94e-309)
996 errno = ERANGE;
997 }
998 else if (exponent > 0)
999 number *= e[exponent];
1000 else if (exponent < 0)
1001 number /= e[-exponent];
1002
1003 if (number >= HUGE_VAL || number <= -HUGE_VAL)
1004 errno = ERANGE;
1005 }
1006
1007 if (skip_trailing) {
1008 // Skip trailing whitespace
1009 while (isspace(*p)) p++;
1010 }
1011
1012 if (endptr) *endptr = p;
1013 return number;
1014 }
1015
1016
start_iteration(tokenizer_t * self,int col)1017 void start_iteration(tokenizer_t *self, int col)
1018 {
1019 // Begin looping over the column string with index col
1020 self->iter_col = col;
1021 // Start at the initial pointer position
1022 self->curr_pos = self->output_cols[col];
1023 }
1024
1025
next_field(tokenizer_t * self,int * size)1026 char *next_field(tokenizer_t *self, int *size)
1027 {
1028 char *tmp = self->curr_pos;
1029
1030 // pass through the entire field until reaching the delimiter
1031 while (*self->curr_pos != '\x00')
1032 ++self->curr_pos;
1033
1034 ++self->curr_pos; // next field begins after the delimiter
1035
1036 if (*tmp == '\x01') // empty field; this is a hack
1037 {
1038 if (size)
1039 *size = 0;
1040 return self->buf;
1041 }
1042
1043 else
1044 {
1045 if (size)
1046 *size = self->curr_pos - tmp - 1;
1047 return tmp;
1048 }
1049 }
1050
1051
get_line(char * ptr,size_t * len,size_t map_len)1052 char *get_line(char *ptr, size_t *len, size_t map_len)
1053 {
1054 size_t pos = 0;
1055
1056 while (pos < map_len)
1057 {
1058 if (ptr[pos] == '\r')
1059 {
1060 *len = pos;
1061 // Windows line break (\r\n)
1062 if (pos != map_len - 1 && ptr[pos + 1] == '\n')
1063 return ptr + pos + 2; // skip newline character
1064 else // Carriage return line break
1065 return ptr + pos + 1;
1066 }
1067
1068 else if (ptr[pos] == '\n')
1069 {
1070 *len = pos;
1071 return ptr + pos + 1;
1072 }
1073
1074 ++pos;
1075 }
1076
1077 // done with input
1078 return 0;
1079 }
1080
1081
reset_comments(tokenizer_t * self)1082 void reset_comments(tokenizer_t *self)
1083 {
1084 free(self->comment_lines);
1085 self->comment_pos = 0;
1086 self->comment_lines_len = INITIAL_COMMENT_LEN;
1087 self->comment_lines = (char *) malloc(INITIAL_COMMENT_LEN);
1088 }
1089