1 /* -*- Mode: c; c-basic-offset: 2 -*-
2 *
3 * sv.c - Parse separated-values (CSV, TSV) files
4 *
5 * Copyright (C) 2009-2014, David Beckett http://www.dajobe.org/
6 *
7 * This package is Free Software
8 *
9 * It is licensed under the following three licenses as alternatives:
10 * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
11 * 2. GNU General Public License (GPL) V2 or any newer version
12 * 3. Apache License, V2.0 or any newer version
13 *
14 * You may not use this file except in compliance with at least one of
15 * the above three licenses.
16 *
17 * See LICENSE.txt at the top of this package for the
18 * complete terms and further detail along with the license texts for
19 * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
20 *
21 */
22
23
24 #ifdef SV_CONFIG
25 #include <sv_config.h>
26 #endif
27
28 #include <stdio.h>
29 #include <string.h>
30 #include <stdarg.h>
31 #include <ctype.h>
32
33 #ifdef HAVE_STDLIB_H
34 #include <stdlib.h>
35 #endif
36
37 #include <sv.h>
38
39 /* bit flags */
40 #define SV_FLAGS_SAVE_HEADER (1<<0)
41 /* error out on bad data lines */
42 #define SV_FLAGS_BAD_DATA_ERROR (1<<1)
43 /* allow fields to be quoted */
44 #define SV_FLAGS_QUOTED_FIELDS (1<<2)
45 /* strip (non-separator) whitespace around fields */
46 #define SV_FLAGS_STRIP_WHITESPACE (1<<3)
47
48
49 struct sv_s {
50 /* field separator: '\t' or ',' */
51 char field_sep;
52
53 int line;
54
55 /* row callback */
56 void *callback_user_data;
57 sv_fields_callback header_callback;
58 sv_fields_callback data_callback;
59
60 /* current buffer */
61 char *buffer;
62 /* size allocated */
63 size_t size;
64 /* size used */
65 size_t len;
66
67 unsigned int fields_count;
68 char **fields;
69 size_t *fields_widths;
70
71 /* memory buffer used for constructing fields for user;
72 * array above 'fields' points into this
73 */
74 char* fields_buffer;
75 size_t fields_buffer_size;
76
77 /* first row is saved as headers */
78 char **headers;
79 size_t *headers_widths;
80
81 unsigned int flags;
82
83 /* error state */
84 sv_status_t status;
85
86 int bad_records;
87
88 char last_char;
89
90 char quote_char;
91
92 /* called with the line (before parsing) */
93 sv_line_callback line_callback;
94 };
95
96
97 /**
98 * sv_new:
99 * @user_data: user data to use for callbacks
100 * @header_callback: callback to receive headers (or NULL)
101 * @data_callback: callback to receive data rows (or NULL)
102 * @field_sep: field separator ',' or '\t'
103 *
104 * Constructor - create an SV object
105 *
106 * Return value: new SV object or NULL on failure.
107 */
108 sv*
sv_new(void * user_data,sv_fields_callback header_callback,sv_fields_callback data_callback,char field_sep)109 sv_new(void *user_data, sv_fields_callback header_callback,
110 sv_fields_callback data_callback,
111 char field_sep)
112 {
113 sv *t;
114
115 if(field_sep != '\t' && field_sep != ',')
116 return NULL;
117
118 t = (sv*)malloc(sizeof(*t));
119 if(!t)
120 return NULL;
121
122 t->field_sep = field_sep;
123
124 t->line = 1;
125
126 t->callback_user_data = user_data;
127 t->header_callback = header_callback;
128 t->data_callback = data_callback;
129
130 t->buffer = NULL;
131 t->size = 0;
132 t->len = 0;
133
134 t->fields_count = 0;
135 t->fields = NULL;
136 t->fields_widths = NULL;
137
138 t->fields_buffer = NULL;
139 t->fields_buffer_size = 0;
140
141 t->headers = NULL;
142 t->headers_widths = NULL;
143
144 /* default flags */
145 t->flags = SV_FLAGS_SAVE_HEADER | SV_FLAGS_QUOTED_FIELDS;
146
147 t->status = SV_STATUS_OK;
148
149 t->bad_records = 0;
150
151 t->last_char = '\0';
152
153 t->quote_char = '"';
154
155 t->line_callback = NULL;
156
157 return t;
158 }
159
160
161 static sv_status_t
sv_init_fields(sv * t)162 sv_init_fields(sv *t)
163 {
164 t->fields = (char**)malloc(sizeof(char*) * (t->fields_count+1));
165 if(!t->fields)
166 goto failed;
167
168 t->fields_widths = (size_t*)malloc(sizeof(size_t) * (t->fields_count+1));
169 if(!t->fields_widths)
170 goto failed;
171
172 t->headers = (char**)malloc(sizeof(char*) * (t->fields_count+1));
173 if(!t->headers)
174 goto failed;
175
176 t->headers_widths = (size_t*)malloc(sizeof(size_t) * (t->fields_count+1));
177 if(!t->headers_widths)
178 goto failed;
179
180 return SV_STATUS_OK;
181
182
183 failed:
184 if(t->fields) {
185 free(t->fields);
186 t->fields = NULL;
187 }
188
189 if(t->fields_widths) {
190 free(t->fields_widths);
191 t->fields_widths = NULL;
192 }
193
194 if(t->headers) {
195 free(t->headers);
196 t->headers = NULL;
197 }
198
199 return SV_STATUS_NO_MEMORY;
200 }
201
202
203 /**
204 * sv_free:
205 * @t: SV object
206 *
207 * Destructor: destroy an SV object
208 *
209 */
210 void
sv_free(sv * t)211 sv_free(sv *t)
212 {
213 if(!t)
214 return;
215
216 if(t->headers_widths)
217 free(t->headers_widths);
218 if(t->headers) {
219 unsigned int i;
220
221 for(i = 0; i < t->fields_count; i++)
222 free(t->headers[i]);
223 free(t->headers);
224 }
225
226
227 if(t->fields_buffer)
228 free(t->fields_buffer);
229
230 if(t->fields_widths)
231 free(t->fields_widths);
232 if(t->fields)
233 free(t->fields);
234 if(t->buffer)
235 free(t->buffer);
236
237 free(t);
238 }
239
240
241
242 /* Ensure fields buffer is big enough for len bytes total */
243 static sv_status_t
sv_ensure_fields_buffer_size(sv * t,size_t len)244 sv_ensure_fields_buffer_size(sv *t, size_t len)
245 {
246 char *nbuffer;
247 size_t nsize;
248
249 if(len < t->fields_buffer_size)
250 return SV_STATUS_OK;
251
252 nsize = len + 8;
253
254 #if defined(SV_DEBUG) && SV_DEBUG > 1
255 fprintf(stderr, "%d: Growing buffer from %d to %d bytes\n",
256 t->line, (int)t->fields_buffer_size, (int)nsize);
257 #endif
258
259 nbuffer = (char*)malloc(nsize + 1);
260 if(!nbuffer)
261 return SV_STATUS_NO_MEMORY;
262
263 if(t->fields_buffer)
264 free(t->fields_buffer);
265
266 t->fields_buffer = nbuffer;
267 t->fields_buffer_size = nsize;
268
269 return SV_STATUS_OK;
270 }
271
272
273
274 /* Ensure internal buffer is big enough for len more bytes */
275 static sv_status_t
sv_ensure_line_buffer_size(sv * t,size_t len)276 sv_ensure_line_buffer_size(sv *t, size_t len)
277 {
278 char *nbuffer;
279 size_t nsize;
280
281 if(t->len + len < t->size)
282 return SV_STATUS_OK;
283
284 nsize = (len + t->len) << 1;
285
286 nbuffer = (char*)malloc(nsize + 1);
287 if(!nbuffer)
288 return SV_STATUS_NO_MEMORY;
289
290 if(t->len)
291 memcpy(nbuffer, t->buffer, t->len);
292 nbuffer[t->len] = '\0';
293
294 if(t->buffer)
295 free(t->buffer);
296
297 t->buffer = nbuffer;
298 t->size = nsize;
299
300 return SV_STATUS_OK;
301 }
302
303
304 /**
305 * sv_get_line:
306 * @t: sv object
307 *
308 * Get current SV line number
309 *
310 * Return value: line number or <0 on failure
311 */
312 int
sv_get_line(sv * t)313 sv_get_line(sv *t)
314 {
315 if(!t)
316 return -1;
317
318 return t->line;
319 }
320
321
322 /**
323 * sv_get_header:
324 * @t: sv object
325 * @i: header index 0
326 * @width_p: pointer to store width (or NULL)
327 *
328 * Get an SV header with optional width
329 *
330 * Return value: shared pointer to header or NULL if out of range
331 */
332 const char*
sv_get_header(sv * t,unsigned int i,size_t * width_p)333 sv_get_header(sv *t, unsigned int i, size_t *width_p)
334 {
335 if(!t || !t->headers || i > t->fields_count)
336 return NULL;
337
338 if(width_p)
339 *width_p = t->headers_widths[i];
340
341 return (const char*)t->headers[i];
342 }
343
344
345 #if defined(SV_DEBUG) && SV_DEBUG > 1
346 static void
sv_dump_buffer(FILE * fh,const char * label,const char * buffer,size_t len)347 sv_dump_buffer(FILE* fh, const char* label, const char* buffer, size_t len)
348 {
349 size_t mylen=len;
350
351 fprintf(fh, "%s (%zu bytes) >>>", label, len);
352 if(mylen > 100)
353 mylen = 100;
354 fwrite(buffer, 1, mylen, fh);
355 if(mylen != len)
356 fputs("...", fh);
357 fputs("<<<\n", fh);
358 }
359 #endif
360
361
362 static sv_status_t
sv_parse_line(sv * t,char * line,size_t len,unsigned int * field_count_p)363 sv_parse_line(sv *t, char *line, size_t len, unsigned int* field_count_p)
364 {
365 unsigned int column;
366 int field_width = 0;
367 int field_offset = 0;
368 char* current_field = NULL;
369 char* p = NULL;
370 char** fields = t->fields;
371 size_t* fields_widths = t->fields_widths;
372 sv_status_t status;
373 int field_is_quoted = 0;
374
375 #if defined(SV_DEBUG) && SV_DEBUG > 1
376 if(fields)
377 sv_dump_buffer(stderr, "(sv_parse_line): Parsing line", line, len);
378 #endif
379
380 status = sv_ensure_fields_buffer_size(t, len);
381 if(status)
382 return status;
383
384 if(fields) {
385 current_field = t->fields_buffer;
386 p = current_field;
387
388 if(!p)
389 return SV_STATUS_OK;
390 }
391
392 for(column = 0; 1; column++) {
393 int c = -1;
394 int field_ended = 0;
395 int expect_sep = 0;
396
397 if(column == len) {
398 field_ended = 1;
399 goto do_last;
400 }
401
402 c = line[column];
403
404 if(t->flags & SV_FLAGS_QUOTED_FIELDS) {
405 if(c == t->quote_char) {
406 if(!field_width && !field_is_quoted) {
407 field_is_quoted = 1;
408 #if defined(SV_DEBUG) && SV_DEBUG > 1
409 fprintf(stderr, "Field is quoted\n");
410 #endif
411 continue;
412 } else if(column < len && line[column+1] == t->quote_char) {
413 #if defined(SV_DEBUG) && SV_DEBUG > 1
414 fprintf(stderr, "Doubled quote %c absorbed\n", t->quote_char);
415 #endif
416 column++;
417 /* skip repeated quote - so it just replaces ""... with " */
418 goto skip;
419 } else if(column == len-1 || line[column+1] == t->field_sep) {
420 #if defined(SV_DEBUG) && SV_DEBUG > 1
421 fprintf(stderr, "Field ended on quote + sep\n");
422 #endif
423 field_ended = 1;
424 expect_sep = 1;
425 goto do_last;
426 }
427 }
428 }
429
430 if(!field_is_quoted && c == t->field_sep) {
431 #if defined(SV_DEBUG) && SV_DEBUG > 1
432 fprintf(stderr, "Field ended on sep\n");
433 #endif
434 field_ended = 1;
435 }
436
437 do_last:
438 if(field_ended) {
439 if(p)
440 *p++ = '\0';
441
442 if(fields) {
443
444 if(t->flags & SV_FLAGS_STRIP_WHITESPACE) {
445 /* Remove whitespace around a field */
446 while(field_width > 0 && isspace(current_field[0])) {
447 current_field++;
448 field_width--;
449 }
450
451 while(field_width > 0 && isspace(current_field[field_width - 1]))
452 field_width--;
453
454 current_field[field_width] = '\0';
455 }
456
457 if(expect_sep)
458 column++;
459
460 }
461
462 #if defined(SV_DEBUG) && SV_DEBUG > 1
463 if(fields) {
464 fprintf(stderr, " Field %d: %s (%d)\n", (int)field_offset, current_field, (int)field_width);
465 }
466 #endif
467 if(fields)
468 fields[field_offset] = current_field;
469 if(fields_widths)
470 fields_widths[field_offset] = field_width;
471
472 /* end loop when out of columns */
473 if(column == len)
474 break;
475
476 /* otherwise got a tab so reset for next field */
477 field_width = 0;
478 field_is_quoted = 0;
479
480 field_offset++;
481 current_field = p;
482
483 continue;
484 }
485
486 skip:
487 if(fields)
488 *p++ = c;
489 field_width++;
490 }
491
492
493 if(field_count_p)
494 *field_count_p = field_offset + 1;
495
496 return SV_STATUS_OK;
497 }
498
499
500 static sv_status_t
sv_parse_chunk_line(sv * t,size_t line_len,int has_nl)501 sv_parse_chunk_line(sv* t, size_t line_len, int has_nl)
502 {
503 size_t move_len = line_len;
504 sv_status_t status = SV_STATUS_OK;
505 unsigned int fields_count = 0;
506
507 if(!line_len)
508 goto skip_line;
509
510 if(t->line_callback) {
511 char c = t->buffer[line_len];
512
513 t->buffer[line_len] = '\0';
514 status = t->line_callback(t, t->callback_user_data, t->buffer, line_len);
515 t->buffer[line_len] = c;
516 if(status != SV_STATUS_OK)
517 return status;
518 }
519
520 if(!t->fields_count) {
521 /* First line in the file - calculate number of fields */
522 status = sv_parse_line(t, t->buffer, line_len, &t->fields_count);
523 if(status)
524 return status;
525
526 /* initialise arrays of size t->fields_count */
527 status = sv_init_fields(t);
528 if(status)
529 return status;
530 }
531
532 status = sv_parse_line(t, t->buffer, line_len, &fields_count);
533 if(status)
534 return status;
535
536 if(fields_count != t->fields_count) {
537 t->bad_records++;
538 if(t->flags & SV_FLAGS_BAD_DATA_ERROR) {
539 #if defined(SV_DEBUG) && SV_DEBUG > 1
540 fprintf(stderr, "Error in line %d: saw %d fields expected %d\n",
541 t->line, fields_count, t->fields_count);
542 #endif
543 status = SV_STATUS_LINE_FIELDS;
544 return status;
545 }
546 #if defined(SV_DEBUG) && SV_DEBUG > 1
547 fprintf(stderr, "Ignoring line %d: saw %d fields expected %d\n",
548 t->line, fields_count, t->fields_count);
549 #endif
550 /* Otherwise skip the line */
551 goto skip_line;
552 }
553
554 if(t->line == 1 && (t->flags & SV_FLAGS_SAVE_HEADER)) {
555 /* first line and header: turn fields into headers */
556 unsigned int i;
557
558 for(i = 0; i < t->fields_count; i++) {
559 char *s = (char*)malloc(t->fields_widths[i]+1);
560 if(!s) {
561 status = SV_STATUS_NO_MEMORY;
562 break;
563 }
564 memcpy(s, t->fields[i], t->fields_widths[i]+1);
565 t->headers[i] = s;
566 t->headers_widths[i] = t->fields_widths[i];
567 }
568
569 if(status == SV_STATUS_OK && t->header_callback) {
570 /* got header fields - return them to user */
571 status = t->header_callback(t, t->callback_user_data, t->headers,
572 t->headers_widths, t->fields_count);
573 }
574 } else {
575 /* data */
576
577 if(t->data_callback) {
578 /* got data fields - return them to user */
579 status = t->data_callback(t, t->callback_user_data, t->fields,
580 t->fields_widths, t->fields_count);
581 }
582 }
583
584 skip_line:
585
586 if(has_nl)
587 move_len++;
588
589 /* adjust buffer - remove 'line_len+1' bytes from start of buffer */
590 t->len -= move_len;
591
592 /* this is an overlapping move */
593 memmove(t->buffer, &t->buffer[move_len], t->len);
594
595 /* This is not needed: guaranteed above */
596 /* t->buffer[t->len] = '\0' */
597
598 t->line++;
599
600 return status;
601 }
602
603
604 /**
605 * sv_parse_chunk:
606 * @t: sv object
607 * @buffer: buffer to parse (or NULL)
608 * @len: length of @buffer (or 0)
609 *
610 * Parse a chunk of data
611 *
612 * Parsing ends if either @buffer is NULL or @len is 0
613 *
614 * Return value: #SV_STATUS_OK on success
615 */
616 sv_status_t
sv_parse_chunk(sv * t,char * buffer,size_t len)617 sv_parse_chunk(sv *t, char *buffer, size_t len)
618 {
619 size_t offset = 0;
620 sv_status_t status = SV_STATUS_OK;
621 /* End of input if either of these is NULL */
622 int is_end = (!buffer || !len);
623
624 if(!is_end) {
625 /* add new data to existing buffer */
626 status = sv_ensure_line_buffer_size(t, len);
627 if(status)
628 return status;
629
630 /* add new buffer */
631 memcpy(t->buffer + t->len, buffer, len);
632
633 /* always ensure it is NUL terminated even if input chunk was not */
634 t->len += len;
635 t->buffer[t->len] = '\0';
636 }
637
638 /* look for an end of line to do some work */
639 for(offset = 0; offset < t->len; offset++) {
640 char c = t->buffer[offset];
641
642 /* skip \n when just seen \r - i.e. \r\n or CR LF */
643 if(t->last_char == '\r' && c == '\n') {
644 #if defined(SV_DEBUG) && SV_DEBUG > 1
645 fprintf(stderr, "Skipping a \\n after \\r\n");
646 #endif
647
648 /* adjust buffer */
649 t->len -= 1;
650
651 /* this is an overlapping move */
652 memmove(t->buffer, &t->buffer[1], t->len);
653
654 t->last_char = '\0';
655 continue;
656 }
657
658 if(c != '\r' && c != '\n')
659 continue;
660
661 t->last_char = c;
662
663 #if defined(SV_DEBUG) && SV_DEBUG > 1
664 sv_dump_buffer(stderr, "Starting buffer", t->buffer, t->len);
665 #endif
666
667 /* found a line */
668 status = sv_parse_chunk_line(t, offset, 1);
669 if(status != SV_STATUS_OK)
670 break;
671
672 offset = -1; /* so for loop starts at 0 */
673 }
674
675 if(is_end && status == SV_STATUS_OK) {
676 /* If end of input and there is a non-empty buffer left, try to
677 * parse it all as the last line. It will NOT contain newlines.
678 */
679 if(t->len)
680 status = sv_parse_chunk_line(t, t->len, 0);
681 }
682
683 return status;
684 }
685
686
687 static sv_status_t
sv_set_option_vararg(sv * t,sv_option_t option,va_list arg)688 sv_set_option_vararg(sv* t, sv_option_t option, va_list arg)
689 {
690 sv_status_t status = SV_STATUS_OK;
691
692 switch(option) {
693 case SV_OPTION_SAVE_HEADER:
694 t->flags &= ~SV_FLAGS_SAVE_HEADER;
695 if(va_arg(arg, long))
696 t->flags |= SV_FLAGS_SAVE_HEADER;
697 break;
698
699 case SV_OPTION_BAD_DATA_ERROR:
700 t->flags &= ~SV_FLAGS_BAD_DATA_ERROR;
701 if(va_arg(arg, long))
702 t->flags |= SV_FLAGS_BAD_DATA_ERROR;
703 break;
704
705 case SV_OPTION_QUOTED_FIELDS:
706 t->flags &= ~SV_FLAGS_QUOTED_FIELDS;
707 if(va_arg(arg, long))
708 t->flags |= SV_FLAGS_QUOTED_FIELDS;
709 break;
710
711 case SV_OPTION_STRIP_WHITESPACE:
712 t->flags &= ~SV_FLAGS_STRIP_WHITESPACE;
713 if(va_arg(arg, long))
714 t->flags |= SV_FLAGS_STRIP_WHITESPACE;
715 break;
716
717 case SV_OPTION_QUOTE_CHAR:
718 if(1) {
719 int c = va_arg(arg, int);
720 if(c != t->field_sep)
721 t->quote_char = c;
722 }
723 break;
724
725 case SV_OPTION_LINE_CALLBACK:
726 if(1) {
727 sv_line_callback cb = (sv_line_callback)va_arg(arg, void*);
728 t->line_callback = cb;
729 }
730
731 default:
732 case SV_OPTION_NONE:
733 status = SV_STATUS_FAILED;
734 break;
735
736 }
737
738 return status;
739 }
740
741
742 /**
743 * sv_set_option:
744 * @t: sv object
745 * @option: option name
746 *
747 * Set an option value. The value varies in type dependent on the @option
748 *
749 * Return value: #SV_STATUS_FAILED if failed
750 */
751 sv_status_t
sv_set_option(sv * t,sv_option_t option,...)752 sv_set_option(sv *t, sv_option_t option, ...)
753 {
754 sv_status_t status;
755 va_list arg;
756
757 va_start(arg, option);
758 status = sv_set_option_vararg(t, option, arg);
759 va_end(arg);
760
761 return status;
762 }
763