1 /* -*- Mode: c; c-basic-offset: 2 -*-
2  *
3  * sv.c - Parse separated-values (CSV, TSV) files
4  *
5  * Copyright (C) 2009-2014, David Beckett http://www.dajobe.org/
6  *
7  * This package is Free Software
8  *
9  * It is licensed under the following three licenses as alternatives:
10  *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
11  *   2. GNU General Public License (GPL) V2 or any newer version
12  *   3. Apache License, V2.0 or any newer version
13  *
14  * You may not use this file except in compliance with at least one of
15  * the above three licenses.
16  *
17  * See LICENSE.txt at the top of this package for the
18  * complete terms and further detail along with the license texts for
19  * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
20  *
21  */
22 
23 
24 #ifdef SV_CONFIG
25 #include <sv_config.h>
26 #endif
27 
28 #include <stdio.h>
29 #include <string.h>
30 #include <stdarg.h>
31 #include <ctype.h>
32 
33 #ifdef HAVE_STDLIB_H
34 #include <stdlib.h>
35 #endif
36 
37 #include <sv.h>
38 
39 /* bit flags */
40 #define SV_FLAGS_SAVE_HEADER    (1<<0)
41 /* error out on bad data lines */
42 #define SV_FLAGS_BAD_DATA_ERROR (1<<1)
43 /* allow fields to be quoted */
44 #define SV_FLAGS_QUOTED_FIELDS  (1<<2)
45 /* strip (non-separator) whitespace around fields */
46 #define SV_FLAGS_STRIP_WHITESPACE  (1<<3)
47 
48 
49 struct sv_s {
50   /* field separator: '\t' or ',' */
51   char field_sep;
52 
53   int line;
54 
55   /* row callback */
56   void *callback_user_data;
57   sv_fields_callback header_callback;
58   sv_fields_callback data_callback;
59 
60   /* current buffer */
61   char *buffer;
62   /* size allocated */
63   size_t size;
64   /* size used */
65   size_t len;
66 
67   unsigned int fields_count;
68   char **fields;
69   size_t *fields_widths;
70 
71   /* memory buffer used for constructing fields for user;
72    * array above 'fields' points into this
73    */
74   char* fields_buffer;
75   size_t fields_buffer_size;
76 
77   /* first row is saved as headers */
78   char **headers;
79   size_t *headers_widths;
80 
81   unsigned int flags;
82 
83   /* error state */
84   sv_status_t status;
85 
86   int bad_records;
87 
88   char last_char;
89 
90   char quote_char;
91 
92   /* called with the line (before parsing) */
93   sv_line_callback line_callback;
94 };
95 
96 
97 /**
98  * sv_new:
99  * @user_data: user data to use for callbacks
100  * @header_callback: callback to receive headers (or NULL)
101  * @data_callback: callback to receive data rows (or NULL)
102  * @field_sep: field separator ',' or '\t'
103  *
104  * Constructor - create an SV object
105  *
106  * Return value: new SV object or NULL on failure.
107  */
108 sv*
sv_new(void * user_data,sv_fields_callback header_callback,sv_fields_callback data_callback,char field_sep)109 sv_new(void *user_data, sv_fields_callback header_callback,
110        sv_fields_callback data_callback,
111        char field_sep)
112 {
113   sv *t;
114 
115   if(field_sep != '\t' && field_sep != ',')
116     return NULL;
117 
118   t = (sv*)malloc(sizeof(*t));
119   if(!t)
120     return NULL;
121 
122   t->field_sep = field_sep;
123 
124   t->line = 1;
125 
126   t->callback_user_data = user_data;
127   t->header_callback = header_callback;
128   t->data_callback = data_callback;
129 
130   t->buffer = NULL;
131   t->size = 0;
132   t->len = 0;
133 
134   t->fields_count = 0;
135   t->fields = NULL;
136   t->fields_widths = NULL;
137 
138   t->fields_buffer = NULL;
139   t->fields_buffer_size = 0;
140 
141   t->headers = NULL;
142   t->headers_widths = NULL;
143 
144   /* default flags */
145   t->flags = SV_FLAGS_SAVE_HEADER | SV_FLAGS_QUOTED_FIELDS;
146 
147   t->status = SV_STATUS_OK;
148 
149   t->bad_records = 0;
150 
151   t->last_char = '\0';
152 
153   t->quote_char = '"';
154 
155   t->line_callback = NULL;
156 
157   return t;
158 }
159 
160 
161 static sv_status_t
sv_init_fields(sv * t)162 sv_init_fields(sv *t)
163 {
164   t->fields = (char**)malloc(sizeof(char*) * (t->fields_count+1));
165   if(!t->fields)
166     goto failed;
167 
168   t->fields_widths = (size_t*)malloc(sizeof(size_t) * (t->fields_count+1));
169   if(!t->fields_widths)
170     goto failed;
171 
172   t->headers = (char**)malloc(sizeof(char*) * (t->fields_count+1));
173   if(!t->headers)
174     goto failed;
175 
176   t->headers_widths = (size_t*)malloc(sizeof(size_t) * (t->fields_count+1));
177   if(!t->headers_widths)
178     goto failed;
179 
180   return SV_STATUS_OK;
181 
182 
183   failed:
184   if(t->fields) {
185     free(t->fields);
186     t->fields = NULL;
187   }
188 
189   if(t->fields_widths) {
190     free(t->fields_widths);
191     t->fields_widths = NULL;
192   }
193 
194   if(t->headers) {
195     free(t->headers);
196     t->headers = NULL;
197   }
198 
199   return SV_STATUS_NO_MEMORY;
200 }
201 
202 
203 /**
204  * sv_free:
205  * @t: SV object
206  *
207  * Destructor: destroy an SV object
208  *
209  */
210 void
sv_free(sv * t)211 sv_free(sv *t)
212 {
213   if(!t)
214     return;
215 
216   if(t->headers_widths)
217     free(t->headers_widths);
218   if(t->headers) {
219     unsigned int i;
220 
221     for(i = 0; i < t->fields_count; i++)
222       free(t->headers[i]);
223     free(t->headers);
224   }
225 
226 
227   if(t->fields_buffer)
228     free(t->fields_buffer);
229 
230   if(t->fields_widths)
231     free(t->fields_widths);
232   if(t->fields)
233     free(t->fields);
234   if(t->buffer)
235     free(t->buffer);
236 
237   free(t);
238 }
239 
240 
241 
242 /* Ensure fields buffer is big enough for len bytes total */
243 static sv_status_t
sv_ensure_fields_buffer_size(sv * t,size_t len)244 sv_ensure_fields_buffer_size(sv *t, size_t len)
245 {
246   char *nbuffer;
247   size_t nsize;
248 
249   if(len < t->fields_buffer_size)
250     return SV_STATUS_OK;
251 
252   nsize = len + 8;
253 
254 #if defined(SV_DEBUG) && SV_DEBUG > 1
255   fprintf(stderr, "%d: Growing buffer from %d to %d bytes\n",
256           t->line, (int)t->fields_buffer_size, (int)nsize);
257 #endif
258 
259   nbuffer = (char*)malloc(nsize + 1);
260   if(!nbuffer)
261     return SV_STATUS_NO_MEMORY;
262 
263   if(t->fields_buffer)
264     free(t->fields_buffer);
265 
266   t->fields_buffer = nbuffer;
267   t->fields_buffer_size = nsize;
268 
269   return SV_STATUS_OK;
270 }
271 
272 
273 
274 /* Ensure internal buffer is big enough for len more bytes */
275 static sv_status_t
sv_ensure_line_buffer_size(sv * t,size_t len)276 sv_ensure_line_buffer_size(sv *t, size_t len)
277 {
278   char *nbuffer;
279   size_t nsize;
280 
281   if(t->len + len < t->size)
282     return SV_STATUS_OK;
283 
284   nsize = (len + t->len) << 1;
285 
286   nbuffer = (char*)malloc(nsize + 1);
287   if(!nbuffer)
288     return SV_STATUS_NO_MEMORY;
289 
290   if(t->len)
291     memcpy(nbuffer, t->buffer, t->len);
292   nbuffer[t->len] = '\0';
293 
294   if(t->buffer)
295     free(t->buffer);
296 
297   t->buffer = nbuffer;
298   t->size = nsize;
299 
300   return SV_STATUS_OK;
301 }
302 
303 
304 /**
305  * sv_get_line:
306  * @t: sv object
307  *
308  * Get current SV line number
309  *
310  * Return value: line number or <0 on failure
311  */
312 int
sv_get_line(sv * t)313 sv_get_line(sv *t)
314 {
315   if(!t)
316     return -1;
317 
318   return t->line;
319 }
320 
321 
322 /**
323  * sv_get_header:
324  * @t: sv object
325  * @i: header index 0
326  * @width_p: pointer to store width (or NULL)
327  *
328  * Get an SV header with optional width
329  *
330  * Return value: shared pointer to header or NULL if out of range
331  */
332 const char*
sv_get_header(sv * t,unsigned int i,size_t * width_p)333 sv_get_header(sv *t, unsigned int i, size_t *width_p)
334 {
335   if(!t || !t->headers || i > t->fields_count)
336     return NULL;
337 
338   if(width_p)
339     *width_p = t->headers_widths[i];
340 
341   return (const char*)t->headers[i];
342 }
343 
344 
345 #if defined(SV_DEBUG) && SV_DEBUG > 1
346 static void
sv_dump_buffer(FILE * fh,const char * label,const char * buffer,size_t len)347 sv_dump_buffer(FILE* fh, const char* label, const char* buffer, size_t len)
348 {
349   size_t mylen=len;
350 
351   fprintf(fh, "%s (%zu bytes) >>>", label, len);
352   if(mylen > 100)
353     mylen = 100;
354   fwrite(buffer, 1, mylen, fh);
355   if(mylen != len)
356     fputs("...", fh);
357   fputs("<<<\n", fh);
358 }
359 #endif
360 
361 
362 static sv_status_t
sv_parse_line(sv * t,char * line,size_t len,unsigned int * field_count_p)363 sv_parse_line(sv *t, char *line, size_t len,  unsigned int* field_count_p)
364 {
365   unsigned int column;
366   int field_width = 0;
367   int field_offset = 0;
368   char* current_field = NULL;
369   char* p = NULL;
370   char** fields = t->fields;
371   size_t* fields_widths = t->fields_widths;
372   sv_status_t status;
373   int field_is_quoted = 0;
374 
375 #if defined(SV_DEBUG) && SV_DEBUG > 1
376   if(fields)
377     sv_dump_buffer(stderr, "(sv_parse_line): Parsing line", line, len);
378 #endif
379 
380   status = sv_ensure_fields_buffer_size(t, len);
381   if(status)
382     return status;
383 
384   if(fields) {
385     current_field = t->fields_buffer;
386     p = current_field;
387 
388     if(!p)
389       return SV_STATUS_OK;
390   }
391 
392   for(column = 0; 1; column++) {
393     int c = -1;
394     int field_ended = 0;
395     int expect_sep = 0;
396 
397     if(column == len) {
398       field_ended = 1;
399       goto do_last;
400     }
401 
402     c = line[column];
403 
404     if(t->flags & SV_FLAGS_QUOTED_FIELDS) {
405       if(c == t->quote_char) {
406         if(!field_width && !field_is_quoted) {
407           field_is_quoted = 1;
408   #if defined(SV_DEBUG) && SV_DEBUG > 1
409           fprintf(stderr, "Field is quoted\n");
410   #endif
411           continue;
412         } else if(column < len && line[column+1] == t->quote_char) {
413   #if defined(SV_DEBUG) && SV_DEBUG > 1
414           fprintf(stderr, "Doubled quote %c absorbed\n", t->quote_char);
415   #endif
416           column++;
417           /* skip repeated quote - so it just replaces ""... with " */
418           goto skip;
419         } else if(column == len-1 || line[column+1] == t->field_sep) {
420   #if defined(SV_DEBUG) && SV_DEBUG > 1
421           fprintf(stderr, "Field ended on quote + sep\n");
422   #endif
423           field_ended = 1;
424           expect_sep = 1;
425           goto do_last;
426         }
427       }
428     }
429 
430     if(!field_is_quoted && c == t->field_sep) {
431 #if defined(SV_DEBUG) && SV_DEBUG > 1
432       fprintf(stderr, "Field ended on sep\n");
433 #endif
434       field_ended = 1;
435     }
436 
437     do_last:
438     if(field_ended) {
439       if(p)
440         *p++ = '\0';
441 
442       if(fields) {
443 
444         if(t->flags & SV_FLAGS_STRIP_WHITESPACE) {
445           /* Remove whitespace around a field */
446           while(field_width > 0 && isspace(current_field[0])) {
447             current_field++;
448             field_width--;
449           }
450 
451           while(field_width > 0 && isspace(current_field[field_width - 1]))
452             field_width--;
453 
454           current_field[field_width] = '\0';
455         }
456 
457         if(expect_sep)
458           column++;
459 
460       }
461 
462 #if defined(SV_DEBUG) && SV_DEBUG > 1
463       if(fields) {
464         fprintf(stderr, "  Field %d: %s (%d)\n", (int)field_offset, current_field, (int)field_width);
465       }
466 #endif
467       if(fields)
468         fields[field_offset] = current_field;
469       if(fields_widths)
470         fields_widths[field_offset] = field_width;
471 
472       /* end loop when out of columns */
473       if(column == len)
474         break;
475 
476       /* otherwise got a tab so reset for next field */
477       field_width = 0;
478       field_is_quoted = 0;
479 
480       field_offset++;
481       current_field = p;
482 
483       continue;
484     }
485 
486     skip:
487     if(fields)
488       *p++ = c;
489     field_width++;
490   }
491 
492 
493   if(field_count_p)
494     *field_count_p = field_offset + 1;
495 
496   return SV_STATUS_OK;
497 }
498 
499 
500 static sv_status_t
sv_parse_chunk_line(sv * t,size_t line_len,int has_nl)501 sv_parse_chunk_line(sv* t, size_t line_len, int has_nl)
502 {
503   size_t move_len = line_len;
504   sv_status_t status = SV_STATUS_OK;
505   unsigned int fields_count = 0;
506 
507   if(!line_len)
508     goto skip_line;
509 
510   if(t->line_callback) {
511     char c = t->buffer[line_len];
512 
513     t->buffer[line_len] = '\0';
514     status = t->line_callback(t, t->callback_user_data, t->buffer, line_len);
515     t->buffer[line_len] = c;
516     if(status != SV_STATUS_OK)
517       return status;
518   }
519 
520   if(!t->fields_count) {
521     /* First line in the file - calculate number of fields */
522     status = sv_parse_line(t, t->buffer, line_len, &t->fields_count);
523     if(status)
524       return status;
525 
526     /* initialise arrays of size t->fields_count */
527     status = sv_init_fields(t);
528     if(status)
529       return status;
530   }
531 
532   status = sv_parse_line(t, t->buffer, line_len, &fields_count);
533   if(status)
534     return status;
535 
536   if(fields_count != t->fields_count) {
537     t->bad_records++;
538     if(t->flags & SV_FLAGS_BAD_DATA_ERROR) {
539 #if defined(SV_DEBUG) && SV_DEBUG > 1
540       fprintf(stderr, "Error in line %d: saw %d fields expected %d\n",
541               t->line, fields_count, t->fields_count);
542 #endif
543       status = SV_STATUS_LINE_FIELDS;
544       return status;
545     }
546 #if defined(SV_DEBUG) && SV_DEBUG > 1
547     fprintf(stderr, "Ignoring line %d: saw %d fields expected %d\n",
548             t->line, fields_count, t->fields_count);
549 #endif
550     /* Otherwise skip the line */
551     goto skip_line;
552   }
553 
554   if(t->line == 1 && (t->flags & SV_FLAGS_SAVE_HEADER)) {
555     /* first line and header: turn fields into headers */
556     unsigned int i;
557 
558     for(i = 0; i < t->fields_count; i++) {
559       char *s = (char*)malloc(t->fields_widths[i]+1);
560       if(!s) {
561         status = SV_STATUS_NO_MEMORY;
562         break;
563       }
564       memcpy(s, t->fields[i], t->fields_widths[i]+1);
565       t->headers[i] = s;
566       t->headers_widths[i] = t->fields_widths[i];
567     }
568 
569     if(status == SV_STATUS_OK && t->header_callback) {
570       /* got header fields - return them to user */
571       status = t->header_callback(t, t->callback_user_data, t->headers,
572                                   t->headers_widths, t->fields_count);
573     }
574   } else {
575     /* data */
576 
577     if(t->data_callback) {
578       /* got data fields - return them to user */
579       status = t->data_callback(t, t->callback_user_data, t->fields,
580                                 t->fields_widths, t->fields_count);
581     }
582   }
583 
584   skip_line:
585 
586   if(has_nl)
587     move_len++;
588 
589   /* adjust buffer - remove 'line_len+1' bytes from start of buffer */
590   t->len -= move_len;
591 
592   /* this is an overlapping move */
593   memmove(t->buffer, &t->buffer[move_len], t->len);
594 
595   /* This is not needed: guaranteed above */
596   /* t->buffer[t->len] = '\0' */
597 
598   t->line++;
599 
600   return status;
601 }
602 
603 
604 /**
605  * sv_parse_chunk:
606  * @t: sv object
607  * @buffer: buffer to parse (or NULL)
608  * @len: length of @buffer (or 0)
609  *
610  * Parse a chunk of data
611  *
612  * Parsing ends if either @buffer is NULL or @len is 0
613  *
614  * Return value: #SV_STATUS_OK on success
615  */
616 sv_status_t
sv_parse_chunk(sv * t,char * buffer,size_t len)617 sv_parse_chunk(sv *t, char *buffer, size_t len)
618 {
619   size_t offset = 0;
620   sv_status_t status = SV_STATUS_OK;
621   /* End of input if either of these is NULL */
622   int is_end = (!buffer || !len);
623 
624   if(!is_end) {
625     /* add new data to existing buffer */
626     status = sv_ensure_line_buffer_size(t, len);
627     if(status)
628       return status;
629 
630     /* add new buffer */
631     memcpy(t->buffer + t->len, buffer, len);
632 
633     /* always ensure it is NUL terminated even if input chunk was not */
634     t->len += len;
635     t->buffer[t->len] = '\0';
636   }
637 
638   /* look for an end of line to do some work */
639   for(offset = 0; offset < t->len; offset++) {
640     char c = t->buffer[offset];
641 
642     /* skip \n when just seen \r - i.e. \r\n or CR LF */
643     if(t->last_char == '\r' && c == '\n') {
644 #if defined(SV_DEBUG) && SV_DEBUG > 1
645       fprintf(stderr, "Skipping a \\n after \\r\n");
646 #endif
647 
648       /* adjust buffer */
649       t->len -= 1;
650 
651       /* this is an overlapping move */
652       memmove(t->buffer, &t->buffer[1], t->len);
653 
654       t->last_char = '\0';
655       continue;
656     }
657 
658     if(c != '\r' && c != '\n')
659       continue;
660 
661     t->last_char = c;
662 
663 #if defined(SV_DEBUG) && SV_DEBUG > 1
664     sv_dump_buffer(stderr, "Starting buffer", t->buffer, t->len);
665 #endif
666 
667     /* found a line */
668     status = sv_parse_chunk_line(t, offset, 1);
669     if(status != SV_STATUS_OK)
670       break;
671 
672     offset = -1; /* so for loop starts at 0 */
673   }
674 
675   if(is_end && status == SV_STATUS_OK) {
676     /* If end of input and there is a non-empty buffer left, try to
677      * parse it all as the last line.  It will NOT contain newlines.
678      */
679     if(t->len)
680       status = sv_parse_chunk_line(t, t->len, 0);
681   }
682 
683   return status;
684 }
685 
686 
687 static sv_status_t
sv_set_option_vararg(sv * t,sv_option_t option,va_list arg)688 sv_set_option_vararg(sv* t, sv_option_t option, va_list arg)
689 {
690   sv_status_t status = SV_STATUS_OK;
691 
692   switch(option) {
693     case SV_OPTION_SAVE_HEADER:
694       t->flags &= ~SV_FLAGS_SAVE_HEADER;
695       if(va_arg(arg, long))
696         t->flags |= SV_FLAGS_SAVE_HEADER;
697       break;
698 
699     case SV_OPTION_BAD_DATA_ERROR:
700       t->flags &= ~SV_FLAGS_BAD_DATA_ERROR;
701       if(va_arg(arg, long))
702         t->flags |= SV_FLAGS_BAD_DATA_ERROR;
703       break;
704 
705     case SV_OPTION_QUOTED_FIELDS:
706       t->flags &= ~SV_FLAGS_QUOTED_FIELDS;
707       if(va_arg(arg, long))
708         t->flags |= SV_FLAGS_QUOTED_FIELDS;
709       break;
710 
711     case SV_OPTION_STRIP_WHITESPACE:
712       t->flags &= ~SV_FLAGS_STRIP_WHITESPACE;
713       if(va_arg(arg, long))
714         t->flags |= SV_FLAGS_STRIP_WHITESPACE;
715       break;
716 
717     case SV_OPTION_QUOTE_CHAR:
718       if(1) {
719         int c = va_arg(arg, int);
720         if(c != t->field_sep)
721           t->quote_char = c;
722       }
723       break;
724 
725     case SV_OPTION_LINE_CALLBACK:
726       if(1) {
727         sv_line_callback cb = (sv_line_callback)va_arg(arg, void*);
728         t->line_callback = cb;
729       }
730 
731     default:
732     case SV_OPTION_NONE:
733       status = SV_STATUS_FAILED;
734       break;
735 
736   }
737 
738   return status;
739 }
740 
741 
742 /**
743  * sv_set_option:
744  * @t: sv object
745  * @option: option name
746  *
747  * Set an option value.  The value varies in type dependent on the @option
748  *
749  * Return value: #SV_STATUS_FAILED if failed
750  */
751 sv_status_t
sv_set_option(sv * t,sv_option_t option,...)752 sv_set_option(sv *t, sv_option_t option, ...)
753 {
754   sv_status_t status;
755   va_list arg;
756 
757   va_start(arg, option);
758   status = sv_set_option_vararg(t, option, arg);
759   va_end(arg);
760 
761   return status;
762 }
763