1 /*
2 libcsv - parse and write csv data
3 Copyright (C) 2008  Robert Gamble
4 
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9 
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 Lesser General Public License for more details.
14 
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18 */
19 
20 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
21 #  include <stdint.h>
22 #else
23 #  define SIZE_MAX ((size_t)-1) /* C89 doesn't have stdint.h or SIZE_MAX */
24 #endif
25 
26 #include "libcsv.h"
27 
28 #define VERSION "3.0.3"
29 
30 #define ROW_NOT_BEGUN           0
31 #define FIELD_NOT_BEGUN         1
32 #define FIELD_BEGUN             2
33 #define FIELD_MIGHT_HAVE_ENDED  3
34 
35 /*
36   Explanation of states
37   ROW_NOT_BEGUN    There have not been any fields encountered for this row
38   FIELD_NOT_BEGUN  There have been fields but we are currently not in one
39   FIELD_BEGUN      We are in a field
40   FIELD_MIGHT_HAVE_ENDED
41                    We encountered a double quote inside a quoted field, the
42                    field is either ended or the quote is literal
43 */
44 
45 #define MEM_BLK_SIZE 128
46 
47 #define SUBMIT_FIELD(p) \
48   do { \
49    if (!quoted) \
50      entry_pos -= spaces; \
51    if (p->options & CSV_APPEND_NULL) \
52      ((p)->entry_buf[entry_pos]) = '\0'; \
53    if (cb1 && (p->options & CSV_EMPTY_IS_NULL) && !quoted && entry_pos == 0) \
54      cb1(NULL, entry_pos, data); \
55    else if (cb1) \
56      cb1(p->entry_buf, entry_pos, data); \
57    pstate = FIELD_NOT_BEGUN; \
58    entry_pos = quoted = spaces = 0; \
59  } while (0)
60 
61 #define SUBMIT_ROW(p, c) \
62   do { \
63     if (cb2) \
64       cb2(c, data); \
65     pstate = ROW_NOT_BEGUN; \
66     entry_pos = quoted = spaces = 0; \
67   } while (0)
68 
69 #define SUBMIT_CHAR(p, c) ((p)->entry_buf[entry_pos++] = (c))
70 
71 static const char *csv_errors[] = {"success",
72                              "error parsing data while strict checking enabled",
73                              "memory exhausted while increasing buffer size",
74                              "data size too large",
75                              "invalid status code"};
76 
77 int
csv_error(struct csv_parser * p)78 csv_error(struct csv_parser *p)
79 {
80   /* Return the current status of the parser */
81   return p->status;
82 }
83 
84 const char *
csv_strerror(int status)85 csv_strerror(int status)
86 {
87   /* Return a textual description of status */
88   if (status >= CSV_EINVALID || status < 0)
89     return csv_errors[CSV_EINVALID];
90   else
91     return csv_errors[status];
92 }
93 
94 int
csv_get_opts(struct csv_parser * p)95 csv_get_opts(struct csv_parser *p)
96 {
97   /* Return the currently set options of parser */
98   if (p == NULL)
99     return -1;
100 
101   return p->options;
102 }
103 
104 int
csv_set_opts(struct csv_parser * p,unsigned char options)105 csv_set_opts(struct csv_parser *p, unsigned char options)
106 {
107   /* Set the options */
108   if (p == NULL)
109     return -1;
110 
111   p->options = options;
112   return 0;
113 }
114 
115 int
csv_init(struct csv_parser * p,unsigned char options)116 csv_init(struct csv_parser *p, unsigned char options)
117 {
118   /* Initialize a csv_parser object returns 0 on success, -1 on error */
119   if (p == NULL)
120     return -1;
121 
122   p->entry_buf = NULL;
123   p->pstate = ROW_NOT_BEGUN;
124   p->quoted = 0;
125   p->spaces = 0;
126   p->entry_pos = 0;
127   p->entry_size = 0;
128   p->status = 0;
129   p->options = options;
130   p->quote_char = CSV_QUOTE;
131   p->delim_char = CSV_COMMA;
132   p->is_space = NULL;
133   p->is_term = NULL;
134   p->blk_size = MEM_BLK_SIZE;
135   p->malloc_func = NULL;
136   p->realloc_func = realloc;
137   p->free_func = free;
138 
139   return 0;
140 }
141 
142 void
csv_free(struct csv_parser * p)143 csv_free(struct csv_parser *p)
144 {
145   /* Free the entry_buffer of csv_parser object */
146   if (p == NULL)
147     return;
148 
149   if (p->entry_buf)
150     p->free_func(p->entry_buf);
151 
152   p->entry_buf = NULL;
153   p->entry_size = 0;
154 
155   return;
156 }
157 
158 int
csv_fini(struct csv_parser * p,void (* cb1)(void *,size_t,void *),void (* cb2)(int c,void *),void * data)159 csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
160 {
161   /* Finalize parsing.  Needed, for example, when file does not end in a newline */
162   int quoted = p->quoted;
163   int pstate = p->pstate;
164   size_t spaces = p->spaces;
165   size_t entry_pos = p->entry_pos;
166 
167   if (p == NULL)
168     return -1;
169 
170 
171   if (p->pstate == FIELD_BEGUN && p->quoted && p->options & CSV_STRICT && p->options & CSV_STRICT_FINI) {
172     /* Current field is quoted, no end-quote was seen, and CSV_STRICT_FINI is set */
173     p->status = CSV_EPARSE;
174     return -1;
175   }
176 
177   switch (p->pstate) {
178     case FIELD_MIGHT_HAVE_ENDED:
179       p->entry_pos -= p->spaces + 1;  /* get rid of spaces and original quote */
180       /* Fall-through */
181     case FIELD_NOT_BEGUN:
182     case FIELD_BEGUN:
183       quoted = p->quoted, pstate = p->pstate;
184       spaces = p->spaces, entry_pos = p->entry_pos;
185       SUBMIT_FIELD(p);
186       SUBMIT_ROW(p, -1);
187     case ROW_NOT_BEGUN: /* Already ended properly */
188       ;
189   }
190 
191   /* Reset parser */
192   p->spaces = p->quoted = p->entry_pos = p->status = 0;
193   p->pstate = ROW_NOT_BEGUN;
194 
195   return 0;
196 }
197 
198 void
csv_set_delim(struct csv_parser * p,unsigned char c)199 csv_set_delim(struct csv_parser *p, unsigned char c)
200 {
201   /* Set the delimiter */
202   if (p) p->delim_char = c;
203 }
204 
205 void
csv_set_quote(struct csv_parser * p,unsigned char c)206 csv_set_quote(struct csv_parser *p, unsigned char c)
207 {
208   /* Set the quote character */
209   if (p) p->quote_char = c;
210 }
211 
212 unsigned char
csv_get_delim(struct csv_parser * p)213 csv_get_delim(struct csv_parser *p)
214 {
215   /* Get the delimiter */
216   return p->delim_char;
217 }
218 
219 unsigned char
csv_get_quote(struct csv_parser * p)220 csv_get_quote(struct csv_parser *p)
221 {
222   /* Get the quote character */
223   return p->quote_char;
224 }
225 
226 void
csv_set_space_func(struct csv_parser * p,int (* f)(unsigned char))227 csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char))
228 {
229   /* Set the space function */
230   if (p) p->is_space = f;
231 }
232 
233 void
csv_set_term_func(struct csv_parser * p,int (* f)(unsigned char))234 csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char))
235 {
236   /* Set the term function */
237   if (p) p->is_term = f;
238 }
239 
240 void
csv_set_realloc_func(struct csv_parser * p,void * (* f)(void *,size_t))241 csv_set_realloc_func(struct csv_parser *p, void *(*f)(void *, size_t))
242 {
243   /* Set the realloc function used to increase buffer size */
244   if (p && f) p->realloc_func = f;
245 }
246 
247 void
csv_set_free_func(struct csv_parser * p,void (* f)(void *))248 csv_set_free_func(struct csv_parser *p, void (*f)(void *))
249 {
250   /* Set the free function used to free the buffer */
251   if (p && f) p->free_func = f;
252 }
253 
254 void
csv_set_blk_size(struct csv_parser * p,size_t size)255 csv_set_blk_size(struct csv_parser *p, size_t size)
256 {
257   /* Set the block size used to increment buffer size */
258   if (p) p->blk_size = size;
259 }
260 
261 size_t
csv_get_buffer_size(struct csv_parser * p)262 csv_get_buffer_size(struct csv_parser *p)
263 {
264   /* Get the size of the entry buffer */
265   if (p)
266     return p->entry_size;
267   return 0;
268 }
269 
270 static int
csv_increase_buffer(struct csv_parser * p)271 csv_increase_buffer(struct csv_parser *p)
272 {
273   /* Increase the size of the entry buffer.  Attempt to increase size by
274    * p->blk_size, if this is larger than SIZE_MAX try to increase current
275    * buffer size to SIZE_MAX.  If allocation fails, try to allocate halve
276    * the size and try again until successful or increment size is zero.
277    */
278 
279   size_t to_add = p->blk_size;
280   void *vp;
281 
282   if ( p->entry_size >= SIZE_MAX - to_add )
283     to_add = SIZE_MAX - p->entry_size;
284 
285   if (!to_add) {
286     p->status = CSV_ETOOBIG;
287     return -1;
288   }
289 
290   while ((vp = p->realloc_func(p->entry_buf, p->entry_size + to_add)) == NULL) {
291     to_add /= 2;
292     if (!to_add) {
293       p->status = CSV_ENOMEM;
294       return -1;
295     }
296   }
297 
298   /* Update entry buffer pointer and entry_size if successful */
299   p->entry_buf = vp;
300   p->entry_size += to_add;
301   return 0;
302 }
303 
304 size_t
csv_parse(struct csv_parser * p,const void * s,size_t len,void (* cb1)(void *,size_t,void *),void (* cb2)(int c,void *),void * data)305 csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
306 {
307   unsigned const char *us = s;  /* Access input data as array of unsigned char */
308   unsigned char c;              /* The character we are currently processing */
309   size_t pos = 0;               /* The number of characters we have processed in this call */
310 
311   /* Store key fields into local variables for performance */
312   unsigned char delim = p->delim_char;
313   unsigned char quote = p->quote_char;
314   int (*is_space)(unsigned char) = p->is_space;
315   int (*is_term)(unsigned char) = p->is_term;
316   int quoted = p->quoted;
317   int pstate = p->pstate;
318   size_t spaces = p->spaces;
319   size_t entry_pos = p->entry_pos;
320 
321 
322   if (!p->entry_buf && pos < len) {
323     /* Buffer hasn't been allocated yet and len > 0 */
324     if (csv_increase_buffer(p) != 0) {
325       p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
326       return pos;
327     }
328   }
329 
330   while (pos < len) {
331     /* Check memory usage, increase buffer if necessary */
332     if (entry_pos == ((p->options & CSV_APPEND_NULL) ? p->entry_size - 1 : p->entry_size) ) {
333       if (csv_increase_buffer(p) != 0) {
334         p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
335         return pos;
336       }
337     }
338 
339     c = us[pos++];
340 
341     switch (pstate) {
342       case ROW_NOT_BEGUN:
343       case FIELD_NOT_BEGUN:
344         if ((is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) && c!=delim) { /* Space or Tab */
345           continue;
346         } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
347           if (pstate == FIELD_NOT_BEGUN) {
348             SUBMIT_FIELD(p);
349             SUBMIT_ROW(p, (unsigned char)c);
350           } else {  /* ROW_NOT_BEGUN */
351             /* Don't submit empty rows by default */
352             if (p->options & CSV_REPALL_NL) {
353               SUBMIT_ROW(p, (unsigned char)c);
354             }
355           }
356           continue;
357         } else if (c == delim) { /* Comma */
358           SUBMIT_FIELD(p);
359           break;
360         } else if (c == quote) { /* Quote */
361           pstate = FIELD_BEGUN;
362           quoted = 1;
363         } else {               /* Anything else */
364           pstate = FIELD_BEGUN;
365           quoted = 0;
366           SUBMIT_CHAR(p, c);
367         }
368         break;
369       case FIELD_BEGUN:
370         if (c == quote) {         /* Quote */
371           if (quoted) {
372             SUBMIT_CHAR(p, c);
373             pstate = FIELD_MIGHT_HAVE_ENDED;
374           } else {
375             /* STRICT ERROR - double quote inside non-quoted field */
376             if (p->options & CSV_STRICT) {
377               p->status = CSV_EPARSE;
378               p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
379               return pos-1;
380             }
381             SUBMIT_CHAR(p, c);
382             spaces = 0;
383           }
384         } else if (c == delim) {  /* Comma */
385           if (quoted) {
386             SUBMIT_CHAR(p, c);
387           } else {
388             SUBMIT_FIELD(p);
389           }
390         } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) {  /* Carriage Return or Line Feed */
391           if (!quoted) {
392             SUBMIT_FIELD(p);
393             SUBMIT_ROW(p, (unsigned char)c);
394           } else {
395             SUBMIT_CHAR(p, c);
396           }
397         } else if (!quoted && (is_space? is_space(c) : c == CSV_SPACE || c == CSV_TAB)) { /* Tab or space for non-quoted field */
398             SUBMIT_CHAR(p, c);
399             spaces++;
400         } else {  /* Anything else */
401           SUBMIT_CHAR(p, c);
402           spaces = 0;
403         }
404         break;
405       case FIELD_MIGHT_HAVE_ENDED:
406         /* This only happens when a quote character is encountered in a quoted field */
407         if (c == delim) {  /* Comma */
408           entry_pos -= spaces + 1;  /* get rid of spaces and original quote */
409           SUBMIT_FIELD(p);
410         } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) {  /* Carriage Return or Line Feed */
411           entry_pos -= spaces + 1;  /* get rid of spaces and original quote */
412           SUBMIT_FIELD(p);
413           SUBMIT_ROW(p, (unsigned char)c);
414         } else if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) {  /* Space or Tab */
415           SUBMIT_CHAR(p, c);
416           spaces++;
417         } else if (c == quote) {  /* Quote */
418           if (spaces) {
419             /* STRICT ERROR - unescaped double quote */
420             if (p->options & CSV_STRICT) {
421               p->status = CSV_EPARSE;
422               p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
423               return pos-1;
424             }
425             spaces = 0;
426             SUBMIT_CHAR(p, c);
427           } else {
428             /* Two quotes in a row */
429             pstate = FIELD_BEGUN;
430           }
431         } else {  /* Anything else */
432           /* STRICT ERROR - unescaped double quote */
433           if (p->options & CSV_STRICT) {
434             p->status = CSV_EPARSE;
435             p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
436             return pos-1;
437           }
438           pstate = FIELD_BEGUN;
439           spaces = 0;
440           SUBMIT_CHAR(p, c);
441         }
442         break;
443      default:
444        break;
445     }
446   }
447   p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
448   return pos;
449 }
450 
451 size_t
csv_write(void * dest,size_t dest_size,const void * src,size_t src_size)452 csv_write (void *dest, size_t dest_size, const void *src, size_t src_size)
453 {
454   unsigned char *cdest = dest;
455   const unsigned char *csrc = src;
456   size_t chars = 0;
457 
458   if (src == NULL)
459     return 0;
460 
461   if (cdest == NULL)
462     dest_size = 0;
463 
464   if (dest_size > 0)
465     *cdest++ = '"';
466   chars++;
467 
468   while (src_size) {
469     if (*csrc == '"') {
470       if (dest_size > chars)
471         *cdest++ = '"';
472       if (chars < SIZE_MAX) chars++;
473     }
474     if (dest_size > chars)
475       *cdest++ = *csrc;
476     if (chars < SIZE_MAX) chars++;
477     src_size--;
478     csrc++;
479   }
480 
481   if (dest_size > chars)
482     *cdest = '"';
483   if (chars < SIZE_MAX) chars++;
484 
485   return chars;
486 }
487 
488 int
csv_fwrite(FILE * fp,const void * src,size_t src_size)489 csv_fwrite (FILE *fp, const void *src, size_t src_size)
490 {
491   const unsigned char *csrc = src;
492 
493   if (fp == NULL || src == NULL)
494     return 0;
495 
496   if (fputc('"', fp) == EOF)
497     return EOF;
498 
499   while (src_size) {
500     if (*csrc == '"') {
501       if (fputc('"', fp) == EOF)
502         return EOF;
503     }
504     if (fputc(*csrc, fp) == EOF)
505       return EOF;
506     src_size--;
507     csrc++;
508   }
509 
510   if (fputc('"', fp) == EOF) {
511     return EOF;
512   }
513 
514   return 0;
515 }
516 
517 size_t
csv_write2(void * dest,size_t dest_size,const void * src,size_t src_size,unsigned char quote)518 csv_write2 (void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote)
519 {
520   unsigned char *cdest = dest;
521   const unsigned char *csrc = src;
522   size_t chars = 0;
523 
524   if (src == NULL)
525     return 0;
526 
527   if (dest == NULL)
528     dest_size = 0;
529 
530   if (dest_size > 0)
531     *cdest++ = quote;
532   chars++;
533 
534   while (src_size) {
535     if (*csrc == quote) {
536       if (dest_size > chars)
537         *cdest++ = quote;
538       if (chars < SIZE_MAX) chars++;
539     }
540     if (dest_size > chars)
541       *cdest++ = *csrc;
542     if (chars < SIZE_MAX) chars++;
543     src_size--;
544     csrc++;
545   }
546 
547   if (dest_size > chars)
548     *cdest = quote;
549   if (chars < SIZE_MAX) chars++;
550 
551   return chars;
552 }
553 
554 int
csv_fwrite2(FILE * fp,const void * src,size_t src_size,unsigned char quote)555 csv_fwrite2 (FILE *fp, const void *src, size_t src_size, unsigned char quote)
556 {
557   const unsigned char *csrc = src;
558 
559   if (fp == NULL || src == NULL)
560     return 0;
561 
562   if (fputc(quote, fp) == EOF)
563     return EOF;
564 
565   while (src_size) {
566     if (*csrc == quote) {
567       if (fputc(quote, fp) == EOF)
568         return EOF;
569     }
570     if (fputc(*csrc, fp) == EOF)
571       return EOF;
572     src_size--;
573     csrc++;
574   }
575 
576   if (fputc(quote, fp) == EOF) {
577     return EOF;
578   }
579 
580   return 0;
581 }
582