1 /*
2 libcsv - parse and write csv data
3 Copyright (C) 2007  Robert Gamble
4 
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9 
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 Lesser General Public License for more details.
14 
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18 */
19 
20 #if ___STDC_VERSION__ >= 199901L
21 #  include <stdint.h>
22 #else
23 #  define SIZE_MAX ((size_t)-1) /* C89 doesn't have stdint.h or SIZE_MAX */
24 #endif
25 
26 #include "csv.h"
27 
28 #define VERSION "2.0.0"
29 
30 #define ROW_NOT_BEGUN           0
31 #define FIELD_NOT_BEGUN         1
32 #define FIELD_BEGUN             2
33 #define FIELD_MIGHT_HAVE_ENDED  3
34 
35 /*
36   Explanation of states
37   ROW_NOT_BEGUN    There have not been any fields encountered for this row
38   FIELD_NOT_BEGUN  There have been fields but we are currently not in one
39   FIELD_BEGUN      We are in a field
40   FIELD_MIGHT_HAVE_ENDED
41                    We encountered a double quote inside a quoted field, the
42                    field is either ended or the quote is literal
43 */
44 
45 #define MEM_BLK_SIZE 128
46 
47 #define SUBMIT_FIELD(p) \
48   do { \
49    if (!quoted) \
50      entry_pos -= spaces; \
51    if (cb1) \
52      cb1(p->entry_buf, entry_pos, data); \
53    pstate = FIELD_NOT_BEGUN; \
54    entry_pos = quoted = spaces = 0; \
55  } while (0)
56 
57 #define SUBMIT_ROW(p, c) \
58   do { \
59     if (cb2) \
60       cb2(c, data); \
61     pstate = ROW_NOT_BEGUN; \
62     entry_pos = quoted = spaces = 0; \
63   } while (0)
64 
65 #define SUBMIT_CHAR(p, c) ((p)->entry_buf[entry_pos++] = (c))
66 
67 static char *csv_errors[] = {"success",
68                              "error parsing data while strict checking enabled",
69                              "memory exhausted while increasing buffer size",
70                              "data size too large",
71                              "invalid status code"};
72 
73 int
74 csv_error(struct csv_parser *p)
75 {
76   return p->status;
77 }
78 
79 char *
80 csv_strerror(int status)
81 {
82   if (status >= CSV_EINVALID || status < 0)
83     return csv_errors[CSV_EINVALID];
84   else
85     return csv_errors[status];
86 }
87 
88 int
89 csv_opts(struct csv_parser *p, unsigned char options)
90 {
91   if (p == NULL)
92     return -1;
93 
94   p->options = options;
95   return 0;
96 }
97 
98 int
99 csv_init(struct csv_parser **p, unsigned char options)
100 {
101   /* Initialize a csv_parser object returns 0 on success, -1 on error */
102   if (p == NULL)
103     return -1;
104 
105   if ((*p = malloc(sizeof(struct csv_parser))) == NULL)
106     return -1;
107 
108   if ( ((*p)->entry_buf = malloc(MEM_BLK_SIZE)) == NULL ) {
109     free(*p);
110     return -1;
111   }
112   (*p)->pstate = ROW_NOT_BEGUN;
113   (*p)->quoted = 0;
114   (*p)->spaces = 0;
115   (*p)->entry_pos = 0;
116   (*p)->entry_size = MEM_BLK_SIZE;
117   (*p)->status = 0;
118   (*p)->options = options;
119   (*p)->quote_char = CSV_QUOTE;
120   (*p)->delim_char = CSV_COMMA;
121   (*p)->is_space = NULL;
122   (*p)->is_term = NULL;
123 
124   return 0;
125 }
126 
127 void
128 csv_free(struct csv_parser *p)
129 {
130   /* Free the entry_buffer and the csv_parser object */
131   if (p == NULL)
132     return;
133 
134   if (p->entry_buf)
135     free(p->entry_buf);
136 
137   free(p);
138   return;
139 }
140 
141 int
142 csv_fini(struct csv_parser *p, void (*cb1)(char *, size_t, void *), void (*cb2)(char c, void *), void *data)
143 {
144   /* Finalize parsing.  Needed, for example, when file does not end in a newline */
145   int quoted = p->quoted;
146   int pstate = p->pstate; /* This is used by the macros, but the compiler thinks it is not used. */
147   (void)pstate; /* Avoid the "set but not used" compiler warning. */
148   size_t spaces = p->spaces;
149   size_t entry_pos = p->entry_pos;
150 
151   if (p == NULL)
152     return -1;
153 
154 
155   if (p->pstate == FIELD_BEGUN && p->quoted && p->options & CSV_STRICT && p->options & CSV_STRICT_FINI) {
156     p->status = CSV_EPARSE;
157     return -1;
158   }
159 
160   switch (p->pstate) {
161     case FIELD_MIGHT_HAVE_ENDED:
162       p->entry_pos -= p->spaces + 1;  /* get rid of spaces and original quote */
163     case FIELD_NOT_BEGUN:
164     case FIELD_BEGUN:
165       quoted = p->quoted, pstate = p->pstate;
166       spaces = p->spaces, entry_pos = p->entry_pos;
167       SUBMIT_FIELD(p);
168       SUBMIT_ROW(p, 0);
169     case ROW_NOT_BEGUN: /* Already ended properly */
170       ;
171   }
172 
173   p->spaces = p->quoted = p->entry_pos = p->status = 0;
174   p->pstate = ROW_NOT_BEGUN;
175 
176   return 0;
177 }
178 
179 void
180 csv_set_delim(struct csv_parser *p, char c)
181 {
182   if (p) p->delim_char = c;
183 }
184 
185 void
186 csv_set_quote(struct csv_parser *p, char c)
187 {
188   if (p) p->quote_char = c;
189 }
190 
191 char
192 csv_get_delim(struct csv_parser *p)
193 {
194   return p->delim_char;
195 }
196 
197 char
198 csv_get_quote(struct csv_parser *p)
199 {
200   return p->quote_char;
201 }
202 
203 void
204 csv_set_space_func(struct csv_parser *p, int (*f)(char))
205 {
206   if (p) p->is_space = f;
207 }
208 
209 void
210 csv_set_term_func(struct csv_parser *p, int (*f)(char))
211 {
212   if (p) p->is_term = f;
213 }
214 
215 static int
216 csv_increase_buffer(struct csv_parser *p)
217 {
218   size_t to_add = MEM_BLK_SIZE;
219   void *vp;
220   while ( p->entry_size >= SIZE_MAX - to_add )
221     to_add /= 2;
222   if (!to_add) {
223     p->status = CSV_ETOOBIG;
224     return -1;
225   }
226   while ((vp = realloc(p->entry_buf, p->entry_size + to_add)) == NULL) {
227     to_add /= 2;
228     if (!to_add) {
229       p->status = CSV_ENOMEM;
230       return -1;
231     }
232   }
233   p->entry_buf = vp;
234   p->entry_size += to_add;
235   return 0;
236 }
237 
238 size_t
239 csv_parse(struct csv_parser *p, const char *s, size_t len, void (*cb1)(char *, size_t, void *), void (*cb2)(char c, void *), void *data)
240 {
241   char c;  /* The character we are currently processing */
242   size_t pos = 0;  /* The number of characters we have processed in this call */
243   char delim = p->delim_char;
244   char quote = p->quote_char;
245   int (*is_space)(char) = p->is_space;
246   int (*is_term)(char) = p->is_term;
247   int quoted = p->quoted;
248   int pstate = p->pstate;
249   size_t spaces = p->spaces;
250   size_t entry_pos = p->entry_pos;
251 
252   while (pos < len) {
253     /* Check memory usage */
254     if (entry_pos == p->entry_size)
255       if (csv_increase_buffer(p) != 0) {
256         p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
257         return pos;
258       }
259 
260     c = s[pos++];
261     switch (pstate) {
262       case ROW_NOT_BEGUN:
263       case FIELD_NOT_BEGUN:
264         if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) { /* Space or Tab */
265           continue;
266         } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
267           if (pstate == FIELD_NOT_BEGUN) {
268             SUBMIT_FIELD(p);
269             SUBMIT_ROW(p, c);
270           } else {  /* ROW_NOT_BEGUN */
271             /* Don't submit empty rows by default */
272             if (p->options & CSV_REPALL_NL) {
273               SUBMIT_ROW(p, c);
274             }
275           }
276           continue;
277         } else if (c == delim) { /* Comma */
278           SUBMIT_FIELD(p);
279           break;
280         } else if (c == quote) { /* Quote */
281           pstate = FIELD_BEGUN;
282           quoted = 1;
283         } else {               /* Anything else */
284           pstate = FIELD_BEGUN;
285           quoted = 0;
286           SUBMIT_CHAR(p, c);
287         }
288         break;
289       case FIELD_BEGUN:
290         if (c == quote) {         /* Quote */
291           if (quoted) {
292             SUBMIT_CHAR(p, c);
293             pstate = FIELD_MIGHT_HAVE_ENDED;
294           } else {
295             /* STRICT ERROR - double quote inside non-quoted field */
296             if (p->options & CSV_STRICT) {
297               p->status = CSV_EPARSE;
298               p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
299               return pos-1;
300             }
301             SUBMIT_CHAR(p, c);
302             spaces = 0;
303           }
304         } else if (c == delim) {  /* Comma */
305           if (quoted) {
306             SUBMIT_CHAR(p, c);
307           } else {
308             SUBMIT_FIELD(p);
309           }
310         } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) {  /* Carriage Return or Line Feed */
311           if (!quoted) {
312             SUBMIT_FIELD(p);
313             SUBMIT_ROW(p, c);
314           } else {
315             SUBMIT_CHAR(p, c);
316           }
317         } else if (!quoted && (is_space? is_space(c) : c == CSV_SPACE || c == CSV_TAB)) { /* Tab or space for non-quoted field */
318             SUBMIT_CHAR(p, c);
319             spaces++;
320         } else {  /* Anything else */
321           SUBMIT_CHAR(p, c);
322           spaces = 0;
323         }
324         break;
325       case FIELD_MIGHT_HAVE_ENDED:
326         /* This only happens when a quote character is encountered in a quoted field */
327         if (c == delim) {  /* Comma */
328           entry_pos -= spaces + 1;  /* get rid of spaces and original quote */
329           SUBMIT_FIELD(p);
330         } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) {  /* Carriage Return or Line Feed */
331           entry_pos -= spaces + 1;  /* get rid of spaces and original quote */
332           SUBMIT_FIELD(p);
333           SUBMIT_ROW(p, c);
334         } else if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) {  /* Space or Tab */
335           SUBMIT_CHAR(p, c);
336           spaces++;
337         } else if (c == quote) {  /* Quote */
338           if (spaces) {
339             /* STRICT ERROR - unescaped double quote */
340             if (p->options & CSV_STRICT) {
341               p->status = CSV_EPARSE;
342               p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
343               return pos-1;
344             }
345             spaces = 0;
346             SUBMIT_CHAR(p, c);
347           } else {
348             /* Two quotes in a row */
349             pstate = FIELD_BEGUN;
350           }
351         } else {  /* Anything else */
352           /* STRICT ERROR - unescaped double quote */
353           if (p->options & CSV_STRICT) {
354             p->status = CSV_EPARSE;
355             p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
356             return pos-1;
357           }
358           pstate = FIELD_BEGUN;
359           spaces = 0;
360           SUBMIT_CHAR(p, c);
361         }
362         break;
363      default:
364        break;
365     }
366   }
367   p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
368   return pos;
369 }
370 
371 size_t
372 csv_write (char *dest, size_t dest_size, const char *src, size_t src_size)
373 {
374   size_t chars = 0;
375 
376   if (src == NULL)
377     return 0;
378 
379   if (dest == NULL)
380     dest_size = 0;
381 
382   if (dest_size > 0)
383     *dest++ = '"';
384   chars++;
385 
386   while (src_size) {
387     if (*src == '"') {
388       if (dest_size > chars)
389         *dest++ = '"';
390       if (chars < SIZE_MAX) chars++;
391     }
392     if (dest_size > chars)
393       *dest++ = *src;
394     if (chars < SIZE_MAX) chars++;
395     src_size--;
396     src++;
397   }
398 
399   if (dest_size > chars)
400     *dest = '"';
401   if (chars < SIZE_MAX) chars++;
402 
403   return chars;
404 }
405 
406 int
407 csv_fwrite (FILE *fp, const char *src, size_t src_size)
408 {
409   if (fp == NULL || src == NULL)
410     return 0;
411 
412   if (fputc('"', fp) == EOF)
413     return EOF;
414 
415   while (src_size) {
416     if (*src == '"') {
417       if (fputc('"', fp) == EOF)
418         return EOF;
419     }
420     if (fputc(*src, fp) == EOF)
421       return EOF;
422     src_size--;
423     src++;
424   }
425 
426   if (fputc('"', fp) == EOF) {
427     return EOF;
428   }
429 
430   return 0;
431 }
432 
433 size_t
434 csv_write2 (char *dest, size_t dest_size, const char *src, size_t src_size, char quote)
435 {
436   size_t chars = 0;
437 
438   if (src == NULL)
439     return 0;
440 
441   if (dest == NULL)
442     dest_size = 0;
443 
444   if (dest_size > 0)
445     *dest++ = quote;
446   chars++;
447 
448   while (src_size) {
449     if (*src == quote) {
450       if (dest_size > chars)
451         *dest++ = quote;
452       if (chars < SIZE_MAX) chars++;
453     }
454     if (dest_size > chars)
455       *dest++ = *src;
456     if (chars < SIZE_MAX) chars++;
457     src_size--;
458     src++;
459   }
460 
461   if (dest_size > chars)
462     *dest = quote;
463   if (chars < SIZE_MAX) chars++;
464 
465   return chars;
466 }
467 
468 int
469 csv_fwrite2 (FILE *fp, const char *src, size_t src_size, char quote)
470 {
471   if (fp == NULL || src == NULL)
472     return 0;
473 
474   if (fputc(quote, fp) == EOF)
475     return EOF;
476 
477   while (src_size) {
478     if (*src == quote) {
479       if (fputc(quote, fp) == EOF)
480         return EOF;
481     }
482     if (fputc(*src, fp) == EOF)
483       return EOF;
484     src_size--;
485     src++;
486   }
487 
488   if (fputc(quote, fp) == EOF) {
489     return EOF;
490   }
491 
492   return 0;
493 }
494