1 /*
2 libcsv - parse and write csv data
3 Copyright (C) 2008 Robert Gamble
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19
20 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
21 # include <stdint.h>
22 #else
23 # define SIZE_MAX ((size_t)-1) /* C89 doesn't have stdint.h or SIZE_MAX */
24 #endif
25
26 #include "libcsv.h"
27
28 #define VERSION "3.0.3"
29
30 #define ROW_NOT_BEGUN 0
31 #define FIELD_NOT_BEGUN 1
32 #define FIELD_BEGUN 2
33 #define FIELD_MIGHT_HAVE_ENDED 3
34
35 /*
36 Explanation of states
37 ROW_NOT_BEGUN There have not been any fields encountered for this row
38 FIELD_NOT_BEGUN There have been fields but we are currently not in one
39 FIELD_BEGUN We are in a field
40 FIELD_MIGHT_HAVE_ENDED
41 We encountered a double quote inside a quoted field, the
42 field is either ended or the quote is literal
43 */
44
45 #define MEM_BLK_SIZE 128
46
47 #define SUBMIT_FIELD(p) \
48 do { \
49 if (!quoted) \
50 entry_pos -= spaces; \
51 if (p->options & CSV_APPEND_NULL) \
52 ((p)->entry_buf[entry_pos]) = '\0'; \
53 if (cb1 && (p->options & CSV_EMPTY_IS_NULL) && !quoted && entry_pos == 0) \
54 cb1(NULL, entry_pos, data); \
55 else if (cb1) \
56 cb1(p->entry_buf, entry_pos, data); \
57 pstate = FIELD_NOT_BEGUN; \
58 entry_pos = quoted = spaces = 0; \
59 } while (0)
60
61 #define SUBMIT_ROW(p, c) \
62 do { \
63 if (cb2) \
64 cb2(c, data); \
65 pstate = ROW_NOT_BEGUN; \
66 entry_pos = quoted = spaces = 0; \
67 } while (0)
68
69 #define SUBMIT_CHAR(p, c) ((p)->entry_buf[entry_pos++] = (c))
70
71 static const char *csv_errors[] = {"success",
72 "error parsing data while strict checking enabled",
73 "memory exhausted while increasing buffer size",
74 "data size too large",
75 "invalid status code"};
76
77 int
csv_error(struct csv_parser * p)78 csv_error(struct csv_parser *p)
79 {
80 /* Return the current status of the parser */
81 return p->status;
82 }
83
84 const char *
csv_strerror(int status)85 csv_strerror(int status)
86 {
87 /* Return a textual description of status */
88 if (status >= CSV_EINVALID || status < 0)
89 return csv_errors[CSV_EINVALID];
90 else
91 return csv_errors[status];
92 }
93
94 int
csv_get_opts(struct csv_parser * p)95 csv_get_opts(struct csv_parser *p)
96 {
97 /* Return the currently set options of parser */
98 if (p == NULL)
99 return -1;
100
101 return p->options;
102 }
103
104 int
csv_set_opts(struct csv_parser * p,unsigned char options)105 csv_set_opts(struct csv_parser *p, unsigned char options)
106 {
107 /* Set the options */
108 if (p == NULL)
109 return -1;
110
111 p->options = options;
112 return 0;
113 }
114
115 int
csv_init(struct csv_parser * p,unsigned char options)116 csv_init(struct csv_parser *p, unsigned char options)
117 {
118 /* Initialize a csv_parser object returns 0 on success, -1 on error */
119 if (p == NULL)
120 return -1;
121
122 p->entry_buf = NULL;
123 p->pstate = ROW_NOT_BEGUN;
124 p->quoted = 0;
125 p->spaces = 0;
126 p->entry_pos = 0;
127 p->entry_size = 0;
128 p->status = 0;
129 p->options = options;
130 p->quote_char = CSV_QUOTE;
131 p->delim_char = CSV_COMMA;
132 p->is_space = NULL;
133 p->is_term = NULL;
134 p->blk_size = MEM_BLK_SIZE;
135 p->malloc_func = NULL;
136 p->realloc_func = realloc;
137 p->free_func = free;
138
139 return 0;
140 }
141
142 void
csv_free(struct csv_parser * p)143 csv_free(struct csv_parser *p)
144 {
145 /* Free the entry_buffer of csv_parser object */
146 if (p == NULL)
147 return;
148
149 if (p->entry_buf)
150 p->free_func(p->entry_buf);
151
152 p->entry_buf = NULL;
153 p->entry_size = 0;
154
155 return;
156 }
157
158 int
csv_fini(struct csv_parser * p,void (* cb1)(void *,size_t,void *),void (* cb2)(int c,void *),void * data)159 csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
160 {
161 /* Finalize parsing. Needed, for example, when file does not end in a newline */
162 int quoted = p->quoted;
163 int pstate = p->pstate;
164 size_t spaces = p->spaces;
165 size_t entry_pos = p->entry_pos;
166
167 if (p == NULL)
168 return -1;
169
170
171 if (p->pstate == FIELD_BEGUN && p->quoted && p->options & CSV_STRICT && p->options & CSV_STRICT_FINI) {
172 /* Current field is quoted, no end-quote was seen, and CSV_STRICT_FINI is set */
173 p->status = CSV_EPARSE;
174 return -1;
175 }
176
177 switch (p->pstate) {
178 case FIELD_MIGHT_HAVE_ENDED:
179 p->entry_pos -= p->spaces + 1; /* get rid of spaces and original quote */
180 /* Fall-through */
181 case FIELD_NOT_BEGUN:
182 case FIELD_BEGUN:
183 quoted = p->quoted, pstate = p->pstate;
184 spaces = p->spaces, entry_pos = p->entry_pos;
185 SUBMIT_FIELD(p);
186 SUBMIT_ROW(p, -1);
187 case ROW_NOT_BEGUN: /* Already ended properly */
188 ;
189 }
190
191 /* Reset parser */
192 p->spaces = p->quoted = p->entry_pos = p->status = 0;
193 p->pstate = ROW_NOT_BEGUN;
194
195 return 0;
196 }
197
198 void
csv_set_delim(struct csv_parser * p,unsigned char c)199 csv_set_delim(struct csv_parser *p, unsigned char c)
200 {
201 /* Set the delimiter */
202 if (p) p->delim_char = c;
203 }
204
205 void
csv_set_quote(struct csv_parser * p,unsigned char c)206 csv_set_quote(struct csv_parser *p, unsigned char c)
207 {
208 /* Set the quote character */
209 if (p) p->quote_char = c;
210 }
211
212 unsigned char
csv_get_delim(struct csv_parser * p)213 csv_get_delim(struct csv_parser *p)
214 {
215 /* Get the delimiter */
216 return p->delim_char;
217 }
218
219 unsigned char
csv_get_quote(struct csv_parser * p)220 csv_get_quote(struct csv_parser *p)
221 {
222 /* Get the quote character */
223 return p->quote_char;
224 }
225
226 void
csv_set_space_func(struct csv_parser * p,int (* f)(unsigned char))227 csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char))
228 {
229 /* Set the space function */
230 if (p) p->is_space = f;
231 }
232
233 void
csv_set_term_func(struct csv_parser * p,int (* f)(unsigned char))234 csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char))
235 {
236 /* Set the term function */
237 if (p) p->is_term = f;
238 }
239
240 void
csv_set_realloc_func(struct csv_parser * p,void * (* f)(void *,size_t))241 csv_set_realloc_func(struct csv_parser *p, void *(*f)(void *, size_t))
242 {
243 /* Set the realloc function used to increase buffer size */
244 if (p && f) p->realloc_func = f;
245 }
246
247 void
csv_set_free_func(struct csv_parser * p,void (* f)(void *))248 csv_set_free_func(struct csv_parser *p, void (*f)(void *))
249 {
250 /* Set the free function used to free the buffer */
251 if (p && f) p->free_func = f;
252 }
253
254 void
csv_set_blk_size(struct csv_parser * p,size_t size)255 csv_set_blk_size(struct csv_parser *p, size_t size)
256 {
257 /* Set the block size used to increment buffer size */
258 if (p) p->blk_size = size;
259 }
260
261 size_t
csv_get_buffer_size(struct csv_parser * p)262 csv_get_buffer_size(struct csv_parser *p)
263 {
264 /* Get the size of the entry buffer */
265 if (p)
266 return p->entry_size;
267 return 0;
268 }
269
270 static int
csv_increase_buffer(struct csv_parser * p)271 csv_increase_buffer(struct csv_parser *p)
272 {
273 /* Increase the size of the entry buffer. Attempt to increase size by
274 * p->blk_size, if this is larger than SIZE_MAX try to increase current
275 * buffer size to SIZE_MAX. If allocation fails, try to allocate halve
276 * the size and try again until successful or increment size is zero.
277 */
278
279 size_t to_add = p->blk_size;
280 void *vp;
281
282 if ( p->entry_size >= SIZE_MAX - to_add )
283 to_add = SIZE_MAX - p->entry_size;
284
285 if (!to_add) {
286 p->status = CSV_ETOOBIG;
287 return -1;
288 }
289
290 while ((vp = p->realloc_func(p->entry_buf, p->entry_size + to_add)) == NULL) {
291 to_add /= 2;
292 if (!to_add) {
293 p->status = CSV_ENOMEM;
294 return -1;
295 }
296 }
297
298 /* Update entry buffer pointer and entry_size if successful */
299 p->entry_buf = vp;
300 p->entry_size += to_add;
301 return 0;
302 }
303
304 size_t
csv_parse(struct csv_parser * p,const void * s,size_t len,void (* cb1)(void *,size_t,void *),void (* cb2)(int c,void *),void * data)305 csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
306 {
307 unsigned const char *us = s; /* Access input data as array of unsigned char */
308 unsigned char c; /* The character we are currently processing */
309 size_t pos = 0; /* The number of characters we have processed in this call */
310
311 /* Store key fields into local variables for performance */
312 unsigned char delim = p->delim_char;
313 unsigned char quote = p->quote_char;
314 int (*is_space)(unsigned char) = p->is_space;
315 int (*is_term)(unsigned char) = p->is_term;
316 int quoted = p->quoted;
317 int pstate = p->pstate;
318 size_t spaces = p->spaces;
319 size_t entry_pos = p->entry_pos;
320
321
322 if (!p->entry_buf && pos < len) {
323 /* Buffer hasn't been allocated yet and len > 0 */
324 if (csv_increase_buffer(p) != 0) {
325 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
326 return pos;
327 }
328 }
329
330 while (pos < len) {
331 /* Check memory usage, increase buffer if necessary */
332 if (entry_pos == ((p->options & CSV_APPEND_NULL) ? p->entry_size - 1 : p->entry_size) ) {
333 if (csv_increase_buffer(p) != 0) {
334 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
335 return pos;
336 }
337 }
338
339 c = us[pos++];
340
341 switch (pstate) {
342 case ROW_NOT_BEGUN:
343 case FIELD_NOT_BEGUN:
344 if ((is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) && c!=delim) { /* Space or Tab */
345 continue;
346 } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
347 if (pstate == FIELD_NOT_BEGUN) {
348 SUBMIT_FIELD(p);
349 SUBMIT_ROW(p, (unsigned char)c);
350 } else { /* ROW_NOT_BEGUN */
351 /* Don't submit empty rows by default */
352 if (p->options & CSV_REPALL_NL) {
353 SUBMIT_ROW(p, (unsigned char)c);
354 }
355 }
356 continue;
357 } else if (c == delim) { /* Comma */
358 SUBMIT_FIELD(p);
359 break;
360 } else if (c == quote) { /* Quote */
361 pstate = FIELD_BEGUN;
362 quoted = 1;
363 } else { /* Anything else */
364 pstate = FIELD_BEGUN;
365 quoted = 0;
366 SUBMIT_CHAR(p, c);
367 }
368 break;
369 case FIELD_BEGUN:
370 if (c == quote) { /* Quote */
371 if (quoted) {
372 SUBMIT_CHAR(p, c);
373 pstate = FIELD_MIGHT_HAVE_ENDED;
374 } else {
375 /* STRICT ERROR - double quote inside non-quoted field */
376 if (p->options & CSV_STRICT) {
377 p->status = CSV_EPARSE;
378 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
379 return pos-1;
380 }
381 SUBMIT_CHAR(p, c);
382 spaces = 0;
383 }
384 } else if (c == delim) { /* Comma */
385 if (quoted) {
386 SUBMIT_CHAR(p, c);
387 } else {
388 SUBMIT_FIELD(p);
389 }
390 } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
391 if (!quoted) {
392 SUBMIT_FIELD(p);
393 SUBMIT_ROW(p, (unsigned char)c);
394 } else {
395 SUBMIT_CHAR(p, c);
396 }
397 } else if (!quoted && (is_space? is_space(c) : c == CSV_SPACE || c == CSV_TAB)) { /* Tab or space for non-quoted field */
398 SUBMIT_CHAR(p, c);
399 spaces++;
400 } else { /* Anything else */
401 SUBMIT_CHAR(p, c);
402 spaces = 0;
403 }
404 break;
405 case FIELD_MIGHT_HAVE_ENDED:
406 /* This only happens when a quote character is encountered in a quoted field */
407 if (c == delim) { /* Comma */
408 entry_pos -= spaces + 1; /* get rid of spaces and original quote */
409 SUBMIT_FIELD(p);
410 } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
411 entry_pos -= spaces + 1; /* get rid of spaces and original quote */
412 SUBMIT_FIELD(p);
413 SUBMIT_ROW(p, (unsigned char)c);
414 } else if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) { /* Space or Tab */
415 SUBMIT_CHAR(p, c);
416 spaces++;
417 } else if (c == quote) { /* Quote */
418 if (spaces) {
419 /* STRICT ERROR - unescaped double quote */
420 if (p->options & CSV_STRICT) {
421 p->status = CSV_EPARSE;
422 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
423 return pos-1;
424 }
425 spaces = 0;
426 SUBMIT_CHAR(p, c);
427 } else {
428 /* Two quotes in a row */
429 pstate = FIELD_BEGUN;
430 }
431 } else { /* Anything else */
432 /* STRICT ERROR - unescaped double quote */
433 if (p->options & CSV_STRICT) {
434 p->status = CSV_EPARSE;
435 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
436 return pos-1;
437 }
438 pstate = FIELD_BEGUN;
439 spaces = 0;
440 SUBMIT_CHAR(p, c);
441 }
442 break;
443 default:
444 break;
445 }
446 }
447 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
448 return pos;
449 }
450
451 size_t
csv_write(void * dest,size_t dest_size,const void * src,size_t src_size)452 csv_write (void *dest, size_t dest_size, const void *src, size_t src_size)
453 {
454 unsigned char *cdest = dest;
455 const unsigned char *csrc = src;
456 size_t chars = 0;
457
458 if (src == NULL)
459 return 0;
460
461 if (cdest == NULL)
462 dest_size = 0;
463
464 if (dest_size > 0)
465 *cdest++ = '"';
466 chars++;
467
468 while (src_size) {
469 if (*csrc == '"') {
470 if (dest_size > chars)
471 *cdest++ = '"';
472 if (chars < SIZE_MAX) chars++;
473 }
474 if (dest_size > chars)
475 *cdest++ = *csrc;
476 if (chars < SIZE_MAX) chars++;
477 src_size--;
478 csrc++;
479 }
480
481 if (dest_size > chars)
482 *cdest = '"';
483 if (chars < SIZE_MAX) chars++;
484
485 return chars;
486 }
487
488 int
csv_fwrite(FILE * fp,const void * src,size_t src_size)489 csv_fwrite (FILE *fp, const void *src, size_t src_size)
490 {
491 const unsigned char *csrc = src;
492
493 if (fp == NULL || src == NULL)
494 return 0;
495
496 if (fputc('"', fp) == EOF)
497 return EOF;
498
499 while (src_size) {
500 if (*csrc == '"') {
501 if (fputc('"', fp) == EOF)
502 return EOF;
503 }
504 if (fputc(*csrc, fp) == EOF)
505 return EOF;
506 src_size--;
507 csrc++;
508 }
509
510 if (fputc('"', fp) == EOF) {
511 return EOF;
512 }
513
514 return 0;
515 }
516
517 size_t
csv_write2(void * dest,size_t dest_size,const void * src,size_t src_size,unsigned char quote)518 csv_write2 (void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote)
519 {
520 unsigned char *cdest = dest;
521 const unsigned char *csrc = src;
522 size_t chars = 0;
523
524 if (src == NULL)
525 return 0;
526
527 if (dest == NULL)
528 dest_size = 0;
529
530 if (dest_size > 0)
531 *cdest++ = quote;
532 chars++;
533
534 while (src_size) {
535 if (*csrc == quote) {
536 if (dest_size > chars)
537 *cdest++ = quote;
538 if (chars < SIZE_MAX) chars++;
539 }
540 if (dest_size > chars)
541 *cdest++ = *csrc;
542 if (chars < SIZE_MAX) chars++;
543 src_size--;
544 csrc++;
545 }
546
547 if (dest_size > chars)
548 *cdest = quote;
549 if (chars < SIZE_MAX) chars++;
550
551 return chars;
552 }
553
554 int
csv_fwrite2(FILE * fp,const void * src,size_t src_size,unsigned char quote)555 csv_fwrite2 (FILE *fp, const void *src, size_t src_size, unsigned char quote)
556 {
557 const unsigned char *csrc = src;
558
559 if (fp == NULL || src == NULL)
560 return 0;
561
562 if (fputc(quote, fp) == EOF)
563 return EOF;
564
565 while (src_size) {
566 if (*csrc == quote) {
567 if (fputc(quote, fp) == EOF)
568 return EOF;
569 }
570 if (fputc(*csrc, fp) == EOF)
571 return EOF;
572 src_size--;
573 csrc++;
574 }
575
576 if (fputc(quote, fp) == EOF) {
577 return EOF;
578 }
579
580 return 0;
581 }
582