1 /* 2 libcsv - parse and write csv data 3 Copyright (C) 2007 Robert Gamble 4 5 This library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 This library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with this library; if not, write to the Free Software 17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 18 */ 19 20 #if ___STDC_VERSION__ >= 199901L 21 # include <stdint.h> 22 #else 23 # define SIZE_MAX ((size_t)-1) /* C89 doesn't have stdint.h or SIZE_MAX */ 24 #endif 25 26 #include "csv.h" 27 28 #define VERSION "2.0.0" 29 30 #define ROW_NOT_BEGUN 0 31 #define FIELD_NOT_BEGUN 1 32 #define FIELD_BEGUN 2 33 #define FIELD_MIGHT_HAVE_ENDED 3 34 35 /* 36 Explanation of states 37 ROW_NOT_BEGUN There have not been any fields encountered for this row 38 FIELD_NOT_BEGUN There have been fields but we are currently not in one 39 FIELD_BEGUN We are in a field 40 FIELD_MIGHT_HAVE_ENDED 41 We encountered a double quote inside a quoted field, the 42 field is either ended or the quote is literal 43 */ 44 45 #define MEM_BLK_SIZE 128 46 47 #define SUBMIT_FIELD(p) \ 48 do { \ 49 if (!quoted) \ 50 entry_pos -= spaces; \ 51 if (cb1) \ 52 cb1(p->entry_buf, entry_pos, data); \ 53 pstate = FIELD_NOT_BEGUN; \ 54 entry_pos = quoted = spaces = 0; \ 55 } while (0) 56 57 #define SUBMIT_ROW(p, c) \ 58 do { \ 59 if (cb2) \ 60 cb2(c, data); \ 61 pstate = ROW_NOT_BEGUN; \ 62 entry_pos = quoted = spaces = 0; \ 63 } while (0) 64 65 #define SUBMIT_CHAR(p, c) ((p)->entry_buf[entry_pos++] = (c)) 66 67 static char *csv_errors[] = {"success", 68 "error parsing data while strict checking enabled", 69 "memory exhausted while increasing buffer size", 70 "data size too large", 71 "invalid status code"}; 72 73 int 74 csv_error(struct csv_parser *p) 75 { 76 return p->status; 77 } 78 79 char * 80 csv_strerror(int status) 81 { 82 if (status >= CSV_EINVALID || status < 0) 83 return csv_errors[CSV_EINVALID]; 84 else 85 return csv_errors[status]; 86 } 87 88 int 89 csv_opts(struct csv_parser *p, unsigned char options) 90 { 91 if (p == NULL) 92 return -1; 93 94 p->options = options; 95 return 0; 96 } 97 98 int 99 csv_init(struct csv_parser **p, unsigned char options) 100 { 101 /* Initialize a csv_parser object returns 0 on success, -1 on error */ 102 if (p == NULL) 103 return -1; 104 105 if ((*p = malloc(sizeof(struct csv_parser))) == NULL) 106 return -1; 107 108 if ( ((*p)->entry_buf = malloc(MEM_BLK_SIZE)) == NULL ) { 109 free(*p); 110 return -1; 111 } 112 (*p)->pstate = ROW_NOT_BEGUN; 113 (*p)->quoted = 0; 114 (*p)->spaces = 0; 115 (*p)->entry_pos = 0; 116 (*p)->entry_size = MEM_BLK_SIZE; 117 (*p)->status = 0; 118 (*p)->options = options; 119 (*p)->quote_char = CSV_QUOTE; 120 (*p)->delim_char = CSV_COMMA; 121 (*p)->is_space = NULL; 122 (*p)->is_term = NULL; 123 124 return 0; 125 } 126 127 void 128 csv_free(struct csv_parser *p) 129 { 130 /* Free the entry_buffer and the csv_parser object */ 131 if (p == NULL) 132 return; 133 134 if (p->entry_buf) 135 free(p->entry_buf); 136 137 free(p); 138 return; 139 } 140 141 int 142 csv_fini(struct csv_parser *p, void (*cb1)(char *, size_t, void *), void (*cb2)(char c, void *), void *data) 143 { 144 /* Finalize parsing. Needed, for example, when file does not end in a newline */ 145 int quoted = p->quoted; 146 int pstate = p->pstate; /* This is used by the macros, but the compiler thinks it is not used. */ 147 (void)pstate; /* Avoid the "set but not used" compiler warning. */ 148 size_t spaces = p->spaces; 149 size_t entry_pos = p->entry_pos; 150 151 if (p == NULL) 152 return -1; 153 154 155 if (p->pstate == FIELD_BEGUN && p->quoted && p->options & CSV_STRICT && p->options & CSV_STRICT_FINI) { 156 p->status = CSV_EPARSE; 157 return -1; 158 } 159 160 switch (p->pstate) { 161 case FIELD_MIGHT_HAVE_ENDED: 162 p->entry_pos -= p->spaces + 1; /* get rid of spaces and original quote */ 163 case FIELD_NOT_BEGUN: 164 case FIELD_BEGUN: 165 quoted = p->quoted, pstate = p->pstate; 166 spaces = p->spaces, entry_pos = p->entry_pos; 167 SUBMIT_FIELD(p); 168 SUBMIT_ROW(p, 0); 169 case ROW_NOT_BEGUN: /* Already ended properly */ 170 ; 171 } 172 173 p->spaces = p->quoted = p->entry_pos = p->status = 0; 174 p->pstate = ROW_NOT_BEGUN; 175 176 return 0; 177 } 178 179 void 180 csv_set_delim(struct csv_parser *p, char c) 181 { 182 if (p) p->delim_char = c; 183 } 184 185 void 186 csv_set_quote(struct csv_parser *p, char c) 187 { 188 if (p) p->quote_char = c; 189 } 190 191 char 192 csv_get_delim(struct csv_parser *p) 193 { 194 return p->delim_char; 195 } 196 197 char 198 csv_get_quote(struct csv_parser *p) 199 { 200 return p->quote_char; 201 } 202 203 void 204 csv_set_space_func(struct csv_parser *p, int (*f)(char)) 205 { 206 if (p) p->is_space = f; 207 } 208 209 void 210 csv_set_term_func(struct csv_parser *p, int (*f)(char)) 211 { 212 if (p) p->is_term = f; 213 } 214 215 static int 216 csv_increase_buffer(struct csv_parser *p) 217 { 218 size_t to_add = MEM_BLK_SIZE; 219 void *vp; 220 while ( p->entry_size >= SIZE_MAX - to_add ) 221 to_add /= 2; 222 if (!to_add) { 223 p->status = CSV_ETOOBIG; 224 return -1; 225 } 226 while ((vp = realloc(p->entry_buf, p->entry_size + to_add)) == NULL) { 227 to_add /= 2; 228 if (!to_add) { 229 p->status = CSV_ENOMEM; 230 return -1; 231 } 232 } 233 p->entry_buf = vp; 234 p->entry_size += to_add; 235 return 0; 236 } 237 238 size_t 239 csv_parse(struct csv_parser *p, const char *s, size_t len, void (*cb1)(char *, size_t, void *), void (*cb2)(char c, void *), void *data) 240 { 241 char c; /* The character we are currently processing */ 242 size_t pos = 0; /* The number of characters we have processed in this call */ 243 char delim = p->delim_char; 244 char quote = p->quote_char; 245 int (*is_space)(char) = p->is_space; 246 int (*is_term)(char) = p->is_term; 247 int quoted = p->quoted; 248 int pstate = p->pstate; 249 size_t spaces = p->spaces; 250 size_t entry_pos = p->entry_pos; 251 252 while (pos < len) { 253 /* Check memory usage */ 254 if (entry_pos == p->entry_size) 255 if (csv_increase_buffer(p) != 0) { 256 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos; 257 return pos; 258 } 259 260 c = s[pos++]; 261 switch (pstate) { 262 case ROW_NOT_BEGUN: 263 case FIELD_NOT_BEGUN: 264 if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) { /* Space or Tab */ 265 continue; 266 } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */ 267 if (pstate == FIELD_NOT_BEGUN) { 268 SUBMIT_FIELD(p); 269 SUBMIT_ROW(p, c); 270 } else { /* ROW_NOT_BEGUN */ 271 /* Don't submit empty rows by default */ 272 if (p->options & CSV_REPALL_NL) { 273 SUBMIT_ROW(p, c); 274 } 275 } 276 continue; 277 } else if (c == delim) { /* Comma */ 278 SUBMIT_FIELD(p); 279 break; 280 } else if (c == quote) { /* Quote */ 281 pstate = FIELD_BEGUN; 282 quoted = 1; 283 } else { /* Anything else */ 284 pstate = FIELD_BEGUN; 285 quoted = 0; 286 SUBMIT_CHAR(p, c); 287 } 288 break; 289 case FIELD_BEGUN: 290 if (c == quote) { /* Quote */ 291 if (quoted) { 292 SUBMIT_CHAR(p, c); 293 pstate = FIELD_MIGHT_HAVE_ENDED; 294 } else { 295 /* STRICT ERROR - double quote inside non-quoted field */ 296 if (p->options & CSV_STRICT) { 297 p->status = CSV_EPARSE; 298 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos; 299 return pos-1; 300 } 301 SUBMIT_CHAR(p, c); 302 spaces = 0; 303 } 304 } else if (c == delim) { /* Comma */ 305 if (quoted) { 306 SUBMIT_CHAR(p, c); 307 } else { 308 SUBMIT_FIELD(p); 309 } 310 } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */ 311 if (!quoted) { 312 SUBMIT_FIELD(p); 313 SUBMIT_ROW(p, c); 314 } else { 315 SUBMIT_CHAR(p, c); 316 } 317 } else if (!quoted && (is_space? is_space(c) : c == CSV_SPACE || c == CSV_TAB)) { /* Tab or space for non-quoted field */ 318 SUBMIT_CHAR(p, c); 319 spaces++; 320 } else { /* Anything else */ 321 SUBMIT_CHAR(p, c); 322 spaces = 0; 323 } 324 break; 325 case FIELD_MIGHT_HAVE_ENDED: 326 /* This only happens when a quote character is encountered in a quoted field */ 327 if (c == delim) { /* Comma */ 328 entry_pos -= spaces + 1; /* get rid of spaces and original quote */ 329 SUBMIT_FIELD(p); 330 } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */ 331 entry_pos -= spaces + 1; /* get rid of spaces and original quote */ 332 SUBMIT_FIELD(p); 333 SUBMIT_ROW(p, c); 334 } else if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) { /* Space or Tab */ 335 SUBMIT_CHAR(p, c); 336 spaces++; 337 } else if (c == quote) { /* Quote */ 338 if (spaces) { 339 /* STRICT ERROR - unescaped double quote */ 340 if (p->options & CSV_STRICT) { 341 p->status = CSV_EPARSE; 342 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos; 343 return pos-1; 344 } 345 spaces = 0; 346 SUBMIT_CHAR(p, c); 347 } else { 348 /* Two quotes in a row */ 349 pstate = FIELD_BEGUN; 350 } 351 } else { /* Anything else */ 352 /* STRICT ERROR - unescaped double quote */ 353 if (p->options & CSV_STRICT) { 354 p->status = CSV_EPARSE; 355 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos; 356 return pos-1; 357 } 358 pstate = FIELD_BEGUN; 359 spaces = 0; 360 SUBMIT_CHAR(p, c); 361 } 362 break; 363 default: 364 break; 365 } 366 } 367 p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos; 368 return pos; 369 } 370 371 size_t 372 csv_write (char *dest, size_t dest_size, const char *src, size_t src_size) 373 { 374 size_t chars = 0; 375 376 if (src == NULL) 377 return 0; 378 379 if (dest == NULL) 380 dest_size = 0; 381 382 if (dest_size > 0) 383 *dest++ = '"'; 384 chars++; 385 386 while (src_size) { 387 if (*src == '"') { 388 if (dest_size > chars) 389 *dest++ = '"'; 390 if (chars < SIZE_MAX) chars++; 391 } 392 if (dest_size > chars) 393 *dest++ = *src; 394 if (chars < SIZE_MAX) chars++; 395 src_size--; 396 src++; 397 } 398 399 if (dest_size > chars) 400 *dest = '"'; 401 if (chars < SIZE_MAX) chars++; 402 403 return chars; 404 } 405 406 int 407 csv_fwrite (FILE *fp, const char *src, size_t src_size) 408 { 409 if (fp == NULL || src == NULL) 410 return 0; 411 412 if (fputc('"', fp) == EOF) 413 return EOF; 414 415 while (src_size) { 416 if (*src == '"') { 417 if (fputc('"', fp) == EOF) 418 return EOF; 419 } 420 if (fputc(*src, fp) == EOF) 421 return EOF; 422 src_size--; 423 src++; 424 } 425 426 if (fputc('"', fp) == EOF) { 427 return EOF; 428 } 429 430 return 0; 431 } 432 433 size_t 434 csv_write2 (char *dest, size_t dest_size, const char *src, size_t src_size, char quote) 435 { 436 size_t chars = 0; 437 438 if (src == NULL) 439 return 0; 440 441 if (dest == NULL) 442 dest_size = 0; 443 444 if (dest_size > 0) 445 *dest++ = quote; 446 chars++; 447 448 while (src_size) { 449 if (*src == quote) { 450 if (dest_size > chars) 451 *dest++ = quote; 452 if (chars < SIZE_MAX) chars++; 453 } 454 if (dest_size > chars) 455 *dest++ = *src; 456 if (chars < SIZE_MAX) chars++; 457 src_size--; 458 src++; 459 } 460 461 if (dest_size > chars) 462 *dest = quote; 463 if (chars < SIZE_MAX) chars++; 464 465 return chars; 466 } 467 468 int 469 csv_fwrite2 (FILE *fp, const char *src, size_t src_size, char quote) 470 { 471 if (fp == NULL || src == NULL) 472 return 0; 473 474 if (fputc(quote, fp) == EOF) 475 return EOF; 476 477 while (src_size) { 478 if (*src == quote) { 479 if (fputc(quote, fp) == EOF) 480 return EOF; 481 } 482 if (fputc(*src, fp) == EOF) 483 return EOF; 484 src_size--; 485 src++; 486 } 487 488 if (fputc(quote, fp) == EOF) { 489 return EOF; 490 } 491 492 return 0; 493 } 494