1 /* Copyright (c) 2007-2021 H.Merijn Brand. All rights reserved.
2 * Copyright (c) 1998-2001 Jochen Wiedmann. All rights reserved.
3 * This program is free software; you can redistribute it and/or
4 * modify it under the same terms as Perl itself.
5 */
6 #define PERL_NO_GET_CONTEXT
7 #include <EXTERN.h>
8 #include <perl.h>
9 #include <XSUB.h>
10 #define DPPP_PL_parser_NO_DUMMY
11 #define NEED_utf8_to_uvchr_buf
12 #define NEED_my_snprintf
13 #define NEED_pv_escape
14 #define NEED_pv_pretty
15 #ifndef PERLIO_F_UTF8
16 # define PERLIO_F_UTF8 0x00008000
17 # endif
18 #ifndef MAXINT
19 # define MAXINT ((int)(~(unsigned)0 >> 1))
20 # endif
21 #include "ppport.h"
22 #define is_utf8_sv(s) is_utf8_string ((U8 *)SvPV_nolen (s), SvCUR (s))
23
24 #define MAINT_DEBUG 0
25
26 #define BUFFER_SIZE 1024
27
28 #define CSV_XS_TYPE_WARN 1
29 #define CSV_XS_TYPE_PV 0
30 #define CSV_XS_TYPE_IV 1
31 #define CSV_XS_TYPE_NV 2
32
33 /* maximum length for EOL, SEP, and QUOTE - keep in sync with .pm */
34 #define MAX_ATTR_LEN 16
35
36 #define CSV_FLAGS_QUO 0x0001
37 #define CSV_FLAGS_BIN 0x0002
38 #define CSV_FLAGS_EIF 0x0004
39 #define CSV_FLAGS_MIS 0x0010
40
41 #define HOOK_ERROR 0x0001
42 #define HOOK_AFTER_PARSE 0x0002
43 #define HOOK_BEFORE_PRINT 0x0004
44
45 #ifdef __THW_370__
46 /* EBCDIC on os390 z/OS: IS_EBCDIC reads better than __THW_370__ */
47 #define IS_EBCDIC
48 #endif
49
50 #define CH_TAB '\t'
51 #define CH_NL '\n'
52 #define CH_CR '\r'
53 #define CH_SPACE ' '
54 #define CH_QUO '"'
55
56 #ifdef IS_EBCDIC
57 #define CH_DEL '\007'
58 static unsigned char ec, ebcdic2ascii[256] = {
59 0x00, 0x01, 0x02, 0x03, 0x9c, 0x09, 0x86, 0x7f,
60 0x97, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
61 0x10, 0x11, 0x12, 0x13, 0x9d, 0x0a, 0x08, 0x87,
62 0x18, 0x19, 0x92, 0x8f, 0x1c, 0x1d, 0x1e, 0x1f,
63 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x17, 0x1b,
64 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07,
65 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04,
66 0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a,
67 0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5,
68 0xe7, 0xf1, 0xa2, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
69 0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef,
70 0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
71 0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5,
72 0xc7, 0xd1, 0xa6, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
73 0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf,
74 0xcc, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
75 0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
76 0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1,
77 0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
78 0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4,
79 0xb5, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
80 0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0x5b, 0xde, 0xae,
81 0xac, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc,
82 0xbd, 0xbe, 0xdd, 0xa8, 0xaf, 0x5d, 0xb4, 0xd7,
83 0x7b, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
84 /* v this 0xa0 really should be 0xad. Needed for UTF = binary */
85 0x48, 0x49, 0xa0, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5,
86 0x7d, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
87 0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xf9, 0xfa, 0xff,
88 0x5c, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
89 0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5,
90 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
91 0x38, 0x39, 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x9f
92 };
93 #define is_csv_binary(ch) ((((ec = ebcdic2ascii[ch]) < 0x20 || ec >= 0x7f) && ch != CH_TAB) || ch == EOF)
94 #else
95 #define CH_DEL '\177'
96 #define is_csv_binary(ch) ((ch < CH_SPACE || ch >= CH_DEL) && ch != CH_TAB)
97 #endif
98 #define CH_EOLX 1215
99 #define CH_EOL *csv->eol
100 #define CH_SEPX 8888
101 #define CH_SEP *csv->sep
102 #define CH_QUOTEX 8889
103 #define CH_QUOTE *csv->quo
104
105 #define useIO_EOF 0x10
106
107 #define unless(expr) if (!(expr))
108
109 #define _is_reftype(f,x) \
110 (f && ((SvGMAGICAL (f) && mg_get (f)) || 1) && SvROK (f) && SvTYPE (SvRV (f)) == x)
111 #define _is_arrayref(f) _is_reftype (f, SVt_PVAV)
112 #define _is_hashref(f) _is_reftype (f, SVt_PVHV)
113 #define _is_coderef(f) _is_reftype (f, SVt_PVCV)
114
115 #define SvSetUndef(sv) sv_setpvn (sv, NULL, 0)
116 #define SvSetEmpty(sv) sv_setpvn_mg (sv, "", 0)
117
118 #define CSV_XS_SELF \
119 if (!self || !SvOK (self) || !SvROK (self) || \
120 SvTYPE (SvRV (self)) != SVt_PVHV) \
121 croak ("self is not a hash ref"); \
122 hv = (HV *)SvRV (self)
123
124 /* Keep in sync with .pm! */
125 #define CACHE_ID_quote_char 0
126 #define CACHE_ID_escape_char 1
127 #define CACHE_ID_sep_char 2
128 #define CACHE_ID_binary 3
129 #define CACHE_ID_keep_meta_info 4
130 #define CACHE_ID_always_quote 5
131 #define CACHE_ID_allow_loose_quotes 6
132 #define CACHE_ID_allow_loose_escapes 7
133 #define CACHE_ID_allow_unquoted_escape 8
134 #define CACHE_ID_allow_whitespace 9
135 #define CACHE_ID_blank_is_undef 10
136 #define CACHE_ID_sep 39
137 #define CACHE_ID_sep_len 38
138 #define CACHE_ID_eol 11
139 #define CACHE_ID_eol_len 12
140 #define CACHE_ID_eol_is_cr 13
141 #define CACHE_ID_quo 15
142 #define CACHE_ID_quo_len 16
143 #define CACHE_ID_verbatim 22
144 #define CACHE_ID_empty_is_undef 23
145 #define CACHE_ID_auto_diag 24
146 #define CACHE_ID_quote_space 25
147 #define CACHE_ID_quote_empty 37
148 #define CACHE_ID__is_bound 26
149 #define CACHE_ID__has_ahead 30
150 #define CACHE_ID_escape_null 31
151 #define CACHE_ID_quote_binary 32
152 #define CACHE_ID_diag_verbose 33
153 #define CACHE_ID_has_error_input 34
154 #define CACHE_ID_decode_utf8 35
155 #define CACHE_ID__has_hooks 36
156 #define CACHE_ID_formula 38
157 #define CACHE_ID_strict 42
158 #define CACHE_ID_skip_empty_rows 43
159 #define CACHE_ID_undef_str 46
160 #define CACHE_ID_comment_str 54
161 #define CACHE_ID_types 62
162
163 #define byte unsigned char
164 #define ulng unsigned long
165 typedef struct {
166 byte quote_char;
167 byte escape_char;
168 byte fld_idx;
169 byte binary;
170
171 byte keep_meta_info;
172 byte always_quote;
173 byte useIO; /* Also used to indicate EOF */
174 byte eol_is_cr;
175
176 byte allow_loose_quotes;
177 byte allow_loose_escapes;
178 byte allow_unquoted_escape;
179 byte allow_whitespace;
180
181 byte blank_is_undef;
182 byte empty_is_undef;
183 byte verbatim;
184 byte auto_diag;
185
186 byte quote_space;
187 byte escape_null;
188 byte quote_binary;
189 byte first_safe_char;
190
191 byte diag_verbose;
192 byte has_error_input;
193 byte decode_utf8;
194 byte has_hooks;
195
196 byte quote_empty;
197 byte formula;
198 byte utf8;
199 byte has_ahead;
200
201 byte eolx;
202 byte strict;
203 short strict_n;
204
205 byte skip_empty_rows;
206
207 long is_bound;
208 ulng recno;
209
210 byte * cache;
211
212 SV * pself; /* PL_self, for error_diag */
213 HV * self;
214 SV * bound;
215
216 char * types;
217
218 byte eol_len;
219 byte sep_len;
220 byte quo_len;
221 byte types_len;
222
223 char * bptr;
224 SV * tmp;
225 byte undef_flg;
226 byte * undef_str;
227 byte * comment_str;
228 int eol_pos;
229 STRLEN size;
230 STRLEN used;
231 byte eol[MAX_ATTR_LEN];
232 byte sep[MAX_ATTR_LEN];
233 byte quo[MAX_ATTR_LEN];
234 char buffer[BUFFER_SIZE];
235 } csv_t;
236
237 #define bool_opt_def(o,d) \
238 (((svp = hv_fetchs (self, o, FALSE)) && *svp) ? SvTRUE (*svp) : d)
239 #define bool_opt(o) bool_opt_def (o, 0)
240 #define num_opt_def(o,d) \
241 (((svp = hv_fetchs (self, o, FALSE)) && *svp) ? SvIV (*svp) : d)
242 #define num_opt(o) num_opt_def (o, 0)
243
244 typedef struct {
245 int xs_errno;
246 char *xs_errstr;
247 } xs_error_t;
248 static const xs_error_t xs_errors[] = {
249
250 /* Generic errors */
251 { 1000, "INI - constructor failed" },
252 { 1001, "INI - sep_char is equal to quote_char or escape_char" },
253 { 1002, "INI - allow_whitespace with escape_char or quote_char SP or TAB" },
254 { 1003, "INI - \\r or \\n in main attr not allowed" },
255 { 1004, "INI - callbacks should be undef or a hashref" },
256 { 1005, "INI - EOL too long" },
257 { 1006, "INI - SEP too long" },
258 { 1007, "INI - QUOTE too long" },
259 { 1008, "INI - SEP undefined" },
260
261 { 1010, "INI - the header is empty" },
262 { 1011, "INI - the header contains more than one valid separator" },
263 { 1012, "INI - the header contains an empty field" },
264 { 1013, "INI - the header contains nun-unique fields" },
265 { 1014, "INI - header called on undefined stream" },
266
267 /* Syntax errors */
268 { 1500, "PRM - Invalid/unsupported argument(s)" },
269 { 1501, "PRM - The key attribute is passed as an unsupported type" },
270 { 1502, "PRM - The value attribute is passed without the key attribute" },
271 { 1503, "PRM - The value attribute is passed as an unsupported type" },
272
273 /* Parse errors */
274 { 2010, "ECR - QUO char inside quotes followed by CR not part of EOL" },
275 { 2011, "ECR - Characters after end of quoted field" },
276 { 2012, "EOF - End of data in parsing input stream" },
277 { 2013, "ESP - Specification error for fragments RFC7111" },
278 { 2014, "ENF - Inconsistent number of fields" },
279
280 /* EIQ - Error Inside Quotes */
281 { 2021, "EIQ - NL char inside quotes, binary off" },
282 { 2022, "EIQ - CR char inside quotes, binary off" },
283 { 2023, "EIQ - QUO character not allowed" },
284 { 2024, "EIQ - EOF cannot be escaped, not even inside quotes" },
285 { 2025, "EIQ - Loose unescaped escape" },
286 { 2026, "EIQ - Binary character inside quoted field, binary off" },
287 { 2027, "EIQ - Quoted field not terminated" },
288
289 /* EIF - Error Inside Field */
290 { 2030, "EIF - NL char inside unquoted verbatim, binary off" },
291 { 2031, "EIF - CR char is first char of field, not part of EOL" },
292 { 2032, "EIF - CR char inside unquoted, not part of EOL" },
293 { 2034, "EIF - Loose unescaped quote" },
294 { 2035, "EIF - Escaped EOF in unquoted field" },
295 { 2036, "EIF - ESC error" },
296 { 2037, "EIF - Binary character in unquoted field, binary off" },
297
298 /* Combine errors */
299 { 2110, "ECB - Binary character in Combine, binary off" },
300
301 /* IO errors */
302 { 2200, "EIO - print to IO failed. See errno" },
303
304 /* Hash-Ref errors */
305 { 3001, "EHR - Unsupported syntax for column_names ()" },
306 { 3002, "EHR - getline_hr () called before column_names ()" },
307 { 3003, "EHR - bind_columns () and column_names () fields count mismatch" },
308 { 3004, "EHR - bind_columns () only accepts refs to scalars" },
309 { 3006, "EHR - bind_columns () did not pass enough refs for parsed fields" },
310 { 3007, "EHR - bind_columns needs refs to writable scalars" },
311 { 3008, "EHR - unexpected error in bound fields" },
312 { 3009, "EHR - print_hr () called before column_names ()" },
313 { 3010, "EHR - print_hr () called with invalid arguments" },
314
315 { 4001, "PRM - The key does not exist as field in the data" },
316
317 { 5001, "PRM - The result does not match the output to append to" },
318 { 5002, "PRM - Unsupported output" },
319
320 { 0, "" },
321 };
322
323 static int last_error = 0;
324 static SV *m_getline, *m_print;
325
326 #define is_EOL(c) (c == CH_EOLX)
327
328 #define __is_SEPX(c) (c == CH_SEP && (csv->sep_len == 0 || (\
329 csv->size - csv->used >= (STRLEN)csv->sep_len - 1 &&\
330 !memcmp (csv->bptr + csv->used, csv->sep + 1, csv->sep_len - 1) &&\
331 (csv->used += csv->sep_len - 1) &&\
332 (c = CH_SEPX))))
333 #if MAINT_DEBUG > 1
_is_SEPX(unsigned int c,csv_t * csv,int line)334 static byte _is_SEPX (unsigned int c, csv_t *csv, int line) {
335 unsigned int b = __is_SEPX (c);
336 (void)fprintf (stderr, "# %4d - is_SEPX:\t%d (%d)\n", line, b, csv->sep_len);
337 if (csv->sep_len)
338 (void)fprintf (stderr,
339 "# len: %d, siz: %d, usd: %d, c: %03x, *sep: %03x\n",
340 csv->sep_len, csv->size, csv->used, c, CH_SEP);
341 return b;
342 } /* _is_SEPX */
343 #define is_SEP(c) _is_SEPX (c, csv, __LINE__)
344 #else
345 #define is_SEP(c) __is_SEPX (c)
346 #endif
347
348 #define __is_QUOTEX(c) (CH_QUOTE && c == CH_QUOTE && (csv->quo_len == 0 || (\
349 csv->size - csv->used >= (STRLEN)csv->quo_len - 1 &&\
350 !memcmp (csv->bptr + csv->used, csv->quo + 1, csv->quo_len - 1) &&\
351 (csv->used += csv->quo_len - 1) &&\
352 (c = CH_QUOTEX))))
353 #if MAINT_DEBUG > 1
_is_QUOTEX(unsigned int c,csv_t * csv,int line)354 static byte _is_QUOTEX (unsigned int c, csv_t *csv, int line) {
355 unsigned int b = __is_QUOTEX (c);
356 (void)fprintf (stderr, "# %4d - is_QUOTEX:\t%d (%d)\n", line, b, csv->quo_len);
357
358 if (csv->quo_len)
359 (void)fprintf (stderr,
360 "# len: %d, siz: %d, usd: %d, c: %03x, *quo: %03x\n",
361 csv->quo_len, csv->size, csv->used, c, CH_QUOTE);
362 return b;
363 } /* _is_QUOTEX */
364 #define is_QUOTE(c) _is_QUOTEX (c, csv, __LINE__)
365 #else
366 #define is_QUOTE(c) __is_QUOTEX (c)
367 #endif
368
369 #define is_whitespace(ch) \
370 ( (ch) != CH_SEP && \
371 (ch) != CH_QUOTE && \
372 (ch) != csv->escape_char && \
373 ( (ch) == CH_SPACE || \
374 (ch) == CH_TAB \
375 ) \
376 )
377
378 #define SvDiag(xse) cx_SvDiag (aTHX_ xse)
cx_SvDiag(pTHX_ int xse)379 static SV *cx_SvDiag (pTHX_ int xse) {
380 int i = 0;
381 SV *err;
382
383 while (xs_errors[i].xs_errno && xs_errors[i].xs_errno != xse) i++;
384 if ((err = newSVpv (xs_errors[i].xs_errstr, 0))) {
385 (void)SvUPGRADE (err, SVt_PVIV);
386 SvIV_set (err, xse);
387 SvIOK_on (err);
388 }
389 return (err);
390 } /* SvDiag */
391
392 /* This function should be altered to deal with the optional extra argument
393 * that holds the replacement message */
394 #define SetDiag(csv,xse) cx_SetDiag (aTHX_ csv, xse)
cx_SetDiag(pTHX_ csv_t * csv,int xse)395 static SV *cx_SetDiag (pTHX_ csv_t *csv, int xse) {
396 dSP;
397 SV *err = SvDiag (xse);
398 SV *pself = csv->pself;
399
400 last_error = xse;
401 (void)hv_store (csv->self, "_ERROR_DIAG", 11, err, 0);
402 if (xse == 0) {
403 (void)hv_store (csv->self, "_ERROR_POS", 10, newSViv (0), 0);
404 (void)hv_store (csv->self, "_ERROR_FLD", 10, newSViv (0), 0);
405 (void)hv_store (csv->self, "_ERROR_INPUT", 12, &PL_sv_undef, 0);
406 csv->has_error_input = 0;
407 }
408 if (xse == 2012) /* EOF */
409 (void)hv_store (csv->self, "_EOF", 4, &PL_sv_yes, 0);
410 if (csv->auto_diag) {
411 unless (_is_hashref (pself))
412 pself = newRV_inc ((SV *)csv->self);
413 ENTER;
414 PUSHMARK (SP);
415 XPUSHs (pself);
416 PUTBACK;
417 call_pv ("Text::CSV_XS::error_diag", G_VOID | G_DISCARD);
418 LEAVE;
419 unless (pself == csv->pself)
420 sv_free (pself);
421 }
422 return (err);
423 } /* SetDiag */
424
425 #define xs_cache_set(hv,idx,val) cx_xs_cache_set (aTHX_ hv, idx, val)
cx_xs_cache_set(pTHX_ HV * hv,int idx,SV * val)426 static void cx_xs_cache_set (pTHX_ HV *hv, int idx, SV *val) {
427 SV **svp;
428 byte *cache;
429
430 csv_t csvs;
431 csv_t *csv = &csvs;
432
433 IV iv;
434 byte bv;
435 char *cp = "\0";
436 STRLEN len = 0;
437
438 unless ((svp = hv_fetchs (hv, "_CACHE", FALSE)) && *svp)
439 return;
440
441 cache = (byte *)SvPV_nolen (*svp);
442 (void)memcpy (csv, cache, sizeof (csv_t));
443
444 if (SvPOK (val))
445 cp = SvPV (val, len);
446 if (SvIOK (val))
447 iv = SvIV (val);
448 else if (SvNOK (val)) /* Needed for 5.6.x but safe for 5.8.x+ */
449 iv = (IV)SvNV (val); /* uncoverable statement ancient perl required */
450 else
451 iv = *cp;
452 bv = (unsigned)iv & 0xff;
453
454 switch (idx) {
455
456 /* single char/byte */
457 case CACHE_ID_sep_char:
458 CH_SEP = *cp;
459 csv->sep_len = 0;
460 break;
461
462 case CACHE_ID_quote_char:
463 CH_QUOTE = *cp;
464 csv->quo_len = 0;
465 break;
466
467 case CACHE_ID_escape_char: csv->escape_char = *cp; break;
468
469 /* boolean/numeric */
470 case CACHE_ID_binary: csv->binary = bv; break;
471 case CACHE_ID_keep_meta_info: csv->keep_meta_info = bv; break;
472 case CACHE_ID_always_quote: csv->always_quote = bv; break;
473 case CACHE_ID_quote_empty: csv->quote_empty = bv; break;
474 case CACHE_ID_quote_space: csv->quote_space = bv; break;
475 case CACHE_ID_escape_null: csv->escape_null = bv; break;
476 case CACHE_ID_quote_binary: csv->quote_binary = bv; break;
477 case CACHE_ID_decode_utf8: csv->decode_utf8 = bv; break;
478 case CACHE_ID_allow_loose_escapes: csv->allow_loose_escapes = bv; break;
479 case CACHE_ID_allow_loose_quotes: csv->allow_loose_quotes = bv; break;
480 case CACHE_ID_allow_unquoted_escape: csv->allow_unquoted_escape = bv; break;
481 case CACHE_ID_allow_whitespace: csv->allow_whitespace = bv; break;
482 case CACHE_ID_blank_is_undef: csv->blank_is_undef = bv; break;
483 case CACHE_ID_empty_is_undef: csv->empty_is_undef = bv; break;
484 case CACHE_ID_formula: csv->formula = bv; break;
485 case CACHE_ID_strict: csv->strict = bv; break;
486 case CACHE_ID_verbatim: csv->verbatim = bv; break;
487 case CACHE_ID_skip_empty_rows: csv->skip_empty_rows = bv; break;
488 case CACHE_ID_auto_diag: csv->auto_diag = bv; break;
489 case CACHE_ID_diag_verbose: csv->diag_verbose = bv; break;
490 case CACHE_ID__has_ahead: csv->has_ahead = bv; break;
491 case CACHE_ID__has_hooks: csv->has_hooks = bv; break;
492 case CACHE_ID_has_error_input: csv->has_error_input = bv; break;
493
494 /* a 4-byte IV */
495 case CACHE_ID__is_bound: csv->is_bound = iv; break;
496
497 /* string */
498 case CACHE_ID_sep:
499 (void)memcpy (csv->sep, cp, len);
500 csv->sep_len = len == 1 ? 0 : len;
501 break;
502
503 case CACHE_ID_quo:
504 (void)memcpy (csv->quo, cp, len);
505 csv->quo_len = len == 1 ? 0 : len;
506 break;
507
508 case CACHE_ID_eol:
509 (void)memcpy (csv->eol, cp, len);
510 csv->eol_len = len;
511 csv->eol_is_cr = len == 1 && *cp == CH_CR ? 1 : 0;
512 break;
513
514 case CACHE_ID_undef_str:
515 if (*cp) {
516 csv->undef_str = (byte *)cp;
517 if (SvUTF8 (val))
518 csv->undef_flg = 3;
519 }
520 else {
521 csv->undef_str = NULL;
522 csv->undef_flg = 0;
523 }
524 break;
525
526 case CACHE_ID_comment_str:
527 csv->comment_str = *cp ? (byte *)cp : NULL;
528 break;
529
530 case CACHE_ID_types:
531 if (cp && len) {
532 csv->types = cp;
533 csv->types_len = len;
534 }
535 else {
536 csv->types = NULL;
537 csv->types_len = 0;
538 }
539 break;
540
541 default:
542 warn ("Unknown cache index %d ignored\n", idx);
543 }
544
545 csv->cache = cache;
546 (void)memcpy (cache, csv, sizeof (csv_t));
547 } /* cache_set */
548
549 #define _pretty_strl(csv) cx_pretty_str (aTHX_ csv, strlen (csv))
550 #define _pretty_str(csv,xse) cx_pretty_str (aTHX_ csv, xse)
cx_pretty_str(pTHX_ byte * s,STRLEN l)551 static char *cx_pretty_str (pTHX_ byte *s, STRLEN l) {
552 SV *dsv = sv_2mortal (newSVpvs (""));
553 return (pv_pretty (dsv, (char *)s, l, 0, NULL, NULL,
554 (PERL_PV_PRETTY_DUMP | PERL_PV_ESCAPE_UNI_DETECT)));
555 } /* _pretty_str */
556
557 #define _cache_show_byte(trim,c) \
558 warn (" %-21s %02x:%3d\n", trim, c, c)
559 #define _cache_show_char(trim,c) \
560 warn (" %-21s %02x:%s\n", trim, c, _pretty_str (&c, 1))
561 #define _cache_show_str(trim,l,str) \
562 warn (" %-21s %02d:%s\n", trim, l, _pretty_str (str, l))
563
564 #define xs_cache_diag(hv) cx_xs_cache_diag (aTHX_ hv)
cx_xs_cache_diag(pTHX_ HV * hv)565 static void cx_xs_cache_diag (pTHX_ HV *hv) {
566 SV **svp;
567 byte *cache;
568 csv_t csvs;
569 csv_t *csv = &csvs;
570
571 unless ((svp = hv_fetchs (hv, "_CACHE", FALSE)) && *svp) {
572 warn ("CACHE: invalid\n");
573 return;
574 }
575
576 cache = (byte *)SvPV_nolen (*svp);
577 (void)memcpy (csv, cache, sizeof (csv_t));
578 warn ("CACHE:\n");
579 _cache_show_char ("quote_char", CH_QUOTE);
580 _cache_show_char ("escape_char", csv->escape_char);
581 _cache_show_char ("sep_char", CH_SEP);
582 _cache_show_byte ("binary", csv->binary);
583 _cache_show_byte ("decode_utf8", csv->decode_utf8);
584
585 _cache_show_byte ("allow_loose_escapes", csv->allow_loose_escapes);
586 _cache_show_byte ("allow_loose_quotes", csv->allow_loose_quotes);
587 _cache_show_byte ("allow_unquoted_escape", csv->allow_unquoted_escape);
588 _cache_show_byte ("allow_whitespace", csv->allow_whitespace);
589 _cache_show_byte ("always_quote", csv->always_quote);
590 _cache_show_byte ("quote_empty", csv->quote_empty);
591 _cache_show_byte ("quote_space", csv->quote_space);
592 _cache_show_byte ("escape_null", csv->escape_null);
593 _cache_show_byte ("quote_binary", csv->quote_binary);
594 _cache_show_byte ("auto_diag", csv->auto_diag);
595 _cache_show_byte ("diag_verbose", csv->diag_verbose);
596 _cache_show_byte ("formula", csv->formula);
597 _cache_show_byte ("strict", csv->strict);
598 _cache_show_byte ("skip_empty_rows", csv->skip_empty_rows);
599 _cache_show_byte ("has_error_input", csv->has_error_input);
600 _cache_show_byte ("blank_is_undef", csv->blank_is_undef);
601 _cache_show_byte ("empty_is_undef", csv->empty_is_undef);
602 _cache_show_byte ("has_ahead", csv->has_ahead);
603 _cache_show_byte ("keep_meta_info", csv->keep_meta_info);
604 _cache_show_byte ("verbatim", csv->verbatim);
605
606 _cache_show_byte ("has_hooks", csv->has_hooks);
607 _cache_show_byte ("eol_is_cr", csv->eol_is_cr);
608 _cache_show_byte ("eol_len", csv->eol_len);
609 _cache_show_str ("eol", csv->eol_len, csv->eol);
610 _cache_show_byte ("sep_len", csv->sep_len);
611 if (csv->sep_len > 1)
612 _cache_show_str ("sep", csv->sep_len, csv->sep);
613 _cache_show_byte ("quo_len", csv->quo_len);
614 if (csv->quo_len > 1)
615 _cache_show_str ("quote", csv->quo_len, csv->quo);
616 if (csv->types_len)
617 _cache_show_str ("types", csv->types_len, (byte *)csv->types);
618 else
619 _cache_show_str ("types", 0, (byte *)"");
620
621 if (csv->bptr)
622 _cache_show_str ("bptr", (int)strlen (csv->bptr), (byte *)csv->bptr);
623 if (csv->tmp && SvPOK (csv->tmp)) {
624 char *s = SvPV_nolen (csv->tmp);
625 _cache_show_str ("tmp", (int)strlen (s), (byte *)s);
626 }
627 } /* xs_cache_diag */
628
629 #define set_eol_is_cr(csv) cx_set_eol_is_cr (aTHX_ csv)
cx_set_eol_is_cr(pTHX_ csv_t * csv)630 static void cx_set_eol_is_cr (pTHX_ csv_t *csv) {
631 csv->eol[0] = CH_CR;
632 csv->eol_is_cr = 1;
633 csv->eol_len = 1;
634 (void)memcpy (csv->cache, csv, sizeof (csv_t));
635
636 (void)hv_store (csv->self, "eol", 3, newSVpvn ((char *)csv->eol, 1), 0);
637 } /* set_eol_is_cr */
638
639 #define SetupCsv(csv,self,pself) cx_SetupCsv (aTHX_ csv, self, pself)
cx_SetupCsv(pTHX_ csv_t * csv,HV * self,SV * pself)640 static void cx_SetupCsv (pTHX_ csv_t *csv, HV *self, SV *pself) {
641 SV **svp;
642 STRLEN len;
643 char *ptr;
644
645 last_error = 0;
646
647 if ((svp = hv_fetchs (self, "_CACHE", FALSE)) && *svp) {
648 byte *cache = (byte *)SvPVX (*svp);
649 (void)memcpy (csv, cache, sizeof (csv_t));
650 }
651 else {
652 SV *sv_cache;
653
654 (void)memset (csv, 0, sizeof (csv_t)); /* Reset everything */
655
656 csv->self = self;
657 csv->pself = pself;
658
659 CH_SEP = ',';
660 if ((svp = hv_fetchs (self, "sep_char", FALSE)) && *svp && SvOK (*svp))
661 CH_SEP = *SvPV (*svp, len);
662 if ((svp = hv_fetchs (self, "sep", FALSE)) && *svp && SvOK (*svp)) {
663 ptr = SvPV (*svp, len);
664 (void)memcpy (csv->sep, ptr, len);
665 if (len > 1)
666 csv->sep_len = len;
667 }
668
669 CH_QUOTE = '"';
670 if ((svp = hv_fetchs (self, "quote_char", FALSE)) && *svp) {
671 if (SvOK (*svp)) {
672 ptr = SvPV (*svp, len);
673 CH_QUOTE = len ? *ptr : (char)0;
674 }
675 else
676 CH_QUOTE = (char)0;
677 }
678 if ((svp = hv_fetchs (self, "quote", FALSE)) && *svp && SvOK (*svp)) {
679 ptr = SvPV (*svp, len);
680 (void)memcpy (csv->quo, ptr, len);
681 if (len > 1)
682 csv->quo_len = len;
683 }
684
685 csv->escape_char = '"';
686 if ((svp = hv_fetchs (self, "escape_char", FALSE)) && *svp) {
687 if (SvOK (*svp)) {
688 ptr = SvPV (*svp, len);
689 csv->escape_char = len ? *ptr : (char)0;
690 }
691 else
692 csv->escape_char = (char)0;
693 }
694
695 if ((svp = hv_fetchs (self, "eol", FALSE)) && *svp && SvOK (*svp)) {
696 char *eol = SvPV (*svp, len);
697 (void)memcpy (csv->eol, eol, len);
698 csv->eol_len = len;
699 if (len == 1 && *csv->eol == CH_CR)
700 csv->eol_is_cr = 1;
701 }
702
703 csv->undef_flg = 0;
704 if ((svp = hv_fetchs (self, "undef_str", FALSE)) && *svp && SvOK (*svp)) {
705 /*if (sv && (SvOK (sv) || (
706 (SvGMAGICAL (sv) && (mg_get (sv), 1) && SvOK (sv))))) {*/
707 csv->undef_str = (byte *)SvPV_nolen (*svp);
708 if (SvUTF8 (*svp))
709 csv->undef_flg = 3;
710 }
711 else
712 csv->undef_str = NULL;
713
714 if ((svp = hv_fetchs (self, "comment_str", FALSE)) && *svp && SvOK (*svp))
715 csv->comment_str = (byte *)SvPV_nolen (*svp);
716 else
717 csv->comment_str = NULL;
718
719 if ((svp = hv_fetchs (self, "_types", FALSE)) && *svp && SvOK (*svp)) {
720 csv->types = SvPV (*svp, len);
721 csv->types_len = len;
722 }
723
724 if ((svp = hv_fetchs (self, "_is_bound", FALSE)) && *svp && SvOK (*svp))
725 csv->is_bound = SvIV (*svp);
726 if ((svp = hv_fetchs (self, "callbacks", FALSE)) && _is_hashref (*svp)) {
727 HV *cb = (HV *)SvRV (*svp);
728 if ((svp = hv_fetchs (cb, "after_parse", FALSE)) && _is_coderef (*svp))
729 csv->has_hooks |= HOOK_AFTER_PARSE;
730 if ((svp = hv_fetchs (cb, "before_print", FALSE)) && _is_coderef (*svp))
731 csv->has_hooks |= HOOK_BEFORE_PRINT;
732 }
733
734 csv->binary = bool_opt ("binary");
735 csv->decode_utf8 = bool_opt ("decode_utf8");
736 csv->always_quote = bool_opt ("always_quote");
737 csv->strict = bool_opt ("strict");
738 csv->skip_empty_rows = bool_opt ("skip_empty_rows");
739 csv->quote_empty = bool_opt ("quote_empty");
740 csv->quote_space = bool_opt_def ("quote_space", 1);
741 csv->escape_null = bool_opt_def ("escape_null", 1);
742 csv->quote_binary = bool_opt_def ("quote_binary", 1);
743 csv->allow_loose_quotes = bool_opt ("allow_loose_quotes");
744 csv->allow_loose_escapes = bool_opt ("allow_loose_escapes");
745 csv->allow_unquoted_escape = bool_opt ("allow_unquoted_escape");
746 csv->allow_whitespace = bool_opt ("allow_whitespace");
747 csv->blank_is_undef = bool_opt ("blank_is_undef");
748 csv->empty_is_undef = bool_opt ("empty_is_undef");
749 csv->verbatim = bool_opt ("verbatim");
750
751 csv->auto_diag = num_opt ("auto_diag");
752 csv->diag_verbose = num_opt ("diag_verbose");
753 csv->keep_meta_info = num_opt ("keep_meta_info");
754 csv->formula = num_opt ("formula");
755
756 unless (csv->escape_char) csv->escape_null = 0;
757
758 sv_cache = newSVpvn ((char *)csv, sizeof (csv_t));
759 csv->cache = (byte *)SvPVX (sv_cache);
760 SvREADONLY_on (sv_cache);
761
762 (void)memcpy (csv->cache, csv, sizeof (csv_t));
763
764 (void)hv_store (self, "_CACHE", 6, sv_cache, 0);
765 }
766
767 csv->utf8 = 0;
768 csv->size = 0;
769 csv->used = 0;
770
771 /* This is EBCDIC-safe, as it is used after translation */
772 csv->first_safe_char = csv->quote_space ? 0x21 : 0x20;
773
774 if (csv->is_bound) {
775 if ((svp = hv_fetchs (self, "_BOUND_COLUMNS", FALSE)) && _is_arrayref (*svp))
776 csv->bound = *svp;
777 else
778 csv->is_bound = 0;
779 }
780
781 csv->eol_pos = -1;
782 csv->eolx = csv->eol_len
783 ? csv->verbatim || csv->eol_len >= 2
784 ? 1
785 : csv->eol[0] == CH_CR || csv->eol[0] == CH_NL
786 ? 0
787 : 1
788 : 0;
789 if (csv->sep_len > 1 && is_utf8_string ((U8 *)(csv->sep), csv->sep_len))
790 csv->utf8 = 1;
791 if (csv->quo_len > 1 && is_utf8_string ((U8 *)(csv->quo), csv->quo_len))
792 csv->utf8 = 1;
793 } /* SetupCsv */
794
795 #define Print(csv,dst) cx_Print (aTHX_ csv, dst)
cx_Print(pTHX_ csv_t * csv,SV * dst)796 static int cx_Print (pTHX_ csv_t *csv, SV *dst) {
797 int result;
798 int keep = 0;
799
800 if (csv->useIO) {
801 SV *tmp = sv_2mortal (newSVpvn (csv->buffer, csv->used));
802 dSP;
803 PUSHMARK (sp);
804 EXTEND (sp, 2);
805 PUSHs ((dst));
806 if (csv->utf8) {
807 STRLEN len;
808 char *ptr;
809 int j;
810
811 ptr = SvPV (tmp, len);
812 while (len > 0 && !is_utf8_sv (tmp) && keep < 16) {
813 ptr[--len] = (char)0;
814 SvCUR_set (tmp, len);
815 keep++;
816 }
817 for (j = 0; j < keep; j++)
818 csv->buffer[j] = csv->buffer[csv->used - keep + j];
819 SvUTF8_on (tmp);
820 }
821 PUSHs (tmp);
822 PUTBACK;
823 result = call_sv (m_print, G_METHOD);
824 SPAGAIN;
825 if (result) {
826 result = POPi;
827 unless (result)
828 (void)SetDiag (csv, 2200);
829 }
830 PUTBACK;
831 }
832 else {
833 sv_catpvn (SvRV (dst), csv->buffer, csv->used);
834 result = TRUE;
835 }
836 if (csv->utf8 && !csv->useIO && csv->decode_utf8
837 && SvROK (dst) && is_utf8_sv (SvRV (dst)))
838 SvUTF8_on (SvRV (dst));
839 csv->used = keep;
840 return result;
841 } /* Print */
842
843 #define CSV_PUT(csv,dst,c) { \
844 if ((csv)->used == sizeof ((csv)->buffer) - 1) { \
845 unless (Print ((csv), (dst))) \
846 return FALSE; \
847 } \
848 (csv)->buffer[(csv)->used++] = (c); \
849 }
850
851 #define bound_field(csv,i,keep) cx_bound_field (aTHX_ csv, i, keep)
cx_bound_field(pTHX_ csv_t * csv,SSize_t i,int keep)852 static SV *cx_bound_field (pTHX_ csv_t *csv, SSize_t i, int keep) {
853 SV *sv = csv->bound;
854 AV *av;
855
856 /* fprintf (stderr, "# New bind %d/%d\n", i, csv->is_bound);\ */
857 if (i >= csv->is_bound) {
858 (void)SetDiag (csv, 3006);
859 return (NULL);
860 }
861
862 if (sv && SvROK (sv)) {
863 av = (AV *)(SvRV (sv));
864 /* fprintf (stderr, "# Bind %d/%d/%d\n", i, csv->is_bound, av_len (av)); */
865 sv = *av_fetch (av, i, FALSE);
866 if (sv && SvROK (sv)) {
867 sv = SvRV (sv);
868 if (keep)
869 return (sv);
870
871 unless (SvREADONLY (sv)) {
872 SvSetEmpty (sv);
873 return (sv);
874 }
875 }
876 }
877 (void)SetDiag (csv, 3008);
878 return (NULL);
879 } /* bound_field */
880
881 #define was_quoted(mf,idx) cx_was_quoted (aTHX_ mf, idx)
cx_was_quoted(pTHX_ AV * mf,int idx)882 static int cx_was_quoted (pTHX_ AV *mf, int idx) {
883 SV **x = av_fetch (mf, idx, FALSE);
884 return (x && SvIOK (*x) && SvIV (*x) & CSV_FLAGS_QUO ? 1 : 0);
885 } /* was_quoted */
886
887 #define _formula(csv,sv,len,f) cx_formula (aTHX_ csv, sv, len, f)
cx_formula(pTHX_ csv_t * csv,SV * sv,STRLEN * len,int f)888 static char *cx_formula (pTHX_ csv_t *csv, SV *sv, STRLEN *len, int f) {
889
890 int fa = csv->formula;
891
892 if (fa == 1) die ("Formulas are forbidden\n");
893 if (fa == 2) croak ("Formulas are forbidden\n");
894
895 if (fa == 3) {
896 char *ptr = SvPV_nolen (sv);
897 char rec[40];
898 char field[128];
899 SV **svp;
900
901 if (csv->recno) (void)sprintf (rec, " in record %lu", csv->recno + 1);
902 else *rec = (char)0;
903
904 *field = (char)0;
905 if ((svp = hv_fetchs (csv->self, "_COLUMN_NAMES", FALSE)) && _is_arrayref (*svp)) {
906 AV *avp = (AV *)SvRV (*svp);
907 if (avp && av_len (avp) >= (f - 1)) {
908 SV **fnm = av_fetch (avp, f - 1, FALSE);
909 if (fnm && *fnm && SvOK (*fnm))
910 (void)sprintf (field, " (column: '%.100s')", SvPV_nolen (*fnm));
911 }
912 }
913
914 warn ("Field %d%s%s contains formula '%s'\n", f, field, rec, ptr);
915 return ptr;
916 }
917
918 if (len) *len = 0;
919
920 if (fa == 4) {
921 unless (SvREADONLY (sv)) SvSetEmpty (sv);
922 return "";
923 }
924
925 if (fa == 5) {
926 unless (SvREADONLY (sv)) SvSetUndef (sv);
927 return NULL;
928 }
929
930 if (fa == 6) {
931 int result;
932 SV **svp = hv_fetchs (csv->self, "_FORMULA_CB", FALSE);
933 if (svp && _is_coderef (*svp)) {
934 dSP;
935 ENTER;
936 SAVE_DEFSV; /* local $_ */
937 DEFSV = sv;
938 PUSHMARK (SP);
939 PUTBACK;
940 result = call_sv (*svp, G_SCALAR);
941 SPAGAIN;
942 if (result)
943 sv_setsv (sv, POPs);
944 PUTBACK;
945 LEAVE;
946 }
947 return len ? SvPV (sv, *len) : SvPV_nolen (sv);
948 }
949
950 /* So far undefined behavior */
951 return NULL;
952 } /* _formula */
953
954 #define Combine(csv,dst,fields) cx_Combine (aTHX_ csv, dst, fields)
cx_Combine(pTHX_ csv_t * csv,SV * dst,AV * fields)955 static int cx_Combine (pTHX_ csv_t *csv, SV *dst, AV *fields) {
956 SSize_t i, n;
957 int bound = 0;
958 int aq = (int)csv->always_quote;
959 int qe = (int)csv->quote_empty;
960 int kmi = (int)csv->keep_meta_info;
961 AV *qm = NULL;
962
963 n = (IV)av_len (fields);
964 if (n < 0 && csv->is_bound) {
965 n = csv->is_bound - 1;
966 bound = 1;
967 }
968
969 if (kmi >= 10) {
970 SV **svp;
971 if ((svp = hv_fetchs (csv->self, "_FFLAGS", FALSE)) && _is_arrayref (*svp)) {
972 AV *avp = (AV *)SvRV (*svp);
973 if (avp && av_len (avp) >= n)
974 qm = avp;
975 }
976 }
977
978 for (i = 0; i <= n; i++) {
979 SV *sv;
980 STRLEN len = 0;
981 char *ptr = NULL;
982
983 if (i > 0) {
984 CSV_PUT (csv, dst, CH_SEP);
985 if (csv->sep_len) {
986 int x;
987 for (x = 1; x < (int)csv->sep_len; x++)
988 CSV_PUT (csv, dst, csv->sep[x]);
989 }
990 }
991
992 if (bound)
993 sv = bound_field (csv, i, 1);
994 else {
995 SV **svp = av_fetch (fields, i, FALSE);
996 sv = svp && *svp ? *svp : NULL;
997 }
998
999 if (sv && (SvOK (sv) || (
1000 (SvGMAGICAL (sv) && (mg_get (sv), 1) && SvOK (sv))))) {
1001
1002 int quoteMe;
1003
1004 ptr = SvPV (sv, len);
1005
1006 if (*ptr == '=' && csv->formula) {
1007 unless (ptr = _formula (csv, sv, &len, i))
1008 continue;
1009 }
1010 if (len == 0)
1011 quoteMe = aq ? 1 : qe ? 1 : qm ? was_quoted (qm, i) : 0;
1012 else {
1013
1014 if (SvUTF8 (sv)) {
1015 csv->utf8 = 1;
1016 csv->binary = 1;
1017 }
1018
1019 quoteMe = aq ? 1 : qm ? was_quoted (qm, i) : 0;
1020
1021 /* Do we need quoting? We do quote, if the user requested
1022 * (always_quote), if binary or blank characters are found
1023 * and if the string contains quote or escape characters.
1024 */
1025 if (!quoteMe &&
1026 ( quoteMe = (!SvIOK (sv) && !SvNOK (sv) && CH_QUOTE))) {
1027 char *ptr2;
1028 STRLEN l;
1029
1030 #if MAINT_DEBUG > 4
1031 (void)fprintf (stderr, "# Combine:\n");
1032 sv_dump (sv);
1033 #endif
1034 for (ptr2 = ptr, l = len; l; ++ptr2, --l) {
1035 byte c = *ptr2;
1036 #ifdef IS_EBCDIC
1037 byte x = ebcdic2ascii[c];
1038 #if MAINT_DEBUG > 4
1039 (void)fprintf (stderr, " %02x", x);
1040 #endif
1041 #else
1042 byte x = c;
1043 #endif
1044
1045 if ((CH_QUOTE && c == CH_QUOTE) ||
1046 (CH_SEP && c == CH_SEP) ||
1047 (csv->escape_char && c == csv->escape_char) ||
1048 (csv->quote_binary ? (x >= 0x7f && x <= 0xa0) ||
1049 x < csv->first_safe_char
1050 : c == CH_NL || c == CH_CR ||
1051 (csv->quote_space && (
1052 c == CH_SPACE || c == CH_TAB)))) {
1053 /* Binary character */
1054 break;
1055 }
1056 }
1057 #if defined(IS_EBCDIC) && MAINT_DEBUG > 4
1058 (void)fprintf (stderr, "\n");
1059 #endif
1060 quoteMe = (l > 0);
1061 }
1062 }
1063 if (quoteMe) {
1064 CSV_PUT (csv, dst, CH_QUOTE);
1065 if (csv->quo_len) {
1066 int x;
1067 for (x = 1; x < (int)csv->quo_len; x++)
1068 CSV_PUT (csv, dst, csv->quo[x]);
1069 }
1070 }
1071 while (len-- > 0) {
1072 char c = *ptr++;
1073 int e = 0;
1074
1075 if (!csv->binary && is_csv_binary (c)) {
1076 SvREFCNT_inc (sv);
1077 csv->has_error_input = 1;
1078 unless (hv_store (csv->self, "_ERROR_INPUT", 12, sv, 0))
1079 SvREFCNT_dec (sv); /* uncoverable statement memory fail */
1080 (void)SetDiag (csv, 2110);
1081 return FALSE;
1082 }
1083 if (CH_QUOTE && (byte)c == CH_QUOTE && (csv->quo_len == 0 ||
1084 memcmp (ptr, csv->quo +1, csv->quo_len - 1) == 0))
1085 e = 1;
1086 else
1087 if (c == csv->escape_char && csv->escape_char)
1088 e = 1;
1089 else
1090 if (c == (char)0 && csv->escape_null) {
1091 e = 1;
1092 c = '0';
1093 }
1094 if (e && csv->escape_char)
1095 CSV_PUT (csv, dst, csv->escape_char);
1096 CSV_PUT (csv, dst, c);
1097 }
1098 if (quoteMe) {
1099 CSV_PUT (csv, dst, CH_QUOTE);
1100 if (csv->quo_len) {
1101 int x;
1102 for (x = 1; x < (int)csv->quo_len; x++)
1103 CSV_PUT (csv, dst, csv->quo[x]);
1104 }
1105 }
1106 }
1107 else {
1108 if (csv->undef_str) {
1109 byte *ptr = csv->undef_str;
1110 STRLEN len = strlen ((char *)ptr);
1111
1112 if (csv->undef_flg) {
1113 csv->utf8 = 1;
1114 csv->binary = 1;
1115 }
1116
1117 while (len--)
1118 CSV_PUT (csv, dst, *ptr++);
1119 }
1120 }
1121 }
1122 if (csv->eol_len) {
1123 STRLEN len = csv->eol_len;
1124 byte *ptr = csv->eol;
1125
1126 while (len--)
1127 CSV_PUT (csv, dst, *ptr++);
1128 }
1129 if (csv->used)
1130 return Print (csv, dst);
1131 return TRUE;
1132 } /* Combine */
1133
1134 #define ParseError(csv,xse,pos) cx_ParseError (aTHX_ csv, xse, pos)
cx_ParseError(pTHX_ csv_t * csv,int xse,STRLEN pos)1135 static void cx_ParseError (pTHX_ csv_t *csv, int xse, STRLEN pos) {
1136 (void)hv_store (csv->self, "_ERROR_POS", 10, newSViv (pos), 0);
1137 (void)hv_store (csv->self, "_ERROR_FLD", 10, newSViv (csv->fld_idx), 0);
1138 if (csv->tmp) {
1139 csv->has_error_input = 1;
1140 if (hv_store (csv->self, "_ERROR_INPUT", 12, csv->tmp, 0))
1141 SvREFCNT_inc (csv->tmp);
1142 }
1143 (void)SetDiag (csv, xse);
1144 } /* ParseError */
1145
1146 #define CsvGet(csv,src) cx_CsvGet (aTHX_ csv, src)
cx_CsvGet(pTHX_ csv_t * csv,SV * src)1147 static int cx_CsvGet (pTHX_ csv_t *csv, SV *src) {
1148 unless (csv->useIO)
1149 return EOF;
1150
1151 if (csv->tmp && csv->eol_pos >= 0) {
1152 csv->eol_pos = -2;
1153 sv_setpvn (csv->tmp, (char *)csv->eol, csv->eol_len);
1154 csv->bptr = SvPV (csv->tmp, csv->size);
1155 csv->used = 0;
1156 return CH_EOLX;
1157 }
1158
1159 { STRLEN result;
1160 dSP;
1161
1162 PUSHMARK (sp);
1163 EXTEND (sp, 1);
1164 PUSHs (src);
1165 PUTBACK;
1166 result = call_sv (m_getline, G_METHOD);
1167 SPAGAIN;
1168 csv->eol_pos = -1;
1169 csv->tmp = result ? POPs : NULL;
1170 PUTBACK;
1171
1172 #if MAINT_DEBUG > 4
1173 (void)fprintf (stderr, "getline () returned:\n");
1174 sv_dump (csv->tmp);
1175 #endif
1176 }
1177 if (csv->tmp && SvOK (csv->tmp)) {
1178 STRLEN tmp_len;
1179 csv->bptr = SvPV (csv->tmp, tmp_len);
1180 csv->used = 0;
1181 csv->size = tmp_len;
1182 if (csv->eolx && csv->size >= csv->eol_len) {
1183 int i, match = 1;
1184 for (i = 1; i <= (int)csv->eol_len; i++) {
1185 unless (csv->bptr[csv->size - i] == csv->eol[csv->eol_len - i]) {
1186 match = 0;
1187 break;
1188 }
1189 }
1190 if (match) {
1191 #if MAINT_DEBUG > 4
1192 (void)fprintf (stderr, "# EOLX match, size: %d\n", csv->size);
1193 #endif
1194 csv->size -= csv->eol_len;
1195 unless (csv->verbatim)
1196 csv->eol_pos = csv->size;
1197 csv->bptr[csv->size] = (char)0;
1198 SvCUR_set (csv->tmp, csv->size);
1199 unless (csv->verbatim || csv->size)
1200 return CH_EOLX;
1201 }
1202 }
1203 if (SvUTF8 (csv->tmp)) csv->utf8 = 1;
1204 if (tmp_len)
1205 return ((byte)csv->bptr[csv->used++]);
1206 }
1207 csv->useIO |= useIO_EOF;
1208 return EOF;
1209 } /* CsvGet */
1210
1211 #define ERROR_INSIDE_QUOTES(diag_code) { \
1212 unless (csv->is_bound) SvREFCNT_dec (sv); \
1213 ParseError (csv, diag_code, csv->used - 1); \
1214 return FALSE; \
1215 }
1216 #define ERROR_INSIDE_FIELD(diag_code) { \
1217 unless (csv->is_bound) SvREFCNT_dec (sv); \
1218 ParseError (csv, diag_code, csv->used - 1); \
1219 return FALSE; \
1220 }
1221
1222 #if MAINT_DEBUG > 4
1223 #define PUT_RPT (void)fprintf (stderr, "# CSV_PUT @ %4d: 0x%02x '%c'\n", __LINE__, c, isprint (c) ? c : '?')
1224 #define PUT_SEPX_RPT1 (void)fprintf (stderr, "# PUT SEPX @ %4d\n", __LINE__)
1225 #define PUT_SEPX_RPT2 (void)fprintf (stderr, "# Done putting SEPX\n")
1226 #define PUT_QUOX_RPT1 (void)fprintf (stderr, "# PUT QUOX @ %4d\n", __LINE__)
1227 #define PUT_QUOX_RPT2 (void)fprintf (stderr, "# Done putting QUOX\n")
1228 #define PUT_EOLX_RPT1 (void)fprintf (stderr, "# PUT EOLX @ %4d\n", __LINE__)
1229 #define PUT_EOLX_RPT2 (void)fprintf (stderr, "# Done putting EOLX\n")
1230 #define PUSH_RPT (void)fprintf (stderr, "# AV_PUSHd @ %4d\n", __LINE__); sv_dump (sv)
1231 #else
1232 #define PUT_RPT
1233 #define PUT_SEPX_RPT1
1234 #define PUT_SEPX_RPT2
1235 #define PUT_QUOX_RPT1
1236 #define PUT_QUOX_RPT2
1237 #define PUT_EOLX_RPT1
1238 #define PUT_EOLX_RPT2
1239 #define PUSH_RPT
1240 #endif
1241 #define CSV_PUT_SV1(c) { \
1242 len = SvCUR ((sv)); \
1243 SvGROW ((sv), len + 2); \
1244 *SvEND ((sv)) = c; \
1245 PUT_RPT; \
1246 SvCUR_set ((sv), len + 1); \
1247 }
1248 #define CSV_PUT_SV(c) { \
1249 if (c == CH_EOLX) { \
1250 int x; PUT_EOLX_RPT1; \
1251 if (csv->eol_pos == -2) \
1252 csv->size = 0; \
1253 for (x = 0; x < (int)csv->eol_len; x++) \
1254 CSV_PUT_SV1 (csv->eol[x]); \
1255 csv->eol_pos = -1; \
1256 PUT_EOLX_RPT2; \
1257 } \
1258 else if (c == CH_SEPX) { \
1259 int x; PUT_SEPX_RPT1; \
1260 for (x = 0; x < (int)csv->sep_len; x++) \
1261 CSV_PUT_SV1 (csv->sep[x]); \
1262 PUT_SEPX_RPT2; \
1263 } \
1264 else if (c == CH_QUOTEX) { \
1265 int x; PUT_QUOX_RPT1; \
1266 for (x = 0; x < (int)csv->quo_len; x++) \
1267 CSV_PUT_SV1 (csv->quo[x]); \
1268 PUT_QUOX_RPT2; \
1269 } \
1270 else \
1271 CSV_PUT_SV1 (c); \
1272 }
1273
1274 #define CSV_GET1 \
1275 (csv->used < csv->size ? (byte)csv->bptr[csv->used++] : CsvGet (csv, src))
1276
1277 #if MAINT_DEBUG > 3
CSV_GET_(pTHX_ csv_t * csv,SV * src,int l)1278 int CSV_GET_ (pTHX_ csv_t *csv, SV *src, int l) {
1279 int c;
1280 (void)fprintf (stderr, "# 1-CSV_GET @ %4d: (used: %d, size: %d, eol_pos: %d, eolx = %d)\n", l, csv->used, csv->size, csv->eol_pos, csv->eolx);
1281 c = CSV_GET1;
1282 (void)fprintf (stderr, "# 2-CSV_GET @ %4d: 0x%02x '%c'\n", l, c, isprint (c) ? c : '?');
1283 return (c);
1284 } /* CSV_GET_ */
1285 #define CSV_GET CSV_GET_ (aTHX_ csv, src, __LINE__)
1286 #else
1287 #define CSV_GET CSV_GET1
1288 #endif
1289
1290 #define AV_PUSH { \
1291 *SvEND (sv) = (char)0; \
1292 SvUTF8_off (sv); \
1293 if (csv->formula && SvCUR (sv) && *(SvPV_nolen (sv)) == '=') \
1294 (void)_formula (csv, sv, NULL, fnum); \
1295 if (SvCUR (sv) == 0 && ( \
1296 csv->empty_is_undef || \
1297 (!(f & CSV_FLAGS_QUO) && csv->blank_is_undef))) \
1298 SvSetUndef (sv); \
1299 else { \
1300 if (csv->allow_whitespace && ! (f & CSV_FLAGS_QUO)) \
1301 strip_trail_whitespace (sv); \
1302 if (f & CSV_FLAGS_BIN && csv->decode_utf8 \
1303 && (csv->utf8 || is_utf8_sv (sv))) \
1304 SvUTF8_on (sv); \
1305 } \
1306 SvSETMAGIC (sv); \
1307 unless (csv->is_bound) av_push (fields, sv); \
1308 PUSH_RPT; \
1309 sv = NULL; \
1310 if (csv->keep_meta_info && fflags) \
1311 av_push (fflags, newSViv (f)); \
1312 waitingForField = 1; \
1313 }
1314
1315 #define strip_trail_whitespace(sv) cx_strip_trail_whitespace (aTHX_ sv)
cx_strip_trail_whitespace(pTHX_ SV * sv)1316 static void cx_strip_trail_whitespace (pTHX_ SV *sv) {
1317 STRLEN len;
1318 char *s = SvPV (sv, len);
1319 unless (s && len) return;
1320 while (s[len - 1] == CH_SPACE || s[len - 1] == CH_TAB)
1321 s[--len] = (char)0;
1322 SvCUR_set (sv, len);
1323 } /* strip_trail_whitespace */
1324
1325 #define NewField \
1326 unless (sv) { \
1327 if (csv->is_bound) \
1328 sv = bound_field (csv, fnum, 0); \
1329 else \
1330 sv = newSVpvs (""); \
1331 fnum++; \
1332 unless (sv) return FALSE; \
1333 f = 0; csv->fld_idx++; \
1334 }
1335
1336 #if MAINT_DEBUG
1337 static char str_parsed[40];
1338 #endif
1339
1340 #if MAINT_DEBUG > 1
_sep_string(csv_t * csv)1341 static char *_sep_string (csv_t *csv) {
1342 char sep[64];
1343 if (csv->sep_len) {
1344 int x;
1345 for (x = 0; x < csv->sep_len; x++)
1346 (void)sprintf (sep + x * x, "%02x ", csv->sep[x]);
1347 }
1348 else
1349 (void)sprintf (sep, "'%c' (0x%02x)", CH_SEP, CH_SEP);
1350 return sep;
1351 } /* _sep_string */
1352 #endif
1353
1354 #define Parse(csv,src,fields,fflags) cx_Parse (aTHX_ csv, src, fields, fflags)
cx_Parse(pTHX_ csv_t * csv,SV * src,AV * fields,AV * fflags)1355 static int cx_Parse (pTHX_ csv_t *csv, SV *src, AV *fields, AV *fflags) {
1356 int c, f = 0;
1357 int waitingForField = 1;
1358 SV *sv = NULL;
1359 STRLEN len;
1360 int seenSomething = FALSE;
1361 int fnum = 0;
1362 int spl = -1;
1363 #if MAINT_DEBUG
1364 (void)memset (str_parsed, 0, 40);
1365 #endif
1366
1367 csv->fld_idx = 0;
1368
1369 while ((c = CSV_GET) != EOF) {
1370
1371 NewField;
1372
1373 seenSomething = TRUE;
1374 spl++;
1375 #if MAINT_DEBUG
1376 if (spl < 39) str_parsed[spl] = c;
1377 #endif
1378 restart:
1379 #if MAINT_DEBUG > 9
1380 (void)fprintf (stderr, "# at restart: %d/%d/%03x pos %d = 0x%02x\n",
1381 waitingForField ? 1 : 0, sv ? 1 : 0, f, spl, c);
1382 #endif
1383 if (is_SEP (c)) {
1384 #if MAINT_DEBUG > 1
1385 (void)fprintf (stderr, "# %d/%d/%03x pos %d = SEP %s\t%s\n",
1386 waitingForField ? 1 : 0, sv ? 1 : 0, f, spl,
1387 _sep_string (csv), _pretty_strl (csv->bptr + csv->used));
1388 #endif
1389 if (waitingForField) {
1390 /* ,1,"foo, 3",,bar,
1391 * ^ ^
1392 */
1393 if (csv->blank_is_undef || csv->empty_is_undef)
1394 SvSetUndef (sv);
1395 else
1396 SvSetEmpty (sv);
1397 unless (csv->is_bound)
1398 av_push (fields, sv);
1399 sv = NULL;
1400 if (csv->keep_meta_info && fflags)
1401 av_push (fflags, newSViv (f));
1402 }
1403 else
1404 if (f & CSV_FLAGS_QUO) {
1405 /* ,1,"foo, 3",,bar,
1406 * ^
1407 */
1408 CSV_PUT_SV (c)
1409 }
1410 else {
1411 /* ,1,"foo, 3",,bar,
1412 * ^ ^ ^
1413 */
1414 AV_PUSH;
1415 }
1416 } /* SEP char */
1417 else
1418 if (is_QUOTE (c)) {
1419 #if MAINT_DEBUG > 1
1420 (void)fprintf (stderr, "# %d/%d/%03x pos %d = QUO '%c'\t\t%s\n",
1421 waitingForField ? 1 : 0, sv ? 1 : 0, f, spl, c,
1422 _pretty_strl (csv->bptr + csv->used));
1423 #endif
1424 if (waitingForField) {
1425 /* ,1,"foo, 3",,bar,\r\n
1426 * ^
1427 */
1428 f |= CSV_FLAGS_QUO;
1429 waitingForField = 0;
1430 continue;
1431 }
1432
1433 if (f & CSV_FLAGS_QUO) {
1434
1435 /* ,1,"foo, 3",,bar,\r\n
1436 * ^
1437 */
1438
1439 int quoesc = 0;
1440 int c2 = CSV_GET;
1441
1442 if (csv->allow_whitespace) {
1443 /* , 1 , "foo, 3" , , bar , \r\n
1444 * ^
1445 */
1446 while (is_whitespace (c2)) {
1447 if (csv->allow_loose_quotes &&
1448 !(csv->escape_char && c2 == csv->escape_char)) {
1449 /* This feels like a brittle fix for RT115953, where
1450 * ["foo "bar" baz"] got parsed as [foo "bar"baz]
1451 * when both allow_whitespace and allow_loose_quotes
1452 * are true and escape does not equal quote
1453 */
1454 CSV_PUT_SV (c);
1455 c = c2;
1456 }
1457 c2 = CSV_GET;
1458 }
1459 }
1460
1461 if (is_SEP (c2)) {
1462 /* ,1,"foo, 3",,bar,\r\n
1463 * ^
1464 */
1465 AV_PUSH;
1466 continue;
1467 }
1468
1469 if (c2 == CH_NL || c2 == CH_EOLX) {
1470 /* ,1,"foo, 3",,"bar"\n
1471 * ^
1472 */
1473 AV_PUSH;
1474 return TRUE;
1475 }
1476
1477 /* ---
1478 * if QUOTE eq ESCAPE
1479 * AND ( c2 eq QUOTE 1,"abc""def",2
1480 * OR c2 eq ESCAPE 1,"abc""def",2 (QUO eq ESC)
1481 * OR c2 eq NULL ) 1,"abc"0def",2
1482 * ---
1483 */
1484 if (csv->escape_char && c == csv->escape_char) {
1485
1486 quoesc = 1;
1487 if (c2 == '0') {
1488 /* ,1,"foo, 3"056",,bar,\r\n
1489 * ^
1490 */
1491 CSV_PUT_SV (0)
1492 continue;
1493 }
1494
1495 if (is_QUOTE (c2)) {
1496 /* ,1,"foo, 3""56",,bar,\r\n
1497 * ^
1498 */
1499 if (csv->utf8)
1500 f |= CSV_FLAGS_BIN;
1501 CSV_PUT_SV (c2)
1502 continue;
1503 }
1504
1505 if (csv->allow_loose_escapes && c2 != CH_CR) {
1506 /* ,1,"foo, 3"56",,bar,\r\n
1507 * ^
1508 */
1509 CSV_PUT_SV (c);
1510 c = c2;
1511 goto restart;
1512 }
1513 }
1514
1515 if (c2 == CH_CR) {
1516 int c3;
1517
1518 if (csv->eol_is_cr) {
1519 /* ,1,"foo, 3"\r
1520 * ^
1521 */
1522 AV_PUSH;
1523 return TRUE;
1524 }
1525
1526 c3 = CSV_GET;
1527
1528 if (c3 == CH_NL) { /* \r is not optional before EOLX! */
1529 /* ,1,"foo, 3"\r\n
1530 * ^
1531 */
1532 AV_PUSH;
1533 return TRUE;
1534 }
1535
1536 if (csv->useIO && csv->eol_len == 0) {
1537 if (c3 == CH_CR) { /* \r followed by an empty line */
1538 /* ,1,"foo, 3"\r\r
1539 * ^
1540 */
1541 set_eol_is_cr (csv);
1542 goto EOLX;
1543 }
1544
1545 if (!is_csv_binary (c3)) {
1546 /* ,1,"foo\n 3",,"bar"\r
1547 * baz,4
1548 * ^
1549 */
1550 set_eol_is_cr (csv);
1551 csv->used--;
1552 csv->has_ahead++;
1553 AV_PUSH;
1554 return TRUE;
1555 }
1556 }
1557
1558 ParseError (csv, quoesc ? 2023 : 2010, csv->used - 2);
1559 return FALSE;
1560 }
1561
1562 if (c2 == EOF) {
1563 /* ,1,"foo, 3"
1564 * ^
1565 */
1566 AV_PUSH;
1567 return TRUE;
1568 }
1569
1570 if (csv->allow_loose_quotes && !quoesc) {
1571 /* ,1,"foo, 3"456",,bar,\r\n
1572 * ^
1573 */
1574 CSV_PUT_SV (c);
1575 c = c2;
1576 goto restart;
1577 }
1578
1579 /* 1,"foo" ",3
1580 * ^
1581 */
1582 if (quoesc) {
1583 csv->used--;
1584 ERROR_INSIDE_QUOTES (2023);
1585 }
1586
1587 ERROR_INSIDE_QUOTES (2011);
1588 }
1589
1590 /* !waitingForField, !InsideQuotes */
1591 if (csv->allow_loose_quotes) { /* 1,foo "boo" d'uh,1 */
1592 f |= CSV_FLAGS_EIF; /* Mark as error-in-field */
1593 CSV_PUT_SV (c);
1594 }
1595 else
1596 ERROR_INSIDE_FIELD (2034);
1597 } /* QUO char */
1598 else
1599 if (c == csv->escape_char && csv->escape_char) {
1600 #if MAINT_DEBUG > 1
1601 (void)fprintf (stderr, "# %d/%d/%03x pos %d = ESC '%c'\t%s\n",
1602 waitingForField ? 1 : 0, sv ? 1 : 0, f, spl, c,
1603 _pretty_strl (csv->bptr + csv->used));
1604 #endif
1605 /* This means quote_char != escape_char */
1606 if (waitingForField) {
1607 waitingForField = 0;
1608 if (csv->allow_unquoted_escape) {
1609 /* The escape character is the first character of an
1610 * unquoted field */
1611 /* ... get and store next character */
1612 int c2 = CSV_GET;
1613
1614 SvSetEmpty (sv);
1615
1616 if (c2 == EOF) {
1617 csv->used--;
1618 ERROR_INSIDE_FIELD (2035);
1619 }
1620
1621 if (c2 == '0')
1622 CSV_PUT_SV (0)
1623 else
1624 if ( is_QUOTE (c2) || is_SEP (c2) ||
1625 c2 == csv->escape_char || csv->allow_loose_escapes) {
1626 if (csv->utf8)
1627 f |= CSV_FLAGS_BIN;
1628 CSV_PUT_SV (c2)
1629 }
1630 else {
1631 csv->used--;
1632 ERROR_INSIDE_QUOTES (2025);
1633 }
1634 }
1635 }
1636 else
1637 if (f & CSV_FLAGS_QUO) {
1638 int c2 = CSV_GET;
1639
1640 if (c2 == EOF) {
1641 csv->used--;
1642 ERROR_INSIDE_QUOTES (2024);
1643 }
1644
1645 if (c2 == '0')
1646 CSV_PUT_SV (0)
1647 else
1648 if ( is_QUOTE (c2) || is_SEP (c2) ||
1649 c2 == csv->escape_char || csv->allow_loose_escapes) {
1650 if (csv->utf8)
1651 f |= CSV_FLAGS_BIN;
1652 CSV_PUT_SV (c2)
1653 }
1654 else {
1655 csv->used--;
1656 ERROR_INSIDE_QUOTES (2025);
1657 }
1658 }
1659 else
1660 if (sv) {
1661 int c2 = CSV_GET;
1662
1663 if (c2 == EOF) {
1664 csv->used--;
1665 ERROR_INSIDE_FIELD (2035);
1666 }
1667
1668 CSV_PUT_SV (c2);
1669 }
1670 else
1671 ERROR_INSIDE_FIELD (2036); /* uncoverable statement I think there's no way to get here */
1672 } /* ESC char */
1673 else
1674 if (c == CH_NL || is_EOL (c)) {
1675 EOLX:
1676 #if MAINT_DEBUG > 1
1677 (void)fprintf (stderr, "# %d/%d/%03x pos %d = NL\t%s\n",
1678 waitingForField ? 1 : 0, sv ? 1 : 0, f, spl,
1679 _pretty_strl (csv->bptr + csv->used));
1680 #endif
1681 if (fnum == 1 && f == 0 && SvCUR (sv) == 0 && csv->skip_empty_rows) {
1682 csv->fld_idx = 0;
1683 c = CSV_GET;
1684 if (c == EOF) {
1685 sv_free (sv);
1686 sv = NULL;
1687 waitingForField = 0;
1688 break;
1689 }
1690 goto restart;
1691 }
1692
1693 if (waitingForField) {
1694 /* ,1,"foo, 3",,bar,
1695 * ^
1696 */
1697 if (csv->blank_is_undef || csv->empty_is_undef)
1698 SvSetUndef (sv);
1699 else
1700 SvSetEmpty (sv);
1701 unless (csv->is_bound)
1702 av_push (fields, sv);
1703 if (csv->keep_meta_info && fflags)
1704 av_push (fflags, newSViv (f));
1705 return TRUE;
1706 }
1707
1708 if (f & CSV_FLAGS_QUO) {
1709 /* ,1,"foo\n 3",,bar,
1710 * ^
1711 */
1712 f |= CSV_FLAGS_BIN;
1713 unless (csv->binary)
1714 ERROR_INSIDE_QUOTES (2021);
1715
1716 CSV_PUT_SV (c);
1717 }
1718 else
1719 if (csv->verbatim) {
1720 /* ,1,foo\n 3,,bar,
1721 * This feature should be deprecated
1722 */
1723 f |= CSV_FLAGS_BIN;
1724 unless (csv->binary)
1725 ERROR_INSIDE_FIELD (2030);
1726
1727 CSV_PUT_SV (c);
1728 }
1729 else {
1730 /* sep=,
1731 * ^
1732 */
1733 if (csv->recno == 0 && csv->fld_idx == 1 && csv->useIO &&
1734 (csv->bptr[0] == 's' || csv->bptr[0] == 'S') &&
1735 (csv->bptr[1] == 'e' || csv->bptr[1] == 'E') &&
1736 (csv->bptr[2] == 'p' || csv->bptr[2] == 'P') &&
1737 csv->bptr[3] == '=') {
1738 char *sep = csv->bptr + 4;
1739 int lnu = csv->used - 5;
1740 if (lnu <= MAX_ATTR_LEN) {
1741 sep[lnu] = (char)0;
1742 (void)memcpy (csv->sep, sep, lnu);
1743 csv->sep_len = lnu == 1 ? 0 : lnu;
1744 return Parse (csv, src, fields, fflags);
1745 }
1746 }
1747
1748 /* ,1,"foo\n 3",,bar
1749 * ^
1750 */
1751 AV_PUSH;
1752 return TRUE;
1753 }
1754 } /* CH_NL */
1755 else
1756 if (c == CH_CR && !(csv->verbatim)) {
1757 #if MAINT_DEBUG > 1
1758 (void)fprintf (stderr, "# %d/%d/%03x pos %d = CR\n",
1759 waitingForField ? 1 : 0, sv ? 1 : 0, f, spl);
1760 #endif
1761 if (waitingForField) {
1762 int c2;
1763
1764 if (csv->eol_is_cr) {
1765 /* ,1,"foo\n 3",,bar,\r
1766 * ^
1767 */
1768 c = CH_NL;
1769 goto EOLX;
1770 }
1771
1772 c2 = CSV_GET;
1773
1774 if (c2 == EOF) {
1775 /* ,1,"foo\n 3",,bar,\r
1776 * ^
1777 */
1778 c = EOF;
1779
1780 #if MAINT_DEBUG > 9
1781 (void)fprintf (stderr, "# (%d) ... CR EOF 0x%x\n",
1782 seenSomething, c);
1783 #endif
1784 unless (seenSomething)
1785 break;
1786 goto restart;
1787 }
1788
1789 if (c2 == CH_NL) { /* \r is not optional before EOLX! */
1790 /* ,1,"foo\n 3",,bar,\r\n
1791 * ^
1792 */
1793 c = c2;
1794 goto EOLX;
1795 }
1796
1797 if (csv->useIO && csv->eol_len == 0) {
1798 if (c2 == CH_CR) { /* \r followed by an empty line */
1799 /* ,1,"foo\n 3",,bar,\r\r
1800 * ^
1801 */
1802 set_eol_is_cr (csv);
1803 goto EOLX;
1804 }
1805
1806 waitingForField = 0;
1807
1808 if (!is_csv_binary (c2)) {
1809 /* ,1,"foo\n 3",,bar,\r
1810 * baz,4
1811 * ^
1812 */
1813 set_eol_is_cr (csv);
1814 csv->used--;
1815 csv->has_ahead++;
1816 if (fnum == 1 && f == 0 && SvCUR (sv) == 0 && csv->skip_empty_rows) {
1817 csv->fld_idx = 0;
1818 c = CSV_GET;
1819 if (c == EOF) {
1820 sv_free (sv);
1821 sv = NULL;
1822 waitingForField = 0;
1823 break;
1824 }
1825 goto restart;
1826 }
1827 AV_PUSH;
1828 return TRUE;
1829 }
1830 }
1831
1832 /* ,1,"foo\n 3",,bar,\r\t
1833 * ^
1834 */
1835 csv->used--;
1836 ERROR_INSIDE_FIELD (2031);
1837 }
1838
1839 if (f & CSV_FLAGS_QUO) {
1840 /* ,1,"foo\r 3",,bar,\r\t
1841 * ^
1842 */
1843 f |= CSV_FLAGS_BIN;
1844 unless (csv->binary)
1845 ERROR_INSIDE_QUOTES (2022);
1846
1847 CSV_PUT_SV (c);
1848 }
1849 else {
1850 int c2;
1851
1852 if (csv->eol_is_cr) {
1853 /* ,1,"foo\n 3",,bar\r
1854 * ^
1855 */
1856 goto EOLX;
1857 }
1858
1859 c2 = CSV_GET;
1860
1861 if (c2 == CH_NL) { /* \r is not optional before EOLX! */
1862 /* ,1,"foo\n 3",,bar\r\n
1863 * ^
1864 */
1865 goto EOLX;
1866 }
1867
1868 if (csv->useIO && csv->eol_len == 0) {
1869 if (!is_csv_binary (c2)
1870 /* ,1,"foo\n 3",,bar\r
1871 * baz,4
1872 * ^
1873 */
1874 || c2 == CH_CR) {
1875 /* ,1,"foo\n 3",,bar,\r\r
1876 * ^
1877 */
1878 set_eol_is_cr (csv);
1879 csv->used--;
1880 csv->has_ahead++;
1881 if (fnum == 1 && f == 0 && SvCUR (sv) == 0 && csv->skip_empty_rows) {
1882 csv->fld_idx = 0;
1883 c = CSV_GET;
1884 if (c == EOF) {
1885 sv_free (sv);
1886 sv = NULL;
1887 waitingForField = 0;
1888 break;
1889 }
1890 goto restart;
1891 }
1892 AV_PUSH;
1893 return TRUE;
1894 }
1895 }
1896
1897 /* ,1,"foo\n 3",,bar\r\t
1898 * ^
1899 */
1900 ERROR_INSIDE_FIELD (2032);
1901 }
1902 } /* CH_CR */
1903 else {
1904 #if MAINT_DEBUG > 1
1905 (void)fprintf (stderr, "# %d/%d/%03x pos %d = CCC '%c'\t\t%s\n",
1906 waitingForField ? 1 : 0, sv ? 1 : 0, f, spl, c,
1907 _pretty_strl (csv->bptr + csv->used));
1908 #endif
1909 /* Needed for non-IO parse, where EOL is not set during read */
1910 if (csv->eolx && c == CH_EOL &&
1911 csv->size - csv->used >= (STRLEN)csv->eol_len - 1 &&
1912 !memcmp (csv->bptr + csv->used, csv->eol + 1, csv->eol_len - 1) &&
1913 (csv->used += csv->eol_len - 1)) {
1914 c = CH_EOLX;
1915 #if MAINT_DEBUG > 5
1916 (void)fprintf (stderr, "# -> EOLX (0x%x)\n", c);
1917 #endif
1918 goto EOLX;
1919 }
1920
1921 if (waitingForField) {
1922 if (csv->comment_str && !f && !spl && c == *csv->comment_str) {
1923 STRLEN cl = strlen ((char *)csv->comment_str);
1924
1925 #if MAINT_DEBUG > 5
1926 (void)fprintf (stderr,
1927 "COMMENT? cl = %d, size = %d, used = %d\n",
1928 cl, csv->size, csv->used);
1929 #endif
1930 if (cl == 1 || (
1931 (csv->size - csv->used >= cl - 1 &&
1932 !memcmp (csv->bptr + csv->used, csv->comment_str + 1, cl - 1) &&
1933 (csv->used += cl - 1)))) {
1934 csv->used = csv->size;
1935 csv->fld_idx = 0;
1936 c = CSV_GET;
1937 #if MAINT_DEBUG > 5
1938 (void)fprintf (stderr, "# COMMENT, SKIPPED\n");
1939 #endif
1940 goto restart;
1941 }
1942 }
1943
1944 if (csv->allow_whitespace && is_whitespace (c)) {
1945 do {
1946 c = CSV_GET;
1947 #if MAINT_DEBUG > 5
1948 (void)fprintf (stderr, "# WS next got (0x%x)\n", c);
1949 #endif
1950 } while (is_whitespace (c));
1951 if (c == EOF)
1952 break;
1953 goto restart;
1954 }
1955 waitingForField = 0;
1956 goto restart;
1957 }
1958
1959 #if MAINT_DEBUG > 5
1960 (void)fprintf (stderr, "# %sc 0x%x is%s binary %s utf8\n",
1961 f & CSV_FLAGS_QUO ? "quoted " : "", c,
1962 is_csv_binary (c) ? "" : " not",
1963 csv->utf8 ? "is" : "not");
1964 #endif
1965 if (f & CSV_FLAGS_QUO) {
1966 if (is_csv_binary (c)) {
1967 f |= CSV_FLAGS_BIN;
1968 unless (csv->binary || csv->utf8)
1969 ERROR_INSIDE_QUOTES (2026);
1970 }
1971 CSV_PUT_SV (c);
1972 }
1973 else {
1974 if (is_csv_binary (c)) {
1975 if (csv->useIO && c == EOF)
1976 break;
1977 f |= CSV_FLAGS_BIN;
1978 unless (csv->binary || csv->utf8)
1979 ERROR_INSIDE_FIELD (2037);
1980 }
1981 CSV_PUT_SV (c);
1982 }
1983 }
1984
1985 /* continue */
1986 if (csv->verbatim && csv->useIO && csv->used == csv->size)
1987 break;
1988 }
1989
1990 if (waitingForField) {
1991 if (seenSomething || !csv->useIO) {
1992 NewField;
1993 if (csv->blank_is_undef || csv->empty_is_undef)
1994 SvSetUndef (sv);
1995 else
1996 SvSetEmpty (sv);
1997 unless (csv->is_bound)
1998 av_push (fields, sv);
1999 if (csv->keep_meta_info && fflags)
2000 av_push (fflags, newSViv (f));
2001 return TRUE;
2002 }
2003
2004 (void)SetDiag (csv, 2012);
2005 return FALSE;
2006 }
2007
2008 if (f & CSV_FLAGS_QUO)
2009 ERROR_INSIDE_QUOTES (2027);
2010
2011 if (sv)
2012 AV_PUSH;
2013 return TRUE;
2014 } /* Parse */
2015
hook(pTHX_ HV * hv,char * cb_name,AV * av)2016 static int hook (pTHX_ HV *hv, char *cb_name, AV *av) {
2017 SV **svp;
2018 HV *cb;
2019 int res;
2020
2021 #if MAINT_DEBUG > 1
2022 (void)fprintf (stderr, "# HOOK %s %x\n", cb_name, av);
2023 #endif
2024 unless ((svp = hv_fetchs (hv, "callbacks", FALSE)) && _is_hashref (*svp))
2025 return 0; /* uncoverable statement defensive programming */
2026
2027 cb = (HV *)SvRV (*svp);
2028 svp = hv_fetch (cb, cb_name, strlen (cb_name), FALSE);
2029 unless (svp && _is_coderef (*svp))
2030 return 0;
2031
2032 { dSP;
2033 ENTER;
2034 SAVETMPS;
2035 PUSHMARK (SP);
2036 mXPUSHs (newRV_inc ((SV *)hv));
2037 mXPUSHs (newRV_inc ((SV *)av));
2038 PUTBACK;
2039 res = call_sv (*svp, G_SCALAR);
2040 SPAGAIN;
2041 if (res) {
2042 SV *rv = POPs;
2043 if (SvROK (rv) && (rv = SvRV (rv)) && SvPOK (rv)) {
2044 if (strcmp (SvPV_nolen (rv), "skip") == 0)
2045 res = 0;
2046 }
2047 }
2048 PUTBACK;
2049 FREETMPS;
2050 LEAVE;
2051 }
2052 return res;
2053 } /* hook */
2054
2055 #define c_xsParse(csv,hv,av,avf,src,useIO) cx_c_xsParse (aTHX_ csv, hv, av, avf, src, useIO)
cx_c_xsParse(pTHX_ csv_t csv,HV * hv,AV * av,AV * avf,SV * src,bool useIO)2056 static int cx_c_xsParse (pTHX_ csv_t csv, HV *hv, AV *av, AV *avf, SV *src, bool useIO) {
2057 int result, ahead = 0;
2058 SV *pos = NULL;
2059
2060 ENTER;
2061 if (csv.eolx || csv.eol_is_cr) {
2062 /* local $/ = $eol */
2063 SAVEGENERICSV (PL_rs);
2064 PL_rs = newSVpvn ((char *)csv.eol, csv.eol_len);
2065 }
2066
2067 if ((csv.useIO = useIO)) {
2068 csv.tmp = NULL;
2069
2070 if ((ahead = csv.has_ahead)) {
2071 SV **svp;
2072 if ((svp = hv_fetchs (hv, "_AHEAD", FALSE)) && *svp) {
2073 csv.bptr = SvPV (csv.tmp = *svp, csv.size);
2074 csv.used = 0;
2075 if (pos && SvIV (pos) > (IV)csv.size)
2076 sv_setiv (pos, SvIV (pos) - csv.size);
2077 }
2078 }
2079 }
2080 else {
2081 csv.tmp = src;
2082 csv.utf8 = SvUTF8 (src) ? 1 : 0;
2083 csv.bptr = SvPV (src, csv.size);
2084 }
2085 if (csv.has_error_input) {
2086 (void)hv_store (hv, "_ERROR_INPUT", 12, &PL_sv_undef, 0);
2087 csv.has_error_input = 0;
2088 }
2089
2090 result = Parse (&csv, src, av, avf);
2091 (void)hv_store (hv, "_RECNO", 6, newSViv (++csv.recno), 0);
2092 (void)hv_store (hv, "_EOF", 4, &PL_sv_no, 0);
2093
2094 if (csv.strict) {
2095 unless (csv.strict_n) csv.strict_n = (short)csv.fld_idx;
2096 if (csv.fld_idx != csv.strict_n) {
2097 unless (csv.useIO & useIO_EOF)
2098 ParseError (&csv, 2014, csv.used);
2099 if (last_error) /* an error callback can reset and accept */
2100 result = FALSE;
2101 }
2102 }
2103
2104 if (csv.useIO) {
2105 if (csv.tmp && csv.used < csv.size && csv.has_ahead) {
2106 SV *sv = newSVpvn (csv.bptr + csv.used, csv.size - csv.used);
2107 (void)hv_store (hv, "_AHEAD", 6, sv, 0);
2108 }
2109 else {
2110 csv.has_ahead = 0;
2111 if (csv.useIO & useIO_EOF)
2112 (void)hv_store (hv, "_EOF", 4, &PL_sv_yes, 0);
2113 }
2114 /* csv.cache[CACHE_ID__has_ahead] = csv.has_ahead; */
2115 (void)memcpy (csv.cache, &csv, sizeof (csv_t));
2116
2117 if (avf) {
2118 if (csv.keep_meta_info)
2119 (void)hv_store (hv, "_FFLAGS", 7, newRV_noinc ((SV *)avf), 0);
2120 else {
2121 av_undef (avf);
2122 sv_free ((SV *)avf);
2123 }
2124 }
2125 }
2126 else /* just copy the cache */
2127 (void)memcpy (csv.cache, &csv, sizeof (csv_t));
2128
2129 if (result && csv.types) {
2130 STRLEN i;
2131 STRLEN len = av_len (av);
2132 SV **svp;
2133
2134 for (i = 0; i <= len && i <= csv.types_len; i++) {
2135 if ((svp = av_fetch (av, i, FALSE)) && *svp && SvOK (*svp)) {
2136 switch (csv.types[i]) {
2137 case CSV_XS_TYPE_IV:
2138 #ifdef CSV_XS_TYPE_WARN
2139 sv_setiv (*svp, SvIV (*svp));
2140 #else
2141 if (SvTRUE (*svp))
2142 sv_setiv (*svp, SvIV (*svp));
2143 else
2144 sv_setiv (*svp, 0);
2145 #endif
2146 break;
2147
2148 case CSV_XS_TYPE_NV:
2149 #ifdef CSV_XS_TYPE_WARN
2150 sv_setnv (*svp, SvNV (*svp));
2151 #else
2152 if (SvTRUE (*svp))
2153 sv_setnv (*svp, SvNV (*svp));
2154 else
2155 sv_setnv (*svp, 0.0);
2156 #endif
2157 break;
2158
2159 default:
2160 break;
2161 }
2162 }
2163 }
2164 }
2165
2166 LEAVE;
2167
2168 return result;
2169 } /* c_xsParse */
2170
2171 #define xsParse(self,hv,av,avf,src,useIO) cx_xsParse (aTHX_ self, hv, av, avf, src, useIO)
cx_xsParse(pTHX_ SV * self,HV * hv,AV * av,AV * avf,SV * src,bool useIO)2172 static int cx_xsParse (pTHX_ SV *self, HV *hv, AV *av, AV *avf, SV *src, bool useIO) {
2173 csv_t csv;
2174 int state;
2175 SetupCsv (&csv, hv, self);
2176 state = c_xsParse (csv, hv, av, avf, src, useIO);
2177 if (state && csv.has_hooks & HOOK_AFTER_PARSE)
2178 (void)hook (aTHX_ hv, "after_parse", av);
2179 return (state || !last_error);
2180 } /* xsParse */
2181
2182 /* API also offers av_clear and av_undef, but they have more overhead */
2183 #define av_empty(av) cx_av_empty (aTHX_ av)
cx_av_empty(pTHX_ AV * av)2184 static void cx_av_empty (pTHX_ AV *av) {
2185 while (av_len (av) >= 0)
2186 sv_free (av_pop (av));
2187 } /* av_empty */
2188
2189 #define xsParse_all(self,hv,io,off,len) cx_xsParse_all (aTHX_ self, hv, io, off, len)
cx_xsParse_all(pTHX_ SV * self,HV * hv,SV * io,SV * off,SV * len)2190 static SV *cx_xsParse_all (pTHX_ SV *self, HV *hv, SV *io, SV *off, SV *len) {
2191 csv_t csv;
2192 int n = 0, skip = 0, length = MAXINT, tail = MAXINT;
2193 AV *avr = newAV ();
2194 AV *row = newAV ();
2195
2196 SetupCsv (&csv, hv, self);
2197
2198 if (SvIOK (off)) {
2199 skip = SvIV (off);
2200 if (skip < 0) {
2201 tail = -skip;
2202 skip = -1;
2203 }
2204 }
2205 if (SvIOK (len))
2206 length = SvIV (len);
2207
2208 while (c_xsParse (csv, hv, row, NULL, io, 1)) {
2209
2210 SetupCsv (&csv, hv, self);
2211
2212 if (skip > 0) {
2213 skip--;
2214 av_empty (row); /* re-use */
2215 continue;
2216 }
2217
2218 if (n++ >= tail) {
2219 SvREFCNT_dec (av_shift (avr));
2220 n--;
2221 }
2222
2223 if (csv.has_hooks & HOOK_AFTER_PARSE) {
2224 unless (hook (aTHX_ hv, "after_parse", row)) {
2225 av_empty (row); /* re-use */
2226 continue;
2227 }
2228 }
2229 av_push (avr, newRV_noinc ((SV *)row));
2230
2231 if (n >= length && skip >= 0)
2232 break; /* We have enough */
2233
2234 row = newAV ();
2235 }
2236 while (n > length) {
2237 SvREFCNT_dec (av_pop (avr));
2238 n--;
2239 }
2240
2241 return (SV *)sv_2mortal (newRV_noinc ((SV *)avr));
2242 } /* xsParse_all */
2243
2244 #define xsCombine(self,hv,av,io,useIO) cx_xsCombine (aTHX_ self, hv, av, io, useIO)
cx_xsCombine(pTHX_ SV * self,HV * hv,AV * av,SV * io,bool useIO)2245 static int cx_xsCombine (pTHX_ SV *self, HV *hv, AV *av, SV *io, bool useIO) {
2246 csv_t csv;
2247 int result;
2248 #if (PERL_BCDVERSION >= 0x5008000)
2249 SV *ors = PL_ors_sv;
2250 #endif
2251
2252 SetupCsv (&csv, hv, self);
2253 csv.useIO = useIO;
2254 #if (PERL_BCDVERSION >= 0x5008000)
2255 if (*csv.eol)
2256 PL_ors_sv = NULL;
2257 #endif
2258 if (useIO && csv.has_hooks & HOOK_BEFORE_PRINT)
2259 (void)hook (aTHX_ hv, "before_print", av);
2260 result = Combine (&csv, io, av);
2261 #if (PERL_BCDVERSION >= 0x5008000)
2262 PL_ors_sv = ors;
2263 #endif
2264 if (result && !useIO && csv.utf8)
2265 sv_utf8_upgrade (io);
2266 return result;
2267 } /* xsCombine */
2268
2269 MODULE = Text::CSV_XS PACKAGE = Text::CSV_XS
2270
2271 PROTOTYPES: DISABLE
2272
2273 BOOT:
2274 m_getline = newSVpvs ("getline");
2275 m_print = newSVpvs ("print");
2276 Perl_load_module (aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvs ("IO::Handle"), NULL, NULL, NULL);
2277
2278 void
SetDiag(self,xse,...)2279 SetDiag (self, xse, ...)
2280 SV *self
2281 int xse
2282
2283 PPCODE:
2284 HV *hv;
2285 csv_t csv;
2286
2287 if (SvOK (self) && SvROK (self)) {
2288 CSV_XS_SELF;
2289 SetupCsv (&csv, hv, self);
2290 ST (0) = SetDiag (&csv, xse);
2291 }
2292 else {
2293 last_error = xse;
2294 ST (0) = sv_2mortal (SvDiag (xse));
2295 }
2296
2297 if (xse && items > 1 && SvPOK (ST (2))) {
2298 sv_setpvn (ST (0), SvPVX (ST (2)), SvCUR (ST (2)));
2299 SvIOK_on (ST (0));
2300 }
2301
2302 XSRETURN (1);
2303 /* XS SetDiag */
2304
2305 void
2306 error_input (self)
2307 SV *self
2308
2309 PPCODE:
2310 if (self && SvOK (self) && SvROK (self) && SvTYPE (SvRV (self)) == SVt_PVHV) {
2311 HV *hv = (HV *)SvRV (self);
2312 SV **sv = hv_fetchs (hv, "_ERROR_INPUT", FALSE);
2313 if (SvOK (*sv))
2314 ST (0) = *sv;
2315 else
2316 ST (0) = newSV (0);
2317 }
2318 else
2319 ST (0) = newSV (0);
2320
2321 XSRETURN (1);
2322 /* XS error_input */
2323
2324 void
2325 Combine (self, dst, fields, useIO)
2326 SV *self
2327 SV *dst
2328 SV *fields
2329 bool useIO
2330
2331 PPCODE:
2332 HV *hv;
2333 AV *av;
2334
2335 CSV_XS_SELF;
2336 av = (AV *)SvRV (fields);
2337 ST (0) = xsCombine (self, hv, av, dst, useIO) ? &PL_sv_yes : &PL_sv_undef;
2338 XSRETURN (1);
2339 /* XS Combine */
2340
2341 void
2342 Parse (self, src, fields, fflags)
2343 SV *self
2344 SV *src
2345 SV *fields
2346 SV *fflags
2347
2348 PPCODE:
2349 HV *hv;
2350 AV *av;
2351 AV *avf;
2352
2353 CSV_XS_SELF;
2354 av = (AV *)SvRV (fields);
2355 avf = (AV *)SvRV (fflags);
2356
2357 ST (0) = xsParse (self, hv, av, avf, src, 0) ? &PL_sv_yes : &PL_sv_no;
2358 XSRETURN (1);
2359 /* XS Parse */
2360
2361 void
2362 print (self, io, fields)
2363 SV *self
2364 SV *io
2365 SV *fields
2366
2367 PPCODE:
2368 HV *hv;
2369 AV *av;
2370
2371 CSV_XS_SELF;
2372 if (fields == &PL_sv_undef)
2373 av = newAV ();
2374 else {
2375 unless (_is_arrayref (fields))
2376 croak ("Expected fields to be an array ref");
2377
2378 av = (AV *)SvRV (fields);
2379 }
2380
2381 ST (0) = xsCombine (self, hv, av, io, 1) ? &PL_sv_yes : &PL_sv_no;
2382 XSRETURN (1);
2383 /* XS print */
2384
2385 void
2386 getline (self, io)
2387 SV *self
2388 SV *io
2389
2390 PPCODE:
2391 HV *hv;
2392 AV *av;
2393 AV *avf;
2394
2395 CSV_XS_SELF;
2396 av = newAV ();
2397 avf = newAV ();
2398 ST (0) = xsParse (self, hv, av, avf, io, 1)
2399 ? sv_2mortal (newRV_noinc ((SV *)av))
2400 : &PL_sv_undef;
2401 XSRETURN (1);
2402 /* XS getline */
2403
2404 void
2405 getline_all (self, io, ...)
2406 SV *self
2407 SV *io
2408
2409 PPCODE:
2410 HV *hv;
2411 SV *offset, *length;
2412
2413 CSV_XS_SELF;
2414
2415 offset = items > 2 ? ST (2) : &PL_sv_undef;
2416 length = items > 3 ? ST (3) : &PL_sv_undef;
2417
2418 ST (0) = xsParse_all (self, hv, io, offset, length);
2419 XSRETURN (1);
2420 /* XS getline_all */
2421
2422 void
2423 _cache_set (self, idx, val)
2424 SV *self
2425 int idx
2426 SV *val
2427
2428 PPCODE:
2429 HV *hv;
2430
2431 CSV_XS_SELF;
2432 xs_cache_set (hv, idx, val);
2433 XSRETURN (1);
2434 /* XS _cache_set */
2435
2436 void
2437 _cache_diag (self)
2438 SV *self
2439
2440 PPCODE:
2441 HV *hv;
2442
2443 CSV_XS_SELF;
2444 xs_cache_diag (hv);
2445 XSRETURN (1);
2446 /* XS _cache_diag */
2447