1package Text::CSV_XS; 2 3# Copyright (c) 2007-2021 H.Merijn Brand. All rights reserved. 4# Copyright (c) 1998-2001 Jochen Wiedmann. All rights reserved. 5# Copyright (c) 1997 Alan Citterman. All rights reserved. 6# 7# This program is free software; you can redistribute it and/or 8# modify it under the same terms as Perl itself. 9 10# HISTORY 11# 12# 0.24 - 13# H.Merijn Brand (h.m.brand@xs4all.nl) 14# 0.10 - 0.23 15# Jochen Wiedmann <joe@ispsoft.de> 16# Based on (the original) Text::CSV by: 17# Alan Citterman <alan@mfgrtl.com> 18 19require 5.006001; 20 21use strict; 22use warnings; 23 24require Exporter; 25use XSLoader; 26use Carp; 27 28use vars qw( $VERSION @ISA @EXPORT_OK ); 29$VERSION = "1.47"; 30@ISA = qw( Exporter ); 31@EXPORT_OK = qw( csv ); 32XSLoader::load ("Text::CSV_XS", $VERSION); 33 34sub PV { 0 } 35sub IV { 1 } 36sub NV { 2 } 37 38if ($] < 5.008002) { 39 no warnings "redefine"; 40 *utf8::decode = sub {}; 41 } 42 43# version 44# 45# class/object method expecting no arguments and returning the version 46# number of Text::CSV. there are no side-effects. 47 48sub version { 49 return $VERSION; 50 } # version 51 52# new 53# 54# class/object method expecting no arguments and returning a reference to 55# a newly created Text::CSV object. 56 57my %def_attr = ( 58 'eol' => '', 59 'sep_char' => ',', 60 'quote_char' => '"', 61 'escape_char' => '"', 62 'binary' => 0, 63 'decode_utf8' => 1, 64 'auto_diag' => 0, 65 'diag_verbose' => 0, 66 'strict' => 0, 67 'blank_is_undef' => 0, 68 'empty_is_undef' => 0, 69 'allow_whitespace' => 0, 70 'allow_loose_quotes' => 0, 71 'allow_loose_escapes' => 0, 72 'allow_unquoted_escape' => 0, 73 'always_quote' => 0, 74 'quote_empty' => 0, 75 'quote_space' => 1, 76 'quote_binary' => 1, 77 'escape_null' => 1, 78 'keep_meta_info' => 0, 79 'verbatim' => 0, 80 'formula' => 0, 81 'skip_empty_rows' => 0, 82 'undef_str' => undef, 83 'comment_str' => undef, 84 'types' => undef, 85 'callbacks' => undef, 86 87 '_EOF' => "", 88 '_RECNO' => 0, 89 '_STATUS' => undef, 90 '_FIELDS' => undef, 91 '_FFLAGS' => undef, 92 '_STRING' => undef, 93 '_ERROR_INPUT' => undef, 94 '_COLUMN_NAMES' => undef, 95 '_BOUND_COLUMNS' => undef, 96 '_AHEAD' => undef, 97 '_FORMULA_CB' => undef, 98 99 'ENCODING' => undef, 100 ); 101my %attr_alias = ( 102 'quote_always' => "always_quote", 103 'verbose_diag' => "diag_verbose", 104 'quote_null' => "escape_null", 105 'escape' => "escape_char", 106 'comment' => "comment_str", 107 ); 108my $last_new_err = Text::CSV_XS->SetDiag (0); 109my $ebcdic = ord ("A") == 0xC1; # Faster than $Config{'ebcdic'} 110 111# NOT a method: is also used before bless 112sub _unhealthy_whitespace { 113 my ($self, $aw) = @_; 114 $aw or return 0; # no checks needed without allow_whitespace 115 116 my $quo = $self->{'quote'}; 117 defined $quo && length ($quo) or $quo = $self->{'quote_char'}; 118 my $esc = $self->{'escape_char'}; 119 120 defined $quo && $quo =~ m/^[ \t]/ and return 1002; 121 defined $esc && $esc =~ m/^[ \t]/ and return 1002; 122 123 return 0; 124 } # _unhealty_whitespace 125 126sub _check_sanity { 127 my $self = shift; 128 129 my $eol = $self->{'eol'}; 130 my $sep = $self->{'sep'}; 131 defined $sep && length ($sep) or $sep = $self->{'sep_char'}; 132 my $quo = $self->{'quote'}; 133 defined $quo && length ($quo) or $quo = $self->{'quote_char'}; 134 my $esc = $self->{'escape_char'}; 135 136# use DP;::diag ("SEP: '", DPeek ($sep), 137# "', QUO: '", DPeek ($quo), 138# "', ESC: '", DPeek ($esc),"'"); 139 140 # sep_char should not be undefined 141 $sep ne "" or return 1008; 142 length ($sep) > 16 and return 1006; 143 $sep =~ m/[\r\n]/ and return 1003; 144 145 if (defined $quo) { 146 $quo eq $sep and return 1001; 147 length ($quo) > 16 and return 1007; 148 $quo =~ m/[\r\n]/ and return 1003; 149 } 150 if (defined $esc) { 151 $esc eq $sep and return 1001; 152 $esc =~ m/[\r\n]/ and return 1003; 153 } 154 if (defined $eol) { 155 length ($eol) > 16 and return 1005; 156 } 157 158 return _unhealthy_whitespace ($self, $self->{'allow_whitespace'}); 159 } # _check_sanity 160 161sub known_attributes { 162 sort grep !m/^_/ => "sep", "quote", keys %def_attr; 163 } # known_attributes 164 165sub new { 166 $last_new_err = Text::CSV_XS->SetDiag (1000, 167 "usage: my \$csv = Text::CSV_XS->new ([{ option => value, ... }]);"); 168 169 my $proto = shift; 170 my $class = ref $proto || $proto or return; 171 @_ > 0 && ref $_[0] ne "HASH" and return; 172 my $attr = shift || {}; 173 my %attr = map { 174 my $k = m/^[a-zA-Z]\w+$/ ? lc $_ : $_; 175 exists $attr_alias{$k} and $k = $attr_alias{$k}; 176 ($k => $attr->{$_}); 177 } keys %{$attr}; 178 179 my $sep_aliased = 0; 180 if (exists $attr{'sep'}) { 181 $attr{'sep_char'} = delete $attr{'sep'}; 182 $sep_aliased = 1; 183 } 184 my $quote_aliased = 0; 185 if (exists $attr{'quote'}) { 186 $attr{'quote_char'} = delete $attr{'quote'}; 187 $quote_aliased = 1; 188 } 189 exists $attr{'formula_handling'} and 190 $attr{'formula'} = delete $attr{'formula_handling'}; 191 my $attr_formula = delete $attr{'formula'}; 192 193 for (keys %attr) { 194 if (m/^[a-z]/ && exists $def_attr{$_}) { 195 # uncoverable condition false 196 defined $attr{$_} && m/_char$/ and utf8::decode ($attr{$_}); 197 next; 198 } 199# croak? 200 $last_new_err = Text::CSV_XS->SetDiag (1000, "INI - Unknown attribute '$_'"); 201 $attr{'auto_diag'} and error_diag (); 202 return; 203 } 204 if ($sep_aliased) { 205 my @b = unpack "U0C*", $attr{'sep_char'}; 206 if (@b > 1) { 207 $attr{'sep'} = $attr{'sep_char'}; 208 $attr{'sep_char'} = "\0"; 209 } 210 else { 211 $attr{'sep'} = undef; 212 } 213 } 214 if ($quote_aliased and defined $attr{'quote_char'}) { 215 my @b = unpack "U0C*", $attr{'quote_char'}; 216 if (@b > 1) { 217 $attr{'quote'} = $attr{'quote_char'}; 218 $attr{'quote_char'} = "\0"; 219 } 220 else { 221 $attr{'quote'} = undef; 222 } 223 } 224 225 my $self = { %def_attr, %attr }; 226 if (my $ec = _check_sanity ($self)) { 227 $last_new_err = Text::CSV_XS->SetDiag ($ec); 228 $attr{'auto_diag'} and error_diag (); 229 return; 230 } 231 if (defined $self->{'callbacks'} && ref $self->{'callbacks'} ne "HASH") { 232 carp ("The 'callbacks' attribute is set but is not a hash: ignored\n"); 233 $self->{'callbacks'} = undef; 234 } 235 236 $last_new_err = Text::CSV_XS->SetDiag (0); 237 defined $\ && !exists $attr{'eol'} and $self->{'eol'} = $\; 238 bless $self, $class; 239 defined $self->{'types'} and $self->types ($self->{'types'}); 240 defined $attr_formula and $self->{'formula'} = _supported_formula ($self, $attr_formula); 241 $self; 242 } # new 243 244# Keep in sync with XS! 245my %_cache_id = ( # Only expose what is accessed from within PM 246 'quote_char' => 0, 247 'escape_char' => 1, 248 'sep_char' => 2, 249 'sep' => 39, # 39 .. 55 250 'binary' => 3, 251 'keep_meta_info' => 4, 252 'always_quote' => 5, 253 'allow_loose_quotes' => 6, 254 'allow_loose_escapes' => 7, 255 'allow_unquoted_escape' => 8, 256 'allow_whitespace' => 9, 257 'blank_is_undef' => 10, 258 'eol' => 11, 259 'quote' => 15, 260 'verbatim' => 22, 261 'empty_is_undef' => 23, 262 'auto_diag' => 24, 263 'diag_verbose' => 33, 264 'quote_space' => 25, 265 'quote_empty' => 37, 266 'quote_binary' => 32, 267 'escape_null' => 31, 268 'decode_utf8' => 35, 269 '_has_ahead' => 30, 270 '_has_hooks' => 36, 271 '_is_bound' => 26, # 26 .. 29 272 'formula' => 38, 273 'strict' => 42, 274 'skip_empty_rows' => 43, 275 'undef_str' => 46, 276 'comment_str' => 54, 277 'types' => 62, 278 ); 279 280# A `character' 281sub _set_attr_C { 282 my ($self, $name, $val, $ec) = @_; 283 defined $val and utf8::decode ($val); 284 $self->{$name} = $val; 285 $ec = _check_sanity ($self) and croak ($self->SetDiag ($ec)); 286 $self->_cache_set ($_cache_id{$name}, $val); 287 } # _set_attr_C 288 289# A flag 290sub _set_attr_X { 291 my ($self, $name, $val) = @_; 292 defined $val or $val = 0; 293 $self->{$name} = $val; 294 $self->_cache_set ($_cache_id{$name}, 0 + $val); 295 } # _set_attr_X 296 297# A number 298sub _set_attr_N { 299 my ($self, $name, $val) = @_; 300 $self->{$name} = $val; 301 $self->_cache_set ($_cache_id{$name}, 0 + $val); 302 } # _set_attr_N 303 304# Accessor methods. 305# It is unwise to change them halfway through a single file! 306sub quote_char { 307 my $self = shift; 308 if (@_) { 309 $self->_set_attr_C ("quote_char", shift); 310 $self->_cache_set ($_cache_id{'quote'}, ""); 311 } 312 $self->{'quote_char'}; 313 } # quote_char 314 315sub quote { 316 my $self = shift; 317 if (@_) { 318 my $quote = shift; 319 defined $quote or $quote = ""; 320 utf8::decode ($quote); 321 my @b = unpack "U0C*", $quote; 322 if (@b > 1) { 323 @b > 16 and croak ($self->SetDiag (1007)); 324 $self->quote_char ("\0"); 325 } 326 else { 327 $self->quote_char ($quote); 328 $quote = ""; 329 } 330 $self->{'quote'} = $quote; 331 332 my $ec = _check_sanity ($self); 333 $ec and croak ($self->SetDiag ($ec)); 334 335 $self->_cache_set ($_cache_id{'quote'}, $quote); 336 } 337 my $quote = $self->{'quote'}; 338 defined $quote && length ($quote) ? $quote : $self->{'quote_char'}; 339 } # quote 340 341sub escape_char { 342 my $self = shift; 343 if (@_) { 344 my $ec = shift; 345 $self->_set_attr_C ("escape_char", $ec); 346 $ec or $self->_set_attr_X ("escape_null", 0); 347 } 348 $self->{'escape_char'}; 349 } # escape_char 350 351sub sep_char { 352 my $self = shift; 353 if (@_) { 354 $self->_set_attr_C ("sep_char", shift); 355 $self->_cache_set ($_cache_id{'sep'}, ""); 356 } 357 $self->{'sep_char'}; 358 } # sep_char 359 360sub sep { 361 my $self = shift; 362 if (@_) { 363 my $sep = shift; 364 defined $sep or $sep = ""; 365 utf8::decode ($sep); 366 my @b = unpack "U0C*", $sep; 367 if (@b > 1) { 368 @b > 16 and croak ($self->SetDiag (1006)); 369 $self->sep_char ("\0"); 370 } 371 else { 372 $self->sep_char ($sep); 373 $sep = ""; 374 } 375 $self->{'sep'} = $sep; 376 377 my $ec = _check_sanity ($self); 378 $ec and croak ($self->SetDiag ($ec)); 379 380 $self->_cache_set ($_cache_id{'sep'}, $sep); 381 } 382 my $sep = $self->{'sep'}; 383 defined $sep && length ($sep) ? $sep : $self->{'sep_char'}; 384 } # sep 385 386sub eol { 387 my $self = shift; 388 if (@_) { 389 my $eol = shift; 390 defined $eol or $eol = ""; 391 length ($eol) > 16 and croak ($self->SetDiag (1005)); 392 $self->{'eol'} = $eol; 393 $self->_cache_set ($_cache_id{'eol'}, $eol); 394 } 395 $self->{'eol'}; 396 } # eol 397 398sub always_quote { 399 my $self = shift; 400 @_ and $self->_set_attr_X ("always_quote", shift); 401 $self->{'always_quote'}; 402 } # always_quote 403 404sub quote_space { 405 my $self = shift; 406 @_ and $self->_set_attr_X ("quote_space", shift); 407 $self->{'quote_space'}; 408 } # quote_space 409 410sub quote_empty { 411 my $self = shift; 412 @_ and $self->_set_attr_X ("quote_empty", shift); 413 $self->{'quote_empty'}; 414 } # quote_empty 415 416sub escape_null { 417 my $self = shift; 418 @_ and $self->_set_attr_X ("escape_null", shift); 419 $self->{'escape_null'}; 420 } # escape_null 421sub quote_null { goto &escape_null; } 422 423sub quote_binary { 424 my $self = shift; 425 @_ and $self->_set_attr_X ("quote_binary", shift); 426 $self->{'quote_binary'}; 427 } # quote_binary 428 429sub binary { 430 my $self = shift; 431 @_ and $self->_set_attr_X ("binary", shift); 432 $self->{'binary'}; 433 } # binary 434 435sub strict { 436 my $self = shift; 437 @_ and $self->_set_attr_X ("strict", shift); 438 $self->{'strict'}; 439 } # always_quote 440 441sub skip_empty_rows { 442 my $self = shift; 443 @_ and $self->_set_attr_X ("skip_empty_rows", shift); 444 $self->{'skip_empty_rows'}; 445 } # always_quote 446 447sub _SetDiagInfo { 448 my ($self, $err, $msg) = @_; 449 $self->SetDiag ($err); 450 my $em = $self->error_diag (); 451 $em =~ s/^\d+$// and $msg =~ s/^/# /; 452 my $sep = $em =~ m/[;\n]$/ ? "\n\t" : ": "; 453 join $sep => grep m/\S\S\S/ => $em, $msg; 454 } # _SetDiagInfo 455 456sub _supported_formula { 457 my ($self, $f) = @_; 458 defined $f or return 5; 459 if ($self && $f && ref $f && ref $f eq "CODE") { 460 $self->{'_FORMULA_CB'} = $f; 461 return 6; 462 } 463 $f =~ m/^(?: 0 | none )$/xi ? 0 : 464 $f =~ m/^(?: 1 | die )$/xi ? 1 : 465 $f =~ m/^(?: 2 | croak )$/xi ? 2 : 466 $f =~ m/^(?: 3 | diag )$/xi ? 3 : 467 $f =~ m/^(?: 4 | empty | )$/xi ? 4 : 468 $f =~ m/^(?: 5 | undef )$/xi ? 5 : 469 $f =~ m/^(?: 6 | cb )$/xi ? 6 : do { 470 $self ||= "Text::CSV_XS"; 471 croak ($self->_SetDiagInfo (1500, "formula-handling '$f' is not supported")); 472 }; 473 } # _supported_formula 474 475sub formula { 476 my $self = shift; 477 @_ and $self->_set_attr_N ("formula", _supported_formula ($self, shift)); 478 $self->{'formula'} == 6 or $self->{'_FORMULA_CB'} = undef; 479 [qw( none die croak diag empty undef cb )]->[_supported_formula ($self, $self->{'formula'})]; 480 } # always_quote 481sub formula_handling { 482 my $self = shift; 483 $self->formula (@_); 484 } # formula_handling 485 486sub decode_utf8 { 487 my $self = shift; 488 @_ and $self->_set_attr_X ("decode_utf8", shift); 489 $self->{'decode_utf8'}; 490 } # decode_utf8 491 492sub keep_meta_info { 493 my $self = shift; 494 if (@_) { 495 my $v = shift; 496 !defined $v || $v eq "" and $v = 0; 497 $v =~ m/^[0-9]/ or $v = lc $v eq "false" ? 0 : 1; # true/truth = 1 498 $self->_set_attr_X ("keep_meta_info", $v); 499 } 500 $self->{'keep_meta_info'}; 501 } # keep_meta_info 502 503sub allow_loose_quotes { 504 my $self = shift; 505 @_ and $self->_set_attr_X ("allow_loose_quotes", shift); 506 $self->{'allow_loose_quotes'}; 507 } # allow_loose_quotes 508 509sub allow_loose_escapes { 510 my $self = shift; 511 @_ and $self->_set_attr_X ("allow_loose_escapes", shift); 512 $self->{'allow_loose_escapes'}; 513 } # allow_loose_escapes 514 515sub allow_whitespace { 516 my $self = shift; 517 if (@_) { 518 my $aw = shift; 519 _unhealthy_whitespace ($self, $aw) and 520 croak ($self->SetDiag (1002)); 521 $self->_set_attr_X ("allow_whitespace", $aw); 522 } 523 $self->{'allow_whitespace'}; 524 } # allow_whitespace 525 526sub allow_unquoted_escape { 527 my $self = shift; 528 @_ and $self->_set_attr_X ("allow_unquoted_escape", shift); 529 $self->{'allow_unquoted_escape'}; 530 } # allow_unquoted_escape 531 532sub blank_is_undef { 533 my $self = shift; 534 @_ and $self->_set_attr_X ("blank_is_undef", shift); 535 $self->{'blank_is_undef'}; 536 } # blank_is_undef 537 538sub empty_is_undef { 539 my $self = shift; 540 @_ and $self->_set_attr_X ("empty_is_undef", shift); 541 $self->{'empty_is_undef'}; 542 } # empty_is_undef 543 544sub verbatim { 545 my $self = shift; 546 @_ and $self->_set_attr_X ("verbatim", shift); 547 $self->{'verbatim'}; 548 } # verbatim 549 550sub undef_str { 551 my $self = shift; 552 if (@_) { 553 my $v = shift; 554 $self->{'undef_str'} = defined $v ? "$v" : undef; 555 $self->_cache_set ($_cache_id{'undef_str'}, $self->{'undef_str'}); 556 } 557 $self->{'undef_str'}; 558 } # undef_str 559 560sub comment_str { 561 my $self = shift; 562 if (@_) { 563 my $v = shift; 564 $self->{'comment_str'} = defined $v ? "$v" : undef; 565 $self->_cache_set ($_cache_id{'comment_str'}, $self->{'comment_str'}); 566 } 567 $self->{'comment_str'}; 568 } # comment_str 569 570sub auto_diag { 571 my $self = shift; 572 if (@_) { 573 my $v = shift; 574 !defined $v || $v eq "" and $v = 0; 575 $v =~ m/^[0-9]/ or $v = lc $v eq "false" ? 0 : 1; # true/truth = 1 576 $self->_set_attr_X ("auto_diag", $v); 577 } 578 $self->{'auto_diag'}; 579 } # auto_diag 580 581sub diag_verbose { 582 my $self = shift; 583 if (@_) { 584 my $v = shift; 585 !defined $v || $v eq "" and $v = 0; 586 $v =~ m/^[0-9]/ or $v = lc $v eq "false" ? 0 : 1; # true/truth = 1 587 $self->_set_attr_X ("diag_verbose", $v); 588 } 589 $self->{'diag_verbose'}; 590 } # diag_verbose 591 592# status 593# 594# object method returning the success or failure of the most recent 595# combine () or parse (). there are no side-effects. 596 597sub status { 598 my $self = shift; 599 return $self->{'_STATUS'}; 600 } # status 601 602sub eof { 603 my $self = shift; 604 return $self->{'_EOF'}; 605 } # status 606 607sub types { 608 my $self = shift; 609 if (@_) { 610 if (my $types = shift) { 611 $self->{'_types'} = join "", map { chr } @{$types}; 612 $self->{'types'} = $types; 613 $self->_cache_set ($_cache_id{'types'}, $self->{'_types'}); 614 } 615 else { 616 delete $self->{'types'}; 617 delete $self->{'_types'}; 618 $self->_cache_set ($_cache_id{'types'}, undef); 619 undef; 620 } 621 } 622 else { 623 $self->{'types'}; 624 } 625 } # types 626 627sub callbacks { 628 my $self = shift; 629 if (@_) { 630 my $cb; 631 my $hf = 0x00; 632 if (defined $_[0]) { 633 grep { !defined } @_ and croak ($self->SetDiag (1004)); 634 $cb = @_ == 1 && ref $_[0] eq "HASH" ? shift 635 : @_ % 2 == 0 ? { @_ } 636 : croak ($self->SetDiag (1004)); 637 foreach my $cbk (keys %{$cb}) { 638 # A key cannot be a ref. That would be stored as the *string 639 # 'SCALAR(0x1f3e710)' or 'ARRAY(0x1a5ae18)' 640 $cbk =~ m/^[\w.]+$/ && ref $cb->{$cbk} eq "CODE" or 641 croak ($self->SetDiag (1004)); 642 } 643 exists $cb->{'error'} and $hf |= 0x01; 644 exists $cb->{'after_parse'} and $hf |= 0x02; 645 exists $cb->{'before_print'} and $hf |= 0x04; 646 } 647 elsif (@_ > 1) { 648 # (undef, whatever) 649 croak ($self->SetDiag (1004)); 650 } 651 $self->_set_attr_X ("_has_hooks", $hf); 652 $self->{'callbacks'} = $cb; 653 } 654 $self->{'callbacks'}; 655 } # callbacks 656 657# error_diag 658# 659# If (and only if) an error occurred, this function returns a code that 660# indicates the reason of failure 661 662sub error_diag { 663 my $self = shift; 664 my @diag = (0 + $last_new_err, $last_new_err, 0, 0, 0); 665 666 # Docs state to NEVER use UNIVERSAL::isa, because it will *never* call an 667 # overridden isa method in any class. Well, that is exacly what I want here 668 if ($self && ref $self and # Not a class method or direct call 669 UNIVERSAL::isa ($self, __PACKAGE__) && exists $self->{'_ERROR_DIAG'}) { 670 $diag[0] = 0 + $self->{'_ERROR_DIAG'}; 671 $diag[1] = $self->{'_ERROR_DIAG'}; 672 $diag[2] = 1 + $self->{'_ERROR_POS'} if exists $self->{'_ERROR_POS'}; 673 $diag[3] = $self->{'_RECNO'}; 674 $diag[4] = $self->{'_ERROR_FLD'} if exists $self->{'_ERROR_FLD'}; 675 676 $diag[0] && $self->{'callbacks'} && $self->{'callbacks'}{'error'} and 677 return $self->{'callbacks'}{'error'}->(@diag); 678 } 679 680 my $context = wantarray; 681 unless (defined $context) { # Void context, auto-diag 682 if ($diag[0] && $diag[0] != 2012) { 683 my $msg = "# CSV_XS ERROR: $diag[0] - $diag[1] \@ rec $diag[3] pos $diag[2]\n"; 684 $diag[4] and $msg =~ s/$/ field $diag[4]/; 685 686 unless ($self && ref $self) { # auto_diag 687 # called without args in void context 688 warn $msg; 689 return; 690 } 691 692 $self->{'diag_verbose'} && $self->{'_ERROR_INPUT'} and 693 $msg .= $self->{'_ERROR_INPUT'}."\n". 694 (" " x ($diag[2] - 1))."^\n"; 695 696 my $lvl = $self->{'auto_diag'}; 697 if ($lvl < 2) { 698 my @c = caller (2); 699 if (@c >= 11 && $c[10] && ref $c[10] eq "HASH") { 700 my $hints = $c[10]; 701 (exists $hints->{'autodie'} && $hints->{'autodie'} or 702 exists $hints->{'guard Fatal'} && 703 !exists $hints->{'no Fatal'}) and 704 $lvl++; 705 # Future releases of autodie will probably set $^H{autodie} 706 # to "autodie @args", like "autodie :all" or "autodie open" 707 # so we can/should check for "open" or "new" 708 } 709 } 710 $lvl > 1 ? die $msg : warn $msg; 711 } 712 return; 713 } 714 return $context ? @diag : $diag[1]; 715 } # error_diag 716 717sub record_number { 718 my $self = shift; 719 return $self->{'_RECNO'}; 720 } # record_number 721 722# string 723# 724# object method returning the result of the most recent combine () or the 725# input to the most recent parse (), whichever is more recent. there are 726# no side-effects. 727 728sub string { 729 my $self = shift; 730 return ref $self->{'_STRING'} ? ${$self->{'_STRING'}} : undef; 731 } # string 732 733# fields 734# 735# object method returning the result of the most recent parse () or the 736# input to the most recent combine (), whichever is more recent. there 737# are no side-effects. 738 739sub fields { 740 my $self = shift; 741 return ref $self->{'_FIELDS'} ? @{$self->{'_FIELDS'}} : undef; 742 } # fields 743 744# meta_info 745# 746# object method returning the result of the most recent parse () or the 747# input to the most recent combine (), whichever is more recent. there 748# are no side-effects. meta_info () returns (if available) some of the 749# field's properties 750 751sub meta_info { 752 my $self = shift; 753 return ref $self->{'_FFLAGS'} ? @{$self->{'_FFLAGS'}} : undef; 754 } # meta_info 755 756sub is_quoted { 757 my ($self, $idx) = @_; 758 ref $self->{'_FFLAGS'} && 759 $idx >= 0 && $idx < @{$self->{'_FFLAGS'}} or return; 760 $self->{'_FFLAGS'}[$idx] & 0x0001 ? 1 : 0; 761 } # is_quoted 762 763sub is_binary { 764 my ($self, $idx) = @_; 765 ref $self->{'_FFLAGS'} && 766 $idx >= 0 && $idx < @{$self->{'_FFLAGS'}} or return; 767 $self->{'_FFLAGS'}[$idx] & 0x0002 ? 1 : 0; 768 } # is_binary 769 770sub is_missing { 771 my ($self, $idx) = @_; 772 $idx < 0 || !ref $self->{'_FFLAGS'} and return; 773 $idx >= @{$self->{'_FFLAGS'}} and return 1; 774 $self->{'_FFLAGS'}[$idx] & 0x0010 ? 1 : 0; 775 } # is_missing 776 777# combine 778# 779# Object method returning success or failure. The given arguments are 780# combined into a single comma-separated value. Failure can be the 781# result of no arguments or an argument containing an invalid character. 782# side-effects include: 783# setting status () 784# setting fields () 785# setting string () 786# setting error_input () 787 788sub combine { 789 my $self = shift; 790 my $str = ""; 791 $self->{'_FIELDS'} = \@_; 792 $self->{'_STATUS'} = (@_ > 0) && $self->Combine (\$str, \@_, 0); 793 $self->{'_STRING'} = \$str; 794 $self->{'_STATUS'}; 795 } # combine 796 797# parse 798# 799# Object method returning success or failure. The given argument is 800# expected to be a valid comma-separated value. Failure can be the 801# result of no arguments or an argument containing an invalid sequence 802# of characters. Side-effects include: 803# setting status () 804# setting fields () 805# setting meta_info () 806# setting string () 807# setting error_input () 808 809sub parse { 810 my ($self, $str) = @_; 811 812 ref $str and croak ($self->SetDiag (1500)); 813 814 my $fields = []; 815 my $fflags = []; 816 $self->{'_STRING'} = \$str; 817 if (defined $str && $self->Parse ($str, $fields, $fflags)) { 818 $self->{'_FIELDS'} = $fields; 819 $self->{'_FFLAGS'} = $fflags; 820 $self->{'_STATUS'} = 1; 821 } 822 else { 823 $self->{'_FIELDS'} = undef; 824 $self->{'_FFLAGS'} = undef; 825 $self->{'_STATUS'} = 0; 826 } 827 $self->{'_STATUS'}; 828 } # parse 829 830sub column_names { 831 my ($self, @keys) = @_; 832 @keys or 833 return defined $self->{'_COLUMN_NAMES'} ? @{$self->{'_COLUMN_NAMES'}} : (); 834 835 @keys == 1 && ! defined $keys[0] and 836 return $self->{'_COLUMN_NAMES'} = undef; 837 838 if (@keys == 1 && ref $keys[0] eq "ARRAY") { 839 @keys = @{$keys[0]}; 840 } 841 elsif (join "", map { defined $_ ? ref $_ : "" } @keys) { 842 croak ($self->SetDiag (3001)); 843 } 844 845 $self->{'_BOUND_COLUMNS'} && @keys != @{$self->{'_BOUND_COLUMNS'}} and 846 croak ($self->SetDiag (3003)); 847 848 $self->{'_COLUMN_NAMES'} = [ map { defined $_ ? $_ : "\cAUNDEF\cA" } @keys ]; 849 @{$self->{'_COLUMN_NAMES'}}; 850 } # column_names 851 852sub header { 853 my ($self, $fh, @args) = @_; 854 855 $fh or croak ($self->SetDiag (1014)); 856 857 my (@seps, %args); 858 for (@args) { 859 if (ref $_ eq "ARRAY") { 860 push @seps, @{$_}; 861 next; 862 } 863 if (ref $_ eq "HASH") { 864 %args = %{$_}; 865 next; 866 } 867 croak ('usage: $csv->header ($fh, [ seps ], { options })'); 868 } 869 870 defined $args{'munge'} && !defined $args{'munge_column_names'} and 871 $args{'munge_column_names'} = $args{'munge'}; # munge as alias 872 defined $args{'detect_bom'} or $args{'detect_bom'} = 1; 873 defined $args{'set_column_names'} or $args{'set_column_names'} = 1; 874 defined $args{'munge_column_names'} or $args{'munge_column_names'} = "lc"; 875 876 # Reset any previous leftovers 877 $self->{'_RECNO'} = 0; 878 $self->{'_AHEAD'} = undef; 879 $self->{'_COLUMN_NAMES'} = undef if $args{'set_column_names'}; 880 $self->{'_BOUND_COLUMNS'} = undef if $args{'set_column_names'}; 881 882 if (defined $args{'sep_set'}) { 883 ref $args{'sep_set'} eq "ARRAY" or 884 croak ($self->_SetDiagInfo (1500, "sep_set should be an array ref")); 885 @seps = @{$args{'sep_set'}}; 886 } 887 888 $^O eq "MSWin32" and binmode $fh; 889 my $hdr = <$fh>; 890 # check if $hdr can be empty here, I don't think so 891 defined $hdr && $hdr ne "" or croak ($self->SetDiag (1010)); 892 893 my %sep; 894 @seps or @seps = (",", ";"); 895 foreach my $sep (@seps) { 896 index ($hdr, $sep) >= 0 and $sep{$sep}++; 897 } 898 899 keys %sep >= 2 and croak ($self->SetDiag (1011)); 900 901 $self->sep (keys %sep); 902 my $enc = ""; 903 if ($args{'detect_bom'}) { # UTF-7 is not supported 904 if ($hdr =~ s/^\x00\x00\xfe\xff//) { $enc = "utf-32be" } 905 elsif ($hdr =~ s/^\xff\xfe\x00\x00//) { $enc = "utf-32le" } 906 elsif ($hdr =~ s/^\xfe\xff//) { $enc = "utf-16be" } 907 elsif ($hdr =~ s/^\xff\xfe//) { $enc = "utf-16le" } 908 elsif ($hdr =~ s/^\xef\xbb\xbf//) { $enc = "utf-8" } 909 elsif ($hdr =~ s/^\xf7\x64\x4c//) { $enc = "utf-1" } 910 elsif ($hdr =~ s/^\xdd\x73\x66\x73//) { $enc = "utf-ebcdic" } 911 elsif ($hdr =~ s/^\x0e\xfe\xff//) { $enc = "scsu" } 912 elsif ($hdr =~ s/^\xfb\xee\x28//) { $enc = "bocu-1" } 913 elsif ($hdr =~ s/^\x84\x31\x95\x33//) { $enc = "gb-18030" } 914 elsif ($hdr =~ s/^\x{feff}//) { $enc = "" } 915 916 $self->{'ENCODING'} = $enc ? uc $enc : undef; 917 918 $hdr eq "" and croak ($self->SetDiag (1010)); 919 920 if ($enc) { 921 $ebcdic && $enc eq "utf-ebcdic" and $enc = ""; 922 if ($enc =~ m/([13]).le$/) { 923 my $l = 0 + $1; 924 my $x; 925 $hdr .= "\0" x $l; 926 read $fh, $x, $l; 927 } 928 if ($enc) { 929 if ($enc ne "utf-8") { 930 require Encode; 931 $hdr = Encode::decode ($enc, $hdr); 932 } 933 binmode $fh, ":encoding($enc)"; 934 } 935 } 936 } 937 938 my ($ahead, $eol); 939 if ($hdr and $hdr =~ s/\Asep=(\S)([\r\n]+)//i) { # Also look in xs:Parse 940 $self->sep ($1); 941 length $hdr or $hdr = <$fh>; 942 } 943 if ($hdr =~ s/^([^\r\n]+)([\r\n]+)([^\r\n].+)\z/$1/s) { 944 $eol = $2; 945 $ahead = $3; 946 } 947 948 my $hr = \$hdr; # Will cause croak on perl-5.6.x 949 open my $h, "<", $hr or croak ($self->SetDiag (1010)); 950 951 my $row = $self->getline ($h) or croak (); 952 close $h; 953 954 if ( $args{'munge_column_names'} eq "lc") { 955 $_ = lc for @{$row}; 956 } 957 elsif ($args{'munge_column_names'} eq "uc") { 958 $_ = uc for @{$row}; 959 } 960 elsif ($args{'munge_column_names'} eq "db") { 961 for (@{$row}) { 962 s/\W+/_/g; 963 s/^_+//; 964 $_ = lc; 965 } 966 } 967 968 if ($ahead) { # Must be after getline, which creates the cache 969 $self->_cache_set ($_cache_id{'_has_ahead'}, 1); 970 $self->{'_AHEAD'} = $ahead; 971 $eol =~ m/^\r([^\n]|\z)/ and $self->eol ($eol); 972 } 973 974 my @hdr = @{$row}; 975 ref $args{'munge_column_names'} eq "CODE" and 976 @hdr = map { $args{'munge_column_names'}->($_) } @hdr; 977 ref $args{'munge_column_names'} eq "HASH" and 978 @hdr = map { $args{'munge_column_names'}->{$_} || $_ } @hdr; 979 my %hdr; $hdr{$_}++ for @hdr; 980 exists $hdr{""} and croak ($self->SetDiag (1012)); 981 unless (keys %hdr == @hdr) { 982 croak ($self->_SetDiagInfo (1013, join ", " => 983 map { "$_ ($hdr{$_})" } grep { $hdr{$_} > 1 } keys %hdr)); 984 } 985 $args{'set_column_names'} and $self->column_names (@hdr); 986 wantarray ? @hdr : $self; 987 } # header 988 989sub bind_columns { 990 my ($self, @refs) = @_; 991 @refs or 992 return defined $self->{'_BOUND_COLUMNS'} ? @{$self->{'_BOUND_COLUMNS'}} : undef; 993 994 if (@refs == 1 && ! defined $refs[0]) { 995 $self->{'_COLUMN_NAMES'} = undef; 996 return $self->{'_BOUND_COLUMNS'} = undef; 997 } 998 999 $self->{'_COLUMN_NAMES'} && @refs != @{$self->{'_COLUMN_NAMES'}} and 1000 croak ($self->SetDiag (3003)); 1001 1002 join "", map { ref $_ eq "SCALAR" ? "" : "*" } @refs and 1003 croak ($self->SetDiag (3004)); 1004 1005 $self->_set_attr_N ("_is_bound", scalar @refs); 1006 $self->{'_BOUND_COLUMNS'} = [ @refs ]; 1007 @refs; 1008 } # bind_columns 1009 1010sub getline_hr { 1011 my ($self, @args, %hr) = @_; 1012 $self->{'_COLUMN_NAMES'} or croak ($self->SetDiag (3002)); 1013 my $fr = $self->getline (@args) or return; 1014 if (ref $self->{'_FFLAGS'}) { # missing 1015 $self->{'_FFLAGS'}[$_] = 0x0010 1016 for (@{$fr} ? $#{$fr} + 1 : 0) .. $#{$self->{'_COLUMN_NAMES'}}; 1017 @{$fr} == 1 && (!defined $fr->[0] || $fr->[0] eq "") and 1018 $self->{'_FFLAGS'}[0] ||= 0x0010; 1019 } 1020 @hr{@{$self->{'_COLUMN_NAMES'}}} = @{$fr}; 1021 \%hr; 1022 } # getline_hr 1023 1024sub getline_hr_all { 1025 my ($self, @args) = @_; 1026 $self->{'_COLUMN_NAMES'} or croak ($self->SetDiag (3002)); 1027 my @cn = @{$self->{'_COLUMN_NAMES'}}; 1028 [ map { my %h; @h{@cn} = @{$_}; \%h } @{$self->getline_all (@args)} ]; 1029 } # getline_hr_all 1030 1031sub say { 1032 my ($self, $io, @f) = @_; 1033 my $eol = $self->eol (); 1034 $eol eq "" and $self->eol ($\ || $/); 1035 # say ($fh, undef) does not propage actual undef to print () 1036 my $state = $self->print ($io, @f == 1 && !defined $f[0] ? undef : @f); 1037 $self->eol ($eol); 1038 return $state; 1039 } # say 1040 1041sub print_hr { 1042 my ($self, $io, $hr) = @_; 1043 $self->{'_COLUMN_NAMES'} or croak ($self->SetDiag (3009)); 1044 ref $hr eq "HASH" or croak ($self->SetDiag (3010)); 1045 $self->print ($io, [ map { $hr->{$_} } $self->column_names () ]); 1046 } # print_hr 1047 1048sub fragment { 1049 my ($self, $io, $spec) = @_; 1050 1051 my $qd = qr{\s* [0-9]+ \s* }x; # digit 1052 my $qs = qr{\s* (?: [0-9]+ | \* ) \s*}x; # digit or star 1053 my $qr = qr{$qd (?: - $qs )?}x; # range 1054 my $qc = qr{$qr (?: ; $qr )*}x; # list 1055 defined $spec && $spec =~ m{^ \s* 1056 \x23 ? \s* # optional leading # 1057 ( row | col | cell ) \s* = 1058 ( $qc # for row and col 1059 | $qd , $qd (?: - $qs , $qs)? # for cell (ranges) 1060 (?: ; $qd , $qd (?: - $qs , $qs)? )* # and cell (range) lists 1061 ) \s* $}xi or croak ($self->SetDiag (2013)); 1062 my ($type, $range) = (lc $1, $2); 1063 1064 my @h = $self->column_names (); 1065 1066 my @c; 1067 if ($type eq "cell") { 1068 my @spec; 1069 my $min_row; 1070 my $max_row = 0; 1071 for (split m/\s*;\s*/ => $range) { 1072 my ($tlr, $tlc, $brr, $brc) = (m{ 1073 ^ \s* ([0-9]+ ) \s* , \s* ([0-9]+ ) \s* 1074 (?: - \s* ([0-9]+ | \*) \s* , \s* ([0-9]+ | \*) \s* )? 1075 $}x) or croak ($self->SetDiag (2013)); 1076 defined $brr or ($brr, $brc) = ($tlr, $tlc); 1077 $tlr == 0 || $tlc == 0 || 1078 ($brr ne "*" && ($brr == 0 || $brr < $tlr)) || 1079 ($brc ne "*" && ($brc == 0 || $brc < $tlc)) 1080 and croak ($self->SetDiag (2013)); 1081 $tlc--; 1082 $brc-- unless $brc eq "*"; 1083 defined $min_row or $min_row = $tlr; 1084 $tlr < $min_row and $min_row = $tlr; 1085 $brr eq "*" || $brr > $max_row and 1086 $max_row = $brr; 1087 push @spec, [ $tlr, $tlc, $brr, $brc ]; 1088 } 1089 my $r = 0; 1090 while (my $row = $self->getline ($io)) { 1091 ++$r < $min_row and next; 1092 my %row; 1093 my $lc; 1094 foreach my $s (@spec) { 1095 my ($tlr, $tlc, $brr, $brc) = @{$s}; 1096 $r < $tlr || ($brr ne "*" && $r > $brr) and next; 1097 !defined $lc || $tlc < $lc and $lc = $tlc; 1098 my $rr = $brc eq "*" ? $#{$row} : $brc; 1099 $row{$_} = $row->[$_] for $tlc .. $rr; 1100 } 1101 push @c, [ @row{sort { $a <=> $b } keys %row } ]; 1102 if (@h) { 1103 my %h; @h{@h} = @{$c[-1]}; 1104 $c[-1] = \%h; 1105 } 1106 $max_row ne "*" && $r == $max_row and last; 1107 } 1108 return \@c; 1109 } 1110 1111 # row or col 1112 my @r; 1113 my $eod = 0; 1114 for (split m/\s*;\s*/ => $range) { 1115 my ($from, $to) = m/^\s* ([0-9]+) (?: \s* - \s* ([0-9]+ | \* ))? \s* $/x 1116 or croak ($self->SetDiag (2013)); 1117 $to ||= $from; 1118 $to eq "*" and ($to, $eod) = ($from, 1); 1119 # $to cannot be <= 0 due to regex and ||= 1120 $from <= 0 || $to < $from and croak ($self->SetDiag (2013)); 1121 $r[$_] = 1 for $from .. $to; 1122 } 1123 1124 my $r = 0; 1125 $type eq "col" and shift @r; 1126 $_ ||= 0 for @r; 1127 while (my $row = $self->getline ($io)) { 1128 $r++; 1129 if ($type eq "row") { 1130 if (($r > $#r && $eod) || $r[$r]) { 1131 push @c, $row; 1132 if (@h) { 1133 my %h; @h{@h} = @{$c[-1]}; 1134 $c[-1] = \%h; 1135 } 1136 } 1137 next; 1138 } 1139 push @c, [ map { ($_ > $#r && $eod) || $r[$_] ? $row->[$_] : () } 0..$#{$row} ]; 1140 if (@h) { 1141 my %h; @h{@h} = @{$c[-1]}; 1142 $c[-1] = \%h; 1143 } 1144 } 1145 1146 return \@c; 1147 } # fragment 1148 1149my $csv_usage = q{usage: my $aoa = csv (in => $file);}; 1150 1151sub _csv_attr { 1152 my %attr = (@_ == 1 && ref $_[0] eq "HASH" ? %{$_[0]} : @_) or croak (); 1153 1154 $attr{'binary'} = 1; 1155 1156 my $enc = delete $attr{'enc'} || delete $attr{'encoding'} || ""; 1157 $enc eq "auto" and ($attr{'detect_bom'}, $enc) = (1, ""); 1158 my $stack = $enc =~ s/(:\w.*)// ? $1 : ""; 1159 $enc =~ m/^[-\w.]+$/ and $enc = ":encoding($enc)"; 1160 $enc .= $stack; 1161 1162 my $fh; 1163 my $sink = 0; 1164 my $cls = 0; # If I open a file, I have to close it 1165 my $in = delete $attr{'in'} || delete $attr{'file'} or croak ($csv_usage); 1166 my $out = exists $attr{'out'} && !$attr{'out'} ? \"skip" 1167 : delete $attr{'out'} || delete $attr{'file'}; 1168 1169 ref $in eq "CODE" || ref $in eq "ARRAY" and $out ||= \*STDOUT; 1170 1171 $in && $out && !ref $in && !ref $out and croak (join "\n" => 1172 qq{Cannot use a string for both in and out. Instead use:}, 1173 qq{ csv (in => csv (in => "$in"), out => "$out");\n}); 1174 1175 if ($out) { 1176 if (ref $out and ("ARRAY" eq ref $out or "HASH" eq ref $out)) { 1177 delete $attr{'out'}; 1178 $sink = 1; 1179 } 1180 elsif ((ref $out and "SCALAR" ne ref $out) or "GLOB" eq ref \$out) { 1181 $fh = $out; 1182 } 1183 elsif (ref $out and "SCALAR" eq ref $out and defined ${$out} and ${$out} eq "skip") { 1184 delete $attr{'out'}; 1185 $sink = 1; 1186 } 1187 else { 1188 open $fh, ">", $out or croak ("$out: $!"); 1189 $cls = 1; 1190 } 1191 if ($fh) { 1192 if ($enc) { 1193 binmode $fh, $enc; 1194 my $fn = fileno $fh; # This is a workaround for a bug in PerlIO::via::gzip 1195 } 1196 unless (defined $attr{'eol'}) { 1197 my @layers = eval { PerlIO::get_layers ($fh) }; 1198 $attr{'eol'} = (grep m/crlf/ => @layers) ? "\n" : "\r\n"; 1199 } 1200 } 1201 } 1202 1203 if ( ref $in eq "CODE" or ref $in eq "ARRAY") { 1204 # All done 1205 } 1206 elsif (ref $in eq "SCALAR") { 1207 # Strings with code points over 0xFF may not be mapped into in-memory file handles 1208 # "<$enc" does not change that :( 1209 open $fh, "<", $in or croak ("Cannot open from SCALAR using PerlIO"); 1210 $cls = 1; 1211 } 1212 elsif (ref $in or "GLOB" eq ref \$in) { 1213 if (!ref $in && $] < 5.008005) { 1214 $fh = \*{$in}; # uncoverable statement ancient perl version required 1215 } 1216 else { 1217 $fh = $in; 1218 } 1219 } 1220 else { 1221 open $fh, "<$enc", $in or croak ("$in: $!"); 1222 $cls = 1; 1223 } 1224 $fh || $sink or croak (qq{No valid source passed. "in" is required}); 1225 1226 my $hdrs = delete $attr{'headers'}; 1227 my $frag = delete $attr{'fragment'}; 1228 my $key = delete $attr{'key'}; 1229 my $val = delete $attr{'value'}; 1230 my $kh = delete $attr{'keep_headers'} || 1231 delete $attr{'keep_column_names'} || 1232 delete $attr{'kh'}; 1233 1234 my $cbai = delete $attr{'callbacks'}{'after_in'} || 1235 delete $attr{'after_in'} || 1236 delete $attr{'callbacks'}{'after_parse'} || 1237 delete $attr{'after_parse'}; 1238 my $cbbo = delete $attr{'callbacks'}{'before_out'} || 1239 delete $attr{'before_out'}; 1240 my $cboi = delete $attr{'callbacks'}{'on_in'} || 1241 delete $attr{'on_in'}; 1242 1243 my $hd_s = delete $attr{'sep_set'} || 1244 delete $attr{'seps'}; 1245 my $hd_b = delete $attr{'detect_bom'} || 1246 delete $attr{'bom'}; 1247 my $hd_m = delete $attr{'munge'} || 1248 delete $attr{'munge_column_names'}; 1249 my $hd_c = delete $attr{'set_column_names'}; 1250 1251 for ([ 'quo' => "quote" ], 1252 [ 'esc' => "escape" ], 1253 [ 'escape' => "escape_char" ], 1254 ) { 1255 my ($f, $t) = @{$_}; 1256 exists $attr{$f} and !exists $attr{$t} and $attr{$t} = delete $attr{$f}; 1257 } 1258 1259 my $fltr = delete $attr{'filter'}; 1260 my %fltr = ( 1261 'not_blank' => sub { @{$_[1]} > 1 or defined $_[1][0] && $_[1][0] ne "" }, 1262 'not_empty' => sub { grep { defined && $_ ne "" } @{$_[1]} }, 1263 'filled' => sub { grep { defined && m/\S/ } @{$_[1]} }, 1264 ); 1265 defined $fltr && !ref $fltr && exists $fltr{$fltr} and 1266 $fltr = { '0' => $fltr{$fltr} }; 1267 ref $fltr eq "CODE" and $fltr = { 0 => $fltr }; 1268 ref $fltr eq "HASH" or $fltr = undef; 1269 1270 my $form = delete $attr{'formula'}; 1271 1272 defined $attr{'auto_diag'} or $attr{'auto_diag'} = 1; 1273 defined $attr{'escape_null'} or $attr{'escape_null'} = 0; 1274 my $csv = delete $attr{'csv'} || Text::CSV_XS->new (\%attr) 1275 or croak ($last_new_err); 1276 defined $form and $csv->formula ($form); 1277 1278 return { 1279 'csv' => $csv, 1280 'attr' => { %attr }, 1281 'fh' => $fh, 1282 'cls' => $cls, 1283 'in' => $in, 1284 'sink' => $sink, 1285 'out' => $out, 1286 'enc' => $enc, 1287 'hdrs' => $hdrs, 1288 'key' => $key, 1289 'val' => $val, 1290 'kh' => $kh, 1291 'frag' => $frag, 1292 'fltr' => $fltr, 1293 'cbai' => $cbai, 1294 'cbbo' => $cbbo, 1295 'cboi' => $cboi, 1296 'hd_s' => $hd_s, 1297 'hd_b' => $hd_b, 1298 'hd_m' => $hd_m, 1299 'hd_c' => $hd_c, 1300 }; 1301 } # _csv_attr 1302 1303sub csv { 1304 @_ && ref $_[0] eq __PACKAGE__ and splice @_, 0, 0, "csv"; 1305 @_ or croak ($csv_usage); 1306 1307 my $c = _csv_attr (@_); 1308 1309 my ($csv, $in, $fh, $hdrs) = @{$c}{qw( csv in fh hdrs )}; 1310 my %hdr; 1311 if (ref $hdrs eq "HASH") { 1312 %hdr = %{$hdrs}; 1313 $hdrs = "auto"; 1314 } 1315 1316 if ($c->{'out'} && !$c->{'sink'}) { 1317 if (ref $in eq "CODE") { 1318 my $hdr = 1; 1319 while (my $row = $in->($csv)) { 1320 if (ref $row eq "ARRAY") { 1321 $csv->print ($fh, $row); 1322 next; 1323 } 1324 if (ref $row eq "HASH") { 1325 if ($hdr) { 1326 $hdrs ||= [ map { $hdr{$_} || $_ } keys %{$row} ]; 1327 $csv->print ($fh, $hdrs); 1328 $hdr = 0; 1329 } 1330 $csv->print ($fh, [ @{$row}{@{$hdrs}} ]); 1331 } 1332 } 1333 } 1334 elsif (@{$in} == 0 or ref $in->[0] eq "ARRAY") { # aoa 1335 ref $hdrs and $csv->print ($fh, $hdrs); 1336 for (@{$in}) { 1337 $c->{'cboi'} and $c->{'cboi'}->($csv, $_); 1338 $c->{'cbbo'} and $c->{'cbbo'}->($csv, $_); 1339 $csv->print ($fh, $_); 1340 } 1341 } 1342 else { # aoh 1343 my @hdrs = ref $hdrs ? @{$hdrs} : keys %{$in->[0]}; 1344 defined $hdrs or $hdrs = "auto"; 1345 ref $hdrs || $hdrs eq "auto" and @hdrs and 1346 $csv->print ($fh, [ map { $hdr{$_} || $_ } @hdrs ]); 1347 for (@{$in}) { 1348 local %_; 1349 *_ = $_; 1350 $c->{'cboi'} and $c->{'cboi'}->($csv, $_); 1351 $c->{'cbbo'} and $c->{'cbbo'}->($csv, $_); 1352 $csv->print ($fh, [ @{$_}{@hdrs} ]); 1353 } 1354 } 1355 1356 $c->{'cls'} and close $fh; 1357 return 1; 1358 } 1359 1360 my @row1; 1361 if (defined $c->{'hd_s'} || defined $c->{'hd_b'} || defined $c->{'hd_m'} || defined $c->{'hd_c'}) { 1362 my %harg; 1363 defined $c->{'hd_s'} and $harg{'set_set'} = $c->{'hd_s'}; 1364 defined $c->{'hd_d'} and $harg{'detect_bom'} = $c->{'hd_b'}; 1365 defined $c->{'hd_m'} and $harg{'munge_column_names'} = $hdrs ? "none" : $c->{'hd_m'}; 1366 defined $c->{'hd_c'} and $harg{'set_column_names'} = $hdrs ? 0 : $c->{'hd_c'}; 1367 @row1 = $csv->header ($fh, \%harg); 1368 my @hdr = $csv->column_names (); 1369 @hdr and $hdrs ||= \@hdr; 1370 } 1371 1372 if ($c->{'kh'}) { 1373 ref $c->{'kh'} eq "ARRAY" or croak ($csv->SetDiag (1501)); 1374 $hdrs ||= "auto"; 1375 } 1376 1377 my $key = $c->{'key'}; 1378 if ($key) { 1379 !ref $key or ref $key eq "ARRAY" && @{$key} > 1 or croak ($csv->SetDiag (1501)); 1380 $hdrs ||= "auto"; 1381 } 1382 my $val = $c->{'val'}; 1383 if ($val) { 1384 $key or croak ($csv->SetDiag (1502)); 1385 !ref $val or ref $val eq "ARRAY" && @{$val} > 0 or croak ($csv->SetDiag (1503)); 1386 } 1387 1388 $c->{'fltr'} && grep m/\D/ => keys %{$c->{'fltr'}} and $hdrs ||= "auto"; 1389 if (defined $hdrs) { 1390 if (!ref $hdrs) { 1391 if ($hdrs eq "skip") { 1392 $csv->getline ($fh); # discard; 1393 } 1394 elsif ($hdrs eq "auto") { 1395 my $h = $csv->getline ($fh) or return; 1396 $hdrs = [ map { $hdr{$_} || $_ } @{$h} ]; 1397 } 1398 elsif ($hdrs eq "lc") { 1399 my $h = $csv->getline ($fh) or return; 1400 $hdrs = [ map { lc ($hdr{$_} || $_) } @{$h} ]; 1401 } 1402 elsif ($hdrs eq "uc") { 1403 my $h = $csv->getline ($fh) or return; 1404 $hdrs = [ map { uc ($hdr{$_} || $_) } @{$h} ]; 1405 } 1406 } 1407 elsif (ref $hdrs eq "CODE") { 1408 my $h = $csv->getline ($fh) or return; 1409 my $cr = $hdrs; 1410 $hdrs = [ map { $cr->($hdr{$_} || $_) } @{$h} ]; 1411 } 1412 $c->{'kh'} and $hdrs and @{$c->{'kh'}} = @{$hdrs}; 1413 } 1414 1415 if ($c->{'fltr'}) { 1416 my %f = %{$c->{'fltr'}}; 1417 # convert headers to index 1418 my @hdr; 1419 if (ref $hdrs) { 1420 @hdr = @{$hdrs}; 1421 for (0 .. $#hdr) { 1422 exists $f{$hdr[$_]} and $f{$_ + 1} = delete $f{$hdr[$_]}; 1423 } 1424 } 1425 $csv->callbacks ('after_parse' => sub { 1426 my ($CSV, $ROW) = @_; # lexical sub-variables in caps 1427 foreach my $FLD (sort keys %f) { 1428 local $_ = $ROW->[$FLD - 1]; 1429 local %_; 1430 @hdr and @_{@hdr} = @{$ROW}; 1431 $f{$FLD}->($CSV, $ROW) or return \"skip"; 1432 $ROW->[$FLD - 1] = $_; 1433 } 1434 }); 1435 } 1436 1437 my $frag = $c->{'frag'}; 1438 my $ref = ref $hdrs 1439 ? # aoh 1440 do { 1441 my @h = $csv->column_names ($hdrs); 1442 my %h; $h{$_}++ for @h; 1443 exists $h{""} and croak ($csv->SetDiag (1012)); 1444 unless (keys %h == @h) { 1445 croak ($csv->_SetDiagInfo (1013, join ", " => 1446 map { "$_ ($h{$_})" } grep { $h{$_} > 1 } keys %h)); 1447 } 1448 $frag ? $csv->fragment ($fh, $frag) : 1449 $key ? do { 1450 my ($k, $j, @f) = ref $key ? (undef, @{$key}) : ($key); 1451 if (my @mk = grep { !exists $h{$_} } grep { defined } $k, @f) { 1452 croak ($csv->_SetDiagInfo (4001, join ", " => @mk)); 1453 } 1454 +{ map { 1455 my $r = $_; 1456 my $K = defined $k ? $r->{$k} : join $j => @{$r}{@f}; 1457 ( $K => ( 1458 $val 1459 ? ref $val 1460 ? { map { $_ => $r->{$_} } @{$val} } 1461 : $r->{$val} 1462 : $r )); 1463 } @{$csv->getline_hr_all ($fh)} } 1464 } 1465 : $csv->getline_hr_all ($fh); 1466 } 1467 : # aoa 1468 $frag ? $csv->fragment ($fh, $frag) 1469 : $csv->getline_all ($fh); 1470 if ($ref) { 1471 @row1 && !$c->{'hd_c'} && !ref $hdrs and unshift @{$ref}, \@row1; 1472 } 1473 else { 1474 Text::CSV_XS->auto_diag (); 1475 } 1476 $c->{'cls'} and close $fh; 1477 if ($ref and $c->{'cbai'} || $c->{'cboi'}) { 1478 # Default is ARRAYref, but with key =>, you'll get a hashref 1479 foreach my $r (ref $ref eq "ARRAY" ? @{$ref} : values %{$ref}) { 1480 local %_; 1481 ref $r eq "HASH" and *_ = $r; 1482 $c->{'cbai'} and $c->{'cbai'}->($csv, $r); 1483 $c->{'cboi'} and $c->{'cboi'}->($csv, $r); 1484 } 1485 } 1486 1487 if ($c->{'sink'}) { 1488 my $ro = ref $c->{'out'} or return; 1489 1490 $ro eq "SCALAR" && ${$c->{'out'}} eq "skip" and 1491 return; 1492 1493 $ro eq ref $ref or 1494 croak ($csv->_SetDiagInfo (5001, "Output type mismatch")); 1495 1496 if ($ro eq "ARRAY") { 1497 if (@{$c->{'out'}} and @$ref and ref $c->{'out'}[0] eq ref $ref->[0]) { 1498 push @{$c->{'out'}} => @$ref; 1499 return $c->{'out'}; 1500 } 1501 croak ($csv->_SetDiagInfo (5001, "Output type mismatch")); 1502 } 1503 1504 if ($ro eq "HASH") { 1505 @{$c->{'out'}}{keys %{$ref}} = values %{$ref}; 1506 return $c->{'out'}; 1507 } 1508 1509 croak ($csv->_SetDiagInfo (5002, "Unsupported output type")); 1510 } 1511 1512 defined wantarray or 1513 return csv ( 1514 'in' => $ref, 1515 'headers' => $hdrs, 1516 %{$c->{'attr'}}, 1517 ); 1518 1519 return $ref; 1520 } # csv 1521 15221; 1523 1524__END__ 1525 1526=encoding utf-8 1527 1528=head1 NAME 1529 1530Text::CSV_XS - comma-separated values manipulation routines 1531 1532=head1 SYNOPSIS 1533 1534 # Functional interface 1535 use Text::CSV_XS qw( csv ); 1536 1537 # Read whole file in memory 1538 my $aoa = csv (in => "data.csv"); # as array of array 1539 my $aoh = csv (in => "data.csv", 1540 headers => "auto"); # as array of hash 1541 1542 # Write array of arrays as csv file 1543 csv (in => $aoa, out => "file.csv", sep_char=> ";"); 1544 1545 # Only show lines where "code" is odd 1546 csv (in => "data.csv", filter => { code => sub { $_ % 2 }}); 1547 1548 1549 # Object interface 1550 use Text::CSV_XS; 1551 1552 my @rows; 1553 # Read/parse CSV 1554 my $csv = Text::CSV_XS->new ({ binary => 1, auto_diag => 1 }); 1555 open my $fh, "<:encoding(utf8)", "test.csv" or die "test.csv: $!"; 1556 while (my $row = $csv->getline ($fh)) { 1557 $row->[2] =~ m/pattern/ or next; # 3rd field should match 1558 push @rows, $row; 1559 } 1560 close $fh; 1561 1562 # and write as CSV 1563 open $fh, ">:encoding(utf8)", "new.csv" or die "new.csv: $!"; 1564 $csv->say ($fh, $_) for @rows; 1565 close $fh or die "new.csv: $!"; 1566 1567=head1 DESCRIPTION 1568 1569Text::CSV_XS provides facilities for the composition and decomposition of 1570comma-separated values. An instance of the Text::CSV_XS class will combine 1571fields into a C<CSV> string and parse a C<CSV> string into fields. 1572 1573The module accepts either strings or files as input and support the use of 1574user-specified characters for delimiters, separators, and escapes. 1575 1576=head2 Embedded newlines 1577 1578B<Important Note>: The default behavior is to accept only ASCII characters 1579in the range from C<0x20> (space) to C<0x7E> (tilde). This means that the 1580fields can not contain newlines. If your data contains newlines embedded in 1581fields, or characters above C<0x7E> (tilde), or binary data, you B<I<must>> 1582set C<< binary => 1 >> in the call to L</new>. To cover the widest range of 1583parsing options, you will always want to set binary. 1584 1585But you still have the problem that you have to pass a correct line to the 1586L</parse> method, which is more complicated from the usual point of usage: 1587 1588 my $csv = Text::CSV_XS->new ({ binary => 1, eol => $/ }); 1589 while (<>) { # WRONG! 1590 $csv->parse ($_); 1591 my @fields = $csv->fields (); 1592 } 1593 1594this will break, as the C<while> might read broken lines: it does not care 1595about the quoting. If you need to support embedded newlines, the way to go 1596is to B<not> pass L<C<eol>|/eol> in the parser (it accepts C<\n>, C<\r>, 1597B<and> C<\r\n> by default) and then 1598 1599 my $csv = Text::CSV_XS->new ({ binary => 1 }); 1600 open my $fh, "<", $file or die "$file: $!"; 1601 while (my $row = $csv->getline ($fh)) { 1602 my @fields = @$row; 1603 } 1604 1605The old(er) way of using global file handles is still supported 1606 1607 while (my $row = $csv->getline (*ARGV)) { ... } 1608 1609=head2 Unicode 1610 1611Unicode is only tested to work with perl-5.8.2 and up. 1612 1613See also L</BOM>. 1614 1615The simplest way to ensure the correct encoding is used for in- and output 1616is by either setting layers on the filehandles, or setting the L</encoding> 1617argument for L</csv>. 1618 1619 open my $fh, "<:encoding(UTF-8)", "in.csv" or die "in.csv: $!"; 1620or 1621 my $aoa = csv (in => "in.csv", encoding => "UTF-8"); 1622 1623 open my $fh, ">:encoding(UTF-8)", "out.csv" or die "out.csv: $!"; 1624or 1625 csv (in => $aoa, out => "out.csv", encoding => "UTF-8"); 1626 1627On parsing (both for L</getline> and L</parse>), if the source is marked 1628being UTF8, then all fields that are marked binary will also be marked UTF8. 1629 1630On combining (L</print> and L</combine>): if any of the combining fields 1631was marked UTF8, the resulting string will be marked as UTF8. Note however 1632that all fields I<before> the first field marked UTF8 and contained 8-bit 1633characters that were not upgraded to UTF8, these will be C<bytes> in the 1634resulting string too, possibly causing unexpected errors. If you pass data 1635of different encoding, or you don't know if there is different encoding, 1636force it to be upgraded before you pass them on: 1637 1638 $csv->print ($fh, [ map { utf8::upgrade (my $x = $_); $x } @data ]); 1639 1640For complete control over encoding, please use L<Text::CSV::Encoded>: 1641 1642 use Text::CSV::Encoded; 1643 my $csv = Text::CSV::Encoded->new ({ 1644 encoding_in => "iso-8859-1", # the encoding comes into Perl 1645 encoding_out => "cp1252", # the encoding comes out of Perl 1646 }); 1647 1648 $csv = Text::CSV::Encoded->new ({ encoding => "utf8" }); 1649 # combine () and print () accept *literally* utf8 encoded data 1650 # parse () and getline () return *literally* utf8 encoded data 1651 1652 $csv = Text::CSV::Encoded->new ({ encoding => undef }); # default 1653 # combine () and print () accept UTF8 marked data 1654 # parse () and getline () return UTF8 marked data 1655 1656=head2 BOM 1657 1658BOM (or Byte Order Mark) handling is available only inside the L</header> 1659method. This method supports the following encodings: C<utf-8>, C<utf-1>, 1660C<utf-32be>, C<utf-32le>, C<utf-16be>, C<utf-16le>, C<utf-ebcdic>, C<scsu>, 1661C<bocu-1>, and C<gb-18030>. See L<Wikipedia|https://en.wikipedia.org/wiki/Byte_order_mark>. 1662 1663If a file has a BOM, the easiest way to deal with that is 1664 1665 my $aoh = csv (in => $file, detect_bom => 1); 1666 1667All records will be encoded based on the detected BOM. 1668 1669This implies a call to the L</header> method, which defaults to also set 1670the L</column_names>. So this is B<not> the same as 1671 1672 my $aoh = csv (in => $file, headers => "auto"); 1673 1674which only reads the first record to set L</column_names> but ignores any 1675meaning of possible present BOM. 1676 1677=head1 SPECIFICATION 1678 1679While no formal specification for CSV exists, L<RFC 4180|https://datatracker.ietf.org/doc/html/rfc4180> 1680(I<1>) describes the common format and establishes C<text/csv> as the MIME 1681type registered with the IANA. L<RFC 7111|https://datatracker.ietf.org/doc/html/rfc7111> 1682(I<2>) adds fragments to CSV. 1683 1684Many informal documents exist that describe the C<CSV> format. L<"How To: 1685The Comma Separated Value (CSV) File Format"|http://creativyst.com/Doc/Articles/CSV/CSV01.shtml> 1686(I<3>) provides an overview of the C<CSV> format in the most widely used 1687applications and explains how it can best be used and supported. 1688 1689 1) https://datatracker.ietf.org/doc/html/rfc4180 1690 2) https://datatracker.ietf.org/doc/html/rfc7111 1691 3) http://creativyst.com/Doc/Articles/CSV/CSV01.shtml 1692 1693The basic rules are as follows: 1694 1695B<CSV> is a delimited data format that has fields/columns separated by the 1696comma character and records/rows separated by newlines. Fields that contain 1697a special character (comma, newline, or double quote), must be enclosed in 1698double quotes. However, if a line contains a single entry that is the empty 1699string, it may be enclosed in double quotes. If a field's value contains a 1700double quote character it is escaped by placing another double quote 1701character next to it. The C<CSV> file format does not require a specific 1702character encoding, byte order, or line terminator format. 1703 1704=over 2 1705 1706=item * 1707 1708Each record is a single line ended by a line feed (ASCII/C<LF>=C<0x0A>) or 1709a carriage return and line feed pair (ASCII/C<CRLF>=C<0x0D 0x0A>), however, 1710line-breaks may be embedded. 1711 1712=item * 1713 1714Fields are separated by commas. 1715 1716=item * 1717 1718Allowable characters within a C<CSV> field include C<0x09> (C<TAB>) and the 1719inclusive range of C<0x20> (space) through C<0x7E> (tilde). In binary mode 1720all characters are accepted, at least in quoted fields. 1721 1722=item * 1723 1724A field within C<CSV> must be surrounded by double-quotes to contain a 1725separator character (comma). 1726 1727=back 1728 1729Though this is the most clear and restrictive definition, Text::CSV_XS is 1730way more liberal than this, and allows extension: 1731 1732=over 2 1733 1734=item * 1735 1736Line termination by a single carriage return is accepted by default 1737 1738=item * 1739 1740The separation-, escape-, and escape- characters can be any ASCII character 1741in the range from C<0x20> (space) to C<0x7E> (tilde). Characters outside 1742this range may or may not work as expected. Multibyte characters, like UTF 1743C<U+060C> (ARABIC COMMA), C<U+FF0C> (FULLWIDTH COMMA), C<U+241B> (SYMBOL 1744FOR ESCAPE), C<U+2424> (SYMBOL FOR NEWLINE), C<U+FF02> (FULLWIDTH QUOTATION 1745MARK), and C<U+201C> (LEFT DOUBLE QUOTATION MARK) (to give some examples of 1746what might look promising) work for newer versions of perl for C<sep_char>, 1747and C<quote_char> but not for C<escape_char>. 1748 1749If you use perl-5.8.2 or higher these three attributes are utf8-decoded, to 1750increase the likelihood of success. This way C<U+00FE> will be allowed as a 1751quote character. 1752 1753=item * 1754 1755A field in C<CSV> must be surrounded by double-quotes to make an embedded 1756double-quote, represented by a pair of consecutive double-quotes, valid. In 1757binary mode you may additionally use the sequence C<"0> for representation 1758of a NULL byte. Using C<0x00> in binary mode is just as valid. 1759 1760=item * 1761 1762Several violations of the above specification may be lifted by passing some 1763options as attributes to the object constructor. 1764 1765=back 1766 1767=head1 METHODS 1768 1769=head2 version 1770X<version> 1771 1772(Class method) Returns the current module version. 1773 1774=head2 new 1775X<new> 1776 1777(Class method) Returns a new instance of class Text::CSV_XS. The attributes 1778are described by the (optional) hash ref C<\%attr>. 1779 1780 my $csv = Text::CSV_XS->new ({ attributes ... }); 1781 1782The following attributes are available: 1783 1784=head3 eol 1785X<eol> 1786 1787 my $csv = Text::CSV_XS->new ({ eol => $/ }); 1788 $csv->eol (undef); 1789 my $eol = $csv->eol; 1790 1791The end-of-line string to add to rows for L</print> or the record separator 1792for L</getline>. 1793 1794When not passed in a B<parser> instance, the default behavior is to accept 1795C<\n>, C<\r>, and C<\r\n>, so it is probably safer to not specify C<eol> at 1796all. Passing C<undef> or the empty string behave the same. 1797 1798When not passed in a B<generating> instance, records are not terminated at 1799all, so it is probably wise to pass something you expect. A safe choice for 1800C<eol> on output is either C<$/> or C<\r\n>. 1801 1802Common values for C<eol> are C<"\012"> (C<\n> or Line Feed), C<"\015\012"> 1803(C<\r\n> or Carriage Return, Line Feed), and C<"\015"> (C<\r> or Carriage 1804Return). The L<C<eol>|/eol> attribute cannot exceed 7 (ASCII) characters. 1805 1806If both C<$/> and L<C<eol>|/eol> equal C<"\015">, parsing lines that end on 1807only a Carriage Return without Line Feed, will be L</parse>d correct. 1808 1809=head3 sep_char 1810X<sep_char> 1811 1812 my $csv = Text::CSV_XS->new ({ sep_char => ";" }); 1813 $csv->sep_char (";"); 1814 my $c = $csv->sep_char; 1815 1816The char used to separate fields, by default a comma. (C<,>). Limited to a 1817single-byte character, usually in the range from C<0x20> (space) to C<0x7E> 1818(tilde). When longer sequences are required, use L<C<sep>|/sep>. 1819 1820The separation character can not be equal to the quote character or to the 1821escape character. 1822 1823See also L</CAVEATS> 1824 1825=head3 sep 1826X<sep> 1827 1828 my $csv = Text::CSV_XS->new ({ sep => "\N{FULLWIDTH COMMA}" }); 1829 $csv->sep (";"); 1830 my $sep = $csv->sep; 1831 1832The chars used to separate fields, by default undefined. Limited to 8 bytes. 1833 1834When set, overrules L<C<sep_char>|/sep_char>. If its length is one byte it 1835acts as an alias to L<C<sep_char>|/sep_char>. 1836 1837See also L</CAVEATS> 1838 1839=head3 quote_char 1840X<quote_char> 1841 1842 my $csv = Text::CSV_XS->new ({ quote_char => "'" }); 1843 $csv->quote_char (undef); 1844 my $c = $csv->quote_char; 1845 1846The character to quote fields containing blanks or binary data, by default 1847the double quote character (C<">). A value of undef suppresses quote chars 1848(for simple cases only). Limited to a single-byte character, usually in the 1849range from C<0x20> (space) to C<0x7E> (tilde). When longer sequences are 1850required, use L<C<quote>|/quote>. 1851 1852C<quote_char> can not be equal to L<C<sep_char>|/sep_char>. 1853 1854=head3 quote 1855X<quote> 1856 1857 my $csv = Text::CSV_XS->new ({ quote => "\N{FULLWIDTH QUOTATION MARK}" }); 1858 $csv->quote ("'"); 1859 my $quote = $csv->quote; 1860 1861The chars used to quote fields, by default undefined. Limited to 8 bytes. 1862 1863When set, overrules L<C<quote_char>|/quote_char>. If its length is one byte 1864it acts as an alias to L<C<quote_char>|/quote_char>. 1865 1866This method does not support C<undef>. Use L<C<quote_char>|/quote_char> to 1867disable quotation. 1868 1869See also L</CAVEATS> 1870 1871=head3 escape_char 1872X<escape_char> 1873 1874 my $csv = Text::CSV_XS->new ({ escape_char => "\\" }); 1875 $csv->escape_char (":"); 1876 my $c = $csv->escape_char; 1877 1878The character to escape certain characters inside quoted fields. This is 1879limited to a single-byte character, usually in the range from C<0x20> 1880(space) to C<0x7E> (tilde). 1881 1882The C<escape_char> defaults to being the double-quote mark (C<">). In other 1883words the same as the default L<C<quote_char>|/quote_char>. This means that 1884doubling the quote mark in a field escapes it: 1885 1886 "foo","bar","Escape ""quote mark"" with two ""quote marks""","baz" 1887 1888If you change the L<C<quote_char>|/quote_char> without changing the 1889C<escape_char>, the C<escape_char> will still be the double-quote (C<">). 1890If instead you want to escape the L<C<quote_char>|/quote_char> by doubling 1891it you will need to also change the C<escape_char> to be the same as what 1892you have changed the L<C<quote_char>|/quote_char> to. 1893 1894Setting C<escape_char> to <undef> or C<""> will disable escaping completely 1895and is greatly discouraged. This will also disable C<escape_null>. 1896 1897The escape character can not be equal to the separation character. 1898 1899=head3 binary 1900X<binary> 1901 1902 my $csv = Text::CSV_XS->new ({ binary => 1 }); 1903 $csv->binary (0); 1904 my $f = $csv->binary; 1905 1906If this attribute is C<1>, you may use binary characters in quoted fields, 1907including line feeds, carriage returns and C<NULL> bytes. (The latter could 1908be escaped as C<"0>.) By default this feature is off. 1909 1910If a string is marked UTF8, C<binary> will be turned on automatically when 1911binary characters other than C<CR> and C<NL> are encountered. Note that a 1912simple string like C<"\x{00a0}"> might still be binary, but not marked UTF8, 1913so setting C<< { binary => 1 } >> is still a wise option. 1914 1915=head3 strict 1916X<strict> 1917 1918 my $csv = Text::CSV_XS->new ({ strict => 1 }); 1919 $csv->strict (0); 1920 my $f = $csv->strict; 1921 1922If this attribute is set to C<1>, any row that parses to a different number 1923of fields than the previous row will cause the parser to throw error 2014. 1924 1925=head3 skip_empty_rows 1926X<skip_empty_rows> 1927 1928 my $csv = Text::CSV_XS->new ({ skip_empty_rows => 1 }); 1929 $csv->skip_empty_rows (0); 1930 my $f = $csv->skip_empty_rows; 1931 1932If this attribute is set to C<1>, any row that has an L</eol> immediately 1933following the start of line will be skipped. Default behavior is to return 1934one single empty field. 1935 1936This attribute is only used in parsing. 1937 1938=head3 formula_handling 1939 1940=head3 formula 1941X<formula_handling> 1942X<formula> 1943 1944 my $csv = Text::CSV_XS->new ({ formula => "none" }); 1945 $csv->formula ("none"); 1946 my $f = $csv->formula; 1947 1948This defines the behavior of fields containing I<formulas>. As formulas are 1949considered dangerous in spreadsheets, this attribute can define an optional 1950action to be taken if a field starts with an equal sign (C<=>). 1951 1952For purpose of code-readability, this can also be written as 1953 1954 my $csv = Text::CSV_XS->new ({ formula_handling => "none" }); 1955 $csv->formula_handling ("none"); 1956 my $f = $csv->formula_handling; 1957 1958Possible values for this attribute are 1959 1960=over 2 1961 1962=item none 1963 1964Take no specific action. This is the default. 1965 1966 $csv->formula ("none"); 1967 1968=item die 1969 1970Cause the process to C<die> whenever a leading C<=> is encountered. 1971 1972 $csv->formula ("die"); 1973 1974=item croak 1975 1976Cause the process to C<croak> whenever a leading C<=> is encountered. (See 1977L<Carp>) 1978 1979 $csv->formula ("croak"); 1980 1981=item diag 1982 1983Report position and content of the field whenever a leading C<=> is found. 1984The value of the field is unchanged. 1985 1986 $csv->formula ("diag"); 1987 1988=item empty 1989 1990Replace the content of fields that start with a C<=> with the empty string. 1991 1992 $csv->formula ("empty"); 1993 $csv->formula (""); 1994 1995=item undef 1996 1997Replace the content of fields that start with a C<=> with C<undef>. 1998 1999 $csv->formula ("undef"); 2000 $csv->formula (undef); 2001 2002=item a callback 2003 2004Modify the content of fields that start with a C<=> with the return-value 2005of the callback. The original content of the field is available inside the 2006callback as C<$_>; 2007 2008 # Replace all formula's with 42 2009 $csv->formula (sub { 42; }); 2010 2011 # same as $csv->formula ("empty") but slower 2012 $csv->formula (sub { "" }); 2013 2014 # Allow =4+12 2015 $csv->formula (sub { s/^=(\d+\+\d+)$/$1/eer }); 2016 2017 # Allow more complex calculations 2018 $csv->formula (sub { eval { s{^=([-+*/0-9()]+)$}{$1}ee }; $_ }); 2019 2020=back 2021 2022All other values will give a warning and then fallback to C<diag>. 2023 2024=head3 decode_utf8 2025X<decode_utf8> 2026 2027 my $csv = Text::CSV_XS->new ({ decode_utf8 => 1 }); 2028 $csv->decode_utf8 (0); 2029 my $f = $csv->decode_utf8; 2030 2031This attributes defaults to TRUE. 2032 2033While I<parsing>, fields that are valid UTF-8, are automatically set to be 2034UTF-8, so that 2035 2036 $csv->parse ("\xC4\xA8\n"); 2037 2038results in 2039 2040 PV("\304\250"\0) [UTF8 "\x{128}"] 2041 2042Sometimes it might not be a desired action. To prevent those upgrades, set 2043this attribute to false, and the result will be 2044 2045 PV("\304\250"\0) 2046 2047=head3 auto_diag 2048X<auto_diag> 2049 2050 my $csv = Text::CSV_XS->new ({ auto_diag => 1 }); 2051 $csv->auto_diag (2); 2052 my $l = $csv->auto_diag; 2053 2054Set this attribute to a number between C<1> and C<9> causes L</error_diag> 2055to be automatically called in void context upon errors. 2056 2057In case of error C<2012 - EOF>, this call will be void. 2058 2059If C<auto_diag> is set to a numeric value greater than C<1>, it will C<die> 2060on errors instead of C<warn>. If set to anything unrecognized, it will be 2061silently ignored. 2062 2063Future extensions to this feature will include more reliable auto-detection 2064of C<autodie> being active in the scope of which the error occurred which 2065will increment the value of C<auto_diag> with C<1> the moment the error is 2066detected. 2067 2068=head3 diag_verbose 2069X<diag_verbose> 2070 2071 my $csv = Text::CSV_XS->new ({ diag_verbose => 1 }); 2072 $csv->diag_verbose (2); 2073 my $l = $csv->diag_verbose; 2074 2075Set the verbosity of the output triggered by C<auto_diag>. Currently only 2076adds the current input-record-number (if known) to the diagnostic output 2077with an indication of the position of the error. 2078 2079=head3 blank_is_undef 2080X<blank_is_undef> 2081 2082 my $csv = Text::CSV_XS->new ({ blank_is_undef => 1 }); 2083 $csv->blank_is_undef (0); 2084 my $f = $csv->blank_is_undef; 2085 2086Under normal circumstances, C<CSV> data makes no distinction between quoted- 2087and unquoted empty fields. These both end up in an empty string field once 2088read, thus 2089 2090 1,"",," ",2 2091 2092is read as 2093 2094 ("1", "", "", " ", "2") 2095 2096When I<writing> C<CSV> files with either L<C<always_quote>|/always_quote> 2097or L<C<quote_empty>|/quote_empty> set, the unquoted I<empty> field is the 2098result of an undefined value. To enable this distinction when I<reading> 2099C<CSV> data, the C<blank_is_undef> attribute will cause unquoted empty 2100fields to be set to C<undef>, causing the above to be parsed as 2101 2102 ("1", "", undef, " ", "2") 2103 2104Note that this is specifically important when loading C<CSV> fields into a 2105database that allows C<NULL> values, as the perl equivalent for C<NULL> is 2106C<undef> in L<DBI> land. 2107 2108=head3 empty_is_undef 2109X<empty_is_undef> 2110 2111 my $csv = Text::CSV_XS->new ({ empty_is_undef => 1 }); 2112 $csv->empty_is_undef (0); 2113 my $f = $csv->empty_is_undef; 2114 2115Going one step further than L<C<blank_is_undef>|/blank_is_undef>, this 2116attribute converts all empty fields to C<undef>, so 2117 2118 1,"",," ",2 2119 2120is read as 2121 2122 (1, undef, undef, " ", 2) 2123 2124Note that this affects only fields that are originally empty, not fields 2125that are empty after stripping allowed whitespace. YMMV. 2126 2127=head3 allow_whitespace 2128X<allow_whitespace> 2129 2130 my $csv = Text::CSV_XS->new ({ allow_whitespace => 1 }); 2131 $csv->allow_whitespace (0); 2132 my $f = $csv->allow_whitespace; 2133 2134When this option is set to true, the whitespace (C<TAB>'s and C<SPACE>'s) 2135surrounding the separation character is removed when parsing. If either 2136C<TAB> or C<SPACE> is one of the three characters L<C<sep_char>|/sep_char>, 2137L<C<quote_char>|/quote_char>, or L<C<escape_char>|/escape_char> it will not 2138be considered whitespace. 2139 2140Now lines like: 2141 2142 1 , "foo" , bar , 3 , zapp 2143 2144are parsed as valid C<CSV>, even though it violates the C<CSV> specs. 2145 2146Note that B<all> whitespace is stripped from both start and end of each 2147field. That would make it I<more> than a I<feature> to enable parsing bad 2148C<CSV> lines, as 2149 2150 1, 2.0, 3, ape , monkey 2151 2152will now be parsed as 2153 2154 ("1", "2.0", "3", "ape", "monkey") 2155 2156even if the original line was perfectly acceptable C<CSV>. 2157 2158=head3 allow_loose_quotes 2159X<allow_loose_quotes> 2160 2161 my $csv = Text::CSV_XS->new ({ allow_loose_quotes => 1 }); 2162 $csv->allow_loose_quotes (0); 2163 my $f = $csv->allow_loose_quotes; 2164 2165By default, parsing unquoted fields containing L<C<quote_char>|/quote_char> 2166characters like 2167 2168 1,foo "bar" baz,42 2169 2170would result in parse error 2034. Though it is still bad practice to allow 2171this format, we cannot help the fact that some vendors make their 2172applications spit out lines styled this way. 2173 2174If there is B<really> bad C<CSV> data, like 2175 2176 1,"foo "bar" baz",42 2177 2178or 2179 2180 1,""foo bar baz"",42 2181 2182there is a way to get this data-line parsed and leave the quotes inside the 2183quoted field as-is. This can be achieved by setting C<allow_loose_quotes> 2184B<AND> making sure that the L<C<escape_char>|/escape_char> is I<not> equal 2185to L<C<quote_char>|/quote_char>. 2186 2187=head3 allow_loose_escapes 2188X<allow_loose_escapes> 2189 2190 my $csv = Text::CSV_XS->new ({ allow_loose_escapes => 1 }); 2191 $csv->allow_loose_escapes (0); 2192 my $f = $csv->allow_loose_escapes; 2193 2194Parsing fields that have L<C<escape_char>|/escape_char> characters that 2195escape characters that do not need to be escaped, like: 2196 2197 my $csv = Text::CSV_XS->new ({ escape_char => "\\" }); 2198 $csv->parse (qq{1,"my bar\'s",baz,42}); 2199 2200would result in parse error 2025. Though it is bad practice to allow this 2201format, this attribute enables you to treat all escape character sequences 2202equal. 2203 2204=head3 allow_unquoted_escape 2205X<allow_unquoted_escape> 2206 2207 my $csv = Text::CSV_XS->new ({ allow_unquoted_escape => 1 }); 2208 $csv->allow_unquoted_escape (0); 2209 my $f = $csv->allow_unquoted_escape; 2210 2211A backward compatibility issue where L<C<escape_char>|/escape_char> differs 2212from L<C<quote_char>|/quote_char> prevents L<C<escape_char>|/escape_char> 2213to be in the first position of a field. If L<C<quote_char>|/quote_char> is 2214equal to the default C<"> and L<C<escape_char>|/escape_char> is set to C<\>, 2215this would be illegal: 2216 2217 1,\0,2 2218 2219Setting this attribute to C<1> might help to overcome issues with backward 2220compatibility and allow this style. 2221 2222=head3 always_quote 2223X<always_quote> 2224 2225 my $csv = Text::CSV_XS->new ({ always_quote => 1 }); 2226 $csv->always_quote (0); 2227 my $f = $csv->always_quote; 2228 2229By default the generated fields are quoted only if they I<need> to be. For 2230example, if they contain the separator character. If you set this attribute 2231to C<1> then I<all> defined fields will be quoted. (C<undef> fields are not 2232quoted, see L</blank_is_undef>). This makes it quite often easier to handle 2233exported data in external applications. (Poor creatures who are better to 2234use Text::CSV_XS. :) 2235 2236=head3 quote_space 2237X<quote_space> 2238 2239 my $csv = Text::CSV_XS->new ({ quote_space => 1 }); 2240 $csv->quote_space (0); 2241 my $f = $csv->quote_space; 2242 2243By default, a space in a field would trigger quotation. As no rule exists 2244this to be forced in C<CSV>, nor any for the opposite, the default is true 2245for safety. You can exclude the space from this trigger by setting this 2246attribute to 0. 2247 2248=head3 quote_empty 2249X<quote_empty> 2250 2251 my $csv = Text::CSV_XS->new ({ quote_empty => 1 }); 2252 $csv->quote_empty (0); 2253 my $f = $csv->quote_empty; 2254 2255By default the generated fields are quoted only if they I<need> to be. An 2256empty (defined) field does not need quotation. If you set this attribute to 2257C<1> then I<empty> defined fields will be quoted. (C<undef> fields are not 2258quoted, see L</blank_is_undef>). See also L<C<always_quote>|/always_quote>. 2259 2260=head3 quote_binary 2261X<quote_binary> 2262 2263 my $csv = Text::CSV_XS->new ({ quote_binary => 1 }); 2264 $csv->quote_binary (0); 2265 my $f = $csv->quote_binary; 2266 2267By default, all "unsafe" bytes inside a string cause the combined field to 2268be quoted. By setting this attribute to C<0>, you can disable that trigger 2269for bytes >= C<0x7F>. 2270 2271=head3 escape_null 2272X<escape_null> 2273X<quote_null> 2274 2275 my $csv = Text::CSV_XS->new ({ escape_null => 1 }); 2276 $csv->escape_null (0); 2277 my $f = $csv->escape_null; 2278 2279By default, a C<NULL> byte in a field would be escaped. This option enables 2280you to treat the C<NULL> byte as a simple binary character in binary mode 2281(the C<< { binary => 1 } >> is set). The default is true. You can prevent 2282C<NULL> escapes by setting this attribute to C<0>. 2283 2284When the C<escape_char> attribute is set to undefined, this attribute will 2285be set to false. 2286 2287The default setting will encode "=\x00=" as 2288 2289 "="0=" 2290 2291With C<escape_null> set, this will result in 2292 2293 "=\x00=" 2294 2295The default when using the C<csv> function is C<false>. 2296 2297For backward compatibility reasons, the deprecated old name C<quote_null> 2298is still recognized. 2299 2300=head3 keep_meta_info 2301X<keep_meta_info> 2302 2303 my $csv = Text::CSV_XS->new ({ keep_meta_info => 1 }); 2304 $csv->keep_meta_info (0); 2305 my $f = $csv->keep_meta_info; 2306 2307By default, the parsing of input records is as simple and fast as possible. 2308However, some parsing information - like quotation of the original field - 2309is lost in that process. Setting this flag to true enables retrieving that 2310information after parsing with the methods L</meta_info>, L</is_quoted>, 2311and L</is_binary> described below. Default is false for performance. 2312 2313If you set this attribute to a value greater than 9, then you can control 2314output quotation style like it was used in the input of the the last parsed 2315record (unless quotation was added because of other reasons). 2316 2317 my $csv = Text::CSV_XS->new ({ 2318 binary => 1, 2319 keep_meta_info => 1, 2320 quote_space => 0, 2321 }); 2322 2323 my $row = $csv->parse (q{1,,"", ," ",f,"g","h""h",help,"help"}); 2324 2325 $csv->print (*STDOUT, \@row); 2326 # 1,,, , ,f,g,"h""h",help,help 2327 $csv->keep_meta_info (11); 2328 $csv->print (*STDOUT, \@row); 2329 # 1,,"", ," ",f,"g","h""h",help,"help" 2330 2331=head3 undef_str 2332X<undef_str> 2333 2334 my $csv = Text::CSV_XS->new ({ undef_str => "\\N" }); 2335 $csv->undef_str (undef); 2336 my $s = $csv->undef_str; 2337 2338This attribute optionally defines the output of undefined fields. The value 2339passed is not changed at all, so if it needs quotation, the quotation needs 2340to be included in the value of the attribute. Use with caution, as passing 2341a value like C<",",,,,"""> will for sure mess up your output. The default 2342for this attribute is C<undef>, meaning no special treatment. 2343 2344This attribute is useful when exporting CSV data to be imported in custom 2345loaders, like for MySQL, that recognize special sequences for C<NULL> data. 2346 2347This attribute has no meaning when parsing CSV data. 2348 2349=head3 comment_str 2350X<comment_str> 2351 2352 my $csv = Text::CSV_XS->new ({ comment_str => "#" }); 2353 $csv->comment_str (undef); 2354 my $s = $csv->comment_str; 2355 2356This attribute optionally defines a string to be recognized as comment. If 2357this attribute is defined, all lines starting with this sequence will not 2358be parsed as CSV but skipped as comment. 2359 2360This attribute has no meaning when generating CSV. 2361 2362Comment strings that start with any of the special characters/sequences are 2363not supported (so it cannot start with any of L</sep_char>, L</quote_char>, 2364L</escape_char>, L</sep>, L</quote>, or L</eol>). 2365 2366For convenience, C<comment> is an alias for C<comment_str>. 2367 2368=head3 verbatim 2369X<verbatim> 2370 2371 my $csv = Text::CSV_XS->new ({ verbatim => 1 }); 2372 $csv->verbatim (0); 2373 my $f = $csv->verbatim; 2374 2375This is a quite controversial attribute to set, but makes some hard things 2376possible. 2377 2378The rationale behind this attribute is to tell the parser that the normally 2379special characters newline (C<NL>) and Carriage Return (C<CR>) will not be 2380special when this flag is set, and be dealt with as being ordinary binary 2381characters. This will ease working with data with embedded newlines. 2382 2383When C<verbatim> is used with L</getline>, L</getline> auto-C<chomp>'s 2384every line. 2385 2386Imagine a file format like 2387 2388 M^^Hans^Janssen^Klas 2\n2A^Ja^11-06-2007#\r\n 2389 2390where, the line ending is a very specific C<"#\r\n">, and the sep_char is a 2391C<^> (caret). None of the fields is quoted, but embedded binary data is 2392likely to be present. With the specific line ending, this should not be too 2393hard to detect. 2394 2395By default, Text::CSV_XS' parse function is instructed to only know about 2396C<"\n"> and C<"\r"> to be legal line endings, and so has to deal with the 2397embedded newline as a real C<end-of-line>, so it can scan the next line if 2398binary is true, and the newline is inside a quoted field. With this option, 2399we tell L</parse> to parse the line as if C<"\n"> is just nothing more than 2400a binary character. 2401 2402For L</parse> this means that the parser has no more idea about line ending 2403and L</getline> C<chomp>s line endings on reading. 2404 2405=head3 types 2406 2407A set of column types; the attribute is immediately passed to the L</types> 2408method. 2409 2410=head3 callbacks 2411X<callbacks> 2412 2413See the L</Callbacks> section below. 2414 2415=head3 accessors 2416 2417To sum it up, 2418 2419 $csv = Text::CSV_XS->new (); 2420 2421is equivalent to 2422 2423 $csv = Text::CSV_XS->new ({ 2424 eol => undef, # \r, \n, or \r\n 2425 sep_char => ',', 2426 sep => undef, 2427 quote_char => '"', 2428 quote => undef, 2429 escape_char => '"', 2430 binary => 0, 2431 decode_utf8 => 1, 2432 auto_diag => 0, 2433 diag_verbose => 0, 2434 blank_is_undef => 0, 2435 empty_is_undef => 0, 2436 allow_whitespace => 0, 2437 allow_loose_quotes => 0, 2438 allow_loose_escapes => 0, 2439 allow_unquoted_escape => 0, 2440 always_quote => 0, 2441 quote_empty => 0, 2442 quote_space => 1, 2443 escape_null => 1, 2444 quote_binary => 1, 2445 keep_meta_info => 0, 2446 strict => 0, 2447 skip_empty_rows => 0, 2448 formula => 0, 2449 verbatim => 0, 2450 undef_str => undef, 2451 comment_str => undef, 2452 types => undef, 2453 callbacks => undef, 2454 }); 2455 2456For all of the above mentioned flags, an accessor method is available where 2457you can inquire the current value, or change the value 2458 2459 my $quote = $csv->quote_char; 2460 $csv->binary (1); 2461 2462It is not wise to change these settings halfway through writing C<CSV> data 2463to a stream. If however you want to create a new stream using the available 2464C<CSV> object, there is no harm in changing them. 2465 2466If the L</new> constructor call fails, it returns C<undef>, and makes the 2467fail reason available through the L</error_diag> method. 2468 2469 $csv = Text::CSV_XS->new ({ ecs_char => 1 }) or 2470 die "".Text::CSV_XS->error_diag (); 2471 2472L</error_diag> will return a string like 2473 2474 "INI - Unknown attribute 'ecs_char'" 2475 2476=head2 known_attributes 2477X<known_attributes> 2478 2479 @attr = Text::CSV_XS->known_attributes; 2480 @attr = Text::CSV_XS::known_attributes; 2481 @attr = $csv->known_attributes; 2482 2483This method will return an ordered list of all the supported attributes as 2484described above. This can be useful for knowing what attributes are valid 2485in classes that use or extend Text::CSV_XS. 2486 2487=head2 print 2488X<print> 2489 2490 $status = $csv->print ($fh, $colref); 2491 2492Similar to L</combine> + L</string> + L</print>, but much more efficient. 2493It expects an array ref as input (not an array!) and the resulting string 2494is not really created, but immediately written to the C<$fh> object, 2495typically an IO handle or any other object that offers a L</print> method. 2496 2497For performance reasons C<print> does not create a result string, so all 2498L</string>, L</status>, L</fields>, and L</error_input> methods will return 2499undefined information after executing this method. 2500 2501If C<$colref> is C<undef> (explicit, not through a variable argument) and 2502L</bind_columns> was used to specify fields to be printed, it is possible 2503to make performance improvements, as otherwise data would have to be copied 2504as arguments to the method call: 2505 2506 $csv->bind_columns (\($foo, $bar)); 2507 $status = $csv->print ($fh, undef); 2508 2509A short benchmark 2510 2511 my @data = ("aa" .. "zz"); 2512 $csv->bind_columns (\(@data)); 2513 2514 $csv->print ($fh, [ @data ]); # 11800 recs/sec 2515 $csv->print ($fh, \@data ); # 57600 recs/sec 2516 $csv->print ($fh, undef ); # 48500 recs/sec 2517 2518=head2 say 2519X<say> 2520 2521 $status = $csv->say ($fh, $colref); 2522 2523Like L<C<print>|/print>, but L<C<eol>|/eol> defaults to C<$\>. 2524 2525=head2 print_hr 2526X<print_hr> 2527 2528 $csv->print_hr ($fh, $ref); 2529 2530Provides an easy way to print a C<$ref> (as fetched with L</getline_hr>) 2531provided the column names are set with L</column_names>. 2532 2533It is just a wrapper method with basic parameter checks over 2534 2535 $csv->print ($fh, [ map { $ref->{$_} } $csv->column_names ]); 2536 2537=head2 combine 2538X<combine> 2539 2540 $status = $csv->combine (@fields); 2541 2542This method constructs a C<CSV> record from C<@fields>, returning success 2543or failure. Failure can result from lack of arguments or an argument that 2544contains an invalid character. Upon success, L</string> can be called to 2545retrieve the resultant C<CSV> string. Upon failure, the value returned by 2546L</string> is undefined and L</error_input> could be called to retrieve the 2547invalid argument. 2548 2549=head2 string 2550X<string> 2551 2552 $line = $csv->string (); 2553 2554This method returns the input to L</parse> or the resultant C<CSV> string 2555of L</combine>, whichever was called more recently. 2556 2557=head2 getline 2558X<getline> 2559 2560 $colref = $csv->getline ($fh); 2561 2562This is the counterpart to L</print>, as L</parse> is the counterpart to 2563L</combine>: it parses a row from the C<$fh> handle using the L</getline> 2564method associated with C<$fh> and parses this row into an array ref. This 2565array ref is returned by the function or C<undef> for failure. When C<$fh> 2566does not support C<getline>, you are likely to hit errors. 2567 2568When fields are bound with L</bind_columns> the return value is a reference 2569to an empty list. 2570 2571The L</string>, L</fields>, and L</status> methods are meaningless again. 2572 2573=head2 getline_all 2574X<getline_all> 2575 2576 $arrayref = $csv->getline_all ($fh); 2577 $arrayref = $csv->getline_all ($fh, $offset); 2578 $arrayref = $csv->getline_all ($fh, $offset, $length); 2579 2580This will return a reference to a list of L<getline ($fh)|/getline> results. 2581In this call, C<keep_meta_info> is disabled. If C<$offset> is negative, as 2582with C<splice>, only the last C<abs ($offset)> records of C<$fh> are taken 2583into consideration. 2584 2585Given a CSV file with 10 lines: 2586 2587 lines call 2588 ----- --------------------------------------------------------- 2589 0..9 $csv->getline_all ($fh) # all 2590 0..9 $csv->getline_all ($fh, 0) # all 2591 8..9 $csv->getline_all ($fh, 8) # start at 8 2592 - $csv->getline_all ($fh, 0, 0) # start at 0 first 0 rows 2593 0..4 $csv->getline_all ($fh, 0, 5) # start at 0 first 5 rows 2594 4..5 $csv->getline_all ($fh, 4, 2) # start at 4 first 2 rows 2595 8..9 $csv->getline_all ($fh, -2) # last 2 rows 2596 6..7 $csv->getline_all ($fh, -4, 2) # first 2 of last 4 rows 2597 2598=head2 getline_hr 2599X<getline_hr> 2600 2601The L</getline_hr> and L</column_names> methods work together to allow you 2602to have rows returned as hashrefs. You must call L</column_names> first to 2603declare your column names. 2604 2605 $csv->column_names (qw( code name price description )); 2606 $hr = $csv->getline_hr ($fh); 2607 print "Price for $hr->{name} is $hr->{price} EUR\n"; 2608 2609L</getline_hr> will croak if called before L</column_names>. 2610 2611Note that L</getline_hr> creates a hashref for every row and will be much 2612slower than the combined use of L</bind_columns> and L</getline> but still 2613offering the same easy to use hashref inside the loop: 2614 2615 my @cols = @{$csv->getline ($fh)}; 2616 $csv->column_names (@cols); 2617 while (my $row = $csv->getline_hr ($fh)) { 2618 print $row->{price}; 2619 } 2620 2621Could easily be rewritten to the much faster: 2622 2623 my @cols = @{$csv->getline ($fh)}; 2624 my $row = {}; 2625 $csv->bind_columns (\@{$row}{@cols}); 2626 while ($csv->getline ($fh)) { 2627 print $row->{price}; 2628 } 2629 2630Your mileage may vary for the size of the data and the number of rows. With 2631perl-5.14.2 the comparison for a 100_000 line file with 14 columns: 2632 2633 Rate hashrefs getlines 2634 hashrefs 1.00/s -- -76% 2635 getlines 4.15/s 313% -- 2636 2637=head2 getline_hr_all 2638X<getline_hr_all> 2639 2640 $arrayref = $csv->getline_hr_all ($fh); 2641 $arrayref = $csv->getline_hr_all ($fh, $offset); 2642 $arrayref = $csv->getline_hr_all ($fh, $offset, $length); 2643 2644This will return a reference to a list of L<getline_hr ($fh)|/getline_hr> 2645results. In this call, L<C<keep_meta_info>|/keep_meta_info> is disabled. 2646 2647=head2 parse 2648X<parse> 2649 2650 $status = $csv->parse ($line); 2651 2652This method decomposes a C<CSV> string into fields, returning success or 2653failure. Failure can result from a lack of argument or the given C<CSV> 2654string is improperly formatted. Upon success, L</fields> can be called to 2655retrieve the decomposed fields. Upon failure calling L</fields> will return 2656undefined data and L</error_input> can be called to retrieve the invalid 2657argument. 2658 2659You may use the L</types> method for setting column types. See L</types>' 2660description below. 2661 2662The C<$line> argument is supposed to be a simple scalar. Everything else is 2663supposed to croak and set error 1500. 2664 2665=head2 fragment 2666X<fragment> 2667 2668This function tries to implement RFC7111 (URI Fragment Identifiers for the 2669text/csv Media Type) - https://datatracker.ietf.org/doc/html/rfc7111 2670 2671 my $AoA = $csv->fragment ($fh, $spec); 2672 2673In specifications, C<*> is used to specify the I<last> item, a dash (C<->) 2674to indicate a range. All indices are C<1>-based: the first row or column 2675has index C<1>. Selections can be combined with the semi-colon (C<;>). 2676 2677When using this method in combination with L</column_names>, the returned 2678reference will point to a list of hashes instead of a list of lists. A 2679disjointed cell-based combined selection might return rows with different 2680number of columns making the use of hashes unpredictable. 2681 2682 $csv->column_names ("Name", "Age"); 2683 my $AoH = $csv->fragment ($fh, "col=3;8"); 2684 2685If the L</after_parse> callback is active, it is also called on every line 2686parsed and skipped before the fragment. 2687 2688=over 2 2689 2690=item row 2691 2692 row=4 2693 row=5-7 2694 row=6-* 2695 row=1-2;4;6-* 2696 2697=item col 2698 2699 col=2 2700 col=1-3 2701 col=4-* 2702 col=1-2;4;7-* 2703 2704=item cell 2705 2706In cell-based selection, the comma (C<,>) is used to pair row and column 2707 2708 cell=4,1 2709 2710The range operator (C<->) using C<cell>s can be used to define top-left and 2711bottom-right C<cell> location 2712 2713 cell=3,1-4,6 2714 2715The C<*> is only allowed in the second part of a pair 2716 2717 cell=3,2-*,2 # row 3 till end, only column 2 2718 cell=3,2-3,* # column 2 till end, only row 3 2719 cell=3,2-*,* # strip row 1 and 2, and column 1 2720 2721Cells and cell ranges may be combined with C<;>, possibly resulting in rows 2722with different numbers of columns 2723 2724 cell=1,1-2,2;3,3-4,4;1,4;4,1 2725 2726Disjointed selections will only return selected cells. The cells that are 2727not specified will not be included in the returned set, not even as 2728C<undef>. As an example given a C<CSV> like 2729 2730 11,12,13,...19 2731 21,22,...28,29 2732 : : 2733 91,...97,98,99 2734 2735with C<cell=1,1-2,2;3,3-4,4;1,4;4,1> will return: 2736 2737 11,12,14 2738 21,22 2739 33,34 2740 41,43,44 2741 2742Overlapping cell-specs will return those cells only once, So 2743C<cell=1,1-3,3;2,2-4,4;2,3;4,2> will return: 2744 2745 11,12,13 2746 21,22,23,24 2747 31,32,33,34 2748 42,43,44 2749 2750=back 2751 2752L<RFC7111|https://datatracker.ietf.org/doc/html/rfc7111> does B<not> allow different 2753types of specs to be combined (either C<row> I<or> C<col> I<or> C<cell>). 2754Passing an invalid fragment specification will croak and set error 2013. 2755 2756=head2 column_names 2757X<column_names> 2758 2759Set the "keys" that will be used in the L</getline_hr> calls. If no keys 2760(column names) are passed, it will return the current setting as a list. 2761 2762L</column_names> accepts a list of scalars (the column names) or a single 2763array_ref, so you can pass the return value from L</getline> too: 2764 2765 $csv->column_names ($csv->getline ($fh)); 2766 2767L</column_names> does B<no> checking on duplicates at all, which might lead 2768to unexpected results. Undefined entries will be replaced with the string 2769C<"\cAUNDEF\cA">, so 2770 2771 $csv->column_names (undef, "", "name", "name"); 2772 $hr = $csv->getline_hr ($fh); 2773 2774will set C<< $hr->{"\cAUNDEF\cA"} >> to the 1st field, C<< $hr->{""} >> to 2775the 2nd field, and C<< $hr->{name} >> to the 4th field, discarding the 3rd 2776field. 2777 2778L</column_names> croaks on invalid arguments. 2779 2780=head2 header 2781 2782This method does NOT work in perl-5.6.x 2783 2784Parse the CSV header and set L<C<sep>|/sep>, column_names and encoding. 2785 2786 my @hdr = $csv->header ($fh); 2787 $csv->header ($fh, { sep_set => [ ";", ",", "|", "\t" ] }); 2788 $csv->header ($fh, { detect_bom => 1, munge_column_names => "lc" }); 2789 2790The first argument should be a file handle. 2791 2792This method resets some object properties, as it is supposed to be invoked 2793only once per file or stream. It will leave attributes C<column_names> and 2794C<bound_columns> alone if setting column names is disabled. Reading headers 2795on previously process objects might fail on perl-5.8.0 and older. 2796 2797Assuming that the file opened for parsing has a header, and the header does 2798not contain problematic characters like embedded newlines, read the first 2799line from the open handle then auto-detect whether the header separates the 2800column names with a character from the allowed separator list. 2801 2802If any of the allowed separators matches, and none of the I<other> allowed 2803separators match, set L<C<sep>|/sep> to that separator for the current 2804CSV_XS instance and use it to parse the first line, map those to lowercase, 2805and use that to set the instance L</column_names>: 2806 2807 my $csv = Text::CSV_XS->new ({ binary => 1, auto_diag => 1 }); 2808 open my $fh, "<", "file.csv"; 2809 binmode $fh; # for Windows 2810 $csv->header ($fh); 2811 while (my $row = $csv->getline_hr ($fh)) { 2812 ... 2813 } 2814 2815If the header is empty, contains more than one unique separator out of the 2816allowed set, contains empty fields, or contains identical fields (after 2817folding), it will croak with error 1010, 1011, 1012, or 1013 respectively. 2818 2819If the header contains embedded newlines or is not valid CSV in any other 2820way, this method will croak and leave the parse error untouched. 2821 2822A successful call to C<header> will always set the L<C<sep>|/sep> of the 2823C<$csv> object. This behavior can not be disabled. 2824 2825=head3 return value 2826 2827On error this method will croak. 2828 2829In list context, the headers will be returned whether they are used to set 2830L</column_names> or not. 2831 2832In scalar context, the instance itself is returned. B<Note>: the values as 2833found in the header will effectively be B<lost> if C<set_column_names> is 2834false. 2835 2836=head3 Options 2837 2838=over 2 2839 2840=item sep_set 2841X<sep_set> 2842 2843 $csv->header ($fh, { sep_set => [ ";", ",", "|", "\t" ] }); 2844 2845The list of legal separators defaults to C<[ ";", "," ]> and can be changed 2846by this option. As this is probably the most often used option, it can be 2847passed on its own as an unnamed argument: 2848 2849 $csv->header ($fh, [ ";", ",", "|", "\t", "::", "\x{2063}" ]); 2850 2851Multi-byte sequences are allowed, both multi-character and Unicode. See 2852L<C<sep>|/sep>. 2853 2854=item detect_bom 2855X<detect_bom> 2856 2857 $csv->header ($fh, { detect_bom => 1 }); 2858 2859The default behavior is to detect if the header line starts with a BOM. If 2860the header has a BOM, use that to set the encoding of C<$fh>. This default 2861behavior can be disabled by passing a false value to C<detect_bom>. 2862 2863Supported encodings from BOM are: UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, and 2864UTF-32LE. BOM also supports UTF-1, UTF-EBCDIC, SCSU, BOCU-1, and GB-18030 2865but L<Encode> does not (yet). UTF-7 is not supported. 2866 2867If a supported BOM was detected as start of the stream, it is stored in the 2868object attribute C<ENCODING>. 2869 2870 my $enc = $csv->{ENCODING}; 2871 2872The encoding is used with C<binmode> on C<$fh>. 2873 2874If the handle was opened in a (correct) encoding, this method will B<not> 2875alter the encoding, as it checks the leading B<bytes> of the first line. In 2876case the stream starts with a decoded BOM (C<U+FEFF>), C<{ENCODING}> will be 2877C<""> (empty) instead of the default C<undef>. 2878 2879=item munge_column_names 2880X<munge_column_names> 2881 2882This option offers the means to modify the column names into something that 2883is most useful to the application. The default is to map all column names 2884to lower case. 2885 2886 $csv->header ($fh, { munge_column_names => "lc" }); 2887 2888The following values are available: 2889 2890 lc - lower case 2891 uc - upper case 2892 db - valid DB field names 2893 none - do not change 2894 \%hash - supply a mapping 2895 \&cb - supply a callback 2896 2897=over 2 2898 2899=item Lower case 2900 2901 $csv->header ($fh, { munge_column_names => "lc" }); 2902 2903The header is changed to all lower-case 2904 2905 $_ = lc; 2906 2907=item Upper case 2908 2909 $csv->header ($fh, { munge_column_names => "uc" }); 2910 2911The header is changed to all upper-case 2912 2913 $_ = uc; 2914 2915=item Literal 2916 2917 $csv->header ($fh, { munge_column_names => "none" }); 2918 2919=item Hash 2920 2921 $csv->header ($fh, { munge_column_names => { foo => "sombrero" }); 2922 2923if a value does not exist, the original value is used unchanged 2924 2925=item Database 2926 2927 $csv->header ($fh, { munge_column_names => "db" }); 2928 2929=over 2 2930 2931=item - 2932 2933lower-case 2934 2935=item - 2936 2937all sequences of non-word characters are replaced with an underscore 2938 2939=item - 2940 2941all leading underscores are removed 2942 2943=back 2944 2945 $_ = lc (s/\W+/_/gr =~ s/^_+//r); 2946 2947=item Callback 2948 2949 $csv->header ($fh, { munge_column_names => sub { fc } }); 2950 $csv->header ($fh, { munge_column_names => sub { "column_".$col++ } }); 2951 $csv->header ($fh, { munge_column_names => sub { lc (s/\W+/_/gr) } }); 2952 2953As this callback is called in a C<map>, you can use C<$_> directly. 2954 2955=back 2956 2957=item set_column_names 2958X<set_column_names> 2959 2960 $csv->header ($fh, { set_column_names => 1 }); 2961 2962The default is to set the instances column names using L</column_names> if 2963the method is successful, so subsequent calls to L</getline_hr> can return 2964a hash. Disable setting the header can be forced by using a false value for 2965this option. 2966 2967As described in L</return value> above, content is lost in scalar context. 2968 2969=back 2970 2971=head3 Validation 2972 2973When receiving CSV files from external sources, this method can be used to 2974protect against changes in the layout by restricting to known headers (and 2975typos in the header fields). 2976 2977 my %known = ( 2978 "record key" => "c_rec", 2979 "rec id" => "c_rec", 2980 "id_rec" => "c_rec", 2981 "kode" => "code", 2982 "code" => "code", 2983 "vaule" => "value", 2984 "value" => "value", 2985 ); 2986 my $csv = Text::CSV_XS->new ({ binary => 1, auto_diag => 1 }); 2987 open my $fh, "<", $source or die "$source: $!"; 2988 $csv->header ($fh, { munge_column_names => sub { 2989 s/\s+$//; 2990 s/^\s+//; 2991 $known{lc $_} or die "Unknown column '$_' in $source"; 2992 }}); 2993 while (my $row = $csv->getline_hr ($fh)) { 2994 say join "\t", $row->{c_rec}, $row->{code}, $row->{value}; 2995 } 2996 2997=head2 bind_columns 2998X<bind_columns> 2999 3000Takes a list of scalar references to be used for output with L</print> or 3001to store in the fields fetched by L</getline>. When you do not pass enough 3002references to store the fetched fields in, L</getline> will fail with error 3003C<3006>. If you pass more than there are fields to return, the content of 3004the remaining references is left untouched. 3005 3006 $csv->bind_columns (\$code, \$name, \$price, \$description); 3007 while ($csv->getline ($fh)) { 3008 print "The price of a $name is \x{20ac} $price\n"; 3009 } 3010 3011To reset or clear all column binding, call L</bind_columns> with the single 3012argument C<undef>. This will also clear column names. 3013 3014 $csv->bind_columns (undef); 3015 3016If no arguments are passed at all, L</bind_columns> will return the list of 3017current bindings or C<undef> if no binds are active. 3018 3019Note that in parsing with C<bind_columns>, the fields are set on the fly. 3020That implies that if the third field of a row causes an error (or this row 3021has just two fields where the previous row had more), the first two fields 3022already have been assigned the values of the current row, while the rest of 3023the fields will still hold the values of the previous row. If you want the 3024parser to fail in these cases, use the L<C<strict>|/strict> attribute. 3025 3026=head2 eof 3027X<eof> 3028 3029 $eof = $csv->eof (); 3030 3031If L</parse> or L</getline> was used with an IO stream, this method will 3032return true (1) if the last call hit end of file, otherwise it will return 3033false (''). This is useful to see the difference between a failure and end 3034of file. 3035 3036Note that if the parsing of the last line caused an error, C<eof> is still 3037true. That means that if you are I<not> using L</auto_diag>, an idiom like 3038 3039 while (my $row = $csv->getline ($fh)) { 3040 # ... 3041 } 3042 $csv->eof or $csv->error_diag; 3043 3044will I<not> report the error. You would have to change that to 3045 3046 while (my $row = $csv->getline ($fh)) { 3047 # ... 3048 } 3049 +$csv->error_diag and $csv->error_diag; 3050 3051=head2 types 3052X<types> 3053 3054 $csv->types (\@tref); 3055 3056This method is used to force that (all) columns are of a given type. For 3057example, if you have an integer column, two columns with doubles and a 3058string column, then you might do a 3059 3060 $csv->types ([Text::CSV_XS::IV (), 3061 Text::CSV_XS::NV (), 3062 Text::CSV_XS::NV (), 3063 Text::CSV_XS::PV ()]); 3064 3065Column types are used only for I<decoding> columns while parsing, in other 3066words by the L</parse> and L</getline> methods. 3067 3068You can unset column types by doing a 3069 3070 $csv->types (undef); 3071 3072or fetch the current type settings with 3073 3074 $types = $csv->types (); 3075 3076=over 4 3077 3078=item IV 3079X<IV> 3080 3081Set field type to integer. 3082 3083=item NV 3084X<NV> 3085 3086Set field type to numeric/float. 3087 3088=item PV 3089X<PV> 3090 3091Set field type to string. 3092 3093=back 3094 3095=head2 fields 3096X<fields> 3097 3098 @columns = $csv->fields (); 3099 3100This method returns the input to L</combine> or the resultant decomposed 3101fields of a successful L</parse>, whichever was called more recently. 3102 3103Note that the return value is undefined after using L</getline>, which does 3104not fill the data structures returned by L</parse>. 3105 3106=head2 meta_info 3107X<meta_info> 3108 3109 @flags = $csv->meta_info (); 3110 3111This method returns the "flags" of the input to L</combine> or the flags of 3112the resultant decomposed fields of L</parse>, whichever was called more 3113recently. 3114 3115For each field, a meta_info field will hold flags that inform something 3116about the field returned by the L</fields> method or passed to the 3117L</combine> method. The flags are bit-wise-C<or>'d like: 3118 3119=over 2 3120 3121=item C< >0x0001 3122 3123The field was quoted. 3124 3125=item C< >0x0002 3126 3127The field was binary. 3128 3129=back 3130 3131See the C<is_***> methods below. 3132 3133=head2 is_quoted 3134X<is_quoted> 3135 3136 my $quoted = $csv->is_quoted ($column_idx); 3137 3138where C<$column_idx> is the (zero-based) index of the column in the last 3139result of L</parse>. 3140 3141This returns a true value if the data in the indicated column was enclosed 3142in L<C<quote_char>|/quote_char> quotes. This might be important for fields 3143where content C<,20070108,> is to be treated as a numeric value, and where 3144C<,"20070108",> is explicitly marked as character string data. 3145 3146This method is only valid when L</keep_meta_info> is set to a true value. 3147 3148=head2 is_binary 3149X<is_binary> 3150 3151 my $binary = $csv->is_binary ($column_idx); 3152 3153where C<$column_idx> is the (zero-based) index of the column in the last 3154result of L</parse>. 3155 3156This returns a true value if the data in the indicated column contained any 3157byte in the range C<[\x00-\x08,\x10-\x1F,\x7F-\xFF]>. 3158 3159This method is only valid when L</keep_meta_info> is set to a true value. 3160 3161=head2 is_missing 3162X<is_missing> 3163 3164 my $missing = $csv->is_missing ($column_idx); 3165 3166where C<$column_idx> is the (zero-based) index of the column in the last 3167result of L</getline_hr>. 3168 3169 $csv->keep_meta_info (1); 3170 while (my $hr = $csv->getline_hr ($fh)) { 3171 $csv->is_missing (0) and next; # This was an empty line 3172 } 3173 3174When using L</getline_hr>, it is impossible to tell if the parsed fields 3175are C<undef> because they where not filled in the C<CSV> stream or because 3176they were not read at all, as B<all> the fields defined by L</column_names> 3177are set in the hash-ref. If you still need to know if all fields in each 3178row are provided, you should enable L<C<keep_meta_info>|/keep_meta_info> so 3179you can check the flags. 3180 3181If L<C<keep_meta_info>|/keep_meta_info> is C<false>, C<is_missing> will 3182always return C<undef>, regardless of C<$column_idx> being valid or not. If 3183this attribute is C<true> it will return either C<0> (the field is present) 3184or C<1> (the field is missing). 3185 3186A special case is the empty line. If the line is completely empty - after 3187dealing with the flags - this is still a valid CSV line: it is a record of 3188just one single empty field. However, if C<keep_meta_info> is set, invoking 3189C<is_missing> with index C<0> will now return true. 3190 3191=head2 status 3192X<status> 3193 3194 $status = $csv->status (); 3195 3196This method returns the status of the last invoked L</combine> or L</parse> 3197call. Status is success (true: C<1>) or failure (false: C<undef> or C<0>). 3198 3199Note that as this only keeps track of the status of above mentioned methods, 3200you are probably looking for L<C<error_diag>|/error_diag> instead. 3201 3202=head2 error_input 3203X<error_input> 3204 3205 $bad_argument = $csv->error_input (); 3206 3207This method returns the erroneous argument (if it exists) of L</combine> or 3208L</parse>, whichever was called more recently. If the last invocation was 3209successful, C<error_input> will return C<undef>. 3210 3211Depending on the type of error, it I<might> also hold the data for the last 3212error-input of L</getline>. 3213 3214=head2 error_diag 3215X<error_diag> 3216 3217 Text::CSV_XS->error_diag (); 3218 $csv->error_diag (); 3219 $error_code = 0 + $csv->error_diag (); 3220 $error_str = "" . $csv->error_diag (); 3221 ($cde, $str, $pos, $rec, $fld) = $csv->error_diag (); 3222 3223If (and only if) an error occurred, this function returns the diagnostics 3224of that error. 3225 3226If called in void context, this will print the internal error code and the 3227associated error message to STDERR. 3228 3229If called in list context, this will return the error code and the error 3230message in that order. If the last error was from parsing, the rest of the 3231values returned are a best guess at the location within the line that was 3232being parsed. Their values are 1-based. The position currently is index of 3233the byte at which the parsing failed in the current record. It might change 3234to be the index of the current character in a later release. The records is 3235the index of the record parsed by the csv instance. The field number is the 3236index of the field the parser thinks it is currently trying to parse. See 3237F<examples/csv-check> for how this can be used. 3238 3239If called in scalar context, it will return the diagnostics in a single 3240scalar, a-la C<$!>. It will contain the error code in numeric context, and 3241the diagnostics message in string context. 3242 3243When called as a class method or a direct function call, the diagnostics 3244are that of the last L</new> call. 3245 3246=head2 record_number 3247X<record_number> 3248 3249 $recno = $csv->record_number (); 3250 3251Returns the records parsed by this csv instance. This value should be more 3252accurate than C<$.> when embedded newlines come in play. Records written by 3253this instance are not counted. 3254 3255=head2 SetDiag 3256X<SetDiag> 3257 3258 $csv->SetDiag (0); 3259 3260Use to reset the diagnostics if you are dealing with errors. 3261 3262=head1 FUNCTIONS 3263 3264=head2 csv 3265X<csv> 3266 3267This function is not exported by default and should be explicitly requested: 3268 3269 use Text::CSV_XS qw( csv ); 3270 3271This is a high-level function that aims at simple (user) interfaces. This 3272can be used to read/parse a C<CSV> file or stream (the default behavior) or 3273to produce a file or write to a stream (define the C<out> attribute). It 3274returns an array- or hash-reference on parsing (or C<undef> on fail) or the 3275numeric value of L</error_diag> on writing. When this function fails you 3276can get to the error using the class call to L</error_diag> 3277 3278 my $aoa = csv (in => "test.csv") or 3279 die Text::CSV_XS->error_diag; 3280 3281This function takes the arguments as key-value pairs. This can be passed as 3282a list or as an anonymous hash: 3283 3284 my $aoa = csv ( in => "test.csv", sep_char => ";"); 3285 my $aoh = csv ({ in => $fh, headers => "auto" }); 3286 3287The arguments passed consist of two parts: the arguments to L</csv> itself 3288and the optional attributes to the C<CSV> object used inside the function 3289as enumerated and explained in L</new>. 3290 3291If not overridden, the default option used for CSV is 3292 3293 auto_diag => 1 3294 escape_null => 0 3295 3296The option that is always set and cannot be altered is 3297 3298 binary => 1 3299 3300As this function will likely be used in one-liners, it allows C<quote> to 3301be abbreviated as C<quo>, and C<escape_char> to be abbreviated as C<esc> 3302or C<escape>. 3303 3304Alternative invocations: 3305 3306 my $aoa = Text::CSV_XS::csv (in => "file.csv"); 3307 3308 my $csv = Text::CSV_XS->new (); 3309 my $aoa = $csv->csv (in => "file.csv"); 3310 3311In the latter case, the object attributes are used from the existing object 3312and the attribute arguments in the function call are ignored: 3313 3314 my $csv = Text::CSV_XS->new ({ sep_char => ";" }); 3315 my $aoh = $csv->csv (in => "file.csv", bom => 1); 3316 3317will parse using C<;> as C<sep_char>, not C<,>. 3318 3319=head3 in 3320X<in> 3321 3322Used to specify the source. C<in> can be a file name (e.g. C<"file.csv">), 3323which will be opened for reading and closed when finished, a file handle 3324(e.g. C<$fh> or C<FH>), a reference to a glob (e.g. C<\*ARGV>), the glob 3325itself (e.g. C<*STDIN>), or a reference to a scalar (e.g. C<\q{1,2,"csv"}>). 3326 3327When used with L</out>, C<in> should be a reference to a CSV structure (AoA 3328or AoH) or a CODE-ref that returns an array-reference or a hash-reference. 3329The code-ref will be invoked with no arguments. 3330 3331 my $aoa = csv (in => "file.csv"); 3332 3333 open my $fh, "<", "file.csv"; 3334 my $aoa = csv (in => $fh); 3335 3336 my $csv = [ [qw( Foo Bar )], [ 1, 2 ], [ 2, 3 ]]; 3337 my $err = csv (in => $csv, out => "file.csv"); 3338 3339If called in void context without the L</out> attribute, the resulting ref 3340will be used as input to a subsequent call to csv: 3341 3342 csv (in => "file.csv", filter => { 2 => sub { length > 2 }}) 3343 3344will be a shortcut to 3345 3346 csv (in => csv (in => "file.csv", filter => { 2 => sub { length > 2 }})) 3347 3348where, in the absence of the C<out> attribute, this is a shortcut to 3349 3350 csv (in => csv (in => "file.csv", filter => { 2 => sub { length > 2 }}), 3351 out => *STDOUT) 3352 3353=head3 out 3354X<out> 3355 3356 csv (in => $aoa, out => "file.csv"); 3357 csv (in => $aoa, out => $fh); 3358 csv (in => $aoa, out => STDOUT); 3359 csv (in => $aoa, out => *STDOUT); 3360 csv (in => $aoa, out => \*STDOUT); 3361 csv (in => $aoa, out => \my $data); 3362 csv (in => $aoa, out => undef); 3363 csv (in => $aoa, out => \"skip"); 3364 3365 csv (in => $fh, out => \@aoa); 3366 csv (in => $fh, out => \@aoh, bom => 1); 3367 csv (in => $fh, out => \%hsh, key => "key"); 3368 3369In output mode, the default CSV options when producing CSV are 3370 3371 eol => "\r\n" 3372 3373The L</fragment> attribute is ignored in output mode. 3374 3375C<out> can be a file name (e.g. C<"file.csv">), which will be opened for 3376writing and closed when finished, a file handle (e.g. C<$fh> or C<FH>), a 3377reference to a glob (e.g. C<\*STDOUT>), the glob itself (e.g. C<*STDOUT>), 3378or a reference to a scalar (e.g. C<\my $data>). 3379 3380 csv (in => sub { $sth->fetch }, out => "dump.csv"); 3381 csv (in => sub { $sth->fetchrow_hashref }, out => "dump.csv", 3382 headers => $sth->{NAME_lc}); 3383 3384When a code-ref is used for C<in>, the output is generated per invocation, 3385so no buffering is involved. This implies that there is no size restriction 3386on the number of records. The C<csv> function ends when the coderef returns 3387a false value. 3388 3389If C<out> is set to a reference of the literal string C<"skip">, the output 3390will be suppressed completely, which might be useful in combination with a 3391filter for side effects only. 3392 3393 my %cache; 3394 csv (in => "dump.csv", 3395 out => \"skip", 3396 on_in => sub { $cache{$_[1][1]}++ }); 3397 3398Currently, setting C<out> to any false value (C<undef>, C<"">, 0) will be 3399equivalent to C<\"skip">. 3400 3401If the C<in> argument point to something to parse, and the C<out> is set to 3402a reference to an C<ARRAY> or a C<HASH>, the output is appended to the data 3403in the existing reference. The result of the parse should match what exists 3404in the reference passed. This might come handy when you have to parse a set 3405of files with similar content (like data stored per period) and you want to 3406collect that into a single data structure: 3407 3408 my %hash; 3409 csv (in => $_, out => \%hash, key => "id") for sort glob "foo-[0-9]*.csv"; 3410 3411 my @list; # List of arrays 3412 csv (in => $_, out => \@list) for sort glob "foo-[0-9]*.csv"; 3413 3414 my @list; # List of hashes 3415 csv (in => $_, out => \@list, bom => 1) for sort glob "foo-[0-9]*.csv"; 3416 3417=head3 encoding 3418X<encoding> 3419 3420If passed, it should be an encoding accepted by the C<:encoding()> option 3421to C<open>. There is no default value. This attribute does not work in perl 34225.6.x. C<encoding> can be abbreviated to C<enc> for ease of use in command 3423line invocations. 3424 3425If C<encoding> is set to the literal value C<"auto">, the method L</header> 3426will be invoked on the opened stream to check if there is a BOM and set the 3427encoding accordingly. This is equal to passing a true value in the option 3428L<C<detect_bom>|/detect_bom>. 3429 3430Encodings can be stacked, as supported by C<binmode>: 3431 3432 # Using PerlIO::via::gzip 3433 csv (in => \@csv, 3434 out => "test.csv:via.gz", 3435 encoding => ":via(gzip):encoding(utf-8)", 3436 ); 3437 $aoa = csv (in => "test.csv:via.gz", encoding => ":via(gzip)"); 3438 3439 # Using PerlIO::gzip 3440 csv (in => \@csv, 3441 out => "test.csv:via.gz", 3442 encoding => ":gzip:encoding(utf-8)", 3443 ); 3444 $aoa = csv (in => "test.csv:gzip.gz", encoding => ":gzip"); 3445 3446=head3 detect_bom 3447X<detect_bom> 3448 3449If C<detect_bom> is given, the method L</header> will be invoked on the 3450opened stream to check if there is a BOM and set the encoding accordingly. 3451 3452C<detect_bom> can be abbreviated to C<bom>. 3453 3454This is the same as setting L<C<encoding>|/encoding> to C<"auto">. 3455 3456Note that as the method L</header> is invoked, its default is to also set 3457the headers. 3458 3459=head3 headers 3460X<headers> 3461 3462If this attribute is not given, the default behavior is to produce an array 3463of arrays. 3464 3465If C<headers> is supplied, it should be an anonymous list of column names, 3466an anonymous hashref, a coderef, or a literal flag: C<auto>, C<lc>, C<uc>, 3467or C<skip>. 3468 3469=over 2 3470 3471=item skip 3472X<skip> 3473 3474When C<skip> is used, the header will not be included in the output. 3475 3476 my $aoa = csv (in => $fh, headers => "skip"); 3477 3478=item auto 3479X<auto> 3480 3481If C<auto> is used, the first line of the C<CSV> source will be read as the 3482list of field headers and used to produce an array of hashes. 3483 3484 my $aoh = csv (in => $fh, headers => "auto"); 3485 3486=item lc 3487X<lc> 3488 3489If C<lc> is used, the first line of the C<CSV> source will be read as the 3490list of field headers mapped to lower case and used to produce an array of 3491hashes. This is a variation of C<auto>. 3492 3493 my $aoh = csv (in => $fh, headers => "lc"); 3494 3495=item uc 3496X<uc> 3497 3498If C<uc> is used, the first line of the C<CSV> source will be read as the 3499list of field headers mapped to upper case and used to produce an array of 3500hashes. This is a variation of C<auto>. 3501 3502 my $aoh = csv (in => $fh, headers => "uc"); 3503 3504=item CODE 3505X<CODE> 3506 3507If a coderef is used, the first line of the C<CSV> source will be read as 3508the list of mangled field headers in which each field is passed as the only 3509argument to the coderef. This list is used to produce an array of hashes. 3510 3511 my $aoh = csv (in => $fh, 3512 headers => sub { lc ($_[0]) =~ s/kode/code/gr }); 3513 3514this example is a variation of using C<lc> where all occurrences of C<kode> 3515are replaced with C<code>. 3516 3517=item ARRAY 3518X<ARRAY> 3519 3520If C<headers> is an anonymous list, the entries in the list will be used 3521as field names. The first line is considered data instead of headers. 3522 3523 my $aoh = csv (in => $fh, headers => [qw( Foo Bar )]); 3524 csv (in => $aoa, out => $fh, headers => [qw( code description price )]); 3525 3526=item HASH 3527X<HASH> 3528 3529If C<headers> is a hash reference, this implies C<auto>, but header fields 3530that exist as key in the hashref will be replaced by the value for that 3531key. Given a CSV file like 3532 3533 post-kode,city,name,id number,fubble 3534 1234AA,Duckstad,Donald,13,"X313DF" 3535 3536using 3537 3538 csv (headers => { "post-kode" => "pc", "id number" => "ID" }, ... 3539 3540will return an entry like 3541 3542 { pc => "1234AA", 3543 city => "Duckstad", 3544 name => "Donald", 3545 ID => "13", 3546 fubble => "X313DF", 3547 } 3548 3549=back 3550 3551See also L<C<munge_column_names>|/munge_column_names> and 3552L<C<set_column_names>|/set_column_names>. 3553 3554=head3 munge_column_names 3555X<munge_column_names> 3556 3557If C<munge_column_names> is set, the method L</header> is invoked on the 3558opened stream with all matching arguments to detect and set the headers. 3559 3560C<munge_column_names> can be abbreviated to C<munge>. 3561 3562=head3 key 3563X<key> 3564 3565If passed, will default L<C<headers>|/headers> to C<"auto"> and return a 3566hashref instead of an array of hashes. Allowed values are simple scalars or 3567array-references where the first element is the joiner and the rest are the 3568fields to join to combine the key. 3569 3570 my $ref = csv (in => "test.csv", key => "code"); 3571 my $ref = csv (in => "test.csv", key => [ ":" => "code", "color" ]); 3572 3573with test.csv like 3574 3575 code,product,price,color 3576 1,pc,850,gray 3577 2,keyboard,12,white 3578 3,mouse,5,black 3579 3580the first example will return 3581 3582 { 1 => { 3583 code => 1, 3584 color => 'gray', 3585 price => 850, 3586 product => 'pc' 3587 }, 3588 2 => { 3589 code => 2, 3590 color => 'white', 3591 price => 12, 3592 product => 'keyboard' 3593 }, 3594 3 => { 3595 code => 3, 3596 color => 'black', 3597 price => 5, 3598 product => 'mouse' 3599 } 3600 } 3601 3602the second example will return 3603 3604 { "1:gray" => { 3605 code => 1, 3606 color => 'gray', 3607 price => 850, 3608 product => 'pc' 3609 }, 3610 "2:white" => { 3611 code => 2, 3612 color => 'white', 3613 price => 12, 3614 product => 'keyboard' 3615 }, 3616 "3:black" => { 3617 code => 3, 3618 color => 'black', 3619 price => 5, 3620 product => 'mouse' 3621 } 3622 } 3623 3624The C<key> attribute can be combined with L<C<headers>|/headers> for C<CSV> 3625date that has no header line, like 3626 3627 my $ref = csv ( 3628 in => "foo.csv", 3629 headers => [qw( c_foo foo bar description stock )], 3630 key => "c_foo", 3631 ); 3632 3633=head3 value 3634X<value> 3635 3636Used to create key-value hashes. 3637 3638Only allowed when C<key> is valid. A C<value> can be either a single column 3639label or an anonymous list of column labels. In the first case, the value 3640will be a simple scalar value, in the latter case, it will be a hashref. 3641 3642 my $ref = csv (in => "test.csv", key => "code", 3643 value => "price"); 3644 my $ref = csv (in => "test.csv", key => "code", 3645 value => [ "product", "price" ]); 3646 my $ref = csv (in => "test.csv", key => [ ":" => "code", "color" ], 3647 value => "price"); 3648 my $ref = csv (in => "test.csv", key => [ ":" => "code", "color" ], 3649 value => [ "product", "price" ]); 3650 3651with test.csv like 3652 3653 code,product,price,color 3654 1,pc,850,gray 3655 2,keyboard,12,white 3656 3,mouse,5,black 3657 3658the first example will return 3659 3660 { 1 => 850, 3661 2 => 12, 3662 3 => 5, 3663 } 3664 3665the second example will return 3666 3667 { 1 => { 3668 price => 850, 3669 product => 'pc' 3670 }, 3671 2 => { 3672 price => 12, 3673 product => 'keyboard' 3674 }, 3675 3 => { 3676 price => 5, 3677 product => 'mouse' 3678 } 3679 } 3680 3681the third example will return 3682 3683 { "1:gray" => 850, 3684 "2:white" => 12, 3685 "3:black" => 5, 3686 } 3687 3688the fourth example will return 3689 3690 { "1:gray" => { 3691 price => 850, 3692 product => 'pc' 3693 }, 3694 "2:white" => { 3695 price => 12, 3696 product => 'keyboard' 3697 }, 3698 "3:black" => { 3699 price => 5, 3700 product => 'mouse' 3701 } 3702 } 3703 3704=head3 keep_headers 3705X<keep_headers> 3706X<keep_column_names> 3707X<kh> 3708 3709When using hashes, keep the column names into the arrayref passed, so all 3710headers are available after the call in the original order. 3711 3712 my $aoh = csv (in => "file.csv", keep_headers => \my @hdr); 3713 3714This attribute can be abbreviated to C<kh> or passed as C<keep_column_names>. 3715 3716This attribute implies a default of C<auto> for the C<headers> attribute. 3717 3718=head3 fragment 3719X<fragment> 3720 3721Only output the fragment as defined in the L</fragment> method. This option 3722is ignored when I<generating> C<CSV>. See L</out>. 3723 3724Combining all of them could give something like 3725 3726 use Text::CSV_XS qw( csv ); 3727 my $aoh = csv ( 3728 in => "test.txt", 3729 encoding => "utf-8", 3730 headers => "auto", 3731 sep_char => "|", 3732 fragment => "row=3;6-9;15-*", 3733 ); 3734 say $aoh->[15]{Foo}; 3735 3736=head3 sep_set 3737X<sep_set> 3738X<seps> 3739 3740If C<sep_set> is set, the method L</header> is invoked on the opened stream 3741to detect and set L<C<sep_char>|/sep_char> with the given set. 3742 3743C<sep_set> can be abbreviated to C<seps>. 3744 3745Note that as the L</header> method is invoked, its default is to also set 3746the headers. 3747 3748=head3 set_column_names 3749X<set_column_names> 3750 3751If C<set_column_names> is passed, the method L</header> is invoked on the 3752opened stream with all arguments meant for L</header>. 3753 3754If C<set_column_names> is passed as a false value, the content of the first 3755row is only preserved if the output is AoA: 3756 3757With an input-file like 3758 3759 bAr,foo 3760 1,2 3761 3,4,5 3762 3763This call 3764 3765 my $aoa = csv (in => $file, set_column_names => 0); 3766 3767will result in 3768 3769 [[ "bar", "foo" ], 3770 [ "1", "2" ], 3771 [ "3", "4", "5" ]] 3772 3773and 3774 3775 my $aoa = csv (in => $file, set_column_names => 0, munge => "none"); 3776 3777will result in 3778 3779 [[ "bAr", "foo" ], 3780 [ "1", "2" ], 3781 [ "3", "4", "5" ]] 3782 3783=head2 Callbacks 3784X<Callbacks> 3785 3786Callbacks enable actions triggered from the I<inside> of Text::CSV_XS. 3787 3788While most of what this enables can easily be done in an unrolled loop as 3789described in the L</SYNOPSIS> callbacks can be used to meet special demands 3790or enhance the L</csv> function. 3791 3792=over 2 3793 3794=item error 3795X<error> 3796 3797 $csv->callbacks (error => sub { $csv->SetDiag (0) }); 3798 3799the C<error> callback is invoked when an error occurs, but I<only> when 3800L</auto_diag> is set to a true value. A callback is invoked with the values 3801returned by L</error_diag>: 3802 3803 my ($c, $s); 3804 3805 sub ignore3006 { 3806 my ($err, $msg, $pos, $recno, $fldno) = @_; 3807 if ($err == 3006) { 3808 # ignore this error 3809 ($c, $s) = (undef, undef); 3810 Text::CSV_XS->SetDiag (0); 3811 } 3812 # Any other error 3813 return; 3814 } # ignore3006 3815 3816 $csv->callbacks (error => \&ignore3006); 3817 $csv->bind_columns (\$c, \$s); 3818 while ($csv->getline ($fh)) { 3819 # Error 3006 will not stop the loop 3820 } 3821 3822=item after_parse 3823X<after_parse> 3824 3825 $csv->callbacks (after_parse => sub { push @{$_[1]}, "NEW" }); 3826 while (my $row = $csv->getline ($fh)) { 3827 $row->[-1] eq "NEW"; 3828 } 3829 3830This callback is invoked after parsing with L</getline> only if no error 3831occurred. The callback is invoked with two arguments: the current C<CSV> 3832parser object and an array reference to the fields parsed. 3833 3834The return code of the callback is ignored unless it is a reference to the 3835string "skip", in which case the record will be skipped in L</getline_all>. 3836 3837 sub add_from_db { 3838 my ($csv, $row) = @_; 3839 $sth->execute ($row->[4]); 3840 push @$row, $sth->fetchrow_array; 3841 } # add_from_db 3842 3843 my $aoa = csv (in => "file.csv", callbacks => { 3844 after_parse => \&add_from_db }); 3845 3846This hook can be used for validation: 3847X<data_validation> 3848 3849=over 2 3850 3851=item FAIL 3852 3853Die if any of the records does not validate a rule: 3854 3855 after_parse => sub { 3856 $_[1][4] =~ m/^[0-9]{4}\s?[A-Z]{2}$/ or 3857 die "5th field does not have a valid Dutch zipcode"; 3858 } 3859 3860=item DEFAULT 3861 3862Replace invalid fields with a default value: 3863 3864 after_parse => sub { $_[1][2] =~ m/^\d+$/ or $_[1][2] = 0 } 3865 3866=item SKIP 3867 3868Skip records that have invalid fields (only applies to L</getline_all>): 3869 3870 after_parse => sub { $_[1][0] =~ m/^\d+$/ or return \"skip"; } 3871 3872=back 3873 3874=item before_print 3875X<before_print> 3876 3877 my $idx = 1; 3878 $csv->callbacks (before_print => sub { $_[1][0] = $idx++ }); 3879 $csv->print (*STDOUT, [ 0, $_ ]) for @members; 3880 3881This callback is invoked before printing with L</print> only if no error 3882occurred. The callback is invoked with two arguments: the current C<CSV> 3883parser object and an array reference to the fields passed. 3884 3885The return code of the callback is ignored. 3886 3887 sub max_4_fields { 3888 my ($csv, $row) = @_; 3889 @$row > 4 and splice @$row, 4; 3890 } # max_4_fields 3891 3892 csv (in => csv (in => "file.csv"), out => *STDOUT, 3893 callbacks => { before_print => \&max_4_fields }); 3894 3895This callback is not active for L</combine>. 3896 3897=back 3898 3899=head3 Callbacks for csv () 3900 3901The L</csv> allows for some callbacks that do not integrate in XS internals 3902but only feature the L</csv> function. 3903 3904 csv (in => "file.csv", 3905 callbacks => { 3906 filter => { 6 => sub { $_ > 15 } }, # first 3907 after_parse => sub { say "AFTER PARSE"; }, # first 3908 after_in => sub { say "AFTER IN"; }, # second 3909 on_in => sub { say "ON IN"; }, # third 3910 }, 3911 ); 3912 3913 csv (in => $aoh, 3914 out => "file.csv", 3915 callbacks => { 3916 on_in => sub { say "ON IN"; }, # first 3917 before_out => sub { say "BEFORE OUT"; }, # second 3918 before_print => sub { say "BEFORE PRINT"; }, # third 3919 }, 3920 ); 3921 3922=over 2 3923 3924=item filter 3925X<filter> 3926 3927This callback can be used to filter records. It is called just after a new 3928record has been scanned. The callback accepts a: 3929 3930=over 2 3931 3932=item hashref 3933 3934The keys are the index to the row (the field name or field number, 1-based) 3935and the values are subs to return a true or false value. 3936 3937 csv (in => "file.csv", filter => { 3938 3 => sub { m/a/ }, # third field should contain an "a" 3939 5 => sub { length > 4 }, # length of the 5th field minimal 5 3940 }); 3941 3942 csv (in => "file.csv", filter => { foo => sub { $_ > 4 }}); 3943 3944If the keys to the filter hash contain any character that is not a digit it 3945will also implicitly set L</headers> to C<"auto"> unless L</headers> was 3946already passed as argument. When headers are active, returning an array of 3947hashes, the filter is not applicable to the header itself. 3948 3949All sub results should match, as in AND. 3950 3951The context of the callback sets C<$_> localized to the field indicated by 3952the filter. The two arguments are as with all other callbacks, so the other 3953fields in the current row can be seen: 3954 3955 filter => { 3 => sub { $_ > 100 ? $_[1][1] =~ m/A/ : $_[1][6] =~ m/B/ }} 3956 3957If the context is set to return a list of hashes (L</headers> is defined), 3958the current record will also be available in the localized C<%_>: 3959 3960 filter => { 3 => sub { $_ > 100 && $_{foo} =~ m/A/ && $_{bar} < 1000 }} 3961 3962If the filter is used to I<alter> the content by changing C<$_>, make sure 3963that the sub returns true in order not to have that record skipped: 3964 3965 filter => { 2 => sub { $_ = uc }} 3966 3967will upper-case the second field, and then skip it if the resulting content 3968evaluates to false. To always accept, end with truth: 3969 3970 filter => { 2 => sub { $_ = uc; 1 }} 3971 3972=item coderef 3973 3974 csv (in => "file.csv", filter => sub { $n++; 0; }); 3975 3976If the argument to C<filter> is a coderef, it is an alias or shortcut to a 3977filter on column 0: 3978 3979 csv (filter => sub { $n++; 0 }); 3980 3981is equal to 3982 3983 csv (filter => { 0 => sub { $n++; 0 }); 3984 3985=item filter-name 3986 3987 csv (in => "file.csv", filter => "not_blank"); 3988 csv (in => "file.csv", filter => "not_empty"); 3989 csv (in => "file.csv", filter => "filled"); 3990 3991These are predefined filters 3992 3993Given a file like (line numbers prefixed for doc purpose only): 3994 3995 1:1,2,3 3996 2: 3997 3:, 3998 4:"" 3999 5:,, 4000 6:, , 4001 7:"", 4002 8:" " 4003 9:4,5,6 4004 4005=over 2 4006 4007=item not_blank 4008 4009Filter out the blank lines 4010 4011This filter is a shortcut for 4012 4013 filter => { 0 => sub { @{$_[1]} > 1 or 4014 defined $_[1][0] && $_[1][0] ne "" } } 4015 4016Due to the implementation, it is currently impossible to also filter lines 4017that consists only of a quoted empty field. These lines are also considered 4018blank lines. 4019 4020With the given example, lines 2 and 4 will be skipped. 4021 4022=item not_empty 4023 4024Filter out lines where all the fields are empty. 4025 4026This filter is a shortcut for 4027 4028 filter => { 0 => sub { grep { defined && $_ ne "" } @{$_[1]} } } 4029 4030A space is not regarded being empty, so given the example data, lines 2, 3, 40314, 5, and 7 are skipped. 4032 4033=item filled 4034 4035Filter out lines that have no visible data 4036 4037This filter is a shortcut for 4038 4039 filter => { 0 => sub { grep { defined && m/\S/ } @{$_[1]} } } 4040 4041This filter rejects all lines that I<not> have at least one field that does 4042not evaluate to the empty string. 4043 4044With the given example data, this filter would skip lines 2 through 8. 4045 4046=back 4047 4048=back 4049 4050One could also use modules like L<Types::Standard>: 4051 4052 use Types::Standard -types; 4053 4054 my $type = Tuple[Str, Str, Int, Bool, Optional[Num]]; 4055 my $check = $type->compiled_check; 4056 4057 # filter with compiled check and warnings 4058 my $aoa = csv ( 4059 in => \$data, 4060 filter => { 4061 0 => sub { 4062 my $ok = $check->($_[1]) or 4063 warn $type->get_message ($_[1]), "\n"; 4064 return $ok; 4065 }, 4066 }, 4067 ); 4068 4069=item after_in 4070X<after_in> 4071 4072This callback is invoked for each record after all records have been parsed 4073but before returning the reference to the caller. The hook is invoked with 4074two arguments: the current C<CSV> parser object and a reference to the 4075record. The reference can be a reference to a HASH or a reference to an 4076ARRAY as determined by the arguments. 4077 4078This callback can also be passed as an attribute without the C<callbacks> 4079wrapper. 4080 4081=item before_out 4082X<before_out> 4083 4084This callback is invoked for each record before the record is printed. The 4085hook is invoked with two arguments: the current C<CSV> parser object and a 4086reference to the record. The reference can be a reference to a HASH or a 4087reference to an ARRAY as determined by the arguments. 4088 4089This callback can also be passed as an attribute without the C<callbacks> 4090wrapper. 4091 4092This callback makes the row available in C<%_> if the row is a hashref. In 4093this case C<%_> is writable and will change the original row. 4094 4095=item on_in 4096X<on_in> 4097 4098This callback acts exactly as the L</after_in> or the L</before_out> hooks. 4099 4100This callback can also be passed as an attribute without the C<callbacks> 4101wrapper. 4102 4103This callback makes the row available in C<%_> if the row is a hashref. In 4104this case C<%_> is writable and will change the original row. So e.g. with 4105 4106 my $aoh = csv ( 4107 in => \"foo\n1\n2\n", 4108 headers => "auto", 4109 on_in => sub { $_{bar} = 2; }, 4110 ); 4111 4112C<$aoh> will be: 4113 4114 [ { foo => 1, 4115 bar => 2, 4116 } 4117 { foo => 2, 4118 bar => 2, 4119 } 4120 ] 4121 4122=item csv 4123 4124The I<function> L</csv> can also be called as a method or with an existing 4125Text::CSV_XS object. This could help if the function is to be invoked a lot 4126of times and the overhead of creating the object internally over and over 4127again would be prevented by passing an existing instance. 4128 4129 my $csv = Text::CSV_XS->new ({ binary => 1, auto_diag => 1 }); 4130 4131 my $aoa = $csv->csv (in => $fh); 4132 my $aoa = csv (in => $fh, csv => $csv); 4133 4134both act the same. Running this 20000 times on a 20 lines CSV file, showed 4135a 53% speedup. 4136 4137=back 4138 4139=head1 INTERNALS 4140 4141=over 4 4142 4143=item Combine (...) 4144 4145=item Parse (...) 4146 4147=back 4148 4149The arguments to these internal functions are deliberately not described or 4150documented in order to enable the module authors make changes it when they 4151feel the need for it. Using them is highly discouraged as the API may 4152change in future releases. 4153 4154=head1 EXAMPLES 4155 4156=head2 Reading a CSV file line by line: 4157 4158 my $csv = Text::CSV_XS->new ({ binary => 1, auto_diag => 1 }); 4159 open my $fh, "<", "file.csv" or die "file.csv: $!"; 4160 while (my $row = $csv->getline ($fh)) { 4161 # do something with @$row 4162 } 4163 close $fh or die "file.csv: $!"; 4164 4165or 4166 4167 my $aoa = csv (in => "file.csv", on_in => sub { 4168 # do something with %_ 4169 }); 4170 4171=head3 Reading only a single column 4172 4173 my $csv = Text::CSV_XS->new ({ binary => 1, auto_diag => 1 }); 4174 open my $fh, "<", "file.csv" or die "file.csv: $!"; 4175 # get only the 4th column 4176 my @column = map { $_->[3] } @{$csv->getline_all ($fh)}; 4177 close $fh or die "file.csv: $!"; 4178 4179with L</csv>, you could do 4180 4181 my @column = map { $_->[0] } 4182 @{csv (in => "file.csv", fragment => "col=4")}; 4183 4184=head2 Parsing CSV strings: 4185 4186 my $csv = Text::CSV_XS->new ({ keep_meta_info => 1, binary => 1 }); 4187 4188 my $sample_input_string = 4189 qq{"I said, ""Hi!""",Yes,"",2.34,,"1.09","\x{20ac}",}; 4190 if ($csv->parse ($sample_input_string)) { 4191 my @field = $csv->fields; 4192 foreach my $col (0 .. $#field) { 4193 my $quo = $csv->is_quoted ($col) ? $csv->{quote_char} : ""; 4194 printf "%2d: %s%s%s\n", $col, $quo, $field[$col], $quo; 4195 } 4196 } 4197 else { 4198 print STDERR "parse () failed on argument: ", 4199 $csv->error_input, "\n"; 4200 $csv->error_diag (); 4201 } 4202 4203=head3 Parsing CSV from memory 4204 4205Given a complete CSV data-set in scalar C<$data>, generate a list of lists 4206to represent the rows and fields 4207 4208 # The data 4209 my $data = join "\r\n" => map { join "," => 0 .. 5 } 0 .. 5; 4210 4211 # in a loop 4212 my $csv = Text::CSV_XS->new ({ binary => 1, auto_diag => 1 }); 4213 open my $fh, "<", \$data; 4214 my @foo; 4215 while (my $row = $csv->getline ($fh)) { 4216 push @foo, $row; 4217 } 4218 close $fh; 4219 4220 # a single call 4221 my $foo = csv (in => \$data); 4222 4223=head2 Printing CSV data 4224 4225=head3 The fast way: using L</print> 4226 4227An example for creating C<CSV> files using the L</print> method: 4228 4229 my $csv = Text::CSV_XS->new ({ binary => 1, eol => $/ }); 4230 open my $fh, ">", "foo.csv" or die "foo.csv: $!"; 4231 for (1 .. 10) { 4232 $csv->print ($fh, [ $_, "$_" ]) or $csv->error_diag; 4233 } 4234 close $fh or die "$tbl.csv: $!"; 4235 4236=head3 The slow way: using L</combine> and L</string> 4237 4238or using the slower L</combine> and L</string> methods: 4239 4240 my $csv = Text::CSV_XS->new; 4241 4242 open my $csv_fh, ">", "hello.csv" or die "hello.csv: $!"; 4243 4244 my @sample_input_fields = ( 4245 'You said, "Hello!"', 5.67, 4246 '"Surely"', '', '3.14159'); 4247 if ($csv->combine (@sample_input_fields)) { 4248 print $csv_fh $csv->string, "\n"; 4249 } 4250 else { 4251 print "combine () failed on argument: ", 4252 $csv->error_input, "\n"; 4253 } 4254 close $csv_fh or die "hello.csv: $!"; 4255 4256=head3 Generating CSV into memory 4257 4258Format a data-set (C<@foo>) into a scalar value in memory (C<$data>): 4259 4260 # The data 4261 my @foo = map { [ 0 .. 5 ] } 0 .. 3; 4262 4263 # in a loop 4264 my $csv = Text::CSV_XS->new ({ binary => 1, auto_diag => 1, eol => "\r\n" }); 4265 open my $fh, ">", \my $data; 4266 $csv->print ($fh, $_) for @foo; 4267 close $fh; 4268 4269 # a single call 4270 csv (in => \@foo, out => \my $data); 4271 4272=head2 Rewriting CSV 4273 4274Rewrite C<CSV> files with C<;> as separator character to well-formed C<CSV>: 4275 4276 use Text::CSV_XS qw( csv ); 4277 csv (in => csv (in => "bad.csv", sep_char => ";"), out => *STDOUT); 4278 4279As C<STDOUT> is now default in L</csv>, a one-liner converting a UTF-16 CSV 4280file with BOM and TAB-separation to valid UTF-8 CSV could be: 4281 4282 $ perl -C3 -MText::CSV_XS=csv -we\ 4283 'csv(in=>"utf16tab.csv",encoding=>"utf16",sep=>"\t")' >utf8.csv 4284 4285=head2 Dumping database tables to CSV 4286 4287Dumping a database table can be simple as this (TIMTOWTDI): 4288 4289 my $dbh = DBI->connect (...); 4290 my $sql = "select * from foo"; 4291 4292 # using your own loop 4293 open my $fh, ">", "foo.csv" or die "foo.csv: $!\n"; 4294 my $csv = Text::CSV_XS->new ({ binary => 1, eol => "\r\n" }); 4295 my $sth = $dbh->prepare ($sql); $sth->execute; 4296 $csv->print ($fh, $sth->{NAME_lc}); 4297 while (my $row = $sth->fetch) { 4298 $csv->print ($fh, $row); 4299 } 4300 4301 # using the csv function, all in memory 4302 csv (out => "foo.csv", in => $dbh->selectall_arrayref ($sql)); 4303 4304 # using the csv function, streaming with callbacks 4305 my $sth = $dbh->prepare ($sql); $sth->execute; 4306 csv (out => "foo.csv", in => sub { $sth->fetch }); 4307 csv (out => "foo.csv", in => sub { $sth->fetchrow_hashref }); 4308 4309Note that this does not discriminate between "empty" values and NULL-values 4310from the database, as both will be the same empty field in CSV. To enable 4311distinction between the two, use L<C<quote_empty>|/quote_empty>. 4312 4313 csv (out => "foo.csv", in => sub { $sth->fetch }, quote_empty => 1); 4314 4315If the database import utility supports special sequences to insert C<NULL> 4316values into the database, like MySQL/MariaDB supports C<\N>, use a filter 4317or a map 4318 4319 csv (out => "foo.csv", in => sub { $sth->fetch }, 4320 on_in => sub { $_ //= "\\N" for @{$_[1]} }); 4321 4322 while (my $row = $sth->fetch) { 4323 $csv->print ($fh, [ map { $_ // "\\N" } @$row ]); 4324 } 4325 4326Note that this will not work as expected when choosing the backslash (C<\>) 4327as C<escape_char>, as that will cause the C<\> to need to be escaped by yet 4328another C<\>, which will cause the field to need quotation and thus ending 4329up as C<"\\N"> instead of C<\N>. See also L<C<undef_str>|/undef_str>. 4330 4331 csv (out => "foo.csv", in => sub { $sth->fetch }, undef_str => "\\N"); 4332 4333These special sequences are not recognized by Text::CSV_XS on parsing the 4334CSV generated like this, but map and filter are your friends again 4335 4336 while (my $row = $csv->getline ($fh)) { 4337 $sth->execute (map { $_ eq "\\N" ? undef : $_ } @$row); 4338 } 4339 4340 csv (in => "foo.csv", filter => { 1 => sub { 4341 $sth->execute (map { $_ eq "\\N" ? undef : $_ } @{$_[1]}); 0; }}); 4342 4343=head2 Converting CSV to JSON 4344 4345 use Text::CSV_XS qw( csv ); 4346 use JSON; # or Cpanel::JSON::XS for better performance 4347 4348 # AoA (no header interpretation) 4349 say encode_json (csv (in => "file.csv")); 4350 4351 # AoH (convert to structures) 4352 say encode_json (csv (in => "file.csv", bom => 1)); 4353 4354Yes, it is that simple. 4355 4356=head2 The examples folder 4357 4358For more extended examples, see the F<examples/> C<1>. sub-directory in the 4359original distribution or the git repository C<2>. 4360 4361 1. https://github.com/Tux/Text-CSV_XS/tree/master/examples 4362 2. https://github.com/Tux/Text-CSV_XS 4363 4364The following files can be found there: 4365 4366=over 2 4367 4368=item parser-xs.pl 4369X<parser-xs.pl> 4370 4371This can be used as a boilerplate to parse invalid C<CSV> and parse beyond 4372(expected) errors alternative to using the L</error> callback. 4373 4374 $ perl examples/parser-xs.pl bad.csv >good.csv 4375 4376=item csv-check 4377X<csv-check> 4378 4379This is a command-line tool that uses parser-xs.pl techniques to check the 4380C<CSV> file and report on its content. 4381 4382 $ csv-check files/utf8.csv 4383 Checked files/utf8.csv with csv-check 1.9 4384 using Text::CSV_XS 1.32 with perl 5.26.0 and Unicode 9.0.0 4385 OK: rows: 1, columns: 2 4386 sep = <,>, quo = <">, bin = <1>, eol = <"\n"> 4387 4388=item csv-split 4389X<csv-split> 4390 4391This command splits C<CSV> files into smaller files, keeping (part of) the 4392header. Options include maximum number of (data) rows per file and maximum 4393number of columns per file or a combination of the two. 4394 4395=item csv2xls 4396X<csv2xls> 4397 4398A script to convert C<CSV> to Microsoft Excel (C<XLS>). This requires extra 4399modules L<Date::Calc> and L<Spreadsheet::WriteExcel>. The converter accepts 4400various options and can produce UTF-8 compliant Excel files. 4401 4402=item csv2xlsx 4403X<csv2xlsx> 4404 4405A script to convert C<CSV> to Microsoft Excel (C<XLSX>). This requires the 4406modules L<Date::Calc> and L<Spreadsheet::Writer::XLSX>. The converter does 4407accept various options including merging several C<CSV> files into a single 4408Excel file. 4409 4410=item csvdiff 4411X<csvdiff> 4412 4413A script that provides colorized diff on sorted CSV files, assuming first 4414line is header and first field is the key. Output options include colorized 4415ANSI escape codes or HTML. 4416 4417 $ csvdiff --html --output=diff.html file1.csv file2.csv 4418 4419=item rewrite.pl 4420X<rewrite.pl> 4421 4422A script to rewrite (in)valid CSV into valid CSV files. Script has options 4423to generate confusing CSV files or CSV files that conform to Dutch MS-Excel 4424exports (using C<;> as separation). 4425 4426Script - by default - honors BOM and auto-detects separation converting it 4427to default standard CSV with C<,> as separator. 4428 4429=back 4430 4431=head1 CAVEATS 4432 4433Text::CSV_XS is I<not> designed to detect the characters used to quote and 4434separate fields. The parsing is done using predefined (default) settings. 4435In the examples sub-directory, you can find scripts that demonstrate how 4436you could try to detect these characters yourself. 4437 4438=head2 Microsoft Excel 4439 4440The import/export from Microsoft Excel is a I<risky task>, according to the 4441documentation in C<Text::CSV::Separator>. Microsoft uses the system's list 4442separator defined in the regional settings, which happens to be a semicolon 4443for Dutch, German and Spanish (and probably some others as well). For the 4444English locale, the default is a comma. In Windows however, the user is 4445free to choose a predefined locale, and then change I<every> individual 4446setting in it, so checking the locale is no solution. 4447 4448As of version 1.17, a lone first line with just 4449 4450 sep=; 4451 4452will be recognized and honored when parsing with L</getline>. 4453 4454=head1 TODO 4455 4456=over 2 4457 4458=item More Errors & Warnings 4459 4460New extensions ought to be clear and concise in reporting what error has 4461occurred where and why, and maybe also offer a remedy to the problem. 4462 4463L</error_diag> is a (very) good start, but there is more work to be done in 4464this area. 4465 4466Basic calls should croak or warn on illegal parameters. Errors should be 4467documented. 4468 4469=item setting meta info 4470 4471Future extensions might include extending the L</meta_info>, L</is_quoted>, 4472and L</is_binary> to accept setting these flags for fields, so you can 4473specify which fields are quoted in the L</combine>/L</string> combination. 4474 4475 $csv->meta_info (0, 1, 1, 3, 0, 0); 4476 $csv->is_quoted (3, 1); 4477 4478L<Metadata Vocabulary for Tabular Data|http://w3c.github.io/csvw/metadata/> 4479(a W3C editor's draft) could be an example for supporting more metadata. 4480 4481=item Parse the whole file at once 4482 4483Implement new methods or functions that enable parsing of a complete file 4484at once, returning a list of hashes. Possible extension to this could be to 4485enable a column selection on the call: 4486 4487 my @AoH = $csv->parse_file ($filename, { cols => [ 1, 4..8, 12 ]}); 4488 4489returning something like 4490 4491 [ { fields => [ 1, 2, "foo", 4.5, undef, "", 8 ], 4492 flags => [ ... ], 4493 }, 4494 { fields => [ ... ], 4495 . 4496 }, 4497 ] 4498 4499Note that the L</csv> function already supports most of this, but does not 4500return flags. L</getline_all> returns all rows for an open stream, but this 4501will not return flags either. L</fragment> can reduce the required rows 4502I<or> columns, but cannot combine them. 4503 4504=item Cookbook 4505 4506Write a document that has recipes for most known non-standard (and maybe 4507some standard) C<CSV> formats, including formats that use C<TAB>, C<;>, 4508C<|>, or other non-comma separators. 4509 4510Examples could be taken from W3C's L<CSV on the Web: Use Cases and 4511Requirements|http://w3c.github.io/csvw/use-cases-and-requirements/index.html> 4512 4513=item Steal 4514 4515Steal good new ideas and features from L<PapaParse|http://papaparse.com> or 4516L<csvkit|http://csvkit.readthedocs.org>. 4517 4518=item Raku support 4519 4520Raku support can be found L<here|https://github.com/Tux/CSV>. The interface 4521is richer in support than the Perl5 API, as Raku supports more types. 4522 4523The Raku version does not (yet) support pure binary CSV datasets. 4524 4525=back 4526 4527=head2 NOT TODO 4528 4529=over 2 4530 4531=item combined methods 4532 4533Requests for adding means (methods) that combine L</combine> and L</string> 4534in a single call will B<not> be honored (use L</print> instead). Likewise 4535for L</parse> and L</fields> (use L</getline> instead), given the problems 4536with embedded newlines. 4537 4538=back 4539 4540=head2 Release plan 4541 4542No guarantees, but this is what I had in mind some time ago: 4543 4544=over 2 4545 4546=item * 4547 4548DIAGNOSTICS section in pod to *describe* the errors (see below) 4549 4550=back 4551 4552=head1 EBCDIC 4553 4554Everything should now work on native EBCDIC systems. As the test does not 4555cover all possible codepoints and L<Encode> does not support C<utf-ebcdic>, 4556there is no guarantee that all handling of Unicode is done correct. 4557 4558Opening C<EBCDIC> encoded files on C<ASCII>+ systems is likely to succeed 4559using Encode's C<cp37>, C<cp1047>, or C<posix-bc>: 4560 4561 open my $fh, "<:encoding(cp1047)", "ebcdic_file.csv" or die "..."; 4562 4563=head1 DIAGNOSTICS 4564 4565Still under construction ... 4566 4567If an error occurs, C<< $csv->error_diag >> can be used to get information 4568on the cause of the failure. Note that for speed reasons the internal value 4569is never cleared on success, so using the value returned by L</error_diag> 4570in normal cases - when no error occurred - may cause unexpected results. 4571 4572If the constructor failed, the cause can be found using L</error_diag> as a 4573class method, like C<< Text::CSV_XS->error_diag >>. 4574 4575The C<< $csv->error_diag >> method is automatically invoked upon error when 4576the contractor was called with L<C<auto_diag>|/auto_diag> set to C<1> or 4577C<2>, or when L<autodie> is in effect. When set to C<1>, this will cause a 4578C<warn> with the error message, when set to C<2>, it will C<die>. C<2012 - 4579EOF> is excluded from L<C<auto_diag>|/auto_diag> reports. 4580 4581Errors can be (individually) caught using the L</error> callback. 4582 4583The errors as described below are available. I have tried to make the error 4584itself explanatory enough, but more descriptions will be added. For most of 4585these errors, the first three capitals describe the error category: 4586 4587=over 2 4588 4589=item * 4590INI 4591 4592Initialization error or option conflict. 4593 4594=item * 4595ECR 4596 4597Carriage-Return related parse error. 4598 4599=item * 4600EOF 4601 4602End-Of-File related parse error. 4603 4604=item * 4605EIQ 4606 4607Parse error inside quotation. 4608 4609=item * 4610EIF 4611 4612Parse error inside field. 4613 4614=item * 4615ECB 4616 4617Combine error. 4618 4619=item * 4620EHR 4621 4622HashRef parse related error. 4623 4624=back 4625 4626And below should be the complete list of error codes that can be returned: 4627 4628=over 2 4629 4630=item * 46311001 "INI - sep_char is equal to quote_char or escape_char" 4632X<1001> 4633 4634The L<separation character|/sep_char> cannot be equal to L<the quotation 4635character|/quote_char> or to L<the escape character|/escape_char>, as this 4636would invalidate all parsing rules. 4637 4638=item * 46391002 "INI - allow_whitespace with escape_char or quote_char SP or TAB" 4640X<1002> 4641 4642Using the L<C<allow_whitespace>|/allow_whitespace> attribute when either 4643L<C<quote_char>|/quote_char> or L<C<escape_char>|/escape_char> is equal to 4644C<SPACE> or C<TAB> is too ambiguous to allow. 4645 4646=item * 46471003 "INI - \r or \n in main attr not allowed" 4648X<1003> 4649 4650Using default L<C<eol>|/eol> characters in either L<C<sep_char>|/sep_char>, 4651L<C<quote_char>|/quote_char>, or L<C<escape_char>|/escape_char> is not 4652allowed. 4653 4654=item * 46551004 "INI - callbacks should be undef or a hashref" 4656X<1004> 4657 4658The L<C<callbacks>|/Callbacks> attribute only allows one to be C<undef> or 4659a hash reference. 4660 4661=item * 46621005 "INI - EOL too long" 4663X<1005> 4664 4665The value passed for EOL is exceeding its maximum length (16). 4666 4667=item * 46681006 "INI - SEP too long" 4669X<1006> 4670 4671The value passed for SEP is exceeding its maximum length (16). 4672 4673=item * 46741007 "INI - QUOTE too long" 4675X<1007> 4676 4677The value passed for QUOTE is exceeding its maximum length (16). 4678 4679=item * 46801008 "INI - SEP undefined" 4681X<1008> 4682 4683The value passed for SEP should be defined and not empty. 4684 4685=item * 46861010 "INI - the header is empty" 4687X<1010> 4688 4689The header line parsed in the L</header> is empty. 4690 4691=item * 46921011 "INI - the header contains more than one valid separator" 4693X<1011> 4694 4695The header line parsed in the L</header> contains more than one (unique) 4696separator character out of the allowed set of separators. 4697 4698=item * 46991012 "INI - the header contains an empty field" 4700X<1012> 4701 4702The header line parsed in the L</header> contains an empty field. 4703 4704=item * 47051013 "INI - the header contains nun-unique fields" 4706X<1013> 4707 4708The header line parsed in the L</header> contains at least two identical 4709fields. 4710 4711=item * 47121014 "INI - header called on undefined stream" 4713X<1014> 4714 4715The header line cannot be parsed from an undefined source. 4716 4717=item * 47181500 "PRM - Invalid/unsupported argument(s)" 4719X<1500> 4720 4721Function or method called with invalid argument(s) or parameter(s). 4722 4723=item * 47241501 "PRM - The key attribute is passed as an unsupported type" 4725X<1501> 4726 4727The C<key> attribute is of an unsupported type. 4728 4729=item * 47301502 "PRM - The value attribute is passed without the key attribute" 4731X<1502> 4732 4733The C<value> attribute is only allowed when a valid key is given. 4734 4735=item * 47361503 "PRM - The value attribute is passed as an unsupported type" 4737X<1503> 4738 4739The C<value> attribute is of an unsupported type. 4740 4741=item * 47422010 "ECR - QUO char inside quotes followed by CR not part of EOL" 4743X<2010> 4744 4745When L<C<eol>|/eol> has been set to anything but the default, like 4746C<"\r\t\n">, and the C<"\r"> is following the B<second> (closing) 4747L<C<quote_char>|/quote_char>, where the characters following the C<"\r"> do 4748not make up the L<C<eol>|/eol> sequence, this is an error. 4749 4750=item * 47512011 "ECR - Characters after end of quoted field" 4752X<2011> 4753 4754Sequences like C<1,foo,"bar"baz,22,1> are not allowed. C<"bar"> is a quoted 4755field and after the closing double-quote, there should be either a new-line 4756sequence or a separation character. 4757 4758=item * 47592012 "EOF - End of data in parsing input stream" 4760X<2012> 4761 4762Self-explaining. End-of-file while inside parsing a stream. Can happen only 4763when reading from streams with L</getline>, as using L</parse> is done on 4764strings that are not required to have a trailing L<C<eol>|/eol>. 4765 4766=item * 47672013 "INI - Specification error for fragments RFC7111" 4768X<2013> 4769 4770Invalid specification for URI L</fragment> specification. 4771 4772=item * 47732014 "ENF - Inconsistent number of fields" 4774X<2014> 4775 4776Inconsistent number of fields under strict parsing. 4777 4778=item * 47792021 "EIQ - NL char inside quotes, binary off" 4780X<2021> 4781 4782Sequences like C<1,"foo\nbar",22,1> are allowed only when the binary option 4783has been selected with the constructor. 4784 4785=item * 47862022 "EIQ - CR char inside quotes, binary off" 4787X<2022> 4788 4789Sequences like C<1,"foo\rbar",22,1> are allowed only when the binary option 4790has been selected with the constructor. 4791 4792=item * 47932023 "EIQ - QUO character not allowed" 4794X<2023> 4795 4796Sequences like C<"foo "bar" baz",qu> and C<2023,",2008-04-05,"Foo, Bar",\n> 4797will cause this error. 4798 4799=item * 48002024 "EIQ - EOF cannot be escaped, not even inside quotes" 4801X<2024> 4802 4803The escape character is not allowed as last character in an input stream. 4804 4805=item * 48062025 "EIQ - Loose unescaped escape" 4807X<2025> 4808 4809An escape character should escape only characters that need escaping. 4810 4811Allowing the escape for other characters is possible with the attribute 4812L</allow_loose_escapes>. 4813 4814=item * 48152026 "EIQ - Binary character inside quoted field, binary off" 4816X<2026> 4817 4818Binary characters are not allowed by default. Exceptions are fields that 4819contain valid UTF-8, that will automatically be upgraded if the content is 4820valid UTF-8. Set L<C<binary>|/binary> to C<1> to accept binary data. 4821 4822=item * 48232027 "EIQ - Quoted field not terminated" 4824X<2027> 4825 4826When parsing a field that started with a quotation character, the field is 4827expected to be closed with a quotation character. When the parsed line is 4828exhausted before the quote is found, that field is not terminated. 4829 4830=item * 48312030 "EIF - NL char inside unquoted verbatim, binary off" 4832X<2030> 4833 4834=item * 48352031 "EIF - CR char is first char of field, not part of EOL" 4836X<2031> 4837 4838=item * 48392032 "EIF - CR char inside unquoted, not part of EOL" 4840X<2032> 4841 4842=item * 48432034 "EIF - Loose unescaped quote" 4844X<2034> 4845 4846=item * 48472035 "EIF - Escaped EOF in unquoted field" 4848X<2035> 4849 4850=item * 48512036 "EIF - ESC error" 4852X<2036> 4853 4854=item * 48552037 "EIF - Binary character in unquoted field, binary off" 4856X<2037> 4857 4858=item * 48592110 "ECB - Binary character in Combine, binary off" 4860X<2110> 4861 4862=item * 48632200 "EIO - print to IO failed. See errno" 4864X<2200> 4865 4866=item * 48673001 "EHR - Unsupported syntax for column_names ()" 4868X<3001> 4869 4870=item * 48713002 "EHR - getline_hr () called before column_names ()" 4872X<3002> 4873 4874=item * 48753003 "EHR - bind_columns () and column_names () fields count mismatch" 4876X<3003> 4877 4878=item * 48793004 "EHR - bind_columns () only accepts refs to scalars" 4880X<3004> 4881 4882=item * 48833006 "EHR - bind_columns () did not pass enough refs for parsed fields" 4884X<3006> 4885 4886=item * 48873007 "EHR - bind_columns needs refs to writable scalars" 4888X<3007> 4889 4890=item * 48913008 "EHR - unexpected error in bound fields" 4892X<3008> 4893 4894=item * 48953009 "EHR - print_hr () called before column_names ()" 4896X<3009> 4897 4898=item * 48993010 "EHR - print_hr () called with invalid arguments" 4900X<3010> 4901 4902=back 4903 4904=head1 SEE ALSO 4905 4906L<IO::File>, L<IO::Handle>, L<IO::Wrap>, L<Text::CSV>, L<Text::CSV_PP>, 4907L<Text::CSV::Encoded>, L<Text::CSV::Separator>, L<Text::CSV::Slurp>, 4908L<Spreadsheet::CSV> and L<Spreadsheet::Read>, and of course L<perl>. 4909 4910If you are using Raku, have a look at C<Text::CSV> in the Raku ecosystem, 4911offering the same features. 4912 4913=head3 non-perl 4914 4915A CSV parser in JavaScript, also used by L<W3C|http://www.w3.org>, is the 4916multi-threaded in-browser L<PapaParse|http://papaparse.com/>. 4917 4918L<csvkit|http://csvkit.readthedocs.org> is a python CSV parsing toolkit. 4919 4920=head1 AUTHOR 4921 4922Alan Citterman F<E<lt>alan@mfgrtl.comE<gt>> wrote the original Perl module. 4923Please don't send mail concerning Text::CSV_XS to Alan, who is not involved 4924in the C/XS part that is now the main part of the module. 4925 4926Jochen Wiedmann F<E<lt>joe@ispsoft.deE<gt>> rewrote the en- and decoding in 4927C by implementing a simple finite-state machine. He added variable quote, 4928escape and separator characters, the binary mode and the print and getline 4929methods. See F<ChangeLog> releases 0.10 through 0.23. 4930 4931H.Merijn Brand F<E<lt>h.m.brand@xs4all.nlE<gt>> cleaned up the code, added 4932the field flags methods, wrote the major part of the test suite, completed 4933the documentation, fixed most RT bugs, added all the allow flags and the 4934L</csv> function. See ChangeLog releases 0.25 and on. 4935 4936=head1 COPYRIGHT AND LICENSE 4937 4938 Copyright (C) 2007-2021 H.Merijn Brand. All rights reserved. 4939 Copyright (C) 1998-2001 Jochen Wiedmann. All rights reserved. 4940 Copyright (C) 1997 Alan Citterman. All rights reserved. 4941 4942This library is free software; you can redistribute and/or modify it under 4943the same terms as Perl itself. 4944 4945=cut 4946 4947=for elvis 4948:ex:se gw=75|color guide #ff0000: 4949 4950=cut 4951