1# csv.tcl -- 2# 3# Tcl implementations of CSV reader and writer 4# 5# Copyright (c) 2001 by Jeffrey Hobbs 6# Copyright (c) 2001-2013 by Andreas Kupries <andreas_kupries@users.sourceforge.net> 7# 8# See the file "license.terms" for information on usage and redistribution 9# of this file, and for a DISCLAIMER OF ALL WARRANTIES. 10# 11# RCS: @(#) $Id: csv.tcl,v 1.28 2011/11/23 02:22:10 andreas_kupries Exp $ 12 13package require Tcl 8.4 14package provide csv 0.8.1 15 16namespace eval ::csv { 17 namespace export join joinlist read2matrix read2queue report 18 namespace export split split2matrix split2queue writematrix writequeue 19} 20 21# ::csv::join -- 22# 23# Takes a list of values and generates a string in CSV format. 24# 25# Arguments: 26# values A list of the values to join 27# sepChar The separator character, defaults to comma 28# delChar The delimiter character, defaults to quote 29# delMode If set to 'always', values are always surrounded by delChar 30# 31# Results: 32# A string containing the values in CSV format. 33 34proc ::csv::join {values {sepChar ,} {delChar \"} {delMode auto}} { 35 set out "" 36 set sep {} 37 foreach val $values { 38 if {($delMode eq "always") || [string match "*\[${delChar}$sepChar\r\n\]*" $val]} { 39 append out $sep${delChar}[string map [list $delChar ${delChar}${delChar}] $val]${delChar} 40 } else { 41 append out $sep${val} 42 } 43 set sep $sepChar 44 } 45 return $out 46} 47 48# ::csv::joinlist -- 49# 50# Takes a list of lists of values and generates a string in CSV 51# format. Each item in the list is made into a single CSV 52# formatted record in the final string, the records being 53# separated by newlines. 54# 55# Arguments: 56# values A list of the lists of the values to join 57# sepChar The separator character, defaults to comma 58# delChar The delimiter character, defaults to quote 59# delMode If set to 'always', values are always surrounded by delChar 60# 61# Results: 62# A string containing the values in CSV format, the records 63# separated by newlines. 64 65proc ::csv::joinlist {values {sepChar ,} {delChar \"} {delMode auto}} { 66 set out "" 67 foreach record $values { 68 # note that this is ::csv::join 69 append out "[join $record $sepChar $delChar $delMode]\n" 70 } 71 return $out 72} 73 74# ::csv::joinmatrix -- 75# 76# Takes a matrix object following the API specified for the 77# struct::matrix package. Each row of the matrix is converted 78# into a single CSV formatted record in the final string, the 79# records being separated by newlines. 80# 81# Arguments: 82# matrix Matrix object command. 83# sepChar The separator character, defaults to comma 84# delChar The delimiter character, defaults to quote 85# delMode If set to 'always', values are always surrounded by delChar 86# 87# Results: 88# A string containing the values in CSV format, the records 89# separated by newlines. 90 91proc ::csv::joinmatrix {matrix {sepChar ,} {delChar \"} {delMode auto}} { 92 return [joinlist [$matrix get rect 0 0 end end] $sepChar $delChar $delMode] 93} 94 95# ::csv::iscomplete -- 96# 97# A predicate checking if the argument is a complete csv record. 98# 99# Arguments 100# data The (partial) csv record to check. 101# 102# Results: 103# A boolean flag indicating the completeness of the data. The 104# result is true if the data is complete. 105 106proc ::csv::iscomplete {data} { 107 expr {1 - [regexp -all \" $data] % 2} 108} 109 110# ::csv::read2matrix -- 111# 112# A wrapper around "Split2matrix" reading CSV formatted 113# lines from the specified channel and adding it to the given 114# matrix. 115# 116# Arguments: 117# m The matrix to add the read data too. 118# chan The channel to read from. 119# sepChar The separator character, defaults to comma 120# expand The expansion mode. The default is none 121# 122# Results: 123# A list of the values in 'line'. 124 125proc ::csv::read2matrix {args} { 126 # FR #481023 127 # See 'split2matrix' for the available expansion modes. 128 129 # Argument syntax: 130 # 131 #2) chan m 132 #3) chan m sepChar 133 #3) -alternate chan m 134 #4) -alternate chan m sepChar 135 #4) chan m sepChar expand 136 #5) -alternate chan m sepChar expand 137 138 set alternate 0 139 set sepChar , 140 set expand none 141 142 switch -exact -- [llength $args] { 143 2 { 144 foreach {chan m} $args break 145 } 146 3 { 147 foreach {a b c} $args break 148 if {[string equal $a "-alternate"]} { 149 set alternate 1 150 set chan $b 151 set m $c 152 } else { 153 set chan $a 154 set m $b 155 set sepChar $c 156 } 157 } 158 4 { 159 foreach {a b c d} $args break 160 if {[string equal $a "-alternate"]} { 161 set alternate 1 162 set chan $b 163 set m $c 164 set sepChar $d 165 } else { 166 set chan $a 167 set m $b 168 set sepChar $c 169 set expand $d 170 } 171 } 172 5 { 173 foreach {a b c d e} $args break 174 if {![string equal $a "-alternate"]} { 175 return -code error "wrong#args: Should be ?-alternate? chan m ?separator? ?expand?" 176 } 177 set alternate 1 178 179 set chan $b 180 set m $c 181 set sepChar $d 182 set expand $e 183 } 184 0 - 1 - 185 default { 186 return -code error "wrong#args: Should be ?-alternate? chan m ?separator? ?expand?" 187 } 188 } 189 190 if {[string length $sepChar] < 1} { 191 return -code error "illegal separator character \"$sepChar\", is empty" 192 } elseif {[string length $sepChar] > 1} { 193 return -code error "illegal separator character \"$sepChar\", is a string" 194 } 195 196 set data "" 197 while {![eof $chan]} { 198 if {[gets $chan line] < 0} {continue} 199 200 # Why skip empty lines? They may be in data. Except if the 201 # buffer is empty, i.e. we are between records. 202 if {$line == {} && $data == {}} {continue} 203 204 append data $line 205 if {![iscomplete $data]} { 206 # Odd number of quotes - must have embedded newline 207 append data \n 208 continue 209 } 210 211 Split2matrix $alternate $m $data $sepChar $expand 212 set data "" 213 } 214 return 215} 216 217# ::csv::read2queue -- 218# 219# A wrapper around "::csv::split2queue" reading CSV formatted 220# lines from the specified channel and adding it to the given 221# queue. 222# 223# Arguments: 224# q The queue to add the read data too. 225# chan The channel to read from. 226# sepChar The separator character, defaults to comma 227# 228# Results: 229# A list of the values in 'line'. 230 231proc ::csv::read2queue {args} { 232 # Argument syntax: 233 # 234 #2) chan q 235 #3) chan q sepChar 236 #3) -alternate chan q 237 #4) -alternate chan q sepChar 238 239 set alternate 0 240 set sepChar , 241 242 switch -exact -- [llength $args] { 243 2 { 244 foreach {chan q} $args break 245 } 246 3 { 247 foreach {a b c} $args break 248 if {[string equal $a "-alternate"]} { 249 set alternate 1 250 set chan $b 251 set q $c 252 } else { 253 set chan $a 254 set q $b 255 set sepChar $c 256 } 257 } 258 4 { 259 foreach {a b c d} $args break 260 if {![string equal $a "-alternate"]} { 261 return -code error "wrong#args: Should be ?-alternate? chan q ?separator?" 262 } 263 set alternate 1 264 set chan $b 265 set q $c 266 set sepChar $d 267 } 268 0 - 1 - 269 default { 270 return -code error "wrong#args: Should be ?-alternate? chan q ?separator?" 271 } 272 } 273 274 if {[string length $sepChar] < 1} { 275 return -code error "illegal separator character \"$sepChar\", is empty" 276 } elseif {[string length $sepChar] > 1} { 277 return -code error "illegal separator character \"$sepChar\", is a string" 278 } 279 280 set data "" 281 while {![eof $chan]} { 282 if {[gets $chan line] < 0} {continue} 283 284 # Why skip empty lines? They may be in data. Except if the 285 # buffer is empty, i.e. we are between records. 286 if {$line == {} && $data == {}} {continue} 287 288 append data $line 289 if {![iscomplete $data]} { 290 # Odd number of quotes - must have embedded newline 291 append data \n 292 continue 293 } 294 295 $q put [Split $alternate $data $sepChar] 296 set data "" 297 } 298 return 299} 300 301# ::csv::report -- 302# 303# A report command which can be used by the matrix methods 304# "format-via" and "format2chan-via". For the latter this 305# command delegates the work to "::csv::writematrix". "cmd" is 306# expected to be either "printmatrix" or 307# "printmatrix2channel". The channel argument, "chan", has to 308# be present for the latter and must not be present for the first. 309# 310# Arguments: 311# cmd Either 'printmatrix' or 'printmatrix2channel' 312# matrix The matrix to format. 313# args 0 (chan): The channel to write to 314# 315# Results: 316# None for 'printmatrix2channel', else the CSV formatted string. 317 318proc ::csv::report {cmd matrix args} { 319 switch -exact -- $cmd { 320 printmatrix { 321 if {[llength $args] > 0} { 322 return -code error "wrong # args:\ 323 ::csv::report printmatrix matrix" 324 } 325 return [joinlist [$matrix get rect 0 0 end end]] 326 } 327 printmatrix2channel { 328 if {[llength $args] != 1} { 329 return -code error "wrong # args:\ 330 ::csv::report printmatrix2channel matrix chan" 331 } 332 writematrix $matrix [lindex $args 0] 333 return "" 334 } 335 default { 336 return -code error "Unknown method $cmd" 337 } 338 } 339} 340 341# ::csv::split -- 342# 343# Split a string according to the rules for CSV processing. 344# This assumes that the string contains a single line of CSVs 345# 346# Arguments: 347# line The string to split 348# sepChar The separator character, defaults to comma 349# 350# Results: 351# A list of the values in 'line'. 352 353proc ::csv::split {args} { 354 # Argument syntax: 355 # 356 # (1) line 357 # (2) line sepChar 358 # (2) -alternate line 359 # (3) -alternate line sepChar 360 361 # (3) line sepChar delChar 362 # (4) -alternate line sepChar delChar 363 364 set alternate 0 365 set sepChar , 366 set delChar \" 367 368 switch -exact -- [llength $args] { 369 1 { 370 set line [lindex $args 0] 371 } 372 2 { 373 foreach {a b} $args break 374 if {[string equal $a "-alternate"]} { 375 set alternate 1 376 set line $b 377 } else { 378 set line $a 379 set sepChar $b 380 } 381 } 382 3 { 383 foreach {a b c} $args break 384 if {[string equal $a "-alternate"]} { 385 set alternate 1 386 set line $b 387 set sepChar $c 388 } else { 389 set line $a 390 set sepChar $b 391 set delChar $c 392 } 393 } 394 4 { 395 foreach {a b c d} $args break 396 if {![string equal $a "-alternate"]} { 397 return -code error "wrong#args: Should be ?-alternate? line ?separator? ?delimiter?" 398 } 399 set alternate 1 400 set line $b 401 set sepChar $c 402 set delChar $d 403 } 404 0 - 405 default { 406 return -code error "wrong#args: Should be ?-alternate? line ?separator? ?delimiter?" 407 } 408 } 409 410 if {[string length $sepChar] < 1} { 411 return -code error "illegal separator character ${delChar}$sepChar${delChar}, is empty" 412 } elseif {[string length $sepChar] > 1} { 413 return -code error "illegal separator character ${delChar}$sepChar${delChar}, is a string" 414 } 415 416 if {[string length $delChar] < 1} { 417 return -code error "illegal separator character \"$delChar\", is empty" 418 } elseif {[string length $delChar] > 1} { 419 return -code error "illegal separator character \"$delChar\", is a string" 420 } 421 422 return [Split $alternate $line $sepChar $delChar] 423} 424 425proc ::csv::Split {alternate line sepChar {delChar \"}} { 426 # Protect the sepchar from special interpretation by 427 # the regex calls below. 428 429 set sepRE \[\[.${sepChar}.]] 430 set delRE \[\[.${delChar}.]] 431 432 if {$alternate} { 433 # The alternate syntax requires a different parser. 434 # A variation of the string map / regsub parser for the 435 # regular syntax was tried but does not handle embedded 436 # doubled " well (testcase csv-91.3 was 'knownBug', sole 437 # one, still a bug). Now we just tokenize the input into 438 # the primary parts (sep char, "'s and the rest) and then 439 # use an explicitly coded state machine (DFA) to parse 440 # and convert token sequences. 441 442 ## puts 1->>$line<< 443 set line [string map [list \ 444 $sepChar \0$sepChar\0 \ 445 $delChar \0${delChar}\0 \ 446 ] $line] 447 448 ## puts 2->>$line<< 449 set line [string map [list \0\0 \0] $line] 450 regsub "^\0" $line {} line 451 regsub "\0$" $line {} line 452 453 ## puts 3->>$line<< 454 455 set val "" 456 set res "" 457 set state base 458 459 ## puts 4->>[::split $line \0] 460 foreach token [::split $line \0] { 461 462 ## puts "\t*= $state\t>>$token<<" 463 switch -exact -- $state { 464 base { 465 if {[string equal $token "${delChar}"]} { 466 set state qvalue 467 continue 468 } 469 if {[string equal $token $sepChar]} { 470 lappend res $val 471 set val "" 472 continue 473 } 474 append val $token 475 } 476 qvalue { 477 if {[string equal $token "${delChar}"]} { 478 # May end value, may be a doubled " 479 set state endordouble 480 continue 481 } 482 append val $token 483 } 484 endordouble { 485 if {[string equal $token "${delChar}"]} { 486 # Doubled ", append to current value 487 append val ${delChar} 488 set state qvalue 489 continue 490 } 491 # Last " was end of quoted value. Close it. 492 # We expect current as $sepChar 493 494 lappend res $val 495 set val "" 496 set state base 497 498 if {[string equal $token $sepChar]} {continue} 499 500 # Undoubled " in middle of text. Just assume that 501 # remainder is another qvalue. 502 set state qvalue 503 } 504 default { 505 return -code error "Internal error, illegal parsing state" 506 } 507 } 508 } 509 510 ## puts "/= $state\t>>$val<<" 511 512 lappend res $val 513 514 ## puts 5->>$res<< 515 return $res 516 } else { 517 regsub -- "$sepRE${delRE}${delRE}$" $line $sepChar\0${delChar}${delChar}\0 line 518 regsub -- "^${delRE}${delRE}$sepRE" $line \0${delChar}${delChar}\0$sepChar line 519 regsub -all -- {(^${delChar}|${delChar}$)} $line \0 line 520 521 set line [string map [list \ 522 $sepChar${delChar}${delChar}${delChar} $sepChar\0${delChar} \ 523 ${delChar}${delChar}${delChar}$sepChar ${delChar}\0$sepChar \ 524 ${delChar}${delChar} ${delChar} \ 525 ${delChar} \0 \ 526 ] $line] 527 528 set end 0 529 while {[regexp -indices -start $end -- {(\0)[^\0]*(\0)} $line \ 530 -> start end]} { 531 set start [lindex $start 0] 532 set end [lindex $end 0] 533 set range [string range $line $start $end] 534 if {[string first $sepChar $range] >= 0} { 535 set line [string replace $line $start $end \ 536 [string map [list $sepChar \1] $range]] 537 } 538 incr end 539 } 540 set line [string map [list $sepChar \0 \1 $sepChar \0 {} ] $line] 541 return [::split $line \0] 542 543 } 544} 545 546# ::csv::split2matrix -- 547# 548# Split a string according to the rules for CSV processing. 549# This assumes that the string contains a single line of CSVs. 550# The resulting list of values is appended to the specified 551# matrix, as a new row. The code assumes that the matrix provides 552# the same interface as the queue provided by the 'struct' 553# module of tcllib, "add row" in particular. 554# 555# Arguments: 556# m The matrix to write the resulting list to. 557# line The string to split 558# sepChar The separator character, defaults to comma 559# expand The expansion mode. The default is none 560# 561# Results: 562# A list of the values in 'line', written to 'q'. 563 564proc ::csv::split2matrix {args} { 565 # FR #481023 566 567 # Argument syntax: 568 # 569 #2) m line 570 #3) m line sepChar 571 #3) -alternate m line 572 #4) -alternate m line sepChar 573 #4) m line sepChar expand 574 #5) -alternate m line sepChar expand 575 576 set alternate 0 577 set sepChar , 578 set expand none 579 580 switch -exact -- [llength $args] { 581 2 { 582 foreach {m line} $args break 583 } 584 3 { 585 foreach {a b c} $args break 586 if {[string equal $a "-alternate"]} { 587 set alternate 1 588 set m $b 589 set line $c 590 } else { 591 set m $a 592 set line $b 593 set sepChar $c 594 } 595 } 596 4 { 597 foreach {a b c d} $args break 598 if {[string equal $a "-alternate"]} { 599 set alternate 1 600 set m $b 601 set line $c 602 set sepChar $d 603 } else { 604 set m $a 605 set line $b 606 set sepChar $c 607 set expand $d 608 } 609 } 610 4 { 611 foreach {a b c d e} $args break 612 if {![string equal $a "-alternate"]} { 613 return -code error "wrong#args: Should be ?-alternate? m line ?separator? ?expand?" 614 } 615 set alternate 1 616 617 set m $b 618 set line $c 619 set sepChar $d 620 set expand $e 621 } 622 0 - 1 - 623 default { 624 return -code error "wrong#args: Should be ?-alternate? m line ?separator? ?expand?" 625 } 626 } 627 628 if {[string length $sepChar] < 1} { 629 return -code error "illegal separator character \"$sepChar\", is empty" 630 } elseif {[string length $sepChar] > 1} { 631 return -code error "illegal separator character \"$sepChar\", is a string" 632 } 633 634 Split2matrix $alternate $m $line $sepChar $expand 635 return 636} 637 638proc ::csv::Split2matrix {alternate m line sepChar expand} { 639 set csv [Split $alternate $line $sepChar] 640 641 # Expansion modes 642 # - none : default, behaviour of original implementation. 643 # no expansion is done, lines are silently truncated 644 # to the number of columns in the matrix. 645 # 646 # - empty : A matrix without columns is expanded to the number 647 # of columns in the first line added to it. All 648 # following lines are handled as if "mode == none" 649 # was set. 650 # 651 # - auto : Full auto-mode. The matrix is expanded as needed to 652 # hold all columns of all lines. 653 654 switch -exact -- $expand { 655 none {} 656 empty { 657 if {[$m columns] == 0} { 658 $m add columns [llength $csv] 659 } 660 } 661 auto { 662 if {[$m columns] < [llength $csv]} { 663 $m add columns [expr {[llength $csv] - [$m columns]}] 664 } 665 } 666 } 667 $m add row $csv 668 return 669} 670 671# ::csv::split2queue -- 672# 673# Split a string according to the rules for CSV processing. 674# This assumes that the string contains a single line of CSVs. 675# The resulting list of values is appended to the specified 676# queue, as a single item. IOW each item in the queue represents 677# a single CSV record. The code assumes that the queue provides 678# the same interface as the queue provided by the 'struct' 679# module of tcllib, "put" in particular. 680# 681# Arguments: 682# q The queue to write the resulting list to. 683# line The string to split 684# sepChar The separator character, defaults to comma 685# 686# Results: 687# A list of the values in 'line', written to 'q'. 688 689proc ::csv::split2queue {args} { 690 # Argument syntax: 691 # 692 #2) q line 693 #3) q line sepChar 694 #3) -alternate q line 695 #4) -alternate q line sepChar 696 697 set alternate 0 698 set sepChar , 699 700 switch -exact -- [llength $args] { 701 2 { 702 foreach {q line} $args break 703 } 704 3 { 705 foreach {a b c} $args break 706 if {[string equal $a "-alternate"]} { 707 set alternate 1 708 set q $b 709 set line $c 710 } else { 711 set q $a 712 set line $b 713 set sepChar $c 714 } 715 } 716 4 { 717 foreach {a b c d} $args break 718 if {![string equal $a "-alternate"]} { 719 return -code error "wrong#args: Should be ?-alternate? q line ?separator?" 720 } 721 set alternate 1 722 723 set q $b 724 set line $c 725 set sepChar $d 726 } 727 0 - 1 - 728 default { 729 return -code error "wrong#args: Should be ?-alternate? q line ?separator?" 730 } 731 } 732 733 if {[string length $sepChar] < 1} { 734 return -code error "illegal separator character \"$sepChar\", is empty" 735 } elseif {[string length $sepChar] > 1} { 736 return -code error "illegal separator character \"$sepChar\", is a string" 737 } 738 739 $q put [Split $alternate $line $sepChar] 740 return 741} 742 743# ::csv::writematrix -- 744# 745# A wrapper around "::csv::join" taking the rows in a matrix and 746# writing them as CSV formatted lines into the channel. 747# 748# Arguments: 749# m The matrix to take the data to write from. 750# chan The channel to write into. 751# sepChar The separator character, defaults to comma 752# 753# Results: 754# None. 755 756proc ::csv::writematrix {m chan {sepChar ,} {delChar \"}} { 757 set n [$m rows] 758 for {set r 0} {$r < $n} {incr r} { 759 puts $chan [join [$m get row $r] $sepChar $delChar] 760 } 761 762 # Memory intensive alternative: 763 # puts $chan [joinlist [m get rect 0 0 end end] $sepChar $delChar] 764 return 765} 766 767# ::csv::writequeue -- 768# 769# A wrapper around "::csv::join" taking the rows in a queue and 770# writing them as CSV formatted lines into the channel. 771# 772# Arguments: 773# q The queue to take the data to write from. 774# chan The channel to write into. 775# sepChar The separator character, defaults to comma 776# 777# Results: 778# None. 779 780proc ::csv::writequeue {q chan {sepChar ,} {delChar \"}} { 781 while {[$q size] > 0} { 782 puts $chan [join [$q get] $sepChar $delChar] 783 } 784 785 # Memory intensive alternative: 786 # puts $chan [joinlist [$q get [$q size]] $sepChar $delChar] 787 return 788} 789 790