1# csv.tcl --
2#
3#	Tcl implementations of CSV reader and writer
4#
5# Copyright (c) 2001      by Jeffrey Hobbs
6# Copyright (c) 2001-2013 by Andreas Kupries <andreas_kupries@users.sourceforge.net>
7#
8# See the file "license.terms" for information on usage and redistribution
9# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
10#
11# RCS: @(#) $Id: csv.tcl,v 1.28 2011/11/23 02:22:10 andreas_kupries Exp $
12
13package require Tcl 8.4
14package provide csv 0.8.1
15
16namespace eval ::csv {
17    namespace export join joinlist read2matrix read2queue report
18    namespace export split split2matrix split2queue writematrix writequeue
19}
20
21# ::csv::join --
22#
23#	Takes a list of values and generates a string in CSV format.
24#
25# Arguments:
26#	values	A list of the values to join
27#	sepChar	The separator character, defaults to comma
28#	delChar	The delimiter character, defaults to quote
29#	delMode	If set to 'always', values are always surrounded by delChar
30#
31# Results:
32#	A string containing the values in CSV format.
33
34proc ::csv::join {values {sepChar ,} {delChar \"} {delMode auto}} {
35    set out ""
36    set sep {}
37    foreach val $values {
38	if {($delMode eq "always") || [string match "*\[${delChar}$sepChar\r\n\]*" $val]} {
39	    append out $sep${delChar}[string map [list $delChar ${delChar}${delChar}] $val]${delChar}
40	} else {
41	    append out $sep${val}
42	}
43	set sep $sepChar
44    }
45    return $out
46}
47
48# ::csv::joinlist --
49#
50#	Takes a list of lists of values and generates a string in CSV
51#	format. Each item in the list is made into a single CSV
52#	formatted record in the final string, the records being
53#	separated by newlines.
54#
55# Arguments:
56#	values	A list of the lists of the values to join
57#	sepChar	The separator character, defaults to comma
58#	delChar	The delimiter character, defaults to quote
59#	delMode	If set to 'always', values are always surrounded by delChar
60#
61# Results:
62#	A string containing the values in CSV format, the records
63#	separated by newlines.
64
65proc ::csv::joinlist {values {sepChar ,} {delChar \"} {delMode auto}} {
66    set out ""
67    foreach record $values {
68	# note that this is ::csv::join
69	append out "[join $record $sepChar $delChar $delMode]\n"
70    }
71    return $out
72}
73
74# ::csv::joinmatrix --
75#
76#	Takes a matrix object following the API specified for the
77#	struct::matrix package. Each row of the matrix is converted
78#	into a single CSV formatted record in the final string, the
79#	records being separated by newlines.
80#
81# Arguments:
82#	matrix	Matrix object command.
83#	sepChar	The separator character, defaults to comma
84#	delChar	The delimiter character, defaults to quote
85#	delMode	If set to 'always', values are always surrounded by delChar
86#
87# Results:
88#	A string containing the values in CSV format, the records
89#	separated by newlines.
90
91proc ::csv::joinmatrix {matrix {sepChar ,} {delChar \"} {delMode auto}} {
92    return [joinlist [$matrix get rect 0 0 end end] $sepChar $delChar $delMode]
93}
94
95# ::csv::iscomplete --
96#
97#	A predicate checking if the argument is a complete csv record.
98#
99# Arguments
100#	data	The (partial) csv record to check.
101#
102# Results:
103#	A boolean flag indicating the completeness of the data. The
104#	result is true if the data is complete.
105
106proc ::csv::iscomplete {data} {
107    expr {1 - [regexp -all \" $data] % 2}
108}
109
110# ::csv::read2matrix --
111#
112#	A wrapper around "Split2matrix" reading CSV formatted
113#	lines from the specified channel and adding it to the given
114#	matrix.
115#
116# Arguments:
117#	m		The matrix to add the read data too.
118#	chan		The channel to read from.
119#	sepChar		The separator character, defaults to comma
120#	expand		The expansion mode. The default is none
121#
122# Results:
123#	A list of the values in 'line'.
124
125proc ::csv::read2matrix {args} {
126    # FR #481023
127    # See 'split2matrix' for the available expansion modes.
128
129    # Argument syntax:
130    #
131    #2)            chan m
132    #3)            chan m sepChar
133    #3) -alternate chan m
134    #4) -alternate chan m sepChar
135    #4)            chan m sepChar expand
136    #5) -alternate chan m sepChar expand
137
138    set alternate 0
139    set sepChar   ,
140    set expand    none
141
142    switch -exact -- [llength $args] {
143	2 {
144	    foreach {chan m} $args break
145	}
146	3 {
147	    foreach {a b c} $args break
148	    if {[string equal $a "-alternate"]} {
149		set alternate 1
150		set chan      $b
151		set m         $c
152	    } else {
153		set chan    $a
154		set m       $b
155		set sepChar $c
156	    }
157	}
158	4 {
159	    foreach {a b c d} $args break
160	    if {[string equal $a "-alternate"]} {
161		set alternate 1
162		set chan      $b
163		set m         $c
164		set sepChar   $d
165	    } else {
166		set chan    $a
167		set m       $b
168		set sepChar $c
169		set expand  $d
170	    }
171	}
172	5 {
173	    foreach {a b c d e} $args break
174	    if {![string equal $a "-alternate"]} {
175		return -code error "wrong#args: Should be ?-alternate? chan m ?separator? ?expand?"
176	    }
177	    set alternate 1
178
179	    set chan    $b
180	    set m       $c
181	    set sepChar $d
182	    set expand  $e
183	}
184	0 - 1 -
185	default {
186	    return -code error "wrong#args: Should be ?-alternate? chan m ?separator? ?expand?"
187	}
188    }
189
190    if {[string length $sepChar] < 1} {
191	return -code error "illegal separator character \"$sepChar\", is empty"
192    } elseif {[string length $sepChar] > 1} {
193	return -code error "illegal separator character \"$sepChar\", is a string"
194    }
195
196    set data ""
197    while {![eof $chan]} {
198	if {[gets $chan line] < 0} {continue}
199
200	# Why skip empty lines? They may be in data. Except if the
201	# buffer is empty, i.e. we are between records.
202	if {$line == {} && $data == {}} {continue}
203
204       append data $line
205       if {![iscomplete $data]} {
206           # Odd number of quotes - must have embedded newline
207           append data \n
208           continue
209       }
210
211       Split2matrix $alternate $m $data $sepChar $expand
212       set data ""
213    }
214    return
215}
216
217# ::csv::read2queue --
218#
219#	A wrapper around "::csv::split2queue" reading CSV formatted
220#	lines from the specified channel and adding it to the given
221#	queue.
222#
223# Arguments:
224#	q		The queue to add the read data too.
225#	chan		The channel to read from.
226#	sepChar		The separator character, defaults to comma
227#
228# Results:
229#	A list of the values in 'line'.
230
231proc ::csv::read2queue {args} {
232    # Argument syntax:
233    #
234    #2)            chan q
235    #3)            chan q sepChar
236    #3) -alternate chan q
237    #4) -alternate chan q sepChar
238
239    set alternate 0
240    set sepChar   ,
241
242    switch -exact -- [llength $args] {
243	2 {
244	    foreach {chan q} $args break
245	}
246	3 {
247	    foreach {a b c} $args break
248	    if {[string equal $a "-alternate"]} {
249		set alternate 1
250		set chan      $b
251		set q         $c
252	    } else {
253		set chan    $a
254		set q       $b
255		set sepChar $c
256	    }
257	}
258	4 {
259	    foreach {a b c d} $args break
260	    if {![string equal $a "-alternate"]} {
261		return -code error "wrong#args: Should be ?-alternate? chan q ?separator?"
262	    }
263	    set alternate 1
264	    set chan    $b
265	    set q       $c
266	    set sepChar $d
267	}
268	0 - 1 -
269	default {
270	    return -code error "wrong#args: Should be ?-alternate? chan q ?separator?"
271	}
272    }
273
274    if {[string length $sepChar] < 1} {
275	return -code error "illegal separator character \"$sepChar\", is empty"
276    } elseif {[string length $sepChar] > 1} {
277	return -code error "illegal separator character \"$sepChar\", is a string"
278    }
279
280    set data ""
281    while {![eof $chan]} {
282	if {[gets $chan line] < 0} {continue}
283
284	# Why skip empty lines? They may be in data. Except if the
285	# buffer is empty, i.e. we are between records.
286	if {$line == {} && $data == {}} {continue}
287
288	append data $line
289	if {![iscomplete $data]} {
290	    # Odd number of quotes - must have embedded newline
291	    append data \n
292	    continue
293	}
294
295	$q put [Split $alternate $data $sepChar]
296	set data ""
297    }
298    return
299}
300
301# ::csv::report --
302#
303#	A report command which can be used by the matrix methods
304#	"format-via" and "format2chan-via". For the latter this
305#	command delegates the work to "::csv::writematrix". "cmd" is
306#	expected to be either "printmatrix" or
307#	"printmatrix2channel". The channel argument, "chan", has to
308#	be present for the latter and must not be present for the first.
309#
310# Arguments:
311#	cmd		Either 'printmatrix' or 'printmatrix2channel'
312#	matrix		The matrix to format.
313#	args		0 (chan): The channel to write to
314#
315# Results:
316#	None for 'printmatrix2channel', else the CSV formatted string.
317
318proc ::csv::report {cmd matrix args} {
319    switch -exact -- $cmd {
320	printmatrix {
321	    if {[llength $args] > 0} {
322		return -code error "wrong # args:\
323			::csv::report printmatrix matrix"
324	    }
325	    return [joinlist [$matrix get rect 0 0 end end]]
326	}
327	printmatrix2channel {
328	    if {[llength $args] != 1} {
329		return -code error "wrong # args:\
330			::csv::report printmatrix2channel matrix chan"
331	    }
332	    writematrix $matrix [lindex $args 0]
333	    return ""
334	}
335	default {
336	    return -code error "Unknown method $cmd"
337	}
338    }
339}
340
341# ::csv::split --
342#
343#	Split a string according to the rules for CSV processing.
344#	This assumes that the string contains a single line of CSVs
345#
346# Arguments:
347#	line		The string to split
348#	sepChar		The separator character, defaults to comma
349#
350# Results:
351#	A list of the values in 'line'.
352
353proc ::csv::split {args} {
354    # Argument syntax:
355    #
356    # (1)            line
357    # (2)            line sepChar
358    # (2) -alternate line
359    # (3) -alternate line sepChar
360
361    # (3)            line sepChar delChar
362    # (4) -alternate line sepChar delChar
363
364    set alternate 0
365    set sepChar   ,
366    set delChar   \"
367
368    switch -exact -- [llength $args] {
369	1 {
370	    set line [lindex $args 0]
371	}
372	2 {
373	    foreach {a b} $args break
374	    if {[string equal $a "-alternate"]} {
375		set alternate 1
376		set line     $b
377	    } else {
378		set line    $a
379		set sepChar $b
380	    }
381	}
382	3 {
383	    foreach {a b c} $args break
384	    if {[string equal $a "-alternate"]} {
385	        set alternate 1
386		set line    $b
387		set sepChar $c
388	    } else {
389		set line    $a
390		set sepChar $b
391		set delChar $c
392            }
393	}
394	4 {
395	    foreach {a b c d} $args break
396	    if {![string equal $a "-alternate"]} {
397		return -code error "wrong#args: Should be ?-alternate? line ?separator? ?delimiter?"
398	    }
399	    set alternate 1
400	    set line    $b
401	    set sepChar $c
402	    set delChar $d
403	}
404	0 -
405	default {
406	    return -code error "wrong#args: Should be ?-alternate? line ?separator? ?delimiter?"
407	}
408    }
409
410    if {[string length $sepChar] < 1} {
411	return -code error "illegal separator character ${delChar}$sepChar${delChar}, is empty"
412    } elseif {[string length $sepChar] > 1} {
413	return -code error "illegal separator character ${delChar}$sepChar${delChar}, is a string"
414    }
415
416    if {[string length $delChar] < 1} {
417	return -code error "illegal separator character \"$delChar\", is empty"
418    } elseif {[string length $delChar] > 1} {
419	return -code error "illegal separator character \"$delChar\", is a string"
420    }
421
422    return [Split $alternate $line $sepChar $delChar]
423}
424
425proc ::csv::Split {alternate line sepChar {delChar \"}} {
426    # Protect the sepchar from special interpretation by
427    # the regex calls below.
428
429    set sepRE \[\[.${sepChar}.]]
430    set delRE \[\[.${delChar}.]]
431
432    if {$alternate} {
433	# The alternate syntax requires a different parser.
434	# A variation of the string map / regsub parser for the
435	# regular syntax was tried but does not handle embedded
436	# doubled " well (testcase csv-91.3 was 'knownBug', sole
437	# one, still a bug). Now we just tokenize the input into
438	# the primary parts (sep char, "'s and the rest) and then
439	# use an explicitly coded state machine (DFA) to parse
440	# and convert token sequences.
441
442	## puts 1->>$line<<
443	set line [string map [list \
444		$sepChar \0$sepChar\0 \
445		$delChar \0${delChar}\0 \
446		] $line]
447
448	## puts 2->>$line<<
449	set line [string map [list \0\0 \0] $line]
450	regsub "^\0" $line {} line
451	regsub "\0$" $line {} line
452
453	## puts 3->>$line<<
454
455	set val ""
456	set res ""
457	set state base
458
459	## puts 4->>[::split $line \0]
460	foreach token [::split $line \0] {
461
462	    ## puts "\t*= $state\t>>$token<<"
463	    switch -exact -- $state {
464		base {
465		    if {[string equal $token "${delChar}"]} {
466			set state qvalue
467			continue
468		    }
469		    if {[string equal $token $sepChar]} {
470			lappend res $val
471			set val ""
472			continue
473		    }
474		    append val $token
475		}
476		qvalue {
477		    if {[string equal $token "${delChar}"]} {
478			# May end value, may be a doubled "
479			set state endordouble
480			continue
481		    }
482		    append val $token
483		}
484		endordouble {
485		    if {[string equal $token "${delChar}"]} {
486			# Doubled ", append to current value
487			append val ${delChar}
488			set state qvalue
489			continue
490		    }
491		    # Last " was end of quoted value. Close it.
492		    # We expect current as $sepChar
493
494		    lappend res $val
495		    set          val ""
496		    set state base
497
498		    if {[string equal $token $sepChar]} {continue}
499
500		    # Undoubled " in middle of text. Just assume that
501		    # remainder is another qvalue.
502		    set state qvalue
503		}
504		default {
505		    return -code error "Internal error, illegal parsing state"
506		}
507	    }
508	}
509
510	## puts "/= $state\t>>$val<<"
511
512	lappend res $val
513
514	## puts 5->>$res<<
515	return $res
516    } else {
517	regsub -- "$sepRE${delRE}${delRE}$" $line $sepChar\0${delChar}${delChar}\0 line
518	regsub -- "^${delRE}${delRE}$sepRE" $line \0${delChar}${delChar}\0$sepChar line
519	regsub -all -- {(^${delChar}|${delChar}$)} $line \0 line
520
521	set line [string map [list \
522		$sepChar${delChar}${delChar}${delChar} $sepChar\0${delChar} \
523		${delChar}${delChar}${delChar}$sepChar ${delChar}\0$sepChar \
524		${delChar}${delChar}           ${delChar} \
525		${delChar}             \0 \
526		] $line]
527
528	set end 0
529	while {[regexp -indices -start $end -- {(\0)[^\0]*(\0)} $line \
530		-> start end]} {
531	    set start [lindex $start 0]
532	    set end   [lindex $end 0]
533	    set range [string range $line $start $end]
534	    if {[string first $sepChar $range] >= 0} {
535		set line [string replace $line $start $end \
536			[string map [list $sepChar \1] $range]]
537	    }
538	    incr end
539	}
540	set line [string map [list $sepChar \0 \1 $sepChar \0 {} ] $line]
541	return [::split $line \0]
542
543    }
544}
545
546# ::csv::split2matrix --
547#
548#	Split a string according to the rules for CSV processing.
549#	This assumes that the string contains a single line of CSVs.
550#	The resulting list of values is appended to the specified
551#	matrix, as a new row. The code assumes that the matrix provides
552#	the same interface as the queue provided by the 'struct'
553#	module of tcllib, "add row" in particular.
554#
555# Arguments:
556#	m		The matrix to write the resulting list to.
557#	line		The string to split
558#	sepChar		The separator character, defaults to comma
559#	expand		The expansion mode. The default is none
560#
561# Results:
562#	A list of the values in 'line', written to 'q'.
563
564proc ::csv::split2matrix {args} {
565    # FR #481023
566
567    # Argument syntax:
568    #
569    #2)            m line
570    #3)            m line sepChar
571    #3) -alternate m line
572    #4) -alternate m line sepChar
573    #4)            m line sepChar expand
574    #5) -alternate m line sepChar expand
575
576    set alternate 0
577    set sepChar   ,
578    set expand    none
579
580    switch -exact -- [llength $args] {
581	2 {
582	    foreach {m line} $args break
583	}
584	3 {
585	    foreach {a b c} $args break
586	    if {[string equal $a "-alternate"]} {
587		set alternate 1
588		set m         $b
589		set line      $c
590	    } else {
591		set m       $a
592		set line    $b
593		set sepChar $c
594	    }
595	}
596	4 {
597	    foreach {a b c d} $args break
598	    if {[string equal $a "-alternate"]} {
599		set alternate 1
600		set m         $b
601		set line      $c
602		set sepChar   $d
603	    } else {
604		set m       $a
605		set line    $b
606		set sepChar $c
607		set expand  $d
608	    }
609	}
610	4 {
611	    foreach {a b c d e} $args break
612	    if {![string equal $a "-alternate"]} {
613		return -code error "wrong#args: Should be ?-alternate? m line ?separator? ?expand?"
614	    }
615	    set alternate 1
616
617	    set m       $b
618	    set line    $c
619	    set sepChar $d
620	    set expand  $e
621	}
622	0 - 1 -
623	default {
624	    return -code error "wrong#args: Should be ?-alternate? m line ?separator? ?expand?"
625	}
626    }
627
628    if {[string length $sepChar] < 1} {
629	return -code error "illegal separator character \"$sepChar\", is empty"
630    } elseif {[string length $sepChar] > 1} {
631	return -code error "illegal separator character \"$sepChar\", is a string"
632    }
633
634    Split2matrix $alternate $m $line $sepChar $expand
635    return
636}
637
638proc ::csv::Split2matrix {alternate m line sepChar expand} {
639    set csv [Split $alternate $line $sepChar]
640
641    # Expansion modes
642    # - none  : default, behaviour of original implementation.
643    #           no expansion is done, lines are silently truncated
644    #           to the number of columns in the matrix.
645    #
646    # - empty : A matrix without columns is expanded to the number
647    #           of columns in the first line added to it. All
648    #           following lines are handled as if "mode == none"
649    #           was set.
650    #
651    # - auto  : Full auto-mode. The matrix is expanded as needed to
652    #           hold all columns of all lines.
653
654    switch -exact -- $expand {
655	none {}
656	empty {
657	    if {[$m columns] == 0} {
658		$m add columns [llength $csv]
659	    }
660	}
661	auto {
662	    if {[$m columns] < [llength $csv]} {
663		$m add columns [expr {[llength $csv] - [$m columns]}]
664	    }
665	}
666    }
667    $m add row $csv
668    return
669}
670
671# ::csv::split2queue --
672#
673#	Split a string according to the rules for CSV processing.
674#	This assumes that the string contains a single line of CSVs.
675#	The resulting list of values is appended to the specified
676#	queue, as a single item. IOW each item in the queue represents
677#	a single CSV record. The code assumes that the queue provides
678#	the same interface as the queue provided by the 'struct'
679#	module of tcllib, "put" in particular.
680#
681# Arguments:
682#	q		The queue to write the resulting list to.
683#	line		The string to split
684#	sepChar		The separator character, defaults to comma
685#
686# Results:
687#	A list of the values in 'line', written to 'q'.
688
689proc ::csv::split2queue {args} {
690    # Argument syntax:
691    #
692    #2)            q line
693    #3)            q line sepChar
694    #3) -alternate q line
695    #4) -alternate q line sepChar
696
697    set alternate 0
698    set sepChar   ,
699
700    switch -exact -- [llength $args] {
701	2 {
702	    foreach {q line} $args break
703	}
704	3 {
705	    foreach {a b c} $args break
706	    if {[string equal $a "-alternate"]} {
707		set alternate 1
708		set q         $b
709		set line      $c
710	    } else {
711		set q       $a
712		set line    $b
713		set sepChar $c
714	    }
715	}
716	4 {
717	    foreach {a b c d} $args break
718	    if {![string equal $a "-alternate"]} {
719		return -code error "wrong#args: Should be ?-alternate? q line ?separator?"
720	    }
721	    set alternate 1
722
723	    set q       $b
724	    set line    $c
725	    set sepChar $d
726	}
727	0 - 1 -
728	default {
729	    return -code error "wrong#args: Should be ?-alternate? q line ?separator?"
730	}
731    }
732
733    if {[string length $sepChar] < 1} {
734	return -code error "illegal separator character \"$sepChar\", is empty"
735    } elseif {[string length $sepChar] > 1} {
736	return -code error "illegal separator character \"$sepChar\", is a string"
737    }
738
739    $q put [Split $alternate $line $sepChar]
740    return
741}
742
743# ::csv::writematrix --
744#
745#	A wrapper around "::csv::join" taking the rows in a matrix and
746#	writing them as CSV formatted lines into the channel.
747#
748# Arguments:
749#	m		The matrix to take the data to write from.
750#	chan		The channel to write into.
751#	sepChar		The separator character, defaults to comma
752#
753# Results:
754#	None.
755
756proc ::csv::writematrix {m chan {sepChar ,} {delChar \"}} {
757    set n [$m rows]
758    for {set r 0} {$r < $n} {incr r} {
759	puts $chan [join [$m get row $r] $sepChar $delChar]
760    }
761
762    # Memory intensive alternative:
763    # puts $chan [joinlist [m get rect 0 0 end end] $sepChar $delChar]
764    return
765}
766
767# ::csv::writequeue --
768#
769#	A wrapper around "::csv::join" taking the rows in a queue and
770#	writing them as CSV formatted lines into the channel.
771#
772# Arguments:
773#	q		The queue to take the data to write from.
774#	chan		The channel to write into.
775#	sepChar		The separator character, defaults to comma
776#
777# Results:
778#	None.
779
780proc ::csv::writequeue {q chan {sepChar ,} {delChar \"}} {
781    while {[$q size] > 0} {
782	puts $chan [join [$q get] $sepChar $delChar]
783    }
784
785    # Memory intensive alternative:
786    # puts $chan [joinlist [$q get [$q size]] $sepChar $delChar]
787    return
788}
789
790