1#!/pro/bin/perl
2
3# csv-check: Check validity of CSV file and report
4#	   (m)'20 [21 May 2020] Copyright H.M.Brand 2007-2021
5
6# This code requires the defined-or feature and PerlIO
7
8use 5.12.0;
9use warnings;
10
11use Data::Peek;
12use Encode qw( decode encode );
13
14our $VERSION = "2.05";	# 2020-05-21
15my $cmd = $0; $cmd =~ s{.*/}{};
16
17sub usage {
18    my $err = shift and select STDERR;
19    print <<"EOU";
20usage: $cmd [-s <sep>] [-q <quot>] [-e <esc>] [-u] [--pp] [file.csv]
21    -s S  --sep=S         use S as seperator char. Auto-detect, default = ','
22                          the string "tab" is allowed.
23    -q Q  --quo=Q         use Q as quote     char. Auto-detect, default = '"'
24                          the string "undef" will disable quotation.
25    -e E  --esc=E         use E as escape    char. Auto-detect, default = '"'
26                          the string "undef" will disable escapes.
27    -N    --nl            force EOL to \\n
28    -C    --cr            force EOL to \\r
29    -M    --crnl          force EOL to \\r\\n
30    -u    --utf-8         check if all fields are valid unicode
31    -E E  --enc=E         open file with encoding E
32    -h    --hdr           check with header (implies BOM)
33    -b    --bom           check with BOM (no header)
34    -f    --skip-formula  do not check formula's
35
36          --pp            use Text::CSV_PP instead (cross-check)
37
38    -A a  --attr=at:val   pass attributes to parser
39			    --at=val is also supported for know attributes
40    -L    --list-attr     list supported CSV attributes
41    -X    --list-changes  list attributes that changed from default
42EOU
43    exit $err;
44    } # usage
45
46use Getopt::Long qw(:config bundling passthrough);
47my $eol;
48GetOptions (
49    "help|?"		=> sub { usage (0); },
50    "V|version"		=> sub { say "$cmd [$VERSION]"; exit 0; },
51
52    "c|s|sep=s"		=> \(my  $sep   = ""),
53    "q|quo|quote=s"	=> \(my  $quo   = '"'),
54    "e|esc|escape=s"	=> \(my  $esc   = '"'),
55    "N|nl!"		=> sub { $eol   = "\n"; },
56    "C|cr!"		=> sub { $eol   = "\r"; },
57    "M|crnl!"		=> sub { $eol   = "\r\n"; },
58    "B|binary!"		=> \(my  $bin   = 1),
59
60    "u|utf|utf8|utf-8!"	=> \(my  $opt_u = 0),
61    "E|enc|encoding=s"	=> \(my  $enc),
62    "h|hdr|header!"	=> \(my  $opt_h = 0),
63    "b|bom!"		=> \(my  $opt_b = 0),
64    "f|skip-formula!"	=> \(my  $opt_f = 0),
65
66    "A|attr=s"		=> \ my  @opt_A,
67    "L|list-attr!"	=> \ my  $opt_L,
68    "X|list-changes!"	=> \ my  $opt_X,
69
70    "pp!"		=> \(my  $opt_p = 0),
71
72    "v|verbose:1"	=> \(my  $opt_v = 0),
73    ) or usage (1);
74$opt_X and $opt_L++;
75
76my  $csvmod = "Text::CSV_XS";
77if ($opt_p) {
78    require Text::CSV_PP;
79    $csvmod = "Text::CSV_PP";
80    }
81else {
82    require Text::CSV_XS;
83    }
84$csvmod->import ();
85
86binmode STDOUT, ":encoding(utf-8)";
87binmode STDERR, ":encoding(utf-8)";
88
89my $fn   = $ARGV[0] // "-";
90my @warn;
91
92my %csvarg = (
93    sep_char       => $sep eq "tab"   ? "\t"  : $sep,
94    quote_char     => $quo eq "undef" ? undef : $quo,
95    escape_char    => $esc eq "undef" ? undef : $esc,
96    eol            => $eol,
97    binary         => $bin,
98    keep_meta_info => 1,
99    auto_diag      => 1,
100    formula        => $opt_f ? "none" : "diag",
101    );
102{   my $p = $csvmod->new;
103    my %ka = map { $_ => $p->{$_} } grep m/^[a-z]/ => $p->known_attributes;
104    foreach my $i (reverse 0 .. $#ARGV) {
105	if ($ARGV[$i] =~ m/^--(no[-_])?+([-\w]+)(?:=(.*))?$/) {
106	    my ($attr, $val) = (lc $2 =~ tr/-/_/r, $3 // ($1 ? 0 : 1));
107	    if (exists $ka{$attr}) {
108		unshift @opt_A, "$attr:$val";
109		splice @ARGV, $i, 1;
110		}
111	    }
112	}
113    for (@opt_A) {
114	m/^([-\w]+)(?:[:=](.*))?/ or next;
115	my ($attr, $val) = (lc $1 =~ tr/-/_/r, $2 // 1);
116	exists $ka{$attr} or next;
117	$val eq "undef" || !length $val and $val = undef;	# -A escape_char:
118	$csvarg{$attr} = $val;
119	}
120    if ($opt_L) {
121	$csvarg{sep_char} ||= $ka{sep_char};
122	foreach my $attr (sort keys %ka) {
123	    $ka{$attr}     //= "(undef)";
124	    $csvarg{$attr} //= $ka{$attr};
125	    $opt_X and $csvarg{$attr} eq $ka{$attr} and next;
126	    printf "  %-21s : %s\n", $attr, $csvarg{$attr};
127	    }
128	exit 0;
129	}
130    }
131$opt_v > 1 and DDumper \%csvarg;
132
133my $data = do { local $/; <> } or die "No data to analyze\n";
134
135my ($rows, %cols, $s_eol) = (0);
136unless ($sep) { # No sep char passed, try to auto-detect;
137    my ($first_line) = ($data =~ m/\A(.*?)(?:\r\n|\n|\r)/);
138    $first_line ||= $data; # if no EOL at all, use whole set
139    $sep = $first_line =~ m/["\d],["\d,]/ ? ","  :
140	   $first_line =~ m/["\d];["\d;]/ ? ";"  :
141	   $first_line =~ m/["\d]\t["\d]/ ? "\t" :
142	   # If neither, then for unquoted strings
143	   $first_line =~ m/\w,[\w,]/     ? ","  :
144	   $first_line =~ m/\w;[\w;]/     ? ";"  :
145	   $first_line =~ m/\w\t[\w]/     ? "\t" : ",";
146    $data =~ m/([\r\n]+)\Z/ and $s_eol = DDisplay "$1";
147    $csvarg{sep_char} = $sep;
148    }
149
150my $csv = $csvmod->new (\%csvarg);
151$opt_v > 8 and DDumper $csv;
152
153$bin = 0; # Assume ASCII only
154
155sub done {
156    my $file = $ARGV // "STDIN";
157    (my $pv = "$]0000000") =~ s{^([0-9]+)\.([0-9]{3})([0-9]{3})[0-9]*}
158			       {sprintf "%d.%d.%d",$1,$2,$3}e;
159    my $uv = eval {
160	no warnings;
161	(my $cv = $]) =~ s/0+$//;
162	eval { require Unicode::UCD;     Unicode::UCD::UnicodeVersion () } ||
163	eval { require Module::CoreList; $Module::CoreList::version{$cv}{Unicode} };
164	} || "unknown";
165    say "Checked $file with $cmd $VERSION\nusing $csvmod @{[$csvmod->VERSION]} with perl $pv and Unicode $uv";
166    my @diag = $csv->error_diag;
167    my $line = $. // $csv->record_number // "?";
168    if ($diag[0] == 2012 && $csv->eof) {
169	my @coll = sort { $a <=> $b } keys %cols;
170	local $" = ", ";
171	my $cols = @coll == 1 ? $coll[0] : "(@coll)";
172	$s_eol //= $csv->eol || "--unknown--";
173	$s_eol =~ m/[\x00-\x1f]/ and $s_eol = DDisplay $s_eol;
174	say "OK: rows: $rows, columns: $cols";
175	say "    sep = <$sep>, quo = <$quo>, bin = <$bin>, eol = <$s_eol>";
176	say "    encoding = $csv->{ENCODING}" if $csv->{ENCODING};
177	if (@coll > 1) {
178	    say "multiple column lengths:";
179	    printf " %6d line%s with %4d field%s\n",
180		$cols{$_}, $cols{$_} == 1 ? " " : "s",
181		$_,        $_        == 1 ? ""  : "s"
182		    for @coll;
183	    }
184	$diag[0] = 0;
185	}
186    elsif ($diag[2]) {
187	say "$ARGV record $diag[3] at line $line/$diag[2] - $diag[0] - $diag[1]";
188	my $ep  = $diag[2] - 1; # diag[2] is 1-based
189	my $ei  = $csv->error_input;
190	if (defined $ei) {
191	    my $l = 0;
192	    my $s = "";
193	    eval { my $u = decode ("utf-8", $ei); $ei = $u };
194	    for (split m/([^ -~])/ => $ei) {
195		if (m/^[ -~]+$/) {
196		    $s .= $_;
197		    $l += length;
198		    next;
199		    }
200		if ($_ eq "\t") {
201		    $s .= "\\t";
202		    $ep > $l and $ep++;
203		    $l += 2;
204		    next;
205		    }
206		if ($_ eq "\n") {
207		    $s .= "\\n";
208		    $ep > $l and $ep++;
209		    $l += 2;
210		    next;
211		    }
212		if ($_ eq "\r") {
213		    $s .= "\\r";
214		    $ep > $l and $ep++;
215		    $l += 2;
216		    next;
217		    }
218		$s .= sprintf "\\x{%05x}", ord;
219		$ep > $l and $ep += 9 - length encode "utf-8", $_;
220		$l += 9;
221		}
222
223	    say "    |$s|"; #           2b06
224	    say "    |", " " x $ep, "\x{25b2}", " " x (length ($s) - $ep - 1), "|";
225	    }
226	}
227    else {
228	say "$ARGV line $line - $diag[1]";
229	}
230    print for @warn;
231    exit $diag[0];
232    } # done
233
234sub show {
235    say STDERR join ", " => map { "\x{231e}$_\x{231d}" } @_;
236    } # show
237
238sub stats {
239    my $r = shift;
240    $cols{scalar @$r}++;
241    grep { $_ & 0x0002 } $csv->meta_info and $bin = 1;
242    $opt_v > 2 and show (@$r);
243    if ($opt_u) {
244	my @r = @$r;
245	foreach my $x (0 .. $#r) {
246	    utf8::is_utf8 ($r[$x]) and next;
247
248	    local $SIG{__WARN__} = sub {
249		(my $msg = shift) =~ s{ at /\S+Encode.pm.*}{};
250		my @h = $csv->column_names;
251		push @warn, sprintf "Field %d%s in record %d - '%s'\t- %s",
252		    $x + 1, @h ? " (column: '$h[$x]')" : "", $rows,
253		    DPeek ($r[$x]), $msg;
254		};
255	    my $oct = decode ("utf-8", $r[$x], Encode::FB_WARN);
256	    }
257	}
258    } # stats
259
260my $mode = $enc ? "<:encoding($enc)" : "<";
261open my $fh, $mode, \$data or die "$fn: $!\n";
262if ($opt_h) {
263    $csv->header ($fh);
264    }
265elsif ($opt_b) {
266    my @hdr = $csv->header ($fh, { detect_bom => 1, set_column_names => 0 });
267    stats \@hdr;
268    }
269
270local $SIG{__WARN__} = sub { push @warn, @_; };
271while (my $row = $csv->getline ($fh)) {
272    $rows++;
273    stats $row;
274    }
275done;
276