1#!/pro/bin/perl 2 3# csv-check: Check validity of CSV file and report 4# (m)'20 [21 May 2020] Copyright H.M.Brand 2007-2021 5 6# This code requires the defined-or feature and PerlIO 7 8use 5.12.0; 9use warnings; 10 11use Data::Peek; 12use Encode qw( decode encode ); 13 14our $VERSION = "2.05"; # 2020-05-21 15my $cmd = $0; $cmd =~ s{.*/}{}; 16 17sub usage { 18 my $err = shift and select STDERR; 19 print <<"EOU"; 20usage: $cmd [-s <sep>] [-q <quot>] [-e <esc>] [-u] [--pp] [file.csv] 21 -s S --sep=S use S as seperator char. Auto-detect, default = ',' 22 the string "tab" is allowed. 23 -q Q --quo=Q use Q as quote char. Auto-detect, default = '"' 24 the string "undef" will disable quotation. 25 -e E --esc=E use E as escape char. Auto-detect, default = '"' 26 the string "undef" will disable escapes. 27 -N --nl force EOL to \\n 28 -C --cr force EOL to \\r 29 -M --crnl force EOL to \\r\\n 30 -u --utf-8 check if all fields are valid unicode 31 -E E --enc=E open file with encoding E 32 -h --hdr check with header (implies BOM) 33 -b --bom check with BOM (no header) 34 -f --skip-formula do not check formula's 35 36 --pp use Text::CSV_PP instead (cross-check) 37 38 -A a --attr=at:val pass attributes to parser 39 --at=val is also supported for know attributes 40 -L --list-attr list supported CSV attributes 41 -X --list-changes list attributes that changed from default 42EOU 43 exit $err; 44 } # usage 45 46use Getopt::Long qw(:config bundling passthrough); 47my $eol; 48GetOptions ( 49 "help|?" => sub { usage (0); }, 50 "V|version" => sub { say "$cmd [$VERSION]"; exit 0; }, 51 52 "c|s|sep=s" => \(my $sep = ""), 53 "q|quo|quote=s" => \(my $quo = '"'), 54 "e|esc|escape=s" => \(my $esc = '"'), 55 "N|nl!" => sub { $eol = "\n"; }, 56 "C|cr!" => sub { $eol = "\r"; }, 57 "M|crnl!" => sub { $eol = "\r\n"; }, 58 "B|binary!" => \(my $bin = 1), 59 60 "u|utf|utf8|utf-8!" => \(my $opt_u = 0), 61 "E|enc|encoding=s" => \(my $enc), 62 "h|hdr|header!" => \(my $opt_h = 0), 63 "b|bom!" => \(my $opt_b = 0), 64 "f|skip-formula!" => \(my $opt_f = 0), 65 66 "A|attr=s" => \ my @opt_A, 67 "L|list-attr!" => \ my $opt_L, 68 "X|list-changes!" => \ my $opt_X, 69 70 "pp!" => \(my $opt_p = 0), 71 72 "v|verbose:1" => \(my $opt_v = 0), 73 ) or usage (1); 74$opt_X and $opt_L++; 75 76my $csvmod = "Text::CSV_XS"; 77if ($opt_p) { 78 require Text::CSV_PP; 79 $csvmod = "Text::CSV_PP"; 80 } 81else { 82 require Text::CSV_XS; 83 } 84$csvmod->import (); 85 86binmode STDOUT, ":encoding(utf-8)"; 87binmode STDERR, ":encoding(utf-8)"; 88 89my $fn = $ARGV[0] // "-"; 90my @warn; 91 92my %csvarg = ( 93 sep_char => $sep eq "tab" ? "\t" : $sep, 94 quote_char => $quo eq "undef" ? undef : $quo, 95 escape_char => $esc eq "undef" ? undef : $esc, 96 eol => $eol, 97 binary => $bin, 98 keep_meta_info => 1, 99 auto_diag => 1, 100 formula => $opt_f ? "none" : "diag", 101 ); 102{ my $p = $csvmod->new; 103 my %ka = map { $_ => $p->{$_} } grep m/^[a-z]/ => $p->known_attributes; 104 foreach my $i (reverse 0 .. $#ARGV) { 105 if ($ARGV[$i] =~ m/^--(no[-_])?+([-\w]+)(?:=(.*))?$/) { 106 my ($attr, $val) = (lc $2 =~ tr/-/_/r, $3 // ($1 ? 0 : 1)); 107 if (exists $ka{$attr}) { 108 unshift @opt_A, "$attr:$val"; 109 splice @ARGV, $i, 1; 110 } 111 } 112 } 113 for (@opt_A) { 114 m/^([-\w]+)(?:[:=](.*))?/ or next; 115 my ($attr, $val) = (lc $1 =~ tr/-/_/r, $2 // 1); 116 exists $ka{$attr} or next; 117 $val eq "undef" || !length $val and $val = undef; # -A escape_char: 118 $csvarg{$attr} = $val; 119 } 120 if ($opt_L) { 121 $csvarg{sep_char} ||= $ka{sep_char}; 122 foreach my $attr (sort keys %ka) { 123 $ka{$attr} //= "(undef)"; 124 $csvarg{$attr} //= $ka{$attr}; 125 $opt_X and $csvarg{$attr} eq $ka{$attr} and next; 126 printf " %-21s : %s\n", $attr, $csvarg{$attr}; 127 } 128 exit 0; 129 } 130 } 131$opt_v > 1 and DDumper \%csvarg; 132 133my $data = do { local $/; <> } or die "No data to analyze\n"; 134 135my ($rows, %cols, $s_eol) = (0); 136unless ($sep) { # No sep char passed, try to auto-detect; 137 my ($first_line) = ($data =~ m/\A(.*?)(?:\r\n|\n|\r)/); 138 $first_line ||= $data; # if no EOL at all, use whole set 139 $sep = $first_line =~ m/["\d],["\d,]/ ? "," : 140 $first_line =~ m/["\d];["\d;]/ ? ";" : 141 $first_line =~ m/["\d]\t["\d]/ ? "\t" : 142 # If neither, then for unquoted strings 143 $first_line =~ m/\w,[\w,]/ ? "," : 144 $first_line =~ m/\w;[\w;]/ ? ";" : 145 $first_line =~ m/\w\t[\w]/ ? "\t" : ","; 146 $data =~ m/([\r\n]+)\Z/ and $s_eol = DDisplay "$1"; 147 $csvarg{sep_char} = $sep; 148 } 149 150my $csv = $csvmod->new (\%csvarg); 151$opt_v > 8 and DDumper $csv; 152 153$bin = 0; # Assume ASCII only 154 155sub done { 156 my $file = $ARGV // "STDIN"; 157 (my $pv = "$]0000000") =~ s{^([0-9]+)\.([0-9]{3})([0-9]{3})[0-9]*} 158 {sprintf "%d.%d.%d",$1,$2,$3}e; 159 my $uv = eval { 160 no warnings; 161 (my $cv = $]) =~ s/0+$//; 162 eval { require Unicode::UCD; Unicode::UCD::UnicodeVersion () } || 163 eval { require Module::CoreList; $Module::CoreList::version{$cv}{Unicode} }; 164 } || "unknown"; 165 say "Checked $file with $cmd $VERSION\nusing $csvmod @{[$csvmod->VERSION]} with perl $pv and Unicode $uv"; 166 my @diag = $csv->error_diag; 167 my $line = $. // $csv->record_number // "?"; 168 if ($diag[0] == 2012 && $csv->eof) { 169 my @coll = sort { $a <=> $b } keys %cols; 170 local $" = ", "; 171 my $cols = @coll == 1 ? $coll[0] : "(@coll)"; 172 $s_eol //= $csv->eol || "--unknown--"; 173 $s_eol =~ m/[\x00-\x1f]/ and $s_eol = DDisplay $s_eol; 174 say "OK: rows: $rows, columns: $cols"; 175 say " sep = <$sep>, quo = <$quo>, bin = <$bin>, eol = <$s_eol>"; 176 say " encoding = $csv->{ENCODING}" if $csv->{ENCODING}; 177 if (@coll > 1) { 178 say "multiple column lengths:"; 179 printf " %6d line%s with %4d field%s\n", 180 $cols{$_}, $cols{$_} == 1 ? " " : "s", 181 $_, $_ == 1 ? "" : "s" 182 for @coll; 183 } 184 $diag[0] = 0; 185 } 186 elsif ($diag[2]) { 187 say "$ARGV record $diag[3] at line $line/$diag[2] - $diag[0] - $diag[1]"; 188 my $ep = $diag[2] - 1; # diag[2] is 1-based 189 my $ei = $csv->error_input; 190 if (defined $ei) { 191 my $l = 0; 192 my $s = ""; 193 eval { my $u = decode ("utf-8", $ei); $ei = $u }; 194 for (split m/([^ -~])/ => $ei) { 195 if (m/^[ -~]+$/) { 196 $s .= $_; 197 $l += length; 198 next; 199 } 200 if ($_ eq "\t") { 201 $s .= "\\t"; 202 $ep > $l and $ep++; 203 $l += 2; 204 next; 205 } 206 if ($_ eq "\n") { 207 $s .= "\\n"; 208 $ep > $l and $ep++; 209 $l += 2; 210 next; 211 } 212 if ($_ eq "\r") { 213 $s .= "\\r"; 214 $ep > $l and $ep++; 215 $l += 2; 216 next; 217 } 218 $s .= sprintf "\\x{%05x}", ord; 219 $ep > $l and $ep += 9 - length encode "utf-8", $_; 220 $l += 9; 221 } 222 223 say " |$s|"; # 2b06 224 say " |", " " x $ep, "\x{25b2}", " " x (length ($s) - $ep - 1), "|"; 225 } 226 } 227 else { 228 say "$ARGV line $line - $diag[1]"; 229 } 230 print for @warn; 231 exit $diag[0]; 232 } # done 233 234sub show { 235 say STDERR join ", " => map { "\x{231e}$_\x{231d}" } @_; 236 } # show 237 238sub stats { 239 my $r = shift; 240 $cols{scalar @$r}++; 241 grep { $_ & 0x0002 } $csv->meta_info and $bin = 1; 242 $opt_v > 2 and show (@$r); 243 if ($opt_u) { 244 my @r = @$r; 245 foreach my $x (0 .. $#r) { 246 utf8::is_utf8 ($r[$x]) and next; 247 248 local $SIG{__WARN__} = sub { 249 (my $msg = shift) =~ s{ at /\S+Encode.pm.*}{}; 250 my @h = $csv->column_names; 251 push @warn, sprintf "Field %d%s in record %d - '%s'\t- %s", 252 $x + 1, @h ? " (column: '$h[$x]')" : "", $rows, 253 DPeek ($r[$x]), $msg; 254 }; 255 my $oct = decode ("utf-8", $r[$x], Encode::FB_WARN); 256 } 257 } 258 } # stats 259 260my $mode = $enc ? "<:encoding($enc)" : "<"; 261open my $fh, $mode, \$data or die "$fn: $!\n"; 262if ($opt_h) { 263 $csv->header ($fh); 264 } 265elsif ($opt_b) { 266 my @hdr = $csv->header ($fh, { detect_bom => 1, set_column_names => 0 }); 267 stats \@hdr; 268 } 269 270local $SIG{__WARN__} = sub { push @warn, @_; }; 271while (my $row = $csv->getline ($fh)) { 272 $rows++; 273 stats $row; 274 } 275done; 276