1package Spreadsheet::ReadSXC; 2 3use 5.006; 4use strict; 5use warnings; 6 7use Exporter 'import'; 8 9our @EXPORT_OK = qw(read_sxc read_sxc_fh read_xml_file read_xml_string); 10our $VERSION = '0.35'; 11 12use Archive::Zip ':ERROR_CODES'; 13use Carp qw(croak); 14use Spreadsheet::ParseODS; 15use PerlX::Maybe; 16 17our @CARP_NOT = qw(Spreadsheet::ParseODS); 18 19sub zip_error_handler {} 20 21sub read_sxc ($;$) { 22 my ($sxc_file, $options_ref) = @_; 23 #if( !$options_ref->{StrictErrors}) { 24 # -f $sxc_file && -s _ or return undef; 25 #}; 26 #open my $fh, '<', $sxc_file 27 # or croak "Couldn't open '$sxc_file': $!"; 28 #read_sxc_fh( $fh, $options_ref ); 29 _parse_xml( {}, $options_ref, $sxc_file ); 30} 31 32sub read_sxc_fh { 33 my( $stream, $options_ref ) = @_; 34 _parse_xml( {}, $options_ref, $stream ); 35} 36 37sub read_xml_file ($;$) { 38 my ($xml_file, $options_ref) = @_; 39 if( !$options_ref->{StrictErrors}) { 40 -f $xml_file && -s _ or return undef; 41 }; 42 _parse_xml({ method => 'parsefile', type => 'xml' }, $options_ref, $xml_file); 43} 44 45sub read_xml_string ($;$) { 46 my ($xml_string, $options_ref) = @_; 47 _parse_xml( { type => 'xml' }, $options_ref, \$xml_string ); 48} 49 50sub _parse_xml { 51 my ($internal_options, $options_ref, $xml_thing) = @_; 52 53 $options_ref ||= {}; 54 55 my $type; 56 if( $internal_options->{ type } ) { 57 $type = $internal_options->{ type }; 58 }; 59 60 my $line_sep = $options_ref->{ReplaceNewlineWith} || ""; 61 62 my $workbook; 63 my $ok = eval { 64 $workbook = Spreadsheet::ParseODS->new( 65 line_separator => $line_sep, 66 #%$options_ref, 67 )->parse( $xml_thing, 68 maybe inputtype => $type ); 69 1; 70 }; 71 if( my $err = $@ and $options_ref->{ StrictErrors } ) { 72 die "$@\n" 73 }; 74 return unless $workbook; 75 76 # Cut off trailing columns if any 77 # Down-convert from ::Cell to raw values, depending on the options 78 79 # This conversion is likely best done in an immediate callback to 80 # speed up things 81 my $res = {}; 82 for my $s ($workbook->worksheets) { 83 my $rs = $res->{ $s->label } = []; 84 for my $r ($s->row_min..$s->row_max) { 85 if( $options_ref->{DropHiddenRows} and $s->is_row_hidden( $r ) ) { 86 next; 87 }; 88 89 my $rowref; 90 for my $c ($s->col_min..$s->col_max) { 91 # Depending on what type we want, use ->value or ->unformatted 92 # depending on $options_ref->{ ... } 93 if( ! $rowref ) { 94 push @$rs, ($rowref = []); 95 }; 96 97 my $cell = $s->get_cell( $r,$c ); 98 if( $options_ref->{DropHiddenColumns} and $s->is_col_hidden($c)) { 99 next; 100 }; 101 102 my ($method,$type) = ('value',$cell->type); 103 $type ||= ''; 104 105 if( $options_ref->{StandardCurrency} and $type =~ qr/^(float|currency|percentage)/) { 106 $method = 'unformatted'; 107 }; 108 if( $options_ref->{StandardDate} and $type =~ qr/^(date)/) { 109 $method = 'unformatted'; 110 }; 111 if( $options_ref->{StandardTime} and $type =~ qr/^(time)/) { 112 $method = 'unformatted'; 113 }; 114 115 push @$rowref, $s->get_cell( $r,$c )->$method; 116 } 117 } 118 }; 119 120 if ( $options_ref->{OrderBySheet} ) { 121 return [ map { $res->{ $_->{label} } } $workbook->worksheets ] 122 } else { 123 return $res 124 } 125} 126 1271; 128 129__END__ 130=head1 NAME 131 132Spreadsheet::ReadSXC - Extract OpenOffice 1.x spreadsheet data 133 134=head1 NOTICE 135 136This is a legacy API wrapper. Most likely you want to look at 137L<Spreadsheet::ParseODS>, which implements an API more compatible with 138L<Spreadsheet::ParseXLSX>. That module is also the backend for this API. 139 140=head1 SYNOPSIS 141 142 use Spreadsheet::ReadSXC qw(read_sxc); 143 my $workbook_ref = read_sxc("/path/to/file.sxc"); 144 145 # Alternatively, unpack the .sxc file yourself and pass content.xml 146 147 use Spreadsheet::ReadSXC qw(read_xml_file); 148 my $workbook_ref = read_xml_file("/path/to/content.xml"); 149 150 151 # Alternatively, pass the XML string directly 152 153 use Spreadsheet::ReadSXC qw(read_xml_string); 154 use Archive::Zip; 155 my $zip = Archive::Zip->new("/path/to/file.sxc"); 156 my $content = $zip->contents('content.xml'); 157 my $workbook_ref = read_xml_string($content); 158 159 160 # Control the output through a hash of options (below are the defaults): 161 162 my %options = ( 163 ReplaceNewlineWith => "", 164 IncludeCoveredCells => 0, 165 DropHiddenRows => 0, 166 DropHiddenColumns => 0, 167 NoTruncate => 0, 168 StandardCurrency => 0, 169 StandardDate => 0, 170 StandardTime => 0, 171 OrderBySheet => 0, 172 StrictErrors => 0, 173 ); 174 my $workbook_ref = read_sxc("/path/to/file.sxc", \%options ); 175 176 177 # Iterate over every worksheet, row, and cell: 178 179 use Encode 'decode'; 180 181 foreach ( sort keys %$workbook_ref ) { 182 print "Worksheet ", $_, " contains ", $#{$$workbook_ref{$_}} + 1, " row(s):\n"; 183 foreach ( @{$$workbook_ref{$_}} ) { 184 foreach ( map { defined $_ ? $_ : '' } @{$_} ) { 185 my $str = decode('UTF-8', $_); 186 print " '$str'"; 187 } 188 print "\n"; 189 } 190 } 191 192 193 # Cell D2 of worksheet "Sheet1" 194 195 $cell = $$workbook_ref{"Sheet1"}[1][3]; 196 197 198 # Row 1 of worksheet "Sheet1": 199 200 @row = @{$$workbook_ref{"Sheet1"}[0]}; 201 202 203 # Worksheet "Sheet1": 204 205 @sheet = @{$$workbook_ref{"Sheet1"}}; 206 207 208 209=head1 DESCRIPTION 210 211Spreadsheet::ReadSXC extracts data from OpenOffice 1.x spreadsheet 212files (.sxc). It exports the function read_sxc() which takes a 213filename and an optional reference to a hash of options as 214arguments and returns a reference to a hash of references to 215two-dimensional arrays. The hash keys correspond to the names of 216worksheets in the OpenOffice workbook. The two-dimensional arrays 217correspond to rows and cells in the respective spreadsheets. If 218you don't like this because the order of sheets is not preserved 219in a hash, read on. The 'OrderBySheet' option provides an array 220of hashes instead. 221 222If you prefer to unpack the .sxc file yourself, you can use the 223function read_xml_file() instead and pass the path to content.xml 224as an argument. Or you can extract the XML string from content.xml 225and pass the string to the function read_xml_string(). Both 226functions also take a reference to a hash of options as an 227optional second argument. 228 229Spreadsheet::ReadSXC uses XML::Twig to parse the XML 230contained in .sxc files. Only the contents of text:p elements are 231returned, not the actual values of table:value attributes. For 232example, a cell might have a table:value-type attribute of 233"currency", a table:value attribute of "-1500.99" and a 234table:currency attribute of "USD". The text:p element would 235contain "-$1,500.99". This is the string which is returned by the 236read_sxc() function, not the value of -1500.99. 237 238Spreadsheet::ReadSXC was written with data import into an SQL 239database in mind. Therefore empty spreadsheet cells correspond to 240undef values in array rows. The example code above shows how to 241replace undef values with empty strings. 242 243If the .sxc file contains an empty spreadsheet its hash element will 244point to an empty array (unless you use the 'NoTruncate' option in 245which case it will point to an array of an array containing one 246undefined element). 247 248OpenOffice uses UTF-8 encoding. It depends on your environment how 249the data returned by the XML Parser is best handled: 250 251 use Unicode::String qw(latin1 utf8); 252 $unicode_string = utf8($$workbook_ref{"Sheet1"}[0][0])->as_string; 253 254 # this will not work for characters outside ISO-8859-1: 255 256 $latin1_string = utf8($$workbook_ref{"Sheet1"}[0][0])->latin1; 257 258Of course there are other modules than Unicode::String on CPAN that 259handle conversion between encodings. It's your choice. 260 261Table rows in .sxc files may have a "table:number-rows-repeated" 262attribute, which is often used for consecutive empty rows. When you 263format whole rows and/or columns in OpenOffice, it sets the numbers 264of rows in a worksheet to 32,000 and the number of columns to 256, even 265if only a few lower-numbered rows and cells actually contain data. 266Spreadsheet::ReadSXC truncates such sheets so that there are no empty 267rows after the last row containing data and no empty columns after the 268last column containing data (unless you use the 'NoTruncate' option). 269 270Still it is perfectly legal for an .sxc file to apply the 271"table:number-rows-repeated" attribute to rows that actually contain 272data (although I have only been able to produce such files manually, 273not through OpenOffice itself). To save on memory usage in these cases, 274Spreadsheet::ReadSXC does not copy rows by value, but by reference 275(remember that multi-dimensional arrays in Perl are really arrays of 276references to arrays). Therefore, if you change a value in one row, it 277is possible that you find the corresponding value in the next row 278changed, too: 279 280 $$workbook_ref{"Sheet1"}[0][0] = 'new string'; 281 print $$workbook_ref{"Sheet1"}[1][0]; 282 283As of version 0.20 the references returned by read_sxc() et al. remain 284valid after subsequent calls to the same function. In earlier versions, 285calling read_sxc() with a different file as the argument would change 286the data referenced by the original return value, so you had to 287derefence it before making another call. Thanks to H. Merijn Brand for 288fixing this. 289 290 291=head1 OPTIONS 292 293=over 4 294 295=item StrictErrors 296 297Turn on error reporting by using C<croak>. Otherwise, functions silently 298return C<undef> when errors are encountered. 299 300=item ReplaceNewlineWith 301 302By default, newlines within cells are ignored and all lines in a cell 303are concatenated to a single string which does not contain a newline. To 304keep the newline characters, use the following key/value pair in your 305hash of options: 306 307 ReplaceNewlineWith => "\n" 308 309However, you may replace newlines with any string you like. 310 311 312=item IncludeCoveredCells 313 314By default, the content of cells that are covered by other cells is 315ignored because you wouldn't see it in OpenOffice unless you unmerge 316the merged cells. To include covered cells in the data structure which 317is returned by parse_sxc(), use the following key/value pair in your 318hash of options: 319 320 IncludeCoveredCells => 1 321 322 323=item DropHiddenRows 324 325By default, hidden rows are included in the data structure returned by 326parse_sxc(). To drop those rows, use the following key/value pair in 327your hash of options: 328 329 DropHiddenRows => 1 330 331 332=item DropHiddenColumns 333 334By default, hidden columns are included in the data structure returned 335by parse_sxc(). To drop those rows, use the following key/value pair 336in your hash of options: 337 338 DropHiddenColumns => 1 339 340 341=item NoTruncate 342 343By default, the two-dimensional arrays that contain the data within 344each worksheet are truncated to get rid of empty rows below the last 345row containing data and empty columns beyond the last column 346containing data. If you prefer to keep those rows and columns, use the 347following key/value pair in your hash of options: 348 349 NoTruncate => 1 350 351 352=item StandardCurrency 353 354By default, cells are returned as formatted. If you prefer to 355obtain the value as contained in the table:value attribute, 356use the following key/value pair in your hash of options: 357 358 StandardCurrency => 1 359 360 361=item StandardDate 362 363By default, date cells are returned as formatted. If you prefer to 364obtain the date value as contained in the table:date-value attribute, 365use the following key/value pair in your hash of options: 366 367 StandardDate => 1 368 369 370=item StandardTime 371 372By default, time cells are returned as formatted. If you prefer to 373obtain the time value as contained in the table:time-value attribute, 374use the following key/value pair in your hash of options: 375 376 StandardTime => 1 377 378These options are a first step on the way to a different approach at 379reading data from .sxc files. There should be more options to read in 380values instead of the strings OpenOffice displays. It should give 381more flexibility in working with the data obtained from OpenOffice 382spreadsheets. 'float' and 'percentage' values could be next. 383'currency' is less obvious, though, as we need to consider both its 384value and the 'table:currency' attribute. Formulas and array formulas 385are yet another issue. I probably won't deal with this until I've 386given this module an object-oriented interface. 387 388 389=item OrderBySheet 390 391The disadvantage of storing worksheets by name in a hash is that the 392order of sheets is lost. If you prefer not to obtain such a hash, but 393an array of worksheets insted, use the following key/value pair in 394your hash of options: 395 396 OrderBySheet => 1 397 398Thus the read_sxc function will return an array of hashes, each of 399which will have two keys, "label" and "data". The value of "label" 400is the name of the sheet. The value of data is a reference to a 401two-dimensional array containing rows and columns of the worksheet: 402 403 my $worksheets_ref = read_sxc("/path/to/file.sxc"); 404 my $name_of_first_sheet = $$worksheets_ref[0]{label}; 405 my $first_cell_of_first_sheet = $$worksheets_ref[0]{data}[0][0]; 406 407 408=back 409 410=head1 FUNCTIONS 411 412=head2 read_sxc 413 414 my $workbook_ref = read_sxc("/path/to/file.sxc"); 415 416Reads an SXC or ODS file given a filename and returns the worksheets as a 417data structure. 418 419=head2 read_sxc_fh 420 421 open my $fh = 'example.ods'; 422 my $sheet = read_sxc_fh( $fh ); 423 424Reads an SXC or ODS file given a filehandle and returns the worksheets as a 425data structure. 426 427=head2 read_xml_file 428 429 my $workbook_ref = read_xml_file("/path/to/content.xml"); 430 431Reads an XML file from a SXC or ODS file returns the worksheets as a 432data structure. 433 434=head2 read_xml_string 435 436Parses an XML string and eturns the worksheets as a data structure. 437 438=head1 Reading an SXC file from an URL 439 440 use HTTP::Tiny; 441 use Spreadsheet::Read; 442 443 # Fetch data and return a filehandle to that data 444 sub fetch_url { 445 my( $url ) = @_; 446 my $ua = HTTP::Tiny->new; 447 my $res = $ua->get( $url ); 448 open my $fh, '<', \$res->{content}; 449 return $fh 450 } 451 my $fh = fetch_url('http://example.com/example.ods'); 452 my $sheet = read_sxc_fh( $fh ); 453 454=head1 BUG TRACKER 455 456Please report bugs in this module via the Github bug queue at 457L<https://github.com/Corion/Spreadsheet-ReadSXC/issues> 458 459=head1 SEE ALSO 460 461L<https://www.openoffice.org/xml/general.html> has extensive documentation 462of the OpenOffice 1.x XML file format (soon to be replaced by the 463OASIS file format (ODS), see L<http://docs.oasis-open.org/office/v1.2/OpenDocument-v1.2.pdf>). 464 465=head1 AUTHOR 466 467Christoph Terhechte, E<lt>terhechte@cpan.orgE<gt> 468 469=head1 MAINTAINER 470 471Max Maischein, L<mailto:corion@cpan.org> 472 473=head1 COPYRIGHT AND LICENSE 474 475Copyright 2005-2019 by Christoph Terhechte 476 477Copyright 2019- by Max Maischein 478 479This library is free software; you can redistribute it and/or modify 480it under the same terms as Perl itself. 481 482=cut 483