1package Spreadsheet::ReadSXC;
2
3use 5.006;
4use strict;
5use warnings;
6
7use Exporter 'import';
8
9our @EXPORT_OK = qw(read_sxc read_sxc_fh read_xml_file read_xml_string);
10our $VERSION = '0.35';
11
12use Archive::Zip ':ERROR_CODES';
13use Carp qw(croak);
14use Spreadsheet::ParseODS;
15use PerlX::Maybe;
16
17our @CARP_NOT = qw(Spreadsheet::ParseODS);
18
19sub zip_error_handler {}
20
21sub read_sxc ($;$) {
22    my ($sxc_file, $options_ref) = @_;
23    #if( !$options_ref->{StrictErrors}) {
24    #    -f $sxc_file && -s _ or return undef;
25    #};
26    #open my $fh, '<', $sxc_file
27    #    or croak "Couldn't open '$sxc_file': $!";
28    #read_sxc_fh( $fh, $options_ref );
29    _parse_xml( {}, $options_ref, $sxc_file );
30}
31
32sub read_sxc_fh {
33    my( $stream, $options_ref ) = @_;
34    _parse_xml( {},  $options_ref, $stream );
35}
36
37sub read_xml_file ($;$) {
38    my ($xml_file, $options_ref) = @_;
39    if( !$options_ref->{StrictErrors}) {
40        -f $xml_file && -s _ or return undef;
41    };
42    _parse_xml({ method => 'parsefile', type => 'xml' }, $options_ref, $xml_file);
43}
44
45sub read_xml_string ($;$) {
46    my ($xml_string, $options_ref) = @_;
47    _parse_xml( { type => 'xml' }, $options_ref, \$xml_string );
48}
49
50sub _parse_xml {
51    my ($internal_options, $options_ref, $xml_thing) = @_;
52
53    $options_ref ||= {};
54
55    my $type;
56    if( $internal_options->{ type } ) {
57        $type = $internal_options->{ type };
58    };
59
60    my $line_sep = $options_ref->{ReplaceNewlineWith} || "";
61
62    my $workbook;
63    my $ok = eval {
64        $workbook = Spreadsheet::ParseODS->new(
65                       line_separator => $line_sep,
66                        #%$options_ref,
67                    )->parse( $xml_thing,
68                              maybe inputtype => $type );
69        1;
70   };
71   if( my $err = $@ and $options_ref->{ StrictErrors } ) {
72       die "$@\n"
73   };
74   return unless $workbook;
75
76    # Cut off trailing columns if any
77    # Down-convert from ::Cell to raw values, depending on the options
78
79    # This conversion is likely best done in an immediate callback to
80    # speed up things
81    my $res = {};
82    for my $s ($workbook->worksheets) {
83        my $rs = $res->{ $s->label } = [];
84        for my $r ($s->row_min..$s->row_max) {
85            if( $options_ref->{DropHiddenRows} and $s->is_row_hidden( $r ) ) {
86                next;
87            };
88
89            my $rowref;
90            for my $c ($s->col_min..$s->col_max) {
91                # Depending on what type we want, use ->value or ->unformatted
92                # depending on $options_ref->{ ... }
93                if( ! $rowref ) {
94                    push @$rs, ($rowref = []);
95                };
96
97                my $cell = $s->get_cell( $r,$c );
98                if( $options_ref->{DropHiddenColumns} and $s->is_col_hidden($c)) {
99                    next;
100                };
101
102                my ($method,$type) = ('value',$cell->type);
103                $type ||= '';
104
105                if( $options_ref->{StandardCurrency} and $type =~ qr/^(float|currency|percentage)/) {
106                    $method = 'unformatted';
107                };
108                if( $options_ref->{StandardDate} and $type =~ qr/^(date)/) {
109                    $method = 'unformatted';
110                };
111                if( $options_ref->{StandardTime} and $type =~ qr/^(time)/) {
112                    $method = 'unformatted';
113                };
114
115                push @$rowref, $s->get_cell( $r,$c )->$method;
116            }
117        }
118    };
119
120    if ( $options_ref->{OrderBySheet} ) {
121        return [ map { $res->{ $_->{label} } } $workbook->worksheets ]
122    } else {
123        return $res
124    }
125}
126
1271;
128
129__END__
130=head1 NAME
131
132Spreadsheet::ReadSXC - Extract OpenOffice 1.x spreadsheet data
133
134=head1 NOTICE
135
136This is a legacy API wrapper. Most likely you want to look at
137L<Spreadsheet::ParseODS>, which implements an API more compatible with
138L<Spreadsheet::ParseXLSX>. That module is also the backend for this API.
139
140=head1 SYNOPSIS
141
142  use Spreadsheet::ReadSXC qw(read_sxc);
143  my $workbook_ref = read_sxc("/path/to/file.sxc");
144
145  # Alternatively, unpack the .sxc file yourself and pass content.xml
146
147  use Spreadsheet::ReadSXC qw(read_xml_file);
148  my $workbook_ref = read_xml_file("/path/to/content.xml");
149
150
151  # Alternatively, pass the XML string directly
152
153  use Spreadsheet::ReadSXC qw(read_xml_string);
154  use Archive::Zip;
155  my $zip = Archive::Zip->new("/path/to/file.sxc");
156  my $content = $zip->contents('content.xml');
157  my $workbook_ref = read_xml_string($content);
158
159
160  # Control the output through a hash of options (below are the defaults):
161
162  my %options = (
163    ReplaceNewlineWith  => "",
164    IncludeCoveredCells => 0,
165    DropHiddenRows      => 0,
166    DropHiddenColumns   => 0,
167    NoTruncate          => 0,
168    StandardCurrency    => 0,
169    StandardDate        => 0,
170    StandardTime        => 0,
171    OrderBySheet        => 0,
172    StrictErrors        => 0,
173  );
174  my $workbook_ref = read_sxc("/path/to/file.sxc", \%options );
175
176
177  # Iterate over every worksheet, row, and cell:
178
179  use Encode 'decode';
180
181  foreach ( sort keys %$workbook_ref ) {
182     print "Worksheet ", $_, " contains ", $#{$$workbook_ref{$_}} + 1, " row(s):\n";
183     foreach ( @{$$workbook_ref{$_}} ) {
184        foreach ( map { defined $_ ? $_ : '' } @{$_} ) {
185          my $str = decode('UTF-8', $_);
186          print " '$str'";
187        }
188        print "\n";
189     }
190  }
191
192
193  # Cell D2 of worksheet "Sheet1"
194
195  $cell = $$workbook_ref{"Sheet1"}[1][3];
196
197
198  # Row 1 of worksheet "Sheet1":
199
200  @row = @{$$workbook_ref{"Sheet1"}[0]};
201
202
203  # Worksheet "Sheet1":
204
205  @sheet = @{$$workbook_ref{"Sheet1"}};
206
207
208
209=head1 DESCRIPTION
210
211Spreadsheet::ReadSXC extracts data from OpenOffice 1.x spreadsheet
212files (.sxc). It exports the function read_sxc() which takes a
213filename and an optional reference to a hash of options as
214arguments and returns a reference to a hash of references to
215two-dimensional arrays. The hash keys correspond to the names of
216worksheets in the OpenOffice workbook. The two-dimensional arrays
217correspond to rows and cells in the respective spreadsheets. If
218you don't like this because the order of sheets is not preserved
219in a hash, read on. The 'OrderBySheet' option provides an array
220of hashes instead.
221
222If you prefer to unpack the .sxc file yourself, you can use the
223function read_xml_file() instead and pass the path to content.xml
224as an argument. Or you can extract the XML string from content.xml
225and pass the string to the function read_xml_string(). Both
226functions also take a reference to a hash of options as an
227optional second argument.
228
229Spreadsheet::ReadSXC uses XML::Twig to parse the XML
230contained in .sxc files. Only the contents of text:p elements are
231returned, not the actual values of table:value attributes. For
232example, a cell might have a table:value-type attribute of
233"currency", a table:value attribute of "-1500.99" and a
234table:currency attribute of "USD". The text:p element would
235contain "-$1,500.99". This is the string which is returned by the
236read_sxc() function, not the value of -1500.99.
237
238Spreadsheet::ReadSXC was written with data import into an SQL
239database in mind. Therefore empty spreadsheet cells correspond to
240undef values in array rows. The example code above shows how to
241replace undef values with empty strings.
242
243If the .sxc file contains an empty spreadsheet its hash element will
244point to an empty array (unless you use the 'NoTruncate' option in
245which case it will point to an array of an array containing one
246undefined element).
247
248OpenOffice uses UTF-8 encoding. It depends on your environment how
249the data returned by the XML Parser is best handled:
250
251  use Unicode::String qw(latin1 utf8);
252  $unicode_string = utf8($$workbook_ref{"Sheet1"}[0][0])->as_string;
253
254  # this will not work for characters outside ISO-8859-1:
255
256  $latin1_string = utf8($$workbook_ref{"Sheet1"}[0][0])->latin1;
257
258Of course there are other modules than Unicode::String on CPAN that
259handle conversion between encodings. It's your choice.
260
261Table rows in .sxc files may have a "table:number-rows-repeated"
262attribute, which is often used for consecutive empty rows. When you
263format whole rows and/or columns in OpenOffice, it sets the numbers
264of rows in a worksheet to 32,000 and the number of columns to 256, even
265if only a few lower-numbered rows and cells actually contain data.
266Spreadsheet::ReadSXC truncates such sheets so that there are no empty
267rows after the last row containing data and no empty columns after the
268last column containing data (unless you use the 'NoTruncate' option).
269
270Still it is perfectly legal for an .sxc file to apply the
271"table:number-rows-repeated" attribute to rows that actually contain
272data (although I have only been able to produce such files manually,
273not through OpenOffice itself). To save on memory usage in these cases,
274Spreadsheet::ReadSXC does not copy rows by value, but by reference
275(remember that multi-dimensional arrays in Perl are really arrays of
276references to arrays). Therefore, if you change a value in one row, it
277is possible that you find the corresponding value in the next row
278changed, too:
279
280  $$workbook_ref{"Sheet1"}[0][0] = 'new string';
281  print $$workbook_ref{"Sheet1"}[1][0];
282
283As of version 0.20 the references returned by read_sxc() et al. remain
284valid after subsequent calls to the same function. In earlier versions,
285calling read_sxc() with a different file as the argument would change
286the data referenced by the original return value, so you had to
287derefence it before making another call. Thanks to H. Merijn Brand for
288fixing this.
289
290
291=head1 OPTIONS
292
293=over 4
294
295=item StrictErrors
296
297Turn on error reporting by using C<croak>. Otherwise, functions silently
298return C<undef> when errors are encountered.
299
300=item ReplaceNewlineWith
301
302By default, newlines within cells are ignored and all lines in a cell
303are concatenated to a single string which does not contain a newline. To
304keep the newline characters, use the following key/value pair in your
305hash of options:
306
307  ReplaceNewlineWith => "\n"
308
309However, you may replace newlines with any string you like.
310
311
312=item IncludeCoveredCells
313
314By default, the content of cells that are covered by other cells is
315ignored because you wouldn't see it in OpenOffice unless you unmerge
316the merged cells. To include covered cells in the data structure which
317is returned by parse_sxc(), use the following key/value pair in your
318hash of options:
319
320  IncludeCoveredCells => 1
321
322
323=item DropHiddenRows
324
325By default, hidden rows are included in the data structure returned by
326parse_sxc(). To drop those rows, use the following key/value pair in
327your hash of options:
328
329  DropHiddenRows => 1
330
331
332=item DropHiddenColumns
333
334By default, hidden columns are included in the data structure returned
335by parse_sxc(). To drop those rows, use the following key/value pair
336in your hash of options:
337
338  DropHiddenColumns => 1
339
340
341=item NoTruncate
342
343By default, the two-dimensional arrays that contain the data within
344each worksheet are truncated to get rid of empty rows below the last
345row containing data and empty columns beyond the last column
346containing data. If you prefer to keep those rows and columns, use the
347following key/value pair in your hash of options:
348
349  NoTruncate => 1
350
351
352=item StandardCurrency
353
354By default, cells are returned as formatted. If you prefer to
355obtain the value as contained in the table:value attribute,
356use the following key/value pair in your hash of options:
357
358  StandardCurrency => 1
359
360
361=item StandardDate
362
363By default, date cells are returned as formatted. If you prefer to
364obtain the date value as contained in the table:date-value attribute,
365use the following key/value pair in your hash of options:
366
367  StandardDate => 1
368
369
370=item StandardTime
371
372By default, time cells are returned as formatted. If you prefer to
373obtain the time value as contained in the table:time-value attribute,
374use the following key/value pair in your hash of options:
375
376  StandardTime => 1
377
378These options are a first step on the way to a different approach at
379reading data from .sxc files. There should be more options to read in
380values instead of the strings OpenOffice displays. It should give
381more flexibility in working with the data obtained from OpenOffice
382spreadsheets. 'float' and 'percentage' values could be next.
383'currency' is less obvious, though, as we need to consider both its
384value and the 'table:currency' attribute. Formulas and array formulas
385are yet another issue. I probably won't deal with this until I've
386given this module an object-oriented interface.
387
388
389=item OrderBySheet
390
391The disadvantage of storing worksheets by name in a hash is that the
392order of sheets is lost. If you prefer not to obtain such a hash, but
393an array of worksheets insted, use the following key/value pair in
394your hash of options:
395
396  OrderBySheet => 1
397
398Thus the read_sxc function will return an array of hashes, each of
399which will have two keys, "label" and "data". The value of "label"
400is the name of the sheet. The value of data is a reference to a
401two-dimensional array containing rows and columns of the worksheet:
402
403  my $worksheets_ref = read_sxc("/path/to/file.sxc");
404  my $name_of_first_sheet = $$worksheets_ref[0]{label};
405  my $first_cell_of_first_sheet = $$worksheets_ref[0]{data}[0][0];
406
407
408=back
409
410=head1 FUNCTIONS
411
412=head2 read_sxc
413
414  my $workbook_ref = read_sxc("/path/to/file.sxc");
415
416Reads an SXC or ODS file given a filename and returns the worksheets as a
417data structure.
418
419=head2 read_sxc_fh
420
421    open my $fh = 'example.ods';
422    my $sheet = read_sxc_fh( $fh );
423
424Reads an SXC or ODS file given a filehandle and returns the worksheets as a
425data structure.
426
427=head2 read_xml_file
428
429  my $workbook_ref = read_xml_file("/path/to/content.xml");
430
431Reads an XML file from a SXC or ODS file returns the worksheets as a
432data structure.
433
434=head2 read_xml_string
435
436Parses an XML string and eturns the worksheets as a data structure.
437
438=head1 Reading an SXC file from an URL
439
440    use HTTP::Tiny;
441    use Spreadsheet::Read;
442
443    # Fetch data and return a filehandle to that data
444    sub fetch_url {
445        my( $url ) = @_;
446        my $ua = HTTP::Tiny->new;
447        my $res = $ua->get( $url );
448        open my $fh, '<', \$res->{content};
449        return $fh
450    }
451    my $fh = fetch_url('http://example.com/example.ods');
452    my $sheet = read_sxc_fh( $fh );
453
454=head1 BUG TRACKER
455
456Please report bugs in this module via the Github bug queue at
457L<https://github.com/Corion/Spreadsheet-ReadSXC/issues>
458
459=head1 SEE ALSO
460
461L<https://www.openoffice.org/xml/general.html> has extensive documentation
462of the OpenOffice 1.x XML file format (soon to be replaced by the
463OASIS file format (ODS), see L<http://docs.oasis-open.org/office/v1.2/OpenDocument-v1.2.pdf>).
464
465=head1 AUTHOR
466
467Christoph Terhechte, E<lt>terhechte@cpan.orgE<gt>
468
469=head1 MAINTAINER
470
471Max Maischein, L<mailto:corion@cpan.org>
472
473=head1 COPYRIGHT AND LICENSE
474
475Copyright 2005-2019 by Christoph Terhechte
476
477Copyright 2019- by Max Maischein
478
479This library is free software; you can redistribute it and/or modify
480it under the same terms as Perl itself.
481
482=cut
483