1package Spreadsheet::ParseXLSX;
2our $AUTHORITY = 'cpan:DOY';
3$Spreadsheet::ParseXLSX::VERSION = '0.27';
4use strict;
5use warnings;
6use 5.010;
7# ABSTRACT: parse XLSX files
8
9use Archive::Zip;
10use Graphics::ColorUtils 'rgb2hls', 'hls2rgb';
11use Scalar::Util 'openhandle';
12use Spreadsheet::ParseExcel 0.61;
13use XML::Twig;
14
15use Spreadsheet::ParseXLSX::Decryptor;
16
17
18
19sub new {
20    my $class = shift;
21    my (%args) = @_;
22
23    my $self = bless {}, $class;
24    $self->{Password} = $args{Password} if defined $args{Password};
25
26    return $self;
27}
28
29
30sub parse {
31    my $self = shift;
32    my ($file, $formatter) = @_;
33
34    my $zip = Archive::Zip->new;
35    my $workbook = Spreadsheet::ParseExcel::Workbook->new;
36
37    if ($self->_check_signature($file)) {
38        my $decrypted_file = Spreadsheet::ParseXLSX::Decryptor->open(
39            $file,
40            $self->{Password}
41        );
42        $file = $decrypted_file if $decrypted_file;
43    }
44
45    if (openhandle($file)) {
46        bless $file, 'IO::File' if ref($file) eq 'GLOB'; # sigh
47        my $fh = ref($file) eq 'File::Temp'
48            ? IO::File->new("<&=" . fileno($file))
49            : $file;
50        $zip->readFromFileHandle($fh) == Archive::Zip::AZ_OK
51            or die "Can't open filehandle as a zip file";
52        $workbook->{File} = undef;
53        $workbook->{__tempfile} = $file;
54    }
55    elsif (ref($file) eq 'SCALAR') {
56        open my $fh, '+<', $file
57            or die "Can't create filehandle from memory data";
58        $zip->readFromFileHandle($fh) == Archive::Zip::AZ_OK
59            or die "Can't open scalar ref as a zip file";
60        $workbook->{File} = undef;
61    }
62    elsif (!ref($file)) {
63        $zip->read($file) == Archive::Zip::AZ_OK
64            or die "Can't open file '$file' as a zip file";
65        $workbook->{File} = $file;
66    }
67    else {
68        die "Argument to 'new' must be a filename, open filehandle, or scalar ref";
69    }
70
71    return $self->_parse_workbook($zip, $workbook, $formatter);
72}
73
74sub _check_signature {
75    my $self = shift;
76    my ($file) = @_;
77
78    my $signature = '';
79    if (openhandle($file)) {
80        bless $file, 'IO::File' if ref($file) eq 'GLOB'; # sigh
81        $file->read($signature, 2);
82        $file->seek(-2, IO::File::SEEK_CUR);
83    }
84    elsif (ref($file) eq 'SCALAR') {
85        $signature = substr($$file, 0, 2);
86    }
87    elsif (!ref($file)) {
88        my $fh = IO::File->new($file, 'r');
89        $fh->read($signature, 2);
90        $fh->close;
91    }
92
93    return $signature eq "\xd0\xcf";
94}
95
96sub _parse_workbook {
97    my $self = shift;
98    my ($zip, $workbook, $formatter) = @_;
99
100    my $files = $self->_extract_files($zip);
101
102    my ($version)    = $files->{workbook}->find_nodes('//s:fileVersion');
103    my ($properties) = $files->{workbook}->find_nodes('//s:workbookPr');
104
105    if ($version) {
106        $workbook->{Version} = $version->att('appName')
107                             . ($version->att('lowestEdited')
108                                 ? ('-' . $version->att('lowestEdited'))
109                                 : (""));
110    }
111
112    $workbook->{Flg1904} = $self->_xml_boolean($properties->att('date1904'))
113        if $properties;
114
115    $workbook->{FmtClass} = $formatter || Spreadsheet::ParseExcel::FmtDefault->new;
116
117    my $themes = $self->_parse_themes((values %{ $files->{themes} })[0]); # XXX
118
119    $workbook->{Color} = $themes->{Color};
120
121    my $styles = $self->_parse_styles($workbook, $files->{styles});
122
123    $workbook->{Format}    = $styles->{Format};
124    $workbook->{FormatStr} = $styles->{FormatStr};
125    $workbook->{Font}      = $styles->{Font};
126
127    if ($files->{strings}) {
128        my %string_parse_data = $self->_parse_shared_strings(
129            $files->{strings},
130            $themes->{Color}
131        );
132        $workbook->{PkgStr} = $string_parse_data{PkgStr};
133        $workbook->{Rich}   = $string_parse_data{Rich};
134    }
135
136    # $workbook->{StandardWidth} = ...;
137
138    # $workbook->{Author} = ...;
139
140    # $workbook->{PrintArea} = ...;
141    # $workbook->{PrintTitle} = ...;
142
143    my @sheets = map {
144        my $idx = $_->att('rels:id');
145        if ($files->{sheets}{$idx}) {
146          my $sheet = Spreadsheet::ParseExcel::Worksheet->new(
147              Name     => $_->att('name'),
148              _Book    => $workbook,
149              _SheetNo => $idx,
150          );
151          $sheet->{SheetHidden} = 1 if defined $_->att('state') and $_->att('state') eq 'hidden';
152          $self->_parse_sheet($sheet, $files->{sheets}{$idx});
153          ($sheet)
154        } else {
155          ()
156        }
157    } $files->{workbook}->find_nodes('//s:sheets/s:sheet');
158
159    $workbook->{Worksheet}  = \@sheets;
160    $workbook->{SheetCount} = scalar(@sheets);
161
162    my ($node) = $files->{workbook}->find_nodes('//s:workbookView');
163    my $selected = $node ? $node->att('activeTab') : undef;
164    $workbook->{SelectedSheet} = defined($selected) ? 0+$selected : 0;
165
166    return $workbook;
167}
168
169sub _parse_sheet {
170    my $self = shift;
171    my ($sheet, $sheet_file) = @_;
172
173    $sheet->{MinRow} = 0;
174    $sheet->{MinCol} = 0;
175    $sheet->{MaxRow} = -1;
176    $sheet->{MaxCol} = -1;
177    $sheet->{Selection} = [ 0, 0 ];
178
179    my %merged_cells;
180
181    my @column_formats;
182    my @column_widths;
183    my @columns_hidden;
184    my @row_heights;
185    my @rows_hidden;
186
187    my $default_row_height   = 15;
188    my $default_column_width = 10;
189
190    my %cells;
191    my $row_idx = 0;
192
193    my $sheet_xml = $self->_new_twig(
194        twig_roots => {
195            #XXX need a fallback here, the dimension tag is optional
196            's:dimension' => sub {
197                my ($twig, $dimension) = @_;
198
199                my ($rmin, $cmin, $rmax, $cmax) = $self->_dimensions(
200                    $dimension->att('ref')
201                );
202
203                $sheet->{MinRow} = $rmin;
204                $sheet->{MinCol} = $cmin;
205                $sheet->{MaxRow} = $rmax ? $rmax : -1;
206                $sheet->{MaxCol} = $cmax ? $cmax : -1;
207
208                $twig->purge;
209            },
210
211            's:headerFooter' => sub {
212                my ($twig, $hf) = @_;
213
214                my ($helem, $felem) = map {
215                    $hf->first_child("s:$_")
216                } qw(oddHeader oddFooter);
217                $sheet->{Header} = $helem->text
218                    if $helem;
219                $sheet->{Footer} = $felem->text
220                    if $felem;
221
222                $twig->purge;
223            },
224
225            's:pageMargins' => sub {
226                my ($twig, $margin) = @_;
227                map {
228                    my $key = "\u${_}Margin";
229                    $sheet->{$key} = defined $margin->att($_)
230                                    ? $margin->att($_) : 0
231                } qw(left right top bottom header footer);
232
233                $twig->purge;
234            },
235
236            's:pageSetup' => sub {
237                my ($twig, $setup) = @_;
238                $sheet->{Scale} = defined $setup->att('scale')
239                                ? $setup->att('scale')
240                                : 100;
241                $sheet->{Landscape} = ($setup->att('orientation') || '') ne 'landscape';
242                $sheet->{PaperSize} = defined $setup->att('paperSize')
243                                    ? $setup->att('paperSize')
244                                    : 1;
245                $sheet->{PageStart} = $setup->att('firstPageNumber');
246                $sheet->{UsePage} = $self->_xml_boolean($setup->att('useFirstPageNumber'));
247                $sheet->{HorizontalDPI} = $setup->att('horizontalDpi');
248                $sheet->{VerticalDPI} = $setup->att('verticalDpi');
249
250                $twig->purge;
251            },
252
253            's:mergeCells/s:mergeCell' => sub {
254                my ( $twig, $merge_area ) = @_;
255
256                if (my $ref = $merge_area->att('ref')) {
257                    my ($topleft, $bottomright) = $ref =~ /([^:]+):([^:]+)/;
258
259                    my ($toprow, $leftcol)     = $self->_cell_to_row_col($topleft);
260                    my ($bottomrow, $rightcol) = $self->_cell_to_row_col($bottomright);
261
262                    push @{ $sheet->{MergedArea} }, [
263                        $toprow, $leftcol,
264                        $bottomrow, $rightcol,
265                    ];
266                    for my $row ($toprow .. $bottomrow) {
267                        for my $col ($leftcol .. $rightcol) {
268                            $merged_cells{"$row;$col"} = 1;
269                        }
270                    }
271                }
272
273                $twig->purge;
274            },
275
276            's:sheetFormatPr' => sub {
277                my ( $twig, $format ) = @_;
278
279                $default_row_height   = $format->att('defaultRowHeight')
280                  unless defined $default_row_height;
281                $default_column_width = $format->att('baseColWidth')
282                  unless defined $default_column_width;
283
284                $twig->purge;
285            },
286
287            's:col' => sub {
288                my ( $twig, $col ) = @_;
289
290                for my $colnum ($col->att('min')..$col->att('max')) {
291                    $column_widths[$colnum - 1] = $col->att('width');
292                    $column_formats[$colnum - 1] = $col->att('style');
293                    $columns_hidden[$colnum - 1] = $self->_xml_boolean($col->att('hidden'));
294                }
295
296                $twig->purge;
297            },
298
299            's:selection' => sub {
300                my ( $twig, $selection ) = @_;
301
302                if (my $cell = $selection->att('activeCell')) {
303                    $sheet->{Selection} = [ $self->_cell_to_row_col($cell) ];
304                }
305                elsif (my $range = $selection->att('sqref')) {
306                    my ($topleft, $bottomright) = $range =~ /([^:]+):([^:]+)/;
307                    $sheet->{Selection} = [
308                        $self->_cell_to_row_col($topleft),
309                        $self->_cell_to_row_col($bottomright),
310                    ];
311                }
312
313                $twig->purge;
314            },
315
316            's:sheetPr/s:tabColor' => sub {
317                my ( $twig, $tab_color ) = @_;
318
319                $sheet->{TabColor} = $self->_color($sheet->{_Book}{Color}, $tab_color);
320
321                $twig->purge;
322            },
323
324            's:sheetData/s:row' => sub {
325                my ( $twig, $row_elt ) = @_;
326                my $explicit_row_idx = $row_elt->att('r');
327                $row_idx = $explicit_row_idx - 1 if defined $explicit_row_idx;
328
329                $row_heights[$row_idx] = $row_elt->att('ht');
330                $rows_hidden[$row_idx] = $self->_xml_boolean($row_elt->att('hidden'));
331
332                my $col_idx = 0;
333                for my $cell ( $row_elt->children('s:c') ){
334                    my $loc = $cell->att('r');
335                    my ($row, $col);
336                    if ($loc) {
337                        ($row, $col) = $self->_cell_to_row_col($loc);
338                        if ($row != $row_idx) {
339                            warn "mismatched coords: got $loc for cell in row $row_idx";
340                        }
341                        $col_idx = $col - 1;
342                    }
343                    else {
344                        ($row, $col) = ($row_idx, $col_idx);
345                    }
346                    $sheet->{MaxRow} = $row
347                        if $sheet->{MaxRow} < $row;
348                    $sheet->{MaxCol} = $col
349                        if $sheet->{MaxCol} < $col;
350                    my $type = $cell->att('t') || 'n';
351                    my $val_xml;
352                    if ($type ne 'inlineStr') {
353                        $val_xml = $cell->first_child('s:v');
354                    }
355                    elsif (defined $cell->first_child('s:is')) {
356                        $val_xml = ($cell->find_nodes('.//s:t'))[0];
357                    }
358                    my $val = $val_xml ? $val_xml->text : undef;
359
360                    my $long_type;
361                    my $Rich;
362                    if (!defined($val)) {
363                        $long_type = 'Text';
364                        $val = '';
365                    }
366                    elsif ($type eq 's') {
367                        $long_type = 'Text';
368                        $Rich = $sheet->{_Book}{Rich}->{$val};
369                        $val  = $sheet->{_Book}{PkgStr}[$val];
370                    }
371                    elsif ($type eq 'n') {
372                        $long_type = 'Numeric';
373                        $val = defined($val) ? 0+$val : undef;
374                    }
375                    elsif ($type eq 'd') {
376                        $long_type = 'Date';
377                    }
378                    elsif ($type eq 'b') {
379                        $long_type = 'Text';
380                        $val = $val ? "TRUE" : "FALSE";
381                    }
382                    elsif ($type eq 'e') {
383                        $long_type = 'Text';
384                    }
385                    elsif ($type eq 'str' || $type eq 'inlineStr') {
386                        $long_type = 'Text';
387                    }
388                    else {
389                        die "unimplemented type $type"; # XXX
390                    }
391
392                    my $format_idx = $cell->att('s') || 0;
393                    my $format = $sheet->{_Book}{Format}[$format_idx];
394                    die "unknown format $format_idx" unless $format;
395
396                    # see the list of built-in formats below in _parse_styles
397                    # XXX probably should figure this out from the actual format string,
398                    # but that's not entirely trivial
399                    if (grep { $format->{FmtIdx} == $_ } 14..22, 45..47) {
400                        $long_type = 'Date';
401                    }
402
403                    my $formula = $cell->first_child('s:f');
404                    my $cell = Spreadsheet::ParseExcel::Cell->new(
405                        Val      => $val,
406                        Type     => $long_type,
407                        Merged   => undef, # fix up later
408                        Format   => $format,
409                        FormatNo => $format_idx,
410                        ($formula
411                            ? (Formula => $formula->text)
412                            : ()),
413                        Rich     => $Rich,
414                    );
415                    $cell->{_Value} = $sheet->{_Book}{FmtClass}->ValFmt(
416                        $cell, $sheet->{_Book}
417                    );
418                    $cells{"$row;$col"} = $cell;
419                    $sheet->{Cells}[$row][$col] = $cell;
420                    $col_idx++;
421                }
422
423                $twig->purge;
424                $row_idx++;
425            },
426        }
427    );
428
429    $sheet_xml->parse( $sheet_file );
430
431    for my $key (keys %merged_cells) {
432        $cells{$key}{Merged} = 1 if $cells{$key};
433    }
434
435    if ( ! $sheet->{Cells} ){
436        $sheet->{MaxRow} = $sheet->{MaxCol} = -1;
437    }
438
439    $sheet->{DefRowHeight} = 0+$default_row_height;
440    $sheet->{DefColWidth} = 0+$default_column_width;
441    $sheet->{RowHeight} = [
442        map { defined $_ ? 0+$_ : 0+$default_row_height } @row_heights
443    ];
444    $sheet->{RowHidden} = \@rows_hidden;
445    $sheet->{ColWidth} = [
446        map { defined $_ ? 0+$_ : 0+$default_column_width } @column_widths
447    ];
448    $sheet->{ColFmtNo} = \@column_formats;
449    $sheet->{ColHidden} = \@columns_hidden;
450
451}
452
453sub _get_text_and_rich_font_by_cell {
454    my $self = shift;
455    my ($si, $theme_colors) = @_;
456
457    # XXX
458    my %default_font_opts = (
459        Height         => 12,
460        Color          => '#000000',
461        Name           => '',
462        Bold           => 0,
463        Italic         => 0,
464        Underline      => 0,
465        UnderlineStyle => 0,
466        Strikeout      => 0,
467        Super          => 0,
468    );
469
470    my $string_text = '';
471    my @rich_font_by_cell;
472    my @nodes_r = $si->find_nodes('.//s:r');
473    if (@nodes_r > 0) {
474        for my $chunk (map { $_->children } @nodes_r) {
475            my $string_length = length($string_text);
476            if ($chunk->name eq 's:t') {
477                if (!@rich_font_by_cell) {
478                    push @rich_font_by_cell, [
479                        $string_length,
480                        Spreadsheet::ParseExcel::Font->new(%default_font_opts)
481                    ];
482                }
483                $string_text .= $chunk->text;
484            }
485            elsif ($chunk->name eq 's:rPr') {
486                my %format_text = %default_font_opts;
487                for my $node_format ($chunk->children) {
488                    if ($node_format->name eq 's:sz') {
489                        $format_text{Height} = $node_format->att('val');
490                    }
491                    elsif ($node_format->name eq 's:color') {
492                        $format_text{Color} = $self->_color(
493                            $theme_colors,
494                            $node_format
495                        );
496                    }
497                    elsif ($node_format->name eq 's:rFont') {
498                        $format_text{Name} = $node_format->att('val');
499                    }
500                    elsif ($node_format->name eq 's:b') {
501                        $format_text{Bold} = 1;
502                    }
503                    elsif ($node_format->name eq 's:i') {
504                        $format_text{Italic} = 1;
505                    }
506                    elsif ($node_format->name eq 's:u') {
507                        $format_text{Underline} = 1;
508                        if (defined $node_format->att('val')) {
509                            $format_text{UnderlineStyle} = 2;
510                        } else {
511                            $format_text{UnderlineStyle} = 1;
512                        }
513                    }
514                    elsif ($node_format->name eq 's:strike') {
515                        $format_text{Strikeout} = 1;
516                    }
517                    elsif ($node_format->name eq 's:vertAlign') {
518                        if ($node_format->att('val') eq 'superscript') {
519                            $format_text{Super} = 1;
520                        }
521                        elsif ($node_format->att('val') eq 'subscript') {
522                            $format_text{Super} = 2;
523                        }
524                    }
525                }
526                push @rich_font_by_cell, [
527                    $string_length,
528                    Spreadsheet::ParseExcel::Font->new(%format_text)
529                ];
530            }
531        }
532    }
533    else {
534        $string_text = join '', map { $_->text } $si->find_nodes('.//s:t');
535    }
536
537    return (
538        String => $string_text,
539        Rich => \@rich_font_by_cell,
540    );
541}
542
543sub _parse_shared_strings {
544    my $self = shift;
545    my ($strings, $theme_colors) = @_;
546
547    my $PkgStr = [];
548
549    my %richfonts;
550    if ($strings) {
551        my $xml = $self->_new_twig(
552            twig_handlers => {
553                's:si' => sub {
554                    my ( $twig, $si ) = @_;
555
556                    my %text_rich = $self->_get_text_and_rich_font_by_cell(
557                        $si,
558                        $theme_colors
559                    );
560                    $richfonts{scalar @$PkgStr} = $text_rich{Rich};
561                    push @$PkgStr, $text_rich{String};
562                    $twig->purge;
563                },
564            }
565        );
566        $xml->parse( $strings );
567    }
568    return (
569        Rich   => \%richfonts,
570        PkgStr => $PkgStr,
571    );
572}
573
574sub _parse_themes {
575    my $self = shift;
576    my ($themes) = @_;
577
578    return {} unless $themes;
579
580    my @color = map {
581        $_->name eq 'drawmain:sysClr' ? $_->att('lastClr') : $_->att('val')
582    } $themes->find_nodes('//drawmain:clrScheme/*/*');
583
584    # this shouldn't be necessary, but the documentation is wrong here
585    # see http://stackoverflow.com/questions/2760976/theme-confusion-in-spreadsheetml
586    ($color[0], $color[1]) = ($color[1], $color[0]);
587    ($color[2], $color[3]) = ($color[3], $color[2]);
588
589    return {
590        Color => \@color,
591    }
592}
593
594sub _parse_styles {
595    my $self = shift;
596    my ($workbook, $styles) = @_;
597
598    # these defaults are from
599    # http://social.msdn.microsoft.com/Forums/en-US/oxmlsdk/thread/e27aaf16-b900-4654-8210-83c5774a179c
600    my %default_format_str = (
601        0  => 'GENERAL',
602        1  => '0',
603        2  => '0.00',
604        3  => '#,##0',
605        4  => '#,##0.00',
606        5  => '$#,##0_);($#,##0)',
607        6  => '$#,##0_);[Red]($#,##0)',
608        7  => '$#,##0.00_);($#,##0.00)',
609        8  => '$#,##0.00_);[Red]($#,##0.00)',
610        9  => '0%',
611        10 => '0.00%',
612        11 => '0.00E+00',
613        12 => '# ?/?',
614        13 => '# ??/??',
615        14 => 'm/d/yyyy',
616        15 => 'd-mmm-yy',
617        16 => 'd-mmm',
618        17 => 'mmm-yy',
619        18 => 'h:mm AM/PM',
620        19 => 'h:mm:ss AM/PM',
621        20 => 'h:mm',
622        21 => 'h:mm:ss',
623        22 => 'm/d/yyyy h:mm',
624        37 => '#,##0_);(#,##0)',
625        38 => '#,##0_);[Red](#,##0)',
626        39 => '#,##0.00_);(#,##0.00)',
627        40 => '#,##0.00_);[Red](#,##0.00)',
628        45 => 'mm:ss',
629        46 => '[h]:mm:ss',
630        47 => 'mm:ss.0',
631        48 => '##0.0E+0',
632        49 => '@',
633    );
634
635    my %default_format_opts = (
636        IgnoreFont         => 1,
637        IgnoreFill         => 1,
638        IgnoreBorder       => 1,
639        IgnoreAlignment    => 1,
640        IgnoreNumberFormat => 1,
641        IgnoreProtection   => 1,
642        FontNo             => 0,
643        FmtIdx             => 0,
644        Lock               => 1,
645        Hidden             => 0,
646        AlignH             => 0,
647        Wrap               => 0,
648        AlignV             => 2,
649        Rotate             => 0,
650        Indent             => 0,
651        Shrink             => 0,
652        BdrStyle           => [0, 0, 0, 0],
653        BdrColor           => [undef, undef, undef, undef],
654        BdrDiag            => [0, 0, undef],
655        Fill               => [0, undef, undef],
656    );
657
658    if (!$styles) {
659        # XXX i guess?
660        my $font = Spreadsheet::ParseExcel::Font->new(
661            Height         => 12,
662            Color          => '#000000',
663            Name           => '',
664        );
665        my $format = Spreadsheet::ParseExcel::Format->new(
666            %default_format_opts,
667            Font => $font,
668        );
669
670        return {
671            FormatStr => \%default_format_str,
672            Font      => [ $font ],
673            Format    => [ $format ],
674        };
675    }
676
677    my %halign = (
678        center           => 2,
679        centerContinuous => 6,
680        distributed      => 7,
681        fill             => 4,
682        general          => 0,
683        justify          => 5,
684        left             => 1,
685        right            => 3,
686    );
687
688    my %valign = (
689        bottom      => 2,
690        center      => 1,
691        distributed => 4,
692        justify     => 3,
693        top         => 0,
694    );
695
696    my %border = (
697        dashDot          => 9,
698        dashDotDot       => 11,
699        dashed           => 3,
700        dotted           => 4,
701        double           => 6,
702        hair             => 7,
703        medium           => 2,
704        mediumDashDot    => 10,
705        mediumDashDotDot => 12,
706        mediumDashed     => 8,
707        none             => 0,
708        slantDashDot     => 13,
709        thick            => 5,
710        thin             => 1,
711    );
712
713    my %fill = (
714        darkDown        => 7,
715        darkGray        => 3,
716        darkGrid        => 9,
717        darkHorizontal  => 5,
718        darkTrellis     => 10,
719        darkUp          => 8,
720        darkVertical    => 6,
721        gray0625        => 18,
722        gray125         => 17,
723        lightDown       => 13,
724        lightGray       => 4,
725        lightGrid       => 15,
726        lightHorizontal => 11,
727        lightTrellis    => 16,
728        lightUp         => 14,
729        lightVertical   => 12,
730        mediumGray      => 2,
731        none            => 0,
732        solid           => 1,
733    );
734
735    my @fills = map {
736        my $pattern_type = $_->att('patternType');
737        [
738            ($pattern_type ? $fill{$pattern_type} : 0),
739            $self->_color($workbook->{Color}, $_->first_child('s:fgColor'), 1),
740            $self->_color($workbook->{Color}, $_->first_child('s:bgColor'), 1),
741        ]
742    } $styles->find_nodes('//s:fills/s:fill/s:patternFill');
743
744    my @borders = map {
745        my $border = $_;
746        my ($ddiag, $udiag) = map {
747            $self->_xml_boolean($border->att($_))
748        } qw(diagonalDown diagonalUp);
749        my %borderstyles = map {
750            my $e = $border->first_child("s:$_");
751            $_ => ($e ? $e->att('style') || 'none' : 'none')
752        } qw(left right top bottom diagonal);
753        my %bordercolors = map {
754            my $e = $border->first_child("s:$_");
755            $_ => ($e ? $e->first_child('s:color') : undef)
756        } qw(left right top bottom diagonal);
757        # XXX specs say "begin" and "end" rather than "left" and "right",
758        # but... that's not what seems to be in the file itself (sigh)
759        {
760            colors => [
761                map {
762                    $self->_color($workbook->{Color}, $bordercolors{$_})
763                } qw(left right top bottom)
764            ],
765            styles => [
766                map {
767                    $border{$borderstyles{$_}}
768                } qw(left right top bottom)
769            ],
770            diagonal => [
771                ( $ddiag &&  $udiag ? 3
772               :  $ddiag && !$udiag ? 2
773               : !$ddiag &&  $udiag ? 1
774               :                      0),
775                $border{$borderstyles{diagonal}},
776                $self->_color($workbook->{Color}, $bordercolors{diagonal}),
777            ],
778        }
779    } $styles->find_nodes('//s:borders/s:border');
780
781    my %format_str = (
782        %default_format_str,
783        (map {
784            $_->att('numFmtId') => $_->att('formatCode')
785        } $styles->find_nodes('//s:numFmts/s:numFmt')),
786    );
787
788    my @font = map {
789        my $vert = $_->first_child('s:vertAlign');
790        my $under = $_->first_child('s:u');
791        my $heightelem = $_->first_child('s:sz');
792        # XXX i guess 12 is okay?
793        my $height = 0+($heightelem ? $heightelem->att('val') : 12);
794        my $nameelem = $_->first_child('s:name');
795        my $name = $nameelem ? $nameelem->att('val') : '';
796        Spreadsheet::ParseExcel::Font->new(
797            Height         => $height,
798            # Attr           => $iAttr,
799            # XXX not sure if there's a better way to keep the indexing stuff
800            # intact rather than just going straight to #xxxxxx
801            # XXX also not sure what it means for the color tag to be missing,
802            # just assuming black for now
803            Color          => ($_->first_child('s:color')
804                ? $self->_color(
805                    $workbook->{Color},
806                    $_->first_child('s:color')
807                )
808                : '#000000'
809            ),
810            Super          => ($vert
811                ? ($vert->att('val') eq 'superscript' ? 1
812                 : $vert->att('val') eq 'subscript'   ? 2
813                 :                                      0)
814                : 0
815            ),
816            # XXX not sure what the single accounting and double accounting
817            # underline styles map to in xlsx. also need to map the new
818            # underline styles
819            UnderlineStyle => ($under
820                # XXX sometimes style xml files can contain just <u/> with no
821                # val attribute. i think this means single underline, but not
822                # sure
823                ? (!$under->att('val')            ? 1
824                 : $under->att('val') eq 'single' ? 1
825                 : $under->att('val') eq 'double' ? 2
826                 :                                  0)
827                : 0
828            ),
829            Name           => $name,
830
831            Bold      => $_->has_child('s:b') ? 1 : 0,
832            Italic    => $_->has_child('s:i') ? 1 : 0,
833            Underline => $_->has_child('s:u') ? 1 : 0,
834            Strikeout => $_->has_child('s:strike') ? 1 : 0,
835        )
836    } $styles->find_nodes('//s:fonts/s:font');
837
838    my @format = map {
839        my $xml_fmt = $_;
840        my $alignment  = $xml_fmt->first_child('s:alignment');
841        my $protection = $xml_fmt->first_child('s:protection');
842        my %ignore = map {
843            ("Ignore$_" => !$self->_xml_boolean($xml_fmt->att("apply$_")))
844        } qw(Font Fill Border Alignment NumberFormat Protection);
845        my %opts = (
846            %default_format_opts,
847            %ignore,
848        );
849
850        $opts{FmtIdx}   = 0+($xml_fmt->att('numFmtId')||0);
851        $opts{FontNo}   = 0+($xml_fmt->att('fontId')||0);
852        $opts{Font}     = $font[$opts{FontNo}];
853        $opts{Fill}     = $fills[$xml_fmt->att('fillId')||0];
854        $opts{BdrStyle} = $borders[$xml_fmt->att('borderId')||0]{styles};
855        $opts{BdrColor} = $borders[$xml_fmt->att('borderId')||0]{colors};
856        $opts{BdrDiag}  = $borders[$xml_fmt->att('borderId')||0]{diagonal};
857
858        if ($alignment) {
859            $opts{AlignH} = $halign{$alignment->att('horizontal') || 'general'};
860            $opts{Wrap}   = $self->_xml_boolean($alignment->att('wrapText'));
861            $opts{AlignV} = $valign{$alignment->att('vertical') || 'bottom'};
862            $opts{Rotate} = $alignment->att('textRotation');
863            $opts{Indent} = $alignment->att('indent');
864            $opts{Shrink} = $self->_xml_boolean($alignment->att('shrinkToFit'));
865            # JustLast => $iJustL,
866        }
867
868        if ($protection) {
869            $opts{Lock} = defined $protection->att('locked')
870                ? $self->_xml_boolean($protection->att('locked'))
871                : 1;
872            $opts{Hidden} = $self->_xml_boolean($protection->att('hidden'));
873        }
874
875        # Style    => $iStyle,
876        # Key123   => $i123,
877        # Merge   => $iMerge,
878        # ReadDir => $iReadDir,
879        Spreadsheet::ParseExcel::Format->new(%opts)
880    } $styles->find_nodes('//s:cellXfs/s:xf');
881
882    return {
883        FormatStr => \%format_str,
884        Font      => \@font,
885        Format    => \@format,
886    }
887}
888
889sub _extract_files {
890    my $self = shift;
891    my ($zip) = @_;
892
893    my $type_base =
894        'http://schemas.openxmlformats.org/officeDocument/2006/relationships';
895
896    my $rels = $self->_parse_xml(
897        $zip,
898        $self->_rels_for(''),
899    );
900    my $wb_name = ($rels->find_nodes(
901        qq<//packagerels:Relationship[\@Type="$type_base/officeDocument"]>
902    ))[0]->att('Target');
903    $wb_name =~ s{^/}{};
904    my $wb_xml = $self->_parse_xml($zip, $wb_name);
905
906    my $path_base = $self->_base_path_for($wb_name);
907    my $wb_rels = $self->_parse_xml(
908        $zip,
909        $self->_rels_for($wb_name),
910    );
911
912    my $get_path = sub {
913        my ($p) = @_;
914
915        return $p =~ s{^/}{}
916            ? $p
917            : $path_base . $p;
918    };
919
920    my ($strings_xml) = map {
921        $self->_zip_file_member($zip, $get_path->($_->att('Target')))
922    } $wb_rels->find_nodes(qq<//packagerels:Relationship[\@Type="$type_base/sharedStrings"]>);
923
924    my ($styles_xml) = map {
925        $self->_parse_xml(
926            $zip,
927            $get_path->($_->att('Target'))
928        )
929    } $wb_rels->find_nodes(qq<//packagerels:Relationship[\@Type="$type_base/styles"]>);
930
931    my %worksheet_xml = map {
932        ($_->att('Id') => $self->_zip_file_member($zip, $get_path->($_->att('Target'))))
933    } $wb_rels->find_nodes(qq<//packagerels:Relationship[\@Type="$type_base/worksheet"]>);
934
935    my %themes_xml = map {
936        $_->att('Id') => $self->_parse_xml($zip, $get_path->($_->att('Target')))
937    } $wb_rels->find_nodes(qq<//packagerels:Relationship[\@Type="$type_base/theme"]>);
938
939    return {
940        workbook => $wb_xml,
941        sheets   => \%worksheet_xml,
942        themes   => \%themes_xml,
943        ($styles_xml
944            ? (styles  => $styles_xml)
945            : ()),
946        ($strings_xml
947            ? (strings => $strings_xml)
948            : ()),
949    };
950}
951
952sub _parse_xml {
953    my $self = shift;
954    my ($zip, $subfile, $map_xmlns) = @_;
955
956    my $xml = $self->_new_twig;
957    $xml->parse($self->_zip_file_member($zip, $subfile));
958
959    return $xml;
960}
961
962sub _zip_file_member {
963    my $self = shift;
964    my ($zip, $name) = @_;
965
966    my @members = $zip->membersMatching(qr/^$name$/i);
967    die "no subfile named $name" unless @members;
968
969    return scalar $members[0]->contents;
970}
971
972sub _rels_for {
973    my $self = shift;
974    my ($file) = @_;
975
976    my @path = split '/', $file;
977    my $name = pop @path;
978    $name = '' unless defined $name;
979    push @path, '_rels';
980    push @path, "$name.rels";
981
982    return join '/', @path;
983}
984
985sub _base_path_for {
986    my $self = shift;
987    my ($file) = @_;
988
989    my @path = split '/', $file;
990    pop @path;
991
992    return join('/', @path) . '/';
993}
994
995sub _dimensions {
996    my $self = shift;
997    my ($dim) = @_;
998
999    my ($topleft, $bottomright) = split ':', $dim;
1000    $bottomright = $topleft unless defined $bottomright;
1001
1002    my ($rmin, $cmin) = $self->_cell_to_row_col($topleft);
1003    my ($rmax, $cmax) = $self->_cell_to_row_col($bottomright);
1004
1005    return ($rmin, $cmin, $rmax, $cmax);
1006}
1007
1008sub _cell_to_row_col {
1009    my $self = shift;
1010    my ($cell) = @_;
1011
1012    my ($col, $row) = $cell =~ /([A-Z]+)([0-9]+)/;
1013
1014    my $ncol = 0;
1015    for my $char (split //, $col) {
1016        $ncol *= 26;
1017        $ncol += ord($char) - ord('A') + 1;
1018    }
1019    $ncol = $ncol - 1;
1020
1021    my $nrow = $row - 1;
1022
1023    return ($nrow, $ncol);
1024}
1025
1026sub _xml_boolean {
1027    my $self = shift;
1028    my ($bool) = @_;
1029    return defined($bool) && ($bool eq 'true' || $bool eq '1');
1030}
1031
1032sub _color {
1033    my $self = shift;
1034    my ($colors, $color_node, $fill) = @_;
1035
1036    my $color;
1037    if ($color_node && !$self->_xml_boolean($color_node->att('auto'))) {
1038        if (defined $color_node->att('indexed')) {
1039            # see https://rt.cpan.org/Public/Bug/Display.html?id=93065
1040            if ($fill && $color_node->att('indexed') == 64) {
1041                return '#FFFFFF';
1042            }
1043            else {
1044                $color = '#' . Spreadsheet::ParseExcel->ColorIdxToRGB(
1045                    $color_node->att('indexed')
1046                );
1047            }
1048        }
1049        elsif (defined $color_node->att('rgb')) {
1050            $color = '#' . substr($color_node->att('rgb'), 2, 6);
1051        }
1052        elsif (defined $color_node->att('theme')) {
1053            my $theme = $colors->[$color_node->att('theme')];
1054            if (defined $theme) {
1055                $color = "#$theme";
1056            }
1057            else {
1058                return undef;
1059            }
1060        }
1061
1062        $color = $self->_apply_tint($color, $color_node->att('tint'))
1063            if $color_node->att('tint');
1064    }
1065
1066    return $color;
1067}
1068
1069sub _apply_tint {
1070    my $self = shift;
1071    my ($color, $tint) = @_;
1072
1073    my ($r, $g, $b) = map { oct("0x$_") } $color =~ /#(..)(..)(..)/;
1074    my ($h, $l, $s) = rgb2hls($r, $g, $b);
1075
1076    if ($tint < 0) {
1077        $l = $l * (1.0 + $tint);
1078    }
1079    else {
1080        $l = $l * (1.0 - $tint) + (1.0 - 1.0 * (1.0 - $tint));
1081    }
1082
1083    return scalar hls2rgb($h, $l, $s);
1084}
1085
1086sub _new_twig {
1087    my $self = shift;
1088    my %opts = @_;
1089
1090    return XML::Twig->new(
1091        map_xmlns => {
1092            'http://schemas.openxmlformats.org/spreadsheetml/2006/main' => 's',
1093            'http://schemas.openxmlformats.org/package/2006/relationships' => 'packagerels',
1094            'http://schemas.openxmlformats.org/officeDocument/2006/relationships' => 'rels',
1095            'http://schemas.openxmlformats.org/drawingml/2006/main' => 'drawmain',
1096        },
1097        keep_original_prefix => 1,
1098        %opts,
1099    );
1100}
1101
1102
11031;
1104
1105__END__
1106
1107=pod
1108
1109=encoding UTF-8
1110
1111=head1 NAME
1112
1113Spreadsheet::ParseXLSX - parse XLSX files
1114
1115=head1 VERSION
1116
1117version 0.27
1118
1119=head1 SYNOPSIS
1120
1121  use Spreadsheet::ParseXLSX;
1122
1123  my $parser = Spreadsheet::ParseXLSX->new;
1124  my $workbook = $parser->parse("file.xlsx");
1125  # see Spreadsheet::ParseExcel for further documentation
1126
1127=head1 DESCRIPTION
1128
1129This module is an adaptor for L<Spreadsheet::ParseExcel> that reads XLSX files.
1130For documentation about the various data that you can retrieve from these
1131classes, please see L<Spreadsheet::ParseExcel>,
1132L<Spreadsheet::ParseExcel::Workbook>, L<Spreadsheet::ParseExcel::Worksheet>,
1133and L<Spreadsheet::ParseExcel::Cell>.
1134
1135=head1 METHODS
1136
1137=head2 new(%opts)
1138
1139Returns a new parser instance. Takes a hash of parameters:
1140
1141=over 4
1142
1143=item Password
1144
1145Password to use for decrypting encrypted files.
1146
1147=back
1148
1149=head2 parse($file, $formatter)
1150
1151Parses an XLSX file. Parsing errors throw an exception. C<$file> can be either
1152a filename or an open filehandle. Returns a
1153L<Spreadsheet::ParseExcel::Workbook> instance containing the parsed data.
1154The C<$formatter> argument is an optional formatter class as described in L<Spreadsheet::ParseExcel>.
1155
1156=head1 INCOMPATIBILITIES
1157
1158This module returns data using classes from L<Spreadsheet::ParseExcel>, so for
1159the most part, it should just be a drop-in replacement. That said, there are a
1160couple areas where the data returned is intentionally different:
1161
1162=over 4
1163
1164=item Colors
1165
1166In Spreadsheet::ParseExcel, colors are represented by integers which index into
1167the color table, and you have to use
1168C<< Spreadsheet::ParseExcel->ColorIdxToRGB >> in order to get the actual value
1169out. In Spreadsheet::ParseXLSX, while the color table still exists, cells are
1170also allowed to specify their color directly rather than going through the
1171color table. In order to avoid confusion, I normalize all color values in
1172Spreadsheet::ParseXLSX to their string RGB format (C<"#0088ff">). This affects
1173the C<Fill>, C<BdrColor>, and C<BdrDiag> properties of formats, and the
1174C<Color> property of fonts. Note that the default color is represented by
1175C<undef> (the same thing that C<ColorIdxToRGB> would return).
1176
1177=item Formulas
1178
1179Spreadsheet::ParseExcel doesn't support formulas. Spreadsheet::ParseXLSX
1180provides basic formula support by returning the text of the formula as part of
1181the cell data. You can access it via C<< $cell->{Formula} >>. Note that the
1182restriction still holds that formula cell values aren't available unless they
1183were explicitly provided when the spreadsheet was written.
1184
1185=back
1186
1187=head1 BUGS
1188
1189=over 4
1190
1191=item Large spreadsheets may cause segfaults on perl 5.14 and earlier
1192
1193This module internally uses XML::Twig, which makes it potentially subject to
1194L<Bug #71636 for XML-Twig: Segfault with medium-sized document|https://rt.cpan.org/Public/Bug/Display.html?id=71636>
1195on perl versions 5.14 and below (the underlying bug with perl weak references
1196was fixed in perl 5.15.5). The larger and more complex the spreadsheet, the
1197more likely to be affected, but the actual size at which it segfaults is
1198platform dependent. On a 64-bit perl with 7.6gb memory, it was seen on
1199spreadsheets about 300mb and above. You can work around this adding
1200C<XML::Twig::_set_weakrefs(0)> to your code before parsing the spreadsheet,
1201although this may have other consequences such as memory leaks.
1202
1203=item Worksheets without the C<dimension> tag are not supported
1204
1205=item Intra-cell formatting is discarded
1206
1207=item Shared formulas are not supported
1208
1209Shared formula support will require an actual formula parser and quite a bit of
1210custom logic, since the only thing stored in the document is the formula for
1211the base cell - updating the cell references in the formulas in the rest of the
1212cells is handled by the application. Values for these cells are still handled
1213properly.
1214
1215=back
1216
1217In addition, there are still a few areas which are not yet implemented (the
1218XLSX spec is quite large). If you run into any of those, bug reports are quite
1219welcome.
1220
1221Please report any bugs to GitHub Issues at
1222L<https://github.com/doy/spreadsheet-parsexlsx/issues>.
1223
1224=head1 SEE ALSO
1225
1226L<Spreadsheet::ParseExcel>: The equivalent, for XLS files.
1227
1228L<Spreadsheet::XLSX>: An older, less robust and featureful implementation.
1229
1230=head1 SUPPORT
1231
1232You can find this documentation for this module with the perldoc command.
1233
1234    perldoc Spreadsheet::ParseXLSX
1235
1236You can also look for information at:
1237
1238=over 4
1239
1240=item * MetaCPAN
1241
1242L<https://metacpan.org/release/Spreadsheet-ParseXLSX>
1243
1244=item * RT: CPAN's request tracker
1245
1246L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Spreadsheet-ParseXLSX>
1247
1248=item * Github
1249
1250L<https://github.com/doy/spreadsheet-parsexlsx>
1251
1252=item * CPAN Ratings
1253
1254L<http://cpanratings.perl.org/d/Spreadsheet-ParseXLSX>
1255
1256=back
1257
1258=head1 SPONSORS
1259
1260Parts of this code were paid for by
1261
1262=over 4
1263
1264=item Socialflow L<http://socialflow.com>
1265
1266=back
1267
1268=head1 AUTHOR
1269
1270Jesse Luehrs <doy@tozt.net>
1271
1272=head1 COPYRIGHT AND LICENSE
1273
1274This software is Copyright (c) 2016 by Jesse Luehrs.
1275
1276This is free software, licensed under:
1277
1278  The MIT (X11) License
1279
1280=cut
1281