1package Spreadsheet::ParseXLSX; 2our $AUTHORITY = 'cpan:DOY'; 3$Spreadsheet::ParseXLSX::VERSION = '0.27'; 4use strict; 5use warnings; 6use 5.010; 7# ABSTRACT: parse XLSX files 8 9use Archive::Zip; 10use Graphics::ColorUtils 'rgb2hls', 'hls2rgb'; 11use Scalar::Util 'openhandle'; 12use Spreadsheet::ParseExcel 0.61; 13use XML::Twig; 14 15use Spreadsheet::ParseXLSX::Decryptor; 16 17 18 19sub new { 20 my $class = shift; 21 my (%args) = @_; 22 23 my $self = bless {}, $class; 24 $self->{Password} = $args{Password} if defined $args{Password}; 25 26 return $self; 27} 28 29 30sub parse { 31 my $self = shift; 32 my ($file, $formatter) = @_; 33 34 my $zip = Archive::Zip->new; 35 my $workbook = Spreadsheet::ParseExcel::Workbook->new; 36 37 if ($self->_check_signature($file)) { 38 my $decrypted_file = Spreadsheet::ParseXLSX::Decryptor->open( 39 $file, 40 $self->{Password} 41 ); 42 $file = $decrypted_file if $decrypted_file; 43 } 44 45 if (openhandle($file)) { 46 bless $file, 'IO::File' if ref($file) eq 'GLOB'; # sigh 47 my $fh = ref($file) eq 'File::Temp' 48 ? IO::File->new("<&=" . fileno($file)) 49 : $file; 50 $zip->readFromFileHandle($fh) == Archive::Zip::AZ_OK 51 or die "Can't open filehandle as a zip file"; 52 $workbook->{File} = undef; 53 $workbook->{__tempfile} = $file; 54 } 55 elsif (ref($file) eq 'SCALAR') { 56 open my $fh, '+<', $file 57 or die "Can't create filehandle from memory data"; 58 $zip->readFromFileHandle($fh) == Archive::Zip::AZ_OK 59 or die "Can't open scalar ref as a zip file"; 60 $workbook->{File} = undef; 61 } 62 elsif (!ref($file)) { 63 $zip->read($file) == Archive::Zip::AZ_OK 64 or die "Can't open file '$file' as a zip file"; 65 $workbook->{File} = $file; 66 } 67 else { 68 die "Argument to 'new' must be a filename, open filehandle, or scalar ref"; 69 } 70 71 return $self->_parse_workbook($zip, $workbook, $formatter); 72} 73 74sub _check_signature { 75 my $self = shift; 76 my ($file) = @_; 77 78 my $signature = ''; 79 if (openhandle($file)) { 80 bless $file, 'IO::File' if ref($file) eq 'GLOB'; # sigh 81 $file->read($signature, 2); 82 $file->seek(-2, IO::File::SEEK_CUR); 83 } 84 elsif (ref($file) eq 'SCALAR') { 85 $signature = substr($$file, 0, 2); 86 } 87 elsif (!ref($file)) { 88 my $fh = IO::File->new($file, 'r'); 89 $fh->read($signature, 2); 90 $fh->close; 91 } 92 93 return $signature eq "\xd0\xcf"; 94} 95 96sub _parse_workbook { 97 my $self = shift; 98 my ($zip, $workbook, $formatter) = @_; 99 100 my $files = $self->_extract_files($zip); 101 102 my ($version) = $files->{workbook}->find_nodes('//s:fileVersion'); 103 my ($properties) = $files->{workbook}->find_nodes('//s:workbookPr'); 104 105 if ($version) { 106 $workbook->{Version} = $version->att('appName') 107 . ($version->att('lowestEdited') 108 ? ('-' . $version->att('lowestEdited')) 109 : ("")); 110 } 111 112 $workbook->{Flg1904} = $self->_xml_boolean($properties->att('date1904')) 113 if $properties; 114 115 $workbook->{FmtClass} = $formatter || Spreadsheet::ParseExcel::FmtDefault->new; 116 117 my $themes = $self->_parse_themes((values %{ $files->{themes} })[0]); # XXX 118 119 $workbook->{Color} = $themes->{Color}; 120 121 my $styles = $self->_parse_styles($workbook, $files->{styles}); 122 123 $workbook->{Format} = $styles->{Format}; 124 $workbook->{FormatStr} = $styles->{FormatStr}; 125 $workbook->{Font} = $styles->{Font}; 126 127 if ($files->{strings}) { 128 my %string_parse_data = $self->_parse_shared_strings( 129 $files->{strings}, 130 $themes->{Color} 131 ); 132 $workbook->{PkgStr} = $string_parse_data{PkgStr}; 133 $workbook->{Rich} = $string_parse_data{Rich}; 134 } 135 136 # $workbook->{StandardWidth} = ...; 137 138 # $workbook->{Author} = ...; 139 140 # $workbook->{PrintArea} = ...; 141 # $workbook->{PrintTitle} = ...; 142 143 my @sheets = map { 144 my $idx = $_->att('rels:id'); 145 if ($files->{sheets}{$idx}) { 146 my $sheet = Spreadsheet::ParseExcel::Worksheet->new( 147 Name => $_->att('name'), 148 _Book => $workbook, 149 _SheetNo => $idx, 150 ); 151 $sheet->{SheetHidden} = 1 if defined $_->att('state') and $_->att('state') eq 'hidden'; 152 $self->_parse_sheet($sheet, $files->{sheets}{$idx}); 153 ($sheet) 154 } else { 155 () 156 } 157 } $files->{workbook}->find_nodes('//s:sheets/s:sheet'); 158 159 $workbook->{Worksheet} = \@sheets; 160 $workbook->{SheetCount} = scalar(@sheets); 161 162 my ($node) = $files->{workbook}->find_nodes('//s:workbookView'); 163 my $selected = $node ? $node->att('activeTab') : undef; 164 $workbook->{SelectedSheet} = defined($selected) ? 0+$selected : 0; 165 166 return $workbook; 167} 168 169sub _parse_sheet { 170 my $self = shift; 171 my ($sheet, $sheet_file) = @_; 172 173 $sheet->{MinRow} = 0; 174 $sheet->{MinCol} = 0; 175 $sheet->{MaxRow} = -1; 176 $sheet->{MaxCol} = -1; 177 $sheet->{Selection} = [ 0, 0 ]; 178 179 my %merged_cells; 180 181 my @column_formats; 182 my @column_widths; 183 my @columns_hidden; 184 my @row_heights; 185 my @rows_hidden; 186 187 my $default_row_height = 15; 188 my $default_column_width = 10; 189 190 my %cells; 191 my $row_idx = 0; 192 193 my $sheet_xml = $self->_new_twig( 194 twig_roots => { 195 #XXX need a fallback here, the dimension tag is optional 196 's:dimension' => sub { 197 my ($twig, $dimension) = @_; 198 199 my ($rmin, $cmin, $rmax, $cmax) = $self->_dimensions( 200 $dimension->att('ref') 201 ); 202 203 $sheet->{MinRow} = $rmin; 204 $sheet->{MinCol} = $cmin; 205 $sheet->{MaxRow} = $rmax ? $rmax : -1; 206 $sheet->{MaxCol} = $cmax ? $cmax : -1; 207 208 $twig->purge; 209 }, 210 211 's:headerFooter' => sub { 212 my ($twig, $hf) = @_; 213 214 my ($helem, $felem) = map { 215 $hf->first_child("s:$_") 216 } qw(oddHeader oddFooter); 217 $sheet->{Header} = $helem->text 218 if $helem; 219 $sheet->{Footer} = $felem->text 220 if $felem; 221 222 $twig->purge; 223 }, 224 225 's:pageMargins' => sub { 226 my ($twig, $margin) = @_; 227 map { 228 my $key = "\u${_}Margin"; 229 $sheet->{$key} = defined $margin->att($_) 230 ? $margin->att($_) : 0 231 } qw(left right top bottom header footer); 232 233 $twig->purge; 234 }, 235 236 's:pageSetup' => sub { 237 my ($twig, $setup) = @_; 238 $sheet->{Scale} = defined $setup->att('scale') 239 ? $setup->att('scale') 240 : 100; 241 $sheet->{Landscape} = ($setup->att('orientation') || '') ne 'landscape'; 242 $sheet->{PaperSize} = defined $setup->att('paperSize') 243 ? $setup->att('paperSize') 244 : 1; 245 $sheet->{PageStart} = $setup->att('firstPageNumber'); 246 $sheet->{UsePage} = $self->_xml_boolean($setup->att('useFirstPageNumber')); 247 $sheet->{HorizontalDPI} = $setup->att('horizontalDpi'); 248 $sheet->{VerticalDPI} = $setup->att('verticalDpi'); 249 250 $twig->purge; 251 }, 252 253 's:mergeCells/s:mergeCell' => sub { 254 my ( $twig, $merge_area ) = @_; 255 256 if (my $ref = $merge_area->att('ref')) { 257 my ($topleft, $bottomright) = $ref =~ /([^:]+):([^:]+)/; 258 259 my ($toprow, $leftcol) = $self->_cell_to_row_col($topleft); 260 my ($bottomrow, $rightcol) = $self->_cell_to_row_col($bottomright); 261 262 push @{ $sheet->{MergedArea} }, [ 263 $toprow, $leftcol, 264 $bottomrow, $rightcol, 265 ]; 266 for my $row ($toprow .. $bottomrow) { 267 for my $col ($leftcol .. $rightcol) { 268 $merged_cells{"$row;$col"} = 1; 269 } 270 } 271 } 272 273 $twig->purge; 274 }, 275 276 's:sheetFormatPr' => sub { 277 my ( $twig, $format ) = @_; 278 279 $default_row_height = $format->att('defaultRowHeight') 280 unless defined $default_row_height; 281 $default_column_width = $format->att('baseColWidth') 282 unless defined $default_column_width; 283 284 $twig->purge; 285 }, 286 287 's:col' => sub { 288 my ( $twig, $col ) = @_; 289 290 for my $colnum ($col->att('min')..$col->att('max')) { 291 $column_widths[$colnum - 1] = $col->att('width'); 292 $column_formats[$colnum - 1] = $col->att('style'); 293 $columns_hidden[$colnum - 1] = $self->_xml_boolean($col->att('hidden')); 294 } 295 296 $twig->purge; 297 }, 298 299 's:selection' => sub { 300 my ( $twig, $selection ) = @_; 301 302 if (my $cell = $selection->att('activeCell')) { 303 $sheet->{Selection} = [ $self->_cell_to_row_col($cell) ]; 304 } 305 elsif (my $range = $selection->att('sqref')) { 306 my ($topleft, $bottomright) = $range =~ /([^:]+):([^:]+)/; 307 $sheet->{Selection} = [ 308 $self->_cell_to_row_col($topleft), 309 $self->_cell_to_row_col($bottomright), 310 ]; 311 } 312 313 $twig->purge; 314 }, 315 316 's:sheetPr/s:tabColor' => sub { 317 my ( $twig, $tab_color ) = @_; 318 319 $sheet->{TabColor} = $self->_color($sheet->{_Book}{Color}, $tab_color); 320 321 $twig->purge; 322 }, 323 324 's:sheetData/s:row' => sub { 325 my ( $twig, $row_elt ) = @_; 326 my $explicit_row_idx = $row_elt->att('r'); 327 $row_idx = $explicit_row_idx - 1 if defined $explicit_row_idx; 328 329 $row_heights[$row_idx] = $row_elt->att('ht'); 330 $rows_hidden[$row_idx] = $self->_xml_boolean($row_elt->att('hidden')); 331 332 my $col_idx = 0; 333 for my $cell ( $row_elt->children('s:c') ){ 334 my $loc = $cell->att('r'); 335 my ($row, $col); 336 if ($loc) { 337 ($row, $col) = $self->_cell_to_row_col($loc); 338 if ($row != $row_idx) { 339 warn "mismatched coords: got $loc for cell in row $row_idx"; 340 } 341 $col_idx = $col - 1; 342 } 343 else { 344 ($row, $col) = ($row_idx, $col_idx); 345 } 346 $sheet->{MaxRow} = $row 347 if $sheet->{MaxRow} < $row; 348 $sheet->{MaxCol} = $col 349 if $sheet->{MaxCol} < $col; 350 my $type = $cell->att('t') || 'n'; 351 my $val_xml; 352 if ($type ne 'inlineStr') { 353 $val_xml = $cell->first_child('s:v'); 354 } 355 elsif (defined $cell->first_child('s:is')) { 356 $val_xml = ($cell->find_nodes('.//s:t'))[0]; 357 } 358 my $val = $val_xml ? $val_xml->text : undef; 359 360 my $long_type; 361 my $Rich; 362 if (!defined($val)) { 363 $long_type = 'Text'; 364 $val = ''; 365 } 366 elsif ($type eq 's') { 367 $long_type = 'Text'; 368 $Rich = $sheet->{_Book}{Rich}->{$val}; 369 $val = $sheet->{_Book}{PkgStr}[$val]; 370 } 371 elsif ($type eq 'n') { 372 $long_type = 'Numeric'; 373 $val = defined($val) ? 0+$val : undef; 374 } 375 elsif ($type eq 'd') { 376 $long_type = 'Date'; 377 } 378 elsif ($type eq 'b') { 379 $long_type = 'Text'; 380 $val = $val ? "TRUE" : "FALSE"; 381 } 382 elsif ($type eq 'e') { 383 $long_type = 'Text'; 384 } 385 elsif ($type eq 'str' || $type eq 'inlineStr') { 386 $long_type = 'Text'; 387 } 388 else { 389 die "unimplemented type $type"; # XXX 390 } 391 392 my $format_idx = $cell->att('s') || 0; 393 my $format = $sheet->{_Book}{Format}[$format_idx]; 394 die "unknown format $format_idx" unless $format; 395 396 # see the list of built-in formats below in _parse_styles 397 # XXX probably should figure this out from the actual format string, 398 # but that's not entirely trivial 399 if (grep { $format->{FmtIdx} == $_ } 14..22, 45..47) { 400 $long_type = 'Date'; 401 } 402 403 my $formula = $cell->first_child('s:f'); 404 my $cell = Spreadsheet::ParseExcel::Cell->new( 405 Val => $val, 406 Type => $long_type, 407 Merged => undef, # fix up later 408 Format => $format, 409 FormatNo => $format_idx, 410 ($formula 411 ? (Formula => $formula->text) 412 : ()), 413 Rich => $Rich, 414 ); 415 $cell->{_Value} = $sheet->{_Book}{FmtClass}->ValFmt( 416 $cell, $sheet->{_Book} 417 ); 418 $cells{"$row;$col"} = $cell; 419 $sheet->{Cells}[$row][$col] = $cell; 420 $col_idx++; 421 } 422 423 $twig->purge; 424 $row_idx++; 425 }, 426 } 427 ); 428 429 $sheet_xml->parse( $sheet_file ); 430 431 for my $key (keys %merged_cells) { 432 $cells{$key}{Merged} = 1 if $cells{$key}; 433 } 434 435 if ( ! $sheet->{Cells} ){ 436 $sheet->{MaxRow} = $sheet->{MaxCol} = -1; 437 } 438 439 $sheet->{DefRowHeight} = 0+$default_row_height; 440 $sheet->{DefColWidth} = 0+$default_column_width; 441 $sheet->{RowHeight} = [ 442 map { defined $_ ? 0+$_ : 0+$default_row_height } @row_heights 443 ]; 444 $sheet->{RowHidden} = \@rows_hidden; 445 $sheet->{ColWidth} = [ 446 map { defined $_ ? 0+$_ : 0+$default_column_width } @column_widths 447 ]; 448 $sheet->{ColFmtNo} = \@column_formats; 449 $sheet->{ColHidden} = \@columns_hidden; 450 451} 452 453sub _get_text_and_rich_font_by_cell { 454 my $self = shift; 455 my ($si, $theme_colors) = @_; 456 457 # XXX 458 my %default_font_opts = ( 459 Height => 12, 460 Color => '#000000', 461 Name => '', 462 Bold => 0, 463 Italic => 0, 464 Underline => 0, 465 UnderlineStyle => 0, 466 Strikeout => 0, 467 Super => 0, 468 ); 469 470 my $string_text = ''; 471 my @rich_font_by_cell; 472 my @nodes_r = $si->find_nodes('.//s:r'); 473 if (@nodes_r > 0) { 474 for my $chunk (map { $_->children } @nodes_r) { 475 my $string_length = length($string_text); 476 if ($chunk->name eq 's:t') { 477 if (!@rich_font_by_cell) { 478 push @rich_font_by_cell, [ 479 $string_length, 480 Spreadsheet::ParseExcel::Font->new(%default_font_opts) 481 ]; 482 } 483 $string_text .= $chunk->text; 484 } 485 elsif ($chunk->name eq 's:rPr') { 486 my %format_text = %default_font_opts; 487 for my $node_format ($chunk->children) { 488 if ($node_format->name eq 's:sz') { 489 $format_text{Height} = $node_format->att('val'); 490 } 491 elsif ($node_format->name eq 's:color') { 492 $format_text{Color} = $self->_color( 493 $theme_colors, 494 $node_format 495 ); 496 } 497 elsif ($node_format->name eq 's:rFont') { 498 $format_text{Name} = $node_format->att('val'); 499 } 500 elsif ($node_format->name eq 's:b') { 501 $format_text{Bold} = 1; 502 } 503 elsif ($node_format->name eq 's:i') { 504 $format_text{Italic} = 1; 505 } 506 elsif ($node_format->name eq 's:u') { 507 $format_text{Underline} = 1; 508 if (defined $node_format->att('val')) { 509 $format_text{UnderlineStyle} = 2; 510 } else { 511 $format_text{UnderlineStyle} = 1; 512 } 513 } 514 elsif ($node_format->name eq 's:strike') { 515 $format_text{Strikeout} = 1; 516 } 517 elsif ($node_format->name eq 's:vertAlign') { 518 if ($node_format->att('val') eq 'superscript') { 519 $format_text{Super} = 1; 520 } 521 elsif ($node_format->att('val') eq 'subscript') { 522 $format_text{Super} = 2; 523 } 524 } 525 } 526 push @rich_font_by_cell, [ 527 $string_length, 528 Spreadsheet::ParseExcel::Font->new(%format_text) 529 ]; 530 } 531 } 532 } 533 else { 534 $string_text = join '', map { $_->text } $si->find_nodes('.//s:t'); 535 } 536 537 return ( 538 String => $string_text, 539 Rich => \@rich_font_by_cell, 540 ); 541} 542 543sub _parse_shared_strings { 544 my $self = shift; 545 my ($strings, $theme_colors) = @_; 546 547 my $PkgStr = []; 548 549 my %richfonts; 550 if ($strings) { 551 my $xml = $self->_new_twig( 552 twig_handlers => { 553 's:si' => sub { 554 my ( $twig, $si ) = @_; 555 556 my %text_rich = $self->_get_text_and_rich_font_by_cell( 557 $si, 558 $theme_colors 559 ); 560 $richfonts{scalar @$PkgStr} = $text_rich{Rich}; 561 push @$PkgStr, $text_rich{String}; 562 $twig->purge; 563 }, 564 } 565 ); 566 $xml->parse( $strings ); 567 } 568 return ( 569 Rich => \%richfonts, 570 PkgStr => $PkgStr, 571 ); 572} 573 574sub _parse_themes { 575 my $self = shift; 576 my ($themes) = @_; 577 578 return {} unless $themes; 579 580 my @color = map { 581 $_->name eq 'drawmain:sysClr' ? $_->att('lastClr') : $_->att('val') 582 } $themes->find_nodes('//drawmain:clrScheme/*/*'); 583 584 # this shouldn't be necessary, but the documentation is wrong here 585 # see http://stackoverflow.com/questions/2760976/theme-confusion-in-spreadsheetml 586 ($color[0], $color[1]) = ($color[1], $color[0]); 587 ($color[2], $color[3]) = ($color[3], $color[2]); 588 589 return { 590 Color => \@color, 591 } 592} 593 594sub _parse_styles { 595 my $self = shift; 596 my ($workbook, $styles) = @_; 597 598 # these defaults are from 599 # http://social.msdn.microsoft.com/Forums/en-US/oxmlsdk/thread/e27aaf16-b900-4654-8210-83c5774a179c 600 my %default_format_str = ( 601 0 => 'GENERAL', 602 1 => '0', 603 2 => '0.00', 604 3 => '#,##0', 605 4 => '#,##0.00', 606 5 => '$#,##0_);($#,##0)', 607 6 => '$#,##0_);[Red]($#,##0)', 608 7 => '$#,##0.00_);($#,##0.00)', 609 8 => '$#,##0.00_);[Red]($#,##0.00)', 610 9 => '0%', 611 10 => '0.00%', 612 11 => '0.00E+00', 613 12 => '# ?/?', 614 13 => '# ??/??', 615 14 => 'm/d/yyyy', 616 15 => 'd-mmm-yy', 617 16 => 'd-mmm', 618 17 => 'mmm-yy', 619 18 => 'h:mm AM/PM', 620 19 => 'h:mm:ss AM/PM', 621 20 => 'h:mm', 622 21 => 'h:mm:ss', 623 22 => 'm/d/yyyy h:mm', 624 37 => '#,##0_);(#,##0)', 625 38 => '#,##0_);[Red](#,##0)', 626 39 => '#,##0.00_);(#,##0.00)', 627 40 => '#,##0.00_);[Red](#,##0.00)', 628 45 => 'mm:ss', 629 46 => '[h]:mm:ss', 630 47 => 'mm:ss.0', 631 48 => '##0.0E+0', 632 49 => '@', 633 ); 634 635 my %default_format_opts = ( 636 IgnoreFont => 1, 637 IgnoreFill => 1, 638 IgnoreBorder => 1, 639 IgnoreAlignment => 1, 640 IgnoreNumberFormat => 1, 641 IgnoreProtection => 1, 642 FontNo => 0, 643 FmtIdx => 0, 644 Lock => 1, 645 Hidden => 0, 646 AlignH => 0, 647 Wrap => 0, 648 AlignV => 2, 649 Rotate => 0, 650 Indent => 0, 651 Shrink => 0, 652 BdrStyle => [0, 0, 0, 0], 653 BdrColor => [undef, undef, undef, undef], 654 BdrDiag => [0, 0, undef], 655 Fill => [0, undef, undef], 656 ); 657 658 if (!$styles) { 659 # XXX i guess? 660 my $font = Spreadsheet::ParseExcel::Font->new( 661 Height => 12, 662 Color => '#000000', 663 Name => '', 664 ); 665 my $format = Spreadsheet::ParseExcel::Format->new( 666 %default_format_opts, 667 Font => $font, 668 ); 669 670 return { 671 FormatStr => \%default_format_str, 672 Font => [ $font ], 673 Format => [ $format ], 674 }; 675 } 676 677 my %halign = ( 678 center => 2, 679 centerContinuous => 6, 680 distributed => 7, 681 fill => 4, 682 general => 0, 683 justify => 5, 684 left => 1, 685 right => 3, 686 ); 687 688 my %valign = ( 689 bottom => 2, 690 center => 1, 691 distributed => 4, 692 justify => 3, 693 top => 0, 694 ); 695 696 my %border = ( 697 dashDot => 9, 698 dashDotDot => 11, 699 dashed => 3, 700 dotted => 4, 701 double => 6, 702 hair => 7, 703 medium => 2, 704 mediumDashDot => 10, 705 mediumDashDotDot => 12, 706 mediumDashed => 8, 707 none => 0, 708 slantDashDot => 13, 709 thick => 5, 710 thin => 1, 711 ); 712 713 my %fill = ( 714 darkDown => 7, 715 darkGray => 3, 716 darkGrid => 9, 717 darkHorizontal => 5, 718 darkTrellis => 10, 719 darkUp => 8, 720 darkVertical => 6, 721 gray0625 => 18, 722 gray125 => 17, 723 lightDown => 13, 724 lightGray => 4, 725 lightGrid => 15, 726 lightHorizontal => 11, 727 lightTrellis => 16, 728 lightUp => 14, 729 lightVertical => 12, 730 mediumGray => 2, 731 none => 0, 732 solid => 1, 733 ); 734 735 my @fills = map { 736 my $pattern_type = $_->att('patternType'); 737 [ 738 ($pattern_type ? $fill{$pattern_type} : 0), 739 $self->_color($workbook->{Color}, $_->first_child('s:fgColor'), 1), 740 $self->_color($workbook->{Color}, $_->first_child('s:bgColor'), 1), 741 ] 742 } $styles->find_nodes('//s:fills/s:fill/s:patternFill'); 743 744 my @borders = map { 745 my $border = $_; 746 my ($ddiag, $udiag) = map { 747 $self->_xml_boolean($border->att($_)) 748 } qw(diagonalDown diagonalUp); 749 my %borderstyles = map { 750 my $e = $border->first_child("s:$_"); 751 $_ => ($e ? $e->att('style') || 'none' : 'none') 752 } qw(left right top bottom diagonal); 753 my %bordercolors = map { 754 my $e = $border->first_child("s:$_"); 755 $_ => ($e ? $e->first_child('s:color') : undef) 756 } qw(left right top bottom diagonal); 757 # XXX specs say "begin" and "end" rather than "left" and "right", 758 # but... that's not what seems to be in the file itself (sigh) 759 { 760 colors => [ 761 map { 762 $self->_color($workbook->{Color}, $bordercolors{$_}) 763 } qw(left right top bottom) 764 ], 765 styles => [ 766 map { 767 $border{$borderstyles{$_}} 768 } qw(left right top bottom) 769 ], 770 diagonal => [ 771 ( $ddiag && $udiag ? 3 772 : $ddiag && !$udiag ? 2 773 : !$ddiag && $udiag ? 1 774 : 0), 775 $border{$borderstyles{diagonal}}, 776 $self->_color($workbook->{Color}, $bordercolors{diagonal}), 777 ], 778 } 779 } $styles->find_nodes('//s:borders/s:border'); 780 781 my %format_str = ( 782 %default_format_str, 783 (map { 784 $_->att('numFmtId') => $_->att('formatCode') 785 } $styles->find_nodes('//s:numFmts/s:numFmt')), 786 ); 787 788 my @font = map { 789 my $vert = $_->first_child('s:vertAlign'); 790 my $under = $_->first_child('s:u'); 791 my $heightelem = $_->first_child('s:sz'); 792 # XXX i guess 12 is okay? 793 my $height = 0+($heightelem ? $heightelem->att('val') : 12); 794 my $nameelem = $_->first_child('s:name'); 795 my $name = $nameelem ? $nameelem->att('val') : ''; 796 Spreadsheet::ParseExcel::Font->new( 797 Height => $height, 798 # Attr => $iAttr, 799 # XXX not sure if there's a better way to keep the indexing stuff 800 # intact rather than just going straight to #xxxxxx 801 # XXX also not sure what it means for the color tag to be missing, 802 # just assuming black for now 803 Color => ($_->first_child('s:color') 804 ? $self->_color( 805 $workbook->{Color}, 806 $_->first_child('s:color') 807 ) 808 : '#000000' 809 ), 810 Super => ($vert 811 ? ($vert->att('val') eq 'superscript' ? 1 812 : $vert->att('val') eq 'subscript' ? 2 813 : 0) 814 : 0 815 ), 816 # XXX not sure what the single accounting and double accounting 817 # underline styles map to in xlsx. also need to map the new 818 # underline styles 819 UnderlineStyle => ($under 820 # XXX sometimes style xml files can contain just <u/> with no 821 # val attribute. i think this means single underline, but not 822 # sure 823 ? (!$under->att('val') ? 1 824 : $under->att('val') eq 'single' ? 1 825 : $under->att('val') eq 'double' ? 2 826 : 0) 827 : 0 828 ), 829 Name => $name, 830 831 Bold => $_->has_child('s:b') ? 1 : 0, 832 Italic => $_->has_child('s:i') ? 1 : 0, 833 Underline => $_->has_child('s:u') ? 1 : 0, 834 Strikeout => $_->has_child('s:strike') ? 1 : 0, 835 ) 836 } $styles->find_nodes('//s:fonts/s:font'); 837 838 my @format = map { 839 my $xml_fmt = $_; 840 my $alignment = $xml_fmt->first_child('s:alignment'); 841 my $protection = $xml_fmt->first_child('s:protection'); 842 my %ignore = map { 843 ("Ignore$_" => !$self->_xml_boolean($xml_fmt->att("apply$_"))) 844 } qw(Font Fill Border Alignment NumberFormat Protection); 845 my %opts = ( 846 %default_format_opts, 847 %ignore, 848 ); 849 850 $opts{FmtIdx} = 0+($xml_fmt->att('numFmtId')||0); 851 $opts{FontNo} = 0+($xml_fmt->att('fontId')||0); 852 $opts{Font} = $font[$opts{FontNo}]; 853 $opts{Fill} = $fills[$xml_fmt->att('fillId')||0]; 854 $opts{BdrStyle} = $borders[$xml_fmt->att('borderId')||0]{styles}; 855 $opts{BdrColor} = $borders[$xml_fmt->att('borderId')||0]{colors}; 856 $opts{BdrDiag} = $borders[$xml_fmt->att('borderId')||0]{diagonal}; 857 858 if ($alignment) { 859 $opts{AlignH} = $halign{$alignment->att('horizontal') || 'general'}; 860 $opts{Wrap} = $self->_xml_boolean($alignment->att('wrapText')); 861 $opts{AlignV} = $valign{$alignment->att('vertical') || 'bottom'}; 862 $opts{Rotate} = $alignment->att('textRotation'); 863 $opts{Indent} = $alignment->att('indent'); 864 $opts{Shrink} = $self->_xml_boolean($alignment->att('shrinkToFit')); 865 # JustLast => $iJustL, 866 } 867 868 if ($protection) { 869 $opts{Lock} = defined $protection->att('locked') 870 ? $self->_xml_boolean($protection->att('locked')) 871 : 1; 872 $opts{Hidden} = $self->_xml_boolean($protection->att('hidden')); 873 } 874 875 # Style => $iStyle, 876 # Key123 => $i123, 877 # Merge => $iMerge, 878 # ReadDir => $iReadDir, 879 Spreadsheet::ParseExcel::Format->new(%opts) 880 } $styles->find_nodes('//s:cellXfs/s:xf'); 881 882 return { 883 FormatStr => \%format_str, 884 Font => \@font, 885 Format => \@format, 886 } 887} 888 889sub _extract_files { 890 my $self = shift; 891 my ($zip) = @_; 892 893 my $type_base = 894 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'; 895 896 my $rels = $self->_parse_xml( 897 $zip, 898 $self->_rels_for(''), 899 ); 900 my $wb_name = ($rels->find_nodes( 901 qq<//packagerels:Relationship[\@Type="$type_base/officeDocument"]> 902 ))[0]->att('Target'); 903 $wb_name =~ s{^/}{}; 904 my $wb_xml = $self->_parse_xml($zip, $wb_name); 905 906 my $path_base = $self->_base_path_for($wb_name); 907 my $wb_rels = $self->_parse_xml( 908 $zip, 909 $self->_rels_for($wb_name), 910 ); 911 912 my $get_path = sub { 913 my ($p) = @_; 914 915 return $p =~ s{^/}{} 916 ? $p 917 : $path_base . $p; 918 }; 919 920 my ($strings_xml) = map { 921 $self->_zip_file_member($zip, $get_path->($_->att('Target'))) 922 } $wb_rels->find_nodes(qq<//packagerels:Relationship[\@Type="$type_base/sharedStrings"]>); 923 924 my ($styles_xml) = map { 925 $self->_parse_xml( 926 $zip, 927 $get_path->($_->att('Target')) 928 ) 929 } $wb_rels->find_nodes(qq<//packagerels:Relationship[\@Type="$type_base/styles"]>); 930 931 my %worksheet_xml = map { 932 ($_->att('Id') => $self->_zip_file_member($zip, $get_path->($_->att('Target')))) 933 } $wb_rels->find_nodes(qq<//packagerels:Relationship[\@Type="$type_base/worksheet"]>); 934 935 my %themes_xml = map { 936 $_->att('Id') => $self->_parse_xml($zip, $get_path->($_->att('Target'))) 937 } $wb_rels->find_nodes(qq<//packagerels:Relationship[\@Type="$type_base/theme"]>); 938 939 return { 940 workbook => $wb_xml, 941 sheets => \%worksheet_xml, 942 themes => \%themes_xml, 943 ($styles_xml 944 ? (styles => $styles_xml) 945 : ()), 946 ($strings_xml 947 ? (strings => $strings_xml) 948 : ()), 949 }; 950} 951 952sub _parse_xml { 953 my $self = shift; 954 my ($zip, $subfile, $map_xmlns) = @_; 955 956 my $xml = $self->_new_twig; 957 $xml->parse($self->_zip_file_member($zip, $subfile)); 958 959 return $xml; 960} 961 962sub _zip_file_member { 963 my $self = shift; 964 my ($zip, $name) = @_; 965 966 my @members = $zip->membersMatching(qr/^$name$/i); 967 die "no subfile named $name" unless @members; 968 969 return scalar $members[0]->contents; 970} 971 972sub _rels_for { 973 my $self = shift; 974 my ($file) = @_; 975 976 my @path = split '/', $file; 977 my $name = pop @path; 978 $name = '' unless defined $name; 979 push @path, '_rels'; 980 push @path, "$name.rels"; 981 982 return join '/', @path; 983} 984 985sub _base_path_for { 986 my $self = shift; 987 my ($file) = @_; 988 989 my @path = split '/', $file; 990 pop @path; 991 992 return join('/', @path) . '/'; 993} 994 995sub _dimensions { 996 my $self = shift; 997 my ($dim) = @_; 998 999 my ($topleft, $bottomright) = split ':', $dim; 1000 $bottomright = $topleft unless defined $bottomright; 1001 1002 my ($rmin, $cmin) = $self->_cell_to_row_col($topleft); 1003 my ($rmax, $cmax) = $self->_cell_to_row_col($bottomright); 1004 1005 return ($rmin, $cmin, $rmax, $cmax); 1006} 1007 1008sub _cell_to_row_col { 1009 my $self = shift; 1010 my ($cell) = @_; 1011 1012 my ($col, $row) = $cell =~ /([A-Z]+)([0-9]+)/; 1013 1014 my $ncol = 0; 1015 for my $char (split //, $col) { 1016 $ncol *= 26; 1017 $ncol += ord($char) - ord('A') + 1; 1018 } 1019 $ncol = $ncol - 1; 1020 1021 my $nrow = $row - 1; 1022 1023 return ($nrow, $ncol); 1024} 1025 1026sub _xml_boolean { 1027 my $self = shift; 1028 my ($bool) = @_; 1029 return defined($bool) && ($bool eq 'true' || $bool eq '1'); 1030} 1031 1032sub _color { 1033 my $self = shift; 1034 my ($colors, $color_node, $fill) = @_; 1035 1036 my $color; 1037 if ($color_node && !$self->_xml_boolean($color_node->att('auto'))) { 1038 if (defined $color_node->att('indexed')) { 1039 # see https://rt.cpan.org/Public/Bug/Display.html?id=93065 1040 if ($fill && $color_node->att('indexed') == 64) { 1041 return '#FFFFFF'; 1042 } 1043 else { 1044 $color = '#' . Spreadsheet::ParseExcel->ColorIdxToRGB( 1045 $color_node->att('indexed') 1046 ); 1047 } 1048 } 1049 elsif (defined $color_node->att('rgb')) { 1050 $color = '#' . substr($color_node->att('rgb'), 2, 6); 1051 } 1052 elsif (defined $color_node->att('theme')) { 1053 my $theme = $colors->[$color_node->att('theme')]; 1054 if (defined $theme) { 1055 $color = "#$theme"; 1056 } 1057 else { 1058 return undef; 1059 } 1060 } 1061 1062 $color = $self->_apply_tint($color, $color_node->att('tint')) 1063 if $color_node->att('tint'); 1064 } 1065 1066 return $color; 1067} 1068 1069sub _apply_tint { 1070 my $self = shift; 1071 my ($color, $tint) = @_; 1072 1073 my ($r, $g, $b) = map { oct("0x$_") } $color =~ /#(..)(..)(..)/; 1074 my ($h, $l, $s) = rgb2hls($r, $g, $b); 1075 1076 if ($tint < 0) { 1077 $l = $l * (1.0 + $tint); 1078 } 1079 else { 1080 $l = $l * (1.0 - $tint) + (1.0 - 1.0 * (1.0 - $tint)); 1081 } 1082 1083 return scalar hls2rgb($h, $l, $s); 1084} 1085 1086sub _new_twig { 1087 my $self = shift; 1088 my %opts = @_; 1089 1090 return XML::Twig->new( 1091 map_xmlns => { 1092 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' => 's', 1093 'http://schemas.openxmlformats.org/package/2006/relationships' => 'packagerels', 1094 'http://schemas.openxmlformats.org/officeDocument/2006/relationships' => 'rels', 1095 'http://schemas.openxmlformats.org/drawingml/2006/main' => 'drawmain', 1096 }, 1097 keep_original_prefix => 1, 1098 %opts, 1099 ); 1100} 1101 1102 11031; 1104 1105__END__ 1106 1107=pod 1108 1109=encoding UTF-8 1110 1111=head1 NAME 1112 1113Spreadsheet::ParseXLSX - parse XLSX files 1114 1115=head1 VERSION 1116 1117version 0.27 1118 1119=head1 SYNOPSIS 1120 1121 use Spreadsheet::ParseXLSX; 1122 1123 my $parser = Spreadsheet::ParseXLSX->new; 1124 my $workbook = $parser->parse("file.xlsx"); 1125 # see Spreadsheet::ParseExcel for further documentation 1126 1127=head1 DESCRIPTION 1128 1129This module is an adaptor for L<Spreadsheet::ParseExcel> that reads XLSX files. 1130For documentation about the various data that you can retrieve from these 1131classes, please see L<Spreadsheet::ParseExcel>, 1132L<Spreadsheet::ParseExcel::Workbook>, L<Spreadsheet::ParseExcel::Worksheet>, 1133and L<Spreadsheet::ParseExcel::Cell>. 1134 1135=head1 METHODS 1136 1137=head2 new(%opts) 1138 1139Returns a new parser instance. Takes a hash of parameters: 1140 1141=over 4 1142 1143=item Password 1144 1145Password to use for decrypting encrypted files. 1146 1147=back 1148 1149=head2 parse($file, $formatter) 1150 1151Parses an XLSX file. Parsing errors throw an exception. C<$file> can be either 1152a filename or an open filehandle. Returns a 1153L<Spreadsheet::ParseExcel::Workbook> instance containing the parsed data. 1154The C<$formatter> argument is an optional formatter class as described in L<Spreadsheet::ParseExcel>. 1155 1156=head1 INCOMPATIBILITIES 1157 1158This module returns data using classes from L<Spreadsheet::ParseExcel>, so for 1159the most part, it should just be a drop-in replacement. That said, there are a 1160couple areas where the data returned is intentionally different: 1161 1162=over 4 1163 1164=item Colors 1165 1166In Spreadsheet::ParseExcel, colors are represented by integers which index into 1167the color table, and you have to use 1168C<< Spreadsheet::ParseExcel->ColorIdxToRGB >> in order to get the actual value 1169out. In Spreadsheet::ParseXLSX, while the color table still exists, cells are 1170also allowed to specify their color directly rather than going through the 1171color table. In order to avoid confusion, I normalize all color values in 1172Spreadsheet::ParseXLSX to their string RGB format (C<"#0088ff">). This affects 1173the C<Fill>, C<BdrColor>, and C<BdrDiag> properties of formats, and the 1174C<Color> property of fonts. Note that the default color is represented by 1175C<undef> (the same thing that C<ColorIdxToRGB> would return). 1176 1177=item Formulas 1178 1179Spreadsheet::ParseExcel doesn't support formulas. Spreadsheet::ParseXLSX 1180provides basic formula support by returning the text of the formula as part of 1181the cell data. You can access it via C<< $cell->{Formula} >>. Note that the 1182restriction still holds that formula cell values aren't available unless they 1183were explicitly provided when the spreadsheet was written. 1184 1185=back 1186 1187=head1 BUGS 1188 1189=over 4 1190 1191=item Large spreadsheets may cause segfaults on perl 5.14 and earlier 1192 1193This module internally uses XML::Twig, which makes it potentially subject to 1194L<Bug #71636 for XML-Twig: Segfault with medium-sized document|https://rt.cpan.org/Public/Bug/Display.html?id=71636> 1195on perl versions 5.14 and below (the underlying bug with perl weak references 1196was fixed in perl 5.15.5). The larger and more complex the spreadsheet, the 1197more likely to be affected, but the actual size at which it segfaults is 1198platform dependent. On a 64-bit perl with 7.6gb memory, it was seen on 1199spreadsheets about 300mb and above. You can work around this adding 1200C<XML::Twig::_set_weakrefs(0)> to your code before parsing the spreadsheet, 1201although this may have other consequences such as memory leaks. 1202 1203=item Worksheets without the C<dimension> tag are not supported 1204 1205=item Intra-cell formatting is discarded 1206 1207=item Shared formulas are not supported 1208 1209Shared formula support will require an actual formula parser and quite a bit of 1210custom logic, since the only thing stored in the document is the formula for 1211the base cell - updating the cell references in the formulas in the rest of the 1212cells is handled by the application. Values for these cells are still handled 1213properly. 1214 1215=back 1216 1217In addition, there are still a few areas which are not yet implemented (the 1218XLSX spec is quite large). If you run into any of those, bug reports are quite 1219welcome. 1220 1221Please report any bugs to GitHub Issues at 1222L<https://github.com/doy/spreadsheet-parsexlsx/issues>. 1223 1224=head1 SEE ALSO 1225 1226L<Spreadsheet::ParseExcel>: The equivalent, for XLS files. 1227 1228L<Spreadsheet::XLSX>: An older, less robust and featureful implementation. 1229 1230=head1 SUPPORT 1231 1232You can find this documentation for this module with the perldoc command. 1233 1234 perldoc Spreadsheet::ParseXLSX 1235 1236You can also look for information at: 1237 1238=over 4 1239 1240=item * MetaCPAN 1241 1242L<https://metacpan.org/release/Spreadsheet-ParseXLSX> 1243 1244=item * RT: CPAN's request tracker 1245 1246L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Spreadsheet-ParseXLSX> 1247 1248=item * Github 1249 1250L<https://github.com/doy/spreadsheet-parsexlsx> 1251 1252=item * CPAN Ratings 1253 1254L<http://cpanratings.perl.org/d/Spreadsheet-ParseXLSX> 1255 1256=back 1257 1258=head1 SPONSORS 1259 1260Parts of this code were paid for by 1261 1262=over 4 1263 1264=item Socialflow L<http://socialflow.com> 1265 1266=back 1267 1268=head1 AUTHOR 1269 1270Jesse Luehrs <doy@tozt.net> 1271 1272=head1 COPYRIGHT AND LICENSE 1273 1274This software is Copyright (c) 2016 by Jesse Luehrs. 1275 1276This is free software, licensed under: 1277 1278 The MIT (X11) License 1279 1280=cut 1281