1#------------------------------------------------------------------------------
2# File:         ZIP.pm
3#
4# Description:  Read ZIP archive meta information
5#
6# Revisions:    10/28/2007 - P. Harvey Created
7#
8# References:   1) http://www.pkware.com/documents/casestudies/APPNOTE.TXT
9#               2) http://www.cpanforum.com/threads/9046
10#               3) http://www.gzip.org/zlib/rfc-gzip.html
11#               4) http://DataCompression.info/ArchiveFormats/RAR202.txt
12#               5) https://jira.atlassian.com/browse/CONF-21706
13#               6) http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/indesign/cs55-docs/IDML/idml-specification.pdf
14#------------------------------------------------------------------------------
15
16package Image::ExifTool::ZIP;
17
18use strict;
19use vars qw($VERSION $warnString);
20use Image::ExifTool qw(:DataAccess :Utils);
21
22$VERSION = '1.26';
23
24sub WarnProc($) { $warnString = $_[0]; }
25
26# file types for recognized Open Document "mimetype" values
27my %openDocType = (
28    'application/vnd.oasis.opendocument.database'     => 'ODB', #5
29    'application/vnd.oasis.opendocument.chart'        => 'ODC', #5
30    'application/vnd.oasis.opendocument.formula'      => 'ODF', #5
31    'application/vnd.oasis.opendocument.graphics'     => 'ODG', #5
32    'application/vnd.oasis.opendocument.image'        => 'ODI', #5
33    'application/vnd.oasis.opendocument.presentation' => 'ODP',
34    'application/vnd.oasis.opendocument.spreadsheet'  => 'ODS',
35    'application/vnd.oasis.opendocument.text'         => 'ODT',
36    'application/vnd.adobe.indesign-idml-package'     => 'IDML', #6 (not open doc)
37    'application/epub+zip' => 'EPUB', #PH (not open doc)
38);
39
40# iWork file types based on names of files found in the zip archive
41my %iWorkFile = (
42    'Index/Slide.iwa' => 'KEY',
43    'Index/Tables/DataList.iwa' => 'NUMBERS',
44);
45
46my %iWorkType = (
47    NUMBERS => 'NUMBERS',
48    PAGES   => 'PAGES',
49    KEY     => 'KEY',
50    KTH     => 'KTH',
51    NMBTEMPLATE => 'NMBTEMPLATE',
52);
53
54# ZIP metadata blocks
55%Image::ExifTool::ZIP::Main = (
56    PROCESS_PROC => \&Image::ExifTool::ProcessBinaryData,
57    GROUPS => { 2 => 'Other' },
58    FORMAT => 'int16u',
59    NOTES => q{
60        The following tags are extracted from ZIP archives.  ExifTool also extracts
61        additional meta information from compressed documents inside some ZIP-based
62        files such Office Open XML (DOCX, PPTX and XLSX), Open Document (ODB, ODC,
63        ODF, ODG, ODI, ODP, ODS and ODT), iWork (KEY, PAGES, NUMBERS), Capture One
64        Enhanced Image Package (EIP), Adobe InDesign Markup Language (IDML),
65        Electronic Publication (EPUB), and Sketch design files (SKETCH).  The
66        ExifTool family 3 groups may be used to organize ZIP tags by embedded
67        document number (ie. the exiftool C<-g3> option).
68    },
69    2 => 'ZipRequiredVersion',
70    3 => {
71        Name => 'ZipBitFlag',
72        PrintConv => '$val ? sprintf("0x%.4x",$val) : $val',
73    },
74    4 => {
75        Name => 'ZipCompression',
76        PrintConv => {
77            0 => 'None',
78            1 => 'Shrunk',
79            2 => 'Reduced with compression factor 1',
80            3 => 'Reduced with compression factor 2',
81            4 => 'Reduced with compression factor 3',
82            5 => 'Reduced with compression factor 4',
83            6 => 'Imploded',
84            7 => 'Tokenized',
85            8 => 'Deflated',
86            9 => 'Enhanced Deflate using Deflate64(tm)',
87           10 => 'Imploded (old IBM TERSE)',
88           12 => 'BZIP2',
89           14 => 'LZMA (EFS)',
90           18 => 'IBM TERSE (new)',
91           19 => 'IBM LZ77 z Architecture (PFS)',
92           96 => 'JPEG recompressed', #2
93           97 => 'WavPack compressed', #2
94           98 => 'PPMd version I, Rev 1',
95       },
96    },
97    5 => {
98        Name => 'ZipModifyDate',
99        Format => 'int32u',
100        Groups => { 2 => 'Time' },
101        ValueConv => sub {
102            my $val = shift;
103            return sprintf('%.4d:%.2d:%.2d %.2d:%.2d:%.2d',
104                ($val >> 25) + 1980, # year
105                ($val >> 21) & 0x0f, # month
106                ($val >> 16) & 0x1f, # day
107                ($val >> 11) & 0x1f, # hour
108                ($val >> 5)  & 0x3f, # minute
109                ($val        & 0x1f) * 2  # second
110            );
111        },
112        PrintConv => '$self->ConvertDateTime($val)',
113    },
114    7 => { Name => 'ZipCRC', Format => 'int32u', PrintConv => 'sprintf("0x%.8x",$val)' },
115    9 => { Name => 'ZipCompressedSize',    Format => 'int32u' },
116    11 => { Name => 'ZipUncompressedSize', Format => 'int32u' },
117    13 => {
118        Name => 'ZipFileNameLength',
119        # don't store a tag -- just extract the value for use with ZipFileName
120        Hidden => 1,
121        RawConv => '$$self{ZipFileNameLength} = $val; undef',
122    },
123    # 14 => 'ZipExtraFieldLength',
124    15 => {
125        Name => 'ZipFileName',
126        Format => 'string[$$self{ZipFileNameLength}]',
127    },
128);
129
130# GNU ZIP tags (ref 3)
131%Image::ExifTool::ZIP::GZIP = (
132    PROCESS_PROC => \&Image::ExifTool::ProcessBinaryData,
133    GROUPS => { 2 => 'Other' },
134    NOTES => q{
135        These tags are extracted from GZIP (GNU ZIP) archives, but currently only
136        for the first file in the archive.
137    },
138    2 => {
139        Name => 'Compression',
140        PrintConv => {
141            8 => 'Deflated',
142        },
143    },
144    3 => {
145        Name => 'Flags',
146        PrintConv => { BITMASK => {
147            0 => 'Text',
148            1 => 'CRC16',
149            2 => 'ExtraFields',
150            3 => 'FileName',
151            4 => 'Comment',
152        }},
153    },
154    4 => {
155        Name => 'ModifyDate',
156        Format => 'int32u',
157        Groups => { 2 => 'Time' },
158        ValueConv => 'ConvertUnixTime($val,1)',
159        PrintConv => '$self->ConvertDateTime($val)',
160    },
161    8 => {
162        Name => 'ExtraFlags',
163        PrintConv => {
164            0 => '(none)',
165            2 => 'Maximum Compression',
166            4 => 'Fastest Algorithm',
167        },
168    },
169    9 => {
170        Name => 'OperatingSystem',
171        PrintConv => {
172            0 => 'FAT filesystem (MS-DOS, OS/2, NT/Win32)',
173            1 => 'Amiga',
174            2 => 'VMS (or OpenVMS)',
175            3 => 'Unix',
176            4 => 'VM/CMS',
177            5 => 'Atari TOS',
178            6 => 'HPFS filesystem (OS/2, NT)',
179            7 => 'Macintosh',
180            8 => 'Z-System',
181            9 => 'CP/M',
182            10 => 'TOPS-20',
183            11 => 'NTFS filesystem (NT)',
184            12 => 'QDOS',
185            13 => 'Acorn RISCOS',
186            255 => 'unknown',
187        },
188    },
189    10 => 'ArchivedFileName',
190    11 => 'Comment',
191);
192
193# RAR tags (ref 4)
194%Image::ExifTool::ZIP::RAR = (
195    PROCESS_PROC => \&Image::ExifTool::ProcessBinaryData,
196    GROUPS => { 2 => 'Other' },
197    NOTES => 'These tags are extracted from RAR archive files.',
198    0 => {
199        Name => 'CompressedSize',
200        Format => 'int32u',
201    },
202    4 => {
203        Name => 'UncompressedSize',
204        Format => 'int32u',
205    },
206    8 => {
207        Name => 'OperatingSystem',
208        PrintConv => {
209            0 => 'MS-DOS',
210            1 => 'OS/2',
211            2 => 'Win32',
212            3 => 'Unix',
213        },
214    },
215    13 => {
216        Name => 'ModifyDate',
217        Format => 'int32u',
218        Groups => { 2 => 'Time' },
219        ValueConv => sub {
220            my $val = shift;
221            return sprintf('%.4d:%.2d:%.2d %.2d:%.2d:%.2d',
222                ($val >> 25) + 1980, # year
223                ($val >> 21) & 0x0f, # month
224                ($val >> 16) & 0x1f, # day
225                ($val >> 11) & 0x1f, # hour
226                ($val >> 5)  & 0x3f, # minute
227                ($val        & 0x1f) * 2  # second
228            );
229        },
230        PrintConv => '$self->ConvertDateTime($val)',
231    },
232    18 => {
233        Name => 'PackingMethod',
234        PrintHex => 1,
235        PrintConv => {
236            0x30 => 'Stored',
237            0x31 => 'Fastest',
238            0x32 => 'Fast',
239            0x33 => 'Normal',
240            0x34 => 'Good Compression',
241            0x35 => 'Best Compression',
242        },
243    },
244    19 => {
245        Name => 'FileNameLength',
246        Format => 'int16u',
247        Hidden => 1,
248        RawConv => '$$self{FileNameLength} = $val; undef',
249    },
250    25 => {
251        Name => 'ArchivedFileName',
252        Format => 'string[$$self{FileNameLength}]',
253    },
254);
255
256#------------------------------------------------------------------------------
257# Extract information from a RAR file (ref 4)
258# Inputs: 0) ExifTool object reference, 1) dirInfo reference
259# Returns: 1 on success, 0 if this wasn't a valid RAR file
260sub ProcessRAR($$)
261{
262    my ($et, $dirInfo) = @_;
263    my $raf = $$dirInfo{RAF};
264    my ($flags, $buff);
265
266    return 0 unless $raf->Read($buff, 7) and $buff eq "Rar!\x1a\x07\0";
267
268    $et->SetFileType();
269    SetByteOrder('II');
270    my $tagTablePtr = GetTagTable('Image::ExifTool::ZIP::RAR');
271    my $docNum = 0;
272
273    for (;;) {
274        # read block header
275        $raf->Read($buff, 7) == 7 or last;
276        my ($type, $flags, $size) = unpack('xxCvv', $buff);
277        $size -= 7;
278        if ($flags & 0x8000) {
279            $raf->Read($buff, 4) == 4 or last;
280            $size += unpack('V',$buff) - 4;
281        }
282        last if $size < 0;
283        next unless $size;  # ignore blocks with no data
284        # don't try to read very large blocks unless LargeFileSupport is enabled
285        if ($size >= 0x80000000 and not $et->Options('LargeFileSupport')) {
286            $et->Warn('Large block encountered. Aborting.');
287            last;
288        }
289        # process the block
290        if ($type == 0x74) { # file block
291            # read maximum 4 KB from a file block
292            my $n = $size > 4096 ? 4096 : $size;
293            $raf->Read($buff, $n) == $n or last;
294            # add compressed size to start of data so we can extract it with the other tags
295            $buff = pack('V',$size) . $buff;
296            $$et{DOC_NUM} = ++$docNum;
297            $et->ProcessDirectory({ DataPt => \$buff }, $tagTablePtr);
298            $size -= $n;
299        } elsif ($type == 0x75 and $size > 6) { # comment block
300            $raf->Read($buff, $size) == $size or last;
301            # save comment, only if "Stored" (this is untested)
302            if (Get8u(\$buff, 3) == 0x30) {
303                $et->FoundTag('Comment', substr($buff, 6));
304            }
305            next;
306        }
307        # seek to the start of the next block
308        $raf->Seek($size, 1) or last if $size;
309    }
310    $$et{DOC_NUM} = 0;
311    if ($docNum > 1 and not $et->Options('Duplicates')) {
312        $et->Warn("Use the Duplicates option to extract tags for all $docNum files", 1);
313    }
314
315    return 1;
316}
317
318#------------------------------------------------------------------------------
319# Extract information from a GNU ZIP file (ref 3)
320# Inputs: 0) ExifTool object reference, 1) dirInfo reference
321# Returns: 1 on success, 0 if this wasn't a valid GZIP file
322sub ProcessGZIP($$)
323{
324    my ($et, $dirInfo) = @_;
325    my $raf = $$dirInfo{RAF};
326    my ($flags, $buff);
327
328    return 0 unless $raf->Read($buff, 10) and $buff =~ /^\x1f\x8b\x08/;
329
330    $et->SetFileType();
331    SetByteOrder('II');
332
333    my $tagTablePtr = GetTagTable('Image::ExifTool::ZIP::GZIP');
334    $et->HandleTag($tagTablePtr, 2, Get8u(\$buff, 2));
335    $et->HandleTag($tagTablePtr, 3, $flags = Get8u(\$buff, 3));
336    $et->HandleTag($tagTablePtr, 4, Get32u(\$buff, 4));
337    $et->HandleTag($tagTablePtr, 8, Get8u(\$buff, 8));
338    $et->HandleTag($tagTablePtr, 9, Get8u(\$buff, 9));
339
340    # extract file name and comment if they exist
341    if ($flags & 0x18) {
342        if ($flags & 0x04) {
343            # skip extra field
344            $raf->Read($buff, 2) == 2 or return 1;
345            my $len = Get16u(\$buff, 0);
346            $raf->Read($buff, $len) == $len or return 1;
347        }
348        $raf->Read($buff, 4096) or return 1;
349        my $pos = 0;
350        my $tagID;
351        # loop for ArchivedFileName (10) and Comment (11) tags
352        foreach $tagID (10, 11) {
353            my $mask = $tagID == 10 ? 0x08 : 0x10;
354            next unless $flags & $mask;
355            my $end = $buff =~ /\0/g ? pos($buff) - 1 : length($buff);
356            # (the doc specifies the string should be ISO 8859-1,
357            # but in OS X it seems to be UTF-8, so don't translate
358            # it because I could just as easily screw it up)
359            my $str = substr($buff, $pos, $end - $pos);
360            $et->HandleTag($tagTablePtr, $tagID, $str);
361            last if $end >= length $buff;
362            $pos = $end + 1;
363        }
364    }
365    return 1;
366}
367
368#------------------------------------------------------------------------------
369# Call HandleTags for attributes of an Archive::Zip member
370# Inputs: 0) ExifTool object ref, 1) member ref, 2) optional tag table ref
371sub HandleMember($$;$)
372{
373    my ($et, $member, $tagTablePtr) = @_;
374    $tagTablePtr or  $tagTablePtr = GetTagTable('Image::ExifTool::ZIP::Main');
375    $et->HandleTag($tagTablePtr, 2, $member->versionNeededToExtract());
376    $et->HandleTag($tagTablePtr, 3, $member->bitFlag());
377    $et->HandleTag($tagTablePtr, 4, $member->compressionMethod());
378    $et->HandleTag($tagTablePtr, 5, $member->lastModFileDateTime());
379    $et->HandleTag($tagTablePtr, 7, $member->crc32());
380    $et->HandleTag($tagTablePtr, 9, $member->compressedSize());
381    $et->HandleTag($tagTablePtr, 11, $member->uncompressedSize());
382    $et->HandleTag($tagTablePtr, 15, $member->fileName());
383}
384
385#------------------------------------------------------------------------------
386# Extract information from a ZIP file
387# Inputs: 0) ExifTool object reference, 1) dirInfo reference
388# Returns: 1 on success, 0 if this wasn't a valid ZIP file
389sub ProcessZIP($$)
390{
391    my ($et, $dirInfo) = @_;
392    my $raf = $$dirInfo{RAF};
393    my ($buff, $buf2, $zip);
394
395    return 0 unless $raf->Read($buff, 30) == 30 and $buff =~ /^PK\x03\x04/;
396
397    my $tagTablePtr = GetTagTable('Image::ExifTool::ZIP::Main');
398    my $docNum = 0;
399
400    # use Archive::Zip if available
401    for (;;) {
402        unless (eval { require Archive::Zip } and eval { require IO::File }) {
403            if ($$et{FILE_EXT} and $$et{FILE_EXT} ne 'ZIP') {
404                $et->Warn("Install Archive::Zip to decode compressed ZIP information");
405            }
406            last;
407        }
408        # Archive::Zip requires a seekable IO::File object
409        my $fh;
410        if ($raf->{TESTED} >= 0) {
411            unless (eval { require IO::File }) {
412                # (this shouldn't happen because IO::File is a prerequisite of Archive::Zip)
413                $et->Warn("Install IO::File to decode compressed ZIP information");
414                last;
415            }
416            $raf->Seek(0,0);
417            $fh = $raf->{FILE_PT};
418            bless $fh, 'IO::File';  # Archive::Zip expects an IO::File object
419        } elsif (eval { require IO::String }) {
420            # read the whole file into memory (what else can I do?)
421            $raf->Slurp();
422            $fh = new IO::String ${$raf->{BUFF_PT}};
423        } else {
424            my $type = $raf->{FILE_PT} ? 'pipe or socket' : 'scalar reference';
425            $et->Warn("Install IO::String to decode compressed ZIP information from a $type");
426            last;
427        }
428        $et->VPrint(1, "  --- using Archive::Zip ---\n");
429        $zip = new Archive::Zip;
430        # catch all warnings! (Archive::Zip is bad for this)
431        local $SIG{'__WARN__'} = \&WarnProc;
432        my $status = $zip->readFromFileHandle($fh);
433        if ($status eq '4' and $raf->{TESTED} >= 0 and eval { require IO::String } and
434            $raf->Seek(0,2) and $raf->Tell() < 100000000)
435        {
436            # try again, reading it ourself this time in an attempt to avoid
437            # a failed test with Perl 5.6.2 GNU/Linux 2.6.32-5-686 i686-linux-64int-ld
438            $raf->Seek(0,0);
439            $raf->Slurp();
440            $fh = new IO::String ${$raf->{BUFF_PT}};
441            $zip = new Archive::Zip;
442            $status = $zip->readFromFileHandle($fh);
443        }
444        if ($status) {
445            undef $zip;
446            my %err = ( 1=>'Stream end error', 3=>'Format error', 4=>'IO error' );
447            my $err = $err{$status} || "Error $status";
448            $et->Warn("$err reading ZIP file");
449            last;
450        }
451        $$dirInfo{ZIP} = $zip;
452
453        # check for an Office Open file (DOCX, etc)
454        # --> read '[Content_Types].xml' to determine the file type
455        my ($mime, @members);
456        my $cType = $zip->memberNamed('[Content_Types].xml');
457        if ($cType) {
458            ($buff, $status) = $zip->contents($cType);
459            if (not $status and (
460                # first look for the main document with the expected name
461                $buff =~ m{\sPartName\s*=\s*['"](?:/ppt/presentation.xml|/word/document.xml|/xl/workbook.xml)['"][^>]*\sContentType\s*=\s*(['"])([^"']+)\.main(\+xml)?\1} or
462                # then look for the main part
463                $buff =~ /<Override[^>]*\sPartName[^<]+\sContentType\s*=\s*(['"])([^"']+)\.main(\+xml)?\1/ or
464                # and if all else fails, use the default main
465                $buff =~ /ContentType\s*=\s*(['"])([^"']+)\.main(\+xml)?\1/))
466            {
467                $mime = $2;
468            }
469        }
470        # check for docProps if we couldn't find a MIME type
471        $mime or @members = $zip->membersMatching('^docProps/.*\.(xml|XML)$');
472        if ($mime or @members) {
473            $$dirInfo{MIME} = $mime;
474            require Image::ExifTool::OOXML;
475            Image::ExifTool::OOXML::ProcessDOCX($et, $dirInfo);
476            delete $$dirInfo{MIME};
477            last;
478        }
479
480        # check for an EIP file
481        @members = $zip->membersMatching('^CaptureOne/.*\.(cos|COS)$');
482        if (@members) {
483            require Image::ExifTool::CaptureOne;
484            Image::ExifTool::CaptureOne::ProcessEIP($et, $dirInfo);
485            last;
486        }
487
488        # check for an iWork file
489        @members = $zip->membersMatching('(?i)^(index\.(xml|apxl)|QuickLook/Thumbnail\.jpg|[^/]+\.(pages|numbers|key)/Index.(zip|xml|apxl))$');
490        if (@members) {
491            require Image::ExifTool::iWork;
492            Image::ExifTool::iWork::Process_iWork($et, $dirInfo);
493            last;
494        }
495
496        # check for an Open Document, IDML or EPUB file
497        my $mType = $zip->memberNamed('mimetype');
498        if ($mType) {
499            ($mime, $status) = $zip->contents($mType);
500            if (not $status and $mime =~ /([\x21-\xfe]+)/s) {
501                # clean up MIME type just in case (note that MIME is case insensitive)
502                $mime = lc $1;
503                $et->SetFileType($openDocType{$mime} || 'ZIP', $mime);
504                $et->Warn("Unrecognized MIMEType $mime") unless $openDocType{$mime};
505                # extract Open Document metadata from "meta.xml"
506                my $meta = $zip->memberNamed('meta.xml');
507                # IDML files have metadata in a different place (ref 6)
508                $meta or $meta = $zip->memberNamed('META-INF/metadata.xml');
509                if ($meta) {
510                    ($buff, $status) = $zip->contents($meta);
511                    unless ($status) {
512                        my %dirInfo = (
513                            DirName => 'XML',
514                            DataPt  => \$buff,
515                            DirLen  => length $buff,
516                            DataLen => length $buff,
517                        );
518                        # (avoid structure warnings when copying from XML)
519                        my $oldWarn = $$et{NO_STRUCT_WARN};
520                        $$et{NO_STRUCT_WARN} = 1;
521                        $et->ProcessDirectory(\%dirInfo, GetTagTable('Image::ExifTool::XMP::Main'));
522                        $$et{NO_STRUCT_WARN} = $oldWarn;
523                    }
524                }
525                # process rootfile of EPUB container if applicable
526                for (;;) {
527                    last if $meta and $mime ne 'application/epub+zip';
528                    my $container = $zip->memberNamed('META-INF/container.xml');
529                    ($buff, $status) = $zip->contents($container);
530                    last if $status;
531                    $buff =~ /<rootfile\s+[^>]*?\bfull-path=(['"])(.*?)\1/s or last;
532                    # load the rootfile data (OPF extension; contains XML metadata)
533                    my $meta2 = $zip->memberNamed($2) or last;
534                    $meta = $meta2;
535                    ($buff, $status) = $zip->contents($meta);
536                    last if $status;
537                    # use opf:event to generate more meaningful tag names for dc:date
538                    while ($buff =~ s{<dc:date opf:event="(\w+)">([^<]+)</dc:date>}{<dc:${1}Date>$2</dc:${1}Date>}s) {
539                        my $dcTable = GetTagTable('Image::ExifTool::XMP::dc');
540                        my $tag = "${1}Date";
541                        AddTagToTable($dcTable, $tag, {
542                            Name => ucfirst $tag,
543                            Groups => { 2 => 'Time' },
544                            List => 'Seq',
545                            %Image::ExifTool::XMP::dateTimeInfo
546                        }) unless $$dcTable{$tag};
547                    }
548                    my %dirInfo = (
549                        DataPt => \$buff,
550                        DirLen => length $buff,
551                        DataLen => length $buff,
552                        IgnoreProp => { 'package' => 1, metadata => 1 },
553                    );
554                    # (avoid structure warnings when copying from XML)
555                    my $oldWarn = $$et{NO_STRUCT_WARN};
556                    $$et{NO_STRUCT_WARN} = 1;
557                    $et->ProcessDirectory(\%dirInfo, GetTagTable('Image::ExifTool::XMP::XML'));
558                    $$et{NO_STRUCT_WARN} = $oldWarn;
559                    last;
560                }
561                if ($openDocType{$mime} or $meta) {
562                    # extract preview image(s) from "Thumbnails" directory if they exist
563                    my $type;
564                    my %tag = ( jpg => 'PreviewImage', png => 'PreviewPNG' );
565                    foreach $type ('jpg', 'png') {
566                        my $thumb = $zip->memberNamed("Thumbnails/thumbnail.$type");
567                        next unless $thumb;
568                        ($buff, $status) = $zip->contents($thumb);
569                        $et->FoundTag($tag{$type}, $buff) unless $status;
570                    }
571                    last;   # all done since we recognized the MIME type or found metadata
572                }
573                # continue on to list ZIP contents...
574            }
575        }
576
577        # otherwise just extract general ZIP information
578        $et->SetFileType();
579        @members = $zip->members();
580        my ($member, $iWorkType);
581        # special files to extract
582        my %extract = (
583            'meta.json' => 1,
584            'previews/preview.png' => 'PreviewPNG',
585            'preview.jpg' => 'PreviewImage', # (iWork 2013 files)
586            'preview-web.jpg' => 'OtherImage', # (iWork 2013 files)
587            'preview-micro.jpg' => 'ThumbnailImage', # (iWork 2013 files)
588            'QuickLook/Thumbnail.jpg' => 'ThumbnailImage', # (iWork 2009 files)
589            'QuickLook/Preview.pdf' => 'PreviewPDF', # (iWork 2009 files)
590        );
591        foreach $member (@members) {
592            $$et{DOC_NUM} = ++$docNum;
593            HandleMember($et, $member, $tagTablePtr);
594            my $file = $member->fileName();
595            # extract things from Sketch files
596            if ($extract{$file}) {
597                ($buff, $status) = $zip->contents($member);
598                $status and $et->Warn("Error extracting $file"), next;
599                if ($file eq 'meta.json') {
600                    $et->ExtractInfo(\$buff, { ReEntry => 1 });
601                    if ($$et{VALUE}{App} and $$et{VALUE}{App} =~ /sketch/i) {
602                        $et->OverrideFileType('SKETCH');
603                    }
604                } else {
605                    $et->FoundTag($extract{$file} => $buff);
606                }
607            } elsif ($file eq 'Index/Document.iwa' and not $iWorkType) {
608                my $type = $iWorkType{$$et{FILE_EXT} || ''};
609                $iWorkType = $type || 'PAGES';
610            } elsif ($iWorkFile{$file}) {
611                $iWorkType = $iWorkFile{$file};
612            }
613        }
614        $et->OverrideFileType($iWorkType) if $iWorkType;
615        last;
616    }
617    # all done if we processed this using Archive::Zip
618    if ($zip) {
619        delete $$dirInfo{ZIP};
620        delete $$et{DOC_NUM};
621        if ($docNum > 1 and not $et->Options('Duplicates')) {
622            $et->Warn("Use the Duplicates option to extract tags for all $docNum files", 1);
623        }
624        return 1;
625    }
626#
627# process the ZIP file by hand (funny, but this seems easier than using Archive::Zip)
628#
629    $et->VPrint(1, "  -- processing as binary data --\n");
630    $raf->Seek(30, 0);
631    $et->SetFileType();
632    SetByteOrder('II');
633
634    #  A.  Local file header:
635    #  local file header signature     0) 4 bytes  (0x04034b50)
636    #  version needed to extract       4) 2 bytes
637    #  general purpose bit flag        6) 2 bytes
638    #  compression method              8) 2 bytes
639    #  last mod file time             10) 2 bytes
640    #  last mod file date             12) 2 bytes
641    #  crc-32                         14) 4 bytes
642    #  compressed size                18) 4 bytes
643    #  uncompressed size              22) 4 bytes
644    #  file name length               26) 2 bytes
645    #  extra field length             28) 2 bytes
646    for (;;) {
647        my $len = Get16u(\$buff, 26) + Get16u(\$buff, 28);
648        $raf->Read($buf2, $len) == $len or last;
649
650        $$et{DOC_NUM} = ++$docNum;
651        $buff .= $buf2;
652        my %dirInfo = (
653            DataPt => \$buff,
654            DataPos => $raf->Tell() - 30 - $len,
655            DataLen => 30 + $len,
656            DirStart => 0,
657            DirLen => 30 + $len,
658        );
659        $et->ProcessDirectory(\%dirInfo, $tagTablePtr);
660        my $flags = Get16u(\$buff, 6);
661        if ($flags & 0x08) {
662            # we don't yet support skipping stream mode data
663            # (when this happens, the CRC, compressed size and uncompressed
664            #  sizes are set to 0 in the header.  Instead, they are stored
665            #  after the compressed data with an optional header of 0x08074b50)
666            $et->Warn('Stream mode data encountered, file list may be incomplete');
667            last;
668        }
669        $len = Get32u(\$buff, 18);      # file data length
670        $raf->Seek($len, 1) or last;    # skip file data
671        $raf->Read($buff, 30) == 30 and $buff =~ /^PK\x03\x04/ or last;
672    }
673    delete $$et{DOC_NUM};
674    if ($docNum > 1 and not $et->Options('Duplicates')) {
675        $et->Warn("Use the Duplicates option to extract tags for all $docNum files", 1);
676    }
677    return 1;
678}
679
6801;  # end
681
682__END__
683
684=head1 NAME
685
686Image::ExifTool::ZIP - Read ZIP archive meta information
687
688=head1 SYNOPSIS
689
690This module is used by Image::ExifTool
691
692=head1 DESCRIPTION
693
694This module contains definitions required by Image::ExifTool to extract meta
695information from ZIP, GZIP and RAR archives.  This includes ZIP-based file
696types like Office Open XML (DOCX, PPTX and XLSX), Open Document (ODB, ODC,
697ODF, ODG, ODI, ODP, ODS and ODT), iWork (KEY, PAGES, NUMBERS), Capture One
698Enhanced Image Package (EIP), Adobe InDesign Markup Language (IDML),
699Electronic Publication (EPUB), and Sketch design files (SKETCH).
700
701=head1 AUTHOR
702
703Copyright 2003-2021, Phil Harvey (philharvey66 at gmail.com)
704
705This library is free software; you can redistribute it and/or modify it
706under the same terms as Perl itself.
707
708=head1 REFERENCES
709
710=over 4
711
712=item L<http://www.pkware.com/documents/casestudies/APPNOTE.TXT>
713
714=item L<http://www.gzip.org/zlib/rfc-gzip.html>
715
716=item L<http://DataCompression.info/ArchiveFormats/RAR202.txt>
717
718=back
719
720=head1 SEE ALSO
721
722L<Image::ExifTool::TagNames/ZIP Tags>,
723L<Image::ExifTool::TagNames/OOXML Tags>,
724L<Image::ExifTool(3pm)|Image::ExifTool>
725
726=cut
727
728