1# File::MMagic
2#
3# $Id: MMagic.pm,v 1.26.4.5 2006-10-13 16:59:20 opengl2772 Exp $
4#
5# This program is originated from file.kulp that is a production of The
6# Unix Reconstruction Projct.
7#    <http://language.perl.com/ppt/index.html>
8# Copyright 1999,2000,2001,2002 NOKUBI Takatsugu <knok@daionet.gr.jp>.
9#
10# This product includes software developed by the Apache Group
11# for use in the Apache HTTP server project (http://www.apache.org/).
12#
13# License for the program is followed the original software. The license is
14# below.
15#
16# This program is copyright by dkulp 1999.
17#
18# This program is free and open software. You may use, copy, modify, distribute
19# and sell this program (and any modified variants) in any way you wish,
20# provided you do not restrict others to do the same, except for the following
21# consideration.
22#
23#I read some of Ian F. Darwin's BSD C implementation, to
24#try to determine how some of this was done since the specification
25#is a little vague.  I don't believe that this perl version could
26#be construed as an "altered version", but I did grab the tokens for
27#identifying the hard-coded file types in names.h and copied some of
28#the man page.
29#
30#Here's his notice:
31#
32#  * Copyright (c) Ian F. Darwin, 1987.
33#  * Written by Ian F. Darwin.
34#  *
35#  * This software is not subject to any license of the American Telephone
36#  * and Telegraph Company or of the Regents of the University of California.
37#  *
38#  * Permission is granted to anyone to use this software for any purpose on
39#  * any computer system, and to alter it and redistribute it freely, subject
40#  * to the following restrictions:
41#  *
42#  * 1. The author is not responsible for the consequences of use of this
43#  *    software, no matter how awful, even if they arise from flaws in it.
44#  *
45#  * 2. The origin of this software must not be misrepresented, either by
46#  *    explicit claim or by omission.  Since few users ever read sources,
47#  *    credits must appear in the documentation.
48#  *
49#  * 3. Altered versions must be plainly marked as such, and must not be
50#  *    misrepresented as being the original software.  Since few users
51#  *    ever read sources, credits must appear in the documentation.
52#  *
53#  * 4. This notice may not be removed or altered.
54#
55# The following is the Apache License. This program contains the magic file
56# that derived from the Apache HTTP Server.
57#
58#  * Copyright (c) 1995-1999 The Apache Group.  All rights reserved.
59#  *
60#  * Redistribution and use in source and binary forms, with or without
61#  * modification, are permitted provided that the following conditions
62#  * are met:
63#  *
64#  * 1. Redistributions of source code must retain the above copyright
65#  *    notice, this list of conditions and the following disclaimer.
66#  *
67#  * 2. Redistributions in binary form must reproduce the above copyright
68#  *    notice, this list of conditions and the following disclaimer in
69#  *    the documentation and/or other materials provided with the
70#  *    distribution.
71#  *
72#  * 3. All advertising materials mentioning features or use of this
73#  *    software must display the following acknowledgment:
74#  *    "This product includes software developed by the Apache Group
75#  *    for use in the Apache HTTP server project (http://www.apache.org/)."
76#  *
77#  * 4. The names "Apache Server" and "Apache Group" must not be used to
78#  *    endorse or promote products derived from this software without
79#  *    prior written permission. For written permission, please contact
80#  *    apache@apache.org.
81#  *
82#  * 5. Products derived from this software may not be called "Apache"
83#  *    nor may "Apache" appear in their names without prior written
84#  *    permission of the Apache Group.
85#  *
86#  * 6. Redistributions of any form whatsoever must retain the following
87#  *    acknowledgment:
88#  *    "This product includes software developed by the Apache Group
89#  *    for use in the Apache HTTP server project (http://www.apache.org/)."
90#  *
91#  * THIS SOFTWARE IS PROVIDED BY THE APACHE GROUP ``AS IS'' AND ANY
92#  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
93#  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
94#  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE APACHE GROUP OR
95#  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
96#  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
97#  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
98#  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
99#  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
100#  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
101#  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
102#  * OF THE POSSIBILITY OF SUCH DAMAGE.
103
104package File::MMagic;
105
106=head1 NAME
107
108File::MMagic - Guess file type
109
110=head1 SYNOPSIS
111
112  use File::MMagic;
113  use FileHandle;
114
115  $mm = new File::MMagic; # use internal magic file
116  # $mm = File::MMagic->new('/etc/magic'); # use external magic file
117  # $mm = File::MMagic->new('/usr/share/etc/magic'); # if you use Debian
118  $res = $mm->checktype_filename("/somewhere/unknown/file");
119
120  $fh = new FileHandle "< /somewhere/unknown/file2";
121  $res = $mm->checktype_filehandle($fh);
122
123  $fh->read($data, 0x8564);
124  $res = $mm->checktype_contents($data);
125
126=head1 ABSTRACT
127
128This perl library uses perl5 objects to guess file type from filename
129and/or filehandle.
130
131=head1 DESCRIPTION
132
133checktype_filename(), checktype_filehandle() and checktype_contents
134returns string contains file type with MIME mediatype format.
135
136=head1 METHODS
137
138=over 4
139
140=item File::MMagic->new()
141
142=item File::MMagic->new( $filename )
143
144Initializes the module. If no filename is given, the magic numbers
145stored in File::MMagic are used.
146
147=item $mm->addSpecials
148
149If a filetype cannot be determined by magic numbers, extra checks are
150done based on extra regular expressions which can be defined here. The
151first argument should be the filetype, the remaining arguments should
152be one or more regular expressions.
153
154By default, checks are done for message/news, message/rfc822,
155text/html, text/x-roff.
156
157=item $mm->removeSpecials
158
159Removes special regular expressions. Specify one or more filetypes. If
160no filetypes are specified, all special regexps are removed.
161
162Returns a hash containing the removed entries.
163
164=item $mm->addFileExts
165
166If a filetype cannot be determined by magic numbers, extra checks can
167be done based on the file extension (actually, a regexp). Two
168arguments should be geiven: the filename pattern and the corresponding
169filetype.
170
171By default, checks are done for application/x-compress,
172application/x-bzip2, application/x-gzip, text/html, text/plain
173
174=item $mm->removeFileExts
175
176Remove filename pattern checks. Specify one or more patterns. If no
177pattern is specified, all are removed.
178
179Returns a hash containing the removed entries.
180
181=item $mm->addMagicEntry
182
183Add a new magic entry in the object. The format is same as magic(5) file.
184
185  Ex.
186  # Add a entry
187  $mm->addMagicEntry("0\tstring\tabc\ttext/abc");
188  # Add a entry with a sub entry
189  $mm->addMagicEntry("0\tstring\tdef\t");
190  $mm->addMagicEntry(">10\tstring\tghi\ttext/ghi");
191
192=item $mm->readMagicHandle
193
194=item $mm->checktype_filename
195
196=item $mm->checktype_magic
197
198=item $mm->checktype_contents
199
200=head1 COPYRIGHT
201
202This program is originated from file.kulp that is a production of The
203Unix Reconstruction Projct.
204   <http://language.perl.com/ppt/index.html>
205Copyright (c) 1999 NOKUBI Takatsugu <knok@daionet.gr.jp>.
206
207There is no warranty for the program.
208
209This product includes software developed by the Apache Group
210for use in the Apache HTTP server project (http://www.apache.org/).
211
212License for the program is followed the original software. The license is
213below.
214
215This program is free and open software. You may use, copy, modify, distribute
216and sell this program (and any modified variants) in any way you wish,
217provided you do not restrict others to do the same, except for the following
218consideration.
219
220I read some of Ian F. Darwin's BSD C implementation, to
221try to determine how some of this was done since the specification
222is a little vague.  I don't believe that this perl version could
223be construed as an "altered version", but I did grab the tokens for
224identifying the hard-coded file types in names.h and copied some of
225the man page.
226
227Here's his notice:
228
229 * Copyright (c) Ian F. Darwin, 1987.
230 * Written by Ian F. Darwin.
231 *
232 * This software is not subject to any license of the American Telephone
233 * and Telegraph Company or of the Regents of the University of California.
234 *
235 * Permission is granted to anyone to use this software for any purpose on
236 * any computer system, and to alter it and redistribute it freely, subject
237 * to the following restrictions:
238 *
239 * 1. The author is not responsible for the consequences of use of this
240 *    software, no matter how awful, even if they arise from flaws in it.
241 *
242 * 2. The origin of this software must not be misrepresented, either by
243 *    explicit claim or by omission.  Since few users ever read sources,
244 *    credits must appear in the documentation.
245 *
246 * 3. Altered versions must be plainly marked as such, and must not be
247 *    misrepresented as being the original software.  Since few users
248 *    ever read sources, credits must appear in the documentation.
249 *
250 * 4. This notice may not be removed or altered.
251
252The following is the Apache License. This program contains the magic file
253that derived from the Apache HTTP Server.
254
255 * Copyright (c) 1995-1999 The Apache Group.  All rights reserved.
256 *
257 * Redistribution and use in source and binary forms, with or without
258 * modification, are permitted provided that the following conditions
259 * are met:
260 *
261 * 1. Redistributions of source code must retain the above copyright
262 *    notice, this list of conditions and the following disclaimer.
263 *
264 * 2. Redistributions in binary form must reproduce the above copyright
265 *    notice, this list of conditions and the following disclaimer in
266 *    the documentation and/or other materials provided with the
267 *    distribution.
268 *
269 * 3. All advertising materials mentioning features or use of this
270 *    software must display the following acknowledgment:
271 *    "This product includes software developed by the Apache Group
272 *    for use in the Apache HTTP server project (http://www.apache.org/)."
273 *
274 * 4. The names "Apache Server" and "Apache Group" must not be used to
275 *    endorse or promote products derived from this software without
276 *    prior written permission. For written permission, please contact
277 *    apache@apache.org.
278 *
279 * 5. Products derived from this software may not be called "Apache"
280 *    nor may "Apache" appear in their names without prior written
281 *    permission of the Apache Group.
282 *
283 * 6. Redistributions of any form whatsoever must retain the following
284 *    acknowledgment:
285 *    "This product includes software developed by the Apache Group
286 *    for use in the Apache HTTP server project (http://www.apache.org/)."
287 *
288 * THIS SOFTWARE IS PROVIDED BY THE APACHE GROUP ``AS IS'' AND ANY
289 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
290 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
291 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE APACHE GROUP OR
292 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
293 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
294 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
295 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
296 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
297 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
298 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
299 * OF THE POSSIBILITY OF SUCH DAMAGE.
300
301=cut
302
303use FileHandle;
304use strict;
305
306use vars qw(
307%TEMPLATES %ESC $VERSION
308$magicFile $checkMagic $followLinks $fileList
309$allowEightbit
310);
311
312BEGIN {
313# translation of type in magic file to unpack template and byte count
314%TEMPLATES = (byte     => [ 'c', 1 ],
315		 ubyte    => [ 'C', 1 ],
316		 char     => [ 'c', 1 ],
317		 uchar    => [ 'C', 1 ],
318		 short    => [ 's', 2 ],
319		 ushort   => [ 'S', 2 ],
320		 long     => [ 'l', 4 ],
321		 ulong    => [ 'L', 4 ],
322		 date     => [ 'l', 4 ],
323		 ubeshort => [ 'n', 2 ],
324		 beshort  => [ [ 'n', 'S', 's' ], 2 ],
325		 ubelong  => [   'N',             4 ],
326		 belong   => [ [ 'N', 'I', 'i' ], 4 ],
327		 bedate   => [   'N',             4 ],
328		 uleshort => [   'v',             2 ],
329		 leshort  => [ [ 'v', 'S', 's' ], 2 ],
330		 ulelong  => [   'V',             4 ],
331		 lelong   => [ [ 'V', 'I', 'i' ], 4 ],
332		 ledate   => [   'V',             4 ],
333		 string   => undef);
334
335# for letter escapes in magic file
336%ESC = ( n => "\n",
337	    r => "\r",
338	    b => "\b",
339	    t => "\t",
340	    f => "\f");
341
342$VERSION = "1.27";
343$allowEightbit = 1;
344}
345
346sub new {
347    my $self = {};
348    my $proto = shift;
349    my $class = ref($proto) || $proto;
350    $self->{MF} = [];
351    $self->{magic} = [];
352    if (! @_) {
353	my $fh = *File::MMagic::DATA{IO};
354	binmode($fh);
355	bless $fh, 'FileHandle' if ref $fh ne 'FileHandle';
356	my $dataLoc;
357	# code block to localise the no strict;, contribute by Simon Matthews
358	{
359	    no strict 'refs';
360	    my $instance = \${ "$class\::_instance" };
361	    $$instance = $fh->tell() unless $$instance;
362	    $dataLoc = $$instance;
363	}
364
365	$fh->seek($dataLoc, 0);
366	&readMagicHandle($self, $fh);
367    } else {
368	my $filename = shift;
369	my $fh = new FileHandle;
370	if ($fh->open("< $filename")) {
371	    binmode($fh);
372	    &readMagicHandle($self, $fh);
373	} else {
374	    warn __PACKAGE__ . " couldn't load specified file $filename";
375	}
376    }
377
378# from the BSD names.h, some tokens for hard-coded checks of
379# different texts.  This isn't rocket science.  It's prone to
380# failure so these checks are only a last resort.
381
382# removSpecials() can be used to remove those afterwards.
383    $self->{SPECIALS} = {
384		 "message/rfc822" => [ "^Received:",
385			     "^>From ",
386			     "^From ",
387			     "^To: ",
388			     "^Return-Path: ",
389			     "^Cc: ",
390			     "^X-Mailer: "],
391		 "message/news" => [ "^Newsgroups: ",
392			     "^Path: ",
393			     "^X-Newsreader: "],
394		 "text/html" => [ "<html[^>]*>",
395			     "<HTML[^>]*>",
396			     "<head[^>]*>",
397			     "<HEAD[^>]*>",
398			     "<body[^>]*>",
399			     "<BODY[^>]*>",
400			     "<title[^>]*>",
401			     "<TITLE[^>]*>",
402			     "<h1[^>]*>",
403			     "<H1[^>]*>",
404			],
405		 "text/x-roff" => [
406			      '^\\.\\\\"',
407			      "^\\.SH ",
408			      "^\\.PP ",
409			      "^\\.TH ",
410			      "^\\.BR ",
411			      "^\\.SS ",
412			      "^\\.TP ",
413			      "^\\.IR ",
414				   ],
415		};
416
417    $self->{FILEEXTS} = {
418	     '\.gz$' => 'application/x-gzip',
419	     '\.bz2$' => 'application/x-bzip2',
420	     '\.Z$' => 'application/x-compress',
421	     '\.txt$' => 'text/plain',
422	     '\.html$' => 'text/html',
423	     '\.htm$' => 'text/html',
424    };
425    bless($self);
426    return $self;
427}
428
429sub addSpecials {
430    my $self = shift;
431    my $mtype = shift;
432    $self->{SPECIALS}->{"$mtype"} = [@_];
433    return $self;
434}
435
436sub removeSpecials {
437    my $self = shift;
438    # Remove all keys if no arguments given
439    my @mtypes = (@_ or keys %{$self->{SPECIALS}});
440    my %returnmtypes;
441    foreach my $mtype (@mtypes) {
442      $returnmtypes{"$mtype"} = delete $self->{SPECIALS}->{"$mtype"};
443    }
444    return %returnmtypes;
445}
446
447sub addFileExts {
448    my $self = shift;
449    my $filepat = shift;
450    my $mtype = shift;
451    $self->{FILEEXTS}->{"$filepat"} = $mtype;
452    return $self;
453}
454
455sub removeFileExts {
456    my $self = shift;
457    # Remove all keys if no arguments given
458    my @filepats = (@_ or keys %{$self->{FILEEXTS}});
459    my %returnfilepats;
460    foreach my $filepat (@filepats) {
461      $returnfilepats{"$filepat"} = delete $self->{FILEEXTS}->{"$filepat"};
462    }
463    return %returnfilepats;
464}
465
466sub addMagicEntry {
467    my $self = shift;
468    my $entry = shift;
469    if ($entry =~ /^>/) {
470	$entry =~ s/^>//;
471	my $depth = 1;
472	my $entref = ${${$self->{magic}}[0]}[2];
473	while ($entry =~ /^>/) {
474	    $entry =~ s/^>//;
475	    $depth ++;
476	    $entref = ${${$entref}[0]}[2];
477	}
478	$entry = '>' x $depth . $entry;
479	unshift @{$entref}, [$entry, -1, []];
480	return $self;
481    }
482    unshift @{$self->{magic}}, [$entry, -1, []];
483    return $self;
484}
485
486sub readMagicHandle {
487    my $self = shift;
488    my $fh = shift;
489    $self->{MF}->[0] = $fh;
490    $self->{MF}->[1] = undef;
491    $self->{MF}->[2] = 0;
492    readMagicEntry($self->{magic}, $self->{MF});
493}
494
495# Not implimented.
496#
497#sub readMagicFile {
498#    my $self = shift;
499#    my $mfile = shift;
500#}
501
502sub checktype_filename {
503    my $self = shift;
504
505# iterate over each file explicitly so we can seek
506    my $file = shift;
507
508    # the description line.  append info to this string
509    my $desc;
510    my $mtype;
511
512    # 0) check permission
513    if (! -r $file) {
514	$desc .= " can't read `$file': Permission denied.";
515	return "x-system/x-error; $desc";
516    }
517
518    # 1) check for various special files first
519    if ($^O eq 'MSWin32') {
520	stat($file);
521    } else {
522	if ($followLinks) { stat($file); } else { lstat($file); }
523    }
524    if (! -f _  or -z _) {
525	if ( $^O ne 'MSWin32' && !$followLinks && -l _ ) {
526	    $desc .= " symbolic link to ".readlink($file);
527	}
528	elsif ( -d _ ) { $desc .= " directory"; }
529	elsif ( -p _ ) { $desc .= " named pipe"; }
530	elsif ( -S _ ) { $desc .= " socket"; }
531	elsif ( -b _ ) { $desc .= " block special file"; }
532	elsif ( -c _ ) { $desc .= " character special file"; }
533	elsif ( -z _ ) { $desc .= " empty"; }
534	else { $desc .= " special"; }
535
536	return "x-system/x-unix; $desc";
537    }
538
539    # current file handle.  or undef if checkMagic (-c option) is true.
540    my $fh;
541
542#    $fh = new FileHandle "< $file" or die "$F: $file: $!\n" ;
543    $fh = new FileHandle "< $file" or return "x-system/x-error; $file: $!\n" ;
544
545    binmode($fh); # for MSWin32
546
547    # 2) check for script
548    if (-x $file && -T _) {
549
550	# Note, some magic files include elaborate attempts
551	# to match #! header lines and return pretty responses
552	# but this slows down matching and is unnecessary.
553	my $line1 = <$fh>;
554	if ($line1 =~ /^\#!\s*(\S+)/) {
555	    $desc .= " executable $1 script text";
556	}
557	else { $desc .= " commands text"; }
558
559	$fh->close();
560
561	return "x-system/x-unix; $desc";
562
563    }
564
565    my $out = checktype_filehandle($self, $fh, $desc);
566    undef $fh;
567
568    return $out;
569}
570
571sub checktype_filehandle {
572    my $self = shift;
573    my ($fh, $desc) = @_;
574    my $mtype;
575
576    binmode($fh); # for MSWin32 architecture.
577
578    # 3) iterate over each magic entry.
579    my $matchFound = 0;
580    my $m;
581    for ($m = 0; $m <= $#{$self->{magic}}; $m++) {
582
583	# check if the m-th magic entry matches
584	# if it does, then $desc will contain an updated description
585	if (magicMatch($self->{magic}->[$m],\$desc,$fh)) {
586	    if (defined $desc && $desc ne '') {
587		$matchFound = 1;
588		$mtype = $desc;
589		last;
590	    }
591	}
592
593	# read another entry from the magic file if we've exhausted
594	# all the entries already buffered.  readMagicEntry will
595	# add to the end of the array if there are more.
596	if ($m == $#{$self->{magic}} && !$self->{MF}->[0]->eof()) {
597	    readMagicEntry($self->{magic}, $self->{MF});
598	}
599    }
600
601    # 4) check if it's text or binary.
602    # if it's text, then do a bunch of searching for special tokens
603    if (!$matchFound) {
604	my $data;
605	$fh->seek(0,0);
606	$fh->read($data, 0x8564);
607	$mtype = checktype_data($self, $data);
608    }
609
610    $mtype = 'text/plain' if (! defined $mtype);
611
612    return $mtype;
613}
614
615sub checktype_contents {
616    my $self = shift;
617    my $data = shift;
618    my $mtype;
619
620    return 'application/octet-stream' if (length($data) <= 0);
621
622    $mtype = checktype_magic($self, $data);
623
624    # 4) check if it's text or binary.
625    # if it's text, then do a bunch of searching for special tokens
626    if (!defined $mtype) {
627	$mtype = checktype_data($self, $data);
628    }
629
630    $mtype = 'text/plain' if (! defined $mtype);
631
632    return $mtype;
633}
634
635sub checktype_magic {
636    my $self = shift;
637    my $data = shift;
638    my $desc;
639    my $mtype;
640
641    return 'application/octet-stream' if (length($data) <= 0);
642
643    # 3) iterate over each magic entry.
644    my $m;
645    for ($m = 0; $m <= $#{$self->{magic}}; $m++) {
646
647	# check if the m-th magic entry matches
648	# if it does, then $desc will contain an updated description
649	if (magicMatchStr($self->{magic}->[$m],\$desc,$data)) {
650	    if (defined $desc && $desc ne '') {
651		$mtype = $desc;
652		last;
653	    }
654	}
655
656	# read another entry from the magic file if we've exhausted
657	# all the entries already buffered.  readMagicEntry will
658	# add to the end of the array if there are more.
659	if ($m == $#{$self->{magic}} && !$self->{MF}->[0]->eof()) {
660	    readMagicEntry($self->{magic}, $self->{MF});
661	}
662    }
663
664    return $mtype;
665}
666
667sub checktype_data {
668    my $self = shift;
669    my $data = shift;
670    my $mtype;
671
672    return undef if (length($data) <= 0);
673
674    # truncate data
675    $data = substr($data, 0, 0x8564);
676
677    # at first, check SPECIALS
678    {
679	# in BSD's version, there's an effort to search from
680	# more specific to less, but I don't do that.
681	my %val;
682	foreach my $type (keys %{$self->{SPECIALS}}) {
683	    my $matched_pos = undef;
684	    foreach my $token (@{$self->{SPECIALS}->{$type}}){
685		pos($data) = 0;
686		if ($data =~ /$token/mg) {
687		    my $tmp =  pos($data);
688		    if ((! defined $matched_pos) || ($matched_pos > $tmp)) {
689			$matched_pos = $tmp;
690		    }
691		}
692	    }
693	    $val{$type} = $matched_pos if $matched_pos;
694	}
695	# search latest match
696	if (%val) {
697	    my @skeys = sort { $val{$a} <=> $val{$b} } keys %val;
698	    $mtype = $skeys[0];
699	}
700
701#	$mtype = 'text/plain' if (! defined $mtype);
702    }
703    if (! defined $mtype && check_binary($data)) {
704	$mtype = "application/octet-stream";
705    }
706
707#    $mtype = 'text/plain' if (! defined $mtype);
708    return $mtype;
709}
710
711sub checktype_byfilename {
712    my $self = shift;
713    my $fname = shift;
714    my $type;
715
716    $fname =~ s/^.*\///;
717    for my $regex (keys %{$self->{FILEEXTS}}) {
718	if ($fname =~ /$regex/i) {
719	    if ((defined $type && $type !~ /;/) || (! defined $type)) {
720		$type = $self->{FILEEXTS}->{$regex}; # has no x-type param
721	    }
722	}
723    }
724    $type = 'application/octet-stream' unless defined $type;
725    return $type;
726}
727
728sub check_binary {
729    my ($data) = @_;
730    my $len = length($data);
731    if ($allowEightbit) {
732	my $count = ($data =~ tr/\x00-\x08\x0b-\x0c\x0e-\x1a\x1c-\x1f//); # exclude TAB, ESC, nl, cr
733        return 1 if ($len <= 0); # no contents
734        return 1 if (($count/$len) > 0.1); # binary
735    } else {
736	my $count = ($data =~ tr/\x00-\x08\x0b-\x0c\x0e-\x1a\x1c-\x1f\x80-\xff//); # exclude TAB, ESC, nl, cr
737        return 1 if ($len <= 0); # no contents
738        return 1 if (($count/$len) > 0.3); # binary
739    }
740    return 0;
741}
742
743sub check_magic {
744    my $self = shift @_;
745    # read the whole file if we haven't already
746    while (!$self->{MF}->[0]->eof()) {
747	readMagicEntry($self->{magic}, $self->{MF});
748    }
749    dumpMagic($self->{magic});
750}
751
752####### SUBROUTINES ###########
753
754# compare the magic item with the filehandle.
755# if success, print info and return true.  otherwise return undef.
756#
757# this is called recursively if an item has subitems.
758sub magicMatch {
759    my ($item, $p_desc, $fh) = @_;
760
761    # delayed evaluation.  if this is our first time considering
762    # this item, then parse out its structure.  @$item is just the
763    # raw string, line number, and subtests until we need the real info.
764    # this saves time otherwise wasted parsing unused subtests.
765    if (@$item == 3){
766        my $tmp = readMagicLine(@$item);
767        @$item = @$tmp;
768    }
769
770    # $item could be undef if we ran into troubles while reading
771    # the entry.
772    return unless defined($item);
773
774    # $fh is not be defined if -c.  that way we always return
775    # false for every item which allows reading/checking the entire
776    # magic file.
777    return unless defined($fh);
778
779    my ($offtype, $offset, $numbytes, $type, $mask, $op, $testval,
780	$template, $message, $subtests) = @$item;
781
782    # bytes from file
783    my $data;
784
785    # set to true if match
786    my $match = 0;
787
788    # offset = [ off1, sz, template, off2 ] for indirect offset
789    if ($offtype == 1) {
790	my ($off1, $sz, $template, $off2) = @$offset;
791	$fh->seek($off1,0) or return;
792	if ($fh->read($data,$sz) != $sz) { return };
793	$off2 += unpack($template,$data);
794	$fh->seek($off2,0) or return;
795    }
796    elsif ($offtype == 2) {
797	# relative offsets from previous seek
798	$fh->seek($offset,1) or return;
799    }
800    else {
801	# absolute offset
802	$fh->seek($offset,0) or return;
803    }
804
805    if ($type =~ /^string/) {
806	# read the length of the match string unless the
807	# comparison is '>' ($numbytes == 0), in which case
808	# read to the next null or "\n". (that's what BSD's file does)
809	if ($numbytes > 0) {
810	    if ($fh->read($data,$numbytes) != $numbytes) { return; }
811	}
812	else {
813	    my $ch = $fh->getc();
814	    while (defined($ch) && $ch ne "\0" && $ch ne "\n") {
815		$data .= $ch;
816		$ch = $fh->getc();
817	    }
818	}
819
820	# now do the comparison
821	if ($op eq '=') {
822	    $match = ($data eq $testval);
823	}
824	elsif ($op eq '<') {
825	    $match = ($data lt $testval);
826	}
827	elsif ($op eq '>') {
828	    $match = ($data gt $testval);
829	}
830	# else bogus op, but don't die, just skip
831
832	if ($checkMagic) {
833	    print STDERR "STRING: $data $op $testval => $match\n";
834	}
835
836    }
837    else {
838	#numeric
839
840	# read up to 4 bytes
841	if ($fh->read($data,$numbytes) != $numbytes) { return; }
842
843	# If template is a ref to an array of 3 letters,
844	# then this is an endian
845	# number which must be first unpacked into an unsigned and then
846	# coerced into a signed.  Is there a better way?
847	if (ref($template)) {
848	    $data = unpack($$template[2],
849			   pack($$template[1],
850				unpack($$template[0],$data)));
851	}
852	else {
853	    $data = unpack($template,$data);
854	}
855
856	# if mask
857	if (defined($mask)) {
858	    $data &= $mask;
859	}
860
861	# Now do the check
862	if ($op eq '=') {
863	    $match = ($data == $testval);
864	}
865	elsif ($op eq 'x') {
866	    $match = 1;
867	}
868	elsif ($op eq '!') {
869	    $match = ($data != $testval);
870	}
871	elsif ($op eq '&') {
872	    $match = (($data & $testval) == $testval);
873	}
874	elsif ($op eq '^') {
875	    $match = ((~$data & $testval) == $testval);
876	}
877	elsif ($op eq '<') {
878	    $match = ($data < $testval);
879	}
880	elsif ($op eq '>') {
881	    $match = ($data > $testval);
882	}
883	# else bogus entry that we're ignoring
884
885	if ($checkMagic) {
886	    print STDERR "NUMERIC: $data $op $testval => $match\n";
887	}
888
889    }
890
891    if ($match) {
892	# it's pretty common to find "\b" in the message, but
893	# sprintf doesn't insert a backspace.  if it's at the
894	# beginning (typical) then don't include separator space.
895	if ($message =~ s/^\\b//) {
896	    $$p_desc .= sprintf($message,$data);
897	}
898	else {
899#	    $$p_desc .= ' ' . sprintf($message,$data) if $message;
900	    $$p_desc .= sprintf($message,$data) if $message;
901	}
902
903	my $subtest;
904	foreach $subtest (@$subtests) {
905	    magicMatch($subtest,$p_desc,$fh);
906	}
907
908	return 1;
909    }
910
911}
912
913sub magicMatchStr {
914    my ($item, $p_desc, $str) = @_;
915    my $origstr = $str;
916
917    # delayed evaluation.  if this is our first time considering
918    # this item, then parse out its structure.  @$item is just the
919    # raw string, line number, and subtests until we need the real info.
920    # this saves time otherwise wasted parsing unused subtests.
921    if (@$item == 3){
922	my $tmp = readMagicLine(@$item);
923
924	# $item could be undef if we ran into troubles while reading
925	# the entry.
926	return unless defined($tmp);
927
928	@$item = @$tmp;
929    }
930
931    # $fh is not be defined if -c.  that way we always return
932    # false for every item which allows reading/checking the entire
933    # magic file.
934    return unless defined($str);
935    return if ($str eq '');
936
937    my ($offtype, $offset, $numbytes, $type, $mask, $op, $testval,
938	$template, $message, $subtests) = @$item;
939    return unless defined $op;
940
941    # bytes from file
942    my $data;
943
944    # set to true if match
945    my $match = 0;
946
947    # offset = [ off1, sz, template, off2 ] for indirect offset
948    if ($offtype == 1) {
949	my ($off1, $sz, $template, $off2) = @$offset;
950	return if (length($str) < $off1);
951	$data = pack("a$sz", $str);
952	$off2 += unpack($template,$data);
953	return if (length($str) < $off2);
954    }
955    elsif ($offtype == 2) {
956	# can't handle relative offsets from previous seek
957	return;
958    }
959    else {
960	# absolute offset
961	return if ($offset > length($str));
962	$str = substr($str, $offset);
963    }
964
965    if ($type =~ /^string/) {
966	# read the length of the match string unless the
967	# comparison is '>' ($numbytes == 0), in which case
968	# read to the next null or "\n". (that's what BSD's file does)
969	if ($numbytes > 0) {
970	    $data = pack("a$numbytes", $str);
971	}
972	else {
973	    $str =~ /^(.*)\0|$/;
974	    $data = $1;
975	}
976
977	# now do the comparison
978	if ($op eq '=') {
979	    $match = ($data eq $testval);
980	}
981	elsif ($op eq '<') {
982	    $match = ($data lt $testval);
983	}
984	elsif ($op eq '>') {
985	    $match = ($data gt $testval);
986	}
987	# else bogus op, but don't die, just skip
988
989	if ($checkMagic) {
990	    print STDERR "STRING: $data $op $testval => $match\n";
991	}
992
993    }
994    else {
995	#numeric
996
997	# read up to 4 bytes
998        return if (length($str) < 4);
999	$data = substr($str, 0, 4);
1000
1001	# If template is a ref to an array of 3 letters,
1002	# then this is an endian
1003	# number which must be first unpacked into an unsigned and then
1004	# coerced into a signed.  Is there a better way?
1005	if (ref($template)) {
1006	    $data = unpack($$template[2],
1007			   pack($$template[1],
1008				unpack($$template[0],$data)));
1009	}
1010	else {
1011	    $data = unpack($template,$data);
1012	}
1013
1014	# if mask
1015	if (defined($mask)) {
1016	    $data &= $mask;
1017	}
1018
1019	# Now do the check
1020	if ($op eq '=') {
1021	    $match = ($data == $testval);
1022	}
1023	elsif ($op eq 'x') {
1024	    $match = 1;
1025	}
1026	elsif ($op eq '!') {
1027	    $match = ($data != $testval);
1028	}
1029	elsif ($op eq '&') {
1030	    $match = (($data & $testval) == $testval);
1031	}
1032	elsif ($op eq '^') {
1033	    $match = ((~$data & $testval) == $testval);
1034	}
1035	elsif ($op eq '<') {
1036	    $match = ($data < $testval);
1037	}
1038	elsif ($op eq '>') {
1039	    $match = ($data > $testval);
1040	}
1041	# else bogus entry that we're ignoring
1042
1043	if ($checkMagic) {
1044	    print STDERR "NUMERIC: $data $op $testval => $match\n";
1045	}
1046
1047    }
1048
1049    if ($match) {
1050	# it's pretty common to find "\b" in the message, but
1051	# sprintf doesn't insert a backspace.  if it's at the
1052	# beginning (typical) then don't include separator space.
1053	if ($message =~ s/^\\b//) {
1054	    $$p_desc .= sprintf($message,$data);
1055	}
1056	else {
1057#	    $$p_desc .= ' ' . sprintf($message,$data) if $message;
1058	    $$p_desc .= sprintf($message,$data) if $message;
1059	}
1060
1061	my $subtest;
1062	foreach $subtest (@$subtests) {
1063	    # finish evaluation when matched.
1064	    magicMatchStr($subtest,$p_desc,$origstr);
1065	}
1066
1067	return 1;
1068    }
1069
1070}
1071
1072# readMagicEntry($pa_magic, $MF, $depth)
1073#
1074# reads the next entry from the magic file and stores it as
1075# a ref to an array at the end of @$pa_magic.
1076#
1077# $MF = [ filehandle, last buffered line, line count ]
1078#
1079# This is called recursively with increasing $depth to read in sub-clauses
1080#
1081# returns the depth of the current buffered line.
1082#
1083sub readMagicEntry {
1084    my ($pa_magic, $MF, $depth) = @_;
1085
1086    # for some reason I need a local var because <$$MF[0]> doesn't work.(?)
1087    my $magicFH = $$MF[0];
1088
1089    # a ref to an array containing a magic line's components
1090    my ($entry, $line);
1091
1092    $line = $$MF[1];		# buffered last line
1093    while (1) {
1094	$line = '' if (! defined $line);
1095	if ($line =~ /^\#/ || $line =~ /^\s*$/) {
1096	    last if $magicFH->eof();
1097	    $line = <$magicFH>;
1098	    $$MF[2]++;
1099	    next;
1100	}
1101
1102	my ($thisDepth) = ($line =~ /^(>+)/);
1103	$thisDepth = '' if (! defined $thisDepth);
1104	$depth = 0 if (! defined $depth);
1105
1106	if (length($thisDepth) > $depth) {
1107	    $$MF[1] = $line;
1108
1109	    # call ourselves recursively.  will return the depth
1110	    # of the entry following the nested group.
1111	    if ((readMagicEntry($entry->[2], $MF, $depth+1) || 0) < $depth ||
1112		$$MF[0]->eof())
1113	    {
1114		return;
1115	    }
1116	    $line = $$MF[1];
1117	}
1118	elsif (length($thisDepth) < $depth) {
1119	    $$MF[1] = $line;
1120	    return length($thisDepth);
1121	}
1122	elsif (defined(@$entry)) {
1123	    # already have an entry.  this is not a continuation.
1124	    # save this line for the next call and exit.
1125	    $$MF[1] = $line;
1126	    return length($thisDepth);
1127	}
1128	else {
1129	    # we're here if the number of '>' is the same as the
1130	    # current depth and we haven't read a magic line yet.
1131
1132	    # create temp entry
1133	    # later -- if we ever get around to evaluating this condition --
1134	    # we'll replace @$entry with the results from readMagicLine.
1135	    $entry = [ $line , $$MF[2], [] ];
1136
1137	    # add to list
1138	    push(@$pa_magic,$entry);
1139
1140	    # read the next line
1141	    last if $magicFH->eof();
1142	    $line = <$magicFH>;
1143	    $$MF[2]++;
1144	}
1145    }
1146}
1147
1148# readMagicLine($line, $line_num, $subtests)
1149#
1150# parses the match info out of $line.  Returns a reference to an array.
1151#
1152#  Format is:
1153#
1154# [ offset, bytes, type, mask, operator, testval, template, sprintf, subtests ]
1155#     0      1      2       3        4         5        6        7      8
1156#
1157# subtests is an array like @$pa_magic.
1158#
1159sub readMagicLine {
1160    my ($line, $line_num, $subtests) = @_;
1161
1162    my ($offtype, $offset, $numbytes, $type, $mask,
1163	$operator, $testval, $template, $message);
1164
1165    # this would be easier if escaped whitespace wasn't allowed.
1166
1167    # grab the offset and type.  offset can either be a decimal, oct,
1168    # or hex offset or an indirect offset specified in parenthesis
1169    # like (x[.[bsl]][+-][y]), or a relative offset specified by &.
1170    # offtype : 0 = absolute, 1 = indirect, 2 = relative
1171    if ($line =~ s/^>*([&\(]?[a-fA-Flsx\.\+\-\d]+\)?)\s+(\S+)\s+//) {
1172	($offset,$type) = ($1,$2);
1173
1174	if ($offset =~ /^\(/) {
1175	    # indirect offset.
1176	    $offtype = 1;
1177
1178	    # store as a reference [ offset1 type template offset2 ]
1179
1180	    my ($o1,$type,$o2);
1181	    if (($o1,$type,$o2) = ($offset =~ /\((\d+)(\.[bsl])?([\+\-]?\d+)?\)/))
1182	    {
1183		$o1 = oct($o1) if $o1 =~ /^0/o;
1184		$o2 = oct($o2) if $o2 =~ /^0/o;
1185
1186		$type =~ s/\.//;
1187		if ($type eq '') { $type = 'l'; }  # default to long
1188		$type =~ tr/b/c/; # type will be template for unpack
1189
1190		my $sz = $type;	  # number of bytes
1191		$sz =~ tr/csl/124/;
1192
1193		$offset = [ $o1,$sz,$type,int($o2) ];
1194	    } else {
1195		warn "Bad indirect offset at line $line_num. '$offset'\n";
1196		return;
1197	    }
1198	}
1199	elsif ($offset =~ /^&/o) {
1200	    # relative offset
1201	    $offtype = 2;
1202
1203	    $offset = substr($offset,1);
1204	    $offset = oct($offset) if $offset =~ /^0/o;
1205	}
1206	else {
1207	    # normal absolute offset
1208	    $offtype = 0;
1209
1210	    # convert if needed
1211	    $offset = oct($offset) if $offset =~ /^0/o;
1212	}
1213    }
1214    else {
1215	warn "Bad Offset/Type at line $line_num. '$line'\n";
1216	return;
1217    }
1218
1219    # check for & operator on type
1220    if ($type =~ s/&(.*)//) {
1221	$mask = $1;
1222
1223	# convert if needed
1224	$mask = oct($mask) if $mask =~ /^0/o;
1225    }
1226
1227    # check if type is valid
1228    if (!exists($TEMPLATES{$type}) && $type !~ /^string/) {
1229	warn "Invalid type '$type' at line $line_num\n";
1230	return;
1231    }
1232
1233    # take everything after the first non-escaped space
1234    if ($line =~ s/([^\\])\s+(.*)/$1/) {
1235	$message = $2;
1236    }
1237    else {
1238	warn "Missing or invalid test condition or message at line $line_num\n";
1239	return;
1240    }
1241
1242    # remove the return if it's still there
1243    $line =~ s/\n$//o;
1244
1245    # get the operator.  if 'x', must be alone.  default is '='.
1246    if ($line =~ s/^([><&^=!])//o) {
1247	$operator = $1;
1248    }
1249    elsif ($line eq 'x') {
1250	$operator = 'x';
1251    }
1252    else { $operator = '='; }
1253
1254
1255    if ($type =~ /string/) {
1256	$testval = $line;
1257
1258	# do octal/hex conversion
1259	$testval =~ s/\\([x0-7][0-7]?[0-7]?)/chr(oct($1))/eg;
1260
1261	# do single char escapes
1262	$testval =~ s/\\(.)/$ESC{$1}||$1/eg;
1263
1264	# put the number of bytes to read in numbytes.
1265	# '0' means read to \0 or \n.
1266	if ($operator =~ /[>x]/o) {
1267	    $numbytes = 0;
1268	}
1269	elsif ($operator =~ /[=<]/o) {
1270	    $numbytes = length($testval);
1271	}
1272	elsif ($operator eq '!') {
1273	    # annoying special case.  ! operator only applies to numerics so
1274	    # put it back.
1275	    $testval = $operator . $testval;
1276	    $numbytes = length($testval);
1277	    $operator = '=';
1278	}
1279	else {
1280	    # there's a bug in my magic file where there's
1281	    # a line that says "0	string	^!<arc..." and the BSD
1282	    # file program treats the argument like a numeric.  To minimize
1283	    # hassles, complain about bad ops only if -c is set.
1284	    warn "Invalid operator '$operator' for type 'string' at line $line_num.\n"
1285	      if $checkMagic;
1286	    return;
1287	}
1288    }
1289    else {
1290	# numeric
1291	if ($operator ne 'x') {
1292	    # this conversion is very forgiving.  it's faster and
1293	    # it doesn't complain about bugs in popular magic files,
1294	    # but it will silently turn a string into zero.
1295	    if ($line =~ /^0/o) {
1296		$testval = oct($line);
1297	    } else {
1298		$testval = int($line);
1299	    }
1300	}
1301
1302	($template,$numbytes) = @{$TEMPLATES{$type}};
1303
1304	# unset coercion of $unsigned unless we're doing order comparison
1305	if (ref($template)) {
1306	    $template = $$template[0]
1307	      unless $operator eq '>' || $operator eq '<';
1308	}
1309    }
1310
1311    return [ $offtype, $offset, $numbytes, $type, $mask,
1312	    $operator, $testval, $template, $message, $subtests ];
1313}
1314
1315# recursively write the magic file to stderr.  Numbers are written
1316# in decimal.
1317sub dumpMagic {
1318    my ($magic,$depth) = @_;
1319    $magic = [] unless defined $magic;
1320    $depth = 0 unless defined $depth;
1321
1322    my $entry;
1323    foreach $entry (@$magic) {
1324	# delayed evaluation.
1325        if (@$entry == 3){
1326            my $tmp = readMagicLine(@$entry);
1327            @$entry = @$tmp;
1328        }
1329
1330	next if !defined($entry);
1331
1332	my ($offtype, $offset, $numbytes, $type, $mask, $op, $testval,
1333	    $template, $message, $subtests) = @$entry;
1334
1335	print STDERR '>'x$depth;
1336	if ($offtype == 1) {
1337	    $offset->[2] =~ tr/c/b/;
1338	    print STDERR "($offset->[0].$offset->[2]$offset->[3])";
1339	}
1340	elsif ($offtype == 2) {
1341	    print STDERR "&",$offset;
1342	}
1343	else {
1344	    # offtype == 0
1345	    print STDERR $offset;
1346	}
1347	print STDERR "\t",$type;
1348	if ($mask) { print STDERR "&",$mask; }
1349	print STDERR "\t",$op,$testval,"\t",$message,"\n";
1350
1351	if ($subtests) {
1352	    dumpMagic($subtests,$depth+1);
1353	}
1354    }
1355}
1356
13571;
1358__DATA__
1359# Magic data for mod_mime_magic Apache module (originally for file(1) command)
1360# The module is described in htdocs/manual/mod/mod_mime_magic.html
1361#
1362# The format is 4-5 columns:
1363#    Column #1: byte number to begin checking from, ">" indicates continuation
1364#    Column #2: type of data to match
1365#    Column #3: contents of data to match
1366#    Column #4: MIME type of result
1367#    Column #5: MIME encoding of result (optional)
1368
1369#------------------------------------------------------------------------------
1370# Localstuff:  file(1) magic for locally observed files
1371# Add any locally observed files here.
1372#
1373
1374# The following paramaters are created for Namazu.
1375# <http://www.namazu.org/>
1376#
1377# 1999/08/13
1378#0	string		\<!--\ MHonArc		text/html; x-type=mhonarc
13790	string		BZh			application/x-bzip2
1380
1381# The following paramaters are local hack.
1382#
1383# 1999/09/09
1384# VRML (suggested by Masao Takaku)
13850	string		#VRML\ V1.0\ ascii	model/vrml
13860	string		#VRML\ V2.0\ utf8	model/vrml
1387
1388#------------------------------------------------------------------------------
1389# end local stuff
1390#------------------------------------------------------------------------------
1391
1392#------------------------------------------------------------------------------
1393# html:  file(1) magic for HTML (HyperText Markup Language) docs
1394#
1395# from Daniel Quinlan <quinlan@yggdrasil.com>
1396#
13970	string		\<!DOCTYPE\ HTML	text/html
13980	string		\<!DOCTYPE\ html	text/html
13990	string		\<HEAD		text/html
14000	string		\<head		text/html
14010	string		\<TITLE		text/html
14020	string		\<title		text/html
14030       string          \<html          text/html
14040       string          \<HTML          text/html
14050	string		\<!--		text/html
14060	string		\<h1		text/html
14070	string		\<H1		text/html
1408
1409#------------------------------------------------------------------------------
1410# mail.news:  file(1) magic for mail and news
1411#
1412# There are tests to ascmagic.c to cope with mail and news.
14130	string		Relay-Version: 	message/rfc822
14140	string		#!\ rnews	message/rfc822
14150	string		N#!\ rnews	message/rfc822
14160	string		Forward\ to 	message/rfc822
14170	string		Pipe\ to 	message/rfc822
14180	string		Return-Path:	message/rfc822
14190	string		Received:	message/rfc822
14200	string		Path:		message/news
14210	string		Xref:		message/news
14220	string		From:		message/rfc822
14230	string		Article 	message/news
1424
1425# Acrobat
1426# (due to clamen@cs.cmu.edu)
14270	string		%PDF-		application/pdf
1428
1429# ZIP archiver
14300		string	PK				application/x-zip
1431
1432#------------------------------------------------------------------------------
1433# msword: file(1) magic for MS Word files
1434#
1435# Contributor claims:
1436# Reversed-engineered MS Word magic numbers
1437#
1438
14390	string		\376\067\0\043			application/msword
1440#0	string		\320\317\021\340\241\261	application/msword
14410	string		\333\245-\0\0\0			application/msword
1442
1443#------------------------------------------------------------------------------
1444# Java
1445
14460	short		0xcafe
1447>2	short		0xbabe		application/java
1448
1449#------------------------------------------------------------------------------
1450# audio:  file(1) magic for sound formats
1451#
1452# from Jan Nicolai Langfeldt <janl@ifi.uio.no>,
1453#
1454
1455# Sun/NeXT audio data
14560	string		.snd
1457>12	belong		1		audio/basic
1458>12	belong		2		audio/basic
1459>12	belong		3		audio/basic
1460>12	belong		4		audio/basic
1461>12	belong		5		audio/basic
1462>12	belong		6		audio/basic
1463>12	belong		7		audio/basic
1464
1465>12	belong		23		audio/x-adpcm
1466
1467# DEC systems (e.g. DECstation 5000) use a variant of the Sun/NeXT format
1468# that uses little-endian encoding and has a different magic number
1469# (0x0064732E in little-endian encoding).
14700	lelong		0x0064732E
1471>12	lelong		1		audio/x-dec-basic
1472>12	lelong		2		audio/x-dec-basic
1473>12	lelong		3		audio/x-dec-basic
1474>12	lelong		4		audio/x-dec-basic
1475>12	lelong		5		audio/x-dec-basic
1476>12	lelong		6		audio/x-dec-basic
1477>12	lelong		7		audio/x-dec-basic
1478#                                       compressed (G.721 ADPCM)
1479>12	lelong		23		audio/x-dec-adpcm
1480
1481# Bytes 0-3 of AIFF, AIFF-C, & 8SVX audio files are "FORM"
1482#					AIFF audio data
14838	string		AIFF		audio/x-aiff
1484#					AIFF-C audio data
14858	string		AIFC		audio/x-aiff
1486#					IFF/8SVX audio data
14878	string		8SVX		audio/x-aiff
1488
1489# Creative Labs AUDIO stuff
1490#					Standard MIDI data
14910	string	MThd			audio/unknown
1492#>9 	byte	>0			(format %d)
1493#>11	byte	>1			using %d channels
1494#					Creative Music (CMF) data
14950	string	CTMF			audio/unknown
1496#					SoundBlaster instrument data
14970	string	SBI			audio/unknown
1498#					Creative Labs voice data
14990	string	Creative\ Voice\ File	audio/unknown
1500## is this next line right?  it came this way...
1501#>19	byte	0x1A
1502#>23	byte	>0			- version %d
1503#>22	byte	>0			\b.%d
1504
1505# [GRR 950115:  is this also Creative Labs?  Guessing that first line
1506#  should be string instead of unknown-endian long...]
1507#0	long		0x4e54524b	MultiTrack sound data
1508#0	string		NTRK		MultiTrack sound data
1509#>4	long		x		- version %ld
1510
1511# Microsoft WAVE format (*.wav)
1512# [GRR 950115:  probably all of the shorts and longs should be leshort/lelong]
1513#					Microsoft RIFF
1514#0	string		RIFF		audio/x-msvideo
15150	string		RIFF
1516#					- WAVE format
1517>8	string		WAVE		audio/x-wav
1518
1519#------------------------------------------------------------------------------
1520# c-lang:  file(1) magic for C programs or various scripts
1521#
1522
1523# XPM icons (Greg Roelofs, newt@uchicago.edu)
1524# ideally should go into "images", but entries below would tag XPM as C source
15250	string		/*\ XPM		image/x-xbm
1526
1527# this first will upset you if you're a PL/1 shop... (are there any left?)
1528# in which case rm it; ascmagic will catch real C programs
1529#					C or REXX program text
15300	string		/*		text/plain
1531#					C++ program text
15320	string		//		text/plain
1533
1534#------------------------------------------------------------------------------
1535# compress:  file(1) magic for pure-compression formats (no archives)
1536#
1537# compress, gzip, pack, compact, huf, squeeze, crunch, freeze, yabba, whap, etc.
1538#
1539# Formats for various forms of compressed data
1540# Formats for "compress" proper have been moved into "compress.c",
1541# because it tries to uncompress it to figure out what's inside.
1542
1543# standard unix compress
1544#0	string		\037\235	application/octet-stream	x-compress
15450	string		\037\235	application/x-compress
1546
1547# gzip (GNU zip, not to be confused with [Info-ZIP/PKWARE] zip archiver)
1548#0       string          \037\213        application/octet-stream	x-gzip
15490       string          \037\213        application/x-gzip
1550
1551# According to gzip.h, this is the correct byte order for packed data.
15520	string		\037\036	application/octet-stream
1553#
1554# This magic number is byte-order-independent.
1555#
15560	short		017437		application/octet-stream
1557
1558# XXX - why *two* entries for "compacted data", one of which is
1559# byte-order independent, and one of which is byte-order dependent?
1560#
1561# compacted data
15620	short		0x1fff		application/octet-stream
15630	string		\377\037	application/octet-stream
1564# huf output
15650	short		0145405		application/octet-stream
1566
1567# Squeeze and Crunch...
1568# These numbers were gleaned from the Unix versions of the programs to
1569# handle these formats.  Note that I can only uncrunch, not crunch, and
1570# I didn't have a crunched file handy, so the crunch number is untested.
1571#				Keith Waclena <keith@cerberus.uchicago.edu>
1572#0	leshort		0x76FF		squeezed data (CP/M, DOS)
1573#0	leshort		0x76FE		crunched data (CP/M, DOS)
1574
1575# Freeze
1576#0	string		\037\237	Frozen file 2.1
1577#0	string		\037\236	Frozen file 1.0 (or gzip 0.5)
1578
1579# lzh?
1580#0	string		\037\240	LZH compressed data
1581
1582#------------------------------------------------------------------------------
1583# frame:  file(1) magic for FrameMaker files
1584#
1585# This stuff came on a FrameMaker demo tape, most of which is
1586# copyright, but this file is "published" as witness the following:
1587#
15880	string		\<MakerFile	application/x-frame
15890	string		\<MIFFile	application/x-frame
15900	string		\<MakerDictionary	application/x-frame
15910	string		\<MakerScreenFon	application/x-frame
15920	string		\<MML		application/x-frame
15930	string		\<Book		application/x-frame
15940	string		\<Maker		application/x-frame
1595
1596#------------------------------------------------------------------------------
1597# images:  file(1) magic for image formats (see also "c-lang" for XPM bitmaps)
1598#
1599# originally from jef@helios.ee.lbl.gov (Jef Poskanzer),
1600# additions by janl@ifi.uio.no as well as others. Jan also suggested
1601# merging several one- and two-line files into here.
1602#
1603# XXX - byte order for GIF and TIFF fields?
1604# [GRR:  TIFF allows both byte orders; GIF is probably little-endian]
1605#
1606
1607# [GRR:  what the hell is this doing in here?]
1608#0	string		xbtoa		btoa'd file
1609
1610# PBMPLUS
1611#					PBM file
16120	string		P1		image/x-portable-bitmap
1613#					PGM file
16140	string		P2		image/x-portable-greymap
1615#					PPM file
16160	string		P3		image/x-portable-pixmap
1617#					PBM "rawbits" file
16180	string		P4		image/x-portable-bitmap
1619#					PGM "rawbits" file
16200	string		P5		image/x-portable-greymap
1621#					PPM "rawbits" file
16220	string		P6		image/x-portable-pixmap
1623
1624# NIFF (Navy Interchange File Format, a modification of TIFF)
1625# [GRR:  this *must* go before TIFF]
16260	string		IIN1		image/x-niff
1627
1628# TIFF and friends
1629#					TIFF file, big-endian
16300	string		MM		image/tiff
1631#					TIFF file, little-endian
16320	string		II		image/tiff
1633
1634# possible GIF replacements; none yet released!
1635# (Greg Roelofs, newt@uchicago.edu)
1636#
1637# GRR 950115:  this was mine ("Zip GIF"):
1638#					ZIF image (GIF+deflate alpha)
16390	string		GIF94z		image/unknown
1640#
1641# GRR 950115:  this is Jeremy Wohl's Free Graphics Format (better):
1642#					FGF image (GIF+deflate beta)
16430	string		FGF95a		image/unknown
1644#
1645# GRR 950115:  this is Thomas Boutell's Portable Bitmap Format proposal
1646# (best; not yet implemented):
1647#					PBF image (deflate compression)
16480	string		PBF		image/unknown
1649
1650# GIF
16510	string		GIF		image/gif
1652
1653# JPEG images
16540	beshort		0xffd8		image/jpeg
1655
1656# PC bitmaps (OS/2, Windoze BMP files)  (Greg Roelofs, newt@uchicago.edu)
16570	string		BM		image/bmp
1658#>14	byte		12		(OS/2 1.x format)
1659#>14	byte		64		(OS/2 2.x format)
1660#>14	byte		40		(Windows 3.x format)
1661#0	string		IC		icon
1662#0	string		PI		pointer
1663#0	string		CI		color icon
1664#0	string		CP		color pointer
1665#0	string		BA		bitmap array
1666
1667# PNG images
1668# Suggested by Jamie LeTual.
16690	string		\211PNG		image/png
1670
1671#------------------------------------------------------------------------------
1672# lisp:  file(1) magic for lisp programs
1673#
1674# various lisp types, from Daniel Quinlan (quinlan@yggdrasil.com)
16750	string	;;			text/plain
1676# Emacs 18 - this is always correct, but not very magical.
16770	string	\012(			application/x-elc
1678# Emacs 19
16790	string	;ELC\023\000\000\000	application/x-elc
1680
1681#------------------------------------------------------------------------------
1682# printer:  file(1) magic for printer-formatted files
1683#
1684
1685# PostScript
16860	string		%!		application/postscript
16870	string		\004%!		application/postscript
1688# EPS
1689# Jason's support for EPSF <jmaggard@timesdispatch.com>
169047 string  EPSF  image/eps
1691
1692#------------------------------------------------------------------------------
1693# sc:  file(1) magic for "sc" spreadsheet
1694#
169538	string		Spreadsheet	application/x-sc
1696
1697#------------------------------------------------------------------------------
1698# tex:  file(1) magic for TeX files
1699#
1700# XXX - needs byte-endian stuff (big-endian and little-endian DVI?)
1701#
1702# From <conklin@talisman.kaleida.com>
1703
1704# Although we may know the offset of certain text fields in TeX DVI
1705# and font files, we can't use them reliably because they are not
1706# zero terminated. [but we do anyway, christos]
17070	string		\367\002	application/x-dvi
1708#0	string		\367\203	TeX generic font data
1709#0	string		\367\131	TeX packed font data
1710#0	string		\367\312	TeX virtual font data
1711#0	string		This\ is\ TeX,	TeX transcript text
1712#0	string		This\ is\ METAFONT,	METAFONT transcript text
1713
1714# There is no way to detect TeX Font Metric (*.tfm) files without
1715# breaking them apart and reading the data.  The following patterns
1716# match most *.tfm files generated by METAFONT or afm2tfm.
1717#2	string		\000\021	TeX font metric data
1718#2	string		\000\022	TeX font metric data
1719#>34	string		>\0		(%s)
1720
1721# Texinfo and GNU Info, from Daniel Quinlan (quinlan@yggdrasil.com)
17220	string		\\input\ texinfo		text/x-texinfo
17230	string		This\ is\ Info\ file	text/x-info
1724#0	string		This\ is\ 				text/x-info
1725
1726# correct TeX magic for Linux (and maybe more)
1727# from Peter Tobias (tobias@server.et-inf.fho-emden.de)
1728#
17290	leshort		0x02f7		application/x-dvi
1730
1731# RTF - Rich Text Format
17320	string		{\\rtf		application/rtf
1733
1734#------------------------------------------------------------------------------
1735# animation:  file(1) magic for animation/movie formats
1736#
1737# animation formats, originally from vax@ccwf.cc.utexas.edu (VaX#n8)
1738#						MPEG file
17390	string		\000\000\001\263	video/mpeg
1740#
1741# The contributor claims:
1742#   I couldn't find a real magic number for these, however, this
1743#   -appears- to work.  Note that it might catch other files, too,
1744#   so BE CAREFUL!
1745#
1746# Note that title and author appear in the two 20-byte chunks
1747# at decimal offsets 2 and 22, respectively, but they are XOR'ed with
1748# 255 (hex FF)! DL format SUCKS BIG ROCKS.
1749#
1750#						DL file version 1 , medium format (160x100, 4 images/screen)
17510	byte		1			video/unknown
17520	byte		2			video/unknown
1753
1754#------------------------------------------------------------------------------
1755# ichitaro456: file(1) magic for Just System Word Processor Ichitaro
1756#
1757# Contributor kenzo-:
1758# Reversed-engineered JS Ichitaro magic numbers
1759#
1760
17610	string		DOC
1762>43	byte		0x14		application/ichitaro4
1763>144	string	JDASH		application/ichitaro4
1764
17650	string		DOC
1766>43	byte		0x15		application/ichitaro5
1767
17680	string		DOC
1769>43	byte		0x16		application/ichitaro6
1770
1771#------------------------------------------------------------------------------
1772# office97: file(1) magic for MicroSoft Office files
1773#
1774# Contributor kenzo-:
1775# Reversed-engineered MS Office magic numbers
1776#
1777
1778#0       string          \320\317\021\340\241\261\032\341
1779#>48     byte            0x1B            application/excel
1780
17812080	string	Microsoft\ Excel\ 5.0\ Worksheet	application/excel
17822114	string	Biff5								application/excel
1783
17840       string	\224\246\056	application/msword
1785
17860		belong	0x31be0000		application/msword
1787
17880		string	PO^Q`			application/msword
1789
17900	string		\320\317\021\340\241\261\032\341
1791>546	string	bjbj			application/msword
1792>546	string	jbjb			application/msword
1793
1794512		string	R\0o\0o\0t\0\ \0E\0n\0t\0r\0y	application/msword
1795
17962080	string	Microsoft\ Word\ 6.0\ Document	application/msword
17972080	string	Documento\ Microsoft\ Word\ 6	application/msword
17982112	string	MSWordDoc						application/msword
1799
1800#0	string		\320\317\021\340\241\261\032\341	application/powerpoint
18010	string		\320\317\021\340\241\261\032\341	application/msword
1802
1803#
1804# MPEG audio/video format
1805# Contributer: Peter Breton
1806#
1807
18080	belong		0x000001b3	video/mpeg
18090	belong		0x000001ba	video/mpeg
18100	beshort		&0xffe0		audio/mpeg
1811
1812#
1813# QuickTime format
1814# Contributer: Peter Breton
1815#
1816
18170	string		MOVI		video/quicktime
18184	string		moov		video/quicktime
18194	string		mdat		video/quicktime
1820
1821# WinNT/WinCE PE files (Warner Losh, imp@village.org)
1822#
1823128		string	PE\000\000		application/octet-stream
18240		string	PE\000\000		application/octet-stream
1825
1826# miscellaneous formats
18270		string	LZ				application/octet-stream
1828
1829# .EXE formats (Greg Roelofs, newt@uchicago.edu)
1830#
18310		string	MZ
1832>24		string	@				application/octet-stream
1833
18340		string	MZ
1835>30		string	Copyright\ 1989-1990\ PKWARE\ Inc.	application/x-zip
1836
18370		string	MZ
1838>30		string	PKLITE\ Copr.	application/x-zip
1839
18400		string	MZ
1841>36		string	LHa's\ SFX		application/x-lha
1842
18430		string	MZ
1844>36		string	LHA's\ SFX		application/x-lha
1845
18460		string	MZ				application/octet-stream
1847
1848# LHA archiver
18492		string	-lh
1850>6		string	-				application/x-lha
1851
1852# POSIX tar archives
1853257		string	ustar\0			application/x-tar
1854257		string	ustar\040\040\0	application/x-gtar
1855
1856# TNEF file
18570		lelong	0x223E9F78	application/ms-tnef
1858
1859# ARC archiver
18600	lelong&0x8080ffff	0x0000081a	application/x-arc
18610	lelong&0x8080ffff	0x0000091a	application/x-arc
18620	lelong&0x8080ffff	0x0000021a	application/x-arc
18630	lelong&0x8080ffff	0x0000031a	application/x-arc
18640	lelong&0x8080ffff	0x0000041a	application/x-arc
18650	lelong&0x8080ffff	0x0000061a	application/x-arc
1866# Zoo archiver
186720	lelong		0xfdc4a7dc	application/x-zoo
1868# ARJ archiver (jason@jarthur.Claremont.EDU)
18690	leshort		0xea60		application/x-arj
1870# RAR archiver (Greg Roelofs, newt@uchicago.edu)
18710	string		Rar!		application/x-rar
1872
1873