1# rtcore.tcl --
2#
3#	Runtime core for file type recognition engines written in pure Tcl.
4#
5# Copyright (c) 2016-2017 Poor Yorick     <tk.tcl.core.tcllib@pooryorick.com>
6# Copyright (c) 2004-2005 Colin McCormack <coldstore@users.sourceforge.net>
7# Copyright (c) 2005      Andreas Kupries <andreas_kupries@users.sourceforge.net>
8#
9# See the file "license.terms" for information on usage and redistribution
10# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
11#
12# RCS: @(#) $Id: rtcore.tcl,v 1.5 2005/09/28 04:51:19 andreas_kupries Exp $
13
14#####
15#
16# "mime type recognition in pure tcl"
17# http://wiki.tcl.tk/12526
18#
19# Tcl code harvested on:  10 Feb 2005, 04:06 GMT
20# Wiki page last updated: ???
21#
22#####
23
24#TODO  {
25#    {Required Functionality} {
26#	{implement full offset language} {
27#	    done
28#
29#	    by pooryorick
30#
31#	    time {2016 06}
32#	}
33#
34#	{implement pstring (pascal string, blerk)} {
35#	    done
36#
37#	    by pooryorick
38#
39#	    time {2016 06}
40#}
41#
42#	{implement regex form (blerk!)} {
43#	    done
44#
45#	    by pooryorick
46#
47#	    time {2016 06}
48#	}
49
50#	{implement string qualifiers} {
51#	    done
52#
53#	    by pooryorick
54#
55#	    time {2016 06}
56#	}
57#
58#	{finish implementing the indirect type}
59#
60#	{Maybe distinguish between binary and text tests, like file(n)}
61#
62#	{process and use strength directives}
63#
64#    }
65#}
66
67# ### ### ### ######### ######### #########
68## Requirements
69
70package require Tcl 8.5
71
72# ### ### ### ######### ######### #########
73## Implementation
74
75namespace eval ::fileutil::magic::rt {
76    # Configuration flag. (De)activate debugging output.
77    # This is done during initialization.
78    # Changes at runtime have no effect.
79
80    variable debug 0
81
82    # The maximum size of a substring to inspect from the file in question
83    variable maxstring 64
84
85    # The maximum length of any %s substitution in a resulting description is
86    variable maxpstring 64
87
88    variable regexdefaultlen 4096
89
90    # Runtime state.
91
92    variable cursor 0      ; # The current offset
93    variable fd     {}     ; # Channel to file under scrutiny
94    variable found 0       ; # Whether the last test produced a match
95    variable lfound {}     ; # For each level, whether a match was found
96    variable level 0
97    variable strbuf {}     ; # Input cache [*].
98    variable cache         ; # Cache of fetched and decoded numeric
99    array set cache {}	   ; # values.
100    variable result {}     ; # Accumulated recognition result.
101    variable extracted     ; # The value extracted for inspection
102    variable  last         ; # Behind last fetch locations,
103    array set last {}      ; # per nesting level.
104    variable weight 0      ; # The weight of the current part.
105                           ; # Basically string length of the contributing of
106			   ; # the potentially-matching part.
107
108    variable weighttotal 0 ; # The aggregate weight of the matching components of
109			   ; # the current test.
110
111    # [*] The vast majority of magic strings are in the first 4k of the file.
112
113    # Export APIs (full public, recognizer public)
114    namespace export open close file_start result
115    namespace export emit ext mime offset Nv N S Nvx Nx Sx L R I resultv U < >
116}
117
118# ### ### ### ######### ######### #########
119## Public API, general use.
120
121proc ::fileutil::magic::rt::> {} {
122    variable level
123    incr level
124}
125
126proc ::fileutil::magic::rt::< {} {
127    variable level
128    incr level -1
129}
130
131proc ::fileutil::magic::rt::classify {data} {
132    set bin_rx {[\x00-\x08\x0b\x0e-\x1f]}
133    if {[regexp $bin_rx $data] } {
134        return binary
135    } else {
136        return text
137    }
138}
139
140proc ::fileutil::magic::rt::mime value {
141    upvar 1 mime mime
142    set mime $value
143}
144
145proc ::fileutil::magic::rt::ext value {
146    upvar 1 ext ext
147    set ext $value
148}
149
150
151# open the file to be scanned
152proc ::fileutil::magic::rt::open {file} {
153    variable result {}
154    variable extracted {}
155    variable strbuf
156    variable fd
157    variable cache
158
159    set fd [::open $file]
160    ::fconfigure $fd -translation binary
161
162    # fill the string cache
163    set strbuf [::read $fd 4096]
164	set class [classify $strbuf]
165
166    # clear the fetch cache
167    catch {unset cache}
168    array set cache {}
169
170    return $fd
171}
172
173
174proc ::fileutil::magic::rt::close {} {
175    variable fd
176    ::close $fd
177    return
178}
179
180# mark the start of a magic file in debugging
181proc ::fileutil::magic::rt::file_start {name} {
182    ::fileutil::magic::rt::Debug {puts stderr "File: $name"}
183}
184
185
186# return the emitted result
187proc ::fileutil::magic::rt::result {{msg {}}} {
188    variable lfound {}
189    variable found
190    variable result
191    variable weight
192    variable weighttotal
193    if {$msg ne {}} {emit $msg}
194    set res [list $found $weighttotal $result]
195    set found 0
196    set weight 0
197    set weighttotal 0
198    set result {}
199    return -code return $res
200}
201
202proc ::fileutil::magic::rt::resultv {{msg {}}} {
203    try result on return result {
204	return $result
205    }
206}
207
208# ### ### ### ######### ######### #########
209## Public API, for use by a recognizer.
210
211# emit a description
212proc ::fileutil::magic::rt::emit msg {
213    variable found
214    variable lfound
215    variable level
216    variable maxpstring
217    variable extracted
218    variable result
219    variable weight
220    variable weighttotal
221    set found 1
222    dict set lfound $level 1
223    incr weighttotal $weight
224
225    #set map [list \
226    #    \\b "" \
227    #    %c [apply {extracted {
228    #        if {[catch {format %c $extracted} result]} {
229    #    	return {}
230    #        }
231    #        return $result
232
233    #    }} $extracted] \
234    #    %s  [string trim [string range $extracted 0 $maxpstring]] \
235    #    %ld $extracted \
236    #    %d  $extracted \
237    #]
238    #[::string map $map $msg]
239
240    # {to do} {Is only taking up to the first newline really a good general rule?}
241    regexp {\A[^\n\r]*} $extracted extracted2
242
243    regsub -all {\s+} $extracted2 { } extracted2
244
245    set arguments {}
246    set count [expr {[string length $msg] - [string length [
247	string map {% {}} $msg]]}]
248    for {set i 0} {$i < $count} {incr i} {
249	lappend arguments $extracted2
250    }
251    catch {set msg [format $msg {*}$arguments]}
252
253    # Assumption: [regexp] leaves $msg untouched if it fails
254    regexp {\A(\b|\\b)?(.*)$} $msg match b msg
255    if {$b ne {} && [llength $result]} {
256	lset result end [lindex $result end]$msg
257    } else {
258	lappend result $msg
259    }
260    return
261}
262
263proc ::fileutil::magic::rt::Nv {type offset compinvert mod mand} {
264    variable typemap
265    variable extracted
266    variable weight
267
268    # unpack the type characteristics
269    foreach {size scan} $typemap($type) break
270
271    # fetch the numeric field from the file
272    set extracted [Fetch $offset $size $scan]
273
274    if {$compinvert && $extracted ne {}} {
275	set extracted [expr ~$extracted]
276    }
277    if {$mod ne {} && $extracted ne {}} {
278	# there's a mask to be applied
279	set extracted [expr $extracted $mod $mand]
280    }
281
282    ::fileutil::magic::rt::Debug {puts stderr "NV $type $offset $mod: $extracted"}
283    set weight [string length $extracted]
284    return $extracted
285}
286
287proc ::fileutil::magic::rt::use {named file name} {
288    if [dict exists $named $file $name] {
289	set script [dict get $named $file $name]
290    } else {
291	dict for {file val} $named {
292	    if {[dict exists $val $name]} {
293		set script [dict get $val $name]
294		break
295	    }
296	}
297    }
298    if {![info exists script]} {
299	return -code error [list {name not found} $file $name]
300    }
301    return $script
302}
303
304# Numeric - get bytes of $type at $offset and $compare to $val
305# qual might be a mask
306proc ::fileutil::magic::rt::N {
307    type offset testinvert compinvert mod mand comp val} {
308    variable typemap
309    variable extracted
310    variable weight
311
312    # unpack the type characteristics
313    foreach {size scan} $typemap($type) break
314
315    # fetch the numeric field
316    set extracted [Fetch $offset $size $scan]
317    if {$extracted eq {}} {
318
319	# Rules like the following, from the jpeg file, imply that
320	# in the absence of an extracted value, a numerical value of
321	# 0 should be used
322
323	# From jpeg:
324	    ## Next, show thumbnail info, if it exists:
325	    #>>18    byte        !0      \b, thumbnail %dx
326	set extracted 0
327    }
328
329    # Would moving this before the fetch an optimisation ? The
330    # tradeoff is that we give up filling the cache, and it is unclear
331    # how often that value would be used. -- Profile!
332    if {$comp eq {x}} {
333	set weight 0
334	# anything matches - don't care
335	if {$testinvert} {
336	    return 0
337	} else {
338	    return 1
339	}
340    }
341
342    if {[string match $scan *me]} {
343	set data [me4 $data]
344	set scan I
345    }
346    # get value in binary form, then back to numeric
347    # this avoids problems with sign, as both values are
348    # [binary scan]-converted identically (see [treegen1])
349    binary scan [binary format $scan $val] $scan val
350
351    if {$compinvert && $extracted ne {}} {
352	set extracted [expr ~$extracted]
353    }
354
355    # perform comparison
356    if {$mod ne {}} {
357	# there's a mask to be applied
358	set extracted [expr $extracted $mod $mand]
359    }
360    switch $comp {
361	& {
362	    set c [expr {($extracted & $val) == $val}]
363	}
364	^ {
365	    set c [expr {($extracted & ~$val) == $extracted}]
366	}
367	== - != - < - > {
368	    set c [expr $extracted $comp $val]
369	}
370	default {
371	    #Should never reach this
372	    return -code error [list {unknown comparison operator} $comp]
373	}
374    }
375    # Do this last to minimize shimmering
376    set weight [string length $extracted]
377
378    ::fileutil::magic::rt::Debug {
379	puts stderr "numeric $type: $val $t$comp $extracted / $mod - $c"
380    }
381    if {$testinvert} {
382	set c [expr {!$c}]
383	return $c
384    } else {
385	return $c
386    }
387}
388
389proc ::fileutil::magic::rt::S {type offset testinvert mod mand comp val} {
390    variable cursor
391    variable extracted
392    variable fd
393    variable level
394    variable lfound
395    variable maxstring
396    variable regexdefaultlen
397    variable weight
398
399    # $compinvert is currently ignored for strings
400
401    set weight [string length $val]
402
403    switch $type {
404	pstring {
405	    set ptype B
406	    set vincluded 0
407	    # The last pstring type specifier wins
408	    foreach item $mod {
409		if {$item eq {J}} {
410		    set vincluded 1
411		} else {
412		    set ptype $item
413		}
414	    }
415	    lassign [dict get {B {b 1} H {S 2} h {s 2} L {I 4} l {i 4}} $ptype] scan slength
416	    set length [GetString $offset $slength]
417	    set offset $cursor
418	    binary scan $length ${scan}u length
419	    if {$vincluded} {
420		set length [expr {$length - $slength}]
421	    }
422	    set extracted [GetString $offset $length]
423	    set c [Smatch $val $comp $extracted $mod]
424	}
425	regex {
426	    if {$mand eq {}} {
427		set mand $regexdefaultlen
428	    }
429	    set extracted [GetString $offset $mand]
430	    if {[regexp $val $extracted match]} {
431		set weight [string length $match]
432	        set c 1
433	    } else {
434	        set c 0
435	    }
436	}
437	search {
438	    set limit $mand
439	    set extracted [GetString $offset $limit]
440	    if {[string first $val $extracted] >= 0} {
441		set weight [string length $val]
442		set c 1
443	    } else {
444		set c 0
445	    }
446	} default {
447	    # explicit "default" type, which is intended only to be used with
448	    # the "x" pattern
449	    set c [expr {[dict exists $lfound $level] ? ![dict get $lfound $level] : 1}]
450	} default {
451	    # get the string and compare it
452	    switch $type bestring16 - lestring16 {
453		set extracted [GetString $offset $maxstring]
454		set extracted [string range $extracted 0 1]
455		switch $type bestring16 {
456		    binary scan $extracted Su extracted
457		} lestring16 {
458		    binary scan $extracted Su extracted
459		}
460		set extracted [format %c $extracted]
461	    } default {
462		# If $val is 0, give [emit] something to work with .
463		if {$val eq  "\0"} {
464		    set extracted [GetString $offset $maxstring]
465		} else {
466		    set extracted [GetString $offset [string length $val]]
467		}
468	    }
469	    set c [Smatch $val $comp $extracted $mod]
470	}
471    }
472
473
474    ::fileutil::magic::rt::Debug {
475	puts "String '$val' $comp '$extracted' - $c"
476	if {$c} {
477	    puts "offset $offset - $extracted"
478	}
479    }
480    if {$testinvert} {
481	return [expr {!$c}]
482    } else {
483	return $c
484    }
485}
486
487proc ::fileutil::magic::rt::Smatch {val op string mod} {
488    variable weight
489    if {$op eq {x}} {
490	set weight 0
491	return 1
492    }
493
494    if {![string length $string] && $op in {eq == < <=}} {
495	if {$op in {eq == < <=}} {
496	    # Nothing matches an empty $string.
497	    return 0
498	}
499	return 1
500    }
501
502    if {$op eq {>} && [string length $val] > [string length $string]} {
503	return 1
504    }
505
506    # To preserve the semantics, the w operation must occur prior to the W
507    # operation (Assuming the interpretation that w makes all whitespace
508    # optional, relaxing the requirements of W) .
509    if {{w} in $mod} {
510	regsub -all {\s} $string[set string {}] {} string
511	regsub -all {\s} $val[set val {}] {} val
512    }
513
514    if {{W} in $mod} {
515	set blanklen [::tcl::mathfunc::max 0 {*}[
516	    lmap {_unused_ blanks} [regexp -all -indices -inline {(\s+)} $val] {
517	    expr {[lindex $blanks 1] - [lindex $blanks 0]}
518	}]]
519	if {![regexp "\s{$blanklen}" $string]} {
520	    ::fileutil::magic::rt::Debug {
521		puts "String '$val' $op '$string' - $c"
522		if {$c} {
523		    puts "offset $offset - $string"
524		}
525	    }
526	    return 0
527	}
528
529	regsub -all {\s+} $string[set string {}] { } string
530	regsub -all {\s+} $val[set val {}] { } val
531    }
532
533
534    if {{T} in $mod} {
535	set string [string trim $string[set string {}]]
536	set val [string tolower $val[set val {}]]
537    }
538
539    set string [string range $string  0 [string length $val]-1]
540
541    # The remaining code may assume that $string and $val have the same length
542    # .
543
544    set opnum [dict get {< -1 == 0 eq 0 != 0 ne 0 > 1} $op]
545
546    if {{c} in $mod || {C} in $mod} {
547	set res 1
548	if {{c} in $mod && {C} in $mod} {
549	    set string [string tolower $string[set string {}]]
550	    set val [string tolower $val[set val {}]]
551	} elseif {{c} in $mod} {
552	    foreach sc [split $string] vc [split $val] {
553		if {[string is lower $sc]} {
554		    set vc [string tolower $vc]
555		}
556		if {[::string compare $val $string] != $opnum} {
557		    set res 0
558		    break
559		}
560	    }
561	} elseif {{C} in $mode} {
562	    foreach vc [split $val] sc [split $string]  {
563		if {[string is upper $vc]} {
564		    set sc [string toupper $sc]
565		}
566		if {[::string compare $val $string] != $opnum} {
567		    set res 0
568		    break
569		}
570	    }
571	}
572    } else {
573	set res [expr {[::string compare $string $val] == $opnum}]
574    }
575    if {$op in {!= ne}} {
576	set res [expr {!$res}]
577    }
578    set weight [string length $val]
579    return $res
580}
581
582proc ::fileutil::magic::rt::Nvx {type offset compinvert mod mand} {
583    variable typemap
584    variable extracted
585    variable last
586    variable weight
587    variable level
588
589    # unpack the type characteristics
590    foreach {size scan} $typemap($type) break
591    set last($level) [expr {$offset + $size}]
592
593    set extracted [Nv $type $offset $compinvert $mod $mand]
594
595    ::fileutil::magic::rt::Debug {puts stderr "NVx $type $offset $extracted $mod $mand"}
596    return $extracted
597}
598
599# Numeric - get bytes of $type at $offset and $compare to $val
600# qual might be a mask
601proc ::fileutil::magic::rt::Nx {
602    type offset testinvert compinvert mod mand comp val} {
603
604    variable cursor
605    variable typemap
606    variable extracted
607    variable last
608    variable level
609    variable weight
610
611    set res [N $type $offset $testinvert $compinvert $mod $mand $comp $val]
612
613    ::fileutil::magic::rt::Debug {
614	puts stderr "Nx numeric $type: $val $comp $extracted / $qual - $c"
615    }
616    set last($level) $cursor
617    return $res
618}
619
620proc ::fileutil::magic::rt::Sx {
621    type offset testinvert mod mand comp val} {
622    variable cursor
623    variable extracted
624    variable fd
625    variable last
626    variable level
627    variable weight
628
629    set res [S $type $offset $testinvert $mod $mand $comp $val]
630    set last($level) $cursor
631    return $res
632}
633proc ::fileutil::magic::rt::L {newlevel} {
634    variable level $newlevel
635    # Regenerate level information in the calling context.
636    return
637}
638
639proc ::fileutil::magic::rt::I {offset it ioi ioo iir io} {
640    # Handling of base locations specified indirectly through the
641    # contents of the inspected file.
642    variable typemap
643    foreach {size scan} $typemap($it) break
644    if {$iir} {
645	# To do:  this can't be right.
646	set io [Fetch [expr $offset + $io] $size $scan]
647    }
648    set data [Fetch $offset $size $scan]
649
650    if {$ioi && [string is double -strict $data]} {
651	set data [expr {~$data}]
652    }
653    if {$ioo ne {} && [string is double -strict $data]} {
654	set data [expr $data $ioo $io]
655    }
656    if {![string is double -strict $data]} {
657	set data -1
658    }
659    return $data
660}
661
662proc ::fileutil::magic::rt::R base {
663    # Handling of base locations specified relative to the end of the
664    # last field one level above.
665
666    variable last   ; # Remembered locations.
667    variable level  ; # The level to get data from.
668    return [expr {$last([expr {$level-1}]) + $base}]
669}
670
671
672proc ::fileutil::magic::rt::U {file name} {
673    upvar named named
674    set script [use $named $file $name]
675    tailcall ::try $script
676}
677
678# ### ### ### ######### ######### #########
679## Internal. Retrieval of the data used in comparisons.
680
681# fetch and cache a numeric value from the file
682proc ::fileutil::magic::rt::Fetch {where what scan} {
683    variable cache
684    variable cursor
685    variable extracted
686    variable strbuf
687    variable fd
688
689    # Avoid [seek] errors
690    if {$where < 0} {
691	set where 0
692    }
693    # {to do} id3 length
694    if {![info exists cache($where,$what,$scan)]} {
695	::seek $fd $where
696	set data [::read $fd $what]
697	incr cursor [string length $data]
698	set extracted [rtscan $data $scan]
699	set cache($where,$what,$scan) [list $extracted $cursor]
700
701	# Optimization: If we got 4 bytes, i.e. long we implicitly
702	# know the short and byte data as well. Should put them into
703	# the cache. -- Profile: How often does such an overlap truly
704	# happen ?
705
706    } else {
707	lassign $cache($where,$what,$scan) extracted cursor
708    }
709    return $extracted
710}
711
712proc ::fileutil::magic::rt::rtscan {data scan} {
713    if {$scan eq {me}} {
714	set data [me4 $data]
715	set scan I
716    }
717    set numeric {}
718    binary scan $data $scan numeric
719    return $numeric
720}
721
722proc ::fileutil::magic::rt::me4 data {
723	binary scan $data a4 chars
724	set data [binary format a4 [lindex $chars 1] [
725	lindex $chars 0] [lindex $chars 3] [lindex $chars 2]]
726}
727
728proc ::fileutil::magic::rt::GetString {offset len} {
729    variable cursor
730    # We have the first 1k of the file cached
731    variable strbuf
732    variable fd
733
734    set end [expr {$offset + $len - 1}]
735    if {$end < 4096} {
736	# in the string cache, copy the requested part.
737	set string [::string range $strbuf $offset $end]
738    } else {
739	# an unusual one, move to the offset and read directly from
740	# the file.
741	::seek $fd $offset
742	set string [::read $fd $len]
743    }
744    set cursor [expr {$offset + [string length $string]}]
745    return $string
746}
747
748# ### ### ### ######### ######### #########
749## Internal, debugging.
750
751if {!$::fileutil::magic::rt::debug} {
752    # This procedure definition is optimized out of using code by the
753    # core bcc. It knows that neither argument checks are required,
754    # nor is anything done. So neither results, nor errors are
755    # possible, a true no-operation.
756    proc ::fileutil::magic::rt::Debug {args} {}
757
758} else {
759    proc ::fileutil::magic::rt::Debug {script} {
760	# Run the commands in the debug script. This usually generates
761	# some output. The uplevel is required to ensure the proper
762	# resolution of all variables found in the script.
763	uplevel 1 $script
764	return
765    }
766}
767
768# ### ### ### ######### ######### #########
769## Initialize constants
770
771namespace eval ::fileutil::magic::rt {
772    # maps magic typenames to field characteristics: size (#byte),
773    # binary scan format
774
775    variable typemap
776}
777
778proc ::fileutil::magic::rt::Init {} {
779    variable typemap
780    global tcl_platform
781
782    # Set the definitions for all types which have their endianess
783    # explicitly specified n their name.
784
785    array set typemap {
786	byte    {1 c}
787	beshort {2 S}
788	leshort {2 s}
789	bedouble {8 Q}
790	belong  {4 I}
791	lelong  {4 i}
792	bedate  {4 S}  ledate   {4 s}
793	beldate {4 I}  leldate  {4 i}
794	bedouble {8 Q}
795	beqdate {8 W}
796	beqldate {8 W}
797	bequad {8 W}
798	ledouble {8 q}
799	leqdate {8 w}
800	leqldate {8 w}
801	lequad {8 w}
802	lequad {8 w}
803	leqwdate {8 w}
804	medate  {4 me}
805	melong  {4 me}
806	meldate  {4 me}
807	lestring16 {2 s}
808	bestring16 {2 S}
809
810	long  {4 Q} date  {4 Q} ldate {4 Q}
811	short {2 Y} quad {8 W}
812    }
813
814    # Now set the definitions for the types without explicit
815    # endianess. They assume/use 'native' byteorder. We also put in
816    # special forms for the compiler, so that it can use short names
817    # for the native-endian types as well.
818
819    # generate short form names
820    foreach {n v} [array get typemap] {
821	foreach {len scan} $v break
822	#puts stderr "Adding $scan - [list $len $scan]"
823	set typemap($scan) [list $len $scan]
824    }
825
826    # The special Q and Y short forms are incorrect, correct now to
827    # use the proper native endianess.
828
829    # {to do} {Is ldate done correctly in the procedure?  What is its byte
830    # order anyway?  Native?}
831
832    if {$tcl_platform(byteOrder) eq "littleEndian"} {
833	array set typemap {Q {4 i} Y {2 s}
834	    short {2 s} long {4 i} quad {8 w}
835	}
836    } else {
837	array set typemap {Q {4 I} Y {2 S}
838	    short {2 S} long {4 I} quad {8 W}
839	}
840    }
841}
842
843::fileutil::magic::rt::Init
844# ### ### ### ######### ######### #########
845## Ready for use.
846
847package provide fileutil::magic::rt 2.0
848# EOF
849