1# rtcore.tcl -- 2# 3# Runtime core for file type recognition engines written in pure Tcl. 4# 5# Copyright (c) 2016-2017 Poor Yorick <tk.tcl.core.tcllib@pooryorick.com> 6# Copyright (c) 2004-2005 Colin McCormack <coldstore@users.sourceforge.net> 7# Copyright (c) 2005 Andreas Kupries <andreas_kupries@users.sourceforge.net> 8# 9# See the file "license.terms" for information on usage and redistribution 10# of this file, and for a DISCLAIMER OF ALL WARRANTIES. 11# 12# RCS: @(#) $Id: rtcore.tcl,v 1.5 2005/09/28 04:51:19 andreas_kupries Exp $ 13 14##### 15# 16# "mime type recognition in pure tcl" 17# http://wiki.tcl.tk/12526 18# 19# Tcl code harvested on: 10 Feb 2005, 04:06 GMT 20# Wiki page last updated: ??? 21# 22##### 23 24#TODO { 25# {Required Functionality} { 26# {implement full offset language} { 27# done 28# 29# by pooryorick 30# 31# time {2016 06} 32# } 33# 34# {implement pstring (pascal string, blerk)} { 35# done 36# 37# by pooryorick 38# 39# time {2016 06} 40#} 41# 42# {implement regex form (blerk!)} { 43# done 44# 45# by pooryorick 46# 47# time {2016 06} 48# } 49 50# {implement string qualifiers} { 51# done 52# 53# by pooryorick 54# 55# time {2016 06} 56# } 57# 58# {finish implementing the indirect type} 59# 60# {Maybe distinguish between binary and text tests, like file(n)} 61# 62# {process and use strength directives} 63# 64# } 65#} 66 67# ### ### ### ######### ######### ######### 68## Requirements 69 70package require Tcl 8.5 71 72# ### ### ### ######### ######### ######### 73## Implementation 74 75namespace eval ::fileutil::magic::rt { 76 # Configuration flag. (De)activate debugging output. 77 # This is done during initialization. 78 # Changes at runtime have no effect. 79 80 variable debug 0 81 82 # The maximum size of a substring to inspect from the file in question 83 variable maxstring 64 84 85 # The maximum length of any %s substitution in a resulting description is 86 variable maxpstring 64 87 88 variable regexdefaultlen 4096 89 90 # Runtime state. 91 92 variable cursor 0 ; # The current offset 93 variable fd {} ; # Channel to file under scrutiny 94 variable found 0 ; # Whether the last test produced a match 95 variable lfound {} ; # For each level, whether a match was found 96 variable level 0 97 variable strbuf {} ; # Input cache [*]. 98 variable cache ; # Cache of fetched and decoded numeric 99 array set cache {} ; # values. 100 variable result {} ; # Accumulated recognition result. 101 variable extracted ; # The value extracted for inspection 102 variable last ; # Behind last fetch locations, 103 array set last {} ; # per nesting level. 104 variable weight 0 ; # The weight of the current part. 105 ; # Basically string length of the contributing of 106 ; # the potentially-matching part. 107 108 variable weighttotal 0 ; # The aggregate weight of the matching components of 109 ; # the current test. 110 111 # [*] The vast majority of magic strings are in the first 4k of the file. 112 113 # Export APIs (full public, recognizer public) 114 namespace export open close file_start result 115 namespace export emit ext mime offset Nv N S Nvx Nx Sx L R I resultv U < > 116} 117 118# ### ### ### ######### ######### ######### 119## Public API, general use. 120 121proc ::fileutil::magic::rt::> {} { 122 variable level 123 incr level 124} 125 126proc ::fileutil::magic::rt::< {} { 127 variable level 128 incr level -1 129} 130 131proc ::fileutil::magic::rt::classify {data} { 132 set bin_rx {[\x00-\x08\x0b\x0e-\x1f]} 133 if {[regexp $bin_rx $data] } { 134 return binary 135 } else { 136 return text 137 } 138} 139 140proc ::fileutil::magic::rt::mime value { 141 upvar 1 mime mime 142 set mime $value 143} 144 145proc ::fileutil::magic::rt::ext value { 146 upvar 1 ext ext 147 set ext $value 148} 149 150 151# open the file to be scanned 152proc ::fileutil::magic::rt::open {file} { 153 variable result {} 154 variable extracted {} 155 variable strbuf 156 variable fd 157 variable cache 158 159 set fd [::open $file] 160 ::fconfigure $fd -translation binary 161 162 # fill the string cache 163 set strbuf [::read $fd 4096] 164 set class [classify $strbuf] 165 166 # clear the fetch cache 167 catch {unset cache} 168 array set cache {} 169 170 return $fd 171} 172 173 174proc ::fileutil::magic::rt::close {} { 175 variable fd 176 ::close $fd 177 return 178} 179 180# mark the start of a magic file in debugging 181proc ::fileutil::magic::rt::file_start {name} { 182 ::fileutil::magic::rt::Debug {puts stderr "File: $name"} 183} 184 185 186# return the emitted result 187proc ::fileutil::magic::rt::result {{msg {}}} { 188 variable lfound {} 189 variable found 190 variable result 191 variable weight 192 variable weighttotal 193 if {$msg ne {}} {emit $msg} 194 set res [list $found $weighttotal $result] 195 set found 0 196 set weight 0 197 set weighttotal 0 198 set result {} 199 return -code return $res 200} 201 202proc ::fileutil::magic::rt::resultv {{msg {}}} { 203 try result on return result { 204 return $result 205 } 206} 207 208# ### ### ### ######### ######### ######### 209## Public API, for use by a recognizer. 210 211# emit a description 212proc ::fileutil::magic::rt::emit msg { 213 variable found 214 variable lfound 215 variable level 216 variable maxpstring 217 variable extracted 218 variable result 219 variable weight 220 variable weighttotal 221 set found 1 222 dict set lfound $level 1 223 incr weighttotal $weight 224 225 #set map [list \ 226 # \\b "" \ 227 # %c [apply {extracted { 228 # if {[catch {format %c $extracted} result]} { 229 # return {} 230 # } 231 # return $result 232 233 # }} $extracted] \ 234 # %s [string trim [string range $extracted 0 $maxpstring]] \ 235 # %ld $extracted \ 236 # %d $extracted \ 237 #] 238 #[::string map $map $msg] 239 240 # {to do} {Is only taking up to the first newline really a good general rule?} 241 regexp {\A[^\n\r]*} $extracted extracted2 242 243 regsub -all {\s+} $extracted2 { } extracted2 244 245 set arguments {} 246 set count [expr {[string length $msg] - [string length [ 247 string map {% {}} $msg]]}] 248 for {set i 0} {$i < $count} {incr i} { 249 lappend arguments $extracted2 250 } 251 catch {set msg [format $msg {*}$arguments]} 252 253 # Assumption: [regexp] leaves $msg untouched if it fails 254 regexp {\A(\b|\\b)?(.*)$} $msg match b msg 255 if {$b ne {} && [llength $result]} { 256 lset result end [lindex $result end]$msg 257 } else { 258 lappend result $msg 259 } 260 return 261} 262 263proc ::fileutil::magic::rt::Nv {type offset compinvert mod mand} { 264 variable typemap 265 variable extracted 266 variable weight 267 268 # unpack the type characteristics 269 foreach {size scan} $typemap($type) break 270 271 # fetch the numeric field from the file 272 set extracted [Fetch $offset $size $scan] 273 274 if {$compinvert && $extracted ne {}} { 275 set extracted [expr ~$extracted] 276 } 277 if {$mod ne {} && $extracted ne {}} { 278 # there's a mask to be applied 279 set extracted [expr $extracted $mod $mand] 280 } 281 282 ::fileutil::magic::rt::Debug {puts stderr "NV $type $offset $mod: $extracted"} 283 set weight [string length $extracted] 284 return $extracted 285} 286 287proc ::fileutil::magic::rt::use {named file name} { 288 if [dict exists $named $file $name] { 289 set script [dict get $named $file $name] 290 } else { 291 dict for {file val} $named { 292 if {[dict exists $val $name]} { 293 set script [dict get $val $name] 294 break 295 } 296 } 297 } 298 if {![info exists script]} { 299 return -code error [list {name not found} $file $name] 300 } 301 return $script 302} 303 304# Numeric - get bytes of $type at $offset and $compare to $val 305# qual might be a mask 306proc ::fileutil::magic::rt::N { 307 type offset testinvert compinvert mod mand comp val} { 308 variable typemap 309 variable extracted 310 variable weight 311 312 # unpack the type characteristics 313 foreach {size scan} $typemap($type) break 314 315 # fetch the numeric field 316 set extracted [Fetch $offset $size $scan] 317 if {$extracted eq {}} { 318 319 # Rules like the following, from the jpeg file, imply that 320 # in the absence of an extracted value, a numerical value of 321 # 0 should be used 322 323 # From jpeg: 324 ## Next, show thumbnail info, if it exists: 325 #>>18 byte !0 \b, thumbnail %dx 326 set extracted 0 327 } 328 329 # Would moving this before the fetch an optimisation ? The 330 # tradeoff is that we give up filling the cache, and it is unclear 331 # how often that value would be used. -- Profile! 332 if {$comp eq {x}} { 333 set weight 0 334 # anything matches - don't care 335 if {$testinvert} { 336 return 0 337 } else { 338 return 1 339 } 340 } 341 342 if {[string match $scan *me]} { 343 set data [me4 $data] 344 set scan I 345 } 346 # get value in binary form, then back to numeric 347 # this avoids problems with sign, as both values are 348 # [binary scan]-converted identically (see [treegen1]) 349 binary scan [binary format $scan $val] $scan val 350 351 if {$compinvert && $extracted ne {}} { 352 set extracted [expr ~$extracted] 353 } 354 355 # perform comparison 356 if {$mod ne {}} { 357 # there's a mask to be applied 358 set extracted [expr $extracted $mod $mand] 359 } 360 switch $comp { 361 & { 362 set c [expr {($extracted & $val) == $val}] 363 } 364 ^ { 365 set c [expr {($extracted & ~$val) == $extracted}] 366 } 367 == - != - < - > { 368 set c [expr $extracted $comp $val] 369 } 370 default { 371 #Should never reach this 372 return -code error [list {unknown comparison operator} $comp] 373 } 374 } 375 # Do this last to minimize shimmering 376 set weight [string length $extracted] 377 378 ::fileutil::magic::rt::Debug { 379 puts stderr "numeric $type: $val $t$comp $extracted / $mod - $c" 380 } 381 if {$testinvert} { 382 set c [expr {!$c}] 383 return $c 384 } else { 385 return $c 386 } 387} 388 389proc ::fileutil::magic::rt::S {type offset testinvert mod mand comp val} { 390 variable cursor 391 variable extracted 392 variable fd 393 variable level 394 variable lfound 395 variable maxstring 396 variable regexdefaultlen 397 variable weight 398 399 # $compinvert is currently ignored for strings 400 401 set weight [string length $val] 402 403 switch $type { 404 pstring { 405 set ptype B 406 set vincluded 0 407 # The last pstring type specifier wins 408 foreach item $mod { 409 if {$item eq {J}} { 410 set vincluded 1 411 } else { 412 set ptype $item 413 } 414 } 415 lassign [dict get {B {b 1} H {S 2} h {s 2} L {I 4} l {i 4}} $ptype] scan slength 416 set length [GetString $offset $slength] 417 set offset $cursor 418 binary scan $length ${scan}u length 419 if {$vincluded} { 420 set length [expr {$length - $slength}] 421 } 422 set extracted [GetString $offset $length] 423 set c [Smatch $val $comp $extracted $mod] 424 } 425 regex { 426 if {$mand eq {}} { 427 set mand $regexdefaultlen 428 } 429 set extracted [GetString $offset $mand] 430 if {[regexp $val $extracted match]} { 431 set weight [string length $match] 432 set c 1 433 } else { 434 set c 0 435 } 436 } 437 search { 438 set limit $mand 439 set extracted [GetString $offset $limit] 440 if {[string first $val $extracted] >= 0} { 441 set weight [string length $val] 442 set c 1 443 } else { 444 set c 0 445 } 446 } default { 447 # explicit "default" type, which is intended only to be used with 448 # the "x" pattern 449 set c [expr {[dict exists $lfound $level] ? ![dict get $lfound $level] : 1}] 450 } default { 451 # get the string and compare it 452 switch $type bestring16 - lestring16 { 453 set extracted [GetString $offset $maxstring] 454 set extracted [string range $extracted 0 1] 455 switch $type bestring16 { 456 binary scan $extracted Su extracted 457 } lestring16 { 458 binary scan $extracted Su extracted 459 } 460 set extracted [format %c $extracted] 461 } default { 462 # If $val is 0, give [emit] something to work with . 463 if {$val eq "\0"} { 464 set extracted [GetString $offset $maxstring] 465 } else { 466 set extracted [GetString $offset [string length $val]] 467 } 468 } 469 set c [Smatch $val $comp $extracted $mod] 470 } 471 } 472 473 474 ::fileutil::magic::rt::Debug { 475 puts "String '$val' $comp '$extracted' - $c" 476 if {$c} { 477 puts "offset $offset - $extracted" 478 } 479 } 480 if {$testinvert} { 481 return [expr {!$c}] 482 } else { 483 return $c 484 } 485} 486 487proc ::fileutil::magic::rt::Smatch {val op string mod} { 488 variable weight 489 if {$op eq {x}} { 490 set weight 0 491 return 1 492 } 493 494 if {![string length $string] && $op in {eq == < <=}} { 495 if {$op in {eq == < <=}} { 496 # Nothing matches an empty $string. 497 return 0 498 } 499 return 1 500 } 501 502 if {$op eq {>} && [string length $val] > [string length $string]} { 503 return 1 504 } 505 506 # To preserve the semantics, the w operation must occur prior to the W 507 # operation (Assuming the interpretation that w makes all whitespace 508 # optional, relaxing the requirements of W) . 509 if {{w} in $mod} { 510 regsub -all {\s} $string[set string {}] {} string 511 regsub -all {\s} $val[set val {}] {} val 512 } 513 514 if {{W} in $mod} { 515 set blanklen [::tcl::mathfunc::max 0 {*}[ 516 lmap {_unused_ blanks} [regexp -all -indices -inline {(\s+)} $val] { 517 expr {[lindex $blanks 1] - [lindex $blanks 0]} 518 }]] 519 if {![regexp "\s{$blanklen}" $string]} { 520 ::fileutil::magic::rt::Debug { 521 puts "String '$val' $op '$string' - $c" 522 if {$c} { 523 puts "offset $offset - $string" 524 } 525 } 526 return 0 527 } 528 529 regsub -all {\s+} $string[set string {}] { } string 530 regsub -all {\s+} $val[set val {}] { } val 531 } 532 533 534 if {{T} in $mod} { 535 set string [string trim $string[set string {}]] 536 set val [string tolower $val[set val {}]] 537 } 538 539 set string [string range $string 0 [string length $val]-1] 540 541 # The remaining code may assume that $string and $val have the same length 542 # . 543 544 set opnum [dict get {< -1 == 0 eq 0 != 0 ne 0 > 1} $op] 545 546 if {{c} in $mod || {C} in $mod} { 547 set res 1 548 if {{c} in $mod && {C} in $mod} { 549 set string [string tolower $string[set string {}]] 550 set val [string tolower $val[set val {}]] 551 } elseif {{c} in $mod} { 552 foreach sc [split $string] vc [split $val] { 553 if {[string is lower $sc]} { 554 set vc [string tolower $vc] 555 } 556 if {[::string compare $val $string] != $opnum} { 557 set res 0 558 break 559 } 560 } 561 } elseif {{C} in $mode} { 562 foreach vc [split $val] sc [split $string] { 563 if {[string is upper $vc]} { 564 set sc [string toupper $sc] 565 } 566 if {[::string compare $val $string] != $opnum} { 567 set res 0 568 break 569 } 570 } 571 } 572 } else { 573 set res [expr {[::string compare $string $val] == $opnum}] 574 } 575 if {$op in {!= ne}} { 576 set res [expr {!$res}] 577 } 578 set weight [string length $val] 579 return $res 580} 581 582proc ::fileutil::magic::rt::Nvx {type offset compinvert mod mand} { 583 variable typemap 584 variable extracted 585 variable last 586 variable weight 587 variable level 588 589 # unpack the type characteristics 590 foreach {size scan} $typemap($type) break 591 set last($level) [expr {$offset + $size}] 592 593 set extracted [Nv $type $offset $compinvert $mod $mand] 594 595 ::fileutil::magic::rt::Debug {puts stderr "NVx $type $offset $extracted $mod $mand"} 596 return $extracted 597} 598 599# Numeric - get bytes of $type at $offset and $compare to $val 600# qual might be a mask 601proc ::fileutil::magic::rt::Nx { 602 type offset testinvert compinvert mod mand comp val} { 603 604 variable cursor 605 variable typemap 606 variable extracted 607 variable last 608 variable level 609 variable weight 610 611 set res [N $type $offset $testinvert $compinvert $mod $mand $comp $val] 612 613 ::fileutil::magic::rt::Debug { 614 puts stderr "Nx numeric $type: $val $comp $extracted / $qual - $c" 615 } 616 set last($level) $cursor 617 return $res 618} 619 620proc ::fileutil::magic::rt::Sx { 621 type offset testinvert mod mand comp val} { 622 variable cursor 623 variable extracted 624 variable fd 625 variable last 626 variable level 627 variable weight 628 629 set res [S $type $offset $testinvert $mod $mand $comp $val] 630 set last($level) $cursor 631 return $res 632} 633proc ::fileutil::magic::rt::L {newlevel} { 634 variable level $newlevel 635 # Regenerate level information in the calling context. 636 return 637} 638 639proc ::fileutil::magic::rt::I {offset it ioi ioo iir io} { 640 # Handling of base locations specified indirectly through the 641 # contents of the inspected file. 642 variable typemap 643 foreach {size scan} $typemap($it) break 644 if {$iir} { 645 # To do: this can't be right. 646 set io [Fetch [expr $offset + $io] $size $scan] 647 } 648 set data [Fetch $offset $size $scan] 649 650 if {$ioi && [string is double -strict $data]} { 651 set data [expr {~$data}] 652 } 653 if {$ioo ne {} && [string is double -strict $data]} { 654 set data [expr $data $ioo $io] 655 } 656 if {![string is double -strict $data]} { 657 set data -1 658 } 659 return $data 660} 661 662proc ::fileutil::magic::rt::R base { 663 # Handling of base locations specified relative to the end of the 664 # last field one level above. 665 666 variable last ; # Remembered locations. 667 variable level ; # The level to get data from. 668 return [expr {$last([expr {$level-1}]) + $base}] 669} 670 671 672proc ::fileutil::magic::rt::U {file name} { 673 upvar named named 674 set script [use $named $file $name] 675 tailcall ::try $script 676} 677 678# ### ### ### ######### ######### ######### 679## Internal. Retrieval of the data used in comparisons. 680 681# fetch and cache a numeric value from the file 682proc ::fileutil::magic::rt::Fetch {where what scan} { 683 variable cache 684 variable cursor 685 variable extracted 686 variable strbuf 687 variable fd 688 689 # Avoid [seek] errors 690 if {$where < 0} { 691 set where 0 692 } 693 # {to do} id3 length 694 if {![info exists cache($where,$what,$scan)]} { 695 ::seek $fd $where 696 set data [::read $fd $what] 697 incr cursor [string length $data] 698 set extracted [rtscan $data $scan] 699 set cache($where,$what,$scan) [list $extracted $cursor] 700 701 # Optimization: If we got 4 bytes, i.e. long we implicitly 702 # know the short and byte data as well. Should put them into 703 # the cache. -- Profile: How often does such an overlap truly 704 # happen ? 705 706 } else { 707 lassign $cache($where,$what,$scan) extracted cursor 708 } 709 return $extracted 710} 711 712proc ::fileutil::magic::rt::rtscan {data scan} { 713 if {$scan eq {me}} { 714 set data [me4 $data] 715 set scan I 716 } 717 set numeric {} 718 binary scan $data $scan numeric 719 return $numeric 720} 721 722proc ::fileutil::magic::rt::me4 data { 723 binary scan $data a4 chars 724 set data [binary format a4 [lindex $chars 1] [ 725 lindex $chars 0] [lindex $chars 3] [lindex $chars 2]] 726} 727 728proc ::fileutil::magic::rt::GetString {offset len} { 729 variable cursor 730 # We have the first 1k of the file cached 731 variable strbuf 732 variable fd 733 734 set end [expr {$offset + $len - 1}] 735 if {$end < 4096} { 736 # in the string cache, copy the requested part. 737 set string [::string range $strbuf $offset $end] 738 } else { 739 # an unusual one, move to the offset and read directly from 740 # the file. 741 ::seek $fd $offset 742 set string [::read $fd $len] 743 } 744 set cursor [expr {$offset + [string length $string]}] 745 return $string 746} 747 748# ### ### ### ######### ######### ######### 749## Internal, debugging. 750 751if {!$::fileutil::magic::rt::debug} { 752 # This procedure definition is optimized out of using code by the 753 # core bcc. It knows that neither argument checks are required, 754 # nor is anything done. So neither results, nor errors are 755 # possible, a true no-operation. 756 proc ::fileutil::magic::rt::Debug {args} {} 757 758} else { 759 proc ::fileutil::magic::rt::Debug {script} { 760 # Run the commands in the debug script. This usually generates 761 # some output. The uplevel is required to ensure the proper 762 # resolution of all variables found in the script. 763 uplevel 1 $script 764 return 765 } 766} 767 768# ### ### ### ######### ######### ######### 769## Initialize constants 770 771namespace eval ::fileutil::magic::rt { 772 # maps magic typenames to field characteristics: size (#byte), 773 # binary scan format 774 775 variable typemap 776} 777 778proc ::fileutil::magic::rt::Init {} { 779 variable typemap 780 global tcl_platform 781 782 # Set the definitions for all types which have their endianess 783 # explicitly specified n their name. 784 785 array set typemap { 786 byte {1 c} 787 beshort {2 S} 788 leshort {2 s} 789 bedouble {8 Q} 790 belong {4 I} 791 lelong {4 i} 792 bedate {4 S} ledate {4 s} 793 beldate {4 I} leldate {4 i} 794 bedouble {8 Q} 795 beqdate {8 W} 796 beqldate {8 W} 797 bequad {8 W} 798 ledouble {8 q} 799 leqdate {8 w} 800 leqldate {8 w} 801 lequad {8 w} 802 lequad {8 w} 803 leqwdate {8 w} 804 medate {4 me} 805 melong {4 me} 806 meldate {4 me} 807 lestring16 {2 s} 808 bestring16 {2 S} 809 810 long {4 Q} date {4 Q} ldate {4 Q} 811 short {2 Y} quad {8 W} 812 } 813 814 # Now set the definitions for the types without explicit 815 # endianess. They assume/use 'native' byteorder. We also put in 816 # special forms for the compiler, so that it can use short names 817 # for the native-endian types as well. 818 819 # generate short form names 820 foreach {n v} [array get typemap] { 821 foreach {len scan} $v break 822 #puts stderr "Adding $scan - [list $len $scan]" 823 set typemap($scan) [list $len $scan] 824 } 825 826 # The special Q and Y short forms are incorrect, correct now to 827 # use the proper native endianess. 828 829 # {to do} {Is ldate done correctly in the procedure? What is its byte 830 # order anyway? Native?} 831 832 if {$tcl_platform(byteOrder) eq "littleEndian"} { 833 array set typemap {Q {4 i} Y {2 s} 834 short {2 s} long {4 i} quad {8 w} 835 } 836 } else { 837 array set typemap {Q {4 I} Y {2 S} 838 short {2 S} long {4 I} quad {8 W} 839 } 840 } 841} 842 843::fileutil::magic::rt::Init 844# ### ### ### ######### ######### ######### 845## Ready for use. 846 847package provide fileutil::magic::rt 2.0 848# EOF 849