1<?php 2/************************************************************************************************************** 3 4 NAME 5 PdfToText.phpclass 6 7 DESCRIPTION 8 A class for extracting text from Pdf files. 9 Usage is very simple : just instantiate a PdfToText object, specifying an input filename, then use the 10 Text property to retrieve PDF textual contents : 11 12 $pdf = new PdfToText ( 'sample.pdf' ) ; 13 echo $pdf -> Text ; // or : echo ( string ) $pdf ; 14 15 Or : 16 17 $pdf = new PdfToText ( ) ; 18 // Modify any property here before loading the file ; for example : 19 // $pdf -> BlockSeparator = " " ; 20 $pdf -> Load ( 'sample.pdf' ) ; 21 echo $pdf -> Text ; 22 23 AUTHOR 24 Christian Vigh, 04/2016. 25 26 HISTORY 27 [Version : 1.6.7] [Date : 2017/05/31] [Author : CV] 28 . Added CID fonts 29 . Changed the way CID font maps are searched and handled 30 31 (...) 32 33 [Version : 1.0] [Date : 2016/04/16] [Author : CV] 34 Initial version. 35 36 **************************************************************************************************************/ 37 38 39/*============================================================================================================== 40 41 class PdfToTextException et al - 42 Implements an exception thrown when an error is encountered while decoding PDF files. 43 44 ==============================================================================================================*/ 45 46// PdfToText exception - 47// Base class for all other PdfToText exceptions. 48class PdfToTextException extends Exception 49 { 50 public static $IsObject = false ; 51 } ; 52 53 54// PdfToTextDecodingException - 55// Thrown when unexpected data is encountered while analyzing PDF contents. 56class PdfToTextDecodingException extends PdfToTextException 57 { 58 public function __construct ( $message, $object_id = false ) 59 { 60 $text = "Pdf decoding error" ; 61 62 if ( $object_id !== false ) 63 $text .= " (object #$object_id)" ; 64 65 $text .= " : $message" ; 66 67 parent::__construct ( $text ) ; 68 } 69 } 70 71 72// PdfToTextDecryptionException - 73// Thrown when something unexpected is encountered while processing encrypted data. 74class PdfToTextDecryptionException extends PdfToTextException 75 { 76 public function __construct ( $message, $object_id = false ) 77 { 78 $text = "Pdf decryption error" ; 79 80 if ( $object_id !== false ) 81 $text .= " (object #$object_id)" ; 82 83 $text .= " : $message" ; 84 85 parent::__construct ( $text ) ; 86 } 87 } 88 89 90// PdfToTextTimeoutException - 91// Thrown when the PDFOPT_ENFORCE_EXECUTION_TIME or PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME option is set, and 92// the script took longer than the allowed execution time limit. 93class PdfToTextTimeoutException extends PdfToTextException 94 { 95 // Set to true if the reason why the max execution time was reached because of too many invocations of the Load() method 96 // Set to false if the max execution time was reached by simply processing one PDF file 97 public $GlobalTimeout ; 98 99 public function __construct ( $message, $global, $php_setting, $class_setting ) 100 { 101 $text = "PdfToText max execution time reached " ; 102 103 if ( ! $global ) 104 $text .= "for one single file " ; 105 106 $text .= "(php limit = {$php_setting}s, class limit = {$class_setting}s) : $message" ; 107 108 $this -> GlobalTimeout = $global ; 109 110 parent::__construct ( $text ) ; 111 } 112 } 113 114 115// PdfToTextFormException - 116// Thrown if the xml template passed to the GetFormData() method contains an error. 117class PdfToTextFormException extends PdfToTextException 118 { 119 public function __construct ( $message ) 120 { 121 $text = "Pdf form template error" ; 122 123 $text .= " : $message" ; 124 125 parent::__construct ( $text ) ; 126 } 127 } 128 129 130// PdfToTextCaptureException - 131// Thrown if the xml template passed to the SetCaptures() method contains an error. 132class PdfToTextCaptureException extends PdfToTextException 133 { 134 public function __construct ( $message ) 135 { 136 $text = "Pdf capture template error" ; 137 138 $text .= " : $message" ; 139 140 parent::__construct ( $text ) ; 141 } 142 } 143 144 145 146/*============================================================================================================== 147 148 Custom error reporting functions. 149 150 ==============================================================================================================*/ 151if ( ! function_exists ( 'warning' ) ) 152 { 153 function warning ( $message ) 154 { 155 trigger_error ( $message, E_USER_WARNING ) ; 156 } 157 } 158 159 160if ( ! function_exists ( 'error' ) ) 161 { 162 function error ( $message ) 163 { 164 if ( is_string ( $message ) ) 165 trigger_error ( $message, E_USER_ERROR ) ; 166 else if ( is_a ( $message, '\Exception' ) ) 167 throw $message ; 168 } 169 } 170 171 172/*============================================================================================================== 173 174 Backward-compatibility issues. 175 176 ==============================================================================================================*/ 177 178// hex2bin - 179// This function appeared only in version 5.4.0 180if ( ! function_exists ( 'hex2bin' ) ) 181 { 182 function hex2bin ( $hexstring ) 183 { 184 $length = strlen ( $hexstring ) ; 185 $binstring = '' ; 186 $index = 0 ; 187 188 while ( $index < $length ) 189 { 190 $byte = substr ( $hexstring, $index, 2 ) ; 191 $ch = pack ( 'H*', $byte ) ; 192 $binstring .= $ch ; 193 194 $index += 2 ; 195 } 196 197 return ( $binstring ) ; 198 } 199 200 } 201 202 203/*============================================================================================================== 204 205 class PfObjectBase - 206 Base class for all PDF objects defined here. 207 208 ==============================================================================================================*/ 209abstract class PdfObjectBase // extends Object 210 { 211 // Possible encoding types for streams inside objects ; "unknown" means that the object contains no stream 212 const PDF_UNKNOWN_ENCODING = 0 ; // No stream decoding type could be identified 213 const PDF_ASCIIHEX_ENCODING = 1 ; // AsciiHex encoding - not tested 214 const PDF_ASCII85_ENCODING = 2 ; // Ascii85 encoding - not tested 215 const PDF_FLATE_ENCODING = 3 ; // Flate/deflate encoding 216 const PDF_TEXT_ENCODING = 4 ; // Stream data appears in clear text - no decoding required 217 const PDF_LZW_ENCODING = 5 ; // Not implemented yet 218 const PDF_RLE_ENCODING = 6 ; // Runtime length encoding ; not implemented yet 219 const PDF_DCT_ENCODING = 7 ; // JPEG images 220 const PDF_CCITT_FAX_ENCODING = 8 ; // CCITT Fax encoding - not implemented yet 221 const PDF_JBIG2_ENCODING = 9 ; // JBIG2 filter encoding (black/white) - not implemented yet 222 const PDF_JPX_ENCODING = 10 ; // JPEG2000 encoding - not implemented yet 223 224 // Regular expression used for recognizing references to a font (this list is far from being exhaustive, as it seems 225 // that you can specify almost everything - however, trying to recognize everything would require to develop a complete 226 // parser) 227 protected static $FontSpecifiers = ' 228 (/F \d+ (\.\d+)? ) | 229 (/R \d+) | 230 (/f-\d+-\d+) | 231 (/[CT]\d+_\d+) | 232 (/TT \d+) | 233 (/OPBaseFont \d+) | 234 (/OPSUFont \d+) | 235 (/[0-9a-zA-Z]) | 236 (/F\w+) | 237 (/[A-Za-z][A-Za-z0-9]* ( [\-+] [A-Za-z][A-Za-z0-9]* )) 238 ' ; 239 240 // Maps alien Unicode characters such as special spaces, letters with ligatures to their ascii string equivalent 241 protected static $UnicodeToSimpleAscii = false ; 242 243 244 /*-------------------------------------------------------------------------------------------------------------- 245 246 Constructor - 247 Performs static initializations such as the Unicode to Ascii table. 248 249 *-------------------------------------------------------------------------------------------------------------*/ 250 public function __construct ( ) 251 { 252 if ( self::$UnicodeToSimpleAscii === false ) 253 { 254 $charset_file = dirname ( __FILE__ ) . "/Maps/unicode-to-ansi.map" ; 255 include ( $charset_file ) ; 256 self::$UnicodeToSimpleAscii = ( isset ( $unicode_to_ansi ) ) ? $unicode_to_ansi : array ( ) ; 257 } 258 259 // parent::__construct ( ) ; 260 } 261 262 263 /*-------------------------------------------------------------------------------------------------------------- 264 265 NAME 266 CodePointToUtf8 - Encodes a Unicode codepoint to UTF8. 267 268 PROTOTYPE 269 $char = $this -> CodePointToUtf8 ( $code ) ; 270 271 DESCRIPTION 272 Encodes a Unicode codepoint to UTF8, trying to handle all possible cases. 273 274 PARAMETERS 275 $code (integer) - 276 Unicode code point to be translated. 277 278 RETURN VALUE 279 A string that contains the UTF8 bytes representing the Unicode code point. 280 281 *-------------------------------------------------------------------------------------------------------------*/ 282 protected function CodePointToUtf8 ( $code ) 283 { 284 if ( $code ) 285 { 286 $result = '' ; 287 288 while ( $code ) 289 { 290 $word = ( $code & 0xFFFF ) ; 291 292 if ( ! isset ( self::$UnicodeToSimpleAscii [ $word ] ) ) 293 { 294 $entity = "&#$word;" ; 295 $result .= mb_convert_encoding ( $entity, 'UTF-8', 'HTML-ENTITIES' ) . $result ; 296 } 297 else 298 $result .= self::$UnicodeToSimpleAscii [ $word ] ; 299 300 $code = ( integer ) ( $code / 0xFFFF ) ; // There is no unsigned right-shift operator in PHP... 301 } 302 303 return ( $result ) ; 304 } 305 // No translation is apparently possible : use a placeholder to signal this situation 306 else 307 { 308 if ( strpos ( PdfToText::$Utf8Placeholder, '%' ) === false ) 309 { 310 return ( PdfToText::$Utf8Placeholder ) ; 311 } 312 else 313 return ( sprintf ( PdfToText::$Utf8Placeholder, $code ) ) ; 314 } 315 } 316 317 318 /*-------------------------------------------------------------------------------------------------------------- 319 320 DecodeRawName - 321 Decodes a string that may contain constructs such as '#xy', where 'xy' are hex digits. 322 323 *-------------------------------------------------------------------------------------------------------------*/ 324 public static function DecodeRawName ( $str ) 325 { 326 return ( rawurldecode ( str_replace ( '#', '%', $str ) ) ) ; 327 } 328 329 330 /*-------------------------------------------------------------------------------------------------------------- 331 332 NAME 333 GetEncodingType - Gets an object encoding type. 334 335 PROTOTYPE 336 $type = $this -> GetEncodingType ( $object_id, $object_data ) ; 337 338 DESCRIPTION 339 When an object is a stream, returns its encoding type. 340 341 PARAMETERS 342 $object_id (integer) - 343 PDF object number. 344 345 $object_data (string) - 346 Object contents. 347 348 RETURN VALUE 349 Returns one of the following values : 350 351 - PdfToText::PDF_ASCIIHEX_ENCODING : 352 Hexadecimal encoding of the binary values. 353 Decoding algorithm was taken from the unknown contributor and not tested so far, since I 354 couldn't find a PDF file with such an encoding type. 355 356 - PdfToText::PDF_ASCII85_ENCODING : 357 Obscure encoding format. 358 Decoding algorithm was taken from the unknown contributor and not tested so far, since I 359 couldn't find a PDF file with such an encoding type. 360 361 - PdfToText::PDF_FLATE_ENCODING : 362 gzip/deflate encoding. 363 364 - PdfToText::PDF_TEXT_ENCODING : 365 Stream data is unencoded (ie, it is pure ascii). 366 367 - PdfToText::PDF_UNKNOWN_ENCODING : 368 The object data does not specify any encoding at all. It can happen on objects that do not have 369 a "stream" part. 370 371 - PdfToText::PDF_DCT_ENCODING : 372 a lossy filter based on the JPEG standard. 373 374 The following constants are defined but not yet implemented ; an exception will be thrown if they are 375 encountered somewhere in the PDF file : 376 377 - PDF_LZW_ENCODING : 378 a filter based on LZW Compression; it can use one of two groups of predictor functions for more 379 compact LZW compression : Predictor 2 from the TIFF 6.0 specification and predictors (filters) 380 from the PNG specification 381 382 - PDF_RLE_ENCODING : 383 a simple compression method for streams with repetitive data using the run-length encoding 384 algorithm and the image-specific filters. 385 386 PDF_CCITT_FAX_ENCODING : 387 a lossless bi-level (black/white) filter based on the Group 3 or Group 4 CCITT (ITU-T) fax 388 compression standard defined in ITU-T T.4 and T.6. 389 390 PDF_JBIG2_ENCODING : 391 a lossy or lossless bi-level (black/white) filter based on the JBIG2 standard, introduced in 392 PDF 1.4. 393 394 PDF_JPX_ENCODING : 395 a lossy or lossless filter based on the JPEG 2000 standard, introduced in PDF 1.5. 396 397 *-------------------------------------------------------------------------------------------------------------*/ 398 protected function GetEncodingType ( $object_id, $object_data ) 399 { 400 $status = preg_match ( '# / (?P<encoding> (ASCIIHexDecode) | (AHx) | (ASCII85Decode) | (A85) | (FlateDecode) | (Fl) | (DCTDecode) | (DCT) | ' . 401 '(LZWDecode) | (LZW) | (RunLengthDecode) | (RL) | (CCITTFaxDecode) | (CCF) | (JBIG2Decode) | (JPXDecode) ) \b #imsx', 402 $object_data, $match ) ; 403 404 if ( ! $status ) 405 return ( self::PDF_TEXT_ENCODING ) ; 406 407 switch ( strtolower ( $match [ 'encoding' ] ) ) 408 { 409 case 'asciihexdecode' : 410 case 'ahx' : return ( self::PDF_ASCIIHEX_ENCODING ) ; 411 412 case 'ascii85decode' : 413 case 'a85' : return ( self::PDF_ASCII85_ENCODING ) ; 414 415 case 'flatedecode' : 416 case 'fl' : return ( self::PDF_FLATE_ENCODING ) ; 417 418 case 'dctdecode' : 419 case 'dct' : return ( self::PDF_DCT_ENCODING ) ; 420 421 case 'lzwdecode' : 422 case 'lzw' : return ( self::PDF_LZW_ENCODING ) ; 423 424 case 'ccittfaxdecode' : 425 case 'ccf' : 426 427 case 'runlengthdecode' : 428 case 'rl' : 429 430 case 'jbig2decode' : 431 432 case 'jpxdecode' : 433 if ( PdfToText::$DEBUG > 1 ) 434 warning ( "Encoding type \"{$match [ 'encoding' ]}\" not yet implemented for pdf object #$object_id." ) ; 435 436 default : return ( self::PDF_UNKNOWN_ENCODING ) ; 437 } 438 } 439 440 441 /*-------------------------------------------------------------------------------------------------------------- 442 443 NAME 444 GetObjectReferences - Gets object references from a specified construct. 445 446 PROTOTYPE 447 $status = $this -> GetObjectReferences ( $object_id, $object_data, $searched_string, &$object_ids ) ; 448 449 DESCRIPTION 450 Certain parameter specifications are followed by an object reference of the form : 451 x 0 R 452 but it can also be an array of references : 453 [x1 0 R x2 0 R ... xn 0 r] 454 Those kind of constructs can occur after parameters such as : /Pages, /Contents, /Kids... 455 This method extracts the object references found in such a construct. 456 457 PARAMETERS 458 $object_id (integer) - 459 Id of the object to be analyzed. 460 461 $object_data (string) - 462 Object contents. 463 464 $searched_string (string) - 465 String to be searched, that must be followed by an object or an array of object references. 466 This parameter can contain constructs used in regular expressions. Note however that the '#' 467 character must be escaped, since it is used as a delimiter in the regex that is applied on 468 object data. 469 470 $object_ids (array of integers) - 471 Returns on output the ids of the pdf object that have been found after the searched string. 472 473 RETURN VALUE 474 True if the searched string has been found and is followed by an object or array of object references, 475 false otherwise. 476 477 *-------------------------------------------------------------------------------------------------------------*/ 478 protected function GetObjectReferences ( $object_id, $object_data, $searched_string, &$object_ids ) 479 { 480 $status = true ; 481 $object_ids = array ( ) ; 482 483 if ( preg_match ( "#$searched_string \s* \\[ (?P<objects> [^\]]+ ) \\]#ix", $object_data, $match ) ) 484 { 485 $object_list = $match [ 'objects' ] ; 486 487 if ( preg_match_all ( '/(?P<object> \d+) \s+ \d+ \s+ R/x', $object_list, $matches ) ) 488 { 489 foreach ( $matches [ 'object' ] as $id ) 490 $object_ids [] = ( integer ) $id ; 491 } 492 else 493 $status = false ; 494 } 495 else if ( preg_match ( "#$searched_string \s+ (?P<object> \d+) \s+ \d+ \s+ R#ix", $object_data, $match ) ) 496 { 497 $object_ids [] = ( integer ) $match [ 'object' ] ; 498 } 499 else 500 $status = false ; 501 502 return ( $status ) ; 503 } 504 505 506 /*-------------------------------------------------------------------------------------------------------------- 507 508 NAME 509 GetStringParameter - Retrieve a string flag value. 510 511 PROTOTYPE 512 $result = $this -> GetStringParameter ( $parameter, $object_data ) ; 513 514 DESCRIPTION 515 Retrieves the value of a string parameter ; for example : 516 517 /U (parameter value) 518 519 or : 520 521 /U <hexdigits> 522 523 PARAMETERS 524 $parameter (string) - 525 Parameter name. 526 527 $object_data (string) - 528 Object containing the parameter. 529 530 RETURN VALUE 531 The parameter value. 532 533 NOTES 534 description 535 536 *-------------------------------------------------------------------------------------------------------------*/ 537 protected function GetStringParameter ( $parameter, $object_data ) 538 { 539 if ( preg_match ( '#' . $parameter . ' \s* \( \s* (?P<value> [^)]+) \)#ix', $object_data, $match ) ) 540 $result = $this -> ProcessEscapedString ( $match [ 'value' ] ) ; 541 else if ( preg_match ( '#' . $parameter . ' \s* \< \s* (?P<value> [^>]+) \>#ix', $object_data, $match ) ) 542 { 543 $hexdigits = $match [ 'value' ] ; 544 $result = '' ; 545 546 for ( $i = 0, $count = strlen ( $hexdigits ) ; $i < $count ; $i += 2 ) 547 $result .= chr ( hexdec ( substr ( $hexdigits, $i, 2 ) ) ) ; 548 } 549 else 550 $result = '' ; 551 552 return ( $result ) ; 553 } 554 555 556 /*-------------------------------------------------------------------------------------------------------------- 557 558 GetUTCDate - 559 Reformats an Adobe UTC date to a format that can be understood by the strtotime() function. 560 Dates are specified in the following format : 561 D:20150521154000Z 562 D:20160707182114+02 563 with are both recognized by strtotime(). However, another format can be specified : 564 D:20160707182114+02'00' 565 which is not recognized by strtotime() so we have to get rid from the '00' part. 566 567 *-------------------------------------------------------------------------------------------------------------*/ 568 protected function GetUTCDate ( $date ) 569 { 570 if ( $date ) 571 { 572 if ( ( $date [0] == 'D' || $date [0] == 'd' ) && $date [1] == ':' ) 573 $date = substr ( $date, 2 ) ; 574 575 if ( ( $index = strpos ( $date, "'" ) ) !== false ) 576 $date = substr ( $date, 0, $index ) ; 577 } 578 579 return ( $date ) ; 580 } 581 582 583 /*-------------------------------------------------------------------------------------------------------------- 584 585 IsCharacterMap - 586 Checks if the specified text contents represent a character map definition or not. 587 588 *-------------------------------------------------------------------------------------------------------------*/ 589 protected function IsCharacterMap ( $decoded_data ) 590 { 591 // preg_match is faster than calling strpos several times 592 return ( preg_match ( '#(begincmap)|(beginbfrange)|(beginbfchar)|(/Differences)#ix', $decoded_data ) ) ; 593 } 594 595 596 /*-------------------------------------------------------------------------------------------------------------- 597 598 IsFont - 599 Checks if the current object contents specify a font declaration. 600 601 *-------------------------------------------------------------------------------------------------------------*/ 602 protected function IsFont ( $object_data ) 603 { 604 return 605 ( 606 stripos ( $object_data, '/BaseFont' ) !== false || 607 ( ! preg_match ( '#/Type \s* /FontDescriptor#ix', $object_data ) && 608 preg_match ( '#/Type \s* /Font#ix', $object_data ) ) 609 ) ; 610 } 611 612 613 /*-------------------------------------------------------------------------------------------------------------- 614 615 IsFormData - 616 Checks if the current object contents specify references to font data. 617 618 *-------------------------------------------------------------------------------------------------------------*/ 619 protected function IsFormData ( $object_data ) 620 { 621 return 622 ( 623 preg_match ( '#\bR \s* \( \s* datasets \s* \)#imsx', $object_data ) 624 ) ; 625 } 626 627 628 /*-------------------------------------------------------------------------------------------------------------- 629 630 IsFontMap - 631 Checks if the code contains things like : 632 <</F1 26 0 R/F2 22 0 R/F3 18 0 R>> 633 which maps font 1 (when specified with the /Fx instruction) to object 26, 2 to object 22 and 3 to 634 object 18, respectively, in the above example. 635 636 *-------------------------------------------------------------------------------------------------------------*/ 637 protected function IsFontMap ( $object_data ) 638 { 639 $object_data = self::UnescapeHexCharacters ( $object_data ) ; 640 641 if ( preg_match ( '#<< \s* ( ' . self::$FontSpecifiers . ' ) \s+ .* >>#imsx', $object_data ) ) 642 return ( true ) ; 643 else 644 return ( false ) ; 645 } 646 647 648 /*-------------------------------------------------------------------------------------------------------------- 649 650 IsImage - 651 Checks if the code contains things like : 652 /Subtype/Image 653 654 *-------------------------------------------------------------------------------------------------------------*/ 655 protected function IsImage ( $object_data ) 656 { 657 if ( preg_match ( '#/Subtype \s* /Image#msx', $object_data ) ) 658 return ( true ) ; 659 else 660 return ( false ) ; 661 } 662 663 664 /*-------------------------------------------------------------------------------------------------------------- 665 666 IsObjectStream - 667 Checks if the code contains an object stream (/Type/ObjStm) 668 /Subtype/Image 669 670 *-------------------------------------------------------------------------------------------------------------*/ 671 protected function IsObjectStream ( $object_data ) 672 { 673 if ( preg_match ( '#/Type \s* /ObjStm#isx', $object_data ) ) 674 return ( true ) ; 675 else 676 return ( false ) ; 677 } 678 679 680 /*-------------------------------------------------------------------------------------------------------------- 681 682 NAME 683 IsPageHeaderOrFooter - Check if the specified object contents denote a text stream. 684 685 PROTOTYPE 686 $status = $this -> IsPageHeaderOrFooter ( $stream_data ) ; 687 688 DESCRIPTION 689 Checks if the specified decoded stream contents denotes header or footer data. 690 691 PARAMETERS 692 $stream_data (string) - 693 Decoded stream contents. 694 695 *-------------------------------------------------------------------------------------------------------------*/ 696 protected function IsPageHeaderOrFooter ( $stream_data ) 697 { 698 if ( preg_match ( '#/Type \s* /Pagination \s* /Subtype \s*/((Header)|(Footer))#ix', $stream_data ) ) 699 return ( true ) ; 700 else if ( preg_match ( '#/Attached \s* \[ .*? /((Top)|(Bottom)) [^]]#ix', $stream_data ) ) 701 return ( true ) ; 702 else 703 return ( false ) ; 704 } 705 706 707 /*-------------------------------------------------------------------------------------------------------------- 708 709 NAME 710 IsText - Check if the specified object contents denote a text stream. 711 712 PROTOTYPE 713 $status = $this -> IsText ( $object_data, $decoded_stream_data ) ; 714 715 DESCRIPTION 716 Checks if the specified object contents denote a text stream. 717 718 PARAMETERS 719 $object_data (string) - 720 Object data, ie the contents located between the "obj" and "endobj" keywords. 721 722 $decoded_stream_data (string) - 723 The flags specified in the object data are not sufficient to be sure that we have a block of 724 drawing instructions. We must also check for certain common instructions to be present. 725 726 RETURN VALUE 727 True if the specified contents MAY be text contents, false otherwise. 728 729 NOTES 730 I do not consider this method as bullet-proof. There may arise some cases where non-text blocks can be 731 mistakenly considered as text blocks, so it is subject to evolve in the future. 732 733 *-------------------------------------------------------------------------------------------------------------*/ 734 protected function IsText ( $object_data, $decoded_stream_data ) 735 { 736 if ( preg_match ( '# / (Filter) | (Length) #ix', $object_data ) && 737 ! preg_match ( '# / (Type) | (Subtype) | (Length1) #ix', $object_data ) ) 738 { 739 if ( preg_match ( '/\\b(BT|Tf|Td|TJ|Tj|Tm|Do|cm)\\b/', $decoded_stream_data ) ) 740 return ( true ) ; 741 } 742 else if ( preg_match ( '/\\b(BT|Tf|Td|TJ|Tj|Tm|Do|cm)\\b/', $decoded_stream_data ) ) 743 return ( true ) ; 744 745 return ( false ) ; 746 } 747 748 749 /*-------------------------------------------------------------------------------------------------------------- 750 751 NAME 752 PregStrReplace - Replace string(s) using regular expression(s) 753 754 PROTOTYPE 755 $result = PdfToText::PregStrReplace ( $pattern, $replacement, $subject, $limit = -1, 756 &$match_count = null ) 757 758 DESCRIPTION 759 This function behaves like a mix of str_replace() and preg_replace() ; it allows to search for strings 760 using regular expressions, but the replacements are plain-text strings and no reference to a capture 761 specified in the regular expression will be interpreted. 762 This is useful when processing templates, which can contain constructs such as "\00" or "$", which are 763 interpreted by preg_replace() as references to captures. 764 765 The function has the same parameters as preg_replace(). 766 767 RETURN VALUE 768 Returns the substituted text. 769 770 *-------------------------------------------------------------------------------------------------------------*/ 771 public static function PregStrReplace ( $pattern, $replacement, $subject, $limit = -1, &$match_count = null ) 772 { 773 // Make sure that $pattern and $replacement become arrays of the same size 774 if ( is_array ( $pattern ) ) 775 { 776 if ( is_array ( $replacement ) ) 777 { 778 if ( count ( $pattern ) !== count ( $replacement ) ) 779 { 780 warning ( "The \$replacement parameter should have the same number of element as \$pattern." ) ; 781 return ( $subject ) ; 782 } 783 } 784 else 785 $replacement = array_fill ( $replacement, count ( $pattern ), $replacement ) ; 786 } 787 else 788 { 789 if ( is_array ( $replacement ) ) 790 { 791 warning ( "Expected string for the \$replacement parameter." ) ; 792 return ( $subject ) ; 793 } 794 795 $pattern = array ( $pattern ) ; 796 $replacement = array ( $replacement ) ; 797 } 798 799 // Upper limit 800 if ( $limit < 1 ) 801 $limit = PHP_INT_MAX ; 802 803 // Loop through each supplied pattern 804 $current_subject = $subject ; 805 $count = 0 ; 806 807 for ( $i = 0, $pattern_count = count ( $pattern ) ; $i < $pattern_count ; $i ++ ) 808 { 809 $regex = $pattern [$i] ; 810 811 // Get all matches for this pattern 812 if ( preg_match_all ( $regex, $current_subject, $matches, PREG_OFFSET_CAPTURE ) ) 813 { 814 $result = '' ; // Current output result 815 $last_offset = 0 ; 816 817 // Process each match 818 foreach ( $matches [0] as $match ) 819 { 820 $offset = ( integer ) $match [1] ; 821 822 // Append data from the last seen offset up to the current one 823 if ( $last_offset < $offset ) 824 $result .= substr ( $current_subject, $last_offset, $offset - $last_offset ) ; 825 826 // Append the replacement string for this match 827 $result .= $replacement [$i] ; 828 829 // Compute next offset in $current_subject 830 $last_offset = $offset + strlen ( $match [0] ) ; 831 832 // Limit checking 833 $count ++ ; 834 835 if ( $count > $limit ) 836 break 2 ; 837 } 838 839 // Append the last part of the subject that has not been matched by anything 840 $result .= substr ( $current_subject, $last_offset ) ; 841 842 // The current subject becomes the string that has been built in the steps above 843 $current_subject = $result ; 844 } 845 } 846 847 /// All done, return 848 return ( $current_subject ) ; 849 } 850 851 852 /*-------------------------------------------------------------------------------------------------------------- 853 854 NAME 855 ProcessEscapedCharacter - Interprets a character after a backslash in a string. 856 857 PROTOTYPE 858 $ch = $this -> ProcessEscapedCharacter ( $ch ) ; 859 860 DESCRIPTION 861 Interprets a character after a backslash in a string and returns the interpreted value. 862 863 PARAMETERS 864 $ch (char) - 865 Character to be escaped. 866 867 RETURN VALUE 868 The escaped character. 869 870 NOTES 871 This method does not process octal sequences. 872 873 *-------------------------------------------------------------------------------------------------------------*/ 874 protected function ProcessEscapedCharacter ( $ch ) 875 { 876 switch ( $ch ) 877 { 878 // Normally, only a few characters should be escaped... 879 case '(' : $newchar = "(" ; break ; 880 case ')' : $newchar = ")" ; break ; 881 case '[' : $newchar = "[" ; break ; 882 case ']' : $newchar = "]" ; break ; 883 case '\\' : $newchar = "\\" ; break ; 884 case 'n' : $newchar = "\n" ; break ; 885 case 'r' : $newchar = "\r" ; break ; 886 case 'f' : $newchar = "\f" ; break ; 887 case 't' : $newchar = "\t" ; break ; 888 case 'b' : $newchar = chr ( 8 ) ; break ; 889 case 'v' : $newchar = chr ( 11 ) ; break ; 890 891 // ... but should we consider that it is a heresy to escape other characters ? 892 // For the moment, no. 893 default : $newchar = $ch ; break ; 894 } 895 896 return ( $newchar ) ; 897 } 898 899 900 /*-------------------------------------------------------------------------------------------------------------- 901 902 NAME 903 ProcessEscapedString - Processes a string which can have escaped characters. 904 905 PROTOTYPE 906 $result = $this -> ProcessEscapedString ( $str, $process_octal_escapes = false ) ; 907 908 DESCRIPTION 909 Processes a string which may contain escape sequences. 910 911 PARAMETERS 912 $str (string) - 913 String to be processed. 914 915 $process_octal_escapes (boolean) - 916 When true, octal escape sequences such as \037 are processed. 917 918 RETURN VALUE 919 The processed input string. 920 921 *-------------------------------------------------------------------------------------------------------------*/ 922 protected function ProcessEscapedString ( $str, $process_octal_escapes = false ) 923 { 924 $length = strlen ( $str ) ; 925 $offset = 0 ; 926 $result = '' ; 927 $ord0 = ord ( '0' ) ; 928 929 while ( ( $backslash_index = strpos ( $str, '\\', $offset ) ) !== false ) 930 { 931 if ( $backslash_index + 1 < $length ) 932 { 933 $ch = $str [ ++ $backslash_index ] ; 934 935 if ( ! $process_octal_escapes ) 936 { 937 $result .= substr ( $str, $offset, $backslash_index - $offset - 1 ) . $this -> ProcessEscapedCharacter ( $ch ) ; 938 $offset = $backslash_index + 1 ; 939 } 940 else if ( $ch < '0' || $ch > '7' ) 941 { 942 $result .= substr ( $str, $offset, $backslash_index - $offset - 1 ) . $this -> ProcessEscapedCharacter ( $ch ) ; 943 $offset = $backslash_index + 1 ; 944 } 945 else 946 { 947 $result .= substr ( $str, $offset, $backslash_index - $offset - 1 ) ; 948 $ord = ord ( $ch ) - $ord0 ; 949 $count = 0 ; 950 $backslash_index ++ ; 951 952 while ( $backslash_index < $length && $count < 2 && 953 $str [ $backslash_index ] >= '0' && $str [ $backslash_index ] <= '7' ) 954 { 955 $ord = ( $ord * 8 ) + ( ord ( $str [ $backslash_index ++ ] ) - $ord0 ) ; 956 $count ++ ; 957 } 958 959 $result .= chr ( $ord ) ; 960 $offset = $backslash_index ; 961 } 962 } 963 else 964 break ; 965 } 966 967 $result .= substr ( $str, $offset ) ; 968 969 return ( $result ) ; 970 } 971 972 973 /*-------------------------------------------------------------------------------------------------------------- 974 975 NAME 976 Unescape - Processes escape sequences from the specified string. 977 978 PROTOTYPE 979 $value = $this -> Unescape ( $text ) ; 980 981 DESCRIPTION 982 Processes escape sequences within the specified text. The recognized escape sequences are like the 983 C-language ones : \b (backspace), \f (form feed), \r (carriage return), \n (newline), \t (tab). 984 All other characters prefixed by "\" are returned as is. 985 986 PARAMETERS 987 $text (string) - 988 Text to be unescaped. 989 990 RETURN VALUE 991 Returns the unescaped value of $text. 992 993 *-------------------------------------------------------------------------------------------------------------*/ 994 public static function Unescape ( $text ) 995 { 996 $length = strlen ( $text ) ; 997 $result = '' ; 998 $ord0 = ord ( 0 ) ; 999 1000 for ( $i = 0 ; $i < $length ; $i ++ ) 1001 { 1002 $ch = $text [$i] ; 1003 1004 if ( $ch == '\\' && isset ( $text [$i+1] ) ) 1005 { 1006 $nch = $text [++$i] ; 1007 1008 switch ( $nch ) 1009 { 1010 case 'b' : $result .= "\b" ; break ; 1011 case 't' : $result .= "\t" ; break ; 1012 case 'f' : $result .= "\f" ; break ; 1013 case 'r' : $result .= "\r" ; break ; 1014 case 'n' : $result .= "\n" ; break ; 1015 default : 1016 // Octal escape notation 1017 if ( $nch >= '0' && $nch <= '7' ) 1018 { 1019 $ord = ord ( $nch ) - $ord0 ; 1020 $digits = 1 ; 1021 $i ++ ; 1022 1023 while ( $i < $length && $digits < 3 && $text [$i] >= '0' && $text [$i] <= '7' ) 1024 { 1025 $ord = ( $ord * 8 ) + ord ( $text [$i] ) - $ord0 ; 1026 $i ++ ; 1027 $digits ++ ; 1028 } 1029 1030 $i -- ; // Count one character less since $i will be incremented at the end of the for() loop 1031 1032 $result .= chr ( $ord ) ; 1033 } 1034 else 1035 $result .= $nch ; 1036 } 1037 } 1038 else 1039 $result .= $ch ; 1040 } 1041 1042 return ( $result ) ; 1043 } 1044 1045 1046 /*-------------------------------------------------------------------------------------------------------------- 1047 1048 NAME 1049 UnescapeHexCharacters - Unescapes characters in the #xy notation. 1050 1051 PROTOTYPE 1052 $result = $this -> UnescapeHexCharacters ( $data ) ; 1053 1054 DESCRIPTION 1055 Some specifications contain hex characters specified as #xy. For the moment, I have met such a construct in 1056 font aliases such as : 1057 /C2#5F0 25 0 R 1058 where "#5F" stands for "_", giving : 1059 /C2_0 25 0 R 1060 Hope that such constructs do not happen in other places... 1061 1062 PARAMETERS 1063 $data (string) - 1064 String to be unescaped. 1065 1066 RETURN VALUE 1067 The input string with all the hex character representations replaced with their ascii equivalent. 1068 1069 *-------------------------------------------------------------------------------------------------------------*/ 1070 public static function UnescapeHexCharacters ( $data ) 1071 { 1072 if ( strpos ( $data, 'stream' ) === false && preg_match ( '/(?P<hex> \# [0-9a-f] [0-9a-f])/ix', $data ) ) 1073 { 1074 preg_match_all ( '/(?P<hex> \# [0-9a-f] [0-9a-f])/ix', $data, $matches ) ; 1075 1076 $searches = array ( ) ; 1077 $replacements = array ( ) ; 1078 1079 foreach ( $matches [ 'hex' ] as $hex ) 1080 { 1081 if ( ! isset ( $searches [ $hex ] ) ) 1082 { 1083 $searches [ $hex ] = $hex ; 1084 $replacements [] = chr ( hexdec ( substr ( $hex, 1 ) ) ) ; 1085 } 1086 1087 $data = str_replace ( $searches, $replacements, $data ) ; 1088 } 1089 } 1090 1091 return ( $data ) ; 1092 } 1093 1094 1095 /*-------------------------------------------------------------------------------------------------------------- 1096 1097 ValidatePhpName - 1098 Checks that the specified name (declared in the XML template) is a valid PHP name. 1099 1100 *-------------------------------------------------------------------------------------------------------------*/ 1101 public static function ValidatePhpName ( $name ) 1102 { 1103 $name = trim ( $name ) ; 1104 1105 if ( ! preg_match ( '/^ [a-z_][a-z0-9_]* $/ix', $name ) ) 1106 error ( new PdfToTextFormException ( "Invalid PHP name \"$name\"." ) ) ; 1107 1108 return ( $name ) ; 1109 } 1110 } 1111 1112 1113/*============================================================================================================== 1114 1115 PdfToText class - 1116 A class for extracting text from Pdf files. 1117 1118 ==============================================================================================================*/ 1119class PdfToText extends PdfObjectBase 1120 { 1121 // Current version of the class 1122 const VERSION = "1.6.7" ; 1123 1124 // Pdf processing options 1125 const PDFOPT_NONE = 0x00000000 ; // No extra option 1126 const PDFOPT_REPEAT_SEPARATOR = 0x00000001 ; // Repeats the Separator property if the offset between two text blocks (in array notation) 1127 // is greater than $this -> MinSpaceWidth 1128 const PDFOPT_GET_IMAGE_DATA = 0x00000002 ; // Retrieve raw image data in the $ths -> ImageData array 1129 const PDFOPT_DECODE_IMAGE_DATA = 0x00000004 ; // Creates a jpeg resource for each image 1130 const PDFOPT_IGNORE_TEXT_LEADING = 0x00000008 ; // Ignore text leading values 1131 const PDFOPT_NO_HYPHENATED_WORDS = 0x00000010 ; // Join hyphenated words that are split on two lines 1132 const PDFOPT_AUTOSAVE_IMAGES = 0x00000020 ; // Autosave images ; the ImageFileTemplate property will need to be defined 1133 const PDFOPT_ENFORCE_EXECUTION_TIME = 0x00000040 ; // Enforces the max_execution_time PHP setting when processing a file. A PdfTexterTimeoutException 1134 // will be thrown if processing of a single file reaches (time_limit - 1 second) by default 1135 // The MaxExecutionTime property can be set to modify this default value. 1136 const PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME = 0x00000080 ; // Same as PDFOPT_ENFORCE_EXECUTION_TIME, but for all calls to the Load() method of the PdfToText class 1137 // The MaxGlobalExecutionTime static property can be set to modify the default time limit 1138 const PDFOPT_IGNORE_HEADERS_AND_FOOTERS = 0x00000300 ; // Ignore headers and footers 1139 1140 const PDFOPT_RAW_LAYOUT = 0x00000000 ; // Layout rendering : raw (default) 1141 const PDFOPT_BASIC_LAYOUT = 0x00000400 ; // Layout rendering : basic 1142 1143 const PDFOPT_LAYOUT_MASK = 0x00000C00 ; // Mask to isolate the targeted layout 1144 1145 const PDFOPT_ENHANCED_STATISTICS = 0x00001000 ; // Compute statistics on PDF language instructions 1146 const PDFOPT_DEBUG_SHOW_COORDINATES = 0x00002000 ; // Include text coordinates ; implies the PDFOPT_BASIC_LAYOUT option 1147 // This option can be useful if you want to use capture areas and get information about 1148 // their coordinates 1149 const PDFOPT_CAPTURE = 0x00004000 ; // Indicates that the caller wants to capture some text and use the SetCaptures() method 1150 // It currently enables the PDFOPT_BASIC_LAYOUT option 1151 const PDFOPT_LOOSE_X_CAPTURE = 0x00008000 ; // Includes in captures text fragments whose dimensions may exceed the captured area dimensions 1152 const PDFOPT_LOOSE_Y_CAPTURE = 0x00010000 ; // (currently not used) 1153 1154 // When boolean true, outputs debug information about fonts, character maps and drawing contents. 1155 // When integer > 1, outputs additional information about other objects. 1156 public static $DEBUG = false ; 1157 1158 // Current filename 1159 public $Filename = false ; 1160 // Extracted text 1161 public $Text = '' ; 1162 // Document pages (array of strings) 1163 public $Pages = array ( ) ; 1164 // Document images (array of PdfImage objects) 1165 public $Images = array ( ) ; 1166 protected $ImageCount = 0 ; 1167 // Raw data for document images 1168 public $ImageData = array ( ) ; 1169 // ImageAutoSaveFileTemplate : 1170 // Template for the file names to be generated when extracting images, if the PDFOPT_AUTOSAVE_IMAGES has been specified. 1171 // Can contain any path, plus the following printf()-like modifiers : 1172 // . "%p" : Path of the original PDF file. 1173 // . "%f" : Filename part of the original PDF file. 1174 // . "%d" : A sequential number, starting from 1, used when generating filenames. The format can contains a width specifier, 1175 // such as "%3d", which will generate 3-digits sequential numbers left-filled with zeroes. 1176 // . "%s" : Image suffix, which will automatically based on the underlying image type. 1177 public $ImageAutoSaveFileTemplate = "%p/%f.%d.%s" ; 1178 // Auto-save image file format 1179 public $ImageAutoSaveFormat = IMG_JPEG ; 1180 // Auto-saved image file names 1181 public $AutoSavedImageFiles = array ( ) ; 1182 // Text chunk separator (used to separate blocks of text specified as an array notation) 1183 public $BlockSeparator = '' ; 1184 // Separator used to separate text groups where the offset value is less than -1000 thousands of character units 1185 // (eg : [(1)-1822(2)] will add a separator between the characters "1" and "2") 1186 // Note that such values are expressed in thousands of text units and subtracted from the current position. A 1187 // negative value means adding more space between the two text units it separates. 1188 public $Separator = ' ' ; 1189 // Separator to be used between pages in the $Text property 1190 public $PageSeparator = "\n" ; 1191 // Minimum value (in 1/1000 of text units) that separates two text chunks that can be considered as a real space 1192 public $MinSpaceWidth = 200 ; 1193 // Pdf options 1194 public $Options = self::PDFOPT_NONE ; 1195 // Maximum number of pages to extract from the PDF. A zero value means "extract everything" 1196 // If this number is negative, then the pages to be extract start from the last page. For example, a value of -2 1197 // extracts the last two pages 1198 public $MaxSelectedPages = false ; 1199 // Maximum number of images to be extracted. A value of zero means "extract everything". A non-zero value gives 1200 // the number of images to extract. 1201 public $MaxExtractedImages = false ; 1202 // Location of the CID tables directory 1203 public static $CIDTablesDirectory ; 1204 // Loacation of the Font metrics directory, for the Adobe standard 14 fonts 1205 public static $FontMetricsDirectory ; 1206 // Standard Adobe font names, and their corresponding file in $FontMetricsDirectory 1207 public static $AdobeStandardFontMetrics = array 1208 ( 1209 'courier' => 'courier.fm', 1210 'courier-bold' => 'courierb.fm', 1211 'courier-oblique' => 'courieri.fm', 1212 'courier-boldoblique' => 'courierbi.fm', 1213 'helvetica' => 'helvetica.fm', 1214 'helvetica-bold' => 'helveticab.fm', 1215 'helvetica-oblique' => 'helveticai.fm', 1216 'helvetica-boldoblique' => 'helveticabi.fm', 1217 'symbol' => 'symbol.fm', 1218 'times-roman' => 'times.fm', 1219 'times-bold' => 'timesb.fm', 1220 'times-bolditalic' => 'timesbi.fm', 1221 'times-italic' => 'timesi.fm', 1222 'zapfdingbats' => 'zapfdingbats.fm' 1223 ) ; 1224 // Author information 1225 public $Author = '' ; 1226 public $CreatorApplication = '' ; 1227 public $ProducerApplication = '' ; 1228 public $CreationDate = '' ; 1229 public $ModificationDate = '' ; 1230 public $Title = '' ; 1231 public $Subject = '' ; 1232 public $Keywords = '' ; 1233 protected $GotAuthorInformation = false ; 1234 // Unique and arbitrary file identifier, as specified in the PDF file 1235 // Well, in fact, there are two IDs, but the PDF specification does not mention the goal of the second one 1236 public $ID = '' ; 1237 public $ID2 = '' ; 1238 // End of line string 1239 public $EOL = PHP_EOL ; 1240 // String to be used when no Unicode translation is possible 1241 public static $Utf8Placeholder = '' ; 1242 // Information about memory consumption implied by the file currently being loaded 1243 public $MemoryUsage, 1244 $MemoryPeakUsage ; 1245 // Offset of the document start (%PDF-x.y) 1246 public $DocumentStartOffset ; 1247 // Debug statistics 1248 public $Statistics = array ( ) ; 1249 // Max execution time settings. A positive value means "don't exceed that number of seconds". 1250 // A negative value means "Don't exceed PHP setting max_execution_time - that number of seconds". If the result 1251 // is negative, then the default will be "max_execution_time - 1". 1252 // For those limits to be enforced, you need to specify either the PDFOPT_ENFORCE_EXECUTION_TIME or 1253 // PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME options, or both 1254 public $MaxExecutionTime = -1 ; 1255 public static $MaxGlobalExecutionTime = -1 ; 1256 // This property is expressed in percents ; it gives the extra percentage to add to the values computed by 1257 // the PdfTexterFont::GetStringWidth() method. 1258 // This is basically used when computing text positions and string lengths with the PDFOPT_BASIC_LAYOUT option : 1259 // the computed string length is shorter than its actual length (because of extra spacing determined by character 1260 // kerning in the font data). To determine whether two consecutive blocks of text should be separated by a space, 1261 // we empirically add this extra percentage to the computed string length. The default is -5%. 1262 public $ExtraTextWidth = -5 ; 1263 1264 // Marker stuff. The unprocessed marker list is a sequential array of markers, which will later be dispatched into 1265 // indexed arrays during their first reference 1266 protected $UnprocessedMarkerList = array ( 'font' => array ( ) ) ; 1267 protected $TextWithFontMarkers = array ( ) ; 1268 1269 // Internal variables used when the PDFOPT_ENFORCE_* options are specified 1270 protected static $PhpMaxExecutionTime ; 1271 protected static $GlobalExecutionStartTime ; 1272 protected static $AllowedGlobalExecutionTime ; 1273 protected $ExecutionStartTime ; 1274 protected $AllowedExecutionTime ; 1275 1276 // Font mappings 1277 protected $FontTable = false ; 1278 // Extra Adobe standard font mappings (for character names of the form "/axxx" for example) 1279 protected $AdobeExtraMappings = array ( ) ; 1280 // Page map object 1281 protected $PageMap ; 1282 // Page locations (start and end offsets) 1283 protected $PageLocations ; 1284 // Encryption data 1285 public $IsEncrypted = false ; 1286 protected $EncryptionData = false ; 1287 // A flag coming from the constructor options, telling if enhanced statistics are enabled 1288 protected $EnhancedStatistics ; 1289 1290 // Document text fragments, with their absolute (x,y) position, approximate width and height 1291 protected $DocumentFragments ; 1292 1293 // Form data 1294 protected $FormData ; 1295 protected $FormDataObjectNumbers ; 1296 protected $FormDataDefinitions ; 1297 protected $FormaDataObjects ; 1298 1299 // Capture data 1300 public $CaptureDefinitions ; 1301 protected $CaptureObject ; 1302 1303 // Indicates whether global static initializations have been made 1304 // This is mainly used for variables such as $Utf8PlaceHolder, which is initialized to a different value 1305 private static $StaticInitialized = false ; 1306 1307 // Drawing instructions that are to be ignored and removed from a text stream before processing, for performance 1308 // reasons (it is faster to call preg_replace() once to remove them than calling the __next_instruction() and 1309 // __next_token() methods to process an input stream containing such useless instructions) 1310 // This is an array of regular expressions where the following constructs are replaced at runtime during static 1311 // initialization : 1312 // %n - Will be replaced with a regex matching a decimal number. 1313 private static $IgnoredInstructionTemplatesLayout = array 1314 ( 1315 '%n{6} ( (c) ) \s+', 1316 '%n{4} ( (re) | (y) | (v) | (k) | (K) ) \s+', 1317 '%n{3} ( (scn) | (SCN) | (r) | (rg) | (RG) | (sc) | (SC) ) \s+', 1318 '%n{2} ( (m) | (l) ) \s+', 1319 '%n ( (w) | (M) | (g) | (G) | (J) | (j) | (d) | (i) | (sc) | (SC) | (Tc) | (Tw) | (scn) | (Tr) | (Tz) | (Ts) ) \s+', 1320 '\b ( (BDC) | (EMC) ) \s+', 1321 '\/( (Cs \d+) | (CS \d+) | (G[Ss] \d+) | (Fm \d+) | (Im \d+) | (PlacedGraphic) ) \s+ \w+ \s*', 1322 '\/( (Span) | (Artifact) | (Figure) | (P) ) \s* << .*? >> [ \t\r\n>]*', 1323 '\/ ( (PlacedGraphic) | (Artifact) ) \s+', 1324 '\d+ \s+ ( (scn) | (SCN) )', 1325 '\/MC \d+ \s+', 1326 '^ \s* [fhS] \r? \n', 1327 '^W \s+ n \r? \n', 1328 '(f | W) \* \s+', 1329 '^[fhnS] \s+', 1330 '-?0 (\. \d+)? \s+ T[cw]', 1331 '\bBI \s+ .*? \bID \s+ .*? \bEI', 1332 '\/ \w+ \s+ ( (cs) | (CS) | (ri) | (gs) )', 1333 // Hazardous replaces ? 1334 '( [Ww] \s+ ){3,}', 1335 ' \[\] \s+ [Shs] \s+' 1336 ) ; 1337 // Additional instructions to be stripped when no particular page layout has been requested 1338 private static $IgnoredInstructionTemplatesNoLayout = array 1339 ( 1340 '%n{6} ( (cm) ) \s+', 1341// '\b ( (BT) | (ET) ) \s+', 1342 '^ \s* [Qq] \r? \n', 1343 '^ \s* (\b [a-zA-Z] \s+)+', 1344 '\s* (\b [a-zA-Z] \s+)+$', 1345 '^[qQ] \s+', 1346 '^q \s+ [hfS] \n', 1347 '( [Qfhnq] \s+ ){2,}' 1348 ) ; 1349 // Replacement regular expressions for %something constructs specified in the $IgnoredInstructions array 1350 private static $ReplacementConstructs = array 1351 ( 1352 '%n' => '( [+\-]? ( ( [0-9]+ ( \. [0-9]* )? ) | ( \. [0-9]+ ) ) \s+ )' 1353 ) ; 1354 // The final regexes that are built during static initialization by the __build_ignored_instructions() method 1355 private static $IgnoredInstructionsNoLayout = array ( ) ; 1356 private static $IgnoredInstructionsLayout = array ( ) ; 1357 private $IgnoredInstructions = array ( ) ; 1358 1359 // Map id buffer - for avoiding unneccesary calls to GetFontByMapId 1360 private $MapIdBuffer = array ( ) ; 1361 1362 // Same for MapCharacter() 1363 private $CharacterMapBuffer = array ( ) ; 1364 1365 // Font objects buffer - used by __assemble_text_fragments() 1366 private $FontObjectsBuffer = array ( ) ; 1367 1368 // Regex used for removing hyphens - we have to take care of different line endings : "\n" for Unix, "\r\n" 1369 // for Windows, and "\r" for pure Mac files. 1370 // Note that we replace an hyphen followed by an end-of-line then by non-space characters with the non-space 1371 // characters, so the word gets joined on the same line. Spaces after the end of the word (on the next line) 1372 // are removed, in order for the next word to appear at the beginning of the second line. 1373 private static $RemoveHyphensRegex = '# 1374 ( 1375 - 1376 [ \t]* ( (\r\n) | \n | \r )+ [ \t\r\n]* 1377 ) 1378 ([^ \t\r\n]+) 1379 \s* 1380 #msx' ; 1381 1382 // A small list of Unicode character ranges that are related to languages written from right to left 1383 // For performance reasons, everythings is mapped to a range here, even if it includes codepoints that do not map to anything 1384 // (this class is not a Unicode codepoint validator, but a Pdf text extractor...) 1385 // The UTF-16 version is given as comments ; only the UTF-8 translation is used here 1386 // To be completed ! 1387 private static $RtlCharacters = array 1388 ( 1389 // This range represents the following languages : 1390 // - Hebrew (0590..05FF) 1391 // - Arabic (0600..06FF) 1392 // - Syriac (0700..074F) 1393 // - Supplement for Arabic (0750..077F) 1394 // - Thaana (0780..07BF) 1395 // - N'ko (07C0..07FF) 1396 // - Samaritan (0800..083F) 1397 // - Mandaic (0840..085F) 1398 // array ( 0x00590, 0x0085F ), 1399 // Hebrew supplement (I suppose ?) + other characters 1400 // array ( 0x0FB1D, 0x0FEFC ), 1401 // Mende kikakui 1402 // array ( 0x1E800, 0x1E8DF ), 1403 // Adlam 1404 // array ( 0x1E900, 0x1E95F ), 1405 // Others 1406 // array ( 0x10800, 0x10C48 ), 1407 // array ( 0x1EE00, 0x1EEBB ) 1408 "\xD6" => array ( array ( "\x90", "\xBF" ) ), 1409 "\xD7" => array ( array ( "\x80", "\xBF" ) ), 1410 "\xD8" => array ( array ( "\x80", "\xBF" ) ), 1411 "\xD9" => array ( array ( "\x80", "\xBF" ) ), 1412 "\xDA" => array ( array ( "\x80", "\xBF" ) ), 1413 "\xDB" => array ( array ( "\x80", "\xBF" ) ), 1414 "\xDC" => array ( array ( "\x80", "\xBF" ) ), 1415 "\xDD" => array ( array ( "\x80", "\xBF" ) ), 1416 "\xDE" => array ( array ( "\x80", "\xBF" ) ), 1417 "\xDF" => array ( array ( "\x80", "\xBF" ) ) 1418 /* 1419 "\xE0" => array 1420 ( 1421 array ( "\xA0\x80", "\xA0\xBF" ), 1422 array ( "\xA1\x80", "\xA1\x9F" ) 1423 ), 1424 "\xEF" => array 1425 ( 1426 array ( "\xAC\x9D", "\xAC\xBF" ), 1427 array ( "\xAD\x80", "\xAD\xBF" ), 1428 array ( "\xAE\x80", "\xAE\xBF" ), 1429 array ( "\xAF\x80", "\xAF\xBF" ), 1430 array ( "\xB0\x80", "\xB0\xBF" ), 1431 array ( "\xB1\x80", "\xB1\xBF" ), 1432 array ( "\xB2\x80", "\xB2\xBF" ), 1433 array ( "\xB3\x80", "\xB3\xBF" ), 1434 array ( "\xB4\x80", "\xB4\xBF" ), 1435 array ( "\xB5\x80", "\xB5\xBF" ), 1436 array ( "\xB6\x80", "\xB6\xBF" ), 1437 array ( "\xB7\x80", "\xB7\xBF" ), 1438 array ( "\xB8\x80", "\xB8\xBF" ), 1439 array ( "\xB9\x80", "\xB9\xBF" ), 1440 array ( "\xBA\x80", "\xBA\xBF" ), 1441 array ( "\xBB\x80", "\xBB\xBC" ) 1442 ) 1443 */ 1444 ) ; 1445 1446 // UTF-8 prefixes for RTL characters as keys, and number of characters that must follow the prefix as values 1447 private static $RtlCharacterPrefixLengths = array 1448 ( 1449 "\xD6" => 1, 1450 "\xD7" => 1, 1451 "\xD8" => 1, 1452 "\xD9" => 1, 1453 "\xDA" => 1, 1454 "\xDB" => 1, 1455 "\xDC" => 1, 1456 "\xDE" => 1, 1457 "\xDF" => 1 1458 /* 1459 "\xE0" => 2, 1460 "\xEF" => 2 1461 */ 1462 ) ; 1463 1464 // A string that contains all the RTL character prefixes above 1465 private static $RtlCharacterPrefixes ; 1466 1467 // As usual, caching a little bit the results of the IsRtlCharacter() method is welcome. Each item will have the value true if the 1468 // character is RTL, or false if LTR. 1469 private $RtlCharacterBuffer = array ( ) ; 1470 1471 // A subset of a character classification array that avoids too many calls to the ctype_* functions or too many 1472 // character comparisons. 1473 // This array is used only for highly sollicited parts of code 1474 const CTYPE_ALPHA = 0x01 ; // Letter 1475 const CTYPE_DIGIT = 0x02 ; // Digit 1476 const CTYPE_XDIGIT = 0x04 ; // Hex digit 1477 const CTYPE_ALNUM = 0x08 ; // Letter or digit 1478 const CTYPE_LOWER = 0x10 ; // Lower- or upper-case letters 1479 const CTYPE_UPPER = 0x20 ; 1480 1481 private static $CharacterClasses = false ; 1482 1483 // Stuff specific to the current PHP version 1484 private static $HasMemoryGetUsage ; 1485 private static $HasMemoryGetPeakUsage ; 1486 1487 1488 /*-------------------------------------------------------------------------------------------------------------- 1489 1490 CONSTRUCTOR 1491 $pdf = new PdfToText ( $filename = null, $options = PDFOPT_NONE ) ; 1492 1493 DESCRIPTION 1494 Builds a PdfToText object and optionally loads the specified file's contents. 1495 1496 PARAMETERS 1497 $filename (string) - 1498 Optional PDF filename whose text contents are to be extracted. 1499 1500 $options (integer) - 1501 A combination of PDFOPT_* flags. This can be any of the following : 1502 1503 - PDFOPT_REPEAT_SEPARATOR : 1504 Text constructs specified as an array are separated by an offset which is expressed as 1505 thousands of text units ; for example : 1506 1507 [(1)-2000(2)] 1508 1509 will be rendered as the text "1 2" ("1" and "2" being separated by two spaces) if the 1510 "Separator" property is set to a space (the default) and this flag is specified. 1511 When not specified, the text will be rendered as "1 2". 1512 1513 - PDFOPT_NONE : 1514 None of the above options will apply. 1515 1516 *-------------------------------------------------------------------------------------------------------------*/ 1517 public function __construct ( $filename = null, $options = self::PDFOPT_NONE, $user_password = false, $owner_password = false ) 1518 { 1519 // We need the mbstring PHP extension here... 1520 if ( ! function_exists ( 'mb_convert_encoding' ) ) 1521 error ( "You must enable the mbstring PHP extension to use this class." ) ; 1522 1523 // Perform static initializations if needed 1524 if ( ! self::$StaticInitialized ) 1525 { 1526 if ( self::$DEBUG ) 1527 { 1528 // In debug mode, initialize the utf8 placeholder only if it still set to its default value, the empty string 1529 if ( self::$Utf8Placeholder == '' ) 1530 self::$Utf8Placeholder = '[Unknown character : 0x%08X]' ; 1531 } 1532 1533 // Build the list of regular expressions from the list of ignored instruction templates 1534 self::__build_ignored_instructions ( ) ; 1535 1536 // Check if some functions are supported or not 1537 self::$HasMemoryGetUsage = function_exists ( 'memory_get_usage' ) ; 1538 self::$HasMemoryGetPeakUsage = function_exists ( 'memory_get_peak_usage' ) ; 1539 1540 // Location of the directory containing CID fonts 1541 self::$CIDTablesDirectory = dirname ( __FILE__ ) . DIRECTORY_SEPARATOR . 'CIDTables' ; 1542 self::$FontMetricsDirectory = dirname ( __FILE__ ) . DIRECTORY_SEPARATOR . 'FontMetrics' ; 1543 1544 // The string that contains all the Rtl character prefixes in UTF-8 - An optimization used by the __rtl_process() method 1545 self::$RtlCharacterPrefixes = implode ( '', array_keys ( self::$RtlCharacterPrefixLengths ) ) ; 1546 1547 // Build the character classes (used only for testing letters and digits) 1548 if ( self::$CharacterClasses === false ) 1549 { 1550 for ( $ord = 0 ; $ord < 256 ; $ord ++ ) 1551 { 1552 $ch = chr ( $ord ) ; 1553 1554 if ( $ch >= '0' && $ch <= '9' ) 1555 self::$CharacterClasses [ $ch ] = self::CTYPE_DIGIT | self::CTYPE_XDIGIT | self::CTYPE_ALNUM ; 1556 else if ( $ch >= 'A' && $ch <= 'Z' ) 1557 { 1558 self::$CharacterClasses [ $ch ] = self::CTYPE_ALPHA | self::CTYPE_UPPER | self::CTYPE_ALNUM ; 1559 1560 if ( $ch <= 'F' ) 1561 self::$CharacterClasses [ $ch ] |= self::CTYPE_XDIGIT ; 1562 } 1563 else if ( $ch >= 'a' && $ch <= 'z' ) 1564 { 1565 self::$CharacterClasses [ $ch ] = self::CTYPE_ALPHA | self::CTYPE_LOWER | self::CTYPE_ALNUM ; 1566 1567 if ( $ch <= 'f' ) 1568 self::$CharacterClasses [ $ch ] |= self::CTYPE_XDIGIT ; 1569 } 1570 else 1571 self::$CharacterClasses [ $ch ] = 0 ; 1572 } 1573 } 1574 1575 // Global execution time limit 1576 self::$PhpMaxExecutionTime = ( integer ) ini_get ( 'max_execution_time' ) ; 1577 1578 if ( ! self::$PhpMaxExecutionTime ) // Paranoia : default max script execution time to 120 seconds 1579 self::$PhpMaxExecutionTime = 120 ; 1580 1581 self::$GlobalExecutionStartTime = microtime ( true ) ; // Set the start of the first execution 1582 1583 if ( self::$MaxGlobalExecutionTime > 0 ) 1584 self::$AllowedGlobalExecutionTime = self::$MaxGlobalExecutionTime ; 1585 else 1586 self::$AllowedGlobalExecutionTime = self::$PhpMaxExecutionTime + self::$MaxGlobalExecutionTime ; 1587 1588 // Adjust in case of inconsistent values 1589 if ( self::$AllowedGlobalExecutionTime < 0 || self::$AllowedGlobalExecutionTime > self::$PhpMaxExecutionTime ) 1590 self::$AllowedGlobalExecutionTime = self::$PhpMaxExecutionTime - 1 ; 1591 1592 self::$StaticInitialized = true ; 1593 } 1594 1595 parent::__construct ( ) ; 1596 1597 $this -> Options = $options ; 1598 1599 if ( $filename ) 1600 $this -> Load ( $filename, $user_password, $owner_password ) ; 1601 } 1602 1603 1604 public function __tostring ( ) 1605 { return ( $this -> Text ) ; } 1606 1607 1608 /************************************************************************************************************** 1609 ************************************************************************************************************** 1610 ************************************************************************************************************** 1611 ****** ****** 1612 ****** ****** 1613 ****** PUBLIC METHODS ****** 1614 ****** ****** 1615 ****** ****** 1616 ************************************************************************************************************** 1617 ************************************************************************************************************** 1618 **************************************************************************************************************/ 1619 1620 /*-------------------------------------------------------------------------------------------------------------- 1621 1622 NAME 1623 Load - Loads text contents from a PDF file. 1624 LoadFromString - Loads PDF contents from a string. 1625 1626 PROTOTYPE 1627 $text = $pdf -> Load ( $filename, $user_password = false, $owner_password = false ) ; 1628 $text = $pdf -> LoadFromString ( $contents, $user_password = false, $owner_password = false ) ; 1629 1630 DESCRIPTION 1631 The Load() method extracts text contents from the specified PDF file. Once processed, text contents will 1632 be available through the "Text" property. 1633 The LoadFromString() method performs the same operation on PDF contents already loaded into memory. 1634 1635 PARAMETERS 1636 $filename (string) - 1637 Optional PDF filename whose text contents are to be extracted. 1638 1639 $contents (string) - 1640 String containing PDF contents. 1641 1642 $user_password (string) - 1643 User password used for decrypting PDF contents. 1644 1645 $owner_password (string) - 1646 Owner password. 1647 1648 *-------------------------------------------------------------------------------------------------------------*/ 1649 private $__memory_peak_usage_start, 1650 $__memory_usage_start ; 1651 1652 public function Load ( $filename, $user_password = false, $owner_password = false ) 1653 { 1654 $this -> __memory_usage_start = ( self::$HasMemoryGetUsage ) ? memory_get_usage ( true ) : 0 ; 1655 $this -> __memory_peak_usage_start = ( self::$HasMemoryGetPeakUsage ) ? memory_get_peak_usage ( true ) : 0 ; 1656 1657 // Check if the file exists, but only if the file is on a local filesystem 1658 if ( ! preg_match ( '#^ [^:]+ ://#ix', $filename ) && ! file_exists ( $filename ) ) 1659 error ( new PdfToTextDecodingException ( "File \"$filename\" does not exist." ) ) ; 1660 1661 // Load its contents 1662 $contents = @file_get_contents ( $filename, FILE_BINARY ) ; 1663 1664 if ( $contents === false ) 1665 error ( new PdfToTextDecodingException ( "Unable to open \"$filename\"." ) ) ; 1666 1667 return ( $this -> __load ( $filename, $contents, $user_password, $owner_password ) ) ; 1668 } 1669 1670 1671 public function LoadFromString ( $contents, $user_password = false, $owner_password = false ) 1672 { 1673 $this -> __memory_usage_start = ( self::$HasMemoryGetUsage ) ? memory_get_usage ( true ) : 0 ; 1674 $this -> __memory_peak_usage_start = ( self::$HasMemoryGetPeakUsage ) ? memory_get_peak_usage ( true ) : 0 ; 1675 1676 return ( $this -> __load ( '', $contents, $user_password, $owner_password ) ) ; 1677 } 1678 1679 1680 private function __load ( $filename, $contents, $user_password = false, $owner_password = false ) 1681 { 1682 // Search for the start of the document ("%PDF-x.y") 1683 $start_offset = strpos ( $contents, '%PDF' ) ; 1684 1685 if ( $start_offset === false ) // Not a pdf document ! 1686 error ( new PdfToTextDecodingException ( "File \"$filename\" is not a valid PDF file." ) ) ; 1687 else // May be a PDF document 1688 $this -> DocumentStartOffset = $start_offset ; 1689 1690 // Check that this is a PDF file with a valid version number 1691 if ( ! preg_match ( '/ %PDF- (?P<version> \d+ (\. \d+)*) /ix', $contents, $match, 0, $start_offset ) ) 1692 error ( new PdfToTextDecodingException ( "File \"$filename\" is not a valid PDF file." ) ) ; 1693 1694 $this -> PdfVersion = $match [ 'version' ] ; 1695 1696 // Initializations 1697 $this -> Text = '' ; 1698 $this -> FontTable = new PdfTexterFontTable ( ) ; 1699 $this -> Filename = realpath ( $filename ) ; 1700 $this -> Pages = array ( ) ; 1701 $this -> Images = array ( ) ; 1702 $this -> ImageData = array ( ) ; 1703 $this -> ImageCount = 0 ; 1704 $this -> AutoSavedImageFiles = array ( ) ; 1705 $this -> PageMap = new PdfTexterPageMap ( ) ; 1706 $this -> PageLocations = array ( ) ; 1707 $this -> Author = '' ; 1708 $this -> CreatorApplication = '' ; 1709 $this -> ProducerApplication = '' ; 1710 $this -> CreationDate = '' ; 1711 $this -> ModificationDate = '' ; 1712 $this -> Title = '' ; 1713 $this -> Subject = '' ; 1714 $this -> Keywords = '' ; 1715 $this -> GotAuthorInformation = false ; 1716 $this -> ID = '' ; 1717 $this -> ID2 = '' ; 1718 $this -> EncryptionData = false ; 1719 $this -> EnhancedStatistics = ( ( $this -> Options & self::PDFOPT_ENHANCED_STATISTICS ) != 0 ) ; 1720 1721 // Also reset cached information that may come from previous runs 1722 $this -> MapIdBuffer = array ( ) ; 1723 $this -> RtlCharacterBuffer = array ( ) ; 1724 $this -> CharacterMapBuffer = array ( ) ; 1725 $this -> FontObjectsBuffer = array ( ) ; 1726 $this -> FormData = array ( ) ; 1727 $this -> FormDataObjectNumbers = false ; 1728 $this -> FomDataDefinitions = array ( ) ; 1729 $this -> FormDataObjects = array ( ) ; 1730 $this -> CaptureDefinitions = false ; 1731 $this -> CaptureObject = false ; 1732 $this -> DocumentFragments = array ( ) ; 1733 1734 // Enable the PDFOPT_BASIC_LAYOUT option if the PDFOPT_CAPTURE flag is specified 1735 if ( $this -> Options & self::PDFOPT_CAPTURE ) 1736 $this -> Options |= self::PDFOPT_BASIC_LAYOUT ; 1737 1738 // Enable the PDFOPT_BASIC_LAYOUT_OPTION is PDFOPT_DEBUG_SHOW_COORDINATES is specified 1739 if ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES ) 1740 $this -> Options |= self::PDFOPT_BASIC_LAYOUT ; 1741 1742 // Page layout options needs more instructions to be retained - select the appropriate list of useless instructions 1743 if ( $this -> Options & self::PDFOPT_BASIC_LAYOUT ) 1744 $this -> IgnoredInstructions = self::$IgnoredInstructionsLayout ; 1745 else 1746 $this -> IgnoredInstructions = self::$IgnoredInstructionsNoLayout ; 1747 1748 1749 // Debug statistics 1750 $this -> Statistics = array 1751 ( 1752 'TextSize' => 0, // Total size of drawing instructions ("text" objects) 1753 'OptimizedTextSize' => 0, // Optimized text size, with useless instructions removed 1754 'Distributions' => array // Statistics about handled instructions distribution - Works only with the page layout option in debug mode 1755 ( 1756 'operand' => 0, 1757 'Tm' => 0, 1758 'Td' => 0, 1759 'TD' => 0, 1760 "'" => 0, 1761 'TJ' => 0, 1762 'Tj' => 0, 1763 'Tf' => 0, 1764 'TL' => 0, 1765 'T*' => 0, 1766 '(' => 0, 1767 '<' => 0, 1768 '[' => 0, 1769 'cm' => 0, 1770 'BT' => 0, 1771 'template' => 0, 1772 'ignored' => 0, 1773 'space' => 0 1774 ) 1775 ) ; 1776 1777 // Per-instance execution time limit 1778 $this -> ExecutionStartTime = microtime ( true ) ; 1779 1780 if ( $this -> MaxExecutionTime > 0 ) 1781 $this -> AllowedExecutionTime = $this -> MaxExecutionTime ; 1782 else 1783 $this -> AllowedExecutionTime = self::$PhpMaxExecutionTime + $this -> MaxExecutionTime ; 1784 1785 // Adjust in case of inconsistent values 1786 if ( $this -> AllowedExecutionTime < 0 || $this -> AllowedExecutionTime > self::$PhpMaxExecutionTime ) 1787 $this -> AllowedExecutionTime = self::$PhpMaxExecutionTime - 1 ; 1788 1789 // Systematically set the DECODE_IMAGE_DATA flag if the AUTOSAVE_IMAGES flag has been specified 1790 if ( $this -> Options & self::PDFOPT_AUTOSAVE_IMAGES ) 1791 $this -> Options |= self::PDFOPT_DECODE_IMAGE_DATA ; 1792 1793 // Systematically set the GET_IMAGE_DATA flag if DECODE_IMAGE_DATA is specified (debug mode only) 1794 if ( self::$DEBUG && $this -> Options & self::PDFOPT_DECODE_IMAGE_DATA ) 1795 $this -> Options |= self::PDFOPT_GET_IMAGE_DATA ; 1796 1797 // Since page layout options take 2 bits, but not all of the 4 possible values are allowed, make sure that an invalid 1798 // value will default to PDFOPT_RAW_LAYOUT value 1799 $layout_option = $this -> Options & self::PDFOPT_LAYOUT_MASK ; 1800 1801 if ( ! $layout_option === self::PDFOPT_RAW_LAYOUT && $layout_option !== self::PDFOPT_BASIC_LAYOUT ) 1802 { 1803 $layout_option = self::PDFOPT_RAW_LAYOUT ; 1804 $this -> Options = ( $this -> Options & ~self::PDFOPT_LAYOUT_MASK ) | self::PDFOPT_RAW_LAYOUT ; 1805 } 1806 1807 // Author information needs to be processed after, because it may reference objects that occur later in the PDF stream 1808 $author_information_object_id = false ; 1809 1810 // Extract pdf objects that are enclosed by the "obj" and "endobj" keywords 1811 $pdf_objects = array ( ) ; 1812 $contents_offset = $this -> DocumentStartOffset ; 1813 $contents_length = strlen ( $contents ) ; 1814 1815 1816 while ( $contents_offset < $contents_length && 1817 preg_match ( '/(?P<re> (?P<object_id> \d+) \s+ \d+ \s+ obj (?P<object> .*?) endobj )/imsx', $contents, $match, PREG_OFFSET_CAPTURE, $contents_offset ) ) 1818 { 1819 $object_number = $match [ 'object_id' ] [0] ; 1820 $object_data = $match [ 'object' ] [0] ; 1821 1822 // Handle the special case of object streams (compound objects) 1823 // They are not added in the $pdf_objects array, because they could be mistakenly processed as relevant information, 1824 // such as font definitions, etc. 1825 // Instead, only the objects they are embedding are stored in this array. 1826 if ( $this -> IsObjectStream ( $object_data ) ) 1827 { 1828 // Ignore ill-formed object streams 1829 if ( ( $object_stream_matches = $this -> DecodeObjectStream ( $object_number, $object_data ) ) !== false ) 1830 { 1831 // Add this list of objects to the list of known objects 1832 for ( $j = 0, $object_stream_count = count ( $object_stream_matches [ 'object_id' ] ) ; $j < $object_stream_count ; $j ++ ) 1833 $pdf_objects [ $object_stream_matches [ 'object_id' ] [$j] ] = $object_stream_matches [ 'object' ] [$j] ; 1834 } 1835 } 1836 // Normal (non-compound) object 1837 else 1838 $pdf_objects [ $object_number ] = $object_data ; 1839 1840 // Update current offset through PDF contents 1841 $contents_offset = $match [ 're' ] [1] + strlen ( $match [ 're' ] [0] ) ; 1842 } 1843 1844 // We put a particular attention in treating errors returned by preg_match_all() here, since we need to be really sure why stopped 1845 // to find further PDF objects in the supplied contents 1846 $preg_error = preg_last_error ( ) ; 1847 1848 switch ( $preg_error ) 1849 { 1850 case PREG_NO_ERROR : 1851 break ; 1852 1853 case PREG_INTERNAL_ERROR : 1854 error ( new PdfToTextDecodingException ( "PDF object extraction : the preg_match_all() function encountered an internal error." ) ) ; 1855 1856 case PREG_BACKTRACK_LIMIT_ERROR : 1857 error ( new PdfToTextDecodingException ( "PDF object extraction : backtrack limit reached (you may have to modify the pcre.backtrack_limit " . 1858 "setting of your PHP.ini file, which is currently set to " . ini_get ( 'pcre.backtrack_limit' ) . ")." ) ) ; 1859 1860 case PREG_JIT_STACKLIMIT_ERROR : 1861 error ( new PdfToTextDecodingException ( "PDF object extraction : JIT stack limit reached (you may disable this feature by setting the pcre.jit " . 1862 "setting of your PHP.ini file to 0)." ) ) ; 1863 1864 case PREG_RECURSION_LIMIT_ERROR : 1865 error ( new PdfToTextDecodingException ( "PDF object extraction : recursion limit reached (you may have to modify the pcre.recursion_limit " . 1866 "setting of your PHP.ini file, which is currently set to " . ini_get ( 'pcre.recursion_limit' ) . ")." ) ) ; 1867 1868 case PREG_BAD_UTF8_ERROR : 1869 error ( new PdfToTextDecodingException ( "PDF object extraction : bad UTF8 character encountered." ) ) ; 1870 1871 case PREG_BAD_UTF8_OFFSET_ERROR : 1872 error ( new PdfToTextDecodingException ( "PDF object extraction : the specified offset does not start at the beginning of a valid UTF8 codepoint." ) ) ; 1873 1874 default : 1875 error ( new PdfToTextDecodingException ( "PDF object extraction : unkown PREG error #$preg_error" ) ) ; 1876 } 1877 1878 1879 // Extract trailer information, which may contain the ID of an object specifying encryption flags 1880 $this -> GetTrailerInformation ( $contents, $pdf_objects ) ; 1881 unset ( $contents ) ; 1882 1883 // Character maps encountered so far 1884 $cmaps = array ( ) ; 1885 1886 // An array that will store object ids as keys and text contents as values 1887 $text = array ( ) ; 1888 1889 // Loop through the objects 1890 foreach ( $pdf_objects as $object_number => $object_data ) 1891 { 1892 // Some additional objects may be uncovered after processing (in an object containing compacted objects for example) 1893 // so add them to the list if necessary 1894 if ( ! isset ( $pdf_objects [ $object_number ] ) ) 1895 $pdf_objects [ $object_number ] = $object_data ; 1896 1897 // Try to catch information related to page mapping - but don't discard the object since it can contain additional information 1898 $this -> PageMap -> Peek ( $object_number, $object_data, $pdf_objects ) ; 1899 1900 // Check if the object contais authoring information - it can appear encoded or unencoded 1901 if ( ! $this -> GotAuthorInformation ) 1902 $author_information_object_id = $this -> PeekAuthorInformation ( $object_number, $object_data ) ; 1903 1904 // Also catch the object encoding type 1905 $type = $this -> GetEncodingType ( $object_number, $object_data ) ; 1906 $stream_match = null ; 1907 1908 if ( strpos ( $object_data, 'stream' ) === false || 1909 ! preg_match ( '#[^/] stream \s+ (?P<stream> .*?) endstream#imsx', $object_data, $stream_match ) ) 1910 { 1911 // Some font definitions are in clear text in an object, some are encoded in a stream within the object 1912 // We process here the unencoded ones 1913 if ( $this -> IsFont ( $object_data ) ) 1914 { 1915 $this -> FontTable -> Add ( $object_number, $object_data, $pdf_objects, $this -> AdobeExtraMappings ) ; 1916 continue ; 1917 } 1918 // Some character maps may also be in clear text 1919 else if ( $this -> IsCharacterMap ( $object_data ) ) 1920 { 1921 $cmap = PdfTexterCharacterMap::CreateInstance ( $object_number, $object_data, $this -> AdobeExtraMappings ) ; 1922 1923 if ( $cmap ) 1924 $cmaps [] = $cmap ; 1925 1926 continue ; 1927 } 1928 // Check if there is an association between font number and object number 1929 else if ( $this -> IsFontMap ( $object_data ) ) 1930 { 1931 $this -> FontTable -> AddFontMap ( $object_number, $object_data ) ; 1932 } 1933 // Retrieve form data if present 1934 else if ( $this -> IsFormData ( $object_data ) ) 1935 { 1936 $this -> RetrieveFormData ( $object_number, $object_data, $pdf_objects ) ; 1937 } 1938 // Ignore other objects that do not contain an encoded stream 1939 else 1940 { 1941 if ( self::$DEBUG > 1 ) 1942 echo "\n----------------------------------- UNSTREAMED #$object_number\n$object_data" ; 1943 1944 continue ; 1945 } 1946 } 1947 // Extract image data, if any 1948 else if ( $this -> IsImage ( $object_data ) ) 1949 { 1950 $this -> AddImage ( $object_number, $stream_match [ 'stream' ], $type, $object_data ) ; 1951 continue ; 1952 } 1953 // Check if there is an association between font number and object number 1954 else if ( $this -> IsFontMap ( $object_data ) ) 1955 { 1956 $this -> FontTable -> AddFontMap ( $object_number, $object_data ) ; 1957 1958 if ( ! $stream_match ) 1959 continue ; 1960 } 1961 1962 // Check if the stream contains data (yes, I have found a sample that had streams of length 0...) 1963 // In other words : ignore empty streams 1964 if ( stripos ( $object_data, '/Length 0' ) !== false ) 1965 continue ; 1966 1967 // Isolate stream data and try to find its encoding type 1968 if ( isset ( $stream_match [ 'stream' ] ) ) 1969 $stream_data = ltrim ( $stream_match [ 'stream' ], "\r\n" ) ; 1970 else 1971 continue ; 1972 1973 // Ignore this stream if the object does not contain an encoding type (/FLATEDECODE, /ASCIIHEX or /ASCII85) 1974 if ( $type == self::PDF_UNKNOWN_ENCODING ) 1975 { 1976 if ( self::$DEBUG > 1 ) 1977 echo "\n----------------------------------- UNENCODED #$object_number :\n$object_data" ; 1978 1979 continue ; 1980 } 1981 1982 // Decode the encoded stream 1983 $decoded_stream_data = $this -> DecodeData ( $object_number, $stream_data, $type, $object_data ) ; 1984 1985 // Second chance to peek author information, this time on a decoded stream data 1986 if ( ! $this -> GotAuthorInformation ) 1987 $author_information_object_id = $this -> PeekAuthorInformation ( $object_number, $decoded_stream_data ) ; 1988 1989 // Check for character maps 1990 if ( $this -> IsCharacterMap ( $decoded_stream_data ) ) 1991 { 1992 $cmap = PdfTexterCharacterMap::CreateInstance ( $object_number, $decoded_stream_data, $this -> AdobeExtraMappings ) ; 1993 1994 if ( $cmap ) 1995 $cmaps [] = $cmap ; 1996 } 1997 // Font definitions 1998 else if ( $this -> IsFont ( $decoded_stream_data ) ) 1999 { 2000 $this -> FontTable -> Add ( $object_number, $decoded_stream_data, $pdf_objects, $this -> AdobeExtraMappings ) ; 2001 } 2002 // Retrieve form data if present 2003 else if ( $this -> IsFormData ( $object_data ) ) 2004 { 2005 $this -> RetrieveFormData ( $object_number, $decoded_stream_data, $pdf_objects ) ; 2006 } 2007 // Plain text (well, in fact PDF drawing instructions) 2008 else if ( $this -> IsText ( $object_data, $decoded_stream_data ) ) 2009 { 2010 $text_data = false ; 2011 2012 // Check if we need to ignore page headers and footers 2013 if ( $this -> Options & self::PDFOPT_IGNORE_HEADERS_AND_FOOTERS ) 2014 { 2015 if ( ! $this -> IsPageHeaderOrFooter ( $decoded_stream_data ) ) 2016 { 2017 $text [ $object_number ] = 2018 $text_data = $decoded_stream_data ; 2019 } 2020 // However, they may be mixed with actual text contents so we need to separate them... 2021 else 2022 { 2023 $this -> ExtractTextData ( $object_number, $decoded_stream_data, $remainder, $header, $footer ) ; 2024 2025 // We still need to check again that the extracted text portion contains something useful 2026 if ( $this -> IsText ( $object_data, $remainder ) ) 2027 { 2028 $text [ $object_number ] = 2029 $text_data = $remainder ; 2030 } 2031 } 2032 } 2033 else 2034 { 2035 $text [ $object_number ] = 2036 $text_data = $decoded_stream_data ; 2037 } 2038 2039 2040 // The current object may be a text object that have been defined as an XObject in some other object 2041 // In this case, we have to keep it since it may be referenced by a /TPLx construct from within 2042 // another text object 2043 if ( $text_data ) 2044 $this -> PageMap -> AddTemplateObject ( $object_number, $text_data ) ; 2045 } 2046 // This may be here the opportunity to look into the $FormData property and replace object ids with their corresponding data 2047 else 2048 { 2049 $found = false ; 2050 2051 foreach ( $this -> FormData as &$form_entry ) 2052 { 2053 if ( is_integer ( $form_entry [ 'values' ] ) && $object_number == $form_entry [ 'values' ] ) 2054 { 2055 $form_entry [ 'values' ] = $decoded_stream_data ; 2056 $found = true ; 2057 } 2058 else if ( is_integer ( $form_entry [ 'form' ] ) && $object_number == $form_entry [ 'form' ] ) 2059 { 2060 $form_entry [ 'form' ] = $decoded_stream_data ; 2061 $found = true ; 2062 } 2063 } 2064 2065 if ( ! $found && self::$DEBUG > 1 ) 2066 echo "\n----------------------------------- UNRECOGNIZED #$object_number :\n$decoded_stream_data\n" ; 2067 } 2068 } 2069 2070 // Form data object numbers 2071 $this -> FormDataObjectNumbers = array_keys ( $this -> FormData ) ; 2072 2073 // Associate character maps with declared fonts 2074 foreach ( $cmaps as $cmap ) 2075 $this -> FontTable -> AddCharacterMap ( $cmap ) ; 2076 2077 // Current font defaults to -1, which means : take the first available font as the current one. 2078 // Sometimes it may happen that text drawing instructions do not set a font at all (PdfPro for example) 2079 $current_font = -1 ; 2080 2081 // Build the page catalog 2082 $this -> Pages = array ( ) ; 2083 $this -> PageMap -> MapObjects ( $text ) ; 2084 2085 // Add font mappings local to each page 2086 $mapped_fonts = $this -> PageMap -> GetMappedFonts ( ) ; 2087 $this -> FontTable -> AddPageFontMap ( $mapped_fonts ) ; 2088 2089 // Extract text from the collected text elements 2090 foreach ( $this -> PageMap -> Pages as $page_number => $page_objects ) 2091 { 2092 // Checks if this page is selected 2093 if ( ! $this -> IsPageSelected ( $page_number ) ) 2094 continue ; 2095 2096 $this -> Pages [ $page_number ] = '' ; 2097 2098 if ( $layout_option === self::PDFOPT_RAW_LAYOUT ) 2099 { 2100 foreach ( $page_objects as $page_object ) 2101 { 2102 if ( isset ( $text [ $page_object ] ) ) 2103 { 2104 $new_text = $this -> PageMap -> ProcessTemplateReferences ( $page_number, $text [ $page_object ] ) ; 2105 $object_text = $this -> ExtractText ( $page_number, $page_object, $new_text, $current_font ) ; 2106 $this -> Pages [ $page_number ] .= $object_text ; 2107 } 2108 else if ( self::$DEBUG > 1 ) 2109 echo "\n----------------------------------- MISSING OBJECT #$page_object for page #$page_number\n" ; 2110 } 2111 } 2112 // New style (basic) layout rendering 2113 else if ( $layout_option === self::PDFOPT_BASIC_LAYOUT ) 2114 { 2115 $page_fragments = array ( ) ; 2116 2117 foreach ( $page_objects as $page_object ) 2118 { 2119 if ( isset ( $text [ $page_object ] ) ) 2120 { 2121 $new_text = $this -> PageMap -> ProcessTemplateReferences ( $page_number, $text [ $page_object ] ) ; 2122 $this -> ExtractTextWithLayout ( $page_fragments, $page_number, $page_object, $new_text, $current_font ) ; 2123 } 2124 else if ( self::$DEBUG > 1 ) 2125 echo "\n----------------------------------- MISSING OBJECT #$page_object for page #$page_number\n" ; 2126 } 2127 2128 $this -> Pages [ $page_number ] = $this -> __assemble_text_fragments ( $page_number, $page_fragments, $page_width, $page_height ) ; 2129 2130 $this -> DocumentFragments [ $page_number ] = array 2131 ( 2132 'fragments' => $page_fragments, 2133 'page-width' => $page_width, 2134 'page_height' => $page_height 2135 ) ; 2136 } 2137 } 2138 2139 // Retrieve author information 2140 if ( $this -> GotAuthorInformation ) 2141 $this -> RetrieveAuthorInformation ( $author_information_object_id, $pdf_objects ) ; 2142 2143 // Build the page locations (ie, starting and ending offsets) 2144 $offset = 0 ; 2145 $page_separator = utf8_encode ( $this -> PageSeparator ) ; 2146 $page_separator_length = strlen ( $page_separator ) ; 2147 2148 foreach ( $this -> Pages as $page_number => &$page ) 2149 { 2150 // If hyphenated words are unwanted, then remove them 2151 if ( $this -> Options & self::PDFOPT_NO_HYPHENATED_WORDS ) 2152 $page = preg_replace ( self::$RemoveHyphensRegex, '$4$2', $page ) ; 2153 2154 $length = strlen ( $page ) ; 2155 $this -> PageLocations [ $page_number ] = array ( 'start' => $offset, 'end' => $offset + $length - 1 ) ; 2156 $offset += $length + $page_separator_length ; 2157 } 2158 2159 // And finally, the Text property 2160 $this -> Text = implode ( $page_separator, $this -> Pages ) ; 2161 2162 // Free memory 2163 $this -> MapIdBuffer = array ( ) ; 2164 $this -> RtlCharacterBuffer = array ( ) ; 2165 $this -> CharacterMapBuffer = array ( ) ; 2166 2167 // Compute memory occupied for this file 2168 $memory_usage_end = ( self::$HasMemoryGetUsage ) ? memory_get_usage ( true ) : 0 ; 2169 $memory_peak_usage_end = ( self::$HasMemoryGetPeakUsage ) ? memory_get_peak_usage ( true ) : 0 ; 2170 2171 $this -> MemoryUsage = $memory_usage_end - $this -> __memory_usage_start ; 2172 $this -> MemoryPeakUsage = $memory_peak_usage_end - $this -> __memory_peak_usage_start ; 2173 2174 // Adjust the "Distributions" statistics 2175 if ( $this -> Options & self::PDFOPT_ENHANCED_STATISTICS ) 2176 { 2177 $instruction_count = 0 ; 2178 $statistics = array ( ) ; 2179 2180 // Count the total number of instructions 2181 foreach ( $this -> Statistics [ 'Distributions' ] as $count ) 2182 $instruction_count += $count ; 2183 2184 // Now transform the Distributions entries into an associative array containing the instruction counts 2185 // ('count') and their relative percentage 2186 foreach ( $this -> Statistics [ 'Distributions' ] as $name => $count ) 2187 { 2188 if ( $instruction_count ) 2189 $percent = round ( ( 100.0 / $instruction_count ) * $count, 2 ) ; 2190 else 2191 $percent = 0 ; 2192 2193 $statistics [ $name ] = array 2194 ( 2195 'instruction' => $name, 2196 'count' => $count, 2197 'percent' => $percent 2198 ) ; 2199 } 2200 2201 // Set the new 'Distributions' array and sort it by instruction count in reverse order 2202 $this -> Statistics [ 'Distributions' ] = $statistics ; 2203 uksort ( $this -> Statistics [ 'Distributions' ], array ( $this, '__sort_distributions' ) ) ; 2204 } 2205 2206 // All done, return 2207 return ( $this -> Text ) ; 2208 } 2209 2210 2211 public function __sort_distributions ( $a, $b ) 2212 { return ( $this -> Statistics [ 'Distributions' ] [$b] [ 'count' ] - $this -> Statistics [ 'Distributions' ] [$a] [ 'count' ] ) ; } 2213 2214 2215 2216 /*-------------------------------------------------------------------------------------------------------------- 2217 2218 NAME 2219 AddAdobeExtraMappings - Adds extra mappings for standard Adobe fonts. 2220 2221 PROTOTYPE 2222 $pdf -> AddAdobeExtraMappings ( $mappings ) ; 2223 2224 DESCRIPTION 2225 Adobe supports 4 predefined fonts : standard, Mac, WinAnsi and PDF). All the characters in these fonts 2226 are identified by a character time, a little bit like HTML entities ; for example, 'one' will be the 2227 character '1', 'acircumflex' will be '�', etc. 2228 There are thousands of character names defined by Adobe (see https://mupdf.com/docs/browse/source/pdf/pdf-glyphlist.h.html). 2229 Some of them are not in this list ; this is the case for example of the 'ax' character names, where 'x' 2230 is a decimal number. When such a character is specified in a /Differences array, then there is somewhere 2231 a CharProc[] array giving an object id for each of those characters. 2232 The referenced object(s) in turn contain drawing instructions to draw the glyph. At no point you could 2233 guess what is the corresponding Unicode character for this glyph, since the information is not contained 2234 in the PDF file. 2235 The AddAdobeExtraMappings() method allows you to specify such correspondences. Specify an array as the 2236 $mappings parameter, whose keys are the Adobe character name (for example, "a127") and values the 2237 corresponding Unicode values (see the description of the $mappings parameter for more information). 2238 2239 PARAMETERS 2240 $mappings (associative array) - 2241 Associative array whose keys are Adobe character names. The array values can take several forms : 2242 - A character 2243 - An integer value 2244 - An array of up to four character or integer values. 2245 Internally, every specified value is converted to an array of four integer values, one for 2246 each of the standard Adobe character sets (Standard, Mac, WinAnsi and PDF). The following 2247 rules apply : 2248 - If the input value is a single character, the output array corrsponding the Adobe character 2249 name will be a set of 4 elements corresponding to the ordinal value of the supplied 2250 character. 2251 - If the input value is an integer, the output array will be a set of 4 identical values 2252 - If the input value is an array : 2253 . Arrays with less that 4 elements will be padded, using the last array item for padding 2254 . Arrays with more than 4 elements will be silently truncated 2255 . Each array value can either be a character or a numeric value. 2256 2257 NOTES 2258 In this current implementation, the method applies the mappings to ALL Adobe default fonts. That is, 2259 you cannot have one mapping for one Adobe font referenced in the PDF file, then a second mapping for 2260 a second Adobe font, etc. 2261 2262 *-------------------------------------------------------------------------------------------------------------*/ 2263 public function AddAdobeExtraMappings ( $mappings ) 2264 { 2265 // Loop through each mapping 2266 foreach ( $mappings as $key => $value ) 2267 { 2268 // Character value : we retain its ordinal value as the 4 values of the output array 2269 if ( is_string ( $value ) ) 2270 { 2271 $ord = ord ( $value ) ; 2272 $items = array ( $ord, $ord, $ord, $ord ) ; 2273 } 2274 // Numeric value : the output array will contain 4 times the supplied value 2275 else if ( is_numeric ( $value ) ) 2276 { 2277 $value = ( integer ) $value ; 2278 $items = array ( $value, $value, $value, $value ) ; 2279 } 2280 // Array value : make sure we will have an output array of 4 values 2281 else if ( is_array ( $value ) ) 2282 { 2283 $items = array ( ) ; 2284 2285 // Collect the supplied values, converting characters to their ordinal values if necessary 2286 for ( $i = 0, $count = count ( $value ) ; $i < $count && $i < 4 ; $i ++ ) 2287 { 2288 $code = $value [$i] ; 2289 2290 if ( is_string ( $code ) ) 2291 $items [] = ord ( $code ) ; 2292 else 2293 $items [] = ( integer ) $code ; 2294 } 2295 2296 // Ensure that we have 4 values ; fill the missing ones with the last seen value if necessary 2297 $count = count ( $items ) ; 2298 2299 if ( ! $count ) 2300 error ( new PdfToTextException ( "Adobe extra mapping \"$key\" has no values." ) ) ; 2301 2302 $last_value = $items [ $count - 1 ] ; 2303 2304 for ( $i = $count ; $i < 4 ; $i ++ ) 2305 $items [] = $last_value ; 2306 } 2307 else 2308 error ( new PdfToTextException ( "Invalid value \"$value\" for Adobe extra mapping \"$key\"." ) ) ; 2309 2310 // Add this current mapping to the Adobe extra mappings array 2311 $this -> AdobeExtraMappings [ $key ] = $items ; 2312 } 2313 } 2314 2315 2316 /*-------------------------------------------------------------------------------------------------------------- 2317 2318 NAME 2319 GetPageFromOffset - Returns a page number from a text offset. 2320 2321 PROTOTYPE 2322 $offset = $pdf -> GetPageFromOffset ( $offset ) ; 2323 2324 DESCRIPTION 2325 Given a byte offset in the Text property, returns its page number in the pdf document. 2326 2327 PARAMETERS 2328 $offset (integer) - 2329 Offset, in the Text property, whose page number is to be retrieved. 2330 2331 RETURN VALUE 2332 Returns a page number in the pdf document, or false if the specified offset does not exist. 2333 2334 *-------------------------------------------------------------------------------------------------------------*/ 2335 public function GetPageFromOffset ( $offset ) 2336 { 2337 if ( $offset === false ) 2338 return ( false ) ; 2339 2340 foreach ( $this -> PageLocations as $page => $location ) 2341 { 2342 if ( $offset >= $location [ 'start' ] && $offset <= $location [ 'end' ] ) 2343 return ( $page ) ; 2344 } 2345 2346 return ( false ) ; 2347 } 2348 2349 2350 /*-------------------------------------------------------------------------------------------------------------- 2351 2352 NAME 2353 text_strpos, text_stripos - Search for an occurrence of a string. 2354 2355 PROTOTYPE 2356 $result = $pdf -> text_strpos ( $search, $start = 0 ) ; 2357 $result = $pdf -> text_stripos ( $search, $start = 0 ) ; 2358 2359 DESCRIPTION 2360 These methods behave as the strpos/stripos PHP functions, except that : 2361 - They operate on the text contents of the pdf file (Text property) 2362 - They return an array containing the page number and text offset. $result [0] will be set to the page 2363 number of the searched text, and $result [1] to its offset in the Text property 2364 2365 PARAMETERS 2366 $search (string) - 2367 String to be searched. 2368 2369 $start (integer) - 2370 Start offset in the pdf text contents. 2371 2372 RETURN VALUE 2373 Returns an array of two values containing the page number and text offset if the searched string has 2374 been found, or false otherwise. 2375 2376 *-------------------------------------------------------------------------------------------------------------*/ 2377 public function text_strpos ( $search, $start = 0 ) 2378 { 2379 $offset = mb_strpos ( $this -> Text, $search, $start, 'UTF-8' ) ; 2380 2381 if ( $offset !== false ) 2382 return ( array ( $this -> GetPageFromOffset ( $offset ), $offset ) ) ; 2383 2384 return ( false ) ; 2385 } 2386 2387 2388 public function text_stripos ( $search, $start = 0 ) 2389 { 2390 $offset = mb_stripos ( $this -> Text, $search, $start, 'UTF-8' ) ; 2391 2392 if ( $offset !== false ) 2393 return ( array ( $this -> GetPageFromOffset ( $offset ), $offset ) ) ; 2394 2395 return ( false ) ; 2396 } 2397 2398 2399 2400 2401 /*-------------------------------------------------------------------------------------------------------------- 2402 2403 NAME 2404 document_strpos, document_stripos - Search for all occurrences of a string. 2405 2406 PROTOTYPE 2407 $result = $pdf -> document_strpos ( $search, $group_by_page = false ) ; 2408 $result = $pdf -> document_stripos ( $search, $group_by_page = false ) ; 2409 2410 DESCRIPTION 2411 Searches for ALL occurrences of a given string in the pdf document. The value of the $group_by_page 2412 parameter determines how the results are returned : 2413 - When true, the returned value will be an associative array whose keys will be page numbers and values 2414 arrays of offset of the found string within the page 2415 - When false, the returned value will be an array of arrays containing two entries : the page number 2416 and the text offset. 2417 2418 For example, if a pdf document contains the string "here" at character offset 100 and 200 in page 1, and 2419 position 157 in page 3, the returned value will be : 2420 - When $group_by_page is false : 2421 [ [ 1, 100 ], [ 1, 200 ], [ 3, 157 ] ] 2422 - When $group_by_page is true : 2423 [ 1 => [ 100, 200 ], 3 => [ 157 ] ] 2424 2425 PARAMETERS 2426 $search (string) - 2427 String to be searched. 2428 2429 $group_by_page (boolean) - 2430 Indicates whether the found offsets should be grouped by page number or not. 2431 2432 RETURN VALUE 2433 Returns an array of page numbers/character offsets (see Description above) or false if the specified 2434 string does not appear in the document. 2435 2436 *-------------------------------------------------------------------------------------------------------------*/ 2437 public function document_strpos ( $text, $group_by_page = false ) 2438 { 2439 $length = strlen ( $text ) ; 2440 2441 if ( ! $length ) 2442 return ( false ) ; 2443 2444 $result = array ( ) ; 2445 $index = 0 ; 2446 2447 while ( ( $index = mb_strpos ( $this -> Text, $text, $index, 'UTF-8' ) ) !== false ) 2448 { 2449 $page = $this -> GetPageFromOffset ( $index ) ; 2450 2451 if ( $group_by_page ) 2452 $result [ $page ] [] = $index ; 2453 else 2454 $result [] = array ( $page, $index ) ; 2455 2456 $index += $length ; 2457 } 2458 2459 return ( $result ) ; 2460 } 2461 2462 2463 public function document_stripos ( $text, $group_by_page = false ) 2464 { 2465 $length = strlen ( $text ) ; 2466 2467 if ( ! $length ) 2468 return ( false ) ; 2469 2470 $result = array ( ) ; 2471 $index = 0 ; 2472 2473 while ( ( $index = mb_stripos ( $this -> Text, $text, $index, 'UTF-8' ) ) !== false ) 2474 { 2475 $page = $this -> GetPageFromOffset ( $index ) ; 2476 2477 if ( $group_by_page ) 2478 $result [ $page ] [] = $index ; 2479 else 2480 $result [] = array ( $page, $index ) ; 2481 2482 $index += $length ; 2483 } 2484 2485 return ( $result ) ; 2486 } 2487 2488 2489 /*-------------------------------------------------------------------------------------------------------------- 2490 2491 NAME 2492 text_match, document_match - Search string using regular expressions. 2493 2494 PROTOTYPE 2495 $status = $pdf -> text_match ( $pattern, &$match = null, $flags = 0, $offset = 0 ) ; 2496 $status = $pdf -> document_match ( $pattern, &$match = null, $flags = 0, $offset = 0 ) ; 2497 2498 DESCRIPTION 2499 text_match() calls the preg_match() PHP function on the pdf text contents, to locate the first occurrence 2500 of text that matches the specified regular expression. 2501 document_match() calls the preg_match_all() function to locate all occurrences that match the specified 2502 regular expression. 2503 Note that both methods add the PREG_OFFSET_CAPTURE flag when calling preg_match/preg_match_all so you 2504 should be aware that all captured results are an array containing the following entries : 2505 - Item [0] is the captured string 2506 - Item [1] is its text offset 2507 - The text_match() and document_match() methods add an extra array item (index 2), which contains the 2508 page number where the matched text resides 2509 2510 PARAMETERS 2511 $pattern (string) - 2512 Regular expression to be searched. 2513 2514 $match (any) - 2515 Output captures. See preg_match/preg_match_all. 2516 2517 $flags (integer) - 2518 PCRE flags. See preg_match/preg_match_all. 2519 2520 $offset (integer) - 2521 Start offset. See preg_match/preg_match_all. 2522 2523 RETURN VALUE 2524 Returns the number of matched occurrences, or false if the specified regular expression is invalid. 2525 2526 *-------------------------------------------------------------------------------------------------------------*/ 2527 public function text_match ( $pattern, &$match = null, $flags = 0, $offset = 0 ) 2528 { 2529 $local_match = null ; 2530 $status = preg_match ( $pattern, $this -> Text, $local_match, $flags | PREG_OFFSET_CAPTURE, $offset ) ; 2531 2532 if ( $status ) 2533 { 2534 foreach ( $local_match as &$entry ) 2535 $entry [2] = $this -> GetPageFromOffset ( $entry [1] ) ; 2536 2537 $match = $local_match ; 2538 } 2539 2540 return ( $status ) ; 2541 } 2542 2543 2544 public function document_match ( $pattern, &$matches = null, $flags = 0, $offset = 0 ) 2545 { 2546 $local_matches = null ; 2547 $status = preg_match_all ( $pattern, $this -> Text, $local_matches, $flags | PREG_OFFSET_CAPTURE, $offset ) ; 2548 2549 if ( $status ) 2550 { 2551 foreach ( $local_matches as &$entry ) 2552 { 2553 foreach ( $entry as &$subentry ) 2554 $subentry [2] = $this -> GetPageFromOffset ( $subentry [1] ) ; 2555 } 2556 2557 $matches = $local_matches ; 2558 } 2559 2560 return ( $status ) ; 2561 } 2562 2563 2564 /*-------------------------------------------------------------------------------------------------------------- 2565 2566 HasFormData - 2567 Returns true if the PDF file contains form data or not. 2568 2569 *-------------------------------------------------------------------------------------------------------------*/ 2570 public function HasFormData ( ) 2571 { 2572 return ( count ( $this -> FormData ) > 0 ) ; 2573 } 2574 2575 2576 /*-------------------------------------------------------------------------------------------------------------- 2577 2578 GetFormCount - 2579 Returns the number of top-level forms contained in the PDF file. 2580 2581 *-------------------------------------------------------------------------------------------------------------*/ 2582 public function GetFormCount ( ) 2583 { 2584 return ( count ( $this -> FormData ) ) ; 2585 } 2586 2587 2588 /*-------------------------------------------------------------------------------------------------------------- 2589 2590 NAME 2591 GetFormData - Returns form data, if any 2592 2593 PROTOTYPE 2594 $object = $pdf -> GetFormData ( $template = null, $form_index = 0 ) ; 2595 2596 DESCRIPTION 2597 Retrieves form data if present. 2598 2599 PARAMETERS 2600 $template (string) - 2601 An XML file describing form data using human-readable names for field values. 2602 If not specified, the inline form definitions will be used, together with the field names 2603 specified in the PDF file. 2604 2605 $form_index (integer) - 2606 Form index in the PDF file. So far, I really don't know if a PDF file can have multiple forms. 2607 2608 RETURN VALUE 2609 An object derived from the PdfToTextFormData class. 2610 2611 *-------------------------------------------------------------------------------------------------------------*/ 2612 public function GetFormData ( $template = null, $form_index = 0 ) 2613 { 2614 if ( isset ( $this -> FormDataObjects [ $form_index ] ) ) 2615 return ( $this -> FormDataObjects [ $form_index ] ) ; 2616 2617 if ( $form_index > count ( $this -> FormDataObjectNumbers ) ) 2618 error ( new PdfToTextFormException ( "Invalid form index #$form_index." ) ) ; 2619 2620 $form_data = $this -> FormData [ $this -> FormDataObjectNumbers [ $form_index ] ] ; 2621 2622 if ( $template ) 2623 { 2624 if ( ! file_exists ( $template ) ) 2625 error ( new PdfToTextFormException ( "Form data template file \"$template\" not found." ) ) ; 2626 2627 $xml_data = file_get_contents ( $template ) ; 2628 $definitions = new PdfToTextFormDefinitions ( $xml_data, $form_data [ 'form' ] ) ; ; 2629 } 2630 else 2631 { 2632 $definitions = new PdfToTextFormDefinitions ( null, $form_data [ 'form' ] ) ; 2633 } 2634 2635 $object = $definitions [ $form_index ] -> GetFormDataFromPdfObject ( $form_data [ 'values' ] ) ; 2636 2637 $this -> FormDataDefinitions [] = $definitions ; 2638 $this -> FormDataObjects [] = $object ; 2639 2640 return ( $object ) ; 2641 } 2642 2643 2644 /*-------------------------------------------------------------------------------------------------------------- 2645 2646 NAME 2647 MarkTextLike - Marks output text. 2648 2649 PROTOTYPE 2650 $pdf -> MarkTextLike ( $regex, $marker_start, $marker_end ) ; 2651 2652 DESCRIPTION 2653 Sometimes it may be convenient, when you want to extract only a portion of text, to say : "I want to 2654 extract text between this title and this title". The MarkTextLike() method provides some support for 2655 such a task. Imagine you have documents that have the same structure, all starting with an "Introduction" 2656 title : 2657 2658 Introduction 2659 ... 2660 some text 2661 ... 2662 Some other title 2663 ... 2664 2665 By calling the MarkTextLike() method such as in the example below : 2666 2667 $pdf -> MarkTextLike ( '/\bIntroduction\b/', '<M>', '</M' ) ; 2668 2669 then you will get as output : 2670 2671 <M>Introduction</M> 2672 ... 2673 some text 2674 ... 2675 <M>Some other title</M> 2676 2677 Adding such markers in the output will allow you to easily extract the text between the chapters 2678 "Introduction" and "Some other title", using a regular expression. 2679 2680 The font name used for the first string matched by the specified regular expression will be searched 2681 later to add markers around all the text portions using this font. 2682 2683 2684 PARAMETERS 2685 $regex (string) - 2686 A regular expression to match the text to be matched. Subsequent portions of text using the 2687 same font will be surrounded by the marker start/end strings. 2688 2689 $marker_start, $marker_end (string) - 2690 Markers to surround the string when a match is found. 2691 2692 *-------------------------------------------------------------------------------------------------------------*/ 2693 public function MarkTextLike ( $regex, $marker_start, $marker_end ) 2694 { 2695 $this -> UnprocessedMarkerList [ 'font' ] [] = array 2696 ( 2697 'regex' => $regex, 2698 'start' => $marker_start, 2699 'end' => $marker_end 2700 ) ; 2701 } 2702 2703 2704 /*-------------------------------------------------------------------------------------------------------------- 2705 2706 NAME 2707 SetCaptures, SetCapturesFromString - Defines document parts to be captured. 2708 2709 PROTOTYPE 2710 $pdf -> SetCaptures ( $xml_file ) ; 2711 $pdf -> SetCapturesFromString ( $xml_data ) ; 2712 2713 DESCRIPTION 2714 Defines document parts to be captured. 2715 SetCaptures() takes the definitions for the areas to be captured from an XML file, while 2716 SetCapturesFromString() takes them from a string representing xml capture definitions. 2717 2718 NOTES 2719 - See file README.md for an explanation on the format of the XML capture definition file. 2720 - The SetCaptures() methods must be called before the Load() method. 2721 2722 *-------------------------------------------------------------------------------------------------------------*/ 2723 public function SetCaptures ( $xml_file ) 2724 { 2725 if ( ! file_exists ( $xml_file ) ) 2726 error ( new PdfToTextException ( "File \"$xml_file\" does not exist." ) ) ; 2727 2728 $xml_data = file_get_contents ( $xml_file ) ; 2729 2730 $this -> SetCapturesFromString ( $xml_data ) ; 2731 2732 } 2733 2734 2735 public function SetCapturesFromString ( $xml_data ) 2736 { 2737 // Setting capture areas implies having the PDFOPT_BASIC_LAYOUT option 2738 $this -> Options |= self::PDFOPT_BASIC_LAYOUT ; 2739 2740 $this -> CaptureDefinitions = new PdfToTextCaptureDefinitions ( $xml_data ) ; 2741 } 2742 2743 2744 /*-------------------------------------------------------------------------------------------------------------- 2745 2746 NAME 2747 GetCaptures - Returns captured data. 2748 2749 PROTOTYPE 2750 $object = $pdf -> GetCaptures ( $full = false ) ; 2751 2752 PARAMETERS 2753 $full (boolean) - 2754 When true, the whole captures, togethers with their definitions, are returned. When false, 2755 only a basic object containing the capture names and their values is returned. 2756 2757 DESCRIPTION 2758 Returns the object that contains captured data. 2759 2760 RETURN VALUE 2761 An object of type PdfToTextCaptures, or false if an error occurred. 2762 2763 *-------------------------------------------------------------------------------------------------------------*/ 2764 public function GetCaptures ( $full = false ) 2765 { 2766 if ( ! $this -> CaptureObject ) 2767 { 2768 $this -> CaptureDefinitions -> SetPageCount ( count ( $this -> Pages ) ) ; 2769 $this -> CaptureObject = $this -> CaptureDefinitions -> GetCapturedObject ( $this -> DocumentFragments ) ; 2770 } 2771 2772 if ( $full ) 2773 return ( $this -> CaptureObject ) ; 2774 else 2775 return ( $this -> CaptureObject -> ToCaptures ( ) ) ; 2776 } 2777 2778 2779 /************************************************************************************************************** 2780 ************************************************************************************************************** 2781 ************************************************************************************************************** 2782 ****** ****** 2783 ****** ****** 2784 ****** INTERNAL METHODS ****** 2785 ****** ****** 2786 ****** ****** 2787 ************************************************************************************************************** 2788 ************************************************************************************************************** 2789 **************************************************************************************************************/ 2790 2791 /*-------------------------------------------------------------------------------------------------------------- 2792 2793 NAME 2794 AddImage - Adds an image from the PDF stream to the current object. 2795 2796 PROTOTYPE 2797 $this -> AddImage ( $object_id, $stream_data, $type, $object_data ) ; 2798 2799 DESCRIPTION 2800 Adds an image from the PDF stream to the current object. 2801 If the PDFOPT_GET_IMAGE_DATA flag is enabled, image data will be added to the ImageData property. 2802 If the PDFOPT_DECODE_IMAGE_DATA flag is enabled, a jpeg resource will be created and added into the 2803 Images array property. 2804 2805 PARAMETERS 2806 $object_id (integer) - 2807 Pdf object id. 2808 2809 $stream_data (string) - 2810 Contents of the unprocessed stream data containing the image. 2811 2812 $type (integer) - 2813 One of the PdfToText::PDF_*_ENCODING constants. 2814 2815 *-------------------------------------------------------------------------------------------------------------*/ 2816 protected function AddImage ( $object_id, $stream_data, $type, $object_data ) 2817 { 2818 2819 if ( self::$DEBUG && $this -> Options & self::PDFOPT_GET_IMAGE_DATA ) 2820 { 2821 switch ( $type ) 2822 { 2823 case self::PDF_DCT_ENCODING : 2824 $this -> ImageData = array ( 'type' => 'jpeg', 'data' => $stream_data ) ; 2825 break ; 2826 } 2827 2828 } 2829 2830 2831 if ( $this -> Options & self::PDFOPT_DECODE_IMAGE_DATA && 2832 ( ! $this -> MaxExtractedImages || $this -> ImageCount < $this -> MaxExtractedImages ) ) 2833 { 2834 $image = $this -> DecodeImage ( $object_id, $stream_data, $type, $object_data, $this -> Options & self::PDFOPT_AUTOSAVE_IMAGES ) ; 2835 2836 if ( $image !== false ) 2837 { 2838 $this -> ImageCount ++ ; 2839 2840 // When the PDFOPT_AUTOSAVE_IMAGES flag is set, we simply use a template filename to generate a real output filename 2841 // then save the image to that file. The memory is freed after that. 2842 if ( $this -> Options & self::PDFOPT_AUTOSAVE_IMAGES ) 2843 { 2844 $output_filename = $this -> __get_output_image_filename ( ) ; 2845 2846 $image -> SaveAs ( $output_filename, $this -> ImageAutoSaveFormat ) ; 2847 unset ( $image ) ; 2848 2849 $this -> AutoSavedImageFiles [] = $output_filename ; 2850 } 2851 // Otherwise, simply store the image data into memory 2852 else 2853 $this -> Images [] = $image ; 2854 } 2855 } 2856 } 2857 2858 2859 /*-------------------------------------------------------------------------------------------------------------- 2860 2861 NAME 2862 DecodeData - Decodes stream data. 2863 2864 PROTOTYPE 2865 $data = $this -> DecodeData ( $object_id, $stream_data, $type ) ; 2866 2867 DESCRIPTION 2868 Decodes stream data (binary data located between the "stream" and "enstream" directives) according to the 2869 specified encoding type, given in the surrounding object parameters. 2870 2871 PARAMETERS 2872 $object_id (integer) - 2873 Id of the object containing the data. 2874 2875 $stream_data (string) - 2876 Contents of the binary stream. 2877 2878 $type (integer) - 2879 One of the PDF_*_ENCODING constants, as returned by the GetEncodingType() method. 2880 2881 RETURN VALUE 2882 Returns the decoded stream data. 2883 2884 *-------------------------------------------------------------------------------------------------------------*/ 2885 protected function DecodeData ( $object_id, $stream_data, $type, $object_data ) 2886 { 2887 $decoded_stream_data = '' ; 2888 2889 switch ( $type ) 2890 { 2891 case self::PDF_FLATE_ENCODING : 2892 // Objects in password-protected Pdf files SHOULD be encrypted ; however, it happens that we may encounter normal, 2893 // unencrypted ones. This is why we always try to gzuncompress them first then, if failed, try to decrypt them 2894 $decoded_stream_data = @gzuncompress ( $stream_data ) ; 2895 2896 if ( $decoded_stream_data === false ) 2897 { 2898 if ( $this -> IsEncrypted ) 2899 { 2900 $decoded_stream_data = $this -> EncryptionData -> Decrypt ( $object_id, $stream_data ) ; 2901 2902 if ( $decoded_stream_data === false ) 2903 { 2904 if ( self::$DEBUG > 1 ) 2905 warning ( new PdfToTextDecodingException ( "Unable to decrypt object contents.", $object_id ) ) ; 2906 } 2907 } 2908 else if ( self::$DEBUG > 1 ) 2909 warning ( new PdfToTextDecodingException ( "Invalid gzip data.", $object_id ) ) ; 2910 } 2911 2912 break ; 2913 2914 case self::PDF_LZW_ENCODING : 2915 $decoded_stream_data = $this -> __decode_lzw ( $stream_data ) ; 2916 break ; 2917 2918 case self::PDF_ASCIIHEX_ENCODING : 2919 $decoded_stream_data = $this -> __decode_ascii_hex ( $stream_data ) ; 2920 break ; 2921 2922 case self::PDF_ASCII85_ENCODING : 2923 $decoded_stream_data = $this -> __decode_ascii_85 ( $stream_data ) ; 2924 2925 // Dumbly check if this could not be gzipped data after decoding (normally, the object flags should also specify 2926 // the /FlateDecode flag) 2927 if ( $decoded_stream_data !== false && ( $result = @gzuncompress ( $decoded_stream_data ) ) !== false ) 2928 $decoded_stream_data = $result ; 2929 2930 break ; 2931 2932 case self::PDF_TEXT_ENCODING : 2933 $decoded_stream_data = $stream_data ; 2934 break ; 2935 } 2936 2937 return ( $decoded_stream_data ) ; 2938 } 2939 2940 2941 // __decode_lzw - 2942 // Decoding function for LZW encrypted data. This function is largely inspired by the TCPDF one but has been rewritten 2943 // for a performance gain of 30-35%. 2944 private function __decode_lzw ( $data ) 2945 { 2946 // The initial dictionary contains 256 entries where each index is equal to its character representation 2947 static $InitialDictionary = array 2948 ( 2949 "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\x09", "\x0A", "\x0B", "\x0C", "\x0D", "\x0E", "\x0F", 2950 "\x10", "\x11", "\x12", "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1A", "\x1B", "\x1C", "\x1D", "\x1E", "\x1F", 2951 "\x20", "\x21", "\x22", "\x23", "\x24", "\x25", "\x26", "\x27", "\x28", "\x29", "\x2A", "\x2B", "\x2C", "\x2D", "\x2E", "\x2F", 2952 "\x30", "\x31", "\x32", "\x33", "\x34", "\x35", "\x36", "\x37", "\x38", "\x39", "\x3A", "\x3B", "\x3C", "\x3D", "\x3E", "\x3F", 2953 "\x40", "\x41", "\x42", "\x43", "\x44", "\x45", "\x46", "\x47", "\x48", "\x49", "\x4A", "\x4B", "\x4C", "\x4D", "\x4E", "\x4F", 2954 "\x50", "\x51", "\x52", "\x53", "\x54", "\x55", "\x56", "\x57", "\x58", "\x59", "\x5A", "\x5B", "\x5C", "\x5D", "\x5E", "\x5F", 2955 "\x60", "\x61", "\x62", "\x63", "\x64", "\x65", "\x66", "\x67", "\x68", "\x69", "\x6A", "\x6B", "\x6C", "\x6D", "\x6E", "\x6F", 2956 "\x70", "\x71", "\x72", "\x73", "\x74", "\x75", "\x76", "\x77", "\x78", "\x79", "\x7A", "\x7B", "\x7C", "\x7D", "\x7E", "\x7F", 2957 "\x80", "\x81", "\x82", "\x83", "\x84", "\x85", "\x86", "\x87", "\x88", "\x89", "\x8A", "\x8B", "\x8C", "\x8D", "\x8E", "\x8F", 2958 "\x90", "\x91", "\x92", "\x93", "\x94", "\x95", "\x96", "\x97", "\x98", "\x99", "\x9A", "\x9B", "\x9C", "\x9D", "\x9E", "\x9F", 2959 "\xA0", "\xA1", "\xA2", "\xA3", "\xA4", "\xA5", "\xA6", "\xA7", "\xA8", "\xA9", "\xAA", "\xAB", "\xAC", "\xAD", "\xAE", "\xAF", 2960 "\xB0", "\xB1", "\xB2", "\xB3", "\xB4", "\xB5", "\xB6", "\xB7", "\xB8", "\xB9", "\xBA", "\xBB", "\xBC", "\xBD", "\xBE", "\xBF", 2961 "\xC0", "\xC1", "\xC2", "\xC3", "\xC4", "\xC5", "\xC6", "\xC7", "\xC8", "\xC9", "\xCA", "\xCB", "\xCC", "\xCD", "\xCE", "\xCF", 2962 "\xD0", "\xD1", "\xD2", "\xD3", "\xD4", "\xD5", "\xD6", "\xD7", "\xD8", "\xD9", "\xDA", "\xDB", "\xDC", "\xDD", "\xDE", "\xDF", 2963 "\xE0", "\xE1", "\xE2", "\xE3", "\xE4", "\xE5", "\xE6", "\xE7", "\xE8", "\xE9", "\xEA", "\xEB", "\xEC", "\xED", "\xEE", "\xEF", 2964 "\xF0", "\xF1", "\xF2", "\xF3", "\xF4", "\xF5", "\xF6", "\xF7", "\xF8", "\xF9", "\xFA", "\xFB", "\xFC", "\xFD", "\xFE", "\xFF" 2965 ) ; 2966 2967 // Dictionary lengths - when we reach one of the values specified as the key, we have to set the bit length to the corresponding value 2968 static $DictionaryLengths = array 2969 ( 2970 511 => 10, 2971 1023 => 11, 2972 2047 => 12 2973 ) ; 2974 2975 // Decoded string to be returned 2976 $result = '' ; 2977 2978 // Convert string to binary string 2979 $bit_string = '' ; 2980 $data_length = strlen ( $data ) ; 2981 2982 for ( $i = 0 ; $i < $data_length ; $i ++ ) 2983 $bit_string .= sprintf ( '%08b', ord ( $data[$i] ) ) ; 2984 2985 $data_length *= 8 ; 2986 2987 // Initialize dictionary 2988 $bit_length = 9 ; 2989 $dictionary_index = 258 ; 2990 $dictionary = $InitialDictionary ; 2991 2992 // Previous value 2993 $previous_index = 0 ; 2994 2995 // Start index in bit string 2996 $start_index = 0 ; 2997 2998 // Until we encounter the EOD marker (257), read $bit_length bits 2999 while ( ( $start_index < $data_length ) && ( ( $index = bindec ( substr ( $bit_string, $start_index, $bit_length ) ) ) !== 257 ) ) 3000 { 3001 // Move to next bit position 3002 $start_index += $bit_length ; 3003 3004 if ( $index !== 256 && $previous_index !== 256 ) 3005 { 3006 // Check if index exists in the dictionary and remember it 3007 if ( $index < $dictionary_index ) 3008 { 3009 $result .= $dictionary [ $index ] ; 3010 $dictionary_value = $dictionary [ $previous_index ] . $dictionary [ $index ] [0] ; 3011 $previous_index = $index ; 3012 } 3013 // Index does not exist - add it to the dictionary 3014 else 3015 { 3016 $dictionary_value = $dictionary [ $previous_index ] . $dictionary [ $previous_index ] [0] ; 3017 $result .= $dictionary_value ; 3018 } 3019 3020 // Update dictionary 3021 $dictionary [ $dictionary_index ++ ] = $dictionary_value ; 3022 3023 // Change bit length whenever we reach an index limit 3024 if ( isset ( $DictionaryLengths [ $dictionary_index ] ) ) 3025 $bit_length = $DictionaryLengths [ $dictionary_index ] ; 3026 } 3027 // Clear table marker 3028 else if ( $index === 256) 3029 { 3030 // Reset dictionary and bit length 3031 // Reset dictionary and bit length 3032 $bit_length = 9 ; 3033 $dictionary_index = 258 ; 3034 $previous_index = 256 ; 3035 $dictionary = $InitialDictionary ; 3036 } 3037 // First entry 3038 else // $previous_index === 256 3039 { 3040 // first entry 3041 $result .= $dictionary [ $index ] ; 3042 $previous_index = $index ; 3043 } 3044 } 3045 3046 // All done, return 3047 return ( $result ) ; 3048 } 3049 3050 3051 // __decode_ascii_hex - 3052 // Decoder for /AsciiHexDecode streams. 3053 private function __decode_ascii_hex ( $input ) 3054 { 3055 $output = "" ; 3056 $is_odd = true ; 3057 $is_comment = false ; 3058 3059 for ( $i = 0, $codeHigh = -1 ; $i < strlen ( $input ) && $input [ $i ] != '>' ; $i++ ) 3060 { 3061 $c = $input [ $i ] ; 3062 3063 if ( $is_comment ) 3064 { 3065 if ( $c == '\r' || $c == '\n' ) 3066 $is_comment = false ; 3067 3068 continue; 3069 } 3070 3071 switch ( $c ) 3072 { 3073 case '\0' : 3074 case '\t' : 3075 case '\r' : 3076 case '\f' : 3077 case '\n' : 3078 case ' ' : 3079 break ; 3080 3081 case '%' : 3082 $is_comment = true ; 3083 break ; 3084 3085 default : 3086 $code = hexdec ( $c ) ; 3087 3088 if ( $code === 0 && $c != '0' ) 3089 return ( '' ) ; 3090 3091 if ( $is_odd ) 3092 $codeHigh = $code ; 3093 else 3094 $output .= chr ( ( $codeHigh << 4 ) | $code ) ; 3095 3096 $is_odd = ! $is_odd ; 3097 break ; 3098 } 3099 } 3100 3101 if ( $input [ $i ] != '>' ) 3102 return ( '' ) ; 3103 3104 if ( $is_odd ) 3105 $output .= chr ( $codeHigh << 4 ) ; 3106 3107 return ( $output ) ; 3108 } 3109 3110 3111 // __decode_ascii_85 - 3112 // Decoder for /Ascii85Decode streams. 3113 private function __decode_ascii_85 ( $data ) 3114 { 3115 // Ordinal value of the first character used in Ascii85 encoding 3116 static $first_ord = 33 ; 3117 // "A 'z' in the input data means "sequence of 4 nuls" 3118 static $z_exception = "\0\0\0\0" ; 3119 // Powers of 85, from 4 to 0 3120 static $exp85 = array ( 52200625, 614125, 7225, 85, 1 ) ; 3121 3122 // Ignore empty data 3123 if ( $data === '' ) 3124 return ( false ) ; 3125 3126 $data_length = strlen ( $data ) ; 3127 $ords = array ( ) ; 3128 $ord_count = 0 ; 3129 $result = '' ; 3130 3131 // Paranoia : Ascii85 data may start with '<~' (but it always end with '~>'). Anyway, we must start past this construct if present 3132 if ( $data [0] == '<' && $data [1] == '~' ) 3133 $start = 2 ; 3134 else 3135 $start = 0 ; 3136 3137 // Loop through nput characters 3138 for ( $i = $start ; $i < $data_length && $data [$i] != '~' ; $i ++ ) 3139 { 3140 $ch = $data [$i] ; 3141 3142 // Most common case : current character is in the range of the Ascii85 encoding ('!'..'u') 3143 if ( $ch >= '!' && $ch <= 'u' ) 3144 $ords [ $ord_count ++ ] = ord ( $ch ) - $first_ord ; 3145 // 'z' is replaced with a sequence of null bytes 3146 else if ( $ch == 'z' && ! $ord_count ) 3147 $result .= $z_exception ; 3148 // Spaces are ignored 3149 else if ( $ch !== "\0" && $ch !== "\t" && $ch !== ' ' && $ch !== "\r" && $ch !== "\n" && $ch !== "\f" ) 3150 continue ; 3151 // Other characters : corrupted data... 3152 else 3153 return ( false ) ; 3154 3155 // We have collected 5 characters in base 85 : convert their 32-bits value to base 2 (3 characters) 3156 if ( $ord_count == 5 ) 3157 { 3158 $ord_count = 0 ; 3159 3160 for ( $sum = 0, $j = 0 ; $j < 5 ; $j ++ ) 3161 $sum = ( $sum * 85 ) + $ords [ $j ] ; 3162 3163 for ( $j = 3 ; $j >= 0 ; $j -- ) 3164 $result .= chr ( $sum >> ( $j * 8 ) ) ; 3165 } 3166 } 3167 3168 // A last processing for the potential remaining bytes 3169 // Notes : this situation has never been tested 3170 if ( $ord_count ) 3171 { 3172 for ( $i = 0, $sum = 0 ; $i < $ord_count ; $i++ ) 3173 $sum += ( $ords [ $i ] + ( $i == $ord_count - 1 ) ) * $exp85 [$i] ; 3174 3175 for ( $i = 0 ; $i < $ord_count - 1 ; $i++ ) 3176 $result .= chr ( $sum >> ( ( 3 - $i ) * 8 ) ) ; 3177 } 3178 3179 // All done, return 3180 return ( $result ) ; 3181 } 3182 3183 3184 /*-------------------------------------------------------------------------------------------------------------- 3185 3186 NAME 3187 DecodeImage - Returns decoded image contents. 3188 3189 PROTOTYPE 3190 TBC 3191 3192 DESCRIPTION 3193 description 3194 3195 PARAMETERS 3196 $object_id (integer) - 3197 Pdf object number. 3198 3199 $stream_data (string) - 3200 Object data. 3201 3202 $type (integer) - 3203 One of the PdfToText::PDF_*_ENCODING constants. 3204 3205 $autosave (boolean) - 3206 When autosave is selected, images will not be decoded into memory unless they have a format 3207 different from JPEG. This is intended to save memory. 3208 3209 RETURN VALUE 3210 Returns an object of type PdfIMage, or false if the image encoding type is not currently supported. 3211 3212 *-------------------------------------------------------------------------------------------------------------*/ 3213 protected function DecodeImage ( $object_id, $stream_data, $type, $object_data, $autosave ) 3214 { 3215 switch ( $type ) 3216 { 3217 // Normal JPEG image 3218 case self::PDF_DCT_ENCODING : 3219 return ( new PdfJpegImage ( $stream_data, $autosave ) ) ; 3220 3221 // CCITT fax image 3222 case self::PDF_CCITT_FAX_ENCODING : 3223 return ( new PdfFaxImage ( $stream_data ) ) ; 3224 3225 // For now, I have not found enough information to be able to decode image data in an inflated stream... 3226 // In some cases, however, this is JPEG data 3227 case self::PDF_FLATE_ENCODING : 3228 $image = PdfInlinedImage::CreateInstance ( $stream_data, $object_data, $autosave ) ; 3229 3230 if ( $image ) 3231 return ( $image ) ; 3232 3233 break ; 3234 3235 default : 3236 return ( false ) ; 3237 } 3238 3239 return ( false ) ; 3240 } 3241 3242 3243 /*-------------------------------------------------------------------------------------------------------------- 3244 3245 NAME 3246 DecodeObjectStream - Decodes an object stream. 3247 3248 PROTOTYPE 3249 $array = $this -> DecodeObjectStream ( $object_id, $object_data ) ; 3250 3251 DESCRIPTION 3252 Decodes an object stream. An object stream is yet another PDF object type that contains itself several 3253 objects not defined using the "x y obj ... endobj" syntax. 3254 As far as I understood, object streams data is contained within stream/endstream delimiters, and is 3255 gzipped. 3256 Object streams start with a set of object id/offset pairs separated by a space ; catenated object data 3257 immediately follows the last space ; for example : 3258 3259 1167 0 1168 114 <</DA(/Helv 0 Tf 0 g )/DR<</Encoding<</PDFDocEncoding 1096 0 R>>/Font<</Helv 1094 0 R/ZaDb 1095 0 R>>>>/Fields[]>>[/ICCBased 1156 0 R] 3260 3261 The above example specifies two objects : 3262 . Object #1167, which starts at offset 0 and ends before the second object, at offset #113 in 3263 the data. The contents are : 3264 <</DA(/Helv 0 Tf 0 g )/DR<</Encoding<</PDFDocEncoding 1096 0 R>>/Font<</Helv 1094 0 R/ZaDb 1095 0 R>>>>/Fields[]>> 3265 . Object #1168, which starts at offset #114 and continues until the end of the object stream. 3266 It contains the following data : 3267 [/ICCBased 1156 0 R] 3268 3269 PARAMETERS 3270 $object_id (integer) - 3271 Pdf object number. 3272 3273 $object_data (string) - 3274 Object data. 3275 3276 RETURN VALUE 3277 Returns false if any error occurred (mainly for syntax reasons). 3278 Otherwise, returns an associative array containing the following elements : 3279 - object_id : 3280 Array of all the object ids contained in the object stream. 3281 - object : 3282 Array of corresponding object data. 3283 3284 The reason for this format is that it is identical to the array returned by the preg_match() function 3285 used in the Load() method for finding objects in a PDF file (ie, a regex that matches "x y oj/endobj" 3286 constructs). 3287 3288 *-------------------------------------------------------------------------------------------------------------*/ 3289 protected function DecodeObjectStream ( $object_id, $object_data ) 3290 { 3291 // Extract gzipped data for this object 3292 if ( preg_match ( '#[^/] stream ( (\r? \n) | \r ) (?P<stream> .*?) endstream#imsx', $object_data, $stream_match ) ) 3293 { 3294 $stream_data = $stream_match [ 'stream' ] ; 3295 $type = $this -> GetEncodingType ( $object_id, $object_data ) ; 3296 $decoded_data = $this -> DecodeData ( $object_id, $stream_data, $type, $object_data ) ; 3297 3298 if ( self::$DEBUG > 1 ) 3299 echo "\n----------------------------------- OBJSTREAM #$object_id\n$decoded_data" ; 3300 } 3301 // Stay prepared to find one day a sample declared as an object stream but not having gzipped data delimited by stream/endstream tags 3302 else 3303 { 3304 if ( self::$DEBUG > 1 ) 3305 error ( new PdfToTextDecodingException ( "Found object stream without gzipped data", $object_id ) ) ; 3306 3307 return ( false ) ; 3308 } 3309 3310 // Object streams data start with a series of object id/offset pairs. The offset is absolute to the first character 3311 // after the last space of these series. 3312 // Note : on Windows platforms, the default stack size is 1Mb. The following regular expression will make Apache crash in most cases, 3313 // so you have to enable the following lines in your http.ini file to set a stack size of 8Mb, as for Unix systems : 3314 // Include conf/extra/httpd-mpm.conf 3315 // ThreadStackSize 8388608 3316 if ( ! preg_match ( '/^ \s* (?P<series> (\d+ \s* )+ )/x', $decoded_data, $series_match ) ) 3317 { 3318 if ( self::$DEBUG > 1 ) 3319 error ( new PdfToTextDecodingException ( "Object stream does not start with integer object id/offset pairs.", $object_id ) ) ; 3320 3321 return ( false ) ; 3322 } 3323 3324 // Extract the series of object id/offset pairs and the stream object data 3325 $series = explode ( ' ', rtrim ( preg_replace ( '/\s+/', ' ', $series_match [ 'series' ] ) ) ) ; 3326 $data = substr ( $decoded_data, strlen ( $series_match [ 'series' ] ) ) ; 3327 3328 // $series should contain an even number of values 3329 if ( count ( $series ) % 2 ) 3330 { 3331 if ( self::$DEBUG ) 3332 warning ( new PdfToTextDecodingException ( "Object stream should start with an even number of integer values.", $object_id ) ) ; 3333 3334 array_pop ( $series ) ; 3335 } 3336 3337 // Extract every individual object 3338 $objects = array ( 'object_id' => array ( ), 'object' => array ( ) ) ; 3339 3340 for ( $i = 0, $count = count ( $series ) ; $i < $count ; $i += 2 ) 3341 { 3342 $object_id = ( integer ) $series [$i] ; 3343 $offset = ( integer ) $series [$i+1] ; 3344 3345 // If there is a "next" object, extract only a substring within the object stream contents 3346 if ( isset ( $series [ $i + 3 ] ) ) 3347 $object_contents = substr ( $data, $offset, $series [ $i + 3 ] - $offset ) ; 3348 // Otherwise, extract everything until the end 3349 else 3350 $object_contents = substr ( $data, $offset ) ; 3351 3352 $objects [ 'object_id'] [] = $object_id ; 3353 $objects [ 'object' ] [] = $object_contents ; 3354 } 3355 3356 return ( $objects ) ; 3357 } 3358 3359 3360 /*-------------------------------------------------------------------------------------------------------------- 3361 3362 NAME 3363 ExtractTextData - Extracts text, header & footer information from a text object. 3364 3365 PROTOTYPE 3366 $this -> ExtractTextData ( $object_id, $stream_contents, &$text, &$header, &$footer ) ; 3367 3368 DESCRIPTION 3369 Extracts text, header & footer information from a text object. The extracted text contents will be 3370 stripped from any header/footer information. 3371 3372 PARAMETERS 3373 $text (string) - 3374 Variable that will receive text contents. 3375 3376 $header, $footer (string) - 3377 Variables that will receive header and footer information. 3378 3379 *-------------------------------------------------------------------------------------------------------------*/ 3380 protected function ExtractTextData ( $object_id, $stream_contents, &$text, &$header, &$footer ) 3381 { 3382 // Normally, a header or footer is introduced with a construct like : 3383 // << /Type /Pagination ... [/Bottom] ... >> (or [/Top] 3384 // The initial regular expression was : 3385 // << .*? \[ \s* / (?P<location> (Bottom) | (Top) ) \s* \] .*? >> \s* BDC .*? EMC 3386 // (the data contained between the BDC and EMC instructions are text-drawing instructions). 3387 // However, this expression revealed to be too greedy and captured too much data ; in the following example : 3388 // <</MCID 0>> ...(several kb of drawing instructions)... << ... [/Bottom] ... >> BDC (other drawing instructions for the page footer) EMC 3389 // everything was captured, from the initial "<<M/MCID 0>>" to the final "EMC", which caused regular page contents to be interpreted as page bottom 3390 // contents. 3391 // The ".*?" in the regex has been replaced with "[^>]*?", which works better. However, it will fail to recognize header/footer contents if 3392 // the header/footer declaration contains a nested construct , such as : 3393 // << /Type /Pagination ... [/Bottom] ... << (some nested contents) >> ... >> (or [/Top] 3394 // Let's wait for the case to happen one day... 3395 static $header_or_footer_re = '# 3396 (?P<contents> 3397 << [^>]*? \[ \s* / (?P<location> (Bottom) | (Top) ) \s* \] [^>]*? >> \s* 3398 BDC .*? EMC 3399 ) 3400 #imsx' ; 3401 3402 $header = 3403 $footer = 3404 $text = '' ; 3405 3406 if ( preg_match_all ( $header_or_footer_re, $stream_contents, $matches, PREG_OFFSET_CAPTURE ) ) 3407 { 3408 for ( $i = 0, $count = count ( $matches [ 'contents' ] ) ; $i < $count ; $i ++ ) 3409 { 3410 if ( ! strcasecmp ( $matches [ 'location' ] [$i] [0], 'Bottom' ) ) 3411 $footer = $matches [ 'contents' ] [$i] [0] ; 3412 else 3413 $header = $matches [ 'contents' ] [$i] [0] ; 3414 } 3415 3416 $text = preg_replace ( $header_or_footer_re, '', $stream_contents ) ; 3417 } 3418 else 3419 $text = $stream_contents ; 3420 } 3421 3422 3423 /*-------------------------------------------------------------------------------------------------------------- 3424 3425 NAME 3426 ExtractText - extracts text from a pdf stream. 3427 3428 PROTOTYPE 3429 $text = $this -> ExtractText ( $page_number, $object_id, $data, &$current_font ) ; 3430 3431 DESCRIPTION 3432 Extracts text from decoded stream contents. 3433 3434 PARAMETERS 3435 $page_number (integer) - 3436 �Page number that contains the text to be extracted. 3437 3438 $object_id (integer) - 3439 Object id of this text block. 3440 3441 $data (string) - 3442 Stream contents. 3443 3444 $current_font (integer) - 3445 Id of the current font, which should be found in the $this->FontTable property, if anything 3446 went ok. 3447 This parameter is required, since text blocks may not specify a new font resource id and reuse 3448 the one that waas set before. 3449 3450 RETURN VALUE 3451 Returns the decoded text. 3452 3453 NOTES 3454 The PDF language can be seen as a stack-driven language ; for example, the instruction defining a text 3455 matrix ( "Tm" ) expects 6 floating-point values from the stack : 3456 3457 0 0 0 0 x y Tm 3458 3459 It can also specify specific operators, such as /Rx, which sets font number "x" to be the current font, 3460 or even "<< >>" constructs that we can ignore during our process of extracting textual data. 3461 Actually, we only want to handle a very small subset of the Adobe drawing language ; These are : 3462 - "Tm" instructions, that specify, among others, the x and y coordinates of the next text to be output 3463 - "/R" instructions, that specify which font is to be used for the next text output. This is useful 3464 only if the font has an associated character map. 3465 - "/F", same as "/R", but use a font map id instead of a direct object id. 3466 - Text, specified either using a single notation ( "(sometext)" ) or the array notation 3467 ( "[(...)d1(...)d2...(...)]" ), which allows for specifying inter-character spacing. 3468 - "Tf" instructions, that specifies the font size. This is to be able to compute approximately the 3469 number of empty lines between two successive Y coordinates in "Tm" instructions 3470 - "TL" instructions, that define the text leading to be used by "T*" 3471 3472 This is why I choosed to decompose the process of text extraction into three steps : 3473 - The first one, the lowest-level step, is a tokenizer that extracts individual elements, such as "Tm", 3474 "TJ", "/Rx" or "510.77". This is handled by the __next_token() method. 3475 - The second one, __next_instruction(), collects tokens. It pushes every floating-point value onto the 3476 stack, until an instruction is met. 3477 - The third one, ExtractText(), processes data returned by __next_instruction(), and actually performs 3478 the (restricted) parsing of text drawing instructions. 3479 3480 *-------------------------------------------------------------------------------------------------------------*/ 3481 protected function ExtractText ( $page_number, $object_id, $data, &$current_font ) 3482 { 3483 $new_data = $this -> __strip_useless_instructions ( $data ) ; 3484 3485 if ( self::$DEBUG ) 3486 { 3487 echo "\n----------------------------------- TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ; 3488 echo $data ; 3489 echo "\n----------------------------------- OPTIMIZED TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ; 3490 echo $new_data ; 3491 } 3492 3493 $data = $new_data ; 3494 3495 // Index into the specified block of text-drawing instructions 3496 $data_index = 0 ; 3497 3498 $data_length = strlen ( $data ) ; // Data length 3499 $result = '' ; // Resulting string 3500 3501 // Y-coordinate of the last seen "Tm" instruction 3502 $last_goto_y = 0 ; 3503 $last_goto_x = 0 ; 3504 3505 // Y-coordinate of the last seen "Td" or "TD" relative positioning instruction 3506 $last_relative_goto_y = 0 ; 3507 3508 // When true, the current text should be output on the same line as the preceding one 3509 $use_same_line = false ; 3510 3511 // Instruction preceding the current one 3512 $last_instruction = true ; 3513 3514 // Current font size 3515 $current_font_size = 0 ; 3516 3517 // Active template 3518 $current_template = '' ; 3519 3520 // Various pre-computed variables 3521 $separator_length = strlen ( $this -> Separator ) ; 3522 3523 // Current font map width, in bytes, plus a flag saying whether the current font is mapped or not 3524 $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ; 3525 3526 // Extra newlines to add before the current text 3527 $extra_newlines = 0 ; 3528 3529 // Text leading used by T* 3530 $text_leading = 0 ; 3531 3532 // Set to true if a separator needs to be inserted 3533 $needs_separator = false ; 3534 3535 // A flag to tell if we should "forget" the last instruction 3536 $discard_last_instruction = false ; 3537 3538 // A flag that tells whether the Separator and BlockSeparator properties are identical 3539 $same_separators = ( $this -> Separator == $this -> BlockSeparator ) ; 3540 3541 // Instruction count (used for handling execution timeouts) 3542 $instruction_count = 0 ; 3543 3544 // Unprocessed markers 3545 $unprocessed_marker_count = count ( $this -> UnprocessedMarkerList [ 'font' ] ) ; 3546 3547 // Loop through instructions 3548 while ( ( $instruction = $this -> __next_instruction ( $page_number, $data, $data_length, $data_index, $current_template ) ) !== false ) 3549 { 3550 $fragment = '' ; 3551 3552 $instruction_count ++ ; 3553 3554 // Timeout handling - don't test for every instruction processed 3555 if ( ! ( $instruction_count % 100 ) ) 3556 { 3557 // Global timeout handling 3558 if ( $this -> Options & self::PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME ) 3559 { 3560 $now = microtime ( true ) ; 3561 3562 if ( $now - self::$GlobalExecutionStartTime > self::$MaxGlobalExecutionTime ) 3563 error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", true, self::$PhpMaxExecutionTime, self::$MaxGlobalExecutionTime ) ) ; 3564 } 3565 3566 // Per-instance timeout handling 3567 if ( $this -> Options & self::PDFOPT_ENFORCE_EXECUTION_TIME ) 3568 { 3569 $now = microtime ( true ) ; 3570 3571 if ( $now - $this -> ExecutionStartTime > $this -> MaxExecutionTime ) 3572 error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", false, self::$PhpMaxExecutionTime, $this -> MaxExecutionTime ) ) ; 3573 } 3574 } 3575 3576 // Character position after the current instruction 3577 $data_index = $instruction [ 'next' ] ; 3578 3579 // Process current instruction 3580 switch ( $instruction [ 'instruction' ] ) 3581 { 3582 // Raw text (enclosed by parentheses) or array text (enclosed within square brackets) 3583 // is returned as a single instruction 3584 case 'text' : 3585 // Empty arrays of text may be encountered - ignore them 3586 if ( ! count ( $instruction [ 'values' ] ) ) 3587 break ; 3588 3589 // Check if we have to insert a newline 3590 if ( ! $use_same_line ) 3591 { 3592 $fragment .= $this -> EOL ; 3593 $needs_separator = false ; 3594 } 3595 // Roughly simulate spacing between lines by inserting newline characters 3596 else if ( $extra_newlines > 0 ) 3597 { 3598 $fragment .= str_repeat ( $this -> EOL, $extra_newlines ) ; 3599 $extra_newlines = 0 ; 3600 $needs_separator = false ; 3601 } 3602 else 3603 $needs_separator = true ; 3604 3605 // Add a separator if necessary 3606 if ( $needs_separator ) 3607 { 3608 // If the Separator and BlockSeparator properties are the same (and not empty), only add a block separator if 3609 // the current result does not end with it 3610 if ( $same_separators ) 3611 { 3612 if ( $this -> Separator != '' && substr ( $fragment, - $separator_length ) != $this -> BlockSeparator ) 3613 $fragment .= $this -> BlockSeparator ; 3614 } 3615 else 3616 $fragment .= $this -> BlockSeparator ; 3617 } 3618 3619 $needs_separator = true ; 3620 $value_index = 0 ; 3621 3622 // Fonts having character maps will require some special processing 3623 if ( $current_font_mapped ) 3624 { 3625 // Loop through each text value 3626 foreach ( $instruction [ 'values' ] as $text ) 3627 { 3628 $is_hex = ( $text [0] == '<' ) ; 3629 $length = strlen ( $text ) - 1 ; 3630 $handled = false ; 3631 3632 // Characters are encoded within angle brackets ( "<>" ). 3633 // Note that several characters can be specified within the same angle brackets, so we have to take 3634 // into account the width we detected in the begincodespancerange construct 3635 if ( $is_hex ) 3636 { 3637 for ( $i = 1 ; $i < $length ; $i += $current_font_map_width ) 3638 { 3639 $value = substr ( $text, $i, $current_font_map_width ) ; 3640 $ch = hexdec ( $value ) ; 3641 3642 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) ) 3643 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ; 3644 else if ( $current_font == -1 ) 3645 { 3646 $newchar = chr ( $ch ) ; 3647 } 3648 else 3649 { 3650 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch ) ; 3651 $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ; 3652 } 3653 3654 $fragment .= $newchar ; 3655 } 3656 3657 $handled = true ; 3658 } 3659 // Yes ! double-byte codes can also be specified as plain text within parentheses ! 3660 // However, we have to be really careful here ; the sequence : 3661 // (Be) 3662 // can mean the string "Be" or the Unicode character 0x4265 ('B' = 0x42, 'e' = 0x65) 3663 // We first look if the character map contains an entry for Unicode codepoint 0x4265 ; 3664 // if not, then we have to consider that it is regular text to be taken one character by 3665 // one character. In this case, we fall back to the "if ( ! $handled )" condition 3666 else if ( $current_font_map_width == 4 ) 3667 { 3668 $temp_result = '' ; 3669 3670 for ( $i = 1 ; $i < $length ; $i ++ ) 3671 { 3672 // Each character in the pair may be a backslash, which escapes the next character so we must skip it 3673 // This code needs to be reviewed ; the same code is duplicated to handle escaped characters in octal notation 3674 if ( $text [$i] != '\\' ) 3675 $ch1 = $text [$i] ; 3676 else 3677 { 3678 $i ++ ; 3679 3680 if ( $text [$i] < '0' || $text [$i] > '7' ) 3681 $ch1 = $this -> ProcessEscapedCharacter ( $text [$i] ) ; 3682 else 3683 { 3684 $oct = '' ; 3685 $digit_count = 0 ; 3686 3687 while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 ) 3688 { 3689 $oct .= $text [$i ++] ; 3690 $digit_count ++ ; 3691 } 3692 3693 $ch1 = chr ( octdec ( $oct ) ) ; 3694 $i -- ; 3695 } 3696 } 3697 3698 $i ++ ; 3699 3700 if ( $text [$i] != '\\' ) 3701 $ch2 = $text [$i] ; 3702 else 3703 { 3704 $i ++ ; 3705 3706 if ( $text [$i] < '0' || $text [$i] > '7' ) 3707 $ch2 = $this -> ProcessEscapedCharacter ( $text [$i] ) ; 3708 else 3709 { 3710 $oct = '' ; 3711 $digit_count = 0 ; 3712 3713 while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 ) 3714 { 3715 $oct .= $text [$i ++] ; 3716 $digit_count ++ ; 3717 } 3718 3719 $ch2 = chr ( octdec ( $oct ) ) ; 3720 $i -- ; 3721 } 3722 } 3723 3724 // Build the 2-bytes character code 3725 $ch = ( ord ( $ch1 ) << 8 ) | ord ( $ch2 ) ; 3726 3727 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) ) 3728 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ; 3729 else 3730 { 3731 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch, true ) ; 3732 $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ; 3733 } 3734 3735 // Yes !!! for characters encoded with two bytes, we can find the following construct : 3736 // 0x00 "\" "(" 0x00 "C" 0x00 "a" 0x00 "r" 0x00 "\" ")" 3737 // which must be expanded as : (Car) 3738 // We have here the escape sequences "\(" and "\)", but the backslash is encoded on two bytes 3739 // (although the MSB is nul), while the escaped character is encoded on 1 byte. waiting 3740 // for the next quirk to happen... 3741 if ( $newchar == '\\' && isset ( $text [ $i + 2 ] ) ) 3742 { 3743 $newchar = $this -> ProcessEscapedCharacter ( $text [ $i + 2 ] ) ; 3744 $i ++ ; // this time we processed 3 bytes, not 2 3745 } 3746 3747 $temp_result .= $newchar ; 3748 } 3749 3750 // Happens only if we were unable to translate a character using the current character map 3751 $fragment .= $temp_result ; 3752 $handled = true ; 3753 } 3754 3755 // Character strings within parentheses. 3756 // For every text value, use the character map table for substitutions 3757 if ( ! $handled ) 3758 { 3759 for ( $i = 1 ; $i < $length ; $i ++ ) 3760 { 3761 $ch = $text [$i] ; 3762 3763 // Set to true to optimize calls to MapCharacters 3764 // Currently does not work with pobox@dizy.sk/infoma.pdf (a few characters differ) 3765 $use_map_buffer = false ; 3766 3767 // ... but don't forget to handle escape sequences "\n" and "\r" for characters 3768 // 10 and 13 3769 if ( $ch == '\\' ) 3770 { 3771 $ch = $text [++$i] ; 3772 3773 // Escaped character 3774 if ( $ch < '0' || $ch > '7' ) 3775 $ch = $this -> ProcessEscapedCharacter ( $ch ) ; 3776 // However, an octal form can also be specified ; in this case we have to take into account 3777 // the character width for the current font (if the character width is 4 hex digits, then we 3778 // will encounter constructs such as "\000\077"). 3779 // The method used here is dirty : we build a regex to match octal character representations on a substring 3780 // of the text 3781 else 3782 { 3783 $width = $current_font_map_width / 2 ; // Convert to byte count 3784 $subtext = substr ( $text, $i - 1 ) ; 3785 $regex = "#^ (\\\\ [0-7]{3}){1,$width} #imsx" ; 3786 3787 $status = preg_match ( $regex, $subtext, $octal_matches ) ; 3788 3789 if ( $status ) 3790 { 3791 $octal_values = explode ( '\\', substr ( $octal_matches [0], 1 ) ) ; 3792 $ord = 0 ; 3793 3794 foreach ( $octal_values as $octal_value ) 3795 $ord = ( $ord << 8 ) + octdec ( $octal_value ) ; 3796 3797 $ch = chr ( $ord ) ; 3798 $i += strlen ( $octal_matches [0] ) - 2 ; 3799 } 3800 } 3801 3802 $use_map_buffer = false ; 3803 } 3804 3805 // Add substituted character to the output result 3806 $ord = ord ( $ch ) ; 3807 3808 if ( ! $use_map_buffer ) 3809 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ; 3810 else 3811 { 3812 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) ) 3813 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ; 3814 else 3815 { 3816 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ; 3817 $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ; 3818 } 3819 } 3820 3821 $fragment .= $newchar ; 3822 } 3823 } 3824 3825 // Handle offsets between blocks of characters 3826 if ( isset ( $instruction [ 'offsets' ] [ $value_index ] ) && 3827 - ( $instruction [ 'offsets' ] [ $value_index ] ) > $this -> MinSpaceWidth ) 3828 $fragment .= $this -> __get_character_padding ( $instruction [ 'offsets' ] [ $value_index ] ) ; 3829 3830 $value_index ++ ; 3831 } 3832 } 3833 // For fonts having no associated character map, we simply encode the string in UTF8 3834 // after the C-like escape sequences have been processed 3835 // Note that <xxxx> constructs can be encountered here, so we have to process them as well 3836 else 3837 { 3838 foreach ( $instruction [ 'values' ] as $text ) 3839 { 3840 $is_hex = ( $text [0] == '<' ) ; 3841 $length = strlen ( $text ) - 1 ; 3842 3843 // Some text within parentheses may have a backslash followed by a newline, to indicate some continuation line. 3844 // Example : 3845 // (this is a sentence \ 3846 // continued on the next line) 3847 // Funny isn't it ? so remove such constructs because we don't care 3848 $text = str_replace ( array ( "\\\r\n", "\\\r", "\\\n" ), '', $text ) ; 3849 3850 // Characters are encoded within angle brackets ( "<>" ) 3851 if ( $is_hex ) 3852 { 3853 for ( $i = 1 ; $i < $length ; $i += 2 ) 3854 { 3855 $ch = hexdec ( substr ( $text, $i, 2 ) ) ; 3856 3857 $fragment .= $this -> CodePointToUtf8 ( $ch ) ; 3858 } 3859 } 3860 // Characters are plain text 3861 else 3862 { 3863 $text = self::Unescape ( $text ) ; 3864 3865 for ( $i = 1, $length = strlen ( $text ) - 1 ; $i < $length ; $i ++ ) 3866 { 3867 $ch = $text [$i] ; 3868 $ord = ord ( $ch ) ; 3869 3870 if ( $ord < 127 ) 3871 $newchar = $ch ; 3872 else 3873 { 3874 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) ) 3875 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ; 3876 else 3877 { 3878 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ; 3879 $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ; 3880 } 3881 } 3882 3883 $fragment .= $newchar ; 3884 } 3885 } 3886 3887 // Handle offsets between blocks of characters 3888 if ( isset ( $instruction [ 'offsets' ] [ $value_index ] ) && 3889 abs ( $instruction [ 'offsets' ] [ $value_index ] ) > $this -> MinSpaceWidth ) 3890 $fragment .= $this -> __get_character_padding ( $instruction [ 'offsets' ] [ $value_index ] ) ; 3891 3892 $value_index ++ ; 3893 } 3894 } 3895 3896 // Process the markers which do not have an associated font yet - this will be done by matching 3897 // the current text fragment against one of the regular expressions defined. 3898 // If a match occurs, then all the subsequent text fragment using the same font will be put markers 3899 for ( $j = 0 ; $j < $unprocessed_marker_count ; $j ++ ) 3900 { 3901 $marker = $this -> UnprocessedMarkerList [ 'font' ] [$j] ; 3902 3903 if ( preg_match ( $marker [ 'regex' ], trim ( $fragment ) ) ) 3904 { 3905 $this -> TextWithFontMarkers [ $current_font ] = array 3906 ( 3907 'font' => $current_font, 3908 'height' => $current_font_size, 3909 'regex' => $marker [ 'regex' ], 3910 'start' => $marker [ 'start' ], 3911 'end' => $marker [ 'end' ] 3912 ) ; 3913 3914 $unprocessed_marker_count -- ; 3915 unset ( $this -> UnprocessedMarkerList [ 'font' ] [$j] ) ; 3916 3917 break ; 3918 } 3919 } 3920 3921 // Check if we need to add markers around this text fragment 3922 if ( isset ( $this -> TextWithFontMarkers [ $current_font ] ) && 3923 $this -> TextWithFontMarkers [ $current_font ] [ 'height' ] == $current_font_size ) 3924 { 3925 $fragment = $this -> TextWithFontMarkers [ $current_font ] [ 'start' ] . 3926 $fragment . 3927 $this -> TextWithFontMarkers [ $current_font ] [ 'end' ] ; 3928 } 3929 3930 $result .= $fragment ; 3931 3932 break ; 3933 3934 // An "nl" instruction means TJ, Tj, T* or "'" 3935 case 'nl' : 3936 if ( ! $instruction [ 'conditional' ] ) 3937 { 3938 if ( $instruction [ 'leading' ] && $text_leading && $current_font_size ) 3939 { 3940 $count = ( integer ) ( ( $text_leading - $current_font_size ) / $current_font_size ) ; 3941 3942 if ( ! $count ) 3943 $count = 1 ; 3944 } 3945 else 3946 $count = 1 ; 3947 3948 $extra = str_repeat ( PHP_EOL, $count ) ; 3949 $result .= $extra ; 3950 $needs_separator = false ; 3951 $last_goto_y -= ( $count * $text_leading ) ; // Approximation on y-coord change 3952 $last_relative_goto_y = 0 ; 3953 } 3954 3955 break ; 3956 3957 // "Tm", "Td" or "TD" : Output text on the same line, if the "y" coordinates are equal 3958 case 'goto' : 3959 // Some text is positioned using 'Tm' instructions ; however they can be immediatley followed by 'Td' instructions 3960 // which give a relative positioning ; so consider that the last instruction wins 3961 if ( $instruction [ 'relative' ] ) 3962 { 3963 // Try to put a separator if the x coordinate is non-zero 3964 //if ( $instruction [ 'x' ] - $last_goto_x >= $current_font_size ) 3965 // $result .= $this -> Separator ; 3966 3967 $discard_last_instruction = true ; 3968 $extra_newlines = 0 ; 3969 $use_same_line = ( ( $last_relative_goto_y - abs ( $instruction [ 'y' ] ) ) <= $current_font_size ) ; 3970 $last_relative_goto_y = abs ( $instruction [ 'y' ] ) ; 3971 $last_goto_x = $instruction [ 'x' ] ; 3972 3973 if ( - $instruction [ 'y' ] > $current_font_size ) 3974 { 3975 $use_same_line = false ; 3976 3977 if ( $last_relative_goto_y ) 3978 $extra_newlines = ( integer ) ( $current_font_size / $last_relative_goto_y ) ; 3979 else 3980 $extra_newlines = 0 ; 3981 } 3982 else if ( ! $instruction [ 'y' ] ) 3983 { 3984 $use_same_line = true ; 3985 $extra_newlines = 0 ; 3986 } 3987 3988 break ; 3989 } 3990 else 3991 $last_relative_goto_y = 0 ; 3992 3993 $y = $last_goto_y + $last_relative_goto_y ; 3994 3995 if ( $instruction [ 'y' ] == $y || abs ( $instruction [ 'y' ] - $y ) < $current_font_size ) 3996 { 3997 $use_same_line = true ; 3998 $extra_newlines = 0 ; 3999 } 4000 else 4001 { 4002 // Compute the number of newlines we have to insert between the current and the next lines 4003 if ( $current_font_size ) 4004 $extra_newlines = ( integer ) ( ( $y - $instruction [ 'y' ] - $current_font_size ) / $current_font_size ) ; 4005 4006 $use_same_line = ( $last_goto_y == 0 ) ; 4007 } 4008 4009 $last_goto_y = $instruction [ 'y' ] ; 4010 break ; 4011 4012 // Set font size 4013 case 'fontsize' : 4014 $current_font_size = $instruction [ 'size' ] ; 4015 break ; 4016 4017 // "/Rx" : sets the current font 4018 case 'resource' : 4019 $current_font = $instruction [ 'resource' ] ; 4020 4021 $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ; 4022 break ; 4023 4024 // "/TPLx" : references a template, which can contain additional font aliases 4025 case 'template' : 4026 if ( $this -> PageMap -> IsValidXObjectName ( $instruction [ 'token' ] ) ) 4027 $current_template = $instruction [ 'token' ] ; 4028 4029 break ; 4030 4031 // 'TL' : text leading to be used for the next "T*" in the flow 4032 case 'leading' : 4033 if ( ! ( $this -> Options & self::PDFOPT_IGNORE_TEXT_LEADING ) ) 4034 $text_leading = $instruction [ 'size' ] ; 4035 4036 break ; 4037 4038 4039 // 'ET' : we have to reset a few things here 4040 case 'ET' : 4041 $current_font = -1 ; 4042 $current_font_map_width = 2 ; 4043 break ; 4044 } 4045 4046 // Remember last instruction - this will help us into determining whether we should put the next text 4047 // on the current or following line 4048 if ( ! $discard_last_instruction ) 4049 $last_instruction = $instruction ; 4050 4051 $discard_last_instruction = false ; 4052 } 4053 4054 return ( $this -> __rtl_process ( $result ) ) ; 4055 } 4056 4057 4058 4059 // __next_instruction - 4060 // Retrieves the next instruction from the drawing text block. 4061 private function __next_instruction ( $page_number, $data, $data_length, $index, $current_template ) 4062 { 4063 static $last_instruction = false ; 4064 4065 $ch = '' ; 4066 4067 // Constructs such as 4068 if ( $last_instruction ) 4069 { 4070 $result = $last_instruction ; 4071 $last_instruction = false ; 4072 4073 return ( $result ) ; 4074 } 4075 4076 // Whether we should compute enhanced statistics 4077 $enhanced_statistics = $this -> EnhancedStatistics ; 4078 4079 // Holds the floating-point values encountered so far 4080 $number_stack = array ( ) ; 4081 4082 // Loop through the stream of tokens 4083 while ( ( $part = $this -> __next_token ( $page_number, $data, $data_length, $index ) ) !== false ) 4084 { 4085 $token = $part [0] ; 4086 $next_index = $part [1] ; 4087 4088 // Floating-point number : push it onto the stack 4089 if ( ( $token [0] >= '0' && $token [0] <= '9' ) || $token [0] == '-' || $token [0] == '+' || $token [0] == '.' ) 4090 { 4091 $number_stack [] = $token ; 4092 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ; 4093 } 4094 // 'Tm' instruction : return a "goto" instruction with the x and y coordinates 4095 else if ( $token == 'Tm' ) 4096 { 4097 $x = $number_stack [4] ; 4098 $y = $number_stack [5] ; 4099 4100 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tm' ] ++ ; 4101 4102 return ( array ( 'instruction' => 'goto', 'next' => $next_index, 'x' => $x, 'y' => $y, 'relative' => false, 'token' => $token ) ) ; 4103 } 4104 // 'Td' or 'TD' instructions : return a goto instruction with the x and y coordinates (1st and 2nd args) 4105 else if ( $token == 'Td' || $token == 'TD' ) 4106 { 4107 $x = $number_stack [0] ; 4108 $y = $number_stack [1] ; 4109 4110 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ $token ] ++ ; 4111 4112 return ( array ( 'instruction' => 'goto', 'next' => $next_index, 'x' => $x, 'y' => $y, 'relative' => true, 'token' => $token ) ) ; 4113 } 4114 // Output text "'" instruction, with conditional newline 4115 else if ( $token [0] == "'" ) 4116 { 4117 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ "'" ] ++ ; 4118 4119 return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => false, 'token' => $token ) ) ; 4120 } 4121 // Same as above 4122 else if ( $token == 'TJ' || $token == 'Tj' ) 4123 { 4124 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ $token ] ++ ; 4125 4126 return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => false, 'token' => $token ) ) ; 4127 } 4128 // Set font size 4129 else if ( $token == 'Tf' ) 4130 { 4131 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tf' ] ++ ; 4132 4133 return ( array ( 'instruction' => 'fontsize', 'next' => $next_index, 'size' => $number_stack [0], 'token' => $token ) ) ; 4134 } 4135 // Text leading (spacing used by T*) 4136 else if ( $token == 'TL' ) 4137 { 4138 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TL' ] ++ ; 4139 4140 return ( array ( 'instruction' => 'leading', 'next' => $next_index, 'size' => $number_stack [0], 'token' => $token ) ) ; 4141 } 4142 // Position to next line 4143 else if ( $token == 'T*' ) 4144 { 4145 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'T*' ] ++ ; 4146 4147 return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => true ) ) ; 4148 } 4149 // Draw object ("Do"). To prevent different text shapes to appear on the same line, we return a "newline" instruction 4150 // here. Note that the shape position is not taken into account here, and shapes will be processed in the order they 4151 // appear in the pdf file (which is likely to be different from their position on a graphic screen). 4152 else if ( $token == 'Do' ) 4153 { 4154 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ; 4155 4156 return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => false, 'token' => $token ) ) ; 4157 } 4158 // Raw text output 4159 else if ( $token [0] == '(' ) 4160 { 4161 $next_part = $this -> __next_token ( $page_number, $data, $data_length, $next_index, $enhanced_statistics ) ; 4162 $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ; 4163 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '(' ] ++ ; 4164 4165 if ( $next_part [0] == "'" ) 4166 { 4167 $last_instruction = $instruction ; 4168 return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => true, 'token' => $token ) ) ; 4169 } 4170 else 4171 return ( $instruction ) ; 4172 } 4173 // Hex digits within angle brackets 4174 else if ( $token [0] == '<' ) 4175 { 4176 $ch = $token [1] ; 4177 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '<' ] ++ ; 4178 $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ; 4179 4180 if ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM ) 4181 { 4182 $next_part = $this -> __next_token ( $page_number, $data, $data_length, $next_index ) ; 4183 $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ; 4184 4185 if ( $next_part [0] == "'" ) 4186 { 4187 $last_instruction = $instruction ; 4188 return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => true, 'token' => $token ) ) ; 4189 } 4190 else 4191 return ( $instruction ) ; 4192 } 4193 } 4194 // Text specified as an array of individual raw text elements, and individual interspaces between characters 4195 else if ( $token [0] == '[' ) 4196 { 4197 $values = $this -> __extract_chars_from_array ( $token ) ; 4198 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '[' ] ++ ; 4199 $instruction = array ( 'instruction' => 'text', 'next' => $next_index, 'values' => $values [0], 'offsets' => $values [1], 'token' => $token ) ; 4200 4201 return ( $instruction ) ; 4202 } 4203 // Token starts with a slash : maybe a font specification 4204 else if ( preg_match ( '#^ ( ' . self::$FontSpecifiers . ' ) #ix', $token ) ) 4205 { 4206 $key = "$page_number:$current_template:$token" ; 4207 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ; 4208 4209 if ( isset ( $this -> MapIdBuffer [ $key ] ) ) 4210 $id = $this -> MapIdBuffer [ $key ] ; 4211 else 4212 { 4213 $id = $this -> FontTable -> GetFontByMapId ( $page_number, $current_template, $token ) ; 4214 4215 $this -> MapIdBuffer [ $key ] = $id ; 4216 } 4217 4218 return ( array ( 'instruction' => 'resource', 'next' => $next_index, 'resource' => $id, 'token' => $token ) ) ; 4219 } 4220 // Template reference, such as /TPL1. Each reference has initially been replaced by !PDFTOTEXT_TEMPLATE_TPLx during substitution 4221 // by ProcessTemplateReferences(), because templates not only specify text to be replaced, but also font aliases 4222 // -and this is the place where we catch font aliases in this case 4223 else if ( preg_match ( '/ !PDFTOTEXT_TEMPLATE_ (?P<template> \w+) /ix', $token, $match ) ) 4224 { 4225 $current_template = '/' . $match [ 'template' ] ; 4226 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'template' ] ++ ; 4227 4228 return ( array ( 'instruction' => 'template', 'next' => $next_index, 'token' => $current_template ) ) ; 4229 } 4230 // Others, only counted for statistics 4231 else if ( $token === 'cm' ) 4232 { 4233 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'cm' ] ++ ; 4234 } 4235 else if ( $token === 'BT' ) 4236 { 4237 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'BT' ] ++ ; 4238 4239 return ( array ( 'instruction' => 'BT', 'next' => $next_index, 'token' => $token ) ) ; 4240 } 4241 else if ( $token == 'ET' ) // Nothing special to count here 4242 { 4243 return ( array ( 'instruction' => 'ET', 'next' => $next_index, 'token' => $token ) ) ; 4244 } 4245 // Other instructions : we're not that much interested in them, so clear the number stack and consider 4246 // that the current parameters, floating-point values, have been processed 4247 else 4248 { 4249 $number_stack = array ( ) ; 4250 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ; 4251 } 4252 4253 $index = $next_index ; 4254 } 4255 4256 // End of input 4257 return ( false ) ; 4258 } 4259 4260 4261 // __next_token : 4262 // Retrieves the next token from the drawing instructions stream. 4263 private function __next_token ( $page_number, $data, $data_length, $index ) 4264 { 4265 // Skip spaces 4266 $count = 0 ; 4267 4268 while ( $index < $data_length && ( $data [ $index ] == ' ' || $data [ $index ] == "\t" || $data [ $index ] == "\r" || $data [ $index ] == "\n" ) ) 4269 { 4270 $index ++ ; 4271 $count ++ ; 4272 } 4273 4274 $enhanced_statistics = $this -> EnhancedStatistics ; 4275 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'space' ] += $count ; 4276 4277 // End of input 4278 if ( $index >= $data_length ) 4279 return ( false ) ; 4280 4281 // The current character will tell us what to do 4282 $ch = $data [ $index ] ; 4283 $ch2 = '' ; 4284 4285 switch ( $ch ) 4286 { 4287 // Opening square bracket : we have to find the closing one, taking care of escape sequences 4288 // that can also specify a square bracket, such as "\]" 4289 case "[" : 4290 $pos = $index + 1 ; 4291 $parent = 0 ; 4292 $angle = 0 ; 4293 $result = $ch ; 4294 4295 while ( $pos < $data_length ) 4296 { 4297 $nch = $data [ $pos ++ ] ; 4298 4299 switch ( $nch ) 4300 { 4301 case '(' : 4302 $parent ++ ; 4303 $result .= $nch ; 4304 break ; 4305 4306 case ')' : 4307 $parent -- ; 4308 $result .= $nch ; 4309 break ; 4310 4311 case '<' : 4312 // Although the array notation can contain hex digits between angle brackets, we have to 4313 // take care that we do not have an angle bracket between two parentheses such as : 4314 // [ (<) ... ] 4315 if ( ! $parent ) 4316 $angle ++ ; 4317 4318 $result .= $nch ; 4319 break ; 4320 4321 case '>' : 4322 if ( ! $parent ) 4323 $angle -- ; 4324 4325 $result .= $nch ; 4326 break ; 4327 4328 case '\\' : 4329 $result .= $nch . $data [ $pos ++ ] ; 4330 break ; 4331 4332 case ']' : 4333 $result .= ']' ; 4334 4335 if ( ! $parent ) 4336 break 2 ; 4337 else 4338 break ; 4339 4340 case "\n" : 4341 case "\r" : 4342 break ; 4343 4344 default : 4345 $result .= $nch ; 4346 } 4347 } 4348 4349 return ( array ( $result, $pos ) ) ; 4350 4351 // Parenthesis : Again, we have to find the closing parenthesis, taking care of escape sequences 4352 // such as "\)" 4353 case "(" : 4354 $pos = $index + 1 ; 4355 $result = $ch ; 4356 4357 while ( $pos < $data_length ) 4358 { 4359 $nch = $data [ $pos ++ ] ; 4360 4361 if ( $nch == '\\' ) 4362 { 4363 $after = $data [ $pos ] ; 4364 4365 // Character references specified as \xyz, where "xyz" are octal digits 4366 if ( $after >= '0' && $after <= '7' ) 4367 { 4368 $result .= $nch ; 4369 4370 while ( $data [ $pos ] >= '0' && $data [ $pos ] <= '7' ) 4371 $result .= $data [ $pos ++ ] ; 4372 } 4373 // Regular character escapes 4374 else 4375 $result .= $nch . $data [ $pos ++ ] ; 4376 } 4377 else if ( $nch == ')' ) 4378 { 4379 $result .= ')' ; 4380 break ; 4381 } 4382 else 4383 $result .= $nch ; 4384 } 4385 4386 return ( array ( $result, $pos ) ) ; 4387 4388 // A construction of the form : "<< something >>", or a unicode character 4389 case '<' : 4390 if ( ! isset ( $data [ $index + 1 ] ) ) 4391 return ( false ) ; 4392 4393 if ( $data [ $index + 1 ] == '<' ) 4394 { 4395 $pos = strpos ( $data, '>>', $index + 2 ) ; 4396 4397 if ( $pos === false ) 4398 return ( false ) ; 4399 4400 return ( array ( substr ( $data, $index, $pos - $index + 2 ), $pos + 2 ) ) ; 4401 } 4402 else 4403 { 4404 $pos = strpos ( $data, '>', $index + 2 ) ; 4405 4406 if ( $pos === false ) 4407 return ( false ) ; 4408 4409 // There can be spaces and newlines inside a series of hex digits, so remove them... 4410 $result = preg_replace ( '/\s+/', '', substr ( $data, $index, $pos - $index + 1 ) ) ; 4411 4412 return ( array ( $result, $pos + 1 ) ) ; 4413 } 4414 4415 // Tick character : consider it as a keyword, in the same way as the "TJ" or "Tj" keywords 4416 case "'" : 4417 return ( array ( "'", $index + 1 ) ) ; 4418 4419 // Other cases : this may be either a floating-point number or a keyword 4420 default : 4421 $index ++ ; 4422 $value = $ch ; 4423 4424 if ( isset ( $data [ $index ] ) ) 4425 { 4426 if ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_DIGIT ) || 4427 $ch == '-' || $ch == '+' || $ch == '.' ) 4428 { 4429 while ( $index < $data_length && 4430 ( ( self::$CharacterClasses [ $data [ $index ] ] & self::CTYPE_DIGIT ) || 4431 $data [ $index ] == '.' ) ) 4432 $value .= $data [ $index ++ ] ; 4433 } 4434 else if ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALPHA ) || 4435 $ch == '/' || $ch == '!' ) 4436 { 4437 $ch = $data [ $index ] ; 4438 4439 while ( $index < $data_length && 4440 ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM ) || 4441 $ch == '*' || $ch == '-' || $ch == '_' || $ch == '.' || $ch == '+' ) ) 4442 { 4443 $value .= $ch ; 4444 $index ++ ; 4445 4446 if ( isset ( $data [ $index ] ) ) 4447 $ch = $data [ $index ] ; 4448 } 4449 } 4450 } 4451 4452 return ( array ( $value, $index ) ) ; 4453 } 4454 } 4455 4456 4457 /*-------------------------------------------------------------------------------------------------------------- 4458 4459 NAME 4460 ExtractTextWithLayout - Extracts text, trying to render the page layout. 4461 4462 $text = $this -> ExtractTextWithLayout ( $page_number, $object_id, $data, &$current_font ) ; 4463 4464 DESCRIPTION 4465 Extracts text from decoded stream contents, trying to render the layout. 4466 4467 PARAMETERS 4468 $page_number (integer) - 4469 �Page number that contains the text to be extracted. 4470 4471 $object_id (integer) - 4472 Object id of this text block. 4473 4474 $data (string) - 4475 Stream contents. 4476 4477 $current_font (integer) - 4478 Id of the current font, which should be found in the $this->FontTable property, if anything 4479 went ok. 4480 This parameter is required, since text blocks may not specify a new font resource id and reuse 4481 the one that waas set before. 4482 4483 RETURN VALUE 4484 Returns the decoded text. 4485 4486 *-------------------------------------------------------------------------------------------------------------*/ 4487 protected function ExtractTextWithLayout ( &$page_fragments, $page_number, $object_id, $data, &$current_font ) 4488 { 4489 // Characters that can start a numeric operand 4490 static $numeric_starts = array 4491 ( 4492 '+' => true, '-' => true, '.' => true, '0' => true, '1' => true, '2' => true, '3' => true, '4' => true, 4493 '5' => true, '6' => true, '7' => true, '8' => true, '9' => true 4494 ) ; 4495 // Initial (default) transformation matrix. To reflect the PDF specifications, we will keep it as a 6 elements array : 4496 // [ sx tx ty sy x y ] 4497 // (although tx and ty are not useful here, since they affect the graphic orientation of the text) 4498 // sx and sy are scaling parameters, actually a multiplier for the x and y parameters. We only keep 4499 static $IdentityMatrix = array ( 1, 0, 0, 1, 0, 0 ) ; 4500 4501 // Remove useless instructions 4502 $new_data = $this -> __strip_useless_instructions ( $data ) ; 4503 4504 if ( self::$DEBUG ) 4505 { 4506 echo "\n----------------------------------- TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ; 4507 echo $data ; 4508 echo "\n----------------------------------- OPTIMIZED TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ; 4509 echo $new_data ; 4510 } 4511 4512 $data = $new_data ; 4513 $data_length = strlen ( $data ) ; // Data length 4514 4515 $page_fragment_count = count ( $page_fragments ) ; 4516 4517 // Index into the specified block of text-drawing instructions 4518 $data_index = 0 ; 4519 4520 // Text matrices 4521 $CTM = 4522 $Tm = $IdentityMatrix ; 4523 4524 // Nesting level of BT..ET instructions (Begin text/End text) - they are not nestable but be prepared to meet buggy PDFs 4525 $BT_nesting_level = 0 ; 4526 4527 // Current font data 4528 $current_font_height = 0 ; 4529 4530 // Current font map width, in bytes, plus a flag saying whether the current font is mapped or not 4531 $current_template = '' ; 4532 $current_font_name = '' ; 4533 $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ; 4534 4535 // Operand stack 4536 $operand_stack = array ( ) ; 4537 4538 // Number of tokens processed so far 4539 $token_count = 0 ; 4540 4541 // Page attributes 4542 $page_attributes = $this -> PageMap -> PageAttributes [ $page_number ] ; 4543 4544 // Graphics context stack - well, we only store here the current transformation matrix 4545 $graphic_stack = array ( ) ; 4546 $graphic_stack_size = 0 ; 4547 4548 // Global/local execution time measurements 4549 $tokens_between_timechecks = 1000 ; 4550 $enforce_global_execution_time = $this -> Options & self::PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME ; 4551 $enforce_local_execution_time = $this -> Options & self::PDFOPT_ENFORCE_EXECUTION_TIME ; 4552 $enforce_execution_time = $enforce_global_execution_time | $enforce_local_execution_time ; 4553 4554 // Whether we should compute enhanced statistics 4555 $enhanced_statistics = $this -> EnhancedStatistics ; 4556 4557 // Whether we should show debug coordinates 4558 $show_debug_coordinates = ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES ) ; 4559 4560 // Text leading value set by the TL instruction 4561 $text_leading = 0.0 ; 4562 4563 // Loop through the stream of tokens 4564 while ( $this -> __next_token_ex ( $page_number, $data, $data_length, $data_index, $token, $next_index ) !== false ) 4565 { 4566 $token_start = $token [0] ; 4567 $token_count ++ ; 4568 $length = $next_index - $data_index - 1 ; 4569 4570 // Check if we need to enforce execution time checking, to prevent PHP from terminating our script without any hope 4571 // of catching the error 4572 if ( $enforce_execution_time && ! ( $token_count % $tokens_between_timechecks ) ) 4573 { 4574 if ( $enforce_global_execution_time ) 4575 { 4576 $now = microtime ( true ) ; 4577 4578 if ( $now - self::$GlobalExecutionStartTime > self::$MaxGlobalExecutionTime ) 4579 error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", true, self::$PhpMaxExecutionTime, self::$MaxGlobalExecutionTime ) ) ; 4580 } 4581 4582 // Per-instance timeout handling 4583 if ( $enforce_local_execution_time ) 4584 { 4585 $now = microtime ( true ) ; 4586 4587 if ( $now - $this -> ExecutionStartTime > $this -> MaxExecutionTime ) 4588 error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", false, self::$PhpMaxExecutionTime, $this -> MaxExecutionTime ) ) ; 4589 } 4590 } 4591 4592 /**************************************************************************************************************** 4593 4594 The order of the testings is important for maximum performance : put the most common cases first. 4595 A study on over 1000 PDF files has shown the following : 4596 4597 - Instruction operands appear 24.5 million times 4598 - Tx instructions (including Tf, Tm, ', ", etc.) : 24M 4599 - (), <> and [] constructs for drawing text : 17M 4600 - Other : peanuts... 4601 - Ignored instructions : 0.5M (these are the instructions without interest for text extraction and that 4602 could not be removed by the __strip_useless_instructions() method). 4603 4604 Of course, white spaces appear more than 100M times between instructions. However, it gets hard to remove 4605 most of them without compromising the result of __strip_useless_instructions. 4606 4607 ***************************************************************************************************************/ 4608 // Numeric or flag for an instruction 4609 if ( $token_start == '/' || isset ( $numeric_starts [ $token_start ] ) ) 4610 { 4611 $operand_stack [] = $token ; 4612 4613 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ; 4614 } 4615 // A 2-characters "Tx" or a 1-character quote/doublequote instruction 4616 else if ( ( $length === 2 && $token_start === 'T' ) || ( $length === 1 && ( $token_start === "'" || $token_start === '"' ) ) ) 4617 { 4618 switch ( ( $length === 1 ) ? $token [0] : $token [1] ) 4619 { 4620 // Tj instruction 4621 case 'j' : 4622 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tj' ] ++ ; 4623 break ; 4624 4625 // Tm instruction 4626 case 'm' : 4627 $Tm [0] = ( double ) $operand_stack [0] ; 4628 $Tm [1] = ( double ) $operand_stack [1] ; 4629 $Tm [2] = ( double ) $operand_stack [2] ; 4630 $Tm [3] = ( double ) $operand_stack [3] ; 4631 $Tm [4] = ( double ) $operand_stack [4] ; 4632 $Tm [5] = ( double ) $operand_stack [5] ; 4633 4634 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tm' ] ++ ; 4635 break ; 4636 4637 // Tf instruction 4638 case 'f' : 4639 $current_font_name = $operand_stack [0] ; 4640 $key = "$page_number:$current_template:$current_font_name" ; 4641 4642 // We have to map a font specifier (such /TT0, C0-1, etc.) into an object id. 4643 // Check first if we already met this font 4644 if ( isset ( $this -> MapIdBuffer [ $key ] ) ) 4645 $current_font = $this -> MapIdBuffer [ $key ] ; 4646 // Otherwise retrieve its corresponding object number and put it in our font cache 4647 else 4648 { 4649 $current_font = $this -> FontTable -> GetFontByMapId ( $page_number, $current_template, $current_font_name ) ; 4650 4651 $this -> MapIdBuffer [ $key ] = $current_font ; 4652 } 4653 4654 $current_font_height = ( double ) $operand_stack [1] ; 4655 $this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ; 4656 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Tf' ] ++ ; 4657 break ; 4658 4659 // Td instruction 4660 case 'd' : 4661 $Tm [4] += ( double ) $operand_stack [0] * abs ( $Tm [0] ) ; 4662 $Tm [5] += ( double ) $operand_stack [1] * abs ( $Tm [3] ) ; 4663 4664 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'Td' ] ++ ; 4665 break ; 4666 4667 // TJ instruction 4668 case 'J' : 4669 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TJ' ] ++ ; 4670 break ; 4671 4672 // TD instruction 4673 case 'D' : 4674 $Tm [4] += ( double ) $operand_stack [0] * $Tm [0] ; 4675 $Tm [5] += ( double ) $operand_stack [1] * $Tm [3] ; 4676 $text_leading -= $Tm [5] ; 4677 4678 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TD' ] ++ ; 4679 break ; 4680 4681 // T* instruction 4682 case '*' : 4683 $Tm [4] = 0.0 ; 4684 $Tm [5] -= $text_leading ; //$current_font_height ; 4685 4686 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'T*' ] ++ ; 4687 break ; 4688 4689 // TL instruction - Set text leading. Currently not used. 4690 case 'L' : 4691 $text_leading = ( double ) $operand_stack [0] ; 4692 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'TL' ] ++ ; 4693 break ; 4694 4695 // ' instruction : go to next line and display text 4696 case "'" : 4697 // Update the coordinates of the last text block found so far 4698 $page_fragments [ $page_fragment_count - 1 ] [ 'x' ] += $text_leading ; 4699 $offset = $current_font_height * abs ( $Tm [3] ) ; 4700 $page_fragments [ $page_fragment_count - 1 ] [ 'y' ] -= $offset ; 4701 4702 // And don't forget to update the y coordinate of the current transformation matrix 4703 $Tm [5] -= $offset ; 4704 4705 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ "'" ] ++ ; 4706 break ; 4707 4708 // "'" instruction 4709 case '"' : 4710 if ( self::$DEBUG ) 4711 warning ( "Instruction $token not yet implemented." ) ; 4712 4713 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '"' ] ++ ; 4714 break ; 4715 4716 // Other : ignore them 4717 default : 4718 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ; 4719 } 4720 4721 $operand_stack = array ( ) ; 4722 } 4723 // cm instruction 4724 else if ( $token == 'cm' ) 4725 { 4726 $a = ( double ) $operand_stack [0] ; 4727 $b = ( double ) $operand_stack [1] ; 4728 $c = ( double ) $operand_stack [2] ; 4729 $d = ( double ) $operand_stack [3] ; 4730 $e = ( double ) $operand_stack [4] ; 4731 $f = ( double ) $operand_stack [5] ; 4732 4733 $CTM = array ( $a, $b, $c, $d, $e, $f ) ; 4734 $operand_stack = array ( ) ; 4735 4736 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'cm' ] ++ ; 4737 } 4738 // q/Q instructions (save/restore graphic context) 4739 else if ( $token === 'q' ) 4740 { 4741 $graphic_stack [ $graphic_stack_size ++ ] = array ( $CTM, $Tm ) ; 4742 $operand_stack = array ( ) ; 4743 } 4744 else if ( $token === 'Q' ) 4745 { 4746 if ( $graphic_stack_size ) 4747 list ( $CTM, $Tm ) = $graphic_stack [ -- $graphic_stack_size ] ; 4748 else if ( self::$DEBUG ) 4749 warning ( "Tried to restore graphics context from an empty stack." ) ; 4750 4751 $operand_stack = array ( ) ; 4752 } 4753 // Text array in the [...] notation. Well, in fact, even non-array constructs are returned as an array by the 4754 // __next_token() function, for the sake of simplicity 4755 else if ( $token_start === '[' ) 4756 { 4757 $text = $this -> __decode_text ( $token, $current_font, $current_font_mapped, $current_font_map_width ) ; 4758 4759 if ( $text !== '' ) 4760 { 4761 $r = $this -> __matrix_multiply ( $Tm, $CTM, $page_attributes [ 'width' ], $page_attributes [ 'height' ] ) ; 4762 $fragment = array 4763 ( 4764 'x' => ( $r [4] < 0 ) ? 0.0 : $r [4], 4765 'y' => ( $r [5] < 0 ) ? 0.0 : $r [5], 4766 'page' => $page_number, 4767 'template' => $current_template, 4768 'font' => $current_font_name, 4769 'font-height' => abs ( $current_font_height * $Tm [3] ), 4770 'text' => $text, 4771 ) ; 4772 4773 // Add debug information when needed 4774 if ( self::$DEBUG ) 4775 { 4776 $fragment = array_merge 4777 ( 4778 $fragment, 4779 array 4780 ( 4781 'CTM' => $CTM, 4782 'Tm' => $Tm, 4783 'New Tm' => $r, 4784 'Real font height' => $current_font_height, 4785 'Page width' => $page_attributes [ 'width' ], 4786 'Page height' => $page_attributes ['height' ] 4787 ) 4788 ) ; 4789 } 4790 4791 // Add this text fragment to the list 4792 $page_fragments [] = $fragment ; 4793 $page_fragment_count ++ ; 4794 4795 $operand_stack = array ( ) ; 4796 } 4797 } 4798 // BT instruction 4799 else if ( $token == 'BT' ) 4800 { 4801 $BT_nesting_level ++ ; 4802 $operand_stack = array ( ) ; 4803 $graphic_stack [ $graphic_stack_size ++ ] = array ( $CTM, $Tm ) ; 4804 4805 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'BT' ] ++ ; 4806 } 4807 // ET instruction 4808 else if ( $token == 'ET' ) 4809 { 4810 if ( $BT_nesting_level ) 4811 { 4812 $BT_nesting_level -- ; 4813 4814 if ( ! $BT_nesting_level && $graphic_stack_size ) 4815 { 4816 list ( $CTM, $Tm ) = $graphic_stack [ -- $graphic_stack_size ] ; 4817 } 4818 4819 } 4820 4821 $operand_stack = array ( ) ; 4822 } 4823 // Template (substituted in __next_token) 4824 else if ( $token_start === '!' ) 4825 { 4826 if ( preg_match ( '/ !PDFTOTEXT_TEMPLATE_ (?P<template> \w+) /ix', $token, $match ) ) 4827 { 4828 $name = '/' . $match [ 'template' ] ; 4829 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'template' ] ++ ; 4830 4831 if ( $this -> PageMap -> IsValidXObjectName ( $name ) ) 4832 $current_template = $name ; 4833 } 4834 else 4835 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ; 4836 4837 $operand_stack = array ( ) ; 4838 } 4839 // Other instructions 4840 else 4841 { 4842 $operand_stack = array ( ) ; 4843 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ; 4844 } 4845 4846 // Update current index in instruction stream 4847 $data_index = $next_index ; 4848 } 4849 } 4850 4851 4852 // __matrix_multiply - 4853 // Multiplies matrix $ma by $mb. 4854 // PDF transformation matrices are 3x3 matrices containing the following values : 4855 // 4856 // | sx rx 0 | 4857 // | ry sy 0 | 4858 // | tx ty 1 | 4859 // 4860 // However, we do not care about the 3rd column, which is always hardcoded. Transformation 4861 // matrices here are implemented 6-elements arrays : 4862 // 4863 // [ sx, rx, ry, tx, ty ] 4864 private function __matrix_multiply ( $ma, $mb, $page_width, $page_height ) 4865 { 4866 // Scaling text is only appropriate for rendering graphics ; in our case, we just have to render 4867 // basic text without any consideration about its width or height ; so adjust the sx/sy parameters 4868 // accordingly 4869 $scale_1x = ( $ma [0] > 0 ) ? 1 : -1 ; 4870 $scale_1y = ( $ma [3] > 0 ) ? 1 : -1 ; 4871 $scale_2x = ( $mb [0] > 0 ) ? 1 : -1 ; 4872 $scale_2y = ( $mb [3] > 0 ) ? 1 : -1 ; 4873 4874 // Perform the matrix multiplication 4875 $r = array ( ) ; 4876 $r [0] = ( $scale_1x * $scale_2x ) + ( $ma [1] * $mb [2] ) ; 4877 $r [1] = ( $scale_1x * $mb [1] ) + ( $ma [1] * $scale_2y ) ; 4878 $r [2] = ( $scale_1y * $scale_2x ) + ( $scale_1y * $mb [2] ) ; 4879 $r [3] = ( $scale_1y * $mb [1] ) + ( $scale_1y* $scale_2y ) ; 4880 $r [4] = ( $ma [4] * $scale_2x ) + ( $ma [5] * $mb [2] ) + $mb [4] ; 4881 $r [5] = ( $ma [4] * $mb [1] ) + ( $ma [5] * $scale_2y ) + $mb [5] ; 4882 4883 // Negative x/y values are expressed relative to the page width/height (???) 4884 if ( $r [0] < 0 ) 4885 $r [4] = abs ( $r [4] ) ;//$page_width - $r [4] ; 4886 4887 if ( $r [3] < 0 ) 4888 $r [5] = abs ( $r [5] ) ; //$page_height - $r [5] ; 4889 4890 return ( $r ) ; 4891 } 4892 4893 4894 // __next_token_ex : 4895 // Reviewed version of __next_token, adapted to ExtractTextWithLayout. 4896 // Both functions will be unified when this one will be stabilized. 4897 private function __next_token_ex ( $page_number, $data, $data_length, $index, &$token, &$next_index ) 4898 { 4899 // Skip spaces 4900 $count = 0 ; 4901 4902 while ( $index < $data_length && ( $data [ $index ] == ' ' || $data [ $index ] == "\t" || $data [ $index ] == "\r" || $data [ $index ] == "\n" ) ) 4903 { 4904 $index ++ ; 4905 $count ++ ; 4906 } 4907 4908 $enhanced_statistics = $this -> EnhancedStatistics ; 4909 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ 'space' ] += $count ; 4910 4911 // End of input 4912 if ( $index >= $data_length ) 4913 return ( false ) ; 4914 4915 // The current character will tell us what to do 4916 $ch = $data [ $index ] ; 4917 4918 switch ( $ch ) 4919 { 4920 // Opening square bracket : we have to find the closing one, taking care of escape sequences 4921 // that can also specify a square bracket, such as "\]" 4922 case "[" : 4923 $next_index = $index + 1 ; 4924 $parent = 0 ; 4925 $angle = 0 ; 4926 $token = '[' ; 4927 4928 while ( $next_index < $data_length ) 4929 { 4930 $nch = $data [ $next_index ++ ] ; 4931 4932 switch ( $nch ) 4933 { 4934 case '(' : 4935 $parent ++ ; 4936 $token .= $nch ; 4937 break ; 4938 4939 case ')' : 4940 $parent -- ; 4941 $token .= $nch ; 4942 break ; 4943 4944 case '<' : 4945 // Although the array notation can contain hex digits between angle brackets, we have to 4946 // take care that we do not have an angle bracket between two parentheses such as : 4947 // [ (<) ... ] 4948 if ( ! $parent ) 4949 $angle ++ ; 4950 4951 $token .= $nch ; 4952 break ; 4953 4954 case '>' : 4955 if ( ! $parent ) 4956 $angle -- ; 4957 4958 $token .= $nch ; 4959 break ; 4960 4961 case '\\' : 4962 $token .= $nch . $data [ $next_index ++ ] ; 4963 break ; 4964 4965 case ']' : 4966 $token .= ']' ; 4967 4968 if ( ! $parent ) 4969 break 2 ; 4970 else 4971 break ; 4972 4973 case "\n" : 4974 case "\r" : 4975 break ; 4976 4977 default : 4978 $token .= $nch ; 4979 } 4980 } 4981 4982 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '[' ] ++ ; 4983 4984 return ( true ) ; 4985 4986 // Parenthesis : Again, we have to find the closing parenthesis, taking care of escape sequences 4987 // such as "\)" 4988 case "(" : 4989 $next_index = $index + 1 ; 4990 $token = '[' . $ch ; 4991 4992 while ( $next_index < $data_length ) 4993 { 4994 $nch = $data [ $next_index ++ ] ; 4995 4996 if ( $nch === '\\' ) 4997 { 4998 $after = $data [ $next_index ] ; 4999 5000 // Character references specified as \xyz, where "xyz" are octal digits 5001 if ( $after >= '0' && $after <= '7' ) 5002 { 5003 $token .= $nch ; 5004 5005 while ( $data [ $next_index ] >= '0' && $data [ $next_index ] <= '7' ) 5006 $token .= $data [ $next_index ++ ] ; 5007 } 5008 // Regular character escapes 5009 else 5010 $token .= $nch . $data [ $next_index ++ ] ; 5011 } 5012 else if ( $nch === ')' ) 5013 { 5014 $token .= ')' ; 5015 break ; 5016 } 5017 else 5018 $token .= $nch ; 5019 } 5020 5021 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '(' ] ++ ; 5022 $token .= ']' ; 5023 5024 return ( true ) ; 5025 5026 // A construction of the form : "<< something >>", or a unicode character 5027 case '<' : 5028 if ( isset ( $data [ $index + 1 ] ) ) 5029 { 5030 if ( $data [ $index + 1 ] === '<' ) 5031 { 5032 $next_index = strpos ( $data, '>>', $index + 2 ) ; 5033 5034 if ( $next_index === false ) 5035 return ( false ) ; 5036 5037 $token = substr ( $data, $index, $next_index - $index + 2 ) ; 5038 $next_index += 2 ; 5039 5040 return ( true ) ; 5041 } 5042 else 5043 { 5044 $next_index = strpos ( $data, '>', $index + 2 ) ; 5045 5046 if ( $next_index === false ) 5047 return ( false ) ; 5048 5049 $enhanced_statistics && $this -> Statistics [ 'Distributions' ] [ '<' ] ++ ; 5050 5051 // There can be spaces and newlines inside a series of hex digits, so remove them... 5052 $result = preg_replace ( '/\s+/', '', substr ( $data, $index, $next_index - $index + 1 ) ) ; 5053 5054 $token = "[$result]" ; 5055 $next_index ++ ; 5056 5057 return ( true ) ; 5058 } 5059 } 5060 else 5061 return ( false ) ; 5062 5063 // Tick character : consider it as a keyword, in the same way as the "TJ" or "Tj" keywords 5064 case "'" : 5065 case '"' : 5066 $token = $ch ; 5067 $next_index += 2 ; 5068 5069 return ( true ) ; 5070 5071 // Other cases : this may be either a floating-point number or a keyword 5072 default : 5073 $next_index = ++ $index ; 5074 $token = $ch ; 5075 5076 if ( isset ( $data [ $next_index ] ) ) 5077 { 5078 if ( ( $ch >= '0' && $ch <= '9' ) || $ch == '-' || $ch == '+' || $ch == '.' ) 5079 { 5080 while ( $next_index < $data_length && 5081 ( ( $data [ $next_index ] >= '0' && $data [ $next_index ] <= '9' ) || 5082 $data [ $next_index ] === '-' || $data [ $next_index ] === '+' || $data [ $next_index ] === '.' ) ) 5083 $token .= $data [ $next_index ++ ] ; 5084 } 5085 else if ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALPHA ) || 5086 $ch == '/' || $ch == '!' ) 5087 { 5088 $ch = $data [ $next_index ] ; 5089 5090 while ( $next_index < $data_length && 5091 ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM ) || 5092 $ch == '*' || $ch == '-' || $ch == '_' || $ch == '.' || $ch == '+' ) ) 5093 { 5094 $token .= $ch ; 5095 $next_index ++ ; 5096 5097 if ( isset ( $data [ $next_index ] ) ) 5098 $ch = $data [ $next_index ] ; 5099 } 5100 } 5101 } 5102 5103 return ( true ) ; 5104 } 5105 } 5106 5107 5108 // __decode_text - 5109 // Text decoding function when the PDFOPT_BASIC_LAYOUT flag is specified. 5110 private function __decode_text ( $data, $current_font, $current_font_mapped, $current_font_map_width ) 5111 { 5112 list ( $text_values, $offsets ) = $this -> __extract_chars_from_array ( $data ) ; 5113 $value_index = 0 ; 5114 $result = '' ; 5115 5116 // Fonts having character maps will require some special processing 5117 if ( $current_font_mapped ) 5118 { 5119 // Loop through each text value 5120 foreach ( $text_values as $text ) 5121 { 5122 $is_hex = ( $text [0] == '<' ) ; 5123 $length = strlen ( $text ) - 1 ; 5124 $handled = false ; 5125 5126 // Characters are encoded within angle brackets ( "<>" ). 5127 // Note that several characters can be specified within the same angle brackets, so we have to take 5128 // into account the width we detected in the begincodespancerange construct 5129 if ( $is_hex ) 5130 { 5131 for ( $i = 1 ; $i < $length ; $i += $current_font_map_width ) 5132 { 5133 $value = substr ( $text, $i, $current_font_map_width ) ; 5134 $ch = hexdec ( $value ) ; 5135 5136 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) ) 5137 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ; 5138 else 5139 { 5140 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch ) ; 5141 $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ; 5142 } 5143 5144 $result .= $newchar ; 5145 } 5146 5147 $handled = true ; 5148 } 5149 // Yes ! double-byte codes can also be specified as plain text within parentheses ! 5150 // However, we have to be really careful here ; the sequence : 5151 // (Be) 5152 // can mean the string "Be" or the Unicode character 0x4265 ('B' = 0x42, 'e' = 0x65) 5153 // We first look if the character map contains an entry for Unicode codepoint 0x4265 ; 5154 // if not, then we have to consider that it is regular text to be taken one character by 5155 // one character. In this case, we fall back to the "if ( ! $handled )" condition 5156 else if ( $current_font_map_width == 4 ) 5157 { 5158 $temp_result = '' ; 5159 5160 for ( $i = 1 ; $i < $length ; $i ++ ) 5161 { 5162 // Each character in the pair may be a backslash, which escapes the next character so we must skip it 5163 // This code needs to be reviewed ; the same code is duplicated to handle escaped characters in octal notation 5164 if ( $text [$i] != '\\' ) 5165 $ch1 = $text [$i] ; 5166 else 5167 { 5168 $i ++ ; 5169 5170 if ( $text [$i] < '0' || $text [$i] > '7' ) 5171 $ch1 = $this -> ProcessEscapedCharacter ( $text [$i] ) ; 5172 else 5173 { 5174 $oct = '' ; 5175 $digit_count = 0 ; 5176 5177 while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 ) 5178 { 5179 $oct .= $text [$i ++] ; 5180 $digit_count ++ ; 5181 } 5182 5183 $ch1 = chr ( octdec ( $oct ) ) ; 5184 $i -- ; 5185 } 5186 } 5187 5188 $i ++ ; 5189 5190 if ( $text [$i] != '\\' ) 5191 $ch2 = $text [$i] ; 5192 else 5193 { 5194 $i ++ ; 5195 5196 if ( $text [$i] < '0' || $text [$i] > '7' ) 5197 $ch2 = $this -> ProcessEscapedCharacter ( $text [$i] ) ; 5198 else 5199 { 5200 $oct = '' ; 5201 $digit_count = 0 ; 5202 5203 while ( $i < $length && $text [$i] >= '0' && $text [$i] <= '7' && $digit_count < 3 ) 5204 { 5205 $oct .= $text [$i ++] ; 5206 $digit_count ++ ; 5207 } 5208 5209 $ch2 = chr ( octdec ( $oct ) ) ; 5210 $i -- ; 5211 } 5212 } 5213 5214 // Build the 2-bytes character code 5215 $ch = ( ord ( $ch1 ) << 8 ) | ord ( $ch2 ) ; 5216 5217 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) ) 5218 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ; 5219 else 5220 { 5221 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ch, true ) ; 5222 $this -> CharacterMapBuffer [ $current_font ] [ $ch ] = $newchar ; 5223 } 5224 5225 // Yes !!! for characters encoded with two bytes, we can find the following construct : 5226 // 0x00 "\" "(" 0x00 "C" 0x00 "a" 0x00 "r" 0x00 "\" ")" 5227 // which must be expanded as : (Car) 5228 // We have here the escape sequences "\(" and "\)", but the backslash is encoded on two bytes 5229 // (although the MSB is nul), while the escaped character is encoded on 1 byte. waiting 5230 // for the next quirk to happen... 5231 if ( $newchar == '\\' ) 5232 { 5233 $newchar = $this -> ProcessEscapedCharacter ( $text [ $i + 2 ] ) ; 5234 $i ++ ; // this time we processed 3 bytes, not 2 5235 } 5236 5237 $temp_result .= $newchar ; 5238 } 5239 5240 // Happens only if we were unable to translate a character using the current character map 5241 $result .= $temp_result ; 5242 $handled = true ; 5243 } 5244 5245 // Character strings within parentheses. 5246 // For every text value, use the character map table for substitutions 5247 if ( ! $handled ) 5248 { 5249 for ( $i = 1 ; $i < $length ; $i ++ ) 5250 { 5251 $ch = $text [$i] ; 5252 5253 // Set to true to optimize calls to MapCharacters 5254 // Currently does not work with pobox@dizy.sk/infoma.pdf (a few characters differ) 5255 $use_map_buffer = false ; 5256 5257 // ... but don't forget to handle escape sequences "\n" and "\r" for characters 5258 // 10 and 13 5259 if ( $ch == '\\' ) 5260 { 5261 $ch = $text [++$i] ; 5262 5263 // Escaped character 5264 if ( $ch < '0' || $ch > '7' ) 5265 $ch = $this -> ProcessEscapedCharacter ( $ch ) ; 5266 // However, an octal form can also be specified ; in this case we have to take into account 5267 // the character width for the current font (if the character width is 4 hex digits, then we 5268 // will encounter constructs such as "\000\077"). 5269 // The method used here is dirty : we build a regex to match octal character representations on a substring 5270 // of the text 5271 else 5272 { 5273 $width = $current_font_map_width / 2 ; // Convert to byte count 5274 $subtext = substr ( $text, $i - 1 ) ; 5275 $regex = "#^ (\\\\ [0-7]{3}){1,$width} #imsx" ; 5276 5277 $status = preg_match ( $regex, $subtext, $octal_matches ) ; 5278 5279 if ( $status ) 5280 { 5281 $octal_values = explode ( '\\', substr ( $octal_matches [0], 1 ) ) ; 5282 $ord = 0 ; 5283 5284 foreach ( $octal_values as $octal_value ) 5285 $ord = ( $ord << 8 ) + octdec ( $octal_value ) ; 5286 5287 $ch = chr ( $ord ) ; 5288 $i += strlen ( $octal_matches [0] ) - 2 ; 5289 } 5290 } 5291 5292 $use_map_buffer = false ; 5293 } 5294 5295 // Add substituted character to the output result 5296 $ord = ord ( $ch ) ; 5297 5298 if ( ! $use_map_buffer ) 5299 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ; 5300 else 5301 { 5302 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) ) 5303 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ; 5304 else 5305 { 5306 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ; 5307 $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ; 5308 } 5309 } 5310 5311 $result .= $newchar ; 5312 } 5313 } 5314 5315 // Handle offsets between blocks of characters 5316 if ( isset ( $offsets [ $value_index ] ) && 5317 - ( $offsets [ $value_index ] ) > $this -> MinSpaceWidth ) 5318 $result .= $this -> __get_character_padding ( $offsets [ $value_index ] ) ; 5319 5320 $value_index ++ ; 5321 } 5322 } 5323 // For fonts having no associated character map, we simply encode the string in UTF8 5324 // after the C-like escape sequences have been processed 5325 // Note that <xxxx> constructs can be encountered here, so we have to process them as well 5326 else 5327 { 5328 foreach ( $text_values as $text ) 5329 { 5330 $is_hex = ( $text [0] == '<' ) ; 5331 $length = strlen ( $text ) - 1 ; 5332 5333 // Some text within parentheses may have a backslash followed by a newline, to indicate some continuation line. 5334 // Example : 5335 // (this is a sentence \ 5336 // continued on the next line) 5337 // Funny isn't it ? so remove such constructs because we don't care 5338 $text = str_replace ( array ( "\\\r\n", "\\\r", "\\\n" ), '', $text ) ; 5339 5340 // Characters are encoded within angle brackets ( "<>" ) 5341 if ( $is_hex ) 5342 { 5343 for ( $i = 1 ; $i < $length ; $i += 2 ) 5344 { 5345 $ch = hexdec ( substr ( $text, $i, 2 ) ) ; 5346 5347 $result .= $this -> CodePointToUtf8 ( $ch ) ; 5348 } 5349 } 5350 // Characters are plain text 5351 else 5352 { 5353 $text = self::Unescape ( $text ) ; 5354 5355 for ( $i = 1, $length = strlen ( $text ) - 1 ; $i < $length ; $i ++ ) 5356 { 5357 $ch = $text [$i] ; 5358 $ord = ord ( $ch ) ; 5359 5360 if ( $ord < 127 ) 5361 $newchar = $ch ; 5362 else 5363 { 5364 if ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) ) 5365 $newchar = $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ; 5366 else 5367 { 5368 $newchar = $this -> FontTable -> MapCharacter ( $current_font, $ord ) ; 5369 $this -> CharacterMapBuffer [ $current_font ] [ $ord ] = $newchar ; 5370 } 5371 } 5372 5373 $result .= $newchar ; 5374 } 5375 } 5376 5377 // Handle offsets between blocks of characters 5378 if ( isset ( $offsets [ $value_index ] ) && 5379 abs ( $offsets [ $value_index ] ) > $this -> MinSpaceWidth ) 5380 $result .= $this -> __get_character_padding ( $offsets [ $value_index ] ) ; 5381 5382 $value_index ++ ; 5383 } 5384 } 5385 5386 // All done, return 5387 return ( $result ) ; 5388 } 5389 5390 5391 // __assemble_text_fragments - 5392 // Assembles text fragments collected by the ExtractTextWithLayout function. 5393 private function __assemble_text_fragments ( $page_number, &$fragments, &$page_width, &$page_height ) 5394 { 5395 $fragment_count = count ( $fragments ) ; 5396 5397 // No fragment no cry... 5398 if ( ! $fragment_count ) 5399 return ( '' ) ; 5400 5401 // Compute the width of each fragment 5402 foreach ( $fragments as &$fragment ) 5403 $this -> __compute_fragment_width ( $fragment ) ; 5404 5405 // Sort the fragments and group them by line 5406 usort ( $fragments, array ( $this, '__sort_page_fragments' ) ) ; 5407 $line_fragments = $this -> __group_line_fragments ( $fragments ) ; 5408 5409 // Retrieve the page attributes 5410 $page_attributes = $this -> PageMap -> PageAttributes [ $page_number ] ; 5411 5412 // Some buggy PDF do not specify page width or page height so, during the processing of text fragments, 5413 // page width & height will be set to the largest x/y coordinate 5414 if ( isset ( $page_attributes [ 'width' ] ) && $page_attributes [ 'width' ] ) 5415 $page_width = $page_attributes [ 'width' ] ; 5416 else 5417 { 5418 $page_width = 0 ; 5419 5420 foreach ( $fragments as $fragment ) 5421 { 5422 $end_x = $fragment [ 'x' ] + $fragment [ 'width' ] ; 5423 5424 if ( $end_x > $page_width ) 5425 $page_width = $end_x ; 5426 } 5427 } 5428 5429 if ( isset ( $page_attributes [ 'height' ] ) && $page_attributes [ 'height' ] ) 5430 $page_height = $page_attributes [ 'height' ] ; 5431 else 5432 $page_height = $fragments [0] [ 'y' ] ; 5433 5434 // Block separator 5435 $separator = ( $this -> BlockSeparator ) ? $this -> BlockSeparator : ' ' ; 5436 5437 // Unprocessed marker count 5438 $unprocessed_marker_count = count ( $this -> UnprocessedMarkerList [ 'font' ] ) ; 5439 5440 // Add page information if the PDFOPT_DEBUG_SHOW_COORDINATES option has been specified 5441 if ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES ) 5442 $result = "[Page : $page_number, width = $page_width, height = $page_height]" . $this -> EOL ; 5443 else 5444 $result = '' ; 5445 5446 // Loop through each line of fragments 5447 for ( $i = 0, $line_count = count ( $line_fragments ) ; $i < $line_count ; $i ++ ) 5448 { 5449 $current_x = 0 ; 5450 5451 // Loop through each fragment of the current line 5452 for ( $j = 0, $fragment_count = count ( $line_fragments [$i] ) ; $j < $fragment_count ; $j ++ ) 5453 { 5454 $fragment = $line_fragments [$i] [$j] ; 5455 5456 // Process the markers which do not have an associated font yet - this will be done by matching 5457 // the current text fragment against one of the regular expressions defined. 5458 // If a match occurs, then all the subsequent text fragment using the same font will be put markers 5459 for ( $k = 0 ; $k < $unprocessed_marker_count ; $k ++ ) 5460 { 5461 $marker = $this -> UnprocessedMarkerList [ 'font' ] [$k] ; 5462 5463 if ( preg_match ( $marker [ 'regex' ], $fragment [ 'text' ] ) ) 5464 { 5465 $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] = array 5466 ( 5467 'font' => $fragment [ 'font' ], 5468 'height' => $fragment [ 'font-height' ], 5469 'regex' => $marker [ 'regex' ], 5470 'start' => $marker [ 'start' ], 5471 'end' => $marker [ 'end' ] 5472 ) ; 5473 5474 $unprocessed_marker_count -- ; 5475 unset ( $this -> UnprocessedMarkerList [ 'font' ] [$k] ) ; 5476 5477 break ; 5478 } 5479 } 5480 5481 // Add debug info if needed 5482 if ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES ) 5483 $result .= $this -> __debug_get_coordinates ( $fragment ) ; 5484 5485 // Add a separator between two fragments, if needed 5486 if ( $j ) 5487 { 5488 if ( $current_x < floor ( $fragment [ 'x' ] ) ) // Accept small rounding errors 5489 $result .= $separator ; 5490 } 5491 5492 // Check if we need to add markers around this text fragment 5493 if ( isset ( $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] ) && 5494 $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] [ 'height' ] == $fragment [ 'font-height' ] ) 5495 { 5496 $fragment_text = $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] [ 'start' ] . 5497 $fragment [ 'text' ] . 5498 $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] [ 'end' ] ; 5499 } 5500 else 5501 $fragment_text = $fragment [ 'text' ] ; 5502 5503 // Add the current fragment to the result 5504 $result .= $fragment_text ; 5505 5506 // Update current x-position 5507 $current_x = $fragment [ 'x' ] + $fragment [ 'width' ] ; 5508 } 5509 5510 // Add a line break between each line 5511 $result .= $this -> EOL ; 5512 } 5513 5514 // All done, return 5515 return ( $result ) ; 5516 } 5517 5518 5519 // __sort_page_fragments - 5520 // Sorts page fragments by their (y,x) coordinates. 5521 public function __sort_page_fragments ( $a, $b ) 5522 { 5523 $xa = $a [ 'x' ] ; 5524 $ya = $a [ 'y' ] ; 5525 $xb = $b [ 'x' ] ; 5526 $yb = $b [ 'y' ] ; 5527 5528 if ( $ya !== $yb ) 5529 return ( $yb - $ya ) ; 5530 else 5531 return ( $xa - $xb ) ; 5532 } 5533 5534 5535 // __sort_line_fragments - 5536 // Sorts fragments per line. 5537 public function __sort_line_fragments ( $a, $b ) 5538 { 5539 return ( $a [ 'x' ] - $b [ 'x' ] ) ; 5540 } 5541 5542 5543 // __group_line_fragments - 5544 // Groups page fragments per line, allowing a certain variation in the y-position. 5545 private function __group_line_fragments ( $fragments ) 5546 { 5547 $result = array ( ) ; 5548 $fragment_count = count ( $fragments ) ; 5549 $last_y_coordinate = $fragments [0] [ 'y' ] ; 5550 $current_fragments = array ( $fragments [0] ) ; 5551 5552 for ( $i = 1 ; $i < $fragment_count ; $i ++ ) 5553 { 5554 $fragment = $fragments [$i] ; 5555 5556 if ( $fragment [ 'y' ] + $fragment [ 'font-height' ] >= $last_y_coordinate ) 5557 $current_fragments [] = $fragment ; 5558 else 5559 { 5560 $last_y_coordinate = $fragment [ 'y' ] ; 5561 usort ( $current_fragments, array ( $this, '__sort_line_fragments' ) ) ; 5562 $result [] = $current_fragments ; 5563 $current_fragments = array ( $fragment ) ; 5564 } 5565 } 5566 5567 if ( count ( $current_fragments ) ) 5568 { 5569 usort ( $current_fragments, array ( $this, '__sort_line_fragments' ) ) ; 5570 $result [] = $current_fragments ; 5571 } 5572 5573 return ( $result ) ; 5574 } 5575 5576 5577 // __compute_fragment_width - 5578 // Compute the width of the specified text fragment and add the width entry accordingly. 5579 // Returns the font object associated with this fragment 5580 private function __compute_fragment_width ( &$fragment ) 5581 { 5582 // To avoid repeated calls to the PdfTexterFontTable::GetFontObject() method, we are buffering them in the FontObjectsBuffer property. 5583 $object_reference = $fragment [ 'page' ] . ':' . $fragment [ 'template' ] . ':' . $fragment [ 'font' ] ; 5584 5585 if ( isset ( $this -> FontObjectsBuffer [ $object_reference ] ) ) 5586 $font_object = $this -> FontObjectsBuffer [ $object_reference ] ; 5587 else 5588 { 5589 $font_object = $this -> FontTable -> GetFontObject ( $fragment [ 'page' ], $fragment [ 'template' ], $fragment [ 'font' ] ) ; 5590 $this -> FontObjectsBuffer [ $object_reference ] = $font_object ; 5591 } 5592 5593 // The width of the previous text fragment will be computed only if its associated font contains character widths information 5594 $fragment [ 'width' ] = ( $font_object ) ? $font_object -> GetStringWidth ( $fragment [ 'text' ], $this -> ExtraTextWidth ) : 0 ; 5595 5596 // Return the font object 5597 return ( $font_object ) ; 5598 } 5599 5600 5601 // __debug_get_coordinates - 5602 // Returns the coordinates of the specified text fragment, in debug mode. 5603 private function __debug_get_coordinates ( $fragment ) 5604 { 5605 return ( "\n[x:" . round ( $fragment [ 'x' ], 3 ) . ', y:' . round ( $fragment [ 'y' ], 3 ) . 5606 ", w: " . round ( $fragment [ 'width' ], 3 ) . ", h:" . round ( $fragment [ 'font-height' ], 3 ) . ", font:" . $fragment [ 'font' ] . "]" ) ; 5607 } 5608 5609 5610 /*-------------------------------------------------------------------------------------------------------------- 5611 5612 NAME 5613 GetTrailerInformation - Retrieves trailer information. 5614 5615 PROTOTYPE 5616 $this -> GetTrailerInformation ( $contents ) ; 5617 5618 DESCRIPTION 5619 Retrieves trailer information : 5620 - Unique file ID 5621 - Id of the object containing encryption data, if the PDF file is encrypted 5622 - Encryption data 5623 5624 PARAMETERS 5625 $contents (string) - 5626 PDF file contents. 5627 5628 *-------------------------------------------------------------------------------------------------------------*/ 5629 protected function GetTrailerInformation ( $contents, $pdf_objects ) 5630 { 5631 // Be paranoid : check if there is trailer information 5632 if ( ! preg_match ( '/trailer \s* << (?P<trailer> .+?) >>/imsx', $contents, $trailer_match ) ) 5633 return ; 5634 5635 $trailer_data = $trailer_match [ 'trailer' ] ; 5636 5637 // Get the unique file id from the trailer data 5638 static $id_regex = '# 5639 /ID \s* \[ \s* 5640 < (?P<id1> [^>]+) > 5641 \s* 5642 < (?P<id2> [^>]+) > 5643 \s* \] 5644 #imsx' ; 5645 5646 if ( preg_match ( $id_regex, $trailer_data, $id_match ) ) 5647 { 5648 $this -> ID = $id_match [ 'id1' ] ; 5649 $this -> ID2 = $id_match [ 'id2' ] ; 5650 } 5651 5652 // If there is an object describing encryption data, get its number (/Encrypt flag) 5653 if ( ! preg_match ( '#/Encrypt \s+ (?P<object> \d+)#ix', $trailer_data, $encrypt_match ) ) 5654 return ; 5655 5656 $encrypt_object_id = $encrypt_match [ 'object' ] ; 5657 5658 if ( ! isset ( $pdf_objects [ $encrypt_object_id ] ) ) 5659 { 5660 if ( self::$DEBUG ) 5661 error ( new PdfToTextDecodingException ( "Object #$encrypt_object_id, which should contain encryption data, is missing." ) ) ; 5662 5663 return ; 5664 } 5665 5666 // Parse encryption information 5667 $this -> EncryptionData = PdfEncryptionData::GetInstance ( $this -> ID, $encrypt_object_id, $pdf_objects [ $encrypt_object_id ] ) ; 5668 $this -> IsEncrypted = ( $this -> EncryptionData !== false ) ; 5669 } 5670 5671 5672 // __build_ignored_instructions : 5673 // Takes the template regular expressions from the self::$IgnoredInstructionsTemplates, replace each string with the contents 5674 // of the self::$ReplacementConstructs array, and sets the self::$IgnoredInstructions to a regular expression that is able to 5675 // match the Postscript instructions to be removed from any text stream. 5676 private function __build_ignored_instructions ( ) 5677 { 5678 $searches = array_keys ( self::$ReplacementConstructs ) ; 5679 $replacements = array_values ( self::$ReplacementConstructs ) ; 5680 5681 foreach ( self::$IgnoredInstructionTemplatesLayout as $template ) 5682 { 5683 $template = '/' . str_replace ( $searches, $replacements, $template ) . '/msx' ; 5684 5685 self::$IgnoredInstructionsLayout [] = $template ; 5686 self::$IgnoredInstructionsNoLayout [] = $template ; 5687 } 5688 5689 foreach ( self::$IgnoredInstructionTemplatesNoLayout as $template ) 5690 { 5691 $template = '/' . str_replace ( $searches, $replacements, $template ) . '/msx' ; 5692 5693 self::$IgnoredInstructionsNoLayout [] = $template ; 5694 } 5695 } 5696 5697 5698 // __convert_utf16 : 5699 // Some strings found in a pdf file can be encoded in UTF16 (author information, for example). 5700 // When this is the case, the string is converted to UTF8. 5701 private function __convert_utf16 ( $text ) 5702 { 5703 if ( isset ( $text [0] ) && isset ( $text [1] ) ) 5704 { 5705 $b1 = ord ( $text [0] ) ; 5706 $b2 = ord ( $text [1] ) ; 5707 5708 if ( ( $b1 == 0xFE && $b2 == 0xFF ) || ( $b1 == 0xFF && $b2 == 0xFE ) ) 5709 $text = mb_convert_encoding ( $text, 'UTF-8', 'UTF-16' ) ; 5710 } 5711 5712 return ( $text ) ; 5713 } 5714 5715 5716 // __extract_chars_from_array - 5717 // Extracts characters enclosed either within parentheses (character codes) or angle brackets (hex value) 5718 // from an array. 5719 // Example : 5720 // 5721 // [<0D>-40<02>-36<03>-39<0E>-36<0F>-36<0B>-37<10>-37<10>-35(abc)] 5722 // 5723 // will return an array having the following entries : 5724 // 5725 // <0D>, <02>, <03>, <0E>, <0F>, <0B>, <10>, <10>, (abc) 5726 private function __extract_chars_from_array ( $array ) 5727 { 5728 $length = strlen ( $array ) - 1 ; 5729 $result = array ( ) ; 5730 $offsets = array ( ) ; 5731 5732 for ( $i = 1 ; $i < $length ; $i ++ ) // Start with character right after the opening bracket 5733 { 5734 $ch = $array [$i] ; 5735 5736 if ( $ch == '(' ) 5737 $endch = ')' ; 5738 else if ( $ch == '<' ) 5739 $endch = '>' ; 5740 else 5741 { 5742 $value = '' ; 5743 5744 while ( $i < $length && ( ( $array [$i] >= '0' && $array [$i] <= '9' ) || 5745 $array [$i] == '-' || $array [$i] == '+' || $array [$i] == '.' ) ) 5746 $value .= $array [$i++] ; 5747 5748 $offsets [] = ( double ) $value ; 5749 5750 if ( $value !== '' ) 5751 $i -- ; 5752 5753 continue ; 5754 } 5755 5756 $char = $ch ; 5757 $i ++ ; 5758 5759 while ( $i < $length && $array [$i] != $endch ) 5760 { 5761 if ( $array [$i] == '\\' ) 5762 $char .= '\\' . $array [++$i] ; 5763 else 5764 { 5765 $char .= $array [$i] ; 5766 5767 if ( $array [$i] == $endch ) 5768 break ; 5769 } 5770 5771 $i ++ ; 5772 } 5773 5774 $result [] = $char . $endch ; 5775 } 5776 5777 return ( array ( $result, $offsets ) ) ; 5778 } 5779 5780 5781 // __extract_chars_from_block - 5782 // Extracts characters from a text block (enclosed in parentheses). 5783 // Returns an array of character ordinals if the $as_array parameter is true, or a string if false. 5784 private function __extract_chars_from_block ( $text, $start_index = false, $length = false, $as_array = false ) 5785 { 5786 if ( $as_array ) 5787 $result = array ( ) ; 5788 else 5789 $result = '' ; 5790 5791 if ( $start_index === false ) 5792 $start_index = 0 ; 5793 5794 if ( $length === false ) 5795 $length = strlen ( $text ) ; 5796 5797 $ord0 = ord ( '0' ) ; 5798 5799 for ( $i = $start_index ; $i < $length ; $i ++ ) 5800 { 5801 $ch = $text [$i] ; 5802 5803 if ( $ch == '\\' ) 5804 { 5805 if ( isset ( $text [ $i + 1 ] ) ) 5806 { 5807 $ch2 = $text [ ++$i ] ; 5808 5809 switch ( $ch2 ) 5810 { 5811 case 'n' : $ch = "\n" ; break ; 5812 case 'r' : $ch = "\r" ; break ; 5813 case 't' : $ch = "\t" ; break ; 5814 case 'f' : $ch = "\f" ; break ; 5815 case 'v' : $ch = "\v" ; break ; 5816 5817 default : 5818 if ( $ch2 >= '0' && $ch2 <= '7' ) 5819 { 5820 $ord = $ch2 - $ord0 ; 5821 $i ++ ; 5822 5823 while ( isset ( $text [$i] ) && $text [$i] >= '0' && $text [$i] <= '7' ) 5824 { 5825 $ord = ( $ord * 8 ) + ord ( $text [$i] ) - $ord0 ; 5826 $i ++ ; 5827 } 5828 5829 $ch = chr ( $ord ) ; 5830 $i -- ; 5831 } 5832 else 5833 $ch = $ch2 ; 5834 5835 } 5836 } 5837 } 5838 5839 if ( $as_array ) 5840 $result [] = ord ( $ch ) ; 5841 else 5842 $result .= $ch ; 5843 } 5844 5845 return ( $result ) ; 5846 } 5847 5848 5849 // __get_character_padding : 5850 // If the offset specified between two character groups in an array notation for displaying text is less 5851 // than -MinSpaceWidth thousands of text units, 5852 private function __get_character_padding ( $char_offset ) 5853 { 5854 if ( $char_offset <= - $this -> MinSpaceWidth ) 5855 { 5856 if ( $this -> Options & self::PDFOPT_REPEAT_SEPARATOR ) 5857 { 5858 // If the MinSpaceWidth property is less than 1000 (text units), consider it has the value 1000 5859 // so that an exuberant number of spaces will not be repeated 5860 $space_width = ( $this -> MinSpaceWidth < 1000 ) ? 1000 : $this -> MinSpaceWidth ; 5861 5862 $repeat_count = abs ( round ( $char_offset / $space_width, 0 ) ) ; 5863 5864 if ( $repeat_count ) 5865 $padding = str_repeat ( $this -> Separator, $repeat_count ) ; 5866 else 5867 $padding = $this -> Separator ; 5868 } 5869 else 5870 $padding = $this -> Separator ; 5871 5872 return ( utf8_encode ( self::Unescape ( $padding ) ) ) ; 5873 } 5874 else 5875 return ( '' ) ; 5876 } 5877 5878 5879 // __get_output_image_filename - 5880 // Returns a real filename based on a template supplied by the AutoSaveImageFileTemplate property. 5881 private function __get_output_image_filename ( ) 5882 { 5883 static $suffixes = array 5884 ( 5885 IMG_JPEG => 'jpg', 5886 IMG_JPG => 'jpg', 5887 IMG_GIF => 'gif', 5888 IMG_PNG => 'png', 5889 IMG_WBMP => 'wbmp', 5890 IMG_XPM => 'xpm' 5891 ) ; 5892 5893 $template = $this -> ImageAutoSaveFileTemplate ; 5894 $length = strlen ( $template ) ; 5895 $parts = pathinfo ( $this -> Filename ) ; 5896 5897 if ( ! isset ( $parts [ 'filename' ] ) ) // for PHP versions < 5.2 5898 { 5899 $index = strpos ( $parts [ 'basename' ], '.' ) ; 5900 5901 if ( $index === false ) 5902 $parts [ 'filename' ] = $parts [ 'basename' ] ; 5903 else 5904 $parts [ 'filename' ] = substr ( $parts [ 'basename' ], $index ) ; 5905 } 5906 5907 $searches = array ( ) ; 5908 $replacements = array ( ) ; 5909 5910 // Search for each construct starting with '%' 5911 for ( $i = 0 ; $i < $length ; $i ++ ) 5912 { 5913 if ( $template [$i] != '%' || $i + 1 >= $length ) 5914 continue ; 5915 5916 $ch = $template [ ++ $i ] ; 5917 5918 // Percent sign found : check the character after 5919 switch ( $ch ) 5920 { 5921 // "%%" : Replace it with a single percent 5922 case '%' : 5923 $searches [] = '%%' ; 5924 $replacements [] = '%' ; 5925 break ; 5926 5927 // "%p" : Path of the original PDF file 5928 case 'p' : 5929 $searches [] = '%p' ; 5930 $replacements [] = $parts [ 'dirname' ] ; 5931 break ; 5932 5933 // "%f" : Filename part of the original PDF file, without its suffix 5934 case 'f' : 5935 $searches [] = '%f' ; 5936 $replacements [] = $parts [ 'filename' ] ; 5937 break ; 5938 5939 // "%s" : Output image file suffix, determined by the ImageAutoSaveFormat property 5940 case 's' : 5941 if ( isset ( $suffixes [ $this -> ImageAutoSaveFormat ] ) ) 5942 { 5943 $searches [] = '%s' ; 5944 $replacements [] = $suffixes [ $this -> ImageAutoSaveFormat ] ; 5945 } 5946 else 5947 { 5948 $searches [] = '%s' ; 5949 $replacements [] = 'unknown' ; 5950 } 5951 5952 break ; 5953 5954 // Other : may be either "%d", or "%xd", where "x" are digits expression the width of the final sequential index 5955 default : 5956 $width = 0 ; 5957 $chars = '' ; 5958 5959 if ( ctype_digit ( $ch ) ) 5960 { 5961 do 5962 { 5963 $width = ( $width * 10 ) + ord ( $ch ) - ord ( '0' ) ; 5964 $chars .= $ch ; 5965 $i ++ ; 5966 } while ( $i < $length && ctype_digit ( $ch = $template [$i] ) ) ; 5967 5968 if ( $template [$i] == 'd' ) 5969 { 5970 $searches [] = '%' . $chars . 'd' ; 5971 $replacements [] = sprintf ( "%0{$width}d", $this -> ImageCount ) ; 5972 } 5973 } 5974 else 5975 { 5976 $searches [] = '%d' ; 5977 $replacements [] = $this -> ImageCount ; 5978 } 5979 } 5980 } 5981 5982 // Perform the replacements 5983 if ( count ( $searches ) ) 5984 $result = str_replace ( $searches, $replacements, $template ) ; 5985 else 5986 $result = $template ; 5987 5988 // All done, return 5989 return ( $result ) ; 5990 } 5991 5992 5993 // __rtl_process - 5994 // Processes the contents of a page when it contains characters belonging to an RTL language. 5995 private function __rtl_process ( $text ) 5996 { 5997 $length = strlen ( $text ) ; 5998 $pos = strcspn ( $text, self::$RtlCharacterPrefixes ) ; 5999 6000 // The text does not contain any of the UTF-8 prefixes that may introduce RTL contents : 6001 // simply return it as is 6002 if ( $pos == $length || $text [$pos] === "\x00" ) 6003 return ( $text ) ; 6004 6005 // Extract each individual line, and get rid of carriage returns if any 6006 $lines = explode ( "\n", str_replace ( "\r", '', $text ) ) ; 6007 $new_lines = array ( ) ; 6008 6009 // Loop through lines 6010 foreach ( $lines as $line ) 6011 { 6012 // Check if the current line contains potential RTL characters 6013 $pos = strcspn ( $line, self::$RtlCharacterPrefixes ) ; 6014 $length = strlen ( $line ) ; 6015 6016 // If not, simply store it as is 6017 if ( $pos == $length ) 6018 { 6019 $new_lines [] = $line ; 6020 continue ; 6021 } 6022 6023 // Otherwise, it gets a little bit more complicated ; we have : 6024 // - To process each series of RTL characters and put them in reverse order 6025 // - Mark spaces and punctuation as "RTL separators", without reversing them (ie, a string like " ." remains " .", not ". ") 6026 // - Other sequences of non-RTL characters must be preserved as is and are not subject to reordering 6027 // The reordering sequence will be described later. For the moment, the $words array is used to store arrays of two elements : 6028 // - The first one is a boolean indicating whether it concerns RTL characters (true) or not (false) 6029 // - The second one is the string itself 6030 $words = array ( ) ; 6031 6032 // Start of the string is not an RTL sequence ; we can add it to our $words array 6033 if ( $pos ) 6034 { 6035 $word = substr ( $line, 0, $pos ) ; 6036 $words [] = array ( $this -> __is_rtl_separator ( $word ), $word ) ; 6037 } 6038 6039 $in_rtl = true ; 6040 6041 // Loop through remaining characters of the current line 6042 while ( $pos < $length ) 6043 { 6044 // Character at the current position may be RTL character 6045 if ( $in_rtl ) 6046 { 6047 6048 $rtl_text = '' ; 6049 $rtl_char = '' ; 6050 $rtl_char_length = 0 ; 6051 $found_rtl = false ; 6052 6053 // Collect all the consecutive RTL characters, which represent a word, and put the letters in reverse order 6054 while ( $pos < $length && $this -> __is_rtl_character ( $line, $pos, $rtl_char, $rtl_char_length ) ) 6055 { 6056 $rtl_text = $rtl_char . $rtl_text ; 6057 $pos += $rtl_char_length ; 6058 $found_rtl = true ; 6059 } 6060 6061 // ... but make sure that we found a valid RTL sequence 6062 if ( $found_rtl ) 6063 $words [] = array ( true, $rtl_text ) ; 6064 else 6065 $words [] = array ( false, $line [ $pos ++ ] ) ; 6066 6067 // For now, we are no more in a series of RTL characters 6068 $in_rtl = false ; 6069 } 6070 // Non-RTL characters : collect them until either the end of the current line or the next RTL character 6071 else 6072 { 6073 $next_pos = $pos + strcspn ( $line, self::$RtlCharacterPrefixes, $pos ) ; 6074 6075 if ( $next_pos >= $length ) 6076 { 6077 $word = substr ( $line, $pos ) ; 6078 break ; 6079 } 6080 else 6081 { 6082 $word = substr ( $line, $pos, $next_pos - $pos ) ; 6083 $pos = $next_pos ; 6084 $in_rtl = true ; 6085 } 6086 6087 // Don't forget to make the distinction between a sequence of spaces and punctuations, and a real 6088 // piece of text. Space/punctuation strings surrounded by RTL words will be interverted 6089 $words [] = array ( $this -> __is_rtl_separator ( $word ), $word ) ; 6090 } 6091 } 6092 6093 // Now we have an array, $words, whose first entry of each element indicates whether the second entry is an RTL string 6094 // or not (this includes strings that contain only spaces and punctuation). 6095 // We have to gather all the consecutive array items whose first entry is true, then invert their order. 6096 // Non-RTL strings are not affected by this process. 6097 $stacked_rtl_words = array ( ) ; 6098 $new_words = array ( ) ; 6099 6100 foreach ( $words as $word ) 6101 { 6102 // RTL word : put it onto the stack 6103 if ( $word [0] ) 6104 $stacked_rtl_words [] = $word [1] ; 6105 // Non-RTL word : add it as is to the output array, $new_words 6106 else 6107 { 6108 // But if RTL words were stacked before, invert them and add them to the output array 6109 if ( count ( $stacked_rtl_words ) ) 6110 { 6111 $new_words = array_merge ( $new_words, array_reverse ( $stacked_rtl_words ) ) ; 6112 $stacked_rtl_words = array ( ) ; 6113 } 6114 6115 $new_words [] = $word [1] ; 6116 } 6117 } 6118 6119 // Process any remaining RTL words that may have been stacked and not yet processed 6120 if ( count ( $stacked_rtl_words ) ) 6121 $new_words = array_merge ( $new_words, array_reverse ( $stacked_rtl_words ) ) ; 6122 6123 // That's ok, we have processed one more line 6124 $new_lines [] = implode ( '', $new_words ) ; 6125 } 6126 6127 // All done, return a catenation of all the lines processed so far 6128 $result = implode ( "\n", $new_lines ) ; 6129 6130 return ( $result ) ; 6131 } 6132 6133 6134 // __is_rtl_character - 6135 // Checks if the sequence starting at $pos in string $text is a character belonging to an RTL language. 6136 // If yes, returns true and sets $rtl_char to the UTF8 string sequence for that character, and $rtl_char_length 6137 // to the length of this string. 6138 // If no, returns false. 6139 private function __is_rtl_character ( $text, $pos, &$rtl_char, &$rtl_char_length ) 6140 { 6141 $ch = $text [ $pos ] ; 6142 6143 // Check that the current character is the start of a potential UTF8 RTL sequence 6144 if ( isset ( self::$RtlCharacterPrefixLengths [ $ch ] ) ) 6145 { 6146 // Get the number of characters that are expected after the sequence 6147 $length_after = self::$RtlCharacterPrefixLengths [ $ch ] ; 6148 6149 // Get the sequence after the UTF8 prefix 6150 $codes_after = substr ( $text, $pos + 1, $length_after ) ; 6151 6152 // Search through $RtlCharacters, which contains arrays of ranges related to the UTF8 character prefix 6153 foreach ( self::$RtlCharacters [ $ch ] as $range ) 6154 { 6155 if ( strcmp ( $range [0], $codes_after ) <= 0 && 6156 strcmp ( $range [1], $codes_after ) >= 0 ) 6157 { 6158 $rtl_char = $ch . $codes_after ; 6159 $rtl_char_length = $length_after + 1 ; 6160 6161 return ( true ) ; 6162 } 6163 } 6164 6165 return ( false ) ; 6166 } 6167 else 6168 return ( false ) ; 6169 } 6170 6171 6172 // __is_rtl_separator - 6173 // RTL words are separated by spaces and punctuation signs that are specified as LTR characters. 6174 // However, such sequences, which are separators between words, must be considered as being part 6175 // of an RTL sequence of words and therefore be reversed with them. 6176 // This function helps to determine if the supplied string is simply a sequence of spaces and 6177 // punctuation (a word separator) or plain text, that must keep its position in the line. 6178 private function __is_rtl_separator ( $text ) 6179 { 6180 static $known_separators = array ( ) ; 6181 static $separators = " \t,.;:/!-_=+" ; 6182 6183 if ( isset ( $known_separators [ $text ] ) ) 6184 return ( true ) ; 6185 6186 for ( $i = 0, $length = strlen ( $text ) ; $i < $length ; $i ++ ) 6187 { 6188 if ( strpos ( $separators, $text [$i] ) === false ) 6189 return ( false ) ; 6190 } 6191 6192 $known_separators [ $text ] = true ; 6193 6194 return ( true ) ; 6195 } 6196 6197 6198 // __strip_useless_instructions : 6199 // Removes from a text stream all the Postscript instructions that are not meaningful for text extraction 6200 // (these are mainly shape drawing instructions). 6201 private function __strip_useless_instructions ( $data ) 6202 { 6203 $result = preg_replace ( $this -> IgnoredInstructions, ' ', $data ) ; 6204 6205 $this -> Statistics [ 'TextSize' ] += strlen ( $data ) ; 6206 $this -> Statistics [ 'OptimizedTextSize' ] += strlen ( $result ) ; 6207 6208 return ( $result ) ; 6209 } 6210 6211 6212 /*-------------------------------------------------------------------------------------------------------------- 6213 6214 NAME 6215 IsPageSelected - Checks if a page is selected for output. 6216 6217 PROTOTYPE 6218 $status = $this -> IsPageSelected ( $page ) ; 6219 6220 DESCRIPTION 6221 Checks if the specified page is to be selected for output. 6222 6223 PARAMETERS 6224 $page (integer) - 6225 Page to be checked. 6226 6227 RETURN VALUE 6228 True if the page is to be selected for output, false otherwise. 6229 6230 *-------------------------------------------------------------------------------------------------------------*/ 6231 protected function IsPageSelected ( $page ) 6232 { 6233 if ( ! $this -> MaxSelectedPages ) 6234 return ( true ) ; 6235 6236 if ( $this -> MaxSelectedPages > 0 ) 6237 return ( $page <= $this -> MaxSelectedPages ) ; 6238 6239 // MaxSelectedPages < 0 6240 return ( $page > count ( $this -> PageMap -> Pages ) + $this -> MaxSelectedPages ) ; 6241 } 6242 6243 6244 /*-------------------------------------------------------------------------------------------------------------- 6245 6246 NAME 6247 PeekAuthorInformation - Gets author information from the specified object data. 6248 6249 PROTOTYPE 6250 $this -> PeekAuthorInformation ( $object_id, $object_data ) ; 6251 6252 DESCRIPTION 6253 Try to check if the specified object data contains author information (ie, the /Author, /Creator, 6254 /Producer, /ModDate, /CreationDate keywords) and sets the corresponding properties accordingly. 6255 6256 PARAMETERS 6257 $object_id (integer) - 6258 Object id of this text block. 6259 6260 $object_data (string) - 6261 Stream contents. 6262 6263 *-------------------------------------------------------------------------------------------------------------*/ 6264 protected function PeekAuthorInformation ( $object_id, $object_data ) 6265 { 6266 if ( ( strpos ( $object_data, '/Author' ) !== false || strpos ( $object_data, '/CreationDate' ) !== false ) ) 6267 { 6268 $this -> GotAuthorInformation = true ; 6269 return ( $object_id ) ; 6270 } 6271 else 6272 return ( false ) ; 6273 } 6274 6275 6276 /*-------------------------------------------------------------------------------------------------------------- 6277 6278 NAME 6279 RetrieveAuthorInformation - Extracts author information 6280 6281 PROTOTYPE 6282 $this -> RetriveAuthorInformation ( $object_id, $pdf_objects ) ; 6283 6284 DESCRIPTION 6285 Extracts the author information. Handles the case where flag values refer to existing objects. 6286 6287 PARAMETERS 6288 $object_id (integer) - 6289 Id of the object containing the author information. 6290 6291 $pdf_objects (array) - 6292 Array whose keys are the PDF object ids, and values their corresponding contents. 6293 6294 *-------------------------------------------------------------------------------------------------------------*/ 6295 protected function RetrieveAuthorInformation ( $object_id, $pdf_objects ) 6296 { 6297 static $re = '# 6298 (?P<info> 6299 / 6300 (?P<keyword> (Author) | (Creator) | (Producer) | (Title) | (CreationDate) | (ModDate) | (Keywords) | (Subject) ) 6301 \s* 6302 (?P<opening> [(<]) 6303 ) 6304 #imsx' ; 6305 static $object_re = '# 6306 (?P<info> 6307 / 6308 (?P<keyword> (Author) | (Creator) | (Producer) | (Title) | (CreationDate) | (ModDate) | (Keywords) | (Subject) ) 6309 \s* 6310 (?P<object_ref> 6311 (?P<object> \d+) 6312 \s+ 6313 \d+ 6314 \s+ 6315 R 6316 ) 6317 ) 6318 #imsx' ; 6319 6320 // Retrieve the object data corresponding to the specified object id 6321 $object_data = $pdf_objects [ $object_id ] ; 6322 6323 // Pre-process flags whose values refer to existing objects 6324 if ( preg_match_all ( $object_re, $object_data, $object_matches ) ) 6325 { 6326 $searches = array ( ) ; 6327 $replacements = array ( ) ; 6328 6329 for ( $i = 0, $count = count ( $object_matches [ 'keyword' ] ) ; $i < $count ; $i ++ ) 6330 { 6331 $searches [] = $object_matches [ 'object_ref' ] [$i] ; 6332 6333 // Some buggy PDF may reference author information objects that do not exist 6334 $replacements [] = isset ( $pdf_objects [ $object_matches [ 'object' ] [$i] ] ) ? 6335 trim ( $pdf_objects [ $object_matches [ 'object' ] [$i] ] ) : '' ; 6336 } 6337 6338 $object_data = str_replace ( $searches, $replacements, $object_data ) ; 6339 } 6340 6341 6342 // To execute faster, run the regular expression only if the object data contains a /Author keyword 6343 if ( preg_match_all ( $re, $object_data, $matches, PREG_OFFSET_CAPTURE ) ) 6344 { 6345 for ( $i = 0, $count = count ( $matches [ 'keyword' ] ) ; $i < $count ; $i ++ ) 6346 { 6347 $keyword = $matches [ 'keyword' ] [$i] [0] ; 6348 $opening = $matches [ 'opening' ] [$i] [0] ; 6349 $start_index = $matches [ 'info' ] [$i] [1] + strlen ( $matches [ 'info' ] [$i] [0] ) ; 6350 6351 // Text between parentheses : the text is written as is 6352 if ( $opening == '(' ) 6353 { 6354 $parent_level = 1 ; 6355 6356 // Since the parameter value can contain any character, including "\" or "(", we will have to find the real closing 6357 // parenthesis 6358 $value = '' ; 6359 6360 for ( $j = $start_index, $object_length = strlen ( $object_data ) ; $j < $object_length ; $j ++ ) 6361 { 6362 if ( $object_data [$j] == '\\' ) 6363 $value .= '\\' . $object_data [++$j] ; 6364 else if ( $object_data [$j] == '(' ) 6365 { 6366 $value .= '(' ; 6367 $parent_level ++ ; 6368 } 6369 else if ( $object_data [$j] == ')' ) 6370 { 6371 $parent_level -- ; 6372 6373 if ( ! $parent_level ) 6374 break ; 6375 else 6376 $value .= ')' ; 6377 } 6378 else 6379 $value .= $object_data [$j] ; 6380 } 6381 } 6382 // Text within angle brackets, written as hex digits 6383 else 6384 { 6385 $end_index = strpos ( $object_data, '>', $start_index ) ; 6386 $hexdigits = substr ( $object_data, $start_index, $end_index - $start_index ) ; 6387 $value = hex2bin ( str_replace ( array ( "\n", "\r", "\t" ), '', $hexdigits ) ) ; 6388 } 6389 6390 $value = $this -> __convert_utf16 ( $this -> __extract_chars_from_block ( $value ) ) ; 6391 6392 switch ( strtolower ( $keyword ) ) 6393 { 6394 case 'author' : $this -> Author = $value ; break ; 6395 case 'creator' : $this -> CreatorApplication = $value ; break ; 6396 case 'producer' : $this -> ProducerApplication = $value ; break ; 6397 case 'title' : $this -> Title = $value ; break ; 6398 case 'keywords' : $this -> Keywords = $value ; break ; 6399 case 'subject' : $this -> Subject = $value ; break ; 6400 case 'creationdate' : $this -> CreationDate = $this -> GetUTCDate ( $value ) ; break ; 6401 case 'moddate' : $this -> ModificationDate = $this -> GetUTCDate ( $value ) ; break ; 6402 } 6403 } 6404 6405 if ( self::$DEBUG ) 6406 { 6407 echo "\n----------------------------------- AUTHOR INFORMATION\n" ; 6408 echo ( "Author : " . $this -> Author . "\n" ) ; 6409 echo ( "Creator application : " . $this -> CreatorApplication . "\n" ) ; 6410 echo ( "Producer application : " . $this -> ProducerApplication . "\n" ) ; 6411 echo ( "Title : " . $this -> Title . "\n" ) ; 6412 echo ( "Subject : " . $this -> Subject . "\n" ) ; 6413 echo ( "Keywords : " . $this -> Keywords . "\n" ) ; 6414 echo ( "Creation date : " . $this -> CreationDate . "\n" ) ; 6415 echo ( "Modification date : " . $this -> ModificationDate . "\n" ) ; 6416 } 6417 } 6418 } 6419 6420 6421 /*-------------------------------------------------------------------------------------------------------------- 6422 6423 NAME 6424 RetrieveFormData - Retrieves raw form data 6425 6426 PROTOTYPE 6427 $this -> RetrieveFormData ( $object_id, $object_data ) ; 6428 6429 DESCRIPTION 6430 Retrieves raw form data (form definition and field values definition). 6431 6432 PARAMETERS 6433 $object_id (integer) - 6434 Id of the object containing the author information. 6435 6436 $object_data (string) - 6437 Object contents. 6438 6439 $pdf_objects (array) - 6440 Array whose keys are the PDF object ids, and values their corresponding contents. 6441 6442 NOTES 6443 This function only memorizes the contents of form data definitions. The actual data will be processed 6444 only if the GetFormData() function is called. 6445 6446 *-------------------------------------------------------------------------------------------------------------*/ 6447 protected function RetrieveFormData ( $object_id, $object_data, $pdf_objects ) 6448 { 6449 // Retrieve the object that contains the field values 6450 preg_match ( '#\b R \s* \( \s* datasets \s* \) \s* (?P<object> \d+) \s+ \d+ \s+ R#imsx', $object_data, $field_match ) ; 6451 $field_object = $field_match [ 'object' ] ; 6452 6453 if ( ! isset ( $pdf_objects [ $field_object ] ) ) 6454 { 6455 if ( self::$DEBUG ) 6456 warning ( "Field definitions object #$field_object not found in object #$object_id." ) ; 6457 6458 return ; 6459 } 6460 6461 // Retrieve the object that contains the form definition 6462 preg_match ( '#\b R \s* \( \s* form \s* \) \s* (?P<object> \d+) \s+ \d+ \s+ R#imsx', $object_data, $form_match ) ; 6463 $form_object = $form_match [ 'object' ] ; 6464 6465 if ( ! isset ( $pdf_objects [ $form_object ] ) ) 6466 { 6467 if ( self::$DEBUG ) 6468 warning ( "Form definitions object #$form_object not found in object #$object_id." ) ; 6469 6470 return ; 6471 } 6472 // Add this entry to form data information 6473 $this -> FormData [ $object_id ] = array 6474 ( 6475 'values' => ( integer ) $field_object, 6476 'form' => ( integer ) $form_object 6477 ) ; 6478 } 6479 6480 6481 } 6482 6483 6484/************************************************************************************************************** 6485 ************************************************************************************************************** 6486 ************************************************************************************************************** 6487 ****** ****** 6488 ****** ****** 6489 ****** FONT TABLE MANAGEMENT ****** 6490 ****** ****** 6491 ****** ****** 6492 ************************************************************************************************************** 6493 ************************************************************************************************************** 6494 **************************************************************************************************************/ 6495 6496/*============================================================================================================== 6497 6498 PdfTexterFontTable class - 6499 The PdfTexterFontTable class is not supposed to be used outside the context of the PdfToText class. 6500 Its purposes are to hold a list of font definitions taken from a pdf document, along with their 6501 associated character mapping tables, if any. 6502 This is why no provision has been made to design this class a a general purpose class ; its utility 6503 exists only in the scope of the PdfToText class. 6504 6505 ==============================================================================================================*/ 6506class PdfTexterFontTable extends PdfObjectBase 6507 { 6508 // Font table 6509 public $Fonts = array ( ) ; 6510 private $DefaultFont = false ; 6511 // Font mapping between a font number and an object number 6512 private $FontMap = array ( ) ; 6513 // A character map buffer is used to store results from previous calls to the MapCharacter() method of the 6514 // FontTable object. It dramatically reduces the number of calls needed, from one call for each character 6515 // defined in the pdf stream, to one call on each DISTINCT character defined in the PDF stream. 6516 // As an example, imagine a PDF file that contains 200K characters, but only 150 distinct ones. The 6517 // MapCharacter method will be called 150 times, instead of 200 000... 6518 private $CharacterMapBuffer = array ( ) ; 6519 6520 6521 // Constructor - 6522 // Well, does not do anything special 6523 public function __construct ( ) 6524 { 6525 parent::__construct ( ) ; 6526 } 6527 6528 6529 // Add - 6530 // Adds the current font declaration to the font table. Handles special cases where font id is not 6531 // given by the object id, but rather by <</Rx...>> constructs 6532 public function Add ( $object_id, $font_definition, $pdf_objects, $extra_mappings ) 6533 { 6534 if ( PdfToText::$DEBUG ) 6535 { 6536 echo "\n----------------------------------- FONT #$object_id\n" ; 6537 echo $font_definition ; 6538 } 6539 6540 $font_type = PdfTexterFont::FONT_ENCODING_STANDARD ; 6541 $cmap_id = 0 ; 6542 $secondary_cmap_id = 0 ; 6543 $font_variant = false ; 6544 6545 // Font resource id specification 6546 if ( preg_match ( '#<< \s* (?P<rscdefs> /R\d+ .*) >>#ix', $font_definition, $match ) ) 6547 { 6548 $resource_definitions = $match [ 'rscdefs' ] ; 6549 6550 preg_match_all ( '#/R (?P<font_id> \d+) #ix', $resource_definitions, $id_matches ) ; 6551 preg_match_all ( '#/ToUnicode \s* (?P<cmap_id> \d+)#ix', $resource_definitions, $cmap_matches ) ; 6552 6553 $count = count ( $id_matches [ 'font_id' ] ) ; 6554 6555 for ( $i = 0 ; $i < $count ; $i ++ ) 6556 { 6557 $font_id = $id_matches [ 'font_id' ] [$i] ; 6558 $cmap_id = $cmap_matches [ 'cmap_id' ] [$i] ; 6559 6560 $this -> Fonts [ $font_id ] = new PdfTexterFont ( $font_id, $cmap_id, PdfTexterFont::FONT_ENCODING_UNICODE_MAP, $extra_mappings ) ; 6561 } 6562 6563 return ; 6564 } 6565 // Experimental implementation of CID fonts 6566 else if ( preg_match ( '#/(Base)?Encoding \s* /Identity-H#ix', $font_definition ) ) 6567 { 6568 if ( preg_match ( '#/BaseFont \s* /(?P<font> [^\s/]+)#ix', $font_definition, $match ) ) 6569 $font_variant = $match [ 'font' ] ; 6570 6571 $font_type = PdfTexterFont::FONT_ENCODING_CID_IDENTITY_H ; 6572 } 6573 // Font has an associated Unicode map (using the /ToUnicode keyword) 6574 else if ( preg_match ( '#/ToUnicode \s* (?P<cmap> \d+)#ix', $font_definition, $match ) ) 6575 { 6576 $cmap_id = $match [ 'cmap' ] ; 6577 $font_type = PdfTexterFont::FONT_ENCODING_UNICODE_MAP ; 6578 6579 if ( preg_match ( '#/Encoding \s* (?P<cmap> \d+)#ix', $font_definition, $secondary_match ) ) 6580 $secondary_cmap_id = $secondary_match [ 'cmap' ] ; 6581 } 6582 // Font has an associated character map (using a cmap id) 6583 else if ( preg_match ( '#/Encoding \s* (?P<cmap> \d+) \s+ \d+ #ix', $font_definition, $match ) ) 6584 { 6585 $cmap_id = $match [ 'cmap' ] ; 6586 $font_type = PdfTexterFont::FONT_ENCODING_PDF_MAP ; 6587 } 6588 // Font uses the Windows Ansi encoding 6589 else if ( preg_match ( '#/(Base)?Encoding \s* /WinAnsiEncoding#ix', $font_definition ) ) 6590 { 6591 $font_type = PdfTexterFont::FONT_ENCODING_WINANSI ; 6592 6593 if ( preg_match ( '# /BaseFont \s* / [a-z0-9_]+ \+ [a-z0-9_]+? Cyr #imsx', $font_definition ) ) 6594 $font_type |= PdfTexterFont::FONT_VARIANT_ISO8859_5 ; 6595 } 6596 // Font uses the Mac Roman encoding 6597 else if ( preg_match ( '#/(Base)?Encoding \s* /MacRomanEncoding#ix', $font_definition ) ) 6598 $font_type = PdfTexterFont::FONT_ENCODING_MAC_ROMAN ; 6599 6600 $this -> Fonts [ $object_id ] = new PdfTexterFont ( $object_id, $cmap_id, $font_type, $secondary_cmap_id, $pdf_objects, $extra_mappings, $font_variant ) ; 6601 6602 // Arbitrarily set the default font to the first font encountered in the pdf file 6603 if ( $this -> DefaultFont === false ) 6604 { 6605 reset ( $this -> Fonts ) ; 6606 $this -> DefaultFont = key ( $this -> Fonts ) ; 6607 } 6608 } 6609 6610 6611 // AddFontMap - 6612 // Process things like : 6613 // <</F1 26 0 R/F2 22 0 R/F3 18 0 R>> 6614 // which maps font 1 (when specified with the /Fx instruction) to object 26, 6615 // 2 to object 22 and 3 to object 18, respectively, in the above example. 6616 // Found also a strange way of specifying a font mapping : 6617 // <</f-0-0 5 0 R etc. 6618 // And yet another one : 6619 // <</C0_0 5 0 R 6620 public function AddFontMap ( $object_id, $object_data ) 6621 { 6622 $object_data = self::UnescapeHexCharacters ( $object_data ) ; 6623 6624 // The same object can hold different notations for font associations 6625 if ( preg_match_all ( '# (?P<font> ' . self::$FontSpecifiers . ' ) \s+ (?P<object> \d+) #imsx', $object_data, $matches ) ) 6626 { 6627 for ( $i = 0, $count = count ( $matches [ 'font' ] ) ; $i < $count ; $i ++ ) 6628 { 6629 $font = $matches [ 'font' ] [$i] ; 6630 $object = $matches [ 'object' ] [$i] ; 6631 6632 $this -> FontMap [ $font ] = $object ; 6633 } 6634 } 6635 } 6636 6637 6638 // AddPageFontMap - 6639 // Adds font aliases to the current font map, in the form : "page:xobject:font". 6640 // The associated value is the font object itself. 6641 public function AddPageFontMap ( $map ) 6642 { 6643 foreach ( $map as $map_entry ) 6644 { 6645 $this -> FontMap [ $map_entry [ 'page' ] . ':' . $map_entry [ 'xobject-name' ] . ':' . $map_entry [ 'font-name' ] ] = $map_entry [ 'object' ] ; 6646 } 6647 } 6648 6649 6650 // AddCharacterMap - 6651 // Associates a character map to a font declaration that referenced it. 6652 public function AddCharacterMap ( $cmap ) 6653 { 6654 $status = false ; 6655 6656 // We loop through all fonts, since the same character map can be referenced by several font definitions 6657 foreach ( $this -> Fonts as $font ) 6658 { 6659 if ( $font -> CharacterMapId == $cmap -> ObjectId ) 6660 { 6661 $font -> CharacterMap = $cmap ; 6662 $status = true ; 6663 } 6664 else if ( $font -> SecondaryCharacterMapId == $cmap -> ObjectId ) 6665 { 6666 $cmap -> Secondary = true ; 6667 $font -> SecondaryCharacterMap = $cmap ; 6668 $status = true ; 6669 } 6670 } 6671 6672 return ( $status ) ; 6673 } 6674 6675 6676 // GetFontAttributes - 6677 // Gets the specified font width in hex digits and whether the font has a character map or not. 6678 public function GetFontAttributes ( $page_number, $template, $font, &$font_map_width, &$font_mapped ) 6679 { 6680 // Font considered as global to the document 6681 if ( isset ( $this -> Fonts [ $font ] ) ) 6682 $key = $font ; 6683 // Font not found : try to use the first one declared in the document 6684 else 6685 { 6686 reset ( $this -> Fonts ) ; 6687 $key = key ( $this -> Fonts ) ; 6688 } 6689 6690 // Font has an associated character map 6691 if ( $key && $this -> Fonts [ $key ] -> CharacterMap ) 6692 { 6693 $font_map_width = $this -> Fonts [ $key ] -> CharacterMap -> HexCharWidth ; 6694 $font_mapped = true ; 6695 6696 return ( true ) ; 6697 } 6698 // No character map : characters are specified as two hex digits 6699 else 6700 { 6701 $font_map_width = 2 ; 6702 $font_mapped = false ; 6703 6704 return ( false ) ; 6705 } 6706 } 6707 6708 6709 // GetFontByMapId - 6710 // Returns the font id (object id) associated with the specified mapped id. 6711 public function GetFontByMapId ( $page_number, $template, $id ) 6712 { 6713 if ( isset ( $this -> FontMap [ "$page_number:$template:$id" ] ) ) 6714 $font_object = $this -> FontMap [ "$page_number:$template:$id" ] ; 6715 else if ( isset ( $this -> FontMap [ $id ] ) ) 6716 $font_object = $this -> FontMap [ $id ] ; 6717 else 6718 $font_object = -1 ; 6719 6720 return ( $font_object ) ; 6721 } 6722 6723 6724 // GetFontObject - 6725 // Returns the PdfTexterFont object for the given page, template and font id (in the form of "/something") 6726 public function GetFontObject ( $page_number, $template, $id ) 6727 { 6728 if ( isset ( $this -> FontMap [ "$page_number:$template:$id" ] ) ) 6729 $font_object = $this -> FontMap [ "$page_number:$template:$id" ] ; 6730 else if ( isset ( $this -> FontMap [ $id ] ) ) 6731 $font_object = $this -> FontMap [ $id ] ; 6732 else 6733 return ( false ) ; 6734 6735 if ( isset ( $this -> Fonts [ $font_object ] ) ) 6736 return ( $this -> Fonts [ $font_object ] ) ; 6737 else 6738 return ( false ) ; 6739 } 6740 6741 6742 // MapCharacter - 6743 // Returns the character associated to the specified one. 6744 public function MapCharacter ( $font, $ch, $return_false_on_failure = false ) 6745 { 6746 if ( isset ( $this -> CharacterMapBuffer [ $font ] [ $ch ] ) ) 6747 return ( $this -> CharacterMapBuffer [ $font ] [ $ch ] ) ; 6748 6749 // Use the first declared font as the default font, if none defined 6750 if ( $font == -1 ) 6751 $font = $this -> DefaultFont ; 6752 6753 $cache = true ; 6754 6755 if ( isset ( $this -> Fonts [ $font ] ) ) 6756 { 6757 $font_object = $this -> Fonts [ $font ] ; 6758 6759 $code = $font_object -> MapCharacter ( $ch, $return_false_on_failure ) ; 6760 6761 if ( $font_object -> CharacterMap ) 6762 $cache = $font_object -> CharacterMap -> Cache ; 6763 } 6764 else 6765 { 6766 $code = $this -> CodePointToUtf8 ( $ch ) ; 6767 } 6768 6769 if ( $cache ) 6770 $this -> CharacterMapBuffer [ $font ] [ $ch ] = $code ; 6771 6772 return ( $code ) ; 6773 } 6774 } 6775 6776 6777/************************************************************************************************************** 6778 ************************************************************************************************************** 6779 ************************************************************************************************************** 6780 ****** ****** 6781 ****** ****** 6782 ****** FONT MANAGEMENT ****** 6783 ****** ****** 6784 ****** ****** 6785 ************************************************************************************************************** 6786 ************************************************************************************************************** 6787 **************************************************************************************************************/ 6788 6789/*============================================================================================================== 6790 6791 PdfTexterFont class - 6792 The PdfTexterFont class is not supposed to be used outside the context of the PdfToText class. 6793 It holds an optional character mapping table associted with this font. 6794 No provision has been made to design this class a a general purpose class ; its utility exists only in 6795 the scope of the PdfToText class. 6796 6797 ==============================================================================================================*/ 6798class PdfTexterFont extends PdfObjectBase 6799 { 6800 // Font encoding types, for fonts that are neither associated with a Unicode character map nor a PDF character map 6801 const FONT_ENCODING_STANDARD = 0 ; // No character map, use the standard character set 6802 const FONT_ENCODING_WINANSI = 1 ; // No character map, use the Windows Ansi character set 6803 const FONT_ENCODING_MAC_ROMAN = 2 ; // No character map, use the MAC OS Roman character set 6804 const FONT_ENCODING_UNICODE_MAP = 3 ; // Font has an associated unicode character map 6805 const FONT_ENCODING_PDF_MAP = 4 ; // Font has an associated PDF character map 6806 const FONT_ENCODING_CID_IDENTITY_H = 5 ; // CID font : IDENTITY-H 6807 6808 // Font variants 6809 const FONT_VARIANT_STANDARD = 0x0000 ; 6810 const FONT_VARIANT_ISO8859_5 = 0x1000 ; // Cyrillic 6811 6812 const FONT_VARIANT_MASK = 0xF000 ; 6813 const FONT_VARIANT_SHIFT = 12 ; 6814 6815 // Font resource id (may be an object id, overridden by <</Rx...>> constructs 6816 public $Id ; 6817 // Font type and variant 6818 public $FontType ; 6819 public $FontVariant ; 6820 // Character map id, specified by the /ToUnicode flag 6821 public $CharacterMapId ; 6822 // Secondary character map id, specified by the /Encoding flag and that can contain a /Differences flag 6823 public $SecondaryCharacterMapId ; 6824 // Optional character map, that may be set by the PdfToText::Load method just before processing text drawing blocks 6825 public $CharacterMap = null ; 6826 public $SecondaryCharacterMap = null ; 6827 // Character widths 6828 public $CharacterWidths = array ( ) ; 6829 // Default character width, if not present in the $CharacterWidths array 6830 public $DefaultWidth = 0 ; 6831 private $GotWidthInformation = false ; 6832 // A buffer for remembering character widths 6833 protected $CharacterWidthsBuffer = array ( ) ; 6834 6835 6836 // Constructor - 6837 // Builds a PdfTexterFont object, using its resource id and optional character map id. 6838 public function __construct ( $resource_id, $cmap_id, $font_type, $secondary_cmap_id = null, $pdf_objects = null, $extra_mappings = null, $font_variant = false ) 6839 { 6840 6841 parent::__construct ( ) ; 6842 6843 $this -> Id = $resource_id ; 6844 $this -> CharacterMapId = $cmap_id ; 6845 $this -> SecondaryCharacterMapId = $secondary_cmap_id ; 6846 $this -> FontType = $font_type & ~self::FONT_VARIANT_MASK ; 6847 $this -> FontVariant = ( $font_type >> self::FONT_VARIANT_SHIFT ) & 0x0F ; 6848 6849 // Instantiate the appropriate character map for this font 6850 switch ( $this -> FontType ) 6851 { 6852 case self::FONT_ENCODING_WINANSI : 6853 $this -> CharacterMap = new PdfTexterAdobeWinAnsiMap ( $resource_id, $this -> FontVariant ) ; 6854 break ; 6855 6856 case self::FONT_ENCODING_MAC_ROMAN : 6857 $this -> CharacterMap = new PdfTexterAdobeMacRomanMap ( $resource_id, $this -> FontVariant ) ; 6858 break ; 6859 6860 case self::FONT_ENCODING_CID_IDENTITY_H : 6861 $this -> CharacterMap = new PdfTexterIdentityHCIDMap ( $resource_id, $font_variant ) ; 6862 break ; 6863 6864 case self::FONT_ENCODING_PDF_MAP : 6865 $this -> CharacterMap = new PdfTexterEncodingMap ( $cmap_id, $pdf_objects [ $cmap_id ], $extra_mappings ) ; 6866 break ; 6867 6868 case self::FONT_ENCODING_UNICODE_MAP : 6869 break ; 6870 6871 case self::FONT_ENCODING_STANDARD : 6872 break ; 6873 6874 default : 6875 if ( PdfToText::$DEBUG ) 6876 warning ( "Unknown font type #$font_type found for object #$resource_id, character map #$cmap_id." ) ; 6877 } 6878 6879 // Get font data ; include font descriptor information if present 6880 $font_data = $pdf_objects [ $resource_id ] ; 6881 6882 if ( preg_match ( '/FontDescriptor \s+ (?P<id> \d+) \s+ \d+ \s+ R/imsx', $font_data, $match ) ) 6883 { 6884 $descriptor_id = $match [ 'id' ] ; 6885 6886 // Don't care about searching this in that object, or that in this object - simply catenate the font descriptor 6887 // with the font definition 6888 if ( isset ( $pdf_objects [ $descriptor_id ] ) ) 6889 $font_data .= $pdf_objects [ $descriptor_id ] ; 6890 } 6891 6892 // Type1 fonts belong to the Adobe 14 standard fonts available. Information about the character widths is never embedded in the PDF 6893 // file, but must be taken from external data (in the FontMetrics directory). 6894 if ( preg_match ( '#/SubType \s* /Type1#ix', $font_data ) ) 6895 { 6896 preg_match ( '#/BaseFont \s* / ([\w]+ \+)? (?P<font> [^\s\[</]+)#ix', $font_data, $match ) ; 6897 $font_name = $match [ 'font' ] ; 6898 $lc_font_name = strtolower ( $font_name ) ; 6899 6900 // Do that only if a font metrics file exists... 6901 if ( isset ( PdfToText::$AdobeStandardFontMetrics [ $lc_font_name ] ) ) 6902 { 6903 $metrics_file = PdfToText::$FontMetricsDirectory . '/' . PdfToText::$AdobeStandardFontMetrics [ $lc_font_name ] ; 6904 6905 if ( file_exists ( $metrics_file ) ) 6906 { 6907 include ( $metrics_file ) ; 6908 6909 if ( isset ( $charwidths ) ) 6910 { 6911 // Build the CharacterWidths table 6912 foreach ( $charwidths as $char => $width ) 6913 $this -> CharacterWidths [ chr ( $char ) ] = ( double ) $width ; 6914 6915 $this -> GotWidthInformation = true ; 6916 } 6917 } 6918 } 6919 } 6920 6921 // Retrieve the character widths for this font. This means : 6922 // - Retrieving the /FirstChar, /LastChar and /Widths entries from the font definition. /Widths is an array of individual character 6923 // widths, between the /FirstChar and /LastChar entries. A value of zero in this array means "Use the default width"... 6924 // - ... which is given by the /MissingWidth parameter, normally put in the font descriptor whose object id is given by the 6925 // /FontDescriptor entry of the font definition 6926 // Well, to be considered, given the number of buggy PDFs around the world, we won't care about the /LastChar entry and we won't 6927 // check whether the /Widths array contains (LastChar - FirstChar + 1) integer values... 6928 // Get the entries 6929 $first_char = false ; 6930 $widths = false ; 6931 $missing_width = false ; 6932 6933 if ( preg_match ( '#/FirstChar \s+ (?P<char> \d+)#imsx', $font_data, $match ) ) 6934 $first_char = $match [ 'char' ] ; 6935 6936 if ( preg_match ( '#/Widths \s* \[ (?P<widths> [^\]]+) \]#imsx', $font_data, $match ) ) 6937 $widths = $match [ 'widths' ] ; 6938 6939 if ( preg_match ( '#/MissingWidth \s+ (?P<missing> \d+)#imsx', $font_data, $match ) ) 6940 $missing_width = $match [ 'missing' ] ; 6941 6942 // It would not make sense if one of the two entries /FirstChar and /Widths was missing 6943 // So ensure they are all there (note that /MissingWidths can be absent) 6944 if ( $first_char !== false && $widths ) 6945 { 6946 if ( $missing_width !== false ) 6947 $this -> DefaultWidth = ( double ) $missing_width ; 6948 6949 // Here comes a really tricky part : 6950 // - The PDF file can contain CharProcs (example names : /a0, /a1, etc.) for which we have no 6951 // Unicode equivalent 6952 // - The caller may have called the AddAdobeExtraMappings method, to providing a mapping between 6953 // those char codes (/a0, /a1, etc.) and a Unicode equivalent 6954 // - Each "charproc" listed in the /Differences array as a specific code, such as : 6955 // [0/a1/a2/a3...] 6956 // which maps /a1 to code 0, /a2 to code 1, and so on 6957 // - However, the GetStringWidth() method provides real Unicode characters 6958 // Consequently, we have to map each CharProc character (/a1, /a2, etc.) to the Unicode value 6959 // that may have been specified using the AddAdobeExtraMappings() method. 6960 // The first step below collects the name list of CharProcs. 6961 $charprocs = false ; 6962 6963 if ( isset ( $this -> CharacterMap -> Encodings ) && 6964 preg_match ( '# /CharProcs \s* << (?P<list> .*?) >>#imsx', $font_data, $match ) ) 6965 { 6966 preg_match_all ( '#/ (?P<char> \w+) \s+ \d+ \s+ \d+ \s+ R#msx', $match [ 'list' ], $char_matches ) ; 6967 6968 $charprocs = array_flip ( $char_matches [ 'char' ] ) ; 6969 } 6970 6971 // The /FontMatrix entry defines the scaling to be used for the character widths (among other things) 6972 if ( preg_match ( '#/FontMatrix \s* \[ \s* (?P<multiplier> \d+)#imsx', $font_data, $match ) ) 6973 $multiplier = 1000 * ( double ) $match [ 'multiplier' ] ; 6974 else 6975 $multiplier = 1 ; 6976 6977 $widths = trim ( preg_replace ( '/\s+/', ' ', $widths ) ) ; 6978 $widths = explode ( ' ', $widths ) ; 6979 6980 for ( $i = 0, $count = count ( $widths ) ; $i < $count ; $i ++ ) 6981 { 6982 $value = ( double ) trim ( $widths [$i] ) ; 6983 $chr_index = $first_char + $i ; 6984 6985 // Tricky thing part 2 : 6986 if ( $charprocs ) 6987 { 6988 // If one of the CharProc characters is listed in the /Differences array then... 6989 if ( isset ( $this -> CharacterMap -> DifferencesByPosition [ $chr_index ] ) ) 6990 { 6991 $chname = $this -> CharacterMap -> DifferencesByPosition [ $chr_index ] ; 6992 6993 // ... if this CharProcs character is defined in the encoding table (possibly because 6994 // it was complemeted through a call to the AddAdobeExtraMappings() method), then we 6995 // will use its Unicode counterpart instead of the character ID coming from the 6996 // /Differences array) 6997 if ( isset ( $charprocs [ $chname ] ) && isset ( $this -> CharacterMap -> Encodings [ $chname ] ) ) 6998 $chr_index = $this -> CharacterMap -> Encodings [ $chname ] [2] ; 6999 } 7000 } 7001 7002 $this -> CharacterWidths [ chr ( $chr_index ) ] = ( $value ) ? ( $value * $multiplier ) : $this -> DefaultWidth ; 7003 } 7004 7005 $this -> GotWidthInformation = true ; 7006 } 7007 } 7008 7009 7010 // MapCharacter - 7011 // Returns the substitution string value for the specified character, if the current font has an 7012 // associated character map, or the original character encoded in utf8, if not. 7013 public function MapCharacter ( $ch, $return_false_on_failure = false ) 7014 { 7015 if ( $this -> CharacterMap ) 7016 { 7017 // Character is defined in the character map ; check if it has been overridden by a /Differences array in 7018 // a secondary character map 7019 if ( isset ( $this -> CharacterMap [ $ch ] ) ) 7020 { 7021 // Since a /ToUnicode map can have an associated /Encoding map with a /Differences list, this is the right place 7022 // to perform the translation (ie, the final Unicode codepoint is impacted by the /Differences list) 7023 if ( ! $this -> SecondaryCharacterMap ) // Most common case first ! 7024 { 7025 $code = $this -> CharacterMap [ $ch ] ; 7026 } 7027 else 7028 { 7029 if ( isset ( $this -> SecondaryCharacterMap [ $ch ] ) ) 7030 $code = $this -> SecondaryCharacterMap [ $ch ] ; 7031 else 7032 $code = $this -> CharacterMap [ $ch ] ; 7033 } 7034 7035 return ( $code ) ; 7036 } 7037 // On the contrary, the character may not be defined in the main character map but may exist in the secondary cmap 7038 else if ( $this -> SecondaryCharacterMap && isset ( $this -> SecondaryCharacterMap [ $ch ] ) ) 7039 { 7040 $code = $this -> SecondaryCharacterMap [ $ch ] ; 7041 7042 return ( $code ) ; 7043 } 7044 } 7045 7046 if ( $return_false_on_failure ) 7047 return ( false ) ; 7048 7049 return ( $this -> CodePointToUtf8 ( $ch ) ) ; 7050 } 7051 7052 7053 /*-------------------------------------------------------------------------------------------------------------- 7054 7055 NAME 7056 GetStringWidth - Returns the length of a string, in 1/100 of points 7057 7058 PROTOTYPE 7059 $width = $font -> GetStringWidth ( $text, $extra_percent ) ; 7060 7061 DESCRIPTION 7062 Returns the length of a string, in 1/100 of points. 7063 7064 PARAMETERS 7065 $text (string) - 7066 String whose length is to be measured. 7067 7068 $extra_percent (double) - 7069 Extra percentage to be added to the computed width. 7070 7071 RETURN VALUE 7072 Returns the length of the specified string in 1/1000 of text points, or 0 if the font does not 7073 contain any character width information. 7074 7075 *-------------------------------------------------------------------------------------------------------------*/ 7076 public function GetStringWidth ( $text, $extra_percent ) 7077 { 7078 // No width information 7079 if ( ! $this -> GotWidthInformation ) 7080 return ( false ) ; 7081 7082 $width = 0 ; 7083 7084 // Compute the width of each individual character - use a character width buffer to avoid 7085 // repeating the same tests again and again for characters whose width has already been processed 7086 for ( $i = 0, $length = strlen ( $text ) ; $i < $length ; $i ++ ) 7087 { 7088 $ch = $text [$i] ; 7089 7090 // Character already in the Widths buffer - Simply retrieve its value 7091 if ( isset ( $this -> CharacterWidthsBuffer [ $ch ] ) ) 7092 { 7093 $width += $this -> CharacterWidthsBuffer [ $ch ] ; 7094 } 7095 // New character - The width comes either from the CharacterWidths array if an entry is defined 7096 // for this character, or from the default width property. 7097 else 7098 { 7099 if ( isset ( $this -> CharacterWidths [ $ch ] ) ) 7100 { 7101 $width += $this -> CharacterWidths [ $ch ] ; 7102 $this -> CharacterWidthsBuffer [ $ch ] = $this -> CharacterWidths [ $ch ] ; 7103 } 7104 else 7105 { 7106 $width += $this -> DefaultWidth ; 7107 $this -> CharacterWidthsBuffer [ $ch ] = $this -> DefaultWidth ; 7108 } 7109 } 7110 } 7111 7112 // The computed width is actually longer/smaller than its actual width. Adjust by the percentage specified 7113 // by the ExtraTextWidth property 7114 $divisor = 100 - $extra_percent ; 7115 7116 if ( $divisor < 50 ) // Arbitrarily fix a limit 7117 $divisor = 50 ; 7118 7119 // All done, return 7120 return ( $width / $divisor ) ; 7121 } 7122 } 7123 7124 7125/*============================================================================================================== 7126 7127 PdfTexterCharacterMap - 7128 The PdfTexterFont class is not supposed to be used outside the context of the PdfToText class. 7129 Describes a character map. 7130 No provision has been made to design this class a a general purpose class ; its utility exists only in 7131 the scope of the PdfToText class. 7132 7133 ==============================================================================================================*/ 7134abstract class PdfTexterCharacterMap extends PdfObjectBase 7135 implements ArrayAccess, Countable 7136 { 7137 // Object id of the character map 7138 public $ObjectId ; 7139 // Number of hex digits in a character represented in hexadecimal notation 7140 public $HexCharWidth ; 7141 // Set to true if the values returned by the array access operator can safely be cached 7142 public $Cache = false ; 7143 7144 7145 7146 public function __construct ( $object_id ) 7147 { 7148 parent::__construct ( ) ; 7149 $this -> ObjectId = $object_id ; 7150 } 7151 7152 7153 /*-------------------------------------------------------------------------------------------------------------- 7154 7155 CreateInstance - 7156 Creates a PdfTexterCharacterMap instance of the correct type. 7157 7158 *-------------------------------------------------------------------------------------------------------------*/ 7159 public static function CreateInstance ( $object_id, $definitions, $extra_mappings ) 7160 { 7161 if ( preg_match ( '# (begincmap) | (beginbfchar) | (beginbfrange) #ix', $definitions ) ) 7162 return ( new PdfTexterUnicodeMap ( $object_id, $definitions ) ) ; 7163 else if ( stripos ( $definitions, '/Differences' ) !== false ) 7164 return ( new PdfTexterEncodingMap ( $object_id, $definitions, $extra_mappings ) ) ; 7165 else 7166 return ( false ) ; 7167 } 7168 7169 7170 7171 /*-------------------------------------------------------------------------------------------------------------- 7172 7173 Interface implementations. 7174 7175 *-------------------------------------------------------------------------------------------------------------*/ 7176 public function offsetSet ( $offset, $value ) 7177 { error ( new PdfToTextDecodingException ( "Unsupported operation." ) ) ; } 7178 7179 public function offsetUnset ( $offset ) 7180 { error ( new PdfToTextDecodingException ( "Unsupported operation." ) ) ; } 7181 } 7182 7183 7184 7185/*============================================================================================================== 7186 7187 PdfTexterUnicodeMap - 7188 A class for fonts having a character map specified with the /ToUnicode parameter. 7189 7190 ==============================================================================================================*/ 7191class PdfTexterUnicodeMap extends PdfTexterCharacterMap 7192 { 7193 // Id of the character map (specified by the /Rx flag) 7194 public $Id ; 7195 // Character substitution table, using the beginbfrange/endbfrange notation 7196 // Only constructs of the form : 7197 // <low> <high> <start> 7198 // are stored in this table. Constructs of the form : 7199 // <x> <y> [ <subst_x> <subst_x+1> ... <subst_y> ] 7200 // are stored in the $DirectMap array, because it is conceptually the same thing in the end as a character substitution being 7201 // defined with the beginbfchar/endbfchar construct. 7202 // Note that a dichotomic search in $RangeMap will be performed for each character reference not yet seen in the pdf flow. 7203 // Once the substitution character has been found, it will be added to the $DirectMap array for later faster access. 7204 // The reason for this optimization is that some pdf files can contain beginbfrange/endbfrange constructs that may seem useless, 7205 // except for validation purposes (ie, validating the fact that a character reference really belongs to the character map). 7206 // However, such constructs can lead to thousands of character substitutions ; consider the following example, that comes 7207 // from a sample I received : 7208 // beginbfrange 7209 // <1000> <1FFFF> <1000> 7210 // <2000> <2FFFF> <2000> 7211 // ... 7212 // <A000> <AFFFF> <A0000> 7213 // ... 7214 // endbfrange 7215 // By naively storing a one-to-one character relationship in an associative array, such as : 7216 // $array [ 0x1000 ] = 0x1000 ; 7217 // $array [ 0x1001 ] = 0x1001 ; 7218 // .. 7219 // $array [ 0x1FFF ] = 0x1FFF ; 7220 // etc. 7221 // you may arrive to a situation where the array becomes so big that it exhausts all of the available memory. 7222 // This is why the ranges are stored as is and a dichotomic search is performed to go faster. 7223 // Since it is useless to use this method to search the same character twice, when it has been found once, the 7224 // substitution pair will be put in the $DirectMap array for subsequent accesses (there is little probability that a PDF 7225 // file contains so much different characters, unless you are processing the whole Unicode table itself ! - but in this 7226 // case, you will simply have to adjust the value of the memory_limit setting in your php.ini file. Consider that I am 7227 // not a magician...). 7228 protected $RangeMap = array ( ) ; 7229 private $RangeCount = 0 ; // Avoid unnecessary calls to the count() function 7230 private $RangeMin = PHP_INT_MAX, // Min and max values of the character ranges 7231 $RangeMax = -1 ; 7232 // Character substitution table for tables using the beginbfchar notation 7233 protected $DirectMap = array ( ) ; 7234 7235 7236 // Constructor - 7237 // Analyzes the text contents of a CMAP and extracts mappings from the beginbfchar/endbfchar and 7238 // beginbfrange/endbfrange constructs. 7239 public function __construct ( $object_id, $definitions ) 7240 { 7241 parent::__construct ( $object_id ) ; 7242 7243 if ( PdfToText::$DEBUG ) 7244 { 7245 echo "\n----------------------------------- UNICODE CMAP #$object_id\n" ; 7246 echo $definitions; 7247 } 7248 7249 // Retrieve the cmap id, if any 7250 preg_match ( '# /CMapName \s* /R (?P<num> \d+) #ix', $definitions, $match ) ; 7251 $this -> Id = isset ( $match [ 'num' ] ) ? $match [ 'num' ] : -1 ; 7252 7253 // Get the codespace range, which will give us the width of a character specified in hexadecimal notation 7254 preg_match ( '# begincodespacerange \s+ <\s* (?P<low> [0-9a-f]+) \s*> \s* <\s* (?P<high> [0-9a-f]+) \s*> \s*endcodespacerange #ix', $definitions, $match ) ; 7255 7256 if ( isset ( $match [ 'low' ] ) ) 7257 $this -> HexCharWidth = max ( strlen ( $match [ 'low' ] ), strlen ( $match [ 'high' ] ) ) ; 7258 else 7259 $this -> HexCharWidth = 0 ; 7260 7261 $max_found_char_width = 0 ; 7262 7263 // Process beginbfchar/endbfchar constructs 7264 if ( preg_match_all ( '/ beginbfchar \s* (?P<chars> .*?) endbfchar /imsx', $definitions, $char_matches ) ) 7265 { 7266 foreach ( $char_matches [ 'chars' ] as $char_list ) 7267 { 7268 // beginbfchar / endbfchar constructs can behave as a kind of beginfbfrange/endbfrange ; example : 7269 // <21> <0009 0020 000d> 7270 // means : 7271 // . Map character #21 to #0009 7272 // . Map character #22 to #0020 7273 // . Map character #23 to #000D 7274 // There is no clue in the Adobe PDF specification that a single character could be mapped to a range. 7275 // The normal constructs would be : 7276 // <21> <0009> 7277 // <22> <0020> 7278 // <23> <0000D> 7279 preg_match_all ( '/< \s* (?P<item> .*?) \s* >/msx', $char_list, $item_matches ) ; 7280 7281 for ( $i = 0, $item_count = count ( $item_matches [ 'item' ] ) ; $i < $item_count ; $i += 2 ) 7282 { 7283 $char = hexdec ( $item_matches [ 'item' ] [$i] ) ; 7284 $char_width = strlen ( $item_matches [ 'item' ] [$i] ) ; 7285 $map = explode ( ' ', preg_replace ( '/\s+/', ' ', $item_matches [ 'item' ] [ $i + 1 ] ) ) ; 7286 7287 if ( $char_width > $max_found_char_width ) 7288 $max_found_char_width = $char_width ; 7289 7290 for ( $j = 0, $map_count = count ( $map ) ; $j < $map_count ; $j ++ ) 7291 { 7292 $subst = hexdec ( $map [$j] ) ; 7293 7294 // Check for this very special, not really document feature which maps CIDs to a non-existing Unicode character 7295 // (but it still corresponds to something...) 7296 if ( isset ( PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ) ) 7297 $subst = PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ; 7298 7299 $this -> DirectMap [ $char + $j ] = $subst ; 7300 } 7301 } 7302 7303 } 7304 } 7305 7306 // Process beginbfrange/endbfrange constructs 7307 if ( preg_match_all ( '/ beginbfrange \s* (?P<ranges> .*?) endbfrange /imsx', $definitions, $range_matches ) ) 7308 { 7309 foreach ( $range_matches [ 'ranges' ] as $range_list ) 7310 { 7311 $start_index = 0 ; 7312 7313 // There are two forms of syntax in a beginbfrange..endbfrange construct 7314 // 1) "<x> <y> <z>", which maps character ids x through y to z through (z+y-x) 7315 // 2) "<x> <y> [<a1> <a2> ... <an>]", which maps character x to a1, x+1 to a2, up to y, which is mapped to an 7316 // All the values are hex digits. 7317 // We will loop through the range definitions by first identifying the <x> and <y>, and the character that follows 7318 // them, which is either a "<" for notation 1), or a "[" for notation 2). 7319 while ( preg_match ( '# < \s* (?P<from> [0-9a-f]+) \s* > \s* < \s* (?P<to> [0-9a-f]+) \s* > \s* (?P<nextchar> .) #imsx', 7320 $range_list, $range_match, PREG_OFFSET_CAPTURE, $start_index ) ) 7321 { 7322 $from = hexdec ( $range_match [ 'from' ] [0] ) ; 7323 $to = hexdec ( $range_match [ 'to' ] [0] ) ; 7324 $next_char = $range_match [ 'nextchar' ] [0] ; 7325 $next_char_index = $range_match [ 'nextchar' ] [1] ; 7326 $char_width = strlen ( $range_match [ 'from' ] [0] ) ; 7327 7328 if ( $char_width > $max_found_char_width ) 7329 $max_found_char_width = $char_width ; 7330 7331 // Form 1) : catch the third hex value after <x> and <y> 7332 if ( $next_char == '<' ) 7333 { 7334 if ( preg_match ( '/ \s* (?P<start> [0-9a-f]+) (?P<tail> \s* > \s*) /imsx', $range_list, $start_match, PREG_OFFSET_CAPTURE, $next_char_index + 1 ) ) 7335 { 7336 $subst = hexdec ( $start_match [ 'start' ] [0] ) ; 7337 7338 // Check for this very special, not really document feature which maps CIDs to a non-existing Unicode character 7339 // (but it still corresponds to something...) 7340 if ( isset ( PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ) ) 7341 $subst = PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ; 7342 7343 // Don't create a range if <x> and <y> are the same 7344 if ( $from != $to ) 7345 { 7346 $this -> RangeMap [] = array ( $from, $to, $subst ) ; 7347 7348 // Adjust min and max values for the ranges stored in this character map - to avoid unnecessary testing 7349 if ( $from < $this -> RangeMin ) 7350 $this -> RangeMin = $from ; 7351 7352 if ( $to > $this -> RangeMax ) 7353 $this -> RangeMax = $to ; 7354 } 7355 else 7356 $this -> DirectMap [ $from ] = $subst ; 7357 7358 $start_index = $start_match [ 'tail' ] [1] + 1 ; 7359 } 7360 else 7361 error ( "Character range $from..$to not followed by an hexadecimal value in Unicode map #$object_id." ) ; 7362 } 7363 // Form 2) : catch all the hex values between square brackets after <x> and <y> 7364 else if ( $next_char == '[' ) 7365 { 7366 if ( preg_match ( '/ (?P<values> [\s<>0-9a-f]+ ) (?P<tail> \] \s*)/imsx', $range_list, $array_match, PREG_OFFSET_CAPTURE, $next_char_index + 1 ) ) 7367 { 7368 preg_match_all ( '/ < \s* (?P<num> [0-9a-f]+) \s* > /imsx', $array_match [ 'values' ] [0], $array_values ) ; 7369 7370 for ( $i = $from, $count = 0 ; $i <= $to ; $i ++, $count ++ ) 7371 $this -> DirectMap [$i] = hexdec ( $array_values [ 'num' ] [ $count ] ) ; 7372 7373 $start_index = $array_match [ 'tail' ] [1] + 1 ; 7374 } 7375 else 7376 error ( "Character range $from..$to not followed by an array of hexadecimal values in Unicode map #$object_id." ) ; 7377 } 7378 else 7379 { 7380 error ( "Unexpected character '$next_char' in Unicode map #$object_id." ) ; 7381 $start_index = $range_match [ 'nextchar' ] [1] + 1 ; 7382 } 7383 } 7384 } 7385 7386 // Sort the ranges by their starting offsets 7387 $this -> RangeCount = count ( $this -> RangeMap ) ; 7388 7389 if ( $this -> RangeCount > 1 ) 7390 { 7391 usort ( $this -> RangeMap, array ( $this, '__rangemap_cmpfunc' ) ) ; 7392 } 7393 } 7394 7395 if ( $max_found_char_width && $max_found_char_width != $this -> HexCharWidth ) 7396 { 7397 if ( PdfToText::$DEBUG ) 7398 warning ( "Character map #$object_id : specified code width ({$this -> HexCharWidth}) differs from actual width ($max_found_char_width)." ) ; 7399 7400 $this -> HexCharWidth = $max_found_char_width ; 7401 } 7402 } 7403 7404 7405 public function __rangemap_cmpfunc ( $a, $b ) 7406 { return ( $a [0] - $b [0] ) ; } 7407 7408 7409 /*-------------------------------------------------------------------------------------------------------------- 7410 7411 Interface implementations. 7412 7413 *-------------------------------------------------------------------------------------------------------------*/ 7414 public function count ( ) 7415 { return ( count ( $this -> DirectMap ) ) ; } 7416 7417 7418 public function offsetExists ( $offset ) 7419 { return ( $this -> offsetGetSafe ( $offset ) !== false ) ; } 7420 7421 7422 public function offsetGetSafe ( $offset, $translate = true ) 7423 { 7424 // Return value 7425 $code = false ; 7426 7427 // Character already has an entry (character reference => subtituted character) 7428 if ( isset ( $this -> DirectMap [ $offset ] ) ) 7429 { 7430 $code = ( $translate ) ? $this -> CodePointToUtf8 ( $this -> DirectMap [ $offset ] ) : $this -> DirectMap [ $offset ] ; 7431 } 7432 // Character does not has a direct entry ; have a look in the character ranges defined for this map 7433 else if ( $this -> RangeCount && $offset >= $this -> RangeMin && $offset <= $this -> RangeMax ) 7434 { 7435 $low = 0 ; 7436 $high = count ( $this -> RangeMap ) - 1 ; 7437 $result = false ; 7438 7439 // Use a dichotomic search through character ranges 7440 while ( $low <= $high ) 7441 { 7442 $middle = ( $low + $high ) >> 1 ; 7443 7444 if ( $offset < $this -> RangeMap [ $middle ] [0] ) 7445 $high = $middle - 1 ; 7446 else if ( $offset > $this -> RangeMap [ $middle ] [1] ) 7447 $low = $middle + 1 ; 7448 else 7449 { 7450 $result = $this -> RangeMap [ $middle ] [2] + $offset - $this -> RangeMap [ $middle ] [0] ; 7451 break ; 7452 } 7453 } 7454 7455 // Once a character has been found in the ranges defined by this character map, store it in the DirectMap property 7456 // so that it will be directly retrieved during subsequent accesses 7457 if ( $result !== false ) 7458 { 7459 $code = ( $translate ) ? $this -> CodePointToUtf8 ( $result ) : $result ; 7460 $this -> DirectMap [ $offset ] = $result ; 7461 } 7462 } 7463 7464 // All done, return 7465 return ( $code ) ; 7466 } 7467 7468 7469 public function offsetGet ( $offset ) 7470 { 7471 $code = $this -> offsetGetSafe ( $offset ) ; 7472 7473 if ( $code === false ) 7474 $code = $this -> CodePointToUtf8 ( $offset ) ; 7475 7476 return ( $code ) ; 7477 } 7478 } 7479 7480 7481/*============================================================================================================== 7482 7483 PdfTexterEncodingMap - 7484 A class for fonts having a character map specified with the /Encoding parameter. 7485 7486 ==============================================================================================================*/ 7487class PdfTexterEncodingMap extends PdfTexterCharacterMap 7488 { 7489 // Possible encodings (there is a 5th one, MacExpertEncoding, but used for "expert fonts" ; no need to deal 7490 // with it here since we only want to extract text) 7491 // Note that the values of these constants are direct indices to the second dimension of the $Encodings table 7492 const PDF_STANDARD_ENCODING = 0 ; 7493 const PDF_MAC_ROMAN_ENCODING = 1 ; 7494 const PDF_WIN_ANSI_ENCODING = 2 ; 7495 const PDF_DOC_ENCODING = 3 ; 7496 7497 // Correspondance between an encoding name and its corresponding character in the 7498 // following format : Standard, Mac, Windows, Pdf 7499 private static $GlobalEncodings = false ; 7500 public $Encodings ; 7501 // Encoding type (one of the PDF_*_ENCODING constants) 7502 public $Encoding ; 7503 // Indicates whether this character map is a secondary one used for Unicode maps ; this must be set at 7504 // a higher level by the PdfTexterFont because at the time a character map is instantiated, we do not know 7505 // yet whether it will be a primary (normal) map, or a map secondary to an existing Unicode map 7506 public $Secondary ; 7507 // Differences array (a character substitution table to the standard encodings) 7508 public $Map = array ( ) ; 7509 // A secondary map for the Differences array, which only contains the differences ; this is used 7510 // for Unicode fonts that also have an associated /Differences parameter, which should not include the 7511 // whole standard Adobe character map but only the differences of encodings 7512 public $SecondaryMap = array ( ) ; 7513 // Differences by position number 7514 public $DifferencesByPosition = array ( ) ; 7515 7516 7517 // Constructor - 7518 // Analyzes the text contents of a CMAP and extracts mappings from the beginbfchar/endbfchar and 7519 // beginbfrange/endbfrange constructs. 7520 public function __construct ( $object_id, $definitions, $extra_mappings ) 7521 { 7522 // Ignore character variants whose names end with these suffixes 7523 static $IgnoredVariants = array 7524 ( 7525 '/\.scalt$/', 7526 '/\.sc$/', 7527 '/\.fitted$/', 7528 '/\.oldstyle$/', 7529 '/\.taboldstyle$/', 7530 '/\.alt$/', 7531 '/alt$/', 7532 ) ; 7533 7534 parent::__construct ( $object_id ) ; 7535 7536 // Load the default Adobe character sets, if not already done 7537 if ( self::$GlobalEncodings === false ) 7538 { 7539 $charset_file = dirname ( __FILE__ ) . '/Maps/adobe-charsets.map' ; 7540 include ( $charset_file ) ; 7541 self::$GlobalEncodings = ( isset ( $adobe_charsets ) ) ? $adobe_charsets : array ( ) ; 7542 } 7543 7544 $this -> Encodings = array_merge ( self::$GlobalEncodings, $extra_mappings ) ; 7545 7546 // Fonts using default Adobe character sets and hexadecimal representations are one-byte long 7547 $this -> HexCharWidth = 2 ; 7548 7549 if ( PdfToText::$DEBUG ) 7550 { 7551 echo "\n----------------------------------- ENCODING CMAP #$object_id\n" ; 7552 echo $definitions; 7553 } 7554 7555 // Retrieve text encoding 7556 preg_match ( '# / (?P<encoding> (WinAnsiEncoding) | (PDFDocEncoding) | (MacRomanEncoding) | (StandardEncoding) ) #ix', 7557 $definitions, $encoding_match ) ; 7558 7559 if ( ! isset ( $encoding_match [ 'encoding' ] ) ) 7560 $encoding_match [ 'encoding' ] = 'WinAnsiEncoding' ; 7561 7562 switch ( strtolower ( $encoding_match [ 'encoding' ] ) ) 7563 { 7564 case 'pdfdocencoding' : $this -> Encoding = self::PDF_DOC_ENCODING ; break ; 7565 case 'macromanencoding' : $this -> Encoding = self::PDF_MAC_ROMAN_ENCODING ; break ; 7566 case 'standardencoding' : $this -> Encoding = self::PDF_STANDARD_ENCODING ; break ; 7567 case 'winansiencoding' : 7568 default : $this -> Encoding = self::PDF_WIN_ANSI_ENCODING ; 7569 } 7570 7571 // Build a virgin character map using the detected encoding 7572 foreach ( $this -> Encodings as $code_array ) 7573 { 7574 $char = $code_array [ $this -> Encoding ] ; 7575 $this -> Map [ $char ] = $char ; 7576 } 7577 7578 // Extract the Differences array 7579 preg_match ( '/ \[ \s* (?P<contents> [^\]]*?) \s* \] /x', $definitions, $match ) ; 7580 7581 if ( ! isset ( $match [ 'contents' ] ) ) 7582 return ; 7583 7584 $data = trim ( preg_replace ( '/\s+(\d+)/', '/$1', $match [ 'contents' ] ) ) ; 7585 $items = explode ( '/', $data ) ; 7586 $index = 0 ; 7587 7588 for ( $i = 0, $item_count = count ( $items ) ; $i < $item_count ; $i ++ ) 7589 { 7590 $item = PdfToText::DecodeRawName ( trim ( $items [$i] ) ) ; 7591 7592 // Integer value : index of next character in map 7593 if ( is_numeric ( $item ) ) 7594 $index = ( integer ) $item ; 7595 // String value : a character name, as defined by Adobe 7596 else 7597 { 7598 // Remove variant part of the character name 7599 $item = preg_replace ( $IgnoredVariants, '', trim ( $item ) ) ; 7600 7601 // Keyword (character name) exists in the encoding table 7602 if ( isset ( $this -> Encodings [ $item ] ) ) 7603 { 7604 $this -> Map [ $index ] = 7605 $this -> SecondaryMap [ $index ] = $this -> Encodings [ $item ] [ $this -> Encoding ] ; 7606 } 7607 // Not defined ; check if this is the "/gxx" notation, where "xx" is a number 7608 else if ( preg_match ( '/g (?P<value> \d+)/x', $item, $match ) ) 7609 { 7610 $value = ( integer ) $match [ 'value' ] ; 7611 7612 // In my current state of investigations, the /g notation has the following characteristics : 7613 // - The value 29 must be added to the number after the "/g" string (why ???) 7614 // - The value after the "/g" string can be greater than 255, meaning that it could be Unicode codepoint 7615 // This has to be carefully watched before revision 7616 $value += 29 ; 7617 7618 $this -> Map [ $index ] = 7619 $this -> SecondaryMap [ $index ] = $value ; 7620 } 7621 // Some characters can be specified by the "/uni" prefix followed by a sequence of hex digits, 7622 // which is not described by the PDF specifications. This sequence gives a Unicode code point. 7623 else if ( preg_match ( '/uni (?P<value> [0-9a-f]+)/ix', $item, $match ) ) 7624 { 7625 $value = hexdec ( $match [ 'value' ] ) ; 7626 7627 $this -> Map [ $index ] = 7628 $this -> SecondaryMap [ $index ] = ( integer ) $value ; 7629 } 7630 // Otherwise, put a quotation mark instead 7631 else 7632 { 7633 if ( PdfToText::$DEBUG ) 7634 warning ( "Unknown character name found in a /Differences[] array : [$item]" ) ; 7635 7636 $this -> Map [ $index ] = 7637 $this -> SecondaryMap [ $index ] = ord ( '?' ) ; 7638 } 7639 7640 $this -> DifferencesByPosition [ $index ] = $item ; 7641 7642 $index ++ ; 7643 } 7644 } 7645 } 7646 7647 7648 /*-------------------------------------------------------------------------------------------------------------- 7649 7650 Interface implementations. 7651 7652 *-------------------------------------------------------------------------------------------------------------*/ 7653 public function count ( ) 7654 { return ( count ( $this -> Map ) ) ; } 7655 7656 7657 public function offsetExists ( $offset ) 7658 { 7659 return ( ( ! $this -> Secondary ) ? 7660 isset ( $this -> Map [ $offset ] ) : 7661 isset ( $this -> SecondaryMap [ $offset ] ) ) ; 7662 } 7663 7664 7665 public function offsetGet ( $offset ) 7666 { 7667 if ( ! $this -> Secondary ) 7668 { 7669 if ( isset ( $this -> Map [ $offset ] ) ) 7670 $ord = $this -> Map [ $offset ] ; 7671 else 7672 $ord = $offset ; 7673 7674 // Check for final character translations (concerns only a few number of characters) 7675 if ( $this -> Encoding == self::PDF_WIN_ANSI_ENCODING && isset ( PdfTexterAdobeWinAnsiMap::$WinAnsiCharacterMap [0] [ $ord ] ) ) 7676 $ord = PdfTexterAdobeWinAnsiMap::$WinAnsiCharacterMap [0] [ $ord ] ; 7677 else if ( $this -> Encoding == self::PDF_MAC_ROMAN_ENCODING && isset ( PdfTexterAdobeMacRomanMap::$MacRomanCharacterMap [0] [ $ord ] ) ) 7678 $ord = PdfTexterAdobeMacRomanMap::$MacRomanCharacterMap [0] [ $ord ] ; 7679 // As far as I have been able to see, the values expressed by the /Differences tag were the only ones used within the 7680 // Pdf document ; however, handle the case where some characters do not belong to the characters listed by /Differences, 7681 // and use the official Adobe encoding maps when necessary 7682 else if ( isset ( $this -> Encodings [ $ord ] [ $this -> Encoding ] ) ) 7683 $ord = $this -> Encodings [ $ord ] [ $this -> Encoding ] ; 7684 7685 $result = $this -> CodePointToUtf8 ( $ord ) ; 7686 } 7687 else if ( isset ( $this -> SecondaryMap [ $offset ] ) ) 7688 { 7689 $ord = $this -> SecondaryMap [ $offset ] ; 7690 $result = $this -> CodePointToUtf8 ( $ord ) ; 7691 } 7692 else 7693 $result = false ; 7694 7695 return ( $result ) ; 7696 } 7697 } 7698 7699 7700/************************************************************************************************************** 7701 ************************************************************************************************************** 7702 ************************************************************************************************************** 7703 ****** ****** 7704 ****** ****** 7705 ****** CHARACTER MAP MANAGEMENT ****** 7706 ****** ****** 7707 ****** ****** 7708 ************************************************************************************************************** 7709 ************************************************************************************************************** 7710 **************************************************************************************************************/ 7711 7712/*============================================================================================================== 7713 7714 class PdfTexterAdobeMap - 7715 Abstract class to handle Adobe-specific fonts. 7716 7717 ==============================================================================================================*/ 7718abstract class PdfTexterAdobeMap extends PdfTexterCharacterMap 7719 { 7720 // Font variant ; one of the PdfTexterFont::FONT_VARIANT_* constants 7721 public $Variant ; 7722 // To be declared by derived classes : 7723 public $Map ; 7724 7725 7726 public function __construct ( $object_id, $font_variant, $map ) 7727 { 7728 parent::__construct ( $object_id ) ; 7729 7730 $this -> HexCharWidth = 2 ; 7731 $this -> Variant = $font_variant ; 7732 $this -> Map = $map ; 7733 7734 if ( ! isset ( $map [ $font_variant ] ) ) 7735 error ( new PdfToTextDecodingException ( "Undefined font variant #$font_variant." ) ) ; 7736 } 7737 7738 7739 /*-------------------------------------------------------------------------------------------------------------- 7740 7741 Interface implementations. 7742 7743 *-------------------------------------------------------------------------------------------------------------*/ 7744 public function count ( ) 7745 { return ( count ( $this -> $Map [ $this -> Variant ] ) ) ; } 7746 7747 7748 public function offsetExists ( $offset ) 7749 { return ( isset ( $this -> Map [ $this-> Variant ] [ $offset ] ) ) ; } 7750 7751 7752 public function offsetGet ( $offset ) 7753 { 7754 if ( isset ( $this -> Map [ $this-> Variant ] [ $offset ] ) ) 7755 $ord = $this -> Map [ $this -> Variant ] [ $offset ] ; 7756 else 7757 $ord = $offset ; 7758 7759 return ( $this -> CodePointToUtf8 ( $ord ) ) ; 7760 } 7761 } 7762 7763 7764/*============================================================================================================== 7765 7766 class PdfTexterAdobeWinAnsiMap - 7767 Abstract class to handle Adobe-specific Win Ansi fonts. 7768 7769 ==============================================================================================================*/ 7770class PdfTexterAdobeWinAnsiMap extends PdfTexterAdobeMap 7771 { 7772 // Windows Ansi mapping to Unicode. Only substitutions that have no direct equivalent are listed here 7773 // Source : https://msdn.microsoft.com/en-us/goglobal/cc305145.aspx 7774 // Only characters from 0x80 to 0x9F have no direct translation 7775 public static $WinAnsiCharacterMap = array 7776 ( 7777 // Normal WinAnsi mapping 7778 0 => array 7779 ( 7780 0x80 => 0x20AC, 7781 0x82 => 0x201A, 7782 0x83 => 0x0192, 7783 0x84 => 0x201E, 7784 0x85 => 0x2026, 7785 0x86 => 0x2020, 7786 0x87 => 0x2021, 7787 0x88 => 0x02C6, 7788 0x89 => 0x2030, 7789 0x8A => 0x0160, 7790 0x8B => 0x2039, 7791 0x8C => 0x0152, 7792 0x8E => 0x017D, 7793 0x91 => 0x2018, 7794 0x92 => 0x2019, 7795 0x93 => 0x201C, 7796 0x94 => 0x201D, 7797 0x95 => 0x2022, 7798 0x96 => 0x2013, 7799 0x97 => 0x2014, 7800 0x98 => 0x02DC, 7801 0x99 => 0x2122, 7802 0x9A => 0x0161, 7803 0x9B => 0x203A, 7804 0x9C => 0x0153, 7805 0x9E => 0x017E, 7806 0x9F => 0x0178 7807 ), 7808 // Cyrillic (IS08859-5) 7809 1 => array 7810 ( 7811 0x93 => 0x0022, // Quotes 7812 0x94 => 0x0022, 7813 0xC0 => 0x0410, 7814 0xC1 => 0x0411, 7815 0xC2 => 0x0412, 7816 0xC3 => 0x0413, 7817 0xC4 => 0x0414, 7818 0xC5 => 0x0415, 7819 0xC6 => 0x0416, 7820 0xC7 => 0x0417, 7821 0xC8 => 0x0418, 7822 0xC9 => 0x0419, 7823 0xCA => 0x041A, 7824 0xCB => 0x041B, 7825 0xCC => 0x041C, 7826 0xCD => 0x041D, 7827 0xCE => 0x041E, 7828 0xCF => 0x041F, 7829 0xD0 => 0x0420, 7830 0xD1 => 0x0421, 7831 0xD2 => 0x0422, 7832 0xD3 => 0x0423, 7833 0xD4 => 0x0424, 7834 0xD5 => 0x0425, 7835 0xD6 => 0x0426, 7836 0xD7 => 0x0427, 7837 0xD8 => 0x0428, 7838 0xD9 => 0x0429, 7839 0xDA => 0x042A, 7840 0xDB => 0x042B, 7841 0xDC => 0x042C, 7842 0xDD => 0x042D, 7843 0xDE => 0x042E, 7844 0xDF => 0x042F, 7845 0xE0 => 0x0430, 7846 0xE1 => 0x0431, 7847 0xE2 => 0x0432, 7848 0xE3 => 0x0433, 7849 0xE4 => 0x0434, 7850 0xE5 => 0x0435, 7851 0xE6 => 0x0436, 7852 0xE7 => 0x0437, 7853 0xE8 => 0x0438, 7854 0xE9 => 0x0439, 7855 0xEA => 0x043A, 7856 0xEB => 0x043B, 7857 0xEC => 0x043C, 7858 0xED => 0x043D, 7859 0xEE => 0x043E, 7860 0xEF => 0x043F, 7861 0xF0 => 0x0440, 7862 0xF1 => 0x0441, 7863 0xF2 => 0x0442, 7864 0xF3 => 0x0443, 7865 0xF4 => 0x0444, 7866 0xF5 => 0x0445, 7867 0xF6 => 0x0446, 7868 0xF7 => 0x0447, 7869 0xF8 => 0x0448, 7870 0xF9 => 0x0449, 7871 0xFA => 0x044A, 7872 0xFB => 0x044B, 7873 0xFC => 0x044C, 7874 0xFD => 0x044D, 7875 0xFE => 0x044E, 7876 0xFF => 0x044F 7877 ) 7878 ) ; 7879 7880 public function __construct ( $object_id, $font_variant ) 7881 { 7882 parent::__construct ( $object_id, $font_variant, self::$WinAnsiCharacterMap ) ; 7883 } 7884 } 7885 7886 7887/*============================================================================================================== 7888 7889 class PdfTexterAdobeMacRomanMap - 7890 Abstract class to handle Adobe-specific Mac Roman fonts. 7891 7892 ==============================================================================================================*/ 7893class PdfTexterAdobeMacRomanMap extends PdfTexterAdobeMap 7894 { 7895 // Mac roman to Unicode encoding 7896 // Source : ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT 7897 public static $MacRomanCharacterMap = array 7898 ( 7899 0 => array 7900 ( 7901 0x80 => 0x00C4, # LATIN CAPITAL LETTER A WITH DIAERESIS 7902 0x81 => 0x00C5, # LATIN CAPITAL LETTER A WITH RING ABOVE 7903 0x82 => 0x00C7, # LATIN CAPITAL LETTER C WITH CEDILLA 7904 0x83 => 0x00C9, # LATIN CAPITAL LETTER E WITH ACUTE 7905 0x84 => 0x00D1, # LATIN CAPITAL LETTER N WITH TILDE 7906 0x85 => 0x00D6, # LATIN CAPITAL LETTER O WITH DIAERESIS 7907 0x86 => 0x00DC, # LATIN CAPITAL LETTER U WITH DIAERESIS 7908 0x87 => 0x00E1, # LATIN SMALL LETTER A WITH ACUTE 7909 0x88 => 0x00E0, # LATIN SMALL LETTER A WITH GRAVE 7910 0x89 => 0x00E2, # LATIN SMALL LETTER A WITH CIRCUMFLEX 7911 0x8A => 0x00E4, # LATIN SMALL LETTER A WITH DIAERESIS 7912 0x8B => 0x00E3, # LATIN SMALL LETTER A WITH TILDE 7913 0x8C => 0x00E5, # LATIN SMALL LETTER A WITH RING ABOVE 7914 0x8D => 0x00E7, # LATIN SMALL LETTER C WITH CEDILLA 7915 0x8E => 0x00E9, # LATIN SMALL LETTER E WITH ACUTE 7916 0x8F => 0x00E8, # LATIN SMALL LETTER E WITH GRAVE 7917 0x90 => 0x00EA, # LATIN SMALL LETTER E WITH CIRCUMFLEX 7918 0x91 => 0x00EB, # LATIN SMALL LETTER E WITH DIAERESIS 7919 0x92 => 0x00ED, # LATIN SMALL LETTER I WITH ACUTE 7920 0x93 => 0x00EC, # LATIN SMALL LETTER I WITH GRAVE 7921 0x94 => 0x00EE, # LATIN SMALL LETTER I WITH CIRCUMFLEX 7922 0x95 => 0x00EF, # LATIN SMALL LETTER I WITH DIAERESIS 7923 0x96 => 0x00F1, # LATIN SMALL LETTER N WITH TILDE 7924 0x97 => 0x00F3, # LATIN SMALL LETTER O WITH ACUTE 7925 0x98 => 0x00F2, # LATIN SMALL LETTER O WITH GRAVE 7926 0x99 => 0x00F4, # LATIN SMALL LETTER O WITH CIRCUMFLEX 7927 0x9A => 0x00F6, # LATIN SMALL LETTER O WITH DIAERESIS 7928 0x9B => 0x00F5, # LATIN SMALL LETTER O WITH TILDE 7929 0x9C => 0x00FA, # LATIN SMALL LETTER U WITH ACUTE 7930 0x9D => 0x00F9, # LATIN SMALL LETTER U WITH GRAVE 7931 0x9E => 0x00FB, # LATIN SMALL LETTER U WITH CIRCUMFLEX 7932 0x9F => 0x00FC, # LATIN SMALL LETTER U WITH DIAERESIS 7933 0xA0 => 0x2020, # DAGGER 7934 0xA1 => 0x00B0, # DEGREE SIGN 7935 0xA2 => 0x00A2, # CENT SIGN 7936 0xA3 => 0x00A3, # POUND SIGN 7937 0xA4 => 0x00A7, # SECTION SIGN 7938 0xA5 => 0x2022, # BULLET 7939 0xA6 => 0x00B6, # PILCROW SIGN 7940 0xA7 => 0x00DF, # LATIN SMALL LETTER SHARP S 7941 0xA8 => 0x00AE, # REGISTERED SIGN 7942 0xA9 => 0x00A9, # COPYRIGHT SIGN 7943 0xAA => 0x2122, # TRADE MARK SIGN 7944 0xAB => 0x00B4, # ACUTE ACCENT 7945 0xAC => 0x00A8, # DIAERESIS 7946 0xAD => 0x2260, # NOT EQUAL TO 7947 0xAE => 0x00C6, # LATIN CAPITAL LETTER AE 7948 0xAF => 0x00D8, # LATIN CAPITAL LETTER O WITH STROKE 7949 0xB0 => 0x221E, # INFINITY 7950 0xB1 => 0x00B1, # PLUS-MINUS SIGN 7951 0xB2 => 0x2264, # LESS-THAN OR EQUAL TO 7952 0xB3 => 0x2265, # GREATER-THAN OR EQUAL TO 7953 0xB4 => 0x00A5, # YEN SIGN 7954 0xB5 => 0x00B5, # MICRO SIGN 7955 0xB6 => 0x2202, # PARTIAL DIFFERENTIAL 7956 0xB7 => 0x2211, # N-ARY SUMMATION 7957 0xB8 => 0x220F, # N-ARY PRODUCT 7958 0xB9 => 0x03C0, # GREEK SMALL LETTER PI 7959 0xBA => 0x222B, # INTEGRAL 7960 0xBB => 0x00AA, # FEMININE ORDINAL INDICATOR 7961 0xBC => 0x00BA, # MASCULINE ORDINAL INDICATOR 7962 0xBD => 0x03A9, # GREEK CAPITAL LETTER OMEGA 7963 0xBE => 0x00E6, # LATIN SMALL LETTER AE 7964 0xBF => 0x00F8, # LATIN SMALL LETTER O WITH STROKE 7965 0xC0 => 0x00BF, # INVERTED QUESTION MARK 7966 0xC1 => 0x00A1, # INVERTED EXCLAMATION MARK 7967 0xC2 => 0x00AC, # NOT SIGN 7968 0xC3 => 0x221A, # SQUARE ROOT 7969 0xC4 => 0x0192, # LATIN SMALL LETTER F WITH HOOK 7970 0xC5 => 0x2248, # ALMOST EQUAL TO 7971 0xC6 => 0x2206, # INCREMENT 7972 0xC7 => 0x00AB, # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK 7973 0xC8 => 0x00BB, # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK 7974 0xC9 => 0x2026, # HORIZONTAL ELLIPSIS 7975 0xCA => 0x00A0, # NO-BREAK SPACE 7976 0xCB => 0x00C0, # LATIN CAPITAL LETTER A WITH GRAVE 7977 0xCC => 0x00C3, # LATIN CAPITAL LETTER A WITH TILDE 7978 0xCD => 0x00D5, # LATIN CAPITAL LETTER O WITH TILDE 7979 0xCE => 0x0152, # LATIN CAPITAL LIGATURE OE 7980 0xCF => 0x0153, # LATIN SMALL LIGATURE OE 7981 0xD0 => 0x2013, # EN DASH 7982 0xD1 => 0x2014, # EM DASH 7983 0xD2 => 0x201C, # LEFT DOUBLE QUOTATION MARK 7984 0xD3 => 0x201D, # RIGHT DOUBLE QUOTATION MARK 7985 0xD4 => 0x2018, # LEFT SINGLE QUOTATION MARK 7986 0xD5 => 0x2019, # RIGHT SINGLE QUOTATION MARK 7987 0xD6 => 0x00F7, # DIVISION SIGN 7988 0xD7 => 0x25CA, # LOZENGE 7989 0xD8 => 0x00FF, # LATIN SMALL LETTER Y WITH DIAERESIS 7990 0xD9 => 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS 7991 0xDA => 0x2044, # FRACTION SLASH 7992 0xDB => 0x20AC, # EURO SIGN 7993 0xDC => 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK 7994 0xDD => 0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 7995 0xDE => 0xFB01, # LATIN SMALL LIGATURE FI 7996 0xDF => 0xFB02, # LATIN SMALL LIGATURE FL 7997 0xE0 => 0x2021, # DOUBLE DAGGER 7998 0xE1 => 0x00B7, # MIDDLE DOT 7999 0xE2 => 0x201A, # SINGLE LOW-9 QUOTATION MARK 8000 0xE3 => 0x201E, # DOUBLE LOW-9 QUOTATION MARK 8001 0xE4 => 0x2030, # PER MILLE SIGN 8002 0xE5 => 0x00C2, # LATIN CAPITAL LETTER A WITH CIRCUMFLEX 8003 0xE6 => 0x00CA, # LATIN CAPITAL LETTER E WITH CIRCUMFLEX 8004 0xE7 => 0x00C1, # LATIN CAPITAL LETTER A WITH ACUTE 8005 0xE8 => 0x00CB, # LATIN CAPITAL LETTER E WITH DIAERESIS 8006 0xE9 => 0x00C8, # LATIN CAPITAL LETTER E WITH GRAVE 8007 0xEA => 0x00CD, # LATIN CAPITAL LETTER I WITH ACUTE 8008 0xEB => 0x00CE, # LATIN CAPITAL LETTER I WITH CIRCUMFLEX 8009 0xEC => 0x00CF, # LATIN CAPITAL LETTER I WITH DIAERESIS 8010 0xED => 0x00CC, # LATIN CAPITAL LETTER I WITH GRAVE 8011 0xEE => 0x00D3, # LATIN CAPITAL LETTER O WITH ACUTE 8012 0xEF => 0x00D4, # LATIN CAPITAL LETTER O WITH CIRCUMFLEX 8013 0xF0 => 0xF8FF, # Apple logo 8014 0xF1 => 0x00D2, # LATIN CAPITAL LETTER O WITH GRAVE 8015 0xF2 => 0x00DA, # LATIN CAPITAL LETTER U WITH ACUTE 8016 0xF3 => 0x00DB, # LATIN CAPITAL LETTER U WITH CIRCUMFLEX 8017 0xF4 => 0x00D9, # LATIN CAPITAL LETTER U WITH GRAVE 8018 0xF5 => 0x0131, # LATIN SMALL LETTER DOTLESS I 8019 0xF6 => 0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT 8020 0xF7 => 0x02DC, # SMALL TILDE 8021 0xF8 => 0x00AF, # MACRON 8022 0xF9 => 0x02D8, # BREVE 8023 0xFA => 0x02D9, # DOT ABOVE 8024 0xFB => 0x02DA, # RING ABOVE 8025 0xFC => 0x00B8, # CEDILLA 8026 0xFD => 0x02DD, # DOUBLE ACUTE ACCENT 8027 0xFE => 0x02DB, # OGONEK 8028 0xFF => 0x02C7 # CARON 8029 ) 8030 ) ; 8031 8032 8033 public function __construct ( $object_id, $font_variant ) 8034 { 8035 parent::__construct ( $object_id, $font_variant, self::$MacRomanCharacterMap ) ; 8036 } 8037 } 8038 8039 8040/*============================================================================================================== 8041 8042 class PdfTexterAdobeUndocumentedUnicodeMap - 8043 Sometimes, Unicode maps translate character ids to something in the range 0xF000..0xF0FF (or maybe more). 8044 These mapped characters do not correspond to anything else in Unicode, but rather to a special character 8045 set. 8046 This class is not meant to be instantiated by anything here, but rather used for its $Map property. 8047 Note that the $Map array is not complete. 8048 8049 ==============================================================================================================*/ 8050class PdfTexterAdobeUndocumentedUnicodeMap extends PdfTexterAdobeMap 8051 { 8052 public static $UnicodeMap = array 8053 ( 8054 0xF0F0 => 0x30, // '0' through '9' 8055 0xF0EF => 0x31, 8056 0xF0EE => 0x32, 8057 0xF0ED => 0x33, 8058 0xF0EC => 0x34, 8059 0xF0EB => 0x35, 8060 0xF0EA => 0x36, 8061 0xF0E9 => 0x37, 8062 0xF0E8 => 0x38, 8063 0xF0E7 => 0x39, 8064 0xF0DF => 0x41, // 'A' through 'Z' 8065 0xF0DE => 0x42, 8066 0xF0DD => 0x43, 8067 0xF0DC => 0x44, 8068 0xF0DB => 0x45, 8069 0xF0DA => 0x46, 8070 0xF0D9 => 0x47, 8071 0xF0D8 => 0x48, 8072 0xF0D7 => 0x49, 8073 0xF0D6 => 0x4A, 8074 0xF0D5 => 0x4B, 8075 0xF0D4 => 0x4C, 8076 0xF0D3 => 0x4D, 8077 0xF0D2 => 0x4E, 8078 0xF0D1 => 0x4F, 8079 0xF0D0 => 0x50, 8080 0xF0CF => 0x51, 8081 0xF0CE => 0x52, 8082 0xF0CD => 0x53, 8083 0xF0CC => 0x54, 8084 0xF0CB => 0x55, 8085 0xF0CA => 0x56, 8086 0xF0C9 => 0x57, 8087 0xF0C8 => 0x58, 8088 0xF0C7 => 0x59, 8089 0xF0C6 => 0x5A, 8090 0xF0BF => 0x61, // 'a' through 'z' 8091 0xF0BE => 0x62, 8092 0xF0BD => 0x63, 8093 0xF0BC => 0x64, 8094 0xF0BB => 0x65, 8095 0xF0BA => 0x66, 8096 0xF0B9 => 0x67, 8097 0xF0B8 => 0x68, 8098 0xF0B7 => 0x69, 8099 0xF0B6 => 0x6A, 8100 0xF0B5 => 0x6B, 8101 0xF0B4 => 0x6C, 8102 0xF0B3 => 0x6D, 8103 0xF0B2 => 0x6E, 8104 0xF0B1 => 0x6F, 8105 0xF0B0 => 0x70, 8106 0xF0AF => 0x71, 8107 0xF0AE => 0x72, 8108 0xF0AD => 0x73, 8109 0xF0AC => 0x74, 8110 0xF0AB => 0x75, 8111 0xF0AA => 0x76, 8112 0xF0A9 => 0x77, 8113 0xF0A8 => 0x78, 8114 0xF0A7 => 0x79, 8115 0xF0A6 => 0x7A, 8116 0xF0F1 => 0x2F, // '/' 8117 0xF0E6 => 0x3A, // ':' 8118 0xF0F3 => 0x2D, // '-' 8119 0xF0F8 => 0x28, // '(' 8120 0xF0F7 => 0x29, // ')' 8121 0xF0F2 => 0x2E, // '.' 8122 0xF020 => 0x20, // Space 8123 0xF0F9 => 0x27, // "'" 8124 0xF037 => 0xE9, // é 8125 0xF038 => 0xE8, // è 8126 ) ; 8127 8128 8129 8130 public function __construct ( $object_id, $font_variant ) 8131 { 8132 parent::__construct ( $object_id, $font_variant, self::$UnicodeMap ) ; 8133 } 8134 } 8135 8136 8137/*============================================================================================================== 8138 8139 PdfTexterCIDMap - 8140 A class for mapping (or trying to...) CID fonts. 8141 8142 ==============================================================================================================*/ 8143abstract class PdfTexterCIDMap extends PdfTexterCharacterMap 8144 { 8145 // CID maps are associative arrays whose keys are the font CID (currently expressed as a numeric value) and 8146 // whose values are the corresponding UTF8 representation. The following special values can also be used to 8147 // initialize certain entries : 8148 // UNKNOWN_CID : 8149 // Indicates that the corresponding CID has no known UTF8 counterpart. When the PdfToText::$DEBUG variable 8150 // is true, every character in this case will be replaced with the string : "[UID: abcd]", where "abcd" is 8151 // the hex representation of the CID. This way, new CID tables can be built using this information. 8152 const UNKNOWN_CID = -1 ; 8153 // ALT_CID : 8154 // Sorry, this will remain undocumented so far and will be highligh subject to change, since it is dating 8155 // from my first interpretation of CID fonts, which is probably wrong. 8156 const ALT_CID = -2 ; 8157 8158 8159 // CID font map file ; the file is a PHP script that must contain an array of the form : 8160 // $map = array 8161 // ( 8162 // 'plain' => array 8163 // ( 8164 // $cid1 => $utf1, 8165 // ... 8166 // ) 8167 // ) ; 8168 protected $MapFile ; 8169 // Map, loaded into memry 8170 protected $Map ; 8171 // Map cache - the interest is to avoid unnecessary includes 8172 private static $CachedMaps = array ( ) ; 8173 8174 // Related to the first experimentatl implementation of CID fonts 8175 private $LastAltOffset = false ; 8176 8177 8178 /*-------------------------------------------------------------------------------------------------------------- 8179 8180 Constructor - 8181 Loads the specified map. 8182 If the map files contains a definition such as : 8183 8184 $map = 'IDENTITY-H-GQJGLM.cid' ; 8185 8186 then the specified map will be loaded instead (ony one ndirection is supported). 8187 8188 *-------------------------------------------------------------------------------------------------------------*/ 8189 public function __construct ( $object_id, $map_name, $font_variant ) 8190 { 8191 // Initialize parent objects 8192 parent::__construct ( $object_id ) ; 8193 $this -> HexCharWidth = 4 ; // So far, CIDs are 2-bytes long 8194 8195 // Since alternate characters can be apparently prefixed by 0x0000 or 0x0001, two calls to the array access operator 8196 // will be needed to retrieve the exact character in such cases 8197 // This is why we have to tell the upper layers not to cache the results 8198 $this -> Cache = false ; 8199 8200 $map_index = "$map_name:$font_variant" ; 8201 8202 // If this font has already been loaded somewhere, then reuse its information 8203 if ( isset ( self::$CachedMaps [ $map_index] ) ) 8204 { 8205 $map = self::$CachedMaps [ $map_index ] [ 'map' ] ; 8206 $file = self::$CachedMaps [ $map_index ] [ 'file' ] ; 8207 } 8208 // Otherwise, 8209 else 8210 { 8211 $file = $this -> __get_cid_file ( $map_name, $font_variant ) ; 8212 8213 // No CID map found : CID numbers will be mapped as is 8214 if ( ! file_exists ( $file ) ) 8215 { 8216 if ( PdfToText::$DEBUG ) 8217 warning ( new PdfToTextDecodingException ( "Could not find CID table \"$map_name\" in directory \"" . PdfToText::$CIDTablesDirectory . "\"." ) ) ; 8218 } 8219 // Otherwise, load the CID map 8220 else 8221 { 8222 include ( $file ) ; 8223 8224 if ( isset ( $map ) ) 8225 { 8226 // We authorize one CID map to contain the name of another CID map file, instead of the map itself 8227 if ( is_string ( $map ) ) 8228 { 8229 $file = PdfToText::$CIDTablesDirectory . "/$map" ; 8230 include ( $file ) ; 8231 } 8232 8233 if ( isset ( $map ) ) 8234 self::$CachedMaps [ $map_index ] = array ( 'file' => $file, 'map' => $map ) ; 8235 } 8236 else if ( PdfToText::$DEBUG ) 8237 warning ( new PdfToTextDecodingException ( "CID \"$file\" does not contain any definition." ) ) ; 8238 } 8239 } 8240 8241 // Save map info for this CID font 8242 $this -> MapFile = $file ; 8243 $this -> Map = ( isset ( $map ) ) ? $map : array ( ) ; 8244 } 8245 8246 8247 /*-------------------------------------------------------------------------------------------------------------- 8248 8249 __get_cid_file - 8250 Searches in the CIDTables directory for the CID map that best matches the specified map name (usually, 8251 IDENTITY-H) and the optional font variant. 8252 8253 If a font variant has been specified, like "ABCD+Italic-Arial", then the CID tables directory will be 8254 searched for the following files, in the following order : 8255 - IDENTITY-H-ABCD+Italic-Arial.cid 8256 - IDENTITY-H-ABCD+Italic.cid 8257 - IDENTITY-H-ABCD.cid 8258 - If none found, then IDENTITY-H-empty.cid will be used and a warning will be issued in debug mode. 8259 8260 *-------------------------------------------------------------------------------------------------------------*/ 8261 private function __get_cid_file ( $map_name, $font_variant ) 8262 { 8263 $files = array ( ) ; 8264 8265 // Search for font variants, if any 8266 if ( $font_variant ) 8267 { 8268 if ( preg_match ( '/^ (?P<name> [a-z_][a-z_0-9]*) (?P<rest> [\-+] .*) $/imsx' , $font_variant, $match ) ) 8269 { 8270 $basename = '-' . $match [ 'name' ] ; 8271 8272 if ( preg_match_all ( '/ (?P<sep> [\-+]) (?P<name> [^\-+]+) /ix', $match [ 'rest' ], $other_matches ) ) 8273 { 8274 for ( $i = count ( $other_matches [ 'name' ] ) - 1 ; $i >= 0 ; $i -- ) 8275 { 8276 $new_file = $basename ; 8277 8278 for ( $j = 0 ; $j < $i ; $j ++ ) 8279 $new_file .= $other_matches [ 'sep' ] [$i] . $other_matches [ 'name' ] [$i] ; 8280 8281 $files [] = array ( PdfToText::$CIDTablesDirectory . "/$map_name$new_file.cid", 'standard' ) ; 8282 } 8283 } 8284 } 8285 8286 // Last one will be the empty CID font 8287 $files [] = array ( PdfToText::$CIDTablesDirectory . "/IDENTITY-H-empty.cid", 'empty' ) ; 8288 } 8289 8290 // Add the specified map file 8291 $files [] = array ( PdfToText::$CIDTablesDirectory . "/$map_name.cid", 'default' ) ; 8292 8293 // The first existing file in the list should be the appropriate one 8294 foreach ( $files as $file ) 8295 { 8296 if ( file_exists ( $file [0] ) ) 8297 { 8298 if ( PdfToText::$DEBUG ) 8299 { 8300 if ( $file [1] === 'empty' ) 8301 warning ( new PdfToTextDecodingException ( "Using empty IDENTITY-H definition for map \"$map_name\", variant \"$font_variant\"." ) ) ; 8302 else if ( $file [1] === 'default' ) 8303 warning ( new PdfToTextDecodingException ( "Using default IDENTITY-H definition for map \"$map_name\"." ) ) ; 8304 } 8305 8306 return ( $file [0] ) ; 8307 } 8308 } 8309 8310 // No CID font found 8311 return ( false ) ; 8312 } 8313 8314 8315 /*-------------------------------------------------------------------------------------------------------------- 8316 8317 Interface implementations. 8318 8319 *-------------------------------------------------------------------------------------------------------------*/ 8320 public function count ( ) 8321 { return ( count ( $this -> Map ) ) ; } 8322 8323 8324 public function offsetExists ( $offset ) 8325 { return ( isset ( $this -> Map [ 'plain' ] [ $offset ] ) ) ; } 8326 8327 8328 public function offsetGet ( $offset ) 8329 { 8330 if ( isset ( $this -> Map [ 'plain' ] [ $offset ] ) ) 8331 { 8332 $ch = $this -> Map [ 'plain' ] [ $offset ] ; 8333 8334 switch ( $ch ) 8335 { 8336 case self::UNKNOWN_CID : 8337 if ( PdfToText::$DEBUG ) 8338 echo ( '[UID:' . sprintf ( '%04x', $offset ) . "]" ) ; 8339 8340 $this -> LastAltOffset = false ; 8341 8342 if ( ! PdfToText::$DEBUG ) 8343 return ( '' ) ; 8344 else 8345 return ( '[UID:' . sprintf ( '%04x', $offset ) . "]" ) ; 8346 8347 case self::ALT_CID : 8348 $this -> LastAltOffset = ( integer ) $offset ; 8349 8350 return ( '' ) ; 8351 8352 default : 8353 if ( $this -> LastAltOffset === false ) 8354 return ( $ch ) ; 8355 8356 if ( isset ( $this -> Map [ 'alt' ] [ $this -> LastAltOffset ] [ $offset ] ) ) 8357 { 8358 $ch2 = $this -> Map [ 'alt' ] [ $this -> LastAltOffset ] [ $offset ] ; 8359 8360 if ( $ch2 == self::UNKNOWN_CID ) 8361 { 8362 if ( PdfToText::$DEBUG ) 8363 { 8364 echo ( "[CID{$this -> LastAltOffset}:" . sprintf ( '%04x', $offset ) . "]" ) ; 8365 8366 $ch2 = "[CID{$this -> LastAltOffset}: $offset]" ; 8367 } 8368 } 8369 } 8370 else 8371 $ch2 = '' ; 8372 8373 $this -> LastAltOffset = false ; 8374 8375 return ( $ch2 ) ; 8376 } 8377 } 8378 else 8379 { 8380 $this -> LastAltOffset = false ; 8381 8382 return ( '' ) ; 8383 } 8384 } 8385 } 8386 8387 8388 8389/*============================================================================================================== 8390 8391 PdfTexterIdentityHCIDMap - 8392 A class for mapping IDENTITY-H CID fonts (or trying to...). 8393 8394 ==============================================================================================================*/ 8395class PdfTexterIdentityHCIDMap extends PdfTexterCIDMap 8396 { 8397 public function __construct ( $object_id, $font_variant ) 8398 { 8399 parent::__construct ( $object_id, 'IDENTITY-H', $font_variant ) ; 8400 } 8401 } 8402 8403 8404 8405/*============================================================================================================== 8406 8407 PdfTexterPageMap - 8408 A class for detecting page objects mappings and retrieving page number for a specified object. 8409 There is a quadruple level of indirection here : 8410 8411 - The first level contains a /Type /Catalog parameter, with a /Pages one that references an object which 8412 contains a /Count and /Kids. I don't know yet if the /Pages parameter can reference more than one 8413 object using the array notation. However, the class is designed to handle such situations. 8414 - The object containing the /Kids parameter references objects who, in turn, lists the objects contained 8415 into one single page. 8416 - Each object referenced in /Kids has a /Type/Page parameter, together with /Contents, which lists the 8417 objects of the current page. 8418 8419 Object references are of the form : "x y R", where "x" is the object number. 8420 8421 Of course, anything can be in any order, otherwise it would not be funny ! Consider the following 8422 example : 8423 8424 (1) 5 0 obj 8425 << ... /Pages 1 0 R ... >> 8426 endobj 8427 8428 (2) 1 0 obj 8429 << ... /Count 1 /Kids[6 0 R] ... /Type/Pages ... >> 8430 endobj 8431 8432 (3) 6 0 obj 8433 << ... /Type/Page ... /Parent 1 0 R ... /Contents [10 0 R 11 0 R ... x 0 R] 8434 endobj 8435 8436 Object #5 says that object #1 contains the list of page contents (in this example, there is only one page, 8437 referenced by object #6). 8438 Object #6 says that the objects #10, #11 through #x are contained into the same page. 8439 The quadruple indirection comes when you are handling one of the objects referenced in object #6 and you 8440 need to retrieve their page number... 8441 8442 Of course, you cannot rely on the fact that all objects appear in logical order. 8443 8444 And, of course #2, there may be no page catalog at all ! in such cases, objects containing drawing 8445 instructions will have to be considered as a single page, whose number will be sequential. 8446 8447 And, of course #3, as this is the case with the official PDF 1.7 Reference from Adobe, there can be a 8448 reference to a non-existing object which was meant to contain the /Kids parameter (!). In this case, 8449 taking the ordinal number of objects of type (3) gives the page number minus one. 8450 8451 One mystery is that the PDF 1.7 Reference file contains 1310 pages but only 1309 are recognized here... 8452 8453 ==============================================================================================================*/ 8454class PdfTexterPageMap extends PdfObjectBase 8455 { 8456 // Page contents are (normally) first described by a catalog 8457 // Although there should be only one entry for that, this property is defined as an array, as you need to really 8458 // become paranoid when handling pdf contents... 8459 protected $PageCatalogs = array ( ) ; 8460 // Entries that describe which page contains which text objects. Of course, these can be nested otherwise it would not be funny ! 8461 protected $PageKids = array ( ) ; 8462 // Terminal entries : they directly give the ids of the objects belonging to a page 8463 public $PageContents = array ( ) ; 8464 // Note that all the above arrays are indexed by object id and filled with the data collected by calling the Peek() Method... 8465 8466 // Objects that could be referenced from other text objects as XObjects, using the /TPLx notation 8467 protected $TemplateObjects = array ( ) ; 8468 8469 // Once the Peek() method has collected page contents & object information, the MapCatalog() method is called to create this array 8470 // which contains page numbers as keys, and the list of objects contained in this page as values 8471 public $Pages = array ( ) ; 8472 // Holds page attributes 8473 public $PageAttributes = array ( ) ; 8474 8475 // Resource mappings can either refer to an object (/Resources 2 0 R) or to inline mappings (/Resources << ... >>) 8476 // The same object can be referenced by many /Resources parameters throughout the pdf file, so its important to keep 8477 // the analyzed mappings in a cache, so that later references will reuse the results of the first one 8478 private $ResourceMappingCache = array ( ) ; 8479 // List of XObject names - Used by the IsValidTemplate() function 8480 private $XObjectNames = array ( ) ; 8481 8482 8483 /*-------------------------------------------------------------------------------------------------------------- 8484 8485 CONSTRUCTOR 8486 Creates a PdfTexterPageMap object. Actually, nothing significant is perfomed here, as this class' goal 8487 is to be used internally by PdfTexter. 8488 8489 *-------------------------------------------------------------------------------------------------------------*/ 8490 public function __construct ( ) 8491 { 8492 parent::__construct ( ) ; 8493 } 8494 8495 8496 /*-------------------------------------------------------------------------------------------------------------- 8497 8498 NAME 8499 AddTemplateObject - Adds an object that could be referenced as a template/ 8500 8501 PROTOTYPE 8502 $pagemap -> AddTemplateObject ( $object_id, $object_text_data ) ; 8503 8504 DESCRIPTION 8505 Adds an object that may be referenced as a template from another text object, using the /TPLx notation. 8506 8507 PARAMETERS 8508 $object_id (integer) - 8509 Id of the object that may contain a resource mapping entry. 8510 8511 $object_data (string) - 8512 Object contents. 8513 8514 *-------------------------------------------------------------------------------------------------------------*/ 8515 public function AddTemplateObject ( $object_id, $object_text_data ) 8516 { 8517 $this -> TemplateObjects [ $object_id ] = $object_text_data ; 8518 } 8519 8520 8521 /*-------------------------------------------------------------------------------------------------------------- 8522 8523 NAME 8524 GetResourceMappings - Gets resource mappings specified after a /Resources parameter. 8525 8526 PROTOTYPE 8527 $result = $this -> GetResourceMappings ( $object_id, $object_data, $parameter, $pdf_object_list ) ; 8528 8529 DESCRIPTION 8530 Most of the time, objects containing a page description (/Type /Page) also contain a /Resources parameter, 8531 which may be followed by one of the following constructs : 8532 - A reference to an object, such as : 8533 /Resources 2 0 R 8534 - Or an inline set of parameters, such as font or xobject mappings : 8535 /Resources << /Font<</F1 10 0 R ...>> /XObject <</Im0 27 0 R ...>> 8536 This method extracts alias/object mappings for the parameter specified by $parameter (it can be for 8537 example 'Font' or 'Xobject') and returns these mappings as an associative array. 8538 8539 PARAMETERS 8540 $object_id (integer) - 8541 Id of the object that may contain a resource mapping entry. 8542 8543 $object_data (string) - 8544 Object contents. 8545 8546 $parameter (string) - 8547 Parameter defining resource mapping, for example /Font or /XObject. 8548 8549 $pdf_object_list (associative array) - 8550 Array of object id/object data associations, for all objects defined in the pdf file. 8551 8552 RETURN VALUE 8553 The list of resource mappings for the specified parameter, as an associative array, whose keys are the 8554 resource aliases and values are the corresponding object ids. 8555 The method returns an empty array if the specified object does not contain resource mappings or does 8556 not contain the specified parameter. 8557 8558 *-------------------------------------------------------------------------------------------------------------*/ 8559 protected function GetResourceMappings ( $object_id, $object_data, $parameter, $pdf_object_list ) 8560 { 8561 // The /Resources parameter refers to an existing PDF object 8562 if ( preg_match ( '#/Resources \s* (?P<object_id> \d+) \s+ \d+ \s+ R#ix', $object_data, $match ) ) 8563 { 8564 // Return the cached result if the same object has previously been referenced by a /Resources parameter 8565 if ( isset ( $this -> ResourceMappingCache [ $object_id ] [ $parameter ] ) ) 8566 return ( $this -> ResourceMappingCache [ $object_id ] [ $parameter ] ) ; 8567 8568 // Check that the object that is referred to exists 8569 if ( isset ( $pdf_object_list [ $match [ 'object_id' ] ] ) ) 8570 $data = $pdf_object_list [ $match [ 'object_id' ] ] ; 8571 else 8572 return ( array ( ) ) ; 8573 8574 $is_object = true ; // to tell that we need to put the results in cache for later use 8575 } 8576 // The /Resources parameter is followed by inline mappings 8577 else if ( preg_match ( '#/Resources \s* <#ix', $object_data, $match, PREG_OFFSET_CAPTURE ) ) 8578 { 8579 $data = substr ( $object_data, $match [0] [1] + strlen ( $match [0] [0] ) - 1 ) ; 8580 $is_object = false ; 8581 } 8582 else 8583 return ( array ( ) ) ; 8584 8585 // Whatever we will be analyzing (an object contents or inline contents following the /Resources parameter), 8586 // the text will be enclosed within double angle brackets (<< ... >>) 8587 8588 // A small kludge for /XObject which specify an object reference ("15 0 R") instead of XObjects mappings 8589 // ("<< ...>>" ) 8590 if ( $parameter == '/XObject' && preg_match ( '#/XObject \s+ (?P<obj> \d+) \s+ \d+ \s+ R#ix', $data, $match ) ) 8591 { 8592 $data = '/XObject ' . $pdf_object_list [ $match [ 'obj' ] ] ; 8593 } 8594 8595 if ( preg_match ( "#$parameter \s* << \s* (?P<mappings> .*?) \s* >>#imsx", $data, $match ) ) 8596 { 8597 preg_match_all ( '# (?P<mapping> / [^\s]+) \s+ (?P<object_id> \d+) \s+ \d+ \s+ R#ix', $match [ 'mappings' ], $matches ) ; 8598 8599 $mappings = array ( ) ; 8600 8601 // Mapping extraction loop 8602 for ( $i = 0, $count = count ( $matches [ 'object_id' ] ) ; $i < $count ; $i ++ ) 8603 $mappings [ $matches [ 'mapping' ] [$i] ] = $matches [ 'object_id' ] [$i] ; 8604 8605 // Put results for referenced objects in cache 8606 if ( $is_object ) 8607 $this -> ResourceMappingCache [ $object_id ] [ $parameter ] = $mappings ; 8608 8609 return ( $mappings ) ; 8610 } 8611 else 8612 return ( array ( ) ) ; 8613 } 8614 8615 8616 /*-------------------------------------------------------------------------------------------------------------- 8617 8618 NAME 8619 Peek - Peeks page information from a pdf object. 8620 8621 PROTOTYPE 8622 $pagemap -> Peek ( ) ; 8623 8624 DESCRIPTION 8625 Retrieves page information which can be of type (1), (2) or (3), as described in the class comments. 8626 8627 PARAMETERS 8628 $object_id (integer) - 8629 Id of the current pdf object. 8630 8631 $object_data (string) - 8632 Pdf object contents. 8633 8634 $pdf_objects (associative array) - 8635 Objects defined in the pdf file, as an associative array whose keys are object numbers and 8636 values object data. 8637 This parameter is used for /Type/Page objects which have a /Resource parameter that references 8638 an existing object instead of providing font mappings and other XObject mappings inline, 8639 enclosed within double angle brackets (<< /Font ... >>). 8640 8641 *-------------------------------------------------------------------------------------------------------------*/ 8642 public function Peek ( $object_id, $object_data, $pdf_objects ) 8643 { 8644 // Page catalog (/Type/Catalog and /Pages x 0 R) 8645 if ( preg_match ( '#/Type \s* /Catalog#ix', $object_data ) && $this -> GetObjectReferences ( $object_id, $object_data, '/Pages', $references ) ) 8646 $this -> PageCatalogs = array_merge ( $this -> PageCatalogs, $references ) ; 8647 // Object listing the object numbers that give the list of objects contained in a single page (/Types/Pages and /Count x /Kids[x1 0 R ... xn 0 R] 8648 else if ( preg_match ( '#/Type \s* /Pages#ix', $object_data ) ) 8649 { 8650 if ( $this -> GetObjectReferences ( $object_id, $object_data, '/Kids', $references ) ) 8651 { 8652 // Sometimes, a reference can be the one of an object that contains the real reference ; in the following example, 8653 // the actual page contents are not in object 4, but in object 5 8654 // /Kids 4 0 R 8655 // ... 8656 // 4 0 obj 8657 // [5 0 R] 8658 // endobj 8659 $new_references = array ( ) ; 8660 8661 foreach ( $references as $reference ) 8662 { 8663 if ( ! isset ( $pdf_objects [ $reference ] ) || 8664 ! preg_match ( '/^ \s* (?P<ref> \[ [^]]+ \]) \s*$/imsx', $pdf_objects [ $reference ], $match ) ) 8665 { 8666 $new_references [] = $reference ; 8667 } 8668 else 8669 { 8670 $this -> GetObjectReferences ( $reference, $pdf_objects [ $reference ], '', $sub_references ) ; 8671 $new_references = array_merge ( $new_references, $sub_references ) ; 8672 } 8673 8674 } 8675 8676 // Get kid count (knowing that sometimes, it is missing...) 8677 preg_match ( '#/Count \s+ (?P<count> \d+)#ix', $object_data, $match ) ; 8678 $page_count = ( isset ( $match [ 'count' ] ) ) ? ( integer ) $match [ 'count' ] : false ; 8679 8680 // Get parent object id 8681 preg_match ( '#/Parent \s+ (?P<parent> \d+)#ix', $object_data, $match ) ; 8682 $parent = ( isset ( $match [ 'parent' ] ) ) ? ( integer ) $match [ 'parent' ] : false ; 8683 8684 $this -> PageKids [ $object_id ] = array 8685 ( 8686 'object' => $object_id, 8687 'parent' => $parent, 8688 'count' => $page_count, 8689 'kids' => $new_references 8690 ) ; 8691 } 8692 } 8693 // Object listing the other objects that are contained in this page (/Type/Page and /Contents[x1 0 R ... xn 0 R] 8694 else if ( preg_match ( '#/Type \s* /Page\b#ix', $object_data ) ) 8695 { 8696 if ( $this -> GetObjectReferences ( $object_id, $object_data, '/Contents', $references ) ) 8697 { 8698 preg_match ( '#/Parent \s+ (?P<parent> \d+)#ix', $object_data, $match ) ; 8699 $parent = ( isset ( $match [ 'parent' ] ) ) ? (integer) $match [ 'parent' ] : false ; 8700 $fonts = $this -> GetResourceMappings ( $object_id, $object_data, '/Font', $pdf_objects ) ; 8701 $xobjects = $this -> GetResourceMappings ( $object_id, $object_data, '/XObject', $pdf_objects ) ; 8702 8703 // Find the width and height of the page (/Mediabox parameter) 8704 if ( preg_match ( '#/MediaBox \s* \[ \s* (?P<x1> \d+) \s+ (?P<y1> \d+) \s+ (?P<x2> \d+) \s+ (?P<y2> \d+) \s* \]#imsx', $object_data, $match ) ) 8705 { 8706 $width = ( double ) ( $match [ 'x2' ] - $match [ 'x1' ] + 1 ) ; 8707 $height = ( double ) ( $match [ 'y2' ] - $match [ 'y1' ] + 1 ) ; 8708 } 8709 // Otherwise, fix an arbitrary width and length (but this should never happen, because all pdf files are correct, isn't it?) 8710 else 8711 { 8712 $width = 595 ; 8713 $height = 850 ; 8714 } 8715 8716 // Yes ! some /Contents parameters may designate another object which contains references to the real text contents 8717 // in the form : [x 0 R y 0 R etc.], so we have to dig into it... 8718 $new_references = array ( ) ; 8719 8720 foreach ( $references as $reference ) 8721 { 8722 // We just need to check that the object contains something like : 8723 // [x 0 R y 0 R ...] 8724 // and nothing more 8725 if ( isset ( $pdf_objects [ $reference ] ) && preg_match ( '#^\s* \[ [^]]+ \]#x', $pdf_objects [ $reference ] ) && 8726 $this -> GetObjectReferences ( $reference, $pdf_objects [ $reference ], '', $nested_references ) ) 8727 $new_references = array_merge ( $new_references, $nested_references ) ; 8728 else 8729 $new_references [] = $reference ; 8730 } 8731 8732 $this -> PageContents [ $object_id ] = array 8733 ( 8734 'object' => $object_id, 8735 'parent' => $parent, 8736 'contents' => $new_references, 8737 'fonts' => $fonts, 8738 'xobjects' => $xobjects, 8739 'width' => $width, 8740 'height' => $height 8741 ) ; 8742 } 8743 } 8744 // None of the above, but object contains /Xobject's and maybe more... 8745 else if ( preg_match ( '#/Type \s* /XObject\b#ix', $object_data ) ) 8746 { 8747 preg_match ( '#/Parent \s+ (?P<parent> \d+)#ix', $object_data, $match ) ; 8748 $parent = ( isset ( $match [ 'parent' ] ) ) ? (integer) $match [ 'parent' ] : false ; 8749 $fonts = $this -> GetResourceMappings ( $object_id, $object_data, '/Font', $pdf_objects ) ; 8750 $xobjects = $this -> GetResourceMappings ( $object_id, $object_data, '/XObject', $pdf_objects ) ; 8751 8752 $this -> GetObjectReferences ( $object_id, $object_data, '/Contents', $references ) ; 8753 8754 $this -> PageContents [ $object_id ] = array 8755 ( 8756 'object' => $object_id, 8757 'parent' => $parent, 8758 'contents' => $references, 8759 'fonts' => $fonts, 8760 'xobjects' => $xobjects 8761 ) ; 8762 } 8763 } 8764 8765 8766 /*-------------------------------------------------------------------------------------------------------------- 8767 8768 NAME 8769 ProcessTemplateReferences - Replace template references with actual text contents. 8770 8771 PROTOTYPE 8772 $text = $pagemap -> ReplaceTemplateReferences ( $page_number, $text_data ) ; 8773 8774 DESCRIPTION 8775 Replaces template references of the form "/TPLx Do" with the actual text contents. 8776 8777 PARAMETERS 8778 $page_number (integer) - 8779 Page number of the object that contains the supplied object data. 8780 8781 $text_data (string) 8782 Text drawing instructions that are to be processed. 8783 8784 RETURN VALUE 8785 Returns the original text, where all template references have been replaced with the contents of the 8786 object they refer to. 8787 8788 *-------------------------------------------------------------------------------------------------------------*/ 8789 public function ProcessTemplateReferences ( $page_number, $text_data ) 8790 { 8791 // Many paranoid checks in this piece of code... 8792 if ( isset ( $this -> Pages [ $page_number ] ) ) 8793 { 8794 // Loop through the PageContents array to find which one(s) may be subject to template reference replacements 8795 foreach ( $this -> PageContents as $page_contents ) 8796 { 8797 // If the current object relates to the specified page number, AND it has xobjects, then the supplied text data 8798 // may contain template reference of the form : /TPLx. 8799 // In this case, we replace such a reference with the actual contents of the object they refer to 8800 if ( isset ( $page_contents [ 'page' ] ) && $page_contents [ 'page' ] == $page_number && count ( $page_contents [ 'xobjects' ] ) ) 8801 { 8802 $template_searches = array ( ) ; 8803 $template_replacements = array ( ) ; 8804 8805 $this -> __get_replacements ( $page_contents, $template_searches, $template_replacements ) ; 8806 $text_data = self::PregStrReplace ( $template_searches, $template_replacements, $text_data ) ; 8807 } 8808 } 8809 } 8810 8811 return ( $text_data ) ; 8812 } 8813 8814 8815 // __get_replacements - 8816 // Recursively gets the search/replacement strings for template references. 8817 private function __get_replacements ( $page_contents, &$searches, &$replacements, $objects_seen = array ( ) ) 8818 { 8819 foreach ( $page_contents [ 'xobjects' ] as $template_name => $template_object ) 8820 { 8821 if ( isset ( $this -> TemplateObjects [ $template_object ] ) && ! isset ( $objects_seen [ $template_object ] ) ) 8822 { 8823 $template = $this -> TemplateObjects [ $template_object ] ; 8824 $searches [] = '#(' . $template_name . ' \s+ Do\b )#msx' ; 8825 $replacements [] = '!PDFTOTEXT_TEMPLATE_' . substr ( $template_name, 1 ) . ' ' . $template ; 8826 $objects_seen [ $template_object ] = $template_object ; 8827 8828 if ( isset ( $this -> PageContents [ $template_object ] ) ) 8829 $this -> __get_replacements ( $this -> PageContents [ $template_object ], $searches, $replacements, $objects_seen ) ; 8830 } 8831 } 8832 } 8833 8834 8835 8836 /*-------------------------------------------------------------------------------------------------------------- 8837 8838 NAME 8839 MapObjects - Builds a correspondance between object and page numbers. 8840 8841 PROTOTYPE 8842 $pagemap -> MapObjects ( ) ; 8843 8844 DESCRIPTION 8845 Builds a correspondance between object and page numbers. The page number corresponding to an object id 8846 will after that be available using the array notation. 8847 8848 NOTES 8849 This method behaves as if there could be more than one page catalog in the same file, but I've not yet 8850 encountered this case. 8851 8852 *-------------------------------------------------------------------------------------------------------------*/ 8853 public function MapObjects ( $objects ) 8854 { 8855 $kid_count = count ( $this -> PageKids ) ; 8856 8857 // PDF files created short after the birth of Earth may have neither a page catalog nor page contents descriptions 8858 if ( ! count ( $this -> PageCatalogs ) ) 8859 { 8860 // Later, during Pleistocen, references to page kids started to appear... 8861 if ( $kid_count ) 8862 { 8863 foreach ( array_keys ( $this -> PageKids ) as $catalog ) 8864 $this -> MapKids ( $catalog, $current_page ) ; 8865 } 8866 else 8867 $this -> Pages [1] = array_keys ( $objects ) ; 8868 } 8869 // This is the ideal situation : there is a catalog that allows us to gather indirectly all page data 8870 else 8871 { 8872 $current_page = 1 ; 8873 8874 foreach ( $this -> PageCatalogs as $catalog ) 8875 { 8876 if ( isset ( $this -> PageKids [ $catalog ] ) ) 8877 $this -> MapKids ( $catalog, $current_page ) ; 8878 // Well, almost ideal : it may happen that the page catalog refers to a non-existing object : 8879 // in this case, we behave the same as if there were no page catalog at all : group everything 8880 // onto one page 8881 else 8882 $this -> Pages [1] = array_keys ( $objects ) ; 8883 } 8884 } 8885 } 8886 8887 8888 /*-------------------------------------------------------------------------------------------------------------- 8889 8890 NAME 8891 MapKids - Establishes a correspondance between page kids and a current page number. 8892 8893 PROTOTYPE 8894 $pagemap -> MapObjects ( $catalog, &$page ) ; 8895 8896 DESCRIPTION 8897 Tries to assign a page number to all page description objects that have been collected by the Peek() 8898 method. 8899 Also creates the Pages associative array, whose keys are page numbers and whose values are the ids of 8900 the objects that the page contains. 8901 8902 EXAMPLE 8903 The following example gives an overview of a possible layout for page catalogs ; it describes which 8904 objects contain what. 8905 Lines starting with "#x", where "x" is a number, stands for a PDF object definition, which will start 8906 with "x 0 obj" in the PDF file. 8907 Whenever numbers are referenced (other than those prefixed with a "#"), it means "reference to the 8908 specified object. 8909 For example, "54" will refer to object #54, and will be given as "54 0 R" in the PDF file. 8910 The numbers at the beginning of each line are just "step numbers", which will be referenced in the 8911 explanations after the example : 8912 8913 (01) #1 : /Type/Catalog /Pages 54 8914 (02) -> #54 : /Type/Pages /Kids[3 28 32 58] /Count 5 8915 (03) -> #3 : /Type/Page /Parent 54 /Contents[26] 8916 (04) -> #26 : page contents 8917 (05) -> #28 : /Type/Page /Parent 54 /Contents[30 100 101 102 103 104] 8918 (06) -> #30 : page contents 8919 (07) -> #32 : /Type/Page /Parent 54 /Contents[34] 8920 (08) -> #34 : page contents 8921 (09) -> #58 : /Type/Pages /Parent 54 /Count 2 /Kids[36 40] 8922 (10) -> #36 : /Type/Page /Parent 58 /Contents[38] 8923 (11) -> #38 : page contents 8924 (12) -> #40 : /Type/Page /Parent 58 /Contents[42] 8925 (13) -> #42 : page contents 8926 8927 Explanations : 8928 (01) Object #1 contains the page catalog ; it states that a further description of the page 8929 contents is given by object #54. 8930 Note that it could reference multiple page descriptions, such as : /Pages [54 68 99...] 8931 (although I did not met the case so far) 8932 (02) Object #54 in turn says that it as "kids", described by objects #3, #28, #32 and #58. It 8933 also says that it has 5 pages (/Count parameter) ; but wait... the /Kids parameter references 8934 4 objects while the /Count parameter states that we have 5 pages : what happens ? we will 8935 discover it in the explanations below. 8936 (03) Object #3 states that it is aimed for page description (/Type/Page) ; the page contents 8937 will be found in object #26, specified after the /Contents parameter. Note that here again, 8938 multiple objects could be referenced by the /Contents parameter but, in our case, there is 8939 only one, 26. Object #3 also says that its parent object (in the page catalog) is object 8940 #54, defined in (01). 8941 Since this is the first page we met, it will have page number 1. 8942 (04) ... object #26 contains the Postscript instructions to draw page #1 8943 (05) Object #28 has the same type as #3 ; its page contents can be located in object #30 (06) 8944 The same applies for object #32 (07), whose page contents are given by object #34 (08). 8945 So, (05) and (07) will be pages 2 and 3, respectively. 8946 (09) Now, it starts to become interesting : object #58 does not directly lead to an object 8947 containing Postscript instructions as did objects #3, #28 and #32 whose parent is #54, but 8948 to yet another page catalog which contains 2 pages (/Count 2), described by objects #36 and 8949 #40. It's not located at the same position as object #54 in the hierarchy, so it shows that 8950 page content descriptions can be recursively nested. 8951 (10) Object #36 says that we will find the page contents in object #38 (which will be page 4) 8952 (12) ... and object #40 says that we will find the page contents in object #42 (and our final 8953 page, 5) 8954 8955 *-------------------------------------------------------------------------------------------------------------*/ 8956 protected function MapKids ( $catalog, &$page ) 8957 { 8958 if ( ! isset ( $this -> PageKids [ $catalog ] ) ) 8959 return ; 8960 8961 $entry = $this -> PageKids [ $catalog ] ; 8962 8963 // The PDF file contains an object containing a /Type/Pages/Kids[] construct, specified by another object containing a 8964 // /Type/Catalog/Pages construct : we will rely on its contents to find which page contains what 8965 if ( isset ( $this -> PageContents [ $entry [ 'kids' ] [0] ] ) ) 8966 { 8967 foreach ( $entry [ 'kids' ] as $item ) 8968 { 8969 // Some objects given by a /Page /Contents[] construct do not directly lead to an object describing PDF contents, 8970 // but rather to an object containing in turn a /Pages /Kids[] construct ; this adds a level of indirection, and 8971 // we have to recursively process it 8972 if ( isset ( $this -> PageKids [ $item ] ) ) 8973 { 8974 $this -> MapKids ( $item, $page ) ; 8975 } 8976 // The referenced object actually defines page contents (no indirection) 8977 else 8978 { 8979 $this -> PageContents [ $item ] [ 'page' ] = $page ; 8980 $this -> Pages [ $page ] = ( isset ( $this -> PageContents [ $item ] [ 'contents' ] ) ) ? 8981 $this -> PageContents [ $item ] [ 'contents' ] : array ( ) ; 8982 if ( isset ( $this -> PageContents [ $item ] [ 'width' ] ) ) 8983 { 8984 $this -> PageAttributes [ $page ] = array 8985 ( 8986 'width' => $this -> PageContents [ $item ] [ 'width' ], 8987 'height' => $this -> PageContents [ $item ] [ 'height' ] 8988 ) ; 8989 } 8990 8991 $page ++ ; 8992 } 8993 } 8994 } 8995 // No page catalog at all : consider everything is on the same page (this class does not use the WheresMyCrystalBall trait) 8996 else 8997 { 8998 foreach ( $entry [ 'kids' ] as $kid ) 8999 $this -> MapKids ( $kid, $page ) ; 9000 } 9001 } 9002 9003 9004 /*-------------------------------------------------------------------------------------------------------------- 9005 9006 NAME 9007 GetMappedFonts - Retrieves the mapped fonts per page 9008 9009 PROTOTYPE 9010 $array = $pagemap -> GetMappedFonts ( ) ; 9011 9012 DESCRIPTION 9013 Gets the mapped fonts, per page. XObjects are traversed, to retrieved additional font aliases defined 9014 by them. 9015 This function is used by the PdfTexter class to add additional entries to the FontMap object, 9016 ensuring that each reference to a font remains local to a page. 9017 9018 RETURN VALUE 9019 Returns an array of associative arrays which have the following entries : 9020 - 'page' : 9021 Page number. 9022 - 'xobject-name' : 9023 XObject name, that can define further font aliases. This entry is set to the empty string for 9024 global font aliases. 9025 - 'font-name' : 9026 Font name (eg, "/F1", "/C1_0", etc.). 9027 - 'object' : 9028 Object defining the font attributes, such as character map, etc. 9029 9030 *-------------------------------------------------------------------------------------------------------------*/ 9031 public function GetMappedFonts ( ) 9032 { 9033 $mapped_fonts = array ( ) ; 9034 $current_page = 0 ; 9035 9036 foreach ( $this -> PageCatalogs as $catalog ) 9037 { 9038 if ( ! isset ( $this -> PageKids [ $catalog ] ) ) 9039 continue ; 9040 9041 foreach ( $this -> PageKids [ $catalog ] [ 'kids' ] as $page_object ) 9042 { 9043 $current_page ++ ; 9044 9045 if ( isset ( $this -> PageContents [ $page_object ] ) ) 9046 { 9047 $page_contents = $this -> PageContents [ $page_object ] ; 9048 $associations = array ( ) ; 9049 9050 if ( isset ( $page_contents [ 'fonts' ] ) ) 9051 { 9052 foreach ( $page_contents [ 'fonts' ] as $font_name => $font_object ) 9053 { 9054 $mapped_fonts [] = array 9055 ( 9056 'page' => $current_page, 9057 'xobject-name' => '', 9058 'font-name' => $font_name, 9059 'object' => $font_object 9060 ) ; 9061 9062 $associations [ ":$font_name" ] = $font_object ; 9063 9064 $this -> __map_recursive ( $current_page, $page_contents [ 'xobjects' ], $mapped_fonts, $associations ) ; 9065 } 9066 } 9067 } 9068 } 9069 } 9070 9071 return ( $mapped_fonts ) ; 9072 } 9073 9074 9075 // __map_recursive - 9076 // Recursively collects font aliases for XObjects. 9077 private function __map_recursive ( $page_number, $xobjects, &$mapped_fonts, &$associations ) 9078 { 9079 foreach ( $xobjects as $xobject_name => $xobject_value ) 9080 { 9081 if ( isset ( $this -> PageContents [ $xobject_value ] ) ) 9082 { 9083 foreach ( $this -> PageContents [ $xobject_value ] [ 'fonts' ] as $font_name => $font_object ) 9084 { 9085 if ( ! isset ( $associations [ "$xobject_name:$font_name" ] ) ) 9086 { 9087 $mapped_fonts [] = array 9088 ( 9089 'page' => $page_number, 9090 'xobject-name' => $xobject_name, 9091 'font-name' => $font_name, 9092 'object' => $font_object 9093 ) ; 9094 9095 $associations [ "$xobject_name:$font_name" ] = $font_object ; 9096 } 9097 } 9098 9099 $this -> XObjectNames [ $xobject_name ] = 1 ; 9100 $this -> __map_recursive ( $page_number, $this -> PageContents [ $xobject_value ] [ 'xobjects' ], $mapped_fonts, $associations ) ; 9101 } 9102 } 9103 } 9104 9105 9106 9107 /*-------------------------------------------------------------------------------------------------------------- 9108 9109 NAME 9110 IsValidXObject - Checks if the specified object is a valid XObject. 9111 9112 PROTOTYPE 9113 $status = $pagemap -> IsValidXObjectName ( $name ) ; 9114 9115 DESCRIPTION 9116 Checks if the specified name is a valid XObject defining its own set of font aliases. 9117 9118 PARAMETERS 9119 $name (string) - 9120 Name of the XObject to be checked. 9121 9122 RETURN VALUE 9123 Returns true if the specified XObject exists and defines its own set of font aliases, false otherwise. 9124 9125 *-------------------------------------------------------------------------------------------------------------*/ 9126 public function IsValidXObjectName ( $name ) 9127 { return ( isset ( $this -> XObjectNames [ $name ] ) ) ; } 9128 } 9129 9130 9131/************************************************************************************************************** 9132 ************************************************************************************************************** 9133 ************************************************************************************************************** 9134 ****** ****** 9135 ****** ****** 9136 ****** IMAGE MANAGEMENT ****** 9137 ****** ****** 9138 ****** ****** 9139 ************************************************************************************************************** 9140 ************************************************************************************************************** 9141 **************************************************************************************************************/ 9142 9143/*============================================================================================================== 9144 9145 class PdfImage - 9146 Holds image data coming from pdf. 9147 9148 ==============================================================================================================*/ 9149abstract class PdfImage extends PdfObjectBase 9150 { 9151 // Image resource that can be used to process image data, using the php imagexxx() functions 9152 public $ImageResource = false ; 9153 // Original image data 9154 protected $ImageData ; 9155 // Tells if the image resource has been created - false when the autosave feature is on and the image is pure JPEG data 9156 protected $NoResourceCreated ; 9157 9158 9159 /*-------------------------------------------------------------------------------------------------------------- 9160 9161 CONSTRUCTOR 9162 Creates a PdfImage object with a resource that can be used with imagexxx() php functions. 9163 9164 *-------------------------------------------------------------------------------------------------------------*/ 9165 public function __construct ( $image_data, $no_resource_created = false ) 9166 { 9167 $this -> ImageData = $image_data ; 9168 $this -> NoResourceCreated = $no_resource_created ; 9169 9170 if ( ! $no_resource_created ) 9171 $this -> ImageResource = $this -> CreateImageResource ( $image_data ) ; 9172 } 9173 9174 9175 /*-------------------------------------------------------------------------------------------------------------- 9176 9177 DESTRUCTOR 9178 Destroys the associated image resource. 9179 9180 *-------------------------------------------------------------------------------------------------------------*/ 9181 public function __destruct ( ) 9182 { 9183 $this -> DestroyImageResource ( ) ; 9184 } 9185 9186 9187 /*-------------------------------------------------------------------------------------------------------------- 9188 9189 NAME 9190 CreateImageResource - creates an image resource from the supplied image data. 9191 9192 PROTOTYPE 9193 $resource = $this -> CreateImageResource ( $data ) ; 9194 9195 DESCRIPTION 9196 Creates an image resource from the supplied image data. 9197 Whatever the input format, the internal format will be the one used by the gd library. 9198 9199 PARAMETERS 9200 $data (string) - 9201 Image data. 9202 9203 *-------------------------------------------------------------------------------------------------------------*/ 9204 abstract protected function CreateImageResource ( $image_data ) ; 9205 9206 9207 /*-------------------------------------------------------------------------------------------------------------- 9208 9209 NAME 9210 DestroyImageResource - Destroys the allocated image resource. 9211 9212 PROTOTYPE 9213 $this -> DestroyImageResource ( ) ; 9214 9215 DESCRIPTION 9216 Destroys the allocated image resource, using the libgd imagedestroy() function. This method can be 9217 overridden by derived class if the underlying image resource does not come from the gd lib. 9218 9219 *-------------------------------------------------------------------------------------------------------------*/ 9220 protected function DestroyImageResource ( ) 9221 { 9222 if ( $this -> ImageResource ) 9223 imagedestroy ( $this -> ImageResource ) ; 9224 } 9225 9226 9227 /*-------------------------------------------------------------------------------------------------------------- 9228 9229 NAME 9230 SaveAs - Saves the current image to a file. 9231 9232 PROTOTYPE 9233 $pdfimage -> SaveAs ( $output_file, $image_type = IMG_JPEG ) ; 9234 9235 DESCRIPTION 9236 Saves the current image resource to the specified output file, in the specified format. 9237 9238 PARAMETERS 9239 $output_file (string) - 9240 Output filename. 9241 9242 $image_type (integer) - 9243 Output format. Can be any of the predefined php constants IMG_*. 9244 9245 *-------------------------------------------------------------------------------------------------------------*/ 9246 public function SaveAs ( $output_file, $image_type = IMG_JPEG ) 9247 { 9248 if ( ! $this -> ImageResource ) 9249 { 9250 if ( $this -> NoResourceCreated && $image_type == IMG_JPEG ) 9251 file_put_contents ( $output_file, $this -> ImageData ) ; 9252 else if ( PdfToText::$DEBUG ) 9253 warning ( new PdfToTextDecodingException ( "No image resource allocated." ) ) ; 9254 9255 return ; 9256 } 9257 9258 $image_types = imagetypes ( ) ; 9259 9260 switch ( $image_type ) 9261 { 9262 case IMG_JPEG : 9263 case IMG_JPG : 9264 if ( ! ( $image_types & IMG_JPEG ) && ! ( $image_types & IMG_JPG ) ) 9265 error ( new PdfToTextDecodingException ( "Your current PHP version does not support JPG images." ) ) ; 9266 9267 imagejpeg ( $this -> ImageResource, $output_file, 100 ) ; 9268 break ; 9269 9270 case IMG_GIF : 9271 if ( ! ( $image_types & IMG_GIF ) ) 9272 error ( new PdfToTextDecodingException ( "Your current PHP version does not support GIF images." ) ) ; 9273 9274 imagegif ( $this -> ImageResource, $output_file ) ; 9275 break ; 9276 9277 case IMG_PNG : 9278 if ( ! ( $image_types & IMG_PNG ) ) 9279 error ( new PdfToTextDecodingException ( "Your current PHP version does not support PNG images." ) ) ; 9280 9281 imagepng ( $this -> ImageResource, $output_file, 0 ) ; 9282 break ; 9283 9284 case IMG_WBMP : 9285 if ( ! ( $image_types & IMG_WBMP ) ) 9286 error ( new PdfToTextDecodingException ( "Your current PHP version does not support WBMP images." ) ) ; 9287 9288 imagewbmp ( $this -> ImageResource, $output_file ) ; 9289 break ; 9290 9291 case IMG_XPM : 9292 if ( ! ( $image_types & IMG_XPM ) ) 9293 error ( new PdfToTextDecodingException ( "Your current PHP version does not support XPM images." ) ) ; 9294 9295 imagexbm ( $this -> ImageResource, $output_file ) ; 9296 break ; 9297 9298 default : 9299 error ( new PdfToTextDecodingException ( "Unknown image type #$image_type." ) ) ; 9300 } 9301 } 9302 9303 9304 public function Output ( ) 9305 { 9306 $this -> SaveAs ( null ) ; 9307 } 9308 } 9309 9310 9311 9312/*============================================================================================================== 9313 9314 class PdfJpegImage - 9315 Handles encoded JPG images. 9316 9317 ==============================================================================================================*/ 9318class PdfJpegImage extends PdfImage 9319 { 9320 public function __construct ( $image_data, $autosave ) 9321 { 9322 parent::__construct ( $image_data, $autosave ) ; 9323 } 9324 9325 9326 protected function CreateImageResource ( $image_data ) 9327 { 9328 return ( imagecreatefromstring ( $image_data ) ) ; 9329 } 9330 } 9331 9332 9333/*============================================================================================================== 9334 9335 class PdfInlinedImage - 9336 Decodes raw image data in objects having the /FlateDecode flag. 9337 9338 ==============================================================================================================*/ 9339class PdfInlinedImage extends PdfImage 9340 { 9341 // Supported color schemes 9342 const COLOR_SCHEME_RGB = 1 ; 9343 const COLOR_SCHEME_CMYK = 2 ; 9344 const COLOR_SCHEME_GRAY = 3 ; 9345 9346 // Color scheme names, for debugging only 9347 private static $DecoderNames = array 9348 ( 9349 self::COLOR_SCHEME_RGB => 'RGB', 9350 self::COLOR_SCHEME_CMYK => 'CMYK', 9351 self::COLOR_SCHEME_GRAY => 'Gray' 9352 ) ; 9353 9354 // Currently implemented image decoders 9355 private static $Decoders = array 9356 ( 9357 self::COLOR_SCHEME_RGB => array 9358 ( 9359 8 => '__decode_rgb8' 9360 ), 9361 self::COLOR_SCHEME_GRAY => array 9362 ( 9363 8 => '__decode_gray8' 9364 ), 9365 self::COLOR_SCHEME_CMYK => array 9366 ( 9367 8 => '__decode_cmyk8' 9368 ), 9369 ) ; 9370 9371 // Image width and height 9372 public $Width, 9373 $Height ; 9374 // Color scheme 9375 public $ColorScheme ; 9376 // Number of bits per color component 9377 public $BitsPerComponent ; 9378 // Decoding function, varying upon the supplied image type 9379 public $DecodingFunction = false ; 9380 9381 9382 /*-------------------------------------------------------------------------------------------------------------- 9383 9384 NAME 9385 Constructor - Builds an image from the supplied data. 9386 9387 PROTOTYPE 9388 $image = new PdfInlinedImage ( $image_data, $width, $height, $bits_per_component, $color_scheme ) ; 9389 9390 DESCRIPTION 9391 Builds an image from the supplied data. Checks that the image flags are supported. 9392 9393 PARAMETERS 9394 $image_data (string) - 9395 Uncompressed image data. 9396 9397 $width (integer) - 9398 Image width, in pixels. 9399 9400 $height (integer) - 9401 Image height, in pixels. 9402 9403 $bits_per_components (integer) - 9404 Number of bits per color component. 9405 9406 $color_scheme (integer) - 9407 One of the COLOR_SCHEME_* constants, specifying the initial data format. 9408 9409 NOTES 9410 Processed images are always converted to JPEG format. 9411 9412 *-------------------------------------------------------------------------------------------------------------*/ 9413 public function __construct ( $image_data, $width, $height, $bits_per_component, $color_scheme ) 9414 { 9415 $this -> Width = $width ; 9416 $this -> Height = $height ; 9417 $this -> BitsPerComponent = $bits_per_component ; 9418 $this -> ColorScheme = $color_scheme ; 9419 9420 // Check that we have a decoding function for the supplied parameters 9421 if ( isset ( self::$Decoders [ $color_scheme ] ) ) 9422 { 9423 if ( isset ( self::$Decoders [ $color_scheme ] [ $bits_per_component ] ) ) 9424 $this -> DecodingFunction = self::$Decoders [ $color_scheme ] [ $bits_per_component ] ; 9425 else 9426 error ( new PdfToTextDecodingException ( "No decoding function has been implemented for image objects having the " . 9427 self::$DecoderNames [ $color_scheme ] . " color scheme with $bits_per_component bits per color component." ) ) ; 9428 } 9429 else 9430 error ( new PdfToTextDecodingException ( "Unknown color scheme $color_scheme." ) ) ; 9431 9432 parent::__construct ( $image_data ) ; 9433 } 9434 9435 9436 /*-------------------------------------------------------------------------------------------------------------- 9437 9438 NAME 9439 CreateInstance - Creates an appropriate instance of a PdfImage class. 9440 9441 PROTOTYPE 9442 $image = PdfInlinedImage ( $stream_data, $object_data ) ; 9443 9444 DESCRIPTION 9445 Creates an instance of either : 9446 - A PdfJpegImage class, if the image specifications in $object_data indicate that the compressed stream 9447 contents are only JPEG data 9448 - A PdfInlinedImage class, if the image specifications state that the compressed stream data contain 9449 only color values. 9450 9451 The class currently supports (in $stream_data) : 9452 - Pure JPEG contents 9453 - RGB values 9454 - CMYK values 9455 - Gray scale values (in the current version, the resulting image does not correctly reproduce the 9456 initial colors, if interpolation is to be used). 9457 9458 PARAMETERS 9459 $stream_data (string) - 9460 Compressed image data. 9461 9462 $object_data (string) - 9463 Object containing the stream data. 9464 9465 RETURN VALUE 9466 Returns : 9467 - A PdfJpegImage object, if the stream data contains only pure JPEG contents 9468 - A PdfInlinedImage object, in other cases. 9469 - False if the supplied image data is not currently supported. 9470 9471 *-------------------------------------------------------------------------------------------------------------*/ 9472 public static function CreateInstance ( $stream_data, $object_data, $autosave ) 9473 { 9474 // Remove stream data from the supplied object data, to speed up the searches below 9475 $index = strpos ( $object_data, 'stream' ) ; 9476 9477 if ( $index !== false ) 9478 $object_data = substr ( $object_data, 0, $index ) ; 9479 9480 // Uncompress stream data 9481 $image_data = gzuncompress ( $stream_data ) ; 9482 9483 // The /DCTDecode flag indicates JPEG contents - returns a PdfJpegImage object 9484 if ( stripos ( $object_data, '/DCTDecode' ) ) 9485 return ( new PdfJpegImage ( $image_data, $autosave ) ) ; 9486 9487 // Get the image width & height 9488 $match = null ; 9489 preg_match ( '#/Width \s+ (?P<value> \d+)#ix', $object_data, $match ) ; 9490 $width = ( integer ) $match [ 'value' ] ; 9491 9492 $match = null ; 9493 preg_match ( '#/Height \s+ (?P<value> \d+)#ix', $object_data, $match ) ; 9494 $height = ( integer ) $match [ 'value' ] ; 9495 9496 // Get the number of bits per color component 9497 $match = null ; 9498 preg_match ( '#/BitsPerComponent \s+ (?P<value> \d+)#ix', $object_data, $match ) ; 9499 $bits_per_component = ( integer ) $match [ 'value' ] ; 9500 9501 // Get the target color space 9502 // Sometimes, this refers to an object in the PDF file, which can also be embedded in a compound object 9503 // We don't handle such cases for now 9504 $match = null ; 9505 preg_match ( '#/ColorSpace \s* / (?P<value> \w+)#ix', $object_data, $match ) ; 9506 9507 if ( ! isset ( $match [ 'value' ] ) ) 9508 return ( false ) ; 9509 9510 $color_space_name = $match [ 'value' ] ; 9511 9512 // Check that we are able to handle the specified color space 9513 switch ( strtolower ( $color_space_name ) ) 9514 { 9515 case 'devicergb' : 9516 $color_space = self::COLOR_SCHEME_RGB ; 9517 break ; 9518 9519 case 'devicegray' : 9520 $color_space = self::COLOR_SCHEME_GRAY ; 9521 break ; 9522 9523 case 'devicecmyk' : 9524 $color_space = self::COLOR_SCHEME_CMYK ; 9525 break ; 9526 9527 default : 9528 if ( PdfToText::$DEBUG ) 9529 warning ( new PdfToTextDecodingException ( "Unsupported color space \"$color_space_name\"." ) ) ; 9530 9531 return ( false ) ; 9532 } 9533 9534 // Also check that we can handle the specified number of bits per component 9535 switch ( $bits_per_component ) 9536 { 9537 case 8 : 9538 break ; 9539 9540 default : 9541 if ( PdfToText::$DEBUG ) 9542 warning ( new PdfToTextDecodingException ( "Unsupported bits per component : $bits_per_component." ) ) ; 9543 9544 return ( false ) ; 9545 } 9546 9547 // All done, return a PdfInlinedImage object 9548 return ( new PdfInlinedImage ( $image_data, $width, $height, $bits_per_component, $color_space ) ) ; 9549 } 9550 9551 9552 /*-------------------------------------------------------------------------------------------------------------- 9553 9554 NAME 9555 CreateImageResource - Creates the image resource. 9556 9557 PROTOTYPE 9558 $resource = $image -> CreateImageResource ( $image_data ) ; 9559 9560 DESCRIPTION 9561 Creates a GD image according to the supplied image data, and the parameters supplied to the class 9562 constructor. 9563 9564 PARAMETERS 9565 $image_data (string) - 9566 Image to be decoded. 9567 9568 RETURN VALUE 9569 Returns a GD graphics resource in true color, or false if there is currently no implemented decoding 9570 function for this kind of images. 9571 9572 *-------------------------------------------------------------------------------------------------------------*/ 9573 protected function CreateImageResource ( $image_data ) 9574 { 9575 $decoder = $this -> DecodingFunction ; 9576 9577 if ( $decoder ) 9578 return ( $this -> $decoder ( $image_data ) ) ; 9579 else 9580 return ( false ) ; 9581 } 9582 9583 9584 /*-------------------------------------------------------------------------------------------------------------- 9585 9586 Decoding functions. 9587 9588 *-------------------------------------------------------------------------------------------------------------*/ 9589 9590 // __decode_rgb8 - 9591 // Decodes image data consisting of 8-bits RGB values (one byte for each color component). 9592 private function __decode_rgb8 ( $data ) 9593 { 9594 $data_length = strlen ( $data ) ; 9595 $colors = array ( ) ; 9596 $width = $this -> Width ; 9597 $height = $this -> Height ; 9598 $image = imagecreatetruecolor ( $width, $height ) ; 9599 9600 for ( $i = 0, $pixel_x = 0, $pixel_y = 0 ; $i + 3 <= $data_length ; $i += 3, $pixel_x ++ ) 9601 { 9602 $red = ord ( $data [$i] ) ; 9603 $green = ord ( $data [$i+1] ) ; 9604 $blue = ord ( $data [$i+2] ) ; 9605 9606 $color = ( $red << 16 ) | ( $green << 8 ) | ( $blue ) ; 9607 9608 if ( isset ( $colors [ $color ] ) ) 9609 $pixel_color = $colors [ $color ] ; 9610 else 9611 { 9612 $pixel_color = imagecolorallocate ( $image, $red, $green, $blue ) ; 9613 $colors [ $color ] = $pixel_color ; 9614 } 9615 9616 if ( $pixel_x >= $width ) 9617 { 9618 $pixel_x = 0 ; 9619 $pixel_y ++ ; 9620 } 9621 9622 imagesetpixel ( $image, $pixel_x, $pixel_y, $pixel_color ) ; 9623 } 9624 9625 return ( $image ) ; 9626 } 9627 9628 9629 // __decode_cmyk8 - 9630 // Decodes image data consisting of 8-bits CMYK values (one byte for each color component). 9631 private function __decode_cmyk8 ( $data ) 9632 { 9633 $data_length = strlen ( $data ) ; 9634 $colors = array ( ) ; 9635 $width = $this -> Width ; 9636 $height = $this -> Height ; 9637 $image = imagecreatetruecolor ( $width, $height ) ; 9638 9639 for ( $i = 0, $pixel_x = 0, $pixel_y = 0 ; $i + 4 <= $data_length ; $i += 4, $pixel_x ++ ) 9640 { 9641 $cyan = ord ( $data [$i] ) ; 9642 $magenta = ord ( $data [$i+1] ) ; 9643 $yellow = ord ( $data [$i+2] ) ; 9644 $black = ord ( $data [$i+3] ) ; 9645 9646 $color = ( $cyan << 24 ) | ( $magenta << 16 ) | ( $yellow << 8 ) | ( $black ) ; 9647 9648 if ( isset ( $colors [ $color ] ) ) 9649 $pixel_color = $colors [ $color ] ; 9650 else 9651 { 9652 $rgb = $this -> __convert_cmyk_to_rgb ( $cyan, $magenta, $yellow, $black ) ; 9653 $pixel_color = imagecolorallocate ( $image, $rgb [0], $rgb [1], $rgb [2] ) ; 9654 $colors [ $color ] = $pixel_color ; 9655 } 9656 9657 if ( $pixel_x >= $width ) 9658 { 9659 $pixel_x = 0 ; 9660 $pixel_y ++ ; 9661 } 9662 9663 imagesetpixel ( $image, $pixel_x, $pixel_y, $pixel_color ) ; 9664 } 9665 9666 return ( $image ) ; 9667 } 9668 9669 9670 // __decode_gray8 - 9671 // Decodes image data consisting of 8-bits gray values. 9672 private function __decode_gray8 ( $data ) 9673 { 9674 $data_length = strlen ( $data ) ; 9675 $colors = array ( ) ; 9676 $width = $this -> Width ; 9677 $height = $this -> Height ; 9678 $image = imagecreatetruecolor ( $width, $height ) ; 9679 9680 for ( $i = 0, $pixel_x = 0, $pixel_y = 0 ; $i < $data_length ; $i ++, $pixel_x ++ ) 9681 { 9682 $color = ord ( $data [$i] ) ; 9683 9684 if ( isset ( $colors [ $color ] ) ) 9685 $pixel_color = $colors [ $color ] ; 9686 else 9687 { 9688 $pixel_color = imagecolorallocate ( $image, $color, $color, $color ) ; 9689 $colors [ $color ] = $pixel_color ; 9690 } 9691 9692 if ( $pixel_x >= $width ) 9693 { 9694 $pixel_x = 0 ; 9695 $pixel_y ++ ; 9696 } 9697 9698 imagesetpixel ( $image, $pixel_x, $pixel_y, $pixel_color ) ; 9699 } 9700 9701 return ( $image ) ; 9702 } 9703 9704 9705 /*-------------------------------------------------------------------------------------------------------------- 9706 9707 Support functions. 9708 9709 *-------------------------------------------------------------------------------------------------------------*/ 9710 9711 // __convert_cmyk_to_rgb - 9712 // Converts CMYK color value to RGB. 9713 private function __convert_cmyk_to_rgb ( $C, $M, $Y, $K ) 9714 { 9715 if ( $C > 1 || $M > 1 || $Y > 1 || $K > 1 ) 9716 { 9717 $C /= 100.0 ; 9718 $M /= 100.0 ; 9719 $Y /= 100.0 ; 9720 $K /= 100.0 ; 9721 } 9722 9723 $R = ( 1 - $C * ( 1 - $K ) - $K ) * 256 ; 9724 $G = ( 1 - $M * ( 1 - $K ) - $K ) * 256 ; 9725 $B = ( 1 - $Y * ( 1 - $K ) - $K ) * 256 ; 9726 9727 $result = array ( round ( $R ), round ( $G ), round ( $B ) ) ; 9728 9729 return ( $result ) ; 9730 } 9731 } 9732 9733 9734/*============================================================================================================== 9735 9736 class PdfFaxImage - 9737 Handles encoded CCITT Fax images. 9738 9739 ==============================================================================================================*/ 9740class PdfFaxImage extends PdfImage 9741 { 9742 public function __construct ( $image_data ) 9743 { 9744 parent::__construct ( $image_data ) ; 9745 } 9746 9747 9748 protected function CreateImageResource ( $image_data ) 9749 { 9750 warning ( new PdfToTextDecodingException ( "Decoding of CCITT Fax image format is not yet implemented." ) ) ; 9751 //return ( imagecreatefromstring ( $image_data ) ) ; 9752 } 9753 } 9754 9755 9756/************************************************************************************************************** 9757 ************************************************************************************************************** 9758 ************************************************************************************************************** 9759 ****** ****** 9760 ****** ****** 9761 ****** ENCRYPTION MANAGEMENT ****** 9762 ****** ****** 9763 ****** ****** 9764 ************************************************************************************************************** 9765 ************************************************************************************************************** 9766 **************************************************************************************************************/ 9767 9768/*============================================================================================================== 9769 9770 class EncryptionData - 9771 Holds encryption data and allows for decryption. 9772 9773 ==============================================================================================================*/ 9774class PdfEncryptionData extends PdfObjectBase 9775 { 9776 // Encryption modes 9777 const PDFMODE_UNKNOWN = 0 ; 9778 const PDFMODE_STANDARD = 1 ; 9779 9780 // Encryption algorithms 9781 const PDFCRYPT_ALGORITHM_RC4 = 0 ; 9782 const PDFCRYPT_ALGORITHM_AES = 1 ; 9783 const PDFCRYPT_ALGORITHM_AES256 = 2 ; 9784 9785 // A 32-bytes hardcoded padding used when computing encryption keys 9786 const PDF_ENCRYPTION_PADDING = "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A" ; 9787 9788 // Permission bits for encrypted files. Comments come from the PDF specification 9789 const PDFPERM_PRINT = 0x0004 ; // bit 3 : 9790 // (Revision 2) Print the document. 9791 // (Revision 3 or greater) Print the document (possibly not at the highest quality level, 9792 // depending on whether bit 12 is also set). 9793 const PDFPERM_MODIFY = 0x0008 ; // bit 4 : 9794 // Modify the contents of the document by operations other than those controlled by bits 6, 9, and 11. 9795 const PDFPERM_COPY = 0x0010 ; // bit 5 : 9796 // (Revision 2) Copy or otherwise extract text and graphics from the document, including extracting text 9797 // and graphics (in support of accessibility to users with disabilities or for other purposes). 9798 // (Revision 3 or greater) Copy or otherwise extract text and graphics from the document by operations 9799 // other than that controlled by bit 10. 9800 const PDFPERM_MODIFY_EXTRA = 0x0020 ; // bit 6 : 9801 // Add or modify text annotations, fill in interactive form fields, and, if bit 4 is also set, 9802 // create or modify interactive form fields (including signature fields). 9803 const PDFPERM_FILL_FORM = 0x0100 ; // bit 9 : 9804 // (Revision 3 or greater) Fill in existing interactive form fields (including signature fields), 9805 // even if bit 6 is clear. 9806 const PDFPERM_EXTRACT = 0x0200 ; // bit 10 : 9807 // (Revision 3 or greater) Fill in existing interactive form fields (including signature fields), 9808 // even if bit 6 is clear. 9809 const PDFPERM_ASSEMBLE = 0x0400 ; // bit 11 : 9810 // (Revision 3 or greater) Assemble the document (insert, rotate, or delete pages and create bookmarks 9811 // or thumbnail images), even if bit 4 is clear. 9812 const PDFPERM_HIGH_QUALITY_PRINT = 0x0800 ; // bit 12 : 9813 // (Revision 3 or greater) Print the document to a representation from which a faithful digital copy of 9814 // the PDF content could be generated. When this bit is clear (and bit 3 is set), printing is limited to 9815 // a low-level representation of the appearance, possibly of degraded quality. 9816 9817 public $FileId ; // File ID, as specified by the /ID flag 9818 public $ObjectId ; // Object id and text contents 9819 private $ObjectData ; 9820 public $Mode ; // Encryption mode - currently, only the "Standard" keyword is accepted 9821 public $EncryptionAlgorithm ; // Encryption algorithm - one of the PDFCRYPT_* constants 9822 public $AlgorithmVersion, // Encryption algorithm version & revision 9823 $AlgorithmRevision ; 9824 public $Flags ; // Protection flags, when an owner password has been specified - one of the PDFPERM_* constants 9825 public $KeyLength ; // Encryption key length 9826 public $UserKey, // User and owner password keys 9827 $OwnerKey ; 9828 public $UserEncryptionString, // Not sure yet of the real usage of these ones 9829 $OwnerEncryptionString ; 9830 public $EncryptMetadata ; // True if metadata is also encrypted 9831 public $FileKeyLength ; // Key length / 5 9832 9833 protected $Decrypter ; // Decrypter object 9834 9835 private $UnsupportedEncryptionAlgorithm = false ; // True if the encryption algorithm used in the PDF file is not yet supported 9836 9837 9838 /************************************************************************************************************** 9839 9840 NAME 9841 Constructor 9842 9843 PROTOTYPE 9844 obj = new PdfEncryptionData ( $mode, $object_id, $object_data ) ; 9845 9846 DESCRIPTION 9847 Creates an instance of a PdfEncryptionData class, using the information parsed from the supplied object 9848 data. 9849 9850 PARAMETERS 9851 $mode (integer) - 9852 One of the PDFMODE_* constants. 9853 9854 $object_id (integer) - 9855 Id of the object containing enryption parameters. 9856 9857 $object_data (string) - 9858 Encryption parameters. 9859 9860 AUTHOR 9861 Christian Vigh, 03/2017. 9862 9863 HISTORY 9864 [Version : 1.0] [Date : 2017-03-14] [Author : CV] 9865 Initial version. 9866 9867 **************************************************************************************************************/ 9868 public function __construct ( $file_id, $mode, $object_id, $object_data ) 9869 { 9870 $this -> FileId = $file_id ; 9871 $this -> ObjectId = $object_id ; 9872 $this -> ObjectData = $object_data ; 9873 $this -> Mode = $mode ; 9874 9875 // Encryption algorithm version & revision 9876 preg_match ( '#/V \s+ (?P<value> \d+)#ix', $object_data, $algorithm_match ) ; 9877 $this -> AlgorithmVersion = ( integer ) $algorithm_match [ 'value' ] ; 9878 9879 preg_match ( '#/R \s+ (?P<value> \d+)#ix', $object_data, $algorithm_revision_match ) ; 9880 $this -> AlgorithmRevision = ( integer ) $algorithm_revision_match [ 'value' ] ; 9881 9882 // Encryption flags 9883 preg_match ( '#/P \s+ (?P<value> \-? \d+)#ix', $object_data, $flags_match ) ; 9884 $this -> Flags = ( integer) $flags_match [ 'value' ] ; 9885 9886 // Key length (40 bits, if not specified) 9887 if ( preg_match ( '#/Length \s+ (?P<value> \d+)#ix', $object_data, $key_length_match ) ) 9888 $this -> KeyLength = $key_length_match [ 'value' ] ; 9889 else 9890 $this -> KeyLength = 40 ; 9891 9892 // Owner and user passwords 9893 $this -> UserKey = $this -> GetStringParameter ( '/U', $object_data ) ; 9894 $this -> OwnerKey = $this -> GetStringParameter ( '/O', $object_data ) ; 9895 9896 // Owner and user encryption strings 9897 $this -> UserEncryptionString = $this -> GetStringParameter ( '/UE', $object_data ) ; 9898 $this -> OwnerEncryptionString = $this -> GetStringParameter ( '/OE', $object_data ) ; 9899 9900 // EncryptMetadata flag 9901 if ( preg_match ( '# /EncryptMetadata (?P<value> (true) | (1) | (false) | (0) )#imsx', $object_data, $encryption_match ) ) 9902 { 9903 if ( ! strcasecmp ( $encryption_match [ 'value' ], 'true' ) || ! strcasecmp ( $encryption_match [ 'value' ], 'false' ) ) 9904 $this -> EncryptMetadata = true ; 9905 else 9906 $this -> EncryptMetadata = false ; 9907 } 9908 else 9909 $this -> EncryptMetadata = false ; 9910 9911 // Now, try to determine the encryption algorithm to be used 9912 $user_key_length = strlen ( $this -> UserKey ) ; 9913 $owner_key_length = strlen ( $this -> OwnerKey ) ; 9914 $user_encryption_string_length = strlen ( $this -> UserEncryptionString ) ; 9915 $owner_encryption_string_length = strlen ( $this -> OwnerEncryptionString ) ; 9916 9917 $error_unhandled_version = false ; 9918 $error_unhandled_revision = false ; 9919 9920 switch ( $this -> AlgorithmVersion ) 9921 { 9922 case 1 : 9923 switch ( $this -> AlgorithmRevision ) 9924 { 9925 case 2 : 9926 if ( $user_key_length != 32 && $owner_key_length != 32 ) 9927 { 9928 if ( PdfToText::$DEBUG ) 9929 error ( new PdfToTextDecryptionException ( "Invalid user and/or owner key length ($user_key_length/$owner_key_length)", $object_id ) ) ; 9930 } 9931 9932 $this -> EncryptionAlgorithm = self::PDFCRYPT_ALGORITHM_RC4 ; 9933 $this -> FileKeyLength = 5 ; 9934 break ; 9935 9936 default : 9937 $error_unhandled_revision = true ; 9938 } 9939 break ; 9940 9941 default : 9942 $error_unhandled_version = true ; 9943 } 9944 9945 // Report unsupported versions/revisions 9946 if ( $error_unhandled_version || $error_unhandled_revision ) 9947 { 9948 if ( PdfToText::$DEBUG ) 9949 error ( new PdfToTextDecryptionException ( "Unsupported encryption algorithm version {$this -> AlgorithmVersion} revision {$this -> AlgorithmRevision}.", 9950 $object_id ) ) ; 9951 9952 $this -> UnSupportedEncryptionAlgorithm = true ; 9953 9954 return ; 9955 } 9956 9957 // Build the object key 9958 $this -> Decrypter = PdfDecryptionAlgorithm::GetInstance ( $this ) ; 9959 9960 if ( $this -> Decrypter === false ) 9961 { 9962 if ( PdfToText::$DEBUG ) 9963 warning ( new PdfToTextDecryptionException ( "Unsupported encryption algorithm #{$this -> EncryptionAlgorithm}, " . 9964 "version {$this -> AlgorithmVersion} revision {$this -> AlgorithmRevision}.", 9965 $object_id ) ) ; 9966 9967 $this -> UnsupportedEncryptionAlgorithm = true ; 9968 9969 return ; 9970 } 9971 //dump ( $this ) ; 9972 } 9973 9974 9975 /*-------------------------------------------------------------------------------------------------------------- 9976 9977 NAME 9978 GetInstance - Creates an instance of a PdfEncryptionData object. 9979 9980 PROTOTYPE 9981 $obj = PdfEncryptionData::GetInstance ( $object_id, $object_data ) ; 9982 9983 DESCRIPTION 9984 Returns an instance of encryption data 9985 9986 *-------------------------------------------------------------------------------------------------------------*/ 9987 public static function GetInstance ( $file_id, $object_id, $object_data ) 9988 { 9989 // Encryption mode 9990 if ( ! preg_match ( '#/Filter \s* / (?P<mode> \w+)#ix', $object_data, $object_data_match ) ) 9991 return (false ) ; 9992 9993 switch ( strtolower ( $object_data_match [ 'mode' ] ) ) 9994 { 9995 case 'standard' : 9996 $mode = self::PDFMODE_STANDARD ; 9997 break ; 9998 9999 default : 10000 if ( self::$DEBUG > 1 ) 10001 error ( new PdfToTextDecodingException ( "Unhandled encryption mode '{$object_data [ 'mode' ]}'", $object_id ) ) ; 10002 10003 return ( false ) ; 10004 10005 } 10006 10007 // Basic checks have been performed, return an instance of encryption data 10008 return ( new PdfEncryptionData ( $file_id, $mode, $object_id, $object_data ) ) ; 10009 } 10010 10011 10012 /*-------------------------------------------------------------------------------------------------------------- 10013 10014 NAME 10015 Decrypt - Decrypts object data. 10016 10017 PROTOTYPE 10018 $data = $this -> Decrypt ( $object_id, $object_data ) ; 10019 10020 DESCRIPTION 10021 Decrypts object data, when the PDF file is password-protected. 10022 10023 PARAMETERS 10024 $object_id (integer) - 10025 Pdf object number. 10026 10027 $object_data (string) - 10028 Object data. 10029 10030 RETURN VALUE 10031 Returns the decrypted object data, or false if the encrypted object could not be decrypted. 10032 10033 *-------------------------------------------------------------------------------------------------------------*/ 10034 public function Decrypt ( $object_id, $object_data ) 10035 { 10036 if ( $this -> UnsupportedEncryptionAlgorithm ) 10037 return ( false ) ; 10038 10039 return ( false ) ; 10040 //return ( $this -> Decrypter -> Decrypt ( $object_data ) ) ; 10041 //return ( "BT (coucou)Tj ET" ) ; 10042 } 10043 } 10044 10045 10046/*============================================================================================================== 10047 10048 class PdfDecryptionAlgorithm - 10049 Base class for algorithm decrypters. 10050 10051 ==============================================================================================================*/ 10052abstract class PdfDecryptionAlgorithm //extends Object 10053 { 10054 protected $EncryptionData ; 10055 protected $ObjectKey ; 10056 protected $ObjectKeyBytes ; 10057 protected $ObjectKeyLength ; 10058 10059 10060 public function __construct ( $encryption_data ) 10061 { 10062 $this -> EncryptionData = $encryption_data ; 10063 10064 $objkey = '' ; 10065 10066 for ( $i = 0 ; $i < $this -> EncryptionData -> FileKeyLength ; $i ++ ) 10067 $objkey .= $this -> EncryptionData -> FileId [$i] ; 10068 10069 $objkey .= chr ( ( $this -> EncryptionData -> ObjectId ) & 0xFF ) ; 10070 $objkey .= chr ( ( $this -> EncryptionData -> ObjectId >> 8 ) & 0xFF ) ; 10071 $objkey .= chr ( ( $this -> EncryptionData -> ObjectId >> 16 ) & 0xFF ) ; 10072 $objkey .= chr ( 0 ) ; // obj generation number & 0xFF 10073 $objkey .= chr ( 0 ) ; // obj generation number >> 8 & 0xFF 10074 10075 $md5 = md5 ( $objkey, true ) ; 10076 $this -> ObjectKey = $md5 ; 10077 $this -> ObjectKeyLength = 16 ; 10078 10079 $this -> ObjectKeyBytes = array ( ) ; 10080 10081 for ( $i = 0 ; $i < $this -> ObjectKeyLength ; $i ++ ) 10082 $this -> ObjectKeyBytes [] = ord ( $this -> ObjectKey [$i] ) ; 10083 } 10084 10085 10086 public static function GetInstance ( $encryption_data ) 10087 { 10088 switch ( $encryption_data -> EncryptionAlgorithm ) 10089 { 10090 case PdfEncryptionData::PDFCRYPT_ALGORITHM_RC4 : 10091 return ( new PdfRC4DecryptionAlgorithm ( $encryption_data ) ) ; 10092 10093 default : 10094 return ( false ) ; 10095 } 10096 } 10097 10098 10099 abstract public function Reset ( ) ; 10100 abstract public function Decrypt ( $data ) ; 10101 10102 } 10103 10104 10105/*============================================================================================================== 10106 10107 class PdfRC4DecryptionAlgorithm - 10108 A decrypter class for RC4 encoding. 10109 10110 ==============================================================================================================*/ 10111class PdfRC4DecryptionAlgorithm extends PdfDecryptionAlgorithm 10112 { 10113 private static $InitialState = false ; 10114 protected $State ; 10115 10116 10117 public function __construct ( $encryption_data ) 10118 { 10119 parent::__construct ( $encryption_data ) ; 10120 10121 if ( self::$InitialState === false ) 10122 self::$InitialState = range ( 0, 255 ) ; 10123 } 10124 10125 10126 public function Reset ( ) 10127 { 10128 $this -> State = self::$InitialState ; 10129 $index1 = 10130 $index2 = 0 ; 10131 10132 for ( $i = 0 ; $i < 256 ; $i ++ ) 10133 { 10134 $index2 = ( $this -> ObjectKeyBytes [ $index1 ] + $this -> State [$i] + $index2 ) & 0xFF ; 10135 10136 // Swap elements $index2 and $i from $State 10137 $x = $this -> State [$i] ; 10138 $this -> State [$i] = $this -> State [ $index2 ] ; 10139 $this -> State [ $index2 ] = $x ; 10140 10141 $index1 = ( $index1 + 1 ) % $this -> ObjectKeyLength ; 10142 } 10143 } 10144 10145 10146 public function Decrypt ( $data ) 10147 { 10148 $this -> Reset ( ) ; 10149 $length = strlen ( $data ) ; 10150 $x = 0 ; 10151 $y = 0 ; 10152 $result = '' ; 10153 10154 for ( $i = 0 ; $i < $length ; $i ++ ) 10155 { 10156 $ord = ord ( $data [$i] ) ; 10157 $x = ( $x + 1 ) & 0xFF ; 10158 $y = ( $this -> State [$x] + $y ) & 0xFF ; 10159 10160 $tx = $this -> State [$x] ; 10161 $ty = $this -> State [$y] ; 10162 10163 $this -> State [$x] = $ty ; 10164 $this -> State [$y] = $tx ; 10165 10166 $new_ord = $ord ^ $this -> State [ ( $tx + $ty ) & 0xFF ] ; 10167 $result .= chr ( $new_ord ) ; 10168 } 10169 10170 return ( $result ) ; 10171 } 10172 } 10173 10174 /* 10175static Guchar rc4DecryptByte(Guchar *state, Guchar *x, Guchar *y, Guchar c) { 10176 Guchar x1, y1, tx, ty; 10177 10178 x1 = *x = (*x + 1) % 256; 10179 y1 = *y = (state[*x] + *y) % 256; 10180 tx = state[x1]; 10181 ty = state[y1]; 10182 state[x1] = ty; 10183 state[y1] = tx; 10184 return c ^ state[(tx + ty) % 256]; 10185} 10186*/ 10187 10188 10189/************************************************************************************************************** 10190 ************************************************************************************************************** 10191 ************************************************************************************************************** 10192 ****** ****** 10193 ****** ****** 10194 ****** FORM DATA MANAGEMENT ****** 10195 ****** ****** 10196 ****** ****** 10197 ************************************************************************************************************** 10198 ************************************************************************************************************** 10199 **************************************************************************************************************/ 10200 10201 10202/*============================================================================================================== 10203 10204 class PdfToTextFormDefinitions - 10205 Analyzes a template XML file that describes PDF form data and maps PDF field names to human-readable 10206 names. 10207 The GetFormData() returns an object containing the mapped properties with their respective values. 10208 10209 ==============================================================================================================*/ 10210class PdftoTextFormDefinitions // extends Object 10211 implements ArrayAccess, Countable, IteratorAggregate 10212 { 10213 static private $ClassDefinitionCount = 0 ; 10214 10215 // Class name, as specified in the XML template 10216 protected $ClassName ; 10217 // Form definitions (a template may contain several versions of the same for definition) 10218 protected $Definitions ; 10219 // Form definitions coming from the PDF file 10220 protected $PdfDefinitions ; 10221 10222 10223 /*-------------------------------------------------------------------------------------------------------------- 10224 10225 Constructor - 10226 Parses the supplied XML template. 10227 10228 *-------------------------------------------------------------------------------------------------------------*/ 10229 public function __construct ( $xml_data, $pdf_xml_data ) 10230 { 10231 // Get PDF XML form data definitions 10232 $this -> __get_pdf_form_definitions ( $pdf_xml_data ) ; 10233 10234 // Create XML data from scratch, if none specified 10235 if ( ! $xml_data ) 10236 $xml_data = $this -> __create_default_xml_data ( $this -> PdfDefinitions ) ; 10237 10238 // Decode XML the hard way, without XSD 10239 $xml = simplexml_load_string ( $xml_data ) ; 10240 $root_entry = $xml -> getName ( ) ; 10241 $definitions = array ( ) ; 10242 $class_name = "PdfFormData" ; 10243 10244 if ( strcasecmp ( $root_entry, "forms" ) ) 10245 error ( new PdfToTextFormException ( "Root entry must be <forms>, <$root_entry> was found." ) ) ; 10246 10247 // Get the attribute values of the <forms> tag 10248 foreach ( $xml -> attributes ( ) as $attribute_name => $attribute_value ) 10249 { 10250 switch ( strtolower ( $attribute_name ) ) 10251 { 10252 case 'class' : 10253 $class_name = ( string ) $attribute_value ; 10254 10255 if ( class_exists ( $class_name, false ) ) 10256 error ( new PdfToTextFormException ( "Class \"$class_name\" specified in XML template already exists." ) ) ; 10257 10258 break ; 10259 10260 default : 10261 error ( new PdfToTextFormException ( "Invalid attribute \"$attribute_name\" in <forms> tag." ) ) ; 10262 } 10263 } 10264 10265 // Don't know if it will be useful, but try to avoid class name collisions by appending a sequential number if necessary 10266 if ( class_exists ( $class_name, false ) ) 10267 { 10268 self::$ClassDefinitionCount ++ ; 10269 $class_name .= '_' . self::$ClassDefinitionCount ; 10270 } 10271 10272 // Loop through each child <form> entry 10273 foreach ( $xml -> children ( ) as $child ) 10274 { 10275 $child_name = $child -> getName ( ) ; 10276 10277 switch ( strtolower ( $child_name ) ) 10278 { 10279 case 'form' : 10280 $definitions [] = new PdfToTextFormDefinition ( $class_name, $child, $this -> PdfDefinitions ) ; 10281 break ; 10282 10283 default : 10284 error ( new PdfToTextFormException ( "Invalid tag <$child_name>." ) ) ; 10285 } 10286 } 10287 10288 // Ensure that there is at least one form definition 10289 if ( ! count ( $definitions ) ) 10290 error ( new PdfToTextFormException ( "No <form> definition found." ) ) ; 10291 10292 // Save to properties 10293 $this -> ClassName = $class_name ; 10294 $this -> Definitions = $definitions ; 10295 } 10296 10297 10298 /*-------------------------------------------------------------------------------------------------------------- 10299 10300 Internal methods. 10301 10302 *-------------------------------------------------------------------------------------------------------------*/ 10303 10304 // __get_pdf_form_definitions - 10305 // Retrieves the form field definitions coming from the PDF file. 10306 private function __get_pdf_form_definitions ( $pdf_data ) 10307 { 10308 preg_match_all ( '#(?P<field> <field .*? </field \s* >)#imsx', $pdf_data, $matches ) ; 10309 10310 foreach ( $matches [ 'field' ] as $field ) 10311 { 10312 $xml_field = simplexml_load_string ( $field ) ; 10313 10314 foreach ( $xml_field -> attributes ( ) as $attribute_name => $attribute_value ) 10315 { 10316 switch ( strtolower ( $attribute_name ) ) 10317 { 10318 case 'name' : 10319 $field_name = ( string ) $attribute_value ; 10320 10321 if ( isset ( $this -> PdfDefinitions [ $field_name ] ) ) 10322 $this -> PdfDefinitions [ $field_name ] [ 'occurrences' ] ++ ; 10323 else 10324 { 10325 $this -> PdfDefinitions [ $field_name ] = array 10326 ( 10327 'name' => $field_name, 10328 'occurrences' => 1 10329 ) ; 10330 } 10331 10332 break ; 10333 } 10334 } 10335 } 10336 } 10337 10338 10339 // __create_default_xml_data - 10340 // When no XML template has been specified, creates a default one based of the form definitions located in the PDF file. 10341 private function __create_default_xml_data ( $pdf_definitions ) 10342 { 10343 $result = "<forms>" . PHP_EOL . 10344 "\t<form version=\"1.0\">" . PHP_EOL ; 10345 10346 foreach ( $pdf_definitions as $name => $field ) 10347 { 10348 $name = str_replace ( '-', '_', $name ) ; // Just in case of 10349 $result .= "\t\t<field name=\"$name\" form-field=\"$name\" type=\"string\"/>" . PHP_EOL ; 10350 } 10351 10352 $result .= "\t</form>" . PHP_EOL . 10353 "</forms>" . PHP_EOL ; 10354 10355 return ( $result ) ; 10356 } 10357 10358 10359 /*-------------------------------------------------------------------------------------------------------------- 10360 10361 Interfaces implementations to retrieve form definitions. 10362 10363 *-------------------------------------------------------------------------------------------------------------*/ 10364 public function count ( ) 10365 { return ( count ( $this - Definitions ) ) ; } 10366 10367 10368 public function getIterator ( ) 10369 { return ( new ArrayIterator ( $this -> Definitions ) ) ; } 10370 10371 10372 public function offsetExists ( $offset ) 10373 { return ( $offset >= 0 && $offset < count ( $this -> Definitions ) ) ; } 10374 10375 10376 public function offsetGet ( $offset ) 10377 { return ( $this -> Definitions [ $offset ] ) ; } 10378 10379 10380 public function offsetSet ( $offset, $value ) 10381 { error ( new PdfToTextException ( "Unsupported operation." ) ) ; } 10382 10383 10384 public function offsetunset ( $offset ) 10385 { error ( new PdfToTextException ( "Unsupported operation." ) ) ; } 10386 } 10387 10388 10389/*============================================================================================================== 10390 10391 class PdfToTextFormDefinition - 10392 Holds the description of a form inside a form XML template. 10393 10394 ==============================================================================================================*/ 10395class PdfToTextFormDefinition // extends Object 10396 { 10397 // Class of the object returned by GetFormData( ) 10398 public $ClassName ; 10399 10400 // Form version 10401 public $Version ; 10402 10403 // Field definitions 10404 public $FieldDefinitions = array ( ) ; 10405 10406 // Field groups (ie, fields that are the results of the concatenation of several form fields) 10407 public $Groups = array ( ) ; 10408 10409 // Pdf field definitions 10410 public $PdfDefinitions ; 10411 10412 // Class definition in PHP, whose instance will be returned by GetFormData() 10413 private $ClassDefinition = false ; 10414 10415 // Direct access to field definitions either through their template name or PDF name 10416 private $FieldDefinitionsByName = array ( ) ; 10417 private $FieldDefinitionsByPdfName = array ( ) ; 10418 10419 10420 /*-------------------------------------------------------------------------------------------------------------- 10421 10422 Constructor - 10423 Analyze the contents of an XML template form definition. 10424 10425 *-------------------------------------------------------------------------------------------------------------*/ 10426 public function __construct ( $class_name, $form_definition, $pdf_definitions ) 10427 { 10428 $this -> ClassName = $class_name ; 10429 $this -> PdfDefinitions = $pdf_definitions ; 10430 $field_count = 0 ; 10431 10432 // Get <form> tag attributes 10433 foreach ( $form_definition -> attributes ( ) as $attribute_name => $attribute_value ) 10434 { 10435 switch ( strtolower ( $attribute_name ) ) 10436 { 10437 case 'version' : 10438 $this -> Version = ( string ) $attribute_value ; 10439 break ; 10440 10441 default : 10442 error ( new PdfToTextFormException ( "Invalid attribute \"$attribute_name\" in <form> tag." ) ) ; 10443 } 10444 } 10445 10446 // Loop through subtags 10447 foreach ( $form_definition -> children ( ) as $child ) 10448 { 10449 $tag_name = $child -> getName ( ) ; 10450 10451 // Check subtags 10452 switch ( strtolower ( $tag_name ) ) 10453 { 10454 // <group> : 10455 // A group is used to create a property that is the concatenation of several existing properties. 10456 case 'group' : 10457 $fields = array ( ) ; 10458 $separator = '' ; 10459 $name = false ; 10460 10461 // Loop through attribute names 10462 foreach ( $child -> attributes ( ) as $attribute_name => $attribute_value ) 10463 { 10464 switch ( $attribute_name ) 10465 { 10466 // "name" attribute" : 10467 // The name of the property, as it will appear in the output object. 10468 case 'name' : 10469 $name = PdfToTextObjectBase::ValidatePhpName ( ( string ) $attribute_value ) ; 10470 break ; 10471 10472 // "separator" attribute : 10473 // Separator to be used when concatenating the underlying properties. 10474 case 'separator' : 10475 $separator = ( string ) $attribute_value ; 10476 break ; 10477 10478 // "fields" : 10479 // A list of comma-separated field names, whose values will be concatenated together 10480 // using the specified separator. 10481 case 'fields' : 10482 $items = explode ( ',', ( string ) $attribute_value ) ; 10483 10484 if ( ! count ( $items ) ) 10485 error ( new PdfToTextFormException ( "Empty \"fields\" attribute in <group> tag." ) ) ; 10486 10487 foreach ( $items as $item ) 10488 $fields [] = PdfToTextObjectBase::ValidatePhpName ( $item ) ; 10489 10490 break ; 10491 10492 // Other attribute names : not allowed 10493 default : 10494 error ( new PdfToTextFormException ( "Invalid attribute \"$attribute_name\" in <group> tag." ) ) ; 10495 } 10496 } 10497 10498 // Check that at least one field has been specified 10499 if ( ! count ( $fields ) ) 10500 error ( new PdfToTextFormException ( "Empty \"fields\" attribute in <group> tag." ) ) ; 10501 10502 // Check that the mandatory property name has been specified 10503 if ( ! $name ) 10504 error ( new PdfToTextFormException ( "The \"name\" attribute is mandatory in <group> tag." ) ) ; 10505 10506 // Add this new grouped property to the list of existing groups 10507 $this -> Groups [] = array 10508 ( 10509 'name' => $name, 10510 'separator' => $separator, 10511 'fields' => $fields 10512 ) ; 10513 10514 break ; 10515 10516 // <field> : 10517 // Field definition. 10518 case 'field' : 10519 $field_def = new PdfToTextFormFieldDefinition ( $child ) ; 10520 $this -> FieldDefinitions [] = $field_def ; 10521 $this -> FieldDefinitionsByName [ $field_def -> Name ] = 10522 $this -> FieldDefinitionsByPdfName [ $field_def -> PdfName ] = $field_count ; 10523 $field_count ++ ; 10524 break ; 10525 10526 // Don't allow other attribute names 10527 default : 10528 error ( new PdfToTextFormException ( "Invalid tag <$tag_name> in <form> definition." ) ) ; 10529 } 10530 } 10531 10532 // Check that everything is ok (ie, that there is no duplicate fields) 10533 $this -> __paranoid_checks ( ) ; 10534 } 10535 10536 10537 /*-------------------------------------------------------------------------------------------------------------- 10538 10539 NAME 10540 GetClassDefinition - Returns the class definition for the urrent form. 10541 10542 PROTOTYPE 10543 $def = $form_def -> GetClassDefinition ( ) ; 10544 10545 DESCRIPTION 10546 Returns a string containing the PHP class definition that will contain the properties defined in the XML 10547 form template. 10548 10549 RETURN VALUE 10550 Returns a string containing the PHP class definition for the current form. 10551 10552 *-------------------------------------------------------------------------------------------------------------*/ 10553 public function GetClassDefinition ( ) 10554 { 10555 // Return the existing definition, if this method has been called more than once 10556 if ( $this -> ClassDefinition ) 10557 return ( $this -> ClassDefinition ) ; 10558 10559 $class_def = "// Class " . $this -> ClassName . " : " . $this -> Version . PHP_EOL . 10560 "class {$this -> ClassName}\t\textends PdfToTextFormData" . PHP_EOL . 10561 " {" . PHP_EOL ; 10562 10563 // Get the maximum width of constant and field names 10564 $max_width = 0 ; 10565 10566 foreach ( $this -> FieldDefinitions as $def ) 10567 { 10568 $length1 = strlen ( $def -> Name ) ; 10569 $length2 = strlen ( $def -> PdfName ) ; 10570 10571 if ( $length1 > $max_width || $length2 > $max_width ) 10572 $max_width = max ( $length1, $length2 ) ; 10573 10574 foreach ( $def -> Constants as $constant ) 10575 { 10576 $length = strlen ( $constant [ 'name' ] ) ; 10577 10578 if ( $length > $max_width ) 10579 $max_width = $length ; 10580 } 10581 } 10582 10583 // First, write out the constant definitions 10584 $all_constants = array ( ) ; 10585 10586 foreach ( $this -> FieldDefinitions as $def ) 10587 { 10588 foreach ( $def -> Constants as $constant ) 10589 { 10590 $name = $constant [ 'name' ] ; 10591 $value = $constant [ 'value' ] ; 10592 10593 if ( isset ( $all_constants [ $name ] ) ) 10594 { 10595 if ( $all_constants [ $name ] != $value ) 10596 error ( new PdfToTextFormException ( "Constant \"$name\" is defined more than once with different values." ) ) ; 10597 } 10598 else 10599 { 10600 $all_constants [ $name ] = $value ; 10601 10602 if ( ! is_numeric ( $value ) ) 10603 $value = '"' . addslashes ( $value ) . '"' ; 10604 10605 $class_def .= "\tconst\t" . str_pad ( $name, $max_width, " ", STR_PAD_RIGHT ) . "\t = $value ; " . PHP_EOL ; 10606 } 10607 } 10608 } 10609 10610 $class_def .= PHP_EOL . PHP_EOL ; 10611 10612 // Then write property definitions 10613 foreach ( $this -> FieldDefinitions as $def ) 10614 { 10615 $class_def .= "\t/** @formdata */" . PHP_EOL . 10616 "\tprotected\t\t\${$def -> Name} ;" . PHP_EOL ; 10617 } 10618 10619 $class_def .= PHP_EOL . PHP_EOL ; 10620 10621 // And finally, grouped properties 10622 foreach ( $this -> Groups as $group ) 10623 { 10624 $class_def .= "\t/**" . PHP_EOL . 10625 "\t\t@formdata" . PHP_EOL . 10626 "\t\t@group(" . implode ( ',', $group [ 'fields' ] ) . ')' . PHP_EOL . 10627 "\t\t@separator(" . str_replace ( ')', '\)', $group [ 'separator' ] ) . ')' . PHP_EOL . 10628 "\t */" . PHP_EOL . 10629 "\tprotected\t\t\${$group [ 'name' ]} ;" . PHP_EOL .PHP_EOL ; 10630 } 10631 10632 // Constructor 10633 $class_def .= PHP_EOL . PHP_EOL . 10634 "\t// Class constructor" . PHP_EOL . 10635 "\tpublic function __construct ( )" . PHP_EOL . 10636 "\t {" . PHP_EOL . 10637 "\t\tparent::__construct ( ) ;" . PHP_EOL . 10638 "\t }" . PHP_EOL ; 10639 10640 $class_def .= " }" . PHP_EOL ; 10641 10642 // Save the definition, if a second call occurs 10643 $this -> ClassDefinition = $class_def ; 10644 10645 // All done, return 10646 return ( $class_def ) ; 10647 } 10648 10649 10650 /*-------------------------------------------------------------------------------------------------------------- 10651 10652 NAME 10653 GetFormData - Returns a form data object containing properties mapped to the form data. 10654 10655 PROTOTYPE 10656 $object = $form_def -> GetFormData ( $fields ) ; 10657 10658 DESCRIPTION 10659 Returns an object containing properties mapped to actual form data. 10660 10661 PARAMETERS 10662 $fields (array) - 10663 An associative array whoses keys are the PDF form field names, and values their values as stored 10664 in the PDF file. 10665 10666 RETURN VALUE 10667 Returns an object of the class, as defined by the template specified to PdfToTextFormDefinitions 10668 class constructor. 10669 10670 *-------------------------------------------------------------------------------------------------------------*/ 10671 public function GetFormData ( $fields = array ( ) ) 10672 { 10673 if ( ! class_exists ( $this -> ClassName, false ) ) 10674 { 10675 $class_def = $this -> GetClassDefinition ( ) ; 10676 eval ( $class_def ) ; 10677 } 10678 10679 $class_name = $this -> ClassName ; 10680 $object = new $class_name ( ) ; 10681 10682 foreach ( $fields as $name => $value ) 10683 { 10684 if ( isset ( $this -> FieldDefinitionsByPdfName [ $name ] ) ) 10685 { 10686 $property = $this -> FieldDefinitions [ $this -> FieldDefinitionsByPdfName [ $name ] ] -> Name ; 10687 $object -> $property = $this -> __process_field_value ( $value ) ; 10688 } 10689 } 10690 10691 return ( $object ) ; 10692 } 10693 10694 10695 // __process_field_values - 10696 // Translates html entities and removes carriage returns (which are apparently used for multiline field) to 10697 // replace them with newlines. 10698 private function __process_field_value ( $value ) 10699 { 10700 $value = html_entity_decode ( $value ) ; 10701 $result = '' ; 10702 10703 for ( $i = 0, $length = strlen ( $value ) ; $i < $length ; $i ++ ) 10704 { 10705 if ( $value [$i] !== "\r" ) 10706 $result .= $value [$i] ; 10707 else 10708 { 10709 if ( isset ( $value [ $i + 1 ] ) ) 10710 { 10711 if ( $value [ $i + 1 ] !== "\n" ) 10712 $result .= "\n" ; 10713 } 10714 else 10715 $result .= "\n" ; 10716 } 10717 } 10718 10719 return ( $result ) ; 10720 } 10721 10722 10723 /*-------------------------------------------------------------------------------------------------------------- 10724 10725 NAME 10726 GetformDataFromPdfObject - Same as GetFormData(), except that it operates on XML data. 10727 10728 PROTOTYPE 10729 $object = $pdf -> GetFormDataFromPdfObject ( $pdf_data ) ; 10730 10731 DESCRIPTION 10732 Behaves the same as GetFormData(), except that it takes as input the XML contents of a PDF object. 10733 10734 PARAMETERS 10735 $pdf_data (string) - 10736 XML data coming from the PDF file. 10737 10738 RETURN VALUE 10739 Returns an object of the class, as defined by the template specified to PdfToTextFormDefinitions 10740 class constructor. 10741 10742 *-------------------------------------------------------------------------------------------------------------*/ 10743 protected function GetFormDataFromPdfObject ( $pdf_data ) 10744 { 10745 // simplexml_ functions do not like tags that contain a colon - replace them with a dash 10746 $pdf_data = preg_replace ( '/(<[^:]+?)(:)/', '$1-', $pdf_data ) ; 10747 10748 // Load the xml data 10749 $xml = simplexml_load_string ( $pdf_data ) ; 10750 10751 // Get the form field values 10752 $fields = array ( ) ; 10753 10754 $this -> __get_pdfform_data ( $fields, $xml ) ; 10755 10756 // Return the object 10757 return ( $this -> GetFormData ( $fields ) ) ; 10758 } 10759 10760 10761 // __getpdfform_data - 10762 // Retrieve the form field values from the specified PDF object, specified as XML 10763 private function __get_pdfform_data ( &$fields, $xml ) 10764 { 10765 $tag_name = $xml -> getName ( ) ; 10766 10767 if ( isset ( $this -> PdfDefinitions [ $tag_name ] ) ) 10768 $fields [ $tag_name ] = ( string ) $xml ; 10769 else 10770 { 10771 foreach ( $xml -> children ( ) as $child ) 10772 { 10773 $this -> __get_pdfform_data ( $fields, $child ) ; 10774 } 10775 } 10776 } 10777 10778 10779 // __paranoid_checks - 10780 // Checks for several kinds of inconsistencies in the supplied XML template. 10781 private function __paranoid_checks ( ) 10782 { 10783 // Check that field names, PDF field names and constant names are unique 10784 $names = array ( ) ; 10785 $pdf_names = array ( ) ; 10786 $constant_names = array ( ) ; 10787 10788 foreach ( $this -> FieldDefinitions as $def ) 10789 { 10790 if ( ! isset ( $this -> PdfDefinitions [ $def -> PdfName ] ) ) 10791 error ( new PdfToTextFormException ( "Field \"{$def -> PdfName}\" is not defined in the PDF file." ) ) ; 10792 10793 if ( isset ( $names [ $def -> Name ] ) ) 10794 error ( new PdfToTextFormException ( "Field \"{$def -> Name}\" is defined more than once." ) ) ; 10795 10796 $names [ $def -> Name ] = true ; 10797 10798 if ( isset ( $pdf_names [ $def -> PdfName ] ) ) 10799 error ( new PdfToTextFormException ( "PDF Field \"{$def -> PdfName}\" is referenced more than once." ) ) ; 10800 10801 $pdf_names [ $def -> PdfName ] = true ; 10802 10803 foreach ( $def -> Constants as $constant ) 10804 { 10805 $constant_name = $constant [ 'name' ] ; 10806 10807 if ( isset ( $constant_names [ $constant_name ] ) && $constant_names [ $constant_name ] != $constant [ 'value' ] ) 10808 error ( new PdfToTextFormException ( "Constant \"$constant_name\" is defined more than once with different values." ) ) ; 10809 10810 $constant_names [ $constant_name ] = $constant [ 'value' ] ; 10811 } 10812 } 10813 10814 // Check that group names are unique and that the fields they are referencing exist 10815 $group_names = array ( ) ; 10816 10817 foreach ( $this -> Groups as $group ) 10818 { 10819 if ( isset ( $group_names [ $group [ 'name' ] ] ) ) 10820 error ( new PdfToTextFormException ( "Group \"{$group [ 'name' ]}\" is defined more than once." ) ) ; 10821 10822 if ( isset ( $names [ $group [ 'name' ] ] ) ) 10823 error ( new PdfToTextFormException ( "Group \"{$group [ 'name' ]}\" has the same name as an existing field." ) ) ; 10824 10825 foreach ( $group [ 'fields' ] as $field_name ) 10826 { 10827 if ( ! isset ( $names [ $field_name ] ) ) 10828 error ( new PdfToTextFormException ( "Field \"$field_name\" of group \"{$group [ 'name' ]}\" does not exist." ) ) ; 10829 } 10830 } 10831 } 10832 } 10833 10834 10835/*============================================================================================================== 10836 10837 class PdfToTextFormFieldDefinition - 10838 Contains an XML template form field definition. 10839 10840 ==============================================================================================================*/ 10841class PdfToTextFormFieldDefinition // extends Object 10842 { 10843 // Supported field types 10844 const TYPE_STRING = 1 ; // String 10845 const TYPE_CHOICE = 2 ; // Choice (must have <constant> subtags) 10846 10847 // Official name (as it will appear in the class based on the XML template) 10848 public $Name = false ; 10849 // Field name, as specified in the input PDF file 10850 public $PdfName = false ; 10851 // Field type 10852 public $Type = self::TYPE_STRING ; 10853 // Available constant values for this field when the "type" attribute has the value "choice" 10854 public $Constants = array ( ) ; 10855 10856 10857 /*-------------------------------------------------------------------------------------------------------------- 10858 10859 Constructor - 10860 Builds the field definition object. 10861 10862 *-------------------------------------------------------------------------------------------------------------*/ 10863 public function __construct ( $field_node ) 10864 { 10865 // Loop through attributes 10866 foreach ( $field_node -> attributes ( ) as $attribute_name => $attribute_value ) 10867 { 10868 switch ( strtolower ( $attribute_name ) ) 10869 { 10870 // "name" attribute : 10871 // Specifies the field name as it will appear in the output class. Must be a valid PHP name. 10872 case 'name' : 10873 $this -> Name = PdfToTextFormDefinition::ValidatePhpName ( ( string ) $attribute_value ) ; 10874 break ; 10875 10876 // "form-field" attribute : 10877 // Corresponding field name in the input PDF form. 10878 case 'form-field' : 10879 $this -> PdfName = ( string ) $attribute_value ; 10880 break ; 10881 10882 // "type" : 10883 // Field type. Can be either : 10884 // - "string" : 10885 // The field value can be any type of string. 10886 // - "choice" : 10887 // The field value has one of the values defined by the <case> or <default> subtags. 10888 case 'type' : 10889 switch ( strtolower ( ( string ) $attribute_value ) ) 10890 { 10891 case 'string' : 10892 $this -> Type = self::TYPE_STRING ; 10893 break ; 10894 10895 case 'choice' : 10896 $this -> Type = self::TYPE_CHOICE ; 10897 break ; 10898 10899 default : 10900 error ( new PdfToTextFormException ( "Invalid value \"$attribute_value\" for the \"$attribute_name\" attribute of the <field> tag." ) ) ; 10901 } 10902 } 10903 } 10904 10905 // The "name" and "form-field" attributes are mandatory 10906 if ( ! $this -> Name ) 10907 error ( new PdfToTextFormException ( "The \"name\" attribute is mandatory for the <field> tag." ) ) ; 10908 10909 if ( ! $this -> PdfName ) 10910 error ( new PdfToTextFormException ( "The \"form-field\" attribute is mandatory for the <field> tag." ) ) ; 10911 10912 // For "type=choice" entries, we have to look for <case> or <default> subtags 10913 if ( $this -> Type === self::TYPE_CHOICE ) 10914 { 10915 foreach ( $field_node -> children ( ) as $child ) 10916 { 10917 $tag_name = $child -> getName ( ) ; 10918 $lcname = strtolower ( $tag_name ) ; 10919 $is_default = false ; 10920 10921 switch ( $lcname ) 10922 { 10923 // Default value to be used when no PDF field value matches the defined constants 10924 case 'default' : 10925 $is_default = true ; 10926 10927 // "case" attribute : 10928 // Maps a value to constant name that will be defined in the generated class. 10929 case 'case' : 10930 $constant_value = "" ; 10931 $constant_name = false ; 10932 10933 // Retrieve attributes 10934 foreach ( $child -> attributes ( ) as $attribute_name => $attribute_value ) 10935 { 10936 switch ( strtolower ( $attribute_name ) ) 10937 { 10938 // "value" attribute : 10939 // PDF form field value. 10940 case 'value' : 10941 $constant_value = ( string ) $attribute_value ; 10942 break ; 10943 10944 // "constant" attribute : 10945 // Associated constant. 10946 case 'constant' : 10947 $constant_name = PdfToTextFormDefinition::ValidatePhpName ( ( string ) $attribute_value ) ; 10948 break ; 10949 10950 // Bail out if any unrecognized attribute has been specified 10951 default : 10952 error ( new PdfToTextFormException ( "Invalid tag <$tag_name> in <field> definition." ) ) ; 10953 } 10954 } 10955 10956 // Each <case> entry must have a "constant" attribute 10957 if ( $constant_value === false && ! $is_default ) 10958 error ( new PdfToTextFormException ( "Missing constant value in <case> tag." ) ) ; 10959 10960 if ( $constant_name === false ) 10961 error ( new PdfToTextFormException ( "Attribute \"constant-name\" is required for <$tag_name> tag." ) ) ; 10962 10963 // Add this to the list of existing constants 10964 $this -> Constants [] = array 10965 ( 10966 'name' => $constant_name, 10967 'value' => $constant_value, 10968 'default' => $is_default 10969 ) ; 10970 10971 break ; 10972 10973 // Check for unrecognized tags 10974 default : 10975 error ( new PdfToTextFormException ( "Invalid tag <$tag_name> in <field> definition." ) ) ; 10976 } 10977 } 10978 } 10979 } 10980 } 10981 10982 10983/*============================================================================================================== 10984 10985 class PdfToTextFormData - 10986 Base class for all Pdf form templates data. 10987 10988 ==============================================================================================================*/ 10989class PdfToTextFormData // extends Object 10990 { 10991 // Doc comments provide information about form data fields (mainly to handle grouped field values) 10992 // The $__Properties array gives information about the form data fields themselves 10993 private $__Properties = array ( ) ; 10994 10995 10996 /*-------------------------------------------------------------------------------------------------------------- 10997 10998 Constructor - 10999 Retrieve information about the derived class properties, which are specified by the derived class 11000 generated on the fly. 11001 11002 *-------------------------------------------------------------------------------------------------------------*/ 11003 public function __construct ( ) 11004 { 11005 // Get class properties 11006 $reflection = new ReflectionClass ( $this ) ; 11007 $properties = $reflection -> getProperties ( ) ; 11008 11009 // Loop through class properties 11010 foreach ( $properties as $property ) 11011 { 11012 $propname = $property -> getName ( ) ; 11013 $doc_comment = $property -> getDocComment ( ) ; 11014 11015 $fields = false ; 11016 $separator = false ; 11017 11018 // A doc comment may indicate either : 11019 // - A form data field (@formdata) 11020 // - A grouped field ; in this case, we will have the following tags : 11021 // . @formdata 11022 // . @group(field_list) : list of fields grouped for this property 11023 // . @separator(string) : a separator used when catenating grouped fields 11024 if ( $doc_comment ) 11025 { 11026 // The @formdata tag must be present 11027 if ( strpos ( $doc_comment, '@formdata' ) === false ) 11028 continue ; 11029 11030 // @group(fields) pattern 11031 if ( preg_match ( '/group \s* \( \s* (?P<fields> [^)]+) \)/imsx', $doc_comment, $match ) ) 11032 { 11033 $items = explode ( ',', $match [ 'fields' ] ) ; 11034 $fields = array ( ) ; 11035 11036 foreach ( $items as $item ) 11037 $fields [] = $item ; 11038 } 11039 11040 // @separator(string) pattern 11041 if ( preg_match ( '/separator \s* \( \s* (?P<separator> ( (\\\)) | (.) )+ \) /imsx', $doc_comment, $match ) ) 11042 { 11043 $separator = stripslashes ( $match [ 'separator' ]) ; 11044 } 11045 } 11046 // Ignore non-formdata properties 11047 else 11048 continue ; 11049 11050 // Property belongs to the form - add it to the list of available properties 11051 $this -> __Properties [ $propname ] = array 11052 ( 11053 'name' => $propname, 11054 'fields' => $fields, 11055 'separator' => $separator 11056 ) ; 11057 } 11058 } 11059 11060 11061 /*-------------------------------------------------------------------------------------------------------------- 11062 11063 __get - 11064 Returns the underlying property value for this PDF data field. 11065 *-------------------------------------------------------------------------------------------------------------*/ 11066 public function __get ( $member ) 11067 { 11068 if ( ! isset ( $this -> __Properties [ $member ] ) ) 11069 warning ( new PdfToTextFormException ( "Undefined property \"$member\"." ) ) ; 11070 11071 return ( $this -> $member ) ; 11072 } 11073 11074 11075 /*-------------------------------------------------------------------------------------------------------------- 11076 11077 __set - 11078 Sets the underlying property value for this PDF data field. 11079 When the property is a compound one, sets individual members as well. 11080 11081 *-------------------------------------------------------------------------------------------------------------*/ 11082 public function __set ( $member, $value ) 11083 { 11084 // Property exists : some special processing will be needed 11085 if ( isset ( $this -> __Properties [ $member ] ) ) 11086 { 11087 $prop_entry = $this -> __Properties [ $member ] ; 11088 11089 // Non-compound property 11090 if ( ! $prop_entry [ 'fields' ] ) 11091 { 11092 $this -> $member = $value ; 11093 11094 // However, we have to check that this property belongs to a compound property and change 11095 // the compound property valu accordingly 11096 foreach ( $this -> __Properties as $name => $property ) 11097 { 11098 if ( $property [ 'fields' ] ) 11099 { 11100 if ( in_array ( $member, $property [ 'fields' ] ) ) 11101 { 11102 $values = array ( ) ; 11103 11104 foreach ( $property [ 'fields' ] as $value ) 11105 $values [] = $this -> $value ; 11106 11107 // Change compound property value accordingly, using the specified separator 11108 $this -> $name = implode ( $property [ 'separator' ], $values ) ; 11109 } 11110 } 11111 } 11112 } 11113 // Compound property : we will have to explode it in separate parts, using the compound property separator, 11114 // then set individual property values 11115 else 11116 { 11117 $values = explode ( $prop_entry [ 'separator' ], $value ) ; 11118 $value_count = count ( $values ) ; 11119 $field_count = count ( $prop_entry [ 'fields' ] ) ; 11120 11121 if ( $value_count < $field_count ) 11122 error ( new PdfToTextFormException ( "Not enough value parts specified for the \"$member\" property ($value)." ) ) ; 11123 else if ( $value_count > $field_count ) 11124 error ( new PdfToTextFormException ( "Too much value parts specified for the \"$member\" property ($value)." ) ) ; 11125 11126 $this -> $member = $value ; 11127 11128 for ( $i = 0 ; $i < $value_count ; $i ++ ) 11129 { 11130 $sub_member = $prop_entry [ 'fields' ] [$i] ; 11131 $this -> $sub_member = $values [$i] ; 11132 } 11133 } 11134 } 11135 // Property does not exist : let PHP act as the default way 11136 else 11137 $this -> $member = $value ; 11138 } 11139 } 11140 11141 11142/************************************************************************************************************** 11143 ************************************************************************************************************** 11144 ************************************************************************************************************** 11145 ****** ****** 11146 ****** ****** 11147 ****** CAPTURE DEFINITION MANAGEMENT ****** 11148 ****** (none of the classes listed here are meant to be instantiated outside this file) ****** 11149 ****** ****** 11150 ****** ****** 11151 ************************************************************************************************************** 11152 ************************************************************************************************************** 11153 **************************************************************************************************************/ 11154 11155/*============================================================================================================== 11156 11157 class PdfToTextCaptureDefinitions - 11158 Holds text capture definitions, whose XML data has been supplied to the PdfToText::SetCapture() method. 11159 11160 ==============================================================================================================*/ 11161class PdfToTextCaptureDefinitions // extends Object 11162 implements ArrayAccess, Countable, Iterator 11163 { 11164 // Shape definitions - The actual objects populating this array depend on the definitions supplied 11165 // (rectangle, etc.) 11166 protected $ShapeDefinitions = array ( ) ; 11167 11168 // Shape field names - used for iteration 11169 private $ShapeNames ; 11170 11171 // Page count 11172 private $PageCount = false ; 11173 11174 11175 /*-------------------------------------------------------------------------------------------------------------- 11176 11177 CONSTRUCTOR - 11178 Analyzes the XML data defining the areas to be captured. 11179 11180 *-------------------------------------------------------------------------------------------------------------*/ 11181 public function __construct ( $xml_data ) 11182 { 11183 $xml = simplexml_load_string ( $xml_data ) ; 11184 $root_entry = $xml -> getName ( ) ; 11185 11186 // Root tag must be <captures> 11187 if ( strcasecmp ( $root_entry, "captures" ) ) 11188 error ( new PdfToTextCaptureException ( "Root entry must be <captures>, <$root_entry> was found." ) ) ; 11189 11190 // Process the child nodes 11191 foreach ( $xml -> children ( ) as $child ) 11192 { 11193 $tag_name = $child -> getName ( ) ; 11194 11195 switch ( strtolower ( $tag_name ) ) 11196 { 11197 // <rectangle> : 11198 // An rectangle whose dimensions are given in the <page> subtags. 11199 case 'rectangle' : 11200 $shape_object = new PdfToTextCaptureRectangleDefinition ( $child ) ; 11201 break ; 11202 11203 // <columns> : 11204 // A definition of columns and their applicable pages. 11205 case 'lines' : 11206 $shape_object = new PdfToTextCaptureLinesDefinition ( $child ) ; 11207 break ; 11208 11209 // Complain if an unknown tag is found 11210 default : 11211 error ( new PdfToTextCaptureException ( "Invalid tag <$tag_name> found in root tag <captures>." ) ) ; 11212 } 11213 11214 // Shape names must be unique within the definitinos 11215 if ( isset ( $this -> ShapeDefinitions [ $shape_object -> Name ] ) ) 11216 error ( new PdfToTextCaptureLinesDefinition ( "The shape named \"{$shape_object -> Name}\" has been defined more than once." ) ) ; 11217 else 11218 $this -> ShapeDefinitions [ $shape_object -> Name ] = $shape_object ; 11219 } 11220 11221 // Build an array of shape names for the iterator interface 11222 $this -> ShapeNames = array_keys ( $this -> ShapeDefinitions ) ; 11223 } 11224 11225 11226 /*-------------------------------------------------------------------------------------------------------------- 11227 11228 NAME 11229 GetCapturedObject - Creates an object reflecting the captured data. 11230 11231 PROTOTYPE 11232 $captures = $capture_definitions -> GetCapturedObject ( $document_fragments ) ; 11233 11234 DESCRIPTION 11235 Returns an object of type PdfToTextCapturedData,containing the data that has been captured, based on 11236 the capture definitions. 11237 11238 PARAMETERS 11239 $document_fragments (type) - 11240 Document text fragments collected during the text layout rendering process. 11241 11242 RETURN VALUE 11243 An object of type PdfToTextCaptures, cntaining the captured data. 11244 11245 *-------------------------------------------------------------------------------------------------------------*/ 11246 public function GetCapturedObject ( $document_fragments ) 11247 { 11248 $captures = array ( ) ; 11249 11250 foreach ( $this -> ShapeDefinitions as $shape ) 11251 { 11252 $capture = $shape -> ExtractAreas ( $document_fragments ) ; 11253 11254 foreach ( $capture as $page => $items ) 11255 { 11256 $captures [ $page ] [] = $items ; 11257 } 11258 } 11259 11260 $captured_object = new PdfToTextCaptures ( $captures ) ; 11261 11262 return ( $captured_object ) ; 11263 } 11264 11265 11266 /*-------------------------------------------------------------------------------------------------------------- 11267 11268 NAME 11269 SetPageCount - Defines the total number of pages in the document. 11270 11271 PROTOTYPE 11272 $shape -> SetPageCount ( $count ) ; 11273 11274 DESCRIPTION 11275 At the time when XML definitions are processed, the total number of pages in the document is not yet 11276 known. Moreover, page ranges or page numbers can be expressed relative to the last page of the 11277 document (for example : 1..$-1, which means "from the first page to the last page - 1). 11278 Setting the page count once it is known allows to process the expressions specified in the "number" 11279 attribute of the <pages> tag so that the expressions are transformed into actual page numbers. 11280 11281 PARAMETERS 11282 $count (integer) - 11283 Number of pages in the document. 11284 11285 *-------------------------------------------------------------------------------------------------------------*/ 11286 public function SetPageCount ( $count ) 11287 { 11288 $this -> PageCount = $count ; 11289 11290 foreach ( $this -> ShapeDefinitions as $def ) 11291 { 11292 $def -> SetPageCount ( $count ) ; 11293 } 11294 } 11295 11296 11297 /*-------------------------------------------------------------------------------------------------------------- 11298 11299 NAME 11300 GetNodeAttributes - Retrieves an XML node's attributes. 11301 11302 PROTOTYPE 11303 $result = PdfToTextCaptureDefinitions::GetNodeAttributes ( $node, $attributes ) ; 11304 11305 DESCRIPTION 11306 Retrieves the attributes defined for the specified XML node. 11307 11308 PARAMETERS 11309 $node (SimpleXMLElement) - 11310 Node whose attributes are to be extracted. 11311 11312 $attributes (associative array) - 11313 Associative array whose keys are the attribute names and whose values define a boolean 11314 indicating whether the attribute is mandatory or not. 11315 11316 RETURN VALUE 11317 Returns an associative whose key are the attribute names and whose values are the attribute values, 11318 specified as a string. 11319 For optional unspecified attributes, the value will be boolean false. 11320 11321 NOTES 11322 The method throws an exception if the node contains an unknown attribute, or if a mandatory attribute 11323 is missing. 11324 11325 *-------------------------------------------------------------------------------------------------------------*/ 11326 public static function GetNodeAttributes ( $node, $attributes ) 11327 { 11328 $tag_name = $node -> getName ( ) ; 11329 11330 // Build the initial value for the resulting array 11331 $result = array ( ) ; 11332 11333 foreach ( array_keys ( $attributes ) as $name ) 11334 $result [ $name ] = false ; 11335 11336 // Loop through node attributes 11337 foreach ( $node -> attributes ( ) as $attribute_name => $attribute_value ) 11338 { 11339 $attribute_name = strtolower ( $attribute_name ) ; 11340 11341 // Check that the attributes exists ; if yes, add it to the resulting array 11342 if ( isset ( $attributes [ $attribute_name ] ) ) 11343 $result [ $attribute_name ] = ( string ) $attribute_value ; 11344 // Otherwise, throw an exception 11345 else 11346 error ( new PdfToTextCaptureLinesDefinition ( "Undefined attribute \"$attribute_name\" for node <$tag_name>." ) ) ; 11347 } 11348 11349 // Check that all mandatory attributes have been specified 11350 foreach ( $attributes as $attribute_name => $mandatory ) 11351 { 11352 if ( $mandatory && $result [ $attribute_name ] === false ) 11353 error ( new PdfToTextCaptureLinesDefinition ( "Undefined attribute \"$attribute_name\" for node <$tag_name>." ) ) ; 11354 } 11355 11356 // All done, return 11357 return ( $result ) ; 11358 } 11359 11360 11361 /*-------------------------------------------------------------------------------------------------------------- 11362 11363 NAME 11364 GetBooleanAttribute - Returns a boolean value associated to a string. 11365 11366 PROTOTYPE 11367 $bool = PdfToTextCaptureDefinitions::GetBooleanValue ( $value ) ; 11368 11369 DESCRIPTION 11370 Returns a boolean value corresponding to a boolean specified as a string. 11371 11372 PARAMETERS 11373 $value (string) - 11374 A boolean value represented as a string. 11375 The strings 'true', 'yes', 'on' and '1' will be interpreted as boolean true. 11376 The strings 'false', 'no', 'off' and '0' will be interpreted as boolean false. 11377 11378 RETURN VALUE 11379 The boolean value corresponding to the specified string. 11380 11381 NOTES 11382 An exception is thrown if the supplied string is incorrect. 11383 11384 *-------------------------------------------------------------------------------------------------------------*/ 11385 public static function GetBooleanAttribute ( $value ) 11386 { 11387 $lcvalue = strtolower ( $value ) ; 11388 11389 if ( $lcvalue === 'true' || $lcvalue === 'on' || $lcvalue === 'yes' || $lcvalue === '1' || $value === true ) 11390 return ( true ) ; 11391 else if ( $lcvalue === 'false' || $lcvalue === 'off' || $lcvalue === 'no' || $lcvalue === '0' || $value === false ) 11392 return( false ) ; 11393 else 11394 error ( new PdfToTextCaptureLinesDefinition ( "Invalid boolean value \"$value\"." ) ) ; 11395 } 11396 11397 11398 /*-------------------------------------------------------------------------------------------------------------- 11399 11400 Interfaces implementations. 11401 11402 *-------------------------------------------------------------------------------------------------------------*/ 11403 11404 // Countable interface 11405 public function count ( ) 11406 { return ( count ( $this -> ShapeDefinitions ) ) ; } 11407 11408 11409 // ArrayAccess interface 11410 public function offsetExists ( $offset ) 11411 { return ( isset ( $this -> ShapeDefinitions [ $offset ] ) ) ; } 11412 11413 11414 public function offsetGet ( $offset ) 11415 { return ( $this -> ShapeDefinitions [ $offset ] ) ; } 11416 11417 11418 public function offsetSet ( $offset, $value ) 11419 { error ( new PdfToTextException ( "Unsupported operation" ) ) ; } 11420 11421 11422 public function offsetunset ( $offset ) 11423 { error ( new PdfToTextException ( "Unsupported operation" ) ) ; } 11424 11425 11426 // Iterator interface - 11427 // Iteration is made through shape names, which are supplied by the $ShapeNames property 11428 private $__iterator_index = 0 ; 11429 11430 public function rewind ( ) 11431 { $this -> __iterator_index = 0 ; } 11432 11433 public function valid ( ) 11434 { return ( $this -> __iterator_index >= 0 && $this -> __iterator_index < count ( $this -> ShapeNames ) ) ; } 11435 11436 public function key ( ) 11437 { return ( $this -> ShapeNames [ $this -> __iterator_index ] ) ; } 11438 11439 public function next ( ) 11440 { $this -> __iterator_index ++ ; } 11441 11442 public function current ( ) 11443 { return ( $this -> ShapeDefinitions [ $this -> ShapeNames [ $this -> __iterator_index ] ] ) ; } 11444 } 11445 11446 11447/*============================================================================================================== 11448 11449 class PdfToTextCaptureShapeDefinition - 11450 Base class for capturing shapes. 11451 11452 ==============================================================================================================*/ 11453abstract class PdfToTextCaptureShapeDefinition //extends Object 11454 { 11455 const SHAPE_RECTANGLE = 1 ; 11456 const SHAPE_COLUMN = 2 ; 11457 const SHAPE_LINE = 3 ; 11458 11459 // Capture name 11460 public $Name ; 11461 // Capture type - one of the SHAPE_* constants, assigned by derived classes. 11462 public $Type ; 11463 // Applicable pages for this capture 11464 public $ApplicablePages ; 11465 // Areas per page for this shape 11466 public $Areas = array ( ) ; 11467 // Separator used when multiple elements are covered by the same shape 11468 public $Separator = " " ; 11469 11470 11471 /*-------------------------------------------------------------------------------------------------------------- 11472 11473 Constructor - 11474 Initializes the base capture class. 11475 11476 *-------------------------------------------------------------------------------------------------------------*/ 11477 public function __construct ( $type ) 11478 { 11479 $this -> Type = $type ; 11480 $this -> ApplicablePages = new PdfToTextCaptureApplicablePages ( ) ; 11481 } 11482 11483 11484 /*-------------------------------------------------------------------------------------------------------------- 11485 11486 SetPageCount - 11487 Sets the page count, so that all the applicable pages can be determined. 11488 Derived classes can implement this function if some additional work is needed. 11489 11490 *-------------------------------------------------------------------------------------------------------------*/ 11491 public function SetPageCount ( $count ) 11492 { 11493 $this -> ApplicablePages -> SetPageCount ( $count ) ; 11494 } 11495 11496 11497 /*-------------------------------------------------------------------------------------------------------------- 11498 11499 GetFragmentData - 11500 Extracts data from a text fragment (text + coordinates). 11501 11502 *-------------------------------------------------------------------------------------------------------------*/ 11503 protected function GetFragmentData ( $fragment, &$text, &$left, &$top, &$right, &$bottom ) 11504 { 11505 $left = ( double ) $fragment [ 'x' ] ; 11506 $top = ( double ) $fragment [ 'y' ] ; 11507 $right = $left + ( double ) $fragment [ 'width' ] - 1 ; 11508 $bottom = $top - ( double ) $fragment [ 'font-height' ] ; 11509 $text = $fragment [ 'text' ] ; 11510 } 11511 11512 11513 /*-------------------------------------------------------------------------------------------------------------- 11514 11515 GetAttributes - 11516 Retrieves the attributes of the given XML node. Processes the following attributes, which are common to 11517 all shapes : 11518 - Name 11519 - Separator 11520 11521 *-------------------------------------------------------------------------------------------------------------*/ 11522 protected function GetAttributes ( $node, $attributes = array ( ) ) 11523 { 11524 $attributes = array_merge ( $attributes, array ( 'name' => true, 'separator' => false ) ) ; 11525 $shape_attributes = PdfToTextCaptureDefinitions::GetNodeAttributes ( $node, $attributes ) ; 11526 $this -> Name = $shape_attributes [ 'name' ] ; 11527 11528 if ( $shape_attributes [ 'separator' ] !== false ) 11529 $this -> Separator = PdfToText::Unescape ( $shape_attributes [ 'separator' ] ) ; 11530 11531 return ( $shape_attributes ) ; 11532 } 11533 11534 11535 /*-------------------------------------------------------------------------------------------------------------- 11536 11537 ExtractAreas - 11538 Extracts text contents from the document fragments. 11539 11540 *-------------------------------------------------------------------------------------------------------------*/ 11541 public abstract function ExtractAreas ( $document_fragments ) ; 11542 } 11543 11544 11545/*============================================================================================================== 11546 11547 class PdfToTextCaptureRectangleDefinition - 11548 A shape for capturing text in rectangle areas. 11549 11550 ==============================================================================================================*/ 11551class PdfToTextCaptureRectangleDefinition extends PdfToTextCaptureShapeDefinition 11552 { 11553 /*-------------------------------------------------------------------------------------------------------------- 11554 11555 CONSTRUCTOR - 11556 Analyzes the contents of a <rectangle> XML node, which contains <page> child node giving the 11557 applicable pages and the rectangle dimensions. 11558 11559 *-------------------------------------------------------------------------------------------------------------*/ 11560 public function __construct ( $node ) 11561 { 11562 parent::__construct ( self::SHAPE_RECTANGLE ) ; 11563 11564 $this -> GetAttributes ( $node ) ; 11565 11566 // Loop through node's children 11567 foreach ( $node -> children ( ) as $child ) 11568 { 11569 $tag_name = $child -> getName ( ) ; 11570 11571 switch ( strtolower ( $tag_name ) ) 11572 { 11573 // <page> tag : applicable page(s) 11574 case 'page' : 11575 // Retrieve the specified attributes 11576 $page_attributes = PdfToTextCaptureDefinitions::GetNodeAttributes 11577 ( 11578 $child, 11579 array 11580 ( 11581 'number' => true, 11582 'left' => true, 11583 'right' => false, 11584 'top' => true, 11585 'bottom' => false, 11586 'width' => false, 11587 'height' => false 11588 ) 11589 ) ; 11590 11591 $page_number = $page_attributes [ 'number' ] ; 11592 11593 // Add this page to the list of applicable pages for this shape 11594 $this -> ApplicablePages -> Add ( $page_number, $page_attributes ) ; 11595 11596 break ; 11597 11598 // Other tag : throw an exception 11599 default : 11600 error ( new PdfToTextCaptureException ( "Invalid tag <$tag_name> found in root tag <rectangle>." ) ) ; 11601 } 11602 } 11603 } 11604 11605 11606 /*-------------------------------------------------------------------------------------------------------------- 11607 11608 ExtractAreas - 11609 Extracts text contents from the document fragments. 11610 11611 *-------------------------------------------------------------------------------------------------------------*/ 11612 public function ExtractAreas ( $document_fragments ) 11613 { 11614 $result = array ( ) ; 11615 11616 // Loop through document fragments 11617 foreach ( $document_fragments as $page => $page_contents ) 11618 { 11619 $fragments = $page_contents [ 'fragments' ] ; 11620 11621 // Ignore pages that are not applicable 11622 if ( ! isset ( $this -> ApplicablePages -> PageMap [ $page ] ) ) 11623 continue ; 11624 11625 // Loop through each text fragment of the page 11626 foreach ( $fragments as $fragment ) 11627 { 11628 $this -> GetFragmentData ( $fragment, $text, $left, $top, $right, $bottom ) ; 11629 11630 // Only handle text fragments that are within the specified area 11631 if ( $this -> Areas [ $page ] -> Contains ( $left, $top, $right, $bottom ) ) 11632 { 11633 // Normally, rectangle shapes are used to capture a single line... 11634 if ( ! isset ( $result [ $page ] ) ) 11635 $result [ $page ] = new PdfToTextCapturedRectangle ( $page, $this -> Name, $text, $left, $top, $right, $bottom, $this ) ; 11636 // ... but you can also use them to capture multiple lines ; in this case, the "separator" attribute of the <rectangle> tag will 11637 // be used to separate items 11638 else 11639 { 11640 $existing_area = $result [ $page ] ; 11641 11642 $existing_area -> Top = max ( $existing_area -> Top , $top ) ; 11643 $existing_area -> Bottom = min ( $existing_area -> Bottom, $bottom ) ; 11644 $existing_area -> Left = min ( $existing_area -> Left , $left ) ; 11645 $existing_area -> Right = max ( $existing_area -> Right , $right ) ; 11646 $existing_area -> Text .= $this -> Separator . $text ; 11647 } 11648 } 11649 } 11650 } 11651 11652 11653 // Provide empty values for pages which did not capture a rectangle shape 11654 $added_missing_pages = false ; 11655 11656 foreach ( $this -> ApplicablePages as $page => $applicable ) 11657 { 11658 if ( ! isset ( $result [ $page ] ) ) 11659 { 11660 $result [ $page ] = new PdfToTextCapturedRectangle ( $page, $this -> Name, '', 0, 0, 0, 0, $this ) ; 11661 $added_missing_pages = true ; 11662 } 11663 } 11664 11665 if ( $added_missing_pages ) // Sort by page number if empty values were added 11666 ksort ( $result ) ; 11667 11668 // All done, return 11669 return ( $result ) ; 11670 } 11671 11672 11673 /*-------------------------------------------------------------------------------------------------------------- 11674 11675 SetPageCount - 11676 Ensures that an Area is created for each related page. 11677 11678 *-------------------------------------------------------------------------------------------------------------*/ 11679 public function SetPageCount ( $count ) 11680 { 11681 parent::SetPageCount ( $count ) ; 11682 11683 // Create a rectangle area for each page concerned - this can only be done when the number of pages is known 11684 // (and the ApplicablePages object updated accordingly) 11685 foreach ( $this -> ApplicablePages -> ExtraPageMapData as $page => $data ) 11686 $this -> Areas [ $page ] = new PdfToTextCaptureArea ( $data ) ; 11687 } 11688 } 11689 11690 11691/*============================================================================================================== 11692 11693 class PdfToTextCaptureLinesDefinition - 11694 A shape for capturing text in rectangle areas. 11695 11696 ==============================================================================================================*/ 11697class PdfToTextCaptureLinesDefinition extends PdfToTextCaptureShapeDefinition 11698 { 11699 // Column areas 11700 public $Columns = array ( ) ; 11701 // Top and bottom lines 11702 public $Tops = array ( ) ; 11703 public $Bottoms = array ( ) ; 11704 // Column names 11705 private $ColumnNames = array ( ) ; 11706 11707 11708 /*-------------------------------------------------------------------------------------------------------------- 11709 11710 CONSTRUCTOR - 11711 Analyzes the contents of a <columns> XML node, which contains <page> nodes giving a part of the column 11712 dimensions, and <column> nodes which specify the name of the column and the remaining coordinates, 11713 such as "left" or "width" 11714 11715 *-------------------------------------------------------------------------------------------------------------*/ 11716 public function __construct ( $node ) 11717 { 11718 parent::__construct ( self::SHAPE_COLUMN ) ; 11719 11720 $shape_attributes = $this -> GetAttributes ( $node, array ( 'default' => false ) ) ; 11721 $column_default = ( $shape_attributes [ 'default' ] ) ? $shape_attributes [ 'default' ] : '' ; 11722 11723 // Loop through node's children 11724 foreach ( $node -> children ( ) as $child ) 11725 { 11726 $tag_name = $child -> getName ( ) ; 11727 11728 switch ( strtolower ( $tag_name ) ) 11729 { 11730 // <page> tag 11731 case 'page' : 11732 // Retrieve the specified attributes 11733 $page_attributes = PdfToTextCaptureDefinitions::GetNodeAttributes 11734 ( 11735 $child, 11736 array 11737 ( 11738 'number' => true, 11739 'top' => true, 11740 'height' => true, 11741 'bottom' => false 11742 ) 11743 ) ; 11744 11745 // We have to store the y-coordinate of the first and last lines, to determine until which 11746 // position we have to check for column contents. 11747 // The "top" and "bottom" attributes of the <page> tag actually determine the top and bottom 11748 // y-coordinates where to search for columns. However, we will have to rename the "bottom" 11749 // attribute to "column-bottom", in order for it not to be mistaken with actual column rectangle 11750 // (only the "height" attribute of the <page> tag gives the height of a line) 11751 $page_attributes [ 'column-top' ] = $page_attributes [ 'top' ] ; 11752 $page_attributes [ 'column-bottom' ] = ( double ) $page_attributes [ 'bottom' ] ; 11753 unset ( $page_attributes [ 'bottom' ] ) ; 11754 11755 // Add this page to the list of applicable pages for this shape 11756 $this -> ApplicablePages -> Add ( $page_attributes [ 'number' ], $page_attributes ) ; 11757 11758 break ; 11759 11760 // <column> tag : 11761 case 'column' : 11762 $column_attributes = PdfToTextCaptureDefinitions::GetNodeAttributes 11763 ( 11764 $child, 11765 array 11766 ( 11767 'name' => true, 11768 'left' => false, 11769 'right' => false, 11770 'width' => false, 11771 'default' => false 11772 ) 11773 ) ; 11774 11775 $column_name = $column_attributes [ 'name' ] ; 11776 11777 // Build the final default value, if any one is specified ; the following special constructs are processed : 11778 // - "%c" : 11779 // Replaced by the column name. 11780 // - "%n" : 11781 // Replaced by the column index (starting from zero). 11782 if ( ! $column_attributes [ 'default' ] ) 11783 $column_attributes [ 'default' ] = $column_default ; 11784 11785 $substitutes = array 11786 ( 11787 '%c' => $column_name, 11788 '%n' => count ( $this -> Columns ) 11789 ) ; 11790 11791 $column_attributes [ 'default' ] = str_replace 11792 ( 11793 array_keys ( $substitutes ), 11794 array_values ( $substitutes ), 11795 $column_attributes [ 'default' ] 11796 ) ; 11797 11798 // Add the column definition to this object 11799 if ( ! isset ( $this -> Columns [ $column_name ] ) ) 11800 { 11801 $this -> Columns [ $column_attributes [ 'name' ] ] = $column_attributes ; 11802 $this -> ColumnNames [] = $column_attributes [ 'name' ] ; 11803 } 11804 else 11805 error ( new PdfToTextCaptureException ( "Column \"$column_name\" is defined more than once." ) ) ; 11806 11807 break ; 11808 11809 // Other tag : throw an exception 11810 default : 11811 error ( new PdfToTextCaptureException ( "Invalid tag <$tag_name> found in root tag <rectangle>." ) ) ; 11812 } 11813 } 11814 } 11815 11816 11817 /*-------------------------------------------------------------------------------------------------------------- 11818 11819 ExtractAreas - 11820 Extracts text contents from the document fragments. 11821 11822 *-------------------------------------------------------------------------------------------------------------*/ 11823 public function ExtractAreas ( $document_fragments ) 11824 { 11825 $result = array ( ) ; 11826 11827 // Loop through each page of document fragments 11828 foreach ( $document_fragments as $page => $page_contents ) 11829 { 11830 $fragments = $page_contents [ 'fragments' ] ; 11831 11832 // Ignore this page if not included in the <columns> definition 11833 if ( ! isset ( $this -> ApplicablePages -> PageMap [ $page ] ) ) 11834 continue ; 11835 11836 // <columns> definition only gives the location of the first line of each column, together 11837 // with its height. 11838 // We will build as many new column areas as can fit on one page 11839 $this_page_areas = $this -> Areas [ $page ] ; 11840 $column_areas = array ( ) ; 11841 11842 for ( $i = 0, $count = count ( $this_page_areas ) ; $i < $count ; $i ++ ) 11843 { 11844 // For now, duplicate the existing column areas - they will represent the 1st line of columns 11845 $this_page_area = $this_page_areas [$i] ; 11846 $new_area = clone ( $this_page_area ) ; 11847 $column_areas [0] [] = $new_area ; 11848 $line_height = $new_area -> Height ; 11849 $current_top = $new_area -> Top - $line_height ; 11850 $current_line = 0 ; 11851 11852 // Then build new column areas for each successive lines 11853 while ( $current_top - $line_height >= 0 ) 11854 { 11855 $current_line ++ ; 11856 $new_area = clone ( $new_area ) ; 11857 $new_area -> Top -= $line_height ; 11858 $new_area -> Bottom -= $line_height ; 11859 11860 $column_areas [ $current_line ] [] = $new_area ; 11861 $current_top -= $line_height ; 11862 } 11863 } 11864 11865 // Now extract the columns, line per line, from the current page's text fragments 11866 $found_lines = array ( ) ; 11867 11868 foreach ( $fragments as $fragment ) 11869 { 11870 $this -> GetFragmentData ( $fragment, $text, $left, $top, $right, $bottom ) ; 11871 11872 // Loop through each line of column areas, built from the above step 11873 foreach ( $column_areas as $line => $column_areas_per_name ) 11874 { 11875 $index = 0 ; // Column index 11876 11877 // Process each column area 11878 foreach ( $column_areas_per_name as $column_area ) 11879 { 11880 // ... but only do something if the current column area is contained in the current fragment 11881 if ( $column_area -> Contains ( $left, $top, $right, $bottom ) ) 11882 { 11883 // The normal usage will be to capture one-line columns... 11884 if ( ! isset ( $found_lines [ $line ] [ $column_area -> Name ] ) ) 11885 { 11886 $found_lines [ $line ] [ $column_area -> Name ] = 11887 new PdfToTextCapturedColumn ( $page, $column_area -> Name, $text, 11888 $left, $top, $right, $bottom, $this ) ; 11889 } 11890 // ... but you can also use them to capture multiple lines ; in this case, the "separator" attribute of the <lines> or 11891 // <column> tag will be used to separate items 11892 else 11893 { 11894 $existing_area = $found_lines [ $line ] [ $column_area -> Name ] ; 11895 11896 $existing_area -> Top = max ( $existing_area -> Top , $column_area -> Top ) ; 11897 $existing_area -> Bottom = min ( $existing_area -> Bottom, $column_area -> Bottom ) ; 11898 $existing_area -> Left = min ( $existing_area -> Left , $column_area -> Left ) ; 11899 $existing_area -> Right = max ( $existing_area -> Right , $column_area -> Right ) ; 11900 $existing_area -> Text .= $this -> Separator . $text ; 11901 } 11902 } 11903 11904 $index ++ ; 11905 } 11906 } 11907 } 11908 11909 // A final pass to provide default values for empty columns (usually, column values that are not represented in the PDF file) 11910 // Also get the surrounding box for the whole line 11911 $final_lines = array ( ) ; 11912 11913 foreach ( $found_lines as $line => $columns_line ) 11914 { 11915 foreach ( $this -> ColumnNames as $column_name ) 11916 { 11917 if ( ! isset ( $columns_line [ $column_name ] ) ) 11918 { 11919 $columns_line [ $column_name ] = 11920 new PdfToTextCapturedColumn ( $page, $column_name, $this -> Columns [ $column_name ] [ 'default' ], 0, 0, 0, 0, $this ) ; 11921 } 11922 } 11923 11924 // Get the (left,top) coordinates of the line 11925 $line_left = $found_lines [ $line ] [ $this -> ColumnNames [0] ] -> Left ; 11926 $line_top = $found_lines [ $line ] [ $this -> ColumnNames [0] ] -> Top ; 11927 11928 // Get the (right,bottom) coordinates - we have to find the last column whose value is not a default value 11929 // (and therefore, has a non-zero Right coordinate) 11930 $last = count ( $this -> ColumnNames ) - 1 ; 11931 $line_right = 0 ; 11932 $line_bottom = 0 ; 11933 11934 while ( $last >= 0 && ! $columns_line [ $this -> ColumnNames [ $last ] ] -> Right ) 11935 $last -- ; 11936 11937 if ( $last > 0 ) 11938 { 11939 $line_right = $columns_line [ $this -> ColumnNames [ $last ] ] -> Right ; 11940 $line_bottom = $columns_line [ $this -> ColumnNames [ $last ] ] -> Bottom ; 11941 } 11942 11943 // Create a CaptureLine entry 11944 $final_lines [] = new PdfToTextCapturedLine ( $page, $this -> Name, $columns_line, $line_left, $line_top, $line_right, $line_bottom, $this ) ; 11945 } 11946 11947 // The result for this page will be a CapturedLines object 11948 $result [ $page ] = new PdfToTextCapturedLines ( $this -> Name, $page, $final_lines ) ; 11949 } 11950 11951 // All done, return 11952 return ( $result ) ; 11953 } 11954 11955 11956 /*-------------------------------------------------------------------------------------------------------------- 11957 11958 SetPageCount - 11959 Extracts text contents from the document fragments. 11960 11961 *-------------------------------------------------------------------------------------------------------------*/ 11962 public function SetPageCount ( $count ) 11963 { 11964 parent::SetPageCount ( $count ) ; 11965 11966 foreach ( $this -> ApplicablePages as $page => $applicable ) 11967 { 11968 if ( ! $applicable ) 11969 continue ; 11970 11971 foreach ( $this -> Columns as $column ) 11972 { 11973 if ( ! isset ( $this -> Tops [ $page ] ) ) 11974 { 11975 $this -> Tops [ $page ] = ( double ) $this -> ApplicablePages -> ExtraPageMapData [ $page ] [ 'column-top' ] ; 11976 $this -> Bottoms [ $page ] = ( double ) $this -> ApplicablePages -> ExtraPageMapData [ $page ] [ 'column-bottom' ] ; 11977 } 11978 11979 $area = new PdfToTextCaptureArea ( $column, $this -> ApplicablePages -> ExtraPageMapData [ $page ], $column [ 'name' ] ) ; 11980 11981 $this -> Areas [ $page ] [] = $area ; 11982 } 11983 } 11984 } 11985 11986 11987 /*-------------------------------------------------------------------------------------------------------------- 11988 11989 Support functions. 11990 11991 *-------------------------------------------------------------------------------------------------------------*/ 11992 } 11993 11994 11995/*============================================================================================================== 11996 11997 class PdfToTextCaptureApplicablePages - 11998 Holds a list of applicable pages given by the "number" attribute of <page> tags. 11999 12000 ==============================================================================================================*/ 12001class PdfToTextCaptureApplicablePages //extends Object 12002 implements ArrayAccess, Countable, Iterator 12003 { 12004 // Ranges of pages, as given by the "number" attribute of the <page> tag. Since a page number expression 12005 // can refer to the last page ("$"), and the total number of pages in the document is not yet known at the 12006 // time of object instantiation, we have to store all the page ranges as is. 12007 protected $PageRanges = array ( ) ; 12008 12009 // Once the SetPageCount() method has been called (ie, once the total number of pages in the document is 12010 // known), then a PageMap is built ; each key is the page number, indicating whether the page applies or not. 12011 public $PageMap = array ( ) ; 12012 12013 // Extra data associated, this time, with each page in PageMap 12014 public $ExtraPageMapData = array ( ) ; 12015 12016 // Page count - set by the SetPageCount() method 12017 public $PageCount = false ; 12018 12019 12020 /*-------------------------------------------------------------------------------------------------------------- 12021 12022 CONSTRUCTOR 12023 Initializes the object. 12024 12025 *-------------------------------------------------------------------------------------------------------------*/ 12026 public function __construct ( ) 12027 { 12028 } 12029 12030 12031 /*-------------------------------------------------------------------------------------------------------------- 12032 12033 NAME 12034 Add - Add a page number(s) definition. 12035 12036 PROTOTYPE 12037 $applicable_pages -> Add ( $page_number ) ; 12038 12039 DESCRIPTION 12040 Add the page number(s) specified by the "number" attribute of the <pages> tag to the list of applicable 12041 pages. 12042 12043 PARAMETERS 12044 $page_number (string) - 12045 A string defining which pages are applicable. This can be a single page number : 12046 12047 <page number="1" .../> 12048 12049 or a comma-separated list of pages : 12050 12051 <page number="1, 2, 10" .../> 12052 12053 or range(s) of pages : 12054 12055 <page number="1..10, 12..20" .../> 12056 12057 The special "$" character means "last page" ; thus the following example : 12058 12059 <page number="1, $-9..$" .../> 12060 12061 means : "applicable pages are 1, plus the last ten pages f the document". 12062 12063 *-------------------------------------------------------------------------------------------------------------*/ 12064 public function Add ( $page_number, $extra_data = false ) 12065 { 12066 $this -> __parse_page_numbers ( $page_number, $extra_data ) ; 12067 } 12068 12069 12070 /*-------------------------------------------------------------------------------------------------------------- 12071 12072 NAME 12073 SetPageCount - Sets the total number of pages in the document. 12074 12075 PROTOTYPE 12076 $applicable_pages -> SetPageCount ( $count ) ; 12077 12078 DESCRIPTION 12079 Sets the total number of pages in the document and builds a map of which pages are applicable or not. 12080 12081 PARAMETERS 12082 $count (integer) - 12083 Total number of pages in the document. 12084 12085 *-------------------------------------------------------------------------------------------------------------*/ 12086 public function SetPageCount ( $count ) 12087 { 12088 $this -> PageCount = $count ; 12089 $this -> PageMap = array ( ) ; 12090 12091 // Loop through the page ranges - every single value in the ranges has been converted to an integer ; 12092 // the other ones, built as expressions (using "$" for example) are processed here to give the actual 12093 // page number 12094 foreach ( $this -> PageRanges as $range ) 12095 { 12096 $low = $range [0] ; 12097 $high = $range [1] ; 12098 12099 // Translate expression to an actual value for the low and high parts of the range, if not already integers 12100 if ( ! is_integer ( $low ) ) 12101 $low = $this -> __check_expression ( $low, $count ) ; 12102 12103 if ( ! is_integer ( $high ) ) 12104 $high = $this -> __check_expression ( $high, $count ) ; 12105 12106 // Expressions using "$" may lead to negative values - adjust them 12107 if ( $low < 1 ) 12108 { 12109 if ( $high < 1 ) 12110 $high = 1 ; 12111 12112 $low = 1 ; 12113 } 12114 12115 // Check that the range is consistent 12116 if ( $low > $high ) 12117 error ( new PdfToTextCaptureException ( "Low value ($low) must be less or equal to high value ($high) " . 12118 "in page range specification \"{$range [0]}..{$range [1]}\"." ) ) ; 12119 12120 // Ignore ranges where the 'low' value is higher than the number of pages in the document 12121 if ( $low > $count ) 12122 { 12123 warning ( new PdfToTextCaptureException ( "Low value ($low) is greater than page count ($count) " . 12124 "in page range specification \"{$range [0]}..{$range [1]}\"." ) ) ; 12125 continue ; 12126 } 12127 12128 // Normalize the 'high' value, so that it's not bigger than the number of pages in the document 12129 if ( $high > $count ) 12130 $high = $count ; 12131 12132 // Complement the page map using this range 12133 for ( $i = $low ; $i <= $high ; $i ++ ) 12134 { 12135 $this -> PageMap [$i] = true ; 12136 $this -> ExtraPageMapData [$i] = $range [2] ; 12137 } 12138 } 12139 } 12140 12141 12142 /*-------------------------------------------------------------------------------------------------------------- 12143 12144 Interfaces implementations. 12145 12146 *-------------------------------------------------------------------------------------------------------------*/ 12147 12148 // Countable interface 12149 public function count ( ) 12150 { return ( count ( $this -> PageMap ) ) ; } 12151 12152 12153 // Array access interface 12154 public function offsetExists ( $offset ) 12155 { return ( isset ( $this -> PageMap [ $offset ] ) ) ; } 12156 12157 12158 public function offsetGet ( $offset ) 12159 { return ( ( isset ( $this -> PageMap [ $offset ] ) ) ? true : false ) ; } 12160 12161 12162 public function offsetSet ( $offset, $value ) 12163 { error ( new PdfToTextException ( "Unsupported operation" ) ) ; } 12164 12165 12166 public function offsetunset ( $offset ) 12167 { error ( new PdfToTextException ( "Unsupported operation" ) ) ; } 12168 12169 12170 // Iterator interface 12171 private $__iterator_value = 1 ; 12172 12173 public function rewind ( ) 12174 { $this -> __iterator_value = 1 ; } 12175 12176 12177 public function valid ( ) 12178 { return ( $this -> __iterator_value >= 1 && $this -> __iterator_value <= $this -> PageCount ) ; } 12179 12180 12181 public function key ( ) 12182 { return ( $this -> __iterator_value ) ; } 12183 12184 12185 public function next ( ) 12186 { $this -> __iterator_value ++ ; } 12187 12188 12189 public function current ( ) 12190 { return ( ( isset ( $this -> PageMap [ $this -> __iterator_value ] ) ) ? true : false ) ; } 12191 12192 12193 /*-------------------------------------------------------------------------------------------------------------- 12194 12195 Helper functions. 12196 12197 *-------------------------------------------------------------------------------------------------------------*/ 12198 12199 // __parse_page_numbers - 12200 // Performs a first pass on the value of the "number" attribute of the <page> tag. Transforms range expressions 12201 // when possible to integers ; keep the expression string intact when either the low or high value of a range 12202 // is itself an expression, probably using the "$" (page count) character. 12203 private function __parse_page_numbers ( $text, $extra_data ) 12204 { 12205 $ranges = explode ( ',', $text ) ; 12206 12207 // Loop through comma-separated ranges 12208 foreach ( $ranges as $range ) 12209 { 12210 $items = explode ( '..', $range ) ; 12211 12212 // Check if current item is a range 12213 switch ( count ( $items ) ) 12214 { 12215 // If not a range (ie, a single value) then make a range using that value 12216 // (low and high range values will be the same) 12217 case 1 : 12218 if ( is_numeric ( $items [0] ) ) 12219 $low = $high = ( integer ) $items [0] ; 12220 else 12221 $low = $high = trim ( $items [0] ) ; 12222 12223 break ; 12224 12225 // If range, store the low and high values 12226 case 2 : 12227 $low = ( is_numeric ( $items [0] ) ) ? ( integer ) $items [0] : trim ( $items [0] ) ; 12228 $high = ( is_numeric ( $items [1] ) ) ? ( integer ) $items [1] : trim ( $items [1] ) ; 12229 break ; 12230 12231 // Other cases : throw an exception 12232 default : 12233 error ( new PdfToTextCaptureException ( "Invalid page range specification \"$range\"." ) ) ; 12234 } 12235 12236 // If the low or high range value is an expression, check at this stage that it is correct 12237 if ( is_string ( $low ) && $this -> __check_expression ( $low ) === false ) 12238 error ( new PdfToTextCaptureException ( "Invalid expression \"$low\" in page range specification \"$range\"." ) ) ; 12239 12240 if ( is_string ( $high ) && $this -> __check_expression ( $high ) === false ) 12241 error ( new PdfToTextCaptureException ( "Invalid expression \"$high\" in page range specification \"$range\"." ) ) ; 12242 12243 // Add the page range and the extra data 12244 $this -> PageRanges [] = array ( $low, $high, $extra_data ) ; 12245 } 12246 } 12247 12248 12249 // __check_expression - 12250 // Checks that a syntactically correct 12251 private function __check_expression ( $str, $count = 1 ) 12252 { 12253 $new_str = str_replace ( '$', $count, $str ) ; 12254 $value = @eval ( "return ( $new_str ) ;" ) ; 12255 12256 return ( $value ) ; 12257 } 12258 } 12259 12260 12261/*============================================================================================================== 12262 12263 class PdfToTextCaptureArea - 12264 A capture area describes a rectangle, either by its top, left, right and bottom coordinates, or by 12265 its top/left coordinates, and its width and height. 12266 12267 ==============================================================================================================*/ 12268class PdfToTextCaptureArea //extends Object 12269 { 12270 // List of authorzed keyword for defining the rectangle dimensions 12271 static private $Keys = array ( 'left', 'top', 'right', 'bottom', 'width', 'height' ) ; 12272 12273 // Rectangle dimensions 12274 private $Left = false, 12275 $Top = false, 12276 $Right = false, 12277 $Bottom = false ; 12278 12279 // Area name (for internal purposes) 12280 public $Name ; 12281 12282 12283 /*-------------------------------------------------------------------------------------------------------------- 12284 12285 NAME 12286 Constructor 12287 12288 PROTOTYPE 12289 $area = new PdfToTextCaptureArea ( $area, $default_area = null, $name = '' ) ; 12290 12291 DESCRIPTION 12292 Initialize an area (a rectangle) using the supplied coordinates 12293 12294 PARAMETERS 12295 $area (array) - 12296 An associative array that may contain the following entries : 12297 12298 - 'left' (double) : 12299 Left x-coordinate (mandatory). 12300 12301 - 'top' (double) : 12302 Top y-coordinate (mandatory). 12303 12304 - 'right (double) : 12305 Right x-coordinate. 12306 12307 - 'bottom' (double) : 12308 Bottom y-coordinate. 12309 12310 - 'width' (double) : 12311 Width of the rectangle, starting from 'left'. 12312 12313 - 'height' (double) : 12314 Height of the rectangle, starting from 'top'. 12315 12316 Either the 'right' or 'width' entries must be specified. This is the same for the 'bottom' and 12317 'height' entries. 12318 12319 $default_area (array) - 12320 An array that can be used to supply default values when absent from $area. 12321 12322 $name (string) - 12323 An optional name for this area. This information is not used by the class. 12324 12325 NOTES 12326 Coordinate (0,0) is located at the left bottom of the page. 12327 12328 *-------------------------------------------------------------------------------------------------------------*/ 12329 public function __construct ( $area, $default_area = null, $name = '' ) 12330 { 12331 $left = 12332 $top = 12333 $right = 12334 $bottom = 12335 $width = 12336 $height = false ; 12337 12338 // Retrieve each entry that allows to specify a coordinate component, using $default_area if needed 12339 foreach ( self::$Keys as $key ) 12340 { 12341 if ( isset ( $area [ $key ] ) ) 12342 { 12343 if ( $area [ $key ] === false ) 12344 { 12345 if ( isset ( $default_area [ $key ] ) ) 12346 $$key = $default_area [ $key ] ; 12347 else 12348 $$key = false ; 12349 } 12350 else 12351 $$key = $area [ $key ] ; 12352 } 12353 else if ( isset ( $default_area [ $key ] ) ) 12354 $$key = $default_area [ $key ] ; 12355 } 12356 12357 // Check for mandatory coordinates 12358 if ( $left === false ) 12359 error ( new PdfToTextCaptureException ( "Attribute \"left\" is mandatory." ) ); 12360 else 12361 $left = ( double ) $left ; 12362 12363 if ( $top === false ) 12364 error ( new PdfToTextCaptureException ( "Attribute \"top\" is mandatory." ) ) ; 12365 else 12366 $top = ( double ) $top ; 12367 12368 // Either the 'right' or 'width' entries are required 12369 if ( $right === false ) 12370 { 12371 if ( $width === false ) 12372 error ( new PdfToTextCaptureException ( "Either the \"right\" or the \"width\" attribute must be specified." ) ) ; 12373 else 12374 $right = $left + ( double ) $width - 1 ; 12375 } 12376 else 12377 $right = ( double ) $right ; 12378 12379 // Same for 'bottom' and 'height' 12380 if ( $bottom === false ) 12381 { 12382 if ( $height === false ) 12383 error ( new PdfToTextCaptureException ( "Either the \"bottom\" or the \"height\" attribute must be specified." ) ) ; 12384 else 12385 $bottom = $top - ( double ) $height + 1 ; 12386 } 12387 else 12388 $bottom = ( double ) $bottom ; 12389 12390 // All done, we have the coordinates we wanted 12391 $this -> Left = $left ; 12392 $this -> Right = $right ; 12393 $this -> Top = $top ; 12394 $this -> Bottom = $bottom ; 12395 12396 $this -> Name = $name ; 12397 } 12398 12399 12400 /*-------------------------------------------------------------------------------------------------------------- 12401 12402 NAME 12403 __get, __set - Implement the Width and Height properties. 12404 12405 *-------------------------------------------------------------------------------------------------------------*/ 12406 public function __get ( $member ) 12407 { 12408 switch ( $member ) 12409 { 12410 case 'Left' : 12411 case 'Top' : 12412 case 'Right' : 12413 case 'Bottom' : 12414 return ( $this -> $member ) ; 12415 12416 case 'Width' : 12417 return ( $this -> Right - $this -> Left + 1 ) ; 12418 12419 case 'Height' : 12420 return ( $this -> Top - $this -> Bottom + 1 ) ; 12421 12422 default : 12423 trigger_error ( "Undefined property \"$member\"." ) ; 12424 } 12425 } 12426 12427 12428 public function __set ( $member, $value ) 12429 { 12430 $value = ( double ) $value ; 12431 12432 switch ( $member ) 12433 { 12434 case 'Top' : 12435 case 'Left' : 12436 case 'Right' : 12437 case 'Bottom' : 12438 $this -> $member = $value ; 12439 break ; 12440 12441 case 'Width' : 12442 $this -> Right = $this -> Left + $value - 1 ; 12443 break ; 12444 12445 case 'Height' : 12446 $this -> Bottom = $this -> Top - $value + 1 ; 12447 break ; 12448 12449 default : 12450 trigger_error ( "Undefined property \"$member\"." ) ; 12451 } 12452 } 12453 12454 12455 /*-------------------------------------------------------------------------------------------------------------- 12456 12457 NAME 12458 Contains - Check if this area contains the specified rectangle. 12459 12460 *-------------------------------------------------------------------------------------------------------------*/ 12461 public function Contains ( $left, $top, $right, $bottom ) 12462 { 12463 if ( $left >= $this -> Left && $right <= $this -> Right && 12464 $top <= $this -> Top && $bottom >= $this -> Bottom ) 12465 return ( true ) ; 12466 else 12467 return ( false ) ; 12468 } 12469 } 12470 12471 12472 12473/************************************************************************************************************** 12474 ************************************************************************************************************** 12475 ************************************************************************************************************** 12476 ****** ****** 12477 ****** ****** 12478 ****** CAPTURED TEXT MANAGEMENT ****** 12479 ****** (none of the classes listed here are meant to be instantiated outside this file) ****** 12480 ****** ****** 12481 ****** ****** 12482 ************************************************************************************************************** 12483 ************************************************************************************************************** 12484 **************************************************************************************************************/ 12485 12486 /*============================================================================================================== 12487 12488 class PdfToTextCapturedText - 12489 Base class for captured text enclosed by shapes. 12490 12491 ==============================================================================================================*/ 12492 abstract class PdfToTextCapturedText //extends Object 12493 { 12494 // Shape name (as specified by the "name" attribute of the <rectangle> or <lines> tags, for example) 12495 public $Name ; 12496 // Number of the page where the text was found (starts from 1) 12497 public $Page ; 12498 // Shape type (one of the PfToTextCaptureShape::SHAPE_* constants) 12499 public $Type ; 12500 // Shape definition object (not really used, but in case of...) 12501 private $ShapeDefinition ; 12502 // Captured text 12503 public $Text ; 12504 // Surrounding rectangle in the PDF file 12505 public $Left, 12506 $Top, 12507 $Right, 12508 $Bottom ; 12509 12510 12511 12512 /*-------------------------------------------------------------------------------------------------------------- 12513 12514 Constructor - 12515 Initializes a captured text object, whatever the original shape. 12516 12517 *-------------------------------------------------------------------------------------------------------------*/ 12518 public function __construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition ) 12519 { 12520 $this -> Name = $name ; 12521 $this -> Page = $page ; 12522 $this -> ShapeDefinition = $definition ; 12523 $this -> Text = $text ; 12524 $this -> Left = $left ; 12525 $this -> Top = $top ; 12526 $this -> Right = $right ; 12527 $this -> Bottom = $bottom ; 12528 $this -> Type = $definition -> Type ; 12529 } 12530 } 12531 12532 12533 /*============================================================================================================== 12534 12535 class PdfToTextCapturedRectangle - 12536 Implements a text captured by a rectangle shape. 12537 12538 ==============================================================================================================*/ 12539class PdfToTextCapturedRectangle extends PdfToTextCapturedText 12540 { 12541 public function __construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition ) 12542 { 12543 parent::__construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition ) ; 12544 } 12545 12546 12547 public function __tostring ( ) 12548 { return ( $this -> Text ) ; } 12549 } 12550 12551 12552 /*============================================================================================================== 12553 12554 class PdfToTextCapturedColumn - 12555 Implements a text captured by a lines/column shape. 12556 Actually behaves like the PdfToTextCapturedRectangle class 12557 12558 ==============================================================================================================*/ 12559class PdfToTextCapturedColumn extends PdfToTextCapturedText 12560 { 12561 public function __construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition ) 12562 { 12563 parent::__construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition ) ; 12564 } 12565 12566 12567 public function __tostring ( ) 12568 { return ( $this -> Text ) ; } 12569 } 12570 12571 12572 /*============================================================================================================== 12573 12574 class PdfToTextCapturedLine - 12575 Implements a text captured by a lines shape. 12576 12577 ==============================================================================================================*/ 12578class PdfToTextCapturedLine extends PdfToTextCapturedText 12579 implements ArrayAccess, Countable, IteratorAggregate 12580 { 12581 // Column objects 12582 public $Columns ; 12583 // Array of column names, to allow access by either index or column name 12584 private $ColumnsByNames = array ( ) ; 12585 12586 12587 /*-------------------------------------------------------------------------------------------------------------- 12588 12589 Constructor - 12590 Builds a Line object based on the supplied columns. 12591 Also builds the Text property, which contains the columns text separated by the separator string 12592 specified in the XML definition. 12593 12594 *-------------------------------------------------------------------------------------------------------------*/ 12595 public function __construct ( $page, $name, $columns, $left, $top, $right, $bottom, $definition ) 12596 { 12597 // Although the Columns property is most likely to be used, build a text representation of the whole ine 12598 $text = array ( ) ; 12599 $count = 0 ; 12600 12601 foreach ( $columns as $column ) 12602 { 12603 $text [] = $column -> Text ; 12604 $this -> ColumnsByNames [ $column -> Name ] = $count ++ ; 12605 } 12606 12607 // Provide this information to the parent constructor 12608 parent::__construct ( $page, $name, implode ( $definition -> Separator, $text ), $left, $top, $right, $bottom, $definition ) ; 12609 12610 // Store the column definitions 12611 $this -> Columns = $columns ; 12612 } 12613 12614 12615 /*-------------------------------------------------------------------------------------------------------------- 12616 12617 __get - 12618 Returns access to a column by its name. 12619 12620 *-------------------------------------------------------------------------------------------------------------*/ 12621 public function __get ( $member ) 12622 { 12623 if ( isset ( $this -> ColumnsByNames [ $member ] ) ) 12624 return ( $this -> Columns [ $this -> ColumnsByNames [ $offset ] ] ) ; 12625 else 12626 trigger_error ( "Undefined property \"$member\"." ) ; 12627 } 12628 12629 12630 /*-------------------------------------------------------------------------------------------------------------- 12631 12632 Interfaces implementations. 12633 12634 *-------------------------------------------------------------------------------------------------------------*/ 12635 public function count ( ) 12636 { return ( $this -> Columns ) ; } 12637 12638 12639 public function getIterator ( ) 12640 { return ( new ArrayIterator ( $this -> Columns ) ) ; } 12641 12642 12643 public function offsetExists ( $offset ) 12644 { 12645 if ( is_numeric ( $offset ) ) 12646 return ( $offset >= 0 && $offset < count ( $this -> Columns ) ) ; 12647 else 12648 return ( isset ( $this -> ColumnsByNames [ $offset ] ) ) ; 12649 } 12650 12651 12652 public function offsetGet ( $offset ) 12653 { 12654 if ( is_numeric ( $offset ) ) 12655 return ( $this -> Columns [ $offset ] ) ; 12656 else 12657 return ( $this -> Columns [ $this -> ColumnsByNames [ $offset ] ] ) ; 12658 } 12659 12660 12661 public function offsetSet ( $offset, $value ) 12662 { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; } 12663 12664 12665 public function offsetUnset ( $offset ) 12666 { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; } 12667 } 12668 12669 12670 /*============================================================================================================== 12671 12672 class PdfToTextCapturedLines - 12673 Implements a set of lines. 12674 12675 ==============================================================================================================*/ 12676class PdfToTextCapturedLines //extends Object 12677 implements ArrayAccess, Countable, IteratorAggregate 12678 { 12679 // Capture name, as specified by the "name" attribute of the <lines> tag 12680 public $Name ; 12681 // Page number of the capture 12682 public $Page ; 12683 // Captured lines 12684 public $Lines ; 12685 // Content type (mimics a little bit the PdfToTextCapturedText class) 12686 public $Type = PdfToTextCaptureShapeDefinition::SHAPE_LINE ; 12687 12688 12689 /*-------------------------------------------------------------------------------------------------------------- 12690 12691 Constructor - 12692 Instantiates a PdfToTextCapturedLines object. 12693 12694 *-------------------------------------------------------------------------------------------------------------*/ 12695 public function __construct ( $name, $page, $lines ) 12696 { 12697 $this -> Name = $name ; 12698 $this -> Page = $page ; 12699 $this -> Lines = $lines ; 12700 } 12701 12702 12703 /*-------------------------------------------------------------------------------------------------------------- 12704 12705 Interfaces implementations. 12706 12707 *-------------------------------------------------------------------------------------------------------------*/ 12708 public function count ( ) 12709 { return ( $this -> Lines ) ; } 12710 12711 12712 public function getIterator ( ) 12713 { return ( new ArrayIterator ( $this -> Lines ) ) ; } 12714 12715 12716 public function offsetExists ( $offset ) 12717 { return ( $offset >= 0 && $offset < count ( $this -> Lines ) ) ; } 12718 12719 12720 public function offsetGet ( $offset ) 12721 { return ( $this -> Captures [ $offset ] ) ; } 12722 12723 12724 public function offsetSet ( $offset, $value ) 12725 { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; } 12726 12727 12728 public function offsetUnset ( $offset ) 12729 { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; } 12730 } 12731 12732 12733/************************************************************************************************************** 12734 ************************************************************************************************************** 12735 ************************************************************************************************************** 12736 ****** ****** 12737 ****** ****** 12738 ****** CAPTURE INTERFACE FOR THE DEVELOPER ****** 12739 ****** (none of the classes listed here are meant to be instantiated outside this file) ****** 12740 ****** ****** 12741 ****** ****** 12742 ************************************************************************************************************** 12743 ************************************************************************************************************** 12744 **************************************************************************************************************/ 12745 12746/*============================================================================================================== 12747 12748 class PdfToTextCaptures - 12749 Represents all the areas in a PDF file captured by the supplied XML definitions. 12750 12751 ==============================================================================================================*/ 12752class PdfToTextCaptures //extends Object 12753 { 12754 // Captured objects - May not exactly reflect the PdfToTextCapture*Shape classes 12755 private $CapturedObjects ; 12756 // Allows faster access by capture name 12757 private $ObjectsByName = array ( ) ; 12758 12759 12760 /*-------------------------------------------------------------------------------------------------------------- 12761 12762 Constructor - 12763 Instantiates a PdfToTextCaptures object. 12764 12765 *-------------------------------------------------------------------------------------------------------------*/ 12766 public function __construct ( $captures ) 12767 { 12768 $this -> CapturedObjects = $captures ; 12769 12770 // Build an array of objects indexed by their names 12771 foreach ( $captures as $page => $shapes ) 12772 { 12773 foreach ( $shapes as $shape ) 12774 $this -> ObjectsByName [ $shape -> Name ] [] = $shape ; 12775 } 12776 } 12777 12778 12779 /*-------------------------------------------------------------------------------------------------------------- 12780 12781 ToCaptures - 12782 Returns a simplified view of captured objects, with only name/value pairs. 12783 12784 *-------------------------------------------------------------------------------------------------------------*/ 12785 public function ToCaptures ( ) 12786 { 12787 $result = new stdClass ( ) ; 12788 12789 foreach ( $this -> CapturedObjects as $page => $captures ) 12790 { 12791 foreach ( $captures as $capture ) 12792 { 12793 switch ( $capture -> Type ) 12794 { 12795 case PdfToTextCaptureShapeDefinition::SHAPE_RECTANGLE : 12796 $name = $capture -> Name ; 12797 $value = $capture -> Text ; 12798 $result -> {$name} [ $page ] = $value ; 12799 break ; 12800 12801 case PdfToTextCaptureShapeDefinition::SHAPE_LINE : 12802 $name = $capture -> Name ; 12803 12804 if ( ! isset ( $result -> {$name} ) ) 12805 $result -> {$name} = array ( ) ; 12806 12807 foreach ( $capture as $line ) 12808 { 12809 $columns = new stdClass ; 12810 12811 foreach ( $line as $column ) 12812 { 12813 $column_name = $column -> Name ; 12814 $column_value = $column -> Text ; 12815 $columns -> {$column_name} = $column_value ; 12816 } 12817 12818 $result -> {$name} [] = $columns ; 12819 } 12820 } 12821 } 12822 } 12823 12824 return ( $result ) ; 12825 } 12826 12827 12828 /*-------------------------------------------------------------------------------------------------------------- 12829 12830 __get - 12831 Retrieves the captured objects by their name, as specified in the XML definition. 12832 12833 *-------------------------------------------------------------------------------------------------------------*/ 12834 public function __get ( $member ) 12835 { 12836 $fieldname = "__capture_{$member}__" ; 12837 12838 if ( ! isset ( $this -> $fieldname ) ) 12839 { 12840 if ( ! isset ( $this -> ObjectsByName [ $member ] ) ) 12841 error ( new PdfToTextException ( "Undefined property \"$member\"." ) ) ; 12842 12843 $this -> $fieldname = $this -> GetCaptureInstance ( $member ) ; 12844 } 12845 12846 return ( $this -> $fieldname ) ; 12847 } 12848 12849 12850 /*-------------------------------------------------------------------------------------------------------------- 12851 12852 GetCapturedObjectsByName - 12853 Returns an associative array of the captured shapes, indexed by their name. 12854 12855 *-------------------------------------------------------------------------------------------------------------*/ 12856 public function GetCapturedObjectsByName ( ) 12857 { return ( $this -> ObjectsByName ) ; } 12858 12859 12860 /*-------------------------------------------------------------------------------------------------------------- 12861 12862 GetCaptureInstance - 12863 Returns an object inheriting from the PdfToTextCapture class, that wraps the capture results. 12864 12865 *-------------------------------------------------------------------------------------------------------------*/ 12866 protected function GetCaptureInstance ( $fieldname ) 12867 { 12868 switch ( $this -> ObjectsByName [ $fieldname ] [0] -> Type ) 12869 { 12870 case PdfToTextCaptureShapeDefinition::SHAPE_RECTANGLE : 12871 return ( new PdfToTextRectangleCapture ( $this -> ObjectsByName [ $fieldname ] ) ) ; 12872 12873 case PdfToTextCaptureShapeDefinition::SHAPE_LINE : 12874 return ( new PdfToTextLinesCapture ( $this -> ObjectsByName [ $fieldname ] ) ) ; 12875 12876 default : 12877 error ( new PdfToTextCaptureException ( "Unhandled shape type " . $this -> ObjectsByName [ $fieldname ] [0] -> Type . "." ) ) ; 12878 } 12879 } 12880 12881 12882 } 12883 12884 12885/*============================================================================================================== 12886 12887 class PdfToTextCapture - 12888 Base class for all capture classes accessible to the caller. 12889 12890 ==============================================================================================================*/ 12891class PdfToTextCapture //extends Object 12892 implements ArrayAccess, Countable, IteratorAggregate 12893 { 12894 protected $Captures ; 12895 12896 12897 /*-------------------------------------------------------------------------------------------------------------- 12898 12899 Constructor - 12900 Instantiates a PdfToTextCapture object. 12901 12902 *-------------------------------------------------------------------------------------------------------------*/ 12903 public function __construct ( $objects ) 12904 { 12905 //parent::__construct ( ) ; 12906 12907 $this -> Captures = $objects ; 12908 } 12909 12910 12911 /*-------------------------------------------------------------------------------------------------------------- 12912 12913 Interfaces implementations. 12914 12915 *-------------------------------------------------------------------------------------------------------------*/ 12916 public function count ( ) 12917 { return ( $this -> Captures ) ; } 12918 12919 12920 public function getIterator ( ) 12921 { return ( new ArrayIterator ( $this -> Captures ) ) ; } 12922 12923 12924 public function offsetExists ( $offset ) 12925 { return ( $offset >= 0 && $offset < count ( $this -> Captures ) ) ; } 12926 12927 12928 public function offsetGet ( $offset ) 12929 { return ( $this -> Captures [ $offset ] ) ; } 12930 12931 12932 public function offsetSet ( $offset, $value ) 12933 { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; } 12934 12935 12936 public function offsetUnset ( $offset ) 12937 { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; } 12938 12939 } 12940 12941 12942/*============================================================================================================== 12943 12944 class PdfToTextLinesCapture - 12945 Represents a lines capture, without indexation to their page number. 12946 12947 ==============================================================================================================*/ 12948class PdfToTextLinesCapture extends PdfToTextCapture 12949 { 12950 /*-------------------------------------------------------------------------------------------------------------- 12951 12952 Constructor - 12953 "flattens" the supplied object list, by removing the PdfToTextCapturedLines class level, so that lines 12954 can be iterated whatever their page number is. 12955 12956 *-------------------------------------------------------------------------------------------------------------*/ 12957 public function __construct ( $objects ) 12958 { 12959 $new_objects = array ( ) ; 12960 12961 foreach ( $objects as $object ) 12962 { 12963 foreach ( $object as $line ) 12964 $new_objects [] = $line ; 12965 } 12966 12967 parent::__construct ( $new_objects ) ; 12968 } 12969 } 12970 12971 12972/*============================================================================================================== 12973 12974 class PdfToTextRectangleCapture - 12975 Implements a rectangle capture, from the caller point of view. 12976 12977 ==============================================================================================================*/ 12978class PdfToTextRectangleCapture extends PdfToTextCapture 12979 { 12980 /*-------------------------------------------------------------------------------------------------------------- 12981 12982 Constructor - 12983 Builds an object array indexed by page number. 12984 12985 *-------------------------------------------------------------------------------------------------------------*/ 12986 public function __construct ( $objects ) 12987 { 12988 $new_objects = array ( ) ; 12989 12990 foreach ( $objects as $object ) 12991 $new_objects [ $object -> Page ] = $object ; 12992 12993 parent::__construct ( $new_objects ) ; 12994 } 12995 } 12996 12997