1<?php
2/**************************************************************************************************************
3
4    NAME
5	PdfToText.phpclass
6
7    DESCRIPTION
8    	A class for extracting text from Pdf files.
9	Usage is very simple : just instantiate a PdfToText object, specifying an input filename, then use the
10	Text property to retrieve PDF textual contents :
11
12		$pdf	=  new PdfToText ( 'sample.pdf' ) ;
13		echo $pdf -> Text ;		// or : echo ( string ) $pdf ;
14
15	Or :
16
17		$pdf	=  new PdfToText ( ) ;
18		// Modify any property here before loading the file ; for example :
19		// $pdf -> BlockSeparator = " " ;
20		$pdf -> Load ( 'sample.pdf' ) ;
21		echo $pdf -> Text ;
22
23    AUTHOR
24        Christian Vigh, 04/2016.
25
26    HISTORY
27    [Version : 1.6.7]	[Date : 2017/05/31]     [Author : CV]
28	. Added CID fonts
29	. Changed the way CID font maps are searched and handled
30
31    (...)
32
33    [Version : 1.0]	[Date : 2016/04/16]     [Author : CV]
34        Initial version.
35
36 **************************************************************************************************************/
37
38
39/*==============================================================================================================
40
41    class PdfToTextException et al -
42        Implements an exception thrown when an error is encountered while decoding PDF files.
43
44  ==============================================================================================================*/
45
46// PdfToText exception -
47//	Base class for all other PdfToText exceptions.
48class  PdfToTextException			extends  Exception
49   {
50	public static	$IsObject		=  false ;
51    } ;
52
53
54// PdfToTextDecodingException -
55//	Thrown when unexpected data is encountered while analyzing PDF contents.
56class  PdfToTextDecodingException		extends  PdfToTextException
57   {
58	public function  __construct ( $message, $object_id = false )
59	   {
60		$text	=  "Pdf decoding error" ;
61
62		if  ( $object_id  !==  false )
63			$text	.=  " (object #$object_id)" ;
64
65		$text	.=  " : $message" ;
66
67		parent::__construct ( $text ) ;
68	    }
69    }
70
71
72// PdfToTextDecryptionException -
73//	Thrown when something unexpected is encountered while processing encrypted data.
74class  PdfToTextDecryptionException		extends  PdfToTextException
75   {
76	public function  __construct ( $message, $object_id = false )
77	   {
78		$text	=  "Pdf decryption error" ;
79
80		if  ( $object_id  !==  false )
81			$text	.=  " (object #$object_id)" ;
82
83		$text	.=  " : $message" ;
84
85		parent::__construct ( $text ) ;
86	    }
87    }
88
89
90// PdfToTextTimeoutException -
91//	Thrown when the PDFOPT_ENFORCE_EXECUTION_TIME or PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME option is set, and
92//	the script took longer than the allowed execution time limit.
93class  PdfToTextTimeoutException		extends  PdfToTextException
94   {
95	// Set to true if the reason why the max execution time was reached because of too many invocations of the Load() method
96	// Set to false if the max execution time was reached by simply processing one PDF file
97	public		$GlobalTimeout ;
98
99	public function  __construct ( $message, $global, $php_setting, $class_setting )
100	   {
101		$text	=  "PdfToText max execution time reached " ;
102
103		if  ( ! $global )
104			$text	.=  "for one single file " ;
105
106		$text	.=  "(php limit = {$php_setting}s, class limit = {$class_setting}s) : $message" ;
107
108		$this -> GlobalTimeout		=  $global ;
109
110		parent::__construct ( $text ) ;
111	    }
112    }
113
114
115// PdfToTextFormException -
116//	Thrown if the xml template passed to the GetFormData() method contains an error.
117class  PdfToTextFormException		extends  PdfToTextException
118   {
119	public function  __construct ( $message )
120	   {
121		$text	=  "Pdf form template error" ;
122
123		$text	.=  " : $message" ;
124
125		parent::__construct ( $text ) ;
126	    }
127    }
128
129
130// PdfToTextCaptureException -
131//	Thrown if the xml template passed to the SetCaptures() method contains an error.
132class  PdfToTextCaptureException		extends  PdfToTextException
133   {
134	public function  __construct ( $message )
135	   {
136		$text	=  "Pdf capture template error" ;
137
138		$text	.=  " : $message" ;
139
140		parent::__construct ( $text ) ;
141	    }
142    }
143
144
145
146/*==============================================================================================================
147
148        Custom error reporting functions.
149
150  ==============================================================================================================*/
151if  ( ! function_exists ( 'warning' ) )
152   {
153	function  warning ( $message )
154	   {
155		trigger_error ( $message, E_USER_WARNING ) ;
156	    }
157    }
158
159
160if  ( ! function_exists ( 'error' ) )
161   {
162	function  error ( $message )
163	   {
164		if  ( is_string ( $message ) )
165			trigger_error ( $message, E_USER_ERROR ) ;
166		else if (  is_a ( $message, '\Exception' ) )
167			throw $message ;
168	    }
169    }
170
171
172/*==============================================================================================================
173
174        Backward-compatibility issues.
175
176  ==============================================================================================================*/
177
178// hex2bin -
179//	This function appeared only in version 5.4.0
180if  ( ! function_exists ( 'hex2bin' ) )
181   {
182	function  hex2bin  ( $hexstring )
183	   {
184		$length		=  strlen ( $hexstring ) ;
185		$binstring	=  '' ;
186		$index		=  0 ;
187
188		while  ( $index   <  $length )
189		   {
190			$byte		 =  substr ( $hexstring, $index, 2 ) ;
191			$ch		 =  pack ( 'H*', $byte ) ;
192			$binstring	.=  $ch ;
193
194			$index		+=  2 ;
195		    }
196
197		return ( $binstring ) ;
198	    }
199
200    }
201
202
203/*==============================================================================================================
204
205    class PfObjectBase -
206        Base class for all PDF objects defined here.
207
208  ==============================================================================================================*/
209abstract class  PdfObjectBase		// extends  Object
210   {
211	// Possible encoding types for streams inside objects ; "unknown" means that the object contains no stream
212	const 	PDF_UNKNOWN_ENCODING 		=   0 ;		// No stream decoding type could be identified
213	const 	PDF_ASCIIHEX_ENCODING 		=   1 ;		// AsciiHex encoding - not tested
214	const 	PDF_ASCII85_ENCODING		=   2 ;		// Ascii85 encoding - not tested
215	const 	PDF_FLATE_ENCODING		=   3 ;		// Flate/deflate encoding
216	const	PDF_TEXT_ENCODING		=   4 ;		// Stream data appears in clear text - no decoding required
217	const	PDF_LZW_ENCODING		=   5 ;		// Not implemented yet
218	const	PDF_RLE_ENCODING		=   6 ;		// Runtime length encoding ; not implemented yet
219	const	PDF_DCT_ENCODING		=   7 ;		// JPEG images
220	const	PDF_CCITT_FAX_ENCODING		=   8 ;		// CCITT Fax encoding - not implemented yet
221	const	PDF_JBIG2_ENCODING		=   9 ;		// JBIG2 filter encoding (black/white) - not implemented yet
222	const	PDF_JPX_ENCODING		=  10 ;		// JPEG2000 encoding - not implemented yet
223
224	// Regular expression used for recognizing references to a font (this list is far from being exhaustive, as it seems
225	// that you can specify almost everything - however, trying to recognize everything would require to develop a complete
226	// parser)
227	protected static	$FontSpecifiers		=  '
228		(/F \d+ (\.\d+)? )						|
229		(/R \d+)							|
230		(/f-\d+-\d+)							|
231		(/[CT]\d+_\d+)							|
232		(/TT \d+)							|
233		(/OPBaseFont \d+)						|
234		(/OPSUFont \d+)							|
235		(/[0-9a-zA-Z])							|
236		(/F\w+)								|
237		(/[A-Za-z][A-Za-z0-9]* ( [\-+] [A-Za-z][A-Za-z0-9]* ))
238		' ;
239
240	// Maps alien Unicode characters such as special spaces, letters with ligatures to their ascii string equivalent
241	protected static	$UnicodeToSimpleAscii	=  false ;
242
243
244 	/*--------------------------------------------------------------------------------------------------------------
245
246	    Constructor -
247		Performs static initializations such as the Unicode to Ascii table.
248
249	 *-------------------------------------------------------------------------------------------------------------*/
250	public function  __construct ( )
251	   {
252		if  ( self::$UnicodeToSimpleAscii  ===  false )
253		   {
254			$charset_file			=  dirname ( __FILE__ ) . "/Maps/unicode-to-ansi.map" ;
255			include ( $charset_file ) ;
256			self::$UnicodeToSimpleAscii	=  ( isset ( $unicode_to_ansi ) ) ?  $unicode_to_ansi : array ( ) ;
257		    }
258
259		// parent::__construct ( ) ;
260	    }
261
262
263 	/*--------------------------------------------------------------------------------------------------------------
264
265	    NAME
266	        CodePointToUtf8 - Encodes a Unicode codepoint to UTF8.
267
268	    PROTOTYPE
269	        $char	=  $this -> CodePointToUtf8 ( $code ) ;
270
271	    DESCRIPTION
272	        Encodes a Unicode codepoint to UTF8, trying to handle all possible cases.
273
274	    PARAMETERS
275	        $code (integer) -
276	                Unicode code point to be translated.
277
278	    RETURN VALUE
279	        A string that contains the UTF8 bytes representing the Unicode code point.
280
281	 *-------------------------------------------------------------------------------------------------------------*/
282	protected function  CodePointToUtf8 ( $code )
283	   {
284		if  ( $code )
285		   {
286			$result		=  '' ;
287
288			while  ( $code )
289			   {
290				$word		=  ( $code & 0xFFFF ) ;
291
292				if  ( ! isset ( self::$UnicodeToSimpleAscii [ $word ] ) )
293				   {
294					$entity		 =  "&#$word;" ;
295					$result		.=  mb_convert_encoding ( $entity, 'UTF-8', 'HTML-ENTITIES' ) . $result ;
296				    }
297				else
298					$result		.=  self::$UnicodeToSimpleAscii [ $word ] ;
299
300				$code		 =  ( integer ) ( $code / 0xFFFF ) ;	// There is no unsigned right-shift operator in PHP...
301			    }
302
303			return ( $result ) ;
304		    }
305		// No translation is apparently possible : use a placeholder to signal this situation
306		else
307		   {
308			if  ( strpos ( PdfToText::$Utf8Placeholder, '%' )   ===  false )
309			   {
310				return ( PdfToText::$Utf8Placeholder ) ;
311			    }
312			else
313				return ( sprintf ( PdfToText::$Utf8Placeholder, $code ) ) ;
314		    }
315	    }
316
317
318	/*--------------------------------------------------------------------------------------------------------------
319
320	    DecodeRawName -
321		Decodes a string that may contain constructs such as '#xy', where 'xy' are hex digits.
322
323	 *-------------------------------------------------------------------------------------------------------------*/
324	public static function  DecodeRawName ( $str )
325	   {
326		return ( rawurldecode ( str_replace ( '#', '%', $str ) ) ) ;
327	    }
328
329
330	/*--------------------------------------------------------------------------------------------------------------
331
332	    NAME
333	        GetEncodingType - Gets an object encoding type.
334
335	    PROTOTYPE
336	        $type	=  $this -> GetEncodingType ( $object_id, $object_data ) ;
337
338	    DESCRIPTION
339	        When an object is a stream, returns its encoding type.
340
341	    PARAMETERS
342		$object_id (integer) -
343			PDF object number.
344
345	        $object_data (string) -
346	                Object contents.
347
348	    RETURN VALUE
349	        Returns one of the following values :
350
351		- PdfToText::PDF_ASCIIHEX_ENCODING :
352			Hexadecimal encoding of the binary values.
353			Decoding algorithm was taken from the unknown contributor and not tested so far, since I
354			couldn't find a PDF file with such an encoding type.
355
356		- PdfToText::PDF_ASCII85_ENCODING :
357			Obscure encoding format.
358			Decoding algorithm was taken from the unknown contributor and not tested so far, since I
359			couldn't find a PDF file with such an encoding type.
360
361		- PdfToText::PDF_FLATE_ENCODING :
362			gzip/deflate encoding.
363
364		- PdfToText::PDF_TEXT_ENCODING :
365			Stream data is unencoded (ie, it is pure ascii).
366
367		- PdfToText::PDF_UNKNOWN_ENCODING :
368			The object data does not specify any encoding at all. It can happen on objects that do not have
369			a "stream" part.
370
371		- PdfToText::PDF_DCT_ENCODING :
372			a lossy filter based on the JPEG standard.
373
374		The following constants are defined but not yet implemented ; an exception will be thrown if they are
375		encountered somewhere in the PDF file :
376
377		- PDF_LZW_ENCODING :
378			a filter based on LZW Compression; it can use one of two groups of predictor functions for more
379			compact LZW compression : Predictor 2 from the TIFF 6.0 specification and predictors (filters)
380			from the PNG specification
381
382		- PDF_RLE_ENCODING :
383			a simple compression method for streams with repetitive data using the run-length encoding
384			algorithm and the image-specific filters.
385
386		PDF_CCITT_FAX_ENCODING :
387			a lossless bi-level (black/white) filter based on the Group 3 or Group 4 CCITT (ITU-T) fax
388			compression standard defined in ITU-T T.4 and T.6.
389
390		PDF_JBIG2_ENCODING :
391			a lossy or lossless bi-level (black/white) filter based on the JBIG2 standard, introduced in
392			PDF 1.4.
393
394		PDF_JPX_ENCODING :
395			a lossy or lossless filter based on the JPEG 2000 standard, introduced in PDF 1.5.
396
397	 *-------------------------------------------------------------------------------------------------------------*/
398	protected function  GetEncodingType ( $object_id, $object_data )
399	   {
400		$status 	=  preg_match ( '# / (?P<encoding> (ASCIIHexDecode) | (AHx) | (ASCII85Decode) | (A85) | (FlateDecode) | (Fl) | (DCTDecode) | (DCT) | ' .
401						                   '(LZWDecode) | (LZW) | (RunLengthDecode) | (RL) | (CCITTFaxDecode) | (CCF) | (JBIG2Decode) | (JPXDecode) ) \b #imsx',
402						$object_data, $match ) ;
403
404		if  ( ! $status )
405			return ( self::PDF_TEXT_ENCODING ) ;
406
407		switch ( strtolower ( $match [ 'encoding' ] ) )
408		    {
409		    	case 	'asciihexdecode' 	:
410			case	'ahx'			:  return ( self::PDF_ASCIIHEX_ENCODING  ) ;
411
412		    	case 	'ascii85decode' 	:
413			case	'a85'			:  return ( self::PDF_ASCII85_ENCODING   ) ;
414
415		    	case	'flatedecode'		:
416			case	'fl'			:  return ( self::PDF_FLATE_ENCODING     ) ;
417
418			case    'dctdecode'		:
419			case	'dct'			:  return ( self::PDF_DCT_ENCODING       ) ;
420
421			case	'lzwdecode'		:
422			case	'lzw'			:  return ( self::PDF_LZW_ENCODING       ) ;
423
424			case	'ccittfaxdecode'	:
425			case	'ccf'			:
426
427			case	'runlengthdecode'	:
428			case	'rl'			:
429
430			case	'jbig2decode'		:
431
432			case	'jpxdecode'		:
433				if  ( PdfToText::$DEBUG  >  1 )
434					warning ( "Encoding type \"{$match [ 'encoding' ]}\" not yet implemented for pdf object #$object_id." ) ;
435
436			default				:  return ( self::PDF_UNKNOWN_ENCODING  ) ;
437		     }
438	    }
439
440
441	/*--------------------------------------------------------------------------------------------------------------
442
443	    NAME
444	        GetObjectReferences - Gets object references from a specified construct.
445
446	    PROTOTYPE
447	        $status		=  $this -> GetObjectReferences ( $object_id, $object_data, $searched_string, &$object_ids ) ;
448
449	    DESCRIPTION
450	        Certain parameter specifications are followed by an object reference of the form :
451			x 0 R
452		but it can also be an array of references :
453			[x1 0 R x2 0 R ... xn 0 r]
454		Those kind of constructs can occur after parameters such as : /Pages, /Contents, /Kids...
455		This method extracts the object references found in such a construct.
456
457	    PARAMETERS
458	        $object_id (integer) -
459	                Id of the object to be analyzed.
460
461		$object_data (string) -
462			Object contents.
463
464		$searched_string (string) -
465			String to be searched, that must be followed by an object or an array of object references.
466			This parameter can contain constructs used in regular expressions. Note however that the '#'
467			character must be escaped, since it is used as a delimiter in the regex that is applied on
468			object data.
469
470		$object_ids (array of integers) -
471			Returns on output the ids of the pdf object that have been found after the searched string.
472
473	    RETURN VALUE
474	        True if the searched string has been found and is followed by an object or array of object references,
475		false otherwise.
476
477	 *-------------------------------------------------------------------------------------------------------------*/
478	protected function  GetObjectReferences ( $object_id, $object_data, $searched_string, &$object_ids )
479	   {
480		$status		=  true ;
481		$object_ids	=  array ( ) ;
482
483		if  ( preg_match ( "#$searched_string \s* \\[ (?P<objects> [^\]]+ ) \\]#ix", $object_data, $match ) )
484		   {
485			$object_list	=  $match [ 'objects' ] ;
486
487			if  ( preg_match_all ( '/(?P<object> \d+) \s+ \d+ \s+ R/x', $object_list, $matches ) )
488			   {
489				foreach  ( $matches [ 'object' ]  as  $id )
490					$object_ids []	=  ( integer ) $id ;
491			    }
492			else
493				$status		=  false ;
494		    }
495		else if  ( preg_match ( "#$searched_string \s+ (?P<object> \d+) \s+ \d+ \s+ R#ix", $object_data, $match ) )
496		   {
497			$object_ids []	=  ( integer ) $match [ 'object' ] ;
498		    }
499		else
500			$status		=  false ;
501
502		return ( $status ) ;
503	    }
504
505
506	/*--------------------------------------------------------------------------------------------------------------
507
508	    NAME
509	        GetStringParameter - Retrieve a string flag value.
510
511	    PROTOTYPE
512	        $result		=  $this -> GetStringParameter ( $parameter, $object_data ) ;
513
514	    DESCRIPTION
515	        Retrieves the value of a string parameter ; for example :
516
517			/U (parameter value)
518
519		or :
520
521			/U <hexdigits>
522
523	    PARAMETERS
524	        $parameter (string) -
525	                Parameter name.
526
527		$object_data (string) -
528			Object containing the parameter.
529
530	    RETURN VALUE
531	        The parameter value.
532
533	    NOTES
534	        description
535
536	 *-------------------------------------------------------------------------------------------------------------*/
537	protected function  GetStringParameter ( $parameter, $object_data )
538	   {
539		if  ( preg_match ( '#' . $parameter . ' \s* \( \s* (?P<value> [^)]+) \)#ix', $object_data, $match ) )
540			$result		=  $this -> ProcessEscapedString ( $match [ 'value' ] ) ;
541		else if  ( preg_match ( '#' . $parameter . ' \s* \< \s* (?P<value> [^>]+) \>#ix', $object_data, $match ) )
542		   {
543			$hexdigits	=  $match [ 'value' ] ;
544			$result		=  '' ;
545
546			for  ( $i = 0, $count = strlen ( $hexdigits ) ; $i  <  $count ; $i +=  2 )
547				$result		.=  chr ( hexdec ( substr ( $hexdigits, $i, 2 ) ) ) ;
548		    }
549		else
550			$result		=  '' ;
551
552		return ( $result ) ;
553	    }
554
555
556	/*--------------------------------------------------------------------------------------------------------------
557
558	    GetUTCDate -
559	        Reformats an Adobe UTC date to a format that can be understood by the strtotime() function.
560		Dates are specified in the following format :
561			D:20150521154000Z
562			D:20160707182114+02
563		with are both recognized by strtotime(). However, another format can be specified :
564			D:20160707182114+02'00'
565		which is not recognized by strtotime() so we have to get rid from the '00' part.
566
567	 *-------------------------------------------------------------------------------------------------------------*/
568	protected function  GetUTCDate ( $date )
569	   {
570		if  ( $date )
571		   {
572			if  ( ( $date [0]  ==  'D'  ||  $date [0]  ==  'd' )  &&  $date [1]  ==  ':' )
573				$date	=  substr ( $date, 2 ) ;
574
575			if  ( ( $index  =  strpos ( $date, "'" ) )  !==  false )
576				$date	=  substr ( $date, 0, $index ) ;
577		    }
578
579		return ( $date ) ;
580	    }
581
582
583	/*--------------------------------------------------------------------------------------------------------------
584
585	    IsCharacterMap -
586	        Checks if the specified text contents represent a character map definition or not.
587
588	 *-------------------------------------------------------------------------------------------------------------*/
589	protected function  IsCharacterMap  ( $decoded_data )
590	   {
591		// preg_match is faster than calling strpos several times
592		return ( preg_match ( '#(begincmap)|(beginbfrange)|(beginbfchar)|(/Differences)#ix', $decoded_data ) ) ;
593	    }
594
595
596	/*--------------------------------------------------------------------------------------------------------------
597
598	    IsFont -
599		Checks if the current object contents specify a font declaration.
600
601	 *-------------------------------------------------------------------------------------------------------------*/
602	protected function  IsFont ( $object_data )
603	   {
604		return
605		   (
606			stripos ( $object_data, '/BaseFont' )  !==  false  ||
607				( ! preg_match ( '#/Type \s* /FontDescriptor#ix', $object_data )  &&
608					preg_match ( '#/Type \s* /Font#ix', $object_data ) )
609		    ) ;
610	    }
611
612
613	/*--------------------------------------------------------------------------------------------------------------
614
615	    IsFormData -
616		Checks if the current object contents specify references to font data.
617
618	 *-------------------------------------------------------------------------------------------------------------*/
619	protected function  IsFormData ( $object_data )
620	   {
621		return
622		   (
623			preg_match ( '#\bR \s* \( \s* datasets \s* \)#imsx', $object_data )
624		    ) ;
625	    }
626
627
628	/*--------------------------------------------------------------------------------------------------------------
629
630	    IsFontMap -
631		Checks if the code contains things like :
632			<</F1 26 0 R/F2 22 0 R/F3 18 0 R>>
633		which maps font 1 (when specified with the /Fx instruction) to object 26, 2 to object 22 and 3 to
634		object 18, respectively, in the above example.
635
636	 *-------------------------------------------------------------------------------------------------------------*/
637	protected function  IsFontMap ( $object_data )
638	   {
639		$object_data	=  self::UnescapeHexCharacters ( $object_data ) ;
640
641		if  ( preg_match ( '#<< \s* ( ' . self::$FontSpecifiers . ' ) \s+ .* >>#imsx', $object_data ) )
642			return ( true ) ;
643		else
644			return ( false ) ;
645	    }
646
647
648	/*--------------------------------------------------------------------------------------------------------------
649
650	    IsImage -
651		Checks if the code contains things like :
652			/Subtype/Image
653
654	 *-------------------------------------------------------------------------------------------------------------*/
655	protected function  IsImage ( $object_data )
656	   {
657		if  ( preg_match ( '#/Subtype \s* /Image#msx', $object_data ) )
658			return ( true ) ;
659		else
660			return ( false ) ;
661	    }
662
663
664	/*--------------------------------------------------------------------------------------------------------------
665
666	    IsObjectStream -
667		Checks if the code contains an object stream (/Type/ObjStm)
668			/Subtype/Image
669
670	 *-------------------------------------------------------------------------------------------------------------*/
671	protected function  IsObjectStream ( $object_data )
672	   {
673		if  ( preg_match ( '#/Type \s* /ObjStm#isx', $object_data ) )
674			return ( true ) ;
675		else
676			return ( false ) ;
677	    }
678
679
680	/*--------------------------------------------------------------------------------------------------------------
681
682	    NAME
683	        IsPageHeaderOrFooter - Check if the specified object contents denote a text stream.
684
685	    PROTOTYPE
686	        $status		=  $this -> IsPageHeaderOrFooter ( $stream_data ) ;
687
688	    DESCRIPTION
689	        Checks if the specified decoded stream contents denotes header or footer data.
690
691	    PARAMETERS
692	        $stream_data (string) -
693	                Decoded stream contents.
694
695	 *-------------------------------------------------------------------------------------------------------------*/
696	protected function  IsPageHeaderOrFooter ( $stream_data )
697	   {
698		if  ( preg_match ( '#/Type \s* /Pagination \s* /Subtype \s*/((Header)|(Footer))#ix', $stream_data ) )
699			return ( true ) ;
700		else if  ( preg_match ( '#/Attached \s* \[ .*? /((Top)|(Bottom)) [^]]#ix', $stream_data ) )
701			return ( true ) ;
702		else
703			return ( false ) ;
704	    }
705
706
707	/*--------------------------------------------------------------------------------------------------------------
708
709	    NAME
710	        IsText - Check if the specified object contents denote a text stream.
711
712	    PROTOTYPE
713	        $status		=  $this -> IsText ( $object_data, $decoded_stream_data ) ;
714
715	    DESCRIPTION
716	        Checks if the specified object contents denote a text stream.
717
718	    PARAMETERS
719	        $object_data (string) -
720	                Object data, ie the contents located between the "obj" and "endobj" keywords.
721
722	        $decoded_stream_data (string) -
723	        	The flags specified in the object data are not sufficient to be sure that we have a block of
724	        	drawing instructions. We must also check for certain common instructions to be present.
725
726	    RETURN VALUE
727	        True if the specified contents MAY be text contents, false otherwise.
728
729	    NOTES
730		I do not consider this method as bullet-proof. There may arise some cases where non-text blocks can be
731		mistakenly considered as text blocks, so it is subject to evolve in the future.
732
733	 *-------------------------------------------------------------------------------------------------------------*/
734	protected function  IsText ( $object_data, $decoded_stream_data )
735	   {
736		if  ( preg_match ( '# / (Filter) | (Length) #ix', $object_data )  &&
737		      ! preg_match ( '# / (Type) | (Subtype) | (Length1) #ix', $object_data ) )
738		   {
739		   	if  ( preg_match ( '/\\b(BT|Tf|Td|TJ|Tj|Tm|Do|cm)\\b/', $decoded_stream_data ) )
740				return ( true ) ;
741		    }
742		else if  ( preg_match ( '/\\b(BT|Tf|Td|TJ|Tj|Tm|Do|cm)\\b/', $decoded_stream_data ) )
743			return ( true ) ;
744
745		return ( false ) ;
746	    }
747
748
749	/*--------------------------------------------------------------------------------------------------------------
750
751	    NAME
752	        PregStrReplace - Replace string(s) using regular expression(s)
753
754	    PROTOTYPE
755	        $result		=  PdfToText::PregStrReplace ( $pattern, $replacement, $subject, $limit = -1,
756						&$match_count = null )
757
758	    DESCRIPTION
759	        This function behaves like a mix of str_replace() and preg_replace() ; it allows to search for strings
760		using regular expressions, but the replacements are plain-text strings and no reference to a capture
761		specified in the regular expression will be interpreted.
762		This is useful when processing templates, which can contain constructs such as "\00" or "$", which are
763		interpreted by preg_replace() as references to captures.
764
765		The function has the same parameters as preg_replace().
766
767	    RETURN VALUE
768	        Returns the substituted text.
769
770	 *-------------------------------------------------------------------------------------------------------------*/
771	public static function  PregStrReplace ( $pattern, $replacement, $subject, $limit = -1, &$match_count = null )
772	   {
773		// Make sure that $pattern and $replacement become arrays of the same size
774		if  ( is_array ( $pattern ) )
775		   {
776			if  ( is_array ( $replacement ) )
777			   {
778				if  ( count ( $pattern )  !==  count ( $replacement ) )
779				   {
780					warning ( "The \$replacement parameter should have the same number of element as \$pattern." ) ;
781					return ( $subject ) ;
782				    }
783			    }
784			else
785				$replacement	=  array_fill ( $replacement, count ( $pattern ), $replacement ) ;
786		    }
787		else
788		   {
789			if  ( is_array ( $replacement ) )
790			   {
791				warning ( "Expected string for the \$replacement parameter." ) ;
792				return ( $subject ) ;
793			    }
794
795			$pattern	=  array ( $pattern ) ;
796			$replacement	=  array ( $replacement ) ;
797		    }
798
799		// Upper limit
800		if  ( $limit  <  1 )
801			$limit		=  PHP_INT_MAX ;
802
803		// Loop through each supplied pattern
804		$current_subject	=  $subject ;
805		$count			=  0 ;
806
807		for  ( $i = 0, $pattern_count = count ( $pattern ) ; $i  <  $pattern_count ; $i ++ )
808		   {
809			$regex		=  $pattern [$i] ;
810
811			// Get all matches for this pattern
812			if  ( preg_match_all ( $regex, $current_subject, $matches, PREG_OFFSET_CAPTURE ) )
813			   {
814				$result		=  '' ;		// Current output result
815				$last_offset	=  0 ;
816
817				// Process each match
818				foreach  ( $matches [0]  as  $match )
819				   {
820					$offset		=  ( integer ) $match [1] ;
821
822					// Append data from the last seen offset up to the current one
823					if  ( $last_offset  <  $offset )
824						$result		.=  substr ( $current_subject, $last_offset, $offset - $last_offset ) ;
825
826					// Append the replacement string for this match
827					$result		.=  $replacement [$i] ;
828
829					// Compute next offset in $current_subject
830					$last_offset     =  $offset + strlen ( $match [0] ) ;
831
832					// Limit checking
833					$count ++ ;
834
835					if  ( $count  >  $limit )
836						break 2 ;
837				    }
838
839				// Append the last part of the subject that has not been matched by anything
840				$result			.=  substr ( $current_subject, $last_offset ) ;
841
842				// The current subject becomes the string that has been built in the steps above
843				$current_subject	 =  $result ;
844			    }
845		    }
846
847		/// All done, return
848		return ( $current_subject ) ;
849	    }
850
851
852	/*--------------------------------------------------------------------------------------------------------------
853
854	    NAME
855	        ProcessEscapedCharacter - Interprets a character after a backslash in a string.
856
857	    PROTOTYPE
858	        $ch		=  $this -> ProcessEscapedCharacter ( $ch ) ;
859
860	    DESCRIPTION
861	        Interprets a character after a backslash in a string and returns the interpreted value.
862
863	    PARAMETERS
864	        $ch (char) -
865	                Character to be escaped.
866
867	    RETURN VALUE
868	        The escaped character.
869
870	    NOTES
871		This method does not process octal sequences.
872
873	 *-------------------------------------------------------------------------------------------------------------*/
874	protected function  ProcessEscapedCharacter ( $ch )
875	   {
876		switch ( $ch )
877		   {
878			// Normally, only a few characters should be escaped...
879			case	'('	:  $newchar =  "("		; break ;
880			case	')'	:  $newchar =  ")"		; break ;
881			case	'['	:  $newchar =  "["		; break ;
882			case	']'	:  $newchar =  "]"		; break ;
883			case	'\\'	:  $newchar =  "\\"		; break ;
884			case 	'n' 	:  $newchar =  "\n"		; break ;
885			case 	'r' 	:  $newchar =  "\r"		; break ;
886			case 	'f' 	:  $newchar =  "\f"		; break ;
887			case 	't' 	:  $newchar =  "\t"		; break ;
888			case 	'b' 	:  $newchar =  chr (  8 )	; break ;
889			case 	'v' 	:  $newchar =  chr ( 11 )	; break ;
890
891			// ... but should we consider that it is a heresy to escape other characters ?
892			// For the moment, no.
893			default		:  $newchar =  $ch  ; break ;
894		    }
895
896		return ( $newchar ) ;
897	    }
898
899
900	/*--------------------------------------------------------------------------------------------------------------
901
902	    NAME
903	        ProcessEscapedString - Processes a string which can have escaped characters.
904
905	    PROTOTYPE
906	        $result		=  $this -> ProcessEscapedString ( $str, $process_octal_escapes = false ) ;
907
908	    DESCRIPTION
909	        Processes a string which may contain escape sequences.
910
911	    PARAMETERS
912	        $str (string) -
913	                String to be processed.
914
915		$process_octal_escapes (boolean) -
916			When true, octal escape sequences such as \037 are processed.
917
918	    RETURN VALUE
919	        The processed input string.
920
921	 *-------------------------------------------------------------------------------------------------------------*/
922	protected function  ProcessEscapedString ( $str, $process_octal_escapes = false )
923	   {
924		$length		=  strlen ( $str ) ;
925		$offset		=  0 ;
926		$result		=  '' ;
927		$ord0		=  ord ( '0' ) ;
928
929		while  ( ( $backslash_index = strpos ( $str, '\\', $offset ) )  !==  false )
930		   {
931			if  ( $backslash_index + 1  <  $length )
932			   {
933				$ch		 =  $str [ ++ $backslash_index ] ;
934
935				if  ( ! $process_octal_escapes )
936				   {
937					$result		.=  substr ( $str, $offset, $backslash_index - $offset - 1 ) . $this -> ProcessEscapedCharacter ( $ch ) ;
938					$offset		 =  $backslash_index + 1 ;
939				    }
940				else if  ( $ch  <  '0'  ||  $ch  >  '7' )
941				   {
942					$result		.=  substr ( $str, $offset, $backslash_index - $offset - 1 ) . $this -> ProcessEscapedCharacter ( $ch ) ;
943					$offset		 =  $backslash_index + 1 ;
944				    }
945				else
946				   {
947					$result		.=  substr ( $str, $offset, $backslash_index - $offset - 1 ) ;
948					$ord		 =  ord ( $ch ) - $ord0 ;
949					$count		 =  0 ;
950					$backslash_index ++ ;
951
952					while  ( $backslash_index  <  $length  &&  $count  <  2  &&
953							$str [ $backslash_index ]  >=  '0'  &&  $str [ $backslash_index ]  <=  '7' )
954					   {
955						$ord	=  ( $ord * 8 ) + ( ord ( $str [ $backslash_index ++ ] ) - $ord0 ) ;
956						$count ++ ;
957					    }
958
959					$result		.=  chr ( $ord ) ;
960					$offset		 =  $backslash_index ;
961				    }
962			    }
963			else
964				break ;
965		    }
966
967		$result		.=  substr ( $str, $offset ) ;
968
969		return ( $result ) ;
970	    }
971
972
973	/*--------------------------------------------------------------------------------------------------------------
974
975	    NAME
976	        Unescape - Processes escape sequences from the specified string.
977
978	    PROTOTYPE
979	        $value	=  $this -> Unescape ( $text ) ;
980
981	    DESCRIPTION
982	        Processes escape sequences within the specified text. The recognized escape sequences are like the
983		C-language ones : \b (backspace), \f (form feed), \r (carriage return), \n (newline), \t (tab).
984		All other characters prefixed by "\" are returned as is.
985
986	    PARAMETERS
987	        $text (string) -
988	                Text to be unescaped.
989
990	    RETURN VALUE
991	        Returns the unescaped value of $text.
992
993	 *-------------------------------------------------------------------------------------------------------------*/
994	public static function  Unescape ( $text )
995	   {
996		$length 	=  strlen ( $text ) ;
997		$result 	=  '' ;
998		$ord0		=  ord ( 0 ) ;
999
1000		for  ( $i = 0 ; $i  <  $length ; $i ++ )
1001		   {
1002		   	$ch 	=  $text [$i] ;
1003
1004			if  ( $ch  ==  '\\'  &&  isset ( $text [$i+1] ) )
1005			   {
1006				$nch 	=  $text [++$i] ;
1007
1008				switch  ( $nch )
1009				   {
1010				   	case 	'b' 	:  $result .=  "\b" ; break ;
1011				   	case 	't' 	:  $result .=  "\t" ; break ;
1012				   	case 	'f' 	:  $result .=  "\f" ; break ;
1013				   	case 	'r' 	:  $result .=  "\r" ; break ;
1014				   	case 	'n' 	:  $result .=  "\n" ; break ;
1015				   	default 	:
1016						// Octal escape notation
1017						if  ( $nch  >=  '0'  &&  $nch  <=  '7' )
1018						   {
1019							$ord		=  ord ( $nch ) - $ord0 ;
1020							$digits		=  1 ;
1021							$i ++ ;
1022
1023							while  ( $i  <  $length  &&  $digits  <  3  &&  $text [$i]  >=  '0'  &&  $text [$i]  <=  '7' )
1024							   {
1025								$ord	=  ( $ord * 8 ) + ord ( $text [$i] ) - $ord0 ;
1026								$i ++ ;
1027								$digits ++ ;
1028							    }
1029
1030							$i -- ;		// Count one character less since $i will be incremented at the end of the for() loop
1031
1032							$result .= chr ( $ord ) ;
1033						    }
1034						else
1035							$result .=  $nch ;
1036				    }
1037			    }
1038			else
1039				$result 	.=  $ch ;
1040		    }
1041
1042		return ( $result ) ;
1043	    }
1044
1045
1046	/*--------------------------------------------------------------------------------------------------------------
1047
1048	    NAME
1049	        UnescapeHexCharacters - Unescapes characters in the #xy notation.
1050
1051	    PROTOTYPE
1052	        $result		=  $this -> UnescapeHexCharacters ( $data ) ;
1053
1054	    DESCRIPTION
1055		Some specifications contain hex characters specified as #xy. For the moment, I have met such a construct in
1056		font aliases such as :
1057			/C2#5F0 25 0 R
1058		where "#5F" stands for "_", giving :
1059			/C2_0 25 0 R
1060		Hope that such constructs do not happen in other places...
1061
1062	    PARAMETERS
1063	        $data (string) -
1064	                String to be unescaped.
1065
1066	    RETURN VALUE
1067	        The input string with all the hex character representations replaced with their ascii equivalent.
1068
1069	 *-------------------------------------------------------------------------------------------------------------*/
1070	public static function  UnescapeHexCharacters ( $data )
1071	   {
1072		if  ( strpos ( $data, 'stream' )  ===  false  &&  preg_match ( '/(?P<hex> \# [0-9a-f] [0-9a-f])/ix', $data ) )
1073		    {
1074			preg_match_all ( '/(?P<hex> \# [0-9a-f] [0-9a-f])/ix', $data, $matches ) ;
1075
1076			$searches		=   array ( ) ;
1077			$replacements		=   array ( ) ;
1078
1079			foreach  ( $matches [ 'hex' ]  as  $hex )
1080			   {
1081				if  ( ! isset ( $searches [ $hex ] ) )
1082				   {
1083					$searches [ $hex ]	=  $hex ;
1084					$replacements []	=  chr ( hexdec ( substr ( $hex, 1 ) ) ) ;
1085				    }
1086
1087				$data	=  str_replace ( $searches, $replacements, $data ) ;
1088			    }
1089		     }
1090
1091		return ( $data ) ;
1092	    }
1093
1094
1095	/*--------------------------------------------------------------------------------------------------------------
1096
1097	    ValidatePhpName -
1098		Checks that the specified name (declared in the XML template) is a valid PHP name.
1099
1100	 *-------------------------------------------------------------------------------------------------------------*/
1101	public static function  ValidatePhpName ( $name )
1102	   {
1103		$name	=  trim ( $name ) ;
1104
1105		if  ( ! preg_match ( '/^ [a-z_][a-z0-9_]* $/ix', $name ) )
1106			error ( new PdfToTextFormException ( "Invalid PHP name \"$name\"." ) ) ;
1107
1108		return ( $name ) ;
1109	    }
1110    }
1111
1112
1113/*==============================================================================================================
1114
1115    PdfToText class -
1116	A class for extracting text from Pdf files.
1117
1118 ==============================================================================================================*/
1119class  PdfToText 	extends PdfObjectBase
1120   {
1121	// Current version of the class
1122	const		VERSION					=  "1.6.7" ;
1123
1124	// Pdf processing options
1125	const		PDFOPT_NONE				=  0x00000000 ;		// No extra option
1126	const		PDFOPT_REPEAT_SEPARATOR			=  0x00000001 ;		// Repeats the Separator property if the offset between two text blocks (in array notation)
1127											// is greater than $this -> MinSpaceWidth
1128	const		PDFOPT_GET_IMAGE_DATA			=  0x00000002 ;		// Retrieve raw image data in the $ths -> ImageData array
1129	const		PDFOPT_DECODE_IMAGE_DATA		=  0x00000004 ;		// Creates a jpeg resource for each image
1130	const		PDFOPT_IGNORE_TEXT_LEADING		=  0x00000008 ;		// Ignore text leading values
1131	const		PDFOPT_NO_HYPHENATED_WORDS		=  0x00000010 ;		// Join hyphenated words that are split on two lines
1132	const		PDFOPT_AUTOSAVE_IMAGES			=  0x00000020 ;		// Autosave images ; the ImageFileTemplate property will need to be defined
1133	const		PDFOPT_ENFORCE_EXECUTION_TIME		=  0x00000040 ;		// Enforces the max_execution_time PHP setting when processing a file. A PdfTexterTimeoutException
1134											// will be thrown if processing of a single file reaches (time_limit - 1 second) by default
1135											// The MaxExecutionTime property can be set to modify this default value.
1136	const		PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME	=  0x00000080 ;		// Same as PDFOPT_ENFORCE_EXECUTION_TIME, but for all calls to the Load() method of the PdfToText class
1137											// The MaxGlobalExecutionTime static property can be set to modify the default time limit
1138	const		PDFOPT_IGNORE_HEADERS_AND_FOOTERS	=  0x00000300 ;		// Ignore headers and footers
1139
1140	const		PDFOPT_RAW_LAYOUT			=  0x00000000 ;		// Layout rendering : raw (default)
1141	const		PDFOPT_BASIC_LAYOUT			=  0x00000400 ;		// Layout rendering : basic
1142
1143	const		PDFOPT_LAYOUT_MASK			=  0x00000C00 ;		// Mask to isolate the targeted layout
1144
1145	const		PDFOPT_ENHANCED_STATISTICS		=  0x00001000 ;		// Compute statistics on PDF language instructions
1146	const		PDFOPT_DEBUG_SHOW_COORDINATES		=  0x00002000 ;		// Include text coordinates ; implies the PDFOPT_BASIC_LAYOUT option
1147											// This option can be useful if you want to use capture areas and get information about
1148											// their coordinates
1149	const		PDFOPT_CAPTURE				=  0x00004000 ;		// Indicates that the caller wants to capture some text and use the SetCaptures() method
1150											// It currently enables the PDFOPT_BASIC_LAYOUT option
1151	const		PDFOPT_LOOSE_X_CAPTURE			=  0x00008000 ;		// Includes in captures text fragments whose dimensions may exceed the captured area dimensions
1152	const		PDFOPT_LOOSE_Y_CAPTURE			=  0x00010000 ;		// (currently not used)
1153
1154	// When boolean true, outputs debug information about fonts, character maps and drawing contents.
1155	// When integer > 1, outputs additional information about other objects.
1156	public static 		$DEBUG 			=  false ;
1157
1158	// Current filename
1159	public 			$Filename 			=  false ;
1160	// Extracted text
1161	public			$Text				=  '' ;
1162	// Document pages (array of strings)
1163	public			$Pages				=  array ( ) ;
1164	// Document images (array of PdfImage objects)
1165	public			$Images				=  array ( ) ;
1166	protected		$ImageCount			=  0 ;
1167	// Raw data for document images
1168	public			$ImageData			=  array ( ) ;
1169	// ImageAutoSaveFileTemplate :
1170	//	Template for the file names to be generated when extracting images, if the PDFOPT_AUTOSAVE_IMAGES has been specified.
1171	//	Can contain any path, plus the following printf()-like modifiers :
1172	//	. "%p" : Path of the original PDF file.
1173	//	. "%f" : Filename part of the original PDF file.
1174	//	. "%d" : A sequential number, starting from 1, used when generating filenames. The format can contains a width specifier,
1175	//		 such as "%3d", which will generate 3-digits sequential numbers left-filled with zeroes.
1176	//	. "%s" : Image suffix, which will automatically based on the underlying image type.
1177	public			$ImageAutoSaveFileTemplate	=   "%p/%f.%d.%s" ;
1178	// Auto-save image file format
1179	public			$ImageAutoSaveFormat		=  IMG_JPEG ;
1180	// Auto-saved image file names
1181	public			$AutoSavedImageFiles		=  array ( ) ;
1182	// Text chunk separator (used to separate blocks of text specified as an array notation)
1183	public			$BlockSeparator			=  '' ;
1184	// Separator used to separate text groups where the offset value is less than -1000 thousands of character units
1185	// (eg : [(1)-1822(2)] will add a separator between the characters "1" and "2")
1186	// Note that such values are expressed in thousands of text units and subtracted from the current position. A
1187	// negative value means adding more space between the two text units it separates.
1188	public			$Separator			=  ' ' ;
1189	// Separator to be used between pages in the $Text property
1190	public			$PageSeparator			=  "\n" ;
1191	// Minimum value (in 1/1000 of text units) that separates two text chunks that can be considered as a real space
1192	public			$MinSpaceWidth			=  200 ;
1193	// Pdf options
1194	public			$Options			=  self::PDFOPT_NONE ;
1195	// Maximum number of pages to extract from the PDF. A zero value means "extract everything"
1196	// If this number is negative, then the pages to be extract start from the last page. For example, a value of -2
1197	// extracts the last two pages
1198	public			$MaxSelectedPages		=  false ;
1199	// Maximum number of images to be extracted. A value of zero means "extract everything". A non-zero value gives
1200	// the number of images to extract.
1201	public			$MaxExtractedImages		=  false ;
1202	// Location of the CID tables directory
1203	public static		$CIDTablesDirectory ;
1204	// Loacation of the Font metrics directory, for the Adobe standard 14 fonts
1205	public static		$FontMetricsDirectory ;
1206	// Standard Adobe font names, and their corresponding file in $FontMetricsDirectory
1207	public static		$AdobeStandardFontMetrics	=  array
1208	   (
1209		'courier'		=>  'courier.fm',
1210		'courier-bold'		=>  'courierb.fm',
1211		'courier-oblique'	=>  'courieri.fm',
1212		'courier-boldoblique'	=>  'courierbi.fm',
1213		'helvetica'		=>  'helvetica.fm',
1214		'helvetica-bold'	=>  'helveticab.fm',
1215		'helvetica-oblique'	=>  'helveticai.fm',
1216		'helvetica-boldoblique'	=>  'helveticabi.fm',
1217		'symbol'		=>  'symbol.fm',
1218		'times-roman'		=>  'times.fm',
1219		'times-bold'		=>  'timesb.fm',
1220		'times-bolditalic'	=>  'timesbi.fm',
1221		'times-italic'		=>  'timesi.fm',
1222		'zapfdingbats'		=>  'zapfdingbats.fm'
1223	    ) ;
1224	// Author information
1225	public			$Author				=  '' ;
1226	public			$CreatorApplication		=  '' ;
1227	public			$ProducerApplication		=  '' ;
1228	public			$CreationDate			=  '' ;
1229	public			$ModificationDate		=  '' ;
1230	public			$Title				=  '' ;
1231	public			$Subject			=  '' ;
1232	public			$Keywords			=  '' ;
1233	protected		$GotAuthorInformation		=  false ;
1234	// Unique and arbitrary file identifier, as specified in the PDF file
1235	// Well, in fact, there are two IDs, but the PDF specification does not mention the goal of the second one
1236	public			$ID				=  '' ;
1237	public			$ID2				=  '' ;
1238	// End of line string
1239	public			$EOL				=  PHP_EOL ;
1240	// String to be used when no Unicode translation is possible
1241	public static		$Utf8Placeholder		=  '' ;
1242	// Information about memory consumption implied by the file currently being loaded
1243	public			$MemoryUsage,
1244				$MemoryPeakUsage ;
1245	// Offset of the document start (%PDF-x.y)
1246	public			$DocumentStartOffset ;
1247	// Debug statistics
1248	public		$Statistics			=  array ( ) ;
1249	// Max execution time settings. A positive value means "don't exceed that number of seconds".
1250	// A negative value means "Don't exceed PHP setting max_execution_time - that number of seconds". If the result
1251	// is negative, then the default will be "max_execution_time - 1".
1252	// For those limits to be enforced, you need to specify either the PDFOPT_ENFORCE_EXECUTION_TIME or
1253	// PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME options, or both
1254	public			$MaxExecutionTime		=  -1 ;
1255	public static		$MaxGlobalExecutionTime		=  -1 ;
1256	// This property is expressed in percents ; it gives the extra percentage to add to the values computed by
1257	// the PdfTexterFont::GetStringWidth() method.
1258	// This is basically used when computing text positions and string lengths with the PDFOPT_BASIC_LAYOUT option :
1259	// the computed string length is shorter than its actual length (because of extra spacing determined by character
1260	// kerning in the font data). To determine whether two consecutive blocks of text should be separated by a space,
1261	// we empirically add this extra percentage to the computed string length. The default is -5%.
1262	public			$ExtraTextWidth			=  -5 ;
1263
1264	// Marker stuff. The unprocessed marker list is a sequential array of markers, which will later be dispatched into
1265	// indexed arrays during their first reference
1266	protected		$UnprocessedMarkerList		=  array ( 'font' => array ( ) ) ;
1267	protected		$TextWithFontMarkers		=  array ( ) ;
1268
1269	// Internal variables used when the PDFOPT_ENFORCE_* options are specified
1270	protected static	$PhpMaxExecutionTime ;
1271	protected static	$GlobalExecutionStartTime ;
1272	protected static	$AllowedGlobalExecutionTime ;
1273	protected		$ExecutionStartTime ;
1274	protected		$AllowedExecutionTime ;
1275
1276	// Font mappings
1277	protected 		$FontTable			=  false ;
1278	// Extra Adobe standard font mappings (for character names of the form "/axxx" for example)
1279	protected		$AdobeExtraMappings		=  array ( ) ;
1280	// Page map object
1281	protected		$PageMap ;
1282	// Page locations (start and end offsets)
1283	protected		$PageLocations ;
1284	// Encryption data
1285	public			$IsEncrypted			=  false ;
1286	protected		$EncryptionData			=  false ;
1287	// A flag coming from the constructor options, telling if enhanced statistics are enabled
1288	protected		$EnhancedStatistics ;
1289
1290	// Document text fragments, with their absolute (x,y) position, approximate width and height
1291	protected		$DocumentFragments ;
1292
1293	// Form data
1294	protected		$FormData ;
1295	protected		$FormDataObjectNumbers ;
1296	protected		$FormDataDefinitions ;
1297	protected		$FormaDataObjects ;
1298
1299	// Capture data
1300	public			$CaptureDefinitions ;
1301	protected		$CaptureObject ;
1302
1303	// Indicates whether global static initializations have been made
1304	// This is mainly used for variables such as $Utf8PlaceHolder, which is initialized to a different value
1305	private static		$StaticInitialized		=  false ;
1306
1307	// Drawing instructions that are to be ignored and removed from a text stream before processing, for performance
1308	// reasons (it is faster to call preg_replace() once to remove them than calling the __next_instruction() and
1309	// __next_token() methods to process an input stream containing such useless instructions)
1310	// This is an array of regular expressions where the following constructs are replaced at runtime during static
1311	// initialization :
1312	// %n - Will be replaced with a regex matching a decimal number.
1313	private static		$IgnoredInstructionTemplatesLayout	=  array
1314	   (
1315		'%n{6} ( (c) ) \s+',
1316		'%n{4} ( (re) | (y) | (v) | (k) | (K) ) \s+',
1317		'%n{3} ( (scn) | (SCN) | (r) | (rg) | (RG) | (sc) | (SC) ) \s+',
1318		'%n{2} ( (m) | (l) ) \s+',
1319		'%n ( (w) | (M) | (g) | (G) | (J) | (j) | (d) | (i) | (sc) | (SC) | (Tc) | (Tw) | (scn) | (Tr) | (Tz) | (Ts) ) \s+',
1320		'\b ( (BDC) | (EMC) ) \s+',
1321		'\/( (Cs \d+) | (CS \d+) | (G[Ss] \d+) | (Fm \d+) | (Im \d+) | (PlacedGraphic) ) \s+ \w+ \s*',
1322		'\/( (Span) | (Artifact) | (Figure) | (P) ) \s* << .*? >> [ \t\r\n>]*',
1323		'\/ ( (PlacedGraphic) | (Artifact) ) \s+',
1324		'\d+ \s+ ( (scn) | (SCN) )',
1325		'\/MC \d+ \s+',
1326		 '^ \s* [fhS] \r? \n',
1327		 '^W \s+ n \r? \n',
1328		 '(f | W) \* \s+',
1329		 '^[fhnS] \s+',
1330		 '-?0 (\. \d+)? \s+ T[cw]',
1331		 '\bBI \s+ .*? \bID \s+ .*? \bEI',
1332		 '\/ \w+ \s+ ( (cs) | (CS) | (ri) | (gs) )',
1333		 // Hazardous replaces ?
1334		 '( [Ww] \s+ ){3,}',
1335		 ' \[\] \s+ [Shs] \s+'
1336	    ) ;
1337	// Additional instructions to be stripped when no particular page layout has been requested
1338	private static		$IgnoredInstructionTemplatesNoLayout	=  array
1339	   (
1340		'%n{6} ( (cm) ) \s+',
1341//		'\b ( (BT) | (ET) ) \s+',
1342		 '^ \s* [Qq] \r? \n',
1343		 '^ \s* (\b [a-zA-Z] \s+)+',
1344		 '\s* (\b [a-zA-Z] \s+)+$',
1345		 '^[qQ] \s+',
1346		 '^q \s+ [hfS] \n',
1347		 '( [Qfhnq] \s+ ){2,}'
1348	    ) ;
1349	// Replacement regular expressions for %something constructs specified in the $IgnoredInstructions array
1350	private static		$ReplacementConstructs		=  array
1351	    (
1352		'%n'	=>  '( [+\-]? ( ( [0-9]+ ( \. [0-9]* )? ) | ( \. [0-9]+ ) ) \s+ )'
1353	     ) ;
1354	// The final regexes that are built during static initialization by the __build_ignored_instructions() method
1355	private static		$IgnoredInstructionsNoLayout	=  array ( ) ;
1356	private static		$IgnoredInstructionsLayout	=  array ( ) ;
1357	private			$IgnoredInstructions		=  array ( ) ;
1358
1359	// Map id buffer - for avoiding unneccesary calls to GetFontByMapId
1360	private			$MapIdBuffer			=  array ( ) ;
1361
1362	// Same for MapCharacter()
1363	private			$CharacterMapBuffer		=  array ( ) ;
1364
1365	// Font objects buffer - used by __assemble_text_fragments()
1366	private			$FontObjectsBuffer		=  array ( ) ;
1367
1368	// Regex used for removing hyphens - we have to take care of different line endings : "\n" for Unix, "\r\n"
1369	// for Windows, and "\r" for pure Mac files.
1370	// Note that we replace an hyphen followed by an end-of-line then by non-space characters with the non-space
1371	// characters, so the word gets joined on the same line. Spaces after the end of the word (on the next line)
1372	// are removed, in order for the next word to appear at the beginning of the second line.
1373	private static		$RemoveHyphensRegex		=  '#
1374									(
1375										  -
1376										  [ \t]* ( (\r\n) | \n | \r )+ [ \t\r\n]*
1377									 )
1378									([^ \t\r\n]+)
1379									\s*
1380								    #msx' ;
1381
1382	// A small list of Unicode character ranges that are related to languages written from right to left
1383	// For performance reasons, everythings is mapped to a range here, even if it includes codepoints that do not map to anything
1384	// (this class is not a Unicode codepoint validator, but a Pdf text extractor...)
1385	// The UTF-16 version is given as comments ; only the UTF-8 translation is used here
1386	// To be completed !
1387	private static		$RtlCharacters			=  array
1388	   (
1389		// This range represents the following languages :
1390		// - Hebrew			(0590..05FF)
1391		// - Arabic			(0600..06FF)
1392		// - Syriac			(0700..074F)
1393		// - Supplement for Arabic	(0750..077F)
1394		// - Thaana			(0780..07BF)
1395		// - N'ko			(07C0..07FF)
1396		// - Samaritan			(0800..083F)
1397		// - Mandaic			(0840..085F)
1398		//	array ( 0x00590, 0x0085F ),
1399		// Hebrew supplement (I suppose ?) + other characters
1400		//	array ( 0x0FB1D, 0x0FEFC ),
1401		// Mende kikakui
1402		//	array ( 0x1E800, 0x1E8DF ),
1403		// Adlam
1404		//	array ( 0x1E900, 0x1E95F ),
1405		// Others
1406		//	 array ( 0x10800, 0x10C48 ),
1407		//	 array ( 0x1EE00, 0x1EEBB )
1408		"\xD6"		=>  array ( array ( "\x90", "\xBF" ) ),
1409		"\xD7"		=>  array ( array ( "\x80", "\xBF" ) ),
1410		"\xD8"		=>  array ( array ( "\x80", "\xBF" ) ),
1411		"\xD9"		=>  array ( array ( "\x80", "\xBF" ) ),
1412		"\xDA"		=>  array ( array ( "\x80", "\xBF" ) ),
1413		"\xDB"		=>  array ( array ( "\x80", "\xBF" ) ),
1414		"\xDC"		=>  array ( array ( "\x80", "\xBF" ) ),
1415		"\xDD"		=>  array ( array ( "\x80", "\xBF" ) ),
1416		"\xDE"		=>  array ( array ( "\x80", "\xBF" ) ),
1417		"\xDF"		=>  array ( array ( "\x80", "\xBF" ) )
1418		/*
1419		"\xE0"		=>  array
1420		   (
1421			array ( "\xA0\x80", "\xA0\xBF" ),
1422			array ( "\xA1\x80", "\xA1\x9F" )
1423		    ),
1424		"\xEF"		=>  array
1425		   (
1426			array ( "\xAC\x9D", "\xAC\xBF" ),
1427			array ( "\xAD\x80", "\xAD\xBF" ),
1428			array ( "\xAE\x80", "\xAE\xBF" ),
1429			array ( "\xAF\x80", "\xAF\xBF" ),
1430			array ( "\xB0\x80", "\xB0\xBF" ),
1431			array ( "\xB1\x80", "\xB1\xBF" ),
1432			array ( "\xB2\x80", "\xB2\xBF" ),
1433			array ( "\xB3\x80", "\xB3\xBF" ),
1434			array ( "\xB4\x80", "\xB4\xBF" ),
1435			array ( "\xB5\x80", "\xB5\xBF" ),
1436			array ( "\xB6\x80", "\xB6\xBF" ),
1437			array ( "\xB7\x80", "\xB7\xBF" ),
1438			array ( "\xB8\x80", "\xB8\xBF" ),
1439			array ( "\xB9\x80", "\xB9\xBF" ),
1440			array ( "\xBA\x80", "\xBA\xBF" ),
1441			array ( "\xBB\x80", "\xBB\xBC" )
1442		    )
1443		    */
1444	    ) ;
1445
1446	// UTF-8 prefixes for RTL characters as keys, and number of characters that must follow the prefix as values
1447	private static		$RtlCharacterPrefixLengths	=  array
1448	   (
1449		"\xD6"		=>  1,
1450		"\xD7"		=>  1,
1451		"\xD8"		=>  1,
1452		"\xD9"		=>  1,
1453		"\xDA"		=>  1,
1454		"\xDB"		=>  1,
1455		"\xDC"		=>  1,
1456		"\xDE"		=>  1,
1457		"\xDF"		=>  1
1458		/*
1459		"\xE0"		=>  2,
1460		"\xEF"		=>  2
1461		*/
1462	    ) ;
1463
1464	// A string that contains all the RTL character prefixes above
1465	private static		$RtlCharacterPrefixes ;
1466
1467	// As usual, caching a little bit the results of the IsRtlCharacter() method is welcome. Each item will have the value true if the
1468	// character is RTL, or false if LTR.
1469	private			$RtlCharacterBuffer		=  array ( ) ;
1470
1471	// A subset of a character classification array that avoids too many calls to the ctype_* functions or too many
1472	// character comparisons.
1473	// This array is used only for highly sollicited parts of code
1474	const	CTYPE_ALPHA		=  0x01 ;		// Letter
1475	const	CTYPE_DIGIT		=  0x02 ;		// Digit
1476	const	CTYPE_XDIGIT		=  0x04 ;		// Hex digit
1477	const	CTYPE_ALNUM		=  0x08 ;		// Letter or digit
1478	const	CTYPE_LOWER		=  0x10 ;		// Lower- or upper-case letters
1479	const	CTYPE_UPPER		=  0x20 ;
1480
1481	private static		$CharacterClasses		=  false ;
1482
1483	// Stuff specific to the current PHP version
1484	private static		$HasMemoryGetUsage ;
1485	private static		$HasMemoryGetPeakUsage ;
1486
1487
1488	/*--------------------------------------------------------------------------------------------------------------
1489
1490	    CONSTRUCTOR
1491	        $pdf	=  new PdfToText ( $filename = null, $options = PDFOPT_NONE ) ;
1492
1493	    DESCRIPTION
1494	        Builds a PdfToText object and optionally loads the specified file's contents.
1495
1496	    PARAMETERS
1497	        $filename (string) -
1498	                Optional PDF filename whose text contents are to be extracted.
1499
1500		$options (integer) -
1501			A combination of PDFOPT_* flags. This can be any of the following :
1502
1503			- PDFOPT_REPEAT_SEPARATOR :
1504				Text constructs specified as an array are separated by an offset which is expressed as
1505				thousands of text units ; for example :
1506
1507					[(1)-2000(2)]
1508
1509				will be rendered as the text "1  2" ("1" and "2" being separated by two spaces) if the
1510				"Separator" property is set to a space (the default) and this flag is specified.
1511				When not specified, the text will be rendered as "1 2".
1512
1513			- PDFOPT_NONE :
1514				None of the above options will apply.
1515
1516	 *-------------------------------------------------------------------------------------------------------------*/
1517	public function  __construct ( $filename = null, $options = self::PDFOPT_NONE, $user_password = false, $owner_password = false )
1518	   {
1519		// We need the mbstring PHP extension here...
1520		if  ( ! function_exists ( 'mb_convert_encoding' ) )
1521			error ( "You must enable the mbstring PHP extension to use this class." ) ;
1522
1523		// Perform static initializations if needed
1524		if  ( ! self::$StaticInitialized )
1525		   {
1526			if  ( self::$DEBUG )
1527			   {
1528				// In debug mode, initialize the utf8 placeholder only if it still set to its default value, the empty string
1529				if  ( self::$Utf8Placeholder  ==  '' )
1530					self::$Utf8Placeholder	=  '[Unknown character : 0x%08X]' ;
1531			    }
1532
1533			// Build the list of regular expressions from the list of ignored instruction templates
1534			self::__build_ignored_instructions (  ) ;
1535
1536			// Check if some functions are supported or not
1537			self::$HasMemoryGetUsage	=  function_exists ( 'memory_get_usage' ) ;
1538			self::$HasMemoryGetPeakUsage	=  function_exists ( 'memory_get_peak_usage' ) ;
1539
1540			// Location of the directory containing CID fonts
1541			self::$CIDTablesDirectory	=  dirname ( __FILE__ ) . DIRECTORY_SEPARATOR . 'CIDTables' ;
1542			self::$FontMetricsDirectory	=  dirname ( __FILE__ ) . DIRECTORY_SEPARATOR . 'FontMetrics' ;
1543
1544			// The string that contains all the Rtl character prefixes in UTF-8 - An optimization used by the __rtl_process() method
1545			self::$RtlCharacterPrefixes	=  implode ( '', array_keys ( self::$RtlCharacterPrefixLengths ) ) ;
1546
1547			// Build the character classes (used only for testing letters and digits)
1548			if  ( self::$CharacterClasses  ===  false )
1549			   {
1550				for  ( $ord = 0 ; $ord  <  256 ; $ord ++ )
1551				   {
1552					$ch	=  chr ( $ord ) ;
1553
1554					if  ( $ch  >=  '0'  &&  $ch  <=  '9' )
1555						self::$CharacterClasses [ $ch ]		=  self::CTYPE_DIGIT | self::CTYPE_XDIGIT | self::CTYPE_ALNUM ;
1556					else if  ( $ch  >=  'A'  &&  $ch  <=  'Z' )
1557					   {
1558						self::$CharacterClasses [ $ch ]		=  self::CTYPE_ALPHA | self::CTYPE_UPPER | self::CTYPE_ALNUM ;
1559
1560						if  ( $ch  <=  'F' )
1561							self::$CharacterClasses [ $ch ]		|=  self::CTYPE_XDIGIT ;
1562					    }
1563					else if  ( $ch  >=  'a'  &&  $ch  <=  'z' )
1564					   {
1565						self::$CharacterClasses [ $ch ]		=  self::CTYPE_ALPHA | self::CTYPE_LOWER | self::CTYPE_ALNUM ;
1566
1567						if  ( $ch  <=  'f' )
1568							self::$CharacterClasses [ $ch ]		|=  self::CTYPE_XDIGIT ;
1569					    }
1570					else
1571						self::$CharacterClasses [ $ch ]		=  0 ;
1572				    }
1573			    }
1574
1575			// Global execution time limit
1576			self::$PhpMaxExecutionTime	=  ( integer ) ini_get ( 'max_execution_time' ) ;
1577
1578			if  ( ! self::$PhpMaxExecutionTime )					// Paranoia : default max script execution time to 120 seconds
1579				self::$PhpMaxExecutionTime	=  120 ;
1580
1581			self::$GlobalExecutionStartTime		=  microtime ( true ) ;		// Set the start of the first execution
1582
1583			if  ( self::$MaxGlobalExecutionTime  >  0 )
1584				self::$AllowedGlobalExecutionTime	=  self::$MaxGlobalExecutionTime ;
1585			else
1586				self::$AllowedGlobalExecutionTime	=  self::$PhpMaxExecutionTime + self::$MaxGlobalExecutionTime ;
1587
1588			// Adjust in case of inconsistent values
1589			if  ( self::$AllowedGlobalExecutionTime  <  0  ||  self::$AllowedGlobalExecutionTime  >  self::$PhpMaxExecutionTime )
1590				self::$AllowedGlobalExecutionTime	=  self::$PhpMaxExecutionTime - 1 ;
1591
1592			self::$StaticInitialized	=  true ;
1593		    }
1594
1595		parent::__construct ( ) ;
1596
1597		$this -> Options		=  $options ;
1598
1599		if  ( $filename )
1600			$this -> Load ( $filename, $user_password, $owner_password ) ;
1601	    }
1602
1603
1604	public function  __tostring ( )
1605	   { return ( $this -> Text ) ; }
1606
1607
1608	/**************************************************************************************************************
1609	 **************************************************************************************************************
1610	 **************************************************************************************************************
1611	 ******                                                                                                  ******
1612	 ******                                                                                                  ******
1613	 ******                                          PUBLIC METHODS                                          ******
1614	 ******                                                                                                  ******
1615	 ******                                                                                                  ******
1616	 **************************************************************************************************************
1617	 **************************************************************************************************************
1618	 **************************************************************************************************************/
1619
1620	/*--------------------------------------------------------------------------------------------------------------
1621
1622	    NAME
1623	        Load		- Loads text contents from a PDF file.
1624		LoadFromString	- Loads PDF contents from a string.
1625
1626	    PROTOTYPE
1627	        $text	=  $pdf -> Load ( $filename, $user_password = false, $owner_password = false ) ;
1628	        $text	=  $pdf -> LoadFromString ( $contents, $user_password = false, $owner_password = false ) ;
1629
1630	    DESCRIPTION
1631	        The Load() method extracts text contents from the specified PDF file. Once processed, text contents will
1632		be available through the "Text" property.
1633		The LoadFromString() method performs the same operation on PDF contents already loaded into memory.
1634
1635	    PARAMETERS
1636	        $filename (string) -
1637	                Optional PDF filename whose text contents are to be extracted.
1638
1639		$contents (string) -
1640			String containing PDF contents.
1641
1642		$user_password (string) -
1643			User password used for decrypting PDF contents.
1644
1645		$owner_password (string) -
1646			Owner password.
1647
1648	 *-------------------------------------------------------------------------------------------------------------*/
1649	private		$__memory_peak_usage_start,
1650			$__memory_usage_start ;
1651
1652	public function  Load  ( $filename, $user_password = false, $owner_password = false )
1653	   {
1654		$this -> __memory_usage_start		=  ( self::$HasMemoryGetUsage     ) ?  memory_get_usage      ( true ) : 0 ;
1655		$this -> __memory_peak_usage_start	=  ( self::$HasMemoryGetPeakUsage ) ?  memory_get_peak_usage ( true ) : 0 ;
1656
1657		// Check if the file exists, but only if the file is on a local filesystem
1658		if  ( ! preg_match ( '#^ [^:]+ ://#ix', $filename )  && ! file_exists ( $filename ) )
1659			error ( new  PdfToTextDecodingException ( "File \"$filename\" does not exist." ) ) ;
1660
1661		// Load its contents
1662		$contents 	=  @file_get_contents ( $filename, FILE_BINARY ) ;
1663
1664		if  ( $contents  ===  false )
1665			error ( new  PdfToTextDecodingException ( "Unable to open \"$filename\"." ) ) ;
1666
1667		return ( $this -> __load ( $filename, $contents, $user_password, $owner_password ) ) ;
1668	    }
1669
1670
1671	public function  LoadFromString ( $contents, $user_password = false, $owner_password = false )
1672	   {
1673		$this -> __memory_usage_start		=  ( self::$HasMemoryGetUsage     ) ?  memory_get_usage      ( true ) : 0 ;
1674		$this -> __memory_peak_usage_start	=  ( self::$HasMemoryGetPeakUsage ) ?  memory_get_peak_usage ( true ) : 0 ;
1675
1676		return ( $this -> __load ( '', $contents, $user_password, $owner_password ) ) ;
1677	    }
1678
1679
1680	private function  __load ( $filename, $contents, $user_password = false, $owner_password = false )
1681	   {
1682		// Search for the start of the document ("%PDF-x.y")
1683		$start_offset	=  strpos ( $contents, '%PDF' ) ;
1684
1685		if  ( $start_offset  ===  false )		// Not a pdf document !
1686			error ( new PdfToTextDecodingException ( "File \"$filename\" is not a valid PDF file." ) ) ;
1687		else						// May be a PDF document
1688			$this -> DocumentStartOffset	=  $start_offset ;
1689
1690		// Check that this is a PDF file with a valid version number
1691		if  ( ! preg_match ( '/ %PDF- (?P<version> \d+ (\. \d+)*) /ix', $contents, $match, 0, $start_offset ) )
1692			error ( new PdfToTextDecodingException ( "File \"$filename\" is not a valid PDF file." ) ) ;
1693
1694		$this -> PdfVersion 		=  $match [ 'version' ] ;
1695
1696		// Initializations
1697		$this -> Text 				=  '' ;
1698		$this -> FontTable 			=  new PdfTexterFontTable ( ) ;
1699		$this -> Filename 			=  realpath ( $filename ) ;
1700		$this -> Pages				=  array ( ) ;
1701		$this -> Images				=  array ( ) ;
1702		$this -> ImageData			=  array ( ) ;
1703		$this -> ImageCount			=  0 ;
1704		$this -> AutoSavedImageFiles		=  array ( ) ;
1705		$this -> PageMap			=  new PdfTexterPageMap ( ) ;
1706		$this -> PageLocations			=  array ( ) ;
1707		$this -> Author				=  '' ;
1708		$this -> CreatorApplication		=  '' ;
1709		$this -> ProducerApplication		=  '' ;
1710		$this -> CreationDate			=  '' ;
1711		$this -> ModificationDate		=  '' ;
1712		$this -> Title				=  '' ;
1713		$this -> Subject			=  '' ;
1714		$this -> Keywords			=  '' ;
1715		$this -> GotAuthorInformation		=  false ;
1716		$this -> ID				=  '' ;
1717		$this -> ID2				=  '' ;
1718		$this -> EncryptionData			=  false ;
1719		$this -> EnhancedStatistics		=  ( ( $this -> Options  &  self::PDFOPT_ENHANCED_STATISTICS )  !=  0 ) ;
1720
1721		// Also reset cached information that may come from previous runs
1722		$this -> MapIdBuffer			=  array ( ) ;
1723		$this -> RtlCharacterBuffer		=  array ( ) ;
1724		$this -> CharacterMapBuffer		=  array ( ) ;
1725		$this -> FontObjectsBuffer		=  array ( ) ;
1726		$this -> FormData			=  array ( ) ;
1727		$this -> FormDataObjectNumbers		=  false ;
1728		$this -> FomDataDefinitions		=  array ( ) ;
1729		$this -> FormDataObjects		=  array ( ) ;
1730		$this -> CaptureDefinitions		=  false ;
1731		$this -> CaptureObject			=  false ;
1732		$this -> DocumentFragments		=  array ( ) ;
1733
1734		// Enable the PDFOPT_BASIC_LAYOUT option if the PDFOPT_CAPTURE flag is specified
1735		if  ( $this -> Options & self::PDFOPT_CAPTURE )
1736			$this -> Options	|=  self::PDFOPT_BASIC_LAYOUT ;
1737
1738		// Enable the PDFOPT_BASIC_LAYOUT_OPTION is PDFOPT_DEBUG_SHOW_COORDINATES is specified
1739		if  ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES )
1740			$this -> Options	|=  self::PDFOPT_BASIC_LAYOUT ;
1741
1742		// Page layout options needs more instructions to be retained - select the appropriate list of useless instructions
1743		if  ( $this -> Options  &  self::PDFOPT_BASIC_LAYOUT )
1744			$this -> IgnoredInstructions	=  self::$IgnoredInstructionsLayout ;
1745		else
1746			$this -> IgnoredInstructions	=  self::$IgnoredInstructionsNoLayout ;
1747
1748
1749		// Debug statistics
1750		$this -> Statistics			=  array
1751		   (
1752			'TextSize'			=>  0,				// Total size of drawing instructions ("text" objects)
1753			'OptimizedTextSize'		=>  0,				// Optimized text size, with useless instructions removed
1754			'Distributions'			=>  array			// Statistics about handled instructions distribution - Works only with the page layout option in debug mode
1755			   (
1756				'operand'	=>  0,
1757				'Tm'		=>  0,
1758				'Td'		=>  0,
1759				'TD'		=>  0,
1760				"'"		=>  0,
1761				'TJ'		=>  0,
1762				'Tj'		=>  0,
1763				'Tf'		=>  0,
1764				'TL'		=>  0,
1765				'T*'		=>  0,
1766				'('		=>  0,
1767				'<'		=>  0,
1768				'['		=>  0,
1769				'cm'		=>  0,
1770				'BT'		=>  0,
1771				'template'	=>  0,
1772				'ignored'	=>  0,
1773				'space'		=>  0
1774			    )
1775		    ) ;
1776
1777		// Per-instance execution time limit
1778		$this -> ExecutionStartTime		=  microtime ( true ) ;
1779
1780		if  ( $this -> MaxExecutionTime  >  0 )
1781			$this -> AllowedExecutionTime		=  $this -> MaxExecutionTime ;
1782		else
1783			$this -> AllowedExecutionTime		=  self::$PhpMaxExecutionTime + $this -> MaxExecutionTime ;
1784
1785		// Adjust in case of inconsistent values
1786		if  ( $this -> AllowedExecutionTime  <  0  ||  $this -> AllowedExecutionTime  >  self::$PhpMaxExecutionTime )
1787			$this -> AllowedExecutionTime		=  self::$PhpMaxExecutionTime - 1 ;
1788
1789		// Systematically set the DECODE_IMAGE_DATA flag if the AUTOSAVE_IMAGES flag has been specified
1790		if  ( $this -> Options  &  self::PDFOPT_AUTOSAVE_IMAGES )
1791			$this -> Options	|=  self::PDFOPT_DECODE_IMAGE_DATA ;
1792
1793		// Systematically set the GET_IMAGE_DATA flag if DECODE_IMAGE_DATA is specified (debug mode only)
1794		if  ( self::$DEBUG  &&  $this -> Options  &  self::PDFOPT_DECODE_IMAGE_DATA )
1795			$this -> Options	|=  self::PDFOPT_GET_IMAGE_DATA ;
1796
1797		// Since page layout options take 2 bits, but not all of the 4 possible values are allowed, make sure that an invalid
1798		// value will default to PDFOPT_RAW_LAYOUT value
1799		$layout_option		=  $this -> Options & self::PDFOPT_LAYOUT_MASK ;
1800
1801		if  ( ! $layout_option  ===  self::PDFOPT_RAW_LAYOUT  &&  $layout_option  !==  self::PDFOPT_BASIC_LAYOUT )
1802		   {
1803			$layout_option		=  self::PDFOPT_RAW_LAYOUT ;
1804			$this -> Options	=  ( $this -> Options & ~self::PDFOPT_LAYOUT_MASK ) | self::PDFOPT_RAW_LAYOUT ;
1805		    }
1806
1807		// Author information needs to be processed after, because it may reference objects that occur later in the PDF stream
1808		$author_information_object_id		=  false ;
1809
1810		// Extract pdf objects that are enclosed by the "obj" and "endobj" keywords
1811		$pdf_objects		=  array ( ) ;
1812		$contents_offset	=  $this -> DocumentStartOffset ;
1813		$contents_length	=  strlen ( $contents ) ;
1814
1815
1816		while  ( $contents_offset  <  $contents_length  &&
1817				preg_match ( '/(?P<re> (?P<object_id> \d+) \s+ \d+ \s+ obj (?P<object> .*?) endobj )/imsx', $contents, $match, PREG_OFFSET_CAPTURE, $contents_offset ) )
1818		   {
1819			$object_number		=  $match [ 'object_id' ] [0] ;
1820			$object_data		=  $match [ 'object' ] [0] ;
1821
1822			// Handle the special case of object streams (compound objects)
1823			// They are not added in the $pdf_objects array, because they could be mistakenly processed as relevant information,
1824			// such as font definitions, etc.
1825			// Instead, only the objects they are embedding are stored in this array.
1826			if  ( $this -> IsObjectStream ( $object_data ) )
1827			   {
1828				// Ignore ill-formed object streams
1829				if  ( ( $object_stream_matches = $this -> DecodeObjectStream ( $object_number, $object_data ) )  !==  false )
1830				   {
1831					// Add this list of objects to the list of known objects
1832					for  ( $j = 0, $object_stream_count = count ( $object_stream_matches [ 'object_id' ] ) ; $j  <  $object_stream_count ; $j ++ )
1833						$pdf_objects [ $object_stream_matches [ 'object_id' ] [$j] ]	=  $object_stream_matches [ 'object' ] [$j] ;
1834				    }
1835			    }
1836			// Normal (non-compound) object
1837			else
1838				$pdf_objects [ $object_number ]	=  $object_data ;
1839
1840			// Update current offset through PDF contents
1841			$contents_offset	=  $match [ 're' ] [1] + strlen ( $match [ 're' ] [0] ) ;
1842		    }
1843
1844		// We put a particular attention in treating errors returned by preg_match_all() here, since we need to be really sure why stopped
1845		// to find further PDF objects in the supplied contents
1846		$preg_error	=  preg_last_error ( ) ;
1847
1848		switch  ( $preg_error )
1849		   {
1850			case  PREG_NO_ERROR :
1851				break ;
1852
1853			case  PREG_INTERNAL_ERROR :
1854				error ( new PdfToTextDecodingException ( "PDF object extraction : the preg_match_all() function encountered an internal error." ) ) ;
1855
1856			case  PREG_BACKTRACK_LIMIT_ERROR :
1857				error ( new PdfToTextDecodingException ( "PDF object extraction : backtrack limit reached (you may have to modify the pcre.backtrack_limit " .
1858						"setting of your PHP.ini file, which is currently set to " . ini_get ( 'pcre.backtrack_limit' ) . ")." ) ) ;
1859
1860			case  PREG_JIT_STACKLIMIT_ERROR :
1861				error ( new PdfToTextDecodingException ( "PDF object extraction : JIT stack limit reached (you may disable this feature by setting the pcre.jit " .
1862						"setting of your PHP.ini file to 0)." ) ) ;
1863
1864			case  PREG_RECURSION_LIMIT_ERROR :
1865				error ( new PdfToTextDecodingException ( "PDF object extraction : recursion limit reached (you may have to modify the pcre.recursion_limit " .
1866						"setting of your PHP.ini file, which is currently set to " . ini_get ( 'pcre.recursion_limit' ) . ")." ) ) ;
1867
1868			case  PREG_BAD_UTF8_ERROR :
1869				error ( new PdfToTextDecodingException ( "PDF object extraction : bad UTF8 character encountered." ) ) ;
1870
1871			case PREG_BAD_UTF8_OFFSET_ERROR :
1872				error ( new PdfToTextDecodingException ( "PDF object extraction : the specified offset does not start at the beginning of a valid UTF8 codepoint." ) ) ;
1873
1874			default :
1875				error ( new PdfToTextDecodingException ( "PDF object extraction : unkown PREG error #$preg_error" ) ) ;
1876		    }
1877
1878
1879		// Extract trailer information, which may contain the ID of an object specifying encryption flags
1880		$this -> GetTrailerInformation ( $contents, $pdf_objects ) ;
1881		unset ( $contents ) ;
1882
1883		// Character maps encountered so far
1884		$cmaps			=  array ( ) ;
1885
1886		// An array that will store object ids as keys and text contents as values
1887		$text			=  array ( ) ;
1888
1889		// Loop through the objects
1890		foreach  ( $pdf_objects  as  $object_number => $object_data )
1891		   {
1892			// Some additional objects may be uncovered after processing (in an object containing compacted objects for example)
1893			// so add them to the list if necessary
1894			if  ( ! isset ( $pdf_objects [ $object_number ] ) )
1895				$pdf_objects [ $object_number ]		=  $object_data ;
1896
1897			// Try to catch information related to page mapping - but don't discard the object since it can contain additional information
1898			$this -> PageMap -> Peek ( $object_number, $object_data, $pdf_objects ) ;
1899
1900			// Check if the object contais authoring information - it can appear encoded or unencoded
1901			if  ( ! $this -> GotAuthorInformation )
1902				$author_information_object_id	=  $this -> PeekAuthorInformation ( $object_number, $object_data ) ;
1903
1904			// Also catch the object encoding type
1905			$type 		=  $this -> GetEncodingType ( $object_number, $object_data ) ;
1906			$stream_match	=  null ;
1907
1908			if  ( strpos ( $object_data, 'stream' )  ===  false  ||
1909					! preg_match ( '#[^/] stream \s+ (?P<stream> .*?) endstream#imsx', $object_data, $stream_match ) )
1910			   {
1911				// Some font definitions are in clear text in an object, some are encoded in a stream within the object
1912				// We process here the unencoded ones
1913				if  ( $this -> IsFont ( $object_data ) )
1914				   {
1915					$this -> FontTable -> Add ( $object_number, $object_data, $pdf_objects, $this -> AdobeExtraMappings ) ;
1916					continue ;
1917				    }
1918				// Some character maps may also be in clear text
1919				else if  ( $this -> IsCharacterMap ( $object_data ) )
1920				    {
1921					$cmap	=  PdfTexterCharacterMap::CreateInstance ( $object_number, $object_data, $this -> AdobeExtraMappings ) ;
1922
1923					if  ( $cmap )
1924						$cmaps [] 	=  $cmap ;
1925
1926					continue ;
1927				    }
1928				// Check if there is an association between font number and object number
1929				else if  ( $this -> IsFontMap ( $object_data ) )
1930				   {
1931					$this -> FontTable -> AddFontMap ( $object_number, $object_data ) ;
1932				    }
1933				// Retrieve form data if present
1934				else if  ( $this -> IsFormData ( $object_data ) )
1935				   {
1936					$this -> RetrieveFormData ( $object_number, $object_data, $pdf_objects ) ;
1937				    }
1938				// Ignore other objects that do not contain an encoded stream
1939				else
1940				   {
1941					if  ( self::$DEBUG  >  1 )
1942						echo "\n----------------------------------- UNSTREAMED #$object_number\n$object_data" ;
1943
1944					continue ;
1945				    }
1946			    }
1947			// Extract image data, if any
1948			else if  ( $this -> IsImage ( $object_data ) )
1949			   {
1950				$this -> AddImage ( $object_number, $stream_match [ 'stream' ], $type, $object_data ) ;
1951				continue ;
1952			    }
1953			// Check if there is an association between font number and object number
1954			else if  ( $this -> IsFontMap ( $object_data ) )
1955			   {
1956				$this -> FontTable -> AddFontMap ( $object_number, $object_data ) ;
1957
1958				if  ( ! $stream_match )
1959					continue ;
1960			    }
1961
1962			// Check if the stream contains data (yes, I have found a sample that had streams of length 0...)
1963			// In other words : ignore empty streams
1964			if  ( stripos ( $object_data, '/Length 0' )  !==  false )
1965				continue ;
1966
1967			// Isolate stream data and try to find its encoding type
1968			if  ( isset  ( $stream_match [ 'stream' ] ) )
1969				$stream_data 		=  ltrim ( $stream_match [ 'stream' ], "\r\n" ) ;
1970			else
1971				continue ;
1972
1973			// Ignore this stream if the object does not contain an encoding type (/FLATEDECODE, /ASCIIHEX or /ASCII85)
1974			if  ( $type  ==  self::PDF_UNKNOWN_ENCODING )
1975			   {
1976				if  ( self::$DEBUG  >  1 )
1977					echo "\n----------------------------------- UNENCODED #$object_number :\n$object_data" ;
1978
1979				continue ;
1980			    }
1981
1982			// Decode the encoded stream
1983			$decoded_stream_data 	=  $this -> DecodeData ( $object_number, $stream_data, $type, $object_data ) ;
1984
1985			// Second chance to peek author information, this time on a decoded stream data
1986			if  ( ! $this -> GotAuthorInformation )
1987				$author_information_object_id	=  $this -> PeekAuthorInformation ( $object_number, $decoded_stream_data ) ;
1988
1989			// Check for character maps
1990			if  ( $this -> IsCharacterMap ( $decoded_stream_data ) )
1991			   {
1992				$cmap	=  PdfTexterCharacterMap::CreateInstance ( $object_number, $decoded_stream_data, $this -> AdobeExtraMappings ) ;
1993
1994				if  ( $cmap )
1995					$cmaps [] 	=  $cmap ;
1996			   }
1997			// Font definitions
1998			else if  ( $this -> IsFont ( $decoded_stream_data ) )
1999			   {
2000				$this -> FontTable -> Add ( $object_number, $decoded_stream_data, $pdf_objects, $this -> AdobeExtraMappings ) ;
2001			    }
2002			// Retrieve form data if present
2003			else if  ( $this -> IsFormData ( $object_data ) )
2004			   {
2005				$this -> RetrieveFormData ( $object_number, $decoded_stream_data, $pdf_objects ) ;
2006			    }
2007			// Plain text (well, in fact PDF drawing instructions)
2008			else if  ( $this -> IsText ( $object_data, $decoded_stream_data ) )
2009			   {
2010				$text_data	=  false ;
2011
2012				// Check if we need to ignore page headers and footers
2013				if  ( $this -> Options  &  self::PDFOPT_IGNORE_HEADERS_AND_FOOTERS )
2014				   {
2015					if  ( ! $this -> IsPageHeaderOrFooter ( $decoded_stream_data ) )
2016					   {
2017						$text [ $object_number ]	=
2018						$text_data			=  $decoded_stream_data ;
2019					    }
2020					// However, they may be mixed with actual text contents so we need to separate them...
2021					else
2022					   {
2023						$this -> ExtractTextData ( $object_number, $decoded_stream_data, $remainder, $header, $footer ) ;
2024
2025						// We still need to check again that the extracted text portion contains something useful
2026						if  ( $this -> IsText ( $object_data, $remainder ) )
2027						   {
2028							$text [ $object_number ]	=
2029							$text_data			=  $remainder ;
2030						    }
2031					    }
2032				    }
2033				else
2034				   {
2035					$text [ $object_number ]	=
2036					$text_data			=  $decoded_stream_data ;
2037				    }
2038
2039
2040				// The current object may be a text object that have been defined as an XObject in some other object
2041				// In this case, we have to keep it since it may be referenced by a /TPLx construct from within
2042				// another text object
2043				if  ( $text_data )
2044					$this -> PageMap -> AddTemplateObject ( $object_number, $text_data ) ;
2045			    }
2046			// This may be here the opportunity to look into the $FormData property and replace object ids with their corresponding data
2047			else
2048			   {
2049				$found		=  false ;
2050
2051				foreach  ( $this -> FormData  as  &$form_entry )
2052				   {
2053					if  ( is_integer ( $form_entry [ 'values' ] )  &&  $object_number  ==  $form_entry [ 'values' ] )
2054					   {
2055						$form_entry [ 'values' ]	=  $decoded_stream_data ;
2056						$found				=  true ;
2057					    }
2058					else if  ( is_integer ( $form_entry [ 'form' ] )  &&  $object_number  ==  $form_entry [ 'form' ] )
2059					   {
2060						$form_entry [ 'form' ]	=  $decoded_stream_data ;
2061						$found				=  true ;
2062					    }
2063				    }
2064
2065				if  ( ! $found  &&  self::$DEBUG  >  1 )
2066					echo "\n----------------------------------- UNRECOGNIZED #$object_number :\n$decoded_stream_data\n" ;
2067			    }
2068		    }
2069
2070		// Form data object numbers
2071		$this -> FormDataObjectNumbers	=  array_keys ( $this -> FormData ) ;
2072
2073		// Associate character maps with declared fonts
2074		foreach  ( $cmaps  as  $cmap )
2075			$this -> FontTable -> AddCharacterMap ( $cmap ) ;
2076
2077		// Current font defaults to -1, which means : take the first available font as the current one.
2078		// Sometimes it may happen that text drawing instructions do not set a font at all (PdfPro for example)
2079		$current_font		=  -1 ;
2080
2081		// Build the page catalog
2082		$this -> Pages	=  array ( ) ;
2083		$this -> PageMap -> MapObjects ( $text ) ;
2084
2085		// Add font mappings local to each page
2086		$mapped_fonts	=  $this -> PageMap -> GetMappedFonts ( ) ;
2087		$this -> FontTable -> AddPageFontMap ( $mapped_fonts ) ;
2088
2089		// Extract text from the collected text elements
2090		foreach ( $this -> PageMap -> Pages as  $page_number => $page_objects )
2091		   {
2092			// Checks if this page is selected
2093			if  ( ! $this -> IsPageSelected ( $page_number ) )
2094				continue ;
2095
2096			$this -> Pages [ $page_number ]		=  '' ;
2097
2098			if  ( $layout_option  ===  self::PDFOPT_RAW_LAYOUT )
2099			   {
2100				foreach  ( $page_objects  as  $page_object )
2101				   {
2102					if  ( isset ( $text [ $page_object ] ) )
2103					   {
2104						$new_text				 =  $this -> PageMap -> ProcessTemplateReferences ( $page_number, $text [ $page_object ] ) ;
2105						$object_text				 =  $this -> ExtractText ( $page_number, $page_object, $new_text, $current_font ) ;
2106						$this -> Pages [ $page_number ]		.=  $object_text ;
2107					    }
2108					else if  ( self::$DEBUG  >  1 )
2109						echo "\n----------------------------------- MISSING OBJECT #$page_object for page #$page_number\n" ;
2110				    }
2111			     }
2112			// New style (basic) layout rendering
2113			else if  ( $layout_option  ===  self::PDFOPT_BASIC_LAYOUT )
2114			   {
2115				$page_fragments		=  array ( ) ;
2116
2117				foreach  ( $page_objects  as  $page_object )
2118				   {
2119					if  ( isset ( $text [ $page_object ] ) )
2120					   {
2121						$new_text				 =  $this -> PageMap -> ProcessTemplateReferences ( $page_number, $text [ $page_object ] ) ;
2122						$this -> ExtractTextWithLayout ( $page_fragments, $page_number, $page_object, $new_text, $current_font ) ;
2123					    }
2124					else if  ( self::$DEBUG  >  1 )
2125						echo "\n----------------------------------- MISSING OBJECT #$page_object for page #$page_number\n" ;
2126				    }
2127
2128				$this -> Pages [ $page_number ]			=  $this -> __assemble_text_fragments ( $page_number, $page_fragments, $page_width, $page_height ) ;
2129
2130				$this -> DocumentFragments [ $page_number ]	=  array
2131				   (
2132					'fragments'		=>  $page_fragments,
2133					'page-width'		=>  $page_width,
2134					'page_height'		=>  $page_height
2135				    ) ;
2136			    }
2137		    }
2138
2139		// Retrieve author information
2140		if  ( $this -> GotAuthorInformation )
2141			$this -> RetrieveAuthorInformation ( $author_information_object_id, $pdf_objects ) ;
2142
2143		// Build the page locations (ie, starting and ending offsets)
2144		$offset			=  0 ;
2145		$page_separator		=  utf8_encode ( $this -> PageSeparator ) ;
2146		$page_separator_length	=  strlen ( $page_separator ) ;
2147
2148		foreach  ( $this -> Pages  as  $page_number => &$page )
2149		   {
2150			// If hyphenated words are unwanted, then remove them
2151			if  ( $this -> Options &  self::PDFOPT_NO_HYPHENATED_WORDS )
2152				$page	=  preg_replace ( self::$RemoveHyphensRegex, '$4$2', $page ) ;
2153
2154			$length					 =  strlen ( $page ) ;
2155			$this -> PageLocations [ $page_number ]	 =  array ( 'start' => $offset, 'end' => $offset + $length - 1 ) ;
2156			$offset					+=  $length + $page_separator_length ;
2157		    }
2158
2159		// And finally, the Text property
2160		$this -> Text	=  implode ( $page_separator, $this -> Pages ) ;
2161
2162		// Free memory
2163		$this -> MapIdBuffer			=  array ( ) ;
2164		$this -> RtlCharacterBuffer		=  array ( ) ;
2165		$this -> CharacterMapBuffer		=  array ( ) ;
2166
2167		// Compute memory occupied for this file
2168		$memory_usage_end		=  ( self::$HasMemoryGetUsage     ) ?  memory_get_usage      ( true ) : 0 ;
2169		$memory_peak_usage_end		=  ( self::$HasMemoryGetPeakUsage ) ?  memory_get_peak_usage ( true ) : 0 ;
2170
2171		$this -> MemoryUsage		=  $memory_usage_end      - $this -> __memory_usage_start ;
2172		$this -> MemoryPeakUsage	=  $memory_peak_usage_end - $this -> __memory_peak_usage_start ;
2173
2174		// Adjust the "Distributions" statistics
2175		if  ( $this -> Options  &  self::PDFOPT_ENHANCED_STATISTICS )
2176		   {
2177			$instruction_count		=  0 ;
2178			$statistics			=  array ( ) ;
2179
2180			// Count the total number of instructions
2181			foreach  ( $this -> Statistics [ 'Distributions' ]  as  $count )
2182				$instruction_count  +=  $count ;
2183
2184			// Now transform the Distributions entries into an associative array containing the instruction counts
2185			// ('count') and their relative percentage
2186			foreach  ( $this -> Statistics [ 'Distributions' ]  as  $name => $count )
2187			   {
2188				if  ( $instruction_count )
2189					$percent	=  round ( ( 100.0 / $instruction_count ) * $count, 2 ) ;
2190				else
2191					$percent	=  0 ;
2192
2193				$statistics [ $name ]	=  array
2194				   (
2195					'instruction'		=>  $name,
2196					'count'			=>  $count,
2197					'percent'		=>  $percent
2198				    ) ;
2199			    }
2200
2201			// Set the new 'Distributions' array and sort it by instruction count in reverse order
2202			$this -> Statistics [ 'Distributions' ]		=  $statistics ;
2203			uksort ( $this -> Statistics [ 'Distributions' ], array ( $this, '__sort_distributions' ) ) ;
2204		    }
2205
2206		// All done, return
2207		return ( $this -> Text ) ;
2208	    }
2209
2210
2211	public function  __sort_distributions ( $a, $b )
2212	   { return ( $this -> Statistics [ 'Distributions' ] [$b] [ 'count' ] - $this -> Statistics [ 'Distributions' ] [$a] [ 'count' ] ) ; }
2213
2214
2215
2216	/*--------------------------------------------------------------------------------------------------------------
2217
2218	    NAME
2219	        AddAdobeExtraMappings - Adds extra mappings for standard Adobe fonts.
2220
2221	    PROTOTYPE
2222	        $pdf -> AddAdobeExtraMappings ( $mappings ) ;
2223
2224	    DESCRIPTION
2225	        Adobe supports 4 predefined fonts : standard, Mac, WinAnsi and PDF). All the characters in these fonts
2226		are identified by a character time, a little bit like HTML entities ; for example, 'one' will be the
2227		character '1', 'acircumflex' will be '�', etc.
2228		There are thousands of character names defined by Adobe (see https://mupdf.com/docs/browse/source/pdf/pdf-glyphlist.h.html).
2229		Some of them are not in this list ; this is the case for example of the 'ax' character names, where 'x'
2230		is a decimal number. When such a character is specified in a /Differences array, then there is somewhere
2231		a CharProc[] array giving an object id for each of those characters.
2232		The referenced object(s) in turn contain drawing instructions to draw the glyph. At no point you could
2233		guess what is the corresponding Unicode character for this glyph, since the information is not contained
2234		in the PDF file.
2235		The AddAdobeExtraMappings() method allows you to specify such correspondences. Specify an array as the
2236		$mappings parameter, whose keys are the Adobe character name (for example, "a127") and values the
2237		corresponding Unicode values (see the description of the $mappings parameter for more information).
2238
2239	    PARAMETERS
2240	        $mappings (associative array) -
2241	                Associative array whose keys are Adobe character names. The array values can take several forms :
2242			- A character
2243			- An integer value
2244			- An array of up to four character or integer values.
2245			Internally, every specified value is converted to an array of four integer values, one for
2246			each of the standard Adobe character sets (Standard, Mac, WinAnsi and PDF). The following
2247			rules apply :
2248			- If the input value is a single character, the output array corrsponding the Adobe character
2249			  name will be a set of 4 elements corresponding to the ordinal value of the supplied
2250			  character.
2251			- If the input value is an integer, the output array will be a set of 4 identical values
2252			- If the input value is an array :
2253			  . Arrays with less that 4 elements will be padded, using the last array item for padding
2254			  . Arrays with more than 4 elements will be silently truncated
2255			  . Each array value can either be a character or a numeric value.
2256
2257	    NOTES
2258	        In this current implementation, the method applies the mappings to ALL Adobe default fonts. That is,
2259		you cannot have one mapping for one Adobe font referenced in the PDF file, then a second mapping for
2260		a second Adobe font, etc.
2261
2262	 *-------------------------------------------------------------------------------------------------------------*/
2263	public function  AddAdobeExtraMappings ( $mappings )
2264	   {
2265		// Loop through each mapping
2266		foreach  ( $mappings  as  $key => $value )
2267		   {
2268			// Character value : we retain its ordinal value as the 4 values of the output array
2269			if  ( is_string ( $value ) )
2270			   {
2271				$ord		=  ord ( $value ) ;
2272				$items		=  array ( $ord, $ord, $ord, $ord ) ;
2273			    }
2274			// Numeric value : the output array will contain 4 times the supplied value
2275			else if  ( is_numeric ( $value ) )
2276			   {
2277				$value		=  ( integer ) $value ;
2278				$items		=  array ( $value, $value, $value, $value ) ;
2279			    }
2280			// Array value : make sure we will have an output array of 4 values
2281			else if  ( is_array ( $value ) )
2282			   {
2283				$items		=  array ( ) ;
2284
2285				// Collect the supplied values, converting characters to their ordinal values if necessary
2286				for  ( $i = 0, $count = count ( $value ) ;  $i  <  $count  &&  $i  <  4 ; $i ++ )
2287				   {
2288					$code	=  $value [$i] ;
2289
2290					if  ( is_string ( $code ) )
2291						$items []	=  ord ( $code ) ;
2292					else
2293						$items []	=  ( integer ) $code ;
2294				    }
2295
2296				// Ensure that we have 4 values ; fill the missing ones with the last seen value if necessary
2297				$count		=  count ( $items ) ;
2298
2299				if  ( ! $count )
2300					error ( new PdfToTextException ( "Adobe extra mapping \"$key\" has no values." ) ) ;
2301
2302				$last_value		=  $items [ $count - 1 ] ;
2303
2304				for  ( $i = $count ; $i  <  4 ; $i ++ )
2305					$items []	=  $last_value ;
2306			    }
2307			else
2308				error ( new PdfToTextException ( "Invalid value \"$value\" for Adobe extra mapping \"$key\"." ) ) ;
2309
2310			// Add this current mapping to the Adobe extra mappings array
2311			$this -> AdobeExtraMappings [ $key ]	=  $items ;
2312		    }
2313	    }
2314
2315
2316	/*--------------------------------------------------------------------------------------------------------------
2317
2318	    NAME
2319	        GetPageFromOffset - Returns a page number from a text offset.
2320
2321	    PROTOTYPE
2322	        $offset		=  $pdf -> GetPageFromOffset ( $offset ) ;
2323
2324	    DESCRIPTION
2325	        Given a byte offset in the Text property, returns its page number in the pdf document.
2326
2327	    PARAMETERS
2328	        $offset (integer) -
2329	                Offset, in the Text property, whose page number is to be retrieved.
2330
2331	    RETURN VALUE
2332	        Returns a page number in the pdf document, or false if the specified offset does not exist.
2333
2334	 *-------------------------------------------------------------------------------------------------------------*/
2335	public function  GetPageFromOffset ( $offset )
2336	   {
2337		if  ( $offset  ===  false )
2338			return ( false ) ;
2339
2340		foreach  ( $this -> PageLocations  as  $page => $location )
2341		   {
2342			if  ( $offset  >=  $location [ 'start' ]  &&  $offset  <=  $location [ 'end' ] )
2343				return ( $page ) ;
2344		    }
2345
2346		return ( false ) ;
2347	    }
2348
2349
2350	/*--------------------------------------------------------------------------------------------------------------
2351
2352	    NAME
2353	        text_strpos, text_stripos - Search for an occurrence of a string.
2354
2355	    PROTOTYPE
2356	        $result		=  $pdf -> text_strpos  ( $search, $start = 0 ) ;
2357	        $result		=  $pdf -> text_stripos ( $search, $start = 0 ) ;
2358
2359	    DESCRIPTION
2360	        These methods behave as the strpos/stripos PHP functions, except that :
2361		- They operate on the text contents of the pdf file (Text property)
2362		- They return an array containing the page number and text offset. $result [0] will be set to the page
2363		  number of the searched text, and $result [1] to its offset in the Text property
2364
2365	    PARAMETERS
2366	        $search (string) -
2367	                String to be searched.
2368
2369		$start (integer) -
2370			Start offset in the pdf text contents.
2371
2372	    RETURN VALUE
2373	        Returns an array of two values containing the page number and text offset if the searched string has
2374		been found, or false otherwise.
2375
2376	 *-------------------------------------------------------------------------------------------------------------*/
2377	public function  text_strpos ( $search, $start = 0 )
2378	   {
2379		$offset		=  mb_strpos ( $this -> Text, $search, $start, 'UTF-8' ) ;
2380
2381		if  ( $offset  !==  false )
2382			return ( array ( $this -> GetPageFromOffset ( $offset ), $offset ) ) ;
2383
2384		return ( false ) ;
2385	    }
2386
2387
2388	public function  text_stripos ( $search, $start = 0 )
2389	   {
2390		$offset		=  mb_stripos ( $this -> Text, $search, $start, 'UTF-8' ) ;
2391
2392		if  ( $offset  !==  false )
2393			return ( array ( $this -> GetPageFromOffset ( $offset ), $offset ) ) ;
2394
2395		return ( false ) ;
2396	    }
2397
2398
2399
2400
2401	/*--------------------------------------------------------------------------------------------------------------
2402
2403	    NAME
2404	        document_strpos, document_stripos - Search for all occurrences of a string.
2405
2406	    PROTOTYPE
2407	        $result		=  $pdf -> document_strpos  ( $search, $group_by_page = false ) ;
2408	        $result		=  $pdf -> document_stripos ( $search, $group_by_page = false ) ;
2409
2410	    DESCRIPTION
2411		Searches for ALL occurrences of a given string in the pdf document. The value of the $group_by_page
2412		parameter determines how the results are returned :
2413		- When true, the returned value will be an associative array whose keys will be page numbers and values
2414		  arrays of offset of the found string within the page
2415		- When false, the returned value will be an array of arrays containing two entries : the page number
2416		  and the text offset.
2417
2418		For example, if a pdf document contains the string "here" at character offset 100 and 200 in page 1, and
2419		position 157 in page 3, the returned value will be :
2420		- When $group_by_page is false :
2421			[ [ 1, 100 ], [ 1, 200 ], [ 3, 157 ] ]
2422		- When $group_by_page is true :
2423			[ 1 => [ 100, 200 ], 3 => [ 157 ] ]
2424
2425	    PARAMETERS
2426	        $search (string) -
2427	                String to be searched.
2428
2429		$group_by_page (boolean) -
2430			Indicates whether the found offsets should be grouped by page number or not.
2431
2432	    RETURN VALUE
2433	        Returns an array of page numbers/character offsets (see Description above) or false if the specified
2434		string does not appear in the document.
2435
2436	 *-------------------------------------------------------------------------------------------------------------*/
2437	public function  document_strpos ( $text, $group_by_page = false )
2438	   {
2439		$length		=  strlen ( $text ) ;
2440
2441		if  ( ! $length )
2442			return ( false ) ;
2443
2444		$result		=  array ( ) ;
2445		$index		=  0 ;
2446
2447		while ( ( $index =  mb_strpos ( $this -> Text, $text, $index, 'UTF-8' ) )  !==  false )
2448		   {
2449			$page	=  $this -> GetPageFromOffset ( $index ) ;
2450
2451			if  ( $group_by_page )
2452				$result [ $page ] []	=  $index ;
2453			else
2454				$result []		=  array ( $page, $index ) ;
2455
2456			$index	+=  $length ;
2457		    }
2458
2459		return ( $result ) ;
2460	    }
2461
2462
2463	public function  document_stripos ( $text, $group_by_page = false )
2464	   {
2465		$length		=  strlen ( $text ) ;
2466
2467		if  ( ! $length )
2468			return ( false ) ;
2469
2470		$result		=  array ( ) ;
2471		$index		=  0 ;
2472
2473		while ( ( $index =  mb_stripos ( $this -> Text, $text, $index, 'UTF-8' ) )  !==  false )
2474		   {
2475			$page	=  $this -> GetPageFromOffset ( $index ) ;
2476
2477			if  ( $group_by_page )
2478				$result [ $page ] []	=  $index ;
2479			else
2480				$result []		=  array ( $page, $index ) ;
2481
2482			$index	+=  $length ;
2483		    }
2484
2485		return ( $result ) ;
2486	    }
2487
2488
2489	/*--------------------------------------------------------------------------------------------------------------
2490
2491	    NAME
2492	        text_match, document_match - Search string using regular expressions.
2493
2494	    PROTOTYPE
2495	        $status		=  $pdf -> text_match ( $pattern, &$match = null, $flags = 0, $offset = 0 ) ;
2496	        $status		=  $pdf -> document_match ( $pattern, &$match = null, $flags = 0, $offset = 0 ) ;
2497
2498	    DESCRIPTION
2499	        text_match() calls the preg_match() PHP function on the pdf text contents, to locate the first occurrence
2500		of text that matches the specified regular expression.
2501		document_match() calls the preg_match_all() function to locate all occurrences that match the specified
2502		regular expression.
2503		Note that both methods add the PREG_OFFSET_CAPTURE flag when calling preg_match/preg_match_all so you
2504		should be aware that all captured results are an array containing the following entries :
2505		- Item [0] is the captured string
2506		- Item [1] is its text offset
2507		- The text_match() and document_match() methods add an extra array item (index 2), which contains the
2508		  page number where the matched text resides
2509
2510	    PARAMETERS
2511	        $pattern (string) -
2512	                Regular expression to be searched.
2513
2514		$match (any) -
2515			Output captures. See preg_match/preg_match_all.
2516
2517		$flags (integer) -
2518			PCRE flags. See preg_match/preg_match_all.
2519
2520		$offset (integer) -
2521			Start offset. See preg_match/preg_match_all.
2522
2523	    RETURN VALUE
2524	        Returns the number of matched occurrences, or false if the specified regular expression is invalid.
2525
2526	 *-------------------------------------------------------------------------------------------------------------*/
2527	public function  text_match ( $pattern, &$match = null, $flags = 0, $offset = 0 )
2528	   {
2529		$local_match	=  null ;
2530		$status		=  preg_match ( $pattern, $this -> Text, $local_match, $flags | PREG_OFFSET_CAPTURE, $offset ) ;
2531
2532		if  ( $status )
2533		   {
2534			foreach  ( $local_match  as  &$entry )
2535				$entry [2]	=  $this -> GetPageFromOffset ( $entry [1] ) ;
2536
2537			$match	=  $local_match ;
2538		    }
2539
2540		return ( $status ) ;
2541	    }
2542
2543
2544	public function  document_match ( $pattern, &$matches = null, $flags = 0, $offset = 0 )
2545	   {
2546		$local_matches	=  null ;
2547		$status		=  preg_match_all ( $pattern, $this -> Text, $local_matches, $flags | PREG_OFFSET_CAPTURE, $offset ) ;
2548
2549		if  ( $status )
2550		   {
2551			foreach  ( $local_matches  as  &$entry )
2552			   {
2553				foreach  ( $entry  as  &$subentry )
2554				$subentry [2]	=  $this -> GetPageFromOffset ( $subentry [1] ) ;
2555			    }
2556
2557			$matches	=  $local_matches ;
2558		    }
2559
2560		return ( $status ) ;
2561	    }
2562
2563
2564	/*--------------------------------------------------------------------------------------------------------------
2565
2566	    HasFormData -
2567		Returns true if the PDF file contains form data or not.
2568
2569	 *-------------------------------------------------------------------------------------------------------------*/
2570	public function  HasFormData ( )
2571	   {
2572		return ( count ( $this -> FormData )  >  0 ) ;
2573	    }
2574
2575
2576	/*--------------------------------------------------------------------------------------------------------------
2577
2578	    GetFormCount -
2579		Returns the number of top-level forms contained in the PDF file.
2580
2581	 *-------------------------------------------------------------------------------------------------------------*/
2582	public function  GetFormCount ( )
2583	   {
2584		return ( count ( $this -> FormData ) ) ;
2585	    }
2586
2587
2588	/*--------------------------------------------------------------------------------------------------------------
2589
2590	    NAME
2591	        GetFormData - Returns form data, if any
2592
2593	    PROTOTYPE
2594	        $object		=  $pdf -> GetFormData ( $template = null, $form_index = 0 ) ;
2595
2596	    DESCRIPTION
2597	        Retrieves form data if present.
2598
2599	    PARAMETERS
2600	        $template (string) -
2601	                An XML file describing form data using human-readable names for field values.
2602			If not specified, the inline form definitions will be used, together with the field names
2603			specified in the PDF file.
2604
2605		$form_index (integer) -
2606			Form index in the PDF file. So far, I really don't know if a PDF file can have multiple forms.
2607
2608	    RETURN VALUE
2609	        An object derived from the PdfToTextFormData class.
2610
2611	 *-------------------------------------------------------------------------------------------------------------*/
2612	public function  GetFormData ( $template = null, $form_index = 0 )
2613	   {
2614		if  ( isset ( $this -> FormDataObjects [ $form_index ] ) )
2615			return ( $this -> FormDataObjects [ $form_index ] ) ;
2616
2617		if  ( $form_index  >  count ( $this -> FormDataObjectNumbers ) )
2618			error ( new PdfToTextFormException ( "Invalid form index #$form_index." ) ) ;
2619
2620		$form_data	=  $this -> FormData [ $this -> FormDataObjectNumbers [ $form_index ] ] ;
2621
2622		if  ( $template )
2623		   {
2624			if  ( ! file_exists ( $template ) )
2625				error ( new PdfToTextFormException ( "Form data template file \"$template\" not found." ) ) ;
2626
2627			$xml_data	=  file_get_contents ( $template ) ;
2628			$definitions	=  new PdfToTextFormDefinitions ( $xml_data, $form_data [ 'form' ] ) ; ;
2629		    }
2630		else
2631		   {
2632			$definitions	=  new PdfToTextFormDefinitions ( null, $form_data [ 'form' ] ) ;
2633		    }
2634
2635		$object		=  $definitions [ $form_index ]	-> GetFormDataFromPdfObject ( $form_data [ 'values' ] ) ;
2636
2637		$this -> FormDataDefinitions []		=  $definitions ;
2638		$this -> FormDataObjects []		=  $object ;
2639
2640		return ( $object ) ;
2641	    }
2642
2643
2644	/*--------------------------------------------------------------------------------------------------------------
2645
2646	    NAME
2647	        MarkTextLike - Marks output text.
2648
2649	    PROTOTYPE
2650	        $pdf -> MarkTextLike ( $regex, $marker_start, $marker_end ) ;
2651
2652	    DESCRIPTION
2653	        Sometimes it may be convenient, when you want to extract only a portion of text, to say : "I want to
2654		extract text between this title and this title". The MarkTextLike() method provides some support for
2655		such a task. Imagine you have documents that have the same structure, all starting with an "Introduction"
2656		title :
2657
2658			Introduction
2659				...
2660				some text
2661				...
2662			Some other title
2663				...
2664
2665		By calling the MarkTextLike() method such as in the example below :
2666
2667			$pdf -> MarkTextLike ( '/\bIntroduction\b/', '<M>', '</M' ) ;
2668
2669		then you will get as output :
2670
2671			<M>Introduction</M>
2672				...
2673				some text
2674				...
2675			<M>Some other title</M>
2676
2677		Adding such markers in the output will allow you to easily extract the text between the chapters
2678		"Introduction" and "Some other title", using a regular expression.
2679
2680		The font name used for the first string matched by the specified regular expression will be searched
2681		later to add markers around all the text portions using this font.
2682
2683
2684	    PARAMETERS
2685	        $regex (string) -
2686	                A regular expression to match the text to be matched. Subsequent portions of text using the
2687			same font will be surrounded by the marker start/end strings.
2688
2689		$marker_start, $marker_end (string) -
2690			Markers to surround the string when a match is found.
2691
2692	 *-------------------------------------------------------------------------------------------------------------*/
2693	public function  MarkTextLike ( $regex, $marker_start, $marker_end )
2694	   {
2695		$this -> UnprocessedMarkerList [ 'font' ] []	=  array
2696		   (
2697			'regex'		=>  $regex,
2698			'start'		=>  $marker_start,
2699			'end'		=>  $marker_end
2700		    ) ;
2701	    }
2702
2703
2704	/*--------------------------------------------------------------------------------------------------------------
2705
2706	    NAME
2707	        SetCaptures, SetCapturesFromString - Defines document parts to be captured.
2708
2709	    PROTOTYPE
2710	        $pdf -> SetCaptures ( $xml_file ) ;
2711		$pdf -> SetCapturesFromString ( $xml_data ) ;
2712
2713	    DESCRIPTION
2714	        Defines document parts to be captured.
2715		SetCaptures() takes the definitions for the areas to be captured from an XML file, while
2716		SetCapturesFromString() takes them from a string representing xml capture definitions.
2717
2718	    NOTES
2719	        - See file README.md for an explanation on the format of the XML capture definition file.
2720		- The SetCaptures() methods must be called before the Load() method.
2721
2722	 *-------------------------------------------------------------------------------------------------------------*/
2723	public function  SetCaptures ( $xml_file )
2724	   {
2725		if  ( ! file_exists ( $xml_file ) )
2726			error ( new PdfToTextException ( "File \"$xml_file\" does not exist." ) ) ;
2727
2728		$xml_data	=  file_get_contents ( $xml_file ) ;
2729
2730		$this -> SetCapturesFromString ( $xml_data )  ;
2731
2732	    }
2733
2734
2735	public function  SetCapturesFromString ( $xml_data )
2736	   {
2737		// Setting capture areas implies having the PDFOPT_BASIC_LAYOUT option
2738		$this -> Options	|=  self::PDFOPT_BASIC_LAYOUT ;
2739
2740		$this -> CaptureDefinitions		=  new PdfToTextCaptureDefinitions ( $xml_data ) ;
2741	    }
2742
2743
2744	/*--------------------------------------------------------------------------------------------------------------
2745
2746	    NAME
2747	        GetCaptures - Returns captured data.
2748
2749	    PROTOTYPE
2750	        $object		=  $pdf -> GetCaptures ( $full = false ) ;
2751
2752	    PARAMETERS
2753		$full (boolean) -
2754			When true, the whole captures, togethers with their definitions, are returned. When false,
2755			only a basic object containing the capture names and their values is returned.
2756
2757	    DESCRIPTION
2758	        Returns the object that contains captured data.
2759
2760	    RETURN VALUE
2761	        An object of type PdfToTextCaptures, or false if an error occurred.
2762
2763	 *-------------------------------------------------------------------------------------------------------------*/
2764	public function  GetCaptures ( $full = false )
2765	   {
2766		if  ( ! $this -> CaptureObject )
2767		   {
2768			$this -> CaptureDefinitions -> SetPageCount ( count ( $this -> Pages ) ) ;
2769			$this -> CaptureObject	=  $this -> CaptureDefinitions -> GetCapturedObject ( $this -> DocumentFragments ) ;
2770		    }
2771
2772		if  ( $full )
2773			return ( $this -> CaptureObject ) ;
2774		else
2775			return ( $this -> CaptureObject -> ToCaptures ( ) ) ;
2776	    }
2777
2778
2779	/**************************************************************************************************************
2780	 **************************************************************************************************************
2781	 **************************************************************************************************************
2782	 ******                                                                                                  ******
2783	 ******                                                                                                  ******
2784	 ******                                         INTERNAL METHODS                                         ******
2785	 ******                                                                                                  ******
2786	 ******                                                                                                  ******
2787	 **************************************************************************************************************
2788	 **************************************************************************************************************
2789	 **************************************************************************************************************/
2790
2791	/*--------------------------------------------------------------------------------------------------------------
2792
2793	    NAME
2794	        AddImage - Adds an image from the PDF stream to the current object.
2795
2796	    PROTOTYPE
2797	        $this -> AddImage ( $object_id, $stream_data, $type, $object_data ) ;
2798
2799	    DESCRIPTION
2800	        Adds an image from the PDF stream to the current object.
2801		If the PDFOPT_GET_IMAGE_DATA flag is enabled, image data will be added to the ImageData property.
2802		If the PDFOPT_DECODE_IMAGE_DATA flag is enabled, a jpeg resource will be created and added into the
2803		Images array property.
2804
2805	    PARAMETERS
2806	        $object_id (integer) -
2807	                Pdf object id.
2808
2809		$stream_data (string) -
2810			Contents of the unprocessed stream data containing the image.
2811
2812		$type (integer) -
2813			One of the PdfToText::PDF_*_ENCODING constants.
2814
2815	 *-------------------------------------------------------------------------------------------------------------*/
2816	protected function  AddImage ( $object_id, $stream_data, $type, $object_data )
2817	   {
2818
2819		if  ( self::$DEBUG  &&   $this -> Options  &  self::PDFOPT_GET_IMAGE_DATA )
2820		    {
2821			switch  ( $type )
2822			   {
2823				case	self::PDF_DCT_ENCODING :
2824					$this -> ImageData	=  array ( 'type' => 'jpeg', 'data' => $stream_data ) ;
2825					break ;
2826			    }
2827
2828		     }
2829
2830
2831		if  ( $this -> Options  &  self::PDFOPT_DECODE_IMAGE_DATA  &&
2832			( ! $this -> MaxExtractedImages  ||  $this -> ImageCount  <  $this -> MaxExtractedImages ) )
2833		   {
2834			$image	=  $this -> DecodeImage ( $object_id, $stream_data, $type, $object_data, $this -> Options  &  self::PDFOPT_AUTOSAVE_IMAGES ) ;
2835
2836			if  ( $image  !==  false )
2837			   {
2838				$this -> ImageCount ++ ;
2839
2840				// When the PDFOPT_AUTOSAVE_IMAGES flag is set, we simply use a template filename to generate a real output filename
2841				// then save the image to that file. The memory is freed after that.
2842				if  ( $this -> Options  &  self::PDFOPT_AUTOSAVE_IMAGES )
2843				   {
2844					$output_filename 			=  $this -> __get_output_image_filename ( ) ;
2845
2846					$image -> SaveAs ( $output_filename, $this -> ImageAutoSaveFormat ) ;
2847					unset ( $image ) ;
2848
2849					$this -> AutoSavedImageFiles []		=  $output_filename ;
2850				    }
2851				// Otherwise, simply store the image data into memory
2852				else
2853					$this -> Images []	=  $image ;
2854			    }
2855		    }
2856	    }
2857
2858
2859	/*--------------------------------------------------------------------------------------------------------------
2860
2861	    NAME
2862	        DecodeData - Decodes stream data.
2863
2864	    PROTOTYPE
2865	        $data	=  $this -> DecodeData ( $object_id, $stream_data, $type ) ;
2866
2867	    DESCRIPTION
2868	        Decodes stream data (binary data located between the "stream" and "enstream" directives) according to the
2869		specified encoding type, given in the surrounding object parameters.
2870
2871	    PARAMETERS
2872		$object_id (integer) -
2873			Id of the object containing the data.
2874
2875	        $stream_data (string) -
2876	                Contents of the binary stream.
2877
2878		$type (integer) -
2879			One of the PDF_*_ENCODING constants, as returned by the GetEncodingType() method.
2880
2881	    RETURN VALUE
2882	        Returns the decoded stream data.
2883
2884	 *-------------------------------------------------------------------------------------------------------------*/
2885	protected function  DecodeData ( $object_id, $stream_data, $type, $object_data )
2886	   {
2887		$decoded_stream_data 	=  '' ;
2888
2889		switch  ( $type )
2890		   {
2891		   	case 	self::PDF_FLATE_ENCODING :
2892				// Objects in password-protected Pdf files SHOULD be encrypted ; however, it happens that we may encounter normal,
2893				// unencrypted ones. This is why we always try to gzuncompress them first then, if failed, try to decrypt them
2894		   		$decoded_stream_data 	=  @gzuncompress ( $stream_data ) ;
2895
2896				if  ( $decoded_stream_data  ===  false )
2897				   {
2898					if  ( $this -> IsEncrypted )
2899					   {
2900						$decoded_stream_data	=  $this -> EncryptionData -> Decrypt ( $object_id, $stream_data ) ;
2901
2902						if  ( $decoded_stream_data  ===  false )
2903						   {
2904							if  ( self::$DEBUG  >  1 )
2905								warning ( new PdfToTextDecodingException ( "Unable to decrypt object contents.", $object_id ) ) ;
2906						    }
2907					    }
2908					else if  ( self::$DEBUG  >  1 )
2909						warning ( new PdfToTextDecodingException ( "Invalid gzip data.", $object_id ) ) ;
2910				    }
2911
2912		   		break ;
2913
2914			case	self::PDF_LZW_ENCODING :
2915				$decoded_stream_data	=  $this -> __decode_lzw ( $stream_data ) ;
2916				break ;
2917
2918		   	case 	self::PDF_ASCIIHEX_ENCODING :
2919		   		$decoded_stream_data 	=  $this -> __decode_ascii_hex ( $stream_data ) ;
2920		   		break ;
2921
2922			case 	self::PDF_ASCII85_ENCODING :
2923				$decoded_stream_data 	=  $this -> __decode_ascii_85 ( $stream_data ) ;
2924
2925				// Dumbly check if this could not be gzipped data after decoding (normally, the object flags should also specify
2926				// the /FlateDecode flag)
2927				if  ( $decoded_stream_data  !==  false  &&  ( $result = @gzuncompress ( $decoded_stream_data ) )  !==  false )
2928					$decoded_stream_data  =  $result ;
2929
2930				break ;
2931
2932			case	self::PDF_TEXT_ENCODING :
2933				$decoded_stream_data	=  $stream_data ;
2934				break ;
2935		    }
2936
2937		return ( $decoded_stream_data ) ;
2938	    }
2939
2940
2941	// __decode_lzw -
2942	//	Decoding function for LZW encrypted data. This function is largely inspired by the TCPDF one but has been rewritten
2943	//	for a performance gain of 30-35%.
2944	private function   __decode_lzw ( $data )
2945	   {
2946		// The initial dictionary contains 256 entries where each index is equal to its character representation
2947		static $InitialDictionary      =  array
2948		   (
2949			"\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", "\x09", "\x0A", "\x0B", "\x0C", "\x0D", "\x0E", "\x0F",
2950			"\x10", "\x11", "\x12", "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1A", "\x1B", "\x1C", "\x1D", "\x1E", "\x1F",
2951			"\x20", "\x21", "\x22", "\x23", "\x24", "\x25", "\x26", "\x27", "\x28", "\x29", "\x2A", "\x2B", "\x2C", "\x2D", "\x2E", "\x2F",
2952			"\x30", "\x31", "\x32", "\x33", "\x34", "\x35", "\x36", "\x37", "\x38", "\x39", "\x3A", "\x3B", "\x3C", "\x3D", "\x3E", "\x3F",
2953			"\x40", "\x41", "\x42", "\x43", "\x44", "\x45", "\x46", "\x47", "\x48", "\x49", "\x4A", "\x4B", "\x4C", "\x4D", "\x4E", "\x4F",
2954			"\x50", "\x51", "\x52", "\x53", "\x54", "\x55", "\x56", "\x57", "\x58", "\x59", "\x5A", "\x5B", "\x5C", "\x5D", "\x5E", "\x5F",
2955			"\x60", "\x61", "\x62", "\x63", "\x64", "\x65", "\x66", "\x67", "\x68", "\x69", "\x6A", "\x6B", "\x6C", "\x6D", "\x6E", "\x6F",
2956			"\x70", "\x71", "\x72", "\x73", "\x74", "\x75", "\x76", "\x77", "\x78", "\x79", "\x7A", "\x7B", "\x7C", "\x7D", "\x7E", "\x7F",
2957			"\x80", "\x81", "\x82", "\x83", "\x84", "\x85", "\x86", "\x87", "\x88", "\x89", "\x8A", "\x8B", "\x8C", "\x8D", "\x8E", "\x8F",
2958			"\x90", "\x91", "\x92", "\x93", "\x94", "\x95", "\x96", "\x97", "\x98", "\x99", "\x9A", "\x9B", "\x9C", "\x9D", "\x9E", "\x9F",
2959			"\xA0", "\xA1", "\xA2", "\xA3", "\xA4", "\xA5", "\xA6", "\xA7", "\xA8", "\xA9", "\xAA", "\xAB", "\xAC", "\xAD", "\xAE", "\xAF",
2960			"\xB0", "\xB1", "\xB2", "\xB3", "\xB4", "\xB5", "\xB6", "\xB7", "\xB8", "\xB9", "\xBA", "\xBB", "\xBC", "\xBD", "\xBE", "\xBF",
2961			"\xC0", "\xC1", "\xC2", "\xC3", "\xC4", "\xC5", "\xC6", "\xC7", "\xC8", "\xC9", "\xCA", "\xCB", "\xCC", "\xCD", "\xCE", "\xCF",
2962			"\xD0", "\xD1", "\xD2", "\xD3", "\xD4", "\xD5", "\xD6", "\xD7", "\xD8", "\xD9", "\xDA", "\xDB", "\xDC", "\xDD", "\xDE", "\xDF",
2963			"\xE0", "\xE1", "\xE2", "\xE3", "\xE4", "\xE5", "\xE6", "\xE7", "\xE8", "\xE9", "\xEA", "\xEB", "\xEC", "\xED", "\xEE", "\xEF",
2964			"\xF0", "\xF1", "\xF2", "\xF3", "\xF4", "\xF5", "\xF6", "\xF7", "\xF8", "\xF9", "\xFA", "\xFB", "\xFC", "\xFD", "\xFE", "\xFF"
2965		    ) ;
2966
2967		// Dictionary lengths - when we reach one of the values specified as the key, we have to set the bit length to the corresponding value
2968		static  $DictionaryLengths	=  array
2969		   (
2970			511		=>  10,
2971			1023		=>  11,
2972			2047		=>  12
2973		    ) ;
2974
2975		// Decoded string to be returned
2976		$result		=  '' ;
2977
2978		// Convert string to binary string
2979		$bit_string	=  '' ;
2980		$data_length	=  strlen ( $data ) ;
2981
2982		for  ( $i = 0 ; $i  <  $data_length ; $i ++ )
2983			$bit_string	.=  sprintf ( '%08b', ord ( $data[$i] ) ) ;
2984
2985		$data_length	*=  8 ;
2986
2987		// Initialize dictionary
2988		$bit_length		=  9 ;
2989		$dictionary_index	=  258 ;
2990		$dictionary		=  $InitialDictionary ;
2991
2992		// Previous value
2993		$previous_index		=  0 ;
2994
2995		// Start index in bit string
2996		$start_index		=  0 ;
2997
2998		// Until we encounter the EOD marker (257), read $bit_length bits
2999		while  ( ( $start_index  <  $data_length )  &&  ( ( $index = bindec ( substr ( $bit_string, $start_index, $bit_length ) ) )  !==  257 ) )
3000		   {
3001			// Move to next bit position
3002			$start_index	+=  $bit_length ;
3003
3004			if  ( $index  !==  256  &&  $previous_index  !==  256 )
3005			    {
3006				// Check if index exists in the dictionary and remember it
3007				if  ( $index  <  $dictionary_index )
3008				   {
3009					$result			.=  $dictionary [ $index ] ;
3010					$dictionary_value	 =  $dictionary [ $previous_index ] . $dictionary [ $index ] [0] ;
3011					$previous_index		 =  $index ;
3012				    }
3013				// Index does not exist - add it to the dictionary
3014				else
3015				   {
3016					$dictionary_value	 =  $dictionary [ $previous_index ] . $dictionary [ $previous_index ] [0] ;
3017					$result			.=  $dictionary_value ;
3018				    }
3019
3020				// Update dictionary
3021				$dictionary [ $dictionary_index ++ ]	=  $dictionary_value ;
3022
3023				// Change bit length whenever we reach an index limit
3024				if  ( isset ( $DictionaryLengths [ $dictionary_index ] ) )
3025					$bit_length	=  $DictionaryLengths [ $dictionary_index ] ;
3026			    }
3027			// Clear table marker
3028			else if  ( $index  ===  256)
3029			   {
3030				// Reset dictionary and bit length
3031				// Reset dictionary and bit length
3032				$bit_length		=  9 ;
3033				$dictionary_index	=  258 ;
3034				$previous_index		=  256 ;
3035				$dictionary		=  $InitialDictionary ;
3036			     }
3037			// First entry
3038			else	// $previous_index  === 256
3039			   {
3040				// first entry
3041				$result		.=  $dictionary [ $index ] ;
3042				$previous_index  =  $index ;
3043			    }
3044		    }
3045
3046		// All done, return
3047		return ( $result ) ;
3048	   }
3049
3050
3051	// __decode_ascii_hex -
3052	//	Decoder for /AsciiHexDecode streams.
3053	private function __decode_ascii_hex ( $input )
3054	    {
3055	    	$output 	=  "" ;
3056	    	$is_odd 		=  true ;
3057	    	$is_comment 	=  false ;
3058
3059	    	for  ( $i = 0, $codeHigh =  -1 ; $i  <  strlen ( $input )  &&  $input [ $i ]  !=  '>' ; $i++ )
3060	    	   {
3061	    		$c 	=  $input [ $i ] ;
3062
3063	    		if  ( $is_comment )
3064	    		   {
3065	    			if   ( $c  ==  '\r'  ||  $c  ==  '\n' )
3066	    				$is_comment 	=  false ;
3067
3068	    			continue;
3069	    		    }
3070
3071	    		switch  ( $c )
3072	    		   {
3073	    			case  '\0' :
3074	    			case  '\t' :
3075	    			case  '\r' :
3076	    			case  '\f' :
3077	    			case  '\n' :
3078	    			case  ' '  :
3079	    				break ;
3080
3081	    			case '%' :
3082	    				$is_comment 	=  true ;
3083	    				break ;
3084
3085	    			default :
3086	    				$code 	=  hexdec ( $c ) ;
3087
3088	    				if  ( $code  ===  0  &&  $c  !=  '0' )
3089	    					return ( '' ) ;
3090
3091	    				if  ( $is_odd )
3092	    					$codeHigh 	 =  $code ;
3093					else
3094	    					$output 	.=  chr ( ( $codeHigh << 4 ) | $code ) ;
3095
3096	    				$is_odd 	=  ! $is_odd ;
3097	    				break ;
3098	    		    }
3099	    	    }
3100
3101	    	if  ( $input [ $i ]  !=  '>' )
3102	    		return ( '' ) ;
3103
3104	    	if  ( $is_odd )
3105	    		$output 	.=  chr ( $codeHigh << 4 ) ;
3106
3107	    	return ( $output ) ;
3108	    }
3109
3110
3111	// __decode_ascii_85 -
3112	//	Decoder for /Ascii85Decode streams.
3113	private function  __decode_ascii_85 ( $data )
3114	   {
3115		// Ordinal value of the first character used in Ascii85 encoding
3116		static	$first_ord	=  33 ;
3117		// "A 'z' in the input data means "sequence of 4 nuls"
3118		static	$z_exception	=  "\0\0\0\0" ;
3119		// Powers of 85, from 4 to 0
3120		static	$exp85		=  array ( 52200625, 614125, 7225, 85, 1 ) ;
3121
3122		// Ignore empty data
3123		if  ( $data  ===  '' )
3124			return ( false ) ;
3125
3126		$data_length	=  strlen ( $data ) ;
3127		$ords		=  array ( ) ;
3128		$ord_count	=  0 ;
3129		$result		=  '' ;
3130
3131		// Paranoia : Ascii85 data may start with '<~' (but it always end with '~>'). Anyway, we must start past this construct if present
3132		if  ( $data [0]  ==  '<'  &&  $data [1]  ==  '~' )
3133			$start	=  2 ;
3134		else
3135			$start	=  0 ;
3136
3137		// Loop through nput characters
3138		for  ( $i = $start ; $i  <  $data_length  &&  $data [$i]  !=  '~' ; $i ++ )
3139		   {
3140			$ch	=  $data [$i] ;
3141
3142			// Most common case : current character is in the range of the Ascii85 encoding ('!'..'u')
3143			if  ( $ch  >=  '!'  &&  $ch  <=  'u' )
3144				$ords [ $ord_count ++ ]		=  ord ( $ch ) - $first_ord ;
3145			// 'z' is replaced with a sequence of null bytes
3146			else if  ( $ch  ==  'z'  &&  ! $ord_count )
3147				$result		.=  $z_exception ;
3148			// Spaces are ignored
3149			else if  ( $ch  !==  "\0"  &&  $ch  !==  "\t"  &&  $ch  !==  ' '  &&  $ch  !==  "\r"  &&  $ch  !==  "\n"  &&  $ch  !==  "\f" )
3150				continue ;
3151			// Other characters : corrupted data...
3152			else
3153				return ( false ) ;
3154
3155			// We have collected 5 characters in base 85 : convert their 32-bits value to base 2 (3 characters)
3156			if  ( $ord_count  ==  5 )
3157			   {
3158				$ord_count	=  0 ;
3159
3160    				for  ( $sum = 0, $j = 0  ; $j  <  5  ; $j ++ )
3161    					$sum 	=  ( $sum * 85 ) + $ords [ $j ] ;
3162
3163    				for ( $j = 3  ; $j  >=  0  ; $j -- )
3164    					$result 	.=  chr ( $sum >> ( $j * 8 ) ) ;
3165			    }
3166		    }
3167
3168		// A last processing for the potential remaining bytes
3169		// Notes : this situation has never been tested
3170		if  ( $ord_count )
3171    		   {
3172    			for  ( $i = 0, $sum = 0  ; $i  <  $ord_count  ; $i++ )
3173    				$sum 	+= ( $ords [ $i ] + ( $i == $ord_count - 1 ) ) * $exp85 [$i] ;
3174
3175    			for  ( $i = 0  ; $i  <  $ord_count - 1  ; $i++ )
3176    				$result 	.=  chr ( $sum >> ( ( 3 - $i ) * 8 ) ) ;
3177    		    }
3178
3179		// All done, return
3180		return ( $result ) ;
3181	    }
3182
3183
3184	/*--------------------------------------------------------------------------------------------------------------
3185
3186	    NAME
3187	        DecodeImage - Returns decoded image contents.
3188
3189	    PROTOTYPE
3190	        TBC
3191
3192	    DESCRIPTION
3193	        description
3194
3195	    PARAMETERS
3196	        $object_id (integer) -
3197	                Pdf object number.
3198
3199		$stream_data (string) -
3200			Object data.
3201
3202		$type (integer) -
3203			One of the PdfToText::PDF_*_ENCODING constants.
3204
3205		$autosave (boolean) -
3206			When autosave is selected, images will not be decoded into memory unless they have a format
3207			different from JPEG. This is intended to save memory.
3208
3209	    RETURN VALUE
3210	        Returns an object of type PdfIMage, or false if the image encoding type is not currently supported.
3211
3212	 *-------------------------------------------------------------------------------------------------------------*/
3213	protected function  DecodeImage ( $object_id, $stream_data, $type, $object_data, $autosave )
3214	   {
3215		switch  ( $type )
3216		   {
3217			// Normal JPEG image
3218			case	self::PDF_DCT_ENCODING :
3219				return ( new PdfJpegImage ( $stream_data, $autosave ) ) ;
3220
3221			// CCITT fax image
3222			case	self::PDF_CCITT_FAX_ENCODING :
3223				return ( new PdfFaxImage ( $stream_data ) ) ;
3224
3225			// For now, I have not found enough information to be able to decode image data in an inflated stream...
3226			// In some cases, however, this is JPEG data
3227			case	self::PDF_FLATE_ENCODING :
3228				$image		=  PdfInlinedImage::CreateInstance ( $stream_data, $object_data, $autosave ) ;
3229
3230				if  ( $image )
3231					return ( $image ) ;
3232
3233				break ;
3234
3235			default :
3236				return ( false ) ;
3237		    }
3238
3239		return ( false ) ;
3240	    }
3241
3242
3243	/*--------------------------------------------------------------------------------------------------------------
3244
3245	    NAME
3246	        DecodeObjectStream - Decodes an object stream.
3247
3248	    PROTOTYPE
3249	        $array	=  $this -> DecodeObjectStream ( $object_id, $object_data ) ;
3250
3251	    DESCRIPTION
3252	        Decodes an object stream. An object stream is yet another PDF object type that contains itself several
3253		objects not defined using the "x y obj ... endobj" syntax.
3254		As far as I understood, object streams data is contained within stream/endstream delimiters, and is
3255		gzipped.
3256		Object streams start with a set of object id/offset pairs separated by a space ; catenated object data
3257		immediately follows the last space ; for example :
3258
3259			1167 0 1168 114 <</DA(/Helv 0 Tf 0 g )/DR<</Encoding<</PDFDocEncoding 1096 0 R>>/Font<</Helv 1094 0 R/ZaDb 1095 0 R>>>>/Fields[]>>[/ICCBased 1156 0 R]
3260
3261		The above example specifies two objects :
3262			. Object #1167, which starts at offset 0 and ends before the second object, at offset #113 in
3263			  the data. The contents are :
3264				<</DA(/Helv 0 Tf 0 g )/DR<</Encoding<</PDFDocEncoding 1096 0 R>>/Font<</Helv 1094 0 R/ZaDb 1095 0 R>>>>/Fields[]>>
3265			. Object #1168, which starts at offset #114 and continues until the end of the object stream.
3266			  It contains the following data :
3267				[/ICCBased 1156 0 R]
3268
3269	    PARAMETERS
3270	        $object_id (integer) -
3271	                Pdf object number.
3272
3273		$object_data (string) -
3274			Object data.
3275
3276	    RETURN VALUE
3277	        Returns false if any error occurred (mainly for syntax reasons).
3278		Otherwise, returns an associative array containing the following elements :
3279		- object_id :
3280			Array of all the object ids contained in the object stream.
3281		- object :
3282			Array of corresponding object data.
3283
3284		The reason for this format is that it is identical to the array returned by the preg_match() function
3285		used in the Load() method for finding objects in a PDF file (ie, a regex that matches "x y oj/endobj"
3286		constructs).
3287
3288	 *-------------------------------------------------------------------------------------------------------------*/
3289	protected function  DecodeObjectStream ( $object_id, $object_data )
3290	   {
3291		// Extract gzipped data for this object
3292		if  ( preg_match ( '#[^/] stream ( (\r? \n) | \r ) (?P<stream> .*?) endstream#imsx', $object_data, $stream_match ) )
3293		    {
3294			$stream_data	=  $stream_match [ 'stream' ] ;
3295			$type 		=  $this -> GetEncodingType ( $object_id, $object_data ) ;
3296			$decoded_data	=  $this -> DecodeData ( $object_id, $stream_data, $type, $object_data ) ;
3297
3298			if  ( self::$DEBUG  >  1 )
3299				echo "\n----------------------------------- OBJSTREAM #$object_id\n$decoded_data" ;
3300		      }
3301		// Stay prepared to find one day a sample declared as an object stream but not having gzipped data delimited by stream/endstream tags
3302		else
3303		   {
3304			if  ( self::$DEBUG  >  1 )
3305				error ( new PdfToTextDecodingException ( "Found object stream without gzipped data", $object_id ) ) ;
3306
3307			return ( false ) ;
3308		    }
3309
3310		// Object streams data start with a series of object id/offset pairs. The offset is absolute to the first character
3311		// after the last space of these series.
3312		// Note : on Windows platforms, the default stack size is 1Mb. The following regular expression will make Apache crash in most cases,
3313		// so you have to enable the following lines in your http.ini file to set a stack size of 8Mb, as for Unix systems :
3314		//	Include conf/extra/httpd-mpm.conf
3315		//	ThreadStackSize 8388608
3316		if  ( ! preg_match ( '/^ \s* (?P<series> (\d+ \s* )+ )/x', $decoded_data, $series_match ) )
3317		   {
3318			if  ( self::$DEBUG  >  1 )
3319				error ( new PdfToTextDecodingException ( "Object stream does not start with integer object id/offset pairs.", $object_id ) ) ;
3320
3321			return ( false ) ;
3322		    }
3323
3324		// Extract the series of object id/offset pairs and the stream object data
3325		$series		=  explode ( ' ', rtrim ( preg_replace ( '/\s+/', ' ', $series_match [ 'series' ] ) ) ) ;
3326		$data		=  substr ( $decoded_data, strlen ( $series_match [ 'series' ] ) ) ;
3327
3328		// $series should contain an even number of values
3329		if  ( count ( $series ) % 2 )
3330		   {
3331			if  ( self::$DEBUG )
3332				warning ( new PdfToTextDecodingException ( "Object stream should start with an even number of integer values.", $object_id ) ) ;
3333
3334			array_pop ( $series ) ;
3335		    }
3336
3337		// Extract every individual object
3338		$objects	=  array ( 'object_id' => array ( ), 'object' => array ( ) ) ;
3339
3340		for  ( $i = 0, $count = count ( $series ) ; $i  <  $count ; $i += 2 )
3341		   {
3342			$object_id	=  ( integer ) $series [$i] ;
3343			$offset		=  ( integer ) $series [$i+1] ;
3344
3345			// If there is a "next" object, extract only a substring within the object stream contents
3346			if  ( isset ( $series [ $i + 3 ] ) )
3347				$object_contents	=  substr ( $data, $offset, $series [ $i + 3 ] - $offset ) ;
3348			// Otherwise, extract everything until the end
3349			else
3350				$object_contents	=  substr ( $data, $offset ) ;
3351
3352			$objects [ 'object_id'] []	=  $object_id ;
3353			$objects [ 'object'   ] []	=  $object_contents ;
3354		    }
3355
3356		return ( $objects ) ;
3357	    }
3358
3359
3360	/*--------------------------------------------------------------------------------------------------------------
3361
3362	    NAME
3363	        ExtractTextData - Extracts text, header & footer information from a text object.
3364
3365	    PROTOTYPE
3366	        $this -> ExtractTextData ( $object_id, $stream_contents, &$text, &$header, &$footer ) ;
3367
3368	    DESCRIPTION
3369	        Extracts text, header & footer information from a text object. The extracted text contents will be
3370		stripped from any header/footer information.
3371
3372	    PARAMETERS
3373	        $text (string) -
3374	                Variable that will receive text contents.
3375
3376		$header, $footer (string) -
3377			Variables that will receive header and footer information.
3378
3379	 *-------------------------------------------------------------------------------------------------------------*/
3380	protected function  ExtractTextData ( $object_id, $stream_contents, &$text, &$header, &$footer )
3381	   {
3382		// Normally, a header or footer is introduced with a construct like :
3383		//	<< /Type /Pagination ... [/Bottom] ... >> (or [/Top]
3384		// The initial regular expression was :
3385		//	<< .*? \[ \s* / (?P<location> (Bottom) | (Top) ) \s* \] .*? >> \s* BDC .*? EMC
3386		// (the data contained between the BDC and EMC instructions are text-drawing instructions).
3387		// However, this expression revealed to be too greedy and captured too much data ; in the following example :
3388		//	<</MCID 0>> ...(several kb of drawing instructions)... << ... [/Bottom] ... >> BDC (other drawing instructions for the page footer) EMC
3389		// everything was captured, from the initial "<<M/MCID 0>>" to the final "EMC", which caused regular page contents to be interpreted as page bottom
3390		// contents.
3391		// The ".*?" in the regex has been replaced with "[^>]*?", which works better. However, it will fail to recognize header/footer contents if
3392		// the header/footer declaration contains a nested construct , such as :
3393		//	<< /Type /Pagination ... [/Bottom] ... << (some nested contents) >> ... >> (or [/Top]
3394		// Let's wait for the case to happen one day...
3395		static		$header_or_footer_re	=  '#
3396								(?P<contents>
3397									<< [^>]*? \[ \s* / (?P<location> (Bottom) | (Top) ) \s* \] [^>]*? >> \s*
3398									BDC .*? EMC
3399								 )
3400							    #imsx' ;
3401
3402		$header		=
3403		$footer		=
3404		$text		=  '' ;
3405
3406		if  ( preg_match_all ( $header_or_footer_re, $stream_contents, $matches, PREG_OFFSET_CAPTURE ) )
3407		   {
3408			for  ( $i = 0, $count = count ( $matches [ 'contents' ] ) ; $i  <  $count ; $i ++ )
3409			   {
3410				if  ( ! strcasecmp ( $matches [ 'location' ] [$i] [0], 'Bottom' ) )
3411					$footer		=  $matches [ 'contents' ] [$i] [0] ;
3412				else
3413					$header		=  $matches [ 'contents' ] [$i] [0] ;
3414			    }
3415
3416			$text	=  preg_replace ( $header_or_footer_re, '', $stream_contents ) ;
3417		    }
3418		else
3419			$text	=  $stream_contents ;
3420	    }
3421
3422
3423	/*--------------------------------------------------------------------------------------------------------------
3424
3425	    NAME
3426		ExtractText - extracts text from a pdf stream.
3427
3428	    PROTOTYPE
3429		$text 	=  $this -> ExtractText ( $page_number, $object_id, $data, &$current_font ) ;
3430
3431	    DESCRIPTION
3432	        Extracts text from decoded stream contents.
3433
3434	    PARAMETERS
3435		$page_number (integer) -
3436			�Page number that contains the text to be extracted.
3437
3438	    	$object_id (integer) -
3439	    		Object id of this text block.
3440
3441	    	$data (string) -
3442	    		Stream contents.
3443
3444		$current_font (integer) -
3445			Id of the current font, which should be found in the $this->FontTable property, if anything
3446			went ok.
3447			This parameter is required, since text blocks may not specify a new font resource id and reuse
3448			the one that waas set before.
3449
3450	    RETURN VALUE
3451		Returns the decoded text.
3452
3453	    NOTES
3454		The PDF language can be seen as a stack-driven language  ; for example, the instruction defining a text
3455		matrix ( "Tm" ) expects 6 floating-point values from the stack :
3456
3457			0 0 0 0 x y Tm
3458
3459		It can also specify specific operators, such as /Rx, which sets font number "x" to be the current font,
3460		or even "<< >>" constructs that we can ignore during our process of extracting textual data.
3461		Actually, we only want to handle a very small subset of the Adobe drawing language ; These are :
3462		- "Tm" instructions, that specify, among others, the x and y coordinates of the next text to be output
3463		- "/R" instructions, that specify which font is to be used for the next text output. This is useful
3464		  only if the font has an associated character map.
3465		- "/F", same as "/R", but use a font map id instead of a direct object id.
3466		- Text, specified either using a single notation ( "(sometext)" ) or the array notation
3467		  ( "[(...)d1(...)d2...(...)]" ), which allows for specifying inter-character spacing.
3468		 - "Tf" instructions, that specifies the font size. This is to be able to compute approximately the
3469		   number of empty lines between two successive Y coordinates in "Tm" instructions
3470		 - "TL" instructions, that define the text leading to be used by "T*"
3471
3472		This is why I choosed to decompose the process of text extraction into three steps :
3473		- The first one, the lowest-level step, is a tokenizer that extracts individual elements, such as "Tm",
3474		  "TJ", "/Rx" or "510.77". This is handled by the __next_token() method.
3475		- The second one, __next_instruction(), collects tokens. It pushes every floating-point value onto the
3476		  stack, until an instruction is met.
3477		- The third one, ExtractText(), processes data returned by __next_instruction(), and actually performs
3478		  the (restricted) parsing of text drawing instructions.
3479
3480	 *-------------------------------------------------------------------------------------------------------------*/
3481	protected function  ExtractText ( $page_number, $object_id, $data, &$current_font )
3482	   {
3483		$new_data	=  $this -> __strip_useless_instructions ( $data ) ;
3484
3485		if  ( self::$DEBUG )
3486		   {
3487			echo "\n----------------------------------- TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ;
3488			echo $data ;
3489			echo "\n----------------------------------- OPTIMIZED TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ;
3490			echo $new_data ;
3491		    }
3492
3493		$data		=  $new_data ;
3494
3495		// Index into the specified block of text-drawing instructions
3496		$data_index 			=  0 ;
3497
3498		$data_length 			=  strlen ( $data ) ;		// Data length
3499		$result 			=  '' ;				// Resulting string
3500
3501		// Y-coordinate of the last seen "Tm" instruction
3502		$last_goto_y 			=  0 ;
3503		$last_goto_x			=  0 ;
3504
3505		// Y-coordinate of the last seen "Td" or "TD" relative positioning instruction
3506		$last_relative_goto_y		=  0 ;
3507
3508		// When true, the current text should be output on the same line as the preceding one
3509		$use_same_line 			=  false ;
3510
3511		// Instruction preceding the current one
3512		$last_instruction 		=  true ;
3513
3514		// Current font size
3515		$current_font_size 		=  0 ;
3516
3517		// Active template
3518		$current_template		=  '' ;
3519
3520		// Various pre-computed variables
3521		$separator_length		=  strlen ( $this -> Separator ) ;
3522
3523		// Current font map width, in bytes, plus a flag saying whether the current font is mapped or not
3524		$this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ;
3525
3526		// Extra newlines to add before the current text
3527		$extra_newlines 		=  0 ;
3528
3529		// Text leading used by T*
3530		$text_leading 			=  0 ;
3531
3532		// Set to true if a separator needs to be inserted
3533		$needs_separator		=  false ;
3534
3535		// A flag to tell if we should "forget" the last instruction
3536		$discard_last_instruction	=  false ;
3537
3538		// A flag that tells whether the Separator and BlockSeparator properties are identical
3539		$same_separators		=  ( $this -> Separator  ==  $this -> BlockSeparator ) ;
3540
3541		// Instruction count (used for handling execution timeouts)
3542		$instruction_count		=  0 ;
3543
3544		// Unprocessed markers
3545		$unprocessed_marker_count	=  count ( $this -> UnprocessedMarkerList [ 'font' ] ) ;
3546
3547		// Loop through instructions
3548		while  ( ( $instruction =  $this -> __next_instruction ( $page_number, $data, $data_length, $data_index, $current_template ) )  !==  false )
3549		   {
3550			$fragment	=  '' ;
3551
3552			$instruction_count ++ ;
3553
3554			// Timeout handling - don't test for every instruction processed
3555			if  ( ! ( $instruction_count % 100 ) )
3556			   {
3557				// Global timeout handling
3558				if  ( $this -> Options  &  self::PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME )
3559				   {
3560					$now	=  microtime ( true ) ;
3561
3562					if  ( $now - self::$GlobalExecutionStartTime  >  self::$MaxGlobalExecutionTime )
3563						error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", true, self::$PhpMaxExecutionTime, self::$MaxGlobalExecutionTime ) ) ;
3564				    }
3565
3566				// Per-instance timeout handling
3567				if  ( $this -> Options  &  self::PDFOPT_ENFORCE_EXECUTION_TIME )
3568				   {
3569					$now	=  microtime ( true ) ;
3570
3571					if  ( $now - $this -> ExecutionStartTime  >  $this -> MaxExecutionTime )
3572						error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", false, self::$PhpMaxExecutionTime, $this -> MaxExecutionTime ) ) ;
3573				    }
3574			    }
3575
3576			// Character position after the current instruction
3577			$data_index 	=  $instruction [ 'next' ] ;
3578
3579			// Process current instruction
3580			switch  ( $instruction [ 'instruction' ] )
3581			   {
3582				// Raw text (enclosed by parentheses) or array text (enclosed within square brackets)
3583				// is returned as a single instruction
3584			   	case 	'text' :
3585					// Empty arrays of text may be encountered - ignore them
3586					if  ( ! count ( $instruction [ 'values' ] ) )
3587						break ;
3588
3589					// Check if we have to insert a newline
3590			   		if ( ! $use_same_line )
3591					   {
3592			   			$fragment 		.=  $this -> EOL ;
3593						$needs_separator	 =  false ;
3594					    }
3595			   		// Roughly simulate spacing between lines by inserting newline characters
3596			   		else if  ( $extra_newlines  > 0 )
3597			   		   {
3598			   			$fragment 		.=  str_repeat ( $this -> EOL, $extra_newlines ) ;
3599			   			$extra_newlines		 =  0 ;
3600						$needs_separator	 =  false ;
3601			   		    }
3602					else
3603						$needs_separator	=  true ;
3604
3605					// Add a separator if necessary
3606					if  ( $needs_separator )
3607					   {
3608						// If the Separator and BlockSeparator properties are the same (and not empty), only add a block separator if
3609						// the current result does not end with it
3610						if  ( $same_separators )
3611						   {
3612							if  ( $this -> Separator  !=  ''  &&  substr ( $fragment, - $separator_length )  !=  $this -> BlockSeparator )
3613								$fragment		.=  $this -> BlockSeparator ;
3614						    }
3615						else
3616							$fragment		.=  $this -> BlockSeparator ;
3617					    }
3618
3619					$needs_separator	=  true ;
3620					$value_index		=  0 ;
3621
3622					// Fonts having character maps will require some special processing
3623					if  ( $current_font_mapped )
3624					   {
3625					   	// Loop through each text value
3626			   			foreach  ( $instruction [ 'values' ]  as  $text )
3627			   			   {
3628			   		   		$is_hex 	=  ( $text [0]  ==  '<' ) ;
3629			   			   	$length 	=  strlen ( $text ) - 1 ;
3630							$handled	=  false ;
3631
3632			   			   	// Characters are encoded within angle brackets ( "<>" ).
3633							// Note that several characters can be specified within the same angle brackets, so we have to take
3634							// into account the width we detected in the begincodespancerange construct
3635			   			   	if  ( $is_hex )
3636			   			   	   {
3637			   			   	   	for  ( $i = 1 ; $i  <  $length ; $i += $current_font_map_width )
3638			   			   	   	   {
3639									$value		 =  substr ( $text, $i, $current_font_map_width ) ;
3640			   			   	   	   	$ch 		 =  hexdec ( $value ) ;
3641
3642									if  ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) )
3643										$newchar	=  $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ;
3644									else if  ( $current_font  ==  -1 )
3645									   {
3646										$newchar	=  chr ( $ch ) ;
3647									    }
3648									else
3649									   {
3650										$newchar	 =  $this -> FontTable -> MapCharacter ( $current_font, $ch ) ;
3651										$this -> CharacterMapBuffer [ $current_font ] [ $ch ]		=  $newchar ;
3652									    }
3653
3654			   			   			$fragment		.=  $newchar ;
3655			   			   	   	    }
3656
3657								$handled	 =  true ;
3658			   			   	    }
3659							// Yes ! double-byte codes can also be specified as plain text within parentheses !
3660							// However, we have to be really careful here ; the sequence :
3661							//	(Be)
3662							// can mean the string "Be" or the Unicode character 0x4265 ('B' = 0x42, 'e' = 0x65)
3663							// We first look if the character map contains an entry for Unicode codepoint 0x4265 ;
3664							// if not, then we have to consider that it is regular text to be taken one character by
3665							// one character. In this case, we fall back to the "if ( ! $handled )" condition
3666							else if  ( $current_font_map_width  ==  4  )
3667							   {
3668								$temp_result		=  '' ;
3669
3670								for  ( $i = 1 ; $i  <  $length ; $i ++ )
3671								   {
3672									// Each character in the pair may be a backslash, which escapes the next character so we must skip it
3673									// This code needs to be reviewed ; the same code is duplicated to handle escaped characters in octal notation
3674									if  ( $text [$i]  !=  '\\' )
3675										$ch1	=  $text [$i] ;
3676									else
3677									   {
3678										$i ++ ;
3679
3680										if  ( $text [$i]  <  '0'  ||  $text [$i]  >  '7' )
3681											$ch1	=  $this -> ProcessEscapedCharacter ( $text [$i] ) ;
3682										else
3683										   {
3684											$oct		=  '' ;
3685											$digit_count	=  0 ;
3686
3687											while  ( $i  <  $length  &&  $text [$i]  >=  '0'  &&  $text [$i]  <=  '7'  &&  $digit_count  <  3 )
3688											   {
3689												$oct	.=  $text [$i ++] ;
3690												$digit_count ++ ;
3691											    }
3692
3693											$ch1	=  chr ( octdec ( $oct ) ) ;
3694											$i -- ;
3695										    }
3696									    }
3697
3698									$i ++ ;
3699
3700									if  ( $text [$i]  != '\\' )
3701										$ch2	=  $text [$i] ;
3702									else
3703									   {
3704										$i ++ ;
3705
3706										if  ( $text [$i]  <  '0'  ||  $text [$i]  >  '7' )
3707											$ch2	=  $this -> ProcessEscapedCharacter ( $text [$i] ) ;
3708										else
3709										   {
3710											$oct		=  '' ;
3711											$digit_count	=  0 ;
3712
3713											while  ( $i  <  $length  &&  $text [$i]  >=  '0'  &&  $text [$i]  <=  '7'  &&  $digit_count  <  3 )
3714											   {
3715												$oct	.=  $text [$i ++] ;
3716												$digit_count ++ ;
3717											    }
3718
3719											$ch2	=  chr ( octdec ( $oct ) ) ;
3720											$i -- ;
3721										    }
3722									    }
3723
3724									// Build the 2-bytes character code
3725									$ch		=  ( ord ( $ch1 )  <<  8 )  |  ord ( $ch2 ) ;
3726
3727									if  ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) )
3728										$newchar	=  $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ;
3729									else
3730									   {
3731										$newchar	=  $this -> FontTable -> MapCharacter ( $current_font, $ch, true ) ;
3732										$this -> CharacterMapBuffer [ $current_font ] [ $ch ]		=  $newchar ;
3733									    }
3734
3735									// Yes !!! for characters encoded with two bytes, we can find the following construct :
3736									//	0x00 "\" "(" 0x00 "C" 0x00 "a" 0x00 "r" 0x00 "\" ")"
3737									// which must be expanded as : (Car)
3738									// We have here the escape sequences "\(" and "\)", but the backslash is encoded on two bytes
3739									// (although the MSB is nul), while the escaped character is encoded on 1 byte. waiting
3740									// for the next quirk to happen...
3741									if  ( $newchar  ==  '\\'  &&  isset ( $text [ $i + 2 ] ) )
3742									   {
3743										$newchar		=  $this -> ProcessEscapedCharacter ( $text [ $i + 2 ] ) ;
3744										$i ++ ;		// this time we processed 3 bytes, not 2
3745									    }
3746
3747									$temp_result		.=  $newchar ;
3748								    }
3749
3750								// Happens only if we were unable to translate a character using the current character map
3751								$fragment		.=  $temp_result ;
3752								$handled	 =  true ;
3753							    }
3754
3755							// Character strings within parentheses.
3756							// For every text value, use the character map table for substitutions
3757							if  ( ! $handled )
3758							   {
3759				   		   		for  ( $i = 1 ; $i  <  $length ; $i ++ )
3760				   		   		   {
3761				   		   			$ch 		=  $text [$i] ;
3762
3763									// Set to true to optimize calls to MapCharacters
3764									// Currently does not work with pobox@dizy.sk/infoma.pdf (a few characters differ)
3765									$use_map_buffer	=  false ;
3766
3767									// ... but don't forget to handle escape sequences "\n" and "\r" for characters
3768									// 10 and 13
3769				   		   			if  ( $ch  ==  '\\' )
3770				   		   			   {
3771				   		   				$ch 	=  $text [++$i] ;
3772
3773										// Escaped character
3774										if  ( $ch  <  '0'  ||  $ch  >  '7' )
3775											$ch		=  $this -> ProcessEscapedCharacter ( $ch ) ;
3776										// However, an octal form can also be specified ; in this case we have to take into account
3777										// the character width for the current font (if the character width is 4 hex digits, then we
3778										// will encounter constructs such as "\000\077").
3779										// The method used here is dirty : we build a regex to match octal character representations on a substring
3780										// of the text
3781										else
3782										   {
3783											$width		=  $current_font_map_width / 2 ;	// Convert to byte count
3784											$subtext	=  substr ( $text, $i - 1 ) ;
3785											$regex		=  "#^ (\\\\ [0-7]{3}){1,$width} #imsx" ;
3786
3787											$status		=  preg_match ( $regex, $subtext, $octal_matches ) ;
3788
3789											if  ( $status )
3790											   {
3791												$octal_values	=  explode ( '\\', substr ( $octal_matches [0], 1 ) ) ;
3792												$ord		=  0 ;
3793
3794												foreach  ( $octal_values  as  $octal_value )
3795													$ord	=  ( $ord  <<  8 ) + octdec ( $octal_value ) ;
3796
3797												$ch	 =  chr ( $ord ) ;
3798												$i	+=  strlen ( $octal_matches [0] ) - 2 ;
3799											    }
3800										    }
3801
3802										$use_map_buffer		=  false ;
3803				   		   			    }
3804
3805									// Add substituted character to the output result
3806									$ord		 =  ord ( $ch ) ;
3807
3808									if  ( ! $use_map_buffer )
3809										$newchar	 =  $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
3810									else
3811									   {
3812										if  ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) )
3813											$newchar	=  $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ;
3814										else
3815										   {
3816											$newchar	 =  $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
3817											$this -> CharacterMapBuffer [ $current_font ] [ $ord ]	=  $newchar ;
3818										    }
3819									    }
3820
3821									$fragment		.=  $newchar ;
3822				   		   		    }
3823							    }
3824
3825							// Handle offsets between blocks of characters
3826							if  ( isset ( $instruction [ 'offsets' ] [ $value_index ] )  &&
3827									- ( $instruction [ 'offsets' ] [ $value_index ] )  >  $this -> MinSpaceWidth )
3828								$fragment		.=  $this -> __get_character_padding ( $instruction [ 'offsets' ] [ $value_index ] ) ;
3829
3830							$value_index ++ ;
3831			   		   	    }
3832			   		    }
3833					// For fonts having no associated character map, we simply encode the string in UTF8
3834					// after the C-like escape sequences have been processed
3835					// Note that <xxxx> constructs can be encountered here, so we have to process them as well
3836			   		else
3837			   		   {
3838			   			foreach  ( $instruction [ 'values' ]  as  $text )
3839			   			   {
3840			   			   	$is_hex 	=  ( $text [0]  ==  '<' ) ;
3841			   			   	$length 	=  strlen ( $text ) - 1 ;
3842
3843							// Some text within parentheses may have a backslash followed by a newline, to indicate some continuation line.
3844							// Example :
3845							//	(this is a sentence \
3846							//	 continued on the next line)
3847							// Funny isn't it ? so remove such constructs because we don't care
3848							$text		=  str_replace ( array ( "\\\r\n", "\\\r", "\\\n" ), '', $text ) ;
3849
3850			   			   	// Characters are encoded within angle brackets ( "<>" )
3851			   			   	if  ( $is_hex )
3852			   			   	   {
3853			   			   	   	for  ( $i = 1 ; $i  <  $length ; $i += 2 )
3854			   			   	   	   {
3855			   			   	   	   	$ch 	=  hexdec ( substr ( $text, $i, 2 ) ) ;
3856
3857			   			   			$fragment .=  $this -> CodePointToUtf8 ( $ch ) ;
3858			   			   	   	    }
3859			   			   	    }
3860							// Characters are plain text
3861			   			   	else
3862							   {
3863								$text	=  self::Unescape ( $text ) ;
3864
3865								for  ( $i = 1, $length = strlen ( $text ) - 1 ; $i  <  $length ; $i ++ )
3866								   {
3867									$ch	=  $text [$i] ;
3868									$ord	=  ord ( $ch ) ;
3869
3870									if  ( $ord  <  127 )
3871										$newchar	=  $ch ;
3872									else
3873									   {
3874										if  ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) )
3875											$newchar	=  $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ;
3876										else
3877										   {
3878											$newchar	=  $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
3879											$this -> CharacterMapBuffer [ $current_font ] [ $ord ]	=  $newchar ;
3880										    }
3881									    }
3882
3883									$fragment		.=  $newchar ;
3884								    }
3885							    }
3886
3887							// Handle offsets between blocks of characters
3888							if  ( isset ( $instruction [ 'offsets' ] [ $value_index ] )  &&
3889									abs ( $instruction [ 'offsets' ] [ $value_index ] )  >  $this -> MinSpaceWidth )
3890								$fragment		.=  $this -> __get_character_padding ( $instruction [ 'offsets' ] [ $value_index ] ) ;
3891
3892							$value_index ++ ;
3893			   			   }
3894			   		    }
3895
3896					// Process the markers which do not have an associated font yet - this will be done by matching
3897					// the current text fragment against one of the regular expressions defined.
3898					// If a match occurs, then all the subsequent text fragment using the same font will be put markers
3899					for  ( $j = 0 ; $j  <  $unprocessed_marker_count ; $j ++ )
3900					   {
3901						$marker			=  $this -> UnprocessedMarkerList [ 'font' ] [$j] ;
3902
3903						if  ( preg_match ( $marker [ 'regex' ], trim ( $fragment ) ) )
3904						   {
3905							$this -> TextWithFontMarkers [ $current_font ]	=  array
3906							   (
3907								'font'		=>  $current_font,
3908								'height'	=>  $current_font_size,
3909								'regex'		=>  $marker   [ 'regex' ],
3910								'start'		=>  $marker   [ 'start' ],
3911								'end'		=>  $marker   [ 'end'   ]
3912							    ) ;
3913
3914							$unprocessed_marker_count -- ;
3915							unset ( $this -> UnprocessedMarkerList [ 'font' ] [$j] ) ;
3916
3917							break ;
3918						    }
3919					    }
3920
3921					// Check if we need to add markers around this text fragment
3922					if  ( isset ( $this -> TextWithFontMarkers [ $current_font ] )  &&
3923							$this -> TextWithFontMarkers [ $current_font ] [ 'height' ]  ==  $current_font_size )
3924					    {
3925						$fragment	=  $this -> TextWithFontMarkers [ $current_font ] [ 'start' ] .
3926								   $fragment .
3927								   $this -> TextWithFontMarkers [ $current_font ] [ 'end' ] ;
3928					     }
3929
3930					$result		.=  $fragment ;
3931
3932					break ;
3933
3934				// An "nl" instruction means TJ, Tj, T* or "'"
3935			   	case 	'nl' :
3936			   		if  ( ! $instruction [ 'conditional' ] )
3937			   		   {
3938			   		   	if  ( $instruction [ 'leading' ]  &&  $text_leading  &&  $current_font_size )
3939			   		   	   {
3940			   		   		$count 	=  ( integer ) ( ( $text_leading - $current_font_size ) / $current_font_size ) ;
3941
3942			   		   		if  ( ! $count )
3943			   		   			$count 	=  1 ;
3944			   		   	    }
3945			   		   	else
3946			   		   		$count 	=  1 ;
3947
3948		   		   		$extra			 =  str_repeat ( PHP_EOL, $count ) ;
3949			   			$result 		.=  $extra ;
3950						$needs_separator	 =  false ;
3951						$last_goto_y 		-=  ( $count * $text_leading ) ;	// Approximation on y-coord change
3952						$last_relative_goto_y	 =  0 ;
3953			   		    }
3954
3955			   		break ;
3956
3957				// "Tm", "Td" or "TD" : Output text on the same line, if the "y" coordinates are equal
3958			   	case 	'goto' :
3959					// Some text is positioned using 'Tm' instructions ; however they can be immediatley followed by 'Td' instructions
3960					// which give a relative positioning ; so consider that the last instruction wins
3961					if  ( $instruction [ 'relative' ] )
3962					   {
3963						// Try to put a separator if the x coordinate is non-zero
3964						//if  ( $instruction [ 'x' ] - $last_goto_x  >=  $current_font_size )
3965						//	$result		.=  $this -> Separator ;
3966
3967						$discard_last_instruction	=  true ;
3968						$extra_newlines			=  0 ;
3969						$use_same_line			=  ( ( $last_relative_goto_y - abs ( $instruction  [ 'y' ] ) )  <=  $current_font_size ) ;
3970						$last_relative_goto_y		=  abs ( $instruction [ 'y' ] ) ;
3971						$last_goto_x			=  $instruction [ 'x' ] ;
3972
3973						if  ( - $instruction [ 'y' ]  >  $current_font_size )
3974						   {
3975							$use_same_line		=  false ;
3976
3977							if  ( $last_relative_goto_y )
3978								$extra_newlines		=  ( integer ) ( $current_font_size / $last_relative_goto_y ) ;
3979							else
3980								$extra_newlines		=  0 ;
3981						    }
3982						else if  ( ! $instruction [ 'y' ] )
3983						   {
3984							$use_same_line		=  true ;
3985							$extra_newlines		=  0 ;
3986						    }
3987
3988						break ;
3989					    }
3990					else
3991						$last_relative_goto_y	=  0 ;
3992
3993					$y	=  $last_goto_y + $last_relative_goto_y ;
3994
3995			   		if  ( $instruction [ 'y' ]  ==  $y  ||  abs ( $instruction [ 'y' ] - $y )  <  $current_font_size )
3996			   		   {
3997			   			$use_same_line 		=  true ;
3998			   			$extra_newlines 	=  0 ;
3999			   		    }
4000					else
4001					   {
4002					   	// Compute the number of newlines we have to insert between the current and the next lines
4003					   	if  ( $current_font_size )
4004					   		$extra_newlines =  ( integer ) ( ( $y - $instruction [ 'y' ] - $current_font_size ) / $current_font_size ) ;
4005
4006						$use_same_line 		=  ( $last_goto_y  ==  0 ) ;
4007					    }
4008
4009					$last_goto_y 		=  $instruction [ 'y' ] ;
4010			   		break ;
4011
4012				// Set font size
4013			   	case 	'fontsize' :
4014			   		$current_font_size 	=  $instruction [ 'size' ] ;
4015			   		break ;
4016
4017				// "/Rx" : sets the current font
4018			   	case 	'resource' :
4019			   		$current_font 		=  $instruction [ 'resource' ] ;
4020
4021					$this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ;
4022			   		break ;
4023
4024				// "/TPLx" : references a template, which can contain additional font aliases
4025				case	'template' :
4026					if  ( $this -> PageMap -> IsValidXObjectName ( $instruction [ 'token' ] ) )
4027						$current_template	=  $instruction [ 'token' ] ;
4028
4029					break ;
4030
4031			   	// 'TL' : text leading to be used for the next "T*" in the flow
4032			   	case	'leading' :
4033					if  ( ! ( $this -> Options & self::PDFOPT_IGNORE_TEXT_LEADING ) )
4034			   			$text_leading 		=  $instruction [ 'size' ] ;
4035
4036			   		break ;
4037
4038
4039				// 'ET' : we have to reset a few things here
4040				case	'ET' :
4041					$current_font			=  -1 ;
4042					$current_font_map_width		=  2 ;
4043					break ;
4044			    }
4045
4046			// Remember last instruction - this will help us into determining whether we should put the next text
4047			// on the current or following line
4048			if  ( ! $discard_last_instruction )
4049				$last_instruction 	=  $instruction ;
4050
4051			$discard_last_instruction	=  false ;
4052		    }
4053
4054		return ( $this -> __rtl_process ( $result ) ) ;
4055	    }
4056
4057
4058
4059	// __next_instruction -
4060	//	Retrieves the next instruction from the drawing text block.
4061	private function  __next_instruction ( $page_number, $data, $data_length, $index, $current_template )
4062	   {
4063		static 	$last_instruction 	=  false ;
4064
4065		$ch	=  '' ;
4066
4067		// Constructs such as
4068		if  ( $last_instruction )
4069		   {
4070			$result 		=  $last_instruction ;
4071			$last_instruction	=  false ;
4072
4073			return ( $result ) ;
4074		    }
4075
4076		// Whether we should compute enhanced statistics
4077		$enhanced_statistics		=  $this -> EnhancedStatistics ;
4078
4079		// Holds the floating-point values encountered so far
4080		$number_stack 	=  array ( ) ;
4081
4082		// Loop through the stream of tokens
4083		while  ( ( $part = $this -> __next_token ( $page_number, $data, $data_length, $index ) )  !==  false )
4084		   {
4085			$token 		=  $part [0] ;
4086			$next_index 	=  $part [1] ;
4087
4088			// Floating-point number : push it onto the stack
4089			if  ( ( $token [0]  >=  '0'  &&  $token [0]  <=  '9' )  ||  $token [0]  ==  '-'  ||  $token [0]  ==  '+'  ||  $token [0]  ==  '.' )
4090			   {
4091				$number_stack []	=  $token ;
4092				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ;
4093			    }
4094			// 'Tm' instruction : return a "goto" instruction with the x and y coordinates
4095			else if  ( $token  ==  'Tm' )
4096			   {
4097				$x 	=  $number_stack [4] ;
4098				$y 	=  $number_stack [5] ;
4099
4100				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'Tm' ] ++ ;
4101
4102				return ( array ( 'instruction' => 'goto', 'next' => $next_index, 'x' => $x, 'y' => $y, 'relative' => false, 'token' => $token ) ) ;
4103			    }
4104			// 'Td' or 'TD' instructions : return a goto instruction with the x and y coordinates (1st and 2nd args)
4105			else if  ( $token  ==  'Td'  ||  $token  ==  'TD' )
4106			   {
4107				$x 		=  $number_stack [0] ;
4108				$y 		=  $number_stack [1] ;
4109
4110				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ $token ] ++ ;
4111
4112				return ( array ( 'instruction' => 'goto', 'next' => $next_index, 'x' => $x, 'y' => $y, 'relative' => true, 'token' => $token ) ) ;
4113			    }
4114			// Output text "'" instruction, with conditional newline
4115			else if  ( $token [0]  ==  "'" )
4116			   {
4117				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ "'" ] ++ ;
4118
4119				return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => false, 'token' => $token ) ) ;
4120			    }
4121			// Same as above
4122			else if  ( $token  ==  'TJ'  ||  $token  ==  'Tj' )
4123			   {
4124				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ $token ] ++ ;
4125
4126				return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => false, 'token' => $token ) ) ;
4127			    }
4128			// Set font size
4129			else if  ( $token  ==  'Tf' )
4130			    {
4131				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'Tf' ] ++ ;
4132
4133				return ( array ( 'instruction' => 'fontsize', 'next' => $next_index, 'size' => $number_stack [0], 'token' => $token ) ) ;
4134			     }
4135			// Text leading (spacing used by T*)
4136			else if  ( $token  ==  'TL' )
4137			   {
4138				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'TL' ] ++ ;
4139
4140				return ( array ( 'instruction' => 'leading', 'next' => $next_index, 'size' => $number_stack [0], 'token' => $token ) ) ;
4141			    }
4142			// Position to next line
4143			else if  ( $token  ==  'T*' )
4144			    {
4145				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'T*' ] ++ ;
4146
4147				return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => true, 'leading' => true ) ) ;
4148			     }
4149			// Draw object ("Do"). To prevent different text shapes to appear on the same line, we return a "newline" instruction
4150			// here. Note that the shape position is not taken into account here, and shapes will be processed in the order they
4151			// appear in the pdf file (which is likely to be different from their position on a graphic screen).
4152			else if  ( $token  ==  'Do' )
4153			   {
4154				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
4155
4156				return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => false, 'token' => $token ) ) ;
4157			    }
4158			// Raw text output
4159			else if  ( $token [0]  ==  '(' )
4160			   {
4161			   	$next_part 	=  $this -> __next_token ( $page_number, $data, $data_length, $next_index, $enhanced_statistics ) ;
4162			   	$instruction	=  array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ;
4163				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ '(' ] ++ ;
4164
4165			   	if  ( $next_part [0]  ==  "'" )
4166			   	   {
4167			   	   	$last_instruction  	=  $instruction ;
4168			   	   	return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => true, 'token' => $token ) ) ;
4169			   	   }
4170			   	else
4171					return ( $instruction ) ;
4172			    }
4173			// Hex digits within angle brackets
4174		   	else if  ( $token [0]  ==  '<'  )
4175			   {
4176				$ch		=  $token [1] ;
4177				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ '<' ] ++ ;
4178			   	$instruction	=  array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ;
4179
4180				if  ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM )
4181				   {
4182			   		$next_part 	=  $this -> __next_token ( $page_number, $data, $data_length, $next_index ) ;
4183			   		$instruction	=  array ( 'instruction' => 'text', 'next' => $next_index, 'values' => array ( $token ), 'token' => $token ) ;
4184
4185			   		if  ( $next_part [0]  ==  "'" )
4186			   		   {
4187			   	   		$last_instruction  	=  $instruction ;
4188			   	   		return ( array ( 'instruction' => 'nl', 'next' => $next_index, 'conditional' => false, 'leading' => true, 'token' => $token ) ) ;
4189			   		   }
4190			   		else
4191						return ( $instruction ) ;
4192				    }
4193			    }
4194			    // Text specified as an array of individual raw text elements, and individual interspaces between characters
4195			else if  ( $token [0]  ==  '[' )
4196			   {
4197				$values 	=  $this -> __extract_chars_from_array ( $token ) ;
4198				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ '[' ] ++ ;
4199				$instruction 	=  array ( 'instruction' => 'text', 'next' => $next_index, 'values' => $values [0], 'offsets' => $values [1], 'token' => $token ) ;
4200
4201				return ( $instruction ) ;
4202			    }
4203			// Token starts with a slash : maybe a font specification
4204			else if  ( preg_match ( '#^ ( ' . self::$FontSpecifiers . ' ) #ix', $token ) )
4205			   {
4206				$key	=  "$page_number:$current_template:$token" ;
4207				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ;
4208
4209				if  ( isset ( $this -> MapIdBuffer [ $key ] ) )
4210					$id	=   $this -> MapIdBuffer [ $key ] ;
4211				else
4212				   {
4213					$id 	=  $this -> FontTable -> GetFontByMapId ( $page_number, $current_template, $token ) ;
4214
4215					$this -> MapIdBuffer [ $key ]	=  $id ;
4216				    }
4217
4218				return ( array ( 'instruction' => 'resource', 'next' => $next_index, 'resource' => $id, 'token' => $token ) ) ;
4219			    }
4220			// Template reference, such as /TPL1. Each reference has initially been replaced by !PDFTOTEXT_TEMPLATE_TPLx during substitution
4221			// by ProcessTemplateReferences(), because templates not only specify text to be replaced, but also font aliases
4222			// -and this is the place where we catch font aliases in this case
4223			else if  ( preg_match ( '/ !PDFTOTEXT_TEMPLATE_ (?P<template> \w+) /ix', $token, $match ) )
4224			   {
4225				$current_template	=  '/' . $match [ 'template' ] ;
4226				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'template' ] ++ ;
4227
4228				return ( array ( 'instruction' => 'template', 'next' => $next_index, 'token' => $current_template ) ) ;
4229			    }
4230			// Others, only counted for statistics
4231			else if  ( $token  ===  'cm' )
4232			   {
4233				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'cm' ] ++ ;
4234			    }
4235			else if  ( $token  ===  'BT' )
4236			   {
4237				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'BT' ] ++ ;
4238
4239				return ( array ( 'instruction' => 'BT', 'next' => $next_index, 'token' => $token ) ) ;
4240			    }
4241			else if  ( $token  ==  'ET' )	// Nothing special to count here
4242			   {
4243				return ( array ( 'instruction' => 'ET', 'next' => $next_index, 'token' => $token ) ) ;
4244			    }
4245			// Other instructions : we're not that much interested in them, so clear the number stack and consider
4246			// that the current parameters, floating-point values, have been processed
4247			else
4248			   {
4249				$number_stack 	=  array ( ) ;
4250				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
4251			    }
4252
4253			$index 		=  $next_index ;
4254		    }
4255
4256		// End of input
4257		return ( false ) ;
4258	    }
4259
4260
4261	// __next_token :
4262	//	Retrieves the next token from the drawing instructions stream.
4263	private function  __next_token ( $page_number, $data, $data_length, $index )
4264	   {
4265		// Skip spaces
4266		$count		=  0 ;
4267
4268		while  ( $index  <  $data_length  &&  ( $data [ $index ]  ==  ' '  ||  $data [ $index ]  ==  "\t"  ||  $data [ $index ]  ==  "\r"  ||  $data [ $index ]  ==  "\n" ) )
4269		   {
4270			$index ++ ;
4271			$count ++ ;
4272		    }
4273
4274		$enhanced_statistics	=  $this -> EnhancedStatistics ;
4275		$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'space' ]	+=  $count ;
4276
4277		// End of input
4278		if  ( $index  >=  $data_length )
4279			return ( false ) ;
4280
4281		// The current character will tell us what to do
4282		$ch 	=  $data [ $index ] ;
4283		$ch2	=  '' ;
4284
4285		switch ( $ch )
4286		   {
4287			// Opening square bracket : we have to find the closing one, taking care of escape sequences
4288			// that can also specify a square bracket, such as "\]"
4289		   	case 	"[" :
4290		   		$pos 		=  $index + 1 ;
4291		   		$parent 	=  0 ;
4292		   		$angle 		=  0 ;
4293		   		$result		=  $ch ;
4294
4295		   		while  ( $pos  <  $data_length )
4296		   		   {
4297		   			$nch 	=  $data [ $pos ++ ] ;
4298
4299		   			switch  ( $nch )
4300		   			   {
4301		   			   	case 	'(' :
4302		   			   		$parent ++ ;
4303		   			   		$result 	.=  $nch ;
4304		   			   		break ;
4305
4306		   			   	case 	')' :
4307		   			   		$parent -- ;
4308		   			   		$result 	.=  $nch ;
4309		   			   		break ;
4310
4311		   			   	case 	'<' :
4312							// Although the array notation can contain hex digits between angle brackets, we have to
4313							// take care that we do not have an angle bracket between two parentheses such as :
4314							// [ (<) ... ]
4315							if  ( ! $parent )
4316		   			   			$angle ++ ;
4317
4318		   			   		$result 	.=  $nch ;
4319		   			   		break ;
4320
4321		   			   	case 	'>' :
4322							if  ( ! $parent )
4323		   			   			$angle -- ;
4324
4325		   			   		$result 	.=  $nch ;
4326		   			   		break ;
4327
4328		   			   	case 	'\\' :
4329		   					$result 	.=  $nch . $data [ $pos ++ ] ;
4330		   					break ;
4331
4332		   			   	case 	']' :
4333		   					$result 	.=  ']' ;
4334
4335		   					if  ( ! $parent  )
4336		   						break  2 ;
4337		   					else
4338		   						break ;
4339
4340						case	"\n" :
4341						case	"\r" :
4342							break ;
4343
4344		   			   	default :
4345		   			   		$result 	.=  $nch ;
4346		   			    }
4347		   		    }
4348
4349		   		return ( array ( $result, $pos ) ) ;
4350
4351			// Parenthesis : Again, we have to find the closing parenthesis, taking care of escape sequences
4352			// such as "\)"
4353		   	case 	"(" :
4354		   		$pos 		=  $index + 1 ;
4355		   		$result		=  $ch ;
4356
4357		   		while  ( $pos  <  $data_length )
4358		   		   {
4359		   			$nch 	=  $data [ $pos ++ ] ;
4360
4361		   			if  ( $nch  ==  '\\' )
4362					   {
4363						$after		 =  $data [ $pos ] ;
4364
4365						// Character references specified as \xyz, where "xyz" are octal digits
4366						if  ( $after  >=  '0'  &&  $after  <=  '7' )
4367						   {
4368							$result		.=  $nch ;
4369
4370							while  ( $data [ $pos ]  >=  '0'  &&  $data [ $pos ]  <=  '7' )
4371								$result		.=  $data [ $pos ++ ] ;
4372						    }
4373						// Regular character escapes
4374						else
4375		   					$result 	.=  $nch . $data [ $pos ++ ] ;
4376					    }
4377		   			else if  ( $nch  ==  ')' )
4378		   			   {
4379		   				$result 	.=  ')' ;
4380		   				break ;
4381		   			    }
4382		   			else
4383		   				$result 	.=  $nch ;
4384		   		   }
4385
4386		   		return ( array ( $result, $pos ) ) ;
4387
4388			// A construction of the form : "<< something >>", or a unicode character
4389		   	case 	'<' :
4390				if  ( ! isset ( $data [ $index + 1 ] ) )
4391					return ( false ) ;
4392
4393		   		if (  $data [ $index + 1 ]  ==  '<' )
4394		   		   {
4395		   		   	$pos 	=  strpos ( $data, '>>', $index + 2 ) ;
4396
4397		   			if  ( $pos  ===  false )
4398		   				return ( false ) ;
4399
4400		   			return ( array ( substr ( $data, $index, $pos - $index + 2 ), $pos + 2 ) ) ;
4401		   		    }
4402		   		else
4403		   		   {
4404		   		   	$pos 	=  strpos ( $data, '>', $index + 2 ) ;
4405
4406		   			if  ( $pos  ===  false )
4407		   				return ( false ) ;
4408
4409					// There can be spaces and newlines inside a series of hex digits, so remove them...
4410					$result			=  preg_replace ( '/\s+/', '', substr ( $data, $index, $pos - $index + 1 ) ) ;
4411
4412		   			return ( array ( $result, $pos + 1 ) ) ;
4413		   		   }
4414
4415			// Tick character : consider it as a keyword, in the same way as the "TJ" or "Tj" keywords
4416		   	case 	"'" :
4417		   		return ( array ( "'", $index + 1 ) ) ;
4418
4419			// Other cases : this may be either a floating-point number or a keyword
4420		   	default :
4421		   		$index ++ ;
4422		   		$value 	=  $ch ;
4423
4424				if  ( isset ( $data [ $index ] ) )
4425				   {
4426		   			if ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_DIGIT )  ||
4427							$ch  ==  '-'  ||  $ch  ==  '+'  ||  $ch  ==  '.' )
4428		   			   {
4429		   				while  ( $index  <  $data_length  &&
4430		   						( ( self::$CharacterClasses [ $data [ $index ] ] & self::CTYPE_DIGIT )  ||
4431									$data [ $index ]  ==  '.' ) )
4432		   					$value 	.=  $data [ $index ++ ] ;
4433		   			    }
4434		   			else if  ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALPHA )  ||
4435							$ch  ==  '/'  ||  $ch  ==  '!' )
4436		   			   {
4437						$ch	=  $data [ $index ] ;
4438
4439						while  ( $index  <  $data_length  &&
4440							( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM )  ||
4441								$ch  ==  '*'  ||  $ch  ==  '-'  ||  $ch  ==  '_'  ||  $ch  ==  '.'  ||  $ch  ==  '+' ) )
4442						   {
4443							$value 	.=  $ch ;
4444							$index ++ ;
4445
4446							if  ( isset ( $data [ $index ] ) )
4447								$ch	=  $data [ $index ] ;
4448						    }
4449		   			    }
4450				    }
4451
4452		   		return ( array ( $value, $index ) ) ;
4453		    }
4454	    }
4455
4456
4457	/*--------------------------------------------------------------------------------------------------------------
4458
4459	    NAME
4460	        ExtractTextWithLayout - Extracts text, trying to render the page layout.
4461
4462		$text 	=  $this -> ExtractTextWithLayout ( $page_number, $object_id, $data, &$current_font ) ;
4463
4464	    DESCRIPTION
4465	        Extracts text from decoded stream contents, trying to render the layout.
4466
4467	    PARAMETERS
4468		$page_number (integer) -
4469			�Page number that contains the text to be extracted.
4470
4471	    	$object_id (integer) -
4472	    		Object id of this text block.
4473
4474	    	$data (string) -
4475	    		Stream contents.
4476
4477		$current_font (integer) -
4478			Id of the current font, which should be found in the $this->FontTable property, if anything
4479			went ok.
4480			This parameter is required, since text blocks may not specify a new font resource id and reuse
4481			the one that waas set before.
4482
4483	    RETURN VALUE
4484		Returns the decoded text.
4485
4486	 *-------------------------------------------------------------------------------------------------------------*/
4487	protected function  ExtractTextWithLayout ( &$page_fragments, $page_number, $object_id, $data, &$current_font )
4488	   {
4489		// Characters that can start a numeric operand
4490		static		$numeric_starts		=  array
4491		   (
4492			'+' => true, '-' => true, '.' => true, '0' => true, '1' => true, '2' => true, '3' => true, '4' => true,
4493			'5' => true, '6' => true, '7' => true, '8' => true, '9' => true
4494		    ) ;
4495		// Initial (default) transformation matrix. To reflect the PDF specifications, we will keep it as a 6 elements array :
4496		//	[ sx tx ty sy x y ]
4497		// (although tx and ty are not useful here, since they affect the graphic orientation of the text)
4498		// sx and sy are scaling parameters, actually a multiplier for the x and y parameters. We only keep
4499		static		$IdentityMatrix		=  array ( 1, 0, 0, 1, 0, 0 ) ;
4500
4501		// Remove useless instructions
4502		$new_data	=  $this -> __strip_useless_instructions ( $data ) ;
4503
4504		if  ( self::$DEBUG )
4505		   {
4506			echo "\n----------------------------------- TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ;
4507			echo $data ;
4508			echo "\n----------------------------------- OPTIMIZED TEXT #$object_id (size = " . strlen ( $data ) . " bytes, new size = " . strlen ( $new_data ) . " bytes)\n" ;
4509			echo $new_data ;
4510		    }
4511
4512		$data					=  $new_data ;
4513		$data_length 				=  strlen ( $data ) ;		// Data length
4514
4515		$page_fragment_count			=  count ( $page_fragments ) ;
4516
4517		// Index into the specified block of text-drawing instructions
4518		$data_index 				=  0 ;
4519
4520		// Text matrices
4521		$CTM					=
4522		$Tm					=  $IdentityMatrix ;
4523
4524		// Nesting level of BT..ET instructions (Begin text/End text) - they are not nestable but be prepared to meet buggy PDFs
4525		$BT_nesting_level			=  0 ;
4526
4527		// Current font data
4528		$current_font_height			=  0 ;
4529
4530		// Current font map width, in bytes, plus a flag saying whether the current font is mapped or not
4531		$current_template	=  '' ;
4532		$current_font_name	=  '' ;
4533		$this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ;
4534
4535		// Operand stack
4536		$operand_stack				=  array ( ) ;
4537
4538		// Number of tokens processed so far
4539		$token_count				=  0 ;
4540
4541		// Page attributes
4542		$page_attributes			=  $this -> PageMap -> PageAttributes [ $page_number ] ;
4543
4544		// Graphics context stack - well, we only store here the current transformation matrix
4545		$graphic_stack				=  array ( ) ;
4546		$graphic_stack_size			=  0 ;
4547
4548		// Global/local execution time measurements
4549		$tokens_between_timechecks	=  1000 ;
4550		$enforce_global_execution_time	=  $this -> Options  &  self::PDFOPT_ENFORCE_GLOBAL_EXECUTION_TIME ;
4551		$enforce_local_execution_time	=  $this -> Options  &  self::PDFOPT_ENFORCE_EXECUTION_TIME ;
4552		$enforce_execution_time		=  $enforce_global_execution_time | $enforce_local_execution_time ;
4553
4554		// Whether we should compute enhanced statistics
4555		$enhanced_statistics		=  $this -> EnhancedStatistics ;
4556
4557		// Whether we should show debug coordinates
4558		$show_debug_coordinates		=  ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES ) ;
4559
4560		// Text leading value set by the TL instruction
4561		$text_leading			=  0.0 ;
4562
4563		// Loop through the stream of tokens
4564		while  ( $this -> __next_token_ex ( $page_number, $data, $data_length, $data_index, $token, $next_index )  !==  false )
4565		   {
4566			$token_start	=  $token [0] ;
4567			$token_count ++ ;
4568			$length		=  $next_index - $data_index - 1 ;
4569
4570			// Check if we need to enforce execution time checking, to prevent PHP from terminating our script without any hope
4571			// of catching the error
4572			if  ( $enforce_execution_time  &&  ! ( $token_count % $tokens_between_timechecks ) )
4573			   {
4574				if  ( $enforce_global_execution_time )
4575				   {
4576					$now	=  microtime ( true ) ;
4577
4578					if  ( $now - self::$GlobalExecutionStartTime  >  self::$MaxGlobalExecutionTime )
4579						error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", true, self::$PhpMaxExecutionTime, self::$MaxGlobalExecutionTime ) ) ;
4580				    }
4581
4582				// Per-instance timeout handling
4583				if  ( $enforce_local_execution_time )
4584				   {
4585					$now	=  microtime ( true ) ;
4586
4587					if  ( $now - $this -> ExecutionStartTime  >  $this -> MaxExecutionTime )
4588						error ( new PdfToTextTimeoutException ( "file {$this -> Filename}", false, self::$PhpMaxExecutionTime, $this -> MaxExecutionTime ) ) ;
4589				    }
4590			    }
4591
4592			/****************************************************************************************************************
4593
4594				The order of the testings is important for maximum performance : put the most common cases first.
4595				A study on over 1000 PDF files has shown the following :
4596
4597				- Instruction operands appear 24.5 million times
4598				- Tx instructions (including Tf, Tm, ', ", etc.) : 24M
4599				- (), <> and [] constructs for drawing text : 17M
4600				- Other : peanuts...
4601				- Ignored instructions : 0.5M (these are the instructions without interest for text extraction and that
4602				 could not be removed by the __strip_useless_instructions() method).
4603
4604				Of course, white spaces appear more than 100M times between instructions. However, it gets hard to remove
4605				most of them without compromising the result of __strip_useless_instructions.
4606
4607			 ***************************************************************************************************************/
4608			// Numeric or flag for an instruction
4609			if  ( $token_start  ==  '/'  ||  isset ( $numeric_starts [ $token_start ] ) )
4610			   {
4611				$operand_stack []		=  $token ;
4612
4613				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'operand' ] ++ ;
4614			    }
4615			// A 2-characters "Tx" or a 1-character quote/doublequote instruction
4616			else if  ( ( $length  ===  2   &&  $token_start  ===  'T' )  ||  ( $length  ===  1  &&  ( $token_start  ===  "'"  ||  $token_start  ===  '"' ) ) )
4617			   {
4618				switch  ( ( $length  ===  1 ) ?  $token [0] : $token [1] )
4619				   {
4620					// Tj instruction
4621					case	'j' :
4622						$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'Tj' ] ++ ;
4623						break ;
4624
4625					// Tm instruction
4626					case	'm' :
4627						$Tm [0]			=  ( double ) $operand_stack [0] ;
4628						$Tm [1]			=  ( double ) $operand_stack [1] ;
4629						$Tm [2]			=  ( double ) $operand_stack [2] ;
4630						$Tm [3]			=  ( double ) $operand_stack [3] ;
4631						$Tm [4]			=  ( double ) $operand_stack [4] ;
4632						$Tm [5]			=  ( double ) $operand_stack [5] ;
4633
4634						$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'Tm' ] ++ ;
4635						break ;
4636
4637					// Tf instruction
4638					case	'f' :
4639						$current_font_name	=  $operand_stack [0] ;
4640						$key			=  "$page_number:$current_template:$current_font_name" ;
4641
4642						// We have to map a font specifier (such /TT0, C0-1, etc.) into an object id.
4643						// Check first if we already met this font
4644						if  ( isset ( $this -> MapIdBuffer [ $key ] ) )
4645							$current_font	=   $this -> MapIdBuffer [ $key ] ;
4646						// Otherwise retrieve its corresponding object number and put it in our font cache
4647						else
4648						   {
4649							$current_font 	=  $this -> FontTable -> GetFontByMapId ( $page_number, $current_template, $current_font_name ) ;
4650
4651							$this -> MapIdBuffer [ $key ]	=  $current_font ;
4652						    }
4653
4654						$current_font_height	=  ( double ) $operand_stack [1] ;
4655						$this -> FontTable -> GetFontAttributes ( $page_number, $current_template, $current_font, $current_font_map_width, $current_font_mapped ) ;
4656						$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'Tf' ] ++ ;
4657						break ;
4658
4659					// Td instruction
4660					case	'd' :
4661						$Tm [4]		+=  ( double ) $operand_stack [0] * abs ( $Tm [0] ) ;
4662						$Tm [5]		+=  ( double ) $operand_stack [1] * abs ( $Tm [3] ) ;
4663
4664						$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'Td' ] ++ ;
4665						break ;
4666
4667					// TJ instruction
4668					case	'J' :
4669						$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'TJ' ] ++ ;
4670						break ;
4671
4672					// TD instruction
4673					case	'D' :
4674						$Tm [4]		 +=  ( double ) $operand_stack [0] * $Tm [0] ;
4675						$Tm [5]		 +=  ( double ) $operand_stack [1] * $Tm [3] ;
4676						$text_leading	 -=  $Tm [5] ;
4677
4678						$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'TD' ] ++ ;
4679						break ;
4680
4681					// T* instruction
4682					case	'*' :
4683						$Tm [4]		 =  0.0 ;
4684						$Tm [5]		-=  $text_leading ; //$current_font_height ;
4685
4686						$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'T*' ] ++ ;
4687						break ;
4688
4689					// TL instruction - Set text leading. Currently not used.
4690					case	'L' :
4691						$text_leading		=  ( double ) $operand_stack [0] ;
4692						$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'TL' ] ++ ;
4693						break ;
4694
4695					// ' instruction : go to next line and display text
4696					case	"'" :
4697						// Update the coordinates of the last text block found so far
4698						$page_fragments [ $page_fragment_count - 1 ] [ 'x' ]		+=  $text_leading ;
4699						$offset								 =  $current_font_height * abs ( $Tm [3] ) ;
4700						$page_fragments [ $page_fragment_count - 1 ] [ 'y' ]		-=  $offset ;
4701
4702						// And don't forget to update the y coordinate of the current transformation matrix
4703						$Tm [5]								-=  $offset ;
4704
4705						$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ "'" ] ++ ;
4706						break ;
4707
4708					// "'" instruction
4709					case	'"' :
4710						if  ( self::$DEBUG )
4711							warning ( "Instruction $token not yet implemented." ) ;
4712
4713						$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ '"' ] ++ ;
4714						break ;
4715
4716					// Other : ignore them
4717					default :
4718						$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
4719				    }
4720
4721				$operand_stack		=  array ( ) ;
4722			    }
4723			// cm instruction
4724			else if  ( $token  ==  'cm' )
4725			   {
4726				$a		=  ( double ) $operand_stack [0] ;
4727				$b		=  ( double ) $operand_stack [1] ;
4728				$c		=  ( double ) $operand_stack [2] ;
4729				$d		=  ( double ) $operand_stack [3] ;
4730				$e		=  ( double ) $operand_stack [4] ;
4731				$f		=  ( double ) $operand_stack [5] ;
4732
4733				$CTM		=  array ( $a, $b, $c, $d, $e, $f ) ;
4734				$operand_stack	=  array ( ) ;
4735
4736				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'cm' ] ++ ;
4737			    }
4738			// q/Q instructions (save/restore graphic context)
4739			else if  ( $token  ===  'q'  )
4740			   {
4741				$graphic_stack [ $graphic_stack_size ++ ]	=  array ( $CTM, $Tm ) ;
4742				$operand_stack					=  array ( ) ;
4743			    }
4744			else if  ( $token  ===  'Q' )
4745			    {
4746				if  ( $graphic_stack_size )
4747					list ( $CTM, $Tm )				=  $graphic_stack [ -- $graphic_stack_size ] ;
4748				else if  ( self::$DEBUG )
4749					warning ( "Tried to restore graphics context from an empty stack." ) ;
4750
4751				$operand_stack					=  array ( ) ;
4752			    }
4753			// Text array in the [...] notation. Well, in fact, even non-array constructs are returned as an array by the
4754			// __next_token() function, for the sake of simplicity
4755			else if  ( $token_start  ===  '[' )
4756			   {
4757				$text			=  $this -> __decode_text ( $token, $current_font, $current_font_mapped, $current_font_map_width ) ;
4758
4759				if  ( $text  !==  '' )
4760				   {
4761					$r		=  $this -> __matrix_multiply ( $Tm, $CTM, $page_attributes [ 'width' ], $page_attributes [ 'height' ] ) ;
4762					$fragment	=  array
4763					   (
4764						'x'		=>  ( $r [4]  <  0 ) ?  0.0 : $r [4],
4765						'y'		=>  ( $r [5]  <  0 ) ?  0.0 : $r [5],
4766						'page'		=>  $page_number,
4767						'template'	=>  $current_template,
4768						'font'		=>  $current_font_name,
4769						'font-height'	=>  abs ( $current_font_height * $Tm [3] ),
4770						'text'		=>  $text,
4771					    ) ;
4772
4773					// Add debug information when needed
4774					if  ( self::$DEBUG )
4775					   {
4776						$fragment	=  array_merge
4777						   (
4778							$fragment,
4779							array
4780							   (
4781								'CTM'			=>  $CTM,
4782								'Tm'			=>  $Tm,
4783								'New Tm'		=>  $r,
4784								'Real font height'	=>  $current_font_height,
4785								'Page width'		=>  $page_attributes [ 'width' ],
4786								'Page height'		=>  $page_attributes ['height' ]
4787							    )
4788						    ) ;
4789					    }
4790
4791					// Add this text fragment to the list
4792					$page_fragments []	=  $fragment ;
4793					$page_fragment_count ++ ;
4794
4795					$operand_stack		=  array ( ) ;
4796				    }
4797			    }
4798			// BT instruction
4799			else if  ( $token  ==  'BT' )
4800			   {
4801				$BT_nesting_level ++ ;
4802				$operand_stack					=  array ( ) ;
4803				$graphic_stack [ $graphic_stack_size ++ ]	=  array ( $CTM, $Tm ) ;
4804
4805				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'BT' ] ++ ;
4806			    }
4807			// ET instruction
4808			else if  ( $token  ==  'ET' )
4809			   {
4810				if  ( $BT_nesting_level )
4811				   {
4812					$BT_nesting_level -- ;
4813
4814					if  ( ! $BT_nesting_level  &&  $graphic_stack_size )
4815					   {
4816						list ( $CTM, $Tm )	=  $graphic_stack [ -- $graphic_stack_size ] ;
4817					    }
4818
4819				    }
4820
4821				$operand_stack		=  array ( ) ;
4822			    }
4823			// Template (substituted in __next_token)
4824			else if  ( $token_start  ===  '!' )
4825			   {
4826				if  ( preg_match ( '/ !PDFTOTEXT_TEMPLATE_ (?P<template> \w+) /ix', $token, $match ) )
4827				   {
4828					$name	=  '/' . $match [ 'template' ] ;
4829					$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'template' ] ++ ;
4830
4831					if  ( $this -> PageMap -> IsValidXObjectName ( $name ) )
4832						$current_template	=  $name ;
4833				    }
4834				else
4835					$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
4836
4837				$operand_stack	=  array ( ) ;
4838			    }
4839			// Other instructions
4840			else
4841			   {
4842				$operand_stack		=  array ( ) ;
4843				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'ignored' ] ++ ;
4844			    }
4845
4846			// Update current index in instruction stream
4847			$data_index	=  $next_index ;
4848		    }
4849	    }
4850
4851
4852	// __matrix_multiply -
4853	//	Multiplies matrix $ma by $mb.
4854	//	PDF transformation matrices are 3x3 matrices containing the following values :
4855	//
4856	//		| sx rx 0 |
4857	//		| ry sy 0 |
4858	//		| tx ty 1 |
4859	//
4860	//	However, we do not care about the 3rd column, which is always hardcoded. Transformation
4861	//	matrices here are implemented 6-elements arrays :
4862	//
4863	//		[ sx, rx, ry, tx, ty ]
4864	private function  __matrix_multiply ( $ma, $mb, $page_width, $page_height )
4865	   {
4866		// Scaling text is only appropriate for rendering graphics ; in our case, we just have to render
4867		// basic text without any consideration about its width or height ; so adjust the sx/sy parameters
4868		// accordingly
4869		$scale_1x	=  ( $ma [0]  >  0 ) ?  1 : -1 ;
4870		$scale_1y	=  ( $ma [3]  >  0 ) ?  1 : -1 ;
4871		$scale_2x	=  ( $mb [0]  >  0 ) ?  1 : -1 ;
4872		$scale_2y	=  ( $mb [3]  >  0 ) ?  1 : -1 ;
4873
4874		// Perform the matrix multiplication
4875		$r	=  array ( ) ;
4876		$r [0]	=  ( $scale_1x * $scale_2x ) + ( $ma [1] * $mb [2]   ) ;
4877		$r [1]	=  ( $scale_1x * $mb [1]   ) + ( $ma [1] * $scale_2y ) ;
4878		$r [2]	=  ( $scale_1y * $scale_2x ) + ( $scale_1y * $mb [2]   ) ;
4879		$r [3]	=  ( $scale_1y * $mb [1]   ) + ( $scale_1y* $scale_2y ) ;
4880		$r [4]	=  ( $ma [4]   * $scale_2x ) + ( $ma [5] * $mb [2]   ) + $mb [4] ;
4881		$r [5]	=  ( $ma [4]   * $mb [1]   ) + ( $ma [5] * $scale_2y ) + $mb [5] ;
4882
4883		// Negative x/y values are expressed relative to the page width/height (???)
4884		if  ( $r [0]  <  0 )
4885			$r [4]	=  abs ( $r [4] ) ;//$page_width - $r [4] ;
4886
4887		if  ( $r [3]  <  0 )
4888			$r [5]	=  abs ( $r [5] ) ; //$page_height - $r [5] ;
4889
4890		return ( $r ) ;
4891	    }
4892
4893
4894	// __next_token_ex :
4895	//	Reviewed version of __next_token, adapted to ExtractTextWithLayout.
4896	//	Both functions will be unified when this one will be stabilized.
4897	private function  __next_token_ex ( $page_number, $data, $data_length, $index, &$token, &$next_index )
4898	   {
4899		// Skip spaces
4900		$count		=  0 ;
4901
4902		while  ( $index  <  $data_length  &&  ( $data [ $index ]  ==  ' '  ||  $data [ $index ]  ==  "\t"  ||  $data [ $index ]  ==  "\r"  ||  $data [ $index ]  ==  "\n" ) )
4903		   {
4904			$index ++ ;
4905			$count ++ ;
4906		    }
4907
4908		$enhanced_statistics	=  $this -> EnhancedStatistics ;
4909		$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ 'space' ]	+=  $count ;
4910
4911		// End of input
4912		if  ( $index  >=  $data_length )
4913			return ( false ) ;
4914
4915		// The current character will tell us what to do
4916		$ch 	=  $data [ $index ] ;
4917
4918		switch ( $ch )
4919		   {
4920			// Opening square bracket : we have to find the closing one, taking care of escape sequences
4921			// that can also specify a square bracket, such as "\]"
4922		   	case 	"[" :
4923		   		$next_index 	=  $index + 1 ;
4924		   		$parent 	=  0 ;
4925		   		$angle 		=  0 ;
4926		   		$token		=  '[' ;
4927
4928		   		while  ( $next_index  <  $data_length )
4929		   		   {
4930		   			$nch 	=  $data [ $next_index ++ ] ;
4931
4932		   			switch  ( $nch )
4933		   			   {
4934		   			   	case 	'(' :
4935		   			   		$parent ++ ;
4936		   			   		$token 	.=  $nch ;
4937		   			   		break ;
4938
4939		   			   	case 	')' :
4940		   			   		$parent -- ;
4941		   			   		$token 	.=  $nch ;
4942		   			   		break ;
4943
4944		   			   	case 	'<' :
4945							// Although the array notation can contain hex digits between angle brackets, we have to
4946							// take care that we do not have an angle bracket between two parentheses such as :
4947							// [ (<) ... ]
4948							if  ( ! $parent )
4949		   			   			$angle ++ ;
4950
4951		   			   		$token 	.=  $nch ;
4952		   			   		break ;
4953
4954		   			   	case 	'>' :
4955							if  ( ! $parent )
4956		   			   			$angle -- ;
4957
4958		   			   		$token 	.=  $nch ;
4959		   			   		break ;
4960
4961		   			   	case 	'\\' :
4962		   					$token 	.=  $nch . $data [ $next_index ++ ] ;
4963		   					break ;
4964
4965		   			   	case 	']' :
4966		   					$token 	.=  ']' ;
4967
4968		   					if  ( ! $parent  )
4969		   						break  2 ;
4970		   					else
4971		   						break ;
4972
4973						case	"\n" :
4974						case	"\r" :
4975							break ;
4976
4977		   			   	default :
4978		   			   		$token 	.=  $nch ;
4979		   			    }
4980		   		    }
4981
4982				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ '[' ] ++ ;
4983
4984		   		return ( true ) ;
4985
4986			// Parenthesis : Again, we have to find the closing parenthesis, taking care of escape sequences
4987			// such as "\)"
4988		   	case 	"(" :
4989		   		$next_index 	=  $index + 1 ;
4990		   		$token		=  '[' . $ch ;
4991
4992		   		while  ( $next_index  <  $data_length )
4993		   		   {
4994		   			$nch 	=  $data [ $next_index ++ ] ;
4995
4996		   			if  ( $nch  ===  '\\' )
4997					   {
4998						$after		 =  $data [ $next_index ] ;
4999
5000						// Character references specified as \xyz, where "xyz" are octal digits
5001						if  ( $after  >=  '0'  &&  $after  <=  '7' )
5002						   {
5003							$token		.=  $nch ;
5004
5005							while  ( $data [ $next_index ]  >=  '0'  &&  $data [ $next_index ]  <=  '7' )
5006								$token		.=  $data [ $next_index ++ ] ;
5007						    }
5008						// Regular character escapes
5009						else
5010		   					$token 	.=  $nch . $data [ $next_index ++ ] ;
5011					    }
5012		   			else if  ( $nch  ===  ')' )
5013		   			   {
5014		   				$token 	.=  ')' ;
5015		   				break ;
5016		   			    }
5017		   			else
5018		   				$token 	.=  $nch ;
5019		   		   }
5020
5021				$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ '(' ] ++ ;
5022				$token	.=  ']' ;
5023
5024		   		return ( true ) ;
5025
5026			// A construction of the form : "<< something >>", or a unicode character
5027		   	case 	'<' :
5028				if  ( isset ( $data [ $index + 1 ] ) )
5029				   {
5030		   			if (  $data [ $index + 1 ]  ===  '<' )
5031		   			   {
5032		   		   		$next_index 	=  strpos ( $data, '>>', $index + 2 ) ;
5033
5034		   				if  ( $next_index  ===  false )
5035		   					return ( false ) ;
5036
5037						$token		=  substr ( $data, $index, $next_index - $index + 2 ) ;
5038						$next_index    +=  2 ;
5039
5040						return ( true ) ;
5041		   			    }
5042		   			else
5043		   			   {
5044		   		   		$next_index 	=  strpos ( $data, '>', $index + 2 ) ;
5045
5046		   				if  ( $next_index  ===  false )
5047		   					return ( false ) ;
5048
5049						$enhanced_statistics  &&  $this -> Statistics [ 'Distributions' ] [ '<' ] ++ ;
5050
5051						// There can be spaces and newlines inside a series of hex digits, so remove them...
5052						$result			=  preg_replace ( '/\s+/', '', substr ( $data, $index, $next_index - $index + 1 ) ) ;
5053
5054						$token		=  "[$result]" ;
5055						$next_index ++ ;
5056
5057		   				return ( true ) ;
5058		   			   }
5059				    }
5060				else
5061					return ( false ) ;
5062
5063			// Tick character : consider it as a keyword, in the same way as the "TJ" or "Tj" keywords
5064		   	case 	"'" :
5065			case	'"' :
5066		   		$token		 =  $ch ;
5067				$next_index	+=  2 ;
5068
5069				return ( true ) ;
5070
5071			// Other cases : this may be either a floating-point number or a keyword
5072		   	default :
5073		   		$next_index	=  ++ $index ;
5074		   		$token 		=  $ch ;
5075
5076				if  ( isset ( $data [ $next_index ] ) )
5077				   {
5078		   			if ( ( $ch  >=  '0'  &&  $ch  <=  '9' )  ||  $ch  ==  '-'  ||  $ch  ==  '+'  ||  $ch  ==  '.' )
5079		   			   {
5080		   				while  ( $next_index  <  $data_length  &&
5081		   						( ( $data [ $next_index ]  >=  '0'  &&  $data [ $next_index ]  <=  '9' )  ||
5082									$data [ $next_index ]  ===  '-'  ||  $data [ $next_index ]  ===  '+'  ||  $data [ $next_index ]  ===  '.' ) )
5083		   					$token 	.=  $data [ $next_index ++ ] ;
5084		   			    }
5085		   			else if  ( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALPHA )  ||
5086							$ch  ==  '/'  ||  $ch  ==  '!' )
5087		   			   {
5088						$ch	=  $data [ $next_index ] ;
5089
5090						while  ( $next_index  <  $data_length  &&
5091							( ( self::$CharacterClasses [ $ch ] & self::CTYPE_ALNUM )  ||
5092								$ch  ==  '*'  ||  $ch  ==  '-'  ||  $ch  ==  '_'  ||  $ch  ==  '.'  ||  $ch  ==  '+' ) )
5093						   {
5094							$token 	.=  $ch ;
5095							$next_index ++ ;
5096
5097							if  ( isset ( $data [ $next_index ] ) )
5098								$ch	=  $data [ $next_index ] ;
5099						    }
5100		   			    }
5101				    }
5102
5103		   		return ( true ) ;
5104		    }
5105	    }
5106
5107
5108	// __decode_text -
5109	//	Text decoding function when the PDFOPT_BASIC_LAYOUT flag is specified.
5110	private function  __decode_text ( $data, $current_font, $current_font_mapped, $current_font_map_width )
5111	   {
5112		list ( $text_values, $offsets ) 	=  $this -> __extract_chars_from_array ( $data ) ;
5113		$value_index				=  0 ;
5114		$result					=  '' ;
5115
5116		// Fonts having character maps will require some special processing
5117		if  ( $current_font_mapped )
5118		   {
5119			// Loop through each text value
5120			foreach  ( $text_values  as  $text )
5121			   {
5122				$is_hex 	=  ( $text [0]  ==  '<' ) ;
5123				$length 	=  strlen ( $text ) - 1 ;
5124				$handled	=  false ;
5125
5126				// Characters are encoded within angle brackets ( "<>" ).
5127				// Note that several characters can be specified within the same angle brackets, so we have to take
5128				// into account the width we detected in the begincodespancerange construct
5129				if  ( $is_hex )
5130				   {
5131					for  ( $i = 1 ; $i  <  $length ; $i += $current_font_map_width )
5132					   {
5133						$value		 =  substr ( $text, $i, $current_font_map_width ) ;
5134						$ch 		 =  hexdec ( $value ) ;
5135
5136						if  ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) )
5137							$newchar	=  $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ;
5138						else
5139						   {
5140							$newchar	 =  $this -> FontTable -> MapCharacter ( $current_font, $ch ) ;
5141							$this -> CharacterMapBuffer [ $current_font ] [ $ch ]		=  $newchar ;
5142						    }
5143
5144						$result		.=  $newchar ;
5145					    }
5146
5147					$handled	 =  true ;
5148				    }
5149				// Yes ! double-byte codes can also be specified as plain text within parentheses !
5150				// However, we have to be really careful here ; the sequence :
5151				//	(Be)
5152				// can mean the string "Be" or the Unicode character 0x4265 ('B' = 0x42, 'e' = 0x65)
5153				// We first look if the character map contains an entry for Unicode codepoint 0x4265 ;
5154				// if not, then we have to consider that it is regular text to be taken one character by
5155				// one character. In this case, we fall back to the "if ( ! $handled )" condition
5156				else if  ( $current_font_map_width  ==  4  )
5157				   {
5158					$temp_result		=  '' ;
5159
5160					for  ( $i = 1 ; $i  <  $length ; $i ++ )
5161					   {
5162						// Each character in the pair may be a backslash, which escapes the next character so we must skip it
5163						// This code needs to be reviewed ; the same code is duplicated to handle escaped characters in octal notation
5164						if  ( $text [$i]  !=  '\\' )
5165							$ch1	=  $text [$i] ;
5166						else
5167						   {
5168							$i ++ ;
5169
5170							if  ( $text [$i]  <  '0'  ||  $text [$i]  >  '7' )
5171								$ch1	=  $this -> ProcessEscapedCharacter ( $text [$i] ) ;
5172							else
5173							   {
5174								$oct		=  '' ;
5175								$digit_count	=  0 ;
5176
5177								while  ( $i  <  $length  &&  $text [$i]  >=  '0'  &&  $text [$i]  <=  '7'  &&  $digit_count  <  3 )
5178								   {
5179									$oct	.=  $text [$i ++] ;
5180									$digit_count ++ ;
5181								    }
5182
5183								$ch1	=  chr ( octdec ( $oct ) ) ;
5184								$i -- ;
5185							    }
5186						    }
5187
5188						$i ++ ;
5189
5190						if  ( $text [$i]  != '\\' )
5191							$ch2	=  $text [$i] ;
5192						else
5193						   {
5194							$i ++ ;
5195
5196							if  ( $text [$i]  <  '0'  ||  $text [$i]  >  '7' )
5197								$ch2	=  $this -> ProcessEscapedCharacter ( $text [$i] ) ;
5198							else
5199							   {
5200								$oct		=  '' ;
5201								$digit_count	=  0 ;
5202
5203								while  ( $i  <  $length  &&  $text [$i]  >=  '0'  &&  $text [$i]  <=  '7'  &&  $digit_count  <  3 )
5204								   {
5205									$oct	.=  $text [$i ++] ;
5206									$digit_count ++ ;
5207								    }
5208
5209								$ch2	=  chr ( octdec ( $oct ) ) ;
5210								$i -- ;
5211							    }
5212						    }
5213
5214						// Build the 2-bytes character code
5215						$ch		=  ( ord ( $ch1 )  <<  8 )  |  ord ( $ch2 ) ;
5216
5217						if  ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ) )
5218							$newchar	=  $this -> CharacterMapBuffer [ $current_font ] [ $ch ] ;
5219						else
5220						   {
5221							$newchar	=  $this -> FontTable -> MapCharacter ( $current_font, $ch, true ) ;
5222							$this -> CharacterMapBuffer [ $current_font ] [ $ch ]		=  $newchar ;
5223						    }
5224
5225						// Yes !!! for characters encoded with two bytes, we can find the following construct :
5226						//	0x00 "\" "(" 0x00 "C" 0x00 "a" 0x00 "r" 0x00 "\" ")"
5227						// which must be expanded as : (Car)
5228						// We have here the escape sequences "\(" and "\)", but the backslash is encoded on two bytes
5229						// (although the MSB is nul), while the escaped character is encoded on 1 byte. waiting
5230						// for the next quirk to happen...
5231						if  ( $newchar  ==  '\\' )
5232						   {
5233							$newchar		=  $this -> ProcessEscapedCharacter ( $text [ $i + 2 ] ) ;
5234							$i ++ ;		// this time we processed 3 bytes, not 2
5235						    }
5236
5237						$temp_result		.=  $newchar ;
5238					    }
5239
5240					// Happens only if we were unable to translate a character using the current character map
5241					$result		.=  $temp_result ;
5242					$handled	 =  true ;
5243				    }
5244
5245				// Character strings within parentheses.
5246				// For every text value, use the character map table for substitutions
5247				if  ( ! $handled )
5248				   {
5249					for  ( $i = 1 ; $i  <  $length ; $i ++ )
5250					   {
5251						$ch 		=  $text [$i] ;
5252
5253						// Set to true to optimize calls to MapCharacters
5254						// Currently does not work with pobox@dizy.sk/infoma.pdf (a few characters differ)
5255						$use_map_buffer	=  false ;
5256
5257						// ... but don't forget to handle escape sequences "\n" and "\r" for characters
5258						// 10 and 13
5259						if  ( $ch  ==  '\\' )
5260						   {
5261							$ch 	=  $text [++$i] ;
5262
5263							// Escaped character
5264							if  ( $ch  <  '0'  ||  $ch  >  '7' )
5265								$ch		=  $this -> ProcessEscapedCharacter ( $ch ) ;
5266							// However, an octal form can also be specified ; in this case we have to take into account
5267							// the character width for the current font (if the character width is 4 hex digits, then we
5268							// will encounter constructs such as "\000\077").
5269							// The method used here is dirty : we build a regex to match octal character representations on a substring
5270							// of the text
5271							else
5272							   {
5273								$width		=  $current_font_map_width / 2 ;	// Convert to byte count
5274								$subtext	=  substr ( $text, $i - 1 ) ;
5275								$regex		=  "#^ (\\\\ [0-7]{3}){1,$width} #imsx" ;
5276
5277								$status		=  preg_match ( $regex, $subtext, $octal_matches ) ;
5278
5279								if  ( $status )
5280								   {
5281									$octal_values	=  explode ( '\\', substr ( $octal_matches [0], 1 ) ) ;
5282									$ord		=  0 ;
5283
5284									foreach  ( $octal_values  as  $octal_value )
5285										$ord	=  ( $ord  <<  8 ) + octdec ( $octal_value ) ;
5286
5287									$ch	 =  chr ( $ord ) ;
5288									$i	+=  strlen ( $octal_matches [0] ) - 2 ;
5289								    }
5290							    }
5291
5292							$use_map_buffer		=  false ;
5293						    }
5294
5295						// Add substituted character to the output result
5296						$ord		 =  ord ( $ch ) ;
5297
5298						if  ( ! $use_map_buffer )
5299							$newchar	 =  $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
5300						else
5301						   {
5302							if  ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) )
5303								$newchar	=  $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ;
5304							else
5305							   {
5306								$newchar	 =  $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
5307								$this -> CharacterMapBuffer [ $current_font ] [ $ord ]	=  $newchar ;
5308							    }
5309						    }
5310
5311						$result		.=  $newchar ;
5312					    }
5313				    }
5314
5315				// Handle offsets between blocks of characters
5316				if  ( isset ( $offsets [ $value_index ] )  &&
5317						- ( $offsets [ $value_index ] )  >  $this -> MinSpaceWidth )
5318					$result		.=  $this -> __get_character_padding ( $offsets [ $value_index ] ) ;
5319
5320				$value_index ++ ;
5321			    }
5322		    }
5323		// For fonts having no associated character map, we simply encode the string in UTF8
5324		// after the C-like escape sequences have been processed
5325		// Note that <xxxx> constructs can be encountered here, so we have to process them as well
5326		else
5327		   {
5328			foreach  ( $text_values  as  $text )
5329			   {
5330				$is_hex 	=  ( $text [0]  ==  '<' ) ;
5331				$length 	=  strlen ( $text ) - 1 ;
5332
5333				// Some text within parentheses may have a backslash followed by a newline, to indicate some continuation line.
5334				// Example :
5335				//	(this is a sentence \
5336				//	 continued on the next line)
5337				// Funny isn't it ? so remove such constructs because we don't care
5338				$text		=  str_replace ( array ( "\\\r\n", "\\\r", "\\\n" ), '', $text ) ;
5339
5340				// Characters are encoded within angle brackets ( "<>" )
5341				if  ( $is_hex )
5342				   {
5343					for  ( $i = 1 ; $i  <  $length ; $i += 2 )
5344					   {
5345						$ch 	=  hexdec ( substr ( $text, $i, 2 ) ) ;
5346
5347						$result .=  $this -> CodePointToUtf8 ( $ch ) ;
5348					    }
5349				    }
5350				// Characters are plain text
5351				else
5352				   {
5353					$text	=  self::Unescape ( $text ) ;
5354
5355					for  ( $i = 1, $length = strlen ( $text ) - 1 ; $i  <  $length ; $i ++ )
5356					   {
5357						$ch	=  $text [$i] ;
5358						$ord	=  ord ( $ch ) ;
5359
5360						if  ( $ord  <  127 )
5361							$newchar	=  $ch ;
5362						else
5363						   {
5364							if  ( isset ( $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ) )
5365								$newchar	=  $this -> CharacterMapBuffer [ $current_font ] [ $ord ] ;
5366							else
5367							   {
5368								$newchar	=  $this -> FontTable -> MapCharacter ( $current_font, $ord ) ;
5369								$this -> CharacterMapBuffer [ $current_font ] [ $ord ]	=  $newchar ;
5370							    }
5371						    }
5372
5373						$result		.=  $newchar ;
5374					    }
5375				    }
5376
5377				// Handle offsets between blocks of characters
5378				if  ( isset ( $offsets [ $value_index ] )  &&
5379						abs ( $offsets [ $value_index ] )  >  $this -> MinSpaceWidth )
5380					$result		.=  $this -> __get_character_padding ( $offsets [ $value_index ] ) ;
5381
5382				$value_index ++ ;
5383			   }
5384		    }
5385
5386		// All done, return
5387		return ( $result ) ;
5388	    }
5389
5390
5391	// __assemble_text_fragments -
5392	//	Assembles text fragments collected by the ExtractTextWithLayout function.
5393	private function  __assemble_text_fragments ( $page_number, &$fragments, &$page_width, &$page_height )
5394	   {
5395		$fragment_count		=  count ( $fragments ) ;
5396
5397		// No fragment no cry...
5398		if  ( ! $fragment_count )
5399			return ( '' ) ;
5400
5401		// Compute the width of each fragment
5402		foreach  ( $fragments  as  &$fragment )
5403			$this -> __compute_fragment_width ( $fragment ) ;
5404
5405		// Sort the fragments and group them by line
5406		usort ( $fragments, array ( $this, '__sort_page_fragments' ) ) ;
5407		$line_fragments		=  $this -> __group_line_fragments ( $fragments ) ;
5408
5409		// Retrieve the page attributes
5410		$page_attributes	=  $this -> PageMap -> PageAttributes [ $page_number ] ;
5411
5412		// Some buggy PDF do not specify page width or page height so, during the processing of text fragments,
5413		// page width & height will be set to the largest x/y coordinate
5414		if  ( isset ( $page_attributes [ 'width' ] )  &&  $page_attributes [ 'width' ] )
5415			$page_width		=  $page_attributes [ 'width' ] ;
5416		else
5417		   {
5418			$page_width		=  0 ;
5419
5420			foreach  ( $fragments  as  $fragment )
5421			   {
5422				$end_x	=  $fragment [ 'x' ] + $fragment [ 'width' ] ;
5423
5424				if  ( $end_x  >  $page_width )
5425					$page_width	=  $end_x ;
5426			    }
5427		    }
5428
5429		if  ( isset ( $page_attributes [ 'height' ] )  &&  $page_attributes [ 'height' ] )
5430			$page_height		=  $page_attributes [ 'height' ] ;
5431		else
5432			$page_height		=  $fragments [0] [ 'y' ] ;
5433
5434		// Block separator
5435		$separator			=  ( $this -> BlockSeparator ) ?  $this -> BlockSeparator : ' ' ;
5436
5437		// Unprocessed marker count
5438		$unprocessed_marker_count	=  count ( $this -> UnprocessedMarkerList [ 'font' ] ) ;
5439
5440		// Add page information if the PDFOPT_DEBUG_SHOW_COORDINATES option has been specified
5441		if  ( $this -> Options  &  self::PDFOPT_DEBUG_SHOW_COORDINATES )
5442			$result		=  "[Page : $page_number, width = $page_width, height = $page_height]" . $this -> EOL ;
5443		else
5444			$result			=  '' ;
5445
5446		// Loop through each line of fragments
5447		for  ( $i = 0, $line_count = count ( $line_fragments ) ; $i  <  $line_count ; $i ++ )
5448		   {
5449			$current_x	=  0 ;
5450
5451			// Loop through each fragment of the current line
5452			for  ( $j = 0, $fragment_count = count ( $line_fragments [$i] ) ; $j  <  $fragment_count ; $j ++ )
5453			   {
5454				$fragment	=  $line_fragments [$i] [$j] ;
5455
5456				// Process the markers which do not have an associated font yet - this will be done by matching
5457				// the current text fragment against one of the regular expressions defined.
5458				// If a match occurs, then all the subsequent text fragment using the same font will be put markers
5459				for  ( $k = 0 ; $k  <  $unprocessed_marker_count ; $k ++ )
5460					{
5461					$marker			=  $this -> UnprocessedMarkerList [ 'font' ] [$k] ;
5462
5463					if  ( preg_match ( $marker [ 'regex' ], $fragment [ 'text' ] ) )
5464					   {
5465						$this -> TextWithFontMarkers [ $fragment [ 'font' ] ]	=  array
5466							(
5467							'font'		=>  $fragment [ 'font'  ],
5468							'height'	=>  $fragment [ 'font-height' ],
5469							'regex'		=>  $marker   [ 'regex' ],
5470							'start'		=>  $marker   [ 'start' ],
5471							'end'		=>  $marker   [ 'end'   ]
5472							) ;
5473
5474						$unprocessed_marker_count -- ;
5475						unset ( $this -> UnprocessedMarkerList [ 'font' ] [$k] ) ;
5476
5477						break ;
5478					    }
5479					}
5480
5481				// Add debug info if needed
5482				if  ( $this -> Options & self::PDFOPT_DEBUG_SHOW_COORDINATES )
5483					$result		.=  $this -> __debug_get_coordinates ( $fragment ) ;
5484
5485				// Add a separator between two fragments, if needed
5486				if  ( $j )
5487				   {
5488					if  ( $current_x  <  floor ( $fragment [ 'x' ] ) )	// Accept small rounding errors
5489						$result		.=  $separator ;
5490				    }
5491
5492				// Check if we need to add markers around this text fragment
5493				if  ( isset ( $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] )  &&
5494						$this -> TextWithFontMarkers [ $fragment [ 'font' ] ] [ 'height' ]  ==  $fragment [ 'font-height' ] )
5495				    {
5496					$fragment_text	=  $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] [ 'start' ] .
5497							   $fragment [ 'text' ] .
5498							   $this -> TextWithFontMarkers [ $fragment [ 'font' ] ] [ 'end' ] ;
5499				     }
5500				else
5501					$fragment_text  =  $fragment [ 'text' ] ;
5502
5503				// Add the current fragment to the result
5504				$result		.=  $fragment_text ;
5505
5506				// Update current x-position
5507				$current_x	=  $fragment [ 'x' ] + $fragment [ 'width' ] ;
5508			    }
5509
5510			// Add a line break between each line
5511			$result		.=  $this -> EOL ;
5512		    }
5513
5514		// All done, return
5515		return ( $result ) ;
5516	    }
5517
5518
5519	// __sort_page_fragments -
5520	//	Sorts page fragments by their (y,x) coordinates.
5521	public function  __sort_page_fragments ( $a, $b )
5522	   {
5523		$xa			=  $a [ 'x' ] ;
5524		$ya			=  $a [ 'y' ] ;
5525		$xb			=  $b [ 'x' ] ;
5526		$yb			=  $b [ 'y' ] ;
5527
5528		if  ( $ya  !==  $yb )
5529			return ( $yb - $ya ) ;
5530		else
5531			return ( $xa - $xb ) ;
5532	    }
5533
5534
5535	// __sort_line_fragments -
5536	//	Sorts fragments per line.
5537	public function  __sort_line_fragments ( $a, $b )
5538	   {
5539		return ( $a [ 'x' ] - $b [ 'x' ] ) ;
5540	    }
5541
5542
5543	// __group_line_fragments -
5544	//	Groups page fragments per line, allowing a certain variation in the y-position.
5545	private function  __group_line_fragments ( $fragments )
5546	   {
5547		$result			=  array ( ) ;
5548		$fragment_count		=  count ( $fragments ) ;
5549		$last_y_coordinate	=  $fragments [0] [ 'y' ] ;
5550		$current_fragments	=  array ( $fragments [0] ) ;
5551
5552		for  ( $i = 1 ; $i  <  $fragment_count ; $i ++ )
5553		   {
5554			$fragment	=  $fragments [$i] ;
5555
5556			if  ( $fragment [ 'y' ] + $fragment [ 'font-height' ]  >=  $last_y_coordinate )
5557				$current_fragments []	=  $fragment ;
5558			else
5559			   {
5560				$last_y_coordinate	=  $fragment [ 'y' ] ;
5561				usort ( $current_fragments, array ( $this, '__sort_line_fragments' ) ) ;
5562				$result	[]		=  $current_fragments ;
5563				$current_fragments	=  array ( $fragment ) ;
5564			    }
5565		    }
5566
5567		if  ( count ( $current_fragments ) )
5568		   {
5569			usort ( $current_fragments, array ( $this, '__sort_line_fragments' ) ) ;
5570			$result []	=  $current_fragments ;
5571		    }
5572
5573		return ( $result ) ;
5574	    }
5575
5576
5577	// __compute_fragment_width -
5578	//	Compute the width of the specified text fragment and add the width entry accordingly.
5579	//	Returns the font object associated with this fragment
5580	private function  __compute_fragment_width ( &$fragment )
5581	   {
5582		// To avoid repeated calls to the PdfTexterFontTable::GetFontObject() method, we are buffering them in the FontObjectsBuffer property.
5583		$object_reference	=  $fragment [ 'page' ] . ':' . $fragment [ 'template' ] . ':' . $fragment [ 'font' ] ;
5584
5585		if  ( isset ( $this -> FontObjectsBuffer [ $object_reference ] ) )
5586			$font_object	=  $this -> FontObjectsBuffer [ $object_reference ] ;
5587		else
5588		   {
5589			$font_object		=  $this -> FontTable -> GetFontObject ( $fragment [ 'page' ], $fragment [ 'template' ], $fragment [ 'font' ] ) ;
5590			$this -> FontObjectsBuffer [ $object_reference ]	=  $font_object ;
5591		    }
5592
5593		// The width of the previous text fragment will be computed only if its associated font contains character widths information
5594		$fragment [ 'width' ]		=  ( $font_object ) ?  $font_object -> GetStringWidth ( $fragment [ 'text' ], $this -> ExtraTextWidth ) : 0 ;
5595
5596		// Return the font object
5597		return ( $font_object ) ;
5598	    }
5599
5600
5601	// __debug_get_coordinates -
5602	//	Returns the coordinates of the specified text fragment, in debug mode.
5603	private function  __debug_get_coordinates ( $fragment )
5604	   {
5605		return ( "\n[x:" . round ( $fragment [ 'x' ], 3 ) . ', y:' . round ( $fragment [ 'y' ], 3 ) .
5606			 ", w: " . round ( $fragment [ 'width' ], 3 ) . ", h:" . round ( $fragment [ 'font-height' ], 3 ) . ", font:" . $fragment [ 'font' ] . "]" ) ;
5607	    }
5608
5609
5610	/*--------------------------------------------------------------------------------------------------------------
5611
5612	    NAME
5613	        GetTrailerInformation - Retrieves trailer information.
5614
5615	    PROTOTYPE
5616	        $this -> GetTrailerInformation ( $contents ) ;
5617
5618	    DESCRIPTION
5619	        Retrieves trailer information :
5620		- Unique file ID
5621		- Id of the object containing encryption data, if the PDF file is encrypted
5622		- Encryption data
5623
5624	    PARAMETERS
5625	        $contents (string) -
5626	                PDF file contents.
5627
5628	 *-------------------------------------------------------------------------------------------------------------*/
5629	protected function  GetTrailerInformation ( $contents, $pdf_objects )
5630	   {
5631		// Be paranoid : check if there is trailer information
5632		if  ( ! preg_match ( '/trailer \s* << (?P<trailer> .+?) >>/imsx', $contents, $trailer_match ) )
5633			return ;
5634
5635		$trailer_data	=  $trailer_match [ 'trailer' ] ;
5636
5637		// Get the unique file id from the trailer data
5638		static		$id_regex	=  '#
5639							/ID \s* \[ \s*
5640							< (?P<id1> [^>]+) >
5641							\s*
5642							< (?P<id2> [^>]+) >
5643							\s* \]
5644						    #imsx' ;
5645
5646		if  ( preg_match ( $id_regex, $trailer_data, $id_match ) )
5647		   {
5648			$this -> ID	=  $id_match [ 'id1' ] ;
5649			$this -> ID2	=  $id_match [ 'id2' ] ;
5650		    }
5651
5652		// If there is an object describing encryption data, get its number (/Encrypt flag)
5653		if (  ! preg_match ( '#/Encrypt \s+ (?P<object> \d+)#ix', $trailer_data, $encrypt_match ) )
5654			return ;
5655
5656		$encrypt_object_id	=  $encrypt_match [ 'object' ] ;
5657
5658		if  ( ! isset ( $pdf_objects [ $encrypt_object_id ] ) )
5659		   {
5660			if  ( self::$DEBUG )
5661				error ( new PdfToTextDecodingException ( "Object #$encrypt_object_id, which should contain encryption data, is missing." ) ) ;
5662
5663			return ;
5664		    }
5665
5666		// Parse encryption information
5667		$this -> EncryptionData		=  PdfEncryptionData::GetInstance ( $this -> ID, $encrypt_object_id, $pdf_objects [ $encrypt_object_id ] ) ;
5668		$this -> IsEncrypted		=  ( $this -> EncryptionData  !==  false ) ;
5669	    }
5670
5671
5672	// __build_ignored_instructions :
5673	//	Takes the template regular expressions from the self::$IgnoredInstructionsTemplates, replace each string with the contents
5674	//	of the self::$ReplacementConstructs array, and sets the self::$IgnoredInstructions to a regular expression that is able to
5675	//	match the Postscript instructions to be removed from any text stream.
5676	private function  __build_ignored_instructions ( )
5677	   {
5678		$searches	=  array_keys   ( self::$ReplacementConstructs ) ;
5679		$replacements	=  array_values ( self::$ReplacementConstructs ) ;
5680
5681		foreach  ( self::$IgnoredInstructionTemplatesLayout  as  $template )
5682		   {
5683			$template	=  '/' . str_replace ( $searches, $replacements, $template ) . '/msx' ;
5684
5685			self::$IgnoredInstructionsLayout []	=  $template ;
5686			self::$IgnoredInstructionsNoLayout []	=  $template ;
5687		    }
5688
5689		foreach  ( self::$IgnoredInstructionTemplatesNoLayout  as  $template )
5690		   {
5691			$template	=  '/' . str_replace ( $searches, $replacements, $template ) . '/msx' ;
5692
5693			self::$IgnoredInstructionsNoLayout []	=  $template ;
5694		    }
5695	    }
5696
5697
5698	// __convert_utf16 :
5699	//	Some strings found in a pdf file can be encoded in UTF16 (author information, for example).
5700	//	When this is the case, the string is converted to UTF8.
5701	private function  __convert_utf16 ( $text )
5702	   {
5703		if  ( isset ( $text [0] )  &&  isset ( $text [1] ) )
5704		   {
5705			$b1	=  ord ( $text [0] ) ;
5706			$b2	=  ord ( $text [1] ) ;
5707
5708			if  ( ( $b1  ==  0xFE  &&  $b2  ==  0xFF )  ||  ( $b1  ==  0xFF  &&  $b2  ==  0xFE ) )
5709				$text	=  mb_convert_encoding ( $text, 'UTF-8', 'UTF-16' ) ;
5710		    }
5711
5712		return ( $text ) ;
5713	    }
5714
5715
5716	// __extract_chars_from_array -
5717	//	Extracts characters enclosed either within parentheses (character codes) or angle brackets (hex value)
5718	//	from an array.
5719	//	Example :
5720	//
5721	//		[<0D>-40<02>-36<03>-39<0E>-36<0F>-36<0B>-37<10>-37<10>-35(abc)]
5722	//
5723	// 	will return an array having the following entries :
5724	//
5725	//		<0D>, <02>, <03>, <0E>, <0F>, <0B>, <10>, <10>, (abc)
5726	private function  __extract_chars_from_array ( $array )
5727	   {
5728		$length 	=  strlen ( $array ) - 1 ;
5729		$result 	=  array ( ) ;
5730		$offsets	=  array ( ) ;
5731
5732		for  ( $i = 1 ; $i  <  $length ; $i ++ )	// Start with character right after the opening bracket
5733		   {
5734		   	$ch 	=  $array [$i] ;
5735
5736			if  ( $ch  ==  '(' )
5737				$endch 	=  ')' ;
5738			else if  ( $ch  ==  '<' )
5739				$endch 	=  '>' ;
5740			else
5741			   {
5742				$value	=  '' ;
5743
5744				while  ( $i  <  $length  &&  ( ( $array [$i]  >=  '0'  &&  $array [$i]  <=  '9' )  ||
5745						$array [$i]  ==  '-'  ||  $array [$i]  ==  '+'  ||  $array [$i]  ==  '.' ) )
5746					$value	.=  $array [$i++] ;
5747
5748				$offsets []	=  ( double ) $value ;
5749
5750				if  ( $value  !==  '' )
5751					$i -- ;
5752
5753				continue ;
5754			    }
5755
5756			$char 	=  $ch ;
5757			$i ++ ;
5758
5759			while  ( $i  <  $length  &&  $array [$i]  !=  $endch )
5760			   {
5761			   	if  ( $array [$i]  ==  '\\' )
5762			   		$char 	.=  '\\' . $array [++$i] ;
5763				else
5764				   {
5765					$char 	.=  $array [$i] ;
5766
5767					if  ( $array [$i]  ==  $endch )
5768						break ;
5769				    }
5770
5771				$i ++ ;
5772			   }
5773
5774			$result [] 	 =  $char . $endch ;
5775		    }
5776
5777		return ( array ( $result, $offsets ) ) ;
5778	    }
5779
5780
5781	// __extract_chars_from_block -
5782	//	Extracts characters from a text block (enclosed in parentheses).
5783	//	Returns an array of character ordinals if the $as_array parameter is true, or a string if false.
5784	private function  __extract_chars_from_block ( $text, $start_index = false, $length = false, $as_array = false )
5785	   {
5786		if  ( $as_array )
5787			$result		=  array ( ) ;
5788		else
5789			$result		=  '' ;
5790
5791		if  ( $start_index  ===  false )
5792			$start_index	=  0 ;
5793
5794		if  ( $length  ===  false )
5795			$length		=  strlen ( $text ) ;
5796
5797		$ord0	=  ord ( '0' ) ;
5798
5799		for  ( $i = $start_index ; $i  <  $length ; $i ++ )
5800		   {
5801			$ch	=  $text [$i] ;
5802
5803			if  ( $ch  ==  '\\' )
5804			   {
5805				if  ( isset ( $text [ $i + 1 ] ) )
5806				   {
5807					$ch2	=  $text [ ++$i ] ;
5808
5809					switch  ( $ch2 )
5810					   {
5811						case  'n' :  $ch =  "\n" ; break ;
5812						case  'r' :  $ch =  "\r" ; break ;
5813						case  't' :  $ch =  "\t" ; break ;
5814						case  'f' :  $ch =  "\f" ; break ;
5815						case  'v' :  $ch =  "\v" ; break ;
5816
5817						default :
5818							if  ( $ch2  >=  '0'  &&  $ch2  <=  '7' )
5819							   {
5820								$ord	=  $ch2 - $ord0 ;
5821								$i ++ ;
5822
5823								while  ( isset ( $text [$i] )  &&  $text [$i]  >=  '0'  &&  $text [$i]  <=  '7' )
5824								   {
5825									$ord	=  ( $ord * 8 ) + ord ( $text [$i] ) - $ord0 ;
5826									$i ++ ;
5827								    }
5828
5829								$ch	=  chr ( $ord ) ;
5830								$i -- ;
5831							    }
5832							else
5833								$ch	=  $ch2 ;
5834
5835					    }
5836				    }
5837			    }
5838
5839			if  ( $as_array )
5840				$result []	 =  ord ( $ch ) ;
5841			else
5842				$result		.=  $ch ;
5843		    }
5844
5845		return ( $result ) ;
5846	    }
5847
5848
5849	// __get_character_padding :
5850	//	If the offset specified between two character groups in an array notation for displaying text is less
5851	//	than -MinSpaceWidth thousands of text units,
5852	private function  __get_character_padding ( $char_offset )
5853	   {
5854		if  ( $char_offset  <=  - $this -> MinSpaceWidth )
5855		   {
5856			if  ( $this -> Options & self::PDFOPT_REPEAT_SEPARATOR )
5857			   {
5858				// If the MinSpaceWidth property is less than 1000 (text units), consider it has the value 1000
5859				// so that an exuberant number of spaces will not be repeated
5860				$space_width	=  ( $this -> MinSpaceWidth  <  1000 ) ?  1000 :  $this -> MinSpaceWidth ;
5861
5862				$repeat_count	=  abs ( round ( $char_offset / $space_width, 0 ) ) ;
5863
5864				if  ( $repeat_count )
5865					$padding	=  str_repeat ( $this -> Separator, $repeat_count ) ;
5866				else
5867					$padding	=  $this -> Separator ;
5868				}
5869			else
5870				$padding	=  $this -> Separator ;
5871
5872			return ( utf8_encode ( self::Unescape ( $padding ) ) ) ;
5873		    }
5874		else
5875			return ( '' ) ;
5876	    }
5877
5878
5879	// __get_output_image_filename -
5880	//	Returns a real filename based on a template supplied by the AutoSaveImageFileTemplate property.
5881	private function  __get_output_image_filename ( )
5882	   {
5883		static		$suffixes	=  array
5884		   (
5885			IMG_JPEG		=>  'jpg',
5886			IMG_JPG			=>  'jpg',
5887			IMG_GIF			=>  'gif',
5888			IMG_PNG			=>  'png',
5889			IMG_WBMP		=>  'wbmp',
5890			IMG_XPM			=>  'xpm'
5891		    ) ;
5892
5893		$template	=  $this -> ImageAutoSaveFileTemplate ;
5894		$length		=  strlen ( $template ) ;
5895		$parts		=  pathinfo ( $this -> Filename ) ;
5896
5897		if  ( ! isset ( $parts [ 'filename' ] ) )	// for PHP versions < 5.2
5898		   {
5899			$index		=  strpos ( $parts [ 'basename' ], '.' ) ;
5900
5901			if  ( $index  ===  false )
5902				$parts [ 'filename' ]	=  $parts [ 'basename' ] ;
5903			else
5904				$parts [ 'filename' ]	=  substr ( $parts [ 'basename' ], $index ) ;
5905		    }
5906
5907		$searches	=  array ( ) ;
5908		$replacements	=  array ( ) ;
5909
5910		// Search for each construct starting with '%'
5911		for  ( $i = 0 ; $i  <  $length ; $i ++ )
5912		   {
5913			if  ( $template [$i]  !=  '%'  ||  $i + 1  >=  $length )
5914				continue ;
5915
5916			$ch	=  $template [ ++ $i ] ;
5917
5918			// Percent sign found : check the character after
5919			switch  ( $ch )
5920			   {
5921				// "%%" : Replace it with a single percent
5922				case	'%' :
5923					$searches []		=  '%%' ;
5924					$replacements []	=  '%' ;
5925					break ;
5926
5927				// "%p" : Path of the original PDF file
5928				case	'p' :
5929					$searches []		=  '%p' ;
5930					$replacements []	=  $parts [ 'dirname' ] ;
5931					break ;
5932
5933				// "%f" : Filename part of the original PDF file, without its suffix
5934				case	'f' :
5935					$searches []		=  '%f' ;
5936					$replacements []	=  $parts [ 'filename' ] ;
5937					break ;
5938
5939				// "%s" : Output image file suffix, determined by the ImageAutoSaveFormat property
5940				case	's' :
5941					if  ( isset ( $suffixes [ $this -> ImageAutoSaveFormat ] ) )
5942					   {
5943						$searches []		=  '%s' ;
5944						$replacements []	=  $suffixes [ $this -> ImageAutoSaveFormat ] ;
5945					    }
5946					else
5947					   {
5948						$searches []		=  '%s' ;
5949						$replacements []	=  'unknown' ;
5950					    }
5951
5952					break ;
5953
5954				// Other : may be either "%d", or "%xd", where "x" are digits expression the width of the final sequential index
5955				default :
5956					$width	=  0 ;
5957					$chars	=  '' ;
5958
5959					if  ( ctype_digit ( $ch ) )
5960					   {
5961						do
5962						   {
5963							$width	 =  ( $width * 10 ) + ord ( $ch ) - ord ( '0' ) ;
5964							$chars  .=  $ch ;
5965							$i ++ ;
5966						    }  while  ( $i  <  $length  &&  ctype_digit ( $ch = $template [$i] ) ) ;
5967
5968						if  ( $template [$i]  ==  'd' )
5969						   {
5970							$searches []		=  '%' . $chars . 'd' ;
5971							$replacements []	=  sprintf ( "%0{$width}d", $this -> ImageCount ) ;
5972						    }
5973					    }
5974					else
5975					   {
5976						$searches []		=  '%d' ;
5977						$replacements []	=  $this -> ImageCount ;
5978					    }
5979			    }
5980		    }
5981
5982		// Perform the replacements
5983		if  ( count ( $searches ) )
5984			$result		=  str_replace ( $searches, $replacements, $template ) ;
5985		else
5986			$result		=  $template ;
5987
5988		// All done, return
5989		return ( $result ) ;
5990	    }
5991
5992
5993	// __rtl_process -
5994	//	Processes the contents of a page when it contains characters belonging to an RTL language.
5995	private function  __rtl_process ( $text )
5996	   {
5997		$length		=  strlen ( $text ) ;
5998		$pos		=  strcspn ( $text, self::$RtlCharacterPrefixes ) ;
5999
6000		// The text does not contain any of the UTF-8 prefixes that may introduce RTL contents :
6001		// simply return it as is
6002		if   ( $pos  ==  $length  ||  $text [$pos]  ===  "\x00" )
6003			return ( $text ) ;
6004
6005		// Extract each individual line, and get rid of carriage returns if any
6006		$lines		=  explode ( "\n", str_replace ( "\r", '', $text ) ) ;
6007		$new_lines	=  array ( ) ;
6008
6009		// Loop through lines
6010		foreach  ( $lines  as  $line )
6011		   {
6012			// Check if the current line contains potential RTL characters
6013			$pos		=  strcspn ( $line, self::$RtlCharacterPrefixes ) ;
6014			$length		=  strlen ( $line ) ;
6015
6016			// If not, simply store it as is
6017			if  ( $pos  ==  $length )
6018			   {
6019				$new_lines []	=  $line ;
6020				continue ;
6021			    }
6022
6023			// Otherwise, it gets a little bit more complicated ; we have :
6024			// - To process each series of RTL characters and put them in reverse order
6025			// - Mark spaces and punctuation as "RTL separators", without reversing them (ie, a string like " ." remains " .", not ". ")
6026			// - Other sequences of non-RTL characters must be preserved as is and are not subject to reordering
6027			// The reordering sequence will be described later. For the moment, the $words array is used to store arrays of two elements :
6028			// - The first one is a boolean indicating whether it concerns RTL characters (true) or not (false)
6029			// - The second one is the string itself
6030			$words		=  array ( ) ;
6031
6032			// Start of the string is not an RTL sequence ; we can add it to our $words array
6033			if  ( $pos )
6034			   {
6035				$word		=  substr ( $line, 0, $pos ) ;
6036				$words []	=  array ( $this -> __is_rtl_separator ( $word ), $word ) ;
6037			    }
6038
6039			$in_rtl		=  true ;
6040
6041			// Loop through remaining characters of the current line
6042			while  ( $pos  <  $length )
6043			   {
6044				// Character at the current position may be RTL character
6045				if  ( $in_rtl )
6046				   {
6047
6048					$rtl_text		=  '' ;
6049					$rtl_char		=  '' ;
6050					$rtl_char_length	=  0 ;
6051					$found_rtl		=  false ;
6052
6053					// Collect all the consecutive RTL characters, which represent a word, and put the letters in reverse order
6054					while  ( $pos  <  $length  &&  $this -> __is_rtl_character ( $line, $pos, $rtl_char, $rtl_char_length ) )
6055					   {
6056						$rtl_text	 =  $rtl_char . $rtl_text ;
6057						$pos		+=  $rtl_char_length ;
6058						$found_rtl	 =  true ;
6059					    }
6060
6061					// ... but make sure that we found a valid RTL sequence
6062					if  ( $found_rtl )
6063						$words []	 =  array ( true, $rtl_text ) ;
6064					else
6065						$words []	 =  array ( false, $line [ $pos ++ ] ) ;
6066
6067					// For now, we are no more in a series of RTL characters
6068					$in_rtl		=  false ;
6069				    }
6070				// Non-RTL characters : collect them until either the end of the current line or the next RTL character
6071				else
6072				   {
6073					$next_pos	=  $pos + strcspn ( $line, self::$RtlCharacterPrefixes, $pos ) ;
6074
6075					if  ( $next_pos  >=  $length )
6076					   {
6077						$word		=  substr ( $line, $pos ) ;
6078						break ;
6079					    }
6080					else
6081					   {
6082						$word		=  substr ( $line, $pos, $next_pos - $pos ) ;
6083						$pos		=  $next_pos ;
6084						$in_rtl		=  true ;
6085					    }
6086
6087					// Don't forget to make the distinction between a sequence of spaces and punctuations, and a real
6088					// piece of text. Space/punctuation strings surrounded by RTL words will be interverted
6089					$words []		=  array ( $this -> __is_rtl_separator ( $word ), $word ) ;
6090				    }
6091			    }
6092
6093			// Now we have an array, $words, whose first entry of each element indicates whether the second entry is an RTL string
6094			// or not (this includes strings that contain only spaces and punctuation).
6095			// We have to gather all the consecutive array items whose first entry is true, then invert their order.
6096			// Non-RTL strings are not affected by this process.
6097			$stacked_rtl_words	=  array ( ) ;
6098			$new_words		=  array ( ) ;
6099
6100			foreach  ( $words  as  $word )
6101			   {
6102				// RTL word : put it onto the stack
6103				if  ( $word [0] )
6104					$stacked_rtl_words []	=  $word [1] ;
6105				// Non-RTL word : add it as is to the output array, $new_words
6106				else
6107				   {
6108					// But if RTL words were stacked before, invert them and add them to the output array
6109					if  ( count ( $stacked_rtl_words ) )
6110					   {
6111						$new_words		=  array_merge ( $new_words, array_reverse ( $stacked_rtl_words ) ) ;
6112						$stacked_rtl_words	=  array ( ) ;
6113					    }
6114
6115					$new_words []	=  $word [1] ;
6116				    }
6117			    }
6118
6119			// Process any remaining RTL words that may have been stacked and not yet processed
6120			if  ( count ( $stacked_rtl_words ) )
6121				$new_words		=  array_merge ( $new_words, array_reverse ( $stacked_rtl_words ) ) ;
6122
6123			// That's ok, we have processed one more line
6124			$new_lines []	=  implode ( '', $new_words ) ;
6125		    }
6126
6127		// All done, return a catenation of all the lines processed so far
6128		$result		=  implode ( "\n", $new_lines ) ;
6129
6130		return ( $result ) ;
6131	    }
6132
6133
6134	// __is_rtl_character -
6135	//	Checks if the sequence starting at $pos in string $text is a character belonging to an RTL language.
6136	//	If yes, returns true and sets $rtl_char to the UTF8 string sequence for that character, and $rtl_char_length
6137	//	to the length of this string.
6138	//	If no, returns false.
6139	private function  __is_rtl_character ( $text, $pos, &$rtl_char, &$rtl_char_length )
6140	   {
6141		$ch	=  $text [ $pos ] ;
6142
6143		// Check that the current character is the start of a potential UTF8 RTL sequence
6144		if  ( isset  ( self::$RtlCharacterPrefixLengths [ $ch ] ) )
6145		   {
6146			// Get the number of characters that are expected after the sequence
6147			$length_after	=  self::$RtlCharacterPrefixLengths [ $ch ] ;
6148
6149			// Get the sequence after the UTF8 prefix
6150			$codes_after	=  substr ( $text, $pos + 1, $length_after ) ;
6151
6152			// Search through $RtlCharacters, which contains arrays of ranges related to the UTF8 character prefix
6153			foreach  ( self::$RtlCharacters [ $ch ]  as  $range )
6154			   {
6155				if  ( strcmp ( $range [0], $codes_after )  <=  0  &&
6156				      strcmp ( $range [1], $codes_after )  >=  0 )
6157				   {
6158					$rtl_char		=  $ch . $codes_after ;
6159					$rtl_char_length	=  $length_after + 1 ;
6160
6161					return ( true ) ;
6162				}
6163			    }
6164
6165			return ( false ) ;
6166		    }
6167		else
6168			return ( false ) ;
6169	    }
6170
6171
6172	// __is_rtl_separator -
6173	//	RTL words are separated by spaces and punctuation signs that are specified as LTR characters.
6174	//	However, such sequences, which are separators between words, must be considered as being part
6175	//	of an RTL sequence of words and therefore be reversed with them.
6176	//	This function helps to determine if the supplied string is simply a sequence of spaces and
6177	//	punctuation (a word separator) or plain text, that must keep its position in the line.
6178	private function  __is_rtl_separator ( $text )
6179	   {
6180		static		$known_separators	=  array ( ) ;
6181		static		$separators		=  " \t,.;:/!-_=+" ;
6182
6183		if  ( isset ( $known_separators [ $text ] ) )
6184			return ( true ) ;
6185
6186		for  ( $i = 0, $length = strlen ( $text ) ; $i  <  $length ; $i ++ )
6187		   {
6188			if  ( strpos ( $separators, $text [$i] )  ===  false )
6189				return ( false ) ;
6190		    }
6191
6192		$known_separators [ $text ]	=  true ;
6193
6194		return ( true ) ;
6195	    }
6196
6197
6198	// __strip_useless_instructions :
6199	//	Removes from a text stream all the Postscript instructions that are not meaningful for text extraction
6200	//	(these are mainly shape drawing instructions).
6201	private function  __strip_useless_instructions ( $data )
6202	   {
6203		$result		=  preg_replace ( $this -> IgnoredInstructions, ' ', $data ) ;
6204
6205		$this -> Statistics [ 'TextSize' ]		+=  strlen ( $data ) ;
6206		$this -> Statistics [ 'OptimizedTextSize' ]	+=  strlen ( $result ) ;
6207
6208		return ( $result ) ;
6209	    }
6210
6211
6212	/*--------------------------------------------------------------------------------------------------------------
6213
6214	    NAME
6215	        IsPageSelected - Checks if a page is selected for output.
6216
6217	    PROTOTYPE
6218	        $status		=  $this -> IsPageSelected ( $page ) ;
6219
6220	    DESCRIPTION
6221	        Checks if the specified page is to be selected for output.
6222
6223	    PARAMETERS
6224	        $page (integer) -
6225	                Page to be checked.
6226
6227	    RETURN VALUE
6228	        True if the page is to be selected for output, false otherwise.
6229
6230	 *-------------------------------------------------------------------------------------------------------------*/
6231	protected function  IsPageSelected ( $page )
6232	   {
6233		if  ( ! $this -> MaxSelectedPages )
6234			return ( true ) ;
6235
6236		if  ( $this -> MaxSelectedPages  >  0 )
6237			return  ( $page  <=  $this -> MaxSelectedPages ) ;
6238
6239		// MaxSelectedPages  <  0
6240		return ( $page  >  count ( $this -> PageMap -> Pages ) + $this -> MaxSelectedPages ) ;
6241	    }
6242
6243
6244	/*--------------------------------------------------------------------------------------------------------------
6245
6246	    NAME
6247	        PeekAuthorInformation - Gets author information from the specified object data.
6248
6249	    PROTOTYPE
6250	        $this -> PeekAuthorInformation ( $object_id, $object_data ) ;
6251
6252	    DESCRIPTION
6253	        Try to check if the specified object data contains author information (ie, the /Author, /Creator,
6254		/Producer, /ModDate, /CreationDate keywords) and sets the corresponding properties accordingly.
6255
6256	    PARAMETERS
6257	    	$object_id (integer) -
6258	    		Object id of this text block.
6259
6260	    	$object_data (string) -
6261	    		Stream contents.
6262
6263	 *-------------------------------------------------------------------------------------------------------------*/
6264	protected function  PeekAuthorInformation ( $object_id, $object_data )
6265	   {
6266		if  ( ( strpos  ( $object_data, '/Author' )  !==  false  ||  strpos ( $object_data, '/CreationDate' )  !==  false ) )
6267		   {
6268			$this -> GotAuthorInformation	=  true ;
6269			return ( $object_id ) ;
6270		    }
6271		else
6272			return ( false ) ;
6273	    }
6274
6275
6276	/*--------------------------------------------------------------------------------------------------------------
6277
6278	    NAME
6279	        RetrieveAuthorInformation - Extracts author information
6280
6281	    PROTOTYPE
6282	        $this -> RetriveAuthorInformation ( $object_id, $pdf_objects ) ;
6283
6284	    DESCRIPTION
6285	        Extracts the author information. Handles the case where flag values refer to existing objects.
6286
6287	    PARAMETERS
6288	        $object_id (integer) -
6289	                Id of the object containing the author information.
6290
6291		$pdf_objects (array) -
6292			Array whose keys are the PDF object ids, and values their corresponding contents.
6293
6294	 *-------------------------------------------------------------------------------------------------------------*/
6295	protected function  RetrieveAuthorInformation ( $object_id, $pdf_objects )
6296	   {
6297		static		$re		=  '#
6298							(?P<info>
6299								/
6300								(?P<keyword> (Author) | (Creator) | (Producer) | (Title) | (CreationDate) | (ModDate) | (Keywords) | (Subject) )
6301								\s*
6302								(?P<opening> [(<])
6303							)
6304						    #imsx' ;
6305		static		$object_re	=  '#
6306							(?P<info>
6307								/
6308								(?P<keyword> (Author) | (Creator) | (Producer) | (Title) | (CreationDate) | (ModDate) | (Keywords) | (Subject) )
6309								\s*
6310								(?P<object_ref>
6311									(?P<object> \d+)
6312									\s+
6313									\d+
6314									\s+
6315									R
6316								 )
6317							)
6318						    #imsx' ;
6319
6320		// Retrieve the object data corresponding to the specified object id
6321		$object_data	=  $pdf_objects [ $object_id ] ;
6322
6323		// Pre-process flags whose values refer to existing objects
6324		if  ( preg_match_all ( $object_re, $object_data, $object_matches ) )
6325		   {
6326			$searches		=  array ( ) ;
6327			$replacements		=  array ( ) ;
6328
6329			for  ( $i = 0, $count = count ( $object_matches [ 'keyword' ] ) ; $i  <  $count ; $i ++ )
6330			   {
6331				$searches []		=  $object_matches [ 'object_ref' ] [$i] ;
6332
6333				// Some buggy PDF may reference author information objects that do not exist
6334				$replacements []	=  isset ( $pdf_objects [ $object_matches [ 'object' ] [$i] ] ) ?
6335								trim ( $pdf_objects [ $object_matches [ 'object' ] [$i] ] ) : '' ;
6336			    }
6337
6338			$object_data	=  str_replace ( $searches, $replacements, $object_data ) ;
6339		    }
6340
6341
6342		// To execute faster, run the regular expression only if the object data contains a /Author keyword
6343		if  ( preg_match_all ( $re, $object_data, $matches, PREG_OFFSET_CAPTURE ) )
6344		   {
6345			for  ( $i = 0, $count = count ( $matches [ 'keyword' ] ) ; $i  <  $count ; $i ++ )
6346			   {
6347				$keyword	=  $matches [ 'keyword' ] [$i] [0] ;
6348				$opening	=  $matches [ 'opening' ] [$i] [0] ;
6349				$start_index	=  $matches [ 'info'    ] [$i] [1] + strlen ( $matches [ 'info' ] [$i] [0] ) ;
6350
6351				// Text between parentheses : the text is written as is
6352				if  ( $opening  ==  '(' )
6353				   {
6354					$parent_level	=  1 ;
6355
6356					// Since the parameter value can contain any character, including "\" or "(", we will have to find the real closing
6357					// parenthesis
6358					$value		=  '' ;
6359
6360					for  ( $j = $start_index, $object_length = strlen ( $object_data ) ; $j  <  $object_length ; $j ++ )
6361					   {
6362						if  ( $object_data [$j]  ==  '\\' )
6363							$value	.=  '\\' . $object_data [++$j] ;
6364						else if  ( $object_data [$j]  ==  '(' )
6365						   {
6366							$value	.=  '(' ;
6367							$parent_level ++ ;
6368						    }
6369						else if  ( $object_data [$j]  ==  ')' )
6370						   {
6371							$parent_level -- ;
6372
6373							if  ( ! $parent_level )
6374								break ;
6375							else
6376								$value	.=  ')' ;
6377						    }
6378						else
6379							$value  .=  $object_data [$j] ;
6380					    }
6381				     }
6382				// Text within angle brackets, written as hex digits
6383				else
6384				   {
6385					$end_index	=  strpos ( $object_data, '>', $start_index ) ;
6386					$hexdigits	=  substr ( $object_data, $start_index, $end_index - $start_index ) ;
6387					$value		=  hex2bin ( str_replace ( array ( "\n", "\r", "\t" ), '', $hexdigits ) ) ;
6388				    }
6389
6390				$value		=   $this -> __convert_utf16 ( $this -> __extract_chars_from_block ( $value ) ) ;
6391
6392				switch ( strtolower ( $keyword ) )
6393				   {
6394					case  'author'		:  $this -> Author			=  $value ; break ;
6395					case  'creator'		:  $this -> CreatorApplication		=  $value ; break ;
6396					case  'producer'	:  $this -> ProducerApplication		=  $value ; break ;
6397					case  'title'		:  $this -> Title			=  $value ; break ;
6398					case  'keywords'	:  $this -> Keywords			=  $value ; break ;
6399					case  'subject'		:  $this -> Subject			=  $value ; break ;
6400					case  'creationdate'	:  $this -> CreationDate		=  $this -> GetUTCDate ( $value ) ; break ;
6401					case  'moddate'		:  $this -> ModificationDate		=  $this -> GetUTCDate ( $value ) ; break ;
6402				    }
6403			    }
6404
6405			if  ( self::$DEBUG )
6406			   {
6407		   		echo "\n----------------------------------- AUTHOR INFORMATION\n" ;
6408				echo ( "Author               : " . $this -> Author . "\n" ) ;
6409				echo ( "Creator application  : " . $this -> CreatorApplication . "\n" ) ;
6410				echo ( "Producer application : " . $this -> ProducerApplication . "\n" ) ;
6411				echo ( "Title                : " . $this -> Title . "\n" ) ;
6412				echo ( "Subject              : " . $this -> Subject . "\n" ) ;
6413				echo ( "Keywords             : " . $this -> Keywords . "\n" ) ;
6414				echo ( "Creation date        : " . $this -> CreationDate . "\n" ) ;
6415				echo ( "Modification date    : " . $this -> ModificationDate . "\n" ) ;
6416			    }
6417		    }
6418	    }
6419
6420
6421	/*--------------------------------------------------------------------------------------------------------------
6422
6423	    NAME
6424	        RetrieveFormData - Retrieves raw form data
6425
6426	    PROTOTYPE
6427	        $this -> RetrieveFormData ( $object_id, $object_data ) ;
6428
6429	    DESCRIPTION
6430	        Retrieves raw form data (form definition and field values definition).
6431
6432	    PARAMETERS
6433	        $object_id (integer) -
6434	                Id of the object containing the author information.
6435
6436		$object_data (string) -
6437			Object contents.
6438
6439		$pdf_objects (array) -
6440			Array whose keys are the PDF object ids, and values their corresponding contents.
6441
6442	    NOTES
6443	        This function only memorizes the contents of form data definitions. The actual data will be processed
6444		only if the GetFormData() function is called.
6445
6446	 *-------------------------------------------------------------------------------------------------------------*/
6447	protected function  RetrieveFormData ( $object_id, $object_data, $pdf_objects )
6448	   {
6449		// Retrieve the object that contains the field values
6450		preg_match ( '#\b R \s* \( \s* datasets \s* \) \s* (?P<object> \d+) \s+ \d+ \s+ R#imsx', $object_data, $field_match ) ;
6451		$field_object		=  $field_match [ 'object' ] ;
6452
6453		if  ( ! isset ( $pdf_objects [ $field_object ] ) )
6454		   {
6455			if  ( self::$DEBUG )
6456				warning ( "Field definitions object #$field_object not found in object #$object_id." ) ;
6457
6458			return ;
6459		    }
6460
6461		// Retrieve the object that contains the form definition
6462		preg_match ( '#\b R \s* \( \s* form \s* \) \s* (?P<object> \d+) \s+ \d+ \s+ R#imsx', $object_data, $form_match ) ;
6463		$form_object		=  $form_match [ 'object' ] ;
6464
6465		if  ( ! isset ( $pdf_objects [ $form_object ] ) )
6466		   {
6467			if  ( self::$DEBUG )
6468				warning ( "Form definitions object #$form_object not found in object #$object_id." ) ;
6469
6470			return ;
6471		    }
6472		// Add this entry to form data information
6473		$this -> FormData [ $object_id ]	=  array
6474		   (
6475			'values'	=>  ( integer ) $field_object,
6476			'form'		=>  ( integer ) $form_object
6477		    ) ;
6478	    }
6479
6480
6481    }
6482
6483
6484/**************************************************************************************************************
6485 **************************************************************************************************************
6486 **************************************************************************************************************
6487 ******                                                                                                  ******
6488 ******                                                                                                  ******
6489 ******                                      FONT TABLE MANAGEMENT                                       ******
6490 ******                                                                                                  ******
6491 ******                                                                                                  ******
6492 **************************************************************************************************************
6493 **************************************************************************************************************
6494 **************************************************************************************************************/
6495
6496/*==============================================================================================================
6497
6498    PdfTexterFontTable class -
6499        The PdfTexterFontTable class is not supposed to be used outside the context of the PdfToText class.
6500	Its purposes are to hold a list of font definitions taken from a pdf document, along with their
6501	associated character mapping tables, if any.
6502	This is why no provision has been made to design this class a a general purpose class ; its utility
6503	exists only in the scope of the PdfToText class.
6504
6505  ==============================================================================================================*/
6506class 	PdfTexterFontTable 	extends PdfObjectBase
6507   {
6508	// Font table
6509	public		$Fonts		=  array ( ) ;
6510	private		$DefaultFont	=  false ;
6511	// Font mapping between a font number and an object number
6512	private 	$FontMap 	=  array ( ) ;
6513	// A character map buffer is used to store results from previous calls to the MapCharacter() method of the
6514	// FontTable object. It dramatically reduces the number of calls needed, from one call for each character
6515	// defined in the pdf stream, to one call on each DISTINCT character defined in the PDF stream.
6516	// As an example, imagine a PDF file that contains 200K characters, but only 150 distinct ones. The
6517	// MapCharacter method will be called 150 times, instead of 200 000...
6518	private		$CharacterMapBuffer		=  array ( ) ;
6519
6520
6521	// Constructor -
6522	//	Well, does not do anything special
6523	public function  __construct ( )
6524	   {
6525		parent::__construct ( ) ;
6526	    }
6527
6528
6529	// Add -
6530	//	Adds the current font declaration to the font table. Handles special cases where font id is not
6531	//	given by the object id, but rather by <</Rx...>> constructs
6532	public function  Add ( $object_id, $font_definition, $pdf_objects, $extra_mappings )
6533	   {
6534		if  ( PdfToText::$DEBUG )
6535		   {
6536	   		echo "\n----------------------------------- FONT #$object_id\n" ;
6537			echo $font_definition ;
6538		    }
6539
6540		$font_type		=  PdfTexterFont::FONT_ENCODING_STANDARD ;
6541		$cmap_id		=  0 ;
6542		$secondary_cmap_id	=  0 ;
6543		$font_variant		=  false ;
6544
6545		// Font resource id specification
6546	   	if  ( preg_match ( '#<< \s* (?P<rscdefs> /R\d+ .*) >>#ix', $font_definition, $match ) )
6547		   {
6548			$resource_definitions	=  $match [ 'rscdefs' ] ;
6549
6550			preg_match_all ( '#/R (?P<font_id> \d+) #ix', $resource_definitions, $id_matches ) ;
6551			preg_match_all ( '#/ToUnicode \s* (?P<cmap_id> \d+)#ix', $resource_definitions, $cmap_matches ) ;
6552
6553			$count		=  count ( $id_matches [ 'font_id' ] ) ;
6554
6555			for  ( $i = 0 ;  $i  <  $count ; $i ++ )
6556			   {
6557				$font_id	=  $id_matches   [ 'font_id' ] [$i] ;
6558				$cmap_id	=  $cmap_matches [ 'cmap_id' ] [$i] ;
6559
6560				$this -> Fonts [ $font_id ]	=  new  PdfTexterFont ( $font_id, $cmap_id, PdfTexterFont::FONT_ENCODING_UNICODE_MAP, $extra_mappings ) ;
6561			    }
6562
6563			return ;
6564		    }
6565		// Experimental implementation of CID fonts
6566		else if  ( preg_match ( '#/(Base)?Encoding \s* /Identity-H#ix', $font_definition ) )
6567		   {
6568			if  ( preg_match ( '#/BaseFont \s* /(?P<font> [^\s/]+)#ix', $font_definition, $match ) )
6569				$font_variant	=  $match [ 'font' ] ;
6570
6571			$font_type	=  PdfTexterFont::FONT_ENCODING_CID_IDENTITY_H ;
6572		    }
6573		// Font has an associated Unicode map (using the /ToUnicode keyword)
6574		else if  ( preg_match ( '#/ToUnicode \s* (?P<cmap> \d+)#ix', $font_definition, $match ) )
6575		   {
6576			$cmap_id	=  $match [ 'cmap' ] ;
6577			$font_type	=  PdfTexterFont::FONT_ENCODING_UNICODE_MAP ;
6578
6579			if  ( preg_match ( '#/Encoding \s* (?P<cmap> \d+)#ix', $font_definition, $secondary_match ) )
6580				$secondary_cmap_id	=  $secondary_match [ 'cmap' ] ;
6581		    }
6582		// Font has an associated character map (using a cmap id)
6583		else if  ( preg_match ( '#/Encoding \s* (?P<cmap> \d+) \s+ \d+ #ix', $font_definition, $match ) )
6584		   {
6585			$cmap_id 	=  $match [ 'cmap' ] ;
6586			$font_type	=  PdfTexterFont::FONT_ENCODING_PDF_MAP ;
6587		    }
6588		// Font uses the Windows Ansi encoding
6589		else if  ( preg_match ( '#/(Base)?Encoding \s* /WinAnsiEncoding#ix', $font_definition ) )
6590		   {
6591			$font_type	=  PdfTexterFont::FONT_ENCODING_WINANSI ;
6592
6593			if  ( preg_match ( '# /BaseFont \s* / [a-z0-9_]+ \+ [a-z0-9_]+? Cyr #imsx', $font_definition ) )
6594				$font_type	|=  PdfTexterFont::FONT_VARIANT_ISO8859_5 ;
6595		    }
6596		// Font uses the Mac Roman encoding
6597		else if  ( preg_match ( '#/(Base)?Encoding \s* /MacRomanEncoding#ix', $font_definition ) )
6598			$font_type	=  PdfTexterFont::FONT_ENCODING_MAC_ROMAN ;
6599
6600		$this -> Fonts [ $object_id ]	=  new  PdfTexterFont ( $object_id, $cmap_id, $font_type, $secondary_cmap_id, $pdf_objects, $extra_mappings, $font_variant ) ;
6601
6602		// Arbitrarily set the default font to the first font encountered in the pdf file
6603		if  ( $this -> DefaultFont  ===  false )
6604		   {
6605			reset ( $this -> Fonts ) ;
6606			$this -> DefaultFont	=  key ( $this -> Fonts ) ;
6607		    }
6608	    }
6609
6610
6611	// AddFontMap -
6612	//	Process things like :
6613	//		<</F1 26 0 R/F2 22 0 R/F3 18 0 R>>
6614	//	which maps font 1 (when specified with the /Fx instruction) to object 26,
6615	//	2 to object 22 and 3 to object 18, respectively, in the above example.
6616	//	Found also a strange way of specifying a font mapping :
6617	//		<</f-0-0 5 0 R etc.
6618	//	And yet another one :
6619	//		<</C0_0 5 0 R
6620	public function  AddFontMap ( $object_id, $object_data )
6621	   {
6622		$object_data	=  self::UnescapeHexCharacters ( $object_data ) ;
6623
6624		// The same object can hold different notations for font associations
6625		if  ( preg_match_all ( '# (?P<font> ' . self::$FontSpecifiers . ' ) \s+ (?P<object> \d+) #imsx', $object_data, $matches ) )
6626		   {
6627			for ( $i = 0, $count = count ( $matches [ 'font' ] ) ; $i  <  $count ; $i ++ )
6628			   {
6629				$font	=  $matches [ 'font'   ] [$i] ;
6630				$object =  $matches [ 'object' ] [$i] ;
6631
6632				$this -> FontMap [ $font ]	=  $object ;
6633			    }
6634		    }
6635	    }
6636
6637
6638	// AddPageFontMap -
6639	//	Adds font aliases to the current font map, in the form : "page:xobject:font".
6640	//	The associated value is the font object itself.
6641	public function  AddPageFontMap ( $map )
6642	   {
6643		foreach  ( $map  as  $map_entry )
6644		   {
6645			$this -> FontMap [ $map_entry [ 'page' ] . ':' . $map_entry [ 'xobject-name' ] . ':' . $map_entry [ 'font-name' ] ]	=  $map_entry [ 'object' ] ;
6646		    }
6647	    }
6648
6649
6650	// AddCharacterMap -
6651	//	Associates a character map to a font declaration that referenced it.
6652	public function  AddCharacterMap ( $cmap )
6653	   {
6654		$status		=  false ;
6655
6656		// We loop through all fonts, since the same character map can be referenced by several font definitions
6657		foreach  ( $this -> Fonts  as  $font )
6658		   {
6659			if  ( $font -> CharacterMapId  ==  $cmap -> ObjectId )
6660			   {
6661				$font -> CharacterMap	=  $cmap ;
6662				$status			=  true ;
6663			    }
6664			else if  ( $font -> SecondaryCharacterMapId  ==  $cmap -> ObjectId )
6665			   {
6666				$cmap -> Secondary		=  true ;
6667				$font -> SecondaryCharacterMap	=  $cmap ;
6668				$status				=  true ;
6669			    }
6670		    }
6671
6672		return ( $status ) ;
6673	    }
6674
6675
6676	// GetFontAttributes -
6677	//	Gets the specified font width in hex digits and whether the font has a character map or not.
6678	public function  GetFontAttributes ( $page_number, $template, $font, &$font_map_width, &$font_mapped )
6679	   {
6680		// Font considered as global to the document
6681		if  ( isset ( $this -> Fonts [ $font ] ) )
6682			$key	=  $font ;
6683		// Font not found : try to use the first one declared in the document
6684		else
6685		   {
6686			reset ( $this -> Fonts ) ;
6687			$key	=  key ( $this -> Fonts ) ;
6688		    }
6689
6690		// Font has an associated character map
6691		if  ( $key  &&  $this -> Fonts [ $key ] -> CharacterMap )
6692		   {
6693			$font_map_width		=  $this -> Fonts [ $key ] -> CharacterMap -> HexCharWidth ;
6694			$font_mapped		=  true ;
6695
6696			return ( true ) ;
6697		    }
6698		// No character map : characters are specified as two hex digits
6699		else
6700		   {
6701			$font_map_width		=  2 ;
6702			$font_mapped		=  false ;
6703
6704			return ( false ) ;
6705		    }
6706	    }
6707
6708
6709	// GetFontByMapId -
6710	//	Returns the font id (object id) associated with the specified mapped id.
6711	public function  GetFontByMapId ( $page_number, $template, $id )
6712	   {
6713		if  ( isset ( $this -> FontMap [ "$page_number:$template:$id" ] ) )
6714			$font_object	=  $this -> FontMap [ "$page_number:$template:$id" ] ;
6715		else if  ( isset ( $this -> FontMap [ $id ] ) )
6716			$font_object	=  $this -> FontMap [ $id ] ;
6717		else
6718			$font_object	=  -1 ;
6719
6720		return ( $font_object ) ;
6721	    }
6722
6723
6724	// GetFontObject -
6725	//	Returns the PdfTexterFont object for the given page, template and font id (in the form of "/something")
6726	public function  GetFontObject ( $page_number, $template, $id )
6727	   {
6728		if  ( isset ( $this -> FontMap [ "$page_number:$template:$id" ] ) )
6729			$font_object	=  $this -> FontMap [ "$page_number:$template:$id" ] ;
6730		else if  ( isset ( $this -> FontMap [ $id ] ) )
6731			$font_object	=  $this -> FontMap [ $id ] ;
6732		else
6733			return ( false ) ;
6734
6735		if  ( isset ( $this -> Fonts [ $font_object ] ) )
6736			return ( $this -> Fonts [ $font_object ] ) ;
6737		else
6738			return ( false ) ;
6739	    }
6740
6741
6742	// MapCharacter -
6743	//	Returns the character associated to the specified one.
6744	public function  MapCharacter ( $font, $ch, $return_false_on_failure = false )
6745	   {
6746		if  ( isset ( $this -> CharacterMapBuffer [ $font ] [ $ch ] ) )
6747			return ( $this -> CharacterMapBuffer [ $font ] [ $ch ] ) ;
6748
6749		// Use the first declared font as the default font, if none defined
6750		if  ( $font  ==  -1 )
6751			$font	=  $this -> DefaultFont ;
6752
6753		$cache	=  true ;
6754
6755		if  ( isset  ( $this -> Fonts [ $font ] ) )
6756		   {
6757			$font_object	=  $this -> Fonts [ $font ] ;
6758
6759			$code	=  $font_object -> MapCharacter ( $ch, $return_false_on_failure ) ;
6760
6761			if  ( $font_object -> CharacterMap )
6762				$cache	=  $font_object -> CharacterMap -> Cache ;
6763		    }
6764		else
6765		   {
6766			$code	=  $this -> CodePointToUtf8 ( $ch ) ;
6767		    }
6768
6769		if  ( $cache )
6770			$this -> CharacterMapBuffer [ $font ] [ $ch ]	=  $code ;
6771
6772		return ( $code ) ;
6773	    }
6774    }
6775
6776
6777/**************************************************************************************************************
6778 **************************************************************************************************************
6779 **************************************************************************************************************
6780 ******                                                                                                  ******
6781 ******                                                                                                  ******
6782 ******                                         FONT MANAGEMENT                                          ******
6783 ******                                                                                                  ******
6784 ******                                                                                                  ******
6785 **************************************************************************************************************
6786 **************************************************************************************************************
6787 **************************************************************************************************************/
6788
6789/*==============================================================================================================
6790
6791    PdfTexterFont class -
6792        The PdfTexterFont class is not supposed to be used outside the context of the PdfToText class.
6793	It holds an optional character mapping table associted with this font.
6794	No provision has been made to design this class a a general purpose class ; its utility exists only in
6795	the scope of the PdfToText class.
6796
6797  ==============================================================================================================*/
6798class  PdfTexterFont		extends PdfObjectBase
6799   {
6800	// Font encoding types, for fonts that are neither associated with a Unicode character map nor a PDF character map
6801	const	FONT_ENCODING_STANDARD			=  0 ;			// No character map, use the standard character set
6802	const	FONT_ENCODING_WINANSI			=  1 ;			// No character map, use the Windows Ansi character set
6803	const	FONT_ENCODING_MAC_ROMAN			=  2 ;			// No character map, use the MAC OS Roman character set
6804	const	FONT_ENCODING_UNICODE_MAP		=  3 ;			// Font has an associated unicode character map
6805	const	FONT_ENCODING_PDF_MAP			=  4 ;			// Font has an associated PDF character map
6806	const	FONT_ENCODING_CID_IDENTITY_H		=  5 ;			// CID font : IDENTITY-H
6807
6808	// Font variants
6809	const   FONT_VARIANT_STANDARD		=  0x0000 ;
6810	const	FONT_VARIANT_ISO8859_5		=  0x1000 ;		// Cyrillic
6811
6812	const	FONT_VARIANT_MASK		=  0xF000 ;
6813	const	FONT_VARIANT_SHIFT		=  12 ;
6814
6815	// Font resource id (may be an object id, overridden by <</Rx...>> constructs
6816	public		$Id ;
6817	// Font type and variant
6818	public		$FontType ;
6819	public		$FontVariant ;
6820	// Character map id, specified by the /ToUnicode flag
6821	public		$CharacterMapId ;
6822	// Secondary character map id, specified by the /Encoding flag and that can contain a /Differences flag
6823	public		$SecondaryCharacterMapId ;
6824	// Optional character map, that may be set by the PdfToText::Load method just before processing text drawing blocks
6825	public		$CharacterMap		=  null ;
6826	public		$SecondaryCharacterMap	=  null ;
6827	// Character widths
6828	public		$CharacterWidths	=  array ( ) ;
6829	// Default character width, if not present in the $CharacterWidths array
6830	public		$DefaultWidth		=  0 ;
6831	private		$GotWidthInformation	=  false ;
6832	// A buffer for remembering character widths
6833	protected	$CharacterWidthsBuffer	=  array ( ) ;
6834
6835
6836	// Constructor -
6837	//	Builds a PdfTexterFont object, using its resource id and optional character map id.
6838	public function  __construct ( $resource_id, $cmap_id, $font_type, $secondary_cmap_id = null, $pdf_objects = null, $extra_mappings = null, $font_variant = false )
6839	   {
6840
6841		parent::__construct ( ) ;
6842
6843		$this -> Id				=  $resource_id ;
6844		$this -> CharacterMapId			=  $cmap_id ;
6845		$this -> SecondaryCharacterMapId	=  $secondary_cmap_id ;
6846		$this -> FontType			=  $font_type  &  ~self::FONT_VARIANT_MASK ;
6847		$this -> FontVariant			=  ( $font_type  >>  self::FONT_VARIANT_SHIFT )  &  0x0F ;
6848
6849		// Instantiate the appropriate character map for this font
6850		switch  ( $this -> FontType )
6851		   {
6852			case	self::FONT_ENCODING_WINANSI :
6853				$this -> CharacterMap	=  new  PdfTexterAdobeWinAnsiMap ( $resource_id, $this -> FontVariant ) ;
6854				break ;
6855
6856			case	self::FONT_ENCODING_MAC_ROMAN :
6857				$this -> CharacterMap	=  new  PdfTexterAdobeMacRomanMap ( $resource_id, $this -> FontVariant ) ;
6858				break ;
6859
6860			case	self::FONT_ENCODING_CID_IDENTITY_H :
6861				$this -> CharacterMap	=  new PdfTexterIdentityHCIDMap (  $resource_id, $font_variant ) ;
6862				break ;
6863
6864			case	self::FONT_ENCODING_PDF_MAP :
6865				$this -> CharacterMap	=  new  PdfTexterEncodingMap ( $cmap_id, $pdf_objects [ $cmap_id ], $extra_mappings ) ;
6866				break ;
6867
6868			case	self::FONT_ENCODING_UNICODE_MAP :
6869				break ;
6870
6871			case	self::FONT_ENCODING_STANDARD :
6872				break ;
6873
6874			default :
6875				if  ( PdfToText::$DEBUG )
6876					warning ( "Unknown font type #$font_type found for object #$resource_id, character map #$cmap_id." ) ;
6877		    }
6878
6879		// Get font data ; include font descriptor information if present
6880		$font_data	=  $pdf_objects [ $resource_id ] ;
6881
6882		if  ( preg_match ( '/FontDescriptor \s+ (?P<id> \d+) \s+ \d+ \s+ R/imsx', $font_data, $match ) )
6883		   {
6884			$descriptor_id		=  $match [ 'id' ] ;
6885
6886			// Don't care about searching this in that object, or that in this object - simply catenate the font descriptor
6887			// with the font definition
6888			if  ( isset ( $pdf_objects [ $descriptor_id ] ) )
6889				$font_data	.=  $pdf_objects [ $descriptor_id ] ;
6890		    }
6891
6892		// Type1 fonts belong to the Adobe 14 standard fonts available. Information about the character widths is never embedded in the PDF
6893		// file, but must be taken from external data (in the FontMetrics directory).
6894		if  ( preg_match ( '#/SubType \s* /Type1#ix', $font_data ) )
6895		   {
6896			preg_match ( '#/BaseFont \s* / ([\w]+ \+)? (?P<font> [^\s\[</]+)#ix', $font_data, $match ) ;
6897			$font_name	=  $match [ 'font' ] ;
6898			$lc_font_name	=  strtolower ( $font_name ) ;
6899
6900			// Do that only if a font metrics file exists...
6901			if  ( isset ( PdfToText::$AdobeStandardFontMetrics [ $lc_font_name ] ) )
6902			   {
6903				$metrics_file	=  PdfToText::$FontMetricsDirectory . '/' . PdfToText::$AdobeStandardFontMetrics [ $lc_font_name ] ;
6904
6905				if  ( file_exists ( $metrics_file ) )
6906				   {
6907					include ( $metrics_file ) ;
6908
6909					if  ( isset ( $charwidths ) )
6910					   {
6911						// Build the CharacterWidths table
6912						foreach  ( $charwidths  as  $char => $width )
6913							$this -> CharacterWidths [ chr ( $char ) ]	=  ( double ) $width ;
6914
6915						$this -> GotWidthInformation	=  true ;
6916					    }
6917				    }
6918			    }
6919	 	    }
6920
6921		// Retrieve the character widths for this font. This means :
6922		// - Retrieving the /FirstChar, /LastChar and /Widths entries from the font definition. /Widths is an array of individual character
6923		//   widths, between the /FirstChar and /LastChar entries. A value of zero in this array means "Use the default width"...
6924		// - ... which is given by the /MissingWidth parameter, normally put in the font descriptor whose object id is given by the
6925		//   /FontDescriptor entry of the font definition
6926		// Well, to be considered, given the number of buggy PDFs around the world, we won't care about the /LastChar entry and we won't
6927		// check whether the /Widths array contains (LastChar - FirstChar + 1) integer values...
6928		// Get the entries
6929		$first_char	=  false ;
6930		$widths		=  false ;
6931		$missing_width  =  false ;
6932
6933		if  ( preg_match ( '#/FirstChar \s+ (?P<char> \d+)#imsx', $font_data, $match ) )
6934			$first_char	=  $match [ 'char' ] ;
6935
6936		if  ( preg_match ( '#/Widths \s* \[ (?P<widths> [^\]]+) \]#imsx', $font_data, $match ) )
6937			$widths		=  $match [ 'widths' ] ;
6938
6939		if  ( preg_match ( '#/MissingWidth \s+ (?P<missing> \d+)#imsx', $font_data, $match ) )
6940			$missing_width	=  $match [ 'missing' ] ;
6941
6942		// It would not make sense if one of the two entries /FirstChar and /Widths was missing
6943		// So ensure they are all there (note that /MissingWidths can be absent)
6944		if  ( $first_char !==  false  &&  $widths )
6945		   {
6946			if  ( $missing_width  !==  false )
6947				$this -> DefaultWidth		=  ( double ) $missing_width ;
6948
6949			// Here comes a really tricky part :
6950			// - The PDF file can contain CharProcs (example names : /a0, /a1, etc.) for which we have no
6951			//   Unicode equivalent
6952			// - The caller may have called the AddAdobeExtraMappings method, to providing a mapping between
6953			//   those char codes (/a0, /a1, etc.) and a Unicode equivalent
6954			// - Each "charproc" listed in the /Differences array as a specific code, such as :
6955			//	[0/a1/a2/a3...]
6956			//   which maps /a1 to code 0, /a2 to code 1, and so on
6957			// - However, the GetStringWidth() method provides real Unicode characters
6958			// Consequently, we have to map each CharProc character (/a1, /a2, etc.) to the Unicode value
6959			// that may have been specified using the AddAdobeExtraMappings() method.
6960			// The first step below collects the name list of CharProcs.
6961			$charprocs	=  false ;
6962
6963			if  ( isset ( $this -> CharacterMap -> Encodings )  &&
6964				preg_match ( '# /CharProcs \s* << (?P<list> .*?) >>#imsx', $font_data, $match ) )
6965			   {
6966				preg_match_all ( '#/ (?P<char> \w+) \s+ \d+ \s+ \d+ \s+ R#msx', $match [ 'list' ], $char_matches ) ;
6967
6968				$charprocs	=  array_flip ( $char_matches [ 'char' ] ) ;
6969			    }
6970
6971			// The /FontMatrix entry defines the scaling to be used for the character widths (among other things)
6972			if  ( preg_match ( '#/FontMatrix \s* \[ \s* (?P<multiplier> \d+)#imsx', $font_data, $match ) )
6973				$multiplier	=  1000 * ( double ) $match [ 'multiplier' ] ;
6974			else
6975				$multiplier	=  1 ;
6976
6977			$widths				=  trim ( preg_replace ( '/\s+/', ' ', $widths ) ) ;
6978			$widths				=  explode ( ' ', $widths ) ;
6979
6980			for  ( $i = 0, $count = count ( $widths ) ; $i  <  $count ; $i ++ )
6981			   {
6982				$value		=  ( double ) trim ( $widths [$i] ) ;
6983				$chr_index	=  $first_char + $i ;
6984
6985				// Tricky thing part 2 :
6986				if  ( $charprocs )
6987				   {
6988					// If one of the CharProc characters is listed in the /Differences array then...
6989					if  ( isset ( $this -> CharacterMap -> DifferencesByPosition [ $chr_index ] ) )
6990					   {
6991						$chname		=  $this -> CharacterMap -> DifferencesByPosition [ $chr_index ] ;
6992
6993						// ... if this CharProcs character is defined in the encoding table (possibly because
6994						// it was complemeted through a call to the AddAdobeExtraMappings() method), then we
6995						// will use its Unicode counterpart instead of the character ID coming from the
6996						// /Differences array)
6997						if  ( isset ( $charprocs [ $chname ] )  &&  isset ( $this -> CharacterMap -> Encodings [ $chname ] ) )
6998							$chr_index	=  $this -> CharacterMap -> Encodings [ $chname ] [2] ;
6999					    }
7000				    }
7001
7002				$this -> CharacterWidths [ chr ( $chr_index ) ]		=  ( $value ) ?  ( $value * $multiplier ) : $this -> DefaultWidth ;
7003			    }
7004
7005			$this -> GotWidthInformation	=  true ;
7006		    }
7007	    }
7008
7009
7010	// MapCharacter -
7011	//	Returns the substitution string value for the specified character, if the current font has an
7012	//	associated character map, or the original character encoded in utf8, if not.
7013	public function  MapCharacter ( $ch, $return_false_on_failure = false )
7014	   {
7015		if  ( $this -> CharacterMap )
7016		   {
7017			// Character is defined in the character map ; check if it has been overridden by a /Differences array in
7018			// a secondary character map
7019			if  ( isset ( $this -> CharacterMap [ $ch ] ) )
7020			   {
7021				// Since a /ToUnicode map can have an associated /Encoding map with a /Differences list, this is the right place
7022				// to perform the translation (ie, the final Unicode codepoint is impacted by the /Differences list)
7023				if  ( ! $this -> SecondaryCharacterMap )		// Most common case first !
7024				   {
7025					$code	=  $this -> CharacterMap [ $ch ] ;
7026				    }
7027				else
7028				   {
7029					if  ( isset  ( $this -> SecondaryCharacterMap [ $ch ] ) )
7030						$code	=  $this -> SecondaryCharacterMap [ $ch ] ;
7031					else
7032						$code	=  $this -> CharacterMap [ $ch ] ;
7033				    }
7034
7035				return ( $code ) ;
7036			    }
7037			// On the contrary, the character may not be defined in the main character map but may exist in the secondary cmap
7038			else if  ( $this -> SecondaryCharacterMap  &&  isset ( $this -> SecondaryCharacterMap [ $ch ] ) )
7039			   {
7040				$code	=  $this -> SecondaryCharacterMap [ $ch ] ;
7041
7042				return ( $code ) ;
7043			    }
7044		    }
7045
7046		if  ( $return_false_on_failure )
7047			return ( false ) ;
7048
7049		return ( $this -> CodePointToUtf8 ( $ch ) ) ;
7050	    }
7051
7052
7053	/*--------------------------------------------------------------------------------------------------------------
7054
7055	    NAME
7056	        GetStringWidth - Returns the length of a string, in 1/100 of points
7057
7058	    PROTOTYPE
7059	        $width		=  $font -> GetStringWidth ( $text, $extra_percent ) ;
7060
7061	    DESCRIPTION
7062	        Returns the length of a string, in 1/100 of points.
7063
7064	    PARAMETERS
7065	        $text (string) -
7066	                String whose length is to be measured.
7067
7068		$extra_percent (double) -
7069			Extra percentage to be added to the computed width.
7070
7071	    RETURN VALUE
7072	        Returns the length of the specified string in 1/1000 of text points, or 0 if the font does not
7073		contain any character width information.
7074
7075	 *-------------------------------------------------------------------------------------------------------------*/
7076	public function  GetStringWidth ( $text, $extra_percent )
7077	   {
7078		// No width information
7079		if  ( ! $this -> GotWidthInformation )
7080			return ( false ) ;
7081
7082		$width		=  0 ;
7083
7084		// Compute the width of each individual character - use a character width buffer to avoid
7085		// repeating the same tests again and again for characters whose width has already been processed
7086		for  ( $i = 0, $length = strlen ( $text ) ; $i  <  $length ; $i ++ )
7087		   {
7088			$ch		=  $text [$i] ;
7089
7090			// Character already in the Widths buffer - Simply retrieve its value
7091			if  ( isset ( $this -> CharacterWidthsBuffer [ $ch ] ) )
7092			   {
7093				$width	+=  $this -> CharacterWidthsBuffer [ $ch ] ;
7094			    }
7095			// New character - The width comes either from the CharacterWidths array if an entry is defined
7096			// for this character, or from the default width property.
7097			else
7098			   {
7099				if  ( isset ( $this -> CharacterWidths [ $ch ] ) )
7100				   {
7101					$width	+=  $this -> CharacterWidths [ $ch ] ;
7102					$this -> CharacterWidthsBuffer [ $ch ]	=  $this -> CharacterWidths [ $ch ] ;
7103				    }
7104				else
7105				   {
7106					$width	+=  $this -> DefaultWidth ;
7107					$this -> CharacterWidthsBuffer [ $ch ]	=  $this -> DefaultWidth ;
7108				    }
7109			    }
7110		    }
7111
7112		// The computed width is actually longer/smaller than its actual width. Adjust by the percentage specified
7113		// by the ExtraTextWidth property
7114		$divisor	=  100 - $extra_percent ;
7115
7116		if  ( $divisor  <  50 )			// Arbitrarily fix a limit
7117			$divisor	=  50 ;
7118
7119		// All done, return
7120		return ( $width / $divisor ) ;
7121	    }
7122    }
7123
7124
7125/*==============================================================================================================
7126
7127    PdfTexterCharacterMap -
7128        The PdfTexterFont class is not supposed to be used outside the context of the PdfToText class.
7129	Describes a character map.
7130	No provision has been made to design this class a a general purpose class ; its utility exists only in
7131	the scope of the PdfToText class.
7132
7133  ==============================================================================================================*/
7134abstract class	PdfTexterCharacterMap	extends		PdfObjectBase
7135					implements	ArrayAccess, Countable
7136   {
7137	// Object id of the character map
7138	public		$ObjectId ;
7139	// Number of hex digits in a character represented in hexadecimal notation
7140	public 		$HexCharWidth ;
7141	// Set to true if the values returned by the array access operator can safely be cached
7142	public		$Cache		=  false ;
7143
7144
7145
7146	public function  __construct ( $object_id )
7147	   {
7148		parent::__construct ( ) ;
7149		$this -> ObjectId	=  $object_id ;
7150	    }
7151
7152
7153	/*--------------------------------------------------------------------------------------------------------------
7154
7155	    CreateInstance -
7156	        Creates a PdfTexterCharacterMap instance of the correct type.
7157
7158	 *-------------------------------------------------------------------------------------------------------------*/
7159	public static function  CreateInstance ( $object_id, $definitions, $extra_mappings )
7160	   {
7161		if  ( preg_match ( '# (begincmap) | (beginbfchar) | (beginbfrange) #ix', $definitions ) )
7162			return ( new PdfTexterUnicodeMap ( $object_id, $definitions ) ) ;
7163		else if  ( stripos ( $definitions, '/Differences' )  !==  false )
7164			return ( new PdfTexterEncodingMap ( $object_id, $definitions, $extra_mappings ) ) ;
7165		else
7166			return ( false ) ;
7167	    }
7168
7169
7170
7171	/*--------------------------------------------------------------------------------------------------------------
7172
7173	        Interface implementations.
7174
7175	 *-------------------------------------------------------------------------------------------------------------*/
7176	public function  offsetSet ( $offset, $value )
7177	   { error ( new PdfToTextDecodingException ( "Unsupported operation." ) ) ; }
7178
7179	public function  offsetUnset ( $offset )
7180	   { error ( new PdfToTextDecodingException ( "Unsupported operation." ) ) ; }
7181    }
7182
7183
7184
7185/*==============================================================================================================
7186
7187    PdfTexterUnicodeMap -
7188        A class for fonts having a character map specified with the /ToUnicode parameter.
7189
7190  ==============================================================================================================*/
7191class  PdfTexterUnicodeMap 	extends 	PdfTexterCharacterMap
7192    {
7193	// Id of the character map (specified by the /Rx flag)
7194	public		$Id	;
7195	// Character substitution table, using the beginbfrange/endbfrange notation
7196	// Only constructs of the form :
7197	//	<low> <high> <start>
7198	// are stored in this table. Constructs of the form :
7199	//	<x> <y> [ <subst_x> <subst_x+1> ... <subst_y> ]
7200	// are stored in the $DirectMap array, because it is conceptually the same thing in the end as a character substitution being
7201	// defined with the beginbfchar/endbfchar construct.
7202	// Note that a dichotomic search in $RangeMap will be performed for each character reference not yet seen in the pdf flow.
7203	// Once the substitution character has been found, it will be added to the $DirectMap array for later faster access.
7204	// The reason for this optimization is that some pdf files can contain beginbfrange/endbfrange constructs that may seem useless,
7205	// except for validation purposes (ie, validating the fact that a character reference really belongs to the character map).
7206	// However, such constructs can lead to thousands of character substitutions ; consider the following example, that comes
7207	// from a sample I received :
7208	//	beginbfrange
7209	//	<1000> <1FFFF> <1000>
7210	//	<2000> <2FFFF> <2000>
7211	//	...
7212	//	<A000> <AFFFF> <A0000>
7213	//	...
7214	//	endbfrange
7215	// By naively storing a one-to-one character relationship in an associative array, such as :
7216	//	$array [ 0x1000 ] = 0x1000 ;
7217	//	$array [ 0x1001 ] = 0x1001 ;
7218	//	..
7219	//	$array [ 0x1FFF ] = 0x1FFF ;
7220	//	etc.
7221	// you may arrive to a situation where the array becomes so big that it exhausts all of the available memory.
7222	// This is why the ranges are stored as is and a dichotomic search is performed to go faster.
7223	// Since it is useless to use this method to search the same character twice, when it has been found once, the
7224	// substitution pair will be put in the $DirectMap array for subsequent accesses (there is little probability that a PDF
7225	// file contains so much different characters, unless you are processing the whole Unicode table itself ! - but in this
7226	// case, you will simply have to adjust the value of the memory_limit setting in your php.ini file. Consider that I am
7227	// not a magician...).
7228	protected	$RangeMap		=  array ( ) ;
7229	private		$RangeCount		=  0 ;				// Avoid unnecessary calls to the count() function
7230	private		$RangeMin		=  PHP_INT_MAX,			// Min and max values of the character ranges
7231			$RangeMax		=  -1 ;
7232	// Character substitution table for tables using the beginbfchar notation
7233	protected	$DirectMap		=  array ( ) ;
7234
7235
7236	// Constructor -
7237	//	Analyzes the text contents of a CMAP and extracts mappings from the beginbfchar/endbfchar and
7238	//	beginbfrange/endbfrange constructs.
7239	public function  __construct ( $object_id, $definitions )
7240	   {
7241		parent::__construct ( $object_id ) ;
7242
7243		if  ( PdfToText::$DEBUG )
7244		   {
7245	   		echo "\n----------------------------------- UNICODE CMAP #$object_id\n" ;
7246			echo $definitions;
7247		    }
7248
7249		// Retrieve the cmap id, if any
7250		preg_match ( '# /CMapName \s* /R (?P<num> \d+) #ix', $definitions, $match ) ;
7251		$this -> Id 		=  isset ( $match [ 'num' ] ) ?  $match [ 'num' ] : -1 ;
7252
7253		// Get the codespace range, which will give us the width of a character specified in hexadecimal notation
7254		preg_match ( '# begincodespacerange \s+ <\s* (?P<low> [0-9a-f]+) \s*> \s* <\s* (?P<high> [0-9a-f]+) \s*> \s*endcodespacerange #ix', $definitions, $match ) ;
7255
7256		if  ( isset ( $match [ 'low' ] ) )
7257			$this -> HexCharWidth 	=  max ( strlen ( $match [ 'low' ] ), strlen ( $match [ 'high' ] ) ) ;
7258		else
7259			$this -> HexCharWidth	=  0 ;
7260
7261		$max_found_char_width	=  0 ;
7262
7263		// Process beginbfchar/endbfchar constructs
7264		if  ( preg_match_all ( '/ beginbfchar \s* (?P<chars> .*?) endbfchar /imsx', $definitions, $char_matches ) )
7265		    {
7266		    	foreach  ( $char_matches [ 'chars' ]  as  $char_list )
7267		    	   {
7268				// beginbfchar / endbfchar constructs can behave as a kind of beginfbfrange/endbfrange ; example :
7269				//	<21> <0009 0020 000d>
7270				// means :
7271				//	. Map character #21 to #0009
7272				//	. Map character #22 to #0020
7273				//	. Map character #23 to #000D
7274				// There is no clue in the Adobe PDF specification that a single character could be mapped to a range.
7275				// The normal constructs would be :
7276				//	<21> <0009>
7277				//	<22> <0020>
7278				//	<23> <0000D>
7279				preg_match_all ( '/< \s* (?P<item> .*?) \s* >/msx', $char_list, $item_matches ) ;
7280
7281				for  ( $i = 0, $item_count = count ( $item_matches [ 'item' ] ) ; $i  <  $item_count ; $i += 2 )
7282				   {
7283					$char		=  hexdec ( $item_matches [ 'item' ] [$i] ) ;
7284					$char_width	=  strlen ( $item_matches [ 'item' ] [$i] ) ;
7285					$map		=  explode ( ' ', preg_replace ( '/\s+/', ' ', $item_matches [ 'item' ] [ $i + 1 ] ) ) ;
7286
7287					if  ( $char_width  >  $max_found_char_width )
7288						$max_found_char_width	=  $char_width ;
7289
7290					for  ( $j = 0, $map_count = count ( $map ) ; $j  <  $map_count ; $j ++ )
7291					   {
7292						$subst				=  hexdec ( $map [$j] ) ;
7293
7294						// Check for this very special, not really document feature which maps CIDs to a non-existing Unicode character
7295						// (but it still corresponds to something...)
7296						if  ( isset ( PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ) )
7297							$subst	=  PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ;
7298
7299						$this -> DirectMap [ $char + $j ]	=  $subst ;
7300					    }
7301				    }
7302
7303		    	    }
7304		     }
7305
7306		// Process beginbfrange/endbfrange constructs
7307		if  ( preg_match_all ( '/ beginbfrange \s* (?P<ranges> .*?) endbfrange /imsx', $definitions, $range_matches ) )
7308		   {
7309			foreach  ( $range_matches [ 'ranges' ]  as  $range_list )
7310			   {
7311				$start_index	=  0 ;
7312
7313				// There are two forms of syntax in a beginbfrange..endbfrange construct
7314				// 1) "<x> <y> <z>", which maps character ids x through y to z through (z+y-x)
7315				// 2) "<x> <y> [<a1> <a2> ... <an>]", which maps character x to a1, x+1 to a2, up to y, which is mapped to an
7316				// All the values are hex digits.
7317				// We will loop through the range definitions by first identifying the <x> and <y>, and the character that follows
7318				// them, which is either a "<" for notation 1), or a "[" for notation 2).
7319				while  ( preg_match ( '#  < \s* (?P<from> [0-9a-f]+) \s* > \s* < \s* (?P<to> [0-9a-f]+) \s* > \s* (?P<nextchar> .) #imsx',
7320						$range_list, $range_match, PREG_OFFSET_CAPTURE, $start_index ) )
7321				   {
7322					$from			=  hexdec ( $range_match [ 'from' ] [0] ) ;
7323					$to			=  hexdec ( $range_match [ 'to'   ] [0] ) ;
7324					$next_char		=  $range_match [ 'nextchar' ] [0] ;
7325					$next_char_index	=  $range_match [ 'nextchar' ] [1] ;
7326					$char_width		=  strlen ( $range_match [ 'from' ] [0] ) ;
7327
7328					if  ( $char_width  >  $max_found_char_width )
7329						$max_found_char_width	=  $char_width ;
7330
7331					// Form 1) : catch the third hex value after <x> and <y>
7332					if  ( $next_char  ==  '<' )
7333					   {
7334						if  ( preg_match ( '/ \s* (?P<start> [0-9a-f]+) (?P<tail> \s* > \s*) /imsx', $range_list, $start_match, PREG_OFFSET_CAPTURE, $next_char_index + 1 ) )
7335						   {
7336							$subst		=  hexdec ( $start_match [ 'start' ] [0] ) ;
7337
7338							// Check for this very special, not really document feature which maps CIDs to a non-existing Unicode character
7339							// (but it still corresponds to something...)
7340							if  ( isset ( PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ) )
7341								$subst	=  PdfTexterAdobeUndocumentedUnicodeMap::$UnicodeMap [ $subst ] ;
7342
7343							// Don't create a range if <x> and <y> are the same
7344							if  ( $from  !=  $to )
7345							   {
7346								$this -> RangeMap []	=  array ( $from, $to, $subst ) ;
7347
7348								// Adjust min and max values for the ranges stored in this character map - to avoid unnecessary testing
7349								if  ( $from  <  $this -> RangeMin )
7350									$this -> RangeMin	=  $from ;
7351
7352								if  ( $to  >  $this -> RangeMax )
7353									$this -> RangeMax	=  $to ;
7354							    }
7355							else
7356								$this -> DirectMap [ $from ]	=  $subst ;
7357
7358							$start_index	=  $start_match [ 'tail' ] [1] + 1 ;
7359						    }
7360						else
7361							error ( "Character range $from..$to not followed by an hexadecimal value in Unicode map #$object_id." ) ;
7362					    }
7363					// Form 2) : catch all the hex values between square brackets after <x> and <y>
7364					else if  ( $next_char  ==  '[' )
7365					   {
7366						if  ( preg_match ( '/ (?P<values> [\s<>0-9a-f]+ ) (?P<tail> \] \s*)/imsx', $range_list, $array_match, PREG_OFFSET_CAPTURE, $next_char_index + 1 ) )
7367						   {
7368							preg_match_all ( '/ < \s* (?P<num> [0-9a-f]+) \s* > /imsx', $array_match [ 'values' ] [0], $array_values ) ;
7369
7370							for  ( $i = $from, $count = 0 ; $i  <=  $to ; $i ++, $count ++ )
7371								$this -> DirectMap [$i] 	=  hexdec ( $array_values [ 'num' ] [ $count ] ) ;
7372
7373							$start_index	=  $array_match [ 'tail' ] [1] + 1 ;
7374						    }
7375						else
7376							error ( "Character range $from..$to not followed by an array of hexadecimal values in Unicode map #$object_id." ) ;
7377					    }
7378					else
7379					   {
7380						error ( "Unexpected character '$next_char' in Unicode map #$object_id." ) ;
7381						$start_index	=  $range_match [ 'nextchar' ] [1] + 1 ;
7382					    }
7383				    }
7384			    }
7385
7386			// Sort the ranges by their starting offsets
7387			$this -> RangeCount	=  count ( $this -> RangeMap ) ;
7388
7389			if  ( $this -> RangeCount  >  1 )
7390			   {
7391				usort ( $this -> RangeMap, array ( $this, '__rangemap_cmpfunc' ) ) ;
7392			    }
7393		    }
7394
7395		if ( $max_found_char_width  &&  $max_found_char_width  !=  $this -> HexCharWidth )
7396		   {
7397			if  ( PdfToText::$DEBUG )
7398				warning ( "Character map #$object_id : specified code width ({$this -> HexCharWidth}) differs from actual width ($max_found_char_width)." ) ;
7399
7400			$this -> HexCharWidth	=  $max_found_char_width ;
7401		    }
7402	     }
7403
7404
7405	public function  __rangemap_cmpfunc ( $a, $b )
7406	   { return ( $a [0] - $b [0] ) ; }
7407
7408
7409	/*--------------------------------------------------------------------------------------------------------------
7410
7411	        Interface implementations.
7412
7413	 *-------------------------------------------------------------------------------------------------------------*/
7414	public function  count ( )
7415	   { return ( count ( $this -> DirectMap ) ) ; }
7416
7417
7418	public function  offsetExists ( $offset )
7419	   { return  ( $this -> offsetGetSafe ( $offset )  !==  false ) ; }
7420
7421
7422	public function  offsetGetSafe ( $offset, $translate = true )
7423	   {
7424		// Return value
7425		$code	=  false ;
7426
7427		// Character already has an entry (character reference => subtituted character)
7428		if  ( isset ( $this -> DirectMap [ $offset ] ) )
7429		   {
7430			$code	=  ( $translate ) ? $this -> CodePointToUtf8 ( $this -> DirectMap [ $offset ] ) : $this -> DirectMap [ $offset ] ;
7431		    }
7432		// Character does not has a direct entry ; have a look in the character ranges defined for this map
7433		else if  ( $this -> RangeCount  &&  $offset  >=  $this -> RangeMin  &&  $offset  <=  $this -> RangeMax )
7434		   {
7435			$low		=  0 ;
7436			$high		=  count ( $this -> RangeMap ) - 1 ;
7437			$result		=  false ;
7438
7439			// Use a dichotomic search through character ranges
7440			while  ( $low  <=  $high )
7441			   {
7442				$middle		=  ( $low + $high )  >>  1 ;
7443
7444				if  ( $offset  <  $this -> RangeMap [ $middle ] [0] )
7445					$high	=  $middle - 1 ;
7446				else if  ( $offset  >  $this -> RangeMap [ $middle ] [1] )
7447					$low	=  $middle + 1 ;
7448				else
7449				   {
7450					$result	=  $this -> RangeMap [ $middle ] [2] + $offset - $this -> RangeMap [ $middle ] [0] ;
7451					break ;
7452				    }
7453			    }
7454
7455			// Once a character has been found in the ranges defined by this character map, store it in the DirectMap property
7456			// so that it will be directly retrieved during subsequent accesses
7457			if  ( $result  !==  false )
7458			   {
7459				$code				=  ( $translate ) ? $this -> CodePointToUtf8 ( $result ) : $result ;
7460				$this -> DirectMap [ $offset ]	=  $result ;
7461			    }
7462		    }
7463
7464		// All done, return
7465		return ( $code ) ;
7466	    }
7467
7468
7469	public function  offsetGet ( $offset )
7470	   {
7471		$code	=  $this -> offsetGetSafe ( $offset ) ;
7472
7473		if  ( $code  === false )
7474			$code	=  $this -> CodePointToUtf8 ( $offset ) ;
7475
7476		return ( $code ) ;
7477	    }
7478    }
7479
7480
7481/*==============================================================================================================
7482
7483    PdfTexterEncodingMap -
7484        A class for fonts having a character map specified with the /Encoding parameter.
7485
7486  ==============================================================================================================*/
7487class  PdfTexterEncodingMap 	extends  PdfTexterCharacterMap
7488   {
7489	// Possible encodings (there is a 5th one, MacExpertEncoding, but used for "expert fonts" ; no need to deal
7490	// with it here since we only want to extract text)
7491	// Note that the values of these constants are direct indices to the second dimension of the $Encodings table
7492	const 	PDF_STANDARD_ENCODING 		=  0 ;
7493	const 	PDF_MAC_ROMAN_ENCODING 		=  1 ;
7494	const 	PDF_WIN_ANSI_ENCODING 		=  2 ;
7495	const 	PDF_DOC_ENCODING 		=  3 ;
7496
7497	// Correspondance between an encoding name and its corresponding character in the
7498	// following format : Standard, Mac, Windows, Pdf
7499	private static 		$GlobalEncodings 	=  false ;
7500	public			$Encodings ;
7501	// Encoding type (one of the PDF_*_ENCODING constants)
7502	public 			$Encoding ;
7503	// Indicates whether this character map is a secondary one used for Unicode maps ; this must be set at
7504	// a higher level by the PdfTexterFont because at the time a character map is instantiated, we do not know
7505	// yet whether it will be a primary (normal) map, or a map secondary to an existing Unicode map
7506	public			$Secondary ;
7507	// Differences array (a character substitution table to the standard encodings)
7508	public 			$Map 			=  array ( ) ;
7509	// A secondary map for the Differences array, which only contains the differences ; this is used
7510	// for Unicode fonts that also have an associated /Differences parameter, which should not include the
7511	// whole standard Adobe character map but only the differences of encodings
7512	public			$SecondaryMap		=  array ( ) ;
7513	// Differences by position number
7514	public			$DifferencesByPosition	=  array ( ) ;
7515
7516
7517   	// Constructor -
7518	//	Analyzes the text contents of a CMAP and extracts mappings from the beginbfchar/endbfchar and
7519	//	beginbfrange/endbfrange constructs.
7520	public function  __construct ( $object_id, $definitions, $extra_mappings )
7521	   {
7522		// Ignore character variants whose names end with these suffixes
7523		static	$IgnoredVariants	=  array
7524		   (
7525			'/\.scalt$/',
7526			'/\.sc$/',
7527			'/\.fitted$/',
7528			'/\.oldstyle$/',
7529			'/\.taboldstyle$/',
7530			'/\.alt$/',
7531			'/alt$/',
7532		    ) ;
7533
7534		parent::__construct ( $object_id ) ;
7535
7536		// Load the default Adobe character sets, if not already done
7537		if  ( self::$GlobalEncodings  ===  false )
7538		   {
7539			$charset_file		=  dirname ( __FILE__ ) .  '/Maps/adobe-charsets.map' ;
7540			include ( $charset_file ) ;
7541			self::$GlobalEncodings	=  ( isset ( $adobe_charsets ) ) ?  $adobe_charsets : array ( ) ;
7542		    }
7543
7544		$this -> Encodings		=  array_merge ( self::$GlobalEncodings, $extra_mappings ) ;
7545
7546		// Fonts using default Adobe character sets and hexadecimal representations are one-byte long
7547		$this -> HexCharWidth	=  2 ;
7548
7549		if  ( PdfToText::$DEBUG )
7550		   {
7551	   		echo "\n----------------------------------- ENCODING CMAP #$object_id\n" ;
7552			echo $definitions;
7553		    }
7554
7555		// Retrieve text encoding
7556		preg_match ( '# / (?P<encoding> (WinAnsiEncoding) | (PDFDocEncoding) | (MacRomanEncoding) | (StandardEncoding) ) #ix',
7557				$definitions, $encoding_match ) ;
7558
7559		if ( ! isset ( $encoding_match [ 'encoding' ] ) )
7560			$encoding_match [ 'encoding' ]	=  'WinAnsiEncoding' ;
7561
7562		switch ( strtolower ( $encoding_match [ 'encoding' ] ) )
7563		   {
7564		   	case 	'pdfdocencoding' 	:  $this -> Encoding	=  self::PDF_DOC_ENCODING 	; break ;
7565		   	case 	'macromanencoding' 	:  $this -> Encoding 	=  self::PDF_MAC_ROMAN_ENCODING ; break ;
7566		   	case 	'standardencoding' 	:  $this -> Encoding 	=  self::PDF_STANDARD_ENCODING 	; break ;
7567		   	case 	'winansiencoding' 	:
7568		   	default 		 	:  $this -> Encoding 	=  self::PDF_WIN_ANSI_ENCODING	;
7569		    }
7570
7571		// Build a virgin character map using the detected encoding
7572		foreach  ( $this -> Encodings  as  $code_array )
7573		   {
7574			$char 			=  $code_array [ $this -> Encoding ] ;
7575			$this -> Map [ $char ] 	=  $char ;
7576		    }
7577
7578		// Extract the Differences array
7579	   	preg_match ( '/ \[ \s* (?P<contents> [^\]]*?)  \s* \] /x', $definitions, $match ) ;
7580
7581		if (  ! isset ( $match [ 'contents' ] ) )
7582			return ;
7583
7584		$data 		=  trim ( preg_replace ( '/\s+(\d+)/', '/$1', $match [ 'contents' ] ) ) ;
7585		$items 		=  explode ( '/', $data ) ;
7586		$index		=  0 ;
7587
7588		for  ( $i = 0, $item_count = count ( $items ) ; $i  <  $item_count ; $i ++ )
7589		   {
7590		   	$item 		=  PdfToText::DecodeRawName ( trim ( $items [$i] ) ) ;
7591
7592		   	// Integer value  : index of next character in map
7593			if  ( is_numeric ( $item ) )
7594				$index 	=  ( integer ) $item ;
7595			// String value : a character name, as defined by Adobe
7596			else
7597			   {
7598				// Remove variant part of the character name
7599				$item	=  preg_replace  ( $IgnoredVariants, '', trim ( $item ) ) ;
7600
7601			   	// Keyword (character name) exists in the encoding table
7602				if  ( isset ( $this -> Encodings [ $item ] ) )
7603				   {
7604					$this -> Map [ $index ] 		=
7605					$this -> SecondaryMap [ $index ]	=  $this -> Encodings [ $item ] [ $this -> Encoding ] ;
7606				    }
7607				// Not defined ; check if this is the "/gxx" notation, where "xx" is a number
7608				else if  ( preg_match ( '/g (?P<value> \d+)/x', $item, $match ) )
7609				   {
7610					$value		=  ( integer ) $match [ 'value' ] ;
7611
7612					// In my current state of investigations, the /g notation has the following characteristics :
7613					// - The value 29 must be added to the number after the "/g" string (why ???)
7614					// - The value after the "/g" string can be greater than 255, meaning that it could be Unicode codepoint
7615					// This has to be carefully watched before revision
7616					$value	+=  29 ;
7617
7618					$this -> Map [ $index ]			=
7619					$this -> SecondaryMap [ $index ]	=  $value ;
7620				    }
7621				// Some characters can be specified by the "/uni" prefix followed by a sequence of hex digits,
7622				// which is not described by the PDF specifications. This sequence gives a Unicode code point.
7623				else if  ( preg_match ( '/uni (?P<value>  [0-9a-f]+)/ix', $item, $match ) )
7624				   {
7625					$value		=  hexdec ( $match [ 'value' ] ) ;
7626
7627					$this -> Map [ $index ]			=
7628					$this -> SecondaryMap [ $index ]	=  ( integer ) $value ;
7629				    }
7630				// Otherwise, put a quotation mark instead
7631				else
7632				   {
7633					if  ( PdfToText::$DEBUG )
7634						 warning ( "Unknown character name found in a /Differences[] array : [$item]" ) ;
7635
7636					$this -> Map [ $index ] 		=
7637					$this -> SecondaryMap [ $index ]	=  ord ( '?' ) ;
7638				    }
7639
7640				$this -> DifferencesByPosition [ $index ]	=  $item ;
7641
7642				$index ++ ;
7643			    }
7644		    }
7645	    }
7646
7647
7648	/*--------------------------------------------------------------------------------------------------------------
7649
7650	        Interface implementations.
7651
7652	 *-------------------------------------------------------------------------------------------------------------*/
7653	public function  count ( )
7654	   { return ( count ( $this -> Map ) ) ; }
7655
7656
7657	public function  offsetExists ( $offset )
7658	   {
7659		return ( ( ! $this -> Secondary ) ?
7660				isset ( $this -> Map [ $offset ] ) :
7661				isset ( $this -> SecondaryMap [ $offset ] ) ) ;
7662	    }
7663
7664
7665	public function  offsetGet ( $offset )
7666	   {
7667		if  ( ! $this -> Secondary )
7668		   {
7669			if  ( isset ( $this -> Map [ $offset ] ) )
7670				$ord		=  $this -> Map [ $offset ] ;
7671			else
7672				$ord		=  $offset ;
7673
7674			// Check for final character translations (concerns only a few number of characters)
7675			if  ( $this -> Encoding  ==  self::PDF_WIN_ANSI_ENCODING  &&  isset ( PdfTexterAdobeWinAnsiMap::$WinAnsiCharacterMap [0] [ $ord ] ) )
7676				$ord	=  PdfTexterAdobeWinAnsiMap::$WinAnsiCharacterMap [0] [ $ord ] ;
7677			else if  ( $this -> Encoding  ==  self::PDF_MAC_ROMAN_ENCODING  &&  isset ( PdfTexterAdobeMacRomanMap::$MacRomanCharacterMap [0] [ $ord ] ) )
7678				$ord	=  PdfTexterAdobeMacRomanMap::$MacRomanCharacterMap [0] [ $ord ] ;
7679			// As far as I have been able to see, the values expressed by the /Differences tag were the only ones used within the
7680			// Pdf document ; however, handle the case where some characters do not belong to the characters listed by /Differences,
7681			// and use the official Adobe encoding maps when necessary
7682			else if  ( isset ( $this -> Encodings [ $ord ] [ $this -> Encoding ] ) )
7683				$ord	=  $this -> Encodings [ $ord ] [ $this -> Encoding ] ;
7684
7685			$result		=  $this -> CodePointToUtf8 ( $ord ) ;
7686		    }
7687		else if  ( isset ( $this -> SecondaryMap [ $offset ] ) )
7688		   {
7689			$ord		=   $this -> SecondaryMap [ $offset ] ;
7690			$result		=   $this -> CodePointToUtf8 ( $ord ) ;
7691		    }
7692		else
7693			$result		=  false ;
7694
7695		return ( $result ) ;
7696	    }
7697    }
7698
7699
7700/**************************************************************************************************************
7701 **************************************************************************************************************
7702 **************************************************************************************************************
7703 ******                                                                                                  ******
7704 ******                                                                                                  ******
7705 ******                                     CHARACTER MAP MANAGEMENT                                     ******
7706 ******                                                                                                  ******
7707 ******                                                                                                  ******
7708 **************************************************************************************************************
7709 **************************************************************************************************************
7710 **************************************************************************************************************/
7711
7712/*==============================================================================================================
7713
7714    class PdfTexterAdobeMap -
7715        Abstract class to handle Adobe-specific fonts.
7716
7717  ==============================================================================================================*/
7718abstract class  PdfTexterAdobeMap	extends  PdfTexterCharacterMap
7719   {
7720	// Font variant ; one of the PdfTexterFont::FONT_VARIANT_* constants
7721	public		$Variant ;
7722	// To be declared by derived classes :
7723	public		$Map ;
7724
7725
7726	public function  __construct ( $object_id, $font_variant, $map )
7727	   {
7728		parent::__construct ( $object_id ) ;
7729
7730		$this -> HexCharWidth	=  2 ;
7731		$this -> Variant	=  $font_variant ;
7732		$this -> Map		=  $map ;
7733
7734		if  ( ! isset ( $map [ $font_variant ] ) )
7735			error ( new  PdfToTextDecodingException ( "Undefined font variant #$font_variant." ) ) ;
7736	    }
7737
7738
7739	/*--------------------------------------------------------------------------------------------------------------
7740
7741	        Interface implementations.
7742
7743	 *-------------------------------------------------------------------------------------------------------------*/
7744	public function  count ( )
7745	   { return ( count ( $this -> $Map [ $this -> Variant ] ) ) ; }
7746
7747
7748	public function  offsetExists ( $offset )
7749	   { return ( isset ( $this -> Map [ $this-> Variant ] [ $offset ] ) ) ; }
7750
7751
7752	public function  offsetGet ( $offset )
7753	   {
7754		if  ( isset ( $this -> Map [ $this-> Variant ] [ $offset ] ) )
7755			$ord		=  $this -> Map [ $this -> Variant ] [ $offset ] ;
7756		else
7757			$ord		=  $offset ;
7758
7759		return ( $this -> CodePointToUtf8 ( $ord ) ) ;
7760	    }
7761    }
7762
7763
7764/*==============================================================================================================
7765
7766    class PdfTexterAdobeWinAnsiMap -
7767        Abstract class to handle Adobe-specific Win Ansi fonts.
7768
7769  ==============================================================================================================*/
7770class	PdfTexterAdobeWinAnsiMap		extends		PdfTexterAdobeMap
7771   {
7772	// Windows Ansi mapping to Unicode. Only substitutions that have no direct equivalent are listed here
7773	// Source : https://msdn.microsoft.com/en-us/goglobal/cc305145.aspx
7774	// Only characters from 0x80 to 0x9F have no direct translation
7775	public static	$WinAnsiCharacterMap	=  array
7776	   (
7777		// Normal WinAnsi mapping
7778		0	=>  array
7779		   (
7780			0x80	=>  0x20AC,
7781			0x82	=>  0x201A,
7782			0x83	=>  0x0192,
7783			0x84	=>  0x201E,
7784			0x85	=>  0x2026,
7785			0x86	=>  0x2020,
7786			0x87	=>  0x2021,
7787			0x88	=>  0x02C6,
7788			0x89	=>  0x2030,
7789			0x8A	=>  0x0160,
7790			0x8B	=>  0x2039,
7791			0x8C	=>  0x0152,
7792			0x8E	=>  0x017D,
7793			0x91	=>  0x2018,
7794			0x92	=>  0x2019,
7795			0x93	=>  0x201C,
7796			0x94	=>  0x201D,
7797			0x95	=>  0x2022,
7798			0x96	=>  0x2013,
7799			0x97	=>  0x2014,
7800			0x98	=>  0x02DC,
7801			0x99	=>  0x2122,
7802			0x9A	=>  0x0161,
7803			0x9B	=>  0x203A,
7804			0x9C	=>  0x0153,
7805			0x9E	=>  0x017E,
7806			0x9F	=>  0x0178
7807		     ),
7808		// Cyrillic (IS08859-5)
7809		1	=>  array
7810		   (
7811			0x93	=> 0x0022,	// Quotes
7812			0x94	=> 0x0022,
7813			0xC0	=> 0x0410,
7814			0xC1	=> 0x0411,
7815			0xC2	=> 0x0412,
7816			0xC3	=> 0x0413,
7817			0xC4	=> 0x0414,
7818			0xC5	=> 0x0415,
7819			0xC6	=> 0x0416,
7820			0xC7	=> 0x0417,
7821			0xC8	=> 0x0418,
7822			0xC9	=> 0x0419,
7823			0xCA	=> 0x041A,
7824			0xCB	=> 0x041B,
7825			0xCC	=> 0x041C,
7826			0xCD	=> 0x041D,
7827			0xCE	=> 0x041E,
7828			0xCF	=> 0x041F,
7829			0xD0	=> 0x0420,
7830			0xD1	=> 0x0421,
7831			0xD2	=> 0x0422,
7832			0xD3	=> 0x0423,
7833			0xD4	=> 0x0424,
7834			0xD5	=> 0x0425,
7835			0xD6	=> 0x0426,
7836			0xD7	=> 0x0427,
7837			0xD8	=> 0x0428,
7838			0xD9	=> 0x0429,
7839			0xDA	=> 0x042A,
7840			0xDB	=> 0x042B,
7841			0xDC	=> 0x042C,
7842			0xDD	=> 0x042D,
7843			0xDE	=> 0x042E,
7844			0xDF	=> 0x042F,
7845			0xE0	=> 0x0430,
7846			0xE1	=> 0x0431,
7847			0xE2	=> 0x0432,
7848			0xE3	=> 0x0433,
7849			0xE4	=> 0x0434,
7850			0xE5	=> 0x0435,
7851			0xE6	=> 0x0436,
7852			0xE7	=> 0x0437,
7853			0xE8	=> 0x0438,
7854			0xE9	=> 0x0439,
7855			0xEA	=> 0x043A,
7856			0xEB	=> 0x043B,
7857			0xEC	=> 0x043C,
7858			0xED	=> 0x043D,
7859			0xEE	=> 0x043E,
7860			0xEF	=> 0x043F,
7861			0xF0	=> 0x0440,
7862			0xF1	=> 0x0441,
7863			0xF2	=> 0x0442,
7864			0xF3	=> 0x0443,
7865			0xF4	=> 0x0444,
7866			0xF5	=> 0x0445,
7867			0xF6	=> 0x0446,
7868			0xF7	=> 0x0447,
7869			0xF8	=> 0x0448,
7870			0xF9	=> 0x0449,
7871			0xFA	=> 0x044A,
7872			0xFB	=> 0x044B,
7873			0xFC	=> 0x044C,
7874			0xFD	=> 0x044D,
7875			0xFE	=> 0x044E,
7876			0xFF	=> 0x044F
7877		    )
7878	    ) ;
7879
7880	public function  __construct ( $object_id, $font_variant )
7881	   {
7882		parent::__construct ( $object_id, $font_variant, self::$WinAnsiCharacterMap ) ;
7883	    }
7884    }
7885
7886
7887/*==============================================================================================================
7888
7889    class PdfTexterAdobeMacRomanMap -
7890        Abstract class to handle Adobe-specific Mac Roman fonts.
7891
7892  ==============================================================================================================*/
7893class	PdfTexterAdobeMacRomanMap		extends		PdfTexterAdobeMap
7894   {
7895	// Mac roman to Unicode encoding
7896	// Source : ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT
7897	public static	$MacRomanCharacterMap	=  array
7898	   (
7899		0	=>  array
7900		   (
7901			0x80	=>  0x00C4,	# LATIN CAPITAL LETTER A WITH DIAERESIS
7902			0x81	=>  0x00C5,	# LATIN CAPITAL LETTER A WITH RING ABOVE
7903			0x82	=>  0x00C7,	# LATIN CAPITAL LETTER C WITH CEDILLA
7904			0x83	=>  0x00C9,	# LATIN CAPITAL LETTER E WITH ACUTE
7905			0x84	=>  0x00D1,	# LATIN CAPITAL LETTER N WITH TILDE
7906			0x85	=>  0x00D6,	# LATIN CAPITAL LETTER O WITH DIAERESIS
7907			0x86	=>  0x00DC,	# LATIN CAPITAL LETTER U WITH DIAERESIS
7908			0x87	=>  0x00E1,	# LATIN SMALL LETTER A WITH ACUTE
7909			0x88	=>  0x00E0,	# LATIN SMALL LETTER A WITH GRAVE
7910			0x89	=>  0x00E2,	# LATIN SMALL LETTER A WITH CIRCUMFLEX
7911			0x8A	=>  0x00E4,	# LATIN SMALL LETTER A WITH DIAERESIS
7912			0x8B	=>  0x00E3,	# LATIN SMALL LETTER A WITH TILDE
7913			0x8C	=>  0x00E5,	# LATIN SMALL LETTER A WITH RING ABOVE
7914			0x8D	=>  0x00E7,	# LATIN SMALL LETTER C WITH CEDILLA
7915			0x8E	=>  0x00E9,	# LATIN SMALL LETTER E WITH ACUTE
7916			0x8F	=>  0x00E8,	# LATIN SMALL LETTER E WITH GRAVE
7917			0x90	=>  0x00EA,	# LATIN SMALL LETTER E WITH CIRCUMFLEX
7918			0x91	=>  0x00EB,	# LATIN SMALL LETTER E WITH DIAERESIS
7919			0x92	=>  0x00ED,	# LATIN SMALL LETTER I WITH ACUTE
7920			0x93	=>  0x00EC,	# LATIN SMALL LETTER I WITH GRAVE
7921			0x94	=>  0x00EE,	# LATIN SMALL LETTER I WITH CIRCUMFLEX
7922			0x95	=>  0x00EF,	# LATIN SMALL LETTER I WITH DIAERESIS
7923			0x96	=>  0x00F1,	# LATIN SMALL LETTER N WITH TILDE
7924			0x97	=>  0x00F3,	# LATIN SMALL LETTER O WITH ACUTE
7925			0x98	=>  0x00F2,	# LATIN SMALL LETTER O WITH GRAVE
7926			0x99	=>  0x00F4,	# LATIN SMALL LETTER O WITH CIRCUMFLEX
7927			0x9A	=>  0x00F6,	# LATIN SMALL LETTER O WITH DIAERESIS
7928			0x9B	=>  0x00F5,	# LATIN SMALL LETTER O WITH TILDE
7929			0x9C	=>  0x00FA,	# LATIN SMALL LETTER U WITH ACUTE
7930			0x9D	=>  0x00F9,	# LATIN SMALL LETTER U WITH GRAVE
7931			0x9E	=>  0x00FB,	# LATIN SMALL LETTER U WITH CIRCUMFLEX
7932			0x9F	=>  0x00FC,	# LATIN SMALL LETTER U WITH DIAERESIS
7933			0xA0	=>  0x2020,	# DAGGER
7934			0xA1	=>  0x00B0,	# DEGREE SIGN
7935			0xA2	=>  0x00A2,	# CENT SIGN
7936			0xA3	=>  0x00A3,	# POUND SIGN
7937			0xA4	=>  0x00A7,	# SECTION SIGN
7938			0xA5	=>  0x2022,	# BULLET
7939			0xA6	=>  0x00B6,	# PILCROW SIGN
7940			0xA7	=>  0x00DF,	# LATIN SMALL LETTER SHARP S
7941			0xA8	=>  0x00AE,	# REGISTERED SIGN
7942			0xA9	=>  0x00A9,	# COPYRIGHT SIGN
7943			0xAA	=>  0x2122,	# TRADE MARK SIGN
7944			0xAB	=>  0x00B4,	# ACUTE ACCENT
7945			0xAC	=>  0x00A8,	# DIAERESIS
7946			0xAD	=>  0x2260,	# NOT EQUAL TO
7947			0xAE	=>  0x00C6,	# LATIN CAPITAL LETTER AE
7948			0xAF	=>  0x00D8,	# LATIN CAPITAL LETTER O WITH STROKE
7949			0xB0	=>  0x221E,	# INFINITY
7950			0xB1	=>  0x00B1,	# PLUS-MINUS SIGN
7951			0xB2	=>  0x2264,	# LESS-THAN OR EQUAL TO
7952			0xB3	=>  0x2265,	# GREATER-THAN OR EQUAL TO
7953			0xB4	=>  0x00A5,	# YEN SIGN
7954			0xB5	=>  0x00B5,	# MICRO SIGN
7955			0xB6	=>  0x2202,	# PARTIAL DIFFERENTIAL
7956			0xB7	=>  0x2211,	# N-ARY SUMMATION
7957			0xB8	=>  0x220F,	# N-ARY PRODUCT
7958			0xB9	=>  0x03C0,	# GREEK SMALL LETTER PI
7959			0xBA	=>  0x222B,	# INTEGRAL
7960			0xBB	=>  0x00AA,	# FEMININE ORDINAL INDICATOR
7961			0xBC	=>  0x00BA,	# MASCULINE ORDINAL INDICATOR
7962			0xBD	=>  0x03A9,	# GREEK CAPITAL LETTER OMEGA
7963			0xBE	=>  0x00E6,	# LATIN SMALL LETTER AE
7964			0xBF	=>  0x00F8,	# LATIN SMALL LETTER O WITH STROKE
7965			0xC0	=>  0x00BF,	# INVERTED QUESTION MARK
7966			0xC1	=>  0x00A1,	# INVERTED EXCLAMATION MARK
7967			0xC2	=>  0x00AC,	# NOT SIGN
7968			0xC3	=>  0x221A,	# SQUARE ROOT
7969			0xC4	=>  0x0192,	# LATIN SMALL LETTER F WITH HOOK
7970			0xC5	=>  0x2248,	# ALMOST EQUAL TO
7971			0xC6	=>  0x2206,	# INCREMENT
7972			0xC7	=>  0x00AB,	# LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
7973			0xC8	=>  0x00BB,	# RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
7974			0xC9	=>  0x2026,	# HORIZONTAL ELLIPSIS
7975			0xCA	=>  0x00A0,	# NO-BREAK SPACE
7976			0xCB	=>  0x00C0,	# LATIN CAPITAL LETTER A WITH GRAVE
7977			0xCC	=>  0x00C3,	# LATIN CAPITAL LETTER A WITH TILDE
7978			0xCD	=>  0x00D5,	# LATIN CAPITAL LETTER O WITH TILDE
7979			0xCE	=>  0x0152,	# LATIN CAPITAL LIGATURE OE
7980			0xCF	=>  0x0153,	# LATIN SMALL LIGATURE OE
7981			0xD0	=>  0x2013,	# EN DASH
7982			0xD1	=>  0x2014,	# EM DASH
7983			0xD2	=>  0x201C,	# LEFT DOUBLE QUOTATION MARK
7984			0xD3	=>  0x201D,	# RIGHT DOUBLE QUOTATION MARK
7985			0xD4	=>  0x2018,	# LEFT SINGLE QUOTATION MARK
7986			0xD5	=>  0x2019,	# RIGHT SINGLE QUOTATION MARK
7987			0xD6	=>  0x00F7,	# DIVISION SIGN
7988			0xD7	=>  0x25CA,	# LOZENGE
7989			0xD8	=>  0x00FF,	# LATIN SMALL LETTER Y WITH DIAERESIS
7990			0xD9	=>  0x0178,	# LATIN CAPITAL LETTER Y WITH DIAERESIS
7991			0xDA	=>  0x2044,	# FRACTION SLASH
7992			0xDB	=>  0x20AC,	# EURO SIGN
7993			0xDC	=>  0x2039,	# SINGLE LEFT-POINTING ANGLE QUOTATION MARK
7994			0xDD	=>  0x203A,	# SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
7995			0xDE	=>  0xFB01,	# LATIN SMALL LIGATURE FI
7996			0xDF	=>  0xFB02,	# LATIN SMALL LIGATURE FL
7997			0xE0	=>  0x2021,	# DOUBLE DAGGER
7998			0xE1	=>  0x00B7,	# MIDDLE DOT
7999			0xE2	=>  0x201A,	# SINGLE LOW-9 QUOTATION MARK
8000			0xE3	=>  0x201E,	# DOUBLE LOW-9 QUOTATION MARK
8001			0xE4	=>  0x2030,	# PER MILLE SIGN
8002			0xE5	=>  0x00C2,	# LATIN CAPITAL LETTER A WITH CIRCUMFLEX
8003			0xE6	=>  0x00CA,	# LATIN CAPITAL LETTER E WITH CIRCUMFLEX
8004			0xE7	=>  0x00C1,	# LATIN CAPITAL LETTER A WITH ACUTE
8005			0xE8	=>  0x00CB,	# LATIN CAPITAL LETTER E WITH DIAERESIS
8006			0xE9	=>  0x00C8,	# LATIN CAPITAL LETTER E WITH GRAVE
8007			0xEA	=>  0x00CD,	# LATIN CAPITAL LETTER I WITH ACUTE
8008			0xEB	=>  0x00CE,	# LATIN CAPITAL LETTER I WITH CIRCUMFLEX
8009			0xEC	=>  0x00CF,	# LATIN CAPITAL LETTER I WITH DIAERESIS
8010			0xED	=>  0x00CC,	# LATIN CAPITAL LETTER I WITH GRAVE
8011			0xEE	=>  0x00D3,	# LATIN CAPITAL LETTER O WITH ACUTE
8012			0xEF	=>  0x00D4,	# LATIN CAPITAL LETTER O WITH CIRCUMFLEX
8013			0xF0	=>  0xF8FF,	# Apple logo
8014			0xF1	=>  0x00D2,	# LATIN CAPITAL LETTER O WITH GRAVE
8015			0xF2	=>  0x00DA,	# LATIN CAPITAL LETTER U WITH ACUTE
8016			0xF3	=>  0x00DB,	# LATIN CAPITAL LETTER U WITH CIRCUMFLEX
8017			0xF4	=>  0x00D9,	# LATIN CAPITAL LETTER U WITH GRAVE
8018			0xF5	=>  0x0131,	# LATIN SMALL LETTER DOTLESS I
8019			0xF6	=>  0x02C6,	# MODIFIER LETTER CIRCUMFLEX ACCENT
8020			0xF7	=>  0x02DC,	# SMALL TILDE
8021			0xF8	=>  0x00AF,	# MACRON
8022			0xF9	=>  0x02D8,	# BREVE
8023			0xFA	=>  0x02D9,	# DOT ABOVE
8024			0xFB	=>  0x02DA,	# RING ABOVE
8025			0xFC	=>  0x00B8,	# CEDILLA
8026			0xFD	=>  0x02DD,	# DOUBLE ACUTE ACCENT
8027			0xFE	=>  0x02DB,	# OGONEK
8028			0xFF	=>  0x02C7	# CARON
8029		    )
8030	    ) ;
8031
8032
8033	public function  __construct ( $object_id, $font_variant )
8034	   {
8035		parent::__construct ( $object_id, $font_variant, self::$MacRomanCharacterMap ) ;
8036	    }
8037    }
8038
8039
8040/*==============================================================================================================
8041
8042    class PdfTexterAdobeUndocumentedUnicodeMap -
8043        Sometimes, Unicode maps translate character ids to something in the range 0xF000..0xF0FF (or maybe more).
8044	These mapped characters do not correspond to anything else in Unicode, but rather to a special character
8045	set.
8046	This class is not meant to be instantiated by anything here, but rather used for its $Map property.
8047	Note that the $Map array is not complete.
8048
8049  ==============================================================================================================*/
8050class	PdfTexterAdobeUndocumentedUnicodeMap		extends		PdfTexterAdobeMap
8051   {
8052	public static	$UnicodeMap		=  array
8053	   (
8054		0xF0F0 	 =>  0x30,	// '0' through '9'
8055		0xF0EF 	 =>  0x31,
8056		0xF0EE 	 =>  0x32,
8057		0xF0ED 	 =>  0x33,
8058		0xF0EC 	 =>  0x34,
8059		0xF0EB 	 =>  0x35,
8060		0xF0EA 	 =>  0x36,
8061		0xF0E9 	 =>  0x37,
8062		0xF0E8 	 =>  0x38,
8063		0xF0E7 	 =>  0x39,
8064		0xF0DF 	 =>  0x41,	// 'A' through 'Z'
8065		0xF0DE 	 =>  0x42,
8066		0xF0DD 	 =>  0x43,
8067		0xF0DC 	 =>  0x44,
8068		0xF0DB 	 =>  0x45,
8069		0xF0DA 	 =>  0x46,
8070		0xF0D9 	 =>  0x47,
8071		0xF0D8 	 =>  0x48,
8072		0xF0D7 	 =>  0x49,
8073		0xF0D6 	 =>  0x4A,
8074		0xF0D5 	 =>  0x4B,
8075		0xF0D4 	 =>  0x4C,
8076		0xF0D3 	 =>  0x4D,
8077		0xF0D2 	 =>  0x4E,
8078		0xF0D1 	 =>  0x4F,
8079		0xF0D0 	 =>  0x50,
8080		0xF0CF 	 =>  0x51,
8081		0xF0CE 	 =>  0x52,
8082		0xF0CD 	 =>  0x53,
8083		0xF0CC 	 =>  0x54,
8084		0xF0CB 	 =>  0x55,
8085		0xF0CA 	 =>  0x56,
8086		0xF0C9 	 =>  0x57,
8087		0xF0C8 	 =>  0x58,
8088		0xF0C7 	 =>  0x59,
8089		0xF0C6 	 =>  0x5A,
8090		0xF0BF 	 =>  0x61,	// 'a' through 'z'
8091		0xF0BE 	 =>  0x62,
8092		0xF0BD 	 =>  0x63,
8093		0xF0BC 	 =>  0x64,
8094		0xF0BB 	 =>  0x65,
8095		0xF0BA 	 =>  0x66,
8096		0xF0B9 	 =>  0x67,
8097		0xF0B8 	 =>  0x68,
8098		0xF0B7 	 =>  0x69,
8099		0xF0B6 	 =>  0x6A,
8100		0xF0B5 	 =>  0x6B,
8101		0xF0B4 	 =>  0x6C,
8102		0xF0B3 	 =>  0x6D,
8103		0xF0B2 	 =>  0x6E,
8104		0xF0B1 	 =>  0x6F,
8105		0xF0B0 	 =>  0x70,
8106		0xF0AF 	 =>  0x71,
8107		0xF0AE 	 =>  0x72,
8108		0xF0AD 	 =>  0x73,
8109		0xF0AC 	 =>  0x74,
8110		0xF0AB 	 =>  0x75,
8111		0xF0AA 	 =>  0x76,
8112		0xF0A9 	 =>  0x77,
8113		0xF0A8 	 =>  0x78,
8114		0xF0A7 	 =>  0x79,
8115		0xF0A6 	 =>  0x7A,
8116		0xF0F1 	 =>  0x2F,	// '/'
8117		0xF0E6 	 =>  0x3A,	// ':'
8118		0xF0F3 	 =>  0x2D,	// '-'
8119		0xF0F8 	 =>  0x28,	// '('
8120		0xF0F7 	 =>  0x29,	// ')'
8121		0xF0F2 	 =>  0x2E,	// '.'
8122		0xF020 	 =>  0x20,	// Space
8123		0xF0F9 	 =>  0x27,	// "'"
8124		0xF037 	 =>  0xE9,	// &eacute;
8125		0xF038 	 =>  0xE8,	// &egrave;
8126	    ) ;
8127
8128
8129
8130	public function  __construct ( $object_id, $font_variant )
8131	   {
8132		parent::__construct ( $object_id, $font_variant, self::$UnicodeMap ) ;
8133	    }
8134    }
8135
8136
8137/*==============================================================================================================
8138
8139    PdfTexterCIDMap -
8140        A class for mapping (or trying to...) CID fonts.
8141
8142  ==============================================================================================================*/
8143abstract class	PdfTexterCIDMap		extends  PdfTexterCharacterMap
8144   {
8145	// CID maps are associative arrays whose keys are the font CID (currently expressed as a numeric value) and
8146	// whose values are the corresponding UTF8 representation. The following special values can also be used to
8147	// initialize certain entries :
8148	// UNKNOWN_CID :
8149	//	Indicates that the corresponding CID has no known UTF8 counterpart. When the PdfToText::$DEBUG variable
8150	//	is true, every character in this case will be replaced with the string : "[UID: abcd]", where "abcd" is
8151	//	the hex representation of the CID. This way, new CID tables can be built using this information.
8152	const		UNKNOWN_CID		=  -1 ;
8153	// ALT_CID :
8154	//	Sorry, this will remain undocumented so far and will be highligh subject to change, since it is dating
8155	//	from my first interpretation of CID fonts, which is probably wrong.
8156	const		ALT_CID			=  -2 ;
8157
8158
8159	// CID font map file ; the file is a PHP script that must contain an array of the form :
8160	//	$map	=  array
8161	//	   (
8162	//		'plain'		=>  array
8163	//		   (
8164	//			$cid1	=>  $utf1,
8165	//			...
8166	//		    )
8167	//	    ) ;
8168	protected	$MapFile ;
8169	// Map, loaded into memry
8170	protected	$Map ;
8171	// Map cache - the interest is to avoid unnecessary includes
8172	private static	$CachedMaps		=  array ( ) ;
8173
8174	// Related to the first experimentatl implementation of CID fonts
8175	private		$LastAltOffset		=  false ;
8176
8177
8178	/*--------------------------------------------------------------------------------------------------------------
8179
8180	    Constructor -
8181		Loads the specified map.
8182		If the map files contains a definition such as :
8183
8184			$map	=  'IDENTITY-H-GQJGLM.cid' ;
8185
8186		then the specified map will be loaded instead (ony one ndirection is supported).
8187
8188	 *-------------------------------------------------------------------------------------------------------------*/
8189	public function  __construct ( $object_id, $map_name, $font_variant )
8190	   {
8191		// Initialize parent objects
8192		parent::__construct ( $object_id ) ;
8193		$this -> HexCharWidth	=  4 ;			// So far, CIDs are 2-bytes long
8194
8195		// Since alternate characters can be apparently prefixed by 0x0000 or 0x0001, two calls to the array access operator
8196		// will be needed to retrieve the exact character in such cases
8197		// This is why we have to tell the upper layers not to cache the results
8198		$this -> Cache		=  false ;
8199
8200		$map_index	=  "$map_name:$font_variant" ;
8201
8202		// If this font has already been loaded somewhere, then reuse its information
8203		if  ( isset ( self::$CachedMaps [ $map_index] ) )
8204		   {
8205			$map	=  self::$CachedMaps [ $map_index ] [ 'map' ] ;
8206			$file	=  self::$CachedMaps [ $map_index ] [ 'file' ] ;
8207		    }
8208		// Otherwise,
8209		else
8210		   {
8211			$file		=  $this -> __get_cid_file ( $map_name, $font_variant ) ;
8212
8213			// No CID map found : CID numbers will be mapped as is
8214			if  ( ! file_exists ( $file ) )
8215			   {
8216				if  ( PdfToText::$DEBUG )
8217					warning ( new PdfToTextDecodingException ( "Could not find CID table \"$map_name\" in directory \"" . PdfToText::$CIDTablesDirectory . "\"." ) ) ;
8218			    }
8219			// Otherwise, load the CID map
8220			else
8221			   {
8222				include ( $file ) ;
8223
8224				if  ( isset ( $map ) )
8225				   {
8226					// We authorize one CID map to contain the name of another CID map file, instead of the map itself
8227					if  ( is_string ( $map ) )
8228					   {
8229						$file	=  PdfToText::$CIDTablesDirectory . "/$map" ;
8230						include ( $file ) ;
8231					    }
8232
8233					if  ( isset ( $map ) )
8234						self::$CachedMaps [ $map_index ]	=  array ( 'file' => $file, 'map' => $map ) ;
8235				    }
8236				else if  ( PdfToText::$DEBUG )
8237					warning ( new PdfToTextDecodingException ( "CID \"$file\" does not contain any definition." ) ) ;
8238			    }
8239		    }
8240
8241		// Save map info for this CID font
8242		$this -> MapFile	=  $file ;
8243		$this -> Map		=  ( isset ( $map ) ) ?  $map :  array ( ) ;
8244	    }
8245
8246
8247	/*--------------------------------------------------------------------------------------------------------------
8248
8249	    __get_cid_file -
8250		Searches in the CIDTables directory for the CID map that best matches the specified map name (usually,
8251		IDENTITY-H) and the optional font variant.
8252
8253		If a font variant has been specified, like "ABCD+Italic-Arial", then the CID tables directory will be
8254		searched for the following files, in the following order :
8255		- IDENTITY-H-ABCD+Italic-Arial.cid
8256		- IDENTITY-H-ABCD+Italic.cid
8257		- IDENTITY-H-ABCD.cid
8258		- If none found, then IDENTITY-H-empty.cid will be used and a warning will be issued in debug mode.
8259
8260	 *-------------------------------------------------------------------------------------------------------------*/
8261	private function  __get_cid_file ( $map_name, $font_variant )
8262	   {
8263		$files		=  array ( ) ;
8264
8265		// Search for font variants, if any
8266		if  ( $font_variant )
8267		   {
8268			if  ( preg_match ( '/^ (?P<name> [a-z_][a-z_0-9]*) (?P<rest> [\-+] .*) $/imsx' , $font_variant, $match ) )
8269			   {
8270				$basename	=  '-' . $match [ 'name' ] ;
8271
8272				if  ( preg_match_all ( '/ (?P<sep> [\-+]) (?P<name> [^\-+]+) /ix', $match [ 'rest' ], $other_matches ) )
8273				   {
8274					for  ( $i = count ( $other_matches [ 'name' ] ) - 1 ; $i  >=  0 ; $i -- )
8275					   {
8276						$new_file	=  $basename ;
8277
8278						for  ( $j = 0 ; $j  <  $i ; $j ++ )
8279							$new_file	.=  $other_matches [ 'sep' ] [$i] . $other_matches [ 'name' ] [$i] ;
8280
8281						$files []		=  array ( PdfToText::$CIDTablesDirectory . "/$map_name$new_file.cid", 'standard' ) ;
8282					    }
8283				    }
8284			    }
8285
8286			// Last one will be the empty CID font
8287			$files []	=  array ( PdfToText::$CIDTablesDirectory . "/IDENTITY-H-empty.cid", 'empty' ) ;
8288		    }
8289
8290		// Add the specified map file
8291		$files []	=  array ( PdfToText::$CIDTablesDirectory . "/$map_name.cid", 'default' ) ;
8292
8293		// The first existing file in the list should be the appropriate one
8294		foreach  ( $files  as  $file )
8295		   {
8296			if  ( file_exists ( $file [0] ) )
8297			   {
8298				if  ( PdfToText::$DEBUG )
8299				   {
8300					if  ( $file [1]  ===  'empty' )
8301						warning ( new PdfToTextDecodingException ( "Using empty IDENTITY-H definition for map \"$map_name\", variant \"$font_variant\"." ) ) ;
8302					else if  ( $file [1]  ===  'default' )
8303						warning ( new PdfToTextDecodingException ( "Using default IDENTITY-H definition for map \"$map_name\"." ) ) ;
8304				    }
8305
8306				return ( $file [0] ) ;
8307			    }
8308		    }
8309
8310		// No CID font found
8311		return ( false ) ;
8312	    }
8313
8314
8315	/*--------------------------------------------------------------------------------------------------------------
8316
8317	        Interface implementations.
8318
8319	 *-------------------------------------------------------------------------------------------------------------*/
8320	public function  count ( )
8321	   { return ( count ( $this -> Map ) ) ; }
8322
8323
8324	public function  offsetExists ( $offset )
8325	   { return ( isset ( $this -> Map [ 'plain' ] [ $offset ] ) ) ; }
8326
8327
8328	public function  offsetGet ( $offset )
8329	   {
8330		if  ( isset ( $this -> Map [ 'plain' ] [ $offset ] ) )
8331		   {
8332			$ch	=  $this -> Map [ 'plain' ] [ $offset ] ;
8333
8334			switch  ( $ch )
8335			   {
8336				case	self::UNKNOWN_CID :
8337					if  ( PdfToText::$DEBUG )
8338						echo ( '[UID:' . sprintf ( '%04x', $offset ) . "]" ) ;
8339
8340					$this -> LastAltOffset	=  false ;
8341
8342					if  ( ! PdfToText::$DEBUG )
8343						return ( '' ) ;
8344					else
8345						return ( '[UID:' . sprintf ( '%04x', $offset ) . "]" ) ;
8346
8347				case	self::ALT_CID :
8348					$this -> LastAltOffset		=  ( integer ) $offset ;
8349
8350					return ( '' ) ;
8351
8352				default :
8353					if  ( $this -> LastAltOffset  ===  false )
8354						return ( $ch ) ;
8355
8356					if  ( isset ( $this -> Map [ 'alt' ] [ $this -> LastAltOffset ] [ $offset ] ) )
8357					   {
8358						$ch2	=  $this -> Map [ 'alt' ] [ $this -> LastAltOffset ] [ $offset ] ;
8359
8360						if  ( $ch2  ==  self::UNKNOWN_CID )
8361						   {
8362							if  ( PdfToText::$DEBUG )
8363							   {
8364								echo ( "[CID{$this -> LastAltOffset}:" . sprintf ( '%04x', $offset ) . "]" ) ;
8365
8366								$ch2  =  "[CID{$this -> LastAltOffset}: $offset]" ;
8367							    }
8368						    }
8369					    }
8370					else
8371						$ch2	=  '' ;
8372
8373					$this -> LastAltOffset	=  false ;
8374
8375					return ( $ch2 ) ;
8376			    }
8377		    }
8378		else
8379		   {
8380			$this -> LastAltOffset	=  false ;
8381
8382			return ( '' ) ;
8383		    }
8384	    }
8385    }
8386
8387
8388
8389/*==============================================================================================================
8390
8391    PdfTexterIdentityHCIDMap -
8392        A class for mapping IDENTITY-H CID fonts (or trying to...).
8393
8394  ==============================================================================================================*/
8395class  PdfTexterIdentityHCIDMap		extends  PdfTexterCIDMap
8396   {
8397	public function  __construct ( $object_id, $font_variant )
8398	   {
8399		parent::__construct ( $object_id, 'IDENTITY-H', $font_variant ) ;
8400	    }
8401    }
8402
8403
8404
8405/*==============================================================================================================
8406
8407    PdfTexterPageMap -
8408        A class for detecting page objects mappings and retrieving page number for a specified object.
8409	There is a quadruple level of indirection here :
8410
8411	- The first level contains a /Type /Catalog parameter, with a /Pages one that references an object which
8412	  contains a /Count and /Kids. I don't know yet if the /Pages parameter can reference more than one
8413	  object using the array notation. However, the class is designed to handle such situations.
8414	- The object containing the /Kids parameter references objects who, in turn, lists the objects contained
8415	  into one single page.
8416	- Each object referenced in /Kids has a /Type/Page parameter, together with /Contents, which lists the
8417	  objects of the current page.
8418
8419	Object references are of the form : "x y R", where "x" is the object number.
8420
8421	Of course, anything can be in any order, otherwise it would not be funny ! Consider the following
8422	example :
8423
8424		(1) 5 0 obj
8425			<< ... /Pages 1 0 R ... >>
8426		    endobj
8427
8428		(2) 1 0 obj
8429			<< ... /Count 1 /Kids[6 0 R] ... /Type/Pages ... >>
8430		    endobj
8431
8432		(3)  6 0 obj
8433			<< ... /Type/Page ... /Parent 1 0 R ... /Contents [10 0 R 11 0 R ... x 0 R]
8434		     endobj
8435
8436	Object #5 says that object #1 contains the list of page contents (in this example, there is only one page,
8437	referenced by object #6).
8438	Object #6 says that the objects #10, #11 through #x are contained into the same page.
8439	The quadruple indirection comes when you are handling one of the objects referenced in object #6 and you
8440	need to retrieve their page number...
8441
8442	Of course, you cannot rely on the fact that all objects appear in logical order.
8443
8444	And, of course #2, there may be no page catalog at all ! in such cases, objects containing drawing
8445	instructions will have to be considered as a single page, whose number will be sequential.
8446
8447	And, of course #3, as this is the case with the official PDF 1.7 Reference from Adobe, there can be a
8448	reference to a non-existing object which was meant to contain the /Kids parameter (!). In this case,
8449	taking the ordinal number of objects of type (3) gives the page number minus one.
8450
8451	One mystery is that the PDF 1.7 Reference file contains 1310 pages but only 1309 are recognized here...
8452
8453  ==============================================================================================================*/
8454class  PdfTexterPageMap		extends  PdfObjectBase
8455   {
8456	// Page contents are (normally) first described by a catalog
8457	// Although there should be only one entry for that, this property is defined as an array, as you need to really
8458	// become paranoid when handling pdf contents...
8459	protected	$PageCatalogs		=  array ( ) ;
8460	// Entries that describe which page contains which text objects. Of course, these can be nested otherwise it would not be funny !
8461	protected	$PageKids		=  array ( ) ;
8462	// Terminal entries : they directly give the ids of the objects belonging to a page
8463	public		$PageContents		=  array ( ) ;
8464	// Note that all the above arrays are indexed by object id and filled with the data collected by calling the Peek() Method...
8465
8466	// Objects that could be referenced from other text objects as XObjects, using the /TPLx notation
8467	protected	$TemplateObjects	=  array ( ) ;
8468
8469	// Once the Peek() method has collected page contents & object information, the MapCatalog() method is called to create this array
8470	// which contains page numbers as keys, and the list of objects contained in this page as values
8471	public		$Pages			=  array ( ) ;
8472	// Holds page attributes
8473	public		$PageAttributes		=  array ( ) ;
8474
8475	// Resource mappings can either refer to an object (/Resources 2 0 R) or to inline mappings (/Resources << ... >>)
8476	// The same object can be referenced by many /Resources parameters throughout the pdf file, so its important to keep
8477	// the analyzed mappings in a cache, so that later references will reuse the results of the first one
8478	private		$ResourceMappingCache	=  array ( ) ;
8479	// List of XObject names - Used by the IsValidTemplate() function
8480	private		$XObjectNames		=  array ( ) ;
8481
8482
8483	/*--------------------------------------------------------------------------------------------------------------
8484
8485	    CONSTRUCTOR
8486		Creates a PdfTexterPageMap object. Actually, nothing significant is perfomed here, as this class' goal
8487		is to be used internally by PdfTexter.
8488
8489	 *-------------------------------------------------------------------------------------------------------------*/
8490	public function  __construct ( )
8491	   {
8492		parent::__construct ( ) ;
8493	    }
8494
8495
8496	/*--------------------------------------------------------------------------------------------------------------
8497
8498	    NAME
8499	        AddTemplateObject - Adds an object that could be referenced as a template/
8500
8501	    PROTOTYPE
8502	        $pagemap -> AddTemplateObject ( $object_id, $object_text_data ) ;
8503
8504	    DESCRIPTION
8505	        Adds an object that may be referenced as a template from another text object, using the /TPLx notation.
8506
8507	    PARAMETERS
8508	        $object_id (integer) -
8509	                Id of the object that may contain a resource mapping entry.
8510
8511		$object_data (string) -
8512			Object contents.
8513
8514	 *-------------------------------------------------------------------------------------------------------------*/
8515	public function  AddTemplateObject ( $object_id, $object_text_data )
8516	   {
8517		$this -> TemplateObjects [ $object_id ]		=  $object_text_data ;
8518	    }
8519
8520
8521	/*--------------------------------------------------------------------------------------------------------------
8522
8523	    NAME
8524	        GetResourceMappings - Gets resource mappings specified after a /Resources parameter.
8525
8526	    PROTOTYPE
8527	        $result		=  $this -> GetResourceMappings ( $object_id, $object_data, $parameter, $pdf_object_list ) ;
8528
8529	    DESCRIPTION
8530	        Most of the time, objects containing a page description (/Type /Page) also contain a /Resources parameter,
8531		which may be followed by one of the following constructs :
8532		- A reference to an object, such as :
8533			/Resources 2 0 R
8534		- Or an inline set of parameters, such as font or xobject mappings :
8535			/Resources << /Font<</F1 10 0 R ...>> /XObject <</Im0 27 0 R ...>>
8536		This method extracts alias/object mappings for the parameter specified by $parameter (it can be for
8537		example 'Font' or 'Xobject') and returns these mappings as an associative array.
8538
8539	    PARAMETERS
8540	        $object_id (integer) -
8541	                Id of the object that may contain a resource mapping entry.
8542
8543		$object_data (string) -
8544			Object contents.
8545
8546		$parameter (string) -
8547			Parameter defining resource mapping, for example /Font or /XObject.
8548
8549		$pdf_object_list (associative array) -
8550			Array of object id/object data associations, for all objects defined in the pdf file.
8551
8552	    RETURN VALUE
8553	        The list of resource mappings for the specified parameter, as an associative array, whose keys are the
8554		resource aliases and values are the corresponding object ids.
8555		The method returns an empty array if the specified object does not contain resource mappings or does
8556		not contain the specified parameter.
8557
8558	 *-------------------------------------------------------------------------------------------------------------*/
8559	protected function  GetResourceMappings ( $object_id, $object_data, $parameter, $pdf_object_list )
8560	   {
8561		// The /Resources parameter refers to an existing PDF object
8562		if  ( preg_match ( '#/Resources \s* (?P<object_id> \d+) \s+ \d+ \s+ R#ix', $object_data, $match ) )
8563		   {
8564			// Return the cached result if the same object has previously been referenced by a /Resources parameter
8565			if  ( isset ( $this -> ResourceMappingCache [ $object_id ] [ $parameter ] ) )
8566				return ( $this -> ResourceMappingCache [ $object_id ] [ $parameter ] ) ;
8567
8568			// Check that the object that is referred to exists
8569			if  ( isset ( $pdf_object_list [ $match [ 'object_id' ] ] ) )
8570				$data	=  $pdf_object_list [ $match [ 'object_id' ] ] ;
8571			else
8572				return ( array ( ) ) ;
8573
8574			$is_object	=  true ;	// to tell that we need to put the results in cache for later use
8575		    }
8576		// The /Resources parameter is followed by inline mappings
8577		else if  ( preg_match ( '#/Resources \s* <#ix', $object_data, $match, PREG_OFFSET_CAPTURE ) )
8578		   {
8579			$data		=  substr ( $object_data, $match [0] [1] + strlen ( $match [0] [0] ) - 1 ) ;
8580			$is_object	=  false ;
8581		    }
8582		else
8583			return ( array ( ) ) ;
8584
8585		// Whatever we will be analyzing (an object contents or inline contents following the /Resources parameter),
8586		// the text will be enclosed within double angle brackets (<< ... >>)
8587
8588		// A small kludge for /XObject which specify an object reference ("15 0 R") instead of XObjects mappings
8589		// ("<< ...>>" )
8590		if  ( $parameter   ==  '/XObject'  &&  preg_match ( '#/XObject \s+ (?P<obj> \d+) \s+ \d+ \s+ R#ix', $data, $match ) )
8591		   {
8592			$data = '/XObject ' . $pdf_object_list [ $match [ 'obj' ] ] ;
8593		    }
8594
8595		if  ( preg_match ( "#$parameter \s* << \s* (?P<mappings> .*?) \s* >>#imsx", $data, $match ) )
8596		   {
8597			preg_match_all ( '# (?P<mapping> / [^\s]+) \s+ (?P<object_id> \d+) \s+ \d+ \s+ R#ix', $match [ 'mappings' ], $matches ) ;
8598
8599			$mappings	=  array ( ) ;
8600
8601			// Mapping extraction loop
8602			for  ( $i = 0, $count = count ( $matches [ 'object_id' ] ) ; $i  <  $count ; $i ++ )
8603				$mappings [ $matches [ 'mapping' ] [$i] ]	=  $matches [ 'object_id' ] [$i] ;
8604
8605			// Put results for referenced objects in cache
8606			if  ( $is_object )
8607				$this -> ResourceMappingCache [ $object_id ] [ $parameter ]	=  $mappings ;
8608
8609			return ( $mappings ) ;
8610		    }
8611		else
8612			return ( array ( ) ) ;
8613	    }
8614
8615
8616	/*--------------------------------------------------------------------------------------------------------------
8617
8618	    NAME
8619	        Peek - Peeks page information from a pdf object.
8620
8621	    PROTOTYPE
8622	        $pagemap -> Peek ( ) ;
8623
8624	    DESCRIPTION
8625	        Retrieves page information which can be of type (1), (2) or (3), as described in the class comments.
8626
8627	    PARAMETERS
8628	        $object_id (integer) -
8629	                Id of the current pdf object.
8630
8631		$object_data (string) -
8632			Pdf object contents.
8633
8634		$pdf_objects (associative array) -
8635			Objects defined in the pdf file, as an associative array whose keys are object numbers and
8636			values object data.
8637			This parameter is used for /Type/Page objects which have a /Resource parameter that references
8638			an existing object instead of providing font mappings and other XObject mappings inline,
8639			enclosed within double angle brackets (<< /Font ... >>).
8640
8641	 *-------------------------------------------------------------------------------------------------------------*/
8642	public function  Peek ( $object_id, $object_data, $pdf_objects )
8643	   {
8644		// Page catalog (/Type/Catalog and /Pages x 0 R)
8645		if  ( preg_match ( '#/Type \s* /Catalog#ix', $object_data )  &&  $this -> GetObjectReferences ( $object_id, $object_data, '/Pages', $references ) )
8646			$this -> PageCatalogs	=  array_merge ( $this -> PageCatalogs, $references ) ;
8647		// Object listing the object numbers that give the list of objects contained in a single page (/Types/Pages and /Count x /Kids[x1 0 R ... xn 0 R]
8648		else if  ( preg_match ( '#/Type \s* /Pages#ix', $object_data ) )
8649		   {
8650			if  ( $this -> GetObjectReferences ( $object_id, $object_data, '/Kids', $references ) )
8651			   {
8652				// Sometimes, a reference can be the one of an object that contains the real reference ; in the following example,
8653				// the actual page contents are not in object 4, but in object 5
8654				//	/Kids 4 0 R
8655				//	...
8656				//	4 0 obj
8657				//	[5 0 R]
8658				//	endobj
8659				$new_references		=  array ( ) ;
8660
8661				foreach  ( $references  as  $reference )
8662				   {
8663					if  ( ! isset ( $pdf_objects [ $reference ] )  ||
8664						! preg_match ( '/^ \s* (?P<ref> \[ [^]]+ \]) \s*$/imsx', $pdf_objects [ $reference ], $match ) )
8665					   {
8666						$new_references []	=  $reference ;
8667					    }
8668					else
8669					   {
8670						$this -> GetObjectReferences ( $reference, $pdf_objects [ $reference ], '', $sub_references ) ;
8671						$new_references		=  array_merge ( $new_references, $sub_references ) ;
8672					    }
8673
8674				    }
8675
8676				// Get kid count (knowing that sometimes, it is missing...)
8677				preg_match ( '#/Count \s+ (?P<count> \d+)#ix', $object_data, $match ) ;
8678				$page_count				=  ( isset ( $match [ 'count' ] ) ) ?  ( integer ) $match [ 'count' ] : false ;
8679
8680				// Get parent object id
8681				preg_match ( '#/Parent \s+ (?P<parent> \d+)#ix', $object_data, $match ) ;
8682				$parent					=  ( isset ( $match [ 'parent' ] ) ) ?  ( integer ) $match [ 'parent' ] : false ;
8683
8684				$this -> PageKids [ $object_id ]	=  array
8685				   (
8686					'object'	=>  $object_id,
8687					'parent'	=>  $parent,
8688					'count'		=>  $page_count,
8689					'kids'		=>  $new_references
8690				    ) ;
8691			    }
8692		    }
8693		// Object listing the other objects that are contained in this page (/Type/Page and /Contents[x1 0 R ... xn 0 R]
8694		else if  ( preg_match ( '#/Type \s* /Page\b#ix', $object_data ) )
8695		   {
8696			if  ( $this -> GetObjectReferences ( $object_id, $object_data, '/Contents', $references ) )
8697			   {
8698				preg_match ( '#/Parent \s+ (?P<parent> \d+)#ix', $object_data, $match ) ;
8699				$parent					=  ( isset ( $match [ 'parent' ] ) ) ?  (integer) $match [ 'parent' ] : false ;
8700				$fonts					=  $this -> GetResourceMappings ( $object_id, $object_data, '/Font', $pdf_objects ) ;
8701				$xobjects				=  $this -> GetResourceMappings ( $object_id, $object_data, '/XObject', $pdf_objects ) ;
8702
8703				// Find the width and height of the page (/Mediabox parameter)
8704				if  ( preg_match ( '#/MediaBox \s* \[ \s* (?P<x1> \d+) \s+ (?P<y1> \d+) \s+ (?P<x2> \d+) \s+ (?P<y2> \d+) \s* \]#imsx', $object_data, $match ) )
8705				   {
8706					$width		=  ( double ) ( $match [ 'x2' ] - $match [ 'x1' ] + 1 ) ;
8707					$height		=  ( double ) ( $match [ 'y2' ] - $match [ 'y1' ] + 1 ) ;
8708				    }
8709				// Otherwise, fix an arbitrary width and length (but this should never happen, because all pdf files are correct, isn't it?)
8710				else
8711				   {
8712					$width		=  595 ;
8713					$height		=  850 ;
8714				    }
8715
8716				// Yes ! some /Contents parameters may designate another object which contains references to the real text contents
8717				// in the form : [x 0 R y 0 R etc.], so we have to dig into it...
8718				$new_references				=  array ( ) ;
8719
8720				foreach  ( $references  as  $reference )
8721				   {
8722					// We just need to check that the object contains something like :
8723					//	[x 0 R y 0 R ...]
8724					// and nothing more
8725					if  ( isset ( $pdf_objects [ $reference ] )  &&  preg_match ( '#^\s* \[ [^]]+ \]#x', $pdf_objects [ $reference ] )  &&
8726							$this -> GetObjectReferences ( $reference, $pdf_objects [ $reference ], '', $nested_references ) )
8727						$new_references		=  array_merge ( $new_references, $nested_references ) ;
8728					else
8729						$new_references []	=  $reference ;
8730				    }
8731
8732				$this -> PageContents [ $object_id ]	=  array
8733				   (
8734					'object'	=>  $object_id,
8735					'parent'	=>  $parent,
8736					'contents'	=>  $new_references,
8737					'fonts'		=>  $fonts,
8738					'xobjects'	=>  $xobjects,
8739					'width'		=>  $width,
8740					'height'	=>  $height
8741				    ) ;
8742			    }
8743		    }
8744		// None of the above, but object contains /Xobject's and maybe more...
8745		else if  ( preg_match ( '#/Type \s* /XObject\b#ix', $object_data ) )
8746		   {
8747			preg_match ( '#/Parent \s+ (?P<parent> \d+)#ix', $object_data, $match ) ;
8748			$parent					=  ( isset ( $match [ 'parent' ] ) ) ?  (integer) $match [ 'parent' ] : false ;
8749			$fonts					=  $this -> GetResourceMappings ( $object_id, $object_data, '/Font', $pdf_objects ) ;
8750			$xobjects				=  $this -> GetResourceMappings ( $object_id, $object_data, '/XObject', $pdf_objects ) ;
8751
8752			$this -> GetObjectReferences ( $object_id, $object_data, '/Contents', $references ) ;
8753
8754			$this -> PageContents [ $object_id ]	=  array
8755			   (
8756				'object'	=>  $object_id,
8757				'parent'	=>  $parent,
8758				'contents'	=>  $references,
8759				'fonts'		=>  $fonts,
8760				'xobjects'	=>  $xobjects
8761			    ) ;
8762		    }
8763	    }
8764
8765
8766	/*--------------------------------------------------------------------------------------------------------------
8767
8768	    NAME
8769	        ProcessTemplateReferences - Replace template references with actual text contents.
8770
8771	    PROTOTYPE
8772	        $text		=  $pagemap -> ReplaceTemplateReferences ( $page_number, $text_data ) ;
8773
8774	    DESCRIPTION
8775	        Replaces template references of the form "/TPLx Do" with the actual text contents.
8776
8777	    PARAMETERS
8778	        $page_number (integer) -
8779	                Page number of the object that contains the supplied object data.
8780
8781		$text_data (string)
8782			Text drawing instructions that are to be processed.
8783
8784	    RETURN VALUE
8785	        Returns the original text, where all template references have been replaced with the contents of the
8786		object they refer to.
8787
8788	 *-------------------------------------------------------------------------------------------------------------*/
8789	public function  ProcessTemplateReferences ( $page_number, $text_data )
8790	    {
8791		// Many paranoid checks in this piece of code...
8792		if  ( isset ( $this -> Pages [ $page_number ] ) )
8793		   {
8794			// Loop through the PageContents array to find which one(s) may be subject to template reference replacements
8795			foreach  ( $this -> PageContents  as  $page_contents )
8796			   {
8797				// If the current object relates to the specified page number, AND it has xobjects, then the supplied text data
8798				// may contain template reference of the form : /TPLx.
8799				// In this case, we replace such a reference with the actual contents of the object they refer to
8800				if  ( isset ( $page_contents [ 'page' ] )  &&  $page_contents [ 'page' ]  ==  $page_number  &&  count ( $page_contents [ 'xobjects' ] ) )
8801				   {
8802					$template_searches	=  array ( ) ;
8803					$template_replacements	=  array ( ) ;
8804
8805					$this ->  __get_replacements ( $page_contents, $template_searches, $template_replacements ) ;
8806					$text_data	=  self::PregStrReplace ( $template_searches, $template_replacements, $text_data ) ;
8807				    }
8808			    }
8809		    }
8810
8811		return ( $text_data ) ;
8812	     }
8813
8814
8815	// __get_replacements -
8816	//	Recursively gets the search/replacement strings for template references.
8817	private function  __get_replacements ( $page_contents, &$searches, &$replacements, $objects_seen = array ( ) )
8818	   {
8819		foreach  ( $page_contents [ 'xobjects' ]  as  $template_name => $template_object )
8820		   {
8821			if  ( isset ( $this -> TemplateObjects [ $template_object ] )  &&  ! isset ( $objects_seen [ $template_object ] ) )
8822			   {
8823				$template				=  $this -> TemplateObjects [ $template_object ] ;
8824				$searches []				=  '#(' . $template_name . ' \s+ Do\b )#msx' ;
8825				$replacements []			=  '!PDFTOTEXT_TEMPLATE_' . substr ( $template_name, 1 ) . ' ' . $template ;
8826				$objects_seen [ $template_object ]	=  $template_object ;
8827
8828				if  ( isset ( $this -> PageContents [ $template_object ] ) )
8829					$this -> __get_replacements ( $this -> PageContents [ $template_object ], $searches, $replacements, $objects_seen ) ;
8830			    }
8831		    }
8832	    }
8833
8834
8835
8836	/*--------------------------------------------------------------------------------------------------------------
8837
8838	    NAME
8839	        MapObjects - Builds a correspondance between object and page numbers.
8840
8841	    PROTOTYPE
8842	        $pagemap -> MapObjects ( ) ;
8843
8844	    DESCRIPTION
8845	        Builds a correspondance between object and page numbers. The page number corresponding to an object id
8846		will after that be available using the array notation.
8847
8848	    NOTES
8849		This method behaves as if there could be more than one page catalog in the same file, but I've not yet
8850		encountered this case.
8851
8852	 *-------------------------------------------------------------------------------------------------------------*/
8853	public function  MapObjects ( $objects )
8854	   {
8855		$kid_count	=  count ( $this -> PageKids ) ;
8856
8857		// PDF files created short after the birth of Earth may have neither a page catalog nor page contents descriptions
8858		if  ( ! count ( $this -> PageCatalogs  ) )
8859		   {
8860			// Later, during Pleistocen, references to page kids started to appear...
8861			if  ( $kid_count )
8862			   {
8863				foreach  ( array_keys ( $this -> PageKids )  as  $catalog )
8864					$this -> MapKids ( $catalog, $current_page ) ;
8865			    }
8866			else
8867				$this -> Pages [1]	=  array_keys ( $objects ) ;
8868		    }
8869		// This is the ideal situation : there is a catalog that allows us to gather indirectly all page data
8870		else
8871		   {
8872			$current_page		=  1 ;
8873
8874			foreach  ( $this -> PageCatalogs  as  $catalog )
8875			   {
8876				if  ( isset ( $this -> PageKids [ $catalog ] ) )
8877					$this -> MapKids ( $catalog, $current_page ) ;
8878				// Well, almost ideal : it may happen that the page catalog refers to a non-existing object :
8879				// in this case, we behave the same as if there were no page catalog at all : group everything
8880				// onto one page
8881				else
8882					$this -> Pages [1]	=  array_keys ( $objects ) ;
8883			    }
8884		    }
8885	    }
8886
8887
8888	/*--------------------------------------------------------------------------------------------------------------
8889
8890	    NAME
8891	        MapKids - Establishes a correspondance between page kids and a current page number.
8892
8893	    PROTOTYPE
8894	        $pagemap -> MapObjects ( $catalog, &$page ) ;
8895
8896	    DESCRIPTION
8897	  	Tries to assign a page number to all page description objects that have been collected by the Peek()
8898		method.
8899	  	Also creates the Pages associative array, whose keys are page numbers and whose values are the ids of
8900		the objects that the page contains.
8901
8902	    EXAMPLE
8903	  	The following example gives an overview of a possible layout for page catalogs ; it describes which
8904		objects contain	what.
8905	  	Lines starting with "#x", where "x" is a number, stands for a PDF object definition, which will start
8906		with "x 0 obj" in the PDF file.
8907	  	Whenever numbers are referenced (other than those prefixed with a "#"), it means "reference to the
8908		specified object.
8909	  	For example, "54" will refer to object #54, and will be given as "54 0 R" in the PDF file.
8910	  	The numbers at the beginning of each line are just "step numbers", which will be referenced in the
8911		explanations after the example :
8912
8913			(01) #1 : /Type/Catalog /Pages 54
8914			(02)    -> #54 : /Type/Pages /Kids[3 28 32 58] /Count 5
8915			(03)           -> #3 : /Type/Page /Parent 54 /Contents[26]
8916			(04)		     -> #26 : page contents
8917			(05)           -> #28 : /Type/Page /Parent 54 /Contents[30 100 101 102 103 104]
8918			(06)		     -> #30 : page contents
8919			(07)	       -> #32 : /Type/Page /Parent 54 /Contents[34]
8920			(08)		     -> #34 : page contents
8921			(09)	       -> #58 : /Type/Pages /Parent 54 /Count 2 /Kids[36 40]
8922			(10)		     -> #36 : /Type/Page /Parent 58 /Contents[38]
8923			(11)			    -> #38 : page contents
8924			(12)		     -> #40 : /Type/Page /Parent 58 /Contents[42]
8925			(13)			    -> #42 : page contents
8926
8927		 Explanations :
8928			(01) Object #1 contains the page catalog ; it states that a further description of the page
8929			     contents is given by object #54.
8930			     Note that it could reference multiple page descriptions, such as : /Pages [54 68 99...]
8931			     (although I did not met the case so far)
8932			(02) Object #54 in turn says that it as "kids", described by objects #3, #28, #32 and #58. It
8933			     also says that it has 5 pages (/Count parameter) ; but wait... the /Kids parameter references
8934			     4 objects while the /Count parameter states that we have 5 pages : what happens ? we will
8935			     discover it in the explanations below.
8936			(03) Object #3 states that it is aimed for page description (/Type/Page) ; the page contents
8937			     will be found in object #26, specified after the /Contents parameter. Note that here again,
8938			     multiple objects could be referenced by the /Contents parameter but, in our case, there is
8939			     only one, 26. Object #3 also says that its parent object (in the page catalog) is object
8940			     #54, defined in (01).
8941			     Since this is the first page we met, it will have page number 1.
8942			(04) ... object #26 contains the Postscript instructions to draw page #1
8943			(05) Object #28 has the same type as #3 ; its page contents can be located in object #30 (06)
8944			     The same applies for object #32 (07), whose page contents are given by object #34 (08).
8945			     So, (05) and (07) will be pages 2 and 3, respectively.
8946			(09) Now, it starts to become interesting : object #58 does not directly lead to an object
8947			     containing Postscript instructions as did objects #3, #28 and #32 whose parent is #54, but
8948			     to yet another page catalog which contains 2 pages (/Count 2), described by objects #36 and
8949			     #40. It's not located at the same position as object #54 in the hierarchy, so it shows that
8950			     page content descriptions can be recursively nested.
8951			(10) Object #36 says that we will find the page contents in object #38 (which will be page 4)
8952			(12) ... and object #40 says that we will find the page contents in object #42 (and our final
8953			     page, 5)
8954
8955	 *-------------------------------------------------------------------------------------------------------------*/
8956	protected function  MapKids ( $catalog, &$page )
8957	   {
8958		if  ( ! isset ( $this -> PageKids [ $catalog ] ) )
8959			return ;
8960
8961		$entry		=  $this -> PageKids [ $catalog ] ;
8962
8963		// The PDF file contains an object containing a /Type/Pages/Kids[] construct, specified by another object containing a
8964		// /Type/Catalog/Pages construct : we will rely on its contents to find which page contains what
8965		if  ( isset ( $this -> PageContents [ $entry [ 'kids' ] [0] ] ) )
8966		   {
8967			foreach  ( $entry [ 'kids' ]  as  $item )
8968			   {
8969				// Some objects given by a /Page /Contents[] construct do not directly lead to an object describing PDF contents,
8970				// but rather to an object containing in turn a /Pages /Kids[] construct ; this adds a level of indirection, and
8971				// we have to recursively process it
8972				if  ( isset ( $this -> PageKids [ $item ] ) )
8973				   {
8974					$this -> MapKids ( $item, $page ) ;
8975				    }
8976				// The referenced object actually defines page contents (no indirection)
8977				else
8978				   {
8979					$this -> PageContents [ $item ]	[ 'page' ]	=  $page ;
8980					$this -> Pages [ $page ]			=  ( isset ( $this -> PageContents [ $item ] [ 'contents' ] ) ) ?
8981												$this -> PageContents [ $item ] [ 'contents' ] : array ( ) ;
8982					if ( isset ( $this -> PageContents [ $item ] [ 'width' ] ) )
8983					   {
8984						$this -> PageAttributes [ $page ]		=  array
8985						   (
8986							'width'			=>  $this -> PageContents [ $item ] [ 'width' ],
8987							'height'		=>  $this -> PageContents [ $item ] [ 'height' ]
8988						    ) ;
8989					    }
8990
8991					$page ++ ;
8992				    }
8993			    }
8994		    }
8995		// No page catalog at all : consider everything is on the same page (this class does not use the WheresMyCrystalBall trait)
8996		else
8997		   {
8998			foreach  ( $entry [ 'kids' ]  as  $kid )
8999				$this -> MapKids ( $kid, $page ) ;
9000		    }
9001	    }
9002
9003
9004	/*--------------------------------------------------------------------------------------------------------------
9005
9006	    NAME
9007	        GetMappedFonts - Retrieves the mapped fonts per page
9008
9009	    PROTOTYPE
9010	        $array	=  $pagemap -> GetMappedFonts ( ) ;
9011
9012	    DESCRIPTION
9013	        Gets the mapped fonts, per page. XObjects are traversed, to retrieved additional font aliases defined
9014		by them.
9015		This function is used by the PdfTexter class to add additional entries to the FontMap object,
9016		ensuring that each reference to a font remains local to a page.
9017
9018	    RETURN VALUE
9019	        Returns an array of associative arrays which have the following entries :
9020		- 'page' :
9021			Page number.
9022		- 'xobject-name' :
9023			XObject name, that can define further font aliases. This entry is set to the empty string for
9024			global font aliases.
9025		- 'font-name' :
9026			Font name (eg, "/F1", "/C1_0", etc.).
9027		- 'object' :
9028			Object defining the font attributes, such as character map, etc.
9029
9030	 *-------------------------------------------------------------------------------------------------------------*/
9031	public function  GetMappedFonts ( )
9032	   {
9033		$mapped_fonts	=  array ( ) ;
9034		$current_page	=  0 ;
9035
9036		foreach  ( $this -> PageCatalogs  as  $catalog )
9037		   {
9038			if  ( ! isset ( $this -> PageKids [ $catalog ] ) )
9039				continue ;
9040
9041			foreach  ( $this -> PageKids [ $catalog ] [ 'kids' ]  as  $page_object )
9042			   {
9043				$current_page ++ ;
9044
9045				if  ( isset ( $this -> PageContents [ $page_object ] ) )
9046				   {
9047					$page_contents	=  $this -> PageContents [ $page_object ] ;
9048					$associations	=  array ( ) ;
9049
9050					if  ( isset ( $page_contents [ 'fonts' ] ) )
9051					   {
9052						foreach  ( $page_contents [ 'fonts' ]  as  $font_name => $font_object )
9053						   {
9054							$mapped_fonts []		=  array
9055							   (
9056								'page'		=>  $current_page,
9057								'xobject-name'	=>  '',
9058								'font-name'	=>  $font_name,
9059								'object'	=>  $font_object
9060							    ) ;
9061
9062							$associations [ ":$font_name" ]	=  $font_object ;
9063
9064							$this -> __map_recursive ( $current_page, $page_contents [ 'xobjects' ], $mapped_fonts, $associations ) ;
9065						    }
9066					    }
9067				    }
9068			    }
9069		    }
9070
9071		return ( $mapped_fonts ) ;
9072	    }
9073
9074
9075	// __map_recursive -
9076	//	Recursively collects font aliases for XObjects.
9077	private function  __map_recursive ( $page_number, $xobjects, &$mapped_fonts, &$associations )
9078	   {
9079		foreach  ( $xobjects  as  $xobject_name => $xobject_value )
9080		   {
9081			if  ( isset ( $this -> PageContents [ $xobject_value ] ) )
9082			   {
9083				foreach  ( $this -> PageContents [ $xobject_value ] [ 'fonts' ]  as  $font_name => $font_object )
9084				   {
9085					if  ( ! isset ( $associations [ "$xobject_name:$font_name" ] ) )
9086					   {
9087						$mapped_fonts []		=  array
9088						   (
9089							'page'		=>  $page_number,
9090							'xobject-name'	=>  $xobject_name,
9091							'font-name'	=>  $font_name,
9092							'object'	=>  $font_object
9093						    ) ;
9094
9095						$associations [ "$xobject_name:$font_name" ]	=  $font_object ;
9096					    }
9097				    }
9098
9099				$this -> XObjectNames [ $xobject_name ]		=  1 ;
9100				$this -> __map_recursive ( $page_number, $this -> PageContents [ $xobject_value ] [ 'xobjects' ], $mapped_fonts, $associations ) ;
9101			    }
9102		    }
9103	    }
9104
9105
9106
9107	/*--------------------------------------------------------------------------------------------------------------
9108
9109	    NAME
9110	        IsValidXObject - Checks if the specified object is a valid XObject.
9111
9112	    PROTOTYPE
9113	        $status		=  $pagemap -> IsValidXObjectName ( $name ) ;
9114
9115	    DESCRIPTION
9116	        Checks if the specified name is a valid XObject defining its own set of font aliases.
9117
9118	    PARAMETERS
9119	        $name (string) -
9120	                Name of the XObject to be checked.
9121
9122	    RETURN VALUE
9123	        Returns true if the specified XObject exists and defines its own set of font aliases, false otherwise.
9124
9125	 *-------------------------------------------------------------------------------------------------------------*/
9126	public function  IsValidXObjectName ( $name )
9127	   { return ( isset ( $this -> XObjectNames [ $name ] ) ) ; }
9128    }
9129
9130
9131/**************************************************************************************************************
9132 **************************************************************************************************************
9133 **************************************************************************************************************
9134 ******                                                                                                  ******
9135 ******                                                                                                  ******
9136 ******                                         IMAGE MANAGEMENT                                         ******
9137 ******                                                                                                  ******
9138 ******                                                                                                  ******
9139 **************************************************************************************************************
9140 **************************************************************************************************************
9141 **************************************************************************************************************/
9142
9143/*==============================================================================================================
9144
9145    class PdfImage -
9146        Holds image data coming from pdf.
9147
9148  ==============================================================================================================*/
9149abstract class  PdfImage			extends  PdfObjectBase
9150   {
9151	// Image resource that can be used to process image data, using the php imagexxx() functions
9152	public		$ImageResource		=  false ;
9153	// Original image data
9154	protected	$ImageData ;
9155	// Tells if the image resource has been created - false when the autosave feature is on and the image is pure JPEG data
9156	protected	$NoResourceCreated ;
9157
9158
9159	/*--------------------------------------------------------------------------------------------------------------
9160
9161	    CONSTRUCTOR
9162	        Creates a PdfImage object with a resource that can be used with imagexxx() php functions.
9163
9164	 *-------------------------------------------------------------------------------------------------------------*/
9165	public function  __construct ( $image_data, $no_resource_created = false )
9166	   {
9167		$this -> ImageData		=  $image_data ;
9168		$this -> NoResourceCreated	=  $no_resource_created ;
9169
9170		if  ( ! $no_resource_created )
9171			$this -> ImageResource		=  $this -> CreateImageResource ( $image_data ) ;
9172	    }
9173
9174
9175	/*--------------------------------------------------------------------------------------------------------------
9176
9177	    DESTRUCTOR
9178	        Destroys the associated image resource.
9179
9180	 *-------------------------------------------------------------------------------------------------------------*/
9181	public function  __destruct ( )
9182	   {
9183		$this -> DestroyImageResource ( ) ;
9184	    }
9185
9186
9187	/*--------------------------------------------------------------------------------------------------------------
9188
9189	    NAME
9190	        CreateImageResource - creates an image resource from the supplied image data.
9191
9192	    PROTOTYPE
9193	        $resource	=  $this -> CreateImageResource ( $data ) ;
9194
9195	    DESCRIPTION
9196	        Creates an image resource from the supplied image data.
9197		Whatever the input format, the internal format will be the one used by the gd library.
9198
9199	    PARAMETERS
9200	        $data (string) -
9201	                Image data.
9202
9203	 *-------------------------------------------------------------------------------------------------------------*/
9204	abstract protected function  CreateImageResource ( $image_data ) ;
9205
9206
9207	/*--------------------------------------------------------------------------------------------------------------
9208
9209	    NAME
9210	        DestroyImageResource - Destroys the allocated image resource.
9211
9212	    PROTOTYPE
9213	        $this -> DestroyImageResource ( ) ;
9214
9215	    DESCRIPTION
9216	        Destroys the allocated image resource, using the libgd imagedestroy() function. This method can be
9217		overridden by derived class if the underlying image resource does not come from the gd lib.
9218
9219	 *-------------------------------------------------------------------------------------------------------------*/
9220	protected function  DestroyImageResource ( )
9221	   {
9222		if  ( $this -> ImageResource )
9223			imagedestroy ( $this -> ImageResource ) ;
9224	    }
9225
9226
9227	/*--------------------------------------------------------------------------------------------------------------
9228
9229	    NAME
9230	        SaveAs - Saves the current image to a file.
9231
9232	    PROTOTYPE
9233	        $pdfimage -> SaveAs ( $output_file, $image_type = IMG_JPEG ) ;
9234
9235	    DESCRIPTION
9236	        Saves the current image resource to the specified output file, in the specified format.
9237
9238	    PARAMETERS
9239	        $output_file (string) -
9240	                Output filename.
9241
9242		$image_type (integer) -
9243			Output format. Can be any of the predefined php constants IMG_*.
9244
9245	 *-------------------------------------------------------------------------------------------------------------*/
9246	public function  SaveAs ( $output_file, $image_type = IMG_JPEG )
9247	   {
9248		if  ( ! $this -> ImageResource )
9249		   {
9250			if  ( $this -> NoResourceCreated  &&  $image_type  ==  IMG_JPEG )
9251				file_put_contents ( $output_file, $this -> ImageData ) ;
9252			else if  ( PdfToText::$DEBUG )
9253				warning ( new PdfToTextDecodingException ( "No image resource allocated." ) ) ;
9254
9255			return ;
9256		    }
9257
9258		$image_types		=  imagetypes ( ) ;
9259
9260		switch  ( $image_type )
9261		   {
9262			case	IMG_JPEG :
9263			case	IMG_JPG :
9264				if  ( ! ( $image_types & IMG_JPEG )  &&  ! ( $image_types & IMG_JPG ) )
9265					error ( new PdfToTextDecodingException ( "Your current PHP version does not support JPG images." ) ) ;
9266
9267				imagejpeg ( $this -> ImageResource, $output_file, 100 ) ;
9268				break ;
9269
9270			case	IMG_GIF :
9271				if  ( ! ( $image_types & IMG_GIF ) )
9272					error ( new PdfToTextDecodingException ( "Your current PHP version does not support GIF images." ) ) ;
9273
9274				imagegif ( $this -> ImageResource, $output_file ) ;
9275				break ;
9276
9277			case	IMG_PNG :
9278				if  ( ! ( $image_types & IMG_PNG ) )
9279					error ( new PdfToTextDecodingException ( "Your current PHP version does not support PNG images." ) ) ;
9280
9281				imagepng ( $this -> ImageResource, $output_file, 0 ) ;
9282				break ;
9283
9284			case	IMG_WBMP :
9285				if  ( ! ( $image_types & IMG_WBMP ) )
9286					error ( new PdfToTextDecodingException ( "Your current PHP version does not support WBMP images." ) ) ;
9287
9288				imagewbmp ( $this -> ImageResource, $output_file ) ;
9289				break ;
9290
9291			case	IMG_XPM :
9292				if  ( ! ( $image_types & IMG_XPM ) )
9293					error ( new PdfToTextDecodingException ( "Your current PHP version does not support XPM images." ) ) ;
9294
9295				imagexbm ( $this -> ImageResource, $output_file ) ;
9296				break ;
9297
9298			default :
9299				error ( new PdfToTextDecodingException ( "Unknown image type #$image_type." ) ) ;
9300		    }
9301	    }
9302
9303
9304	public function  Output ( )
9305	   {
9306		$this -> SaveAs ( null ) ;
9307	    }
9308    }
9309
9310
9311
9312/*==============================================================================================================
9313
9314    class PdfJpegImage -
9315        Handles encoded JPG images.
9316
9317  ==============================================================================================================*/
9318class  PdfJpegImage		extends  PdfImage
9319   {
9320	public function  __construct ( $image_data, $autosave )
9321	   {
9322		parent::__construct ( $image_data, $autosave ) ;
9323	    }
9324
9325
9326	protected function  CreateImageResource ( $image_data )
9327	   {
9328		return ( imagecreatefromstring ( $image_data ) ) ;
9329	    }
9330    }
9331
9332
9333/*==============================================================================================================
9334
9335    class PdfInlinedImage -
9336        Decodes raw image data in objects having the /FlateDecode flag.
9337
9338  ==============================================================================================================*/
9339class  PdfInlinedImage		extends  PdfImage
9340   {
9341	// Supported color schemes
9342	const		COLOR_SCHEME_RGB		=  1 ;
9343	const		COLOR_SCHEME_CMYK		=  2 ;
9344	const		COLOR_SCHEME_GRAY		=  3 ;
9345
9346	// Color scheme names, for debugging only
9347	private static	$DecoderNames		=  array
9348	   (
9349		self::COLOR_SCHEME_RGB		=>  'RGB',
9350		self::COLOR_SCHEME_CMYK		=>  'CMYK',
9351		self::COLOR_SCHEME_GRAY		=>  'Gray'
9352	    ) ;
9353
9354	// Currently implemented image decoders
9355	private static	$Decoders		=  array
9356	   (
9357		self::COLOR_SCHEME_RGB		=>  array
9358		   (
9359			8	=>  '__decode_rgb8'
9360		    ),
9361		self::COLOR_SCHEME_GRAY		=>  array
9362		   (
9363			8	=>  '__decode_gray8'
9364		    ),
9365		self::COLOR_SCHEME_CMYK		=>  array
9366		   (
9367			8	=>  '__decode_cmyk8'
9368		    ),
9369	    ) ;
9370
9371	// Image width and height
9372	public		$Width,
9373			$Height ;
9374	// Color scheme
9375	public		$ColorScheme ;
9376	// Number of bits per color component
9377	public		$BitsPerComponent ;
9378	// Decoding function, varying upon the supplied image type
9379	public		$DecodingFunction	=  false ;
9380
9381
9382	/*--------------------------------------------------------------------------------------------------------------
9383
9384	    NAME
9385	        Constructor - Builds an image from the supplied data.
9386
9387	    PROTOTYPE
9388	        $image	=  new  PdfInlinedImage ( $image_data, $width, $height, $bits_per_component, $color_scheme ) ;
9389
9390	    DESCRIPTION
9391	        Builds an image from the supplied data. Checks that the image flags are supported.
9392
9393	    PARAMETERS
9394	        $image_data (string) -
9395	                Uncompressed image data.
9396
9397		$width (integer) -
9398			Image width, in pixels.
9399
9400		$height (integer) -
9401			Image height, in pixels.
9402
9403		$bits_per_components (integer) -
9404			Number of bits per color component.
9405
9406		$color_scheme (integer) -
9407			One of the COLOR_SCHEME_* constants, specifying the initial data format.
9408
9409	    NOTES
9410	        Processed images are always converted to JPEG format.
9411
9412	 *-------------------------------------------------------------------------------------------------------------*/
9413	public function  __construct ( $image_data, $width, $height, $bits_per_component, $color_scheme )
9414	   {
9415		$this -> Width			=  $width ;
9416		$this -> Height			=  $height ;
9417		$this -> BitsPerComponent	=  $bits_per_component ;
9418		$this -> ColorScheme		=  $color_scheme ;
9419
9420		// Check that we have a decoding function for the supplied parameters
9421		if  ( isset ( self::$Decoders [ $color_scheme ] ) )
9422		   {
9423			if  ( isset ( self::$Decoders [ $color_scheme ] [ $bits_per_component ] ) )
9424				$this -> DecodingFunction	=  self::$Decoders [ $color_scheme ] [ $bits_per_component ] ;
9425			else
9426				error ( new PdfToTextDecodingException ( "No decoding function has been implemented for image objects having the " .
9427						self::$DecoderNames [ $color_scheme ] . " color scheme with $bits_per_component bits per color component." ) ) ;
9428		    }
9429		else
9430			error ( new PdfToTextDecodingException ( "Unknown color scheme $color_scheme." ) ) ;
9431
9432		parent::__construct ( $image_data ) ;
9433	    }
9434
9435
9436	/*--------------------------------------------------------------------------------------------------------------
9437
9438	    NAME
9439	        CreateInstance - Creates an appropriate instance of a PdfImage class.
9440
9441	    PROTOTYPE
9442	        $image	=  PdfInlinedImage ( $stream_data, $object_data ) ;
9443
9444	    DESCRIPTION
9445	        Creates an instance of either :
9446		- A PdfJpegImage class, if the image specifications in $object_data indicate that the compressed stream
9447		  contents are only JPEG data
9448		- A PdfInlinedImage class, if the image specifications state that the compressed stream data contain
9449		  only color values.
9450
9451		The class currently supports (in $stream_data) :
9452		- Pure JPEG contents
9453		- RGB values
9454		- CMYK values
9455		- Gray scale values (in the current version, the resulting image does not correctly reproduce the
9456		  initial colors, if interpolation is to be used).
9457
9458	    PARAMETERS
9459	        $stream_data (string) -
9460	                Compressed image data.
9461
9462		$object_data (string) -
9463			Object containing the stream data.
9464
9465	    RETURN VALUE
9466	        Returns :
9467		- A PdfJpegImage object, if the stream data contains only pure JPEG contents
9468		- A PdfInlinedImage object, in other cases.
9469		- False if the supplied image data is not currently supported.
9470
9471	 *-------------------------------------------------------------------------------------------------------------*/
9472	public static function  CreateInstance ( $stream_data, $object_data, $autosave )
9473	   {
9474		// Remove stream data from the supplied object data, to speed up the searches below
9475		$index		=  strpos ( $object_data, 'stream' ) ;
9476
9477		if  ( $index  !==   false )
9478			$object_data	=  substr ( $object_data, 0, $index ) ;
9479
9480		// Uncompress stream data
9481		$image_data	=  gzuncompress ( $stream_data ) ;
9482
9483		// The /DCTDecode flag indicates JPEG contents - returns a PdfJpegImage object
9484		if  ( stripos ( $object_data, '/DCTDecode' ) )
9485			return ( new PdfJpegImage ( $image_data, $autosave ) ) ;
9486
9487		// Get the image width & height
9488		$match		=  null ;
9489		preg_match ( '#/Width \s+ (?P<value> \d+)#ix', $object_data, $match ) ;
9490		$width		=  ( integer ) $match [ 'value' ] ;
9491
9492		$match		=  null ;
9493		preg_match ( '#/Height \s+ (?P<value> \d+)#ix', $object_data, $match ) ;
9494		$height		=  ( integer ) $match [ 'value' ] ;
9495
9496		// Get the number of bits per color component
9497		$match		=  null ;
9498		preg_match ( '#/BitsPerComponent \s+ (?P<value> \d+)#ix', $object_data, $match ) ;
9499		$bits_per_component	=  ( integer ) $match [ 'value' ] ;
9500
9501		// Get the target color space
9502		// Sometimes, this refers to an object in the PDF file, which can also be embedded in a compound object
9503		// We don't handle such cases for now
9504		$match		=  null ;
9505		preg_match ( '#/ColorSpace \s* / (?P<value> \w+)#ix', $object_data, $match ) ;
9506
9507		if  ( ! isset ( $match [ 'value' ] ) )
9508			return ( false ) ;
9509
9510		$color_space_name	=  $match [ 'value' ] ;
9511
9512		// Check that we are able to handle the specified color space
9513		switch ( strtolower ( $color_space_name ) )
9514		   {
9515			case	'devicergb' :
9516				$color_space	=  self::COLOR_SCHEME_RGB ;
9517				break ;
9518
9519			case	'devicegray' :
9520				$color_space	=  self::COLOR_SCHEME_GRAY ;
9521				break ;
9522
9523			case	'devicecmyk' :
9524				$color_space	=  self::COLOR_SCHEME_CMYK ;
9525				break ;
9526
9527			default :
9528				if  ( PdfToText::$DEBUG )
9529					warning ( new PdfToTextDecodingException ( "Unsupported color space \"$color_space_name\"." ) ) ;
9530
9531				return ( false ) ;
9532		    }
9533
9534		// Also check that we can handle the specified number of bits per component
9535		switch ( $bits_per_component )
9536		   {
9537			case	8 :
9538				break ;
9539
9540			default :
9541				if  ( PdfToText::$DEBUG )
9542					warning ( new PdfToTextDecodingException ( "Unsupported bits per component : $bits_per_component." ) ) ;
9543
9544				return ( false ) ;
9545		    }
9546
9547		// All done, return a PdfInlinedImage object
9548		return ( new PdfInlinedImage ( $image_data, $width, $height, $bits_per_component, $color_space ) ) ;
9549	    }
9550
9551
9552	/*--------------------------------------------------------------------------------------------------------------
9553
9554	    NAME
9555	        CreateImageResource - Creates the image resource.
9556
9557	    PROTOTYPE
9558	        $resource	=  $image -> CreateImageResource ( $image_data ) ;
9559
9560	    DESCRIPTION
9561	        Creates a GD image according to the supplied image data, and the parameters supplied to the class
9562		constructor.
9563
9564	    PARAMETERS
9565	        $image_data (string) -
9566	                Image to be decoded.
9567
9568	    RETURN VALUE
9569	        Returns a GD graphics resource in true color, or false if there is currently no implemented decoding
9570		function for this kind of images.
9571
9572	 *-------------------------------------------------------------------------------------------------------------*/
9573	protected function  CreateImageResource ( $image_data )
9574	   {
9575		$decoder	=  $this -> DecodingFunction ;
9576
9577		if  ( $decoder )
9578			return ( $this -> $decoder ( $image_data ) ) ;
9579		else
9580			return ( false ) ;
9581	    }
9582
9583
9584	/*--------------------------------------------------------------------------------------------------------------
9585
9586		Decoding functions.
9587
9588	 *-------------------------------------------------------------------------------------------------------------*/
9589
9590	// __decode_rgb8 -
9591	//	Decodes image data consisting of 8-bits RGB values (one byte for each color component).
9592	private function  __decode_rgb8 ( $data )
9593	   {
9594		$data_length	=  strlen ( $data ) ;
9595		$colors		=  array ( ) ;
9596		$width		=  $this -> Width ;
9597		$height		=  $this -> Height ;
9598		$image		=  imagecreatetruecolor ( $width, $height ) ;
9599
9600		for  ( $i = 0, $pixel_x = 0, $pixel_y = 0 ; $i + 3  <=  $data_length ; $i += 3, $pixel_x ++ )
9601		   {
9602			$red	=  ord ( $data [$i] ) ;
9603			$green	=  ord ( $data [$i+1] ) ;
9604			$blue	=  ord ( $data [$i+2] ) ;
9605
9606			$color	=  ( $red  <<  16 ) | ( $green  <<  8 )  | ( $blue ) ;
9607
9608			if  ( isset ( $colors [ $color ] ) )
9609				$pixel_color	=  $colors [ $color ] ;
9610			else
9611			   {
9612				$pixel_color		=  imagecolorallocate ( $image, $red, $green, $blue ) ;
9613				$colors [ $color ]	=  $pixel_color ;
9614			    }
9615
9616			if  ( $pixel_x  >=  $width )
9617			   {
9618				$pixel_x	=  0 ;
9619				$pixel_y ++ ;
9620			    }
9621
9622			imagesetpixel ( $image, $pixel_x, $pixel_y, $pixel_color ) ;
9623		    }
9624
9625		return ( $image ) ;
9626	    }
9627
9628
9629	// __decode_cmyk8 -
9630	//	Decodes image data consisting of 8-bits CMYK values (one byte for each color component).
9631	private function  __decode_cmyk8 ( $data )
9632	   {
9633		$data_length	=  strlen ( $data ) ;
9634		$colors		=  array ( ) ;
9635		$width		=  $this -> Width ;
9636		$height		=  $this -> Height ;
9637		$image		=  imagecreatetruecolor ( $width, $height ) ;
9638
9639		for  ( $i = 0, $pixel_x = 0, $pixel_y = 0 ; $i + 4  <=  $data_length ; $i += 4, $pixel_x ++ )
9640		   {
9641			$cyan		=  ord ( $data [$i] ) ;
9642			$magenta	=  ord ( $data [$i+1] ) ;
9643			$yellow		=  ord ( $data [$i+2] ) ;
9644			$black		=  ord ( $data [$i+3] ) ;
9645
9646			$color	=  ( $cyan  <<  24 ) | ( $magenta  <<  16 ) | ( $yellow  << 8 ) | ( $black ) ;
9647
9648			if  ( isset ( $colors [ $color ] ) )
9649				$pixel_color	=  $colors [ $color ] ;
9650			else
9651			   {
9652				$rgb			=  $this -> __convert_cmyk_to_rgb ( $cyan, $magenta, $yellow, $black ) ;
9653				$pixel_color		=  imagecolorallocate ( $image, $rgb [0], $rgb [1], $rgb [2] ) ;
9654				$colors [ $color ]	=  $pixel_color ;
9655			    }
9656
9657			if  ( $pixel_x  >=  $width )
9658			   {
9659				$pixel_x	=  0 ;
9660				$pixel_y ++ ;
9661			    }
9662
9663			imagesetpixel ( $image, $pixel_x, $pixel_y, $pixel_color ) ;
9664		    }
9665
9666		return ( $image ) ;
9667	    }
9668
9669
9670	// __decode_gray8 -
9671	//	Decodes image data consisting of 8-bits gray values.
9672	private function  __decode_gray8 ( $data )
9673	   {
9674		$data_length	=  strlen ( $data ) ;
9675		$colors		=  array ( ) ;
9676		$width		=  $this -> Width ;
9677		$height		=  $this -> Height ;
9678		$image		=  imagecreatetruecolor ( $width, $height ) ;
9679
9680		for  ( $i = 0, $pixel_x = 0, $pixel_y = 0 ; $i  <  $data_length ; $i ++, $pixel_x ++ )
9681		   {
9682			$color	=  ord ( $data [$i] ) ;
9683
9684			if  ( isset ( $colors [ $color ] ) )
9685				$pixel_color	=  $colors [ $color ] ;
9686			else
9687			   {
9688				$pixel_color		=  imagecolorallocate ( $image, $color, $color, $color ) ;
9689				$colors [ $color ]	=  $pixel_color ;
9690			    }
9691
9692			if  ( $pixel_x  >=  $width )
9693			   {
9694				$pixel_x	=  0 ;
9695				$pixel_y ++ ;
9696			    }
9697
9698			imagesetpixel ( $image, $pixel_x, $pixel_y, $pixel_color ) ;
9699		    }
9700
9701		return ( $image ) ;
9702	    }
9703
9704
9705	/*--------------------------------------------------------------------------------------------------------------
9706
9707		Support functions.
9708
9709	 *-------------------------------------------------------------------------------------------------------------*/
9710
9711	// __convert_cmyk_to_rgb -
9712	//	Converts CMYK color value to RGB.
9713	private function  __convert_cmyk_to_rgb ( $C, $M, $Y, $K )
9714	   {
9715		if  ( $C  >  1  ||  $M  >  1  ||  $Y  >  1  ||  $K  >  1 )
9716		   {
9717			$C /= 100.0 ;
9718			$M /= 100.0 ;
9719			$Y /= 100.0 ;
9720			$K /= 100.0 ;
9721		    }
9722
9723   		$R 	=  ( 1 - $C * ( 1 - $K ) - $K ) * 256 ;
9724   		$G 	=  ( 1 - $M * ( 1 - $K ) - $K ) * 256 ;
9725   		$B 	=  ( 1 - $Y * ( 1 - $K ) - $K ) * 256 ;
9726
9727		$result =  array ( round ( $R ), round ( $G ), round ( $B ) ) ;
9728
9729		return ( $result ) ;
9730  	    }
9731    }
9732
9733
9734/*==============================================================================================================
9735
9736    class PdfFaxImage -
9737        Handles encoded CCITT Fax images.
9738
9739  ==============================================================================================================*/
9740class  PdfFaxImage		extends  PdfImage
9741   {
9742	public function  __construct ( $image_data )
9743	   {
9744		parent::__construct ( $image_data ) ;
9745	    }
9746
9747
9748	protected function  CreateImageResource ( $image_data )
9749	   {
9750		warning ( new PdfToTextDecodingException ( "Decoding of CCITT Fax image format is not yet implemented." ) ) ;
9751		//return ( imagecreatefromstring ( $image_data ) ) ;
9752	    }
9753    }
9754
9755
9756/**************************************************************************************************************
9757 **************************************************************************************************************
9758 **************************************************************************************************************
9759 ******                                                                                                  ******
9760 ******                                                                                                  ******
9761 ******                                      ENCRYPTION MANAGEMENT                                       ******
9762 ******                                                                                                  ******
9763 ******                                                                                                  ******
9764 **************************************************************************************************************
9765 **************************************************************************************************************
9766 **************************************************************************************************************/
9767
9768/*==============================================================================================================
9769
9770    class EncryptionData -
9771        Holds encryption data and allows for decryption.
9772
9773  ==============================================================================================================*/
9774class  PdfEncryptionData		extends  PdfObjectBase
9775   {
9776	// Encryption modes
9777	const		PDFMODE_UNKNOWN				=  0 ;
9778	const		PDFMODE_STANDARD			=  1 ;
9779
9780	// Encryption algorithms
9781	const		PDFCRYPT_ALGORITHM_RC4			=  0 ;
9782	const		PDFCRYPT_ALGORITHM_AES			=  1 ;
9783	const		PDFCRYPT_ALGORITHM_AES256		=  2 ;
9784
9785	// A 32-bytes hardcoded padding used when computing encryption keys
9786	const		PDF_ENCRYPTION_PADDING			=  "\x28\xBF\x4E\x5E\x4E\x75\x8A\x41\x64\x00\x4E\x56\xFF\xFA\x01\x08\x2E\x2E\x00\xB6\xD0\x68\x3E\x80\x2F\x0C\xA9\xFE\x64\x53\x69\x7A" ;
9787
9788	// Permission bits for encrypted files. Comments come from the PDF specification
9789	const		PDFPERM_PRINT				=  0x0004 ;		// bit 3 :
9790											//	(Revision 2) Print the document.
9791											//	(Revision 3 or greater) Print the document (possibly not at the highest quality level,
9792											//	depending on whether bit 12 is also set).
9793	const		PDFPERM_MODIFY				=  0x0008 ;		// bit 4 :
9794											//	Modify the contents of the document by operations other than those controlled by bits 6, 9, and 11.
9795	const		PDFPERM_COPY				=  0x0010 ;		// bit 5 :
9796											//	(Revision 2) Copy or otherwise extract text and graphics from the document, including extracting text
9797											//	and graphics (in support of accessibility to users with disabilities or for other purposes).
9798											//	(Revision 3 or greater) Copy or otherwise extract text and graphics from the document by operations
9799											//	other than that controlled by bit 10.
9800	const		PDFPERM_MODIFY_EXTRA			=  0x0020 ;		// bit 6 :
9801											//	Add or modify text annotations, fill in interactive form fields, and, if bit 4 is also set,
9802											//	create or modify interactive form fields (including signature fields).
9803	const		PDFPERM_FILL_FORM			=  0x0100 ;		// bit 9 :
9804											//	(Revision 3 or greater) Fill in existing interactive form fields (including signature fields),
9805											//	even if bit 6 is clear.
9806	const		PDFPERM_EXTRACT				=  0x0200 ;		// bit 10 :
9807											//	(Revision 3 or greater) Fill in existing interactive form fields (including signature fields),
9808											//	even if bit 6 is clear.
9809	const		PDFPERM_ASSEMBLE			=  0x0400 ;		// bit 11 :
9810											//	(Revision 3 or greater) Assemble the document (insert, rotate, or delete pages and create bookmarks
9811											//	or thumbnail images), even if bit 4 is clear.
9812	const		PDFPERM_HIGH_QUALITY_PRINT		=  0x0800 ;		// bit 12 :
9813											//	(Revision 3 or greater) Print the document to a representation from which a faithful digital copy of
9814											//	the PDF content could be generated. When this bit is clear (and bit 3 is set), printing is limited to
9815											//	a low-level representation of the appearance, possibly of degraded quality.
9816
9817	public		$FileId ;							// File ID, as specified by the /ID flag
9818	public		$ObjectId ;							// Object id and text contents
9819	private		$ObjectData ;
9820	public		$Mode ;								// Encryption mode - currently, only the "Standard" keyword is accepted
9821	public		$EncryptionAlgorithm ;						// Encryption algorithm - one of the PDFCRYPT_* constants
9822	public		$AlgorithmVersion,						// Encryption algorithm version & revision
9823			$AlgorithmRevision ;
9824	public		$Flags ;							// Protection flags, when an owner password has been specified - one of the PDFPERM_* constants
9825	public		$KeyLength ;							// Encryption key length
9826	public		$UserKey,							// User and owner password keys
9827			$OwnerKey ;
9828	public		$UserEncryptionString,						// Not sure yet of the real usage of these ones
9829			$OwnerEncryptionString ;
9830	public		$EncryptMetadata ;						// True if metadata is also encrypted
9831	public		$FileKeyLength ;						// Key length / 5
9832
9833	protected	$Decrypter ;							// Decrypter object
9834
9835	private		$UnsupportedEncryptionAlgorithm		=  false ;		// True if the encryption algorithm used in the PDF file is not yet supported
9836
9837
9838	/**************************************************************************************************************
9839
9840	    NAME
9841	        Constructor
9842
9843	    PROTOTYPE
9844	        obj	=  new  PdfEncryptionData ( $mode, $object_id, $object_data ) ;
9845
9846	    DESCRIPTION
9847		Creates an instance of a PdfEncryptionData class, using the information parsed from the supplied object
9848		data.
9849
9850	    PARAMETERS
9851		$mode (integer) -
9852			One of the PDFMODE_* constants.
9853
9854		$object_id (integer) -
9855			Id of the object containing enryption parameters.
9856
9857		$object_data (string) -
9858			Encryption parameters.
9859
9860	    AUTHOR
9861	        Christian Vigh, 03/2017.
9862
9863	    HISTORY
9864	    [Version : 1.0]	[Date : 2017-03-14]     [Author : CV]
9865	        Initial version.
9866
9867	 **************************************************************************************************************/
9868	public function  __construct ( $file_id, $mode, $object_id, $object_data )
9869	   {
9870		$this -> FileId			=  $file_id ;
9871		$this -> ObjectId		=  $object_id ;
9872		$this -> ObjectData		=  $object_data ;
9873		$this -> Mode			=  $mode ;
9874
9875		// Encryption algorithm version & revision
9876		preg_match ( '#/V \s+ (?P<value> \d+)#ix', $object_data, $algorithm_match ) ;
9877		$this -> AlgorithmVersion	=  ( integer ) $algorithm_match [ 'value' ] ;
9878
9879		preg_match ( '#/R \s+ (?P<value> \d+)#ix', $object_data, $algorithm_revision_match ) ;
9880		$this -> AlgorithmRevision	=  ( integer ) $algorithm_revision_match [ 'value' ] ;
9881
9882		// Encryption flags
9883		preg_match ( '#/P \s+ (?P<value> \-? \d+)#ix', $object_data, $flags_match ) ;
9884		$this -> Flags			=  ( integer) $flags_match [ 'value' ] ;
9885
9886		// Key length (40 bits, if not specified)
9887		if  ( preg_match ( '#/Length \s+ (?P<value> \d+)#ix', $object_data, $key_length_match ) )
9888			$this -> KeyLength	=  $key_length_match [ 'value' ] ;
9889		else
9890			$this -> KeyLength	=  40 ;
9891
9892		// Owner and user passwords
9893		$this -> UserKey		=  $this -> GetStringParameter ( '/U', $object_data ) ;
9894		$this -> OwnerKey		=  $this -> GetStringParameter ( '/O', $object_data ) ;
9895
9896		// Owner and user encryption strings
9897		$this -> UserEncryptionString	=  $this -> GetStringParameter ( '/UE', $object_data ) ;
9898		$this -> OwnerEncryptionString	=  $this -> GetStringParameter ( '/OE', $object_data ) ;
9899
9900		// EncryptMetadata flag
9901		if  ( preg_match ( '# /EncryptMetadata (?P<value> (true) | (1) | (false) | (0) )#imsx', $object_data, $encryption_match ) )
9902		   {
9903			if  ( ! strcasecmp ( $encryption_match [ 'value' ], 'true' )  ||  ! strcasecmp ( $encryption_match [ 'value' ], 'false' ) )
9904				$this -> EncryptMetadata		=  true ;
9905			else
9906				$this -> EncryptMetadata		=  false ;
9907		    }
9908		else
9909			$this -> EncryptMetadata	=  false ;
9910
9911		// Now, try to determine the encryption algorithm to be used
9912		$user_key_length		=  strlen ( $this -> UserKey ) ;
9913		$owner_key_length		=  strlen ( $this -> OwnerKey ) ;
9914		$user_encryption_string_length	=  strlen ( $this -> UserEncryptionString ) ;
9915		$owner_encryption_string_length	=  strlen ( $this -> OwnerEncryptionString ) ;
9916
9917		$error_unhandled_version	=  false ;
9918		$error_unhandled_revision	=  false ;
9919
9920		switch  ( $this -> AlgorithmVersion )
9921		   {
9922			case	1 :
9923				switch  ( $this -> AlgorithmRevision )
9924				   {
9925					case	2 :
9926						if  ( $user_key_length  !=  32  &&  $owner_key_length  !=  32 )
9927						   {
9928							if  ( PdfToText::$DEBUG )
9929								error ( new PdfToTextDecryptionException ( "Invalid user and/or owner key length ($user_key_length/$owner_key_length)", $object_id ) ) ;
9930						    }
9931
9932						$this -> EncryptionAlgorithm	=  self::PDFCRYPT_ALGORITHM_RC4 ;
9933						$this -> FileKeyLength		=  5 ;
9934						break ;
9935
9936					default :
9937						$error_unhandled_revision	=  true ;
9938				    }
9939				break ;
9940
9941			default :
9942				$error_unhandled_version	=  true ;
9943		    }
9944
9945		// Report unsupported versions/revisions
9946		if  ( $error_unhandled_version  ||  $error_unhandled_revision )
9947		   {
9948			if  ( PdfToText::$DEBUG )
9949				error ( new PdfToTextDecryptionException ( "Unsupported encryption algorithm version {$this -> AlgorithmVersion} revision {$this -> AlgorithmRevision}.",
9950						$object_id ) ) ;
9951
9952			$this -> UnSupportedEncryptionAlgorithm		=  true ;
9953
9954			return ;
9955		    }
9956
9957		// Build the object key
9958		$this -> Decrypter		=  PdfDecryptionAlgorithm::GetInstance ( $this ) ;
9959
9960		if  ( $this -> Decrypter  ===  false )
9961		   {
9962			if  ( PdfToText::$DEBUG )
9963				warning ( new PdfToTextDecryptionException ( "Unsupported encryption algorithm #{$this -> EncryptionAlgorithm}, " .
9964						"version {$this -> AlgorithmVersion} revision {$this -> AlgorithmRevision}.",
9965						$object_id ) ) ;
9966
9967			$this -> UnsupportedEncryptionAlgorithm		=  true ;
9968
9969			return ;
9970		    }
9971		//dump ( $this ) ;
9972	    }
9973
9974
9975	/*--------------------------------------------------------------------------------------------------------------
9976
9977	    NAME
9978	        GetInstance - Creates an instance of a PdfEncryptionData object.
9979
9980	    PROTOTYPE
9981	        $obj		=  PdfEncryptionData::GetInstance ( $object_id, $object_data ) ;
9982
9983	    DESCRIPTION
9984	        Returns an instance of encryption data
9985
9986	 *-------------------------------------------------------------------------------------------------------------*/
9987	public static function  GetInstance ( $file_id, $object_id, $object_data )
9988	   {
9989		// Encryption mode
9990		if  ( ! preg_match ( '#/Filter \s* / (?P<mode> \w+)#ix', $object_data, $object_data_match ) )
9991			return  (false ) ;
9992
9993		switch ( strtolower ( $object_data_match [ 'mode' ] ) )
9994		   {
9995			case	'standard' :
9996				$mode		=  self::PDFMODE_STANDARD ;
9997				break ;
9998
9999			default :
10000				if  ( self::$DEBUG  >  1 )
10001					error ( new PdfToTextDecodingException ( "Unhandled encryption mode '{$object_data [ 'mode' ]}'", $object_id ) ) ;
10002
10003				return ( false ) ;
10004
10005		    }
10006
10007		// Basic checks have been performed, return an instance of encryption data
10008		return ( new PdfEncryptionData ( $file_id, $mode, $object_id, $object_data ) ) ;
10009	    }
10010
10011
10012	/*--------------------------------------------------------------------------------------------------------------
10013
10014	    NAME
10015	        Decrypt - Decrypts object data.
10016
10017	    PROTOTYPE
10018	        $data		=  $this -> Decrypt ( $object_id, $object_data ) ;
10019
10020	    DESCRIPTION
10021	        Decrypts object data, when the PDF file is password-protected.
10022
10023	    PARAMETERS
10024	        $object_id (integer) -
10025	                Pdf object number.
10026
10027		$object_data (string) -
10028			Object data.
10029
10030	    RETURN VALUE
10031	        Returns the decrypted object data, or false if the encrypted object could not be decrypted.
10032
10033	 *-------------------------------------------------------------------------------------------------------------*/
10034	public function  Decrypt ( $object_id, $object_data )
10035	   {
10036		if  ( $this -> UnsupportedEncryptionAlgorithm )
10037			return ( false ) ;
10038
10039		return ( false ) ;
10040		//return ( $this -> Decrypter -> Decrypt ( $object_data ) ) ;
10041		//return ( "BT (coucou)Tj ET" ) ;
10042	    }
10043    }
10044
10045
10046/*==============================================================================================================
10047
10048    class PdfDecryptionAlgorithm -
10049        Base class for algorithm decrypters.
10050
10051  ==============================================================================================================*/
10052abstract class	PdfDecryptionAlgorithm		//extends  Object
10053   {
10054	protected		$EncryptionData ;
10055	protected		$ObjectKey ;
10056	protected		$ObjectKeyBytes ;
10057	protected		$ObjectKeyLength ;
10058
10059
10060	public function  __construct ( $encryption_data )
10061	   {
10062		$this -> EncryptionData		=  $encryption_data ;
10063
10064		$objkey		=  '' ;
10065
10066		for  ( $i = 0 ; $i  <  $this -> EncryptionData -> FileKeyLength ; $i ++ )
10067			$objkey 	.=  $this -> EncryptionData -> FileId [$i] ;
10068
10069		$objkey 			.=  chr ( ( $this -> EncryptionData -> ObjectId ) & 0xFF ) ;
10070		$objkey 			.=  chr ( ( $this -> EncryptionData -> ObjectId  >>   8 )  &  0xFF ) ;
10071		$objkey 			.=  chr ( ( $this -> EncryptionData -> ObjectId  >>  16 )  &  0xFF ) ;
10072		$objkey 			.=  chr ( 0 ) ;		// obj generation number & 0xFF
10073		$objkey 			.=  chr ( 0 ) ;		// obj generation number >> 8  &  0xFF
10074
10075		$md5				=  md5 ( $objkey, true ) ;
10076		$this -> ObjectKey		=  $md5 ;
10077		$this -> ObjectKeyLength	=  16 ;
10078
10079		$this -> ObjectKeyBytes		=  array ( ) ;
10080
10081		for  ( $i = 0 ; $i  <  $this -> ObjectKeyLength ; $i ++ )
10082			$this -> ObjectKeyBytes  []	=  ord ( $this -> ObjectKey [$i] ) ;
10083	    }
10084
10085
10086	public static function  GetInstance  ( $encryption_data )
10087	   {
10088		switch  ( $encryption_data -> EncryptionAlgorithm )
10089		   {
10090			case	PdfEncryptionData::PDFCRYPT_ALGORITHM_RC4 :
10091				return ( new PdfRC4DecryptionAlgorithm ( $encryption_data ) ) ;
10092
10093			default :
10094				return ( false ) ;
10095		    }
10096	    }
10097
10098
10099	abstract public function  Reset		( ) ;
10100	abstract public function  Decrypt	( $data ) ;
10101
10102    }
10103
10104
10105/*==============================================================================================================
10106
10107    class PdfRC4DecryptionAlgorithm -
10108        A decrypter class for RC4 encoding.
10109
10110  ==============================================================================================================*/
10111class	PdfRC4DecryptionAlgorithm		extends  PdfDecryptionAlgorithm
10112   {
10113	private	static		$InitialState		=  false ;
10114	protected		$State ;
10115
10116
10117	public function  __construct ( $encryption_data )
10118	   {
10119		parent::__construct ( $encryption_data ) ;
10120
10121		if  ( self::$InitialState  ===  false )
10122			self::$InitialState	=  range ( 0, 255 ) ;
10123	    }
10124
10125
10126	public function  Reset ( )
10127	   {
10128		$this -> State		=  self::$InitialState ;
10129		$index1			=
10130		$index2			=  0 ;
10131
10132		for  ( $i = 0 ; $i  <  256 ; $i ++ )
10133		   {
10134			$index2		=  ( $this -> ObjectKeyBytes [ $index1 ] + $this -> State [$i] + $index2 )  &  0xFF ;
10135
10136			// Swap elements $index2 and $i from $State
10137			$x				=  $this -> State [$i] ;
10138			$this -> State [$i]		=  $this -> State [ $index2 ] ;
10139			$this -> State [ $index2 ]	=  $x ;
10140
10141			$index1  =  ( $index1 + 1 ) % $this -> ObjectKeyLength ;
10142		    }
10143	    }
10144
10145
10146	public function  Decrypt ( $data )
10147	   {
10148		$this -> Reset ( ) ;
10149		$length		=  strlen ( $data ) ;
10150		$x		=  0 ;
10151		$y		=  0 ;
10152		$result		=  '' ;
10153
10154		for  ( $i = 0 ; $i  <  $length ; $i ++ )
10155		   {
10156			$ord	=  ord ( $data [$i] ) ;
10157			$x	=  ( $x + 1 ) & 0xFF ;
10158			$y	=  ( $this -> State [$x] + $y ) & 0xFF ;
10159
10160			$tx	=  $this -> State [$x] ;
10161			$ty	=  $this -> State [$y] ;
10162
10163			$this -> State [$x]	=  $ty ;
10164			$this -> State [$y]	=  $tx ;
10165
10166			$new_ord		=  $ord ^ $this -> State [ ( $tx + $ty ) & 0xFF ] ;
10167			$result		.=  chr ( $new_ord ) ;
10168		    }
10169
10170		return ( $result ) ;
10171	    }
10172    }
10173
10174    /*
10175static Guchar rc4DecryptByte(Guchar *state, Guchar *x, Guchar *y, Guchar c) {
10176  Guchar x1, y1, tx, ty;
10177
10178  x1 = *x = (*x + 1) % 256;
10179  y1 = *y = (state[*x] + *y) % 256;
10180  tx = state[x1];
10181  ty = state[y1];
10182  state[x1] = ty;
10183  state[y1] = tx;
10184  return c ^ state[(tx + ty) % 256];
10185}
10186*/
10187
10188
10189/**************************************************************************************************************
10190 **************************************************************************************************************
10191 **************************************************************************************************************
10192 ******                                                                                                  ******
10193 ******                                                                                                  ******
10194 ******                                       FORM DATA MANAGEMENT                                       ******
10195 ******                                                                                                  ******
10196 ******                                                                                                  ******
10197 **************************************************************************************************************
10198 **************************************************************************************************************
10199 **************************************************************************************************************/
10200
10201
10202/*==============================================================================================================
10203
10204    class PdfToTextFormDefinitions -
10205        Analyzes a template XML file that describes PDF form data and maps PDF field names to human-readable
10206	names.
10207	The GetFormData() returns an object containing the mapped properties with their respective values.
10208
10209  ==============================================================================================================*/
10210class  PdftoTextFormDefinitions		// extends		Object
10211					implements	ArrayAccess, Countable, IteratorAggregate
10212   {
10213	static private	$ClassDefinitionCount	=  0 ;
10214
10215	// Class name, as specified in the XML template
10216	protected	$ClassName ;
10217	// Form definitions (a template may contain several versions of the same for definition)
10218	protected	$Definitions ;
10219	// Form definitions coming from the PDF file
10220	protected	$PdfDefinitions ;
10221
10222
10223	/*--------------------------------------------------------------------------------------------------------------
10224
10225	    Constructor -
10226		Parses the supplied XML template.
10227
10228	 *-------------------------------------------------------------------------------------------------------------*/
10229	public function  __construct ( $xml_data, $pdf_xml_data )
10230	   {
10231		// Get PDF XML form data definitions
10232		$this -> __get_pdf_form_definitions ( $pdf_xml_data ) ;
10233
10234		// Create XML data from scratch, if none specified
10235		if  ( ! $xml_data  )
10236			$xml_data	=  $this -> __create_default_xml_data ( $this -> PdfDefinitions ) ;
10237
10238		// Decode XML the hard way, without XSD
10239		$xml		=  simplexml_load_string ( $xml_data ) ;
10240		$root_entry	=  $xml -> getName ( ) ;
10241		$definitions	=  array ( ) ;
10242		$class_name	=  "PdfFormData" ;
10243
10244		if  ( strcasecmp ( $root_entry, "forms" ) )
10245			error ( new PdfToTextFormException ( "Root entry must be <forms>, <$root_entry> was found." ) ) ;
10246
10247		// Get the attribute values of the <forms> tag
10248		foreach  ( $xml -> attributes ( )  as  $attribute_name => $attribute_value )
10249		   {
10250			switch  ( strtolower ( $attribute_name ) )
10251			   {
10252				case	'class' :
10253					$class_name	=  ( string ) $attribute_value ;
10254
10255					if  ( class_exists ( $class_name, false ) )
10256						error ( new PdfToTextFormException ( "Class \"$class_name\" specified in XML template already exists." ) ) ;
10257
10258					break ;
10259
10260				default :
10261					error ( new PdfToTextFormException ( "Invalid attribute \"$attribute_name\" in <forms> tag." ) ) ;
10262			    }
10263		    }
10264
10265		// Don't know if it will be useful, but try to avoid class name collisions by appending a sequential number if necessary
10266		if  ( class_exists ( $class_name, false ) )
10267		   {
10268			self::$ClassDefinitionCount ++ ;
10269			$class_name	.=  '_' . self::$ClassDefinitionCount ;
10270		    }
10271
10272		// Loop through each child <form> entry
10273		foreach  ( $xml -> children ( )  as  $child )
10274		   {
10275			$child_name	=  $child -> getName ( ) ;
10276
10277			switch ( strtolower ( $child_name ) )
10278			   {
10279				case	'form' :
10280					$definitions []		=  new PdfToTextFormDefinition ( $class_name, $child, $this -> PdfDefinitions ) ;
10281					break ;
10282
10283				default :
10284					error ( new PdfToTextFormException ( "Invalid tag <$child_name>." ) ) ;
10285			    }
10286		    }
10287
10288		// Ensure that there is at least one form definition
10289		if  ( ! count ( $definitions ) )
10290			error ( new PdfToTextFormException ( "No <form> definition found." ) ) ;
10291
10292		// Save to properties
10293		$this -> ClassName		=  $class_name ;
10294		$this -> Definitions		=  $definitions ;
10295	    }
10296
10297
10298	/*--------------------------------------------------------------------------------------------------------------
10299
10300		Internal methods.
10301
10302	 *-------------------------------------------------------------------------------------------------------------*/
10303
10304	// __get_pdf_form_definitions -
10305	//	Retrieves the form field definitions coming from the PDF file.
10306	private function  __get_pdf_form_definitions ( $pdf_data )
10307	   {
10308		preg_match_all ( '#(?P<field> <field .*? </field \s* >)#imsx', $pdf_data, $matches ) ;
10309
10310		foreach  ( $matches [ 'field' ]  as  $field )
10311		   {
10312			$xml_field	=  simplexml_load_string ( $field ) ;
10313
10314			foreach  ( $xml_field -> attributes ( )  as  $attribute_name => $attribute_value )
10315			   {
10316				switch  ( strtolower ( $attribute_name ) )
10317				   {
10318					case	'name' :
10319						$field_name		=  ( string ) $attribute_value ;
10320
10321						if  ( isset ( $this -> PdfDefinitions [ $field_name ] ) )
10322							$this -> PdfDefinitions [ $field_name ] [ 'occurrences' ] ++ ;
10323						else
10324						   {
10325							$this -> PdfDefinitions [ $field_name ]		=  array
10326							   (
10327								'name'		=>  $field_name,
10328								'occurrences'	=>  1
10329							    ) ;
10330						    }
10331
10332						break ;
10333				    }
10334			    }
10335		    }
10336	    }
10337
10338
10339	// __create_default_xml_data -
10340	//	When no XML template has been specified, creates a default one based of the form definitions located in the PDF file.
10341	private function  __create_default_xml_data ( $pdf_definitions )
10342	   {
10343		$result		=  "<forms>" . PHP_EOL .
10344				   "\t<form version=\"1.0\">" . PHP_EOL ;
10345
10346		foreach  ( $pdf_definitions  as  $name => $field )
10347		   {
10348			$name		 =  str_replace ( '-', '_', $name ) ;		// Just in case of
10349			$result		.=  "\t\t<field name=\"$name\" form-field=\"$name\" type=\"string\"/>" . PHP_EOL ;
10350		    }
10351
10352		$result		.=  "\t</form>" . PHP_EOL .
10353				    "</forms>" . PHP_EOL ;
10354
10355		return ( $result ) ;
10356	    }
10357
10358
10359	/*--------------------------------------------------------------------------------------------------------------
10360
10361		Interfaces implementations to retrieve form definitions.
10362
10363	 *-------------------------------------------------------------------------------------------------------------*/
10364	public function  count ( )
10365	   { return ( count ( $this - Definitions ) ) ; }
10366
10367
10368	public function  getIterator ( )
10369	   { return ( new ArrayIterator ( $this -> Definitions ) ) ; }
10370
10371
10372	public function  offsetExists ( $offset )
10373	   { return ( $offset  >=  0  &&  $offset  <  count ( $this -> Definitions ) ) ; }
10374
10375
10376	public function  offsetGet ( $offset )
10377	   { return ( $this -> Definitions [ $offset ] ) ; }
10378
10379
10380	public function  offsetSet ( $offset, $value )
10381	   { error ( new PdfToTextException ( "Unsupported operation." ) ) ; }
10382
10383
10384	public function  offsetunset ( $offset )
10385	   { error ( new PdfToTextException ( "Unsupported operation." ) ) ; }
10386    }
10387
10388
10389/*==============================================================================================================
10390
10391    class PdfToTextFormDefinition -
10392        Holds the description of a form inside a form XML template.
10393
10394  ==============================================================================================================*/
10395class  PdfToTextFormDefinition		// extends  Object
10396   {
10397	// Class of the object returned by GetFormData( )
10398	public		$ClassName ;
10399
10400	// Form version
10401	public		$Version ;
10402
10403	// Field definitions
10404	public		$FieldDefinitions		=  array ( ) ;
10405
10406	// Field groups (ie, fields that are the results of the concatenation of several form fields)
10407	public		$Groups				=  array ( ) ;
10408
10409	// Pdf field definitions
10410	public		$PdfDefinitions ;
10411
10412	// Class definition in PHP, whose instance will be returned by GetFormData()
10413	private		$ClassDefinition		=  false ;
10414
10415	// Direct access to field definitions either through their template name or PDF name
10416	private		$FieldDefinitionsByName		=  array ( ) ;
10417	private		$FieldDefinitionsByPdfName	=  array ( ) ;
10418
10419
10420	/*--------------------------------------------------------------------------------------------------------------
10421
10422	    Constructor -
10423		Analyze the contents of an XML template form definition.
10424
10425	 *-------------------------------------------------------------------------------------------------------------*/
10426	public function  __construct ( $class_name, $form_definition, $pdf_definitions )
10427	   {
10428		$this -> ClassName		=  $class_name ;
10429		$this -> PdfDefinitions		=  $pdf_definitions ;
10430		$field_count			=  0 ;
10431
10432		// Get <form> tag attributes
10433		foreach  ( $form_definition -> attributes ( )  as  $attribute_name => $attribute_value )
10434		   {
10435			switch ( strtolower ( $attribute_name ) )
10436			   {
10437				case	'version' :
10438					$this -> Version	=  ( string ) $attribute_value ;
10439					break ;
10440
10441				default :
10442					error ( new PdfToTextFormException ( "Invalid attribute \"$attribute_name\" in <form> tag." ) ) ;
10443			    }
10444		    }
10445
10446		// Loop through subtags
10447		foreach  ( $form_definition -> children ( )  as  $child )
10448		   {
10449			$tag_name	=  $child -> getName ( ) ;
10450
10451			// Check subtags
10452			switch  ( strtolower ( $tag_name ) )
10453			   {
10454				// <group> :
10455				//	A group is used to create a property that is the concatenation of several existing properties.
10456				case	'group' :
10457					$fields		=  array ( ) ;
10458					$separator	=  '' ;
10459					$name		=  false ;
10460
10461					// Loop through attribute names
10462					foreach  ( $child -> attributes ( )  as  $attribute_name => $attribute_value )
10463					   {
10464						switch  ( $attribute_name )
10465						   {
10466							// "name" attribute" :
10467							//	The name of the property, as it will appear in the output object.
10468							case	'name' :
10469								$name		=  PdfToTextObjectBase::ValidatePhpName ( ( string ) $attribute_value ) ;
10470								break ;
10471
10472							// "separator" attribute :
10473							//	Separator to be used when concatenating the underlying properties.
10474							case	'separator' :
10475								$separator	=  ( string ) $attribute_value ;
10476								break ;
10477
10478							// "fields" :
10479							//	A list of comma-separated field names, whose values will be concatenated together
10480							//	using the specified separator.
10481							case	'fields' :
10482								$items		=  explode ( ',', ( string ) $attribute_value ) ;
10483
10484								if  ( ! count ( $items ) )
10485									error ( new PdfToTextFormException ( "Empty \"fields\" attribute in <group> tag." ) ) ;
10486
10487								foreach  ( $items  as  $item )
10488									$fields []	=  PdfToTextObjectBase::ValidatePhpName ( $item ) ;
10489
10490								break ;
10491
10492							// Other attribute names : not allowed
10493							default :
10494								error ( new PdfToTextFormException ( "Invalid attribute \"$attribute_name\" in <group> tag." ) ) ;
10495						    }
10496					    }
10497
10498					// Check that at least one field has been specified
10499					if  ( ! count ( $fields ) )
10500						error ( new PdfToTextFormException ( "Empty \"fields\" attribute in <group> tag." ) ) ;
10501
10502					// Check that the mandatory property name has been specified
10503					if  ( ! $name )
10504						error ( new PdfToTextFormException ( "The \"name\" attribute is mandatory in <group> tag." ) ) ;
10505
10506					// Add this new grouped property to the list of existing groups
10507					$this -> Groups []	=  array
10508					   (
10509						'name'		=>  $name,
10510						'separator'	=>  $separator,
10511						'fields'	=>  $fields
10512					    ) ;
10513
10514					break ;
10515
10516				// <field> :
10517				//	Field definition.
10518				case	'field' :
10519					$field_def							=  new PdfToTextFormFieldDefinition ( $child ) ;
10520					$this -> FieldDefinitions []					=  $field_def ;
10521					$this -> FieldDefinitionsByName [ $field_def -> Name ]		=
10522					$this -> FieldDefinitionsByPdfName [ $field_def -> PdfName ]	=  $field_count ;
10523					$field_count ++ ;
10524					break ;
10525
10526				// Don't allow other attribute names
10527				default :
10528					error ( new PdfToTextFormException ( "Invalid tag <$tag_name> in <form> definition." ) ) ;
10529			    }
10530		    }
10531
10532		// Check that everything is ok (ie, that there is no duplicate fields)
10533		$this -> __paranoid_checks ( ) ;
10534	    }
10535
10536
10537	/*--------------------------------------------------------------------------------------------------------------
10538
10539	    NAME
10540	        GetClassDefinition - Returns the class definition for the urrent form.
10541
10542	    PROTOTYPE
10543	        $def	=   $form_def -> GetClassDefinition ( ) ;
10544
10545	    DESCRIPTION
10546	        Returns a string containing the PHP class definition that will contain the properties defined in the XML
10547		form template.
10548
10549	    RETURN VALUE
10550	        Returns a string containing the PHP class definition for the current form.
10551
10552	 *-------------------------------------------------------------------------------------------------------------*/
10553	public function  GetClassDefinition ( )
10554	   {
10555		// Return the existing definition, if this method has been called more than once
10556		if  ( $this -> ClassDefinition )
10557			return ( $this -> ClassDefinition ) ;
10558
10559		$class_def	=  "// Class " . $this -> ClassName . " : " . $this -> Version . PHP_EOL .
10560				   "class {$this -> ClassName}\t\textends PdfToTextFormData" . PHP_EOL .
10561				   "   {" . PHP_EOL ;
10562
10563		// Get the maximum width of constant and field names
10564		$max_width	=  0 ;
10565
10566		foreach  ( $this -> FieldDefinitions  as  $def )
10567		   {
10568			$length1	=  strlen ( $def -> Name ) ;
10569			$length2	=  strlen ( $def -> PdfName ) ;
10570
10571			if  ( $length1  >  $max_width  ||  $length2  >  $max_width )
10572				$max_width	=  max ( $length1, $length2 ) ;
10573
10574			foreach  ( $def -> Constants  as  $constant )
10575			   {
10576				$length		=  strlen ( $constant [ 'name' ] ) ;
10577
10578				if  ( $length  >  $max_width )
10579					$max_width	=  $length ;
10580			    }
10581		    }
10582
10583		// First, write out the constant definitions
10584		$all_constants	=  array ( ) ;
10585
10586		foreach  ( $this -> FieldDefinitions  as  $def )
10587		   {
10588			foreach  ( $def -> Constants  as  $constant )
10589			   {
10590				$name	=  $constant [ 'name' ] ;
10591				$value	=  $constant [ 'value' ] ;
10592
10593				if  ( isset ( $all_constants [ $name ] ) )
10594				   {
10595					if  ( $all_constants [ $name ]  !=  $value )
10596						error ( new PdfToTextFormException ( "Constant \"$name\" is defined more than once with different values." ) ) ;
10597				    }
10598				else
10599				   {
10600					$all_constants [ $name ]	 =  $value ;
10601
10602					if  ( ! is_numeric ( $value ) )
10603						$value		=  '"' . addslashes ( $value ) . '"' ;
10604
10605					$class_def			.=  "\tconst\t" . str_pad ( $name, $max_width, " ", STR_PAD_RIGHT ) . "\t = $value ; " . PHP_EOL ;
10606				    }
10607			    }
10608		    }
10609
10610		$class_def	.=  PHP_EOL . PHP_EOL ;
10611
10612		// Then write property definitions
10613		foreach  ( $this -> FieldDefinitions  as  $def )
10614		   {
10615			$class_def	.=  "\t/** @formdata */" . PHP_EOL .
10616					     "\tprotected\t\t\${$def -> Name} ;" . PHP_EOL ;
10617		    }
10618
10619		$class_def	.=  PHP_EOL . PHP_EOL ;
10620
10621		// And finally, grouped properties
10622		foreach  ( $this -> Groups  as  $group )
10623		   {
10624			$class_def	.=  "\t/**" . PHP_EOL .
10625					    "\t\t@formdata" . PHP_EOL .
10626					    "\t\t@group(" . implode ( ',', $group [ 'fields' ] ) . ')' . PHP_EOL .
10627					    "\t\t@separator(" . str_replace ( ')', '\)', $group [ 'separator' ] ) . ')' . PHP_EOL .
10628					    "\t */" . PHP_EOL .
10629					    "\tprotected\t\t\${$group [ 'name' ]} ;" . PHP_EOL .PHP_EOL ;
10630		    }
10631
10632		// Constructor
10633		$class_def	.=  PHP_EOL . PHP_EOL .
10634				    "\t// Class constructor" . PHP_EOL .
10635				    "\tpublic function  __construct ( )" . PHP_EOL .
10636				    "\t   {" . PHP_EOL .
10637				    "\t\tparent::__construct ( ) ;" . PHP_EOL .
10638				    "\t    }" . PHP_EOL ;
10639
10640		$class_def	.=  "    }" . PHP_EOL ;
10641
10642		// Save the definition, if a second call occurs
10643		$this -> ClassDefinition	=  $class_def ;
10644
10645		// All done, return
10646		return ( $class_def ) ;
10647	    }
10648
10649
10650	/*--------------------------------------------------------------------------------------------------------------
10651
10652	    NAME
10653	        GetFormData - Returns a form data object containing properties mapped to the form data.
10654
10655	    PROTOTYPE
10656	        $object		=  $form_def -> GetFormData ( $fields ) ;
10657
10658	    DESCRIPTION
10659	        Returns an object containing properties mapped to actual form data.
10660
10661	    PARAMETERS
10662	        $fields (array) -
10663	                An associative array whoses keys are the PDF form field names, and values their values as stored
10664			in the PDF file.
10665
10666	    RETURN VALUE
10667	        Returns an object of the class, as defined by the template specified to PdfToTextFormDefinitions
10668		class constructor.
10669
10670	 *-------------------------------------------------------------------------------------------------------------*/
10671	public function  GetFormData ( $fields = array ( ) )
10672	   {
10673		if  ( ! class_exists ( $this -> ClassName, false ) )
10674		   {
10675			$class_def	=  $this -> GetClassDefinition ( ) ;
10676			eval ( $class_def ) ;
10677		    }
10678
10679		$class_name	=  $this -> ClassName ;
10680		$object		=  new  $class_name ( ) ;
10681
10682		foreach  ( $fields  as  $name => $value )
10683		   {
10684			if  ( isset ( $this -> FieldDefinitionsByPdfName [ $name ] ) )
10685			   {
10686				$property		=  $this -> FieldDefinitions [ $this -> FieldDefinitionsByPdfName [ $name ] ] -> Name ;
10687				$object -> $property	=  $this -> __process_field_value ( $value ) ;
10688			    }
10689		    }
10690
10691		return ( $object ) ;
10692	    }
10693
10694
10695	// __process_field_values -
10696	//	Translates html entities and removes carriage returns (which are apparently used for multiline field) to
10697	//	replace them with newlines.
10698	private function  __process_field_value ( $value )
10699	   {
10700		$value		=  html_entity_decode ( $value ) ;
10701		$result		=  '' ;
10702
10703		for  ( $i = 0, $length = strlen ( $value ) ; $i  <  $length ; $i ++ )
10704		   {
10705			if  ( $value [$i]  !==  "\r" )
10706				$result		.=  $value [$i] ;
10707			else
10708			   {
10709				if  ( isset ( $value [ $i + 1 ] ) )
10710				   {
10711					if  ( $value [ $i + 1 ]  !==  "\n" )
10712						$result		.=  "\n" ;
10713				    }
10714				else
10715					$result		.=  "\n" ;
10716			    }
10717		    }
10718
10719		return ( $result ) ;
10720	    }
10721
10722
10723	/*--------------------------------------------------------------------------------------------------------------
10724
10725	    NAME
10726	        GetformDataFromPdfObject - Same as GetFormData(), except that it operates on XML data.
10727
10728	    PROTOTYPE
10729	        $object		=  $pdf -> GetFormDataFromPdfObject ( $pdf_data ) ;
10730
10731	    DESCRIPTION
10732	        Behaves the same as GetFormData(), except that it takes as input the XML contents of a PDF object.
10733
10734	    PARAMETERS
10735	        $pdf_data (string) -
10736	                XML data coming from the PDF file.
10737
10738	    RETURN VALUE
10739	        Returns an object of the class, as defined by the template specified to PdfToTextFormDefinitions
10740		class constructor.
10741
10742	 *-------------------------------------------------------------------------------------------------------------*/
10743	protected function  GetFormDataFromPdfObject ( $pdf_data )
10744	   {
10745		// simplexml_ functions do not like tags that contain a colon - replace them with a dash
10746		$pdf_data	=  preg_replace ( '/(<[^:]+?)(:)/', '$1-', $pdf_data ) ;
10747
10748		// Load the xml data
10749		$xml		=  simplexml_load_string ( $pdf_data ) ;
10750
10751		// Get the form field values
10752		$fields		=  array ( ) ;
10753
10754		$this -> __get_pdfform_data ( $fields, $xml ) ;
10755
10756		// Return the object
10757		return ( $this -> GetFormData ( $fields ) ) ;
10758	    }
10759
10760
10761	// __getpdfform_data -
10762	//	Retrieve the form field values from the specified PDF object, specified as XML
10763	private function  __get_pdfform_data ( &$fields, $xml )
10764	   {
10765		$tag_name		=  $xml -> getName ( ) ;
10766
10767		if  ( isset ( $this -> PdfDefinitions [ $tag_name ] ) )
10768			$fields [ $tag_name ]	=  ( string )  $xml ;
10769		else
10770		   {
10771			foreach  ( $xml -> children ( )  as  $child )
10772			   {
10773				$this -> __get_pdfform_data ( $fields, $child ) ;
10774			    }
10775		    }
10776	    }
10777
10778
10779	// __paranoid_checks -
10780	//	Checks for several kinds of inconsistencies in the supplied XML template.
10781	private function  __paranoid_checks ( )
10782	   {
10783		// Check that field names, PDF field names and constant names are unique
10784		$names			=  array ( ) ;
10785		$pdf_names		=  array ( ) ;
10786		$constant_names		=  array ( ) ;
10787
10788		foreach  ( $this -> FieldDefinitions  as  $def )
10789		   {
10790			if  ( ! isset ( $this -> PdfDefinitions [ $def -> PdfName ] ) )
10791				error ( new PdfToTextFormException ( "Field \"{$def -> PdfName}\" is not defined in the PDF file." ) ) ;
10792
10793			if  ( isset ( $names [ $def -> Name ] ) )
10794				error ( new PdfToTextFormException ( "Field \"{$def -> Name}\" is defined more than once." ) ) ;
10795
10796			$names [ $def -> Name ]		=  true ;
10797
10798			if  ( isset ( $pdf_names [ $def -> PdfName ] ) )
10799				error ( new PdfToTextFormException ( "PDF Field \"{$def -> PdfName}\" is referenced more than once." ) ) ;
10800
10801			$pdf_names [ $def -> PdfName ]	=  true ;
10802
10803			foreach  ( $def -> Constants  as  $constant )
10804			   {
10805				$constant_name		=  $constant [ 'name' ] ;
10806
10807				if  ( isset ( $constant_names [ $constant_name ] )  &&  $constant_names [ $constant_name ]  !=  $constant [ 'value' ] )
10808					error ( new PdfToTextFormException ( "Constant \"$constant_name\" is defined more than once with different values." ) ) ;
10809
10810				$constant_names [ $constant_name ]	=  $constant [ 'value' ] ;
10811			    }
10812		    }
10813
10814		// Check that group names are unique and that the fields they are referencing exist
10815		$group_names	=  array ( ) ;
10816
10817		foreach ( $this -> Groups  as  $group )
10818		   {
10819			if  ( isset ( $group_names [ $group [ 'name' ] ] ) )
10820				error ( new PdfToTextFormException ( "Group \"{$group [ 'name' ]}\" is defined more than once." ) ) ;
10821
10822			if  ( isset ( $names [ $group [ 'name' ] ] ) )
10823				error ( new PdfToTextFormException ( "Group \"{$group [ 'name' ]}\" has the same name as an existing field." ) ) ;
10824
10825			foreach ( $group [ 'fields' ]  as  $field_name )
10826			   {
10827				if  ( ! isset ( $names [ $field_name ] ) )
10828					error ( new PdfToTextFormException ( "Field \"$field_name\" of group \"{$group [ 'name' ]}\" does not exist." ) ) ;
10829			    }
10830		    }
10831	    }
10832    }
10833
10834
10835/*==============================================================================================================
10836
10837    class PdfToTextFormFieldDefinition -
10838        Contains an XML template form field definition.
10839
10840  ==============================================================================================================*/
10841class  PdfToTextFormFieldDefinition	// extends  Object
10842   {
10843	// Supported field types
10844	const		TYPE_STRING		=  1 ;			// String
10845	const		TYPE_CHOICE		=  2 ;			// Choice (must have <constant> subtags)
10846
10847	// Official name (as it will appear in the class based on the XML template)
10848	public		$Name			=  false ;
10849	// Field name, as specified in the input PDF file
10850	public		$PdfName		=  false ;
10851	// Field type
10852	public		$Type			=  self::TYPE_STRING ;
10853	// Available constant values for this field when the "type" attribute has the value "choice"
10854	public		$Constants		=  array ( ) ;
10855
10856
10857	/*--------------------------------------------------------------------------------------------------------------
10858
10859	    Constructor -
10860		Builds the field definition object.
10861
10862	 *-------------------------------------------------------------------------------------------------------------*/
10863	public function  __construct ( $field_node )
10864	   {
10865		// Loop through attributes
10866		foreach  ( $field_node -> attributes ( )  as  $attribute_name => $attribute_value )
10867		   {
10868			switch  ( strtolower ( $attribute_name ) )
10869			   {
10870				// "name" attribute :
10871				//	Specifies the field name as it will appear in the output class. Must be a valid PHP name.
10872				case	'name' :
10873					$this -> Name		=  PdfToTextFormDefinition::ValidatePhpName ( ( string ) $attribute_value ) ;
10874					break ;
10875
10876				// "form-field" attribute :
10877				//	Corresponding field name in the input PDF form.
10878				case	'form-field' :
10879					$this -> PdfName	=  ( string ) $attribute_value ;
10880					break ;
10881
10882				// "type" :
10883				//	Field type. Can be either :
10884				//	- "string" :
10885				//		The field value can be any type of string.
10886				//	- "choice" :
10887				//		The field value has one of the values defined by the <case> or <default> subtags.
10888				case	'type' :
10889					switch ( strtolower ( ( string ) $attribute_value ) )
10890					   {
10891						case	'string' :
10892							$this -> Type	=  self::TYPE_STRING ;
10893							break ;
10894
10895						case	'choice' :
10896							$this -> Type	=  self::TYPE_CHOICE ;
10897							break ;
10898
10899						default :
10900							error ( new PdfToTextFormException ( "Invalid value \"$attribute_value\" for the \"$attribute_name\" attribute of the <field> tag." ) ) ;
10901					    }
10902			    }
10903		    }
10904
10905		// The "name" and "form-field" attributes are mandatory
10906		if  ( ! $this -> Name )
10907			error ( new PdfToTextFormException ( "The \"name\" attribute is mandatory for the <field> tag." ) ) ;
10908
10909		if  ( ! $this -> PdfName )
10910			error ( new PdfToTextFormException ( "The \"form-field\" attribute is mandatory for the <field> tag." ) ) ;
10911
10912		// For "type=choice" entries, we have to look for <case> or <default> subtags
10913		if  ( $this -> Type  ===  self::TYPE_CHOICE )
10914		   {
10915			foreach  ( $field_node -> children ( )  as  $child )
10916			   {
10917				$tag_name	=  $child -> getName ( ) ;
10918				$lcname		=  strtolower ( $tag_name ) ;
10919				$is_default	=  false ;
10920
10921				switch ( $lcname )
10922				   {
10923					// Default value to be used when no PDF field value matches the defined constants
10924					case	'default' :
10925						$is_default		=  true ;
10926
10927					// "case" attribute :
10928					//	Maps a value to  constant name that will be defined in the generated class.
10929					case	'case' :
10930						$constant_value		=  "" ;
10931						$constant_name		=  false ;
10932
10933						// Retrieve attributes
10934						foreach  ( $child -> attributes ( )  as  $attribute_name => $attribute_value )
10935						   {
10936							switch  ( strtolower ( $attribute_name ) )
10937							   {
10938								// "value" attribute :
10939								//	PDF form field value.
10940								case	'value'	:
10941									$constant_value		=  ( string ) $attribute_value ;
10942									break ;
10943
10944								// "constant" attribute :
10945								//	Associated constant.
10946								case	'constant' :
10947									$constant_name		=  PdfToTextFormDefinition::ValidatePhpName ( ( string ) $attribute_value ) ;
10948									break ;
10949
10950								// Bail out if any unrecognized attribute has been specified
10951								default :
10952									error ( new PdfToTextFormException ( "Invalid tag <$tag_name> in <field> definition." ) ) ;
10953							    }
10954						    }
10955
10956						// Each <case> entry must have a "constant" attribute
10957						if  ( $constant_value  ===  false  &&  ! $is_default )
10958							error ( new PdfToTextFormException ( "Missing constant value in <case> tag." ) ) ;
10959
10960						if  ( $constant_name  ===  false )
10961							error ( new PdfToTextFormException ( "Attribute \"constant-name\" is required for <$tag_name> tag." ) ) ;
10962
10963						// Add this to the list of existing constants
10964						$this -> Constants []	=  array
10965						   (
10966							'name'		=>  $constant_name,
10967							'value'		=>  $constant_value,
10968							'default'	=>  $is_default
10969						    ) ;
10970
10971						break ;
10972
10973					// Check for unrecognized tags
10974					default :
10975						error ( new PdfToTextFormException ( "Invalid tag <$tag_name> in <field> definition." ) ) ;
10976				    }
10977			    }
10978		    }
10979	    }
10980    }
10981
10982
10983/*==============================================================================================================
10984
10985    class PdfToTextFormData -
10986        Base class for all Pdf form templates data.
10987
10988  ==============================================================================================================*/
10989class  PdfToTextFormData		// extends  Object
10990   {
10991	// Doc comments provide information about form data fields (mainly to handle grouped field values)
10992	// The $__Properties array gives information about the form data fields themselves
10993	private		$__Properties	=  array ( ) ;
10994
10995
10996	/*--------------------------------------------------------------------------------------------------------------
10997
10998	    Constructor -
10999		Retrieve information about the derived class properties, which are specified by the derived class
11000		generated on the fly.
11001
11002	 *-------------------------------------------------------------------------------------------------------------*/
11003	public function  __construct ( )
11004	   {
11005		// Get class properties
11006		$reflection	=  new ReflectionClass ( $this ) ;
11007		$properties	=  $reflection -> getProperties ( ) ;
11008
11009		// Loop through class properties
11010		foreach  ( $properties  as  $property )
11011		   {
11012			$propname	=  $property -> getName ( ) ;
11013			$doc_comment	=  $property -> getDocComment ( ) ;
11014
11015			$fields		=  false ;
11016			$separator	=  false ;
11017
11018			// A doc comment may indicate either :
11019			// - A form data field (@formdata)
11020			// - A grouped field ; in this case, we will have the following tags :
11021			//	. @formdata
11022			//	. @group(field_list) : list of fields grouped for this property
11023			//	. @separator(string) : a separator used when catenating grouped fields
11024			if  ( $doc_comment )
11025			   {
11026				// The @formdata tag must be present
11027				if  ( strpos ( $doc_comment, '@formdata' )  ===  false )
11028					continue ;
11029
11030				// @group(fields) pattern
11031				if  ( preg_match ( '/group \s* \( \s* (?P<fields> [^)]+) \)/imsx', $doc_comment, $match ) )
11032				   {
11033					$items	=  explode ( ',', $match [ 'fields' ] ) ;
11034					$fields =  array ( ) ;
11035
11036					foreach  ( $items  as  $item )
11037						$fields	[]	=  $item ;
11038				    }
11039
11040				// @separator(string) pattern
11041				if  ( preg_match ( '/separator \s* \( \s* (?P<separator> ( (\\\)) | (.) )+  \) /imsx', $doc_comment, $match ) )
11042				   {
11043					$separator	=  stripslashes ( $match [ 'separator' ]) ;
11044				    }
11045			     }
11046			// Ignore non-formdata properties
11047			else
11048				continue ;
11049
11050			// Property belongs to the form - add it to the list of available properties
11051			$this -> __Properties [ $propname ]	=  array
11052			   (
11053				'name'		=>  $propname,
11054				'fields'	=>  $fields,
11055				'separator'	=>  $separator
11056			    ) ;
11057		    }
11058	    }
11059
11060
11061	/*--------------------------------------------------------------------------------------------------------------
11062
11063	    __get -
11064		Returns the underlying property value for this PDF data field.
11065	 *-------------------------------------------------------------------------------------------------------------*/
11066	public function  __get ( $member )
11067	   {
11068		if  ( ! isset ( $this -> __Properties [ $member ] ) )
11069			warning ( new PdfToTextFormException ( "Undefined property \"$member\"." ) ) ;
11070
11071		return ( $this -> $member ) ;
11072	    }
11073
11074
11075	/*--------------------------------------------------------------------------------------------------------------
11076
11077	    __set -
11078		Sets the underlying property value for this PDF data field.
11079		When the property is a compound one, sets individual members as well.
11080
11081	 *-------------------------------------------------------------------------------------------------------------*/
11082	public function  __set  ( $member, $value )
11083	   {
11084		// Property exists : some special processing will be needed
11085		if  ( isset ( $this -> __Properties [ $member ] ) )
11086		   {
11087			$prop_entry	=  $this -> __Properties [ $member ] ;
11088
11089			// Non-compound property
11090			if  ( ! $prop_entry [ 'fields' ] )
11091			   {
11092				$this -> $member	=  $value ;
11093
11094				// However, we have to check that this property belongs to a compound property and change
11095				// the compound property valu accordingly
11096				foreach  ( $this -> __Properties  as  $name => $property )
11097				   {
11098					if  ( $property [ 'fields' ] )
11099					   {
11100						if  ( in_array ( $member, $property [ 'fields' ] ) )
11101						   {
11102							$values		=  array ( ) ;
11103
11104							foreach  ( $property [ 'fields' ]  as  $value )
11105								$values	[]	=  $this -> $value ;
11106
11107							// Change compound property value accordingly, using the specified separator
11108							$this -> $name	=  implode ( $property [ 'separator' ], $values ) ;
11109						    }
11110					    }
11111				    }
11112			    }
11113			// Compound property : we will have to explode it in separate parts, using the compound property separator,
11114			// then set individual property values
11115			else
11116			   {
11117				$values		=  explode ( $prop_entry [ 'separator' ], $value ) ;
11118				$value_count	=  count ( $values ) ;
11119				$field_count	=  count ( $prop_entry [ 'fields' ] ) ;
11120
11121				if  ( $value_count  <  $field_count )
11122					error ( new PdfToTextFormException ( "Not enough value parts specified for the \"$member\" property ($value)." ) ) ;
11123				else if  ( $value_count  >  $field_count )
11124					error ( new PdfToTextFormException ( "Too much value parts specified for the \"$member\" property ($value)." ) ) ;
11125
11126				$this -> $member	=  $value ;
11127
11128				for  ( $i = 0 ; $i  <  $value_count ; $i ++ )
11129				   {
11130					$sub_member		=  $prop_entry [ 'fields' ] [$i] ;
11131					$this -> $sub_member	=  $values [$i] ;
11132				    }
11133			    }
11134		    }
11135		// Property does not exist : let PHP act as the default way
11136		else
11137			$this -> $member	=  $value ;
11138	    }
11139    }
11140
11141
11142/**************************************************************************************************************
11143 **************************************************************************************************************
11144 **************************************************************************************************************
11145 ******                                                                                                  ******
11146 ******                                                                                                  ******
11147 ******                                  CAPTURE DEFINITION MANAGEMENT                                   ******
11148 ******         (none of the classes listed here are meant to be instantiated outside this file)         ******
11149 ******                                                                                                  ******
11150 ******                                                                                                  ******
11151 **************************************************************************************************************
11152 **************************************************************************************************************
11153 **************************************************************************************************************/
11154
11155/*==============================================================================================================
11156
11157    class PdfToTextCaptureDefinitions -
11158        Holds text capture definitions, whose XML data has been supplied to the PdfToText::SetCapture() method.
11159
11160  ==============================================================================================================*/
11161class  PdfToTextCaptureDefinitions	// extends		Object
11162					implements	ArrayAccess, Countable, Iterator
11163   {
11164	// Shape definitions - The actual objects populating this array depend on the definitions supplied
11165	// (rectangle, etc.)
11166	protected	$ShapeDefinitions		=  array ( ) ;
11167
11168	// Shape field names - used for iteration
11169	private		$ShapeNames ;
11170
11171	// Page count
11172	private		$PageCount			=  false ;
11173
11174
11175	/*--------------------------------------------------------------------------------------------------------------
11176
11177	    CONSTRUCTOR -
11178		Analyzes the XML data defining the areas to be captured.
11179
11180	 *-------------------------------------------------------------------------------------------------------------*/
11181	public function  __construct ( $xml_data )
11182	   {
11183		$xml		=  simplexml_load_string ( $xml_data ) ;
11184		$root_entry	=  $xml -> getName ( ) ;
11185
11186		// Root tag must be <captures>
11187		if  ( strcasecmp ( $root_entry, "captures" ) )
11188			error ( new PdfToTextCaptureException ( "Root entry must be <captures>, <$root_entry> was found." ) ) ;
11189
11190		// Process the child nodes
11191		foreach  ( $xml -> children ( )   as  $child )
11192		   {
11193			$tag_name		=  $child -> getName ( ) ;
11194
11195			switch  ( strtolower ( $tag_name ) )
11196			   {
11197				// <rectangle> :
11198				//	An rectangle whose dimensions are given in the <page> subtags.
11199				case	'rectangle' :
11200					$shape_object	=  new PdfToTextCaptureRectangleDefinition ( $child ) ;
11201					break ;
11202
11203				// <columns> :
11204				//	A definition of columns and their applicable pages.
11205				case	'lines' :
11206					$shape_object	=  new PdfToTextCaptureLinesDefinition ( $child ) ;
11207					break ;
11208
11209				// Complain if an unknown tag is found
11210				default :
11211					error ( new PdfToTextCaptureException ( "Invalid tag <$tag_name> found in root tag <captures>." ) ) ;
11212			    }
11213
11214			// Shape names must be unique within the definitinos
11215			if  ( isset ( $this -> ShapeDefinitions [ $shape_object -> Name ] ) )
11216				error ( new PdfToTextCaptureLinesDefinition ( "The shape named \"{$shape_object -> Name}\" has been defined more than once." ) ) ;
11217			else
11218				$this -> ShapeDefinitions [ $shape_object -> Name ]	=  $shape_object ;
11219		    }
11220
11221		// Build an array of shape names for the iterator interface
11222		$this -> ShapeNames	=  array_keys ( $this -> ShapeDefinitions ) ;
11223	    }
11224
11225
11226	/*--------------------------------------------------------------------------------------------------------------
11227
11228	    NAME
11229	        GetCapturedObject - Creates an object reflecting the captured data.
11230
11231	    PROTOTYPE
11232	        $captures	=  $capture_definitions -> GetCapturedObject ( $document_fragments ) ;
11233
11234	    DESCRIPTION
11235	        Returns an object of type PdfToTextCapturedData,containing the data that has been captured, based on
11236		the capture definitions.
11237
11238	    PARAMETERS
11239	        $document_fragments (type) -
11240	                Document text fragments collected during the text layout rendering process.
11241
11242	    RETURN VALUE
11243	        An object of type PdfToTextCaptures, cntaining the captured data.
11244
11245	 *-------------------------------------------------------------------------------------------------------------*/
11246	public function  GetCapturedObject ( $document_fragments )
11247	   {
11248		$captures	=  array ( ) ;
11249
11250		foreach  ( $this -> ShapeDefinitions  as  $shape )
11251		   {
11252			$capture	=  $shape -> ExtractAreas ( $document_fragments ) ;
11253
11254			foreach  ( $capture  as  $page => $items )
11255			   {
11256				$captures [ $page ] []	=  $items ;
11257			    }
11258		    }
11259
11260		 $captured_object	=  new PdfToTextCaptures ( $captures ) ;
11261
11262		 return ( $captured_object ) ;
11263	    }
11264
11265
11266	/*--------------------------------------------------------------------------------------------------------------
11267
11268	    NAME
11269	        SetPageCount - Defines the total number of pages in the document.
11270
11271	    PROTOTYPE
11272	        $shape -> SetPageCount ( $count ) ;
11273
11274	    DESCRIPTION
11275	        At the time when XML definitions are processed, the total number of pages in the document is not yet
11276		known. Moreover, page ranges or page numbers can be expressed relative to the last page of the
11277		document (for example : 1..$-1, which means "from the first page to the last page - 1).
11278		Setting the page count once it is known allows to process the expressions specified in the "number"
11279		attribute of the <pages> tag so that the expressions are transformed into actual page numbers.
11280
11281	    PARAMETERS
11282	        $count (integer) -
11283	                Number of pages in the document.
11284
11285	 *-------------------------------------------------------------------------------------------------------------*/
11286	public function  SetPageCount ( $count )
11287	   {
11288		$this -> PageCount	=  $count ;
11289
11290		foreach  ( $this -> ShapeDefinitions  as  $def )
11291		   {
11292			$def ->  SetPageCount ( $count ) ;
11293		    }
11294	    }
11295
11296
11297	/*--------------------------------------------------------------------------------------------------------------
11298
11299	    NAME
11300	        GetNodeAttributes - Retrieves an XML node's attributes.
11301
11302	    PROTOTYPE
11303	        $result		=  PdfToTextCaptureDefinitions::GetNodeAttributes ( $node, $attributes ) ;
11304
11305	    DESCRIPTION
11306	        Retrieves the attributes defined for the specified XML node.
11307
11308	    PARAMETERS
11309	        $node (SimpleXMLElement) -
11310	                Node whose attributes are to be extracted.
11311
11312		$attributes (associative array) -
11313			Associative array whose keys are the attribute names and whose values define a boolean
11314			indicating whether the attribute is mandatory or not.
11315
11316	    RETURN VALUE
11317	        Returns an associative whose key are the attribute names and whose values are the attribute values,
11318		specified as a string.
11319		For optional unspecified attributes, the value will be boolean false.
11320
11321	    NOTES
11322	        The method throws an exception if the node contains an unknown attribute, or if a mandatory attribute
11323		is missing.
11324
11325	 *-------------------------------------------------------------------------------------------------------------*/
11326	public static function  GetNodeAttributes  ( $node, $attributes )
11327	   {
11328		$tag_name	=  $node -> getName ( ) ;
11329
11330		// Build the initial value for the resulting array
11331		$result		=  array ( ) ;
11332
11333		foreach  ( array_keys ( $attributes )  as  $name )
11334			$result	[ $name ]	=  false ;
11335
11336		// Loop through node attributes
11337		foreach  ( $node -> attributes ( )  as  $attribute_name => $attribute_value )
11338		   {
11339			$attribute_name		=  strtolower ( $attribute_name ) ;
11340
11341			// Check that the attributes exists ; if yes, add it to the resulting array
11342			if  ( isset ( $attributes [ $attribute_name ] ) )
11343				$result [ $attribute_name ]	=  ( string ) $attribute_value ;
11344			// Otherwise, throw an exception
11345			else
11346				error ( new PdfToTextCaptureLinesDefinition ( "Undefined attribute \"$attribute_name\" for node <$tag_name>." ) ) ;
11347		    }
11348
11349		// Check that all mandatory attributes have been specified
11350		foreach  ( $attributes  as  $attribute_name => $mandatory )
11351		   {
11352			if  ( $mandatory  &&  $result [ $attribute_name ]  ===  false )
11353				error ( new PdfToTextCaptureLinesDefinition ( "Undefined attribute \"$attribute_name\" for node <$tag_name>." ) ) ;
11354		    }
11355
11356		// All done, return
11357		return ( $result ) ;
11358	    }
11359
11360
11361	/*--------------------------------------------------------------------------------------------------------------
11362
11363	    NAME
11364	        GetBooleanAttribute - Returns a boolean value associated to a string.
11365
11366	    PROTOTYPE
11367	        $bool	=  PdfToTextCaptureDefinitions::GetBooleanValue ( $value ) ;
11368
11369	    DESCRIPTION
11370	        Returns a boolean value corresponding to a boolean specified as a string.
11371
11372	    PARAMETERS
11373	        $value (string) -
11374	                A boolean value represented as a string.
11375			The strings 'true', 'yes', 'on' and '1' will be interpreted as boolean true.
11376			The strings 'false', 'no', 'off' and '0' will be interpreted as boolean false.
11377
11378	    RETURN VALUE
11379	        The boolean value corresponding to the specified string.
11380
11381	    NOTES
11382	        An exception is thrown if the supplied string is incorrect.
11383
11384	 *-------------------------------------------------------------------------------------------------------------*/
11385	public static function  GetBooleanAttribute ( $value )
11386	   {
11387		$lcvalue	=  strtolower ( $value ) ;
11388
11389		if  ( $lcvalue  ===  'true'  ||  $lcvalue  ===  'on'  ||  $lcvalue  === 'yes'  ||  $lcvalue  ===  '1'  ||  $value  ===  true )
11390			return ( true ) ;
11391		else if  ( $lcvalue  ===  'false'  ||  $lcvalue  ===  'off'  ||  $lcvalue  === 'no'  ||  $lcvalue  ===  '0'  ||  $value  ===  false )
11392			return(  false ) ;
11393		else
11394			error ( new PdfToTextCaptureLinesDefinition ( "Invalid boolean value \"$value\"." ) ) ;
11395	    }
11396
11397
11398	/*--------------------------------------------------------------------------------------------------------------
11399
11400		Interfaces implementations.
11401
11402	 *-------------------------------------------------------------------------------------------------------------*/
11403
11404	// Countable interface
11405	public function  count ( )
11406	   { return ( count ( $this -> ShapeDefinitions ) ) ; }
11407
11408
11409	// ArrayAccess interface
11410	public function  offsetExists ( $offset )
11411	   { return ( isset ( $this -> ShapeDefinitions [ $offset ] ) ) ; }
11412
11413
11414	public function  offsetGet ( $offset )
11415	   { return ( $this -> ShapeDefinitions [ $offset ] ) ; }
11416
11417
11418	public function  offsetSet ( $offset, $value )
11419	   { error ( new PdfToTextException ( "Unsupported operation" ) ) ; }
11420
11421
11422	public function  offsetunset ( $offset )
11423	   { error ( new PdfToTextException ( "Unsupported operation" ) ) ; }
11424
11425
11426	// Iterator interface -
11427	//	Iteration is made through shape names, which are supplied by the $ShapeNames property
11428	private		$__iterator_index	=  0 ;
11429
11430	public function  rewind ( )
11431	   { $this -> __iterator_index = 0 ; }
11432
11433	public function  valid ( )
11434	   { return ( $this -> __iterator_index  >=  0  &&  $this -> __iterator_index  <  count ( $this -> ShapeNames ) ) ; }
11435
11436	public function  key ( )
11437	   { return ( $this -> ShapeNames [ $this -> __iterator_index ] ) ; }
11438
11439	public function  next ( )
11440	   { $this -> __iterator_index ++ ; }
11441
11442	public function  current ( )
11443	   { return ( $this -> ShapeDefinitions [ $this -> ShapeNames [ $this -> __iterator_index ] ] ) ; }
11444    }
11445
11446
11447/*==============================================================================================================
11448
11449    class PdfToTextCaptureShapeDefinition -
11450        Base class for capturing shapes.
11451
11452  ==============================================================================================================*/
11453abstract class  PdfToTextCaptureShapeDefinition		//extends  Object
11454   {
11455	const	SHAPE_RECTANGLE		=  1 ;
11456	const	SHAPE_COLUMN		=  2 ;
11457	const	SHAPE_LINE		=  3 ;
11458
11459	// Capture name
11460	public		$Name ;
11461	// Capture type - one of the SHAPE_* constants, assigned by derived classes.
11462	public		$Type ;
11463	// Applicable pages for this capture
11464	public		$ApplicablePages ;
11465	// Areas per page for this shape
11466	public		$Areas		=  array ( ) ;
11467	// Separator used when multiple elements are covered by the same shape
11468	public		$Separator	=  " " ;
11469
11470
11471	/*--------------------------------------------------------------------------------------------------------------
11472
11473	     Constructor -
11474		Initializes the base capture class.
11475
11476	 *-------------------------------------------------------------------------------------------------------------*/
11477	public function  __construct ( $type )
11478	   {
11479		$this -> Type			=  $type ;
11480		$this -> ApplicablePages	=  new PdfToTextCaptureApplicablePages ( ) ;
11481	    }
11482
11483
11484	/*--------------------------------------------------------------------------------------------------------------
11485
11486	     SetPageCount -
11487		Sets the page count, so that all the applicable pages can be determined.
11488		Derived classes can implement this function if some additional work is needed.
11489
11490	 *-------------------------------------------------------------------------------------------------------------*/
11491	public function  SetPageCount ( $count )
11492	   {
11493		$this -> ApplicablePages -> SetPageCount ( $count ) ;
11494	    }
11495
11496
11497	/*--------------------------------------------------------------------------------------------------------------
11498
11499	     GetFragmentData -
11500		Extracts data from a text fragment (text + coordinates).
11501
11502	 *-------------------------------------------------------------------------------------------------------------*/
11503	protected function  GetFragmentData ( $fragment, &$text, &$left, &$top, &$right, &$bottom )
11504	   {
11505		$left		=  ( double ) $fragment [ 'x' ] ;
11506		$top		=  ( double ) $fragment [ 'y' ] ;
11507		$right		=  $left + ( double ) $fragment [ 'width' ] - 1 ;
11508		$bottom		=  $top  - ( double ) $fragment [ 'font-height' ] ;
11509		$text		=  $fragment [ 'text' ] ;
11510	    }
11511
11512
11513	/*--------------------------------------------------------------------------------------------------------------
11514
11515	     GetAttributes -
11516		Retrieves the attributes of the given XML node. Processes the following attributes, which are common to
11517		all shapes :
11518		- Name
11519		- Separator
11520
11521	 *-------------------------------------------------------------------------------------------------------------*/
11522	protected function  GetAttributes ( $node, $attributes  =  array ( ) )
11523	   {
11524		$attributes		=  array_merge ( $attributes, array ( 'name' => true, 'separator' => false ) ) ;
11525		$shape_attributes	=  PdfToTextCaptureDefinitions::GetNodeAttributes ( $node, $attributes ) ;
11526		$this -> Name		=  $shape_attributes [ 'name' ] ;
11527
11528		if  ( $shape_attributes [ 'separator' ]  !==  false )
11529			$this -> Separator	=  PdfToText::Unescape ( $shape_attributes [ 'separator' ] ) ;
11530
11531		return ( $shape_attributes ) ;
11532	    }
11533
11534
11535	/*--------------------------------------------------------------------------------------------------------------
11536
11537	     ExtractAreas -
11538		Extracts text contents from the document fragments.
11539
11540	 *-------------------------------------------------------------------------------------------------------------*/
11541	public abstract function  ExtractAreas ( $document_fragments ) ;
11542    }
11543
11544
11545/*==============================================================================================================
11546
11547    class PdfToTextCaptureRectangleDefinition -
11548        A shape for capturing text in rectangle areas.
11549
11550  ==============================================================================================================*/
11551class	PdfToTextCaptureRectangleDefinition		extends  PdfToTextCaptureShapeDefinition
11552   {
11553	/*--------------------------------------------------------------------------------------------------------------
11554
11555	    CONSTRUCTOR -
11556		Analyzes the contents of a <rectangle> XML node, which contains <page> child node giving the
11557		applicable pages and the rectangle dimensions.
11558
11559	 *-------------------------------------------------------------------------------------------------------------*/
11560	public function  __construct  ( $node )
11561	   {
11562		parent::__construct ( self::SHAPE_RECTANGLE ) ;
11563
11564		$this -> GetAttributes ( $node ) ;
11565
11566		// Loop through node's children
11567		foreach  ( $node -> children ( )  as  $child )
11568		   {
11569			$tag_name	=  $child -> getName ( ) ;
11570
11571			switch  ( strtolower ( $tag_name ) )
11572			   {
11573				// <page> tag : applicable page(s)
11574				case	'page' :
11575					// Retrieve the specified attributes
11576					$page_attributes	=  PdfToTextCaptureDefinitions::GetNodeAttributes
11577					   (
11578						$child,
11579						array
11580						   (
11581							'number'	=>  true,
11582							'left'		=>  true,
11583							'right'		=>  false,
11584							'top'		=>  true,
11585							'bottom'	=>  false,
11586							'width'		=>  false,
11587							'height'	=>  false
11588						    )
11589					    ) ;
11590
11591					$page_number	=  $page_attributes [ 'number' ] ;
11592
11593					// Add this page to the list of applicable pages for this shape
11594					$this -> ApplicablePages -> Add ( $page_number, $page_attributes ) ;
11595
11596					break ;
11597
11598				// Other tag : throw an exception
11599				default :
11600					error ( new PdfToTextCaptureException ( "Invalid tag <$tag_name> found in root tag <rectangle>." ) ) ;
11601			    }
11602		    }
11603	    }
11604
11605
11606	/*--------------------------------------------------------------------------------------------------------------
11607
11608	     ExtractAreas -
11609		Extracts text contents from the document fragments.
11610
11611	 *-------------------------------------------------------------------------------------------------------------*/
11612	public function  ExtractAreas ( $document_fragments )
11613	   {
11614		$result		=  array ( ) ;
11615
11616		// Loop through document fragments
11617		foreach  ( $document_fragments  as  $page  =>  $page_contents )
11618		   {
11619			$fragments		=  $page_contents [ 'fragments' ] ;
11620
11621			// Ignore pages that are not applicable
11622			if  ( ! isset ( $this -> ApplicablePages -> PageMap [ $page ] ) )
11623				continue ;
11624
11625			// Loop through each text fragment of the page
11626			foreach  ( $fragments  as  $fragment )
11627			   {
11628				$this -> GetFragmentData ( $fragment, $text, $left, $top, $right, $bottom ) ;
11629
11630				// Only handle text fragments that are within the specified area
11631				if  ( $this -> Areas [ $page ] -> Contains ( $left, $top, $right, $bottom ) )
11632				   {
11633					// Normally, rectangle shapes are used to capture a single line...
11634					if  ( ! isset ( $result [ $page ] ) )
11635						$result [ $page ]	=  new PdfToTextCapturedRectangle ( $page, $this -> Name, $text, $left, $top, $right, $bottom, $this ) ;
11636					// ... but you can also use them to capture multiple lines ; in this case, the "separator" attribute of the <rectangle> tag will
11637					// be used to separate items
11638					else
11639					   {
11640						$existing_area	=  $result [ $page ] ;
11641
11642						$existing_area -> Top			=  max ( $existing_area -> Top   , $top    ) ;
11643						$existing_area -> Bottom		=  min ( $existing_area -> Bottom, $bottom ) ;
11644						$existing_area -> Left			=  min ( $existing_area -> Left  , $left   ) ;
11645						$existing_area -> Right			=  max ( $existing_area -> Right , $right  ) ;
11646						$existing_area -> Text		       .=  $this -> Separator . $text ;
11647					    }
11648				    }
11649			    }
11650		    }
11651
11652
11653		// Provide empty values for pages which did not capture a rectangle shape
11654		$added_missing_pages	=  false ;
11655
11656		foreach  ( $this -> ApplicablePages  as  $page => $applicable )
11657		   {
11658			if  ( ! isset ( $result [ $page ] ) )
11659			   {
11660				$result [ $page ]	=  new PdfToTextCapturedRectangle ( $page, $this -> Name, '', 0, 0, 0, 0, $this ) ;
11661				$added_missing_pages	=  true ;
11662			    }
11663		    }
11664
11665		if  ( $added_missing_pages )	// Sort by page number if empty values were added
11666			ksort ( $result ) ;
11667
11668		// All done, return
11669		return ( $result ) ;
11670	    }
11671
11672
11673	/*--------------------------------------------------------------------------------------------------------------
11674
11675	     SetPageCount -
11676		Ensures that an Area is created for each related page.
11677
11678	 *-------------------------------------------------------------------------------------------------------------*/
11679	public function  SetPageCount ( $count )
11680	   {
11681		parent::SetPageCount ( $count ) ;
11682
11683		// Create a rectangle area for each page concerned - this can only be done when the number of pages is known
11684		// (and the ApplicablePages object updated accordingly)
11685		foreach  ( $this -> ApplicablePages -> ExtraPageMapData  as  $page => $data )
11686			$this -> Areas [ $page ]		=  new PdfToTextCaptureArea ( $data ) ;
11687	    }
11688    }
11689
11690
11691/*==============================================================================================================
11692
11693    class PdfToTextCaptureLinesDefinition -
11694        A shape for capturing text in rectangle areas.
11695
11696  ==============================================================================================================*/
11697class	PdfToTextCaptureLinesDefinition		extends  PdfToTextCaptureShapeDefinition
11698   {
11699	// Column areas
11700	public		$Columns		=  array ( ) ;
11701	// Top and bottom lines
11702	public		$Tops			=  array ( ) ;
11703	public		$Bottoms		=  array ( ) ;
11704	// Column names
11705	private		$ColumnNames		=  array ( ) ;
11706
11707
11708	/*--------------------------------------------------------------------------------------------------------------
11709
11710	    CONSTRUCTOR -
11711		Analyzes the contents of a <columns> XML node, which contains <page> nodes giving a part of the column
11712		dimensions, and <column> nodes which specify the name of the column and the remaining coordinates,
11713		such as "left" or "width"
11714
11715	 *-------------------------------------------------------------------------------------------------------------*/
11716	public function  __construct  ( $node )
11717	   {
11718		parent::__construct ( self::SHAPE_COLUMN ) ;
11719
11720		$shape_attributes	=  $this -> GetAttributes ( $node, array ( 'default' => false ) ) ;
11721		$column_default		=  ( $shape_attributes [ 'default' ] )  ?  $shape_attributes [ 'default' ] : '' ;
11722
11723		// Loop through node's children
11724		foreach  ( $node -> children ( )  as  $child )
11725		   {
11726			$tag_name	=  $child -> getName ( ) ;
11727
11728			switch  ( strtolower ( $tag_name ) )
11729			   {
11730				// <page> tag
11731				case	'page' :
11732					// Retrieve the specified attributes
11733					$page_attributes	=  PdfToTextCaptureDefinitions::GetNodeAttributes
11734					   (
11735						$child,
11736						array
11737						   (
11738							'number'	=>  true,
11739							'top'		=>  true,
11740							'height'	=>  true,
11741							'bottom'	=>  false
11742						    )
11743					    ) ;
11744
11745					// We have to store the y-coordinate of the first and last lines, to determine until which
11746					// position we have to check for column contents.
11747					// The "top" and "bottom" attributes of the <page> tag actually determine the top and bottom
11748					// y-coordinates where to search for columns. However, we will have to rename the "bottom"
11749					// attribute to "column-bottom", in order for it not to be mistaken with actual column rectangle
11750					// (only the "height" attribute of the <page> tag gives the height of a line)
11751					$page_attributes [ 'column-top' ]	=  $page_attributes [ 'top' ] ;
11752					$page_attributes [ 'column-bottom' ]	=  ( double ) $page_attributes [ 'bottom' ] ;
11753					unset ( $page_attributes [ 'bottom' ] ) ;
11754
11755					// Add this page to the list of applicable pages for this shape
11756					$this -> ApplicablePages -> Add ( $page_attributes [ 'number' ], $page_attributes ) ;
11757
11758					break ;
11759
11760				// <column> tag :
11761				case	'column' :
11762					$column_attributes	=  PdfToTextCaptureDefinitions::GetNodeAttributes
11763					   (
11764						$child,
11765						array
11766						   (
11767							'name'		=>  true,
11768							'left'		=>  false,
11769							'right'		=>  false,
11770							'width'		=>  false,
11771							'default'	=>  false
11772						    )
11773					    ) ;
11774
11775					$column_name		=  $column_attributes [ 'name' ] ;
11776
11777					// Build the final default value, if any one is specified ; the following special constructs are processed :
11778					// - "%c" :
11779					//	Replaced by the column name.
11780					// - "%n" :
11781					//	Replaced by the column index (starting from zero).
11782					if  ( ! $column_attributes [ 'default' ] )
11783						$column_attributes [ 'default' ]	=  $column_default ;
11784
11785					$substitutes		=  array
11786					   (
11787						'%c'		=>  $column_name,
11788						'%n'		=>  count ( $this -> Columns )
11789					    ) ;
11790
11791					$column_attributes [ 'default' ]	=  str_replace
11792					   (
11793						array_keys ( $substitutes ),
11794						array_values ( $substitutes ),
11795						$column_attributes [ 'default' ]
11796					    ) ;
11797
11798					// Add the column definition to this object
11799					if  ( ! isset ( $this -> Columns [ $column_name ] ) )
11800					   {
11801						$this -> Columns [ $column_attributes [ 'name' ] ]	=  $column_attributes ;
11802						$this -> ColumnNames []					=  $column_attributes [ 'name' ] ;
11803					    }
11804					else
11805						error ( new PdfToTextCaptureException ( "Column \"$column_name\" is defined more than once." ) ) ;
11806
11807					break ;
11808
11809				// Other tag : throw an exception
11810				default :
11811					error ( new PdfToTextCaptureException ( "Invalid tag <$tag_name> found in root tag <rectangle>." ) ) ;
11812			    }
11813		    }
11814	    }
11815
11816
11817	/*--------------------------------------------------------------------------------------------------------------
11818
11819	     ExtractAreas -
11820		Extracts text contents from the document fragments.
11821
11822	 *-------------------------------------------------------------------------------------------------------------*/
11823	public function  ExtractAreas ( $document_fragments )
11824	   {
11825		$result		=  array ( ) ;
11826
11827		// Loop through each page of document fragments
11828		foreach  ( $document_fragments  as  $page  =>  $page_contents )
11829		   {
11830			$fragments		=  $page_contents [ 'fragments' ] ;
11831
11832			// Ignore this page if not included in the <columns> definition
11833			if  ( ! isset ( $this -> ApplicablePages -> PageMap [ $page ] ) )
11834				continue ;
11835
11836			// <columns> definition only gives the location of the first line of each column, together
11837			// with its height.
11838			// We will build as many new column areas as can fit on one page
11839			$this_page_areas	=  $this -> Areas [ $page ] ;
11840			$column_areas		=  array ( ) ;
11841
11842			for  ( $i = 0, $count = count ( $this_page_areas ) ; $i  <  $count ; $i ++ )
11843			   {
11844				// For now, duplicate the existing column areas - they will represent the 1st line of columns
11845				$this_page_area		=  $this_page_areas [$i] ;
11846				$new_area		=  clone ( $this_page_area ) ;
11847				$column_areas [0] []	=  $new_area ;
11848				$line_height		=  $new_area -> Height ;
11849				$current_top		=  $new_area -> Top - $line_height ;
11850				$current_line		=  0 ;
11851
11852				// Then build new column areas for each successive lines
11853				while  ( $current_top - $line_height  >=  0 )
11854				   {
11855					$current_line ++ ;
11856					$new_area				 =  clone ( $new_area ) ;
11857					$new_area -> Top			-=  $line_height ;
11858					$new_area -> Bottom			-=  $line_height ;
11859
11860					$column_areas [ $current_line ]	[]	 =  $new_area ;
11861					$current_top				-=  $line_height ;
11862				    }
11863			    }
11864
11865			// Now extract the columns, line per line, from the current page's text fragments
11866			$found_lines		=  array ( ) ;
11867
11868			foreach  ( $fragments  as  $fragment )
11869			   {
11870				$this -> GetFragmentData ( $fragment, $text, $left, $top, $right, $bottom ) ;
11871
11872				// Loop through each line of column areas, built from the above step
11873				foreach ( $column_areas  as  $line => $column_areas_per_name )
11874				    {
11875					$index	=  0 ;			// Column index
11876
11877					// Process each column area
11878					foreach  ( $column_areas_per_name  as  $column_area )
11879					   {
11880						// ... but only do something if the current column area is contained in the current fragment
11881						if  ( $column_area -> Contains ( $left, $top, $right, $bottom ) )
11882						   {
11883							// The normal usage will be to capture one-line columns...
11884							if  ( ! isset ( $found_lines [ $line ] [ $column_area -> Name ] ) )
11885							   {
11886								$found_lines [ $line ] [ $column_area -> Name ]	=
11887									new PdfToTextCapturedColumn ( $page, $column_area -> Name, $text,
11888										$left, $top, $right, $bottom, $this ) ;
11889							    }
11890							// ... but you can also use them to capture multiple lines ; in this case, the "separator" attribute of the <lines> or
11891							// <column> tag will be used to separate items
11892							else
11893							   {
11894								$existing_area	=  $found_lines [ $line ] [ $column_area -> Name ] ;
11895
11896								$existing_area -> Top			=  max ( $existing_area -> Top   , $column_area -> Top    ) ;
11897								$existing_area -> Bottom		=  min ( $existing_area -> Bottom, $column_area -> Bottom ) ;
11898								$existing_area -> Left			=  min ( $existing_area -> Left  , $column_area -> Left   ) ;
11899								$existing_area -> Right			=  max ( $existing_area -> Right , $column_area -> Right  ) ;
11900								$existing_area -> Text		       .=  $this -> Separator . $text ;
11901							    }
11902						    }
11903
11904						$index ++ ;
11905					    }
11906				     }
11907			    }
11908
11909			// A final pass to provide default values for empty columns (usually, column values that are not represented in the PDF file)
11910			// Also get the surrounding box for the whole line
11911			$final_lines		=  array ( ) ;
11912
11913			foreach  ( $found_lines  as  $line => $columns_line )
11914			   {
11915				foreach  ( $this -> ColumnNames  as  $column_name )
11916				   {
11917					if  ( ! isset ( $columns_line [ $column_name ] ) )
11918					   {
11919						$columns_line [ $column_name ]	=
11920							new PdfToTextCapturedColumn ( $page, $column_name, $this -> Columns [ $column_name ] [ 'default' ], 0, 0, 0, 0, $this ) ;
11921					    }
11922				    }
11923
11924				// Get the (left,top) coordinates of the line
11925				$line_left	=  $found_lines [ $line ] [ $this -> ColumnNames [0] ] -> Left ;
11926				$line_top	=  $found_lines [ $line ] [ $this -> ColumnNames [0] ] -> Top ;
11927
11928				// Get the (right,bottom) coordinates - we have to find the last column whose value is not a default value
11929				// (and therefore, has a non-zero Right coordinate)
11930				$last		=  count ( $this -> ColumnNames ) - 1 ;
11931				$line_right	=  0 ;
11932				$line_bottom	=  0 ;
11933
11934				while  ( $last  >=  0  &&  ! $columns_line [ $this -> ColumnNames [ $last ] ] -> Right )
11935					$last -- ;
11936
11937				if  ( $last  >  0 )
11938				   {
11939					$line_right	=  $columns_line [ $this -> ColumnNames [ $last ] ] -> Right ;
11940					$line_bottom	=  $columns_line [ $this -> ColumnNames [ $last ] ] -> Bottom ;
11941				    }
11942
11943				// Create a CaptureLine entry
11944				$final_lines []	=  new PdfToTextCapturedLine ( $page, $this -> Name, $columns_line, $line_left, $line_top, $line_right, $line_bottom, $this ) ;
11945			    }
11946
11947			// The result for this page will be a CapturedLines object
11948			$result [ $page ]	=  new  PdfToTextCapturedLines ( $this -> Name, $page, $final_lines ) ;
11949		    }
11950
11951		// All done, return
11952		return ( $result ) ;
11953	    }
11954
11955
11956	/*--------------------------------------------------------------------------------------------------------------
11957
11958	     SetPageCount -
11959		Extracts text contents from the document fragments.
11960
11961	 *-------------------------------------------------------------------------------------------------------------*/
11962	public function  SetPageCount ( $count )
11963	   {
11964		parent::SetPageCount ( $count ) ;
11965
11966		foreach  ( $this -> ApplicablePages  as $page => $applicable )
11967		   {
11968			if  ( ! $applicable )
11969				continue ;
11970
11971			foreach  ( $this -> Columns  as  $column )
11972			   {
11973				if  ( ! isset ( $this -> Tops [ $page ] ) )
11974				   {
11975					$this -> Tops    [ $page ]		=  ( double ) $this -> ApplicablePages -> ExtraPageMapData [ $page ] [ 'column-top' ] ;
11976					$this -> Bottoms [ $page ]		=  ( double ) $this -> ApplicablePages -> ExtraPageMapData [ $page ] [ 'column-bottom' ] ;
11977				    }
11978
11979				$area	=  new PdfToTextCaptureArea ( $column, $this -> ApplicablePages -> ExtraPageMapData [ $page ], $column [ 'name' ] ) ;
11980
11981				$this -> Areas [ $page ] []	=  $area ;
11982			    }
11983		    }
11984	    }
11985
11986
11987	/*--------------------------------------------------------------------------------------------------------------
11988
11989		Support functions.
11990
11991	 *-------------------------------------------------------------------------------------------------------------*/
11992    }
11993
11994
11995/*==============================================================================================================
11996
11997    class PdfToTextCaptureApplicablePages -
11998        Holds a list of applicable pages given by the "number" attribute of <page> tags.
11999
12000  ==============================================================================================================*/
12001class  PdfToTextCaptureApplicablePages		//extends		Object
12002						implements	ArrayAccess, Countable, Iterator
12003   {
12004	// Ranges of pages, as given by the "number" attribute of the <page> tag. Since a page number expression
12005	// can refer to the last page ("$"), and the total number of pages in the document is not yet known at the
12006	// time of object instantiation, we have to store all the page ranges as is.
12007	protected	$PageRanges		=  array ( ) ;
12008
12009	// Once the SetPageCount() method has been called (ie, once the total number of pages in the document is
12010	// known), then a PageMap is built ; each key is the page number, indicating whether the page applies or not.
12011	public		$PageMap		=  array ( ) ;
12012
12013	// Extra data associated, this time, with each page in PageMap
12014	public		$ExtraPageMapData	=  array ( ) ;
12015
12016	// Page count - set by the SetPageCount() method
12017	public		$PageCount		=  false ;
12018
12019
12020	/*--------------------------------------------------------------------------------------------------------------
12021
12022	    CONSTRUCTOR
12023	        Initializes the object.
12024
12025	 *-------------------------------------------------------------------------------------------------------------*/
12026	public function  __construct ( )
12027	   {
12028	    }
12029
12030
12031	/*--------------------------------------------------------------------------------------------------------------
12032
12033	    NAME
12034	        Add - Add a page number(s) definition.
12035
12036	    PROTOTYPE
12037	        $applicable_pages -> Add ( $page_number ) ;
12038
12039	    DESCRIPTION
12040	        Add the page number(s) specified by the "number" attribute of the <pages> tag to the list of applicable
12041		pages.
12042
12043	    PARAMETERS
12044	        $page_number (string) -
12045	                A string defining which pages are applicable. This can be a single page number :
12046
12047				<page number="1" .../>
12048
12049			or a comma-separated list of pages :
12050
12051				<page number="1, 2, 10" .../>
12052
12053			or range(s) of pages :
12054
12055				<page number="1..10, 12..20" .../>
12056
12057			The special "$" character means "last page" ; thus the following example :
12058
12059				<page number="1, $-9..$" .../>
12060
12061			means : "applicable pages are 1, plus the last ten pages f the document".
12062
12063	 *-------------------------------------------------------------------------------------------------------------*/
12064	public function  Add ( $page_number, $extra_data = false )
12065	   {
12066		$this -> __parse_page_numbers ( $page_number, $extra_data ) ;
12067	    }
12068
12069
12070	/*--------------------------------------------------------------------------------------------------------------
12071
12072	    NAME
12073	        SetPageCount - Sets the total number of pages in the document.
12074
12075	    PROTOTYPE
12076	        $applicable_pages -> SetPageCount ( $count ) ;
12077
12078	    DESCRIPTION
12079	        Sets the total number of pages in the document and builds a map of which pages are applicable or not.
12080
12081	    PARAMETERS
12082	        $count (integer) -
12083	                Total number of pages in the document.
12084
12085	 *-------------------------------------------------------------------------------------------------------------*/
12086	public function  SetPageCount ( $count )
12087	   {
12088		$this -> PageCount		=  $count ;
12089		$this -> PageMap		=  array ( ) ;
12090
12091		// Loop through the page ranges - every single value in the ranges has been converted to an integer ;
12092		// the other ones, built as expressions (using "$" for example) are processed here to give the actual
12093		// page number
12094		foreach  ( $this -> PageRanges  as  $range )
12095		   {
12096			$low		=  $range [0] ;
12097			$high		=  $range [1] ;
12098
12099			// Translate expression to an actual value for the low and high parts of the range, if not already integers
12100			if  ( ! is_integer ( $low ) )
12101				$low	=  $this -> __check_expression ( $low, $count ) ;
12102
12103			if  ( ! is_integer ( $high ) )
12104				$high	=  $this -> __check_expression ( $high, $count ) ;
12105
12106			// Expressions using "$" may lead to negative values - adjust them
12107			if  ( $low  <  1 )
12108			   {
12109				if  ( $high  <  1 )
12110					$high	=  1 ;
12111
12112				$low	=  1 ;
12113			    }
12114
12115			// Check that the range is consistent
12116			if  ( $low  >  $high )
12117				error ( new PdfToTextCaptureException ( "Low value ($low) must be less or equal to high value ($high) " .
12118						"in page range specification \"{$range [0]}..{$range [1]}\"." ) ) ;
12119
12120			// Ignore ranges where the 'low' value is higher than the number of pages in the document
12121			if  ( $low  >  $count )
12122			    {
12123				warning ( new PdfToTextCaptureException ( "Low value ($low) is greater than page count ($count) " .
12124						"in page range specification \"{$range [0]}..{$range [1]}\"." ) ) ;
12125				continue ;
12126			     }
12127
12128			// Normalize the 'high' value, so that it's not bigger than the number of pages in the document
12129			if  ( $high  >  $count )
12130				$high	=  $count ;
12131
12132			// Complement the page map using this range
12133			for  ( $i = $low ; $i  <=  $high ; $i ++ )
12134			   {
12135				$this -> PageMap [$i]		=  true ;
12136				$this -> ExtraPageMapData [$i]	=  $range [2] ;
12137			    }
12138		    }
12139	    }
12140
12141
12142	/*--------------------------------------------------------------------------------------------------------------
12143
12144		Interfaces implementations.
12145
12146	 *-------------------------------------------------------------------------------------------------------------*/
12147
12148	// Countable interface
12149	public function  count ( )
12150	   { return ( count ( $this -> PageMap ) ) ; }
12151
12152
12153	// Array access interface
12154	public function  offsetExists ( $offset )
12155	   { return ( isset ( $this -> PageMap [ $offset ] ) ) ; }
12156
12157
12158	public function  offsetGet ( $offset )
12159	   { return ( ( isset ( $this -> PageMap [ $offset ] ) ) ?  true : false ) ; }
12160
12161
12162	public function  offsetSet ( $offset, $value )
12163	   { error ( new PdfToTextException ( "Unsupported operation" ) ) ; }
12164
12165
12166	public function  offsetunset ( $offset )
12167	   { error ( new PdfToTextException ( "Unsupported operation" ) ) ; }
12168
12169
12170	// Iterator interface
12171	private		$__iterator_value	=  1 ;
12172
12173	public function   rewind ( )
12174	   { $this -> __iterator_value = 1 ; }
12175
12176
12177	public function  valid ( )
12178	   { return ( $this -> __iterator_value  >=  1  &&  $this -> __iterator_value  <=  $this -> PageCount ) ; }
12179
12180
12181	public function  key ( )
12182	   { return ( $this -> __iterator_value ) ; }
12183
12184
12185	public function  next ( )
12186	   { $this -> __iterator_value ++ ; }
12187
12188
12189	public function  current ( )
12190	   { return ( ( isset ( $this -> PageMap [ $this -> __iterator_value ] ) ) ?  true : false ) ; }
12191
12192
12193	/*--------------------------------------------------------------------------------------------------------------
12194
12195		Helper functions.
12196
12197	 *-------------------------------------------------------------------------------------------------------------*/
12198
12199	// __parse_page_numbers -
12200	//	Performs a first pass on the value of the "number" attribute of the <page> tag. Transforms range expressions
12201	//	when possible to integers ; keep the expression string intact when either the low or high value of a range
12202	//	is itself an expression, probably using the "$" (page count) character.
12203	private function  __parse_page_numbers ( $text, $extra_data )
12204	   {
12205		$ranges		=  explode ( ',', $text ) ;
12206
12207		// Loop through comma-separated ranges
12208		foreach  ( $ranges  as  $range )
12209		   {
12210			$items		=  explode ( '..', $range ) ;
12211
12212			// Check if current item is a range
12213			switch  ( count ( $items ) )
12214			   {
12215				// If not a range (ie, a single value) then make a range using that value
12216				// (low and high range values will be the same)
12217				case	1 :
12218					if  ( is_numeric ( $items [0] ) )
12219						$low	=  $high	=  ( integer ) $items [0] ;
12220					else
12221						$low	=  $high	=  trim ( $items [0] ) ;
12222
12223					break ;
12224
12225				// If range, store the low and high values
12226				case	2 :
12227					$low	=  ( is_numeric ( $items [0] ) ) ?  ( integer ) $items [0] : trim ( $items [0] ) ;
12228					$high	=  ( is_numeric ( $items [1] ) ) ?  ( integer ) $items [1] : trim ( $items [1] ) ;
12229					break ;
12230
12231				// Other cases : throw an exception
12232				default :
12233					error ( new PdfToTextCaptureException ( "Invalid page range specification \"$range\"." ) ) ;
12234			    }
12235
12236			// If the low or high range value is an expression, check at this stage that it is correct
12237			if  ( is_string ( $low )  &&  $this -> __check_expression ( $low )  ===  false )
12238				error ( new PdfToTextCaptureException ( "Invalid expression \"$low\" in page range specification \"$range\"." ) ) ;
12239
12240			if  ( is_string ( $high )  &&  $this -> __check_expression ( $high )  ===  false )
12241				error ( new PdfToTextCaptureException ( "Invalid expression \"$high\" in page range specification \"$range\"." ) ) ;
12242
12243			// Add the page range and the extra data
12244			$this -> PageRanges []	=  array ( $low, $high, $extra_data ) ;
12245		    }
12246	    }
12247
12248
12249	// __check_expression -
12250	//	Checks that a syntactically correct
12251	private function  __check_expression ( $str, $count = 1 )
12252	    {
12253		$new_str	=  str_replace ( '$', $count, $str ) ;
12254		$value		=  @eval ( "return ( $new_str ) ;" ) ;
12255
12256		return ( $value ) ;
12257	     }
12258    }
12259
12260
12261/*==============================================================================================================
12262
12263    class PdfToTextCaptureArea -
12264        A capture area describes a rectangle, either by its top, left, right and bottom coordinates, or by
12265	its top/left coordinates, and its width and height.
12266
12267  ==============================================================================================================*/
12268class  PdfToTextCaptureArea	//extends  Object
12269   {
12270	// List of authorzed keyword for defining the rectangle dimensions
12271	static private		$Keys		=  array ( 'left', 'top', 'right', 'bottom', 'width', 'height' ) ;
12272
12273	// Rectangle dimensions
12274	private			$Left		=  false,
12275				$Top		=  false,
12276				$Right		=  false,
12277				$Bottom		=  false ;
12278
12279	// Area name (for internal purposes)
12280	public			$Name ;
12281
12282
12283	/*--------------------------------------------------------------------------------------------------------------
12284
12285	    NAME
12286	        Constructor
12287
12288	    PROTOTYPE
12289	        $area	=  new PdfToTextCaptureArea ( $area, $default_area = null, $name = '' ) ;
12290
12291	    DESCRIPTION
12292	        Initialize an area (a rectangle) using the supplied coordinates
12293
12294	    PARAMETERS
12295	        $area (array) -
12296	                An associative array that may contain the following entries :
12297
12298			- 'left' (double) :
12299				Left x-coordinate (mandatory).
12300
12301			- 'top' (double) :
12302				Top y-coordinate (mandatory).
12303
12304			- 'right (double) :
12305				Right x-coordinate.
12306
12307			- 'bottom' (double) :
12308				Bottom y-coordinate.
12309
12310			- 'width' (double) :
12311				Width of the rectangle, starting from 'left'.
12312
12313			- 'height' (double) :
12314				Height of the rectangle, starting from 'top'.
12315
12316			Either the 'right' or 'width' entries must be specified. This is the same for the 'bottom' and
12317			'height' entries.
12318
12319		$default_area (array) -
12320			An array that can be used to supply default values when absent from $area.
12321
12322		$name (string) -
12323			An optional name for this area. This information is not used by the class.
12324
12325	    NOTES
12326	        Coordinate (0,0) is located at the left bottom of the page.
12327
12328	 *-------------------------------------------------------------------------------------------------------------*/
12329	public function  __construct ( $area, $default_area = null, $name = '' )
12330	   {
12331		$left		=
12332		$top		=
12333		$right		=
12334		$bottom		=
12335		$width		=
12336		$height		=  false ;
12337
12338		// Retrieve each entry that allows to specify a coordinate component, using $default_area if needed
12339		foreach  ( self::$Keys  as  $key )
12340		   {
12341			if  ( isset ( $area [ $key ] ) )
12342			   {
12343				if  ( $area [ $key ]  ===  false )
12344				   {
12345					if  ( isset ( $default_area [ $key ] ) )
12346						$$key	=  $default_area [ $key ] ;
12347					else
12348						$$key	=  false ;
12349				    }
12350				else
12351					$$key	=  $area [ $key ] ;
12352			    }
12353			else if  ( isset ( $default_area [ $key ] ) )
12354				$$key	=  $default_area [ $key ] ;
12355		    }
12356
12357		// Check for mandatory coordinates
12358		if  ( $left  ===  false )
12359			error ( new PdfToTextCaptureException ( "Attribute \"left\" is mandatory." ) );
12360		else
12361			$left	=  ( double ) $left ;
12362
12363		if  ( $top  ===  false )
12364			error ( new PdfToTextCaptureException ( "Attribute \"top\" is mandatory." ) ) ;
12365		else
12366			$top	=  ( double ) $top ;
12367
12368		// Either the 'right' or 'width' entries are required
12369		if  ( $right  ===  false )
12370		   {
12371			if  ( $width  ===  false )
12372				error ( new PdfToTextCaptureException ( "Either the \"right\" or the \"width\" attribute must be specified." ) ) ;
12373			else
12374				$right		=  $left + ( double ) $width - 1 ;
12375		    }
12376		else
12377			$right	=  ( double ) $right ;
12378
12379		// Same for 'bottom' and 'height'
12380		if  ( $bottom  ===  false )
12381		   {
12382			if  ( $height  ===  false )
12383				error ( new PdfToTextCaptureException ( "Either the \"bottom\" or the \"height\" attribute must be specified." ) ) ;
12384			else
12385				$bottom		=  $top - ( double ) $height + 1 ;
12386		    }
12387		else
12388			$bottom	=  ( double ) $bottom ;
12389
12390		// All done, we have the coordinates we wanted
12391		$this -> Left		=  $left ;
12392		$this -> Right		=  $right ;
12393		$this -> Top		=  $top ;
12394		$this -> Bottom		=  $bottom ;
12395
12396		$this -> Name		=  $name ;
12397	    }
12398
12399
12400	/*--------------------------------------------------------------------------------------------------------------
12401
12402	    NAME
12403	        __get, __set - Implement the Width and Height properties.
12404
12405	 *-------------------------------------------------------------------------------------------------------------*/
12406	public function  __get ( $member )
12407	   {
12408		switch  ( $member )
12409		   {
12410			case	'Left'		:
12411			case	'Top'		:
12412			case	'Right'		:
12413			case	'Bottom'	:
12414				return ( $this -> $member ) ;
12415
12416			case	'Width'		:
12417				return ( $this -> Right - $this -> Left + 1 ) ;
12418
12419			case	'Height'	:
12420				return ( $this -> Top - $this -> Bottom + 1 ) ;
12421
12422			default :
12423				trigger_error ( "Undefined property \"$member\"." ) ;
12424		    }
12425	    }
12426
12427
12428	public function  __set ( $member, $value )
12429	   {
12430		$value		=  ( double ) $value ;
12431
12432		switch  ( $member )
12433		   {
12434			case	'Top'		:
12435			case	'Left'		:
12436			case	'Right'		:
12437			case	'Bottom'	:
12438				$this -> $member	=  $value ;
12439				break ;
12440
12441			case	'Width'		:
12442				$this -> Right		=  $this -> Left + $value - 1 ;
12443				break ;
12444
12445			case	'Height'	:
12446				$this -> Bottom		=  $this -> Top - $value + 1 ;
12447				break ;
12448
12449			default :
12450				trigger_error ( "Undefined property \"$member\"." ) ;
12451		    }
12452	    }
12453
12454
12455	/*--------------------------------------------------------------------------------------------------------------
12456
12457	    NAME
12458	        Contains - Check if this area contains the specified rectangle.
12459
12460	 *-------------------------------------------------------------------------------------------------------------*/
12461	public function  Contains ( $left, $top, $right, $bottom )
12462	   {
12463		if  ( $left  >=  $this -> Left  &&  $right  <=  $this -> Right  &&
12464				$top  <=  $this -> Top  &&  $bottom  >=  $this -> Bottom )
12465			return ( true ) ;
12466		else
12467			return ( false ) ;
12468	    }
12469    }
12470
12471
12472
12473/**************************************************************************************************************
12474 **************************************************************************************************************
12475 **************************************************************************************************************
12476 ******                                                                                                  ******
12477 ******                                                                                                  ******
12478 ******                                     CAPTURED TEXT MANAGEMENT                                     ******
12479 ******         (none of the classes listed here are meant to be instantiated outside this file)         ******
12480 ******                                                                                                  ******
12481 ******                                                                                                  ******
12482 **************************************************************************************************************
12483 **************************************************************************************************************
12484 **************************************************************************************************************/
12485
12486 /*==============================================================================================================
12487
12488     class PdfToTextCapturedText -
12489         Base class for captured text enclosed by shapes.
12490
12491   ==============================================================================================================*/
12492 abstract class  PdfToTextCapturedText		//extends  Object
12493    {
12494	// Shape name (as specified by the "name" attribute of the <rectangle> or <lines> tags, for example)
12495	public		$Name ;
12496	// Number of the page where the text was found (starts from 1)
12497	public		$Page ;
12498	// Shape type (one of the PfToTextCaptureShape::SHAPE_* constants)
12499	public		$Type ;
12500	// Shape definition object (not really used, but in case of...)
12501	private		$ShapeDefinition ;
12502	// Captured text
12503	public		$Text ;
12504	// Surrounding rectangle in the PDF file
12505	public		$Left,
12506			$Top,
12507			$Right,
12508			$Bottom ;
12509
12510
12511
12512	/*--------------------------------------------------------------------------------------------------------------
12513
12514	    Constructor -
12515		Initializes a captured text object, whatever the original shape.
12516
12517	 *-------------------------------------------------------------------------------------------------------------*/
12518	public function  __construct  ( $page, $name, $text, $left, $top, $right, $bottom, $definition )
12519	   {
12520		$this -> Name			=  $name ;
12521		$this -> Page			=  $page ;
12522		$this -> ShapeDefinition	=  $definition ;
12523		$this -> Text			=  $text ;
12524		$this -> Left			=  $left ;
12525		$this -> Top			=  $top ;
12526		$this -> Right			=  $right ;
12527		$this -> Bottom			=  $bottom ;
12528		$this -> Type			=  $definition -> Type ;
12529	    }
12530     }
12531
12532
12533 /*==============================================================================================================
12534
12535     class PdfToTextCapturedRectangle -
12536         Implements a text captured by a rectangle shape.
12537
12538   ==============================================================================================================*/
12539class  PdfToTextCapturedRectangle		extends  PdfToTextCapturedText
12540   {
12541	public function  __construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition )
12542	   {
12543		parent::__construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition ) ;
12544	    }
12545
12546
12547	public function  __tostring ( )
12548	   { return ( $this -> Text ) ; }
12549    }
12550
12551
12552 /*==============================================================================================================
12553
12554     class PdfToTextCapturedColumn -
12555         Implements a text captured by a lines/column shape.
12556	 Actually behaves like the PdfToTextCapturedRectangle class
12557
12558   ==============================================================================================================*/
12559class  PdfToTextCapturedColumn			extends  PdfToTextCapturedText
12560   {
12561	public function  __construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition )
12562	   {
12563		parent::__construct ( $page, $name, $text, $left, $top, $right, $bottom, $definition ) ;
12564	    }
12565
12566
12567	public function  __tostring ( )
12568	   { return ( $this -> Text ) ; }
12569    }
12570
12571
12572 /*==============================================================================================================
12573
12574     class PdfToTextCapturedLine -
12575         Implements a text captured by a lines shape.
12576
12577   ==============================================================================================================*/
12578class  PdfToTextCapturedLine			extends		PdfToTextCapturedText
12579 						implements	ArrayAccess, Countable, IteratorAggregate
12580  {
12581	// Column objects
12582	public		$Columns ;
12583	// Array of column names, to allow access by either index or column name
12584	private		$ColumnsByNames		=  array ( ) ;
12585
12586
12587	/*--------------------------------------------------------------------------------------------------------------
12588
12589	    Constructor -
12590		Builds a Line object based on the supplied columns.
12591		Also builds the Text property, which contains the columns text separated by the separator string
12592		specified in the XML definition.
12593
12594	 *-------------------------------------------------------------------------------------------------------------*/
12595	public function  __construct ( $page, $name, $columns, $left, $top, $right, $bottom, $definition )
12596	   {
12597		// Although the Columns property is most likely to be used, build a text representation of the whole ine
12598		$text			=  array ( ) ;
12599		$count			=  0 ;
12600
12601		foreach  ( $columns  as  $column )
12602		   {
12603			$text []					=  $column -> Text ;
12604			$this -> ColumnsByNames [ $column -> Name ]	=  $count ++ ;
12605		    }
12606
12607		// Provide this information to the parent constructor
12608		parent::__construct ( $page, $name, implode ( $definition -> Separator, $text ), $left, $top, $right, $bottom, $definition ) ;
12609
12610		// Store the column definitions
12611		$this -> Columns	=  $columns ;
12612	    }
12613
12614
12615	/*--------------------------------------------------------------------------------------------------------------
12616
12617	    __get -
12618		Returns access to a column by its name.
12619
12620	 *-------------------------------------------------------------------------------------------------------------*/
12621	public function  __get ( $member )
12622	   {
12623		if  ( isset ( $this -> ColumnsByNames [ $member ] ) )
12624			return ( $this -> Columns [ $this -> ColumnsByNames [ $offset ] ] ) ;
12625		else
12626			trigger_error ( "Undefined property \"$member\"." ) ;
12627	    }
12628
12629
12630	/*--------------------------------------------------------------------------------------------------------------
12631
12632		Interfaces implementations.
12633
12634	 *-------------------------------------------------------------------------------------------------------------*/
12635	public function  count ( )
12636	   { return ( $this -> Columns ) ; }
12637
12638
12639	public function  getIterator ( )
12640	   { return ( new ArrayIterator ( $this -> Columns ) ) ; }
12641
12642
12643	public function  offsetExists ( $offset )
12644	   {
12645		if  ( is_numeric ( $offset ) )
12646			return ( $offset  >=  0  &&  $offset  <  count ( $this -> Columns ) ) ;
12647		else
12648			return ( isset ( $this -> ColumnsByNames [ $offset ] ) ) ;
12649	    }
12650
12651
12652	public function  offsetGet ( $offset )
12653	   {
12654		if  ( is_numeric ( $offset ) )
12655			return ( $this -> Columns [ $offset ] ) ;
12656		else
12657			return ( $this -> Columns [ $this -> ColumnsByNames [ $offset ] ] ) ;
12658	    }
12659
12660
12661	public function  offsetSet ( $offset, $value )
12662	   { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
12663
12664
12665	public function  offsetUnset ( $offset )
12666	   { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
12667    }
12668
12669
12670 /*==============================================================================================================
12671
12672     class PdfToTextCapturedLines -
12673         Implements a set of lines.
12674
12675   ==============================================================================================================*/
12676class  PdfToTextCapturedLines			//extends		Object
12677						implements	ArrayAccess, Countable, IteratorAggregate
12678   {
12679	// Capture name, as specified by the "name" attribute of the <lines> tag
12680	public		$Name ;
12681	// Page number of the capture
12682	public		$Page ;
12683	// Captured lines
12684	public		$Lines ;
12685	// Content type (mimics a little bit the PdfToTextCapturedText class)
12686	public		$Type			=  PdfToTextCaptureShapeDefinition::SHAPE_LINE ;
12687
12688
12689	/*--------------------------------------------------------------------------------------------------------------
12690
12691	    Constructor -
12692		Instantiates a PdfToTextCapturedLines object.
12693
12694	 *-------------------------------------------------------------------------------------------------------------*/
12695	public function  __construct ( $name, $page, $lines )
12696	   {
12697		$this -> Name		=  $name ;
12698		$this -> Page		=  $page ;
12699		$this -> Lines		=  $lines ;
12700	    }
12701
12702
12703	/*--------------------------------------------------------------------------------------------------------------
12704
12705		Interfaces implementations.
12706
12707	 *-------------------------------------------------------------------------------------------------------------*/
12708	public function  count ( )
12709	   { return ( $this -> Lines ) ; }
12710
12711
12712	public function  getIterator ( )
12713	   { return ( new ArrayIterator ( $this -> Lines ) ) ; }
12714
12715
12716	public function  offsetExists ( $offset )
12717	   { return ( $offset  >=  0  &&  $offset  <  count ( $this -> Lines ) ) ; }
12718
12719
12720	public function  offsetGet ( $offset )
12721	   { return ( $this -> Captures [ $offset ] ) ; }
12722
12723
12724	public function  offsetSet ( $offset, $value )
12725	   { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
12726
12727
12728	public function  offsetUnset ( $offset )
12729	   { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
12730    }
12731
12732
12733/**************************************************************************************************************
12734 **************************************************************************************************************
12735 **************************************************************************************************************
12736 ******                                                                                                  ******
12737 ******                                                                                                  ******
12738 ******                               CAPTURE INTERFACE FOR THE DEVELOPER                                ******
12739 ******         (none of the classes listed here are meant to be instantiated outside this file)         ******
12740 ******                                                                                                  ******
12741 ******                                                                                                  ******
12742 **************************************************************************************************************
12743 **************************************************************************************************************
12744 **************************************************************************************************************/
12745
12746/*==============================================================================================================
12747
12748    class PdfToTextCaptures -
12749        Represents all the areas in a PDF file captured by the supplied XML definitions.
12750
12751  ==============================================================================================================*/
12752class  PdfToTextCaptures			//extends  Object
12753   {
12754	// Captured objects - May not exactly reflect the PdfToTextCapture*Shape classes
12755	private		$CapturedObjects ;
12756	// Allows faster access by capture name
12757	private		$ObjectsByName			=  array ( ) ;
12758
12759
12760	/*--------------------------------------------------------------------------------------------------------------
12761
12762	    Constructor -
12763		Instantiates a PdfToTextCaptures object.
12764
12765	 *-------------------------------------------------------------------------------------------------------------*/
12766	public function  __construct ( $captures )
12767	   {
12768		$this -> CapturedObjects	=  $captures ;
12769
12770		// Build an array of objects indexed by their names
12771		foreach  ( $captures  as  $page => $shapes )
12772		   {
12773			foreach  ( $shapes  as  $shape )
12774				$this -> ObjectsByName [ $shape -> Name ] []	=  $shape ;
12775		    }
12776	    }
12777
12778
12779	/*--------------------------------------------------------------------------------------------------------------
12780
12781	    ToCaptures -
12782		Returns a simplified view of captured objects, with only name/value pairs.
12783
12784	 *-------------------------------------------------------------------------------------------------------------*/
12785	public function  ToCaptures ( )
12786	   {
12787		$result		=  new stdClass ( ) ;
12788
12789		foreach  ( $this -> CapturedObjects  as  $page => $captures )
12790		   {
12791			foreach  ( $captures  as  $capture )
12792			    {
12793				switch  ( $capture -> Type )
12794				   {
12795					case	PdfToTextCaptureShapeDefinition::SHAPE_RECTANGLE :
12796						$name				=  $capture -> Name ;
12797						$value				=  $capture -> Text ;
12798						$result -> {$name} [ $page ]	=  $value ;
12799						break ;
12800
12801					case	PdfToTextCaptureShapeDefinition::SHAPE_LINE :
12802						$name				=  $capture -> Name ;
12803
12804						if  ( ! isset ( $result -> {$name} ) )
12805							$result -> {$name}		=  array ( ) ;
12806
12807						foreach  ( $capture  as  $line )
12808						   {
12809							$columns	=  new  stdClass ;
12810
12811							foreach  ( $line  as  $column )
12812							   {
12813								$column_name			=  $column -> Name ;
12814								$column_value			=  $column -> Text ;
12815								$columns -> {$column_name}	=  $column_value ;
12816							    }
12817
12818							$result -> {$name} []	=  $columns ;
12819						    }
12820				    }
12821			    }
12822		    }
12823
12824		return ( $result ) ;
12825	    }
12826
12827
12828	/*--------------------------------------------------------------------------------------------------------------
12829
12830	    __get -
12831		Retrieves the captured objects by their name, as specified in the XML definition.
12832
12833	 *-------------------------------------------------------------------------------------------------------------*/
12834	public function  __get ( $member )
12835	   {
12836		$fieldname	=  "__capture_{$member}__" ;
12837
12838		if  ( ! isset ( $this -> $fieldname ) )
12839		   {
12840			if  ( ! isset ( $this -> ObjectsByName [ $member ] ) )
12841				error ( new PdfToTextException ( "Undefined property \"$member\"." ) ) ;
12842
12843			$this -> $fieldname	=  $this -> GetCaptureInstance ( $member ) ;
12844		    }
12845
12846		return ( $this -> $fieldname ) ;
12847	    }
12848
12849
12850	/*--------------------------------------------------------------------------------------------------------------
12851
12852	    GetCapturedObjectsByName -
12853		Returns an associative array of the captured shapes, indexed by their name.
12854
12855	 *-------------------------------------------------------------------------------------------------------------*/
12856	public function  GetCapturedObjectsByName ( )
12857	   { return ( $this -> ObjectsByName ) ; }
12858
12859
12860	/*--------------------------------------------------------------------------------------------------------------
12861
12862	    GetCaptureInstance -
12863		Returns an object inheriting from the PdfToTextCapture class, that wraps the capture results.
12864
12865	 *-------------------------------------------------------------------------------------------------------------*/
12866	protected function  GetCaptureInstance ( $fieldname )
12867	   {
12868		switch ( $this -> ObjectsByName [ $fieldname ] [0] -> Type )
12869		   {
12870			case	PdfToTextCaptureShapeDefinition::SHAPE_RECTANGLE :
12871				return ( new PdfToTextRectangleCapture ( $this -> ObjectsByName [ $fieldname ] ) ) ;
12872
12873			case	PdfToTextCaptureShapeDefinition::SHAPE_LINE :
12874				return ( new PdfToTextLinesCapture ( $this -> ObjectsByName [ $fieldname ] ) ) ;
12875
12876			default :
12877				error ( new PdfToTextCaptureException ( "Unhandled shape type " . $this -> ObjectsByName [ $fieldname ] [0] -> Type . "." ) ) ;
12878		    }
12879	    }
12880
12881
12882    }
12883
12884
12885/*==============================================================================================================
12886
12887    class PdfToTextCapture -
12888        Base class for all capture classes accessible to the caller.
12889
12890  ==============================================================================================================*/
12891class  PdfToTextCapture				//extends		Object
12892						implements	ArrayAccess, Countable, IteratorAggregate
12893   {
12894	protected	$Captures ;
12895
12896
12897	/*--------------------------------------------------------------------------------------------------------------
12898
12899	    Constructor -
12900		Instantiates a PdfToTextCapture object.
12901
12902	 *-------------------------------------------------------------------------------------------------------------*/
12903	public function  __construct ( $objects )
12904	   {
12905		//parent::__construct ( ) ;
12906
12907		$this -> Captures		=  $objects ;
12908	    }
12909
12910
12911	/*--------------------------------------------------------------------------------------------------------------
12912
12913		Interfaces implementations.
12914
12915	 *-------------------------------------------------------------------------------------------------------------*/
12916	public function  count ( )
12917	   { return ( $this -> Captures ) ; }
12918
12919
12920	public function  getIterator ( )
12921	   { return ( new ArrayIterator ( $this -> Captures ) ) ; }
12922
12923
12924	public function  offsetExists ( $offset )
12925	   { return ( $offset  >=  0  &&  $offset  <  count ( $this -> Captures ) ) ; }
12926
12927
12928	public function  offsetGet ( $offset )
12929	   { return ( $this -> Captures [ $offset ] ) ; }
12930
12931
12932	public function  offsetSet ( $offset, $value )
12933	   { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
12934
12935
12936	public function  offsetUnset ( $offset )
12937	   { error ( new PdfToTextCaptureException ( "Unsupported operation." ) ) ; }
12938
12939    }
12940
12941
12942/*==============================================================================================================
12943
12944    class PdfToTextLinesCapture -
12945        Represents a lines capture, without indexation to their page number.
12946
12947  ==============================================================================================================*/
12948class  PdfToTextLinesCapture			extends  PdfToTextCapture
12949   {
12950	/*--------------------------------------------------------------------------------------------------------------
12951
12952	    Constructor -
12953		"flattens" the supplied object list, by removing the PdfToTextCapturedLines class level, so that lines
12954		can be iterated whatever their page number is.
12955
12956	 *-------------------------------------------------------------------------------------------------------------*/
12957	public function  __construct ( $objects )
12958	   {
12959		$new_objects	=  array ( ) ;
12960
12961		foreach  ( $objects  as  $object )
12962		   {
12963			foreach  ( $object  as  $line )
12964				$new_objects []		=  $line ;
12965		    }
12966
12967		parent::__construct ( $new_objects ) ;
12968	    }
12969    }
12970
12971
12972/*==============================================================================================================
12973
12974    class PdfToTextRectangleCapture -
12975        Implements a rectangle capture, from the caller point of view.
12976
12977  ==============================================================================================================*/
12978class  PdfToTextRectangleCapture		extends  PdfToTextCapture
12979   {
12980	/*--------------------------------------------------------------------------------------------------------------
12981
12982	    Constructor -
12983		Builds an object array indexed by page number.
12984
12985	 *-------------------------------------------------------------------------------------------------------------*/
12986	public function  __construct ( $objects )
12987	   {
12988		$new_objects	=  array ( ) ;
12989
12990		foreach  ( $objects  as  $object )
12991			$new_objects [ $object -> Page ]	=  $object ;
12992
12993		parent::__construct ( $new_objects ) ;
12994	    }
12995    }
12996
12997