1package CAM::PDF; 2 3use 5.006; 4use warnings; 5use strict; 6use Carp qw(croak cluck); 7use English qw(-no_match_vars); 8use CAM::PDF::Node; 9use CAM::PDF::Decrypt; 10 11our $VERSION = '1.60'; 12 13## no critic(Bangs::ProhibitCommentedOutCode) 14## no critic(ControlStructures::ProhibitDeepNests) 15 16=for stopwords eval'ed CR-NL PDFLib defiltered prefill indices inline de-embedding 4th linearized viewable decrypted 17 18=head1 NAME 19 20CAM::PDF - PDF manipulation library 21 22=head1 LICENSE 23 24Copyright 2002-2006 Clotho Advanced Media, Inc., L<http://www.clotho.com/> 25 26Copyright 2007-2008 Chris Dolan 27 28This library is free software; you can redistribute it and/or modify it 29under the same terms as Perl itself. 30 31=head1 SYNOPSIS 32 33 use CAM::PDF; 34 35 my $pdf = CAM::PDF->new('test1.pdf'); 36 37 my $page1 = $pdf->getPageContent(1); 38 [ ... mess with page ... ] 39 $pdf->setPageContent(1, $page1); 40 [ ... create some new content ... ] 41 $pdf->appendPageContent(1, $newcontent); 42 43 my $anotherpdf = CAM::PDF->new('test2.pdf'); 44 $pdf->appendPDF($anotherpdf); 45 46 my @prefs = $pdf->getPrefs(); 47 $prefs[$CAM::PDF::PREF_OPASS] = 'mypassword'; 48 $prefs[$CAM::PDF::PREF_UPASS] = 'mypassword'; 49 $pdf->setPrefs(@prefs); 50 51 $pdf->cleanoutput('out1.pdf'); 52 print $pdf->toPDF(); 53 54Many example programs are included in this distribution to do useful 55tasks. See the C<bin> subdirectory. 56 57=head1 DESCRIPTION 58 59This package reads and writes any document that conforms to the PDF 60specification generously provided by Adobe at 61L<http://partners.adobe.com/public/developer/pdf/index_reference.html> 62(link last checked Oct 2005). 63 64The file format through PDF 1.5 is well-supported, with the exception 65of the "linearized" or "optimized" output format, which this module 66can read but not write. Many specific aspects of the document model 67are not manipulable with this package (like fonts), but if the input 68document is correctly written, then this module will preserve the 69model integrity. 70 71The PDF writing feature saves as PDF 1.4-compatible. That means that 72we cannot write compressed object streams. The consequence is that 73reading and then writing a PDF 1.5+ document may enlarge the resulting 74file by a fair margin. 75 76This library grants you some power over the PDF security model. Note 77that applications editing PDF documents via this library MUST respect 78the security preferences of the document. Any violation of this 79respect is contrary to Adobe's intellectual property position, as 80stated in the reference manual at the above URL. 81 82Technical detail regarding corrupt PDFs: This library adheres strictly 83to the PDF specification. Adobe's Acrobat Reader is more lenient, 84allowing some corrupted PDFs to be viewable. Therefore, it is 85possible that some PDFs may be readable by Acrobat that are illegible 86to this library. In particular, files which have had line endings 87converted to or from DOS/Windows style (i.e. CR-NL) may be rendered 88unusable even though Acrobat does not complain. Future library 89versions may relax the parser, but not yet. 90 91=cut 92 93our $PREF_OPASS = 0; 94our $PREF_UPASS = 1; 95our $PREF_PRINT = 2; 96our $PREF_MODIFY = 3; 97our $PREF_COPY = 4; 98our $PREF_ADD = 5; 99 100our $MAX_STRING = 65; # length of output string 101 102my %filterabbrevs = ( 103 AHx => 'ASCIIHexDecode', 104 A85 => 'ASCII85Decode', 105 CCF => 'CCITTFaxDecode', 106 DCT => 'DCTDecode', 107 Fl => 'FlateDecode', 108 LZW => 'LZWDecode', 109 RL => 'RunLengthDecode', 110 ); 111 112my %inlineabbrevs = ( 113 %filterabbrevs, 114 BPC => 'BitsPerComponent', 115 CS => 'ColorSpace', 116 D => 'Decode', 117 DP => 'DecodeParms', 118 F => 'Filter', 119 H => 'Height', 120 IM => 'ImageMask', 121 I => 'Interpolate', 122 W => 'Width', 123 CMYK => 'DeviceCMYK', 124 G => 'DeviceGray', 125 RGB => 'DeviceRGB', 126 I => 'Indexed', 127 ); 128 129=head1 API 130 131=head2 Functions intended to be used externally 132 133 $self = CAM::PDF->new(content | filename | '-') 134 $self->toPDF() 135 $self->needsSave() 136 $self->save() 137 $self->cleansave() 138 $self->output(filename | '-') 139 $self->cleanoutput(filename | '-') 140 $self->previousRevision() 141 $self->allRevisions() 142 $self->preserveOrder() 143 $self->appendObject(olddoc, oldnum, [follow=(1|0)]) 144 $self->replaceObject(newnum, olddoc, oldnum, [follow=(1|0)]) 145 (olddoc can be undef in the above for adding new objects) 146 $self->numPages() 147 $self->getPageText(pagenum) 148 $self->getPageDimensions(pagenum) 149 $self->getPageContent(pagenum) 150 $self->setPageContent(pagenum, content) 151 $self->appendPageContent(pagenum, content) 152 $self->deletePage(pagenum) 153 $self->deletePages(pagenum, pagenum, ...) 154 $self->extractPages(pagenum, pagenum, ...) 155 $self->appendPDF(CAM::PDF object) 156 $self->prependPDF(CAM::PDF object) 157 $self->wrapString(string, width, fontsize, page, fontlabel) 158 $self->getFontNames(pagenum) 159 $self->addFont(page, fontname, fontlabel, [fontmetrics]) 160 $self->deEmbedFont(page, fontname, [newfontname]) 161 $self->deEmbedFontByBaseName(page, basename, [newfont]) 162 $self->getPrefs() 163 $self->setPrefs() 164 $self->canPrint() 165 $self->canModify() 166 $self->canCopy() 167 $self->canAdd() 168 $self->getFormFieldList() 169 $self->fillFormFields(fieldname, value, [fieldname, value, ...]) 170 or $self->fillFormFields(%values) 171 $self->clearFormFieldTriggers(fieldname, fieldname, ...) 172 173Note: 'clean' as in cleansave() and cleanobject() means write a fresh 174PDF document. The alternative (e.g. save()) reuses the existing doc 175and just appends to it. Also note that 'clean' functions sort the 176objects numerically. If you prefer that the new PDF docs more closely 177resemble the old ones, call preserveOrder() before cleansave() or 178cleanobject(). 179 180=head2 Slightly less external, but useful, functions 181 182 $self->toString() 183 $self->getPage(pagenum) 184 $self->getFont(pagenum, fontname) 185 $self->getFonts(pagenum) 186 $self->getStringWidth(fontdict, string) 187 $self->getFormField(fieldname) 188 $self->getFormFieldDict(object) 189 $self->isLinearized() 190 $self->decodeObject(objectnum) 191 $self->decodeAll(any-node) 192 $self->decodeOne(dict-node) 193 $self->encodeObject(objectnum, filter) 194 $self->encodeOne(any-node, filter) 195 $self->changeString(obj-node, hashref) 196 197=head2 Deeper utilities 198 199 $self->pageAddName(pagenum, name, objectnum) 200 $self->getPageObjnum(pagenum) 201 $self->getPropertyNames(pagenum) 202 $self->getProperty(pagenum, propname) 203 $self->getValue(any-node) 204 $self->dereference(objectnum) or $self->dereference(name,pagenum) 205 $self->deleteObject(objectnum) 206 $self->copyObject(obj-node) 207 $self->cacheObjects() 208 $self->setObjNum(obj-node, num) 209 $self->getRefList(obj-node) 210 $self->changeRefKeys(obj-node, hashref) 211 212=head2 More rarely needed utilities 213 214 $self->getObjValue(objectnum) 215 216=head2 Routines that should not be called 217 218 $self->_startdoc() 219 $self->delinearlize() 220 $self->build*() 221 $self->parse*() 222 $self->write*() 223 $self->*CB() 224 $self->traverse() 225 $self->fixDecode() 226 $self->abbrevInlineImage() 227 $self->unabbrevInlineImage() 228 $self->cleanse() 229 $self->clean() 230 $self->createID() 231 232 233=head1 FUNCTIONS 234 235=head2 Object creation/manipulation 236 237=over 238 239=item $doc->new($package, $content) 240 241=item $doc->new($package, $content, $ownerpass, $userpass) 242 243=item $doc->new($package, $content, $ownerpass, $userpass, $prompt) 244 245=item $doc->new($package, $content, $ownerpass, $userpass, $options) 246 247Instantiate a new CAM::PDF object. C<$content> can be a document in a 248string, a filename, or '-'. The latter indicates that the document 249should be read from standard input. If the document is password 250protected, the passwords should be passed as additional arguments. If 251they are not known, a boolean C<$prompt> argument allows the programmer to 252suggest that the constructor prompt the user for a password. This is 253rudimentary prompting: passwords are in the clear on the console. 254 255This constructor takes an optional final argument which is a hash 256reference. This hash can contain any of the following optional 257parameters: 258 259=over 260 261=item prompt_for_password => $boolean 262 263This is the same as the C<$prompt> argument described above. 264 265=item fault_tolerant => $boolean 266 267This flag causes the instance to be more lenient when reading the 268input PDF. Currently, this only affects PDFs which cannot be 269successfully decrypted. 270 271=back 272 273=cut 274 275sub new ## no critic(Subroutines::ProhibitExcessComplexity, Unpack) 276{ 277 my $pkg = shift; 278 my $content = shift; # or a filename 279 # Optional args: 280 my $opassword = shift; 281 my $upassword = shift; 282 my $options; 283 # Backward compatible support for prompt flag as final argument 284 if (ref $_[0]) 285 { 286 $options = shift; 287 if ((ref $options) ne 'HASH') 288 { 289 croak 'Options must be a hash reference'; 290 } 291 } 292 else 293 { 294 $options = { 295 prompt_for_password => shift, 296 }; 297 } 298 299 300 my $pdfversion = '1.2'; 301 if ($content =~ m/ \A%PDF-([\d.]+) /xms) 302 { 303 my $ver = $1; 304 if ($ver && $ver > $pdfversion) 305 { 306 $pdfversion = $ver; 307 } 308 } 309 else 310 { 311 if (1024 > length $content) 312 { 313 my $file = $content; 314 if ($file eq q{-}) 315 { 316 $content = q{}; 317 my $offset = 0; 318 my $step = 4096; 319 binmode STDIN; ##no critic (Syscalls) 320 while ($step == read STDIN, $content, $step, $offset) 321 { 322 $offset += $step; 323 } 324 } 325 else 326 { 327 if (open my $fh, '<', $file) 328 { 329 binmode $fh; ##no critic (Syscalls) 330 my $size = -s $file; 331 if ($size != read $fh, $content, $size) { 332 $CAM::PDF::errstr = "Failed to read $file bytes\n"; 333 return; 334 } 335 if (!close $fh) { 336 $CAM::PDF::errstr = "Failed to close reading $file\n"; 337 return; 338 } 339 } 340 else 341 { 342 $CAM::PDF::errstr = "Failed to open $file: $ERRNO\n"; 343 return; 344 } 345 } 346 } 347 if ($content =~ m/ \A%PDF-([\d.]+) /xms) 348 { 349 my $ver = $1; 350 if ($ver && $ver > $pdfversion) 351 { 352 $pdfversion = $ver; 353 } 354 } 355 else 356 { 357 $CAM::PDF::errstr = "Content does not begin with \"%PDF-\"\n"; 358 return; 359 } 360 } 361 #warn "got pdfversion $pdfversion\n"; 362 363 my $self = { 364 options => $options, 365 366 pdfversion => $pdfversion, 367 maxstr => $CAM::PDF::MAX_STRING, # length of output string 368 content => $content, 369 contentlength => length $content, 370 xref => {}, 371 maxobj => 0, 372 changes => {}, 373 versions => {}, 374 375 # Caches: 376 objcache => {}, 377 pagecache => {}, 378 formcache => {}, 379 Names => {}, 380 NameObjects => {}, 381 fontmetrics => {}, 382 }; 383 bless $self, $pkg; 384 if (!$self->_startdoc()) 385 { 386 return; 387 } 388 if ($self->{trailer}->{ID}) 389 { 390 my $id = $self->getValue($self->{trailer}->{ID}); 391 if (ref $id) 392 { 393 my $accum = q{}; 394 for my $objnode (@{$id}) 395 { 396 $accum .= $self->getValue($objnode); 397 } 398 $id = $accum; 399 } 400 $self->{ID} = $id; 401 } 402 403 $self->{crypt} = CAM::PDF::Decrypt->new($self, $opassword, $upassword, 404 $self->{options}->{prompt_for_password}); 405 if (!$self->{crypt} && !$self->{options}->{fault_tolerant}) 406 { 407 return; 408 } 409 410 return $self; 411} 412 413=item $doc->toPDF() 414 415Serializes the data structure as a PDF document stream and returns as 416in a scalar. 417 418=cut 419 420sub toPDF 421{ 422 my $self = shift; 423 424 if ($self->needsSave()) 425 { 426 $self->cleansave(); 427 } 428 return $self->{content}; 429} 430 431=item $doc->toString() 432 433Returns a serialized representation of the data structure. 434Implemented via Data::Dumper. 435 436=cut 437 438sub toString ## no critic (Unpack) 439{ 440 my $self = shift; 441 my @skip = @_ == 0 ? qw(content) : @_; 442 443 my %hold; 444 for my $key (@skip) 445 { 446 $hold{$key} = delete $self->{$key}; 447 } 448 449 require Data::Dumper; 450 my $result = Data::Dumper->Dump([$self], [qw(doc)]); 451 452 for my $key (keys %hold) 453 { 454 $self->{$key} = $hold{$key}; 455 } 456 return $result; 457} 458 459################################################################################ 460 461=back 462 463=head2 Document reading 464 465(all of these functions are intended for internal only) 466 467=over 468 469=cut 470 471 472# PRIVATE METHOD 473# read the document index and some metadata. 474 475sub _startdoc 476{ 477 my $self = shift; 478 479 ### Parse the document metadata 480 481 # Start by parsing out the location of the last xref block 482 483 # Implementation note: The PDF spec says "The last line of the file 484 # contains only the end-of-file marker, %%EOF." but it also says 485 # "Acrobat viewers require only that the %%EOF marker appear 486 # somewhere within the last 1024 bytes of the file." 487 # So, we follow the latter more lenient rule. 488 489 my $doc_length = length $self->{content}; 490 my $startxref; 491 if ($doc_length > 1024) 492 { 493 # The initial ".*" is for the unlikely case that there are two "startxref" statements in the last 1024 bytes 494 ($startxref) = (substr $self->{content}, $doc_length - 1024, 1024) =~ m/ .* startxref\s*(\d+)\s*%%EOF.*?\z /xms; 495 } 496 else 497 { 498 ($startxref) = $self->{content} =~ m/ .* startxref\s*(\d+)\s*%%EOF.*?\z /xms; 499 } 500 501 if (!$startxref) 502 { 503 $CAM::PDF::errstr = "Cannot find the index in the PDF content\n"; 504 return; 505 } 506 507 # Parse the hierarchy of xref blocks 508 $self->{startxref} = $startxref; 509 my @objstreamrefs; 510 $self->{trailer} = $self->_buildxref($self->{startxref}, $self->{xref}, $self->{versions}, \@objstreamrefs); 511 if (!defined $self->{trailer}) 512 { 513 return; 514 } 515 $self->_buildendxref(); 516 for my $objstreamref (@objstreamrefs) 517 { 518 if (!$self->_index_objstream($objstreamref)) 519 { 520 return; 521 } 522 } 523 524 ### Cache some page content descriptors 525 526 # Get the document root catalog 527 if (!exists $self->{trailer}->{Root}) 528 { 529 $CAM::PDF::errstr = "No root node present in PDF trailer.\n"; 530 return; 531 } 532 my $root = $self->getRootDict(); 533 if (!$root || (ref $root) ne 'HASH') 534 { 535 $CAM::PDF::errstr = "The PDF root node is not a dictionary.\n"; 536 return; 537 } 538 539 # Get the root of the page tree 540 if (!exists $root->{Pages}) 541 { 542 $CAM::PDF::errstr = "The PDF root node doesn't have a reference to the page tree.\n"; 543 return; 544 } 545 my $pages = $self->getPagesDict(); 546 if (!$pages || (ref $pages) ne 'HASH') 547 { 548 $CAM::PDF::errstr = "The PDF page tree root is not a dictionary.\n"; 549 return; 550 } 551 552 # Get the number of pages in the document 553 $self->{PageCount} = $self->getValue($pages->{Count}); 554 if (!$self->{PageCount} || $self->{PageCount} < 1) 555 { 556 $CAM::PDF::errstr = "Bad number of pages in PDF document\n"; 557 return; 558 } 559 560 return 1; 561} 562 563# PRIVATE FUNCTION 564# read document index 565 566sub _buildxref 567{ 568 my $self = shift; 569 my $startxref = shift; 570 my $index = shift; 571 my $versions = shift; 572 my $objstreamrefs = shift; 573 574 my $trailer; 575 if ('xref' eq substr $self->{content}, $startxref, 4) 576 { 577 $trailer = $self->_buildxref_pdf14($startxref, $index, $versions); 578 if ($trailer && exists $trailer->{XRefStm}) 579 { 580 if (!$self->_buildxref_pdf15($trailer->{XRefStm}->{value}, $index, $versions, $objstreamrefs)) 581 { 582 return; 583 } 584 } 585 } 586 else 587 { 588 $trailer = $self->_buildxref_pdf15($startxref, $index, $versions, $objstreamrefs); 589 } 590 591 if ($trailer && exists $trailer->{Prev}) 592 { 593 if (!$self->_buildxref($trailer->{Prev}->{value}, $index, $versions, $objstreamrefs)) 594 { 595 return; 596 } 597 } 598 599 return $trailer; 600} 601 602# Just for debugging 603sub __dump_binary_stream { 604 my $stream = shift; 605 606 my @b = unpack 'C*', $stream; 607 print ' 0 1 2 3 4 5 6 7 8 9 a b c d e f'; 608 for my $i (0 .. $#b) { 609 if (0 == $i % 15) { 610 printf "\n%04x: ", $i; 611 } elsif (0 == $i % 3) { 612 print q{ }; 613 } 614 printf '%02x', $b[$i]; 615 } 616 print "\n"; 617 618 return; 619} 620 621sub _buildxref_pdf15 622{ 623 my $self = shift; 624 my $startxref = shift; 625 my $index = shift; 626 my $versions = shift; 627 my $objstreamrefs = shift; 628 629 my ($trailer, $stream) = $self->_buildxref_pdf15_getstream($startxref); 630 if (!$trailer) { 631 return; 632 } 633 634 #__dump_binary_stream($stream); 635 636 my @byte_pattern = map {$_->{value}} @{$trailer->{W}->{value}}; 637 my $entry_size = $byte_pattern[0] + $byte_pattern[1] + $byte_pattern[2]; 638 #print STDOUT "pack: [@byte_pattern] => total size $entry_size\n"; 639 640 my @objstreamrefs; 641 { 642 my @pairs = (0, $trailer->{Size}->{value}); 643 if (exists $trailer->{Index}) 644 { 645 @pairs = map {$_->{value}} @{$trailer->{Index}->{value}}; 646 } 647 #print STDOUT "Pairs: (Index,Size)=@pairs; size of stream=",length($stream)," ?= $entry_size x ",(length($stream)/$entry_size),"\n"; 648 649 my $i = 0; 650 while (@pairs) 651 { 652 my $start = shift @pairs; 653 my $len = shift @pairs; 654 my $end = $start + $len; 655 for my $objnum ($start .. $end - 1) 656 { 657 my @byte = unpack 'C*', substr $stream, $i++ * $entry_size, $entry_size; 658 my %values = (type => 1, major => 0, minor => 0); 659 my @w = @byte_pattern; 660 my $pos = 0; 661 my $w = 0; 662 for my $key (qw(type major minor)) { 663 $w += shift @w; 664 if ($w > $pos) { 665 my $val = 0; 666 for (; $pos < $w; ++$pos) { ## no critic (ProhibitCStyleForLoops) 667 $val = ($val << 8) + $byte[$pos]; 668 } 669 $values{$key} = $val; 670 } 671 } 672 673 next if (exists $index->{$objnum}); # keep only latest revision 674 675 #my %strs = ( 676 # 0 => {str=>'free', major=>'objnext', minor=>'gennum'}, 677 # 1 => {str=>'raw', major=>'byte', minor=>'gennum'}, 678 # 2 => {str=>'zip', major=>'stream', minor=>'index'}, 679 #); 680 #my $type_def = $strs{$values{type}}; 681 #my $type_str = $type_def ? $type_def->{str} : 'unk'; 682 #my $major_str = $type_def ? $type_def->{major} : 'unk'; 683 #my $minor_str = $type_def ? $type_def->{minor} : 'unk'; 684 #print STDOUT "xref $objnum = $values{type}=$type_str $major_str=$values{major}(", 685 # sprintf('%04x',$values{major}),") $minor_str=$values{minor}(", 686 # sprintf('%02x',$values{minor}),")\n"; 687 688 # Ignore type 0 689 if ($values{type} == 1) 690 { 691 $index->{$objnum} = $values{major}; 692 $versions->{$objnum} = $values{minor}; 693 } 694 elsif ($values{type} == 2) 695 { 696 push @{$objstreamrefs}, {objnum => $objnum, streamnum => $values{major}, indx => $values{minor}}; 697 $index->{$objnum} = {}; # will be overwritten later 698 $versions->{$objnum} = 0; 699 } 700 } 701 if ($end - 1 > $self->{maxobj}) 702 { 703 $self->{maxobj} = $end - 1; 704 } 705 } 706 } 707 return $trailer; 708} 709 710sub _buildxref_pdf15_getstream 711{ 712 my $self = shift; 713 my $startxref = shift; 714 715 # Don't slurp in the whole file 716 my $chunk_size = 1024; 717 my @content = (substr $self->{content}, $startxref, $chunk_size); 718 # warning: this doesn't account for the case where "endobj" crosses a 1024-byte boundary 719 # instead, we hit the end of file and find it after concatenation -- a hack but it works 720 while ($content[-1] && $content[-1] !~ m/endobj/xms) { 721 my $offset = $startxref + $chunk_size * @content; 722 if ($offset >= length $self->{content}) { 723 # end of file 724 last; 725 } 726 push @content, substr $self->{content}, $offset, $chunk_size; 727 } 728 my $content = join q{}, @content; 729 730 my $xrefstream = $self->parseObj(\$content); 731 if (!$xrefstream) 732 { 733 $CAM::PDF::errstr = 'Failed to locate the xref stream'; 734 return; 735 } 736 my $trailer = $xrefstream->{value}->{value}; # dict hash 737 if (!$trailer) 738 { 739 $CAM::PDF::errstr = 'Invalid xref stream: no trailer'; 740 return; 741 } 742 if ('HASH' ne ref $trailer) 743 { 744 $CAM::PDF::errstr = 'Invalid xref stream: trailer is not a dictionary'; 745 return; 746 } 747 #print STDOUT "Trailer: @{[sort keys %{$trailer}]}, $trailer->{Type}->{value}\n"; 748 if (!exists $trailer->{Type} || 'XRef' ne $trailer->{Type}->{value}) 749 { 750 $CAM::PDF::errstr = 'Invalid xref stream: type is not XRef'; 751 return; 752 } 753 754 my $stream = $self->decodeOne($xrefstream->{value}); 755 if (!$stream) 756 { 757 $CAM::PDF::errstr = 'Invalid xref stream: could not decode the stream'; 758 return; 759 } 760 761 return ($trailer, $stream); 762} 763 764sub _index_objstream 765{ 766 my $self = shift; 767 my $objstreamref = shift; 768 769 my $objstream = $self->dereference($objstreamref->{streamnum}); 770 if (!$objstream) 771 { 772 $CAM::PDF::errstr = 'Failed to read object stream ' . $objstreamref->{streamnum}; 773 return; 774 } 775 if ($objstream->{_indexed}++) 776 { 777 return 1; 778 } 779 my $stream = $self->decodeOne($objstream->{value}); 780 if (!$stream) 781 { 782 $CAM::PDF::errstr = 'Invalid xref stream: could not decode objstream ' . $objstreamref->{streamnum}; 783 return; 784 } 785 my $dict = $objstream->{value}->{value}; 786 my $n = $dict->{N}->{value}; 787 my $first = $dict->{First}->{value}; 788 my $lookup = substr $stream, 0, $first; 789 my @objs; 790 my $streamholder = {stream => $stream}; 791 for my $i (0 .. $n-1) 792 { 793 if ($lookup =~ m/\G\s*(\d+)\s+(\d+)/cgxms) 794 { 795 my ($objnum, $offset) = ($1, $2); 796 my $pos = {objstream => $streamholder, start => $first + $offset}; 797 push @objs, $pos; 798 if (exists $self->{xref}->{$objnum}) { 799 # keep only latest revision 800 next if !ref $self->{xref}->{$objnum}; 801 next if $self->{xref}->{$objnum}->{objstream}; 802 } 803 #print "objnum $objnum at pos $offset of objstream $objstreamref->{streamnum}\n"; 804 $self->{xref}->{$objnum} = $pos; 805 $self->{versions}->{$objnum} = 0; 806 } 807 else 808 { 809 $CAM::PDF::errstr = 'Failed to read the objstream index for ' . $objstreamref->{streamnum}; 810 return; 811 } 812 } 813 for my $i (0 .. $#objs-1) 814 { 815 $objs[$i]->{end} = $objs[$i+1]->{start}; 816 } 817 $objs[-1]->{end} = length $stream; 818 819 return 1; 820} 821 822sub _buildxref_pdf14 823{ 824 my $self = shift; 825 my $startxref = shift; 826 my $index = shift; 827 my $versions = shift; 828 829 my $trailerpos = index $self->{content}, 'trailer', $startxref; 830 831 # Workaround for Perl 5.6.1 bug 832 if ($trailerpos > 0 && $trailerpos < $startxref) 833 { 834 my $xrefstr = substr $self->{content}, $startxref; 835 $trailerpos = $startxref + index $xrefstr, 'trailer'; 836 } 837 838 my $end = substr $self->{content}, $startxref, $trailerpos-$startxref; 839 840 if ($end !~ s/ \A xref\s+ //xms) 841 { 842 my $len = length $end; 843 $CAM::PDF::errstr = "Could not find PDF cross-ref table at location $startxref/$trailerpos/$len\n" . $self->trimstr($end); 844 return; 845 } 846 my $part = 0; 847 while ($end =~ s/ \A (\d+)\s+(\d+)\s+ //xms) 848 { 849 my $s = $1; 850 my $n = $2; 851 852 $part++; 853 for my $i (0 .. $n-1) 854 { 855 my $objnum = $s+$i; 856 next if (exists $index->{$objnum}); 857 858 my $row = substr $end, $i*20, 20; 859 my ($indexnum, $version, $type) = $row =~ m/ \A (\d{10}) [ ] (\d{5}) [ ] (\w) /xms; 860 if (!$indexnum) 861 { 862 $CAM::PDF::errstr = "Could not decipher xref row:\n" . $self->trimstr($row); 863 return; 864 } 865 if ($type eq 'n') 866 { 867 if ($indexnum != 0) # if the index says it's at byte zero, pretend it's an 'f' instead of an 'n' 868 { 869 $index->{$objnum} = $indexnum; 870 $versions->{$objnum} = $version; 871 } 872 } 873 if ($objnum > $self->{maxobj}) 874 { 875 $self->{maxobj} = $objnum; 876 } 877 } 878 879 $end = substr $end, 20*$n; 880 } 881 882 my $sxrefpos = index $self->{content}, 'startxref', $trailerpos; 883 if ($sxrefpos > 0 && $sxrefpos < $trailerpos) # workaround for 5.6.1 bug 884 { 885 my $tail = substr $self->{content}, $trailerpos; 886 $sxrefpos = $trailerpos + index $tail, 'startxref'; 887 } 888 $end = substr $self->{content}, $trailerpos, $sxrefpos-$trailerpos; 889 890 if ($end !~ s/ \A trailer\s* //xms) 891 { 892 $CAM::PDF::errstr = "Did not find expected trailer block after xref\n" . $self->trimstr($end); 893 return; 894 } 895 my $trailer = $self->parseDict(\$end)->{value}; 896 return $trailer; 897} 898 899# PRIVATE FUNCTION 900# _buildendxref -- compute the end of each object 901# note that this is not always the *actual* end of the object, but 902# we guarantee that the object will end at or before this point. 903 904sub _buildendxref 905{ 906 my $self = shift; 907 908 my $x = $self->{xref}; # shorthand 909 # make a list of objnums sorted by file position. Ignore objects inside objstreams. 910 my @keys = sort {$x->{$a} <=> $x->{$b}} grep {!ref $x->{$_}} keys %{$x}; 911 912 my $r = {}; 913 for my $i (0 .. $#keys-1) 914 { 915 # set the end of each object to be the beginning of the next object 916 $r->{$keys[$i]} = $x->{$keys[$i+1]}; 917 } 918 # The end of the last object is the end of the file 919 $r->{$keys[-1]} = $self->{contentlength}; 920 921 $self->{endxref} = $r; 922 return; 923} 924 925# PRIVATE FUNTION 926# _buildNameTable -- descend into the page tree and extract all XObject 927# and Font name references. 928 929sub _buildNameTable 930{ 931 my $self = shift; 932 my $pagenum = shift; 933 934 if (!$pagenum || $pagenum eq 'All') # Build the ENTIRE name table 935 { 936 $self->cacheObjects(); 937 for my $p (1 .. $self->{PageCount}) 938 { 939 $self->_buildNameTable($p); 940 } 941 my %n; 942 for my $objnode (values %{$self->{objcache}}) 943 { 944 if ($objnode->{value}->{type} eq 'dictionary') 945 { 946 my $dict = $objnode->{value}->{value}; 947 if ($dict->{Name}) 948 { 949 $n{$dict->{Name}->{value}} = CAM::PDF::Node->new('reference', $objnode->{objnum}); 950 } 951 } 952 } 953 $self->{Names}->{All} = {%n}; 954 return; 955 } 956 957 return if (exists $self->{Names}->{$pagenum}); 958 959 my %n; 960 my $page = $self->getPage($pagenum); 961 while ($page) 962 { 963 my $objnum = $self->getPageObjnum($pagenum); 964 if (exists $page->{Resources}) 965 { 966 my $r = $self->getValue($page->{Resources}); 967 for my $key ('XObject', 'Font') 968 { 969 if (exists $r->{$key}) 970 { 971 my $x = $self->getValue($r->{$key}); 972 if ((ref $x) eq 'HASH') 973 { 974 %n = (%{$x}, %n); 975 } 976 } 977 } 978 } 979 980 # Inherit from parent 981 $page = $page->{Parent}; 982 if ($page) 983 { 984 $page = $self->getValue($page); 985 } 986 } 987 988 $self->{Names}->{$pagenum} = {%n}; 989 return; 990} 991 992=item $doc->getRootDict() 993 994Returns the Root dictionary for the PDF. 995 996=cut 997 998sub getRootDict 999{ 1000 my $self = shift; 1001 1002 return $self->getValue($self->{trailer}->{Root}); 1003} 1004 1005=item $doc->getPagesDict() 1006 1007Returns the root Pages dictionary for the PDF. 1008 1009=cut 1010 1011sub getPagesDict 1012{ 1013 my $self = shift; 1014 1015 return $self->getValue($self->getRootDict()->{Pages}); 1016} 1017 1018=item $doc->parseObj($string) 1019 1020Use parseAny() instead of this, if possible. 1021 1022Given a fragment of PDF page content, parse it and return an object 1023Node. This can be called as a class method in most circumstances, but 1024is intended as an instance method. 1025 1026=cut 1027 1028sub parseObj 1029{ 1030 my $self = shift; 1031 my $c = shift; 1032 1033 if (${$c} !~ m/ \G\s*(\d+)\s+(\d+)\s+obj\s* /cgxms) ##no critic(ProhibitUnusedCapture) 1034 { 1035 die "Expected object open tag\n" . $self->trimstr(${$c}); 1036 } 1037 # need to implement like this with explicit capture vars for 5.6.1 1038 # compatibility 1039 my ($objnum, $gennum) = ($1, $2); ##no critic(ProhibitCaptureWithoutTest) 1040 $objnum = int $objnum; 1041 $gennum = int $gennum; 1042 1043 my $objnode; 1044 if (${$c} =~ m/ \G(.*?)endobj\s* /cgxms) 1045 { 1046 my $string = $1; 1047 $objnode = $self->parseAny(\$string, $objnum, $gennum); 1048 if ($string =~ m/ \Gstream /xms) 1049 { 1050 if ($objnode->{type} ne 'dictionary') 1051 { 1052 die "Found an object stream without a preceding dictionary\n" . $self->trimstr(${$c}); 1053 } 1054 $objnode->{value}->{StreamData} = $self->parseStream(\$string, $objnum, $gennum, $objnode->{value}); 1055 } 1056 } 1057 else 1058 { 1059 die "Expected endobj\n" . $self->trimstr(${$c}); 1060 } 1061 return CAM::PDF::Node->new('object', $objnode, $objnum, $gennum); 1062} 1063 1064 1065=item $doc->parseInlineImage($string) 1066 1067=item $doc->parseInlineImage($string, $objnum) 1068 1069=item $doc->parseInlineImage($string, $objnum, $gennum) 1070 1071Given a fragment of PDF page content, parse it and return an object 1072Node. This can be called as a class method in some cases, but 1073is intended as an instance method. 1074 1075=cut 1076 1077sub parseInlineImage 1078{ 1079 my $self = shift; 1080 my $c = shift; 1081 my $objnum = shift; 1082 my $gennum = shift; 1083 1084 if (${$c} !~ m/ \GBI\b /xms) 1085 { 1086 die "Expected inline image open tag\n" . $self->trimstr(${$c}); 1087 } 1088 my $dict = $self->parseDict($c, $objnum, $gennum, 'BI\\b\\s*', 'ID\\b'); 1089 $self->unabbrevInlineImage($dict); 1090 $dict->{value}->{Type} = CAM::PDF::Node->new('label', 'XObject', $objnum, $gennum); 1091 $dict->{value}->{Subtype} = CAM::PDF::Node->new('label', 'Image', $objnum, $gennum); 1092 $dict->{value}->{StreamData} = $self->parseStream($c, $objnum, $gennum, $dict->{value}, 1093 qr/ \s* /xms, qr/ \s*EI(?!\S) /xms); 1094 ${$c} =~ m/ \G\s+ /cgxms; 1095 1096 return CAM::PDF::Node->new('object', $dict, $objnum, $gennum); 1097} 1098 1099 1100=item $doc->writeInlineImage($objectnode) 1101 1102This is the inverse of parseInlineImage(), intended for use only in 1103the CAM::PDF::Content class. 1104 1105=cut 1106 1107sub writeInlineImage 1108{ 1109 my $self = shift; 1110 my $objnode = shift; 1111 1112 # Make a copy since we are going to trash the image 1113 my $dictobj = $self->copyObject($objnode)->{value}; 1114 1115 my $dict = $dictobj->{value}; 1116 delete $dict->{Type}; 1117 delete $dict->{Subtype}; 1118 my $stream = $dict->{StreamData}->{value}; 1119 delete $dict->{StreamData}; 1120 $self->abbrevInlineImage($dictobj); 1121 1122 my $str = $self->writeAny($dictobj); 1123 $str =~ s/ \A << /BI /xms; 1124 $str =~ s/ >> \z / ID/xms; 1125 $str .= "\n" . $stream . "\nEI"; 1126 return $str; 1127} 1128 1129=item $doc->parseStream($string, $objnum, $gennum, $dictnode) 1130 1131This should only be used by parseObj(), or other specialized cases. 1132 1133Given a fragment of PDF page content, parse it and return a stream 1134Node. This can be called as a class method in most circumstances, but 1135is intended as an instance method. 1136 1137The dictionary Node argument is typically the body of the object Node 1138that precedes this stream. 1139 1140=cut 1141 1142sub parseStream 1143{ 1144 my $self = shift; 1145 my $c = shift; 1146 my $objnum = shift; 1147 my $gennum = shift; 1148 my $dict = shift; 1149 1150 my $begin = shift || qr/ stream[ \t]*\r?\n /xms; 1151 my $end = shift || qr/ \s*endstream\s* /xms; 1152 1153 if (${$c} !~ m/ \G$begin /cgxms) 1154 { 1155 die "Expected stream open tag\n" . $self->trimstr(${$c}); 1156 } 1157 1158 my $stream; 1159 1160 my $l = $dict->{Length} || $dict->{L}; 1161 if (!defined $l) 1162 { 1163 if ($begin =~ m/ \Gstream /xms) 1164 { 1165 die "Missing stream length\n" . $self->trimstr(${$c}); 1166 } 1167 if (${$c} =~ m/ \G$begin(.*?)$end /cgxms) 1168 { 1169 $stream = $1; 1170 my $len = length $stream; 1171 $dict->{Length} = CAM::PDF::Node->new('number', $len, $objnum, $gennum); 1172 } 1173 else 1174 { 1175 die "Missing stream begin/end\n" . $self->trimstr(${$c}); 1176 } 1177 } 1178 else 1179 { 1180 my $length = $self->getValue($l); 1181 my $pos = pos ${$c}; 1182 $stream = substr ${$c}, $pos, $length; 1183 pos(${$c}) += $length; ## no critic(CodeLayout::ProhibitParensWithBuiltins) 1184 if (${$c} !~ m/ \G$end /cgxms) 1185 { 1186 die "Expected endstream\n" . $self->trimstr(${$c}); 1187 } 1188 } 1189 1190 if (ref $self) 1191 { 1192 # in the rare case of CAM::PDF::Content::_parseInlineImage, this 1193 # may be called as a class method, thus making the above test 1194 # necessary 1195 1196 if ($self->{crypt}) 1197 { 1198 $stream = $self->{crypt}->decrypt($self, $stream, $objnum, $gennum); 1199 } 1200 } 1201 1202 return CAM::PDF::Node->new('stream', $stream, $objnum, $gennum); 1203} 1204 1205=item $doc->parseDict($string) 1206 1207=item $doc->parseDict($string, $objnum) 1208 1209=item $doc->parseDict($string, $objnum, $gennum) 1210 1211Use parseAny() instead of this, if possible. 1212 1213Given a fragment of PDF page content, parse it and return an dictionary 1214Node. This can be called as a class method in most circumstances, but 1215is intended as an instance method. 1216 1217=cut 1218 1219sub parseDict 1220{ 1221 my $pkg_or_doc = shift; 1222 my $c = shift; 1223 my $objnum = shift; 1224 my $gennum = shift; 1225 1226 my $begin = shift || '<<\\s*'; 1227 my $end = shift || '>>\\s*'; 1228 1229 my $dict = {}; 1230 if (${$c} =~ m/ \G$begin /cgxms) 1231 { 1232 while (${$c} !~ m/ \G$end /cgxms) 1233 { 1234 #warn "looking for label:\n" . $pkg_or_doc->trimstr(${$c}); 1235 my $keyref = $pkg_or_doc->parseLabel($c, $objnum, $gennum); 1236 my $key = $keyref->{value}; 1237 #warn "looking for value:\n" . $pkg_or_doc->trimstr(${$c}); 1238 my $value = $pkg_or_doc->parseAny($c, $objnum, $gennum); 1239 $dict->{$key} = $value; 1240 } 1241 } 1242 1243 return CAM::PDF::Node->new('dictionary', $dict, $objnum, $gennum); 1244} 1245 1246=item $doc->parseArray($string) 1247 1248=item $doc->parseArray($string, $objnum) 1249 1250=item $doc->parseArray($string, $objnum, $gennum) 1251 1252Use parseAny() instead of this, if possible. 1253 1254Given a fragment of PDF page content, parse it and return an array 1255Node. This can be called as a class or instance method. 1256 1257=cut 1258 1259sub parseArray 1260{ 1261 my $pkg_or_doc = shift; 1262 my $c = shift; 1263 my $objnum = shift; 1264 my $gennum = shift; 1265 1266 my $array = []; 1267 if (${$c} =~ m/ \G\[\s* /cgxms) 1268 { 1269 while (${$c} !~ m/ \G\]\s* /cgxms) 1270 { 1271 #warn "looking for array value:\n" . $pkg_or_doc->trimstr(${$c}); 1272 push @{$array}, $pkg_or_doc->parseAny($c, $objnum, $gennum); 1273 } 1274 } 1275 1276 return CAM::PDF::Node->new('array', $array, $objnum, $gennum); 1277} 1278 1279=item $doc->parseLabel($string) 1280 1281=item $doc->parseLabel($string, $objnum) 1282 1283=item $doc->parseLabel($string, $objnum, $gennum) 1284 1285Use parseAny() instead of this, if possible. 1286 1287Given a fragment of PDF page content, parse it and return a label 1288Node. This can be called as a class or instance method. 1289 1290=cut 1291 1292sub parseLabel 1293{ 1294 my $pkg_or_doc = shift; 1295 my $c = shift; 1296 my $objnum = shift; 1297 my $gennum = shift; 1298 1299 my $label; 1300 if (${$c} =~ m{ \G/([^\s<>/\[\]()]+)\s* }cgxms) 1301 { 1302 $label = $1; 1303 } 1304 else 1305 { 1306 die "Expected identifier label:\n" . $pkg_or_doc->trimstr(${$c}); 1307 } 1308 return CAM::PDF::Node->new('label', $label, $objnum, $gennum); 1309} 1310 1311=item $doc->parseRef($string) 1312 1313=item $doc->parseRef($string, $objnum) 1314 1315=item $doc->parseRef($string, $objnum, $gennum) 1316 1317Use parseAny() instead of this, if possible. 1318 1319Given a fragment of PDF page content, parse it and return a reference 1320Node. This can be called as a class or instance method. 1321 1322=cut 1323 1324sub parseRef 1325{ 1326 my $pkg_or_doc = shift; 1327 my $c = shift; 1328 my $objnum = shift; 1329 my $gennum = shift; 1330 1331 my $newobjnum; 1332 if (${$c} =~ m/ \G(\d+)\s+\d+\s+R\s* /cgxms) 1333 { 1334 $newobjnum = int $1; 1335 } 1336 else 1337 { 1338 die "Expected object reference\n" . $pkg_or_doc->trimstr(${$c}); 1339 } 1340 return CAM::PDF::Node->new('reference', $newobjnum, $objnum, $gennum); 1341} 1342 1343=item $doc->parseNum($string) 1344 1345=item $doc->parseNum($string, $objnum) 1346 1347=item $doc->parseNum($string, $objnum, $gennum) 1348 1349Use parseAny() instead of this, if possible. 1350 1351Given a fragment of PDF page content, parse it and return a number 1352Node. This can be called as a class or instance method. 1353 1354=cut 1355 1356sub parseNum 1357{ 1358 my $pkg_or_doc = shift; 1359 my $c = shift; 1360 my $objnum = shift; 1361 my $gennum = shift; 1362 1363 my $value; 1364 if (${$c} =~ m/ \G([\d.+-]+)\s* /cgxms) 1365 { 1366 $value = $1; 1367 } 1368 else 1369 { 1370 die "Expected numerical constant\n" . $pkg_or_doc->trimstr(${$c}); 1371 } 1372 return CAM::PDF::Node->new('number', $value, $objnum, $gennum); 1373} 1374 1375=item $doc->parseString($string) 1376 1377=item $doc->parseString($string, $objnum) 1378 1379=item $doc->parseString($string, $objnum, $gennum) 1380 1381Use parseAny() instead of this, if possible. 1382 1383Given a fragment of PDF page content, parse it and return a string 1384Node. This can be called as a class or instance method. 1385 1386=cut 1387 1388sub parseString 1389{ 1390 my $pkg_or_doc = shift; 1391 my $c = shift; 1392 my $objnum = shift; 1393 my $gennum = shift; 1394 1395 my $value = q{}; 1396 if (${$c} =~ m/ \G [(] /cgxms) 1397 { 1398 # TODO: use Text::Balanced or Regexp::Common from CPAN?? 1399 1400 my $depth = 1; 1401 while ($depth > 0) 1402 { 1403 if (${$c} =~ m/ \G ([^()]*) ([()]) /cgxms) 1404 { 1405 my $string = $1; 1406 my $delim = $2; 1407 $value .= $string; 1408 1409 # Make sure this is not an escaped paren, OR an real paren 1410 # preceded by an escaped backslash! 1411 if ($string =~ m/ (\\+) \z/xms && 1 == (length $1) % 2) 1412 { 1413 $value .= $delim; 1414 } 1415 elsif ($delim eq '(') 1416 { 1417 $value .= $delim; 1418 $depth++; 1419 } 1420 elsif(--$depth > 0) 1421 { 1422 $value .= $delim; 1423 } 1424 } 1425 else 1426 { 1427 die "Expected string closing\n" . $pkg_or_doc->trimstr(${$c}); 1428 } 1429 } 1430 ${$c} =~ m/ \G\s* /cgxms; 1431 } 1432 else 1433 { 1434 die "Expected string opener\n" . $pkg_or_doc->trimstr(${$c}); 1435 } 1436 1437 # Unescape slash-escaped characters. Treat \\ specially. 1438 my @parts = split /\\\\|\\134/xms, $value, -1; 1439 for (@parts) 1440 { 1441 # concatenate continued lines 1442 s/ \\\r?\n //gxms; 1443 s/ \\\r //gxms; 1444 1445 # special characters 1446 s/ \\n /\n/gxms; 1447 s/ \\r /\r/gxms; 1448 s/ \\t /\t/gxms; 1449 s/ \\f /\f/gxms; 1450 s/ \\b /\x{8}/gxms; 1451 1452 # octal numbers 1453 s/ \\(\d{1,3}) /chr oct $1/gexms; 1454 1455 # Ignore all other slashes (i.e. following characters are treated literally) 1456 s/ \\ //gxms; 1457 } 1458 $value = join q{\\}, @parts; 1459 1460 if (ref $pkg_or_doc) 1461 { 1462 my $self = $pkg_or_doc; 1463 if ($self->{crypt}) 1464 { 1465 $value = $self->{crypt}->decrypt($self, $value, $objnum, $gennum); 1466 } 1467 } 1468 return CAM::PDF::Node->new('string', $value, $objnum, $gennum); 1469} 1470 1471=item $doc->parseHexString($string) 1472 1473=item $doc->parseHexString($string, $objnum) 1474 1475=item $doc->parseHexString($string, $objnum, $gennum) 1476 1477Use parseAny() instead of this, if possible. 1478 1479Given a fragment of PDF page content, parse it and return a hex string 1480Node. This can be called as a class or instance method. 1481 1482=cut 1483 1484sub parseHexString 1485{ 1486 my $pkg_or_doc = shift; 1487 my $c = shift; 1488 my $objnum = shift; 1489 my $gennum = shift; 1490 1491 my $str = q{}; 1492 if (${$c} =~ m/ \G<([\da-fA-F\s]*)>\s* /cgxms) 1493 { 1494 $str = $1; 1495 $str =~ s/\s+//gxms; 1496 my $len = length $str; 1497 if ($len % 2 == 1) 1498 { 1499 $str .= '0'; 1500 } 1501 $str = pack 'H*', $str; 1502 } 1503 else 1504 { 1505 die "Expected hex string\n" . $pkg_or_doc->trimstr(${$c}); 1506 } 1507 1508 if (ref $pkg_or_doc) 1509 { 1510 my $self = $pkg_or_doc; 1511 if ($self->{crypt}) 1512 { 1513 $str = $self->{crypt}->decrypt($self, $str, $objnum, $gennum); 1514 } 1515 } 1516 return CAM::PDF::Node->new('hexstring', $str, $objnum, $gennum); 1517} 1518 1519=item $doc->parseBoolean($string) 1520 1521=item $doc->parseBoolean($string, $objnum) 1522 1523=item $doc->parseBoolean($string, $objnum, $gennum) 1524 1525Use parseAny() instead of this, if possible. 1526 1527Given a fragment of PDF page content, parse it and return a boolean 1528Node. This can be called as a class or instance method. 1529 1530=cut 1531 1532sub parseBoolean 1533{ 1534 my $pkg_or_doc = shift; 1535 my $c = shift; 1536 my $objnum = shift; 1537 my $gennum = shift; 1538 1539 my $val = q{}; 1540 if (${$c} =~ m/ \G(true|false)\s* /cgxmsi) 1541 { 1542 $val = lc $1; 1543 } 1544 else 1545 { 1546 die "Expected boolean true or false keyword\n" . $pkg_or_doc->trimstr(${$c}); 1547 } 1548 1549 return CAM::PDF::Node->new('boolean', $val, $objnum, $gennum); 1550} 1551 1552=item $doc->parseNull($string) 1553 1554=item $doc->parseNull($string, $objnum) 1555 1556=item $doc->parseNull($string, $objnum, $gennum) 1557 1558Use parseAny() instead of this, if possible. 1559 1560Given a fragment of PDF page content, parse it and return a null 1561Node. This can be called as a class or instance method. 1562 1563=cut 1564 1565sub parseNull 1566{ 1567 my $pkg_or_doc = shift; 1568 my $c = shift; 1569 my $objnum = shift; 1570 my $gennum = shift; 1571 1572 my $val = q{}; 1573 if (${$c} =~ m/ \Gnull\s* /cgxmsi) 1574 { 1575 $val = undef; 1576 } 1577 else 1578 { 1579 die "Expected null keyword\n" . $pkg_or_doc->trimstr(${$c}); 1580 } 1581 1582 return CAM::PDF::Node->new('null', $val, $objnum, $gennum); 1583} 1584 1585=item $doc->parseAny($string) 1586 1587=item $doc->parseAny($string, $objnum) 1588 1589=item $doc->parseAny($string, $objnum, $gennum) 1590 1591Given a fragment of PDF page content, parse it and return a Node of 1592the appropriate type. This can be called as a class or instance 1593method. 1594 1595=cut 1596 1597sub parseAny 1598{ 1599 my $p = shift; # pkg or doc 1600 my $c = shift; 1601 my $objnum = shift; 1602 my $gennum = shift; 1603 1604 return ${$c} =~ m/ \G \d+\s+\d+\s+R\b /xms ? $p->parseRef( $c, $objnum, $gennum) 1605 : ${$c} =~ m{ \G / }xms ? $p->parseLabel( $c, $objnum, $gennum) 1606 : ${$c} =~ m/ \G << /xms ? $p->parseDict( $c, $objnum, $gennum) 1607 : ${$c} =~ m/ \G \[ /xms ? $p->parseArray( $c, $objnum, $gennum) 1608 : ${$c} =~ m/ \G [(] /xms ? $p->parseString( $c, $objnum, $gennum) 1609 : ${$c} =~ m/ \G < /xms ? $p->parseHexString($c, $objnum, $gennum) 1610 : ${$c} =~ m/ \G [\d.+-]+ /xms ? $p->parseNum( $c, $objnum, $gennum) 1611 : ${$c} =~ m/ \G (true|false) /ixms ? $p->parseBoolean( $c, $objnum, $gennum) 1612 : ${$c} =~ m/ \G null /ixms ? $p->parseNull( $c, $objnum, $gennum) 1613 : die "Unrecognized type in parseAny:\n" . $p->trimstr(${$c}); 1614} 1615 1616################################################################################ 1617 1618=back 1619 1620=head2 Data Accessors 1621 1622=over 1623 1624=item $doc->getValue($object) 1625 1626I<For INTERNAL use> 1627 1628Dereference a data object, return a value. Given an node object 1629of any kind, returns raw scalar object: hashref, arrayref, string, 1630number. This function follows all references, and descends into all 1631objects. 1632 1633=cut 1634 1635sub getValue 1636{ 1637 my $self = shift; 1638 my $objnode = shift; 1639 1640 return if (! ref $objnode); 1641 1642 while ($objnode->{type} eq 'reference' || $objnode->{type} eq 'object') 1643 { 1644 if ($objnode->{type} eq 'reference') 1645 { 1646 my $objnum = $objnode->{value}; 1647 $objnode = $self->dereference($objnum); 1648 } 1649 elsif ($objnode->{type} eq 'object') 1650 { 1651 $objnode = $objnode->{value}; 1652 } 1653 return if (! ref $objnode); 1654 } 1655 1656 return $objnode->{value}; 1657} 1658 1659=item $doc->getObjValue($objectnum) 1660 1661I<For INTERNAL use> 1662 1663Dereference a data object, and return a value. Behaves just like the 1664getValue() function, but used when all you know is the object number. 1665 1666=cut 1667 1668sub getObjValue 1669{ 1670 my $self = shift; 1671 my $objnum = shift; 1672 1673 return $self->getValue(CAM::PDF::Node->new('reference', $objnum)); 1674} 1675 1676 1677=item $doc->dereference($objectnum) 1678 1679=item $doc->dereference($name, $pagenum) 1680 1681I<For INTERNAL use> 1682 1683Dereference a data object, return a PDF object as a node. This 1684function makes heavy use of the internal object cache. Most (if not 1685all) object requests should go through this function. 1686 1687C<$name> should look something like '/R12'. 1688 1689=cut 1690 1691sub dereference 1692{ 1693 my $self = shift; 1694 my $key = shift; 1695 my $pagenum = shift; # only used if $key is a named resource 1696 1697 if ($key =~ s/ \A\/ //xms) # strip off the leading slash while testing 1698 { 1699 # This is a request for a named object 1700 $self->_buildNameTable($pagenum); 1701 $key = $self->{Names}->{$pagenum}->{$key}; 1702 return if (!defined $key); 1703 # $key should now point to a 'reference' object 1704 if ((ref $key) ne 'CAM::PDF::Node') 1705 { 1706 die "Assertion failed: key is a reference object in dereference\n"; 1707 } 1708 $key = $key->{value}; 1709 } 1710 1711 $key = int $key; 1712 if (!exists $self->{objcache}->{$key}) 1713 { 1714 #print "Filling cache for obj \#$key...\n"; 1715 1716 my $pos = $self->{xref}->{$key}; 1717 1718 if (!$pos) 1719 { 1720 warn "Bad request for object $key at position 0 in the file\n"; 1721 return; 1722 } 1723 1724 my $content_fragment; 1725 if (ref $pos) 1726 { 1727 $content_fragment = substr $pos->{objstream}->{stream}, $pos->{start}, $pos->{end}; 1728 $content_fragment = "$key 0 obj\n$content_fragment\nendobj\n"; 1729 } 1730 else 1731 { 1732 # This is fastest and safest 1733 if (!exists $self->{endxref}) 1734 { 1735 $self->_buildendxref(); 1736 } 1737 my $endpos = $self->{endxref}->{$key}; 1738 if (!defined $endpos || $endpos < $pos) 1739 { 1740 # really slow, but a totally safe fallback 1741 $endpos = $self->{contentlength}; 1742 } 1743 1744 $content_fragment = substr $self->{content}, $pos, $endpos - $pos; 1745 } 1746 $self->{objcache}->{$key} = $self->parseObj(\$content_fragment, $key); 1747 } 1748 1749 return $self->{objcache}->{$key}; 1750} 1751 1752 1753=item $doc->getPropertyNames($pagenum) 1754 1755=item $doc->getProperty($pagenum, $propertyname) 1756 1757Each PDF page contains a list of resources that it uses (images, 1758fonts, etc). getPropertyNames() returns an array of the names of 1759those resources. getProperty() returns a node representing a 1760named property (most likely a reference node). 1761 1762=cut 1763 1764sub getPropertyNames 1765{ 1766 my $self = shift; 1767 my $pagenum = shift; 1768 1769 $self->_buildNameTable($pagenum); 1770 my $props = $self->{Names}->{$pagenum}; 1771 return if (!defined $props); 1772 return keys %{$props}; 1773} 1774sub getProperty 1775{ 1776 my $self = shift; 1777 my $pagenum = shift; 1778 my $name = shift; 1779 1780 $self->_buildNameTable($pagenum); 1781 my $props = $self->{Names}->{$pagenum}; 1782 return if (!defined $props); 1783 return if (!defined $name); 1784 return $props->{$name}; 1785} 1786 1787=item $doc->getFont($pagenum, $fontname) 1788 1789I<For INTERNAL use> 1790 1791Returns a dictionary for a given font identified by its label, 1792referenced by page. 1793 1794=cut 1795 1796sub getFont 1797{ 1798 my $self = shift; 1799 my $pagenum = shift; 1800 my $fontname = shift; 1801 1802 $fontname =~ s/ \A\/? /\//xms; # add leading slash if needed 1803 my $objnode = $self->dereference($fontname, $pagenum); 1804 return if (!$objnode); 1805 1806 my $dict = $self->getValue($objnode); 1807 if ($dict && $dict->{Type} && $dict->{Type}->{value} eq 'Font') 1808 { 1809 return $dict; 1810 } 1811 else 1812 { 1813 return; 1814 } 1815} 1816 1817=item $doc->getFontNames($pagenum) 1818 1819I<For INTERNAL use> 1820 1821Returns a list of fonts for a given page. 1822 1823=cut 1824 1825sub getFontNames 1826{ 1827 my $self = shift; 1828 my $pagenum = shift; 1829 1830 $self->_buildNameTable($pagenum); 1831 my $list = $self->{Names}->{$pagenum}; 1832 my @names; 1833 if ($list) 1834 { 1835 for my $key (keys %{$list}) 1836 { 1837 my $dict = $self->getValue($list->{$key}); 1838 if ($dict && $dict->{Type} && $dict->{Type}->{value} eq 'Font') 1839 { 1840 push @names, $key; 1841 } 1842 } 1843 } 1844 return @names; 1845} 1846 1847 1848=item $doc->getFonts($pagenum) 1849 1850I<For INTERNAL use> 1851 1852Returns an array of font objects for a given page. 1853 1854=cut 1855 1856sub getFonts 1857{ 1858 my $self = shift; 1859 my $pagenum = shift; 1860 1861 $self->_buildNameTable($pagenum); 1862 my $list = $self->{Names}->{$pagenum}; 1863 my @fonts; 1864 if ($list) 1865 { 1866 for my $key (keys %{$list}) 1867 { 1868 my $dict = $self->getValue($list->{$key}); 1869 if ($dict && $dict->{Type} && $dict->{Type}->{value} eq 'Font') 1870 { 1871 push @fonts, $dict; 1872 } 1873 } 1874 } 1875 return @fonts; 1876} 1877 1878=item $doc->getFontByBaseName($pagenum, $fontname) 1879 1880I<For INTERNAL use> 1881 1882Returns a dictionary for a given font, referenced by page and the name 1883of the base font. 1884 1885=cut 1886 1887sub getFontByBaseName 1888{ 1889 my $self = shift; 1890 my $pagenum = shift; 1891 my $fontname = shift; 1892 1893 $self->_buildNameTable($pagenum); 1894 my $list = $self->{Names}->{$pagenum}; 1895 for my $key (keys %{$list}) 1896 { 1897 my $num = $list->{$key}->{value}; 1898 my $objnode = $self->dereference($num); 1899 my $dict = $self->getValue($objnode); 1900 if ($dict && 1901 $dict->{Type} && $dict->{Type}->{value} eq 'Font' && 1902 $dict->{BaseFont} && $dict->{BaseFont}->{value} eq $fontname) 1903 { 1904 return $dict; 1905 } 1906 } 1907 return; 1908} 1909 1910=item $doc->getFontMetrics($properties $fontname) 1911 1912I<For INTERNAL use> 1913 1914Returns a data structure representing the font metrics for the named 1915font. The property list is the results of something like the 1916following: 1917 1918 $self->_buildNameTable($pagenum); 1919 my $properties = $self->{Names}->{$pagenum}; 1920 1921Alternatively, if you know the page number, it might be easier to do: 1922 1923 my $font = $self->dereference($fontlabel, $pagenum); 1924 my $fontmetrics = $font->{value}->{value}; 1925 1926where the C<$fontlabel> is something like '/Helv'. The getFontMetrics() 1927method is useful in the cases where you've forgotten which page number 1928you are working on (e.g. in CAM::PDF::GS), or if your property list 1929isn't part of any page (e.g. working with form field annotation 1930objects). 1931 1932=cut 1933 1934sub getFontMetrics 1935{ 1936 my $self = shift; 1937 my $props = shift; 1938 my $fontname = shift; 1939 1940 my $fontmetrics; 1941 1942 #print STDERR "looking for font $fontname\n"; 1943 1944 # Sometimes we are passed just the object list, sometimes the whole 1945 # properties data structure 1946 if ($props->{Font}) 1947 { 1948 $props = $self->getValue($props->{Font}); 1949 } 1950 1951 if ($props->{$fontname}) 1952 { 1953 my $fontdict = $self->getValue($props->{$fontname}); 1954 if ($fontdict && $fontdict->{Type} && $fontdict->{Type}->{value} eq 'Font') 1955 { 1956 $fontmetrics = $fontdict; 1957 #print STDERR "Got font\n"; 1958 } 1959 else 1960 { 1961 #print STDERR "Almost got font\n"; 1962 } 1963 } 1964 else 1965 { 1966 #print STDERR "No font with that name in the dict\n"; 1967 } 1968 #print STDERR "Failed to get font\n" if (!$fontmetrics); 1969 return $fontmetrics; 1970} 1971 1972=item $doc->addFont($pagenum, $fontname, $fontlabel) 1973 1974=item $doc->addFont($pagenum, $fontname, $fontlabel, $fontmetrics) 1975 1976Adds a reference to the specified font to the page. 1977 1978If a font metrics hash is supplied (it is required for a font other 1979than the 14 core fonts), then it is cloned and inserted into the new 1980font structure. Note that if those font metrics contain references 1981(e.g. to the C<FontDescriptor>), the referred objects are not copied -- 1982you must do that part yourself. 1983 1984For Type1 fonts, the font metrics must minimally contain the following 1985fields: C<Subtype>, C<FirstChar>, C<LastChar>, C<Widths>, 1986C<FontDescriptor>. 1987 1988=cut 1989 1990sub addFont 1991{ 1992 my $self = shift; 1993 my $pagenum = shift; 1994 my $name = shift; 1995 my $label = shift; 1996 my $fontmetrics = shift; # optional 1997 1998 # Check if this font already exists 1999 my $page = $self->getPage($pagenum); 2000 if (exists $page->{Resources}) 2001 { 2002 my $r = $self->getValue($page->{Resources}); 2003 if (exists $r->{Font}) 2004 { 2005 my $f = $self->getValue($r->{Font}); 2006 if (exists $f->{$label}) 2007 { 2008 # Font already exists. Skip. 2009 return $self; 2010 } 2011 } 2012 } 2013 2014 # Build the font 2015 my $dict = CAM::PDF::Node->new('dictionary', 2016 { 2017 Type => CAM::PDF::Node->new('label', 'Font'), 2018 Name => CAM::PDF::Node->new('label', $label), 2019 BaseFont => CAM::PDF::Node->new('label', $name), 2020 }, 2021 ); 2022 if ($fontmetrics) 2023 { 2024 my $copy = $self->copyObject($fontmetrics); 2025 for my $key (keys %{$copy}) 2026 { 2027 if (!$dict->{value}->{$key}) 2028 { 2029 $dict->{value}->{$key} = $copy->{$key}; 2030 } 2031 } 2032 } 2033 else 2034 { 2035 $dict->{value}->{Subtype} = CAM::PDF::Node->new('label', 'Type1'); 2036 } 2037 2038 # Add the font to the document 2039 my $fontobjnum = $self->appendObject(undef, CAM::PDF::Node->new('object', $dict), 0); 2040 2041 # Add the font to the page 2042 my ($objnum,$gennum) = $self->getPageObjnum($pagenum); 2043 if (!exists $page->{Resources}) 2044 { 2045 $page->{Resources} = CAM::PDF::Node->new('dictionary', {}, $objnum, $gennum); 2046 } 2047 my $r = $self->getValue($page->{Resources}); 2048 if (!exists $r->{Font}) 2049 { 2050 $r->{Font} = CAM::PDF::Node->new('dictionary', {}, $objnum, $gennum); 2051 } 2052 my $f = $self->getValue($r->{Font}); 2053 $f->{$label} = CAM::PDF::Node->new('reference', $fontobjnum, $objnum, $gennum); 2054 2055 delete $self->{Names}->{$pagenum}; # decache 2056 $self->{changes}->{$objnum} = 1; 2057 return $self; 2058} 2059 2060=item $doc->deEmbedFont($pagenum, $fontname) 2061 2062=item $doc->deEmbedFont($pagenum, $fontname, $basefont) 2063 2064Removes embedded font data, leaving font reference intact. Returns 2065true if the font exists and 1) font is not embedded or 2) embedded 2066data was successfully discarded. Returns false if the font does not 2067exist, or the embedded data could not be discarded. 2068 2069The optional C<$basefont> parameter allows you to change the font. This 2070is useful when some applications embed a standard font (see below) and 2071give it a funny name, like C<SYLXNP+Helvetica>. In this example, it's 2072important to change the basename back to the standard C<Helvetica> when 2073de-embedding. 2074 2075De-embedding the font does NOT remove it from the PDF document, it 2076just removes references to it. To get a size reduction by throwing 2077away unused font data, you should use the following code sometime 2078after this method. 2079 2080 $self->cleanse(); 2081 2082For reference, the standard fonts are C<Times-Roman>, C<Helvetica>, and 2083C<Courier> (and their bold, italic and bold-italic forms) plus C<Symbol> and 2084C<Zapfdingbats>. (Adobe PDF Reference v1.4, p.319) 2085 2086=cut 2087 2088sub deEmbedFont 2089{ 2090 my $self = shift; 2091 my $pagenum = shift; 2092 my $fontname = shift; 2093 my $basefont = shift; 2094 2095 my $success; 2096 my $font = $self->getFont($pagenum, $fontname); 2097 if ($font) 2098 { 2099 $self->_deEmbedFontObj($font, $basefont); 2100 $success = $self; 2101 } 2102 else 2103 { 2104 $success = undef; 2105 } 2106 return $success; 2107} 2108 2109=item $doc->deEmbedFontByBaseName($pagenum, $fontname) 2110 2111=item $doc->deEmbedFontByBaseName($pagenum, $fontname, $basefont) 2112 2113Just like deEmbedFont(), except that the font name parameter refers to 2114the name of the current base font instead of the PDF label for the 2115font. 2116 2117=cut 2118 2119sub deEmbedFontByBaseName 2120{ 2121 my $self = shift; 2122 my $pagenum = shift; 2123 my $fontname = shift; 2124 my $basefont = shift; 2125 2126 my $success; 2127 my $font = $self->getFontByBaseName($pagenum, $fontname); 2128 if ($font) 2129 { 2130 $self->_deEmbedFontObj($font, $basefont); 2131 $success = $self; 2132 } 2133 else 2134 { 2135 $success = undef; 2136 } 2137 return $success; 2138} 2139 2140sub _deEmbedFontObj 2141{ 2142 my $self = shift; 2143 my $font = shift; 2144 my $basefont = shift; 2145 2146 if ($basefont) 2147 { 2148 $font->{BaseFont} = CAM::PDF::Node->new('label', $basefont); 2149 } 2150 delete $font->{FontDescriptor}; 2151 delete $font->{Widths}; 2152 delete $font->{FirstChar}; 2153 delete $font->{LastChar}; 2154 $self->{changes}->{$font->{Type}->{objnum}} = 1; 2155 return; 2156} 2157 2158=item $doc->wrapString($string, $width, $fontsize, $fontmetrics) 2159 2160=item $doc->wrapString($string, $width, $fontsize, $pagenum, $fontlabel) 2161 2162Returns an array of strings wrapped to the specified width. 2163 2164=cut 2165 2166sub wrapString ## no critic (Unpack) 2167{ 2168 my $self = shift; 2169 my $string = shift; 2170 my $width = shift; 2171 my $size = shift; 2172 2173 my $fm; 2174 if (defined $_[0] && ref $_[0]) 2175 { 2176 $fm = shift; 2177 } 2178 else 2179 { 2180 my $pagenum = shift; 2181 my $fontlabel = shift; 2182 $fm = $self->getFont($pagenum, $fontlabel); 2183 } 2184 2185 $string =~ s/ \r\n /\n/gxms; 2186 # no split limit, so trailing null strings are omitted 2187 my @strings = split /[\r\n]/xms, $string; 2188 my @out; 2189 $width /= $size; 2190 #print STDERR 'wrapping '.join('|',@strings)."\n"; 2191 for my $s (@strings) 2192 { 2193 $s =~ s/ \s+\z //xms; 2194 my $w = $self->getStringWidth($fm, $s); 2195 if ($w <= $width) 2196 { 2197 push @out, $s; 2198 } 2199 else 2200 { 2201 my $cur; 2202 if ($s =~ s/ \A(\s*) //xms) 2203 { 2204 $cur = $1; 2205 } 2206 my $curw = $cur eq q{} ? 0 : $self->getStringWidth($fm, $cur); 2207 while ($s) 2208 { 2209 my ($sp,$wd); 2210 if ($s =~ s/ \A(\s*)(\S*) //xms) 2211 { 2212 ($sp,$wd) = ($1,$2); 2213 } 2214 my $wwd = $wd eq q{} ? 0 : $self->getStringWidth($fm, $wd); 2215 if ($curw == 0) 2216 { 2217 $cur = $wd; 2218 $curw = $wwd; 2219 } 2220 else 2221 { 2222 my $wsp = $sp eq q{} ? 0 : $self->getStringWidth($fm, $sp); 2223 if ($curw + $wsp + $wwd <= $width) 2224 { 2225 $cur .= $sp . $wd; 2226 $curw += $wsp + $wwd; 2227 } 2228 else 2229 { 2230 push @out, $cur; 2231 $cur = $wd; 2232 $curw = $wwd; 2233 } 2234 } 2235 } 2236 if (0 < length $cur) 2237 { 2238 push @out, $cur; 2239 } 2240 } 2241 } 2242 #print STDERR 'wrapped to '.join('|',@out)."\n"; 2243 return @out; 2244} 2245 2246=item $doc->getStringWidth($fontmetrics, $string) 2247 2248I<For INTERNAL use> 2249 2250Returns the width of the string, using the font metrics if possible. 2251 2252=cut 2253 2254sub getStringWidth 2255{ 2256 my $self = shift; 2257 my $fontmetrics = shift; 2258 my $string = shift; 2259 2260 if (! defined $string || $string eq q{}) 2261 { 2262 return 0; 2263 } 2264 2265 my $width = 0; 2266 if ($fontmetrics) 2267 { 2268 if ($fontmetrics->{Widths}) 2269 { 2270 my $firstc = $self->getValue($fontmetrics->{FirstChar}); 2271 my $lastc = $self->getValue($fontmetrics->{LastChar}); 2272 my $widths = $self->getValue($fontmetrics->{Widths}); 2273 my $missing_width; # populate this on demand 2274 CHAR: 2275 for my $char (unpack 'C*', $string) 2276 { 2277 if ($char >= $firstc && $char <= $lastc) 2278 { 2279 $width += $widths->[$char - $firstc]->{value}; 2280 next CHAR; 2281 } 2282 2283 if (!defined $missing_width) 2284 { 2285 my $fd = exists $fontmetrics->{FontDescriptor} ? 2286 $self->getValue($fontmetrics->{FontDescriptor}) : undef; 2287 $missing_width = $fd && exists $fd->{MissingWidth} ? 2288 $self->getValue($fd->{MissingWidth}) : 0; 2289 } 2290 2291 $width += $missing_width; 2292 } 2293 $width /= 1000.0; # units conversion 2294 } 2295 elsif ($fontmetrics->{BaseFont}) 2296 { 2297 my $fontname = $self->getValue($fontmetrics->{BaseFont}); 2298 if (!exists $self->{fontmetrics}->{$fontname}) 2299 { 2300 require Text::PDF::SFont; 2301 require Text::PDF::File; 2302 my $pdf = Text::PDF::File->new(); 2303 $self->{fontmetrics}->{$fontname} = 2304 Text::PDF::SFont->new($pdf, $fontname, 'NULL'); 2305 } 2306 if ($self->{fontmetrics}->{$fontname}) 2307 { 2308 $width = $self->{fontmetrics}->{$fontname}->width($string); 2309 } 2310 } 2311 else 2312 { 2313 warn 'Failed to understand this font'; 2314 } 2315 } 2316 2317 if ($width == 0) 2318 { 2319 # HACK!!! 2320 #warn "Using klugy width!\n"; 2321 $width = 0.2 * length $string; 2322 } 2323 2324 return $width; 2325} 2326 2327=item $doc->numPages() 2328 2329Returns the number of pages in the PDF document. 2330 2331=cut 2332 2333sub numPages 2334{ 2335 my $self = shift; 2336 return $self->{PageCount}; 2337} 2338 2339=item $doc->getPage($pagenum) 2340 2341I<For INTERNAL use> 2342 2343Returns a dictionary for a given numbered page. 2344 2345=cut 2346 2347sub getPage 2348{ 2349 my $self = shift; 2350 my $pagenum = shift; 2351 2352 if ($pagenum < 1 || $pagenum > $self->{PageCount}) 2353 { 2354 warn "Invalid page number requested: $pagenum\n"; 2355 return; 2356 } 2357 2358 if (!exists $self->{pagecache}->{$pagenum}) 2359 { 2360 my $node = $self->getPagesDict(); 2361 my $nodestart = 1; 2362 while ($self->getValue($node->{Type}) eq 'Pages') 2363 { 2364 my $kids = $self->getValue($node->{Kids}); 2365 if ((ref $kids) ne 'ARRAY') 2366 { 2367 die "Error: \@kids is not an array\n"; 2368 } 2369 my $child = 0; 2370 if (@{$kids} == 1) 2371 { 2372 # Do the simple case first: 2373 $child = 0; 2374 # nodestart is unchanged 2375 } 2376 else 2377 { 2378 # search through all kids EXCEPT don't bother looking at 2379 # the last one because that is surely the right one if all 2380 # the others are wrong. 2381 2382 while ($child < $#{$kids}) 2383 { 2384 # the first leaf of the kid is the page we want. It 2385 # doesn't matter if the kid is a leaf or a node. 2386 last if ($pagenum == $nodestart); 2387 2388 # Retrieve the dictionary of this child 2389 my $sub = $self->getValue($kids->[$child]); 2390 if ($sub->{Type}->{value} ne 'Pages') 2391 { 2392 # Its a leaf, and not the right one. Move on. 2393 $nodestart++; 2394 } 2395 else 2396 { 2397 my $count = $self->getValue($sub->{Count}); 2398 2399 # The page we want is in this kid. Descend. 2400 last if ($nodestart + $count - 1 >= $pagenum); 2401 2402 # Not in this kid. Move on. 2403 $nodestart += $count; 2404 } 2405 $child++; 2406 } 2407 } 2408 2409 $node = $self->getValue($kids->[$child]); 2410 if (! ref $node) 2411 { 2412 require Data::Dumper; 2413 cluck Data::Dumper::Dumper($node); 2414 } 2415 } 2416 2417 # Ok, now we've got the right page. Store it. 2418 $self->{pagecache}->{$pagenum} = $node; 2419 } 2420 2421 return $self->{pagecache}->{$pagenum}; 2422} 2423 2424=item $doc->getPageObjnum($pagenum) 2425 2426I<For INTERNAL use> 2427 2428Return the number of the PDF object in which the specified page occurs. 2429 2430=cut 2431 2432sub getPageObjnum 2433{ 2434 my $self = shift; 2435 my $pagenum = shift; 2436 2437 my $page = $self->getPage($pagenum); 2438 return if (!$page); 2439 my ($anyobj) = values %{$page}; 2440 if (!$anyobj) 2441 { 2442 die "Internal error: page has no attributes!!!\n"; 2443 } 2444 if (wantarray) 2445 { 2446 return ($anyobj->{objnum}, $anyobj->{gennum}); 2447 } 2448 else 2449 { 2450 return $anyobj->{objnum}; 2451 } 2452} 2453 2454=item $doc->getPageText($pagenum) 2455 2456Extracts the text from a PDF page as a string. 2457 2458=cut 2459 2460sub getPageText 2461{ 2462 my $self = shift; 2463 my $pagenum = shift; 2464 my $verbose = shift; 2465 2466 my $pagetree = $self->getPageContentTree($pagenum, $verbose); 2467 if (!$pagetree) 2468 { 2469 return; 2470 } 2471 2472 require CAM::PDF::PageText; 2473 return CAM::PDF::PageText->render($pagetree, $verbose); 2474} 2475 2476=item $doc->getPageContentTree($pagenum) 2477 2478Retrieves a parsed page content data structure, or undef if there is a 2479syntax error or if the page does not exist. 2480 2481=cut 2482 2483sub getPageContentTree 2484{ 2485 my $self = shift; 2486 my $pagenum = shift; 2487 my $verbose = shift; 2488 2489 my $content = $self->getPageContent($pagenum); 2490 return if (!defined $content); 2491 2492 $self->_buildNameTable($pagenum); 2493 2494 my $page = $self->getPage($pagenum); 2495 my $box = [0, 0, 612, 792]; 2496 if ($page->{MediaBox}) 2497 { 2498 my $mediabox = $self->getValue($page->{MediaBox}); 2499 $box->[0] = $self->getValue($mediabox->[0]); 2500 $box->[1] = $self->getValue($mediabox->[1]); 2501 $box->[2] = $self->getValue($mediabox->[2]); 2502 $box->[3] = $self->getValue($mediabox->[3]); 2503 } 2504 2505 require CAM::PDF::Content; 2506 my $tree = CAM::PDF::Content->new($content, { 2507 doc => $self, 2508 properties => $self->{Names}->{$pagenum}, 2509 mediabox => $box, 2510 }, $verbose); 2511 2512 return $tree; 2513} 2514 2515=item $doc->getPageContent($pagenum) 2516 2517Return a string with the layout contents of one page. 2518 2519=cut 2520 2521sub getPageContent 2522{ 2523 my $self = shift; 2524 my $pagenum = shift; 2525 2526 my $page = $self->getPage($pagenum); 2527 if (!$page || !exists $page->{Contents}) 2528 { 2529 return q{}; 2530 } 2531 2532 my $contents = $self->getValue($page->{Contents}); 2533 2534 if (! ref $contents) 2535 { 2536 return $contents; 2537 } 2538 elsif ((ref $contents) eq 'HASH') 2539 { 2540 # doesn't matter if it's not encoded... 2541 return $self->decodeOne(CAM::PDF::Node->new('dictionary', $contents)); 2542 } 2543 elsif ((ref $contents) eq 'ARRAY') 2544 { 2545 my $stream = q{}; 2546 for my $arrobj (@{$contents}) 2547 { 2548 my $streamdata = $self->getValue($arrobj); 2549 if (! ref $streamdata) 2550 { 2551 $stream .= $streamdata; 2552 } 2553 elsif ((ref $streamdata) eq 'HASH') 2554 { 2555 $stream .= $self->decodeOne(CAM::PDF::Node->new('dictionary',$streamdata)); # doesn't matter if it's not encoded... 2556 } 2557 else 2558 { 2559 die "Unexpected content type for page contents\n"; 2560 } 2561 } 2562 return $stream; 2563 } 2564 else 2565 { 2566 die "Unexpected content type for page contents\n"; 2567 } 2568 return; # should never get here 2569} 2570 2571=item $doc->getPageDimensions($pagenum) 2572 2573Returns an array of C<x>, C<y>, C<width> and C<height> numbers that 2574define the dimensions of the specified page in points (1/72 inches). 2575Technically, this is the C<MediaBox> dimensions, which explains why 2576it's possible for C<x> and C<y> to be non-zero, but that's a rare 2577case. 2578 2579For example, given a simple 8.5 by 11 inch page, this method will return 2580C<(0,0,612,792)>. 2581 2582This method will die() if the specified page number does not exist. 2583 2584=cut 2585 2586sub getPageDimensions 2587{ 2588 my $self = shift; 2589 my $pagenum = shift; 2590 my $pagedict = shift; # only used during recursion 2591 2592 if (!$pagedict) 2593 { 2594 $pagedict = $self->getPage($pagenum); 2595 if (!$pagedict) 2596 { 2597 die 'No such page '.$pagenum; 2598 } 2599 } 2600 2601 if (exists $pagedict->{MediaBox}) 2602 { 2603 my $box = $self->getValue($pagedict->{MediaBox}); 2604 return ($self->getValue($box->[0]), 2605 $self->getValue($box->[1]), 2606 $self->getValue($box->[2]), 2607 $self->getValue($box->[3])); 2608 } 2609 elsif (exists $pagedict->{Parent}) 2610 { 2611 return $self->getPageDimensions($pagenum, $self->getValue($pagedict->{Parent})); 2612 } 2613 else 2614 { 2615 die 'Failed to find the page dimensions'; 2616 } 2617 return; # never gets here 2618} 2619 2620=item $doc->getName($object) 2621 2622I<For INTERNAL use> 2623 2624Given a PDF object reference, return it's name, if it has one. This 2625is useful for indirect references to images in particular. 2626 2627=cut 2628 2629sub getName 2630{ 2631 my $self = shift; 2632 my $objnode = shift; 2633 2634 if ($objnode->{value}->{type} eq 'dictionary') 2635 { 2636 my $dict = $objnode->{value}->{value}; 2637 if (exists $dict->{Name}) 2638 { 2639 return $self->getValue($dict->{Name}); 2640 } 2641 } 2642 return q{}; 2643} 2644 2645=item $doc->getPrefs() 2646 2647Return an array of security information for the document: 2648 2649 owner password 2650 user password 2651 print boolean 2652 modify boolean 2653 copy boolean 2654 add boolean 2655 2656See the PDF reference for the intended use of the latter four booleans. 2657 2658This module publishes the array indices of these values for your 2659convenience: 2660 2661 $CAM::PDF::PREF_OPASS 2662 $CAM::PDF::PREF_UPASS 2663 $CAM::PDF::PREF_PRINT 2664 $CAM::PDF::PREF_MODIFY 2665 $CAM::PDF::PREF_COPY 2666 $CAM::PDF::PREF_ADD 2667 2668So, you can retrieve the value of the Copy boolean via: 2669 2670 my ($canCopy) = ($self->getPrefs())[$CAM::PDF::PREF_COPY]; 2671 2672=cut 2673 2674sub getPrefs 2675{ 2676 my $self = shift; 2677 2678 my @p = (1,1,1,1); 2679 if (exists $self->{crypt}->{P}) 2680 { 2681 @p = $self->{crypt}->decode_permissions($self->{crypt}->{P}); 2682 } 2683 return($self->{crypt}->{opass}, $self->{crypt}->{upass}, @p); 2684} 2685 2686=item $doc->canPrint() 2687 2688Return a boolean indicating whether the Print permission is enabled 2689on the PDF. 2690 2691=cut 2692 2693sub canPrint 2694{ 2695 my $self = shift; 2696 return ($self->getPrefs())[$PREF_PRINT]; 2697} 2698 2699=item $doc->canModify() 2700 2701Return a boolean indicating whether the Modify permission is enabled 2702on the PDF. 2703 2704=cut 2705 2706sub canModify 2707{ 2708 my $self = shift; 2709 return ($self->getPrefs())[$PREF_MODIFY]; 2710} 2711 2712=item $doc->canCopy() 2713 2714Return a boolean indicating whether the Copy permission is enabled 2715on the PDF. 2716 2717=cut 2718 2719sub canCopy 2720{ 2721 my $self = shift; 2722 return ($self->getPrefs())[$PREF_COPY]; 2723} 2724 2725=item $doc->canAdd() 2726 2727Return a boolean indicating whether the Add permission is enabled 2728on the PDF. 2729 2730=cut 2731 2732sub canAdd 2733{ 2734 my $self = shift; 2735 return ($self->getPrefs())[$PREF_ADD]; 2736} 2737 2738=item $doc->getFormFieldList() 2739 2740Return an array of the names of all of the PDF form fields. The names 2741are the full hierarchical names constructed as explained in the PDF 2742reference manual. These names are useful for the fillFormFields() 2743function. 2744 2745=cut 2746 2747sub getFormFieldList 2748{ 2749 my $self = shift; 2750 my $parentname = shift; # very optional 2751 2752 my $prefix = (defined $parentname ? $parentname . q{.} : q{}); 2753 2754 my $kidlist; 2755 if (defined $parentname && $parentname ne q{}) 2756 { 2757 my $parent = $self->getFormField($parentname); 2758 return if (!$parent); 2759 my $dict = $self->getValue($parent); 2760 return if (!exists $dict->{Kids}); 2761 $kidlist = $self->getValue($dict->{Kids}); 2762 } 2763 else 2764 { 2765 my $root = $self->getRootDict()->{AcroForm}; 2766 return if (!$root); 2767 my $parent = $self->getValue($root); 2768 return if (!exists $parent->{Fields}); 2769 $kidlist = $self->getValue($parent->{Fields}); 2770 } 2771 2772 my @list; 2773 for my $kid (@{$kidlist}) 2774 { 2775 if ((! ref $kid) || (ref $kid) ne 'CAM::PDF::Node' || $kid->{type} ne 'reference') 2776 { 2777 die "Expected a reference as the form child of '$parentname'\n"; 2778 } 2779 my $objnode = $self->dereference($kid->{value}); 2780 my $dict = $self->getValue($objnode); 2781 my $name = '(no name)'; # assume the worst 2782 if (exists $dict->{T}) 2783 { 2784 $name = $self->getValue($dict->{T}); 2785 } 2786 $name = $prefix . $name; 2787 push @list, $name; 2788 if (exists $dict->{TU}) 2789 { 2790 push @list, $prefix . $self->getValue($dict->{TU}) . ' (alternate name)'; 2791 } 2792 $self->{formcache}->{$name} = $objnode; 2793 my @kidnames = $self->getFormFieldList($name); 2794 if (@kidnames > 0) 2795 { 2796 #push @list, 'descend...'; 2797 push @list, @kidnames; 2798 #push @list, 'ascend...'; 2799 } 2800 } 2801 return @list; 2802} 2803 2804=item $doc->getFormField($name) 2805 2806I<For INTERNAL use> 2807 2808Return the object containing the form field definition for the 2809specified field name. C<$name> can be either the full name or the 2810"short/alternate" name. 2811 2812=cut 2813 2814sub getFormField 2815{ 2816 my $self = shift; 2817 my $fieldname = shift; 2818 2819 return if (!defined $fieldname); 2820 2821 if (! exists $self->{formcache}->{$fieldname}) 2822 { 2823 my $kidlist; 2824 my $parent; 2825 if ($fieldname =~ m/ [.] /xms) 2826 { 2827 my $parentname; 2828 if ($fieldname =~ s/ \A(.*)[.]([.]+)\z /$2/xms) 2829 { 2830 $parentname = $1; 2831 } 2832 return if (!$parentname); 2833 $parent = $self->getFormField($parentname); 2834 return if (!$parent); 2835 my $dict = $self->getValue($parent); 2836 return if (!exists $dict->{Kids}); 2837 $kidlist = $self->getValue($dict->{Kids}); 2838 } 2839 else 2840 { 2841 my $root = $self->getRootDict()->{AcroForm}; 2842 return if (!$root); 2843 $parent = $self->dereference($root->{value}); 2844 return if (!$parent); 2845 my $dict = $self->getValue($parent); 2846 return if (!exists $dict->{Fields}); 2847 $kidlist = $self->getValue($dict->{Fields}); 2848 } 2849 2850 $self->{formcache}->{$fieldname} = undef; # assume the worst... 2851 for my $kid (@{$kidlist}) 2852 { 2853 my $objnode = $self->dereference($kid->{value}); 2854 $objnode->{formparent} = $parent; 2855 my $dict = $self->getValue($objnode); 2856 if (exists $dict->{T}) 2857 { 2858 $self->{formcache}->{$self->getValue($dict->{T})} = $objnode; 2859 } 2860 if (exists $dict->{TU}) 2861 { 2862 $self->{formcache}->{$self->getValue($dict->{TU})} = $objnode; 2863 } 2864 } 2865 } 2866 2867 return $self->{formcache}->{$fieldname}; 2868} 2869 2870=item $doc->getFormFieldDict($formfieldobject) 2871 2872I<For INTERNAL use> 2873 2874Return a hash reference representing the accumulated property list for 2875a form field, including all of it's inherited properties. This should 2876be treated as a read-only hash! It ONLY retrieves the properties it 2877knows about. 2878 2879=cut 2880 2881sub getFormFieldDict 2882{ 2883 my $self = shift; 2884 my $field = shift; 2885 2886 return if (!defined $field); 2887 2888 my $dict = {}; 2889 if ($field->{formparent}) 2890 { 2891 $dict = $self->getFormFieldDict($field->{formparent}); 2892 } 2893 my $olddict = $self->getValue($field); 2894 2895 if ($olddict->{DR}) 2896 { 2897 $dict->{DR} ||= CAM::PDF::Node->new('dictionary', {}); 2898 my $dr = $self->getValue($dict->{DR}); 2899 my $olddr = $self->getValue($olddict->{DR}); 2900 for my $key (keys %{$olddr}) 2901 { 2902 if ($dr->{$key}) 2903 { 2904 if ($key eq 'Font') 2905 { 2906 my $fonts = $self->getValue($olddr->{$key}); 2907 for my $font (keys %{$fonts}) 2908 { 2909 $dr->{$key}->{$font} = $self->copyObject($fonts->{$font}); 2910 } 2911 } 2912 else 2913 { 2914 warn "Unknown resource key '$key' in form field dictionary"; 2915 } 2916 } 2917 else 2918 { 2919 $dr->{$key} = $self->copyObject($olddr->{$key}); 2920 } 2921 } 2922 } 2923 2924 # Some properties are simple: inherit means override 2925 for my $prop (qw(Q DA Ff V FT)) 2926 { 2927 if ($olddict->{$prop}) 2928 { 2929 $dict->{$prop} = $self->copyObject($olddict->{$prop}); 2930 } 2931 } 2932 2933 return $dict; 2934} 2935 2936################################################################################ 2937 2938=back 2939 2940=head2 Data/Object Manipulation 2941 2942=over 2943 2944=item $doc->setPrefs($ownerpass, $userpass, $print?, $modify?, $copy?, $add?) 2945 2946Alter the document's security information. Note that modifying these 2947parameters must be done respecting the intellectual property of the 2948original document. See Adobe's statement in the introduction of the 2949reference manual. 2950 2951B<Important Note:> Most PDF readers (Acrobat, Preview.app) only offer 2952one password field for opening documents. So, if the C<$ownerpass> 2953and C<$userpass> are different, those applications cannot read the 2954documents. (Perhaps this is a bug in CAM::PDF?) 2955 2956Note: any omitted booleans default to false. So, these two are 2957equivalent: 2958 2959 $doc->setPrefs('password', 'password'); 2960 $doc->setPrefs('password', 'password', 0, 0, 0, 0); 2961 2962=cut 2963 2964sub setPrefs 2965{ 2966 my ($self, @prefs) = @_; 2967 2968 my $p = $self->{crypt}->encode_permissions(@prefs[2..5]); 2969 $self->{crypt}->set_passwords($self, @prefs[0..1], $p); 2970 return; 2971} 2972 2973=item $doc->setName($object, $name) 2974 2975I<For INTERNAL use> 2976 2977Change the name of a PDF object structure. 2978 2979=cut 2980 2981sub setName 2982{ 2983 my $self = shift; 2984 my $objnode = shift; 2985 my $name = shift; 2986 2987 if ($name && $objnode->{value}->{type} eq 'dictionary') 2988 { 2989 $objnode->{value}->{value}->{Name} = CAM::PDF::Node->new('label', $name, $objnode->{objnum}, $objnode->{gennum}); 2990 if ($objnode->{objnum}) 2991 { 2992 $self->{changes}->{$objnode->{objnum}} = 1; 2993 } 2994 return $self; 2995 } 2996 return; 2997} 2998 2999=item $doc->removeName($object) 3000 3001I<For INTERNAL use> 3002 3003Delete the name of a PDF object structure. 3004 3005=cut 3006 3007sub removeName 3008{ 3009 my $self = shift; 3010 my $objnode = shift; 3011 3012 if ($objnode->{value}->{type} eq 'dictionary' && exists $objnode->{value}->{value}->{Name}) 3013 { 3014 delete $objnode->{value}->{value}->{Name}; 3015 if ($objnode->{objnum}) 3016 { 3017 $self->{changes}->{$objnode->{objnum}} = 1; 3018 } 3019 return $self; 3020 } 3021 return; 3022} 3023 3024 3025=item $doc->pageAddName($pagenum, $name, $objectnum) 3026 3027I<For INTERNAL use> 3028 3029Append a named object to the metadata for a given page. 3030 3031=cut 3032 3033sub pageAddName 3034{ 3035 my $self = shift; 3036 my $pagenum = shift; 3037 my $name = shift; 3038 my $key = shift; 3039 3040 $self->_buildNameTable($pagenum); 3041 my $page = $self->getPage($pagenum); 3042 my ($objnum, $gennum) = $self->getPageObjnum($pagenum); 3043 3044 if (!exists $self->{NameObjects}->{$pagenum}) 3045 { 3046 if ($objnum) 3047 { 3048 $self->{changes}->{$objnum} = 1; 3049 } 3050 if (!exists $page->{Resources}) 3051 { 3052 $page->{Resources} = CAM::PDF::Node->new('dictionary', {}, $objnum, $gennum); 3053 } 3054 my $r = $self->getValue($page->{Resources}); 3055 if (!exists $r->{XObject}) 3056 { 3057 $r->{XObject} = CAM::PDF::Node->new('dictionary', {}, $objnum, $gennum); 3058 } 3059 $self->{NameObjects}->{$pagenum} = $self->getValue($r->{XObject}); 3060 } 3061 3062 $self->{NameObjects}->{$pagenum}->{$name} = CAM::PDF::Node->new('reference', $key, $objnum, $gennum); 3063 if ($objnum) 3064 { 3065 $self->{changes}->{$objnum} = 1; 3066 } 3067 return; 3068} 3069 3070=item $doc->setPageContent($pagenum, $content) 3071 3072=item $doc->setPageContent($pagenum, $tree->toString) 3073 3074Replace the content of the specified page with a new version. This 3075function is often used after the getPageContent() function and some 3076manipulation of the returned string from that function. 3077 3078If your content is a parsed tree (i.e. the result of 3079getPageContentTree) then you should serialize it via toString first. 3080 3081=cut 3082 3083sub setPageContent 3084{ 3085 my $self = shift; 3086 my $pagenum = shift; 3087 my $content = shift; 3088 3089 # Note that this *could* be implemented as 3090 # delete current content 3091 # appendPageContent 3092 # but that would lose the optimization below of reusing the content 3093 # object, where possible 3094 3095 my $page = $self->getPage($pagenum); 3096 3097 my $stream = $self->createStreamObject($content, 'FlateDecode'); 3098 if ($page->{Contents} && $page->{Contents}->{type} eq 'reference') 3099 { 3100 my $key = $page->{Contents}->{value}; 3101 $self->replaceObject($key, undef, $stream, 0); 3102 } 3103 else 3104 { 3105 my ($objnum, $gennum) = $self->getPageObjnum($pagenum); 3106 my $key = $self->appendObject(undef, $stream, 0); 3107 $page->{Contents} = CAM::PDF::Node->new('reference', $key, $objnum, $gennum); 3108 $self->{changes}->{$objnum} = 1; 3109 } 3110 return; 3111} 3112 3113=item $doc->appendPageContent($pagenum, $content) 3114 3115Add more content to the specified page. Note that this function does 3116NOT do any page metadata work for you (like creating font objects for 3117any newly defined fonts). 3118 3119=cut 3120 3121sub appendPageContent 3122{ 3123 my $self = shift; 3124 my $pagenum = shift; 3125 my $content = shift; 3126 3127 my $page = $self->getPage($pagenum); 3128 3129 my ($objnum, $gennum) = $self->getPageObjnum($pagenum); 3130 my $stream = $self->createStreamObject($content, 'FlateDecode'); 3131 my $key = $self->appendObject(undef, $stream, 0); 3132 my $streamref = CAM::PDF::Node->new('reference', $key, $objnum, $gennum); 3133 3134 if (!$page->{Contents}) 3135 { 3136 $page->{Contents} = $streamref; 3137 } 3138 elsif ($page->{Contents}->{type} eq 'array') 3139 { 3140 push @{$page->{Contents}->{value}}, $streamref; 3141 } 3142 elsif ($page->{Contents}->{type} eq 'reference') 3143 { 3144 $page->{Contents} = CAM::PDF::Node->new('array', [ $page->{Contents}, $streamref ], $objnum, $gennum); 3145 } 3146 else 3147 { 3148 die "Unsupported Content type \"$page->{Contents}->{type}\" on page $pagenum\n"; 3149 } 3150 $self->{changes}->{$objnum} = 1; 3151 return; 3152} 3153 3154=item $doc->extractPages($pages...) 3155 3156Remove all pages from the PDF except the specified ones. Like 3157deletePages(), the pages can be multiple arguments, comma separated 3158lists, ranges (open or closed). 3159 3160=cut 3161 3162sub extractPages ## no critic (Unpack) 3163{ 3164 my $self = shift; 3165 return $self if (@_ == 0); # no-work shortcut 3166 my @pages = $self->rangeToArray(1,$self->numPages(),@_); 3167 3168 if (@pages == 0) 3169 { 3170 croak 'Tried to delete all the pages'; 3171 } 3172 3173 my %pages = map {$_ => 1} @pages; # eliminate duplicates 3174 3175 # make a list that is the complement of the @pages list 3176 my @delete = grep {!$pages{$_}} 1..$self->numPages(); 3177 3178 return $self if (@delete == 0); # no-work shortcut 3179 return $self->_deletePages(@delete); 3180} 3181 3182=item $doc->deletePages($pages...) 3183 3184Remove the specified pages from the PDF. The pages can be multiple 3185arguments, comma separated lists, ranges (open or closed). 3186 3187=cut 3188 3189sub deletePages ## no critic (Unpack) 3190{ 3191 my $self = shift; 3192 return $self if (@_ == 0); # no-work shortcut 3193 my @pages = $self->rangeToArray(1,$self->numPages(),@_); 3194 3195 return $self if (@pages == 0); # no-work shortcut 3196 3197 my %pages = map {$_ => 1} @pages; # eliminate duplicates 3198 3199 if ($self->numPages() == scalar keys %pages) 3200 { 3201 croak 'Tried to delete all the pages'; 3202 } 3203 3204 return $self->_deletePages(keys %pages); 3205} 3206 3207sub _deletePages 3208{ 3209 my ($self, @pages) = @_; 3210 3211 # Pages should be reverse sorted since we need to delete from the 3212 # end to make the page numbers come out right. 3213 my @objnums; 3214 for my $page (reverse sort {$a <=> $b} @pages) 3215 { 3216 my $objnum = $self->_deletePage($page); 3217 if (!$objnum) 3218 { 3219 $self->_deleteRefsToPages(@objnums); # emergency cleanup to prevent corruption 3220 return; 3221 } 3222 push @objnums, $objnum; 3223 } 3224 $self->_deleteRefsToPages(@objnums); 3225 $self->cleanse(); 3226 return $self; 3227} 3228 3229=item $doc->deletePage($pagenum) 3230 3231Remove the specified page from the PDF. If the PDF has only one page, 3232this method will fail. 3233 3234=cut 3235 3236sub deletePage 3237{ 3238 my $self = shift; 3239 my $pagenum = shift; 3240 3241 my $objnum = $self->_deletePage($pagenum); 3242 if ($objnum) 3243 { 3244 $self->_deleteRefsToPages($objnum); 3245 $self->cleanse(); 3246 } 3247 return $objnum ? $self : (); 3248} 3249 3250# Internal method, called by deletePage() or deletePages() 3251# Returns the objnum of the deleted page 3252 3253sub _deletePage 3254{ 3255 my $self = shift; 3256 my $pagenum = shift; 3257 3258 if ($self->numPages() <= 1) # don't delete the last page 3259 { 3260 croak 'Tried to delete the only page'; 3261 } 3262 my ($objnum, $gennum) = $self->getPageObjnum($pagenum); 3263 if (!defined $objnum) 3264 { 3265 croak 'Tried to delete a non-existent page'; 3266 } 3267 3268 $self->_deletePage_backPointers($objnum); 3269 $self->_deletePage_removeFromPageTree($pagenum); 3270 3271 # Removing the page is easy: 3272 $self->deleteObject($objnum); 3273 3274 # Caches are now bad for all pages from this one 3275 $self->decachePages($pagenum .. $self->numPages()); 3276 3277 $self->{PageCount}--; 3278 3279 return $objnum; 3280} 3281 3282 3283sub _deletePage_backPointers 3284{ 3285 my $self = shift; 3286 my $objnum = shift; 3287 3288 # Delete pointer from annotation back to the page 3289 my $page = $self->dereference($objnum); 3290 my $pagedict = $page->{value}->{value}; 3291 if ($pagedict->{Annots}) 3292 { 3293 my $annots = $self->getValue($pagedict->{Annots}); 3294 if ($annots) 3295 { 3296 for my $annotref (@{$annots}) 3297 { 3298 my $annot = $self->getValue($annotref); 3299 if ($annot) 3300 { 3301 delete $annot->{P}; 3302 } 3303 } 3304 } 3305 } 3306 return; 3307} 3308 3309sub _deletePage_removeFromPageTree 3310{ 3311 my $self = shift; 3312 my $pagenum = shift; 3313 3314 # Removing references to the page is hard: 3315 # (much of this code is lifted from getPage) 3316 my $parentdict; 3317 my $node = $self->dereference($self->getRootDict()->{Pages}->{value}); 3318 my $nodedict = $node->{value}->{value}; 3319 my $nodestart = 1; 3320 while ($node && $nodedict->{Type}->{value} eq 'Pages') 3321 { 3322 my $count; 3323 if ($nodedict->{Count}->{type} eq 'reference') 3324 { 3325 my $countobj = $self->dereference($nodedict->{Count}->{value}); 3326 $count = $countobj->{value}->{value}--; 3327 $self->{changes}->{$countobj->{objnum}} = 1; 3328 } 3329 else 3330 { 3331 $count = $nodedict->{Count}->{value}--; 3332 } 3333 $self->{changes}->{$node->{objnum}} = 1; 3334 3335 if ($count == 1) 3336 { 3337 # only one left, so this is it 3338 if (!$parentdict) 3339 { 3340 croak 'Tried to delete the only page'; 3341 } 3342 my $parentkids = $self->getValue($parentdict->{Kids}); 3343 @{$parentkids} = grep {$_->{value} != $node->{objnum}} @{$parentkids}; 3344 $self->{changes}->{$parentdict->{Kids}->{objnum}} = 1; 3345 $self->deleteObject($node->{objnum}); 3346 last; 3347 } 3348 3349 my $kids = $self->getValue($nodedict->{Kids}); 3350 if (@{$kids} == 1) 3351 { 3352 # Count was not 1, so this must not be a leaf node 3353 # hop down into node's child 3354 3355 my $sub = $self->dereference($kids->[0]->{value}); 3356 my $subdict = $sub->{value}->{value}; 3357 $parentdict = $nodedict; 3358 $node = $sub; 3359 $nodedict = $subdict; 3360 } 3361 else 3362 { 3363 # search through all kids 3364 for my $child (0 .. $#{$kids}) 3365 { 3366 my $sub = $self->dereference($kids->[$child]->{value}); 3367 my $subdict = $sub->{value}->{value}; 3368 3369 if ($subdict->{Type}->{value} ne 'Pages') 3370 { 3371 if ($pagenum == $nodestart) 3372 { 3373 # Got it! 3374 splice @{$kids}, $child, 1; 3375 $node = undef; # flag that we are done 3376 last; 3377 } 3378 else 3379 { 3380 # Its a leaf, and not the right one. Move on. 3381 $nodestart++; 3382 } 3383 } 3384 else 3385 { 3386 # Type=='Pages' node 3387 my $child_count = $self->getValue($subdict->{Count}); 3388 if ($nodestart + $child_count - 1 >= $pagenum) 3389 { 3390 # The page we want is in this kid. Descend. 3391 $parentdict = $nodedict; 3392 $node = $sub; 3393 $nodedict = $subdict; 3394 last; 3395 } 3396 else 3397 { 3398 # Not in this kid. Move on. 3399 $nodestart += $child_count; 3400 } 3401 } 3402 if ($child == $#{$kids}) 3403 { 3404 die "Internal error: did not find the page to delete -- corrupted page index\n"; 3405 } 3406 } 3407 } 3408 } 3409 return; 3410} 3411 3412sub _deleteRefsToPages 3413{ 3414 my ($self, @objnums) = @_; 3415 my %objnums = map {$_ => 1} @objnums; 3416 3417 my $root = $self->getRootDict(); 3418 if ($root->{Names}) 3419 { 3420 my $names = $self->getValue($root->{Names}); 3421 if ($names->{Dests}) 3422 { 3423 my $dests = $self->getValue($names->{Dests}); 3424 if ($self->_deleteDests($dests, \%objnums)) 3425 { 3426 delete $names->{Dests}; 3427 } 3428 } 3429 3430 if (0 == scalar keys %{$names}) 3431 { 3432 my $names_objnum = $root->{Names}->{value}; 3433 $self->deleteObject($names_objnum); 3434 delete $root->{Names}; 3435 } 3436 } 3437 3438 if ($root->{Outlines}) 3439 { 3440 my $outlines = $self->getValue($root->{Outlines}); 3441 $self->_deleteOutlines($outlines, \%objnums); 3442 } 3443 return; 3444} 3445 3446sub _deleteOutlines 3447{ 3448 my $self = shift; 3449 my $outlines = shift; 3450 my $objnums = shift; 3451 3452 my @deletes; 3453 my @stack = ($outlines); 3454 3455 while (@stack > 0) 3456 { 3457 my $node = shift @stack; 3458 3459 # Check for a Destination (aka internal hyperlink) 3460 # A is indirect ref, Dest is direct ref; only one can be present 3461 my $dest; 3462 if ($node->{A}) 3463 { 3464 $dest = $self->getValue($node->{A}); 3465 $dest = $self->getValue($dest->{D}); 3466 } 3467 elsif ($node->{Dest}) 3468 { 3469 $dest = $self->getValue($node->{Dest}); 3470 } 3471 if ($dest && (ref $dest) && (ref $dest) eq 'ARRAY') 3472 { 3473 my $ref = $dest->[0]; 3474 if ($ref && $ref->{type} eq 'reference' && $objnums->{$ref->{value}}) 3475 { 3476 $self->deleteObject($ref->{objnum}); 3477 # Easier to just delete both, even though only one may exist 3478 delete $node->{A}; 3479 delete $node->{Dest}; 3480 } 3481 } 3482 3483 if ($node->{Next}) 3484 { 3485 push @stack, $self->getValue($node->{Next}); 3486 } 3487 if ($node->{First}) 3488 { 3489 push @stack, $self->getValue($node->{First}); 3490 } 3491 } 3492 return; 3493} 3494 3495sub _deleteDests ## no critic(Subroutines::ProhibitExcessComplexity) 3496{ 3497 my $self = shift; 3498 my $dests = shift; 3499 my $objnums = shift; 3500 3501 ## Accumulate the nodes to delete 3502 my @deletes; 3503 my @stack = ([$dests]); 3504 3505 while (@stack > 0) 3506 { 3507 my $chain = pop @stack; 3508 my $node = $chain->[0]; 3509 if ($node->{Names}) 3510 { 3511 my $pairs = $self->getValue($node->{Names}); 3512 for (my $i=1; $i<@{$pairs}; $i+=2) ## no critic(ControlStructures::ProhibitCStyleForLoops) 3513 { 3514 push @stack, [$self->getValue($pairs->[$i]), @{$chain}]; 3515 } 3516 } 3517 elsif ($node->{Kids}) 3518 { 3519 my $list = $self->getValue($node->{Kids}); 3520 push @stack, map {[$self->getValue($_), @{$chain}]} @{$list}; 3521 } 3522 elsif ($node->{D}) 3523 { 3524 my $props = $self->getValue($node->{D}); 3525 my $ref = $props->[0]; 3526 if ($ref && $ref->{type} eq 'reference' && $objnums->{$ref->{value}}) 3527 { 3528 push @deletes, $chain; 3529 } 3530 } 3531 } 3532 3533 ## Delete the nodes, and their parents if applicable 3534 for my $chain (@deletes) 3535 { 3536 my $objnode = shift @{$chain}; 3537 my $objnum = [values %{$objnode}]->[0]->{objnum}; 3538 if (!$objnum) 3539 { 3540 die 'Destination object lacks an object number (number '.@{$chain}.' in the chain)'; 3541 } 3542 $self->deleteObject($objnum); 3543 3544 # Ascend chain... $objnum gets overwritten 3545 3546 CHAIN: 3547 for my $node (@{$chain}) 3548 { 3549 last if (exists $node->{deleted}); # internal flag 3550 3551 my $node_objnum = [values %{$node}]->[0]->{objnum} || die; 3552 3553 if ($node->{Names}) 3554 { 3555 my $pairs = $self->getValue($node->{Names}); 3556 my $limits = $self->getValue($node->{Limits}); 3557 my $redo_limits = 0; 3558 3559 # Find and remove child reference 3560 # iterate over keys of key-value array 3561 for (my $i=@{$pairs}-2; $i>=0; $i-=2) ## no critic(ControlStructures::ProhibitCStyleForLoops) 3562 { 3563 if ($pairs->[$i+1]->{value} == $objnum) 3564 { 3565 my $name = $pairs->[$i]->{value} || die 'No name in Name tree'; 3566 splice @{$pairs}, $i, 2; 3567 if ($limits->[0]->{value} eq $name || $limits->[1]->{value} eq $name) 3568 { 3569 $redo_limits = 1; 3570 } 3571 } 3572 } 3573 3574 if (@{$pairs} > 0) 3575 { 3576 if ($redo_limits) 3577 { 3578 my @names; 3579 for (my $i=0; $i<@{$pairs}; $i+=2) ## no critic(ControlStructures::ProhibitCStyleForLoops) 3580 { 3581 push @names, $pairs->[$i]->{value}; 3582 } 3583 @names = sort @names; 3584 $limits->[0]->{value} = $names[0]; 3585 $limits->[1]->{value} = $names[-1]; 3586 } 3587 last CHAIN; 3588 } 3589 } 3590 3591 elsif ($node->{Kids}) 3592 { 3593 my $list = $self->getValue($node->{Kids}); 3594 3595 # Find and remove child reference 3596 for my $i (reverse 0 .. $#{$list}) 3597 { 3598 if ($list->[$i]->{value} == $objnum) 3599 { 3600 splice @{$list}, $i, 1; 3601 } 3602 } 3603 3604 if (@{$list} > 0) 3605 { 3606 if ($node->{Limits}) 3607 { 3608 my $limits = $self->getValue($node->{Limits}); 3609 if (!$limits || @{$limits} != 2) 3610 { 3611 die 'Internal error: trouble parsing the Limits array in a name tree'; 3612 } 3613 my @names; 3614 for my $i (0..$#{$list}) 3615 { 3616 my $child = $self->getValue($list->[$i]); 3617 my $child_limits = $self->getValue($child->{Limits}); 3618 push @names, map {$_->{value}} @{$child_limits}; 3619 } 3620 @names = sort @names; 3621 $limits->[0]->{value} = $names[0]; 3622 $limits->[1]->{value} = $names[-1]; 3623 } 3624 last CHAIN; 3625 } 3626 } 3627 3628 else 3629 { 3630 die 'Internal error: found a parent node with neither Names nor Kids. This should be impossible.'; 3631 } 3632 3633 # If we got here, the node is empty, so delete it and move onward 3634 $self->deleteObject($node_objnum); 3635 $node->{deleted} = undef; # internal flag 3636 3637 # Prepare for next iteration 3638 $objnum = $node_objnum; 3639 } 3640 } 3641 3642 return exists $dests->{deleted}; 3643} 3644 3645=item $doc->decachePages($pagenum, $pagenum, ...) 3646 3647Clears cached copies of the specified page data structures. This is 3648useful if an operation has been performed that changes a page. 3649 3650=cut 3651 3652sub decachePages 3653{ 3654 my ($self, @pages) = @_; 3655 3656 for (@pages) 3657 { 3658 delete $self->{pagecache}->{$_}; 3659 delete $self->{Names}->{$_}; 3660 delete $self->{NameObjects}->{$_}; 3661 } 3662 delete $self->{Names}->{All}; 3663 return $self; 3664} 3665 3666 3667=item $doc->addPageResources($pagenum, $resourcehash) 3668 3669Add the resources from the given object to the page resource 3670dictionary. If the page does not have a resource dictionary, create 3671one. This function avoids duplicating resources where feasible. 3672 3673=cut 3674 3675sub addPageResources 3676{ 3677 my $self = shift; 3678 my $pagenum = shift; 3679 my $newrsrcs = shift; 3680 3681 return if (!$newrsrcs); 3682 my $page = $self->getPage($pagenum); 3683 return if (!$page); 3684 3685 my ($anyobj) = values %{$page}; 3686 my $objnum = $anyobj->{objnum}; 3687 my $gennum = $anyobj->{gennum}; 3688 3689 my $pagersrcs; 3690 if ($page->{Resources}) 3691 { 3692 $pagersrcs = $self->getValue($page->{Resources}); 3693 } 3694 else 3695 { 3696 $pagersrcs = {}; 3697 $page->{Resources} = CAM::PDF::Node->new('dictionary', $pagersrcs, $objnum, $gennum); 3698 $self->{changes}->{$objnum} = 1; 3699 } 3700 for my $type (keys %{$newrsrcs}) 3701 { 3702 my $new_r = $self->getValue($newrsrcs->{$type}); 3703 my $page_r; 3704 if ($pagersrcs->{$type}) 3705 { 3706 $page_r = $self->getValue($pagersrcs->{$type}); 3707 } 3708 if ($type eq 'Font') 3709 { 3710 if (!$page_r) 3711 { 3712 $page_r = {}; 3713 $pagersrcs->{$type} = CAM::PDF::Node->new('dictionary', $page_r, $objnum, $gennum); 3714 $self->{changes}->{$objnum} = 1; 3715 } 3716 for my $font (keys %{$new_r}) 3717 { 3718 next if (exists $page_r->{$font}); 3719 my $val = $new_r->{$font}; 3720 if ($val->{type} ne 'reference') 3721 { 3722 die 'Internal error: font entry is not a reference'; 3723 } 3724 $page_r->{$font} = CAM::PDF::Node->new('reference', $val->{value}, $objnum, $gennum); 3725 $self->{changes}->{$objnum} = 1; 3726 } 3727 } 3728 elsif ($type eq 'ProcSet') 3729 { 3730 if (!$page_r) 3731 { 3732 $page_r = []; 3733 $pagersrcs->{$type} = CAM::PDF::Node->new('array', $page_r, $objnum, $gennum); 3734 $self->{changes}->{$objnum} = 1; 3735 } 3736 for my $proc (@{$new_r}) 3737 { 3738 if ($proc->{type} ne 'label') 3739 { 3740 die 'Internal error: procset entry is not a label'; 3741 } 3742 { 3743 ## no critic(BuiltinFunctions::ProhibitBooleanGrep) -- TODO: use any() instead 3744 next if (grep {$_->{value} eq $proc->{value}} @{$page_r}); 3745 } 3746 push @{$page_r}, CAM::PDF::Node->new('label', $proc->{value}, $objnum, $gennum); 3747 $self->{changes}->{$objnum} = 1; 3748 } 3749 } 3750 elsif ($type eq 'Encoding') 3751 { 3752 # TODO: is this a hack or is it right? 3753 # EXPLICITLY skip /Encoding from form DR entry 3754 } 3755 else 3756 { 3757 warn "Internal error: unsupported resource type '$type'"; 3758 } 3759 } 3760 return; 3761} 3762 3763=item $doc->appendPDF($pdf) 3764 3765Append pages from another PDF document to this one. No optimization 3766is done -- the pieces are just appended and the internal table of 3767contents is updated. 3768 3769Note that this can break documents with annotations. See the 3770F<appendpdf.pl> script for a workaround. 3771 3772=cut 3773 3774sub appendPDF 3775{ 3776 my $self = shift; 3777 my $otherdoc = shift; 3778 my $prepend = shift; # boolean, default false 3779 3780 my $pageroot = $self->getPagesDict(); 3781 my ($anyobj) = values %{$pageroot}; 3782 my $objnum = $anyobj->{objnum}; 3783 my $gennum = $anyobj->{gennum}; 3784 3785 my $root = $self->getRootDict(); 3786 my $otherroot = $otherdoc->getRootDict(); 3787 my $otherpageobj = $otherdoc->dereference($otherroot->{Pages}->{value}); 3788 my ($key, %refkeys) = $self->appendObject($otherdoc, $otherpageobj->{objnum}, 1); 3789 my $subpage = $self->getObjValue($key); 3790 3791 my $newdict = {}; 3792 my $newpage = CAM::PDF::Node->new('object', 3793 CAM::PDF::Node->new('dictionary', $newdict)); 3794 $newdict->{Type} = CAM::PDF::Node->new('label', 'Pages'); 3795 $newdict->{Kids} = CAM::PDF::Node->new('array', 3796 [ 3797 CAM::PDF::Node->new('reference', $prepend ? $key : $objnum), 3798 CAM::PDF::Node->new('reference', $prepend ? $objnum : $key), 3799 ]); 3800 $self->{PageCount} += $otherdoc->{PageCount}; 3801 $newdict->{Count} = CAM::PDF::Node->new('number', $self->{PageCount}); 3802 my $newpagekey = $self->appendObject(undef, $newpage, 0); 3803 $root->{Pages}->{value} = $newpagekey; 3804 3805 $pageroot->{Parent} = CAM::PDF::Node->new('reference', $newpagekey, $key, $subpage->{gennum}); 3806 $subpage->{Parent} = CAM::PDF::Node->new('reference', $newpagekey, $key, $subpage->{gennum}); 3807 3808 if ($otherroot->{AcroForm}) 3809 { 3810 my $forms = $otherdoc->getValue($otherdoc->getValue($otherroot->{AcroForm})->{Fields}); 3811 my @newforms; 3812 for my $reference (@{$forms}) 3813 { 3814 if ($reference->{type} ne 'reference') 3815 { 3816 die 'Internal error: expected a reference'; 3817 } 3818 my $newkey = $refkeys{$reference->{value}}; 3819 if ($newkey) 3820 { 3821 push @newforms, CAM::PDF::Node->new('reference', $newkey); 3822 } 3823 } 3824 if ($root->{AcroForm}) 3825 { 3826 my $mainforms = $self->getValue($self->getValue($root->{AcroForm})->{Fields}); 3827 for my $reference (@newforms) 3828 { 3829 $reference->{objnum} = $mainforms->[0]->{objnum}; 3830 $reference->{gennum} = $mainforms->[0]->{gennum}; 3831 } 3832 push @{$mainforms}, @newforms; 3833 } 3834 else 3835 { 3836 die 'adding new forms is not implemented'; 3837 } 3838 } 3839 3840 if ($prepend) 3841 { 3842 # clear caches 3843 $self->{pagecache} = {}; 3844 $self->{Names} = {}; 3845 $self->{NameObjects} = {}; 3846 } 3847 3848 return $key; 3849} 3850 3851=item $doc->prependPDF($pdf) 3852 3853Just like appendPDF() except the new document is inserted on page 1 3854instead of at the end. 3855 3856=cut 3857 3858sub prependPDF 3859{ 3860 my ($self, @args) = @_; 3861 return $self->appendPDF(@args, 1); 3862} 3863 3864=item $doc->duplicatePage($pagenum) 3865 3866=item $doc->duplicatePage($pagenum, $leaveblank) 3867 3868Inserts an identical copy of the specified page into the document. 3869The new page's number will be C<$pagenum + 1>. 3870 3871If C<$leaveblank> is true, the new page does not get any content. 3872Thus, the document is broken until you subsequently call 3873setPageContent(). 3874 3875=cut 3876 3877sub duplicatePage 3878{ 3879 my $self = shift; 3880 my $pagenum = shift; 3881 my $leave_blank = shift || 0; 3882 3883 my $page = $self->getPage($pagenum); 3884 my $objnum = $self->getPageObjnum($pagenum); 3885 my $newobjnum = $self->appendObject($self, $objnum, 0); 3886 my $newdict = $self->getObjValue($newobjnum); 3887 delete $newdict->{Contents}; 3888 my $parent = $self->getValue($page->{Parent}); 3889 push @{$self->getValue($parent->{Kids})}, CAM::PDF::Node->new('reference', $newobjnum); 3890 3891 while ($parent) 3892 { 3893 $self->{changes}->{$parent->{Count}->{objnum}} = 1; 3894 if ($parent->{Count}->{type} eq 'reference') 3895 { 3896 my $countobj = $self->dereference($parent->{Count}->{value}); 3897 $countobj->{value}->{value}++; 3898 $self->{changes}->{$countobj->{objnum}} = 1; 3899 } 3900 else 3901 { 3902 $parent->{Count}->{value}++; 3903 } 3904 $parent = $self->getValue($parent->{Parent}); 3905 } 3906 $self->{PageCount}++; 3907 3908 if (!$leave_blank) 3909 { 3910 $self->setPageContent($pagenum+1, $self->getPageContent($pagenum)); 3911 } 3912 3913 # Caches are now bad for all pages from this one 3914 $self->decachePages($pagenum + 1 .. $self->numPages()); 3915 3916 return $self; 3917} 3918 3919=item $doc->createStreamObject($content) 3920 3921=item $doc->createStreamObject($content, $filter ...) 3922 3923I<For INTERNAL use> 3924 3925Create a new Stream object. This object is NOT added to the document. 3926Use the appendObject() function to do that after calling this 3927function. 3928 3929=cut 3930 3931sub createStreamObject 3932{ 3933 my $self = shift; 3934 my $content = shift; 3935 3936 my $dict = CAM::PDF::Node->new('dictionary', 3937 { 3938 Length => CAM::PDF::Node->new('number', length $content), 3939 StreamData => CAM::PDF::Node->new('stream', $content), 3940 }, 3941 ); 3942 3943 my $objnode = CAM::PDF::Node->new('object', $dict); 3944 3945 while (my $filter = shift) 3946 { 3947 #warn "$filter encoding\n"; 3948 $self->encodeOne($objnode->{value}, $filter); 3949 } 3950 3951 return $objnode; 3952} 3953 3954=item $doc->uninlineImages() 3955 3956=item $doc->uninlineImages($pagenum) 3957 3958Search the content of the specified page (or all pages if the 3959page number is omitted) for embedded images. If there are any, replace 3960them with indirect objects. This procedure uses heuristics to detect 3961in-line images, and is subject to confusion in extremely rare cases of text 3962that uses C<BI> and C<ID> a lot. 3963 3964=cut 3965 3966sub uninlineImages 3967{ 3968 my $self = shift; 3969 my $pagenum = shift; 3970 3971 my $changes = 0; 3972 if (!$pagenum) 3973 { 3974 my $pages = $self->numPages(); 3975 for my $p (1 .. $pages) 3976 { 3977 $changes += $self->uninlineImages($p); 3978 } 3979 } 3980 else 3981 { 3982 my $c = $self->getPageContent($pagenum); 3983 my $pos = 0; 3984 while (($pos = index $c, 'BI', $pos) != 1) 3985 { 3986 # manual \bBI check 3987 # if beginning of string or token 3988 if ($pos == 0 || (substr $c, $pos-1, 1) =~ m/ \W /xms) 3989 { 3990 my $part = substr $c, $pos; 3991 if ($part =~ m/ \A BI\b(.*?)\bID\b /xms) 3992 { 3993 my $im = $1; 3994 3995 ## Long series of tests to make sure this is really an 3996 ## image and not just coincidental text 3997 3998 # Fix easy cases of "BI text) BI ... ID" 3999 $im =~ s/ \A .*\bBI\b //xms; 4000 # There should never be an EI inside of a BI ... ID 4001 next if ($im =~ m/ \bEI\b /xms); 4002 4003 # Easy tests: is this the beginning or end of a string? 4004 # (these aren't really good tests...) 4005 next if ($im =~ m/ \A [)] /xms); 4006 next if ($im =~ m/ [(] \z /xms); 4007 4008 # this is the most complex heuristic: 4009 # make sure that there is an open paren before every close 4010 # if not, then the "BI" or the "ID" was part of a string 4011 my $test = $im; # make a copy we can scribble on 4012 my $failed = 0; 4013 # get rid of escaped parens for the test 4014 $test =~ s/ \\[()] //gxms; 4015 # Look for closing parens 4016 while ($test =~ s/ \A(.*?)[)] //xms) 4017 { 4018 # If there is NOT an opening paren before the 4019 # closing paren we detected above, then the start of 4020 # our string is INSIDE a paren pair, thus a failure. 4021 my $bit = $1; 4022 if ($bit !~ m/ [(] /xms) 4023 { 4024 $failed = 1; 4025 last; 4026 } 4027 } 4028 next if ($failed); 4029 4030 # End of heuristics. This is likely a real embedded image. 4031 # Now do the replacement. 4032 4033 my $oldlen = length $part; 4034 my $image = $self->parseInlineImage(\$part, undef); 4035 my $newlen = length $part; 4036 my $imagelen = $oldlen - $newlen; 4037 4038 # Construct a new image name like "I3". Start with 4039 # "I1" and continue until we get an unused "I<n>" 4040 # (first, get the list of already-used labels) 4041 $self->_buildNameTable($pagenum); 4042 my $i = 1; 4043 my $name = 'Im1'; 4044 while (exists $self->{Names}->{$pagenum}->{$name}) 4045 { 4046 $name = 'Im' . ++$i; 4047 } 4048 4049 $self->setName($image, $name); 4050 my $key = $self->appendObject(undef, $image, 0); 4051 $self->pageAddName($pagenum, $name, $key); 4052 4053 $c = (substr $c, 0, $pos) . "/$name Do" . (substr $c, $pos+$imagelen); 4054 $changes++; 4055 } 4056 } 4057 } 4058 if ($changes > 0) 4059 { 4060 $self->setPageContent($pagenum, $c); 4061 } 4062 } 4063 return $changes; 4064} 4065 4066=item $doc->appendObject($doc, $objectnum, $recurse?) 4067 4068=item $doc->appendObject($undef, $object, $recurse?) 4069 4070Duplicate an object from another PDF document and add it to this 4071document, optionally descending into the object and copying any other 4072objects it references. 4073 4074Like replaceObject(), the second form allows you to append a 4075newly-created block to the PDF. 4076 4077=cut 4078 4079sub appendObject 4080{ 4081 my $self = shift; 4082 my $otherdoc = shift; 4083 my $otherkey = shift; 4084 my $follow = shift; 4085 4086 my $objnum = ++$self->{maxobj}; 4087 4088 # Make sure our new object has a number higher than anything in 4089 # either document, otherwise the changeRefKeys might change 4090 # something twice! We had a problem of 15 -> 134 -> 333 in 1.52 4091 # (private email with Charlie Katz) 4092 if ($otherdoc && $otherdoc->{maxobj} >= $objnum) { 4093 $objnum = $self->{maxobj} = $otherdoc->{maxobj} + 1; 4094 } 4095 4096 if (exists $self->{versions}->{$objnum}) { 4097 $self->{versions}->{$objnum}++; 4098 } else { 4099 $self->{versions}->{$objnum} = 0; 4100 } 4101 4102 my %refkeys = $self->replaceObject($objnum, $otherdoc, $otherkey, $follow); 4103 if (wantarray) 4104 { 4105 return ($objnum, %refkeys); 4106 } 4107 else 4108 { 4109 return $objnum; 4110 } 4111} 4112 4113=item $doc->replaceObject($objectnum, $doc, $objectnum, $recurse?) 4114 4115=item $doc->replaceObject($objectnum, $undef, $object) 4116 4117Duplicate an object from another PDF document and insert it into this 4118document, replacing an existing object. Optionally descend into the 4119original object and copy any other objects it references. 4120 4121If the other document is undefined, then the object to copy is taken 4122to be an anonymous object that is not part of any other document. 4123This is useful when you've just created that anonymous object. 4124 4125=cut 4126 4127sub replaceObject 4128{ 4129 my $self = shift; 4130 my $key = shift; 4131 my $otherdoc = shift; 4132 my $otherkey = shift; 4133 my $follow = shift; 4134 4135 # careful! 'undef' means something different from '0' here! 4136 if (!defined $follow) 4137 { 4138 $follow = 1; 4139 } 4140 4141 my $objnode; 4142 my $otherobj; 4143 if ($otherdoc) 4144 { 4145 $otherobj = $otherdoc->dereference($otherkey); 4146 $objnode = $self->copyObject($otherobj); 4147 } 4148 else 4149 { 4150 $objnode = $otherkey; 4151 if ($follow) 4152 { 4153 warn "Error: you cannot \"follow\" an object if it has no document.\n" . 4154 "Resetting follow = false and continuing....\n"; 4155 $follow = 0; 4156 } 4157 } 4158 4159 $self->setObjNum($objnode, $key, 0); 4160 4161 # Preserve the name of the object 4162 if ($self->{xref}->{$key}) # make sure it isn't a brand new object 4163 { 4164 my $oldname = $self->getName($self->dereference($key)); 4165 if ($oldname) 4166 { 4167 $self->setName($objnode, $oldname); 4168 } 4169 else 4170 { 4171 $self->removeName($objnode); 4172 } 4173 } 4174 4175 $self->{objcache}->{$key} = $objnode; 4176 $self->{changes}->{$key} = 1; 4177 4178 my %newrefkeys = ($otherkey, $key); 4179 if ($follow) 4180 { 4181 for my $oldrefkey ($otherdoc->getRefList($otherobj)) 4182 { 4183 next if ($oldrefkey == $otherkey); 4184 my $newkey = $self->appendObject($otherdoc, $oldrefkey, 0); 4185 $newrefkeys{$oldrefkey} = $newkey; 4186 } 4187 my $already_changed = {}; # hash used by traverse() to avoid repeats 4188 $self->changeRefKeys($objnode, \%newrefkeys, $already_changed); 4189 for my $newkey (values %newrefkeys) 4190 { 4191 $self->changeRefKeys($self->dereference($newkey), \%newrefkeys, $already_changed); 4192 } 4193 } 4194 return (%newrefkeys); 4195} 4196 4197=item $doc->deleteObject($objectnum) 4198 4199Remove an object from the document. This function does NOT take care 4200of dependencies on this object. 4201 4202=cut 4203 4204sub deleteObject 4205{ 4206 my $self = shift; 4207 my $objnum = shift; 4208 4209 # DON'T clear versuion number! We need to keep this to increment later 4210 #delete $self->{versions}->{$objnum}; 4211 delete $self->{objcache}->{$objnum}; 4212 delete $self->{xref}->{$objnum}; 4213 delete $self->{endxref}->{$objnum}; 4214 delete $self->{changes}->{$objnum}; 4215 return; 4216} 4217 4218=item $doc->cleanse() 4219 4220Remove unused objects. I<WARNING:> this function breaks some PDF 4221documents because it removes objects that are strictly part of the 4222page model hierarchy, but which are required anyway (like some font 4223definition objects). 4224 4225=cut 4226 4227sub cleanse 4228{ 4229 my $self = shift; 4230 4231 delete $self->{trailer}->{XRefStm}; # can't write this 4232 delete $self->getRootDict()->{PieceInfo}; # can't handle this one, too complicated 4233 4234 my $base = CAM::PDF::Node->new('dictionary', $self->{trailer}); 4235 my @list = sort {$a<=>$b} $self->getRefList($base); 4236 #print join(',', @list), "\n"; 4237 4238 for my $i (1 .. $self->{maxobj}) 4239 { 4240 if (@list && $list[0] == $i) 4241 { 4242 shift @list; 4243 } 4244 else 4245 { 4246 #warn "delete object $i\n"; 4247 $self->deleteObject($i); 4248 } 4249 } 4250 return; 4251} 4252 4253=item $doc->createID() 4254 4255I<For INTERNAL use> 4256 4257Generate a new document ID. Contrary the Adobe recommendation, this 4258is a random number. 4259 4260=cut 4261 4262sub createID 4263{ 4264 my $self = shift; 4265 4266 # Warning: this is non-repeatable, and depends on Linux! 4267 4268 my $addbytes; 4269 if ($self->{ID}) 4270 { 4271 # do not change the first half of an existing ID 4272 $self->{ID} = substr $self->{ID}, 0, 16; 4273 $addbytes = 16; 4274 } 4275 else 4276 { 4277 $self->{ID} = q{}; 4278 $addbytes = 32; 4279 } 4280 4281 # Append $addbytes random bytes 4282 # First try the system random number generator 4283 if (-f '/dev/urandom') 4284 { 4285 if (open my $fh, '<', '/dev/urandom') 4286 { 4287 my $bytes_read = read $fh, $self->{ID}, $addbytes, 32-$addbytes; 4288 close $fh; ##no critic(Syscalls) 4289 $addbytes -= $bytes_read; 4290 } 4291 } 4292 # If that failed, use Perl's random number generator 4293 for (1..$addbytes) 4294 { 4295 $self->{ID} .= pack 'C', int rand 256; 4296 } 4297 4298 if ($self->{trailer}) 4299 { 4300 $self->{trailer}->{ID} = CAM::PDF::Node->new('array', 4301 [ 4302 CAM::PDF::Node->new('hexstring', substr $self->{ID}, 0, 16), 4303 CAM::PDF::Node->new('hexstring', substr $self->{ID}, 16, 16), 4304 ], 4305 ); 4306 } 4307 4308 return 1; 4309} 4310 4311=item $doc->fillFormFields($name => $value, ...) 4312 4313=item $doc->fillFormFields($opts_hash, $name => $value, ...) 4314 4315Set the default values of PDF form fields. The name should be the 4316full hierarchical name of the field as output by the 4317getFormFieldList() function. The argument list can be a hash if you 4318like. A simple way to use this function is something like this: 4319 4320 my %fields = (fname => 'John', lname => 'Smith', state => 'WI'); 4321 $field{zip} = 53703; 4322 $self->fillFormFields(%fields); 4323 4324If the first argument is a hash reference, it is interpreted as 4325options for how to render the filled data: 4326 4327=over 4328 4329=item background_color =E<lt> 'none' | $gray | [$r, $g, $b] 4330 4331Specify the background color for the text field. 4332 4333=item max_autoscale_fontsize =E<lt> $size 4334 4335=item min_autoscale_fontsize =E<lt> $size 4336 4337If the form field is set to auto-size the text to fit, then you may 4338use these options to constrain the limits of that 4339autoscaling. Otherwise, for example, a very long string will become 4340arbitrarily small to fit in the box. 4341 4342=back 4343 4344=cut 4345 4346sub fillFormFields ## no critic(Subroutines::ProhibitExcessComplexity, Unpack) 4347{ 4348 my $self = shift; 4349 my $opts = ref $_[0] ? shift : {}; 4350 my @list = (@_); 4351 4352 my %opts = ( 4353 background_color => 1, 4354 %{$opts}, 4355 ); 4356 4357 my $filled = 0; 4358 while (@list > 0) 4359 { 4360 my $key = shift @list; 4361 my $value = shift @list; 4362 if (!defined $value) 4363 { 4364 $value = q{}; 4365 } 4366 4367 next if (!$key); 4368 next if (ref $key); 4369 my $objnode = $self->getFormField($key); 4370 next if (!$objnode); 4371 4372 my $objnum = $objnode->{objnum}; 4373 my $gennum = $objnode->{gennum}; 4374 4375 # This read-only dict includes inherited properties 4376 my $propdict = $self->getFormFieldDict($objnode); 4377 4378 # This read-write dict does not include inherited properties 4379 my $dict = $self->getValue($objnode); 4380 $dict->{V} = CAM::PDF::Node->new('string', $value, $objnum, $gennum); 4381 #$dict->{DV} = CAM::PDF::Node->new('string', $value, $objnum, $gennum); 4382 4383 if ($propdict->{FT} && $self->getValue($propdict->{FT}) eq 'Tx') # Is it a text field? 4384 { 4385 # Set up display of form value 4386 if (!$dict->{AP}) 4387 { 4388 $dict->{AP} = CAM::PDF::Node->new('dictionary', {}, $objnum, $gennum); 4389 } 4390 if (!$dict->{AP}->{value}->{N}) 4391 { 4392 my $newobj = CAM::PDF::Node->new('object', 4393 CAM::PDF::Node->new('dictionary',{}), 4394 ); 4395 my $num = $self->appendObject(undef, $newobj, 0); 4396 $dict->{AP}->{value}->{N} = CAM::PDF::Node->new('reference', $num, $objnum, $gennum); 4397 } 4398 my $formobj = $self->dereference($dict->{AP}->{value}->{N}->{value}); 4399 my $formonum = $formobj->{objnum}; 4400 my $formgnum = $formobj->{gennum}; 4401 my $formdict = $self->getValue($formobj); 4402 if (!$formdict->{Subtype}) 4403 { 4404 $formdict->{Subtype} = CAM::PDF::Node->new('label', 'Form', $formonum, $formgnum); 4405 } 4406 my @rect = (0,0,0,0); 4407 if ($dict->{Rect}) 4408 { 4409 ## no critic(Bangs::ProhibitNumberedNames) 4410 my $r = $self->getValue($dict->{Rect}); 4411 my ($x1, $y1, $x2, $y2) = @{$r}; 4412 @rect = ( 4413 $self->getValue($x1), 4414 $self->getValue($y1), 4415 $self->getValue($x2), 4416 $self->getValue($y2), 4417 ); 4418 } 4419 my $dx = $rect[2]-$rect[0]; 4420 my $dy = $rect[3]-$rect[1]; 4421 if (!$formdict->{BBox}) 4422 { 4423 $formdict->{BBox} = CAM::PDF::Node->new('array', 4424 [ 4425 CAM::PDF::Node->new('number', 0, $formonum, $formgnum), 4426 CAM::PDF::Node->new('number', 0, $formonum, $formgnum), 4427 CAM::PDF::Node->new('number', $dx, $formonum, $formgnum), 4428 CAM::PDF::Node->new('number', $dy, $formonum, $formgnum), 4429 ], 4430 $formonum, 4431 $formgnum); 4432 } 4433 my $text = $value; 4434 $text =~ s/ \r\n? /\n/gxms; 4435 $text =~ s/ \n+\z //xms; 4436 4437 my @rsrcs; 4438 my $fontmetrics = 0; 4439 my $fontname = q{}; 4440 my $fontsize = 0; 4441 my $da = q{}; 4442 my $tl = q{}; 4443 my $border = 2; 4444 my $tx = $border; 4445 my $ty = $border + 2; 4446 my $stringwidth; 4447 if ($propdict->{DA}) { 4448 $da = $self->getValue($propdict->{DA}); 4449 4450 # Try to pull out all of the resources used in the text object 4451 @rsrcs = ($da =~ m{ /([^\s<>/\[\]()]+) }gxms); 4452 4453 # Try to pull out the font size, if any. If more than 4454 # one, pick the last one. Font commands look like: 4455 # "/<fontname> <size> Tf" 4456 if ($da =~ m{ \s*/(\w+)\s+(\d+)\s+Tf.*? \z }xms) 4457 { 4458 $fontname = $1; 4459 $fontsize = $2; 4460 if ($fontname) 4461 { 4462 if ($propdict->{DR}) 4463 { 4464 my $dr = $self->getValue($propdict->{DR}); 4465 $fontmetrics = $self->getFontMetrics($dr, $fontname); 4466 } 4467 #print STDERR "Didn't get font\n" if (!$fontmetrics); 4468 } 4469 } 4470 } 4471 4472 my %flags = ( 4473 Justify => 'left', 4474 ); 4475 if ($propdict->{Ff}) 4476 { 4477 # Just decode the ones we actually care about 4478 # PDF ref, 3rd ed pp 532,543 4479 my $ff = $self->getValue($propdict->{Ff}); 4480 my @flags = split m//xms, unpack 'b*', pack 'V', $ff; 4481 $flags{ReadOnly} = $flags[0]; 4482 $flags{Required} = $flags[1]; 4483 $flags{NoExport} = $flags[2]; 4484 $flags{Multiline} = $flags[12]; 4485 $flags{Password} = $flags[13]; 4486 $flags{FileSelect} = $flags[20]; 4487 $flags{DoNotSpellCheck} = $flags[22]; 4488 $flags{DoNotScroll} = $flags[23]; 4489 } 4490 if ($propdict->{Q}) 4491 { 4492 my $q = $self->getValue($propdict->{Q}) || 0; 4493 $flags{Justify} = $q==2 ? 'right' : ($q==1 ? 'center' : 'left'); 4494 } 4495 4496 # The order of the following sections is important! 4497 if ($flags{Password}) 4498 { 4499 $text =~ s/ [^\n] /*/gxms; # Asterisks for password characters 4500 } 4501 4502 if ($fontmetrics && ! $fontsize) 4503 { 4504 # Fix autoscale fonts 4505 $stringwidth = 0; 4506 my $lines = 0; 4507 for my $line (split /\n/xms, $text) # trailing null strings omitted 4508 { 4509 $lines++; 4510 my $w = $self->getStringWidth($fontmetrics, $line); 4511 if ($w && $w > $stringwidth) 4512 { 4513 $stringwidth = $w; 4514 } 4515 } 4516 $lines ||= 1; 4517 # Initial guess 4518 $fontsize = ($dy - 2 * $border) / ($lines * 1.5); 4519 my $fontwidth = $fontsize * $stringwidth; 4520 my $maxwidth = $dx - 2 * $border; 4521 if ($fontwidth > $maxwidth) 4522 { 4523 $fontsize *= $maxwidth / $fontwidth; 4524 } 4525 4526 # allow for user override 4527 if (exists $opts->{max_autoscale_fontsize} && $fontsize > $opts->{max_autoscale_fontsize}) { 4528 $fontsize = $opts->{max_autoscale_fontsize}; 4529 } 4530 if (exists $opts->{min_autoscale_fontsize} && $fontsize < $opts->{min_autoscale_fontsize}) { 4531 $fontsize = $opts->{min_autoscale_fontsize}; 4532 } 4533 4534 $da =~ s/ \/$fontname\s+0\s+Tf\b /\/$fontname $fontsize Tf/gxms; 4535 } 4536 if ($fontsize) 4537 { 4538 # This formula is TOTALLY empirical. It's probably wrong. 4539 $ty = $border + 2 + (9 - $fontsize) * 0.4; 4540 } 4541 4542 4543 # escape characters 4544 $text = $self->writeString($text); 4545 4546 if ($flags{Multiline}) 4547 { 4548 # TODO: wrap the field with wrapString()?? 4549 # Shawn Dawson of Silent Solutions pointed out that this does not auto-wrap the input text 4550 4551 my $linebreaks = $text =~ s/ \\n /\) Tj T* \(/gxms; 4552 4553 # Total guess work: 4554 # line height is either 150% of fontsize or thrice 4555 # the corner offset 4556 $tl = $fontsize ? $fontsize * 1.5 : $ty * 3; 4557 4558 # Bottom aligned 4559 #$ty += $linebreaks * $tl; 4560 # Top aligned 4561 $ty = $dy - $border - $tl; 4562 4563 if ($flags{Justify} ne 'left') 4564 { 4565 warn 'Justified text not supported for multiline fields'; 4566 } 4567 4568 $tl .= ' TL'; 4569 } 4570 else 4571 { 4572 if ($flags{Justify} ne 'left' && $fontmetrics) 4573 { 4574 my $width = $stringwidth || $self->getStringWidth($fontmetrics, $text); 4575 my $diff = $dx - $width*$fontsize; 4576 4577 if ($flags{Justify} eq 'center') 4578 { 4579 $text = ($diff/2)." 0 Td $text"; 4580 } 4581 elsif ($flags{Justify} eq 'right') 4582 { 4583 $text = "$diff 0 Td $text"; 4584 } 4585 } 4586 } 4587 4588 # Move text from lower left corner of form field 4589 my $tm = "1 0 0 1 $tx $ty Tm "; 4590 4591 # if not 'none', draw a background as a filled rectangle of solid color 4592 my $background_color 4593 = $opts{background_color} eq 'none' ? q{} 4594 : ref $opts{background_color} ? "@{$opts{background_color}} rgb" 4595 : "$opts{background_color} g"; 4596 my $background = $background_color ? "$background_color 0 0 $dx $dy re f" : q{}; 4597 4598 $text = "$tl $da $tm $text Tj"; 4599 $text = "$background /Tx BMC q 1 1 ".($dx-$border).q{ }.($dy-$border)." re W n BT $text ET Q EMC"; 4600 my $len = length $text; 4601 $formdict->{Length} = CAM::PDF::Node->new('number', $len, $formonum, $formgnum); 4602 $formdict->{StreamData} = CAM::PDF::Node->new('stream', $text, $formonum, $formgnum); 4603 4604 if (@rsrcs > 0) { 4605 if (!$formdict->{Resources}) 4606 { 4607 $formdict->{Resources} = CAM::PDF::Node->new('dictionary', {}, $formonum, $formgnum); 4608 } 4609 my $rdict = $self->getValue($formdict->{Resources}); 4610 if (!$rdict->{ProcSet}) 4611 { 4612 $rdict->{ProcSet} = CAM::PDF::Node->new('array', 4613 [ 4614 CAM::PDF::Node->new('label', 'PDF', $formonum, $formgnum), 4615 CAM::PDF::Node->new('label', 'Text', $formonum, $formgnum), 4616 ], 4617 $formonum, 4618 $formgnum); 4619 } 4620 if (!$rdict->{Font}) 4621 { 4622 $rdict->{Font} = CAM::PDF::Node->new('dictionary', {}, $formonum, $formgnum); 4623 } 4624 my $fdict = $self->getValue($rdict->{Font}); 4625 4626 # Search out font resources. This is a total kluge. 4627 # TODO: the right way to do this is to look for the DR 4628 # attribute in the form element or it's ancestors. 4629 for my $font (@rsrcs) 4630 { 4631 my $fobj = $self->dereference("/$font", 'All'); 4632 if (!$fobj) 4633 { 4634 die "Could not find resource /$font while preparing form field $key\n"; 4635 } 4636 $fdict->{$font} = CAM::PDF::Node->new('reference', $fobj->{objnum}, $formonum, $formgnum); 4637 } 4638 } 4639 } 4640 $filled++; 4641 } 4642 return $filled; 4643} 4644 4645 4646=item $doc->clearFormFieldTriggers($name, $name, ...) 4647 4648Disable any triggers set on data entry for the specified form field 4649names. This is useful in the case where, for example, the data entry 4650Javascript forbids punctuation and you want to prefill with a 4651hyphenated word. If you don't clear the trigger, the prefill may not 4652happen. 4653 4654=cut 4655 4656sub clearFormFieldTriggers 4657{ 4658 my ($self, @fieldnames) = @_; 4659 4660 for my $fieldname (@fieldnames) 4661 { 4662 my $objnode = $self->getFormField($fieldname); 4663 if ($objnode) 4664 { 4665 if (exists $objnode->{value}->{value}->{AA}) 4666 { 4667 delete $objnode->{value}->{value}->{AA}; 4668 my $objnum = $objnode->{objnum}; 4669 if ($objnum) 4670 { 4671 $self->{changes}->{$objnum} = 1; 4672 } 4673 } 4674 } 4675 } 4676 return; 4677} 4678 4679=item $doc->clearAnnotations() 4680 4681Remove all annotations from the document. If form fields are 4682encountered, their text is added to the appropriate page. 4683 4684=cut 4685 4686sub clearAnnotations 4687{ 4688 my $self = shift; 4689 4690 my $formrsrcs; 4691 my $root = $self->getRootDict(); 4692 if ($root->{AcroForm}) 4693 { 4694 my $acroform = $self->getValue($root->{AcroForm}); 4695 # Get the form resources 4696 if ($acroform->{DR}) 4697 { 4698 $formrsrcs = $self->getValue($acroform->{DR}); 4699 } 4700 4701 # Kill off the forms 4702 $self->deleteObject($root->{AcroForm}->{value}); 4703 delete $root->{AcroForm}; 4704 } 4705 4706 # Iterate through the pages, deleting annotations 4707 4708 my $pages = $self->numPages(); 4709 for my $p (1..$pages) 4710 { 4711 my $page = $self->getPage($p); 4712 if ($page->{Annots}) { 4713 $self->addPageResources($p, $formrsrcs); 4714 my $annotsarray = $self->getValue($page->{Annots}); 4715 delete $page->{Annots}; 4716 for my $annotref (@{$annotsarray}) 4717 { 4718 my $annot = $self->getValue($annotref); 4719 if ((ref $annot) ne 'HASH') 4720 { 4721 die 'Internal error: annotation is not a dictionary'; 4722 } 4723 # Copy all text field values into the page, if present 4724 if ($annot->{Subtype} && 4725 $annot->{Subtype}->{value} eq 'Widget' && 4726 $annot->{FT} && 4727 $annot->{FT}->{value} eq 'Tx' && 4728 $annot->{AP}) 4729 { 4730 my $ap = $self->getValue($annot->{AP}); 4731 my $rect = $self->getValue($annot->{Rect}); 4732 my $x = $self->getValue($rect->[0]); 4733 my $y = $self->getValue($rect->[1]); 4734 if ($ap->{N}) 4735 { 4736 my $n = $self->dereference($ap->{N}->{value})->{value}; 4737 my $content = $self->decodeOne($n, 0); 4738 if (!$content) 4739 { 4740 die 'Internal error: expected a content stream from the form copy'; 4741 } 4742 $content =~ s/ \bre(\s+)f\b /re$1n/gxms; 4743 $content = "q 1 0 0 1 $x $y cm\n$content Q\n"; 4744 $self->appendPageContent($p, $content); 4745 $self->addPageResources($p, $self->getValue($n->{value}->{Resources})); 4746 } 4747 } 4748 $self->deleteObject($annotref->{value}); 4749 } 4750 } 4751 } 4752 4753 # kill off the annotation dependencies 4754 $self->cleanse(); 4755 return; 4756} 4757 4758=item $doc->previousRevision() 4759 4760If this PDF was previously saved in append mode (that is, if 4761C<clean()> was not invoked on it), return a new instance representing 4762that previous version. Otherwise return void. If this is an 4763encrypted PDF, this method assumes that previous revisions were 4764encrypted with the same password, which may be an incorrect 4765assumption. 4766 4767=cut 4768 4769sub previousRevision { 4770 my $self = shift; 4771 4772 my $content = \$self->{content}; 4773 return if !${$content}; # already wiped... 4774 4775 # Figure out line end character 4776 my ($lineend) = ${$content} =~ m/ (.)%%EOF.*?\z /xms; 4777 return if !$lineend; # Corrupt PDF: Cannot find the end-of-file marker 4778 4779 my $eof = $lineend.'%%EOF'; 4780 my $i = rindex ${$content}, $eof; 4781 my $j = rindex ${$content}, $eof, $i-1; 4782 return if $j < 0; # just one revision 4783 4784 my $prev_content = (substr ${$content}, 0, $j) . $eof . $lineend; 4785 # assume the passwords were the same in the previous rev 4786 my ($opass, $upass, @perms) = $self->getPrefs; 4787 4788 return __PACKAGE__->new($prev_content, $opass, $upass); 4789} 4790 4791=item $doc->allRevisions() 4792 4793Accumulate CAM::PDF instances returned by C<previousRevision> until 4794there are no more previous revisions. Returns a list of instances 4795from newest to oldest including this instance as the newest. 4796 4797=cut 4798 4799sub allRevisions { 4800 my ($self) = @_; 4801 my @revs; 4802 for (my $pdf = $self; $pdf; $pdf = $pdf->previous_revision) { ## no critic(ProhibitCStyleForLoops) 4803 push @revs, $pdf; 4804 } 4805 return @revs; 4806} 4807 4808################################################################################ 4809 4810=back 4811 4812=head2 Document Writing 4813 4814=over 4815 4816=item $doc->preserveOrder() 4817 4818Try to recreate the original document as much as possible. This may 4819help in recreating documents which use undocumented tricks of saving 4820font information in adjacent objects. 4821 4822=cut 4823 4824sub preserveOrder 4825{ 4826 # Call this to record the order of the objects in the original file 4827 # If called, then any new file will try to preserve the original order 4828 my $self = shift; 4829 4830 my $x = $self->{xref}; # shorthand 4831 $self->{order} = [sort {$x->{$a} <=> $x->{$b}} grep {!ref $x->{$_}} keys %{$x}]; 4832 return; 4833} 4834 4835=item $doc->isLinearized() 4836 4837Returns a boolean indicating whether this PDF is linearized (aka 4838"optimized"). 4839 4840=cut 4841 4842sub isLinearized 4843{ 4844 my $self = shift; 4845 4846 my $first; 4847 if (exists $self->{order}) 4848 { 4849 $first = $self->{order}->[0]; 4850 } 4851 else 4852 { 4853 my $x = $self->{xref}; # shorthand 4854 ($first) = sort {$x->{$a} <=> $x->{$b}} grep {!ref $x->{$_}} keys %{$x}; 4855 } 4856 4857 my $linearized; # false 4858 my $objnode = $self->dereference($first); 4859 if ($objnode && $objnode->{value}->{type} eq 'dictionary') 4860 { 4861 if (exists $objnode->{value}->{value}->{Linearized}) 4862 { 4863 $linearized = $self; # true 4864 } 4865 } 4866 return $linearized; 4867} 4868 4869=item $doc->delinearize() 4870 4871I<For INTERNAL use> 4872 4873Undo the tweaks used to make the document 'optimized'. This function 4874is automatically called on every save or output since this library 4875does not yet support linearized documents. 4876 4877=cut 4878 4879sub delinearize 4880{ 4881 my $self = shift; 4882 4883 return if ($self->{delinearized}); 4884 4885 # Turn off Linearization, if set 4886 my $first; 4887 if (exists $self->{order}) 4888 { 4889 $first = $self->{order}->[0]; 4890 } 4891 else 4892 { 4893 my $x = $self->{xref}; # shorthand 4894 ($first) = sort {$x->{$a} <=> $x->{$b}} grep {!ref $x->{$_}} keys %{$x}; 4895 } 4896 4897 my $objnode = $self->dereference($first); 4898 if ($objnode->{value}->{type} eq 'dictionary') 4899 { 4900 if (exists $objnode->{value}->{value}->{Linearized}) 4901 { 4902 $self->deleteObject($first); 4903 } 4904 } 4905 4906 $self->{delinearized} = 1; 4907 return; 4908} 4909 4910=item $doc->clean() 4911 4912Cache all parts of the document and throw away it's old structure. 4913This is useful for writing PDFs anew, instead of simply appending 4914changes to the existing documents. This is called by cleansave() and 4915cleanoutput(). 4916 4917=cut 4918 4919sub clean 4920{ 4921 my $self = shift; 4922 4923 # Make sure to extract everything before we wipe the old version 4924 $self->cacheObjects(); 4925 4926 $self->delinearize(); 4927 4928 # Update the ID number to make this document distinct from the original. 4929 # If there is already an ID, only the second half is changed 4930 $self->createID(); 4931 4932 # Mark everything changed 4933 %{$self->{changes}} = ( 4934 %{$self->{changes}}, 4935 map { $_ => 1 } keys %{$self->{xref}}, 4936 ); 4937 4938 # Mark everything new 4939 %{$self->{versions}} = ( 4940 %{$self->{versions}}, 4941 map { $_ => 0 } keys %{$self->{xref}}, 4942 ); 4943 4944 $self->{xref} = {}; 4945 delete $self->{endxref}; 4946 $self->{startxref} = 0; 4947 $self->{content} = q{}; 4948 $self->{contentlength} = 0; 4949 4950 my $trailer = $self->{trailer}; 4951 delete $trailer->{Prev}; 4952 delete $trailer->{XRefStm}; 4953 if (exists $trailer->{Type} && $trailer->{Type}->{value} eq 'XRef') { 4954 delete $trailer->{Type}; 4955 delete $trailer->{Size}; 4956 delete $trailer->{Index}; 4957 delete $trailer->{W}; 4958 delete $trailer->{Length}; 4959 delete $trailer->{L}; 4960 delete $trailer->{StreamData}; 4961 delete $trailer->{Filter}; 4962 delete $trailer->{F}; 4963 delete $trailer->{DecodeParms}; 4964 delete $trailer->{DP}; 4965 } 4966 return; 4967} 4968 4969=item $doc->needsSave() 4970 4971Returns a boolean indicating whether the save() method needs to be 4972called. Like save(), this has nothing to do with whether the document 4973has been saved to disk, but whether the in-memory representation of 4974the document has been serialized. 4975 4976=cut 4977 4978sub needsSave 4979{ 4980 my $self = shift; 4981 4982 return 0 != keys %{$self->{changes}}; 4983} 4984 4985=item $doc->save() 4986 4987Serialize the document into a single string. All changed document 4988elements are normalized, and a new index and an updated trailer are 4989created. 4990 4991This function operates solely in memory. It DOES NOT write the 4992document to a file. See the output() function for that. 4993 4994=cut 4995 4996sub save 4997{ 4998 my $self = shift; 4999 5000 if (!$self->needsSave()) 5001 { 5002 return $self; 5003 } 5004 5005 $self->delinearize(); 5006 5007 delete $self->{endxref}; 5008 5009 if (!$self->{content}) 5010 { 5011 $self->{content} = '%PDF-' . $self->{pdfversion} . "\n%\217\n"; 5012 } 5013 5014 my %allobjs = (%{$self->{changes}}, %{$self->{xref}}); 5015 my @objects = sort {$a<=>$b} keys %allobjs; 5016 if ($self->{order}) 5017 { 5018 5019 # Sort in the order in $self->{order} array, with the rest later 5020 # in objnum order 5021 my %o; 5022 my $n = @{$self->{order}}; 5023 for my $i (0 .. $n-1) 5024 { 5025 $o{$self->{order}->[$i]} = $i; 5026 } 5027 @objects = map {$_->[1]} sort {$a->[0] <=> $b->[0]} map {[$o{$_} || $_+$n, $_]} @objects; 5028 } 5029 delete $self->{order}; 5030 5031 my %newxref; 5032 my $offset = length $self->{content}; 5033 for my $key (@objects) 5034 { 5035 next if (!$self->{changes}->{$key}); 5036 $newxref{$key} = $offset; 5037 5038 #print "Writing object $key\n"; 5039 my $obj = $self->writeObject($key); 5040 $self->{content} .= $obj; 5041 $offset += length $obj; 5042 5043 $self->{xref}->{$key} = $newxref{$key}; 5044 delete $self->{changes}->{$key}; 5045 } 5046 5047 if ($self->{content} !~ m/ [\r\n] \z /xms) 5048 { 5049 $self->{content} .= "\n"; 5050 } 5051 5052 my $startxref = length $self->{content}; 5053 5054 # Append the new xref 5055 $self->{content} .= "xref\n"; 5056 my %blocks = ( 5057 0 => "0000000000 65535 f \n", 5058 ); 5059 for my $key (keys %newxref) 5060 { 5061 $blocks{$key} = sprintf "%010d %05d n \n", $newxref{$key}, $self->{versions}->{$key}; 5062 } 5063 5064 # If there is only one version of the document, there must be no 5065 # holes in the xref. Test for versions by checking if there's already an xref. 5066 # If clean() has been called, it will be absent 5067 if (!$self->{startxref}) 5068 { 5069 # Fill in holes 5070 my $prevfreeblock = 0; 5071 for my $key (reverse 0 .. $self->{maxobj}-1) 5072 { 5073 if (!exists $blocks{$key}) 5074 { 5075 # Add an entry to the free list 5076 # On $key == 0, this blows away the above definition of 5077 # the head of the free block list, but that's no big deal. 5078 $blocks{$key} = sprintf "%010d %05d f \n", 5079 $prevfreeblock, ($key == 0 ? 65_535 : 1); 5080 $prevfreeblock = $key; 5081 } 5082 } 5083 } 5084 5085 my $currblock = q{}; 5086 my $currnum = 0; 5087 my $currstart = 0; 5088 my @blockkeys = sort {$a<=>$b} keys %blocks; 5089 for my $i (0 .. $#blockkeys) 5090 { 5091 my $key = $blockkeys[$i]; 5092 $currblock .= $blocks{$key}; 5093 $currnum++; 5094 if ($i == $#blockkeys || $key+1 < $blockkeys[$i+1]) 5095 { 5096 $self->{content} .= "$currstart $currnum\n$currblock"; 5097 if ($i < $#blockkeys) 5098 { 5099 $currblock = q{}; 5100 $currnum = 0; 5101 $currstart = $blockkeys[$i+1]; 5102 } 5103 } 5104 } 5105 5106 # Append the new trailer 5107 $self->{trailer}->{Size} = CAM::PDF::Node->new('number', $self->{maxobj} + 1); 5108 if ($self->{startxref}) 5109 { 5110 $self->{trailer}->{Prev} = CAM::PDF::Node->new('number', $self->{startxref}); 5111 } 5112 $self->{content} .= "trailer\n" . $self->writeAny(CAM::PDF::Node->new('dictionary', $self->{trailer})) . "\n"; 5113 5114 # Append the new startxref 5115 $self->{content} .= "startxref\n$startxref\n"; 5116 $self->{startxref} = $startxref; 5117 5118 # Append EOF 5119 $self->{content} .= "%%EOF\n"; 5120 5121 $self->{contentlength} = length $self->{content}; 5122 5123 return $self; 5124} 5125 5126=item $doc->cleansave() 5127 5128Call the clean() function, then call the save() function. 5129 5130=cut 5131 5132sub cleansave 5133{ 5134 my $self = shift; 5135 5136 $self->clean(); 5137 return $self->save(); 5138} 5139 5140=item $doc->output($filename) 5141 5142=item $doc->output() 5143 5144Save the document to a file. The save() function is called first to 5145serialize the data structure. If no filename is specified, or if the 5146filename is '-', the document is written to standard output. 5147 5148Note: it is the responsibility of the application to ensure that the 5149PDF document has either the Modify or Add permission. You can do this 5150like the following: 5151 5152 if ($self->canModify()) { 5153 $self->output($outfile); 5154 } else { 5155 die "The PDF file denies permission to make modifications\n"; 5156 } 5157 5158=cut 5159 5160sub output 5161{ 5162 my $self = shift; 5163 my $file = shift; 5164 if (!defined $file) 5165 { 5166 $file = q{-}; 5167 } 5168 5169 $self->save(); 5170 5171 if ($file eq q{-}) 5172 { 5173 binmode STDOUT; ##no critic(RequireCheckedSysCalls) 5174 print $self->{content}; 5175 } 5176 else 5177 { 5178 open my $fh, '>', $file or die "Failed to write file $file\n"; 5179 binmode $fh or die "Failed to set binmode for file $file\n"; 5180 print {$fh} $self->{content}; 5181 close $fh or die "Failed to write file $file\n"; 5182 } 5183 return $self; 5184} 5185 5186=item $doc->cleanoutput($file) 5187 5188=item $doc->cleanoutput() 5189 5190Call the clean() function, then call the output() function to write a 5191fresh copy of the document to a file. 5192 5193=cut 5194 5195sub cleanoutput 5196{ 5197 my $self = shift; 5198 my $file = shift; 5199 5200 $self->clean(); 5201 return $self->output($file); 5202} 5203 5204=item $doc->writeObject($objnum) 5205 5206Return the serialization of the specified object. 5207 5208=cut 5209 5210sub writeObject 5211{ 5212 my $self = shift; 5213 my $objnum = shift; 5214 5215 return "$objnum 0 " . $self->writeAny($self->dereference($objnum)); 5216} 5217 5218=item $doc->writeString($string) 5219 5220Return the serialization of the specified string. Works on normal or 5221hex strings. If encryption is desired, the string should be encrypted 5222before being passed here. 5223 5224=cut 5225 5226sub writeString 5227{ 5228 my $pkg_or_doc = shift; 5229 my $string = shift; 5230 5231 # Divide the string into manageable pieces, which will be 5232 # re-concatenated with "\" continuation characters at the end of 5233 # their lines 5234 5235 # -- This code used to do concatenation by juxtaposing multiple 5236 # -- "(<fragment>)" compenents, but this breaks many PDF 5237 # -- implementations (incl Acrobat5 and XPDF) 5238 5239 # Break the string into pieces of length $maxstr. Note that an 5240 # artifact of this usage of split returns empty strings between 5241 # the fragments, so grep them out 5242 5243 my $maxstr = (ref $pkg_or_doc) ? $pkg_or_doc->{maxstr} : $CAM::PDF::MAX_STRING; 5244 my @strs = grep {$_ ne q{}} split /(.{$maxstr}})/xms, $string; 5245 for (@strs) 5246 { 5247 s/ \\ /\\\\/gxms; # escape escapes -- this line must come first! 5248 s/ ([()]) /\\$1/gxms; # escape parens 5249 s/ \n /\\n/gxms; 5250 s/ \r /\\r/gxms; 5251 s/ \t /\\t/gxms; 5252 s/ \f /\\f/gxms; 5253 # TODO: handle backspace char 5254 #s/ ??? /\\b/gxms; 5255 } 5256 return '(' . (join "\\\n", @strs) . ')'; 5257} 5258 5259=item $doc->writeAny($node) 5260 5261Returns the serialization of the specified node. This handles all 5262Node types, including object Nodes. 5263 5264=cut 5265 5266sub writeAny 5267{ 5268 my $self = shift; 5269 my $objnode = shift; 5270 5271 if (! ref $objnode) 5272 { 5273 die 'Not a ref'; 5274 } 5275 5276 my $key = $objnode->{type}; 5277 my $val = $objnode->{value}; 5278 my $objnum = $objnode->{objnum}; 5279 my $gennum = $objnode->{gennum}; 5280 5281 return $key eq 'string' ? $self->writeString($self->{crypt}->encrypt($self, $val, $objnum, $gennum)) 5282 : $key eq 'hexstring' ? '<' . (unpack 'H*', $self->{crypt}->encrypt($self, $val, $objnum, $gennum)) . '>' 5283 : $key eq 'number' ? "$val" 5284 : $key eq 'reference' ? "$val 0 R" # TODO: lookup the gennum and use it instead of 0 (?) 5285 : $key eq 'boolean' ? $val 5286 : $key eq 'null' ? 'null' 5287 : $key eq 'label' ? "/$val" 5288 : $key eq 'array' ? $self->_writeArray($objnode) 5289 : $key eq 'dictionary' ? $self->_writeDictionary($objnode) 5290 : $key eq 'object' ? $self->_writeObject($objnode) 5291 5292 : die "Unknown key '$key' in writeAny (objnum ".($objnum||'<none>').")\n"; 5293} 5294 5295sub _writeArray 5296{ 5297 my $self = shift; 5298 my $objnode = shift; 5299 5300 my $val = $objnode->{value}; 5301 if (@{$val} == 0) 5302 { 5303 return '[ ]'; 5304 } 5305 my $str = q{}; 5306 my @strs; 5307 for (@{$val}) 5308 { 5309 my $newstr = $self->writeAny($_); 5310 if ($str ne q{}) 5311 { 5312 if ($self->{maxstr} < length $str . $newstr) 5313 { 5314 push @strs, $str; 5315 $str = q{}; 5316 } 5317 else 5318 { 5319 $str .= q{ }; 5320 } 5321 } 5322 $str .= $newstr; 5323 } 5324 if (@strs > 0) 5325 { 5326 $str = join "\n", @strs, $str; 5327 } 5328 return '[ ' . $str . ' ]'; 5329} 5330 5331sub _writeDictionary 5332{ 5333 my $self = shift; 5334 my $objnode = shift; 5335 5336 my $val = $objnode->{value}; 5337 my $str = q{}; 5338 my @strs; 5339 if (exists $val->{Type}) 5340 { 5341 $str .= ($str ? q{ } : q{}) . '/Type ' . $self->writeAny($val->{Type}); 5342 } 5343 if (exists $val->{Subtype}) 5344 { 5345 $str .= ($str ? q{ } : q{}) . '/Subtype ' . $self->writeAny($val->{Subtype}); 5346 } 5347 for my $dictkey (sort keys %{$val}) 5348 { 5349 next if ($dictkey eq 'Type'); 5350 next if ($dictkey eq 'Subtype'); 5351 next if ($dictkey eq 'StreamDataDone'); 5352 if ($dictkey eq 'StreamData') 5353 { 5354 if (exists $val->{StreamDataDone}) 5355 { 5356 delete $val->{StreamDataDone}; 5357 next; 5358 } 5359 # This is a stream way down deep in the data... Probably due to a solidifyObject 5360 5361 # First, try to handle the easy case: 5362 if (2 == scalar keys %{$val} && (exists $val->{Length} || exists $val->{L})) 5363 { 5364 my $binary = $val->{$dictkey}->{value}; 5365 my $len = length $binary; 5366 my $unpacked = unpack 'H' . $len*2, $binary; 5367 return $self->writeAny(CAM::PDF::Node->new('hexstring', $unpacked, $objnode->{objnum}, $objnode->{gennum})); 5368 } 5369 5370 # TODO: Handle more complex streams ... 5371 die "This stream is too complex for me to write... Giving up\n"; 5372 5373 next; ## no critic(ControlStructures::ProhibitUnreachableCode) 5374 } 5375 5376 my $newstr = "/$dictkey " . $self->writeAny($val->{$dictkey}); 5377 if ($str ne q{}) 5378 { 5379 if ($self->{maxstr} < length $str . $newstr) 5380 { 5381 push @strs, $str; 5382 $str = q{}; 5383 } 5384 else 5385 { 5386 $str .= q{ }; 5387 } 5388 } 5389 $str .= $newstr; 5390 } 5391 if (@strs > 0) 5392 { 5393 $str = join "\n", @strs, $str; 5394 } 5395 return '<< ' . $str . ' >>'; 5396} 5397 5398sub _writeObject 5399{ 5400 my $self = shift; 5401 my $objnode = shift; 5402 5403 my $val = $objnode->{value}; 5404 if (! ref $val) 5405 { 5406 die "Obj data is not a ref! ($val)"; 5407 } 5408 my $stream; 5409 if ($val->{type} eq 'dictionary' && exists $val->{value}->{StreamData}) 5410 { 5411 $stream = $val->{value}->{StreamData}->{value}; 5412 my $length = length $stream; 5413 5414 my $l = $val->{value}->{Length} || $val->{value}->{L}; 5415 my $oldlength = $self->getValue($l); 5416 if ($length != $oldlength) 5417 { 5418 $val->{value}->{Length} = CAM::PDF::Node->new('number', $length, $objnode->{objnum}, $objnode->{gennum}); 5419 delete $val->{value}->{L}; 5420 } 5421 $val->{value}->{StreamDataDone} = 1; 5422 } 5423 my $str = $self->writeAny($val); 5424 if (defined $stream) 5425 { 5426 $stream = $self->{crypt}->encrypt($self, $stream, $objnode->{objnum}, $objnode->{gennum}); 5427 $str .= "\nstream\n" . $stream . 'endstream'; 5428 } 5429 return "obj\n$str\nendobj\n"; 5430} 5431 5432###################################################################### 5433 5434=back 5435 5436=head2 Document Traversing 5437 5438=over 5439 5440=item $doc->traverse($dereference, $node, $callbackfunc, $callbackdata) 5441 5442Recursive traversal of a PDF data structure. 5443 5444In many cases, it's useful to apply one action to every node in an 5445object tree. The routines below all use this traverse() function. 5446One of the most important parameters is the first: the C<$dereference> 5447boolean. If true, the traversal follows reference Nodes. If false, 5448it does not descend into reference Nodes. 5449 5450Optionally, you can pass in a hashref as a final argument to reduce 5451redundant traversing across multiple calls. Just pass in an empty 5452hashref the first time and pass in the same hashref each time. See 5453C<changeRefKeys()> for an example. 5454 5455=cut 5456 5457sub traverse 5458{ 5459 my $self = shift; 5460 my $deref = shift; 5461 my $startnode = shift; 5462 my $func = shift; 5463 my $funcdata = shift; 5464 my $traversed = shift || {}; 5465 5466 my @stack = ($startnode); 5467 5468 my $i = 0; 5469 while ($i < @stack) 5470 { 5471 my $objnode = $stack[$i++]; 5472 $self->$func($objnode, $funcdata); 5473 5474 my $type = $objnode->{type}; 5475 my $val = $objnode->{value}; 5476 5477 if ($type eq 'object') 5478 { 5479 # Shrink stack periodically 5480 splice @stack, 0, $i; 5481 $i = 0; 5482 # Mark object done 5483 if ($objnode->{objnum}) 5484 { 5485 $traversed->{$objnode->{objnum}} = 1; 5486 } 5487 } 5488 5489 push @stack, $type eq 'dictionary' ? values %{$val} 5490 : $type eq 'array' ? @{$val} 5491 : $type eq 'object' ? $val 5492 : $type eq 'reference' 5493 && $deref 5494 && !exists $traversed->{$val} ? $self->dereference($val) 5495 : (); 5496 } 5497 return; 5498} 5499 5500# decodeObject and decodeAll differ from each other like this: 5501# 5502# decodeObject JUST decodes a single stream directly below the object 5503# specified by the objnum 5504# 5505# decodeAll descends through a whole object tree (following 5506# references) decoding everything it can find 5507 5508=item $doc->decodeObject($objectnum) 5509 5510I<For INTERNAL use> 5511 5512Remove any filters (like compression, etc) from a data stream 5513indicated by the object number. 5514 5515=cut 5516 5517sub decodeObject 5518{ 5519 my $self = shift; 5520 my $objnum = shift; 5521 5522 my $objnode = $self->dereference($objnum); 5523 5524 $self->decodeOne($objnode->{value}, 1); 5525 return; 5526} 5527 5528=item $doc->decodeAll($object) 5529 5530I<For INTERNAL use> 5531 5532Remove any filters from any data stream in this object or any object 5533referenced by it. 5534 5535=cut 5536 5537sub decodeAll 5538{ 5539 my $self = shift; 5540 my $objnode = shift; 5541 5542 $self->traverse(1, $objnode, \&decodeOne, 1); 5543 return; 5544} 5545 5546=item $doc->decodeOne($object) 5547 5548=item $doc->decodeOne($object, $save?) 5549 5550I<For INTERNAL use> 5551 5552Remove any filters from an object. The boolean flag C<$save> (defaults to 5553false) indicates whether this removal should be permanent or just 5554this once. If true, the function returns success or failure. If 5555false, the function returns the defiltered content. 5556 5557=cut 5558 5559sub decodeOne 5560{ 5561 my $self = shift; 5562 my $objnode = shift; 5563 my $save = shift || 0; 5564 5565 my $changed = 0; 5566 my $streamdata = q{}; 5567 5568 if ($objnode->{type} ne 'dictionary') 5569 { 5570 return $save ? $changed : $streamdata; 5571 } 5572 5573 my $dict = $objnode->{value}; 5574 5575 $streamdata = $dict->{StreamData}->{value}; 5576 #warn 'decoding thing ' . ($dict->{StreamData}->{objnum} || '(unknown)') . "\n"; 5577 5578 # Don't work on {F} since that's too common a word 5579 #my $filtobj = $dict->{Filter} || $dict->{F}; 5580 my $filtobj = $dict->{Filter}; 5581 5582 if (defined $filtobj) 5583 { 5584 my @filters = $filtobj->{type} eq 'array' ? @{$filtobj->{value}} : ($filtobj); 5585 my $parmobj = $dict->{DecodeParms} || $dict->{DP}; 5586 my @parms; 5587 if ($parmobj) 5588 { 5589 @parms = $parmobj->{type} eq 'array' ? @{$parmobj->{value}} : ($parmobj); 5590 } 5591 5592 for my $filter (@filters) 5593 { 5594 if ($filter->{type} ne 'label') 5595 { 5596 warn "All filter names must be labels\n"; 5597 require Data::Dumper; 5598 warn Data::Dumper->Dump([$filter], ['Filter']); 5599 next; 5600 } 5601 my $filtername = $filter->{value}; 5602 5603 # Make sure this is not an encrypt dict 5604 next if ($filtername eq 'Standard'); 5605 5606 my $filt; 5607 eval { 5608 require Text::PDF::Filter; 5609 my $pkg = 'Text::PDF::' . ($filterabbrevs{$filtername} || $filtername); 5610 $filt = $pkg->new; 5611 1; 5612 }; 5613 if (!$filt) 5614 { 5615 warn "Failed to open filter $filtername (Text::PDF::$filtername)\n"; 5616 last; 5617 } 5618 5619 my $oldlength = length $streamdata; 5620 5621 { 5622 # Hack to turn off warnings in Filter library 5623 no warnings; ## no critic(TestingAndDebugging::ProhibitNoWarnings) 5624 $streamdata = $filt->infilt($streamdata, 1); 5625 } 5626 5627 $self->fixDecode(\$streamdata, $filtername, shift @parms); 5628 my $length = length $streamdata; 5629 5630 #warn "decoded length: $oldlength -> $length\n"; 5631 5632 if ($save) 5633 { 5634 my $objnum = $dict->{StreamData}->{objnum}; 5635 my $gennum = $dict->{StreamData}->{gennum}; 5636 if ($objnum) 5637 { 5638 $self->{changes}->{$objnum} = 1; 5639 } 5640 $changed = 1; 5641 $dict->{StreamData}->{value} = $streamdata; 5642 if ($length != $oldlength) 5643 { 5644 $dict->{Length} = CAM::PDF::Node->new('number', $length, $objnum, $gennum); 5645 delete $dict->{L}; 5646 } 5647 5648 # These changes should happen later, but I prefer to do it 5649 # redundantly near the changes hash 5650 delete $dict->{Filter}; 5651 delete $dict->{F}; 5652 delete $dict->{DecodeParms}; 5653 delete $dict->{DP}; 5654 } 5655 } 5656 } 5657 #else { use Data::Dumper; print Dumper($dict); } 5658 5659 return $save ? $changed : $streamdata; 5660} 5661 5662=item $doc->fixDecode($streamdata, $filter, $params) 5663 5664This is a utility method to do any tweaking after removing the filter 5665from a data stream. 5666 5667=cut 5668 5669sub fixDecode 5670{ 5671 my $self = shift; 5672 my $streamdata = shift; 5673 my $filter = shift; 5674 my $parms = shift; 5675 5676 if (!$parms) 5677 { 5678 return; 5679 } 5680 my $d = $self->getValue($parms); 5681 if (!$d || (ref $d) ne 'HASH') 5682 { 5683 die "DecodeParms must be a dictionary.\n"; 5684 } 5685 if ($filter eq 'FlateDecode' || $filter eq 'Fl' || 5686 $filter eq 'LZWDecode' || $filter eq 'LZW') 5687 { 5688 my $p = exists $d->{Predictor} ? $self->getValue($d->{Predictor}) : 1; 5689 if ($p == 2) 5690 { 5691 $self->_fixDecodeTIFF($streamdata, $d); 5692 } 5693 elsif ($p >= 10) 5694 { 5695 $self->_fixDecodePNG($streamdata, $d); 5696 } 5697 # else no fix needed 5698 } 5699 return; 5700} 5701 5702sub _fixDecodeTIFF 5703{ 5704 my $self = shift; 5705 my $streamdata = shift; 5706 my $d = shift; 5707 5708 die 'The TIFF image predictor is not supported'; 5709} 5710 5711sub _fixDecodePNG 5712{ 5713 my $self = shift; 5714 my $streamdata = shift; 5715 my $d = shift; 5716 5717 # PNG differencing algorithms http://www.w3.org/TR/PNG-Filters.html 5718 my $colors = exists $d->{Colors} ? $self->getValue($d->{Colors}) : 1; 5719 my $columns = exists $d->{Columns} ? $self->getValue($d->{Columns}) : 1; 5720 my $bpc = exists $d->{BitsPerComponent} ? $self->getValue($d->{BitsPerComponent}) : 8; 5721 if (0 != $bpc % 8) { 5722 die 'Color samples that are not multiples of 8 bits are not supported'; 5723 } 5724 my $width = 1 + $colors * $columns * ($bpc >> 3); # size of a row in bytes, including the 1-byte predictor 5725 my $len = length ${$streamdata}; 5726 if (0 != $len % $width) { 5727 die 'The stream data is not evenly divisible into rows'; 5728 } 5729 my $rows = $len / $width; 5730 my $newdata = q{}; 5731 my $prev_row = [(0) x ($width - 1)]; 5732 for my $irow (0 .. $rows - 1) 5733 { 5734 my ($row_pred, @row) = unpack 'C' . $width, substr ${$streamdata}, $irow * $width, $width; 5735 if ($row_pred == 1) { ##no critic (IfElse) 5736 for my $i (1 .. $width-2) { 5737 $row[$i] = ($row[$i-1] + $row[$i]) & 0xff; 5738 } 5739 } elsif ($row_pred == 2) { 5740 for my $i (0 .. $width-2) { 5741 $row[$i] = ($prev_row->[$i] + $row[$i]) & 0xff; 5742 } 5743 } elsif ($row_pred == 3) { 5744 $row[0] = (($prev_row->[0] >> 1) + $row[0]) & 0xff; 5745 for my $i (1 .. $width-2) { 5746 $row[$i] = ((($row[$i-1] + $prev_row->[$i]) >> 1) + $row[$i]) & 0xff; 5747 } 5748 } elsif ($row_pred == 4) { 5749 # Paeth reduces to up for first column 5750 $row[0] = ($prev_row->[0] + $row[0]) & 0xff; 5751 for my $i (1 .. $width-2) { 5752 my $a = $row[$i-1]; 5753 my $b = $prev_row->[$i]; 5754 my $c = $prev_row->[$i-1]; 5755 my $p = $a + $b - $c; 5756 my $pa = abs $p - $a; 5757 my $pb = abs $p - $b; 5758 my $pc = abs $p - $c; 5759 my $paeth = $pa <= $pb && $pa <= $pc ? $a : $pb <= $pc ? $b : $c; 5760 $row[$i] = ($paeth + $row[$i]) & 0xff; 5761 } 5762 } 5763 $newdata .= pack 'C*', @row; 5764 $prev_row = \@row; 5765 } 5766 ${$streamdata} = $newdata; 5767 return; 5768} 5769 5770=item $doc->encodeObject($objectnum, $filter) 5771 5772Apply the specified filter to the object. 5773 5774=cut 5775 5776sub encodeObject 5777{ 5778 my $self = shift; 5779 my $objnum = shift; 5780 my $filtername = shift; 5781 5782 my $objnode = $self->dereference($objnum); 5783 5784 $self->encodeOne($objnode->{value}, $filtername); 5785 return; 5786} 5787 5788=item $doc->encodeOne($object, $filter) 5789 5790Apply the specified filter to the object. 5791 5792=cut 5793 5794sub encodeOne ## no critic(Subroutines::ProhibitExcessComplexity) 5795{ 5796 my $self = shift; 5797 my $objnode = shift; 5798 my $filtername = shift; 5799 5800 my $changed = 0; 5801 5802 if ($objnode->{type} eq 'dictionary') 5803 { 5804 my $dict = $objnode->{value}; 5805 my $objnum = $objnode->{objnum}; 5806 my $gennum = $objnode->{gennum}; 5807 5808 if (! exists $dict->{StreamData}) 5809 { 5810 #warn "Object does not contain a Stream to encode\n"; 5811 return 0; 5812 } 5813 5814 if ($filtername eq 'LZWDecode' || $filtername eq 'LZW') 5815 { 5816 $filtername = 'FlateDecode'; 5817 warn "LZWDecode filter not supported for encoding. Using $filtername instead\n"; 5818 } 5819 my $filt = eval { 5820 require Text::PDF::Filter; 5821 my $pkg = "Text::PDF::$filtername"; 5822 $pkg->new; 5823 }; 5824 if (!$filt) 5825 { 5826 warn "Failed to open filter $filtername (Text::PDF::$filtername)\n"; 5827 return 0; 5828 } 5829 5830 my $l = $dict->{Length} || $dict->{L}; 5831 my $oldlength = $self->getValue($l); 5832 $dict->{StreamData}->{value} = $filt->outfilt($dict->{StreamData}->{value}, 1); 5833 my $length = length $dict->{StreamData}->{value}; 5834 5835 if (! defined $oldlength || $length != $oldlength) 5836 { 5837 if (defined $l && $l->{type} eq 'reference') 5838 { 5839 my $lenobj = $self->dereference($l->{value})->{value}; 5840 if ($lenobj->{type} ne 'number') 5841 { 5842 die "Expected length to be a reference to an object containing a number while encoding\n"; 5843 } 5844 $lenobj->{value} = $length; 5845 } 5846 elsif (!defined $l || $l->{type} eq 'number') 5847 { 5848 $dict->{Length} = CAM::PDF::Node->new('number', $length, $objnum, $gennum); 5849 delete $dict->{L}; 5850 } 5851 else 5852 { 5853 die "Unexpected type \"$l->{type}\" for Length while encoding.\n" . 5854 "(expected \"number\" or \"reference\")\n"; 5855 } 5856 } 5857 5858 # Record the filter 5859 my $newfilt = CAM::PDF::Node->new('label', $filtername, $objnum, $gennum); 5860 my $f = $dict->{Filter} || $dict->{F}; 5861 if (!defined $f) 5862 { 5863 $dict->{Filter} = $newfilt; 5864 delete $dict->{F}; 5865 } 5866 elsif ($f->{type} eq 'label') 5867 { 5868 $dict->{Filter} = CAM::PDF::Node->new('array', [ 5869 $newfilt, 5870 $f, 5871 ], 5872 $objnum, $gennum); 5873 delete $dict->{F}; 5874 } 5875 elsif ($f->{type} eq 'array') 5876 { 5877 unshift @{$f->{value}}, $newfilt; 5878 } 5879 else 5880 { 5881 die "Confused: Filter type is \"$f->{type}\", not the\n" . 5882 "expected \"array\" or \"label\"\n"; 5883 } 5884 5885 if ($dict->{DecodeParms} || $dict->{DP}) 5886 { 5887 die "Insertion of DecodeParms not yet supported...\n"; 5888 } 5889 5890 if ($objnum) 5891 { 5892 $self->{changes}->{$objnum} = 1; 5893 } 5894 $changed = 1; 5895 } 5896 return $changed; 5897} 5898 5899 5900=item $doc->setObjNum($object, $objectnum, $gennum) 5901 5902Descend into an object and change all of the INTERNAL object number 5903flags to a new number. This is just for consistency of internal 5904accounting. 5905 5906=cut 5907 5908sub setObjNum 5909{ 5910 my $self = shift; 5911 my $objnode = shift; 5912 my $objnum = shift; 5913 my $gennum = shift; 5914 5915 $self->traverse(0, $objnode, \&_setObjNumCB, [$objnum, $gennum]); 5916 return; 5917} 5918 5919# PRIVATE FUNCTION 5920 5921sub _setObjNumCB 5922{ 5923 my $self = shift; 5924 my $objnode = shift; 5925 my $nums = shift; 5926 5927 $objnode->{objnum} = $nums->[0]; 5928 $objnode->{gennum} = $nums->[1]; 5929 return; 5930} 5931 5932=item $doc->getRefList($object) 5933 5934I<For INTERNAL use> 5935 5936Return an array all of objects referred to in this object. 5937 5938=cut 5939 5940sub getRefList 5941{ 5942 my $self = shift; 5943 my $objnode = shift; 5944 5945 my $list = {}; 5946 $self->traverse(1, $objnode, \&_getRefListCB, $list); 5947 5948 return (sort keys %{$list}); 5949} 5950 5951# PRIVATE FUNCTION 5952 5953sub _getRefListCB 5954{ 5955 my $self = shift; 5956 my $objnode = shift; 5957 my $list = shift; 5958 5959 if ($objnode->{type} eq 'reference') 5960 { 5961 $list->{$objnode->{value}} = 1; 5962 } 5963 return; 5964} 5965 5966=item $doc->changeRefKeys($object, $hashref) 5967 5968I<For INTERNAL use> 5969 5970Renumber all references in an object. 5971 5972=cut 5973 5974sub changeRefKeys 5975{ 5976 my $self = shift; 5977 my $objnode = shift; 5978 my $newrefkeys = shift; 5979 my $traversed = shift; # optional 5980 5981 my $follow = shift || 0; # almost always false 5982 5983 $self->traverse($follow, $objnode, \&_changeRefKeysCB, $newrefkeys, $traversed); 5984 return; 5985} 5986 5987# PRIVATE FUNCTION 5988 5989sub _changeRefKeysCB 5990{ 5991 my $self = shift; 5992 my $objnode = shift; 5993 my $newrefkeys = shift; 5994 5995 if ($objnode->{type} eq 'reference') 5996 { 5997 if (exists $newrefkeys->{$objnode->{value}}) 5998 { 5999 $objnode->{value} = $newrefkeys->{$objnode->{value}}; 6000 } 6001 } 6002 return; 6003} 6004 6005=item $doc->abbrevInlineImage($object) 6006 6007Contract all image keywords to inline abbreviations. 6008 6009=cut 6010 6011sub abbrevInlineImage 6012{ 6013 my $self = shift; 6014 my $objnode = shift; 6015 6016 $self->traverse(0, $objnode, \&_abbrevInlineImageCB, {reverse %inlineabbrevs}); 6017 return; 6018} 6019 6020=item $doc->unabbrevInlineImage($object) 6021 6022Expand all inline image abbreviations. 6023 6024=cut 6025 6026sub unabbrevInlineImage 6027{ 6028 my $self = shift; 6029 my $objnode = shift; 6030 6031 $self->traverse(0, $objnode, \&_abbrevInlineImageCB, \%inlineabbrevs); 6032 return; 6033} 6034 6035# PRIVATE FUNCTION 6036 6037sub _abbrevInlineImageCB 6038{ 6039 my $self = shift; 6040 my $objnode = shift; 6041 my $convert = shift; 6042 6043 if ($objnode->{type} eq 'label') 6044 { 6045 my $new = $convert->{$objnode->{value}}; 6046 if (defined $new) 6047 { 6048 $objnode->{value} = $new; 6049 } 6050 } 6051 elsif ($objnode->{type} eq 'dictionary') 6052 { 6053 my $dict = $objnode->{value}; 6054 for my $key (keys %{$dict}) 6055 { 6056 my $new = $convert->{$key}; 6057 if (defined $new && $new ne $key) 6058 { 6059 $dict->{$new} = $dict->{$key}; 6060 delete $dict->{$key}; 6061 } 6062 } 6063 } 6064 return; 6065} 6066 6067=item $doc->changeString($object, $hashref) 6068 6069Alter all instances of a given string. The hashref is a dictionary of 6070from-string and to-string. If the from-string looks like C<regex(...)> 6071then it is interpreted as a Perl regular expression and is eval'ed. 6072Otherwise the search-and-replace is literal. 6073 6074=cut 6075 6076sub changeString 6077{ 6078 my $self = shift; 6079 my $objnode = shift; 6080 my $changelist = shift; 6081 6082 $self->traverse(0, $objnode, \&_changeStringCB, $changelist); 6083 return; 6084} 6085 6086# PRIVATE FUNCTION 6087 6088sub _changeStringCB 6089{ 6090 my $self = shift; 6091 my $objnode = shift; 6092 my $changelist = shift; 6093 6094 if ($objnode->{type} eq 'string') 6095 { 6096 for my $key (keys %{$changelist}) 6097 { 6098 if ($key =~ m/ \A regex[(](.*)[)] \z /xms) 6099 { 6100 my $regex = $1; 6101 my $res; 6102 my $eval_result = eval { 6103 $res = ($objnode->{value} =~ s/ $regex /$changelist->{$key}/gxms); 6104 1; 6105 }; 6106 if (!$eval_result) 6107 { 6108 die "Failed regex search/replace: $EVAL_ERROR\n"; 6109 } 6110 if ($res && $objnode->{objnum}) 6111 { 6112 $self->{changes}->{$objnode->{objnum}} = 1; 6113 } 6114 } 6115 else 6116 { 6117 if ($objnode->{value} =~ s/ $key /$changelist->{$key}/gxms && $objnode->{objnum}) 6118 { 6119 $self->{changes}->{$objnode->{objnum}} = 1; 6120 } 6121 } 6122 } 6123 } 6124 return; 6125} 6126 6127###################################################################### 6128 6129=back 6130 6131=head2 Utility functions 6132 6133=over 6134 6135=item $doc->rangeToArray($min, $max, $list...) 6136 6137Converts string lists of numbers to an array. For example, 6138 6139 CAM::PDF->rangeToArray(1, 15, '1,3-5,12,9', '14-', '8 - 6, -2'); 6140 6141becomes 6142 6143 (1,3,4,5,12,9,14,15,8,7,6,1,2) 6144 6145=cut 6146 6147sub rangeToArray 6148{ 6149 my ($pkg_or_doc, $min, $max, @range_parts) = @_; 6150 my @in_array = grep {defined $_} @range_parts; 6151 6152 for (@in_array) # modify in place 6153 { 6154 s/ [^\d\-,] //gxms; # clean 6155 } 6156 # split on numbers and ranges 6157 @in_array = map {m/ ([\d\-]+) /gxms} @in_array; 6158 6159 my @out_array; 6160 if (@in_array == 0) 6161 { 6162 @out_array = $min .. $max; 6163 } 6164 else 6165 { 6166 for (@in_array) 6167 { 6168 if (m/ (\d*)-(\d*) /xms) 6169 { 6170 my $aa = $1; 6171 my $bb = $2; 6172 if ($aa eq q{}) 6173 { 6174 $aa = $min-1; 6175 } 6176 if ($bb eq q{}) 6177 { 6178 $bb = $max+1; 6179 } 6180 6181 # Check if these are possible 6182 next if ($aa < $min && $bb < $min); 6183 next if ($aa > $max && $bb > $max); 6184 6185 if ($aa < $min) 6186 { 6187 $aa = $min; 6188 } 6189 if ($bb < $min) 6190 { 6191 $bb = $min; 6192 } 6193 if ($aa > $max) 6194 { 6195 $aa = $max; 6196 } 6197 if ($bb > $max) 6198 { 6199 $bb = $max; 6200 } 6201 6202 if ($aa > $bb) 6203 { 6204 push @out_array, reverse $bb .. $aa; 6205 } 6206 else 6207 { 6208 push @out_array, $aa .. $bb; 6209 } 6210 } 6211 elsif ($_ >= $min && $_ <= $max) 6212 { 6213 push @out_array, $_; 6214 } 6215 } 6216 } 6217 return @out_array; 6218} 6219 6220=item $doc->trimstr($string) 6221 6222Used solely for debugging. Trims a string to a max of 40 characters, 6223handling nulls and non-Unix line endings. 6224 6225=cut 6226 6227sub trimstr ## no critic (Unpack) 6228{ 6229 my $pkg_or_doc = shift; 6230 my $s = $_[0]; 6231 6232 my $pos = pos $_[0]; 6233 $pos ||= 0; 6234 6235 if (!defined $s || $s eq q{}) 6236 { 6237 $s = '(empty)'; 6238 } 6239 elsif (length $s > 40) 6240 { 6241 $s = (substr $s, $pos, 40) . '...'; 6242 } 6243 $s =~ s/ \r /^M/gxms; 6244 return $pos . q{ } . $s . "\n"; 6245} 6246 6247=item $doc->copyObject($node) 6248 6249Clones a node via Data::Dumper and eval(). 6250 6251=cut 6252 6253sub copyObject 6254{ 6255 my $self = shift; 6256 my $objnode = shift; 6257 6258 # replace $objnode with a copy of itself 6259 require Data::Dumper; 6260 my $d = Data::Dumper->new([$objnode],['objnode']); 6261 $d->Purity(1)->Indent(0); 6262 $objnode = eval $d->Dump(); ## no critic(ProhibitStringyEval) 6263 return $objnode; 6264} 6265 6266=item $doc->cacheObjects() 6267 6268Parses all object Nodes and stores them in the cache. This is useful 6269for cases where you intend to do some global manipulation and want all 6270of the data conveniently in RAM. 6271 6272=cut 6273 6274sub cacheObjects 6275{ 6276 my $self = shift; 6277 6278 for my $key (keys %{$self->{xref}}) 6279 { 6280 if (!exists $self->{objcache}->{$key}) 6281 { 6282 $self->{objcache}->{$key} = $self->dereference($key); 6283 } 6284 } 6285 return; 6286} 6287 6288=item $doc->asciify($string) 6289 6290Helper class/instance method to massage a string, cleaning up some 6291non-ASCII problems. This is a very incomplete list. Specifically: 6292 6293=over 6294 6295=item f-i ligatures 6296 6297=item (R) symbol 6298 6299=back 6300 6301=cut 6302 6303sub asciify 6304{ 6305 my $pkg_or_doc = shift; 6306 my $R_string = shift; # scalar reference 6307 6308 ## Heuristics: fix up some odd text characters: 6309 # f-i ligature 6310 ${$R_string} =~ s/ \223 /fi/gxms; 6311 # Registered symbol 6312 ${$R_string} =~ s/ \xae /(R)/gxms; 6313 return $pkg_or_doc; 6314} 6315 63161; 6317__END__ 6318 6319=back 6320 6321=head1 COMPATIBILITY 6322 6323This library was primarily developed against the 3rd edition of the 6324reference (PDF v1.4) with several important updates from 4th edition 6325(PDF v1.5). This library focuses most deeply on PDF v1.2 features. 6326Nonetheless, it should be forward and backward compatible in the 6327majority of cases. 6328 6329=head1 PERFORMANCE 6330 6331This module is written with good speed and flexibility in mind, often 6332at the expense of memory consumption. Entire PDF documents are 6333typically slurped into RAM. As an example, simply calling 6334C<new('PDFReference15_v15.pdf')> (the 13.5 MB Adobe PDF Reference V1.5 6335document) pushes Perl to consume 89 MB of RAM on my development 6336machine. 6337 6338=head1 SEE ALSO 6339 6340There are several other PDF modules on CPAN. Below is a brief 6341description of a few of them. If these comments are out of date, 6342please inform me. 6343 6344=over 6345 6346=item PDF::API2 6347 6348As of v0.46.003, LGPL license. 6349 6350This is the leading PDF library, in my opinion. 6351 6352Excellent text and font support. This is the highest level library of 6353the bunch, and is the most complete implementation of the Adobe PDF 6354spec. The author is amazingly responsive and patient. 6355 6356=item Text::PDF 6357 6358As of v0.25, Artistic license. 6359 6360Excellent compression support (CAM::PDF cribs off this Text::PDF 6361feature). This has not been developed since 2003. 6362 6363=item PDF::Reuse 6364 6365As of v0.32, Artistic/GPL license, like Perl itself. 6366 6367This library is not object oriented, so it can only process one PDF at 6368a time, while storing all data in global variables. I'm not fond of 6369it, but it's quite popular, so don't take my word for it! 6370 6371=back 6372 6373CAM::PDF is the only one of these that has regression tests. 6374Currently, CAM::PDF has test coverage of about 50%, as reported by 6375C<Build testcover>. 6376 6377Additionally, PDFLib is a commercial package not on CPAN 6378(L<www.pdflib.com>). It is a C-based library with a Perl interface. 6379It is designed for PDF creation, not for reuse. 6380 6381=head1 INTERNALS 6382 6383The data structure used to represent the PDF document is composed 6384primarily of a hierarchy of Node objects. Every node in the document 6385tree has this structure: 6386 6387 type => <type> 6388 value => <value> 6389 objnum => <object number> 6390 gennum => <generation number> 6391 6392where the <value> depends on the <type>, and <type> is one of 6393 6394 Type Value 6395 ---- ----- 6396 object Node 6397 stream byte string 6398 string byte string 6399 hexstring byte string 6400 number number 6401 reference integer (object number) 6402 boolean "true" | "false" 6403 label string 6404 array arrayref of Nodes 6405 dictionary hashref of (string => Node) 6406 null undef 6407 6408All of these except "stream" are directly related to the PDF data 6409types of the same name. Streams are treated as special cases in this 6410library since the have a non-general syntax and placement in the 6411document body. Internally, streams are very much like strings, except 6412that they have filters applied to them. 6413 6414All objects are referenced indirectly by their numbers, as defined in 6415the PDF document. In all cases, the dereference() function should be 6416used to deserialize objects into their internal representation. This 6417function is also useful for looking up named objects in the page model 6418metadata. Every node in the hierarchy contains its object and 6419generation number. You can think of this as a sort of a pointer back 6420to the root of each node tree. This serves in place of a "parent" 6421link for every node, which would be harder to maintain. 6422 6423The PDF document itself is represented internally as a hash reference 6424with many components, including the document content, the document 6425metadata (index, trailer and root node), the object cache, and several 6426other caches, in addition to a few assorted bookkeeping structures. 6427 6428The core of the document is represented in the object cache, which is 6429only populated as needed, thus avoiding the overhead of parsing the 6430whole document at read time. 6431 6432=head1 AUTHOR 6433 6434Chris Dolan 6435 6436This module was originally developed by me at Clotho Advanced Media 6437Inc. Now I maintain it in my spare time. 6438 6439=head1 ACKNOWLEDGMENTS 6440 6441Thanks to all the people who have submitted bug reports over the 6442years! I've belatedly started crediting people in the F<CHANGES> 6443file. Apologies to contributors I've overlooked... 6444 6445=cut 6446