1package CAM::PDF;
2
3use 5.006;
4use warnings;
5use strict;
6use Carp qw(croak cluck);
7use English qw(-no_match_vars);
8use CAM::PDF::Node;
9use CAM::PDF::Decrypt;
10
11our $VERSION = '1.60';
12
13## no critic(Bangs::ProhibitCommentedOutCode)
14## no critic(ControlStructures::ProhibitDeepNests)
15
16=for stopwords eval'ed CR-NL PDFLib defiltered prefill indices inline de-embedding 4th linearized viewable decrypted
17
18=head1 NAME
19
20CAM::PDF - PDF manipulation library
21
22=head1 LICENSE
23
24Copyright 2002-2006 Clotho Advanced Media, Inc., L<http://www.clotho.com/>
25
26Copyright 2007-2008 Chris Dolan
27
28This library is free software; you can redistribute it and/or modify it
29under the same terms as Perl itself.
30
31=head1 SYNOPSIS
32
33    use CAM::PDF;
34
35    my $pdf = CAM::PDF->new('test1.pdf');
36
37    my $page1 = $pdf->getPageContent(1);
38    [ ... mess with page ... ]
39    $pdf->setPageContent(1, $page1);
40    [ ... create some new content ... ]
41    $pdf->appendPageContent(1, $newcontent);
42
43    my $anotherpdf = CAM::PDF->new('test2.pdf');
44    $pdf->appendPDF($anotherpdf);
45
46    my @prefs = $pdf->getPrefs();
47    $prefs[$CAM::PDF::PREF_OPASS] = 'mypassword';
48    $prefs[$CAM::PDF::PREF_UPASS] = 'mypassword';
49    $pdf->setPrefs(@prefs);
50
51    $pdf->cleanoutput('out1.pdf');
52    print $pdf->toPDF();
53
54Many example programs are included in this distribution to do useful
55tasks.  See the C<bin> subdirectory.
56
57=head1 DESCRIPTION
58
59This package reads and writes any document that conforms to the PDF
60specification generously provided by Adobe at
61L<http://partners.adobe.com/public/developer/pdf/index_reference.html>
62(link last checked Oct 2005).
63
64The file format through PDF 1.5 is well-supported, with the exception
65of the "linearized" or "optimized" output format, which this module
66can read but not write.  Many specific aspects of the document model
67are not manipulable with this package (like fonts), but if the input
68document is correctly written, then this module will preserve the
69model integrity.
70
71The PDF writing feature saves as PDF 1.4-compatible.  That means that
72we cannot write compressed object streams.  The consequence is that
73reading and then writing a PDF 1.5+ document may enlarge the resulting
74file by a fair margin.
75
76This library grants you some power over the PDF security model.  Note
77that applications editing PDF documents via this library MUST respect
78the security preferences of the document.  Any violation of this
79respect is contrary to Adobe's intellectual property position, as
80stated in the reference manual at the above URL.
81
82Technical detail regarding corrupt PDFs: This library adheres strictly
83to the PDF specification.  Adobe's Acrobat Reader is more lenient,
84allowing some corrupted PDFs to be viewable.  Therefore, it is
85possible that some PDFs may be readable by Acrobat that are illegible
86to this library.  In particular, files which have had line endings
87converted to or from DOS/Windows style (i.e. CR-NL) may be rendered
88unusable even though Acrobat does not complain.  Future library
89versions may relax the parser, but not yet.
90
91=cut
92
93our $PREF_OPASS  = 0;
94our $PREF_UPASS  = 1;
95our $PREF_PRINT  = 2;
96our $PREF_MODIFY = 3;
97our $PREF_COPY   = 4;
98our $PREF_ADD    = 5;
99
100our $MAX_STRING  = 65;  # length of output string
101
102my %filterabbrevs = (
103                     AHx => 'ASCIIHexDecode',
104                     A85 => 'ASCII85Decode',
105                     CCF => 'CCITTFaxDecode',
106                     DCT => 'DCTDecode',
107                     Fl  => 'FlateDecode',
108                     LZW => 'LZWDecode',
109                     RL  => 'RunLengthDecode',
110                     );
111
112my %inlineabbrevs = (
113                     %filterabbrevs,
114                     BPC => 'BitsPerComponent',
115                     CS  => 'ColorSpace',
116                     D   => 'Decode',
117                     DP  => 'DecodeParms',
118                     F   => 'Filter',
119                     H   => 'Height',
120                     IM  => 'ImageMask',
121                     I   => 'Interpolate',
122                     W   => 'Width',
123                     CMYK => 'DeviceCMYK',
124                     G   => 'DeviceGray',
125                     RGB => 'DeviceRGB',
126                     I   => 'Indexed',
127                     );
128
129=head1 API
130
131=head2 Functions intended to be used externally
132
133 $self = CAM::PDF->new(content | filename | '-')
134 $self->toPDF()
135 $self->needsSave()
136 $self->save()
137 $self->cleansave()
138 $self->output(filename | '-')
139 $self->cleanoutput(filename | '-')
140 $self->previousRevision()
141 $self->allRevisions()
142 $self->preserveOrder()
143 $self->appendObject(olddoc, oldnum, [follow=(1|0)])
144 $self->replaceObject(newnum, olddoc, oldnum, [follow=(1|0)])
145    (olddoc can be undef in the above for adding new objects)
146 $self->numPages()
147 $self->getPageText(pagenum)
148 $self->getPageDimensions(pagenum)
149 $self->getPageContent(pagenum)
150 $self->setPageContent(pagenum, content)
151 $self->appendPageContent(pagenum, content)
152 $self->deletePage(pagenum)
153 $self->deletePages(pagenum, pagenum, ...)
154 $self->extractPages(pagenum, pagenum, ...)
155 $self->appendPDF(CAM::PDF object)
156 $self->prependPDF(CAM::PDF object)
157 $self->wrapString(string, width, fontsize, page, fontlabel)
158 $self->getFontNames(pagenum)
159 $self->addFont(page, fontname, fontlabel, [fontmetrics])
160 $self->deEmbedFont(page, fontname, [newfontname])
161 $self->deEmbedFontByBaseName(page, basename, [newfont])
162 $self->getPrefs()
163 $self->setPrefs()
164 $self->canPrint()
165 $self->canModify()
166 $self->canCopy()
167 $self->canAdd()
168 $self->getFormFieldList()
169 $self->fillFormFields(fieldname, value, [fieldname, value, ...])
170   or $self->fillFormFields(%values)
171 $self->clearFormFieldTriggers(fieldname, fieldname, ...)
172
173Note: 'clean' as in cleansave() and cleanobject() means write a fresh
174PDF document.  The alternative (e.g. save()) reuses the existing doc
175and just appends to it.  Also note that 'clean' functions sort the
176objects numerically.  If you prefer that the new PDF docs more closely
177resemble the old ones, call preserveOrder() before cleansave() or
178cleanobject().
179
180=head2 Slightly less external, but useful, functions
181
182 $self->toString()
183 $self->getPage(pagenum)
184 $self->getFont(pagenum, fontname)
185 $self->getFonts(pagenum)
186 $self->getStringWidth(fontdict, string)
187 $self->getFormField(fieldname)
188 $self->getFormFieldDict(object)
189 $self->isLinearized()
190 $self->decodeObject(objectnum)
191 $self->decodeAll(any-node)
192 $self->decodeOne(dict-node)
193 $self->encodeObject(objectnum, filter)
194 $self->encodeOne(any-node, filter)
195 $self->changeString(obj-node, hashref)
196
197=head2 Deeper utilities
198
199 $self->pageAddName(pagenum, name, objectnum)
200 $self->getPageObjnum(pagenum)
201 $self->getPropertyNames(pagenum)
202 $self->getProperty(pagenum, propname)
203 $self->getValue(any-node)
204 $self->dereference(objectnum)  or $self->dereference(name,pagenum)
205 $self->deleteObject(objectnum)
206 $self->copyObject(obj-node)
207 $self->cacheObjects()
208 $self->setObjNum(obj-node, num)
209 $self->getRefList(obj-node)
210 $self->changeRefKeys(obj-node, hashref)
211
212=head2 More rarely needed utilities
213
214 $self->getObjValue(objectnum)
215
216=head2 Routines that should not be called
217
218 $self->_startdoc()
219 $self->delinearlize()
220 $self->build*()
221 $self->parse*()
222 $self->write*()
223 $self->*CB()
224 $self->traverse()
225 $self->fixDecode()
226 $self->abbrevInlineImage()
227 $self->unabbrevInlineImage()
228 $self->cleanse()
229 $self->clean()
230 $self->createID()
231
232
233=head1 FUNCTIONS
234
235=head2 Object creation/manipulation
236
237=over
238
239=item $doc->new($package, $content)
240
241=item $doc->new($package, $content, $ownerpass, $userpass)
242
243=item $doc->new($package, $content, $ownerpass, $userpass, $prompt)
244
245=item $doc->new($package, $content, $ownerpass, $userpass, $options)
246
247Instantiate a new CAM::PDF object.  C<$content> can be a document in a
248string, a filename, or '-'.  The latter indicates that the document
249should be read from standard input.  If the document is password
250protected, the passwords should be passed as additional arguments.  If
251they are not known, a boolean C<$prompt> argument allows the programmer to
252suggest that the constructor prompt the user for a password.  This is
253rudimentary prompting: passwords are in the clear on the console.
254
255This constructor takes an optional final argument which is a hash
256reference.  This hash can contain any of the following optional
257parameters:
258
259=over
260
261=item prompt_for_password => $boolean
262
263This is the same as the C<$prompt> argument described above.
264
265=item fault_tolerant => $boolean
266
267This flag causes the instance to be more lenient when reading the
268input PDF.  Currently, this only affects PDFs which cannot be
269successfully decrypted.
270
271=back
272
273=cut
274
275sub new  ## no critic(Subroutines::ProhibitExcessComplexity, Unpack)
276{
277   my $pkg = shift;
278   my $content = shift;  # or a filename
279   # Optional args:
280   my $opassword = shift;
281   my $upassword = shift;
282   my $options;
283   # Backward compatible support for prompt flag as final argument
284   if (ref $_[0])
285   {
286      $options = shift;
287      if ((ref $options) ne 'HASH')
288      {
289         croak 'Options must be a hash reference';
290      }
291   }
292   else
293   {
294      $options = {
295         prompt_for_password => shift,
296      };
297   }
298
299
300   my $pdfversion = '1.2';
301   if ($content =~ m/ \A%PDF-([\d.]+) /xms)
302   {
303      my $ver = $1;
304      if ($ver && $ver > $pdfversion)
305      {
306         $pdfversion = $ver;
307      }
308   }
309   else
310   {
311      if (1024 > length $content)
312      {
313         my $file = $content;
314         if ($file eq q{-})
315         {
316            $content = q{};
317            my $offset = 0;
318            my $step = 4096;
319            binmode STDIN; ##no critic (Syscalls)
320            while ($step == read STDIN, $content, $step, $offset)
321            {
322               $offset += $step;
323            }
324         }
325         else
326         {
327            if (open my $fh, '<', $file)
328            {
329               binmode $fh; ##no critic (Syscalls)
330               my $size = -s $file;
331               if ($size != read $fh, $content, $size) {
332                  $CAM::PDF::errstr = "Failed to read $file bytes\n";
333                  return;
334               }
335               if (!close $fh) {
336                  $CAM::PDF::errstr = "Failed to close reading $file\n";
337                  return;
338               }
339            }
340            else
341            {
342               $CAM::PDF::errstr = "Failed to open $file: $ERRNO\n";
343               return;
344            }
345         }
346      }
347      if ($content =~ m/ \A%PDF-([\d.]+) /xms)
348      {
349         my $ver = $1;
350         if ($ver && $ver > $pdfversion)
351         {
352            $pdfversion = $ver;
353         }
354      }
355      else
356      {
357         $CAM::PDF::errstr = "Content does not begin with \"%PDF-\"\n";
358         return;
359      }
360   }
361   #warn "got pdfversion $pdfversion\n";
362
363   my $self = {
364      options => $options,
365
366      pdfversion => $pdfversion,
367      maxstr => $CAM::PDF::MAX_STRING,  # length of output string
368      content => $content,
369      contentlength => length $content,
370      xref => {},
371      maxobj => 0,
372      changes => {},
373      versions => {},
374
375      # Caches:
376      objcache => {},
377      pagecache => {},
378      formcache => {},
379      Names => {},
380      NameObjects => {},
381      fontmetrics => {},
382   };
383   bless $self, $pkg;
384   if (!$self->_startdoc())
385   {
386      return;
387   }
388   if ($self->{trailer}->{ID})
389   {
390      my $id = $self->getValue($self->{trailer}->{ID});
391      if (ref $id)
392      {
393         my $accum = q{};
394         for my $objnode (@{$id})
395         {
396            $accum .= $self->getValue($objnode);
397         }
398         $id = $accum;
399      }
400      $self->{ID} = $id;
401   }
402
403   $self->{crypt} = CAM::PDF::Decrypt->new($self, $opassword, $upassword,
404                                          $self->{options}->{prompt_for_password});
405   if (!$self->{crypt} && !$self->{options}->{fault_tolerant})
406   {
407      return;
408   }
409
410   return $self;
411}
412
413=item $doc->toPDF()
414
415Serializes the data structure as a PDF document stream and returns as
416in a scalar.
417
418=cut
419
420sub toPDF
421{
422   my $self = shift;
423
424   if ($self->needsSave())
425   {
426      $self->cleansave();
427   }
428   return $self->{content};
429}
430
431=item $doc->toString()
432
433Returns a serialized representation of the data structure.
434Implemented via Data::Dumper.
435
436=cut
437
438sub toString  ## no critic (Unpack)
439{
440   my $self = shift;
441   my @skip = @_ == 0 ? qw(content) : @_;
442
443   my %hold;
444   for my $key (@skip)
445   {
446      $hold{$key} = delete $self->{$key};
447   }
448
449   require Data::Dumper;
450   my $result = Data::Dumper->Dump([$self], [qw(doc)]);
451
452   for my $key (keys %hold)
453   {
454      $self->{$key} = $hold{$key};
455   }
456   return $result;
457}
458
459################################################################################
460
461=back
462
463=head2 Document reading
464
465(all of these functions are intended for internal only)
466
467=over
468
469=cut
470
471
472# PRIVATE METHOD
473# read the document index and some metadata.
474
475sub _startdoc
476{
477   my $self = shift;
478
479   ### Parse the document metadata
480
481   # Start by parsing out the location of the last xref block
482
483   # Implementation note: The PDF spec says "The last line of the file
484   # contains only the end-of-file marker, %%EOF." but it also says
485   # "Acrobat viewers require only that the %%EOF marker appear
486   # somewhere within the last 1024 bytes of the file."
487   # So, we follow the latter more lenient rule.
488
489   my $doc_length = length $self->{content};
490   my $startxref;
491   if ($doc_length > 1024)
492   {
493      # The initial ".*" is for the unlikely case that there are two "startxref" statements in the last 1024 bytes
494      ($startxref) = (substr $self->{content}, $doc_length - 1024, 1024) =~ m/ .* startxref\s*(\d+)\s*%%EOF.*?\z /xms;
495   }
496   else
497   {
498      ($startxref) = $self->{content} =~ m/ .* startxref\s*(\d+)\s*%%EOF.*?\z /xms;
499   }
500
501   if (!$startxref)
502   {
503      $CAM::PDF::errstr = "Cannot find the index in the PDF content\n";
504      return;
505   }
506
507   # Parse the hierarchy of xref blocks
508   $self->{startxref} = $startxref;
509   my @objstreamrefs;
510   $self->{trailer} = $self->_buildxref($self->{startxref}, $self->{xref}, $self->{versions}, \@objstreamrefs);
511   if (!defined $self->{trailer})
512   {
513      return;
514   }
515   $self->_buildendxref();
516   for my $objstreamref (@objstreamrefs)
517   {
518      if (!$self->_index_objstream($objstreamref))
519      {
520         return;
521      }
522   }
523
524   ### Cache some page content descriptors
525
526   # Get the document root catalog
527   if (!exists $self->{trailer}->{Root})
528   {
529      $CAM::PDF::errstr = "No root node present in PDF trailer.\n";
530      return;
531   }
532   my $root = $self->getRootDict();
533   if (!$root || (ref $root) ne 'HASH')
534   {
535      $CAM::PDF::errstr = "The PDF root node is not a dictionary.\n";
536      return;
537   }
538
539   # Get the root of the page tree
540   if (!exists $root->{Pages})
541   {
542      $CAM::PDF::errstr = "The PDF root node doesn't have a reference to the page tree.\n";
543      return;
544   }
545   my $pages = $self->getPagesDict();
546   if (!$pages || (ref $pages) ne 'HASH')
547   {
548      $CAM::PDF::errstr = "The PDF page tree root is not a dictionary.\n";
549      return;
550   }
551
552   # Get the number of pages in the document
553   $self->{PageCount} = $self->getValue($pages->{Count});
554   if (!$self->{PageCount} || $self->{PageCount} < 1)
555   {
556      $CAM::PDF::errstr = "Bad number of pages in PDF document\n";
557      return;
558   }
559
560   return 1;
561}
562
563# PRIVATE FUNCTION
564#  read document index
565
566sub _buildxref
567{
568   my $self = shift;
569   my $startxref = shift;
570   my $index = shift;
571   my $versions = shift;
572   my $objstreamrefs = shift;
573
574   my $trailer;
575   if ('xref' eq substr $self->{content}, $startxref, 4)
576   {
577      $trailer = $self->_buildxref_pdf14($startxref, $index, $versions);
578      if ($trailer && exists $trailer->{XRefStm})
579      {
580         if (!$self->_buildxref_pdf15($trailer->{XRefStm}->{value}, $index, $versions, $objstreamrefs))
581         {
582            return;
583         }
584      }
585   }
586   else
587   {
588      $trailer = $self->_buildxref_pdf15($startxref, $index, $versions, $objstreamrefs);
589   }
590
591   if ($trailer && exists $trailer->{Prev})
592   {
593      if (!$self->_buildxref($trailer->{Prev}->{value}, $index, $versions, $objstreamrefs))
594      {
595         return;
596      }
597   }
598
599   return $trailer;
600}
601
602# Just for debugging
603sub __dump_binary_stream {
604   my $stream = shift;
605
606   my @b = unpack 'C*', $stream;
607   print '       0 1 2 3  4 5 6 7  8 9 a b  c d e f';
608   for my $i (0 .. $#b) {
609      if (0 == $i % 15) {
610         printf "\n%04x: ", $i;
611      } elsif (0 == $i % 3) {
612         print q{ };
613      }
614      printf '%02x', $b[$i];
615   }
616   print "\n";
617
618   return;
619}
620
621sub _buildxref_pdf15
622{
623   my $self = shift;
624   my $startxref = shift;
625   my $index = shift;
626   my $versions = shift;
627   my $objstreamrefs = shift;
628
629   my ($trailer, $stream) = $self->_buildxref_pdf15_getstream($startxref);
630   if (!$trailer) {
631      return;
632   }
633
634   #__dump_binary_stream($stream);
635
636   my @byte_pattern = map {$_->{value}} @{$trailer->{W}->{value}};
637   my $entry_size = $byte_pattern[0] + $byte_pattern[1] + $byte_pattern[2];
638   #print STDOUT "pack: [@byte_pattern] => total size $entry_size\n";
639
640   my @objstreamrefs;
641   {
642      my @pairs = (0, $trailer->{Size}->{value});
643      if (exists $trailer->{Index})
644      {
645         @pairs = map {$_->{value}} @{$trailer->{Index}->{value}};
646      }
647      #print STDOUT "Pairs: (Index,Size)=@pairs; size of stream=",length($stream)," ?= $entry_size x ",(length($stream)/$entry_size),"\n";
648
649      my $i = 0;
650      while (@pairs)
651      {
652         my $start = shift @pairs;
653         my $len = shift @pairs;
654         my $end = $start + $len;
655         for my $objnum ($start .. $end - 1)
656         {
657            my @byte = unpack 'C*', substr $stream, $i++ * $entry_size, $entry_size;
658            my %values = (type => 1, major => 0, minor => 0);
659            my @w = @byte_pattern;
660            my $pos = 0;
661            my $w = 0;
662            for my $key (qw(type major minor)) {
663               $w += shift @w;
664               if ($w > $pos) {
665                  my $val = 0;
666                  for (; $pos < $w; ++$pos) {  ## no critic (ProhibitCStyleForLoops)
667                      $val = ($val << 8) + $byte[$pos];
668                  }
669                  $values{$key} = $val;
670               }
671            }
672
673            next if (exists $index->{$objnum}); # keep only latest revision
674
675            #my %strs = (
676            #   0 => {str=>'free', major=>'objnext', minor=>'gennum'},
677            #   1 => {str=>'raw', major=>'byte', minor=>'gennum'},
678            #   2 => {str=>'zip', major=>'stream', minor=>'index'},
679            #);
680            #my $type_def = $strs{$values{type}};
681            #my $type_str = $type_def ? $type_def->{str} : 'unk';
682            #my $major_str = $type_def ? $type_def->{major} : 'unk';
683            #my $minor_str = $type_def ? $type_def->{minor} : 'unk';
684            #print STDOUT "xref $objnum = $values{type}=$type_str $major_str=$values{major}(",
685            #     sprintf('%04x',$values{major}),") $minor_str=$values{minor}(",
686            #     sprintf('%02x',$values{minor}),")\n";
687
688            # Ignore type 0
689            if ($values{type} == 1)
690            {
691               $index->{$objnum} = $values{major};
692               $versions->{$objnum} = $values{minor};
693            }
694            elsif ($values{type} == 2)
695            {
696               push @{$objstreamrefs}, {objnum => $objnum, streamnum => $values{major}, indx => $values{minor}};
697               $index->{$objnum} = {}; # will be overwritten later
698               $versions->{$objnum} = 0;
699            }
700         }
701         if ($end - 1 > $self->{maxobj})
702         {
703            $self->{maxobj} = $end - 1;
704         }
705      }
706   }
707   return $trailer;
708}
709
710sub _buildxref_pdf15_getstream
711{
712   my $self = shift;
713   my $startxref = shift;
714
715   # Don't slurp in the whole file
716   my $chunk_size = 1024;
717   my @content = (substr $self->{content}, $startxref, $chunk_size);
718   # warning: this doesn't account for the case where "endobj" crosses a 1024-byte boundary
719   # instead, we hit the end of file and find it after concatenation -- a hack but it works
720   while ($content[-1] && $content[-1] !~ m/endobj/xms) {
721      my $offset = $startxref + $chunk_size * @content;
722      if ($offset >= length $self->{content}) {
723         # end of file
724         last;
725      }
726      push @content, substr $self->{content}, $offset, $chunk_size;
727   }
728   my $content = join q{}, @content;
729
730   my $xrefstream = $self->parseObj(\$content);
731   if (!$xrefstream)
732   {
733      $CAM::PDF::errstr = 'Failed to locate the xref stream';
734      return;
735   }
736   my $trailer = $xrefstream->{value}->{value}; # dict hash
737   if (!$trailer)
738   {
739      $CAM::PDF::errstr = 'Invalid xref stream: no trailer';
740      return;
741   }
742   if ('HASH' ne ref $trailer)
743   {
744      $CAM::PDF::errstr = 'Invalid xref stream: trailer is not a dictionary';
745      return;
746   }
747   #print STDOUT "Trailer: @{[sort keys %{$trailer}]}, $trailer->{Type}->{value}\n";
748   if (!exists $trailer->{Type} || 'XRef' ne $trailer->{Type}->{value})
749   {
750      $CAM::PDF::errstr = 'Invalid xref stream: type is not XRef';
751      return;
752   }
753
754   my $stream = $self->decodeOne($xrefstream->{value});
755   if (!$stream)
756   {
757      $CAM::PDF::errstr = 'Invalid xref stream: could not decode the stream';
758      return;
759   }
760
761   return ($trailer, $stream);
762}
763
764sub _index_objstream
765{
766   my $self = shift;
767   my $objstreamref = shift;
768
769   my $objstream = $self->dereference($objstreamref->{streamnum});
770   if (!$objstream)
771   {
772      $CAM::PDF::errstr = 'Failed to read object stream ' . $objstreamref->{streamnum};
773      return;
774   }
775   if ($objstream->{_indexed}++)
776   {
777      return 1;
778   }
779   my $stream = $self->decodeOne($objstream->{value});
780   if (!$stream)
781   {
782      $CAM::PDF::errstr = 'Invalid xref stream: could not decode objstream ' . $objstreamref->{streamnum};
783      return;
784   }
785   my $dict = $objstream->{value}->{value};
786   my $n = $dict->{N}->{value};
787   my $first = $dict->{First}->{value};
788   my $lookup = substr $stream, 0, $first;
789   my @objs;
790   my $streamholder = {stream => $stream};
791   for my $i (0 .. $n-1)
792   {
793      if ($lookup =~ m/\G\s*(\d+)\s+(\d+)/cgxms)
794      {
795         my ($objnum, $offset) = ($1, $2);
796         my $pos = {objstream => $streamholder, start => $first + $offset};
797         push @objs, $pos;
798         if (exists $self->{xref}->{$objnum}) {
799            # keep only latest revision
800            next if !ref $self->{xref}->{$objnum};
801            next if $self->{xref}->{$objnum}->{objstream};
802         }
803         #print "objnum $objnum at pos $offset of objstream $objstreamref->{streamnum}\n";
804         $self->{xref}->{$objnum} = $pos;
805         $self->{versions}->{$objnum} = 0;
806      }
807      else
808      {
809         $CAM::PDF::errstr = 'Failed to read the objstream index for ' . $objstreamref->{streamnum};
810         return;
811      }
812   }
813   for my $i (0 .. $#objs-1)
814   {
815      $objs[$i]->{end} = $objs[$i+1]->{start};
816   }
817   $objs[-1]->{end} = length $stream;
818
819   return 1;
820}
821
822sub _buildxref_pdf14
823{
824   my $self = shift;
825   my $startxref = shift;
826   my $index = shift;
827   my $versions = shift;
828
829   my $trailerpos = index $self->{content}, 'trailer', $startxref;
830
831   # Workaround for Perl 5.6.1 bug
832   if ($trailerpos > 0 && $trailerpos < $startxref)
833   {
834      my $xrefstr = substr $self->{content}, $startxref;
835      $trailerpos = $startxref + index $xrefstr, 'trailer';
836   }
837
838   my $end = substr $self->{content}, $startxref, $trailerpos-$startxref;
839
840   if ($end !~ s/ \A xref\s+ //xms)
841   {
842      my $len = length $end;
843      $CAM::PDF::errstr = "Could not find PDF cross-ref table at location $startxref/$trailerpos/$len\n" . $self->trimstr($end);
844      return;
845   }
846   my $part = 0;
847   while ($end =~ s/ \A (\d+)\s+(\d+)\s+ //xms)
848   {
849      my $s = $1;
850      my $n = $2;
851
852      $part++;
853      for my $i (0 .. $n-1)
854      {
855         my $objnum = $s+$i;
856         next if (exists $index->{$objnum});
857
858         my $row = substr $end, $i*20, 20;
859         my ($indexnum, $version, $type) = $row =~ m/ \A (\d{10}) [ ] (\d{5}) [ ] (\w) /xms;
860         if (!$indexnum)
861         {
862            $CAM::PDF::errstr = "Could not decipher xref row:\n" . $self->trimstr($row);
863            return;
864         }
865         if ($type eq 'n')
866         {
867            if ($indexnum != 0) # if the index says it's at byte zero, pretend it's an 'f' instead of an 'n'
868            {
869               $index->{$objnum} = $indexnum;
870               $versions->{$objnum} = $version;
871            }
872         }
873         if ($objnum > $self->{maxobj})
874         {
875            $self->{maxobj} = $objnum;
876         }
877      }
878
879      $end = substr $end, 20*$n;
880   }
881
882   my $sxrefpos = index $self->{content}, 'startxref', $trailerpos;
883   if ($sxrefpos > 0 && $sxrefpos < $trailerpos)  # workaround for 5.6.1 bug
884   {
885      my $tail = substr $self->{content}, $trailerpos;
886      $sxrefpos = $trailerpos + index $tail, 'startxref';
887   }
888   $end = substr $self->{content}, $trailerpos, $sxrefpos-$trailerpos;
889
890   if ($end !~ s/ \A trailer\s* //xms)
891   {
892      $CAM::PDF::errstr = "Did not find expected trailer block after xref\n" . $self->trimstr($end);
893      return;
894   }
895   my $trailer = $self->parseDict(\$end)->{value};
896   return $trailer;
897}
898
899# PRIVATE FUNCTION
900# _buildendxref -- compute the end of each object
901#    note that this is not always the *actual* end of the object, but
902#    we guarantee that the object will end at or before this point.
903
904sub _buildendxref
905{
906   my $self = shift;
907
908   my $x = $self->{xref}; # shorthand
909   # make a list of objnums sorted by file position.  Ignore objects inside objstreams.
910   my @keys = sort {$x->{$a} <=> $x->{$b}} grep {!ref $x->{$_}} keys %{$x};
911
912   my $r = {};
913   for my $i (0 .. $#keys-1)
914   {
915      # set the end of each object to be the beginning of the next object
916      $r->{$keys[$i]} = $x->{$keys[$i+1]};
917   }
918   # The end of the last object is the end of the file
919   $r->{$keys[-1]} = $self->{contentlength};
920
921   $self->{endxref} = $r;
922   return;
923}
924
925# PRIVATE FUNTION
926# _buildNameTable -- descend into the page tree and extract all XObject
927# and Font name references.
928
929sub _buildNameTable
930{
931   my $self = shift;
932   my $pagenum = shift;
933
934   if (!$pagenum || $pagenum eq 'All')   # Build the ENTIRE name table
935   {
936      $self->cacheObjects();
937      for my $p (1 .. $self->{PageCount})
938      {
939         $self->_buildNameTable($p);
940      }
941      my %n;
942      for my $objnode (values %{$self->{objcache}})
943      {
944         if ($objnode->{value}->{type} eq 'dictionary')
945         {
946            my $dict = $objnode->{value}->{value};
947            if ($dict->{Name})
948            {
949               $n{$dict->{Name}->{value}} = CAM::PDF::Node->new('reference', $objnode->{objnum});
950            }
951         }
952      }
953      $self->{Names}->{All} = {%n};
954      return;
955   }
956
957   return if (exists $self->{Names}->{$pagenum});
958
959   my %n;
960   my $page = $self->getPage($pagenum);
961   while ($page)
962   {
963      my $objnum = $self->getPageObjnum($pagenum);
964      if (exists $page->{Resources})
965      {
966         my $r = $self->getValue($page->{Resources});
967         for my $key ('XObject', 'Font')
968         {
969            if (exists $r->{$key})
970            {
971               my $x = $self->getValue($r->{$key});
972               if ((ref $x) eq 'HASH')
973               {
974                  %n = (%{$x}, %n);
975               }
976            }
977         }
978      }
979
980      # Inherit from parent
981      $page = $page->{Parent};
982      if ($page)
983      {
984         $page = $self->getValue($page);
985      }
986   }
987
988   $self->{Names}->{$pagenum} = {%n};
989   return;
990}
991
992=item $doc->getRootDict()
993
994Returns the Root dictionary for the PDF.
995
996=cut
997
998sub getRootDict
999{
1000   my $self = shift;
1001
1002   return $self->getValue($self->{trailer}->{Root});
1003}
1004
1005=item $doc->getPagesDict()
1006
1007Returns the root Pages dictionary for the PDF.
1008
1009=cut
1010
1011sub getPagesDict
1012{
1013   my $self = shift;
1014
1015   return $self->getValue($self->getRootDict()->{Pages});
1016}
1017
1018=item $doc->parseObj($string)
1019
1020Use parseAny() instead of this, if possible.
1021
1022Given a fragment of PDF page content, parse it and return an object
1023Node.  This can be called as a class method in most circumstances, but
1024is intended as an instance method.
1025
1026=cut
1027
1028sub parseObj
1029{
1030   my $self = shift;
1031   my $c = shift;
1032
1033   if (${$c} !~ m/ \G\s*(\d+)\s+(\d+)\s+obj\s* /cgxms) ##no critic(ProhibitUnusedCapture)
1034   {
1035      die "Expected object open tag\n" . $self->trimstr(${$c});
1036   }
1037   # need to implement like this with explicit capture vars for 5.6.1
1038   # compatibility
1039   my ($objnum, $gennum) = ($1, $2); ##no critic(ProhibitCaptureWithoutTest)
1040   $objnum = int $objnum;
1041   $gennum = int $gennum;
1042
1043   my $objnode;
1044   if (${$c} =~ m/ \G(.*?)endobj\s* /cgxms)
1045   {
1046      my $string = $1;
1047      $objnode = $self->parseAny(\$string, $objnum, $gennum);
1048      if ($string =~ m/ \Gstream /xms)
1049      {
1050         if ($objnode->{type} ne 'dictionary')
1051         {
1052            die "Found an object stream without a preceding dictionary\n" . $self->trimstr(${$c});
1053         }
1054         $objnode->{value}->{StreamData} = $self->parseStream(\$string, $objnum, $gennum, $objnode->{value});
1055      }
1056   }
1057   else
1058   {
1059      die "Expected endobj\n" . $self->trimstr(${$c});
1060   }
1061   return CAM::PDF::Node->new('object', $objnode, $objnum, $gennum);
1062}
1063
1064
1065=item $doc->parseInlineImage($string)
1066
1067=item $doc->parseInlineImage($string, $objnum)
1068
1069=item $doc->parseInlineImage($string, $objnum, $gennum)
1070
1071Given a fragment of PDF page content, parse it and return an object
1072Node.  This can be called as a class method in some cases, but
1073is intended as an instance method.
1074
1075=cut
1076
1077sub parseInlineImage
1078{
1079   my $self = shift;
1080   my $c = shift;
1081   my $objnum = shift;
1082   my $gennum = shift;
1083
1084   if (${$c} !~ m/ \GBI\b /xms)
1085   {
1086      die "Expected inline image open tag\n" . $self->trimstr(${$c});
1087   }
1088   my $dict = $self->parseDict($c, $objnum, $gennum, 'BI\\b\\s*', 'ID\\b');
1089   $self->unabbrevInlineImage($dict);
1090   $dict->{value}->{Type} = CAM::PDF::Node->new('label', 'XObject', $objnum, $gennum);
1091   $dict->{value}->{Subtype} = CAM::PDF::Node->new('label', 'Image', $objnum, $gennum);
1092   $dict->{value}->{StreamData} = $self->parseStream($c, $objnum, $gennum, $dict->{value},
1093                                                     qr/ \s* /xms, qr/ \s*EI(?!\S) /xms);
1094   ${$c} =~ m/ \G\s+ /cgxms;
1095
1096   return CAM::PDF::Node->new('object', $dict, $objnum, $gennum);
1097}
1098
1099
1100=item $doc->writeInlineImage($objectnode)
1101
1102This is the inverse of parseInlineImage(), intended for use only in
1103the CAM::PDF::Content class.
1104
1105=cut
1106
1107sub writeInlineImage
1108{
1109   my $self = shift;
1110   my $objnode = shift;
1111
1112   # Make a copy since we are going to trash the image
1113   my $dictobj = $self->copyObject($objnode)->{value};
1114
1115   my $dict = $dictobj->{value};
1116   delete $dict->{Type};
1117   delete $dict->{Subtype};
1118   my $stream = $dict->{StreamData}->{value};
1119   delete $dict->{StreamData};
1120   $self->abbrevInlineImage($dictobj);
1121
1122   my $str = $self->writeAny($dictobj);
1123   $str =~ s/ \A <<    /BI /xms;
1124   $str =~ s/    >> \z / ID/xms;
1125   $str .= "\n" . $stream . "\nEI";
1126   return $str;
1127}
1128
1129=item $doc->parseStream($string, $objnum, $gennum, $dictnode)
1130
1131This should only be used by parseObj(), or other specialized cases.
1132
1133Given a fragment of PDF page content, parse it and return a stream
1134Node.  This can be called as a class method in most circumstances, but
1135is intended as an instance method.
1136
1137The dictionary Node argument is typically the body of the object Node
1138that precedes this stream.
1139
1140=cut
1141
1142sub parseStream
1143{
1144   my $self   = shift;
1145   my $c      = shift;
1146   my $objnum = shift;
1147   my $gennum = shift;
1148   my $dict   = shift;
1149
1150   my $begin = shift || qr/ stream[ \t]*\r?\n /xms;
1151   my $end   = shift || qr/ \s*endstream\s* /xms;
1152
1153   if (${$c} !~ m/ \G$begin /cgxms)
1154   {
1155      die "Expected stream open tag\n" . $self->trimstr(${$c});
1156   }
1157
1158   my $stream;
1159
1160   my $l = $dict->{Length} || $dict->{L};
1161   if (!defined $l)
1162   {
1163      if ($begin =~ m/ \Gstream /xms)
1164      {
1165         die "Missing stream length\n" . $self->trimstr(${$c});
1166      }
1167      if (${$c} =~ m/ \G$begin(.*?)$end /cgxms)
1168      {
1169         $stream = $1;
1170         my $len = length $stream;
1171         $dict->{Length} = CAM::PDF::Node->new('number', $len, $objnum, $gennum);
1172      }
1173      else
1174      {
1175         die "Missing stream begin/end\n" . $self->trimstr(${$c});
1176      }
1177   }
1178   else
1179   {
1180      my $length = $self->getValue($l);
1181      my $pos = pos ${$c};
1182      $stream = substr ${$c}, $pos, $length;
1183      pos(${$c}) += $length;    ## no critic(CodeLayout::ProhibitParensWithBuiltins)
1184      if (${$c} !~ m/ \G$end /cgxms)
1185      {
1186         die "Expected endstream\n" . $self->trimstr(${$c});
1187      }
1188   }
1189
1190   if (ref $self)
1191   {
1192      # in the rare case of CAM::PDF::Content::_parseInlineImage, this
1193      # may be called as a class method, thus making the above test
1194      # necessary
1195
1196      if ($self->{crypt})
1197      {
1198         $stream = $self->{crypt}->decrypt($self, $stream, $objnum, $gennum);
1199      }
1200   }
1201
1202   return CAM::PDF::Node->new('stream', $stream, $objnum, $gennum);
1203}
1204
1205=item $doc->parseDict($string)
1206
1207=item $doc->parseDict($string, $objnum)
1208
1209=item $doc->parseDict($string, $objnum, $gennum)
1210
1211Use parseAny() instead of this, if possible.
1212
1213Given a fragment of PDF page content, parse it and return an dictionary
1214Node.  This can be called as a class method in most circumstances, but
1215is intended as an instance method.
1216
1217=cut
1218
1219sub parseDict
1220{
1221   my $pkg_or_doc = shift;
1222   my $c = shift;
1223   my $objnum = shift;
1224   my $gennum = shift;
1225
1226   my $begin = shift || '<<\\s*';
1227   my $end = shift || '>>\\s*';
1228
1229   my $dict = {};
1230   if (${$c} =~ m/ \G$begin /cgxms)
1231   {
1232      while (${$c} !~ m/ \G$end /cgxms)
1233      {
1234         #warn "looking for label:\n" . $pkg_or_doc->trimstr(${$c});
1235         my $keyref = $pkg_or_doc->parseLabel($c, $objnum, $gennum);
1236         my $key = $keyref->{value};
1237         #warn "looking for value:\n" . $pkg_or_doc->trimstr(${$c});
1238         my $value = $pkg_or_doc->parseAny($c, $objnum, $gennum);
1239         $dict->{$key} = $value;
1240      }
1241   }
1242
1243   return CAM::PDF::Node->new('dictionary', $dict, $objnum, $gennum);
1244}
1245
1246=item $doc->parseArray($string)
1247
1248=item $doc->parseArray($string, $objnum)
1249
1250=item $doc->parseArray($string, $objnum, $gennum)
1251
1252Use parseAny() instead of this, if possible.
1253
1254Given a fragment of PDF page content, parse it and return an array
1255Node.  This can be called as a class or instance method.
1256
1257=cut
1258
1259sub parseArray
1260{
1261   my $pkg_or_doc = shift;
1262   my $c = shift;
1263   my $objnum = shift;
1264   my $gennum = shift;
1265
1266   my $array = [];
1267   if (${$c} =~ m/ \G\[\s* /cgxms)
1268   {
1269      while (${$c} !~ m/ \G\]\s* /cgxms)
1270      {
1271         #warn "looking for array value:\n" . $pkg_or_doc->trimstr(${$c});
1272         push @{$array}, $pkg_or_doc->parseAny($c, $objnum, $gennum);
1273      }
1274   }
1275
1276   return CAM::PDF::Node->new('array', $array, $objnum, $gennum);
1277}
1278
1279=item $doc->parseLabel($string)
1280
1281=item $doc->parseLabel($string, $objnum)
1282
1283=item $doc->parseLabel($string, $objnum, $gennum)
1284
1285Use parseAny() instead of this, if possible.
1286
1287Given a fragment of PDF page content, parse it and return a label
1288Node.  This can be called as a class or instance method.
1289
1290=cut
1291
1292sub parseLabel
1293{
1294   my $pkg_or_doc = shift;
1295   my $c = shift;
1296   my $objnum = shift;
1297   my $gennum = shift;
1298
1299   my $label;
1300   if (${$c} =~ m{ \G/([^\s<>/\[\]()]+)\s* }cgxms)
1301   {
1302      $label = $1;
1303   }
1304   else
1305   {
1306      die "Expected identifier label:\n" . $pkg_or_doc->trimstr(${$c});
1307   }
1308   return CAM::PDF::Node->new('label', $label, $objnum, $gennum);
1309}
1310
1311=item $doc->parseRef($string)
1312
1313=item $doc->parseRef($string, $objnum)
1314
1315=item $doc->parseRef($string, $objnum, $gennum)
1316
1317Use parseAny() instead of this, if possible.
1318
1319Given a fragment of PDF page content, parse it and return a reference
1320Node.  This can be called as a class or instance method.
1321
1322=cut
1323
1324sub parseRef
1325{
1326   my $pkg_or_doc = shift;
1327   my $c = shift;
1328   my $objnum = shift;
1329   my $gennum = shift;
1330
1331   my $newobjnum;
1332   if (${$c} =~ m/ \G(\d+)\s+\d+\s+R\s* /cgxms)
1333   {
1334      $newobjnum = int $1;
1335   }
1336   else
1337   {
1338      die "Expected object reference\n" . $pkg_or_doc->trimstr(${$c});
1339   }
1340   return CAM::PDF::Node->new('reference', $newobjnum, $objnum, $gennum);
1341}
1342
1343=item $doc->parseNum($string)
1344
1345=item $doc->parseNum($string, $objnum)
1346
1347=item $doc->parseNum($string, $objnum, $gennum)
1348
1349Use parseAny() instead of this, if possible.
1350
1351Given a fragment of PDF page content, parse it and return a number
1352Node.  This can be called as a class or instance method.
1353
1354=cut
1355
1356sub parseNum
1357{
1358   my $pkg_or_doc = shift;
1359   my $c = shift;
1360   my $objnum = shift;
1361   my $gennum = shift;
1362
1363   my $value;
1364   if (${$c} =~ m/ \G([\d.+-]+)\s* /cgxms)
1365   {
1366      $value = $1;
1367   }
1368   else
1369   {
1370      die "Expected numerical constant\n" . $pkg_or_doc->trimstr(${$c});
1371   }
1372   return CAM::PDF::Node->new('number', $value, $objnum, $gennum);
1373}
1374
1375=item $doc->parseString($string)
1376
1377=item $doc->parseString($string, $objnum)
1378
1379=item $doc->parseString($string, $objnum, $gennum)
1380
1381Use parseAny() instead of this, if possible.
1382
1383Given a fragment of PDF page content, parse it and return a string
1384Node.  This can be called as a class or instance method.
1385
1386=cut
1387
1388sub parseString
1389{
1390   my $pkg_or_doc = shift;
1391   my $c          = shift;
1392   my $objnum     = shift;
1393   my $gennum     = shift;
1394
1395   my $value = q{};
1396   if (${$c} =~ m/ \G [(] /cgxms)
1397   {
1398      # TODO: use Text::Balanced or Regexp::Common from CPAN??
1399
1400      my $depth = 1;
1401      while ($depth > 0)
1402      {
1403         if (${$c} =~ m/ \G ([^()]*) ([()]) /cgxms)
1404         {
1405            my $string = $1;
1406            my $delim  = $2;
1407            $value .= $string;
1408
1409            # Make sure this is not an escaped paren, OR an real paren
1410            # preceded by an escaped backslash!
1411            if ($string =~ m/ (\\+) \z/xms && 1 == (length $1) % 2)
1412            {
1413               $value .= $delim;
1414            }
1415            elsif ($delim eq '(')
1416            {
1417               $value .= $delim;
1418               $depth++;
1419            }
1420            elsif(--$depth > 0)
1421            {
1422               $value .= $delim;
1423            }
1424         }
1425         else
1426         {
1427            die "Expected string closing\n" . $pkg_or_doc->trimstr(${$c});
1428         }
1429      }
1430      ${$c} =~ m/ \G\s* /cgxms;
1431   }
1432   else
1433   {
1434      die "Expected string opener\n" . $pkg_or_doc->trimstr(${$c});
1435   }
1436
1437   # Unescape slash-escaped characters.  Treat \\ specially.
1438   my @parts = split /\\\\|\\134/xms, $value, -1;
1439   for (@parts)
1440   {
1441      # concatenate continued lines
1442      s/ \\\r?\n //gxms;
1443      s/ \\\r    //gxms;
1444
1445      # special characters
1446      s/ \\n /\n/gxms;
1447      s/ \\r /\r/gxms;
1448      s/ \\t /\t/gxms;
1449      s/ \\f /\f/gxms;
1450      s/ \\b /\x{8}/gxms;
1451
1452      # octal numbers
1453      s/ \\(\d{1,3}) /chr oct $1/gexms;
1454
1455      # Ignore all other slashes (i.e. following characters are treated literally)
1456      s/ \\ //gxms;
1457   }
1458   $value = join q{\\}, @parts;
1459
1460   if (ref $pkg_or_doc)
1461   {
1462      my $self = $pkg_or_doc;
1463      if ($self->{crypt})
1464      {
1465         $value = $self->{crypt}->decrypt($self, $value, $objnum, $gennum);
1466      }
1467   }
1468   return CAM::PDF::Node->new('string', $value, $objnum, $gennum);
1469}
1470
1471=item $doc->parseHexString($string)
1472
1473=item $doc->parseHexString($string, $objnum)
1474
1475=item $doc->parseHexString($string, $objnum, $gennum)
1476
1477Use parseAny() instead of this, if possible.
1478
1479Given a fragment of PDF page content, parse it and return a hex string
1480Node.  This can be called as a class or instance method.
1481
1482=cut
1483
1484sub parseHexString
1485{
1486   my $pkg_or_doc = shift;
1487   my $c = shift;
1488   my $objnum = shift;
1489   my $gennum = shift;
1490
1491   my $str = q{};
1492   if (${$c} =~ m/ \G<([\da-fA-F\s]*)>\s* /cgxms)
1493   {
1494      $str = $1;
1495      $str =~ s/\s+//gxms;
1496      my $len = length $str;
1497      if ($len % 2 == 1)
1498      {
1499         $str .= '0';
1500      }
1501      $str = pack 'H*', $str;
1502   }
1503   else
1504   {
1505     die "Expected hex string\n" . $pkg_or_doc->trimstr(${$c});
1506   }
1507
1508   if (ref $pkg_or_doc)
1509   {
1510      my $self = $pkg_or_doc;
1511      if ($self->{crypt})
1512      {
1513         $str = $self->{crypt}->decrypt($self, $str, $objnum, $gennum);
1514      }
1515   }
1516   return CAM::PDF::Node->new('hexstring', $str, $objnum, $gennum);
1517}
1518
1519=item $doc->parseBoolean($string)
1520
1521=item $doc->parseBoolean($string, $objnum)
1522
1523=item $doc->parseBoolean($string, $objnum, $gennum)
1524
1525Use parseAny() instead of this, if possible.
1526
1527Given a fragment of PDF page content, parse it and return a boolean
1528Node.  This can be called as a class or instance method.
1529
1530=cut
1531
1532sub parseBoolean
1533{
1534   my $pkg_or_doc = shift;
1535   my $c = shift;
1536   my $objnum = shift;
1537   my $gennum = shift;
1538
1539   my $val = q{};
1540   if (${$c} =~ m/ \G(true|false)\s* /cgxmsi)
1541   {
1542      $val = lc $1;
1543   }
1544   else
1545   {
1546     die "Expected boolean true or false keyword\n" . $pkg_or_doc->trimstr(${$c});
1547   }
1548
1549   return CAM::PDF::Node->new('boolean', $val, $objnum, $gennum);
1550}
1551
1552=item $doc->parseNull($string)
1553
1554=item $doc->parseNull($string, $objnum)
1555
1556=item $doc->parseNull($string, $objnum, $gennum)
1557
1558Use parseAny() instead of this, if possible.
1559
1560Given a fragment of PDF page content, parse it and return a null
1561Node.  This can be called as a class or instance method.
1562
1563=cut
1564
1565sub parseNull
1566{
1567   my $pkg_or_doc = shift;
1568   my $c = shift;
1569   my $objnum = shift;
1570   my $gennum = shift;
1571
1572   my $val = q{};
1573   if (${$c} =~ m/ \Gnull\s* /cgxmsi)
1574   {
1575      $val = undef;
1576   }
1577   else
1578   {
1579     die "Expected null keyword\n" . $pkg_or_doc->trimstr(${$c});
1580   }
1581
1582   return CAM::PDF::Node->new('null', $val, $objnum, $gennum);
1583}
1584
1585=item $doc->parseAny($string)
1586
1587=item $doc->parseAny($string, $objnum)
1588
1589=item $doc->parseAny($string, $objnum, $gennum)
1590
1591Given a fragment of PDF page content, parse it and return a Node of
1592the appropriate type.  This can be called as a class or instance
1593method.
1594
1595=cut
1596
1597sub parseAny
1598{
1599   my $p      = shift;  # pkg or doc
1600   my $c      = shift;
1601   my $objnum = shift;
1602   my $gennum = shift;
1603
1604   return ${$c} =~ m/ \G \d+\s+\d+\s+R\b /xms  ? $p->parseRef(      $c, $objnum, $gennum)
1605        : ${$c} =~ m{ \G /               }xms  ? $p->parseLabel(    $c, $objnum, $gennum)
1606        : ${$c} =~ m/ \G <<              /xms  ? $p->parseDict(     $c, $objnum, $gennum)
1607        : ${$c} =~ m/ \G \[              /xms  ? $p->parseArray(    $c, $objnum, $gennum)
1608        : ${$c} =~ m/ \G [(]             /xms  ? $p->parseString(   $c, $objnum, $gennum)
1609        : ${$c} =~ m/ \G <               /xms  ? $p->parseHexString($c, $objnum, $gennum)
1610        : ${$c} =~ m/ \G [\d.+-]+        /xms  ? $p->parseNum(      $c, $objnum, $gennum)
1611        : ${$c} =~ m/ \G (true|false)    /ixms ? $p->parseBoolean(  $c, $objnum, $gennum)
1612        : ${$c} =~ m/ \G null            /ixms ? $p->parseNull(     $c, $objnum, $gennum)
1613        : die "Unrecognized type in parseAny:\n" . $p->trimstr(${$c});
1614}
1615
1616################################################################################
1617
1618=back
1619
1620=head2 Data Accessors
1621
1622=over
1623
1624=item $doc->getValue($object)
1625
1626I<For INTERNAL use>
1627
1628Dereference a data object, return a value.  Given an node object
1629of any kind, returns raw scalar object: hashref, arrayref, string,
1630number.  This function follows all references, and descends into all
1631objects.
1632
1633=cut
1634
1635sub getValue
1636{
1637   my $self = shift;
1638   my $objnode = shift;
1639
1640   return if (! ref $objnode);
1641
1642   while ($objnode->{type} eq 'reference' || $objnode->{type} eq 'object')
1643   {
1644      if ($objnode->{type} eq 'reference')
1645      {
1646         my $objnum = $objnode->{value};
1647         $objnode = $self->dereference($objnum);
1648      }
1649      elsif ($objnode->{type} eq 'object')
1650      {
1651         $objnode = $objnode->{value};
1652      }
1653      return if (! ref $objnode);
1654   }
1655
1656   return $objnode->{value};
1657}
1658
1659=item $doc->getObjValue($objectnum)
1660
1661I<For INTERNAL use>
1662
1663Dereference a data object, and return a value.  Behaves just like the
1664getValue() function, but used when all you know is the object number.
1665
1666=cut
1667
1668sub getObjValue
1669{
1670   my $self = shift;
1671   my $objnum = shift;
1672
1673   return $self->getValue(CAM::PDF::Node->new('reference', $objnum));
1674}
1675
1676
1677=item $doc->dereference($objectnum)
1678
1679=item $doc->dereference($name, $pagenum)
1680
1681I<For INTERNAL use>
1682
1683Dereference a data object, return a PDF object as a node.  This
1684function makes heavy use of the internal object cache.  Most (if not
1685all) object requests should go through this function.
1686
1687C<$name> should look something like '/R12'.
1688
1689=cut
1690
1691sub dereference
1692{
1693   my $self = shift;
1694   my $key = shift;
1695   my $pagenum = shift; # only used if $key is a named resource
1696
1697   if ($key =~ s/ \A\/ //xms)  # strip off the leading slash while testing
1698   {
1699      # This is a request for a named object
1700      $self->_buildNameTable($pagenum);
1701      $key = $self->{Names}->{$pagenum}->{$key};
1702      return if (!defined $key);
1703      # $key should now point to a 'reference' object
1704      if ((ref $key) ne 'CAM::PDF::Node')
1705      {
1706         die "Assertion failed: key is a reference object in dereference\n";
1707      }
1708      $key = $key->{value};
1709   }
1710
1711   $key = int $key;
1712   if (!exists $self->{objcache}->{$key})
1713   {
1714      #print "Filling cache for obj \#$key...\n";
1715
1716      my $pos = $self->{xref}->{$key};
1717
1718      if (!$pos)
1719      {
1720         warn "Bad request for object $key at position 0 in the file\n";
1721         return;
1722      }
1723
1724      my $content_fragment;
1725      if (ref $pos)
1726      {
1727         $content_fragment = substr $pos->{objstream}->{stream}, $pos->{start}, $pos->{end};
1728         $content_fragment = "$key 0 obj\n$content_fragment\nendobj\n";
1729      }
1730      else
1731      {
1732         # This is fastest and safest
1733         if (!exists $self->{endxref})
1734         {
1735            $self->_buildendxref();
1736         }
1737         my $endpos = $self->{endxref}->{$key};
1738         if (!defined $endpos || $endpos < $pos)
1739         {
1740            # really slow, but a totally safe fallback
1741            $endpos = $self->{contentlength};
1742         }
1743
1744         $content_fragment = substr $self->{content}, $pos, $endpos - $pos;
1745      }
1746      $self->{objcache}->{$key} = $self->parseObj(\$content_fragment, $key);
1747   }
1748
1749   return $self->{objcache}->{$key};
1750}
1751
1752
1753=item $doc->getPropertyNames($pagenum)
1754
1755=item $doc->getProperty($pagenum, $propertyname)
1756
1757Each PDF page contains a list of resources that it uses (images,
1758fonts, etc).  getPropertyNames() returns an array of the names of
1759those resources.  getProperty() returns a node representing a
1760named property (most likely a reference node).
1761
1762=cut
1763
1764sub getPropertyNames
1765{
1766   my $self = shift;
1767   my $pagenum = shift;
1768
1769   $self->_buildNameTable($pagenum);
1770   my $props = $self->{Names}->{$pagenum};
1771   return if (!defined $props);
1772   return keys %{$props};
1773}
1774sub getProperty
1775{
1776   my $self = shift;
1777   my $pagenum = shift;
1778   my $name = shift;
1779
1780   $self->_buildNameTable($pagenum);
1781   my $props = $self->{Names}->{$pagenum};
1782   return if (!defined $props);
1783   return if (!defined $name);
1784   return $props->{$name};
1785}
1786
1787=item $doc->getFont($pagenum, $fontname)
1788
1789I<For INTERNAL use>
1790
1791Returns a dictionary for a given font identified by its label,
1792referenced by page.
1793
1794=cut
1795
1796sub getFont
1797{
1798   my $self = shift;
1799   my $pagenum = shift;
1800   my $fontname = shift;
1801
1802   $fontname =~ s/ \A\/? /\//xms; # add leading slash if needed
1803   my $objnode = $self->dereference($fontname, $pagenum);
1804   return if (!$objnode);
1805
1806   my $dict = $self->getValue($objnode);
1807   if ($dict && $dict->{Type} && $dict->{Type}->{value} eq 'Font')
1808   {
1809      return $dict;
1810   }
1811   else
1812   {
1813      return;
1814   }
1815}
1816
1817=item $doc->getFontNames($pagenum)
1818
1819I<For INTERNAL use>
1820
1821Returns a list of fonts for a given page.
1822
1823=cut
1824
1825sub getFontNames
1826{
1827   my $self = shift;
1828   my $pagenum = shift;
1829
1830   $self->_buildNameTable($pagenum);
1831   my $list = $self->{Names}->{$pagenum};
1832   my @names;
1833   if ($list)
1834   {
1835      for my $key (keys %{$list})
1836      {
1837         my $dict = $self->getValue($list->{$key});
1838         if ($dict && $dict->{Type} && $dict->{Type}->{value} eq 'Font')
1839         {
1840            push @names, $key;
1841         }
1842      }
1843   }
1844   return @names;
1845}
1846
1847
1848=item $doc->getFonts($pagenum)
1849
1850I<For INTERNAL use>
1851
1852Returns an array of font objects for a given page.
1853
1854=cut
1855
1856sub getFonts
1857{
1858   my $self = shift;
1859   my $pagenum = shift;
1860
1861   $self->_buildNameTable($pagenum);
1862   my $list = $self->{Names}->{$pagenum};
1863   my @fonts;
1864   if ($list)
1865   {
1866      for my $key (keys %{$list})
1867      {
1868         my $dict = $self->getValue($list->{$key});
1869         if ($dict && $dict->{Type} && $dict->{Type}->{value} eq 'Font')
1870         {
1871            push @fonts, $dict;
1872         }
1873      }
1874   }
1875   return @fonts;
1876}
1877
1878=item $doc->getFontByBaseName($pagenum, $fontname)
1879
1880I<For INTERNAL use>
1881
1882Returns a dictionary for a given font, referenced by page and the name
1883of the base font.
1884
1885=cut
1886
1887sub getFontByBaseName
1888{
1889   my $self = shift;
1890   my $pagenum = shift;
1891   my $fontname = shift;
1892
1893   $self->_buildNameTable($pagenum);
1894   my $list = $self->{Names}->{$pagenum};
1895   for my $key (keys %{$list})
1896   {
1897      my $num = $list->{$key}->{value};
1898      my $objnode = $self->dereference($num);
1899      my $dict = $self->getValue($objnode);
1900      if ($dict &&
1901          $dict->{Type} && $dict->{Type}->{value} eq 'Font' &&
1902          $dict->{BaseFont} && $dict->{BaseFont}->{value} eq $fontname)
1903      {
1904         return $dict;
1905      }
1906   }
1907   return;
1908}
1909
1910=item $doc->getFontMetrics($properties $fontname)
1911
1912I<For INTERNAL use>
1913
1914Returns a data structure representing the font metrics for the named
1915font.  The property list is the results of something like the
1916following:
1917
1918  $self->_buildNameTable($pagenum);
1919  my $properties = $self->{Names}->{$pagenum};
1920
1921Alternatively, if you know the page number, it might be easier to do:
1922
1923  my $font = $self->dereference($fontlabel, $pagenum);
1924  my $fontmetrics = $font->{value}->{value};
1925
1926where the C<$fontlabel> is something like '/Helv'.  The getFontMetrics()
1927method is useful in the cases where you've forgotten which page number
1928you are working on (e.g. in CAM::PDF::GS), or if your property list
1929isn't part of any page (e.g. working with form field annotation
1930objects).
1931
1932=cut
1933
1934sub getFontMetrics
1935{
1936   my $self = shift;
1937   my $props = shift;
1938   my $fontname = shift;
1939
1940   my $fontmetrics;
1941
1942   #print STDERR "looking for font $fontname\n";
1943
1944   # Sometimes we are passed just the object list, sometimes the whole
1945   # properties data structure
1946   if ($props->{Font})
1947   {
1948      $props = $self->getValue($props->{Font});
1949   }
1950
1951   if ($props->{$fontname})
1952   {
1953      my $fontdict = $self->getValue($props->{$fontname});
1954      if ($fontdict && $fontdict->{Type} && $fontdict->{Type}->{value} eq 'Font')
1955      {
1956         $fontmetrics = $fontdict;
1957         #print STDERR "Got font\n";
1958      }
1959      else
1960      {
1961         #print STDERR "Almost got font\n";
1962      }
1963   }
1964   else
1965   {
1966      #print STDERR "No font with that name in the dict\n";
1967   }
1968   #print STDERR "Failed to get font\n" if (!$fontmetrics);
1969   return $fontmetrics;
1970}
1971
1972=item $doc->addFont($pagenum, $fontname, $fontlabel)
1973
1974=item $doc->addFont($pagenum, $fontname, $fontlabel, $fontmetrics)
1975
1976Adds a reference to the specified font to the page.
1977
1978If a font metrics hash is supplied (it is required for a font other
1979than the 14 core fonts), then it is cloned and inserted into the new
1980font structure.  Note that if those font metrics contain references
1981(e.g. to the C<FontDescriptor>), the referred objects are not copied --
1982you must do that part yourself.
1983
1984For Type1 fonts, the font metrics must minimally contain the following
1985fields: C<Subtype>, C<FirstChar>, C<LastChar>, C<Widths>,
1986C<FontDescriptor>.
1987
1988=cut
1989
1990sub addFont
1991{
1992   my $self = shift;
1993   my $pagenum = shift;
1994   my $name = shift;
1995   my $label = shift;
1996   my $fontmetrics = shift; # optional
1997
1998   # Check if this font already exists
1999   my $page = $self->getPage($pagenum);
2000   if (exists $page->{Resources})
2001   {
2002      my $r = $self->getValue($page->{Resources});
2003      if (exists $r->{Font})
2004      {
2005         my $f = $self->getValue($r->{Font});
2006         if (exists $f->{$label})
2007         {
2008            # Font already exists.  Skip.
2009            return $self;
2010         }
2011      }
2012   }
2013
2014   # Build the font
2015   my $dict = CAM::PDF::Node->new('dictionary',
2016                                 {
2017                                    Type => CAM::PDF::Node->new('label', 'Font'),
2018                                    Name => CAM::PDF::Node->new('label', $label),
2019                                    BaseFont => CAM::PDF::Node->new('label', $name),
2020                                 },
2021                                 );
2022   if ($fontmetrics)
2023   {
2024      my $copy = $self->copyObject($fontmetrics);
2025      for my $key (keys %{$copy})
2026      {
2027         if (!$dict->{value}->{$key})
2028         {
2029            $dict->{value}->{$key} = $copy->{$key};
2030         }
2031      }
2032   }
2033   else
2034   {
2035      $dict->{value}->{Subtype} = CAM::PDF::Node->new('label', 'Type1');
2036   }
2037
2038   # Add the font to the document
2039   my $fontobjnum = $self->appendObject(undef, CAM::PDF::Node->new('object', $dict), 0);
2040
2041   # Add the font to the page
2042   my ($objnum,$gennum) = $self->getPageObjnum($pagenum);
2043   if (!exists $page->{Resources})
2044   {
2045      $page->{Resources} = CAM::PDF::Node->new('dictionary', {}, $objnum, $gennum);
2046   }
2047   my $r = $self->getValue($page->{Resources});
2048   if (!exists $r->{Font})
2049   {
2050      $r->{Font} = CAM::PDF::Node->new('dictionary', {}, $objnum, $gennum);
2051   }
2052   my $f = $self->getValue($r->{Font});
2053   $f->{$label} = CAM::PDF::Node->new('reference', $fontobjnum, $objnum, $gennum);
2054
2055   delete $self->{Names}->{$pagenum}; # decache
2056   $self->{changes}->{$objnum} = 1;
2057   return $self;
2058}
2059
2060=item $doc->deEmbedFont($pagenum, $fontname)
2061
2062=item $doc->deEmbedFont($pagenum, $fontname, $basefont)
2063
2064Removes embedded font data, leaving font reference intact.  Returns
2065true if the font exists and 1) font is not embedded or 2) embedded
2066data was successfully discarded.  Returns false if the font does not
2067exist, or the embedded data could not be discarded.
2068
2069The optional C<$basefont> parameter allows you to change the font.  This
2070is useful when some applications embed a standard font (see below) and
2071give it a funny name, like C<SYLXNP+Helvetica>.  In this example, it's
2072important to change the basename back to the standard C<Helvetica> when
2073de-embedding.
2074
2075De-embedding the font does NOT remove it from the PDF document, it
2076just removes references to it.  To get a size reduction by throwing
2077away unused font data, you should use the following code sometime
2078after this method.
2079
2080  $self->cleanse();
2081
2082For reference, the standard fonts are C<Times-Roman>, C<Helvetica>, and
2083C<Courier> (and their bold, italic and bold-italic forms) plus C<Symbol> and
2084C<Zapfdingbats>. (Adobe PDF Reference v1.4, p.319)
2085
2086=cut
2087
2088sub deEmbedFont
2089{
2090   my $self = shift;
2091   my $pagenum = shift;
2092   my $fontname = shift;
2093   my $basefont = shift;
2094
2095   my $success;
2096   my $font = $self->getFont($pagenum, $fontname);
2097   if ($font)
2098   {
2099      $self->_deEmbedFontObj($font, $basefont);
2100      $success = $self;
2101   }
2102   else
2103   {
2104      $success = undef;
2105   }
2106   return $success;
2107}
2108
2109=item $doc->deEmbedFontByBaseName($pagenum, $fontname)
2110
2111=item $doc->deEmbedFontByBaseName($pagenum, $fontname, $basefont)
2112
2113Just like deEmbedFont(), except that the font name parameter refers to
2114the name of the current base font instead of the PDF label for the
2115font.
2116
2117=cut
2118
2119sub deEmbedFontByBaseName
2120{
2121   my $self = shift;
2122   my $pagenum = shift;
2123   my $fontname = shift;
2124   my $basefont = shift;
2125
2126   my $success;
2127   my $font = $self->getFontByBaseName($pagenum, $fontname);
2128   if ($font)
2129   {
2130      $self->_deEmbedFontObj($font, $basefont);
2131      $success = $self;
2132   }
2133   else
2134   {
2135      $success = undef;
2136   }
2137   return $success;
2138}
2139
2140sub _deEmbedFontObj
2141{
2142   my $self = shift;
2143   my $font = shift;
2144   my $basefont = shift;
2145
2146   if ($basefont)
2147   {
2148      $font->{BaseFont} = CAM::PDF::Node->new('label', $basefont);
2149   }
2150   delete $font->{FontDescriptor};
2151   delete $font->{Widths};
2152   delete $font->{FirstChar};
2153   delete $font->{LastChar};
2154   $self->{changes}->{$font->{Type}->{objnum}} = 1;
2155   return;
2156}
2157
2158=item $doc->wrapString($string, $width, $fontsize, $fontmetrics)
2159
2160=item $doc->wrapString($string, $width, $fontsize, $pagenum, $fontlabel)
2161
2162Returns an array of strings wrapped to the specified width.
2163
2164=cut
2165
2166sub wrapString  ## no critic (Unpack)
2167{
2168   my $self = shift;
2169   my $string = shift;
2170   my $width = shift;
2171   my $size = shift;
2172
2173   my $fm;
2174   if (defined $_[0] && ref $_[0])
2175   {
2176      $fm = shift;
2177   }
2178   else
2179   {
2180      my $pagenum = shift;
2181      my $fontlabel = shift;
2182      $fm = $self->getFont($pagenum, $fontlabel);
2183   }
2184
2185   $string =~ s/ \r\n /\n/gxms;
2186   # no split limit, so trailing null strings are omitted
2187   my @strings = split /[\r\n]/xms, $string;
2188   my @out;
2189   $width /= $size;
2190   #print STDERR 'wrapping '.join('|',@strings)."\n";
2191   for my $s (@strings)
2192   {
2193      $s =~ s/ \s+\z //xms;
2194      my $w = $self->getStringWidth($fm, $s);
2195      if ($w <= $width)
2196      {
2197         push @out, $s;
2198      }
2199      else
2200      {
2201         my $cur;
2202         if ($s =~ s/ \A(\s*) //xms)
2203         {
2204            $cur = $1;
2205         }
2206         my $curw = $cur eq q{} ? 0 : $self->getStringWidth($fm, $cur);
2207         while ($s)
2208         {
2209            my ($sp,$wd);
2210            if ($s =~ s/ \A(\s*)(\S*) //xms)
2211            {
2212               ($sp,$wd) = ($1,$2);
2213            }
2214            my $wwd = $wd eq q{} ? 0 : $self->getStringWidth($fm, $wd);
2215            if ($curw == 0)
2216            {
2217               $cur = $wd;
2218               $curw = $wwd;
2219            }
2220            else
2221            {
2222               my $wsp = $sp eq q{} ? 0 : $self->getStringWidth($fm, $sp);
2223               if ($curw + $wsp + $wwd <= $width)
2224               {
2225                  $cur .= $sp . $wd;
2226                  $curw += $wsp + $wwd;
2227               }
2228               else
2229               {
2230                  push @out, $cur;
2231                  $cur = $wd;
2232                  $curw = $wwd;
2233               }
2234            }
2235         }
2236         if (0 < length $cur)
2237         {
2238            push @out, $cur;
2239         }
2240      }
2241   }
2242   #print STDERR 'wrapped to '.join('|',@out)."\n";
2243   return @out;
2244}
2245
2246=item $doc->getStringWidth($fontmetrics, $string)
2247
2248I<For INTERNAL use>
2249
2250Returns the width of the string, using the font metrics if possible.
2251
2252=cut
2253
2254sub getStringWidth
2255{
2256   my $self = shift;
2257   my $fontmetrics = shift;
2258   my $string = shift;
2259
2260   if (! defined $string || $string eq q{})
2261   {
2262      return 0;
2263   }
2264
2265   my $width = 0;
2266   if ($fontmetrics)
2267   {
2268      if ($fontmetrics->{Widths})
2269      {
2270         my $firstc  = $self->getValue($fontmetrics->{FirstChar});
2271         my $lastc   = $self->getValue($fontmetrics->{LastChar});
2272         my $widths  = $self->getValue($fontmetrics->{Widths});
2273         my $missing_width;  # populate this on demand
2274       CHAR:
2275         for my $char (unpack 'C*', $string)
2276         {
2277            if ($char >= $firstc && $char <= $lastc)
2278            {
2279               $width += $widths->[$char - $firstc]->{value};
2280               next CHAR;
2281            }
2282
2283            if (!defined $missing_width)
2284            {
2285               my $fd = exists $fontmetrics->{FontDescriptor} ?
2286                   $self->getValue($fontmetrics->{FontDescriptor}) : undef;
2287               $missing_width = $fd && exists $fd->{MissingWidth} ?
2288                   $self->getValue($fd->{MissingWidth}) : 0;
2289            }
2290
2291            $width += $missing_width;
2292         }
2293         $width /= 1000.0;  # units conversion
2294      }
2295      elsif ($fontmetrics->{BaseFont})
2296      {
2297         my $fontname = $self->getValue($fontmetrics->{BaseFont});
2298         if (!exists $self->{fontmetrics}->{$fontname})
2299         {
2300            require Text::PDF::SFont;
2301            require Text::PDF::File;
2302            my $pdf = Text::PDF::File->new();
2303            $self->{fontmetrics}->{$fontname} =
2304                Text::PDF::SFont->new($pdf, $fontname, 'NULL');
2305         }
2306         if ($self->{fontmetrics}->{$fontname})
2307         {
2308            $width = $self->{fontmetrics}->{$fontname}->width($string);
2309         }
2310      }
2311      else
2312      {
2313         warn 'Failed to understand this font';
2314      }
2315   }
2316
2317   if ($width == 0)
2318   {
2319      # HACK!!!
2320      #warn "Using klugy width!\n";
2321      $width = 0.2 * length $string;
2322   }
2323
2324   return $width;
2325}
2326
2327=item $doc->numPages()
2328
2329Returns the number of pages in the PDF document.
2330
2331=cut
2332
2333sub numPages
2334{
2335   my $self = shift;
2336   return $self->{PageCount};
2337}
2338
2339=item $doc->getPage($pagenum)
2340
2341I<For INTERNAL use>
2342
2343Returns a dictionary for a given numbered page.
2344
2345=cut
2346
2347sub getPage
2348{
2349   my $self = shift;
2350   my $pagenum = shift;
2351
2352   if ($pagenum < 1 || $pagenum > $self->{PageCount})
2353   {
2354      warn "Invalid page number requested: $pagenum\n";
2355      return;
2356   }
2357
2358   if (!exists $self->{pagecache}->{$pagenum})
2359   {
2360      my $node = $self->getPagesDict();
2361      my $nodestart = 1;
2362      while ($self->getValue($node->{Type}) eq 'Pages')
2363      {
2364         my $kids = $self->getValue($node->{Kids});
2365         if ((ref $kids) ne 'ARRAY')
2366         {
2367            die "Error: \@kids is not an array\n";
2368         }
2369         my $child = 0;
2370         if (@{$kids} == 1)
2371         {
2372            # Do the simple case first:
2373            $child = 0;
2374            # nodestart is unchanged
2375         }
2376         else
2377         {
2378            # search through all kids EXCEPT don't bother looking at
2379            # the last one because that is surely the right one if all
2380            # the others are wrong.
2381
2382            while ($child < $#{$kids})
2383            {
2384               # the first leaf of the kid is the page we want.  It
2385               # doesn't matter if the kid is a leaf or a node.
2386               last if ($pagenum == $nodestart);
2387
2388               # Retrieve the dictionary of this child
2389               my $sub = $self->getValue($kids->[$child]);
2390               if ($sub->{Type}->{value} ne 'Pages')
2391               {
2392                  # Its a leaf, and not the right one.  Move on.
2393                  $nodestart++;
2394               }
2395               else
2396               {
2397                  my $count = $self->getValue($sub->{Count});
2398
2399                  # The page we want is in this kid.  Descend.
2400                  last if ($nodestart + $count - 1 >= $pagenum);
2401
2402                  # Not in this kid.  Move on.
2403                  $nodestart += $count;
2404               }
2405               $child++;
2406            }
2407         }
2408
2409         $node = $self->getValue($kids->[$child]);
2410         if (! ref $node)
2411         {
2412            require Data::Dumper;
2413            cluck Data::Dumper::Dumper($node);
2414         }
2415      }
2416
2417      # Ok, now we've got the right page.  Store it.
2418      $self->{pagecache}->{$pagenum} = $node;
2419   }
2420
2421   return $self->{pagecache}->{$pagenum};
2422}
2423
2424=item $doc->getPageObjnum($pagenum)
2425
2426I<For INTERNAL use>
2427
2428Return the number of the PDF object in which the specified page occurs.
2429
2430=cut
2431
2432sub getPageObjnum
2433{
2434   my $self = shift;
2435   my $pagenum = shift;
2436
2437   my $page = $self->getPage($pagenum);
2438   return if (!$page);
2439   my ($anyobj) = values %{$page};
2440   if (!$anyobj)
2441   {
2442      die "Internal error: page has no attributes!!!\n";
2443   }
2444   if (wantarray)
2445   {
2446      return ($anyobj->{objnum}, $anyobj->{gennum});
2447   }
2448   else
2449   {
2450      return $anyobj->{objnum};
2451   }
2452}
2453
2454=item $doc->getPageText($pagenum)
2455
2456Extracts the text from a PDF page as a string.
2457
2458=cut
2459
2460sub getPageText
2461{
2462   my $self = shift;
2463   my $pagenum = shift;
2464   my $verbose = shift;
2465
2466   my $pagetree = $self->getPageContentTree($pagenum, $verbose);
2467   if (!$pagetree)
2468   {
2469      return;
2470   }
2471
2472   require CAM::PDF::PageText;
2473   return CAM::PDF::PageText->render($pagetree, $verbose);
2474}
2475
2476=item $doc->getPageContentTree($pagenum)
2477
2478Retrieves a parsed page content data structure, or undef if there is a
2479syntax error or if the page does not exist.
2480
2481=cut
2482
2483sub getPageContentTree
2484{
2485   my $self = shift;
2486   my $pagenum = shift;
2487   my $verbose = shift;
2488
2489   my $content = $self->getPageContent($pagenum);
2490   return if (!defined $content);
2491
2492   $self->_buildNameTable($pagenum);
2493
2494   my $page = $self->getPage($pagenum);
2495   my $box = [0, 0, 612, 792];
2496   if ($page->{MediaBox})
2497   {
2498      my $mediabox = $self->getValue($page->{MediaBox});
2499      $box->[0] = $self->getValue($mediabox->[0]);
2500      $box->[1] = $self->getValue($mediabox->[1]);
2501      $box->[2] = $self->getValue($mediabox->[2]);
2502      $box->[3] = $self->getValue($mediabox->[3]);
2503   }
2504
2505   require CAM::PDF::Content;
2506   my $tree = CAM::PDF::Content->new($content, {
2507      doc => $self,
2508      properties => $self->{Names}->{$pagenum},
2509      mediabox => $box,
2510   }, $verbose);
2511
2512   return $tree;
2513}
2514
2515=item $doc->getPageContent($pagenum)
2516
2517Return a string with the layout contents of one page.
2518
2519=cut
2520
2521sub getPageContent
2522{
2523   my $self = shift;
2524   my $pagenum = shift;
2525
2526   my $page = $self->getPage($pagenum);
2527   if (!$page || !exists $page->{Contents})
2528   {
2529      return q{};
2530   }
2531
2532   my $contents = $self->getValue($page->{Contents});
2533
2534   if (! ref $contents)
2535   {
2536      return $contents;
2537   }
2538   elsif ((ref $contents) eq 'HASH')
2539   {
2540      # doesn't matter if it's not encoded...
2541      return $self->decodeOne(CAM::PDF::Node->new('dictionary', $contents));
2542   }
2543   elsif ((ref $contents) eq 'ARRAY')
2544   {
2545      my $stream = q{};
2546      for my $arrobj (@{$contents})
2547      {
2548         my $streamdata = $self->getValue($arrobj);
2549         if (! ref $streamdata)
2550         {
2551            $stream .= $streamdata;
2552         }
2553         elsif ((ref $streamdata) eq 'HASH')
2554         {
2555            $stream .= $self->decodeOne(CAM::PDF::Node->new('dictionary',$streamdata));  # doesn't matter if it's not encoded...
2556         }
2557         else
2558         {
2559            die "Unexpected content type for page contents\n";
2560         }
2561      }
2562      return $stream;
2563   }
2564   else
2565   {
2566      die "Unexpected content type for page contents\n";
2567   }
2568   return; # should never get here
2569}
2570
2571=item $doc->getPageDimensions($pagenum)
2572
2573Returns an array of C<x>, C<y>, C<width> and C<height> numbers that
2574define the dimensions of the specified page in points (1/72 inches).
2575Technically, this is the C<MediaBox> dimensions, which explains why
2576it's possible for C<x> and C<y> to be non-zero, but that's a rare
2577case.
2578
2579For example, given a simple 8.5 by 11 inch page, this method will return
2580C<(0,0,612,792)>.
2581
2582This method will die() if the specified page number does not exist.
2583
2584=cut
2585
2586sub getPageDimensions
2587{
2588   my $self      = shift;
2589   my $pagenum  = shift;
2590   my $pagedict = shift; # only used during recursion
2591
2592   if (!$pagedict)
2593   {
2594      $pagedict = $self->getPage($pagenum);
2595      if (!$pagedict)
2596      {
2597         die 'No such page '.$pagenum;
2598      }
2599   }
2600
2601   if (exists $pagedict->{MediaBox})
2602   {
2603      my $box = $self->getValue($pagedict->{MediaBox});
2604      return ($self->getValue($box->[0]),
2605              $self->getValue($box->[1]),
2606              $self->getValue($box->[2]),
2607              $self->getValue($box->[3]));
2608   }
2609   elsif (exists $pagedict->{Parent})
2610   {
2611      return $self->getPageDimensions($pagenum, $self->getValue($pagedict->{Parent}));
2612   }
2613   else
2614   {
2615      die 'Failed to find the page dimensions';
2616   }
2617   return; # never gets here
2618}
2619
2620=item $doc->getName($object)
2621
2622I<For INTERNAL use>
2623
2624Given a PDF object reference, return it's name, if it has one.  This
2625is useful for indirect references to images in particular.
2626
2627=cut
2628
2629sub getName
2630{
2631   my $self = shift;
2632   my $objnode = shift;
2633
2634   if ($objnode->{value}->{type} eq 'dictionary')
2635   {
2636      my $dict = $objnode->{value}->{value};
2637      if (exists $dict->{Name})
2638      {
2639         return $self->getValue($dict->{Name});
2640      }
2641   }
2642   return q{};
2643}
2644
2645=item $doc->getPrefs()
2646
2647Return an array of security information for the document:
2648
2649  owner password
2650  user password
2651  print boolean
2652  modify boolean
2653  copy boolean
2654  add boolean
2655
2656See the PDF reference for the intended use of the latter four booleans.
2657
2658This module publishes the array indices of these values for your
2659convenience:
2660
2661  $CAM::PDF::PREF_OPASS
2662  $CAM::PDF::PREF_UPASS
2663  $CAM::PDF::PREF_PRINT
2664  $CAM::PDF::PREF_MODIFY
2665  $CAM::PDF::PREF_COPY
2666  $CAM::PDF::PREF_ADD
2667
2668So, you can retrieve the value of the Copy boolean via:
2669
2670  my ($canCopy) = ($self->getPrefs())[$CAM::PDF::PREF_COPY];
2671
2672=cut
2673
2674sub getPrefs
2675{
2676   my $self = shift;
2677
2678   my @p = (1,1,1,1);
2679   if (exists $self->{crypt}->{P})
2680   {
2681      @p = $self->{crypt}->decode_permissions($self->{crypt}->{P});
2682   }
2683   return($self->{crypt}->{opass}, $self->{crypt}->{upass}, @p);
2684}
2685
2686=item $doc->canPrint()
2687
2688Return a boolean indicating whether the Print permission is enabled
2689on the PDF.
2690
2691=cut
2692
2693sub canPrint
2694{
2695   my $self = shift;
2696   return ($self->getPrefs())[$PREF_PRINT];
2697}
2698
2699=item $doc->canModify()
2700
2701Return a boolean indicating whether the Modify permission is enabled
2702on the PDF.
2703
2704=cut
2705
2706sub canModify
2707{
2708   my $self = shift;
2709   return ($self->getPrefs())[$PREF_MODIFY];
2710}
2711
2712=item $doc->canCopy()
2713
2714Return a boolean indicating whether the Copy permission is enabled
2715on the PDF.
2716
2717=cut
2718
2719sub canCopy
2720{
2721   my $self = shift;
2722   return ($self->getPrefs())[$PREF_COPY];
2723}
2724
2725=item $doc->canAdd()
2726
2727Return a boolean indicating whether the Add permission is enabled
2728on the PDF.
2729
2730=cut
2731
2732sub canAdd
2733{
2734   my $self = shift;
2735   return ($self->getPrefs())[$PREF_ADD];
2736}
2737
2738=item $doc->getFormFieldList()
2739
2740Return an array of the names of all of the PDF form fields.  The names
2741are the full hierarchical names constructed as explained in the PDF
2742reference manual.  These names are useful for the fillFormFields()
2743function.
2744
2745=cut
2746
2747sub getFormFieldList
2748{
2749   my $self = shift;
2750   my $parentname = shift;  # very optional
2751
2752   my $prefix = (defined $parentname ? $parentname . q{.} : q{});
2753
2754   my $kidlist;
2755   if (defined $parentname && $parentname ne q{})
2756   {
2757      my $parent = $self->getFormField($parentname);
2758      return if (!$parent);
2759      my $dict = $self->getValue($parent);
2760      return if (!exists $dict->{Kids});
2761      $kidlist = $self->getValue($dict->{Kids});
2762   }
2763   else
2764   {
2765      my $root = $self->getRootDict()->{AcroForm};
2766      return if (!$root);
2767      my $parent = $self->getValue($root);
2768      return if (!exists $parent->{Fields});
2769      $kidlist = $self->getValue($parent->{Fields});
2770   }
2771
2772   my @list;
2773   for my $kid (@{$kidlist})
2774   {
2775      if ((! ref $kid) || (ref $kid) ne 'CAM::PDF::Node' || $kid->{type} ne 'reference')
2776      {
2777         die "Expected a reference as the form child of '$parentname'\n";
2778      }
2779      my $objnode = $self->dereference($kid->{value});
2780      my $dict = $self->getValue($objnode);
2781      my $name = '(no name)';  # assume the worst
2782      if (exists $dict->{T})
2783      {
2784         $name = $self->getValue($dict->{T});
2785      }
2786      $name = $prefix . $name;
2787      push @list, $name;
2788      if (exists $dict->{TU})
2789      {
2790         push @list, $prefix . $self->getValue($dict->{TU}) . ' (alternate name)';
2791      }
2792      $self->{formcache}->{$name} = $objnode;
2793      my @kidnames = $self->getFormFieldList($name);
2794      if (@kidnames > 0)
2795      {
2796         #push @list, 'descend...';
2797         push @list, @kidnames;
2798         #push @list, 'ascend...';
2799      }
2800   }
2801   return @list;
2802}
2803
2804=item $doc->getFormField($name)
2805
2806I<For INTERNAL use>
2807
2808Return the object containing the form field definition for the
2809specified field name.  C<$name> can be either the full name or the
2810"short/alternate" name.
2811
2812=cut
2813
2814sub getFormField
2815{
2816   my $self = shift;
2817   my $fieldname = shift;
2818
2819   return if (!defined $fieldname);
2820
2821   if (! exists $self->{formcache}->{$fieldname})
2822   {
2823      my $kidlist;
2824      my $parent;
2825      if ($fieldname =~ m/ [.] /xms)
2826      {
2827         my $parentname;
2828         if ($fieldname =~ s/ \A(.*)[.]([.]+)\z /$2/xms)
2829         {
2830            $parentname = $1;
2831         }
2832         return if (!$parentname);
2833         $parent = $self->getFormField($parentname);
2834         return if (!$parent);
2835         my $dict = $self->getValue($parent);
2836         return if (!exists $dict->{Kids});
2837         $kidlist = $self->getValue($dict->{Kids});
2838      }
2839      else
2840      {
2841         my $root = $self->getRootDict()->{AcroForm};
2842         return if (!$root);
2843         $parent = $self->dereference($root->{value});
2844         return if (!$parent);
2845         my $dict = $self->getValue($parent);
2846         return if (!exists $dict->{Fields});
2847         $kidlist = $self->getValue($dict->{Fields});
2848      }
2849
2850      $self->{formcache}->{$fieldname} = undef;  # assume the worst...
2851      for my $kid (@{$kidlist})
2852      {
2853         my $objnode = $self->dereference($kid->{value});
2854         $objnode->{formparent} = $parent;
2855         my $dict = $self->getValue($objnode);
2856         if (exists $dict->{T})
2857         {
2858            $self->{formcache}->{$self->getValue($dict->{T})} = $objnode;
2859         }
2860         if (exists $dict->{TU})
2861         {
2862            $self->{formcache}->{$self->getValue($dict->{TU})} = $objnode;
2863         }
2864      }
2865   }
2866
2867   return $self->{formcache}->{$fieldname};
2868}
2869
2870=item $doc->getFormFieldDict($formfieldobject)
2871
2872I<For INTERNAL use>
2873
2874Return a hash reference representing the accumulated property list for
2875a form field, including all of it's inherited properties.  This should
2876be treated as a read-only hash!  It ONLY retrieves the properties it
2877knows about.
2878
2879=cut
2880
2881sub getFormFieldDict
2882{
2883   my $self = shift;
2884   my $field = shift;
2885
2886   return if (!defined $field);
2887
2888   my $dict = {};
2889   if ($field->{formparent})
2890   {
2891      $dict = $self->getFormFieldDict($field->{formparent});
2892   }
2893   my $olddict = $self->getValue($field);
2894
2895   if ($olddict->{DR})
2896   {
2897      $dict->{DR} ||= CAM::PDF::Node->new('dictionary', {});
2898      my $dr = $self->getValue($dict->{DR});
2899      my $olddr = $self->getValue($olddict->{DR});
2900      for my $key (keys %{$olddr})
2901      {
2902         if ($dr->{$key})
2903         {
2904            if ($key eq 'Font')
2905            {
2906               my $fonts = $self->getValue($olddr->{$key});
2907               for my $font (keys %{$fonts})
2908               {
2909                  $dr->{$key}->{$font} = $self->copyObject($fonts->{$font});
2910               }
2911            }
2912            else
2913            {
2914               warn "Unknown resource key '$key' in form field dictionary";
2915            }
2916         }
2917         else
2918         {
2919            $dr->{$key} = $self->copyObject($olddr->{$key});
2920         }
2921      }
2922   }
2923
2924   # Some properties are simple: inherit means override
2925   for my $prop (qw(Q DA Ff V FT))
2926   {
2927      if ($olddict->{$prop})
2928      {
2929         $dict->{$prop} = $self->copyObject($olddict->{$prop});
2930      }
2931   }
2932
2933   return $dict;
2934}
2935
2936################################################################################
2937
2938=back
2939
2940=head2 Data/Object Manipulation
2941
2942=over
2943
2944=item $doc->setPrefs($ownerpass, $userpass, $print?, $modify?, $copy?, $add?)
2945
2946Alter the document's security information.  Note that modifying these
2947parameters must be done respecting the intellectual property of the
2948original document.  See Adobe's statement in the introduction of the
2949reference manual.
2950
2951B<Important Note:> Most PDF readers (Acrobat, Preview.app) only offer
2952one password field for opening documents.  So, if the C<$ownerpass>
2953and C<$userpass> are different, those applications cannot read the
2954documents.  (Perhaps this is a bug in CAM::PDF?)
2955
2956Note: any omitted booleans default to false.  So, these two are
2957equivalent:
2958
2959    $doc->setPrefs('password', 'password');
2960    $doc->setPrefs('password', 'password', 0, 0, 0, 0);
2961
2962=cut
2963
2964sub setPrefs
2965{
2966   my ($self, @prefs) = @_;
2967
2968   my $p = $self->{crypt}->encode_permissions(@prefs[2..5]);
2969   $self->{crypt}->set_passwords($self, @prefs[0..1], $p);
2970   return;
2971}
2972
2973=item $doc->setName($object, $name)
2974
2975I<For INTERNAL use>
2976
2977Change the name of a PDF object structure.
2978
2979=cut
2980
2981sub setName
2982{
2983   my $self = shift;
2984   my $objnode = shift;
2985   my $name = shift;
2986
2987   if ($name && $objnode->{value}->{type} eq 'dictionary')
2988   {
2989      $objnode->{value}->{value}->{Name} = CAM::PDF::Node->new('label', $name, $objnode->{objnum}, $objnode->{gennum});
2990      if ($objnode->{objnum})
2991      {
2992         $self->{changes}->{$objnode->{objnum}} = 1;
2993      }
2994      return $self;
2995   }
2996   return;
2997}
2998
2999=item $doc->removeName($object)
3000
3001I<For INTERNAL use>
3002
3003Delete the name of a PDF object structure.
3004
3005=cut
3006
3007sub removeName
3008{
3009   my $self = shift;
3010   my $objnode = shift;
3011
3012   if ($objnode->{value}->{type} eq 'dictionary' && exists $objnode->{value}->{value}->{Name})
3013   {
3014      delete $objnode->{value}->{value}->{Name};
3015      if ($objnode->{objnum})
3016      {
3017         $self->{changes}->{$objnode->{objnum}} = 1;
3018      }
3019      return $self;
3020   }
3021   return;
3022}
3023
3024
3025=item $doc->pageAddName($pagenum, $name, $objectnum)
3026
3027I<For INTERNAL use>
3028
3029Append a named object to the metadata for a given page.
3030
3031=cut
3032
3033sub pageAddName
3034{
3035   my $self = shift;
3036   my $pagenum = shift;
3037   my $name = shift;
3038   my $key = shift;
3039
3040   $self->_buildNameTable($pagenum);
3041   my $page = $self->getPage($pagenum);
3042   my ($objnum, $gennum) = $self->getPageObjnum($pagenum);
3043
3044   if (!exists $self->{NameObjects}->{$pagenum})
3045   {
3046      if ($objnum)
3047      {
3048         $self->{changes}->{$objnum} = 1;
3049      }
3050      if (!exists $page->{Resources})
3051      {
3052         $page->{Resources} = CAM::PDF::Node->new('dictionary', {}, $objnum, $gennum);
3053      }
3054      my $r = $self->getValue($page->{Resources});
3055      if (!exists $r->{XObject})
3056      {
3057         $r->{XObject} = CAM::PDF::Node->new('dictionary', {}, $objnum, $gennum);
3058      }
3059      $self->{NameObjects}->{$pagenum} = $self->getValue($r->{XObject});
3060   }
3061
3062   $self->{NameObjects}->{$pagenum}->{$name} = CAM::PDF::Node->new('reference', $key, $objnum, $gennum);
3063   if ($objnum)
3064   {
3065      $self->{changes}->{$objnum} = 1;
3066   }
3067   return;
3068}
3069
3070=item $doc->setPageContent($pagenum, $content)
3071
3072=item $doc->setPageContent($pagenum, $tree->toString)
3073
3074Replace the content of the specified page with a new version.  This
3075function is often used after the getPageContent() function and some
3076manipulation of the returned string from that function.
3077
3078If your content is a parsed tree (i.e. the result of
3079getPageContentTree) then you should serialize it via toString first.
3080
3081=cut
3082
3083sub setPageContent
3084{
3085   my $self = shift;
3086   my $pagenum = shift;
3087   my $content = shift;
3088
3089   # Note that this *could* be implemented as
3090   #   delete current content
3091   #   appendPageContent
3092   # but that would lose the optimization below of reusing the content
3093   # object, where possible
3094
3095   my $page = $self->getPage($pagenum);
3096
3097   my $stream = $self->createStreamObject($content, 'FlateDecode');
3098   if ($page->{Contents} && $page->{Contents}->{type} eq 'reference')
3099   {
3100      my $key = $page->{Contents}->{value};
3101      $self->replaceObject($key, undef, $stream, 0);
3102   }
3103   else
3104   {
3105      my ($objnum, $gennum) = $self->getPageObjnum($pagenum);
3106      my $key = $self->appendObject(undef, $stream, 0);
3107      $page->{Contents} = CAM::PDF::Node->new('reference', $key, $objnum, $gennum);
3108      $self->{changes}->{$objnum} = 1;
3109   }
3110   return;
3111}
3112
3113=item $doc->appendPageContent($pagenum, $content)
3114
3115Add more content to the specified page.  Note that this function does
3116NOT do any page metadata work for you (like creating font objects for
3117any newly defined fonts).
3118
3119=cut
3120
3121sub appendPageContent
3122{
3123   my $self = shift;
3124   my $pagenum = shift;
3125   my $content = shift;
3126
3127   my $page = $self->getPage($pagenum);
3128
3129   my ($objnum, $gennum) = $self->getPageObjnum($pagenum);
3130   my $stream = $self->createStreamObject($content, 'FlateDecode');
3131   my $key = $self->appendObject(undef, $stream, 0);
3132   my $streamref = CAM::PDF::Node->new('reference', $key, $objnum, $gennum);
3133
3134   if (!$page->{Contents})
3135   {
3136      $page->{Contents} = $streamref;
3137   }
3138   elsif ($page->{Contents}->{type} eq 'array')
3139   {
3140      push @{$page->{Contents}->{value}}, $streamref;
3141   }
3142   elsif ($page->{Contents}->{type} eq 'reference')
3143   {
3144      $page->{Contents} = CAM::PDF::Node->new('array', [ $page->{Contents}, $streamref ], $objnum, $gennum);
3145   }
3146   else
3147   {
3148      die "Unsupported Content type \"$page->{Contents}->{type}\" on page $pagenum\n";
3149   }
3150   $self->{changes}->{$objnum} = 1;
3151   return;
3152}
3153
3154=item $doc->extractPages($pages...)
3155
3156Remove all pages from the PDF except the specified ones.  Like
3157deletePages(), the pages can be multiple arguments, comma separated
3158lists, ranges (open or closed).
3159
3160=cut
3161
3162sub extractPages  ## no critic (Unpack)
3163{
3164   my $self = shift;
3165   return $self if (@_ == 0); # no-work shortcut
3166   my @pages = $self->rangeToArray(1,$self->numPages(),@_);
3167
3168   if (@pages == 0)
3169   {
3170      croak 'Tried to delete all the pages';
3171   }
3172
3173   my %pages = map {$_ => 1} @pages; # eliminate duplicates
3174
3175   # make a list that is the complement of the @pages list
3176   my @delete = grep {!$pages{$_}} 1..$self->numPages();
3177
3178   return $self if (@delete == 0); # no-work shortcut
3179   return $self->_deletePages(@delete);
3180}
3181
3182=item $doc->deletePages($pages...)
3183
3184Remove the specified pages from the PDF.  The pages can be multiple
3185arguments, comma separated lists, ranges (open or closed).
3186
3187=cut
3188
3189sub deletePages  ## no critic (Unpack)
3190{
3191   my $self = shift;
3192   return $self if (@_ == 0); # no-work shortcut
3193   my @pages = $self->rangeToArray(1,$self->numPages(),@_);
3194
3195   return $self if (@pages == 0); # no-work shortcut
3196
3197   my %pages = map {$_ => 1} @pages; # eliminate duplicates
3198
3199   if ($self->numPages() == scalar keys %pages)
3200   {
3201      croak 'Tried to delete all the pages';
3202   }
3203
3204   return $self->_deletePages(keys %pages);
3205}
3206
3207sub _deletePages
3208{
3209   my ($self, @pages) = @_;
3210
3211   # Pages should be reverse sorted since we need to delete from the
3212   # end to make the page numbers come out right.
3213   my @objnums;
3214   for my $page (reverse sort {$a <=> $b} @pages)
3215   {
3216      my $objnum = $self->_deletePage($page);
3217      if (!$objnum)
3218      {
3219         $self->_deleteRefsToPages(@objnums);  # emergency cleanup to prevent corruption
3220         return;
3221      }
3222      push @objnums, $objnum;
3223   }
3224   $self->_deleteRefsToPages(@objnums);
3225   $self->cleanse();
3226   return $self;
3227}
3228
3229=item $doc->deletePage($pagenum)
3230
3231Remove the specified page from the PDF.  If the PDF has only one page,
3232this method will fail.
3233
3234=cut
3235
3236sub deletePage
3237{
3238   my $self = shift;
3239   my $pagenum = shift;
3240
3241   my $objnum = $self->_deletePage($pagenum);
3242   if ($objnum)
3243   {
3244      $self->_deleteRefsToPages($objnum);
3245      $self->cleanse();
3246   }
3247   return $objnum ? $self : ();
3248}
3249
3250# Internal method, called by deletePage() or deletePages()
3251# Returns the objnum of the deleted page
3252
3253sub _deletePage
3254{
3255   my $self = shift;
3256   my $pagenum = shift;
3257
3258   if ($self->numPages() <= 1) # don't delete the last page
3259   {
3260      croak 'Tried to delete the only page';
3261   }
3262   my ($objnum, $gennum) = $self->getPageObjnum($pagenum);
3263   if (!defined $objnum)
3264   {
3265      croak 'Tried to delete a non-existent page';
3266   }
3267
3268   $self->_deletePage_backPointers($objnum);
3269   $self->_deletePage_removeFromPageTree($pagenum);
3270
3271   # Removing the page is easy:
3272   $self->deleteObject($objnum);
3273
3274   # Caches are now bad for all pages from this one
3275   $self->decachePages($pagenum .. $self->numPages());
3276
3277   $self->{PageCount}--;
3278
3279   return $objnum;
3280}
3281
3282
3283sub _deletePage_backPointers
3284{
3285   my $self = shift;
3286   my $objnum = shift;
3287
3288   # Delete pointer from annotation back to the page
3289   my $page = $self->dereference($objnum);
3290   my $pagedict = $page->{value}->{value};
3291   if ($pagedict->{Annots})
3292   {
3293      my $annots = $self->getValue($pagedict->{Annots});
3294      if ($annots)
3295      {
3296         for my $annotref (@{$annots})
3297         {
3298            my $annot = $self->getValue($annotref);
3299            if ($annot)
3300            {
3301               delete $annot->{P};
3302            }
3303         }
3304      }
3305   }
3306   return;
3307}
3308
3309sub _deletePage_removeFromPageTree
3310{
3311   my $self = shift;
3312   my $pagenum = shift;
3313
3314   # Removing references to the page is hard:
3315   # (much of this code is lifted from getPage)
3316   my $parentdict;
3317   my $node = $self->dereference($self->getRootDict()->{Pages}->{value});
3318   my $nodedict = $node->{value}->{value};
3319   my $nodestart = 1;
3320   while ($node && $nodedict->{Type}->{value} eq 'Pages')
3321   {
3322      my $count;
3323      if ($nodedict->{Count}->{type} eq 'reference')
3324      {
3325         my $countobj = $self->dereference($nodedict->{Count}->{value});
3326         $count = $countobj->{value}->{value}--;
3327         $self->{changes}->{$countobj->{objnum}} = 1;
3328      }
3329      else
3330      {
3331         $count = $nodedict->{Count}->{value}--;
3332      }
3333      $self->{changes}->{$node->{objnum}} = 1;
3334
3335      if ($count == 1)
3336      {
3337         # only one left, so this is it
3338         if (!$parentdict)
3339         {
3340            croak 'Tried to delete the only page';
3341         }
3342         my $parentkids = $self->getValue($parentdict->{Kids});
3343         @{$parentkids} = grep {$_->{value} != $node->{objnum}} @{$parentkids};
3344         $self->{changes}->{$parentdict->{Kids}->{objnum}} = 1;
3345         $self->deleteObject($node->{objnum});
3346         last;
3347      }
3348
3349      my $kids = $self->getValue($nodedict->{Kids});
3350      if (@{$kids} == 1)
3351      {
3352         # Count was not 1, so this must not be a leaf node
3353         # hop down into node's child
3354
3355         my $sub = $self->dereference($kids->[0]->{value});
3356         my $subdict = $sub->{value}->{value};
3357         $parentdict = $nodedict;
3358         $node = $sub;
3359         $nodedict = $subdict;
3360      }
3361      else
3362      {
3363         # search through all kids
3364         for my $child (0 .. $#{$kids})
3365         {
3366            my $sub = $self->dereference($kids->[$child]->{value});
3367            my $subdict = $sub->{value}->{value};
3368
3369            if ($subdict->{Type}->{value} ne 'Pages')
3370            {
3371               if ($pagenum == $nodestart)
3372               {
3373                  # Got it!
3374                  splice @{$kids}, $child, 1;
3375                  $node = undef;  # flag that we are done
3376                  last;
3377               }
3378               else
3379               {
3380                  # Its a leaf, and not the right one.  Move on.
3381                  $nodestart++;
3382               }
3383            }
3384            else
3385            {
3386               # Type=='Pages' node
3387               my $child_count = $self->getValue($subdict->{Count});
3388               if ($nodestart + $child_count - 1 >= $pagenum)
3389               {
3390                  # The page we want is in this kid.  Descend.
3391                  $parentdict = $nodedict;
3392                  $node = $sub;
3393                  $nodedict = $subdict;
3394                  last;
3395               }
3396               else
3397               {
3398                  # Not in this kid.  Move on.
3399                  $nodestart += $child_count;
3400               }
3401            }
3402            if ($child == $#{$kids})
3403            {
3404               die "Internal error: did not find the page to delete -- corrupted page index\n";
3405            }
3406         }
3407      }
3408   }
3409   return;
3410}
3411
3412sub _deleteRefsToPages
3413{
3414   my ($self, @objnums) = @_;
3415   my %objnums = map {$_ => 1} @objnums;
3416
3417   my $root = $self->getRootDict();
3418   if ($root->{Names})
3419   {
3420      my $names = $self->getValue($root->{Names});
3421      if ($names->{Dests})
3422      {
3423         my $dests = $self->getValue($names->{Dests});
3424         if ($self->_deleteDests($dests, \%objnums))
3425         {
3426            delete $names->{Dests};
3427         }
3428      }
3429
3430      if (0 == scalar keys %{$names})
3431      {
3432         my $names_objnum = $root->{Names}->{value};
3433         $self->deleteObject($names_objnum);
3434         delete $root->{Names};
3435      }
3436   }
3437
3438   if ($root->{Outlines})
3439   {
3440      my $outlines = $self->getValue($root->{Outlines});
3441      $self->_deleteOutlines($outlines, \%objnums);
3442   }
3443   return;
3444}
3445
3446sub _deleteOutlines
3447{
3448   my $self = shift;
3449   my $outlines = shift;
3450   my $objnums = shift;
3451
3452   my @deletes;
3453   my @stack = ($outlines);
3454
3455   while (@stack > 0)
3456   {
3457      my $node = shift @stack;
3458
3459      # Check for a Destination (aka internal hyperlink)
3460      # A is indirect ref, Dest is direct ref; only one can be present
3461      my $dest;
3462      if ($node->{A})
3463      {
3464         $dest = $self->getValue($node->{A});
3465         $dest = $self->getValue($dest->{D});
3466      }
3467      elsif ($node->{Dest})
3468      {
3469         $dest = $self->getValue($node->{Dest});
3470      }
3471      if ($dest && (ref $dest) && (ref $dest) eq 'ARRAY')
3472      {
3473         my $ref = $dest->[0];
3474         if ($ref && $ref->{type} eq 'reference' && $objnums->{$ref->{value}})
3475         {
3476            $self->deleteObject($ref->{objnum});
3477            # Easier to just delete both, even though only one may exist
3478            delete $node->{A};
3479            delete $node->{Dest};
3480         }
3481      }
3482
3483      if ($node->{Next})
3484      {
3485         push @stack, $self->getValue($node->{Next});
3486      }
3487      if ($node->{First})
3488      {
3489         push @stack, $self->getValue($node->{First});
3490      }
3491   }
3492   return;
3493}
3494
3495sub _deleteDests  ## no critic(Subroutines::ProhibitExcessComplexity)
3496{
3497   my $self = shift;
3498   my $dests = shift;
3499   my $objnums = shift;
3500
3501   ## Accumulate the nodes to delete
3502   my @deletes;
3503   my @stack = ([$dests]);
3504
3505   while (@stack > 0)
3506   {
3507      my $chain = pop @stack;
3508      my $node = $chain->[0];
3509      if ($node->{Names})
3510      {
3511         my $pairs = $self->getValue($node->{Names});
3512         for (my $i=1; $i<@{$pairs}; $i+=2) ## no critic(ControlStructures::ProhibitCStyleForLoops)
3513         {
3514            push @stack, [$self->getValue($pairs->[$i]), @{$chain}];
3515         }
3516      }
3517      elsif ($node->{Kids})
3518      {
3519         my $list = $self->getValue($node->{Kids});
3520         push @stack, map {[$self->getValue($_), @{$chain}]} @{$list};
3521      }
3522      elsif ($node->{D})
3523      {
3524         my $props = $self->getValue($node->{D});
3525         my $ref = $props->[0];
3526         if ($ref && $ref->{type} eq 'reference' && $objnums->{$ref->{value}})
3527         {
3528            push @deletes, $chain;
3529         }
3530      }
3531   }
3532
3533   ## Delete the nodes, and their parents if applicable
3534   for my $chain (@deletes)
3535   {
3536      my $objnode = shift @{$chain};
3537      my $objnum = [values %{$objnode}]->[0]->{objnum};
3538      if (!$objnum)
3539      {
3540         die 'Destination object lacks an object number (number '.@{$chain}.' in the chain)';
3541      }
3542      $self->deleteObject($objnum);
3543
3544      # Ascend chain...  $objnum gets overwritten
3545
3546    CHAIN:
3547      for my $node (@{$chain})
3548      {
3549         last if (exists $node->{deleted});  # internal flag
3550
3551         my $node_objnum = [values %{$node}]->[0]->{objnum} || die;
3552
3553         if ($node->{Names})
3554         {
3555            my $pairs = $self->getValue($node->{Names});
3556            my $limits = $self->getValue($node->{Limits});
3557            my $redo_limits = 0;
3558
3559            # Find and remove child reference
3560            # iterate over keys of key-value array
3561            for (my $i=@{$pairs}-2; $i>=0; $i-=2) ## no critic(ControlStructures::ProhibitCStyleForLoops)
3562            {
3563               if ($pairs->[$i+1]->{value} == $objnum)
3564               {
3565                  my $name = $pairs->[$i]->{value} || die 'No name in Name tree';
3566                  splice @{$pairs}, $i, 2;
3567                  if ($limits->[0]->{value} eq $name || $limits->[1]->{value} eq $name)
3568                  {
3569                     $redo_limits = 1;
3570                  }
3571               }
3572            }
3573
3574            if (@{$pairs} > 0)
3575            {
3576               if ($redo_limits)
3577               {
3578                  my @names;
3579                  for (my $i=0; $i<@{$pairs}; $i+=2) ## no critic(ControlStructures::ProhibitCStyleForLoops)
3580                  {
3581                     push @names, $pairs->[$i]->{value};
3582                  }
3583                  @names = sort @names;
3584                  $limits->[0]->{value} = $names[0];
3585                  $limits->[1]->{value} = $names[-1];
3586               }
3587               last CHAIN;
3588            }
3589         }
3590
3591         elsif ($node->{Kids})
3592         {
3593            my $list = $self->getValue($node->{Kids});
3594
3595            # Find and remove child reference
3596            for my $i (reverse 0 .. $#{$list})
3597            {
3598               if ($list->[$i]->{value} == $objnum)
3599               {
3600                  splice @{$list}, $i, 1;
3601               }
3602            }
3603
3604            if (@{$list} > 0)
3605            {
3606               if ($node->{Limits})
3607               {
3608                  my $limits = $self->getValue($node->{Limits});
3609                  if (!$limits || @{$limits} != 2)
3610                  {
3611                     die 'Internal error: trouble parsing the Limits array in a name tree';
3612                  }
3613                  my @names;
3614                  for my $i (0..$#{$list})
3615                  {
3616                     my $child = $self->getValue($list->[$i]);
3617                     my $child_limits = $self->getValue($child->{Limits});
3618                     push @names, map {$_->{value}} @{$child_limits};
3619                  }
3620                  @names = sort @names;
3621                  $limits->[0]->{value} = $names[0];
3622                  $limits->[1]->{value} = $names[-1];
3623               }
3624               last CHAIN;
3625            }
3626         }
3627
3628         else
3629         {
3630            die 'Internal error: found a parent node with neither Names nor Kids.  This should be impossible.';
3631         }
3632
3633         # If we got here, the node is empty, so delete it and move onward
3634         $self->deleteObject($node_objnum);
3635         $node->{deleted} = undef;  # internal flag
3636
3637         # Prepare for next iteration
3638         $objnum = $node_objnum;
3639      }
3640   }
3641
3642   return exists $dests->{deleted};
3643}
3644
3645=item $doc->decachePages($pagenum, $pagenum, ...)
3646
3647Clears cached copies of the specified page data structures.  This is
3648useful if an operation has been performed that changes a page.
3649
3650=cut
3651
3652sub decachePages
3653{
3654   my ($self, @pages) = @_;
3655
3656   for (@pages)
3657   {
3658      delete $self->{pagecache}->{$_};
3659      delete $self->{Names}->{$_};
3660      delete $self->{NameObjects}->{$_};
3661   }
3662   delete $self->{Names}->{All};
3663   return $self;
3664}
3665
3666
3667=item $doc->addPageResources($pagenum, $resourcehash)
3668
3669Add the resources from the given object to the page resource
3670dictionary.  If the page does not have a resource dictionary, create
3671one.  This function avoids duplicating resources where feasible.
3672
3673=cut
3674
3675sub addPageResources
3676{
3677   my $self = shift;
3678   my $pagenum = shift;
3679   my $newrsrcs = shift;
3680
3681   return if (!$newrsrcs);
3682   my $page = $self->getPage($pagenum);
3683   return if (!$page);
3684
3685   my ($anyobj) = values %{$page};
3686   my $objnum = $anyobj->{objnum};
3687   my $gennum = $anyobj->{gennum};
3688
3689   my $pagersrcs;
3690   if ($page->{Resources})
3691   {
3692      $pagersrcs = $self->getValue($page->{Resources});
3693   }
3694   else
3695   {
3696      $pagersrcs = {};
3697      $page->{Resources} = CAM::PDF::Node->new('dictionary', $pagersrcs, $objnum, $gennum);
3698      $self->{changes}->{$objnum} = 1;
3699   }
3700   for my $type (keys %{$newrsrcs})
3701   {
3702      my $new_r = $self->getValue($newrsrcs->{$type});
3703      my $page_r;
3704      if ($pagersrcs->{$type})
3705      {
3706         $page_r = $self->getValue($pagersrcs->{$type});
3707      }
3708      if ($type eq 'Font')
3709      {
3710         if (!$page_r)
3711         {
3712            $page_r = {};
3713            $pagersrcs->{$type} = CAM::PDF::Node->new('dictionary', $page_r, $objnum, $gennum);
3714            $self->{changes}->{$objnum} = 1;
3715         }
3716         for my $font (keys %{$new_r})
3717         {
3718            next if (exists $page_r->{$font});
3719            my $val = $new_r->{$font};
3720            if ($val->{type} ne 'reference')
3721            {
3722               die 'Internal error: font entry is not a reference';
3723            }
3724            $page_r->{$font} = CAM::PDF::Node->new('reference', $val->{value}, $objnum, $gennum);
3725            $self->{changes}->{$objnum} = 1;
3726         }
3727      }
3728      elsif ($type eq 'ProcSet')
3729      {
3730         if (!$page_r)
3731         {
3732            $page_r = [];
3733            $pagersrcs->{$type} = CAM::PDF::Node->new('array', $page_r, $objnum, $gennum);
3734            $self->{changes}->{$objnum} = 1;
3735         }
3736         for my $proc (@{$new_r})
3737         {
3738            if ($proc->{type} ne 'label')
3739            {
3740               die 'Internal error: procset entry is not a label';
3741            }
3742            {
3743               ## no critic(BuiltinFunctions::ProhibitBooleanGrep) -- TODO: use any() instead
3744               next if (grep {$_->{value} eq $proc->{value}} @{$page_r});
3745            }
3746            push @{$page_r}, CAM::PDF::Node->new('label', $proc->{value}, $objnum, $gennum);
3747            $self->{changes}->{$objnum} = 1;
3748         }
3749      }
3750      elsif ($type eq 'Encoding')
3751      {
3752         # TODO: is this a hack or is it right?
3753         # EXPLICITLY skip /Encoding from form DR entry
3754      }
3755      else
3756      {
3757         warn "Internal error: unsupported resource type '$type'";
3758      }
3759   }
3760   return;
3761}
3762
3763=item $doc->appendPDF($pdf)
3764
3765Append pages from another PDF document to this one.  No optimization
3766is done -- the pieces are just appended and the internal table of
3767contents is updated.
3768
3769Note that this can break documents with annotations.  See the
3770F<appendpdf.pl> script for a workaround.
3771
3772=cut
3773
3774sub appendPDF
3775{
3776   my $self = shift;
3777   my $otherdoc = shift;
3778   my $prepend = shift; # boolean, default false
3779
3780   my $pageroot = $self->getPagesDict();
3781   my ($anyobj) = values %{$pageroot};
3782   my $objnum = $anyobj->{objnum};
3783   my $gennum = $anyobj->{gennum};
3784
3785   my $root = $self->getRootDict();
3786   my $otherroot = $otherdoc->getRootDict();
3787   my $otherpageobj = $otherdoc->dereference($otherroot->{Pages}->{value});
3788   my ($key, %refkeys) = $self->appendObject($otherdoc, $otherpageobj->{objnum}, 1);
3789   my $subpage = $self->getObjValue($key);
3790
3791   my $newdict = {};
3792   my $newpage = CAM::PDF::Node->new('object',
3793                                     CAM::PDF::Node->new('dictionary', $newdict));
3794   $newdict->{Type} = CAM::PDF::Node->new('label', 'Pages');
3795   $newdict->{Kids} = CAM::PDF::Node->new('array',
3796                                          [
3797                                           CAM::PDF::Node->new('reference', $prepend ? $key : $objnum),
3798                                           CAM::PDF::Node->new('reference', $prepend ? $objnum : $key),
3799                                           ]);
3800   $self->{PageCount} += $otherdoc->{PageCount};
3801   $newdict->{Count} = CAM::PDF::Node->new('number', $self->{PageCount});
3802   my $newpagekey = $self->appendObject(undef, $newpage, 0);
3803   $root->{Pages}->{value} = $newpagekey;
3804
3805   $pageroot->{Parent} = CAM::PDF::Node->new('reference', $newpagekey, $key, $subpage->{gennum});
3806   $subpage->{Parent} = CAM::PDF::Node->new('reference', $newpagekey, $key, $subpage->{gennum});
3807
3808   if ($otherroot->{AcroForm})
3809   {
3810      my $forms = $otherdoc->getValue($otherdoc->getValue($otherroot->{AcroForm})->{Fields});
3811      my @newforms;
3812      for my $reference (@{$forms})
3813      {
3814         if ($reference->{type} ne 'reference')
3815         {
3816            die 'Internal error: expected a reference';
3817         }
3818         my $newkey = $refkeys{$reference->{value}};
3819         if ($newkey)
3820         {
3821            push @newforms, CAM::PDF::Node->new('reference', $newkey);
3822         }
3823      }
3824      if ($root->{AcroForm})
3825      {
3826         my $mainforms = $self->getValue($self->getValue($root->{AcroForm})->{Fields});
3827         for my $reference (@newforms)
3828         {
3829            $reference->{objnum} = $mainforms->[0]->{objnum};
3830            $reference->{gennum} = $mainforms->[0]->{gennum};
3831         }
3832         push @{$mainforms}, @newforms;
3833      }
3834      else
3835      {
3836         die 'adding new forms is not implemented';
3837      }
3838   }
3839
3840   if ($prepend)
3841   {
3842      # clear caches
3843      $self->{pagecache} = {};
3844      $self->{Names} = {};
3845      $self->{NameObjects} = {};
3846   }
3847
3848   return $key;
3849}
3850
3851=item $doc->prependPDF($pdf)
3852
3853Just like appendPDF() except the new document is inserted on page 1
3854instead of at the end.
3855
3856=cut
3857
3858sub prependPDF
3859{
3860   my ($self, @args) = @_;
3861   return $self->appendPDF(@args, 1);
3862}
3863
3864=item $doc->duplicatePage($pagenum)
3865
3866=item $doc->duplicatePage($pagenum, $leaveblank)
3867
3868Inserts an identical copy of the specified page into the document.
3869The new page's number will be C<$pagenum + 1>.
3870
3871If C<$leaveblank> is true, the new page does not get any content.
3872Thus, the document is broken until you subsequently call
3873setPageContent().
3874
3875=cut
3876
3877sub duplicatePage
3878{
3879   my $self = shift;
3880   my $pagenum = shift;
3881   my $leave_blank = shift || 0;
3882
3883   my $page = $self->getPage($pagenum);
3884   my $objnum = $self->getPageObjnum($pagenum);
3885   my $newobjnum = $self->appendObject($self, $objnum, 0);
3886   my $newdict = $self->getObjValue($newobjnum);
3887   delete $newdict->{Contents};
3888   my $parent = $self->getValue($page->{Parent});
3889   push @{$self->getValue($parent->{Kids})}, CAM::PDF::Node->new('reference', $newobjnum);
3890
3891   while ($parent)
3892   {
3893      $self->{changes}->{$parent->{Count}->{objnum}} = 1;
3894      if ($parent->{Count}->{type} eq 'reference')
3895      {
3896         my $countobj = $self->dereference($parent->{Count}->{value});
3897         $countobj->{value}->{value}++;
3898         $self->{changes}->{$countobj->{objnum}} = 1;
3899      }
3900      else
3901      {
3902         $parent->{Count}->{value}++;
3903      }
3904      $parent = $self->getValue($parent->{Parent});
3905   }
3906   $self->{PageCount}++;
3907
3908   if (!$leave_blank)
3909   {
3910      $self->setPageContent($pagenum+1, $self->getPageContent($pagenum));
3911   }
3912
3913   # Caches are now bad for all pages from this one
3914   $self->decachePages($pagenum + 1 .. $self->numPages());
3915
3916   return $self;
3917}
3918
3919=item $doc->createStreamObject($content)
3920
3921=item $doc->createStreamObject($content, $filter ...)
3922
3923I<For INTERNAL use>
3924
3925Create a new Stream object.  This object is NOT added to the document.
3926Use the appendObject() function to do that after calling this
3927function.
3928
3929=cut
3930
3931sub createStreamObject
3932{
3933   my $self = shift;
3934   my $content = shift;
3935
3936   my $dict = CAM::PDF::Node->new('dictionary',
3937                                 {
3938                                    Length => CAM::PDF::Node->new('number', length $content),
3939                                    StreamData => CAM::PDF::Node->new('stream', $content),
3940                                 },
3941                                 );
3942
3943   my $objnode = CAM::PDF::Node->new('object', $dict);
3944
3945   while (my $filter = shift)
3946   {
3947      #warn "$filter encoding\n";
3948      $self->encodeOne($objnode->{value}, $filter);
3949   }
3950
3951   return $objnode;
3952}
3953
3954=item $doc->uninlineImages()
3955
3956=item $doc->uninlineImages($pagenum)
3957
3958Search the content of the specified page (or all pages if the
3959page number is omitted) for embedded images.  If there are any, replace
3960them with indirect objects.  This procedure uses heuristics to detect
3961in-line images, and is subject to confusion in extremely rare cases of text
3962that uses C<BI> and C<ID> a lot.
3963
3964=cut
3965
3966sub uninlineImages
3967{
3968   my $self = shift;
3969   my $pagenum = shift;
3970
3971   my $changes = 0;
3972   if (!$pagenum)
3973   {
3974      my $pages = $self->numPages();
3975      for my $p (1 .. $pages)
3976      {
3977         $changes += $self->uninlineImages($p);
3978      }
3979   }
3980   else
3981   {
3982      my $c = $self->getPageContent($pagenum);
3983      my $pos = 0;
3984      while (($pos = index $c, 'BI', $pos) != 1)
3985      {
3986         # manual \bBI check
3987         # if beginning of string or token
3988         if ($pos == 0 || (substr $c, $pos-1, 1) =~ m/ \W /xms)
3989         {
3990            my $part = substr $c, $pos;
3991            if ($part =~ m/ \A BI\b(.*?)\bID\b /xms)
3992            {
3993               my $im = $1;
3994
3995               ## Long series of tests to make sure this is really an
3996               ## image and not just coincidental text
3997
3998               # Fix easy cases of "BI text) BI ... ID"
3999               $im =~ s/ \A .*\bBI\b //xms;
4000               # There should never be an EI inside of a BI ... ID
4001               next if ($im =~ m/ \bEI\b /xms);
4002
4003               # Easy tests: is this the beginning or end of a string?
4004               # (these aren't really good tests...)
4005               next if ($im =~ m/ \A [)]    /xms);
4006               next if ($im =~ m/    [(] \z /xms);
4007
4008               # this is the most complex heuristic:
4009               # make sure that there is an open paren before every close
4010               # if not, then the "BI" or the "ID" was part of a string
4011               my $test = $im;  # make a copy we can scribble on
4012               my $failed = 0;
4013               # get rid of escaped parens for the test
4014               $test =~ s/ \\[()] //gxms;
4015               # Look for closing parens
4016               while ($test =~ s/ \A(.*?)[)] //xms)
4017               {
4018                  # If there is NOT an opening paren before the
4019                  # closing paren we detected above, then the start of
4020                  # our string is INSIDE a paren pair, thus a failure.
4021                  my $bit = $1;
4022                  if ($bit !~ m/ [(] /xms)
4023                  {
4024                     $failed = 1;
4025                     last;
4026                  }
4027               }
4028               next if ($failed);
4029
4030               # End of heuristics.  This is likely a real embedded image.
4031               # Now do the replacement.
4032
4033               my $oldlen = length $part;
4034               my $image = $self->parseInlineImage(\$part, undef);
4035               my $newlen = length $part;
4036               my $imagelen = $oldlen - $newlen;
4037
4038               # Construct a new image name like "I3".  Start with
4039               # "I1" and continue until we get an unused "I<n>"
4040               # (first, get the list of already-used labels)
4041               $self->_buildNameTable($pagenum);
4042               my $i = 1;
4043               my $name = 'Im1';
4044               while (exists $self->{Names}->{$pagenum}->{$name})
4045               {
4046                  $name = 'Im' . ++$i;
4047               }
4048
4049               $self->setName($image, $name);
4050               my $key = $self->appendObject(undef, $image, 0);
4051               $self->pageAddName($pagenum, $name, $key);
4052
4053               $c = (substr $c, 0, $pos) . "/$name Do" . (substr $c, $pos+$imagelen);
4054               $changes++;
4055            }
4056         }
4057      }
4058      if ($changes > 0)
4059      {
4060         $self->setPageContent($pagenum, $c);
4061      }
4062   }
4063   return $changes;
4064}
4065
4066=item $doc->appendObject($doc, $objectnum, $recurse?)
4067
4068=item $doc->appendObject($undef, $object, $recurse?)
4069
4070Duplicate an object from another PDF document and add it to this
4071document, optionally descending into the object and copying any other
4072objects it references.
4073
4074Like replaceObject(), the second form allows you to append a
4075newly-created block to the PDF.
4076
4077=cut
4078
4079sub appendObject
4080{
4081   my $self = shift;
4082   my $otherdoc = shift;
4083   my $otherkey = shift;
4084   my $follow = shift;
4085
4086   my $objnum = ++$self->{maxobj};
4087
4088   # Make sure our new object has a number higher than anything in
4089   # either document, otherwise the changeRefKeys might change
4090   # something twice!  We had a problem of 15 -> 134 -> 333 in 1.52
4091   # (private email with Charlie Katz)
4092   if ($otherdoc && $otherdoc->{maxobj} >= $objnum) {
4093      $objnum = $self->{maxobj} = $otherdoc->{maxobj} + 1;
4094   }
4095
4096   if (exists $self->{versions}->{$objnum}) {
4097      $self->{versions}->{$objnum}++;
4098   } else {
4099      $self->{versions}->{$objnum} = 0;
4100   }
4101
4102   my %refkeys = $self->replaceObject($objnum, $otherdoc, $otherkey, $follow);
4103   if (wantarray)
4104   {
4105      return ($objnum, %refkeys);
4106   }
4107   else
4108   {
4109      return $objnum;
4110   }
4111}
4112
4113=item $doc->replaceObject($objectnum, $doc, $objectnum, $recurse?)
4114
4115=item $doc->replaceObject($objectnum, $undef, $object)
4116
4117Duplicate an object from another PDF document and insert it into this
4118document, replacing an existing object.  Optionally descend into the
4119original object and copy any other objects it references.
4120
4121If the other document is undefined, then the object to copy is taken
4122to be an anonymous object that is not part of any other document.
4123This is useful when you've just created that anonymous object.
4124
4125=cut
4126
4127sub replaceObject
4128{
4129   my $self = shift;
4130   my $key = shift;
4131   my $otherdoc = shift;
4132   my $otherkey = shift;
4133   my $follow = shift;
4134
4135   # careful! 'undef' means something different from '0' here!
4136   if (!defined $follow)
4137   {
4138      $follow = 1;
4139   }
4140
4141   my $objnode;
4142   my $otherobj;
4143   if ($otherdoc)
4144   {
4145      $otherobj = $otherdoc->dereference($otherkey);
4146      $objnode = $self->copyObject($otherobj);
4147   }
4148   else
4149   {
4150      $objnode = $otherkey;
4151      if ($follow)
4152      {
4153         warn "Error: you cannot \"follow\" an object if it has no document.\n" .
4154             "Resetting follow = false and continuing....\n";
4155         $follow = 0;
4156      }
4157   }
4158
4159   $self->setObjNum($objnode, $key, 0);
4160
4161   # Preserve the name of the object
4162   if ($self->{xref}->{$key})  # make sure it isn't a brand new object
4163   {
4164      my $oldname = $self->getName($self->dereference($key));
4165      if ($oldname)
4166      {
4167         $self->setName($objnode, $oldname);
4168      }
4169      else
4170      {
4171         $self->removeName($objnode);
4172      }
4173   }
4174
4175   $self->{objcache}->{$key} = $objnode;
4176   $self->{changes}->{$key} = 1;
4177
4178   my %newrefkeys = ($otherkey, $key);
4179   if ($follow)
4180   {
4181      for my $oldrefkey ($otherdoc->getRefList($otherobj))
4182      {
4183         next if ($oldrefkey == $otherkey);
4184         my $newkey = $self->appendObject($otherdoc, $oldrefkey, 0);
4185         $newrefkeys{$oldrefkey} = $newkey;
4186      }
4187      my $already_changed = {}; # hash used by traverse() to avoid repeats
4188      $self->changeRefKeys($objnode, \%newrefkeys, $already_changed);
4189      for my $newkey (values %newrefkeys)
4190      {
4191         $self->changeRefKeys($self->dereference($newkey), \%newrefkeys, $already_changed);
4192      }
4193   }
4194   return (%newrefkeys);
4195}
4196
4197=item $doc->deleteObject($objectnum)
4198
4199Remove an object from the document.  This function does NOT take care
4200of dependencies on this object.
4201
4202=cut
4203
4204sub deleteObject
4205{
4206   my $self = shift;
4207   my $objnum = shift;
4208
4209   # DON'T clear versuion number! We need to keep this to increment later
4210   #delete $self->{versions}->{$objnum};
4211   delete $self->{objcache}->{$objnum};
4212   delete $self->{xref}->{$objnum};
4213   delete $self->{endxref}->{$objnum};
4214   delete $self->{changes}->{$objnum};
4215   return;
4216}
4217
4218=item $doc->cleanse()
4219
4220Remove unused objects.  I<WARNING:> this function breaks some PDF
4221documents because it removes objects that are strictly part of the
4222page model hierarchy, but which are required anyway (like some font
4223definition objects).
4224
4225=cut
4226
4227sub cleanse
4228{
4229   my $self = shift;
4230
4231   delete $self->{trailer}->{XRefStm}; # can't write this
4232   delete $self->getRootDict()->{PieceInfo}; # can't handle this one, too complicated
4233
4234   my $base = CAM::PDF::Node->new('dictionary', $self->{trailer});
4235   my @list = sort {$a<=>$b} $self->getRefList($base);
4236   #print join(',', @list), "\n";
4237
4238   for my $i (1 .. $self->{maxobj})
4239   {
4240      if (@list && $list[0] == $i)
4241      {
4242         shift @list;
4243      }
4244      else
4245      {
4246         #warn "delete object $i\n";
4247         $self->deleteObject($i);
4248      }
4249   }
4250   return;
4251}
4252
4253=item $doc->createID()
4254
4255I<For INTERNAL use>
4256
4257Generate a new document ID.  Contrary the Adobe recommendation, this
4258is a random number.
4259
4260=cut
4261
4262sub createID
4263{
4264   my $self = shift;
4265
4266   # Warning: this is non-repeatable, and depends on Linux!
4267
4268   my $addbytes;
4269   if ($self->{ID})
4270   {
4271      # do not change the first half of an existing ID
4272      $self->{ID} = substr $self->{ID}, 0, 16;
4273      $addbytes = 16;
4274   }
4275   else
4276   {
4277      $self->{ID} = q{};
4278      $addbytes = 32;
4279   }
4280
4281   # Append $addbytes random bytes
4282   # First try the system random number generator
4283   if (-f '/dev/urandom')
4284   {
4285      if (open my $fh, '<', '/dev/urandom')
4286      {
4287         my $bytes_read = read $fh, $self->{ID}, $addbytes, 32-$addbytes;
4288         close $fh;  ##no critic(Syscalls)
4289         $addbytes -= $bytes_read;
4290      }
4291   }
4292   # If that failed, use Perl's random number generator
4293   for (1..$addbytes)
4294   {
4295      $self->{ID} .= pack 'C', int rand 256;
4296   }
4297
4298   if ($self->{trailer})
4299   {
4300      $self->{trailer}->{ID} = CAM::PDF::Node->new('array',
4301                               [
4302                                CAM::PDF::Node->new('hexstring', substr $self->{ID}, 0, 16),
4303                                CAM::PDF::Node->new('hexstring', substr $self->{ID}, 16, 16),
4304                                ],
4305                               );
4306   }
4307
4308   return 1;
4309}
4310
4311=item $doc->fillFormFields($name => $value, ...)
4312
4313=item $doc->fillFormFields($opts_hash, $name => $value, ...)
4314
4315Set the default values of PDF form fields.  The name should be the
4316full hierarchical name of the field as output by the
4317getFormFieldList() function.  The argument list can be a hash if you
4318like.  A simple way to use this function is something like this:
4319
4320    my %fields = (fname => 'John', lname => 'Smith', state => 'WI');
4321    $field{zip} = 53703;
4322    $self->fillFormFields(%fields);
4323
4324If the first argument is a hash reference, it is interpreted as
4325options for how to render the filled data:
4326
4327=over
4328
4329=item background_color =E<lt> 'none' | $gray | [$r, $g, $b]
4330
4331Specify the background color for the text field.
4332
4333=item max_autoscale_fontsize =E<lt> $size
4334
4335=item min_autoscale_fontsize =E<lt> $size
4336
4337If the form field is set to auto-size the text to fit, then you may
4338use these options to constrain the limits of that
4339autoscaling. Otherwise, for example, a very long string will become
4340arbitrarily small to fit in the box.
4341
4342=back
4343
4344=cut
4345
4346sub fillFormFields  ## no critic(Subroutines::ProhibitExcessComplexity, Unpack)
4347{
4348   my $self = shift;
4349   my $opts = ref $_[0] ? shift : {};
4350   my @list = (@_);
4351
4352   my %opts = (
4353      background_color => 1,
4354      %{$opts},
4355   );
4356
4357   my $filled = 0;
4358   while (@list > 0)
4359   {
4360      my $key = shift @list;
4361      my $value = shift @list;
4362      if (!defined $value)
4363      {
4364         $value = q{};
4365      }
4366
4367      next if (!$key);
4368      next if (ref $key);
4369      my $objnode = $self->getFormField($key);
4370      next if (!$objnode);
4371
4372      my $objnum = $objnode->{objnum};
4373      my $gennum = $objnode->{gennum};
4374
4375      # This read-only dict includes inherited properties
4376      my $propdict = $self->getFormFieldDict($objnode);
4377
4378      # This read-write dict does not include inherited properties
4379      my $dict = $self->getValue($objnode);
4380      $dict->{V}  = CAM::PDF::Node->new('string', $value, $objnum, $gennum);
4381      #$dict->{DV} = CAM::PDF::Node->new('string', $value, $objnum, $gennum);
4382
4383      if ($propdict->{FT} && $self->getValue($propdict->{FT}) eq 'Tx')  # Is it a text field?
4384      {
4385         # Set up display of form value
4386         if (!$dict->{AP})
4387         {
4388            $dict->{AP} = CAM::PDF::Node->new('dictionary', {}, $objnum, $gennum);
4389         }
4390         if (!$dict->{AP}->{value}->{N})
4391         {
4392            my $newobj = CAM::PDF::Node->new('object',
4393                                            CAM::PDF::Node->new('dictionary',{}),
4394                                            );
4395            my $num = $self->appendObject(undef, $newobj, 0);
4396            $dict->{AP}->{value}->{N} = CAM::PDF::Node->new('reference', $num, $objnum, $gennum);
4397         }
4398         my $formobj = $self->dereference($dict->{AP}->{value}->{N}->{value});
4399         my $formonum = $formobj->{objnum};
4400         my $formgnum = $formobj->{gennum};
4401         my $formdict = $self->getValue($formobj);
4402         if (!$formdict->{Subtype})
4403         {
4404            $formdict->{Subtype} = CAM::PDF::Node->new('label', 'Form', $formonum, $formgnum);
4405         }
4406         my @rect = (0,0,0,0);
4407         if ($dict->{Rect})
4408         {
4409            ## no critic(Bangs::ProhibitNumberedNames)
4410            my $r = $self->getValue($dict->{Rect});
4411            my ($x1, $y1, $x2, $y2) = @{$r};
4412            @rect = (
4413               $self->getValue($x1),
4414               $self->getValue($y1),
4415               $self->getValue($x2),
4416               $self->getValue($y2),
4417            );
4418         }
4419         my $dx = $rect[2]-$rect[0];
4420         my $dy = $rect[3]-$rect[1];
4421         if (!$formdict->{BBox})
4422         {
4423            $formdict->{BBox} = CAM::PDF::Node->new('array',
4424                                                   [
4425                                                    CAM::PDF::Node->new('number', 0, $formonum, $formgnum),
4426                                                    CAM::PDF::Node->new('number', 0, $formonum, $formgnum),
4427                                                    CAM::PDF::Node->new('number', $dx, $formonum, $formgnum),
4428                                                    CAM::PDF::Node->new('number', $dy, $formonum, $formgnum),
4429                                                    ],
4430                                                   $formonum,
4431                                                   $formgnum);
4432         }
4433         my $text = $value;
4434         $text =~ s/ \r\n? /\n/gxms;
4435         $text =~ s/ \n+\z //xms;
4436
4437         my @rsrcs;
4438         my $fontmetrics = 0;
4439         my $fontname    = q{};
4440         my $fontsize    = 0;
4441         my $da          = q{};
4442         my $tl          = q{};
4443         my $border      = 2;
4444         my $tx          = $border;
4445         my $ty          = $border + 2;
4446         my $stringwidth;
4447         if ($propdict->{DA}) {
4448            $da = $self->getValue($propdict->{DA});
4449
4450            # Try to pull out all of the resources used in the text object
4451            @rsrcs = ($da =~ m{ /([^\s<>/\[\]()]+) }gxms);
4452
4453            # Try to pull out the font size, if any.  If more than
4454            # one, pick the last one.  Font commands look like:
4455            # "/<fontname> <size> Tf"
4456            if ($da =~ m{ \s*/(\w+)\s+(\d+)\s+Tf.*? \z }xms)
4457            {
4458               $fontname = $1;
4459               $fontsize = $2;
4460               if ($fontname)
4461               {
4462                  if ($propdict->{DR})
4463                  {
4464                     my $dr = $self->getValue($propdict->{DR});
4465                     $fontmetrics = $self->getFontMetrics($dr, $fontname);
4466                  }
4467                  #print STDERR "Didn't get font\n" if (!$fontmetrics);
4468               }
4469            }
4470         }
4471
4472         my %flags = (
4473                      Justify => 'left',
4474                      );
4475         if ($propdict->{Ff})
4476         {
4477            # Just decode the ones we actually care about
4478            # PDF ref, 3rd ed pp 532,543
4479            my $ff = $self->getValue($propdict->{Ff});
4480            my @flags = split m//xms, unpack 'b*', pack 'V', $ff;
4481            $flags{ReadOnly}        = $flags[0];
4482            $flags{Required}        = $flags[1];
4483            $flags{NoExport}        = $flags[2];
4484            $flags{Multiline}       = $flags[12];
4485            $flags{Password}        = $flags[13];
4486            $flags{FileSelect}      = $flags[20];
4487            $flags{DoNotSpellCheck} = $flags[22];
4488            $flags{DoNotScroll}     = $flags[23];
4489         }
4490         if ($propdict->{Q})
4491         {
4492            my $q = $self->getValue($propdict->{Q}) || 0;
4493            $flags{Justify} = $q==2 ? 'right' : ($q==1 ? 'center' : 'left');
4494         }
4495
4496         # The order of the following sections is important!
4497         if ($flags{Password})
4498         {
4499            $text =~ s/ [^\n] /*/gxms;  # Asterisks for password characters
4500         }
4501
4502         if ($fontmetrics && ! $fontsize)
4503         {
4504            # Fix autoscale fonts
4505            $stringwidth = 0;
4506            my $lines = 0;
4507            for my $line (split /\n/xms, $text)  # trailing null strings omitted
4508            {
4509               $lines++;
4510               my $w = $self->getStringWidth($fontmetrics, $line);
4511               if ($w && $w > $stringwidth)
4512               {
4513                  $stringwidth = $w;
4514               }
4515            }
4516            $lines ||= 1;
4517            # Initial guess
4518            $fontsize = ($dy - 2 * $border) / ($lines * 1.5);
4519            my $fontwidth = $fontsize * $stringwidth;
4520            my $maxwidth = $dx - 2 * $border;
4521            if ($fontwidth > $maxwidth)
4522            {
4523               $fontsize *= $maxwidth / $fontwidth;
4524            }
4525
4526            # allow for user override
4527            if (exists $opts->{max_autoscale_fontsize} && $fontsize > $opts->{max_autoscale_fontsize}) {
4528               $fontsize = $opts->{max_autoscale_fontsize};
4529            }
4530            if (exists $opts->{min_autoscale_fontsize} && $fontsize < $opts->{min_autoscale_fontsize}) {
4531               $fontsize = $opts->{min_autoscale_fontsize};
4532            }
4533
4534            $da =~ s/ \/$fontname\s+0\s+Tf\b /\/$fontname $fontsize Tf/gxms;
4535         }
4536         if ($fontsize)
4537         {
4538            # This formula is TOTALLY empirical.  It's probably wrong.
4539            $ty = $border + 2 + (9 - $fontsize) * 0.4;
4540         }
4541
4542
4543         # escape characters
4544         $text = $self->writeString($text);
4545
4546         if ($flags{Multiline})
4547         {
4548            # TODO: wrap the field with wrapString()??
4549            # Shawn Dawson of Silent Solutions pointed out that this does not auto-wrap the input text
4550
4551            my $linebreaks = $text =~ s/ \\n /\) Tj T* \(/gxms;
4552
4553            # Total guess work:
4554            # line height is either 150% of fontsize or thrice
4555            # the corner offset
4556            $tl = $fontsize ? $fontsize * 1.5 : $ty * 3;
4557
4558            # Bottom aligned
4559            #$ty += $linebreaks * $tl;
4560            # Top aligned
4561            $ty = $dy - $border - $tl;
4562
4563            if ($flags{Justify} ne 'left')
4564            {
4565               warn 'Justified text not supported for multiline fields';
4566            }
4567
4568            $tl .= ' TL';
4569         }
4570         else
4571         {
4572            if ($flags{Justify} ne 'left' && $fontmetrics)
4573            {
4574               my $width = $stringwidth || $self->getStringWidth($fontmetrics, $text);
4575               my $diff = $dx - $width*$fontsize;
4576
4577               if ($flags{Justify} eq 'center')
4578               {
4579                  $text = ($diff/2)." 0 Td $text";
4580               }
4581               elsif ($flags{Justify} eq 'right')
4582               {
4583                  $text = "$diff 0 Td $text";
4584               }
4585            }
4586         }
4587
4588         # Move text from lower left corner of form field
4589         my $tm = "1 0 0 1 $tx $ty Tm ";
4590
4591         # if not 'none', draw a background as a filled rectangle of solid color
4592         my $background_color
4593             = $opts{background_color} eq 'none' ? q{}
4594             : ref $opts{background_color}       ? "@{$opts{background_color}} rgb"
4595             :                                     "$opts{background_color} g";
4596         my $background = $background_color ? "$background_color 0 0 $dx $dy re f" : q{};
4597
4598         $text =  "$tl $da $tm $text Tj";
4599         $text = "$background /Tx BMC q 1 1 ".($dx-$border).q{ }.($dy-$border)." re W n BT $text ET Q EMC";
4600         my $len = length $text;
4601         $formdict->{Length} = CAM::PDF::Node->new('number', $len, $formonum, $formgnum);
4602         $formdict->{StreamData} = CAM::PDF::Node->new('stream', $text, $formonum, $formgnum);
4603
4604         if (@rsrcs > 0) {
4605            if (!$formdict->{Resources})
4606            {
4607               $formdict->{Resources} = CAM::PDF::Node->new('dictionary', {}, $formonum, $formgnum);
4608            }
4609            my $rdict = $self->getValue($formdict->{Resources});
4610            if (!$rdict->{ProcSet})
4611            {
4612               $rdict->{ProcSet} = CAM::PDF::Node->new('array',
4613                                                      [
4614                                                       CAM::PDF::Node->new('label', 'PDF', $formonum, $formgnum),
4615                                                       CAM::PDF::Node->new('label', 'Text', $formonum, $formgnum),
4616                                                       ],
4617                                                      $formonum,
4618                                                      $formgnum);
4619            }
4620            if (!$rdict->{Font})
4621            {
4622               $rdict->{Font} = CAM::PDF::Node->new('dictionary', {}, $formonum, $formgnum);
4623            }
4624            my $fdict = $self->getValue($rdict->{Font});
4625
4626            # Search out font resources.  This is a total kluge.
4627            # TODO: the right way to do this is to look for the DR
4628            # attribute in the form element or it's ancestors.
4629            for my $font (@rsrcs)
4630            {
4631               my $fobj = $self->dereference("/$font", 'All');
4632               if (!$fobj)
4633               {
4634                  die "Could not find resource /$font while preparing form field $key\n";
4635               }
4636               $fdict->{$font} = CAM::PDF::Node->new('reference', $fobj->{objnum}, $formonum, $formgnum);
4637            }
4638         }
4639      }
4640      $filled++;
4641   }
4642   return $filled;
4643}
4644
4645
4646=item $doc->clearFormFieldTriggers($name, $name, ...)
4647
4648Disable any triggers set on data entry for the specified form field
4649names.  This is useful in the case where, for example, the data entry
4650Javascript forbids punctuation and you want to prefill with a
4651hyphenated word.  If you don't clear the trigger, the prefill may not
4652happen.
4653
4654=cut
4655
4656sub clearFormFieldTriggers
4657{
4658   my ($self, @fieldnames) = @_;
4659
4660   for my $fieldname (@fieldnames)
4661   {
4662      my $objnode = $self->getFormField($fieldname);
4663      if ($objnode)
4664      {
4665         if (exists $objnode->{value}->{value}->{AA})
4666         {
4667            delete $objnode->{value}->{value}->{AA};
4668            my $objnum = $objnode->{objnum};
4669            if ($objnum)
4670            {
4671               $self->{changes}->{$objnum} = 1;
4672            }
4673         }
4674      }
4675   }
4676   return;
4677}
4678
4679=item $doc->clearAnnotations()
4680
4681Remove all annotations from the document.  If form fields are
4682encountered, their text is added to the appropriate page.
4683
4684=cut
4685
4686sub clearAnnotations
4687{
4688   my $self = shift;
4689
4690   my $formrsrcs;
4691   my $root = $self->getRootDict();
4692   if ($root->{AcroForm})
4693   {
4694      my $acroform = $self->getValue($root->{AcroForm});
4695      # Get the form resources
4696      if ($acroform->{DR})
4697      {
4698         $formrsrcs = $self->getValue($acroform->{DR});
4699      }
4700
4701      # Kill off the forms
4702      $self->deleteObject($root->{AcroForm}->{value});
4703      delete $root->{AcroForm};
4704   }
4705
4706   # Iterate through the pages, deleting annotations
4707
4708   my $pages = $self->numPages();
4709   for my $p (1..$pages)
4710   {
4711      my $page = $self->getPage($p);
4712      if ($page->{Annots}) {
4713         $self->addPageResources($p, $formrsrcs);
4714         my $annotsarray = $self->getValue($page->{Annots});
4715         delete $page->{Annots};
4716         for my $annotref (@{$annotsarray})
4717         {
4718            my $annot = $self->getValue($annotref);
4719            if ((ref $annot) ne 'HASH')
4720            {
4721               die 'Internal error: annotation is not a dictionary';
4722            }
4723            # Copy all text field values into the page, if present
4724            if ($annot->{Subtype} &&
4725                $annot->{Subtype}->{value} eq 'Widget' &&
4726                $annot->{FT} &&
4727                $annot->{FT}->{value} eq 'Tx' &&
4728                $annot->{AP})
4729            {
4730               my $ap = $self->getValue($annot->{AP});
4731               my $rect = $self->getValue($annot->{Rect});
4732               my $x = $self->getValue($rect->[0]);
4733               my $y = $self->getValue($rect->[1]);
4734               if ($ap->{N})
4735               {
4736                  my $n = $self->dereference($ap->{N}->{value})->{value};
4737                  my $content = $self->decodeOne($n, 0);
4738                  if (!$content)
4739                  {
4740                     die 'Internal error: expected a content stream from the form copy';
4741                  }
4742                  $content =~ s/ \bre(\s+)f\b /re$1n/gxms;
4743                  $content = "q 1 0 0 1 $x $y cm\n$content Q\n";
4744                  $self->appendPageContent($p, $content);
4745                  $self->addPageResources($p, $self->getValue($n->{value}->{Resources}));
4746               }
4747            }
4748            $self->deleteObject($annotref->{value});
4749         }
4750      }
4751   }
4752
4753   # kill off the annotation dependencies
4754   $self->cleanse();
4755   return;
4756}
4757
4758=item $doc->previousRevision()
4759
4760If this PDF was previously saved in append mode (that is, if
4761C<clean()> was not invoked on it), return a new instance representing
4762that previous version.  Otherwise return void.  If this is an
4763encrypted PDF, this method assumes that previous revisions were
4764encrypted with the same password, which may be an incorrect
4765assumption.
4766
4767=cut
4768
4769sub previousRevision {
4770   my $self = shift;
4771
4772   my $content = \$self->{content};
4773   return if !${$content};  # already wiped...
4774
4775   # Figure out line end character
4776   my ($lineend) = ${$content} =~ m/ (.)%%EOF.*?\z /xms;
4777   return if !$lineend; # Corrupt PDF: Cannot find the end-of-file marker
4778
4779   my $eof = $lineend.'%%EOF';
4780   my $i = rindex ${$content}, $eof;
4781   my $j = rindex ${$content}, $eof, $i-1;
4782   return if $j < 0; # just one revision
4783
4784   my $prev_content = (substr ${$content}, 0, $j) . $eof . $lineend;
4785   # assume the passwords were the same in the previous rev
4786   my ($opass, $upass, @perms) = $self->getPrefs;
4787
4788   return __PACKAGE__->new($prev_content, $opass, $upass);
4789}
4790
4791=item $doc->allRevisions()
4792
4793Accumulate CAM::PDF instances returned by C<previousRevision> until
4794there are no more previous revisions.  Returns a list of instances
4795from newest to oldest including this instance as the newest.
4796
4797=cut
4798
4799sub allRevisions {
4800   my ($self) = @_;
4801   my @revs;
4802   for (my $pdf = $self; $pdf; $pdf = $pdf->previous_revision) {  ## no critic(ProhibitCStyleForLoops)
4803      push @revs, $pdf;
4804   }
4805   return @revs;
4806}
4807
4808################################################################################
4809
4810=back
4811
4812=head2 Document Writing
4813
4814=over
4815
4816=item $doc->preserveOrder()
4817
4818Try to recreate the original document as much as possible.  This may
4819help in recreating documents which use undocumented tricks of saving
4820font information in adjacent objects.
4821
4822=cut
4823
4824sub preserveOrder
4825{
4826   # Call this to record the order of the objects in the original file
4827   # If called, then any new file will try to preserve the original order
4828   my $self = shift;
4829
4830   my $x = $self->{xref}; # shorthand
4831   $self->{order} = [sort {$x->{$a} <=> $x->{$b}} grep {!ref $x->{$_}} keys %{$x}];
4832   return;
4833}
4834
4835=item $doc->isLinearized()
4836
4837Returns a boolean indicating whether this PDF is linearized (aka
4838"optimized").
4839
4840=cut
4841
4842sub isLinearized
4843{
4844   my $self = shift;
4845
4846   my $first;
4847   if (exists $self->{order})
4848   {
4849      $first = $self->{order}->[0];
4850   }
4851   else
4852   {
4853      my $x = $self->{xref}; # shorthand
4854      ($first) = sort {$x->{$a} <=> $x->{$b}} grep {!ref $x->{$_}} keys %{$x};
4855   }
4856
4857   my $linearized; # false
4858   my $objnode = $self->dereference($first);
4859   if ($objnode && $objnode->{value}->{type} eq 'dictionary')
4860   {
4861      if (exists $objnode->{value}->{value}->{Linearized})
4862      {
4863         $linearized = $self; # true
4864      }
4865   }
4866   return $linearized;
4867}
4868
4869=item $doc->delinearize()
4870
4871I<For INTERNAL use>
4872
4873Undo the tweaks used to make the document 'optimized'.  This function
4874is automatically called on every save or output since this library
4875does not yet support linearized documents.
4876
4877=cut
4878
4879sub delinearize
4880{
4881   my $self = shift;
4882
4883   return if ($self->{delinearized});
4884
4885   # Turn off Linearization, if set
4886   my $first;
4887   if (exists $self->{order})
4888   {
4889      $first = $self->{order}->[0];
4890   }
4891   else
4892   {
4893      my $x = $self->{xref}; # shorthand
4894      ($first) = sort {$x->{$a} <=> $x->{$b}} grep {!ref $x->{$_}} keys %{$x};
4895   }
4896
4897   my $objnode = $self->dereference($first);
4898   if ($objnode->{value}->{type} eq 'dictionary')
4899   {
4900      if (exists $objnode->{value}->{value}->{Linearized})
4901      {
4902         $self->deleteObject($first);
4903      }
4904   }
4905
4906   $self->{delinearized} = 1;
4907   return;
4908}
4909
4910=item $doc->clean()
4911
4912Cache all parts of the document and throw away it's old structure.
4913This is useful for writing PDFs anew, instead of simply appending
4914changes to the existing documents.  This is called by cleansave() and
4915cleanoutput().
4916
4917=cut
4918
4919sub clean
4920{
4921   my $self = shift;
4922
4923   # Make sure to extract everything before we wipe the old version
4924   $self->cacheObjects();
4925
4926   $self->delinearize();
4927
4928   # Update the ID number to make this document distinct from the original.
4929   # If there is already an ID, only the second half is changed
4930   $self->createID();
4931
4932   # Mark everything changed
4933   %{$self->{changes}} = (
4934                         %{$self->{changes}},
4935                         map { $_ => 1 } keys %{$self->{xref}},
4936                         );
4937
4938   # Mark everything new
4939   %{$self->{versions}} = (
4940                          %{$self->{versions}},
4941                          map { $_ => 0 } keys %{$self->{xref}},
4942                          );
4943
4944   $self->{xref} = {};
4945   delete $self->{endxref};
4946   $self->{startxref} = 0;
4947   $self->{content} = q{};
4948   $self->{contentlength} = 0;
4949
4950   my $trailer = $self->{trailer};
4951   delete $trailer->{Prev};
4952   delete $trailer->{XRefStm};
4953   if (exists $trailer->{Type} && $trailer->{Type}->{value} eq 'XRef') {
4954      delete $trailer->{Type};
4955      delete $trailer->{Size};
4956      delete $trailer->{Index};
4957      delete $trailer->{W};
4958      delete $trailer->{Length};
4959      delete $trailer->{L};
4960      delete $trailer->{StreamData};
4961      delete $trailer->{Filter};
4962      delete $trailer->{F};
4963      delete $trailer->{DecodeParms};
4964      delete $trailer->{DP};
4965   }
4966   return;
4967}
4968
4969=item $doc->needsSave()
4970
4971Returns a boolean indicating whether the save() method needs to be
4972called.  Like save(), this has nothing to do with whether the document
4973has been saved to disk, but whether the in-memory representation of
4974the document has been serialized.
4975
4976=cut
4977
4978sub needsSave
4979{
4980   my $self = shift;
4981
4982   return 0 != keys %{$self->{changes}};
4983}
4984
4985=item $doc->save()
4986
4987Serialize the document into a single string.  All changed document
4988elements are normalized, and a new index and an updated trailer are
4989created.
4990
4991This function operates solely in memory.  It DOES NOT write the
4992document to a file.  See the output() function for that.
4993
4994=cut
4995
4996sub save
4997{
4998   my $self = shift;
4999
5000   if (!$self->needsSave())
5001   {
5002      return $self;
5003   }
5004
5005   $self->delinearize();
5006
5007   delete $self->{endxref};
5008
5009   if (!$self->{content})
5010   {
5011      $self->{content} = '%PDF-' . $self->{pdfversion} . "\n%\217\n";
5012   }
5013
5014   my %allobjs = (%{$self->{changes}}, %{$self->{xref}});
5015   my @objects = sort {$a<=>$b} keys %allobjs;
5016   if ($self->{order})
5017   {
5018
5019      # Sort in the order in $self->{order} array, with the rest later
5020      # in objnum order
5021      my %o;
5022      my $n = @{$self->{order}};
5023      for my $i (0 .. $n-1)
5024      {
5025         $o{$self->{order}->[$i]} = $i;
5026      }
5027      @objects = map {$_->[1]} sort {$a->[0] <=> $b->[0]} map {[$o{$_} || $_+$n, $_]} @objects;
5028   }
5029   delete $self->{order};
5030
5031   my %newxref;
5032   my $offset = length $self->{content};
5033   for my $key (@objects)
5034   {
5035      next if (!$self->{changes}->{$key});
5036      $newxref{$key} = $offset;
5037
5038      #print "Writing object $key\n";
5039      my $obj = $self->writeObject($key);
5040      $self->{content} .= $obj;
5041      $offset += length $obj;
5042
5043      $self->{xref}->{$key} = $newxref{$key};
5044      delete $self->{changes}->{$key};
5045   }
5046
5047   if ($self->{content} !~ m/ [\r\n] \z /xms)
5048   {
5049      $self->{content} .= "\n";
5050   }
5051
5052   my $startxref = length $self->{content};
5053
5054   # Append the new xref
5055   $self->{content} .= "xref\n";
5056   my %blocks = (
5057                 0 => "0000000000 65535 f \n",
5058                 );
5059   for my $key (keys %newxref)
5060   {
5061      $blocks{$key} = sprintf "%010d %05d n \n", $newxref{$key}, $self->{versions}->{$key};
5062   }
5063
5064   # If there is only one version of the document, there must be no
5065   # holes in the xref.  Test for versions by checking if there's already an xref.
5066   # If clean() has been called, it will be absent
5067   if (!$self->{startxref})
5068   {
5069      # Fill in holes
5070      my $prevfreeblock = 0;
5071      for my $key (reverse 0 .. $self->{maxobj}-1)
5072      {
5073         if (!exists $blocks{$key})
5074         {
5075            # Add an entry to the free list
5076            # On $key == 0, this blows away the above definition of
5077            # the head of the free block list, but that's no big deal.
5078            $blocks{$key} = sprintf "%010d %05d f \n",
5079                                    $prevfreeblock, ($key == 0 ? 65_535 : 1);
5080            $prevfreeblock = $key;
5081         }
5082      }
5083   }
5084
5085   my $currblock = q{};
5086   my $currnum   = 0;
5087   my $currstart = 0;
5088   my @blockkeys = sort {$a<=>$b} keys %blocks;
5089   for my $i (0 .. $#blockkeys)
5090   {
5091      my $key = $blockkeys[$i];
5092      $currblock .= $blocks{$key};
5093      $currnum++;
5094      if ($i == $#blockkeys || $key+1 < $blockkeys[$i+1])
5095      {
5096         $self->{content} .= "$currstart $currnum\n$currblock";
5097         if ($i < $#blockkeys)
5098         {
5099            $currblock = q{};
5100            $currnum   = 0;
5101            $currstart = $blockkeys[$i+1];
5102         }
5103      }
5104   }
5105
5106   #   Append the new trailer
5107   $self->{trailer}->{Size} = CAM::PDF::Node->new('number', $self->{maxobj} + 1);
5108   if ($self->{startxref})
5109   {
5110      $self->{trailer}->{Prev} = CAM::PDF::Node->new('number', $self->{startxref});
5111   }
5112   $self->{content} .= "trailer\n" . $self->writeAny(CAM::PDF::Node->new('dictionary', $self->{trailer})) . "\n";
5113
5114   # Append the new startxref
5115   $self->{content} .= "startxref\n$startxref\n";
5116   $self->{startxref} = $startxref;
5117
5118   # Append EOF
5119   $self->{content} .= "%%EOF\n";
5120
5121   $self->{contentlength} = length $self->{content};
5122
5123   return $self;
5124}
5125
5126=item $doc->cleansave()
5127
5128Call the clean() function, then call the save() function.
5129
5130=cut
5131
5132sub cleansave
5133{
5134   my $self = shift;
5135
5136   $self->clean();
5137   return $self->save();
5138}
5139
5140=item $doc->output($filename)
5141
5142=item $doc->output()
5143
5144Save the document to a file.  The save() function is called first to
5145serialize the data structure.  If no filename is specified, or if the
5146filename is '-', the document is written to standard output.
5147
5148Note: it is the responsibility of the application to ensure that the
5149PDF document has either the Modify or Add permission.  You can do this
5150like the following:
5151
5152   if ($self->canModify()) {
5153      $self->output($outfile);
5154   } else {
5155      die "The PDF file denies permission to make modifications\n";
5156   }
5157
5158=cut
5159
5160sub output
5161{
5162   my $self = shift;
5163   my $file = shift;
5164   if (!defined $file)
5165   {
5166      $file = q{-};
5167   }
5168
5169   $self->save();
5170
5171   if ($file eq q{-})
5172   {
5173      binmode STDOUT; ##no critic(RequireCheckedSysCalls)
5174      print $self->{content};
5175   }
5176   else
5177   {
5178      open my $fh, '>', $file or die "Failed to write file $file\n";
5179      binmode $fh or die "Failed to set binmode for file $file\n";
5180      print {$fh} $self->{content};
5181      close $fh or die "Failed to write file $file\n";
5182   }
5183   return $self;
5184}
5185
5186=item $doc->cleanoutput($file)
5187
5188=item $doc->cleanoutput()
5189
5190Call the clean() function, then call the output() function to write a
5191fresh copy of the document to a file.
5192
5193=cut
5194
5195sub cleanoutput
5196{
5197   my $self = shift;
5198   my $file = shift;
5199
5200   $self->clean();
5201   return $self->output($file);
5202}
5203
5204=item $doc->writeObject($objnum)
5205
5206Return the serialization of the specified object.
5207
5208=cut
5209
5210sub writeObject
5211{
5212   my $self = shift;
5213   my $objnum = shift;
5214
5215   return "$objnum 0 " . $self->writeAny($self->dereference($objnum));
5216}
5217
5218=item $doc->writeString($string)
5219
5220Return the serialization of the specified string.  Works on normal or
5221hex strings.  If encryption is desired, the string should be encrypted
5222before being passed here.
5223
5224=cut
5225
5226sub writeString
5227{
5228   my $pkg_or_doc = shift;
5229   my $string = shift;
5230
5231   # Divide the string into manageable pieces, which will be
5232   # re-concatenated with "\" continuation characters at the end of
5233   # their lines
5234
5235   # -- This code used to do concatenation by juxtaposing multiple
5236   # -- "(<fragment>)" compenents, but this breaks many PDF
5237   # -- implementations (incl Acrobat5 and XPDF)
5238
5239   # Break the string into pieces of length $maxstr.  Note that an
5240   # artifact of this usage of split returns empty strings between
5241   # the fragments, so grep them out
5242
5243   my $maxstr = (ref $pkg_or_doc) ? $pkg_or_doc->{maxstr} : $CAM::PDF::MAX_STRING;
5244   my @strs = grep {$_ ne q{}} split /(.{$maxstr}})/xms, $string;
5245   for (@strs)
5246   {
5247      s/ \\     /\\\\/gxms;  # escape escapes -- this line must come first!
5248      s/ ([()]) /\\$1/gxms;  # escape parens
5249      s/ \n     /\\n/gxms;
5250      s/ \r     /\\r/gxms;
5251      s/ \t     /\\t/gxms;
5252      s/ \f     /\\f/gxms;
5253      # TODO: handle backspace char
5254      #s/ ???    /\\b/gxms;
5255   }
5256   return '(' . (join "\\\n", @strs) . ')';
5257}
5258
5259=item $doc->writeAny($node)
5260
5261Returns the serialization of the specified node.  This handles all
5262Node types, including object Nodes.
5263
5264=cut
5265
5266sub writeAny
5267{
5268   my $self = shift;
5269   my $objnode = shift;
5270
5271   if (! ref $objnode)
5272   {
5273      die 'Not a ref';
5274   }
5275
5276   my $key = $objnode->{type};
5277   my $val = $objnode->{value};
5278   my $objnum = $objnode->{objnum};
5279   my $gennum = $objnode->{gennum};
5280
5281   return $key eq 'string'     ? $self->writeString($self->{crypt}->encrypt($self, $val, $objnum, $gennum))
5282        : $key eq 'hexstring'  ? '<' . (unpack 'H*', $self->{crypt}->encrypt($self, $val, $objnum, $gennum)) . '>'
5283        : $key eq 'number'     ? "$val"
5284        : $key eq 'reference'  ? "$val 0 R" # TODO: lookup the gennum and use it instead of 0 (?)
5285        : $key eq 'boolean'    ? $val
5286        : $key eq 'null'       ? 'null'
5287        : $key eq 'label'      ? "/$val"
5288        : $key eq 'array'      ? $self->_writeArray($objnode)
5289        : $key eq 'dictionary' ? $self->_writeDictionary($objnode)
5290        : $key eq 'object'     ? $self->_writeObject($objnode)
5291
5292        : die "Unknown key '$key' in writeAny (objnum ".($objnum||'<none>').")\n";
5293}
5294
5295sub _writeArray
5296{
5297   my $self = shift;
5298   my $objnode = shift;
5299
5300   my $val = $objnode->{value};
5301   if (@{$val} == 0)
5302   {
5303      return '[ ]';
5304   }
5305   my $str = q{};
5306   my @strs;
5307   for (@{$val})
5308   {
5309      my $newstr = $self->writeAny($_);
5310      if ($str ne q{})
5311      {
5312         if ($self->{maxstr} < length $str . $newstr)
5313         {
5314            push @strs, $str;
5315            $str = q{};
5316         }
5317         else
5318         {
5319            $str .= q{ };
5320         }
5321      }
5322      $str .= $newstr;
5323   }
5324   if (@strs > 0)
5325   {
5326      $str = join "\n", @strs, $str;
5327   }
5328   return '[ ' . $str . ' ]';
5329}
5330
5331sub _writeDictionary
5332{
5333   my $self = shift;
5334   my $objnode = shift;
5335
5336   my $val = $objnode->{value};
5337   my $str = q{};
5338   my @strs;
5339   if (exists $val->{Type})
5340   {
5341      $str .= ($str ? q{ } : q{}) . '/Type ' . $self->writeAny($val->{Type});
5342   }
5343   if (exists $val->{Subtype})
5344   {
5345      $str .= ($str ? q{ } : q{}) . '/Subtype ' . $self->writeAny($val->{Subtype});
5346   }
5347   for my $dictkey (sort keys %{$val})
5348   {
5349      next if ($dictkey eq 'Type');
5350      next if ($dictkey eq 'Subtype');
5351      next if ($dictkey eq 'StreamDataDone');
5352      if ($dictkey eq 'StreamData')
5353      {
5354         if (exists $val->{StreamDataDone})
5355         {
5356            delete $val->{StreamDataDone};
5357            next;
5358         }
5359         # This is a stream way down deep in the data...  Probably due to a solidifyObject
5360
5361         # First, try to handle the easy case:
5362         if (2 == scalar keys %{$val} && (exists $val->{Length} || exists $val->{L}))
5363         {
5364            my $binary = $val->{$dictkey}->{value};
5365            my $len = length $binary;
5366            my $unpacked = unpack 'H' . $len*2, $binary;
5367            return $self->writeAny(CAM::PDF::Node->new('hexstring', $unpacked, $objnode->{objnum}, $objnode->{gennum}));
5368         }
5369
5370         # TODO: Handle more complex streams ...
5371         die "This stream is too complex for me to write... Giving up\n";
5372
5373         next; ## no critic(ControlStructures::ProhibitUnreachableCode)
5374      }
5375
5376      my $newstr = "/$dictkey " . $self->writeAny($val->{$dictkey});
5377      if ($str ne q{})
5378      {
5379         if ($self->{maxstr} < length $str . $newstr)
5380         {
5381            push @strs, $str;
5382            $str = q{};
5383         }
5384         else
5385         {
5386            $str .= q{ };
5387         }
5388      }
5389      $str .= $newstr;
5390   }
5391   if (@strs > 0)
5392   {
5393      $str = join "\n", @strs, $str;
5394   }
5395   return '<< ' . $str . ' >>';
5396}
5397
5398sub _writeObject
5399{
5400   my $self = shift;
5401   my $objnode = shift;
5402
5403   my $val = $objnode->{value};
5404   if (! ref $val)
5405   {
5406      die "Obj data is not a ref! ($val)";
5407   }
5408   my $stream;
5409   if ($val->{type} eq 'dictionary' && exists $val->{value}->{StreamData})
5410   {
5411      $stream = $val->{value}->{StreamData}->{value};
5412      my $length = length $stream;
5413
5414      my $l = $val->{value}->{Length} || $val->{value}->{L};
5415      my $oldlength = $self->getValue($l);
5416      if ($length != $oldlength)
5417      {
5418         $val->{value}->{Length} = CAM::PDF::Node->new('number', $length, $objnode->{objnum}, $objnode->{gennum});
5419         delete $val->{value}->{L};
5420      }
5421      $val->{value}->{StreamDataDone} = 1;
5422   }
5423   my $str = $self->writeAny($val);
5424   if (defined $stream)
5425   {
5426      $stream = $self->{crypt}->encrypt($self, $stream, $objnode->{objnum}, $objnode->{gennum});
5427      $str .= "\nstream\n" . $stream . 'endstream';
5428   }
5429   return "obj\n$str\nendobj\n";
5430}
5431
5432######################################################################
5433
5434=back
5435
5436=head2 Document Traversing
5437
5438=over
5439
5440=item $doc->traverse($dereference, $node, $callbackfunc, $callbackdata)
5441
5442Recursive traversal of a PDF data structure.
5443
5444In many cases, it's useful to apply one action to every node in an
5445object tree.  The routines below all use this traverse() function.
5446One of the most important parameters is the first: the C<$dereference>
5447boolean.  If true, the traversal follows reference Nodes.  If false,
5448it does not descend into reference Nodes.
5449
5450Optionally, you can pass in a hashref as a final argument to reduce
5451redundant traversing across multiple calls.  Just pass in an empty
5452hashref the first time and pass in the same hashref each time.  See
5453C<changeRefKeys()> for an example.
5454
5455=cut
5456
5457sub traverse
5458{
5459   my $self = shift;
5460   my $deref = shift;
5461   my $startnode = shift;
5462   my $func = shift;
5463   my $funcdata = shift;
5464   my $traversed = shift || {};
5465
5466   my @stack = ($startnode);
5467
5468   my $i = 0;
5469   while ($i < @stack)
5470   {
5471      my $objnode = $stack[$i++];
5472      $self->$func($objnode, $funcdata);
5473
5474      my $type = $objnode->{type};
5475      my $val = $objnode->{value};
5476
5477      if ($type eq 'object')
5478      {
5479         # Shrink stack periodically
5480         splice @stack, 0, $i;
5481         $i = 0;
5482         # Mark object done
5483         if ($objnode->{objnum})
5484         {
5485            $traversed->{$objnode->{objnum}} = 1;
5486         }
5487      }
5488
5489      push @stack, $type eq 'dictionary'          ? values %{$val}
5490                 : $type eq 'array'               ? @{$val}
5491                 : $type eq 'object'              ? $val
5492                 : $type eq 'reference'
5493                   && $deref
5494                   && !exists $traversed->{$val}  ? $self->dereference($val)
5495                 : ();
5496   }
5497   return;
5498}
5499
5500# decodeObject and decodeAll differ from each other like this:
5501#
5502#  decodeObject JUST decodes a single stream directly below the object
5503#  specified by the objnum
5504#
5505#  decodeAll descends through a whole object tree (following
5506#  references) decoding everything it can find
5507
5508=item $doc->decodeObject($objectnum)
5509
5510I<For INTERNAL use>
5511
5512Remove any filters (like compression, etc) from a data stream
5513indicated by the object number.
5514
5515=cut
5516
5517sub decodeObject
5518{
5519   my $self = shift;
5520   my $objnum = shift;
5521
5522   my $objnode = $self->dereference($objnum);
5523
5524   $self->decodeOne($objnode->{value}, 1);
5525   return;
5526}
5527
5528=item $doc->decodeAll($object)
5529
5530I<For INTERNAL use>
5531
5532Remove any filters from any data stream in this object or any object
5533referenced by it.
5534
5535=cut
5536
5537sub decodeAll
5538{
5539   my $self = shift;
5540   my $objnode = shift;
5541
5542   $self->traverse(1, $objnode, \&decodeOne, 1);
5543   return;
5544}
5545
5546=item $doc->decodeOne($object)
5547
5548=item $doc->decodeOne($object, $save?)
5549
5550I<For INTERNAL use>
5551
5552Remove any filters from an object.  The boolean flag C<$save> (defaults to
5553false) indicates whether this removal should be permanent or just
5554this once.  If true, the function returns success or failure.  If
5555false, the function returns the defiltered content.
5556
5557=cut
5558
5559sub decodeOne
5560{
5561   my $self = shift;
5562   my $objnode = shift;
5563   my $save = shift || 0;
5564
5565   my $changed = 0;
5566   my $streamdata = q{};
5567
5568   if ($objnode->{type} ne 'dictionary')
5569   {
5570      return $save ? $changed : $streamdata;
5571   }
5572
5573   my $dict = $objnode->{value};
5574
5575   $streamdata = $dict->{StreamData}->{value};
5576   #warn 'decoding thing ' . ($dict->{StreamData}->{objnum} || '(unknown)') . "\n";
5577
5578   # Don't work on {F} since that's too common a word
5579   #my $filtobj = $dict->{Filter} || $dict->{F};
5580   my $filtobj = $dict->{Filter};
5581
5582   if (defined $filtobj)
5583   {
5584      my @filters = $filtobj->{type} eq 'array' ? @{$filtobj->{value}} : ($filtobj);
5585      my $parmobj = $dict->{DecodeParms} || $dict->{DP};
5586      my @parms;
5587      if ($parmobj)
5588      {
5589         @parms = $parmobj->{type} eq 'array' ? @{$parmobj->{value}} : ($parmobj);
5590      }
5591
5592      for my $filter (@filters)
5593      {
5594         if ($filter->{type} ne 'label')
5595         {
5596            warn "All filter names must be labels\n";
5597            require Data::Dumper;
5598            warn Data::Dumper->Dump([$filter], ['Filter']);
5599            next;
5600         }
5601         my $filtername = $filter->{value};
5602
5603         # Make sure this is not an encrypt dict
5604         next if ($filtername eq 'Standard');
5605
5606         my $filt;
5607         eval {
5608            require Text::PDF::Filter;
5609            my $pkg = 'Text::PDF::' . ($filterabbrevs{$filtername} || $filtername);
5610            $filt = $pkg->new;
5611            1;
5612         };
5613         if (!$filt)
5614         {
5615            warn "Failed to open filter $filtername (Text::PDF::$filtername)\n";
5616            last;
5617         }
5618
5619         my $oldlength = length $streamdata;
5620
5621         {
5622            # Hack to turn off warnings in Filter library
5623            no warnings; ## no critic(TestingAndDebugging::ProhibitNoWarnings)
5624            $streamdata = $filt->infilt($streamdata, 1);
5625         }
5626
5627         $self->fixDecode(\$streamdata, $filtername, shift @parms);
5628         my $length = length $streamdata;
5629
5630         #warn "decoded length: $oldlength -> $length\n";
5631
5632         if ($save)
5633         {
5634            my $objnum = $dict->{StreamData}->{objnum};
5635            my $gennum = $dict->{StreamData}->{gennum};
5636            if ($objnum)
5637            {
5638               $self->{changes}->{$objnum} = 1;
5639            }
5640            $changed = 1;
5641            $dict->{StreamData}->{value} = $streamdata;
5642            if ($length != $oldlength)
5643            {
5644               $dict->{Length} = CAM::PDF::Node->new('number', $length, $objnum, $gennum);
5645               delete $dict->{L};
5646            }
5647
5648            # These changes should happen later, but I prefer to do it
5649            # redundantly near the changes hash
5650            delete $dict->{Filter};
5651            delete $dict->{F};
5652            delete $dict->{DecodeParms};
5653            delete $dict->{DP};
5654         }
5655      }
5656   }
5657   #else { use Data::Dumper; print Dumper($dict); }
5658
5659   return $save ? $changed : $streamdata;
5660}
5661
5662=item $doc->fixDecode($streamdata, $filter, $params)
5663
5664This is a utility method to do any tweaking after removing the filter
5665from a data stream.
5666
5667=cut
5668
5669sub fixDecode
5670{
5671   my $self = shift;
5672   my $streamdata = shift;
5673   my $filter = shift;
5674   my $parms = shift;
5675
5676   if (!$parms)
5677   {
5678      return;
5679   }
5680   my $d = $self->getValue($parms);
5681   if (!$d || (ref $d) ne 'HASH')
5682   {
5683      die "DecodeParms must be a dictionary.\n";
5684   }
5685   if ($filter eq 'FlateDecode' || $filter eq 'Fl' ||
5686       $filter eq 'LZWDecode' || $filter eq 'LZW')
5687   {
5688      my $p = exists $d->{Predictor} ? $self->getValue($d->{Predictor}) : 1;
5689      if ($p == 2)
5690      {
5691         $self->_fixDecodeTIFF($streamdata, $d);
5692      }
5693      elsif ($p >= 10)
5694      {
5695         $self->_fixDecodePNG($streamdata, $d);
5696      }
5697      # else no fix needed
5698   }
5699   return;
5700}
5701
5702sub _fixDecodeTIFF
5703{
5704   my $self = shift;
5705   my $streamdata = shift;
5706   my $d = shift;
5707
5708   die 'The TIFF image predictor is not supported';
5709}
5710
5711sub _fixDecodePNG
5712{
5713   my $self = shift;
5714   my $streamdata = shift;
5715   my $d = shift;
5716
5717   # PNG differencing algorithms  http://www.w3.org/TR/PNG-Filters.html
5718   my $colors = exists $d->{Colors} ? $self->getValue($d->{Colors}) : 1;
5719   my $columns = exists $d->{Columns} ? $self->getValue($d->{Columns}) : 1;
5720   my $bpc = exists $d->{BitsPerComponent} ? $self->getValue($d->{BitsPerComponent}) : 8;
5721   if (0 != $bpc % 8) {
5722      die 'Color samples that are not multiples of 8 bits are not supported';
5723   }
5724   my $width = 1 + $colors * $columns * ($bpc >> 3); # size of a row in bytes, including the 1-byte predictor
5725   my $len = length ${$streamdata};
5726   if (0 != $len % $width) {
5727      die 'The stream data is not evenly divisible into rows';
5728   }
5729   my $rows = $len / $width;
5730   my $newdata = q{};
5731   my $prev_row = [(0) x ($width - 1)];
5732   for my $irow (0 .. $rows - 1)
5733   {
5734      my ($row_pred, @row) = unpack 'C' . $width, substr ${$streamdata}, $irow * $width, $width;
5735      if ($row_pred == 1) {   ##no critic (IfElse)
5736         for my $i (1 .. $width-2) {
5737            $row[$i] = ($row[$i-1] + $row[$i]) & 0xff;
5738         }
5739      } elsif ($row_pred == 2) {
5740         for my $i (0 .. $width-2) {
5741            $row[$i] = ($prev_row->[$i] + $row[$i]) & 0xff;
5742         }
5743      } elsif ($row_pred == 3) {
5744         $row[0] = (($prev_row->[0] >> 1) + $row[0]) & 0xff;
5745         for my $i (1 .. $width-2) {
5746            $row[$i] = ((($row[$i-1] + $prev_row->[$i]) >> 1) + $row[$i]) & 0xff;
5747         }
5748      } elsif ($row_pred == 4) {
5749         # Paeth reduces to up for first column
5750         $row[0] = ($prev_row->[0] + $row[0]) & 0xff;
5751         for my $i (1 .. $width-2) {
5752            my $a = $row[$i-1];
5753            my $b = $prev_row->[$i];
5754            my $c = $prev_row->[$i-1];
5755            my $p = $a + $b - $c;
5756            my $pa = abs $p - $a;
5757            my $pb = abs $p - $b;
5758            my $pc = abs $p - $c;
5759            my $paeth = $pa <= $pb && $pa <= $pc ? $a : $pb <= $pc ? $b : $c;
5760            $row[$i] = ($paeth + $row[$i]) & 0xff;
5761         }
5762      }
5763      $newdata .= pack 'C*', @row;
5764      $prev_row = \@row;
5765   }
5766   ${$streamdata} = $newdata;
5767   return;
5768}
5769
5770=item $doc->encodeObject($objectnum, $filter)
5771
5772Apply the specified filter to the object.
5773
5774=cut
5775
5776sub encodeObject
5777{
5778   my $self = shift;
5779   my $objnum = shift;
5780   my $filtername = shift;
5781
5782   my $objnode = $self->dereference($objnum);
5783
5784   $self->encodeOne($objnode->{value}, $filtername);
5785   return;
5786}
5787
5788=item $doc->encodeOne($object, $filter)
5789
5790Apply the specified filter to the object.
5791
5792=cut
5793
5794sub encodeOne  ## no critic(Subroutines::ProhibitExcessComplexity)
5795{
5796   my $self = shift;
5797   my $objnode = shift;
5798   my $filtername = shift;
5799
5800   my $changed = 0;
5801
5802   if ($objnode->{type} eq 'dictionary')
5803   {
5804      my $dict = $objnode->{value};
5805      my $objnum = $objnode->{objnum};
5806      my $gennum = $objnode->{gennum};
5807
5808      if (! exists $dict->{StreamData})
5809      {
5810         #warn "Object does not contain a Stream to encode\n";
5811         return 0;
5812      }
5813
5814      if ($filtername eq 'LZWDecode' || $filtername eq 'LZW')
5815      {
5816         $filtername = 'FlateDecode';
5817         warn "LZWDecode filter not supported for encoding.  Using $filtername instead\n";
5818      }
5819      my $filt = eval {
5820         require Text::PDF::Filter;
5821         my $pkg = "Text::PDF::$filtername";
5822         $pkg->new;
5823      };
5824      if (!$filt)
5825      {
5826         warn "Failed to open filter $filtername (Text::PDF::$filtername)\n";
5827         return 0;
5828      }
5829
5830      my $l = $dict->{Length} || $dict->{L};
5831      my $oldlength = $self->getValue($l);
5832      $dict->{StreamData}->{value} = $filt->outfilt($dict->{StreamData}->{value}, 1);
5833      my $length = length $dict->{StreamData}->{value};
5834
5835      if (! defined $oldlength || $length != $oldlength)
5836      {
5837         if (defined $l && $l->{type} eq 'reference')
5838         {
5839            my $lenobj = $self->dereference($l->{value})->{value};
5840            if ($lenobj->{type} ne 'number')
5841            {
5842               die "Expected length to be a reference to an object containing a number while encoding\n";
5843            }
5844            $lenobj->{value} = $length;
5845         }
5846         elsif (!defined $l || $l->{type} eq 'number')
5847         {
5848            $dict->{Length} = CAM::PDF::Node->new('number', $length, $objnum, $gennum);
5849            delete $dict->{L};
5850         }
5851         else
5852         {
5853            die "Unexpected type \"$l->{type}\" for Length while encoding.\n" .
5854                "(expected \"number\" or \"reference\")\n";
5855         }
5856      }
5857
5858      # Record the filter
5859      my $newfilt = CAM::PDF::Node->new('label', $filtername, $objnum, $gennum);
5860      my $f = $dict->{Filter} || $dict->{F};
5861      if (!defined $f)
5862      {
5863         $dict->{Filter} = $newfilt;
5864         delete $dict->{F};
5865      }
5866      elsif ($f->{type} eq 'label')
5867      {
5868         $dict->{Filter} = CAM::PDF::Node->new('array', [
5869                                                         $newfilt,
5870                                                         $f,
5871                                                         ],
5872                                               $objnum, $gennum);
5873         delete $dict->{F};
5874      }
5875      elsif ($f->{type} eq 'array')
5876      {
5877         unshift @{$f->{value}}, $newfilt;
5878      }
5879      else
5880      {
5881         die "Confused: Filter type is \"$f->{type}\", not the\n" .
5882             "expected \"array\" or \"label\"\n";
5883      }
5884
5885      if ($dict->{DecodeParms} || $dict->{DP})
5886      {
5887         die "Insertion of DecodeParms not yet supported...\n";
5888      }
5889
5890      if ($objnum)
5891      {
5892         $self->{changes}->{$objnum} = 1;
5893      }
5894      $changed = 1;
5895   }
5896   return $changed;
5897}
5898
5899
5900=item $doc->setObjNum($object, $objectnum, $gennum)
5901
5902Descend into an object and change all of the INTERNAL object number
5903flags to a new number.  This is just for consistency of internal
5904accounting.
5905
5906=cut
5907
5908sub setObjNum
5909{
5910   my $self = shift;
5911   my $objnode = shift;
5912   my $objnum = shift;
5913   my $gennum = shift;
5914
5915   $self->traverse(0, $objnode, \&_setObjNumCB, [$objnum, $gennum]);
5916   return;
5917}
5918
5919# PRIVATE FUNCTION
5920
5921sub _setObjNumCB
5922{
5923   my $self = shift;
5924   my $objnode = shift;
5925   my $nums = shift;
5926
5927   $objnode->{objnum} = $nums->[0];
5928   $objnode->{gennum} = $nums->[1];
5929   return;
5930}
5931
5932=item $doc->getRefList($object)
5933
5934I<For INTERNAL use>
5935
5936Return an array all of objects referred to in this object.
5937
5938=cut
5939
5940sub getRefList
5941{
5942   my $self = shift;
5943   my $objnode = shift;
5944
5945   my $list = {};
5946   $self->traverse(1, $objnode, \&_getRefListCB, $list);
5947
5948   return (sort keys %{$list});
5949}
5950
5951# PRIVATE FUNCTION
5952
5953sub _getRefListCB
5954{
5955   my $self = shift;
5956   my $objnode = shift;
5957   my $list = shift;
5958
5959   if ($objnode->{type} eq 'reference')
5960   {
5961      $list->{$objnode->{value}} = 1;
5962   }
5963   return;
5964}
5965
5966=item $doc->changeRefKeys($object, $hashref)
5967
5968I<For INTERNAL use>
5969
5970Renumber all references in an object.
5971
5972=cut
5973
5974sub changeRefKeys
5975{
5976   my $self = shift;
5977   my $objnode = shift;
5978   my $newrefkeys = shift;
5979   my $traversed = shift; # optional
5980
5981   my $follow = shift || 0;   # almost always false
5982
5983   $self->traverse($follow, $objnode, \&_changeRefKeysCB, $newrefkeys, $traversed);
5984   return;
5985}
5986
5987# PRIVATE FUNCTION
5988
5989sub _changeRefKeysCB
5990{
5991   my $self = shift;
5992   my $objnode = shift;
5993   my $newrefkeys = shift;
5994
5995   if ($objnode->{type} eq 'reference')
5996   {
5997      if (exists $newrefkeys->{$objnode->{value}})
5998      {
5999         $objnode->{value} = $newrefkeys->{$objnode->{value}};
6000      }
6001   }
6002   return;
6003}
6004
6005=item $doc->abbrevInlineImage($object)
6006
6007Contract all image keywords to inline abbreviations.
6008
6009=cut
6010
6011sub abbrevInlineImage
6012{
6013   my $self = shift;
6014   my $objnode = shift;
6015
6016   $self->traverse(0, $objnode, \&_abbrevInlineImageCB, {reverse %inlineabbrevs});
6017   return;
6018}
6019
6020=item $doc->unabbrevInlineImage($object)
6021
6022Expand all inline image abbreviations.
6023
6024=cut
6025
6026sub unabbrevInlineImage
6027{
6028   my $self = shift;
6029   my $objnode = shift;
6030
6031   $self->traverse(0, $objnode, \&_abbrevInlineImageCB, \%inlineabbrevs);
6032   return;
6033}
6034
6035# PRIVATE FUNCTION
6036
6037sub _abbrevInlineImageCB
6038{
6039   my $self = shift;
6040   my $objnode = shift;
6041   my $convert = shift;
6042
6043   if ($objnode->{type} eq 'label')
6044   {
6045      my $new = $convert->{$objnode->{value}};
6046      if (defined $new)
6047      {
6048         $objnode->{value} = $new;
6049      }
6050   }
6051   elsif ($objnode->{type} eq 'dictionary')
6052   {
6053      my $dict = $objnode->{value};
6054      for my $key (keys %{$dict})
6055      {
6056         my $new = $convert->{$key};
6057         if (defined $new && $new ne $key)
6058         {
6059            $dict->{$new} = $dict->{$key};
6060            delete $dict->{$key};
6061         }
6062      }
6063   }
6064   return;
6065}
6066
6067=item $doc->changeString($object, $hashref)
6068
6069Alter all instances of a given string.  The hashref is a dictionary of
6070from-string and to-string.  If the from-string looks like C<regex(...)>
6071then it is interpreted as a Perl regular expression and is eval'ed.
6072Otherwise the search-and-replace is literal.
6073
6074=cut
6075
6076sub changeString
6077{
6078   my $self = shift;
6079   my $objnode = shift;
6080   my $changelist = shift;
6081
6082   $self->traverse(0, $objnode, \&_changeStringCB, $changelist);
6083   return;
6084}
6085
6086# PRIVATE FUNCTION
6087
6088sub _changeStringCB
6089{
6090   my $self = shift;
6091   my $objnode = shift;
6092   my $changelist = shift;
6093
6094   if ($objnode->{type} eq 'string')
6095   {
6096      for my $key (keys %{$changelist})
6097      {
6098         if ($key =~ m/ \A regex[(](.*)[)] \z /xms)
6099         {
6100            my $regex = $1;
6101            my $res;
6102            my $eval_result = eval {
6103               $res = ($objnode->{value} =~ s/ $regex /$changelist->{$key}/gxms);
6104               1;
6105            };
6106            if (!$eval_result)
6107            {
6108               die "Failed regex search/replace: $EVAL_ERROR\n";
6109            }
6110            if ($res && $objnode->{objnum})
6111            {
6112               $self->{changes}->{$objnode->{objnum}} = 1;
6113            }
6114         }
6115         else
6116         {
6117            if ($objnode->{value} =~ s/ $key /$changelist->{$key}/gxms && $objnode->{objnum})
6118            {
6119               $self->{changes}->{$objnode->{objnum}} = 1;
6120            }
6121         }
6122      }
6123   }
6124   return;
6125}
6126
6127######################################################################
6128
6129=back
6130
6131=head2 Utility functions
6132
6133=over
6134
6135=item $doc->rangeToArray($min, $max, $list...)
6136
6137Converts string lists of numbers to an array.  For example,
6138
6139    CAM::PDF->rangeToArray(1, 15, '1,3-5,12,9', '14-', '8 - 6, -2');
6140
6141becomes
6142
6143    (1,3,4,5,12,9,14,15,8,7,6,1,2)
6144
6145=cut
6146
6147sub rangeToArray
6148{
6149   my ($pkg_or_doc, $min, $max, @range_parts) = @_;
6150   my @in_array = grep {defined $_} @range_parts;
6151
6152   for (@in_array) # modify in place
6153   {
6154      s/ [^\d\-,] //gxms;   # clean
6155   }
6156   # split on numbers and ranges
6157   @in_array = map {m/ ([\d\-]+) /gxms} @in_array;
6158
6159   my @out_array;
6160   if (@in_array == 0)
6161   {
6162      @out_array = $min .. $max;
6163   }
6164   else
6165   {
6166      for (@in_array)
6167      {
6168         if (m/ (\d*)-(\d*) /xms)
6169         {
6170            my $aa = $1;
6171            my $bb = $2;
6172            if ($aa eq q{})
6173            {
6174               $aa = $min-1;
6175            }
6176            if ($bb eq q{})
6177            {
6178               $bb = $max+1;
6179            }
6180
6181            # Check if these are possible
6182            next if ($aa < $min && $bb < $min);
6183            next if ($aa > $max && $bb > $max);
6184
6185            if ($aa < $min)
6186            {
6187               $aa = $min;
6188            }
6189            if ($bb < $min)
6190            {
6191               $bb = $min;
6192            }
6193            if ($aa > $max)
6194            {
6195               $aa = $max;
6196            }
6197            if ($bb > $max)
6198            {
6199               $bb = $max;
6200            }
6201
6202            if ($aa > $bb)
6203            {
6204               push @out_array, reverse $bb .. $aa;
6205            }
6206            else
6207            {
6208               push @out_array, $aa .. $bb;
6209            }
6210         }
6211         elsif ($_ >= $min && $_ <= $max)
6212         {
6213            push @out_array, $_;
6214         }
6215      }
6216   }
6217   return @out_array;
6218}
6219
6220=item $doc->trimstr($string)
6221
6222Used solely for debugging.  Trims a string to a max of 40 characters,
6223handling nulls and non-Unix line endings.
6224
6225=cut
6226
6227sub trimstr  ## no critic (Unpack)
6228{
6229   my $pkg_or_doc = shift;
6230   my $s = $_[0];
6231
6232   my $pos = pos $_[0];
6233   $pos ||= 0;
6234
6235   if (!defined $s || $s eq q{})
6236   {
6237      $s = '(empty)';
6238   }
6239   elsif (length $s > 40)
6240   {
6241      $s = (substr $s, $pos, 40) . '...';
6242   }
6243   $s =~ s/ \r /^M/gxms;
6244   return $pos . q{ } . $s . "\n";
6245}
6246
6247=item $doc->copyObject($node)
6248
6249Clones a node via Data::Dumper and eval().
6250
6251=cut
6252
6253sub copyObject
6254{
6255   my $self = shift;
6256   my $objnode = shift;
6257
6258   # replace $objnode with a copy of itself
6259   require Data::Dumper;
6260   my $d = Data::Dumper->new([$objnode],['objnode']);
6261   $d->Purity(1)->Indent(0);
6262   $objnode = eval $d->Dump(); ## no critic(ProhibitStringyEval)
6263   return $objnode;
6264}
6265
6266=item $doc->cacheObjects()
6267
6268Parses all object Nodes and stores them in the cache.  This is useful
6269for cases where you intend to do some global manipulation and want all
6270of the data conveniently in RAM.
6271
6272=cut
6273
6274sub cacheObjects
6275{
6276   my $self = shift;
6277
6278   for my $key (keys %{$self->{xref}})
6279   {
6280      if (!exists $self->{objcache}->{$key})
6281      {
6282         $self->{objcache}->{$key} = $self->dereference($key);
6283      }
6284   }
6285   return;
6286}
6287
6288=item $doc->asciify($string)
6289
6290Helper class/instance method to massage a string, cleaning up some
6291non-ASCII problems.  This is a very incomplete list.  Specifically:
6292
6293=over
6294
6295=item f-i ligatures
6296
6297=item (R) symbol
6298
6299=back
6300
6301=cut
6302
6303sub asciify
6304{
6305   my $pkg_or_doc = shift;
6306   my $R_string = shift;   # scalar reference
6307
6308   ## Heuristics: fix up some odd text characters:
6309   # f-i ligature
6310   ${$R_string} =~ s/ \223 /fi/gxms;
6311   # Registered symbol
6312   ${$R_string} =~ s/ \xae /(R)/gxms;
6313   return $pkg_or_doc;
6314}
6315
63161;
6317__END__
6318
6319=back
6320
6321=head1 COMPATIBILITY
6322
6323This library was primarily developed against the 3rd edition of the
6324reference (PDF v1.4) with several important updates from 4th edition
6325(PDF v1.5).  This library focuses most deeply on PDF v1.2 features.
6326Nonetheless, it should be forward and backward compatible in the
6327majority of cases.
6328
6329=head1 PERFORMANCE
6330
6331This module is written with good speed and flexibility in mind, often
6332at the expense of memory consumption.  Entire PDF documents are
6333typically slurped into RAM.  As an example, simply calling
6334C<new('PDFReference15_v15.pdf')> (the 13.5 MB Adobe PDF Reference V1.5
6335document) pushes Perl to consume 89 MB of RAM on my development
6336machine.
6337
6338=head1 SEE ALSO
6339
6340There are several other PDF modules on CPAN.  Below is a brief
6341description of a few of them.  If these comments are out of date,
6342please inform me.
6343
6344=over
6345
6346=item PDF::API2
6347
6348As of v0.46.003, LGPL license.
6349
6350This is the leading PDF library, in my opinion.
6351
6352Excellent text and font support.  This is the highest level library of
6353the bunch, and is the most complete implementation of the Adobe PDF
6354spec.  The author is amazingly responsive and patient.
6355
6356=item Text::PDF
6357
6358As of v0.25, Artistic license.
6359
6360Excellent compression support (CAM::PDF cribs off this Text::PDF
6361feature).  This has not been developed since 2003.
6362
6363=item PDF::Reuse
6364
6365As of v0.32, Artistic/GPL license, like Perl itself.
6366
6367This library is not object oriented, so it can only process one PDF at
6368a time, while storing all data in global variables.  I'm not fond of
6369it, but it's quite popular, so don't take my word for it!
6370
6371=back
6372
6373CAM::PDF is the only one of these that has regression tests.
6374Currently, CAM::PDF has test coverage of about 50%, as reported by
6375C<Build testcover>.
6376
6377Additionally, PDFLib is a commercial package not on CPAN
6378(L<www.pdflib.com>).  It is a C-based library with a Perl interface.
6379It is designed for PDF creation, not for reuse.
6380
6381=head1 INTERNALS
6382
6383The data structure used to represent the PDF document is composed
6384primarily of a hierarchy of Node objects.  Every node in the document
6385tree has this structure:
6386
6387    type => <type>
6388    value => <value>
6389    objnum => <object number>
6390    gennum => <generation number>
6391
6392where the <value> depends on the <type>, and <type> is one of
6393
6394     Type        Value
6395     ----        -----
6396     object      Node
6397     stream      byte string
6398     string      byte string
6399     hexstring   byte string
6400     number      number
6401     reference   integer (object number)
6402     boolean     "true" | "false"
6403     label       string
6404     array       arrayref of Nodes
6405     dictionary  hashref of (string => Node)
6406     null        undef
6407
6408All of these except "stream" are directly related to the PDF data
6409types of the same name.  Streams are treated as special cases in this
6410library since the have a non-general syntax and placement in the
6411document body.  Internally, streams are very much like strings, except
6412that they have filters applied to them.
6413
6414All objects are referenced indirectly by their numbers, as defined in
6415the PDF document.  In all cases, the dereference() function should be
6416used to deserialize objects into their internal representation.  This
6417function is also useful for looking up named objects in the page model
6418metadata.  Every node in the hierarchy contains its object and
6419generation number.  You can think of this as a sort of a pointer back
6420to the root of each node tree.  This serves in place of a "parent"
6421link for every node, which would be harder to maintain.
6422
6423The PDF document itself is represented internally as a hash reference
6424with many components, including the document content, the document
6425metadata (index, trailer and root node), the object cache, and several
6426other caches, in addition to a few assorted bookkeeping structures.
6427
6428The core of the document is represented in the object cache, which is
6429only populated as needed, thus avoiding the overhead of parsing the
6430whole document at read time.
6431
6432=head1 AUTHOR
6433
6434Chris Dolan
6435
6436This module was originally developed by me at Clotho Advanced Media
6437Inc.  Now I maintain it in my spare time.
6438
6439=head1 ACKNOWLEDGMENTS
6440
6441Thanks to all the people who have submitted bug reports over the
6442years!  I've belatedly started crediting people in the F<CHANGES>
6443file.  Apologies to contributors I've overlooked...
6444
6445=cut
6446