1#!/usr/local/bin/perl 2# This file is in the public domain 3# Author: Ievgenii Meshcheriakov <eugen@debian.org> 4# 5# This program extracts outlines from PDF files. The outlines 6# are stored into a text file that could be used with pdfoutline 7# program. 8# 9# Usage: pdf-extract-outline input.pdf outline.txt 10 11use strict; 12use warnings; 13use feature qw(say); 14use PDF::API2; 15use Locale::TextDomain('@CMAKE_PROJECT_NAME@', '@CMAKE_INSTALL_FULL_LOCALEDIR@'); 16use POSIX qw(:locale_h); 17use Encode qw(decode find_encoding); 18use Encode::Guess; 19use List::MoreUtils qw(first_index); 20 21my $fallback_encoding = 'PDFDocumentEncoding'; 22 23eval { 24 require Encode::PDFDocumentEncoding; 25 Encode::Guess->set_suspects(qw/PDFDocumentEncoding/); 26 1; 27} or do { 28 warn 'Encode::PDFDocumentEncoding is missing, falling back to ASCII'; 29 $fallback_encoding = 'ascii'; 30}; 31 32sub decode_pdf { 33 my ($s) = @_; 34 35 eval { 36 decode('Guess', $s); 37 } or do { 38 decode $fallback_encoding, $s, sub { 39 my $code = shift; 40 my $repr = sprintf "\\x%02X", $code; 41 warn "$fallback_encoding \"$repr\" does not map to Unicode"; 42 return q{?}; 43 }; 44 } 45} 46 47sub usage { 48 printf __"Usage: %s input.pdf outline.txt\n", $0; 49} 50 51sub search_tree { 52 my ($tree, $key) = @_; 53 54 if ($tree->{'Limits'}) { 55 my ($first, $last) = @{$tree->{'Limits'}->val}; 56 return if (($key lt $first->val) or ($key gt $last->val)); 57 } 58 59 if ($tree->{'Names'}) { 60 my @arr = @{$tree->{'Names'}->val}; 61 for (my $i = 0; $i < $#arr; $i += 2) { 62 return $arr[$i + 1] if ($arr[$i]->val eq $key); 63 } 64 } 65 66 if ($tree->{'Kids'}) { 67 foreach my $kid (@{$tree->{'Kids'}->val}) { 68 my $result = search_tree($kid->val, $key); 69 return $result if $result; 70 } 71 } 72 73 return; 74} 75 76sub extract_outlines { 77 my ($pdf, $level, $outline, $F) = @_; 78 79 OUTLINE: for (; $outline; $outline = $outline->{'Next'}) { 80 $outline = $outline->val; 81 82 my $raw_title = $outline->{'Title'}->val; 83 my $title = decode_pdf($raw_title); 84 my $dest; 85 86 if ($outline->{'Dest'}) { 87 $dest = $outline->{'Dest'}; 88 } elsif ($outline->{'A'}) { 89 my $a = $outline->{'A'}->val; 90 # TODO Search for GoTo entry 91 if ($a->{'S'}->val eq 'GoTo') { 92 $dest = $a->{'D'}; 93 } else { 94 warn 'Action is not GoTo'; 95 next OUTLINE; 96 } 97 } else { 98 warn "No Dest or A entry for '$title'"; 99 next OUTLINE; 100 } 101 102 if (ref($dest) eq 'PDF::API2::Basic::PDF::Name') { 103 # Find the destination in Dest dictionary in Root object. 104 my $named_ref = $dest->val; 105 my $dests = $pdf->{'pdf'}->{'Root'}->{'Dests'}->val; 106 $dest = $dests->{$named_ref}; 107 } elsif (ref($dest) eq 'PDF::API2::Basic::PDF::String') { 108 # Find the destination in Dest tree in Names dictionary of Root object 109 my $names = $pdf->{'pdf'}->{'Root'}->{'Names'}->val; 110 my $tree = $names->{'Dests'}->val; 111 my $name = $dest->val; 112 $dest = search_tree($tree, $name); 113 unless ($dest) { 114 warn "No Dest found with name '$name'"; 115 next OUTLINE; 116 } 117 } 118 119 if (ref($dest) eq 'PDF::API2::Basic::PDF::Objind') { 120 $dest = $dest->val; 121 } 122 123 if (ref($dest) eq 'PDF::API2::Basic::PDF::Dict') { 124 $dest = $dest->{'D'}; 125 } 126 127 if (ref($dest) eq 'PDF::API2::Basic::PDF::Array') { 128 $dest = $dest->val; 129 } 130 131 unless ($dest) { 132 warn "Destination not found for '$title'"; 133 next OUTLINE; 134 } 135 136 my $page = $dest->[0]; 137 my $page_no; 138 139 if (ref($page) eq 'PDF::API2::Basic::PDF::Number') { 140 # Some documents use numbers even for pages in the current document 141 $page_no = $page->val + 1; 142 } else { 143 my $page_idx = first_index { $_ == $page } @{$pdf->{'pagestack'}}; 144 145 if ($page_idx == -1) { 146 warn "Page not found in the page stack for '$title'"; 147 next OUTLINE; 148 } 149 150 $page_no = $page_idx + 1; 151 } 152 153 print {$F} "$level $page_no $title\n"; 154 155 my $sub_outlines = $outline->{'First'}; 156 if ($sub_outlines) { 157 extract_outlines($pdf, $level + 1, $sub_outlines, $F); 158 } 159 } 160} 161 162setlocale(LC_ALL, q{}); 163 164if ($#ARGV != 1) { 165 usage; 166 exit 1; 167} 168 169my ($pdffile, $outlinefile) = @ARGV; 170 171my $pdf = PDF::API2->open($pdffile); 172 173open my $outline_fh, '>:encoding(UTF-8)', $outlinefile 174 or die __x("Cannot open outline file '{outlinefile}'", 175 outlinefile => $outlinefile); 176 177my $outlines = $pdf->{'pdf'}->{'Root'}->{'Outlines'}; 178 179if ($outlines) { 180 if ($pdf->{'pdf'}->{'Encrypt'}) { 181 die __('Extracting outlines from encrypted files is not supported'); 182 } 183 184 my $first = $outlines->val->{'First'}; 185 extract_outlines($pdf, 0, $first, $outline_fh); 186} 187 188close $outline_fh; 189