1#!/usr/local/bin/perl
2# This file is in the public domain
3# Author: Ievgenii Meshcheriakov <eugen@debian.org>
4#
5# This program extracts outlines from PDF files. The outlines
6# are stored into a text file that could be used with pdfoutline
7# program.
8#
9# Usage: pdf-extract-outline input.pdf outline.txt
10
11use strict;
12use warnings;
13use feature qw(say);
14use PDF::API2;
15use Locale::TextDomain('@CMAKE_PROJECT_NAME@', '@CMAKE_INSTALL_FULL_LOCALEDIR@');
16use POSIX qw(:locale_h);
17use Encode qw(decode find_encoding);
18use Encode::Guess;
19use List::MoreUtils qw(first_index);
20
21my $fallback_encoding = 'PDFDocumentEncoding';
22
23eval {
24    require Encode::PDFDocumentEncoding;
25    Encode::Guess->set_suspects(qw/PDFDocumentEncoding/);
26    1;
27} or do {
28    warn 'Encode::PDFDocumentEncoding is missing, falling back to ASCII';
29    $fallback_encoding = 'ascii';
30};
31
32sub decode_pdf {
33    my ($s) = @_;
34
35    eval {
36        decode('Guess', $s);
37    } or do {
38        decode $fallback_encoding, $s, sub {
39            my $code = shift;
40            my $repr = sprintf "\\x%02X", $code;
41            warn "$fallback_encoding \"$repr\" does not map to Unicode";
42            return q{?};
43        };
44    }
45}
46
47sub usage {
48    printf __"Usage: %s input.pdf outline.txt\n", $0;
49}
50
51sub search_tree {
52    my ($tree, $key) = @_;
53
54    if ($tree->{'Limits'}) {
55        my ($first, $last) = @{$tree->{'Limits'}->val};
56        return if (($key lt $first->val) or ($key gt $last->val));
57    }
58
59    if ($tree->{'Names'}) {
60        my @arr = @{$tree->{'Names'}->val};
61        for (my $i = 0; $i < $#arr; $i += 2) {
62            return $arr[$i + 1] if ($arr[$i]->val eq $key);
63        }
64    }
65
66    if ($tree->{'Kids'}) {
67        foreach my $kid (@{$tree->{'Kids'}->val}) {
68            my $result = search_tree($kid->val, $key);
69            return $result if $result;
70        }
71    }
72
73    return;
74}
75
76sub extract_outlines {
77    my ($pdf, $level, $outline, $F) = @_;
78
79    OUTLINE: for (; $outline; $outline = $outline->{'Next'}) {
80        $outline = $outline->val;
81
82        my $raw_title = $outline->{'Title'}->val;
83        my $title = decode_pdf($raw_title);
84        my $dest;
85
86        if ($outline->{'Dest'}) {
87            $dest = $outline->{'Dest'};
88        } elsif ($outline->{'A'}) {
89            my $a = $outline->{'A'}->val;
90            # TODO Search for GoTo entry
91            if ($a->{'S'}->val eq 'GoTo') {
92                $dest = $a->{'D'};
93            } else {
94                warn 'Action is not GoTo';
95                next OUTLINE;
96            }
97        } else {
98            warn "No Dest or A entry for '$title'";
99            next OUTLINE;
100        }
101
102        if (ref($dest) eq 'PDF::API2::Basic::PDF::Name') {
103            # Find the destination in Dest dictionary in Root object.
104            my $named_ref = $dest->val;
105            my $dests = $pdf->{'pdf'}->{'Root'}->{'Dests'}->val;
106            $dest = $dests->{$named_ref};
107        } elsif (ref($dest) eq 'PDF::API2::Basic::PDF::String') {
108            # Find the destination in Dest tree in Names dictionary of Root object
109            my $names = $pdf->{'pdf'}->{'Root'}->{'Names'}->val;
110            my $tree = $names->{'Dests'}->val;
111            my $name = $dest->val;
112            $dest = search_tree($tree, $name);
113            unless ($dest) {
114                warn "No Dest found with name '$name'";
115                next OUTLINE;
116            }
117        }
118
119        if (ref($dest) eq 'PDF::API2::Basic::PDF::Objind') {
120            $dest = $dest->val;
121        }
122
123        if (ref($dest) eq 'PDF::API2::Basic::PDF::Dict') {
124            $dest = $dest->{'D'};
125        }
126
127        if (ref($dest) eq 'PDF::API2::Basic::PDF::Array') {
128            $dest = $dest->val;
129        }
130
131        unless ($dest) {
132            warn "Destination not found for '$title'";
133            next OUTLINE;
134        }
135
136        my $page = $dest->[0];
137        my $page_no;
138
139        if (ref($page) eq 'PDF::API2::Basic::PDF::Number') {
140            # Some documents use numbers even for pages in the current document
141            $page_no = $page->val + 1;
142        } else {
143            my $page_idx = first_index { $_ == $page } @{$pdf->{'pagestack'}};
144
145            if ($page_idx == -1) {
146                warn "Page not found in the page stack for '$title'";
147                next OUTLINE;
148            }
149
150            $page_no = $page_idx + 1;
151        }
152
153        print {$F} "$level $page_no $title\n";
154
155        my $sub_outlines = $outline->{'First'};
156        if ($sub_outlines) {
157            extract_outlines($pdf, $level + 1, $sub_outlines, $F);
158        }
159    }
160}
161
162setlocale(LC_ALL, q{});
163
164if ($#ARGV != 1) {
165    usage;
166    exit 1;
167}
168
169my ($pdffile, $outlinefile) = @ARGV;
170
171my $pdf = PDF::API2->open($pdffile);
172
173open my $outline_fh, '>:encoding(UTF-8)', $outlinefile
174    or die __x("Cannot open outline file '{outlinefile}'",
175    outlinefile => $outlinefile);
176
177my $outlines = $pdf->{'pdf'}->{'Root'}->{'Outlines'};
178
179if ($outlines) {
180    if ($pdf->{'pdf'}->{'Encrypt'}) {
181        die __('Extracting outlines from encrypted files is not supported');
182    }
183
184    my $first = $outlines->val->{'First'};
185    extract_outlines($pdf, 0, $first, $outline_fh);
186}
187
188close $outline_fh;
189