1 /*
2  * Copyright (C) 2009-2010, Pino Toscano <pino@kde.org>
3  * Copyright (C) 2017-2019, Albert Astals Cid <aacid@kde.org>
4  * Copyright (C) 2017, Jason Alan Palmer <jalanpalmer@gmail.com>
5  * Copyright (C) 2018, 2020, Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
6  * Copyright (C) 2019, Masamichi Hosoda <trueroad@trueroad.jp>
7  * Copyright (C) 2020, Jiri Jakes <freedesktop@jirijakes.eu>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2, or (at your option)
12  * any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.
22  */
23 
24 #include <goo/glibc.h>
25 #include <poppler-destination.h>
26 #include <poppler-document.h>
27 #include <poppler-embedded-file.h>
28 #include <poppler-font.h>
29 #include <poppler-page.h>
30 #include <poppler-toc.h>
31 #include <poppler-version.h>
32 
33 #include <cstdlib>
34 #include <cstring>
35 #include <ctime>
36 #include <algorithm>
37 #include <iomanip>
38 #include <ios>
39 #include <iostream>
40 #include <map>
41 #include <memory>
42 #include <sstream>
43 
44 #include "parseargs.h"
45 
46 #include "config.h"
47 
48 static const int out_width = 30;
49 
50 bool show_all = false;
51 bool show_info = false;
52 bool show_perm = false;
53 bool show_metadata = false;
54 bool show_toc = false;
55 bool show_fonts = false;
56 bool show_embedded_files = false;
57 bool show_pages = false;
58 bool show_destinations = false;
59 bool show_help = false;
60 bool show_version = false;
61 char show_text[32];
62 bool show_text_list = false;
63 bool show_text_list_with_font = false;
64 poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout;
65 
66 static const ArgDesc the_args[] = { { "--show-all", argFlag, &show_all, 0, "show all the available information" },
67                                     { "--show-info", argFlag, &show_info, 0, "show general document information" },
68                                     { "--show-perm", argFlag, &show_perm, 0, "show document permissions" },
69                                     { "--show-metadata", argFlag, &show_metadata, 0, "show document metadata" },
70                                     { "--show-toc", argFlag, &show_toc, 0, "show the TOC" },
71                                     { "--show-fonts", argFlag, &show_fonts, 0, "show the document fonts" },
72                                     { "--show-embedded-files", argFlag, &show_embedded_files, 0, "show the document-level embedded files" },
73                                     { "--show-pages", argFlag, &show_pages, 0, "show pages information" },
74                                     { "--show-destinations", argFlag, &show_destinations, 0, "show named destinations" },
75                                     { "--show-text", argString, &show_text, sizeof(show_text), "show text (physical|raw|none) extracted from all pages" },
76                                     { "--show-text-list", argFlag, &show_text_list, 0, "show text list (experimental)" },
77                                     { "--show-text-list-with-font", argFlag, &show_text_list_with_font, 0, "show text list with font info (experimental)" },
78                                     { "-h", argFlag, &show_help, 0, "print usage information" },
79                                     { "--help", argFlag, &show_help, 0, "print usage information" },
80                                     { "--version", argFlag, &show_version, 0, "print poppler version" },
81                                     { nullptr, argFlag, nullptr, 0, nullptr } };
82 
error(const std::string & msg)83 static void error(const std::string &msg)
84 {
85     std::cerr << "Error: " << msg << std::endl;
86     std::cerr << "Exiting..." << std::endl;
87     exit(1);
88 }
89 
operator <<(std::ostream & stream,const poppler::ustring & str)90 static std::ostream &operator<<(std::ostream &stream, const poppler::ustring &str)
91 {
92     const poppler::byte_array ba = str.to_utf8();
93     for (const char c : ba) {
94         stream << c;
95     }
96     return stream;
97 }
98 
out_date(std::time_t date)99 static std::string out_date(std::time_t date)
100 {
101     if (date != std::time_t(-1)) {
102         struct tm time;
103         gmtime_r(&date, &time);
104         struct tm *t = &time;
105         char buf[32];
106         strftime(buf, sizeof(buf) - 1, "%d/%m/%Y %H:%M:%S", t);
107         return std::string(buf);
108     }
109     return std::string("n/a");
110 }
111 
out_size(int size)112 static std::string out_size(int size)
113 {
114     if (size >= 0) {
115         std::ostringstream ss;
116         ss << size;
117         return ss.str();
118     }
119     return std::string("n/a");
120 }
121 
charToHex(int x)122 static char charToHex(int x)
123 {
124     return x < 10 ? x + '0' : x - 10 + 'a';
125 }
126 
out_hex_string(const poppler::byte_array & str)127 static std::string out_hex_string(const poppler::byte_array &str)
128 {
129     std::string ret(str.size() * 2, '\0');
130     const char *str_p = &str[0];
131     for (unsigned int i = 0; i < str.size(); ++i, ++str_p) {
132         ret[i * 2] = charToHex((*str_p & 0xf0) >> 4);
133         ret[i * 2 + 1] = charToHex(*str_p & 0xf);
134     }
135     return ret;
136 }
137 
out_page_orientation(poppler::page::orientation_enum o)138 static std::string out_page_orientation(poppler::page::orientation_enum o)
139 {
140     switch (o) {
141     case poppler::page::landscape:
142         return "landscape (90)";
143     case poppler::page::portrait:
144         return "portrait (0)";
145     case poppler::page::seascape:
146         return "seascape (270)";
147     case poppler::page::upside_down:
148         return "upside_downs (180)";
149     };
150     return "<unknown orientation>";
151 }
152 
out_font_info_type(poppler::font_info::type_enum t)153 static std::string out_font_info_type(poppler::font_info::type_enum t)
154 {
155 #define OUT_FONT_TYPE(thetype)                                                                                                                                                                                                                 \
156     case poppler::font_info::thetype:                                                                                                                                                                                                          \
157         return #thetype
158     switch (t) {
159         OUT_FONT_TYPE(unknown);
160         OUT_FONT_TYPE(type1);
161         OUT_FONT_TYPE(type1c);
162         OUT_FONT_TYPE(type1c_ot);
163         OUT_FONT_TYPE(type3);
164         OUT_FONT_TYPE(truetype);
165         OUT_FONT_TYPE(truetype_ot);
166         OUT_FONT_TYPE(cid_type0);
167         OUT_FONT_TYPE(cid_type0c);
168         OUT_FONT_TYPE(cid_type0c_ot);
169         OUT_FONT_TYPE(cid_truetype);
170         OUT_FONT_TYPE(cid_truetype_ot);
171     }
172     return "<unknown font type>";
173 #undef OUT_FONT_TYPE
174 }
175 
print_info(poppler::document * doc)176 static void print_info(poppler::document *doc)
177 {
178     std::cout << "Document information:" << std::endl;
179     int major = 0, minor = 0;
180     doc->get_pdf_version(&major, &minor);
181     std::cout << std::setw(out_width) << "PDF version"
182               << ": " << major << "." << minor << std::endl;
183     std::string permanent_id, update_id;
184     if (doc->get_pdf_id(&permanent_id, &update_id)) {
185         std::cout << std::setw(out_width) << "PDF IDs"
186                   << ": P: " << permanent_id << " - U: " << update_id << std::endl;
187     } else {
188         std::cout << std::setw(out_width) << "PDF IDs"
189                   << ": <none>" << std::endl;
190     }
191     const std::vector<std::string> keys = doc->info_keys();
192     std::vector<std::string>::const_iterator key_it = keys.begin(), key_end = keys.end();
193     for (; key_it != key_end; ++key_it) {
194         std::cout << std::setw(out_width) << *key_it << ": " << doc->info_key(*key_it) << std::endl;
195     }
196     std::cout << std::setw(out_width) << "Date (creation)"
197               << ": " << out_date(doc->info_date("CreationDate")) << std::endl;
198     std::cout << std::setw(out_width) << "Date (modification)"
199               << ": " << out_date(doc->info_date("ModDate")) << std::endl;
200     std::cout << std::setw(out_width) << "Number of pages"
201               << ": " << doc->pages() << std::endl;
202     std::cout << std::setw(out_width) << "Linearized"
203               << ": " << doc->is_linearized() << std::endl;
204     std::cout << std::setw(out_width) << "Encrypted"
205               << ": " << doc->is_encrypted() << std::endl;
206     std::cout << std::endl;
207 }
208 
print_perm(poppler::document * doc)209 static void print_perm(poppler::document *doc)
210 {
211     std::cout << "Document permissions:" << std::endl;
212 #define OUT_PERM(theperm) std::cout << std::setw(out_width) << #theperm << ": " << doc->has_permission(poppler::perm_##theperm) << std::endl
213     OUT_PERM(print);
214     OUT_PERM(change);
215     OUT_PERM(copy);
216     OUT_PERM(add_notes);
217     OUT_PERM(fill_forms);
218     OUT_PERM(accessibility);
219     OUT_PERM(assemble);
220     OUT_PERM(print_high_resolution);
221     std::cout << std::endl;
222 #undef OUT_PERM
223 }
224 
print_metadata(poppler::document * doc)225 static void print_metadata(poppler::document *doc)
226 {
227     std::cout << std::setw(out_width) << "Metadata"
228               << ":" << std::endl
229               << doc->metadata() << std::endl;
230     std::cout << std::endl;
231 }
232 
print_toc_item(poppler::toc_item * item,int indent)233 static void print_toc_item(poppler::toc_item *item, int indent)
234 {
235     std::cout << std::setw(indent * 2) << " "
236               << "+ " << item->title() << " (" << item->is_open() << ")" << std::endl;
237     poppler::toc_item::iterator it = item->children_begin(), it_end = item->children_end();
238     for (; it != it_end; ++it) {
239         print_toc_item(*it, indent + 1);
240     }
241 }
242 
print_toc(poppler::toc * doctoc)243 static void print_toc(poppler::toc *doctoc)
244 {
245     std::cout << "Document TOC:" << std::endl;
246     if (doctoc) {
247         print_toc_item(doctoc->root(), 0);
248     } else {
249         std::cout << "<no TOC>" << std::endl;
250     }
251     std::cout << std::endl;
252 }
253 
print_fonts(poppler::document * doc)254 static void print_fonts(poppler::document *doc)
255 {
256     std::cout << "Document fonts:" << std::endl;
257     std::vector<poppler::font_info> fl = doc->fonts();
258     if (!fl.empty()) {
259         std::vector<poppler::font_info>::const_iterator it = fl.begin(), it_end = fl.end();
260         const std::ios_base::fmtflags f = std::cout.flags();
261         std::left(std::cout);
262         for (; it != it_end; ++it) {
263             std::cout << " " << std::setw(out_width + 10) << it->name() << " " << std::setw(15) << out_font_info_type(it->type()) << " " << std::setw(5) << it->is_embedded() << " " << std::setw(5) << it->is_subset() << " " << it->file()
264                       << std::endl;
265         }
266         std::cout.flags(f);
267     } else {
268         std::cout << "<no fonts>" << std::endl;
269     }
270     std::cout << std::endl;
271 }
272 
print_embedded_files(poppler::document * doc)273 static void print_embedded_files(poppler::document *doc)
274 {
275     std::cout << "Document embedded files:" << std::endl;
276     std::vector<poppler::embedded_file *> ef = doc->embedded_files();
277     if (!ef.empty()) {
278         std::vector<poppler::embedded_file *>::const_iterator it = ef.begin(), it_end = ef.end();
279         const std::ios_base::fmtflags flags = std::cout.flags();
280         std::left(std::cout);
281         for (; it != it_end; ++it) {
282             poppler::embedded_file *f = *it;
283             std::cout << " " << std::setw(out_width + 10) << f->name() << " " << std::setw(10) << out_size(f->size()) << " " << std::setw(20) << out_date(f->creation_date()) << " " << std::setw(20) << out_date(f->modification_date())
284                       << std::endl
285                       << "     ";
286             if (f->description().empty()) {
287                 std::cout << "<no description>";
288             } else {
289                 std::cout << f->description();
290             }
291             std::cout << std::endl
292                       << "     " << std::setw(35) << (f->checksum().empty() ? std::string("<no checksum>") : out_hex_string(f->checksum())) << " " << (f->mime_type().empty() ? std::string("<no mime type>") : f->mime_type()) << std::endl;
293         }
294         std::cout.flags(flags);
295     } else {
296         std::cout << "<no embedded files>" << std::endl;
297     }
298     std::cout << std::endl;
299 }
300 
print_page(poppler::page * p)301 static void print_page(poppler::page *p)
302 {
303     if (p) {
304         std::cout << std::setw(out_width) << "Rect"
305                   << ": " << p->page_rect() << std::endl;
306         std::cout << std::setw(out_width) << "Label"
307                   << ": " << p->label() << std::endl;
308         std::cout << std::setw(out_width) << "Duration"
309                   << ": " << p->duration() << std::endl;
310         std::cout << std::setw(out_width) << "Orientation"
311                   << ": " << out_page_orientation(p->orientation()) << std::endl;
312     } else {
313         std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
314     }
315     std::cout << std::endl;
316 }
317 
print_destination(const poppler::destination * d)318 static void print_destination(const poppler::destination *d)
319 {
320     if (d) {
321         std::cout << std::setw(out_width) << "Type"
322                   << ": ";
323         switch (d->type()) {
324         case poppler::destination::unknown:
325             std::cout << "unknown" << std::endl;
326             break;
327         case poppler::destination::xyz:
328             std::cout << "xyz" << std::endl
329                       << std::setw(out_width) << "Page"
330                       << ": " << d->page_number() << std::endl
331                       << std::setw(out_width) << "Left"
332                       << ": " << d->left() << std::endl
333                       << std::setw(out_width) << "Top"
334                       << ": " << d->top() << std::endl
335                       << std::setw(out_width) << "Zoom"
336                       << ": " << d->zoom() << std::endl;
337             break;
338         case poppler::destination::fit:
339             std::cout << "fit" << std::endl
340                       << std::setw(out_width) << "Page"
341                       << ": " << d->page_number() << std::endl;
342             break;
343         case poppler::destination::fit_h:
344             std::cout << "fit_h" << std::endl
345                       << std::setw(out_width) << "Page"
346                       << ": " << d->page_number() << std::endl
347                       << std::setw(out_width) << "Top"
348                       << ": " << d->top() << std::endl;
349             break;
350         case poppler::destination::fit_v:
351             std::cout << "fit_v" << std::endl
352                       << std::setw(out_width) << "Page"
353                       << ": " << d->page_number() << std::endl
354                       << std::setw(out_width) << "Left"
355                       << ": " << d->left() << std::endl;
356             break;
357         case poppler::destination::fit_r:
358             std::cout << "fit_r" << std::endl
359                       << std::setw(out_width) << "Page"
360                       << ": " << d->page_number() << std::endl
361                       << std::setw(out_width) << "Left"
362                       << ": " << d->left() << std::endl
363                       << std::setw(out_width) << "Bottom"
364                       << ": " << d->bottom() << std::endl
365                       << std::setw(out_width) << "Right"
366                       << ": " << d->right() << std::endl
367                       << std::setw(out_width) << "Top"
368                       << ": " << d->top() << std::endl;
369             break;
370         case poppler::destination::fit_b:
371             std::cout << "fit_b" << std::endl
372                       << std::setw(out_width) << "Page"
373                       << ": " << d->page_number() << std::endl;
374             break;
375         case poppler::destination::fit_b_h:
376             std::cout << "fit_b_h" << std::endl
377                       << std::setw(out_width) << "Page"
378                       << ": " << d->page_number() << std::endl
379                       << std::setw(out_width) << "Top"
380                       << ": " << d->top() << std::endl;
381             break;
382         case poppler::destination::fit_b_v:
383             std::cout << "fit_b_v" << std::endl
384                       << std::setw(out_width) << "Page"
385                       << ": " << d->page_number() << std::endl
386                       << std::setw(out_width) << "Left"
387                       << ": " << d->left() << std::endl;
388             break;
389         default:
390             std::cout << "error" << std::endl;
391             break;
392         }
393     }
394     std::cout << std::endl;
395 }
396 
print_page_text(poppler::page * p)397 static void print_page_text(poppler::page *p)
398 {
399     if (p) {
400         std::cout << p->text(poppler::rectf(), show_text_layout) << std::endl;
401     } else {
402         std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
403     }
404     std::cout << std::endl;
405 }
406 
print_page_text_list(poppler::page * p,int opt_flag=0)407 static void print_page_text_list(poppler::page *p, int opt_flag = 0)
408 {
409     if (!p) {
410         std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
411         std::cout << std::endl;
412         return;
413     }
414     auto text_list = p->text_list(opt_flag);
415 
416     std::cout << "---" << std::endl;
417     for (const poppler::text_box &text : text_list) {
418         poppler::rectf bbox = text.bbox();
419         poppler::ustring ustr = text.text();
420         int wmode = text.get_wmode();
421         double font_size = text.get_font_size();
422         std::string font_name = text.get_font_name();
423         std::cout << "[" << ustr << "] @ ";
424         std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
425         if (text.has_font_info())
426             std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
427         std::cout << std::endl;
428     }
429     std::cout << "---" << std::endl;
430 }
431 
main(int argc,char * argv[])432 int main(int argc, char *argv[])
433 {
434     if (!parseArgs(the_args, &argc, argv) || argc < 2 || show_help) {
435         printUsage(argv[0], "DOCUMENT", the_args);
436         exit(1);
437     }
438 
439     if (show_text[0]) {
440         if (!memcmp(show_text, "physical", 9)) {
441             show_text_layout = poppler::page::physical_layout;
442         } else if (!memcmp(show_text, "raw", 4)) {
443             show_text_layout = poppler::page::raw_order_layout;
444         } else if (!memcmp(show_text, "none", 5)) {
445             show_text_layout = poppler::page::non_raw_non_physical_layout;
446         } else {
447             error(std::string("unrecognized text mode: '") + show_text + "'");
448         }
449     }
450 
451     std::string file_name(argv[1]);
452 
453     std::unique_ptr<poppler::document> doc(poppler::document::load_from_file(file_name));
454     if (!doc.get()) {
455         error("loading error");
456     }
457     if (doc->is_locked()) {
458         error("encrypted document");
459     }
460 
461     std::cout.setf(std::ios_base::boolalpha);
462 
463     if (show_all) {
464         show_info = true;
465         show_perm = true;
466         show_metadata = true;
467         show_toc = true;
468         show_fonts = true;
469         show_embedded_files = true;
470         show_pages = true;
471     }
472 
473     if (show_version) {
474         std::cout << std::setw(out_width) << "Compiled"
475                   << ": poppler-cpp " << POPPLER_VERSION << std::endl
476                   << std::setw(out_width) << "Running"
477                   << ": poppler-cpp " << poppler::version_string() << std::endl;
478     }
479     if (show_info) {
480         print_info(doc.get());
481     }
482     if (show_perm) {
483         print_perm(doc.get());
484     }
485     if (show_metadata) {
486         print_metadata(doc.get());
487     }
488     if (show_toc) {
489         std::unique_ptr<poppler::toc> doctoc(doc->create_toc());
490         print_toc(doctoc.get());
491     }
492     if (show_fonts) {
493         print_fonts(doc.get());
494     }
495     if (show_embedded_files) {
496         print_embedded_files(doc.get());
497     }
498     if (show_pages) {
499         const int pages = doc->pages();
500         for (int i = 0; i < pages; ++i) {
501             std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
502             std::unique_ptr<poppler::page> p(doc->create_page(i));
503             print_page(p.get());
504         }
505     }
506     if (show_destinations) {
507         auto map = doc->create_destination_map();
508         for (const auto &pair : map) {
509             std::string s = pair.first;
510             for (auto &c : s) {
511                 if (c < 0x20 || c > 0x7e)
512                     c = '.';
513             }
514             std::cout << "Named destination \"" << s << "\":" << std::endl;
515             print_destination(&pair.second);
516         }
517     }
518     if (show_text[0]) {
519         const int pages = doc->pages();
520         for (int i = 0; i < pages; ++i) {
521             std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
522             std::unique_ptr<poppler::page> p(doc->create_page(i));
523             print_page_text(p.get());
524         }
525     }
526     if (show_text_list || show_text_list_with_font) {
527         const int pages = doc->pages();
528         for (int i = 0; i < pages; ++i) {
529             std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
530             std::unique_ptr<poppler::page> p(doc->create_page(i));
531             if (show_text_list_with_font)
532                 print_page_text_list(p.get(), poppler::page::text_list_include_font);
533             else
534                 print_page_text_list(p.get(), 0);
535         }
536     }
537 
538     return 0;
539 }
540