1 /*
2 * Copyright (C) 2009-2010, Pino Toscano <pino@kde.org>
3 * Copyright (C) 2017-2019, Albert Astals Cid <aacid@kde.org>
4 * Copyright (C) 2017, Jason Alan Palmer <jalanpalmer@gmail.com>
5 * Copyright (C) 2018, 2020, Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
6 * Copyright (C) 2019, Masamichi Hosoda <trueroad@trueroad.jp>
7 * Copyright (C) 2020, Jiri Jakes <freedesktop@jirijakes.eu>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2, or (at your option)
12 * any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA.
22 */
23
24 #include <goo/glibc.h>
25 #include <poppler-destination.h>
26 #include <poppler-document.h>
27 #include <poppler-embedded-file.h>
28 #include <poppler-font.h>
29 #include <poppler-page.h>
30 #include <poppler-toc.h>
31 #include <poppler-version.h>
32
33 #include <cstdlib>
34 #include <cstring>
35 #include <ctime>
36 #include <algorithm>
37 #include <iomanip>
38 #include <ios>
39 #include <iostream>
40 #include <map>
41 #include <memory>
42 #include <sstream>
43
44 #include "parseargs.h"
45
46 #include "config.h"
47
48 static const int out_width = 30;
49
50 bool show_all = false;
51 bool show_info = false;
52 bool show_perm = false;
53 bool show_metadata = false;
54 bool show_toc = false;
55 bool show_fonts = false;
56 bool show_embedded_files = false;
57 bool show_pages = false;
58 bool show_destinations = false;
59 bool show_help = false;
60 bool show_version = false;
61 char show_text[32];
62 bool show_text_list = false;
63 bool show_text_list_with_font = false;
64 poppler::page::text_layout_enum show_text_layout = poppler::page::physical_layout;
65
66 static const ArgDesc the_args[] = { { "--show-all", argFlag, &show_all, 0, "show all the available information" },
67 { "--show-info", argFlag, &show_info, 0, "show general document information" },
68 { "--show-perm", argFlag, &show_perm, 0, "show document permissions" },
69 { "--show-metadata", argFlag, &show_metadata, 0, "show document metadata" },
70 { "--show-toc", argFlag, &show_toc, 0, "show the TOC" },
71 { "--show-fonts", argFlag, &show_fonts, 0, "show the document fonts" },
72 { "--show-embedded-files", argFlag, &show_embedded_files, 0, "show the document-level embedded files" },
73 { "--show-pages", argFlag, &show_pages, 0, "show pages information" },
74 { "--show-destinations", argFlag, &show_destinations, 0, "show named destinations" },
75 { "--show-text", argString, &show_text, sizeof(show_text), "show text (physical|raw|none) extracted from all pages" },
76 { "--show-text-list", argFlag, &show_text_list, 0, "show text list (experimental)" },
77 { "--show-text-list-with-font", argFlag, &show_text_list_with_font, 0, "show text list with font info (experimental)" },
78 { "-h", argFlag, &show_help, 0, "print usage information" },
79 { "--help", argFlag, &show_help, 0, "print usage information" },
80 { "--version", argFlag, &show_version, 0, "print poppler version" },
81 { nullptr, argFlag, nullptr, 0, nullptr } };
82
error(const std::string & msg)83 static void error(const std::string &msg)
84 {
85 std::cerr << "Error: " << msg << std::endl;
86 std::cerr << "Exiting..." << std::endl;
87 exit(1);
88 }
89
operator <<(std::ostream & stream,const poppler::ustring & str)90 static std::ostream &operator<<(std::ostream &stream, const poppler::ustring &str)
91 {
92 const poppler::byte_array ba = str.to_utf8();
93 for (const char c : ba) {
94 stream << c;
95 }
96 return stream;
97 }
98
out_date(std::time_t date)99 static std::string out_date(std::time_t date)
100 {
101 if (date != std::time_t(-1)) {
102 struct tm time;
103 gmtime_r(&date, &time);
104 struct tm *t = &time;
105 char buf[32];
106 strftime(buf, sizeof(buf) - 1, "%d/%m/%Y %H:%M:%S", t);
107 return std::string(buf);
108 }
109 return std::string("n/a");
110 }
111
out_size(int size)112 static std::string out_size(int size)
113 {
114 if (size >= 0) {
115 std::ostringstream ss;
116 ss << size;
117 return ss.str();
118 }
119 return std::string("n/a");
120 }
121
charToHex(int x)122 static char charToHex(int x)
123 {
124 return x < 10 ? x + '0' : x - 10 + 'a';
125 }
126
out_hex_string(const poppler::byte_array & str)127 static std::string out_hex_string(const poppler::byte_array &str)
128 {
129 std::string ret(str.size() * 2, '\0');
130 const char *str_p = &str[0];
131 for (unsigned int i = 0; i < str.size(); ++i, ++str_p) {
132 ret[i * 2] = charToHex((*str_p & 0xf0) >> 4);
133 ret[i * 2 + 1] = charToHex(*str_p & 0xf);
134 }
135 return ret;
136 }
137
out_page_orientation(poppler::page::orientation_enum o)138 static std::string out_page_orientation(poppler::page::orientation_enum o)
139 {
140 switch (o) {
141 case poppler::page::landscape:
142 return "landscape (90)";
143 case poppler::page::portrait:
144 return "portrait (0)";
145 case poppler::page::seascape:
146 return "seascape (270)";
147 case poppler::page::upside_down:
148 return "upside_downs (180)";
149 };
150 return "<unknown orientation>";
151 }
152
out_font_info_type(poppler::font_info::type_enum t)153 static std::string out_font_info_type(poppler::font_info::type_enum t)
154 {
155 #define OUT_FONT_TYPE(thetype) \
156 case poppler::font_info::thetype: \
157 return #thetype
158 switch (t) {
159 OUT_FONT_TYPE(unknown);
160 OUT_FONT_TYPE(type1);
161 OUT_FONT_TYPE(type1c);
162 OUT_FONT_TYPE(type1c_ot);
163 OUT_FONT_TYPE(type3);
164 OUT_FONT_TYPE(truetype);
165 OUT_FONT_TYPE(truetype_ot);
166 OUT_FONT_TYPE(cid_type0);
167 OUT_FONT_TYPE(cid_type0c);
168 OUT_FONT_TYPE(cid_type0c_ot);
169 OUT_FONT_TYPE(cid_truetype);
170 OUT_FONT_TYPE(cid_truetype_ot);
171 }
172 return "<unknown font type>";
173 #undef OUT_FONT_TYPE
174 }
175
print_info(poppler::document * doc)176 static void print_info(poppler::document *doc)
177 {
178 std::cout << "Document information:" << std::endl;
179 int major = 0, minor = 0;
180 doc->get_pdf_version(&major, &minor);
181 std::cout << std::setw(out_width) << "PDF version"
182 << ": " << major << "." << minor << std::endl;
183 std::string permanent_id, update_id;
184 if (doc->get_pdf_id(&permanent_id, &update_id)) {
185 std::cout << std::setw(out_width) << "PDF IDs"
186 << ": P: " << permanent_id << " - U: " << update_id << std::endl;
187 } else {
188 std::cout << std::setw(out_width) << "PDF IDs"
189 << ": <none>" << std::endl;
190 }
191 const std::vector<std::string> keys = doc->info_keys();
192 std::vector<std::string>::const_iterator key_it = keys.begin(), key_end = keys.end();
193 for (; key_it != key_end; ++key_it) {
194 std::cout << std::setw(out_width) << *key_it << ": " << doc->info_key(*key_it) << std::endl;
195 }
196 std::cout << std::setw(out_width) << "Date (creation)"
197 << ": " << out_date(doc->info_date("CreationDate")) << std::endl;
198 std::cout << std::setw(out_width) << "Date (modification)"
199 << ": " << out_date(doc->info_date("ModDate")) << std::endl;
200 std::cout << std::setw(out_width) << "Number of pages"
201 << ": " << doc->pages() << std::endl;
202 std::cout << std::setw(out_width) << "Linearized"
203 << ": " << doc->is_linearized() << std::endl;
204 std::cout << std::setw(out_width) << "Encrypted"
205 << ": " << doc->is_encrypted() << std::endl;
206 std::cout << std::endl;
207 }
208
print_perm(poppler::document * doc)209 static void print_perm(poppler::document *doc)
210 {
211 std::cout << "Document permissions:" << std::endl;
212 #define OUT_PERM(theperm) std::cout << std::setw(out_width) << #theperm << ": " << doc->has_permission(poppler::perm_##theperm) << std::endl
213 OUT_PERM(print);
214 OUT_PERM(change);
215 OUT_PERM(copy);
216 OUT_PERM(add_notes);
217 OUT_PERM(fill_forms);
218 OUT_PERM(accessibility);
219 OUT_PERM(assemble);
220 OUT_PERM(print_high_resolution);
221 std::cout << std::endl;
222 #undef OUT_PERM
223 }
224
print_metadata(poppler::document * doc)225 static void print_metadata(poppler::document *doc)
226 {
227 std::cout << std::setw(out_width) << "Metadata"
228 << ":" << std::endl
229 << doc->metadata() << std::endl;
230 std::cout << std::endl;
231 }
232
print_toc_item(poppler::toc_item * item,int indent)233 static void print_toc_item(poppler::toc_item *item, int indent)
234 {
235 std::cout << std::setw(indent * 2) << " "
236 << "+ " << item->title() << " (" << item->is_open() << ")" << std::endl;
237 poppler::toc_item::iterator it = item->children_begin(), it_end = item->children_end();
238 for (; it != it_end; ++it) {
239 print_toc_item(*it, indent + 1);
240 }
241 }
242
print_toc(poppler::toc * doctoc)243 static void print_toc(poppler::toc *doctoc)
244 {
245 std::cout << "Document TOC:" << std::endl;
246 if (doctoc) {
247 print_toc_item(doctoc->root(), 0);
248 } else {
249 std::cout << "<no TOC>" << std::endl;
250 }
251 std::cout << std::endl;
252 }
253
print_fonts(poppler::document * doc)254 static void print_fonts(poppler::document *doc)
255 {
256 std::cout << "Document fonts:" << std::endl;
257 std::vector<poppler::font_info> fl = doc->fonts();
258 if (!fl.empty()) {
259 std::vector<poppler::font_info>::const_iterator it = fl.begin(), it_end = fl.end();
260 const std::ios_base::fmtflags f = std::cout.flags();
261 std::left(std::cout);
262 for (; it != it_end; ++it) {
263 std::cout << " " << std::setw(out_width + 10) << it->name() << " " << std::setw(15) << out_font_info_type(it->type()) << " " << std::setw(5) << it->is_embedded() << " " << std::setw(5) << it->is_subset() << " " << it->file()
264 << std::endl;
265 }
266 std::cout.flags(f);
267 } else {
268 std::cout << "<no fonts>" << std::endl;
269 }
270 std::cout << std::endl;
271 }
272
print_embedded_files(poppler::document * doc)273 static void print_embedded_files(poppler::document *doc)
274 {
275 std::cout << "Document embedded files:" << std::endl;
276 std::vector<poppler::embedded_file *> ef = doc->embedded_files();
277 if (!ef.empty()) {
278 std::vector<poppler::embedded_file *>::const_iterator it = ef.begin(), it_end = ef.end();
279 const std::ios_base::fmtflags flags = std::cout.flags();
280 std::left(std::cout);
281 for (; it != it_end; ++it) {
282 poppler::embedded_file *f = *it;
283 std::cout << " " << std::setw(out_width + 10) << f->name() << " " << std::setw(10) << out_size(f->size()) << " " << std::setw(20) << out_date(f->creation_date()) << " " << std::setw(20) << out_date(f->modification_date())
284 << std::endl
285 << " ";
286 if (f->description().empty()) {
287 std::cout << "<no description>";
288 } else {
289 std::cout << f->description();
290 }
291 std::cout << std::endl
292 << " " << std::setw(35) << (f->checksum().empty() ? std::string("<no checksum>") : out_hex_string(f->checksum())) << " " << (f->mime_type().empty() ? std::string("<no mime type>") : f->mime_type()) << std::endl;
293 }
294 std::cout.flags(flags);
295 } else {
296 std::cout << "<no embedded files>" << std::endl;
297 }
298 std::cout << std::endl;
299 }
300
print_page(poppler::page * p)301 static void print_page(poppler::page *p)
302 {
303 if (p) {
304 std::cout << std::setw(out_width) << "Rect"
305 << ": " << p->page_rect() << std::endl;
306 std::cout << std::setw(out_width) << "Label"
307 << ": " << p->label() << std::endl;
308 std::cout << std::setw(out_width) << "Duration"
309 << ": " << p->duration() << std::endl;
310 std::cout << std::setw(out_width) << "Orientation"
311 << ": " << out_page_orientation(p->orientation()) << std::endl;
312 } else {
313 std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
314 }
315 std::cout << std::endl;
316 }
317
print_destination(const poppler::destination * d)318 static void print_destination(const poppler::destination *d)
319 {
320 if (d) {
321 std::cout << std::setw(out_width) << "Type"
322 << ": ";
323 switch (d->type()) {
324 case poppler::destination::unknown:
325 std::cout << "unknown" << std::endl;
326 break;
327 case poppler::destination::xyz:
328 std::cout << "xyz" << std::endl
329 << std::setw(out_width) << "Page"
330 << ": " << d->page_number() << std::endl
331 << std::setw(out_width) << "Left"
332 << ": " << d->left() << std::endl
333 << std::setw(out_width) << "Top"
334 << ": " << d->top() << std::endl
335 << std::setw(out_width) << "Zoom"
336 << ": " << d->zoom() << std::endl;
337 break;
338 case poppler::destination::fit:
339 std::cout << "fit" << std::endl
340 << std::setw(out_width) << "Page"
341 << ": " << d->page_number() << std::endl;
342 break;
343 case poppler::destination::fit_h:
344 std::cout << "fit_h" << std::endl
345 << std::setw(out_width) << "Page"
346 << ": " << d->page_number() << std::endl
347 << std::setw(out_width) << "Top"
348 << ": " << d->top() << std::endl;
349 break;
350 case poppler::destination::fit_v:
351 std::cout << "fit_v" << std::endl
352 << std::setw(out_width) << "Page"
353 << ": " << d->page_number() << std::endl
354 << std::setw(out_width) << "Left"
355 << ": " << d->left() << std::endl;
356 break;
357 case poppler::destination::fit_r:
358 std::cout << "fit_r" << std::endl
359 << std::setw(out_width) << "Page"
360 << ": " << d->page_number() << std::endl
361 << std::setw(out_width) << "Left"
362 << ": " << d->left() << std::endl
363 << std::setw(out_width) << "Bottom"
364 << ": " << d->bottom() << std::endl
365 << std::setw(out_width) << "Right"
366 << ": " << d->right() << std::endl
367 << std::setw(out_width) << "Top"
368 << ": " << d->top() << std::endl;
369 break;
370 case poppler::destination::fit_b:
371 std::cout << "fit_b" << std::endl
372 << std::setw(out_width) << "Page"
373 << ": " << d->page_number() << std::endl;
374 break;
375 case poppler::destination::fit_b_h:
376 std::cout << "fit_b_h" << std::endl
377 << std::setw(out_width) << "Page"
378 << ": " << d->page_number() << std::endl
379 << std::setw(out_width) << "Top"
380 << ": " << d->top() << std::endl;
381 break;
382 case poppler::destination::fit_b_v:
383 std::cout << "fit_b_v" << std::endl
384 << std::setw(out_width) << "Page"
385 << ": " << d->page_number() << std::endl
386 << std::setw(out_width) << "Left"
387 << ": " << d->left() << std::endl;
388 break;
389 default:
390 std::cout << "error" << std::endl;
391 break;
392 }
393 }
394 std::cout << std::endl;
395 }
396
print_page_text(poppler::page * p)397 static void print_page_text(poppler::page *p)
398 {
399 if (p) {
400 std::cout << p->text(poppler::rectf(), show_text_layout) << std::endl;
401 } else {
402 std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
403 }
404 std::cout << std::endl;
405 }
406
print_page_text_list(poppler::page * p,int opt_flag=0)407 static void print_page_text_list(poppler::page *p, int opt_flag = 0)
408 {
409 if (!p) {
410 std::cout << std::setw(out_width) << "Broken Page. Could not be parsed" << std::endl;
411 std::cout << std::endl;
412 return;
413 }
414 auto text_list = p->text_list(opt_flag);
415
416 std::cout << "---" << std::endl;
417 for (const poppler::text_box &text : text_list) {
418 poppler::rectf bbox = text.bbox();
419 poppler::ustring ustr = text.text();
420 int wmode = text.get_wmode();
421 double font_size = text.get_font_size();
422 std::string font_name = text.get_font_name();
423 std::cout << "[" << ustr << "] @ ";
424 std::cout << "( x=" << bbox.x() << " y=" << bbox.y() << " w=" << bbox.width() << " h=" << bbox.height() << " )";
425 if (text.has_font_info())
426 std::cout << "( fontname=" << font_name << " fontsize=" << font_size << " wmode=" << wmode << " )";
427 std::cout << std::endl;
428 }
429 std::cout << "---" << std::endl;
430 }
431
main(int argc,char * argv[])432 int main(int argc, char *argv[])
433 {
434 if (!parseArgs(the_args, &argc, argv) || argc < 2 || show_help) {
435 printUsage(argv[0], "DOCUMENT", the_args);
436 exit(1);
437 }
438
439 if (show_text[0]) {
440 if (!memcmp(show_text, "physical", 9)) {
441 show_text_layout = poppler::page::physical_layout;
442 } else if (!memcmp(show_text, "raw", 4)) {
443 show_text_layout = poppler::page::raw_order_layout;
444 } else if (!memcmp(show_text, "none", 5)) {
445 show_text_layout = poppler::page::non_raw_non_physical_layout;
446 } else {
447 error(std::string("unrecognized text mode: '") + show_text + "'");
448 }
449 }
450
451 std::string file_name(argv[1]);
452
453 std::unique_ptr<poppler::document> doc(poppler::document::load_from_file(file_name));
454 if (!doc.get()) {
455 error("loading error");
456 }
457 if (doc->is_locked()) {
458 error("encrypted document");
459 }
460
461 std::cout.setf(std::ios_base::boolalpha);
462
463 if (show_all) {
464 show_info = true;
465 show_perm = true;
466 show_metadata = true;
467 show_toc = true;
468 show_fonts = true;
469 show_embedded_files = true;
470 show_pages = true;
471 }
472
473 if (show_version) {
474 std::cout << std::setw(out_width) << "Compiled"
475 << ": poppler-cpp " << POPPLER_VERSION << std::endl
476 << std::setw(out_width) << "Running"
477 << ": poppler-cpp " << poppler::version_string() << std::endl;
478 }
479 if (show_info) {
480 print_info(doc.get());
481 }
482 if (show_perm) {
483 print_perm(doc.get());
484 }
485 if (show_metadata) {
486 print_metadata(doc.get());
487 }
488 if (show_toc) {
489 std::unique_ptr<poppler::toc> doctoc(doc->create_toc());
490 print_toc(doctoc.get());
491 }
492 if (show_fonts) {
493 print_fonts(doc.get());
494 }
495 if (show_embedded_files) {
496 print_embedded_files(doc.get());
497 }
498 if (show_pages) {
499 const int pages = doc->pages();
500 for (int i = 0; i < pages; ++i) {
501 std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
502 std::unique_ptr<poppler::page> p(doc->create_page(i));
503 print_page(p.get());
504 }
505 }
506 if (show_destinations) {
507 auto map = doc->create_destination_map();
508 for (const auto &pair : map) {
509 std::string s = pair.first;
510 for (auto &c : s) {
511 if (c < 0x20 || c > 0x7e)
512 c = '.';
513 }
514 std::cout << "Named destination \"" << s << "\":" << std::endl;
515 print_destination(&pair.second);
516 }
517 }
518 if (show_text[0]) {
519 const int pages = doc->pages();
520 for (int i = 0; i < pages; ++i) {
521 std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
522 std::unique_ptr<poppler::page> p(doc->create_page(i));
523 print_page_text(p.get());
524 }
525 }
526 if (show_text_list || show_text_list_with_font) {
527 const int pages = doc->pages();
528 for (int i = 0; i < pages; ++i) {
529 std::cout << "Page " << (i + 1) << "/" << pages << ":" << std::endl;
530 std::unique_ptr<poppler::page> p(doc->create_page(i));
531 if (show_text_list_with_font)
532 print_page_text_list(p.get(), poppler::page::text_list_include_font);
533 else
534 print_page_text_list(p.get(), 0);
535 }
536 }
537
538 return 0;
539 }
540