1 //========================================================================
2 //
3 // pdftohtml.cc
4 //
5 //
6 // Copyright 1999-2000 G. Ovtcharov
7 //========================================================================
8 
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2007-2008, 2010, 2012, 2015-2020 Albert Astals Cid <aacid@kde.org>
17 // Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
18 // Copyright (C) 2010 Mike Slegeir <tehpola@yahoo.com>
19 // Copyright (C) 2010, 2013 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
20 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in)
21 // Copyright (C) 2011 Steven Murdoch <Steven.Murdoch@cl.cam.ac.uk>
22 // Copyright (C) 2012 Igor Slepchin <igor.redhat@gmail.com>
23 // Copyright (C) 2012 Ihar Filipau <thephilips@gmail.com>
24 // Copyright (C) 2012 Luis Parravicini <lparravi@gmail.com>
25 // Copyright (C) 2014 Pino Toscano <pino@kde.org>
26 // Copyright (C) 2015 William Bader <williambader@hotmail.com>
27 // Copyright (C) 2017, 2021 Adrian Johnson <ajohnson@redneon.com>
28 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
29 // Copyright (C) 2018 Thibaut Brard <thibaut.brard@gmail.com>
30 // Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
31 // Copyright (C) 2019, 2021 Oliver Sander <oliver.sander@tu-dresden.de>
32 // Copyright (C) 2021 Hubert Figuiere <hub@figuiere.net>
33 //
34 // To see a description of the changes please see the Changelog file that
35 // came with your tarball or type make ChangeLog if you are building from git
36 //
37 //========================================================================
38 
39 #include "config.h"
40 #include <poppler-config.h>
41 #include <cstdio>
42 #include <cstdlib>
43 #include <cstddef>
44 #include <cstring>
45 #ifdef HAVE_DIRENT_H
46 #    include <dirent.h>
47 #endif
48 #include <ctime>
49 #include "parseargs.h"
50 #include "goo/GooString.h"
51 #include "goo/gbase64.h"
52 #include "goo/gbasename.h"
53 #include "goo/gmem.h"
54 #include "Object.h"
55 #include "Stream.h"
56 #include "Array.h"
57 #include "Dict.h"
58 #include "XRef.h"
59 #include "Catalog.h"
60 #include "Page.h"
61 #include "Outline.h"
62 #include "PDFDoc.h"
63 #include "PDFDocFactory.h"
64 #include "HtmlOutputDev.h"
65 #include "SplashOutputDev.h"
66 #include "splash/SplashBitmap.h"
67 #include "GlobalParams.h"
68 #include "PDFDocEncoding.h"
69 #include "Error.h"
70 #include "DateInfo.h"
71 #include "goo/gfile.h"
72 #include "Win32Console.h"
73 #include "InMemoryFile.h"
74 
75 static int firstPage = 1;
76 static int lastPage = 0;
77 static bool rawOrder = true;
78 bool printCommands = true;
79 static bool printHelp = false;
80 bool printHtml = false;
81 bool complexMode = false;
82 bool singleHtml = false; // singleHtml
83 bool dataUrls = false;
84 bool ignore = false;
85 static char extension[5] = "png";
86 static double scale = 1.5;
87 bool noframes = false;
88 bool stout = false;
89 bool xml = false;
90 bool noRoundedCoordinates = false;
91 static bool errQuiet = false;
92 static bool noDrm = false;
93 double wordBreakThreshold = 10; // 10%, below converted into a coefficient - 0.1
94 
95 bool showHidden = false;
96 bool noMerge = false;
97 bool fontFullName = false;
98 static char ownerPassword[33] = "";
99 static char userPassword[33] = "";
100 static bool printVersion = false;
101 
102 static GooString *getInfoString(Dict *infoDict, const char *key);
103 static GooString *getInfoDate(Dict *infoDict, const char *key);
104 
105 static char textEncName[128] = "";
106 
107 static const ArgDesc argDesc[] = { { "-f", argInt, &firstPage, 0, "first page to convert" },
108                                    { "-l", argInt, &lastPage, 0, "last page to convert" },
109                                    /*{"-raw",    argFlag,     &rawOrder,      0,
110                                      "keep strings in content stream order"},*/
111                                    { "-q", argFlag, &errQuiet, 0, "don't print any messages or errors" },
112                                    { "-h", argFlag, &printHelp, 0, "print usage information" },
113                                    { "-?", argFlag, &printHelp, 0, "print usage information" },
114                                    { "-help", argFlag, &printHelp, 0, "print usage information" },
115                                    { "--help", argFlag, &printHelp, 0, "print usage information" },
116                                    { "-p", argFlag, &printHtml, 0, "exchange .pdf links by .html" },
117                                    { "-c", argFlag, &complexMode, 0, "generate complex document" },
118                                    { "-s", argFlag, &singleHtml, 0, "generate single document that includes all pages" },
119 #ifdef HAVE_IN_MEMORY_FILE
120                                    { "-dataurls", argFlag, &dataUrls, 0, "use data URLs instead of external images in HTML" },
121 #endif
122                                    { "-i", argFlag, &ignore, 0, "ignore images" },
123                                    { "-noframes", argFlag, &noframes, 0, "generate no frames" },
124                                    { "-stdout", argFlag, &stout, 0, "use standard output" },
125                                    { "-zoom", argFP, &scale, 0, "zoom the pdf document (default 1.5)" },
126                                    { "-xml", argFlag, &xml, 0, "output for XML post-processing" },
127                                    { "-noroundcoord", argFlag, &noRoundedCoordinates, 0, "do not round coordinates (with XML output only)" },
128                                    { "-hidden", argFlag, &showHidden, 0, "output hidden text" },
129                                    { "-nomerge", argFlag, &noMerge, 0, "do not merge paragraphs" },
130                                    { "-enc", argString, textEncName, sizeof(textEncName), "output text encoding name" },
131                                    { "-fmt", argString, extension, sizeof(extension), "image file format for Splash output (png or jpg)" },
132                                    { "-v", argFlag, &printVersion, 0, "print copyright and version info" },
133                                    { "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" },
134                                    { "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" },
135                                    { "-nodrm", argFlag, &noDrm, 0, "override document DRM settings" },
136                                    { "-wbt", argFP, &wordBreakThreshold, 0, "word break threshold (default 10 percent)" },
137                                    { "-fontfullname", argFlag, &fontFullName, 0, "outputs font full name" },
138                                    {} };
139 
140 class SplashOutputDevNoText : public SplashOutputDev
141 {
142 public:
SplashOutputDevNoText(SplashColorMode colorModeA,int bitmapRowPadA,bool reverseVideoA,SplashColorPtr paperColorA,bool bitmapTopDownA=true)143     SplashOutputDevNoText(SplashColorMode colorModeA, int bitmapRowPadA, bool reverseVideoA, SplashColorPtr paperColorA, bool bitmapTopDownA = true)
144         : SplashOutputDev(colorModeA, bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA) { }
145     ~SplashOutputDevNoText() override;
146 
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode code,int nBytes,const Unicode * u,int uLen)147     void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int nBytes, const Unicode *u, int uLen) override { }
beginType3Char(GfxState * state,double x,double y,double dx,double dy,CharCode code,const Unicode * u,int uLen)148     bool beginType3Char(GfxState *state, double x, double y, double dx, double dy, CharCode code, const Unicode *u, int uLen) override { return false; }
endType3Char(GfxState * state)149     void endType3Char(GfxState *state) override { }
beginTextObject(GfxState * state)150     void beginTextObject(GfxState *state) override { }
endTextObject(GfxState * state)151     void endTextObject(GfxState *state) override { }
interpretType3Chars()152     bool interpretType3Chars() override { return false; }
153 };
154 
155 SplashOutputDevNoText::~SplashOutputDevNoText() = default;
156 
main(int argc,char * argv[])157 int main(int argc, char *argv[])
158 {
159     std::unique_ptr<PDFDoc> doc;
160     GooString *fileName = nullptr;
161     GooString *docTitle = nullptr;
162     GooString *author = nullptr, *keywords = nullptr, *subject = nullptr, *date = nullptr;
163     GooString *htmlFileName = nullptr;
164     HtmlOutputDev *htmlOut = nullptr;
165     SplashOutputDev *splashOut = nullptr;
166     bool doOutline;
167     bool ok;
168     GooString *ownerPW, *userPW;
169     Object info;
170     int exit_status = EXIT_FAILURE;
171 
172     Win32Console win32Console(&argc, &argv);
173     // parse args
174     ok = parseArgs(argDesc, &argc, argv);
175     if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) {
176         fprintf(stderr, "pdftohtml version %s\n", PACKAGE_VERSION);
177         fprintf(stderr, "%s\n", popplerCopyright);
178         fprintf(stderr, "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch");
179         fprintf(stderr, "%s\n\n", xpdfCopyright);
180         if (!printVersion) {
181             printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc);
182         }
183         exit(printHelp || printVersion ? 0 : 1);
184     }
185 
186     // init error file
187     // errorInit();
188 
189     // read config file
190     globalParams = std::make_unique<GlobalParams>();
191 
192     if (errQuiet) {
193         globalParams->setErrQuiet(errQuiet);
194         printCommands = false; // I'm not 100% what is the difference between them
195     }
196 
197     if (textEncName[0]) {
198         globalParams->setTextEncoding(textEncName);
199         if (!globalParams->getTextEncoding()) {
200             goto error;
201         }
202     }
203 
204     // convert from user-friendly percents into a coefficient
205     wordBreakThreshold /= 100.0;
206 
207     // open PDF file
208     if (ownerPassword[0]) {
209         ownerPW = new GooString(ownerPassword);
210     } else {
211         ownerPW = nullptr;
212     }
213     if (userPassword[0]) {
214         userPW = new GooString(userPassword);
215     } else {
216         userPW = nullptr;
217     }
218 
219     fileName = new GooString(argv[1]);
220 
221     if (fileName->cmp("-") == 0) {
222         delete fileName;
223         fileName = new GooString("fd://0");
224     }
225 
226     doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW);
227 
228     if (userPW) {
229         delete userPW;
230     }
231     if (ownerPW) {
232         delete ownerPW;
233     }
234     if (!doc->isOk()) {
235         goto error;
236     }
237 
238     // check for copy permission
239     if (!doc->okToCopy()) {
240         if (!noDrm) {
241             error(errNotAllowed, -1, "Copying of text from this document is not allowed.");
242             goto error;
243         }
244         fprintf(stderr, "Document has copy-protection bit set.\n");
245     }
246 
247     // construct text file name
248     if (argc == 3) {
249         GooString *tmp = new GooString(argv[2]);
250         if (!xml) {
251             if (tmp->getLength() >= 5) {
252                 const char *p = tmp->c_str() + tmp->getLength() - 5;
253                 if (!strcmp(p, ".html") || !strcmp(p, ".HTML")) {
254                     htmlFileName = new GooString(tmp->c_str(), tmp->getLength() - 5);
255                 }
256             }
257         } else {
258             if (tmp->getLength() >= 4) {
259                 const char *p = tmp->c_str() + tmp->getLength() - 4;
260                 if (!strcmp(p, ".xml") || !strcmp(p, ".XML")) {
261                     htmlFileName = new GooString(tmp->c_str(), tmp->getLength() - 4);
262                 }
263             }
264         }
265         if (!htmlFileName) {
266             htmlFileName = new GooString(tmp);
267         }
268         delete tmp;
269     } else if (fileName->cmp("fd://0") == 0) {
270         error(errCommandLine, -1, "You have to provide an output filename when reading from stdin.");
271         goto error;
272     } else {
273         const char *p = fileName->c_str() + fileName->getLength() - 4;
274         if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF"))
275             htmlFileName = new GooString(fileName->c_str(), fileName->getLength() - 4);
276         else
277             htmlFileName = fileName->copy();
278         //   htmlFileName->append(".html");
279     }
280 
281     if (scale > 3.0)
282         scale = 3.0;
283     if (scale < 0.5)
284         scale = 0.5;
285 
286     if (complexMode) {
287         // noframes=false;
288         stout = false;
289     }
290 
291     if (stout) {
292         noframes = true;
293         complexMode = false;
294     }
295 
296     if (xml) {
297         complexMode = true;
298         singleHtml = false;
299         noframes = true;
300         noMerge = true;
301     }
302 
303     // get page range
304     if (firstPage < 1)
305         firstPage = 1;
306     if (lastPage < 1 || lastPage > doc->getNumPages())
307         lastPage = doc->getNumPages();
308     if (lastPage < firstPage) {
309         error(errCommandLine, -1, "Wrong page range given: the first page ({0:d}) can not be after the last page ({1:d}).", firstPage, lastPage);
310         goto error;
311     }
312 
313     info = doc->getDocInfo();
314     if (info.isDict()) {
315         docTitle = getInfoString(info.getDict(), "Title");
316         author = getInfoString(info.getDict(), "Author");
317         keywords = getInfoString(info.getDict(), "Keywords");
318         subject = getInfoString(info.getDict(), "Subject");
319         date = getInfoDate(info.getDict(), "ModDate");
320         if (!date)
321             date = getInfoDate(info.getDict(), "CreationDate");
322     }
323     if (!docTitle)
324         docTitle = new GooString(htmlFileName);
325 
326     if (!singleHtml)
327         rawOrder = complexMode; // todo: figure out what exactly rawOrder do :)
328     else
329         rawOrder = singleHtml;
330 
331     doOutline = doc->getOutline()->getItems() != nullptr;
332     // write text file
333     htmlOut = new HtmlOutputDev(doc->getCatalog(), htmlFileName->c_str(), docTitle->c_str(), author ? author->c_str() : nullptr, keywords ? keywords->c_str() : nullptr, subject ? subject->c_str() : nullptr, date ? date->c_str() : nullptr,
334                                 rawOrder, firstPage, doOutline);
335     delete docTitle;
336     if (author) {
337         delete author;
338     }
339     if (keywords) {
340         delete keywords;
341     }
342     if (subject) {
343         delete subject;
344     }
345     if (date) {
346         delete date;
347     }
348 
349     if ((complexMode || singleHtml) && !xml && !ignore) {
350         GooString *imgFileName = nullptr;
351         // White paper color
352         SplashColor color;
353         color[0] = color[1] = color[2] = 255;
354         // If the user specified "jpg" use JPEG, otherwise PNG
355         SplashImageFileFormat format = strcmp(extension, "jpg") ? splashFormatPng : splashFormatJpeg;
356 
357         splashOut = new SplashOutputDevNoText(splashModeRGB8, 4, false, color);
358         splashOut->startDoc(doc.get());
359 
360         for (int pg = firstPage; pg <= lastPage; ++pg) {
361             InMemoryFile imf;
362             doc->displayPage(splashOut, pg, 72 * scale, 72 * scale, 0, true, false, false);
363             SplashBitmap *bitmap = splashOut->getBitmap();
364 
365             imgFileName = GooString::format("{0:s}{1:03d}.{2:s}", htmlFileName->c_str(), pg, extension);
366             auto f1 = dataUrls ? imf.open("wb") : fopen(imgFileName->c_str(), "wb");
367             if (!f1) {
368                 fprintf(stderr, "Could not open %s\n", imgFileName->c_str());
369                 delete imgFileName;
370                 continue;
371             }
372             bitmap->writeImgFile(format, f1, 72 * scale, 72 * scale);
373             fclose(f1);
374             if (dataUrls) {
375                 htmlOut->addBackgroundImage(std::string((format == splashFormatJpeg) ? "data:image/jpeg;base64," : "data:image/png;base64,") + gbase64Encode(imf.getBuffer()));
376             } else {
377                 htmlOut->addBackgroundImage(gbasename(imgFileName->c_str()));
378             }
379             delete imgFileName;
380         }
381 
382         delete splashOut;
383     }
384 
385     if (htmlOut->isOk()) {
386         doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0, true, false, false);
387         htmlOut->dumpDocOutline(doc.get());
388     }
389 
390     delete htmlOut;
391 
392     exit_status = EXIT_SUCCESS;
393 
394     // clean up
395 error:
396     delete fileName;
397 
398     if (htmlFileName)
399         delete htmlFileName;
400 
401     return exit_status;
402 }
403 
getInfoString(Dict * infoDict,const char * key)404 static GooString *getInfoString(Dict *infoDict, const char *key)
405 {
406     Object obj;
407     // Raw value as read from PDF (may be in pdfDocEncoding or UCS2)
408     const GooString *rawString;
409     // Value converted to unicode
410     Unicode *unicodeString;
411     int unicodeLength;
412     // Value HTML escaped and converted to desired encoding
413     GooString *encodedString = nullptr;
414     // Is rawString UCS2 (as opposed to pdfDocEncoding)
415     bool isUnicode;
416 
417     obj = infoDict->lookup(key);
418     if (obj.isString()) {
419         rawString = obj.getString();
420 
421         // Convert rawString to unicode
422         if (rawString->hasUnicodeMarker()) {
423             isUnicode = true;
424             unicodeLength = (obj.getString()->getLength() - 2) / 2;
425         } else {
426             isUnicode = false;
427             unicodeLength = obj.getString()->getLength();
428         }
429         unicodeString = new Unicode[unicodeLength];
430 
431         for (int i = 0; i < unicodeLength; i++) {
432             if (isUnicode) {
433                 unicodeString[i] = ((rawString->getChar((i + 1) * 2) & 0xff) << 8) | (rawString->getChar(((i + 1) * 2) + 1) & 0xff);
434             } else {
435                 unicodeString[i] = pdfDocEncoding[rawString->getChar(i) & 0xff];
436             }
437         }
438 
439         // HTML escape and encode unicode
440         encodedString = HtmlFont::HtmlFilter(unicodeString, unicodeLength);
441         delete[] unicodeString;
442     }
443 
444     return encodedString;
445 }
446 
getInfoDate(Dict * infoDict,const char * key)447 static GooString *getInfoDate(Dict *infoDict, const char *key)
448 {
449     Object obj;
450     int year, mon, day, hour, min, sec, tz_hour, tz_minute;
451     char tz;
452     struct tm tmStruct;
453     GooString *result = nullptr;
454     char buf[256];
455 
456     obj = infoDict->lookup(key);
457     if (obj.isString()) {
458         const GooString *s = obj.getString();
459         // TODO do something with the timezone info
460         if (parseDateString(s, &year, &mon, &day, &hour, &min, &sec, &tz, &tz_hour, &tz_minute)) {
461             tmStruct.tm_year = year - 1900;
462             tmStruct.tm_mon = mon - 1;
463             tmStruct.tm_mday = day;
464             tmStruct.tm_hour = hour;
465             tmStruct.tm_min = min;
466             tmStruct.tm_sec = sec;
467             tmStruct.tm_wday = -1;
468             tmStruct.tm_yday = -1;
469             tmStruct.tm_isdst = -1;
470             mktime(&tmStruct); // compute the tm_wday and tm_yday fields
471             if (strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S+00:00", &tmStruct)) {
472                 result = new GooString(buf);
473             } else {
474                 result = new GooString(s);
475             }
476         } else {
477             result = new GooString(s);
478         }
479     }
480     return result;
481 }
482