1 //========================================================================
2 //
3 // pdftohtml.cc
4 //
5 //
6 // Copyright 1999-2000 G. Ovtcharov
7 //========================================================================
8 
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2007-2008, 2010 Albert Astals Cid <aacid@kde.org>
17 // Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
18 // Copyright (C) 2010 Mike Slegeir <tehpola@yahoo.com>
19 // Copyright (C) 2010 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
20 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in)
21 //
22 // To see a description of the changes please see the Changelog file that
23 // came with your tarball or type make ChangeLog if you are building from git
24 //
25 //========================================================================
26 
27 #include "config.h"
28 #include <poppler-config.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <stddef.h>
32 #include <string.h>
33 #ifdef HAVE_DIRENT_H
34 #include <dirent.h>
35 #endif
36 #include <time.h>
37 #include "parseargs.h"
38 #include "goo/GooString.h"
39 #include "goo/gmem.h"
40 #include "Object.h"
41 #include "Stream.h"
42 #include "Array.h"
43 #include "Dict.h"
44 #include "XRef.h"
45 #include "Catalog.h"
46 #include "Page.h"
47 #include "PDFDoc.h"
48 #include "PDFDocFactory.h"
49 #include "HtmlOutputDev.h"
50 #ifdef HAVE_SPLASH
51 #include "SplashOutputDev.h"
52 #include "splash/SplashBitmap.h"
53 #endif
54 #include "PSOutputDev.h"
55 #include "GlobalParams.h"
56 #include "Error.h"
57 #include "DateInfo.h"
58 #include "goo/gfile.h"
59 
60 #ifndef GHOSTSCRIPT
61 # define GHOSTSCRIPT "gs"
62 #endif
63 
64 static int firstPage = 1;
65 static int lastPage = 0;
66 static GBool rawOrder = gTrue;
67 GBool printCommands = gTrue;
68 static GBool printHelp = gFalse;
69 GBool printHtml = gFalse;
70 GBool complexMode=gFalse;
71 GBool singleHtml=gFalse; // singleHtml
72 GBool ignore=gFalse;
73 static GBool useSplash=gTrue;
74 static char extension[5]="png";
75 static double scale=1.5;
76 GBool noframes=gFalse;
77 GBool stout=gFalse;
78 GBool xml=gFalse;
79 static GBool errQuiet=gFalse;
80 static GBool noDrm=gFalse;
81 
82 GBool showHidden = gFalse;
83 GBool noMerge = gFalse;
84 static char ownerPassword[33] = "";
85 static char userPassword[33] = "";
86 static char gsDevice[33] = "none";
87 static GBool printVersion = gFalse;
88 
89 static GooString* getInfoString(Dict *infoDict, char *key);
90 static GooString* getInfoDate(Dict *infoDict, char *key);
91 
92 static char textEncName[128] = "";
93 
94 static const ArgDesc argDesc[] = {
95   {"-f",      argInt,      &firstPage,     0,
96    "first page to convert"},
97   {"-l",      argInt,      &lastPage,      0,
98    "last page to convert"},
99   /*{"-raw",    argFlag,     &rawOrder,      0,
100     "keep strings in content stream order"},*/
101   {"-q",      argFlag,     &errQuiet,      0,
102    "don't print any messages or errors"},
103   {"-h",      argFlag,     &printHelp,     0,
104    "print usage information"},
105   {"-help",   argFlag,     &printHelp,     0,
106    "print usage information"},
107   {"-p",      argFlag,     &printHtml,     0,
108    "exchange .pdf links by .html"},
109   {"-c",      argFlag,     &complexMode,          0,
110    "generate complex document"},
111   {"-s",      argFlag,     &singleHtml,          0,
112    "generate single document that includes all pages"},
113   {"-i",      argFlag,     &ignore,        0,
114    "ignore images"},
115   {"-noframes", argFlag,   &noframes,      0,
116    "generate no frames"},
117   {"-stdout"  ,argFlag,    &stout,         0,
118    "use standard output"},
119   {"-zoom",   argFP,    &scale,         0,
120    "zoom the pdf document (default 1.5)"},
121   {"-xml",    argFlag,    &xml,         0,
122    "output for XML post-processing"},
123   {"-hidden", argFlag,   &showHidden,   0,
124    "output hidden text"},
125   {"-nomerge", argFlag, &noMerge, 0,
126    "do not merge paragraphs"},
127   {"-enc",    argString,   textEncName,    sizeof(textEncName),
128    "output text encoding name"},
129   {"-dev",    argString,   gsDevice,       sizeof(gsDevice),
130    "output device name for Ghostscript (png16m, jpeg etc)"},
131   {"-fmt",    argString,   extension,      sizeof(extension),
132    "image file format for Splash output (png or jpg)"},
133   {"-v",      argFlag,     &printVersion,  0,
134    "print copyright and version info"},
135   {"-opw",    argString,   ownerPassword,  sizeof(ownerPassword),
136    "owner password (for encrypted files)"},
137   {"-upw",    argString,   userPassword,   sizeof(userPassword),
138    "user password (for encrypted files)"},
139   {"-nodrm", argFlag, &noDrm, 0,
140    "override document DRM settings"},
141   {NULL}
142 };
143 
144 #ifdef HAVE_SPLASH
145 class SplashOutputDevNoText : public SplashOutputDev {
146 public:
SplashOutputDevNoText(SplashColorMode colorModeA,int bitmapRowPadA,GBool reverseVideoA,SplashColorPtr paperColorA,GBool bitmapTopDownA=gTrue,GBool allowAntialiasA=gTrue)147   SplashOutputDevNoText(SplashColorMode colorModeA, int bitmapRowPadA,
148         GBool reverseVideoA, SplashColorPtr paperColorA,
149         GBool bitmapTopDownA = gTrue,
150         GBool allowAntialiasA = gTrue) : SplashOutputDev(colorModeA,
151             bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA,
152             allowAntialiasA) { }
~SplashOutputDevNoText()153   virtual ~SplashOutputDevNoText() { }
154 
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode code,int nBytes,Unicode * u,int uLen)155   void drawChar(GfxState *state, double x, double y,
156       double dx, double dy,
157       double originX, double originY,
158       CharCode code, int nBytes, Unicode *u, int uLen) { }
beginType3Char(GfxState * state,double x,double y,double dx,double dy,CharCode code,Unicode * u,int uLen)159   GBool beginType3Char(GfxState *state, double x, double y,
160       double dx, double dy,
161       CharCode code, Unicode *u, int uLen) { return false; }
endType3Char(GfxState * state)162   void endType3Char(GfxState *state) { }
beginTextObject(GfxState * state)163   void beginTextObject(GfxState *state) { }
deviceHasTextClip(GfxState * state)164   GBool deviceHasTextClip(GfxState *state) { return false; }
endTextObject(GfxState * state)165   void endTextObject(GfxState *state) { }
interpretType3Chars()166   GBool interpretType3Chars() { return gFalse; }
167 };
168 #endif
169 
main(int argc,char * argv[])170 int main(int argc, char *argv[]) {
171   PDFDoc *doc = NULL;
172   GooString *fileName = NULL;
173   GooString *docTitle = NULL;
174   GooString *author = NULL, *keywords = NULL, *subject = NULL, *date = NULL;
175   GooString *htmlFileName = NULL;
176   GooString *psFileName = NULL;
177   HtmlOutputDev *htmlOut = NULL;
178 #ifdef HAVE_SPLASH
179   SplashOutputDev *splashOut = NULL;
180 #endif
181   PSOutputDev *psOut = NULL;
182   GBool ok;
183   char *p;
184   GooString *ownerPW, *userPW;
185   Object info;
186   char * extsList[] = {"png", "jpeg", "bmp", "pcx", "tiff", "pbm", NULL};
187 
188   // parse args
189   ok = parseArgs(argDesc, &argc, argv);
190   if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) {
191     fprintf(stderr, "pdftohtml version %s\n", PACKAGE_VERSION);
192     fprintf(stderr, "%s\n", popplerCopyright);
193     fprintf(stderr, "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch");
194     fprintf(stderr, "%s\n\n", xpdfCopyright);
195     if (!printVersion) {
196       printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc);
197     }
198     exit(1);
199   }
200 
201   // init error file
202   //errorInit();
203 
204   // read config file
205   globalParams = new GlobalParams();
206 
207   if (errQuiet) {
208     globalParams->setErrQuiet(errQuiet);
209     printCommands = gFalse; // I'm not 100% what is the differecne between them
210   }
211 
212   if (textEncName[0]) {
213     globalParams->setTextEncoding(textEncName);
214     if( !globalParams->getTextEncoding() )  {
215 	goto error;
216     }
217   }
218 
219   // open PDF file
220   if (ownerPassword[0]) {
221     ownerPW = new GooString(ownerPassword);
222   } else {
223     ownerPW = NULL;
224   }
225   if (userPassword[0]) {
226     userPW = new GooString(userPassword);
227   } else {
228     userPW = NULL;
229   }
230 
231   fileName = new GooString(argv[1]);
232 
233   if (fileName->cmp("-") == 0) {
234       delete fileName;
235       fileName = new GooString("fd://0");
236   }
237 
238   doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW);
239 
240   if (userPW) {
241     delete userPW;
242   }
243   if (ownerPW) {
244     delete ownerPW;
245   }
246   if (!doc->isOk()) {
247     goto error;
248   }
249 
250   // check for copy permission
251   if (!doc->okToCopy()) {
252     if (!noDrm) {
253       error(-1, "Copying of text from this document is not allowed.");
254       goto error;
255     }
256     fprintf(stderr, "Document has copy-protection bit set.\n");
257   }
258 
259   // construct text file name
260   if (argc == 3) {
261     GooString* tmp = new GooString(argv[2]);
262     if (!xml) {
263       if (tmp->getLength() >= 5) {
264         p = tmp->getCString() + tmp->getLength() - 5;
265         if (!strcmp(p, ".html") || !strcmp(p, ".HTML")) {
266           htmlFileName = new GooString(tmp->getCString(), tmp->getLength() - 5);
267         }
268       }
269     } else {
270       if (tmp->getLength() >= 4) {
271         p = tmp->getCString() + tmp->getLength() - 4;
272         if (!strcmp(p, ".xml") || !strcmp(p, ".XML")) {
273           htmlFileName = new GooString(tmp->getCString(), tmp->getLength() - 4);
274         }
275       }
276     }
277     if (!htmlFileName) {
278       htmlFileName =new GooString(tmp);
279     }
280     delete tmp;
281   } else if (fileName->cmp("fd://0") == 0) {
282       error(-1, "You have to provide an output filename when reading form stdin.");
283       goto error;
284   } else {
285     p = fileName->getCString() + fileName->getLength() - 4;
286     if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF"))
287       htmlFileName = new GooString(fileName->getCString(),
288 				 fileName->getLength() - 4);
289     else
290       htmlFileName = fileName->copy();
291     //   htmlFileName->append(".html");
292   }
293 
294    if (scale>3.0) scale=3.0;
295    if (scale<0.5) scale=0.5;
296 
297    if (complexMode || singleHtml) {
298      //noframes=gFalse;
299      stout=gFalse;
300    }
301 
302    if (stout) {
303      noframes=gTrue;
304      complexMode=gFalse;
305      singleHtml=gFalse;
306    }
307 
308    if (xml)
309    {
310        complexMode = gTrue;
311        singleHtml = gFalse;
312        noframes = gTrue;
313        noMerge = gTrue;
314    }
315 
316   // get page range
317   if (firstPage < 1)
318     firstPage = 1;
319   if (lastPage < 1 || lastPage > doc->getNumPages())
320     lastPage = doc->getNumPages();
321 
322   doc->getDocInfo(&info);
323   if (info.isDict()) {
324     docTitle = getInfoString(info.getDict(), "Title");
325     author = getInfoString(info.getDict(), "Author");
326     keywords = getInfoString(info.getDict(), "Keywords");
327     subject = getInfoString(info.getDict(), "Subject");
328     date = getInfoDate(info.getDict(), "ModDate");
329     if( !date )
330 	date = getInfoDate(info.getDict(), "CreationDate");
331   }
332   info.free();
333   if( !docTitle ) docTitle = new GooString(htmlFileName);
334 
335   if( strcmp("none", gsDevice) ) {
336     useSplash = gFalse;
337     /* determine extensions of output background images */
338     int i;
339     for(i = 0; extsList[i]; i++)
340     {
341       if( strstr(gsDevice, extsList[i]) != (char *) NULL )
342       {
343         strncpy(extension, extsList[i], sizeof(extension));
344         break;
345       }
346     }
347   }
348 
349 #ifndef HAVE_SPLASH
350   if( useSplash ) {
351     fprintf(stderr, "You are trying to use the -fmt option but your pdftohtml was built without support for it. Please use the -dev option\n");
352     delete docTitle;
353     delete author;
354     delete keywords;
355     delete subject;
356     delete date;
357     delete htmlFileName;
358     delete globalParams;
359     delete fileName;
360     delete doc;
361     return -1;
362   }
363 #endif
364 
365   if (!singleHtml)
366       rawOrder = complexMode; // todo: figure out what exactly rawOrder do :)
367   else
368       rawOrder = singleHtml;
369 
370   // write text file
371   htmlOut = new HtmlOutputDev(htmlFileName->getCString(),
372 	  docTitle->getCString(),
373 	  author ? author->getCString() : NULL,
374 	  keywords ? keywords->getCString() : NULL,
375           subject ? subject->getCString() : NULL,
376 	  date ? date->getCString() : NULL,
377 	  extension,
378 	  rawOrder,
379 	  firstPage,
380 	  doc->getCatalog()->getOutline()->isDict());
381   delete docTitle;
382   if( author )
383   {
384       delete author;
385   }
386   if( keywords )
387   {
388       delete keywords;
389   }
390   if( subject )
391   {
392       delete subject;
393   }
394   if( date )
395   {
396       delete date;
397   }
398 
399   if (htmlOut->isOk())
400   {
401     doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0,
402 		      gTrue, gFalse, gFalse);
403   	if (!xml)
404 	{
405 		htmlOut->dumpDocOutline(doc->getCatalog());
406 	}
407   }
408 
409   if ((complexMode || singleHtml) && !xml && !ignore) {
410     if(useSplash) {
411 #ifdef HAVE_SPLASH
412       GooString *imgFileName = NULL;
413       // White paper color
414       SplashColor color;
415       color[0] = color[1] = color[2] = 255;
416       // If the user specified "jpg" use JPEG, otherwise PNG
417       SplashImageFileFormat format = strcmp(extension, "jpg") ?
418           splashFormatPng : splashFormatJpeg;
419 
420       splashOut = new SplashOutputDevNoText(splashModeRGB8, 4, gFalse, color);
421       splashOut->startDoc(doc->getXRef());
422 
423       for (int pg = firstPage; pg <= lastPage; ++pg) {
424         doc->displayPage(splashOut, pg,
425                          72 * scale, 72 * scale,
426                          0, gTrue, gFalse, gFalse);
427         SplashBitmap *bitmap = splashOut->getBitmap();
428 
429         imgFileName = GooString::format("{0:s}{1:03d}.{2:s}",
430             htmlFileName->getCString(), pg, extension);
431 
432         bitmap->writeImgFile(format, imgFileName->getCString(),
433                              72 * scale, 72 * scale);
434 
435         delete imgFileName;
436       }
437 
438       delete splashOut;
439 #endif
440     } else {
441       int h=xoutRound(htmlOut->getPageHeight()/scale);
442       int w=xoutRound(htmlOut->getPageWidth()/scale);
443       //int h=xoutRound(doc->getPageHeight(1)/scale);
444       //int w=xoutRound(doc->getPageWidth(1)/scale);
445 
446       psFileName = new GooString(htmlFileName->getCString());
447       psFileName->append(".ps");
448 
449       psOut = new PSOutputDev(psFileName->getCString(), doc, doc->getXRef(),
450           doc->getCatalog(), NULL, firstPage, lastPage, psModePS, w, h);
451       psOut->setDisplayText(gFalse);
452       doc->displayPages(psOut, firstPage, lastPage, 72, 72, 0,
453           gTrue, gFalse, gFalse);
454       delete psOut;
455 
456       /*sprintf(buf, "%s -sDEVICE=png16m -dBATCH -dNOPROMPT -dNOPAUSE -r%d -sOutputFile=%s%%03d.png -g%dx%d -q %s", GHOSTSCRIPT, resolution, htmlFileName->getCString(), w, h,
457       psFileName->getCString());*/
458 
459       GooString *gsCmd = new GooString(GHOSTSCRIPT);
460       GooString *tw, *th, *sc;
461       gsCmd->append(" -sDEVICE=");
462       gsCmd->append(gsDevice);
463       gsCmd->append(" -dBATCH -dNOPROMPT -dNOPAUSE -r");
464       sc = GooString::fromInt(static_cast<int>(72*scale));
465       gsCmd->append(sc);
466       gsCmd->append(" -sOutputFile=");
467       gsCmd->append("\"");
468       gsCmd->append(htmlFileName);
469       gsCmd->append("%03d.");
470       gsCmd->append(extension);
471       gsCmd->append("\" -g");
472       tw = GooString::fromInt(static_cast<int>(scale*w));
473       gsCmd->append(tw);
474       gsCmd->append("x");
475       th = GooString::fromInt(static_cast<int>(scale*h));
476       th = GooString::fromInt(static_cast<int>(scale*h));
477       gsCmd->append(th);
478       gsCmd->append(" -q \"");
479       gsCmd->append(psFileName);
480       gsCmd->append("\"");
481       //    printf("running: %s\n", gsCmd->getCString());
482       if( !executeCommand(gsCmd->getCString()) && !errQuiet) {
483         error(-1, "Failed to launch Ghostscript!\n");
484       }
485       unlink(psFileName->getCString());
486       delete tw;
487       delete th;
488       delete sc;
489       delete gsCmd;
490       delete psFileName;
491     }
492   }
493 
494   delete htmlOut;
495 
496   // clean up
497  error:
498   if(doc) delete doc;
499   delete fileName;
500   if(globalParams) delete globalParams;
501 
502   if(htmlFileName) delete htmlFileName;
503   HtmlFont::clear();
504 
505   // check for memory leaks
506   Object::memCheck(stderr);
507   gMemReport(stderr);
508 
509   return 0;
510 }
511 
getInfoString(Dict * infoDict,char * key)512 static GooString* getInfoString(Dict *infoDict, char *key) {
513   Object obj;
514   GooString *s1 = NULL;
515 
516   if (infoDict->lookup(key, &obj)->isString()) {
517     s1 = new GooString(obj.getString());
518   }
519   obj.free();
520   return s1;
521 }
522 
getInfoDate(Dict * infoDict,char * key)523 static GooString* getInfoDate(Dict *infoDict, char *key) {
524   Object obj;
525   char *s;
526   int year, mon, day, hour, min, sec, tz_hour, tz_minute;
527   char tz;
528   struct tm tmStruct;
529   GooString *result = NULL;
530   char buf[256];
531 
532   if (infoDict->lookup(key, &obj)->isString()) {
533     s = obj.getString()->getCString();
534     // TODO do something with the timezone info
535     if ( parseDateString( s, &year, &mon, &day, &hour, &min, &sec, &tz, &tz_hour, &tz_minute ) ) {
536       tmStruct.tm_year = year - 1900;
537       tmStruct.tm_mon = mon - 1;
538       tmStruct.tm_mday = day;
539       tmStruct.tm_hour = hour;
540       tmStruct.tm_min = min;
541       tmStruct.tm_sec = sec;
542       tmStruct.tm_wday = -1;
543       tmStruct.tm_yday = -1;
544       tmStruct.tm_isdst = -1;
545       mktime(&tmStruct); // compute the tm_wday and tm_yday fields
546       if (strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S+00:00", &tmStruct)) {
547         result = new GooString(buf);
548       } else {
549         result = new GooString(s);
550       }
551     } else {
552       result = new GooString(s);
553     }
554   }
555   obj.free();
556   return result;
557 }
558 
559