1 //========================================================================
2 //
3 // pdftohtml.cc
4 //
5 //
6 // Copyright 1999-2000 G. Ovtcharov
7 //========================================================================
8
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2007-2008, 2010 Albert Astals Cid <aacid@kde.org>
17 // Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
18 // Copyright (C) 2010 Mike Slegeir <tehpola@yahoo.com>
19 // Copyright (C) 2010 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
20 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in)
21 //
22 // To see a description of the changes please see the Changelog file that
23 // came with your tarball or type make ChangeLog if you are building from git
24 //
25 //========================================================================
26
27 #include "config.h"
28 #include <poppler-config.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <stddef.h>
32 #include <string.h>
33 #ifdef HAVE_DIRENT_H
34 #include <dirent.h>
35 #endif
36 #include <time.h>
37 #include "parseargs.h"
38 #include "goo/GooString.h"
39 #include "goo/gmem.h"
40 #include "Object.h"
41 #include "Stream.h"
42 #include "Array.h"
43 #include "Dict.h"
44 #include "XRef.h"
45 #include "Catalog.h"
46 #include "Page.h"
47 #include "PDFDoc.h"
48 #include "PDFDocFactory.h"
49 #include "HtmlOutputDev.h"
50 #ifdef HAVE_SPLASH
51 #include "SplashOutputDev.h"
52 #include "splash/SplashBitmap.h"
53 #endif
54 #include "PSOutputDev.h"
55 #include "GlobalParams.h"
56 #include "Error.h"
57 #include "DateInfo.h"
58 #include "goo/gfile.h"
59
60 #ifndef GHOSTSCRIPT
61 # define GHOSTSCRIPT "gs"
62 #endif
63
64 static int firstPage = 1;
65 static int lastPage = 0;
66 static GBool rawOrder = gTrue;
67 GBool printCommands = gTrue;
68 static GBool printHelp = gFalse;
69 GBool printHtml = gFalse;
70 GBool complexMode=gFalse;
71 GBool singleHtml=gFalse; // singleHtml
72 GBool ignore=gFalse;
73 static GBool useSplash=gTrue;
74 static char extension[5]="png";
75 static double scale=1.5;
76 GBool noframes=gFalse;
77 GBool stout=gFalse;
78 GBool xml=gFalse;
79 static GBool errQuiet=gFalse;
80 static GBool noDrm=gFalse;
81
82 GBool showHidden = gFalse;
83 GBool noMerge = gFalse;
84 static char ownerPassword[33] = "";
85 static char userPassword[33] = "";
86 static char gsDevice[33] = "none";
87 static GBool printVersion = gFalse;
88
89 static GooString* getInfoString(Dict *infoDict, char *key);
90 static GooString* getInfoDate(Dict *infoDict, char *key);
91
92 static char textEncName[128] = "";
93
94 static const ArgDesc argDesc[] = {
95 {"-f", argInt, &firstPage, 0,
96 "first page to convert"},
97 {"-l", argInt, &lastPage, 0,
98 "last page to convert"},
99 /*{"-raw", argFlag, &rawOrder, 0,
100 "keep strings in content stream order"},*/
101 {"-q", argFlag, &errQuiet, 0,
102 "don't print any messages or errors"},
103 {"-h", argFlag, &printHelp, 0,
104 "print usage information"},
105 {"-help", argFlag, &printHelp, 0,
106 "print usage information"},
107 {"-p", argFlag, &printHtml, 0,
108 "exchange .pdf links by .html"},
109 {"-c", argFlag, &complexMode, 0,
110 "generate complex document"},
111 {"-s", argFlag, &singleHtml, 0,
112 "generate single document that includes all pages"},
113 {"-i", argFlag, &ignore, 0,
114 "ignore images"},
115 {"-noframes", argFlag, &noframes, 0,
116 "generate no frames"},
117 {"-stdout" ,argFlag, &stout, 0,
118 "use standard output"},
119 {"-zoom", argFP, &scale, 0,
120 "zoom the pdf document (default 1.5)"},
121 {"-xml", argFlag, &xml, 0,
122 "output for XML post-processing"},
123 {"-hidden", argFlag, &showHidden, 0,
124 "output hidden text"},
125 {"-nomerge", argFlag, &noMerge, 0,
126 "do not merge paragraphs"},
127 {"-enc", argString, textEncName, sizeof(textEncName),
128 "output text encoding name"},
129 {"-dev", argString, gsDevice, sizeof(gsDevice),
130 "output device name for Ghostscript (png16m, jpeg etc)"},
131 {"-fmt", argString, extension, sizeof(extension),
132 "image file format for Splash output (png or jpg)"},
133 {"-v", argFlag, &printVersion, 0,
134 "print copyright and version info"},
135 {"-opw", argString, ownerPassword, sizeof(ownerPassword),
136 "owner password (for encrypted files)"},
137 {"-upw", argString, userPassword, sizeof(userPassword),
138 "user password (for encrypted files)"},
139 {"-nodrm", argFlag, &noDrm, 0,
140 "override document DRM settings"},
141 {NULL}
142 };
143
144 #ifdef HAVE_SPLASH
145 class SplashOutputDevNoText : public SplashOutputDev {
146 public:
SplashOutputDevNoText(SplashColorMode colorModeA,int bitmapRowPadA,GBool reverseVideoA,SplashColorPtr paperColorA,GBool bitmapTopDownA=gTrue,GBool allowAntialiasA=gTrue)147 SplashOutputDevNoText(SplashColorMode colorModeA, int bitmapRowPadA,
148 GBool reverseVideoA, SplashColorPtr paperColorA,
149 GBool bitmapTopDownA = gTrue,
150 GBool allowAntialiasA = gTrue) : SplashOutputDev(colorModeA,
151 bitmapRowPadA, reverseVideoA, paperColorA, bitmapTopDownA,
152 allowAntialiasA) { }
~SplashOutputDevNoText()153 virtual ~SplashOutputDevNoText() { }
154
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode code,int nBytes,Unicode * u,int uLen)155 void drawChar(GfxState *state, double x, double y,
156 double dx, double dy,
157 double originX, double originY,
158 CharCode code, int nBytes, Unicode *u, int uLen) { }
beginType3Char(GfxState * state,double x,double y,double dx,double dy,CharCode code,Unicode * u,int uLen)159 GBool beginType3Char(GfxState *state, double x, double y,
160 double dx, double dy,
161 CharCode code, Unicode *u, int uLen) { return false; }
endType3Char(GfxState * state)162 void endType3Char(GfxState *state) { }
beginTextObject(GfxState * state)163 void beginTextObject(GfxState *state) { }
deviceHasTextClip(GfxState * state)164 GBool deviceHasTextClip(GfxState *state) { return false; }
endTextObject(GfxState * state)165 void endTextObject(GfxState *state) { }
interpretType3Chars()166 GBool interpretType3Chars() { return gFalse; }
167 };
168 #endif
169
main(int argc,char * argv[])170 int main(int argc, char *argv[]) {
171 PDFDoc *doc = NULL;
172 GooString *fileName = NULL;
173 GooString *docTitle = NULL;
174 GooString *author = NULL, *keywords = NULL, *subject = NULL, *date = NULL;
175 GooString *htmlFileName = NULL;
176 GooString *psFileName = NULL;
177 HtmlOutputDev *htmlOut = NULL;
178 #ifdef HAVE_SPLASH
179 SplashOutputDev *splashOut = NULL;
180 #endif
181 PSOutputDev *psOut = NULL;
182 GBool ok;
183 char *p;
184 GooString *ownerPW, *userPW;
185 Object info;
186 char * extsList[] = {"png", "jpeg", "bmp", "pcx", "tiff", "pbm", NULL};
187
188 // parse args
189 ok = parseArgs(argDesc, &argc, argv);
190 if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) {
191 fprintf(stderr, "pdftohtml version %s\n", PACKAGE_VERSION);
192 fprintf(stderr, "%s\n", popplerCopyright);
193 fprintf(stderr, "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch");
194 fprintf(stderr, "%s\n\n", xpdfCopyright);
195 if (!printVersion) {
196 printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc);
197 }
198 exit(1);
199 }
200
201 // init error file
202 //errorInit();
203
204 // read config file
205 globalParams = new GlobalParams();
206
207 if (errQuiet) {
208 globalParams->setErrQuiet(errQuiet);
209 printCommands = gFalse; // I'm not 100% what is the differecne between them
210 }
211
212 if (textEncName[0]) {
213 globalParams->setTextEncoding(textEncName);
214 if( !globalParams->getTextEncoding() ) {
215 goto error;
216 }
217 }
218
219 // open PDF file
220 if (ownerPassword[0]) {
221 ownerPW = new GooString(ownerPassword);
222 } else {
223 ownerPW = NULL;
224 }
225 if (userPassword[0]) {
226 userPW = new GooString(userPassword);
227 } else {
228 userPW = NULL;
229 }
230
231 fileName = new GooString(argv[1]);
232
233 if (fileName->cmp("-") == 0) {
234 delete fileName;
235 fileName = new GooString("fd://0");
236 }
237
238 doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW);
239
240 if (userPW) {
241 delete userPW;
242 }
243 if (ownerPW) {
244 delete ownerPW;
245 }
246 if (!doc->isOk()) {
247 goto error;
248 }
249
250 // check for copy permission
251 if (!doc->okToCopy()) {
252 if (!noDrm) {
253 error(-1, "Copying of text from this document is not allowed.");
254 goto error;
255 }
256 fprintf(stderr, "Document has copy-protection bit set.\n");
257 }
258
259 // construct text file name
260 if (argc == 3) {
261 GooString* tmp = new GooString(argv[2]);
262 if (!xml) {
263 if (tmp->getLength() >= 5) {
264 p = tmp->getCString() + tmp->getLength() - 5;
265 if (!strcmp(p, ".html") || !strcmp(p, ".HTML")) {
266 htmlFileName = new GooString(tmp->getCString(), tmp->getLength() - 5);
267 }
268 }
269 } else {
270 if (tmp->getLength() >= 4) {
271 p = tmp->getCString() + tmp->getLength() - 4;
272 if (!strcmp(p, ".xml") || !strcmp(p, ".XML")) {
273 htmlFileName = new GooString(tmp->getCString(), tmp->getLength() - 4);
274 }
275 }
276 }
277 if (!htmlFileName) {
278 htmlFileName =new GooString(tmp);
279 }
280 delete tmp;
281 } else if (fileName->cmp("fd://0") == 0) {
282 error(-1, "You have to provide an output filename when reading form stdin.");
283 goto error;
284 } else {
285 p = fileName->getCString() + fileName->getLength() - 4;
286 if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF"))
287 htmlFileName = new GooString(fileName->getCString(),
288 fileName->getLength() - 4);
289 else
290 htmlFileName = fileName->copy();
291 // htmlFileName->append(".html");
292 }
293
294 if (scale>3.0) scale=3.0;
295 if (scale<0.5) scale=0.5;
296
297 if (complexMode || singleHtml) {
298 //noframes=gFalse;
299 stout=gFalse;
300 }
301
302 if (stout) {
303 noframes=gTrue;
304 complexMode=gFalse;
305 singleHtml=gFalse;
306 }
307
308 if (xml)
309 {
310 complexMode = gTrue;
311 singleHtml = gFalse;
312 noframes = gTrue;
313 noMerge = gTrue;
314 }
315
316 // get page range
317 if (firstPage < 1)
318 firstPage = 1;
319 if (lastPage < 1 || lastPage > doc->getNumPages())
320 lastPage = doc->getNumPages();
321
322 doc->getDocInfo(&info);
323 if (info.isDict()) {
324 docTitle = getInfoString(info.getDict(), "Title");
325 author = getInfoString(info.getDict(), "Author");
326 keywords = getInfoString(info.getDict(), "Keywords");
327 subject = getInfoString(info.getDict(), "Subject");
328 date = getInfoDate(info.getDict(), "ModDate");
329 if( !date )
330 date = getInfoDate(info.getDict(), "CreationDate");
331 }
332 info.free();
333 if( !docTitle ) docTitle = new GooString(htmlFileName);
334
335 if( strcmp("none", gsDevice) ) {
336 useSplash = gFalse;
337 /* determine extensions of output background images */
338 int i;
339 for(i = 0; extsList[i]; i++)
340 {
341 if( strstr(gsDevice, extsList[i]) != (char *) NULL )
342 {
343 strncpy(extension, extsList[i], sizeof(extension));
344 break;
345 }
346 }
347 }
348
349 #ifndef HAVE_SPLASH
350 if( useSplash ) {
351 fprintf(stderr, "You are trying to use the -fmt option but your pdftohtml was built without support for it. Please use the -dev option\n");
352 delete docTitle;
353 delete author;
354 delete keywords;
355 delete subject;
356 delete date;
357 delete htmlFileName;
358 delete globalParams;
359 delete fileName;
360 delete doc;
361 return -1;
362 }
363 #endif
364
365 if (!singleHtml)
366 rawOrder = complexMode; // todo: figure out what exactly rawOrder do :)
367 else
368 rawOrder = singleHtml;
369
370 // write text file
371 htmlOut = new HtmlOutputDev(htmlFileName->getCString(),
372 docTitle->getCString(),
373 author ? author->getCString() : NULL,
374 keywords ? keywords->getCString() : NULL,
375 subject ? subject->getCString() : NULL,
376 date ? date->getCString() : NULL,
377 extension,
378 rawOrder,
379 firstPage,
380 doc->getCatalog()->getOutline()->isDict());
381 delete docTitle;
382 if( author )
383 {
384 delete author;
385 }
386 if( keywords )
387 {
388 delete keywords;
389 }
390 if( subject )
391 {
392 delete subject;
393 }
394 if( date )
395 {
396 delete date;
397 }
398
399 if (htmlOut->isOk())
400 {
401 doc->displayPages(htmlOut, firstPage, lastPage, 72 * scale, 72 * scale, 0,
402 gTrue, gFalse, gFalse);
403 if (!xml)
404 {
405 htmlOut->dumpDocOutline(doc->getCatalog());
406 }
407 }
408
409 if ((complexMode || singleHtml) && !xml && !ignore) {
410 if(useSplash) {
411 #ifdef HAVE_SPLASH
412 GooString *imgFileName = NULL;
413 // White paper color
414 SplashColor color;
415 color[0] = color[1] = color[2] = 255;
416 // If the user specified "jpg" use JPEG, otherwise PNG
417 SplashImageFileFormat format = strcmp(extension, "jpg") ?
418 splashFormatPng : splashFormatJpeg;
419
420 splashOut = new SplashOutputDevNoText(splashModeRGB8, 4, gFalse, color);
421 splashOut->startDoc(doc->getXRef());
422
423 for (int pg = firstPage; pg <= lastPage; ++pg) {
424 doc->displayPage(splashOut, pg,
425 72 * scale, 72 * scale,
426 0, gTrue, gFalse, gFalse);
427 SplashBitmap *bitmap = splashOut->getBitmap();
428
429 imgFileName = GooString::format("{0:s}{1:03d}.{2:s}",
430 htmlFileName->getCString(), pg, extension);
431
432 bitmap->writeImgFile(format, imgFileName->getCString(),
433 72 * scale, 72 * scale);
434
435 delete imgFileName;
436 }
437
438 delete splashOut;
439 #endif
440 } else {
441 int h=xoutRound(htmlOut->getPageHeight()/scale);
442 int w=xoutRound(htmlOut->getPageWidth()/scale);
443 //int h=xoutRound(doc->getPageHeight(1)/scale);
444 //int w=xoutRound(doc->getPageWidth(1)/scale);
445
446 psFileName = new GooString(htmlFileName->getCString());
447 psFileName->append(".ps");
448
449 psOut = new PSOutputDev(psFileName->getCString(), doc, doc->getXRef(),
450 doc->getCatalog(), NULL, firstPage, lastPage, psModePS, w, h);
451 psOut->setDisplayText(gFalse);
452 doc->displayPages(psOut, firstPage, lastPage, 72, 72, 0,
453 gTrue, gFalse, gFalse);
454 delete psOut;
455
456 /*sprintf(buf, "%s -sDEVICE=png16m -dBATCH -dNOPROMPT -dNOPAUSE -r%d -sOutputFile=%s%%03d.png -g%dx%d -q %s", GHOSTSCRIPT, resolution, htmlFileName->getCString(), w, h,
457 psFileName->getCString());*/
458
459 GooString *gsCmd = new GooString(GHOSTSCRIPT);
460 GooString *tw, *th, *sc;
461 gsCmd->append(" -sDEVICE=");
462 gsCmd->append(gsDevice);
463 gsCmd->append(" -dBATCH -dNOPROMPT -dNOPAUSE -r");
464 sc = GooString::fromInt(static_cast<int>(72*scale));
465 gsCmd->append(sc);
466 gsCmd->append(" -sOutputFile=");
467 gsCmd->append("\"");
468 gsCmd->append(htmlFileName);
469 gsCmd->append("%03d.");
470 gsCmd->append(extension);
471 gsCmd->append("\" -g");
472 tw = GooString::fromInt(static_cast<int>(scale*w));
473 gsCmd->append(tw);
474 gsCmd->append("x");
475 th = GooString::fromInt(static_cast<int>(scale*h));
476 th = GooString::fromInt(static_cast<int>(scale*h));
477 gsCmd->append(th);
478 gsCmd->append(" -q \"");
479 gsCmd->append(psFileName);
480 gsCmd->append("\"");
481 // printf("running: %s\n", gsCmd->getCString());
482 if( !executeCommand(gsCmd->getCString()) && !errQuiet) {
483 error(-1, "Failed to launch Ghostscript!\n");
484 }
485 unlink(psFileName->getCString());
486 delete tw;
487 delete th;
488 delete sc;
489 delete gsCmd;
490 delete psFileName;
491 }
492 }
493
494 delete htmlOut;
495
496 // clean up
497 error:
498 if(doc) delete doc;
499 delete fileName;
500 if(globalParams) delete globalParams;
501
502 if(htmlFileName) delete htmlFileName;
503 HtmlFont::clear();
504
505 // check for memory leaks
506 Object::memCheck(stderr);
507 gMemReport(stderr);
508
509 return 0;
510 }
511
getInfoString(Dict * infoDict,char * key)512 static GooString* getInfoString(Dict *infoDict, char *key) {
513 Object obj;
514 GooString *s1 = NULL;
515
516 if (infoDict->lookup(key, &obj)->isString()) {
517 s1 = new GooString(obj.getString());
518 }
519 obj.free();
520 return s1;
521 }
522
getInfoDate(Dict * infoDict,char * key)523 static GooString* getInfoDate(Dict *infoDict, char *key) {
524 Object obj;
525 char *s;
526 int year, mon, day, hour, min, sec, tz_hour, tz_minute;
527 char tz;
528 struct tm tmStruct;
529 GooString *result = NULL;
530 char buf[256];
531
532 if (infoDict->lookup(key, &obj)->isString()) {
533 s = obj.getString()->getCString();
534 // TODO do something with the timezone info
535 if ( parseDateString( s, &year, &mon, &day, &hour, &min, &sec, &tz, &tz_hour, &tz_minute ) ) {
536 tmStruct.tm_year = year - 1900;
537 tmStruct.tm_mon = mon - 1;
538 tmStruct.tm_mday = day;
539 tmStruct.tm_hour = hour;
540 tmStruct.tm_min = min;
541 tmStruct.tm_sec = sec;
542 tmStruct.tm_wday = -1;
543 tmStruct.tm_yday = -1;
544 tmStruct.tm_isdst = -1;
545 mktime(&tmStruct); // compute the tm_wday and tm_yday fields
546 if (strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S+00:00", &tmStruct)) {
547 result = new GooString(buf);
548 } else {
549 result = new GooString(s);
550 }
551 } else {
552 result = new GooString(s);
553 }
554 }
555 obj.free();
556 return result;
557 }
558
559