1 //========================================================================
2 //
3 // pdftohtml.cc
4 //
5 // Copyright 2005 Glyph & Cog, LLC
6 //
7 //========================================================================
8
9 #include <aconf.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include "gmem.h"
13 #include "gmempp.h"
14 #include "parseargs.h"
15 #include "gfile.h"
16 #include "GString.h"
17 #include "GlobalParams.h"
18 #include "PDFDoc.h"
19 #include "HTMLGen.h"
20 #include "Error.h"
21 #include "ErrorCodes.h"
22 #include "config.h"
23
24 //------------------------------------------------------------------------
25
26 static GBool createIndex(char *htmlDir);
27
28 //------------------------------------------------------------------------
29
30 static int firstPage = 1;
31 static int lastPage = 0;
32 static double zoom = 1;
33 static int resolution = 150;
34 static GBool noFonts = gFalse;
35 static GBool skipInvisible = gFalse;
36 static GBool allInvisible = gFalse;
37 static char ownerPassword[33] = "\001";
38 static char userPassword[33] = "\001";
39 static GBool quiet = gFalse;
40 static char cfgFileName[256] = "";
41 static GBool printVersion = gFalse;
42 static GBool printHelp = gFalse;
43
44 static ArgDesc argDesc[] = {
45 {"-f", argInt, &firstPage, 0,
46 "first page to convert"},
47 {"-l", argInt, &lastPage, 0,
48 "last page to convert"},
49 {"-z", argFP, &zoom, 0,
50 "initial zoom level (1.0 means 72dpi)"},
51 {"-r", argInt, &resolution, 0,
52 "resolution, in DPI (default is 150)"},
53 {"-nofonts", argFlag, &noFonts, 0,
54 "do not extract embedded fonts"},
55 {"-skipinvisible", argFlag, &skipInvisible, 0,
56 "do not draw invisible text"},
57 {"-allinvisible", argFlag, &allInvisible, 0,
58 "treat all text as invisible"},
59 {"-opw", argString, ownerPassword, sizeof(ownerPassword),
60 "owner password (for encrypted files)"},
61 {"-upw", argString, userPassword, sizeof(userPassword),
62 "user password (for encrypted files)"},
63 {"-q", argFlag, &quiet, 0,
64 "don't print any messages or errors"},
65 {"-cfg", argString, cfgFileName, sizeof(cfgFileName),
66 "configuration file to use in place of .xpdfrc"},
67 {"-v", argFlag, &printVersion, 0,
68 "print copyright and version info"},
69 {"-h", argFlag, &printHelp, 0,
70 "print usage information"},
71 {"-help", argFlag, &printHelp, 0,
72 "print usage information"},
73 {"--help", argFlag, &printHelp, 0,
74 "print usage information"},
75 {"-?", argFlag, &printHelp, 0,
76 "print usage information"},
77 {NULL}
78 };
79
80 //------------------------------------------------------------------------
81
writeToFile(void * file,const char * data,int size)82 static int writeToFile(void *file, const char *data, int size) {
83 return (int)fwrite(data, 1, size, (FILE *)file);
84 }
85
main(int argc,char * argv[])86 int main(int argc, char *argv[]) {
87 PDFDoc *doc;
88 char *fileName;
89 char *htmlDir;
90 GString *ownerPW, *userPW;
91 HTMLGen *htmlGen;
92 GString *htmlFileName, *pngFileName, *pngURL;
93 FILE *htmlFile, *pngFile;
94 int pg, err, exitCode;
95 GBool ok;
96
97 exitCode = 99;
98
99 // parse args
100 fixCommandLine(&argc, &argv);
101 ok = parseArgs(argDesc, &argc, argv);
102 if (!ok || argc != 3 || printVersion || printHelp) {
103 fprintf(stderr, "pdftohtml version %s [www.xpdfreader.com]\n", xpdfVersion);
104 fprintf(stderr, "%s\n", xpdfCopyright);
105 if (!printVersion) {
106 printUsage("pdftohtml", "<PDF-file> <html-dir>", argDesc);
107 }
108 goto err0;
109 }
110 fileName = argv[1];
111 htmlDir = argv[2];
112
113 // read config file
114 globalParams = new GlobalParams(cfgFileName);
115 if (quiet) {
116 globalParams->setErrQuiet(quiet);
117 }
118 globalParams->setupBaseFonts(NULL);
119 globalParams->setTextEncoding("UTF-8");
120
121 // open PDF file
122 if (ownerPassword[0] != '\001') {
123 ownerPW = new GString(ownerPassword);
124 } else {
125 ownerPW = NULL;
126 }
127 if (userPassword[0] != '\001') {
128 userPW = new GString(userPassword);
129 } else {
130 userPW = NULL;
131 }
132 doc = new PDFDoc(fileName, ownerPW, userPW);
133 if (userPW) {
134 delete userPW;
135 }
136 if (ownerPW) {
137 delete ownerPW;
138 }
139 if (!doc->isOk()) {
140 exitCode = 1;
141 goto err1;
142 }
143
144 // check for copy permission
145 if (!doc->okToCopy()) {
146 error(errNotAllowed, -1,
147 "Copying of text from this document is not allowed.");
148 exitCode = 3;
149 goto err1;
150 }
151
152 // get page range
153 if (firstPage < 1) {
154 firstPage = 1;
155 }
156 if (lastPage < 1 || lastPage > doc->getNumPages()) {
157 lastPage = doc->getNumPages();
158 }
159
160 // create HTML directory
161 if (makeDir(htmlDir, 0755)) {
162 error(errIO, -1, "Couldn't create HTML output directory '{0:s}'",
163 htmlDir);
164 exitCode = 2;
165 goto err1;
166 }
167
168 // set up the HTMLGen object
169 htmlGen = new HTMLGen(resolution);
170 if (!htmlGen->isOk()) {
171 exitCode = 99;
172 goto err1;
173 }
174 htmlGen->setZoom(zoom);
175 htmlGen->setDrawInvisibleText(!skipInvisible);
176 htmlGen->setAllTextInvisible(allInvisible);
177 htmlGen->setExtractFontFiles(!noFonts);
178 htmlGen->startDoc(doc);
179
180 // convert the pages
181 for (pg = firstPage; pg <= lastPage; ++pg) {
182 htmlFileName = GString::format("{0:s}/page{1:d}.html", htmlDir, pg);
183 pngFileName = GString::format("{0:s}/page{1:d}.png", htmlDir, pg);
184 if (!(htmlFile = openFile(htmlFileName->getCString(), "wb"))) {
185 error(errIO, -1, "Couldn't open HTML file '{0:t}'", htmlFileName);
186 delete htmlFileName;
187 delete pngFileName;
188 goto err2;
189 }
190 if (!(pngFile = openFile(pngFileName->getCString(), "wb"))) {
191 error(errIO, -1, "Couldn't open PNG file '{0:t}'", pngFileName);
192 fclose(htmlFile);
193 delete htmlFileName;
194 delete pngFileName;
195 goto err2;
196 }
197 pngURL = GString::format("page{0:d}.png", pg);
198 err = htmlGen->convertPage(pg, pngURL->getCString(), htmlDir,
199 &writeToFile, htmlFile,
200 &writeToFile, pngFile);
201 delete pngURL;
202 fclose(htmlFile);
203 fclose(pngFile);
204 delete htmlFileName;
205 delete pngFileName;
206 if (err != errNone) {
207 error(errIO, -1, "Error converting page {0:d}", pg);
208 exitCode = 2;
209 goto err2;
210 }
211 }
212
213 // create the master index
214 if (!createIndex(htmlDir)) {
215 exitCode = 2;
216 goto err2;
217 }
218
219 exitCode = 0;
220
221 // clean up
222 err2:
223 delete htmlGen;
224 err1:
225 delete doc;
226 delete globalParams;
227 err0:
228
229 // check for memory leaks
230 Object::memCheck(stderr);
231 gMemReport(stderr);
232
233 return exitCode;
234 }
235
createIndex(char * htmlDir)236 static GBool createIndex(char *htmlDir) {
237 GString *htmlFileName;
238 FILE *html;
239 int pg;
240
241 htmlFileName = GString::format("{0:s}/index.html", htmlDir);
242 html = openFile(htmlFileName->getCString(), "w");
243 delete htmlFileName;
244 if (!html) {
245 error(errIO, -1, "Couldn't open HTML file '{0:t}'", htmlFileName);
246 return gFalse;
247 }
248
249 fprintf(html, "<html>\n");
250 fprintf(html, "<body>\n");
251 for (pg = firstPage; pg <= lastPage; ++pg) {
252 fprintf(html, "<a href=\"page%d.html\">page %d</a><br>\n", pg, pg);
253 }
254 fprintf(html, "</body>\n");
255 fprintf(html, "</html>\n");
256
257 fclose(html);
258
259 return gTrue;
260 }
261