1 //========================================================================
2 //
3 // pdftotext.cc
4 //
5 // Copyright 1997-2003 Glyph & Cog, LLC
6 //
7 // Modified for Debian by Hamish Moffatt, 22 May 2002.
8 //
9 //========================================================================
10
11 //========================================================================
12 //
13 // Modified under the Poppler project - http://poppler.freedesktop.org
14 //
15 // All changes made under the Poppler project to this file are licensed
16 // under GPL version 2 or later
17 //
18 // Copyright (C) 2006 Dominic Lachowicz <cinamod@hotmail.com>
19 // Copyright (C) 2007-2008, 2010, 2011 Albert Astals Cid <aacid@kde.org>
20 // Copyright (C) 2009 Jan Jockusch <jan@jockusch.de>
21 // Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
22 // Copyright (C) 2010 Kenneth Berland <ken@hero.com>
23 //
24 // To see a description of the changes please see the Changelog file that
25 // came with your tarball or type make ChangeLog if you are building from git
26 //
27 //========================================================================
28
29 #include "config.h"
30 #include <poppler-config.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <stddef.h>
34 #include <string.h>
35 #include "parseargs.h"
36 #include "printencodings.h"
37 #include "goo/GooString.h"
38 #include "goo/gmem.h"
39 #include "GlobalParams.h"
40 #include "Object.h"
41 #include "Stream.h"
42 #include "Array.h"
43 #include "Dict.h"
44 #include "XRef.h"
45 #include "Catalog.h"
46 #include "Page.h"
47 #include "PDFDoc.h"
48 #include "PDFDocFactory.h"
49 #include "TextOutputDev.h"
50 #include "CharTypes.h"
51 #include "UnicodeMap.h"
52 #include "Error.h"
53 #include <string>
54
55 static void printInfoString(FILE *f, Dict *infoDict, char *key,
56 char *text1, char *text2, UnicodeMap *uMap);
57 static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt);
58
59 static int firstPage = 1;
60 static int lastPage = 0;
61 static double resolution = 72.0;
62 static int x = 0;
63 static int y = 0;
64 static int w = 0;
65 static int h = 0;
66 static GBool bbox = gFalse;
67 static GBool physLayout = gFalse;
68 static GBool rawOrder = gFalse;
69 static GBool htmlMeta = gFalse;
70 static char textEncName[128] = "";
71 static char textEOL[16] = "";
72 static GBool noPageBreaks = gFalse;
73 static char ownerPassword[33] = "\001";
74 static char userPassword[33] = "\001";
75 static GBool quiet = gFalse;
76 static GBool printVersion = gFalse;
77 static GBool printHelp = gFalse;
78 static GBool printEnc = gFalse;
79
80 static const ArgDesc argDesc[] = {
81 {"-f", argInt, &firstPage, 0,
82 "first page to convert"},
83 {"-l", argInt, &lastPage, 0,
84 "last page to convert"},
85 {"-r", argFP, &resolution, 0,
86 "resolution, in DPI (default is 72)"},
87 {"-x", argInt, &x, 0,
88 "x-coordinate of the crop area top left corner"},
89 {"-y", argInt, &y, 0,
90 "y-coordinate of the crop area top left corner"},
91 {"-W", argInt, &w, 0,
92 "width of crop area in pixels (default is 0)"},
93 {"-H", argInt, &h, 0,
94 "height of crop area in pixels (default is 0)"},
95 {"-layout", argFlag, &physLayout, 0,
96 "maintain original physical layout"},
97 {"-raw", argFlag, &rawOrder, 0,
98 "keep strings in content stream order"},
99 {"-htmlmeta", argFlag, &htmlMeta, 0,
100 "generate a simple HTML file, including the meta information"},
101 {"-enc", argString, textEncName, sizeof(textEncName),
102 "output text encoding name"},
103 {"-listenc",argFlag, &printEnc, 0,
104 "list available encodings"},
105 {"-eol", argString, textEOL, sizeof(textEOL),
106 "output end-of-line convention (unix, dos, or mac)"},
107 {"-nopgbrk", argFlag, &noPageBreaks, 0,
108 "don't insert page breaks between pages"},
109 {"-bbox", argFlag, &bbox, 0,
110 "output bounding box for each word and page size to html. Sets -htmlmeta"},
111 {"-opw", argString, ownerPassword, sizeof(ownerPassword),
112 "owner password (for encrypted files)"},
113 {"-upw", argString, userPassword, sizeof(userPassword),
114 "user password (for encrypted files)"},
115 {"-q", argFlag, &quiet, 0,
116 "don't print any messages or errors"},
117 {"-v", argFlag, &printVersion, 0,
118 "print copyright and version info"},
119 {"-h", argFlag, &printHelp, 0,
120 "print usage information"},
121 {"-help", argFlag, &printHelp, 0,
122 "print usage information"},
123 {"--help", argFlag, &printHelp, 0,
124 "print usage information"},
125 {"-?", argFlag, &printHelp, 0,
126 "print usage information"},
127 {NULL}
128 };
129
myStringReplace(const std::string & inString,const std::string & oldToken,const std::string & newToken)130 static std::string myStringReplace(const std::string &inString, const std::string &oldToken, const std::string &newToken) {
131 std::string result = inString;
132 size_t foundLoc;
133 int advance = 0;
134 do {
135 foundLoc = result.find(oldToken, advance);
136 if (foundLoc != std::string::npos){
137 result.replace(foundLoc, oldToken.length(), newToken);
138 advance = foundLoc + newToken.length();
139 }
140 } while (foundLoc != std::string::npos );
141 return result;
142 }
143
myXmlTokenReplace(const char * inString)144 static std::string myXmlTokenReplace(const char *inString){
145 std::string myString(inString);
146 myString = myStringReplace(myString, "&", "&" );
147 myString = myStringReplace(myString, "'", "'" );
148 myString = myStringReplace(myString, "\"", """ );
149 myString = myStringReplace(myString, "<", "<" );
150 myString = myStringReplace(myString, ">", ">" );
151 return myString;
152 }
153
main(int argc,char * argv[])154 int main(int argc, char *argv[]) {
155 PDFDoc *doc;
156 GooString *fileName;
157 GooString *textFileName;
158 GooString *ownerPW, *userPW;
159 TextOutputDev *textOut;
160 FILE *f;
161 UnicodeMap *uMap;
162 Object info;
163 GBool ok;
164 char *p;
165 int exitCode;
166
167 exitCode = 99;
168
169 // parse args
170 ok = parseArgs(argDesc, &argc, argv);
171 if (bbox) {
172 htmlMeta = gTrue;
173 }
174 if (!ok || (argc < 2 && !printEnc) || argc > 3 || printVersion || printHelp) {
175 fprintf(stderr, "pdftotext version %s\n", PACKAGE_VERSION);
176 fprintf(stderr, "%s\n", popplerCopyright);
177 fprintf(stderr, "%s\n", xpdfCopyright);
178 if (!printVersion) {
179 printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);
180 }
181 if (printVersion || printHelp)
182 exitCode = 0;
183 goto err0;
184 }
185
186 // read config file
187 globalParams = new GlobalParams();
188
189 if (printEnc) {
190 printEncodings();
191 delete globalParams;
192 exitCode = 0;
193 goto err0;
194 }
195
196 fileName = new GooString(argv[1]);
197
198 if (textEncName[0]) {
199 globalParams->setTextEncoding(textEncName);
200 }
201 if (textEOL[0]) {
202 if (!globalParams->setTextEOL(textEOL)) {
203 fprintf(stderr, "Bad '-eol' value on command line\n");
204 }
205 }
206 if (noPageBreaks) {
207 globalParams->setTextPageBreaks(gFalse);
208 }
209 if (quiet) {
210 globalParams->setErrQuiet(quiet);
211 }
212
213 // get mapping to output encoding
214 if (!(uMap = globalParams->getTextEncoding())) {
215 error(-1, "Couldn't get text encoding");
216 delete fileName;
217 goto err1;
218 }
219
220 // open PDF file
221 if (ownerPassword[0] != '\001') {
222 ownerPW = new GooString(ownerPassword);
223 } else {
224 ownerPW = NULL;
225 }
226 if (userPassword[0] != '\001') {
227 userPW = new GooString(userPassword);
228 } else {
229 userPW = NULL;
230 }
231
232 if (fileName->cmp("-") == 0) {
233 delete fileName;
234 fileName = new GooString("fd://0");
235 }
236
237 doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW);
238
239 if (userPW) {
240 delete userPW;
241 }
242 if (ownerPW) {
243 delete ownerPW;
244 }
245 if (!doc->isOk()) {
246 exitCode = 1;
247 goto err2;
248 }
249
250 #ifdef ENFORCE_PERMISSIONS
251 // check for copy permission
252 if (!doc->okToCopy()) {
253 error(-1, "Copying of text from this document is not allowed.");
254 exitCode = 3;
255 goto err2;
256 }
257 #endif
258
259 // construct text file name
260 if (argc == 3) {
261 textFileName = new GooString(argv[2]);
262 } else if (fileName->cmp("fd://0") == 0) {
263 error(-1, "You have to provide an output filename when reading form stdin.");
264 goto err2;
265 } else {
266 p = fileName->getCString() + fileName->getLength() - 4;
267 if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
268 textFileName = new GooString(fileName->getCString(),
269 fileName->getLength() - 4);
270 } else {
271 textFileName = fileName->copy();
272 }
273 textFileName->append(htmlMeta ? ".html" : ".txt");
274 }
275
276 // get page range
277 if (firstPage < 1) {
278 firstPage = 1;
279 }
280 if (lastPage < 1 || lastPage > doc->getNumPages()) {
281 lastPage = doc->getNumPages();
282 }
283
284 // write HTML header
285 if (htmlMeta) {
286 if (!textFileName->cmp("-")) {
287 f = stdout;
288 } else {
289 if (!(f = fopen(textFileName->getCString(), "wb"))) {
290 error(-1, "Couldn't open text file '%s'", textFileName->getCString());
291 exitCode = 2;
292 goto err3;
293 }
294 }
295 fputs("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">", f);
296 fputs("<html xmlns=\"http://www.w3.org/1999/xhtml\">\n", f);
297 fputs("<head>\n", f);
298 doc->getDocInfo(&info);
299 if (info.isDict()) {
300 Object obj;
301 if (info.getDict()->lookup("Title", &obj)->isString()) {
302 printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n", uMap);
303 } else {
304 fputs("<title></title>\n", f);
305 }
306 obj.free();
307 printInfoString(f, info.getDict(), "Subject",
308 "<meta name=\"Subject\" content=\"", "\"/>\n", uMap);
309 printInfoString(f, info.getDict(), "Keywords",
310 "<meta name=\"Keywords\" content=\"", "\"/>\n", uMap);
311 printInfoString(f, info.getDict(), "Author",
312 "<meta name=\"Author\" content=\"", "\"/>\n", uMap);
313 printInfoString(f, info.getDict(), "Creator",
314 "<meta name=\"Creator\" content=\"", "\"/>\n", uMap);
315 printInfoString(f, info.getDict(), "Producer",
316 "<meta name=\"Producer\" content=\"", "\"/>\n", uMap);
317 printInfoDate(f, info.getDict(), "CreationDate",
318 "<meta name=\"CreationDate\" content=\"\"/>\n");
319 printInfoDate(f, info.getDict(), "LastModifiedDate",
320 "<meta name=\"ModDate\" content=\"\"/>\n");
321 }
322 info.free();
323 fputs("</head>\n", f);
324 fputs("<body>\n", f);
325 if (!bbox) fputs("<pre>\n", f);
326 if (f != stdout) {
327 fclose(f);
328 }
329 }
330
331 // write text file
332 if (bbox) {
333 textOut = new TextOutputDev(NULL, physLayout, rawOrder, htmlMeta);
334 if (!(f = fopen(textFileName->getCString(), "ab"))) {
335 error(-1, "Couldn't open text file '%s' for append", textFileName->getCString());
336 exitCode = 2;
337 delete textOut;
338 goto err3;
339 }
340
341 if (textOut->isOk()) {
342 fprintf(f, "<doc>\n");
343 for (int page = firstPage; page <= lastPage; ++page) {
344 fprintf(f, " <page width=\"%f\" height=\"%f\">\n",doc->getPageCropWidth(page), doc->getPageCropHeight(page));
345 doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse);
346 TextWordList *wordlist = textOut->makeWordList();
347 const int word_length = wordlist != NULL ? wordlist->getLength() : 0;
348 TextWord *word;
349 double xMinA, yMinA, xMaxA, yMaxA;
350 if (word_length == 0)
351 fprintf(stderr, "no word list\n");
352
353 for (int i = 0; i < word_length; ++i) {
354 word = wordlist->get(i);
355 word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA);
356 const std::string myString = myXmlTokenReplace(word->getText()->getCString());
357 fprintf(f," <word xMin=\"%f\" yMin=\"%f\" xMax=\"%f\" yMax=\"%f\">%s</word>\n", xMinA, yMinA, xMaxA, yMaxA, myString.c_str());
358 }
359 fprintf(f, " </page>\n");
360 delete wordlist;
361 }
362 fprintf(f, "</doc>\n");
363 }
364 fclose(f);
365 } else {
366 textOut = new TextOutputDev(textFileName->getCString(),
367 physLayout, rawOrder, htmlMeta);
368 if (textOut->isOk()) {
369 if ((w==0) && (h==0) && (x==0) && (y==0)) {
370 doc->displayPages(textOut, firstPage, lastPage, resolution, resolution, 0,
371 gTrue, gFalse, gFalse);
372 } else {
373
374 for (int page = firstPage; page <= lastPage; ++page) {
375 doc->displayPageSlice(textOut, page, resolution, resolution, 0,
376 gTrue, gFalse, gFalse,
377 x, y, w, h);
378 }
379 }
380
381 } else {
382 delete textOut;
383 exitCode = 2;
384 goto err3;
385 }
386 }
387 delete textOut;
388
389 // write end of HTML file
390 if (htmlMeta) {
391 if (!textFileName->cmp("-")) {
392 f = stdout;
393 } else {
394 if (!(f = fopen(textFileName->getCString(), "ab"))) {
395 error(-1, "Couldn't open text file '%s'", textFileName->getCString());
396 exitCode = 2;
397 goto err3;
398 }
399 }
400 if (!bbox) fputs("</pre>\n", f);
401 fputs("</body>\n", f);
402 fputs("</html>\n", f);
403 if (f != stdout) {
404 fclose(f);
405 }
406 }
407
408 exitCode = 0;
409
410 // clean up
411 err3:
412 delete textFileName;
413 err2:
414 delete doc;
415 delete fileName;
416 uMap->decRefCnt();
417 err1:
418 delete globalParams;
419 err0:
420
421 // check for memory leaks
422 Object::memCheck(stderr);
423 gMemReport(stderr);
424
425 return exitCode;
426 }
427
printInfoString(FILE * f,Dict * infoDict,char * key,char * text1,char * text2,UnicodeMap * uMap)428 static void printInfoString(FILE *f, Dict *infoDict, char *key,
429 char *text1, char *text2, UnicodeMap *uMap) {
430 Object obj;
431 GooString *s1;
432 GBool isUnicode;
433 Unicode u;
434 char buf[8];
435 int i, n;
436
437 if (infoDict->lookup(key, &obj)->isString()) {
438 fputs(text1, f);
439 s1 = obj.getString();
440 if ((s1->getChar(0) & 0xff) == 0xfe &&
441 (s1->getChar(1) & 0xff) == 0xff) {
442 isUnicode = gTrue;
443 i = 2;
444 } else {
445 isUnicode = gFalse;
446 i = 0;
447 }
448 while (i < obj.getString()->getLength()) {
449 if (isUnicode) {
450 u = ((s1->getChar(i) & 0xff) << 8) |
451 (s1->getChar(i+1) & 0xff);
452 i += 2;
453 } else {
454 u = s1->getChar(i) & 0xff;
455 ++i;
456 }
457 n = uMap->mapUnicode(u, buf, sizeof(buf));
458 fwrite(buf, 1, n, f);
459 }
460 fputs(text2, f);
461 }
462 obj.free();
463 }
464
printInfoDate(FILE * f,Dict * infoDict,char * key,char * fmt)465 static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt) {
466 Object obj;
467 char *s;
468
469 if (infoDict->lookup(key, &obj)->isString()) {
470 s = obj.getString()->getCString();
471 if (s[0] == 'D' && s[1] == ':') {
472 s += 2;
473 }
474 fprintf(f, fmt, s);
475 }
476 obj.free();
477 }
478