1 //========================================================================
2 //
3 // pdftotext.cc
4 //
5 // Copyright 1997-2013 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 #include <aconf.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stddef.h>
13 #include <string.h>
14 #ifdef DEBUG_FP_LINUX
15 #  include <fenv.h>
16 #  include <fpu_control.h>
17 #endif
18 #include "gmem.h"
19 #include "gmempp.h"
20 #include "parseargs.h"
21 #include "GString.h"
22 #include "GList.h"
23 #include "GlobalParams.h"
24 #include "Object.h"
25 #include "Stream.h"
26 #include "Array.h"
27 #include "Dict.h"
28 #include "XRef.h"
29 #include "Catalog.h"
30 #include "Page.h"
31 #include "PDFDoc.h"
32 #include "TextOutputDev.h"
33 #include "CharTypes.h"
34 #include "UnicodeMap.h"
35 #include "TextString.h"
36 #include "Error.h"
37 #include "config.h"
38 
39 static int firstPage = 1;
40 static int lastPage = 0;
41 static GBool physLayout = gFalse;
42 static GBool simpleLayout = gFalse;
43 static GBool simple2Layout = gFalse;
44 static GBool tableLayout = gFalse;
45 static GBool linePrinter = gFalse;
46 static GBool rawOrder = gFalse;
47 static double fixedPitch = 0;
48 static double fixedLineSpacing = 0;
49 static GBool clipText = gFalse;
50 static GBool discardDiag = gFalse;
51 static char textEncName[128] = "";
52 static char textEOL[16] = "";
53 static GBool noPageBreaks = gFalse;
54 static GBool insertBOM = gFalse;
55 static double marginLeft = 0;
56 static double marginRight = 0;
57 static double marginTop = 0;
58 static double marginBottom = 0;
59 static char ownerPassword[33] = "\001";
60 static char userPassword[33] = "\001";
61 static GBool quiet = gFalse;
62 static char cfgFileName[256] = "";
63 static GBool listEncodings = gFalse;
64 static GBool printVersion = gFalse;
65 static GBool printHelp = gFalse;
66 
67 static ArgDesc argDesc[] = {
68   {"-f",       argInt,      &firstPage,     0,
69    "first page to convert"},
70   {"-l",       argInt,      &lastPage,      0,
71    "last page to convert"},
72   {"-layout",  argFlag,     &physLayout,    0,
73    "maintain original physical layout"},
74   {"-simple",  argFlag,     &simpleLayout,  0,
75    "simple one-column page layout"},
76   {"-simple2", argFlag,     &simple2Layout, 0,
77    "simple one-column page layout, version 2"},
78   {"-table",   argFlag,     &tableLayout,   0,
79    "similar to -layout, but optimized for tables"},
80   {"-lineprinter", argFlag, &linePrinter,   0,
81    "use strict fixed-pitch/height layout"},
82   {"-raw",     argFlag,     &rawOrder,      0,
83    "keep strings in content stream order"},
84   {"-fixed",   argFP,       &fixedPitch,    0,
85    "assume fixed-pitch (or tabular) text"},
86   {"-linespacing", argFP,   &fixedLineSpacing, 0,
87    "fixed line spacing for LinePrinter mode"},
88   {"-clip",    argFlag,     &clipText,      0,
89    "separate clipped text"},
90   {"-nodiag",  argFlag,     &discardDiag,   0,
91    "discard diagonal text"},
92   {"-enc",     argString,   textEncName,    sizeof(textEncName),
93    "output text encoding name"},
94   {"-eol",     argString,   textEOL,        sizeof(textEOL),
95    "output end-of-line convention (unix, dos, or mac)"},
96   {"-nopgbrk", argFlag,     &noPageBreaks,  0,
97    "don't insert page breaks between pages"},
98   {"-bom",     argFlag,     &insertBOM,     0,
99    "insert a Unicode BOM at the start of the text file"},
100   {"-marginl", argFP,       &marginLeft,    0,
101    "left page margin"},
102   {"-marginr", argFP,       &marginRight,   0,
103    "right page margin"},
104   {"-margint", argFP,       &marginTop,     0,
105    "top page margin"},
106   {"-marginb", argFP,       &marginBottom,  0,
107    "bottom page margin"},
108   {"-opw",     argString,   ownerPassword,  sizeof(ownerPassword),
109    "owner password (for encrypted files)"},
110   {"-upw",     argString,   userPassword,   sizeof(userPassword),
111    "user password (for encrypted files)"},
112   {"-q",       argFlag,     &quiet,         0,
113    "don't print any messages or errors"},
114   {"-cfg",     argString,   cfgFileName,    sizeof(cfgFileName),
115    "configuration file to use in place of .xpdfrc"},
116   {"-listencodings", argFlag, &listEncodings, 0,
117    "list all available output text encodings"},
118   {"-v",       argFlag,     &printVersion,  0,
119    "print copyright and version info"},
120   {"-h",       argFlag,     &printHelp,     0,
121    "print usage information"},
122   {"-help",    argFlag,     &printHelp,     0,
123    "print usage information"},
124   {"--help",   argFlag,     &printHelp,     0,
125    "print usage information"},
126   {"-?",       argFlag,     &printHelp,     0,
127    "print usage information"},
128   {NULL}
129 };
130 
main(int argc,char * argv[])131 int main(int argc, char *argv[]) {
132   PDFDoc *doc;
133   char *fileName;
134   GString *textFileName;
135   GString *ownerPW, *userPW;
136   TextOutputControl textOutControl;
137   TextOutputDev *textOut;
138   UnicodeMap *uMap;
139   GBool ok;
140   char *p;
141   int exitCode;
142 
143 #ifdef DEBUG_FP_LINUX
144   // enable exceptions on floating point div-by-zero
145   feenableexcept(FE_DIVBYZERO);
146   // force 64-bit rounding: this avoids changes in output when minor
147   // code changes result in spills of x87 registers; it also avoids
148   // differences in output with valgrind's 64-bit floating point
149   // emulation (yes, this is a kludge; but it's pretty much
150   // unavoidable given the x87 instruction set; see gcc bug 323 for
151   // more info)
152   fpu_control_t cw;
153   _FPU_GETCW(cw);
154   cw = (fpu_control_t)((cw & ~_FPU_EXTENDED) | _FPU_DOUBLE);
155   _FPU_SETCW(cw);
156 #endif
157 
158   exitCode = 99;
159 
160   // parse args
161   fixCommandLine(&argc, &argv);
162   ok = parseArgs(argDesc, &argc, argv);
163   if (ok && listEncodings) {
164     // list available encodings
165     globalParams = new GlobalParams(cfgFileName);
166     GList *encs = globalParams->getAvailableTextEncodings();
167     for (int i = 0; i < encs->getLength(); ++i) {
168       printf("%s\n", ((GString *)encs->get(i))->getCString());
169     }
170     deleteGList(encs, GString);
171     delete globalParams;
172     goto err0;
173   }
174   if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) {
175     fprintf(stderr, "pdftotext version %s [www.xpdfreader.com]\n", xpdfVersion);
176     fprintf(stderr, "%s\n", xpdfCopyright);
177     if (!printVersion) {
178       printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);
179     }
180     goto err0;
181   }
182   fileName = argv[1];
183 
184   // read config file
185   globalParams = new GlobalParams(cfgFileName);
186   if (textEncName[0]) {
187     globalParams->setTextEncoding(textEncName);
188   }
189   if (textEOL[0]) {
190     if (!globalParams->setTextEOL(textEOL)) {
191       fprintf(stderr, "Bad '-eol' value on command line\n");
192     }
193   }
194   if (noPageBreaks) {
195     globalParams->setTextPageBreaks(gFalse);
196   }
197   if (quiet) {
198     globalParams->setErrQuiet(quiet);
199   }
200 
201   // get mapping to output encoding
202   if (!(uMap = globalParams->getTextEncoding())) {
203     error(errConfig, -1, "Couldn't get text encoding");
204     goto err1;
205   }
206 
207   // open PDF file
208   if (ownerPassword[0] != '\001') {
209     ownerPW = new GString(ownerPassword);
210   } else {
211     ownerPW = NULL;
212   }
213   if (userPassword[0] != '\001') {
214     userPW = new GString(userPassword);
215   } else {
216     userPW = NULL;
217   }
218   doc = new PDFDoc(fileName, ownerPW, userPW);
219   if (userPW) {
220     delete userPW;
221   }
222   if (ownerPW) {
223     delete ownerPW;
224   }
225   if (!doc->isOk()) {
226     exitCode = 1;
227     goto err2;
228   }
229 
230   // check for copy permission
231   if (!doc->okToCopy()) {
232     error(errNotAllowed, -1,
233 	  "Copying of text from this document is not allowed.");
234     exitCode = 3;
235     goto err2;
236   }
237 
238   // construct text file name
239   if (argc == 3) {
240     textFileName = new GString(argv[2]);
241   } else {
242     p = fileName + strlen(fileName) - 4;
243     if (strlen(fileName) > 4 && (!strcmp(p, ".pdf") || !strcmp(p, ".PDF"))) {
244       textFileName = new GString(fileName, (int)strlen(fileName) - 4);
245     } else {
246       textFileName = new GString(fileName);
247     }
248     textFileName->append(".txt");
249   }
250 
251   // get page range
252   if (firstPage < 1) {
253     firstPage = 1;
254   }
255   if (lastPage < 1 || lastPage > doc->getNumPages()) {
256     lastPage = doc->getNumPages();
257   }
258 
259   // write text file
260   if (tableLayout) {
261     textOutControl.mode = textOutTableLayout;
262     textOutControl.fixedPitch = fixedPitch;
263   } else if (physLayout) {
264     textOutControl.mode = textOutPhysLayout;
265     textOutControl.fixedPitch = fixedPitch;
266   } else if (simpleLayout) {
267     textOutControl.mode = textOutSimpleLayout;
268   } else if (simple2Layout) {
269     textOutControl.mode = textOutSimple2Layout;
270   } else if (linePrinter) {
271     textOutControl.mode = textOutLinePrinter;
272     textOutControl.fixedPitch = fixedPitch;
273     textOutControl.fixedLineSpacing = fixedLineSpacing;
274   } else if (rawOrder) {
275     textOutControl.mode = textOutRawOrder;
276   } else {
277     textOutControl.mode = textOutReadingOrder;
278   }
279   textOutControl.clipText = clipText;
280   textOutControl.discardDiagonalText = discardDiag;
281   textOutControl.insertBOM = insertBOM;
282   textOutControl.marginLeft = marginLeft;
283   textOutControl.marginRight = marginRight;
284   textOutControl.marginTop = marginTop;
285   textOutControl.marginBottom = marginBottom;
286   textOut = new TextOutputDev(textFileName->getCString(), &textOutControl,
287 			      gFalse, gTrue);
288   if (textOut->isOk()) {
289     doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0,
290 		      gFalse, gTrue, gFalse);
291   } else {
292     delete textOut;
293     exitCode = 2;
294     goto err3;
295   }
296   delete textOut;
297 
298   exitCode = 0;
299 
300   // clean up
301  err3:
302   delete textFileName;
303  err2:
304   delete doc;
305   uMap->decRefCnt();
306  err1:
307   delete globalParams;
308  err0:
309 
310   // check for memory leaks
311   Object::memCheck(stderr);
312   gMemReport(stderr);
313 
314   return exitCode;
315 }
316