1 //========================================================================
2 //
3 // pdftotext.cc
4 //
5 // Copyright 1997-2013 Glyph & Cog, LLC
6 //
7 //========================================================================
8
9 #include <aconf.h>
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stddef.h>
13 #include <string.h>
14 #ifdef DEBUG_FP_LINUX
15 # include <fenv.h>
16 # include <fpu_control.h>
17 #endif
18 #include "gmem.h"
19 #include "gmempp.h"
20 #include "parseargs.h"
21 #include "GString.h"
22 #include "GList.h"
23 #include "GlobalParams.h"
24 #include "Object.h"
25 #include "Stream.h"
26 #include "Array.h"
27 #include "Dict.h"
28 #include "XRef.h"
29 #include "Catalog.h"
30 #include "Page.h"
31 #include "PDFDoc.h"
32 #include "TextOutputDev.h"
33 #include "CharTypes.h"
34 #include "UnicodeMap.h"
35 #include "TextString.h"
36 #include "Error.h"
37 #include "config.h"
38
39 static int firstPage = 1;
40 static int lastPage = 0;
41 static GBool physLayout = gFalse;
42 static GBool simpleLayout = gFalse;
43 static GBool simple2Layout = gFalse;
44 static GBool tableLayout = gFalse;
45 static GBool linePrinter = gFalse;
46 static GBool rawOrder = gFalse;
47 static double fixedPitch = 0;
48 static double fixedLineSpacing = 0;
49 static GBool clipText = gFalse;
50 static GBool discardDiag = gFalse;
51 static char textEncName[128] = "";
52 static char textEOL[16] = "";
53 static GBool noPageBreaks = gFalse;
54 static GBool insertBOM = gFalse;
55 static double marginLeft = 0;
56 static double marginRight = 0;
57 static double marginTop = 0;
58 static double marginBottom = 0;
59 static char ownerPassword[33] = "\001";
60 static char userPassword[33] = "\001";
61 static GBool quiet = gFalse;
62 static char cfgFileName[256] = "";
63 static GBool listEncodings = gFalse;
64 static GBool printVersion = gFalse;
65 static GBool printHelp = gFalse;
66
67 static ArgDesc argDesc[] = {
68 {"-f", argInt, &firstPage, 0,
69 "first page to convert"},
70 {"-l", argInt, &lastPage, 0,
71 "last page to convert"},
72 {"-layout", argFlag, &physLayout, 0,
73 "maintain original physical layout"},
74 {"-simple", argFlag, &simpleLayout, 0,
75 "simple one-column page layout"},
76 {"-simple2", argFlag, &simple2Layout, 0,
77 "simple one-column page layout, version 2"},
78 {"-table", argFlag, &tableLayout, 0,
79 "similar to -layout, but optimized for tables"},
80 {"-lineprinter", argFlag, &linePrinter, 0,
81 "use strict fixed-pitch/height layout"},
82 {"-raw", argFlag, &rawOrder, 0,
83 "keep strings in content stream order"},
84 {"-fixed", argFP, &fixedPitch, 0,
85 "assume fixed-pitch (or tabular) text"},
86 {"-linespacing", argFP, &fixedLineSpacing, 0,
87 "fixed line spacing for LinePrinter mode"},
88 {"-clip", argFlag, &clipText, 0,
89 "separate clipped text"},
90 {"-nodiag", argFlag, &discardDiag, 0,
91 "discard diagonal text"},
92 {"-enc", argString, textEncName, sizeof(textEncName),
93 "output text encoding name"},
94 {"-eol", argString, textEOL, sizeof(textEOL),
95 "output end-of-line convention (unix, dos, or mac)"},
96 {"-nopgbrk", argFlag, &noPageBreaks, 0,
97 "don't insert page breaks between pages"},
98 {"-bom", argFlag, &insertBOM, 0,
99 "insert a Unicode BOM at the start of the text file"},
100 {"-marginl", argFP, &marginLeft, 0,
101 "left page margin"},
102 {"-marginr", argFP, &marginRight, 0,
103 "right page margin"},
104 {"-margint", argFP, &marginTop, 0,
105 "top page margin"},
106 {"-marginb", argFP, &marginBottom, 0,
107 "bottom page margin"},
108 {"-opw", argString, ownerPassword, sizeof(ownerPassword),
109 "owner password (for encrypted files)"},
110 {"-upw", argString, userPassword, sizeof(userPassword),
111 "user password (for encrypted files)"},
112 {"-q", argFlag, &quiet, 0,
113 "don't print any messages or errors"},
114 {"-cfg", argString, cfgFileName, sizeof(cfgFileName),
115 "configuration file to use in place of .xpdfrc"},
116 {"-listencodings", argFlag, &listEncodings, 0,
117 "list all available output text encodings"},
118 {"-v", argFlag, &printVersion, 0,
119 "print copyright and version info"},
120 {"-h", argFlag, &printHelp, 0,
121 "print usage information"},
122 {"-help", argFlag, &printHelp, 0,
123 "print usage information"},
124 {"--help", argFlag, &printHelp, 0,
125 "print usage information"},
126 {"-?", argFlag, &printHelp, 0,
127 "print usage information"},
128 {NULL}
129 };
130
main(int argc,char * argv[])131 int main(int argc, char *argv[]) {
132 PDFDoc *doc;
133 char *fileName;
134 GString *textFileName;
135 GString *ownerPW, *userPW;
136 TextOutputControl textOutControl;
137 TextOutputDev *textOut;
138 UnicodeMap *uMap;
139 GBool ok;
140 char *p;
141 int exitCode;
142
143 #ifdef DEBUG_FP_LINUX
144 // enable exceptions on floating point div-by-zero
145 feenableexcept(FE_DIVBYZERO);
146 // force 64-bit rounding: this avoids changes in output when minor
147 // code changes result in spills of x87 registers; it also avoids
148 // differences in output with valgrind's 64-bit floating point
149 // emulation (yes, this is a kludge; but it's pretty much
150 // unavoidable given the x87 instruction set; see gcc bug 323 for
151 // more info)
152 fpu_control_t cw;
153 _FPU_GETCW(cw);
154 cw = (fpu_control_t)((cw & ~_FPU_EXTENDED) | _FPU_DOUBLE);
155 _FPU_SETCW(cw);
156 #endif
157
158 exitCode = 99;
159
160 // parse args
161 fixCommandLine(&argc, &argv);
162 ok = parseArgs(argDesc, &argc, argv);
163 if (ok && listEncodings) {
164 // list available encodings
165 globalParams = new GlobalParams(cfgFileName);
166 GList *encs = globalParams->getAvailableTextEncodings();
167 for (int i = 0; i < encs->getLength(); ++i) {
168 printf("%s\n", ((GString *)encs->get(i))->getCString());
169 }
170 deleteGList(encs, GString);
171 delete globalParams;
172 goto err0;
173 }
174 if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) {
175 fprintf(stderr, "pdftotext version %s [www.xpdfreader.com]\n", xpdfVersion);
176 fprintf(stderr, "%s\n", xpdfCopyright);
177 if (!printVersion) {
178 printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);
179 }
180 goto err0;
181 }
182 fileName = argv[1];
183
184 // read config file
185 globalParams = new GlobalParams(cfgFileName);
186 if (textEncName[0]) {
187 globalParams->setTextEncoding(textEncName);
188 }
189 if (textEOL[0]) {
190 if (!globalParams->setTextEOL(textEOL)) {
191 fprintf(stderr, "Bad '-eol' value on command line\n");
192 }
193 }
194 if (noPageBreaks) {
195 globalParams->setTextPageBreaks(gFalse);
196 }
197 if (quiet) {
198 globalParams->setErrQuiet(quiet);
199 }
200
201 // get mapping to output encoding
202 if (!(uMap = globalParams->getTextEncoding())) {
203 error(errConfig, -1, "Couldn't get text encoding");
204 goto err1;
205 }
206
207 // open PDF file
208 if (ownerPassword[0] != '\001') {
209 ownerPW = new GString(ownerPassword);
210 } else {
211 ownerPW = NULL;
212 }
213 if (userPassword[0] != '\001') {
214 userPW = new GString(userPassword);
215 } else {
216 userPW = NULL;
217 }
218 doc = new PDFDoc(fileName, ownerPW, userPW);
219 if (userPW) {
220 delete userPW;
221 }
222 if (ownerPW) {
223 delete ownerPW;
224 }
225 if (!doc->isOk()) {
226 exitCode = 1;
227 goto err2;
228 }
229
230 // check for copy permission
231 if (!doc->okToCopy()) {
232 error(errNotAllowed, -1,
233 "Copying of text from this document is not allowed.");
234 exitCode = 3;
235 goto err2;
236 }
237
238 // construct text file name
239 if (argc == 3) {
240 textFileName = new GString(argv[2]);
241 } else {
242 p = fileName + strlen(fileName) - 4;
243 if (strlen(fileName) > 4 && (!strcmp(p, ".pdf") || !strcmp(p, ".PDF"))) {
244 textFileName = new GString(fileName, (int)strlen(fileName) - 4);
245 } else {
246 textFileName = new GString(fileName);
247 }
248 textFileName->append(".txt");
249 }
250
251 // get page range
252 if (firstPage < 1) {
253 firstPage = 1;
254 }
255 if (lastPage < 1 || lastPage > doc->getNumPages()) {
256 lastPage = doc->getNumPages();
257 }
258
259 // write text file
260 if (tableLayout) {
261 textOutControl.mode = textOutTableLayout;
262 textOutControl.fixedPitch = fixedPitch;
263 } else if (physLayout) {
264 textOutControl.mode = textOutPhysLayout;
265 textOutControl.fixedPitch = fixedPitch;
266 } else if (simpleLayout) {
267 textOutControl.mode = textOutSimpleLayout;
268 } else if (simple2Layout) {
269 textOutControl.mode = textOutSimple2Layout;
270 } else if (linePrinter) {
271 textOutControl.mode = textOutLinePrinter;
272 textOutControl.fixedPitch = fixedPitch;
273 textOutControl.fixedLineSpacing = fixedLineSpacing;
274 } else if (rawOrder) {
275 textOutControl.mode = textOutRawOrder;
276 } else {
277 textOutControl.mode = textOutReadingOrder;
278 }
279 textOutControl.clipText = clipText;
280 textOutControl.discardDiagonalText = discardDiag;
281 textOutControl.insertBOM = insertBOM;
282 textOutControl.marginLeft = marginLeft;
283 textOutControl.marginRight = marginRight;
284 textOutControl.marginTop = marginTop;
285 textOutControl.marginBottom = marginBottom;
286 textOut = new TextOutputDev(textFileName->getCString(), &textOutControl,
287 gFalse, gTrue);
288 if (textOut->isOk()) {
289 doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0,
290 gFalse, gTrue, gFalse);
291 } else {
292 delete textOut;
293 exitCode = 2;
294 goto err3;
295 }
296 delete textOut;
297
298 exitCode = 0;
299
300 // clean up
301 err3:
302 delete textFileName;
303 err2:
304 delete doc;
305 uMap->decRefCnt();
306 err1:
307 delete globalParams;
308 err0:
309
310 // check for memory leaks
311 Object::memCheck(stderr);
312 gMemReport(stderr);
313
314 return exitCode;
315 }
316