1 /* -*- Mode: Java; tab-width: 4; c-basic-offset: 4 -*- */
2 /*
3  * $Id: PRTokeniser.java,v 1.15 2002/06/20 13:30:25 blowagie Exp $
4  * $Name:  $
5  *
6  * Copyright 2001, 2002 by Paulo Soares.
7  *
8  *
9  * The Original Code is 'iText, a free JAVA-PDF library'.
10  *
11  * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
12  * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
13  * All Rights Reserved.
14  * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
15  * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
16  *
17  * Contributor(s): all the names of the contributors are added in the source code
18  * where applicable.
19  *
20  *
21  * This library is free software; you can redistribute it and/or
22  * modify it under the terms of the GNU Library General Public
23  * License as published by the Free Software Foundation; either
24  * version 2 of the License, or (at your option) any later version.
25  *
26  * This library is distributed in the hope that it will be useful,
27  * but WITHOUT ANY WARRANTY; without even the implied warranty of
28  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
29  * Library General Public License for more details.
30  *
31  * You should have received a copy of the GNU Library General Public
32  * License along with this library; if not, write to the
33  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
34  * Boston, MA  02110-1301, USA.
35  *
36  *
37  * This library is free software; you can redistribute it and/or
38  * modify it under the terms of the GNU Library General Public
39  * License as published by the Free Software Foundation; either
40  * version 2 of the License, or (at your option) any later version.
41  *
42  * This library is distributed in the hope that it will be useful,
43  * but WITHOUT ANY WARRANTY; without even the implied warranty of
44  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
45  * Library General Public License for more details.
46  *
47  * You should have received a copy of the GNU Library General Public
48  * License along with this library; if not, write to the
49  * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
50  * Boston, MA  02110-1301, USA.
51  *
52  *
53  * If you didn't download this code from the following link, you should check if
54  * you aren't using an obsolete version:
55  * http://www.lowagie.com/iText/
56  */
57 
58 package com.gitlab.pdftk_java.com.lowagie.text.pdf;
59 
60 import java.io.IOException;
61 import com.gitlab.pdftk_java.com.lowagie.text.exceptions.InvalidPdfException;
62 /**
63  *
64  * @author  Paulo Soares (psoares@consiste.pt)
65  */
66 public class PRTokeniser {
67 
68     public static final int TK_NUMBER = 1;
69     public static final int TK_STRING = 2;
70     public static final int TK_NAME = 3;
71     public static final int TK_COMMENT = 4;
72     public static final int TK_START_ARRAY = 5;
73     public static final int TK_END_ARRAY = 6;
74     public static final int TK_START_DIC = 7;
75     public static final int TK_END_DIC = 8;
76     public static final int TK_REF = 9;
77     public static final int TK_OTHER = 10;
78     public static final int TK_ENDOFFILE = 11;
79     public static final boolean delims[] = {
80         true,  true,  false, false, false, false, false, false, false, false,
81         true,  true,  false, true,  true,  false, false, false, false, false,
82         false, false, false, false, false, false, false, false, false, false,
83         false, false, false, true,  false, false, false, false, true,  false,
84         false, true,  true,  false, false, false, false, false, true,  false,
85         false, false, false, false, false, false, false, false, false, false,
86         false, true,  false, true,  false, false, false, false, false, false,
87         false, false, false, false, false, false, false, false, false, false,
88         false, false, false, false, false, false, false, false, false, false,
89         false, false, true,  false, true,  false, false, false, false, false,
90         false, false, false, false, false, false, false, false, false, false,
91         false, false, false, false, false, false, false, false, false, false,
92         false, false, false, false, false, false, false, false, false, false,
93         false, false, false, false, false, false, false, false, false, false,
94         false, false, false, false, false, false, false, false, false, false,
95         false, false, false, false, false, false, false, false, false, false,
96         false, false, false, false, false, false, false, false, false, false,
97         false, false, false, false, false, false, false, false, false, false,
98         false, false, false, false, false, false, false, false, false, false,
99         false, false, false, false, false, false, false, false, false, false,
100         false, false, false, false, false, false, false, false, false, false,
101         false, false, false, false, false, false, false, false, false, false,
102         false, false, false, false, false, false, false, false, false, false,
103         false, false, false, false, false, false, false, false, false, false,
104         false, false, false, false, false, false, false, false, false, false,
105         false, false, false, false, false, false, false};
106 
107     static final String EMPTY = "";
108 
109 
110     protected RandomAccessFileOrArray file = null;
111     protected int type = 0;
112     protected String stringValue = "";
113     protected int reference = 0;
114     protected int generation = 0;
115     protected boolean hexString = false;
116 
PRTokeniser(String filename)117     public PRTokeniser(String filename) throws IOException {
118         file = new RandomAccessFileOrArray(filename);
119     }
120 
PRTokeniser(byte pdfIn[])121     public PRTokeniser(byte pdfIn[]) {
122         file = new RandomAccessFileOrArray(pdfIn);
123     }
124 
PRTokeniser(RandomAccessFileOrArray file)125     public PRTokeniser(RandomAccessFileOrArray file) {
126         this.file = file;
127     }
128 
seek(int pos)129     public void seek(int pos) throws IOException {
130         file.seek(pos);
131     }
132 
getFilePointer()133     public int getFilePointer() throws IOException {
134         return file.getFilePointer();
135     }
136 
close()137     public void close() throws IOException {
138         file.close();
139     }
140 
length()141     public int length() throws IOException {
142         return file.length();
143     }
144 
read()145     public int read() throws IOException {
146         return file.read();
147     }
148 
getSafeFile()149     public RandomAccessFileOrArray getSafeFile() throws IOException {
150         return new RandomAccessFileOrArray(file);
151     }
152 
getFile()153     public RandomAccessFileOrArray getFile() {
154         return file;
155     }
156 
readString(int size)157     public String readString(int size) throws IOException {
158         StringBuffer buf = new StringBuffer();
159         int ch;
160         while ((size--) > 0) {
161             ch = file.read();
162             if (ch == -1)
163                 break;
164             buf.append((char)ch);
165         }
166         return buf.toString();
167     }
168 
isWhitespace(int ch)169     public static final boolean isWhitespace(int ch) {
170         return (ch == 0 || ch == 9 || ch == 10 || ch == 12 || ch == 13 || ch == 32);
171     }
172 
isDelimiter(int ch)173     public static final boolean isDelimiter(int ch) {
174         return (ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '/' || ch == '%');
175     }
176 
isDelimiterWhitespace(int ch)177     public static final boolean isDelimiterWhitespace(int ch) {
178         return delims[ch + 1];
179     }
180 
getTokenType()181     public int getTokenType() {
182         return type;
183     }
184 
getStringValue()185     public String getStringValue() {
186         return stringValue;
187     }
188 
getReference()189     public int getReference() {
190         return reference;
191     }
192 
getGeneration()193     public int getGeneration() {
194         return generation;
195     }
196 
backOnePosition(int ch)197     public void backOnePosition(int ch) throws IOException {
198         if (ch != -1)
199             file.pushBack((byte)ch);
200     }
201 
throwError(String error)202     public void throwError(String error) throws IOException {
203         throw new InvalidPdfException(error + " at file pointer " + file.getFilePointer());
204     }
205 
checkPdfHeader()206     public char checkPdfHeader() throws IOException {
207         file.setStartOffset(0);
208         String str = readString(1024);
209         int idx = str.indexOf("%PDF-");
210         if (idx < 0)
211             throw new InvalidPdfException("PDF header signature not found.");
212         file.setStartOffset(idx);
213         return str.charAt(idx + 7);
214     }
215 
checkFdfHeader()216     public void checkFdfHeader() throws IOException {
217         file.setStartOffset(0);
218         String str = readString(1024);
219         int idx = str.indexOf("%FDF-1.2");
220         if (idx < 0)
221             throw new InvalidPdfException("FDF header signature not found.");
222         file.setStartOffset(idx);
223     }
224 
225     // "startxref" should always be at the end of a file
226     // Some non-compliant files have additional, unrelated data at the end
227     // (see https://gitlab.com/pdftk-java/pdftk/-/issues/90)
228     // So we have to keep searching if we do not find startxref at the end
getStartxref()229     public int getStartxref() throws IOException {
230         int size = Math.min(1024, file.length());
231         for (int pos = file.length() - size; pos>=0; pos-=1024) {
232             file.seek(pos);
233             // read a bit past a block, in case "startxref" is split between blocks
234             String str = readString(1024+10);
235             int idx = str.lastIndexOf("startxref");
236             if (idx >= 0) return pos + idx;
237         }
238         throw new InvalidPdfException("PDF startxref not found.");
239     }
240 
getHex(int v)241     public static int getHex(int v) {
242         if (v >= '0' && v <= '9')
243             return v - '0';
244         if (v >= 'A' && v <= 'F')
245             return v - 'A' + 10;
246         if (v >= 'a' && v <= 'f')
247             return v - 'a' + 10;
248         return -1;
249     }
250 
nextValidToken()251     public void nextValidToken() throws IOException {
252         int level = 0;
253         String n1 = null;
254         String n2 = null;
255         int ptr = 0;
256         while (nextToken()) {
257             if (type == TK_COMMENT)
258                 continue;
259             switch (level) {
260                 case 0:
261                 {
262                     if (type != TK_NUMBER)
263                         return;
264                     ptr = file.getFilePointer();
265                     n1 = stringValue;
266                     ++level;
267                     break;
268                 }
269                 case 1:
270                 {
271                     if (type != TK_NUMBER) {
272                         file.seek(ptr);
273                         type = TK_NUMBER;
274                         stringValue = n1;
275                         return;
276                     }
277                     n2 = stringValue;
278                     ++level;
279                     break;
280                 }
281                 default:
282                 {
283                     if (type != TK_OTHER || !stringValue.equals("R")) {
284                         file.seek(ptr);
285                         type = TK_NUMBER;
286                         stringValue = n1;
287                         return;
288                     }
289                     type = TK_REF;
290                     reference = Integer.parseInt(n1);
291                     generation = Integer.parseInt(n2);
292                     return;
293                 }
294             }
295         }
296 		// http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=687669#20
297         if (level > 0) {
298             type = TK_NUMBER;
299             file.seek(ptr);
300             stringValue = n1;
301             return;
302         }
303         throwError("Unexpected end of file");
304     }
305 
nextToken()306     public boolean nextToken() throws IOException {
307         int ch = 0;
308         do {
309             ch = file.read();
310         } while (ch != -1 && isWhitespace(ch));
311         if (ch == -1){
312             type = TK_ENDOFFILE;
313             return false;
314         }
315 
316         // Note:  We have to initialize stringValue here, after we've looked for the end of the stream,
317         // to ensure that we don't lose the value of a token that might end exactly at the end
318         // of the stream
319         StringBuffer outBuf = null;
320         stringValue = EMPTY;
321 
322         switch (ch) {
323             case '[':
324                 type = TK_START_ARRAY;
325                 break;
326             case ']':
327                 type = TK_END_ARRAY;
328                 break;
329             case '/':
330             {
331                 outBuf = new StringBuffer();
332                 type = TK_NAME;
333                 while (true) {
334                     ch = file.read();
335                     if (delims[ch + 1])
336                         break;
337                     if (ch == '#') {
338                         ch = (getHex(file.read()) << 4) + getHex(file.read());
339                     }
340                     outBuf.append((char)ch);
341                 }
342                 backOnePosition(ch);
343                 break;
344             }
345             case '>':
346                 ch = file.read();
347                 if (ch != '>')
348                     throwError("'>' not expected");
349                 type = TK_END_DIC;
350                 break;
351             case '<':
352             {
353                 int v1 = file.read();
354                 if (v1 == '<') {
355                     type = TK_START_DIC;
356                     break;
357                 }
358                 outBuf = new StringBuffer();
359                 type = TK_STRING;
360                 hexString = true;
361                 int v2 = 0;
362                 while (true) {
363                     while (isWhitespace(v1))
364                         v1 = file.read();
365                     if (v1 == '>')
366                         break;
367                     v1 = getHex(v1);
368                     if (v1 < 0)
369                         break;
370                     v2 = file.read();
371                     while (isWhitespace(v2))
372                         v2 = file.read();
373                     if (v2 == '>') {
374                         ch = v1 << 4;
375                         outBuf.append((char)ch);
376                         break;
377                     }
378                     v2 = getHex(v2);
379                     if (v2 < 0)
380                         break;
381                     ch = (v1 << 4) + v2;
382                     outBuf.append((char)ch);
383                     v1 = file.read();
384                 }
385                 if (v1 < 0 || v2 < 0)
386                     throwError("Error reading string");
387                 break;
388             }
389             case '%':
390                 type = TK_COMMENT;
391                 do {
392                     ch = file.read();
393                 } while (ch != -1 && ch != '\r' && ch != '\n');
394                 break;
395             case '(':
396             {
397                 outBuf = new StringBuffer();
398                 type = TK_STRING;
399                 hexString = false;
400                 int nesting = 0;
401                 while (true) {
402                     ch = file.read();
403                     if (ch == -1)
404                         break;
405                     if (ch == '(') {
406                         ++nesting;
407                     }
408                     else if (ch == ')') {
409                         --nesting;
410                     }
411                     else if (ch == '\\') {
412                         boolean lineBreak = false;
413                         ch = file.read();
414                         switch (ch) {
415                             case 'n':
416                                 ch = '\n';
417                                 break;
418                             case 'r':
419                                 ch = '\r';
420                                 break;
421                             case 't':
422                                 ch = '\t';
423                                 break;
424                             case 'b':
425                                 ch = '\b';
426                                 break;
427                             case 'f':
428                                 ch = '\f';
429                                 break;
430                             case '(':
431                             case ')':
432                             case '\\':
433                                 break;
434                             case '\r':
435                                 lineBreak = true;
436                                 ch = file.read();
437                                 if (ch != '\n')
438                                     backOnePosition(ch);
439                                 break;
440                             case '\n':
441                                 lineBreak = true;
442                                 break;
443                             default:
444                             {
445                                 if (ch < '0' || ch > '7') {
446                                     break;
447                                 }
448                                 int octal = ch - '0';
449                                 ch = file.read();
450                                 if (ch < '0' || ch > '7') {
451                                     backOnePosition(ch);
452                                     ch = octal;
453                                     break;
454                                 }
455                                 octal = (octal << 3) + ch - '0';
456                                 ch = file.read();
457                                 if (ch < '0' || ch > '7') {
458                                     backOnePosition(ch);
459                                     ch = octal;
460                                     break;
461                                 }
462                                 octal = (octal << 3) + ch - '0';
463                                 ch = octal & 0xff;
464                                 break;
465                             }
466                         }
467                         if (lineBreak)
468                             continue;
469                         if (ch < 0)
470                             break;
471                     }
472                     else if (ch == '\r') {
473                         ch = file.read();
474                         if (ch < 0)
475                             break;
476                         if (ch != '\n') {
477                             backOnePosition(ch);
478                             ch = '\n';
479                         }
480                     }
481                     if (nesting == -1)
482                         break;
483                     outBuf.append((char)ch);
484                 }
485                 if (ch == -1)
486                     throwError("Error reading string");
487                 break;
488             }
489             default:
490             {
491                 outBuf = new StringBuffer();
492                 if (ch == '-' || ch == '+' || ch == '.' || (ch >= '0' && ch <= '9')) {
493                     type = TK_NUMBER;
494                     do {
495                         outBuf.append((char)ch);
496                         ch = file.read();
497                     } while (ch != -1 && ((ch >= '0' && ch <= '9') || ch == '.'));
498                 }
499                 else {
500                     type = TK_OTHER;
501                     do {
502                         outBuf.append((char)ch);
503                         ch = file.read();
504                     } while (!delims[ch + 1]);
505                 }
506                 backOnePosition(ch);
507                 break;
508             }
509         }
510         if (outBuf != null)
511             stringValue = outBuf.toString();
512         return true;
513     }
514 
intValue()515     public int intValue() {
516         return Integer.parseInt(stringValue);
517     }
518 
readLineSegment(byte input[])519     public boolean readLineSegment(byte input[]) throws IOException {
520         int c = -1;
521         boolean eol = false;
522         int ptr = 0;
523         int len = input.length;
524 
525 		// ssteward, pdftk-1.10, 040922:
526 		// skip initial whitespace; added this because PdfReader.rebuildXref()
527 		// assumes that line provided by readLineSegment does not have init. whitespace;
528 		if ( ptr < len ) {
529 			while ( isWhitespace( (c = read()) ) );
530 		}
531 		while ( !eol && ptr < len ) {
532 			switch (c) {
533 			case -1:
534 			case '\n':
535 				eol = true;
536 			break;
537 			case '\r':
538 				eol = true;
539 				int cur = getFilePointer();
540 				if ((read()) != '\n') {
541 					seek(cur);
542 				}
543 				break;
544 			default:
545 				input[ptr++] = (byte)c;
546 				break;
547 			}
548 
549 			// break loop? do it before we read() again
550 			if( eol || len <= ptr ) {
551 				break;
552 			}
553 			else {
554 				c = read();
555 			}
556 		}
557 
558 		if( len <= ptr  ) {
559 			eol = false;
560 			while (!eol) {
561 				switch (c = read()) {
562 				case -1:
563 				case '\n':
564 					eol = true;
565 				break;
566 				case '\r':
567 					eol = true;
568 					int cur = getFilePointer();
569 					if ((read()) != '\n') {
570 						seek(cur);
571 					}
572 					break;
573 				}
574 			}
575 		}
576 
577         if ((c == -1) && (ptr == 0)) {
578             return false;
579         }
580         if (ptr + 2 <= len) {
581             input[ptr++] = (byte)' ';
582             input[ptr] = (byte)'X';
583         }
584         return true;
585     }
586 
checkObjectStart(byte line[])587     public static int[] checkObjectStart(byte line[]) {
588         try {
589             PRTokeniser tk = new PRTokeniser(line);
590             int num = 0;
591             int gen = 0;
592             if (!tk.nextToken() || tk.getTokenType() != TK_NUMBER)
593                 return null;
594             num = tk.intValue();
595             if (!tk.nextToken() || tk.getTokenType() != TK_NUMBER)
596                 return null;
597             gen = tk.intValue();
598             if (!tk.nextToken())
599                 return null;
600             if (!tk.getStringValue().equals("obj"))
601                 return null;
602             return new int[]{num, gen};
603         }
604         catch (Exception ioe) {
605             // empty on purpose
606         }
607         return null;
608     }
609 
isHexString()610     public boolean isHexString() {
611         return this.hexString;
612     }
613 
614 }
615