1 /* -*- Mode: Java; tab-width: 4; c-basic-offset: 4 -*- */ 2 /* 3 * $Id: PRTokeniser.java,v 1.15 2002/06/20 13:30:25 blowagie Exp $ 4 * $Name: $ 5 * 6 * Copyright 2001, 2002 by Paulo Soares. 7 * 8 * 9 * The Original Code is 'iText, a free JAVA-PDF library'. 10 * 11 * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by 12 * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie. 13 * All Rights Reserved. 14 * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer 15 * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved. 16 * 17 * Contributor(s): all the names of the contributors are added in the source code 18 * where applicable. 19 * 20 * 21 * This library is free software; you can redistribute it and/or 22 * modify it under the terms of the GNU Library General Public 23 * License as published by the Free Software Foundation; either 24 * version 2 of the License, or (at your option) any later version. 25 * 26 * This library is distributed in the hope that it will be useful, 27 * but WITHOUT ANY WARRANTY; without even the implied warranty of 28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 29 * Library General Public License for more details. 30 * 31 * You should have received a copy of the GNU Library General Public 32 * License along with this library; if not, write to the 33 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, 34 * Boston, MA 02110-1301, USA. 35 * 36 * 37 * This library is free software; you can redistribute it and/or 38 * modify it under the terms of the GNU Library General Public 39 * License as published by the Free Software Foundation; either 40 * version 2 of the License, or (at your option) any later version. 41 * 42 * This library is distributed in the hope that it will be useful, 43 * but WITHOUT ANY WARRANTY; without even the implied warranty of 44 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 45 * Library General Public License for more details. 46 * 47 * You should have received a copy of the GNU Library General Public 48 * License along with this library; if not, write to the 49 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, 50 * Boston, MA 02110-1301, USA. 51 * 52 * 53 * If you didn't download this code from the following link, you should check if 54 * you aren't using an obsolete version: 55 * http://www.lowagie.com/iText/ 56 */ 57 58 package com.gitlab.pdftk_java.com.lowagie.text.pdf; 59 60 import java.io.IOException; 61 import com.gitlab.pdftk_java.com.lowagie.text.exceptions.InvalidPdfException; 62 /** 63 * 64 * @author Paulo Soares (psoares@consiste.pt) 65 */ 66 public class PRTokeniser { 67 68 public static final int TK_NUMBER = 1; 69 public static final int TK_STRING = 2; 70 public static final int TK_NAME = 3; 71 public static final int TK_COMMENT = 4; 72 public static final int TK_START_ARRAY = 5; 73 public static final int TK_END_ARRAY = 6; 74 public static final int TK_START_DIC = 7; 75 public static final int TK_END_DIC = 8; 76 public static final int TK_REF = 9; 77 public static final int TK_OTHER = 10; 78 public static final int TK_ENDOFFILE = 11; 79 public static final boolean delims[] = { 80 true, true, false, false, false, false, false, false, false, false, 81 true, true, false, true, true, false, false, false, false, false, 82 false, false, false, false, false, false, false, false, false, false, 83 false, false, false, true, false, false, false, false, true, false, 84 false, true, true, false, false, false, false, false, true, false, 85 false, false, false, false, false, false, false, false, false, false, 86 false, true, false, true, false, false, false, false, false, false, 87 false, false, false, false, false, false, false, false, false, false, 88 false, false, false, false, false, false, false, false, false, false, 89 false, false, true, false, true, false, false, false, false, false, 90 false, false, false, false, false, false, false, false, false, false, 91 false, false, false, false, false, false, false, false, false, false, 92 false, false, false, false, false, false, false, false, false, false, 93 false, false, false, false, false, false, false, false, false, false, 94 false, false, false, false, false, false, false, false, false, false, 95 false, false, false, false, false, false, false, false, false, false, 96 false, false, false, false, false, false, false, false, false, false, 97 false, false, false, false, false, false, false, false, false, false, 98 false, false, false, false, false, false, false, false, false, false, 99 false, false, false, false, false, false, false, false, false, false, 100 false, false, false, false, false, false, false, false, false, false, 101 false, false, false, false, false, false, false, false, false, false, 102 false, false, false, false, false, false, false, false, false, false, 103 false, false, false, false, false, false, false, false, false, false, 104 false, false, false, false, false, false, false, false, false, false, 105 false, false, false, false, false, false, false}; 106 107 static final String EMPTY = ""; 108 109 110 protected RandomAccessFileOrArray file = null; 111 protected int type = 0; 112 protected String stringValue = ""; 113 protected int reference = 0; 114 protected int generation = 0; 115 protected boolean hexString = false; 116 PRTokeniser(String filename)117 public PRTokeniser(String filename) throws IOException { 118 file = new RandomAccessFileOrArray(filename); 119 } 120 PRTokeniser(byte pdfIn[])121 public PRTokeniser(byte pdfIn[]) { 122 file = new RandomAccessFileOrArray(pdfIn); 123 } 124 PRTokeniser(RandomAccessFileOrArray file)125 public PRTokeniser(RandomAccessFileOrArray file) { 126 this.file = file; 127 } 128 seek(int pos)129 public void seek(int pos) throws IOException { 130 file.seek(pos); 131 } 132 getFilePointer()133 public int getFilePointer() throws IOException { 134 return file.getFilePointer(); 135 } 136 close()137 public void close() throws IOException { 138 file.close(); 139 } 140 length()141 public int length() throws IOException { 142 return file.length(); 143 } 144 read()145 public int read() throws IOException { 146 return file.read(); 147 } 148 getSafeFile()149 public RandomAccessFileOrArray getSafeFile() throws IOException { 150 return new RandomAccessFileOrArray(file); 151 } 152 getFile()153 public RandomAccessFileOrArray getFile() { 154 return file; 155 } 156 readString(int size)157 public String readString(int size) throws IOException { 158 StringBuffer buf = new StringBuffer(); 159 int ch; 160 while ((size--) > 0) { 161 ch = file.read(); 162 if (ch == -1) 163 break; 164 buf.append((char)ch); 165 } 166 return buf.toString(); 167 } 168 isWhitespace(int ch)169 public static final boolean isWhitespace(int ch) { 170 return (ch == 0 || ch == 9 || ch == 10 || ch == 12 || ch == 13 || ch == 32); 171 } 172 isDelimiter(int ch)173 public static final boolean isDelimiter(int ch) { 174 return (ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '/' || ch == '%'); 175 } 176 isDelimiterWhitespace(int ch)177 public static final boolean isDelimiterWhitespace(int ch) { 178 return delims[ch + 1]; 179 } 180 getTokenType()181 public int getTokenType() { 182 return type; 183 } 184 getStringValue()185 public String getStringValue() { 186 return stringValue; 187 } 188 getReference()189 public int getReference() { 190 return reference; 191 } 192 getGeneration()193 public int getGeneration() { 194 return generation; 195 } 196 backOnePosition(int ch)197 public void backOnePosition(int ch) throws IOException { 198 if (ch != -1) 199 file.pushBack((byte)ch); 200 } 201 throwError(String error)202 public void throwError(String error) throws IOException { 203 throw new InvalidPdfException(error + " at file pointer " + file.getFilePointer()); 204 } 205 checkPdfHeader()206 public char checkPdfHeader() throws IOException { 207 file.setStartOffset(0); 208 String str = readString(1024); 209 int idx = str.indexOf("%PDF-"); 210 if (idx < 0) 211 throw new InvalidPdfException("PDF header signature not found."); 212 file.setStartOffset(idx); 213 return str.charAt(idx + 7); 214 } 215 checkFdfHeader()216 public void checkFdfHeader() throws IOException { 217 file.setStartOffset(0); 218 String str = readString(1024); 219 int idx = str.indexOf("%FDF-1.2"); 220 if (idx < 0) 221 throw new InvalidPdfException("FDF header signature not found."); 222 file.setStartOffset(idx); 223 } 224 225 // "startxref" should always be at the end of a file 226 // Some non-compliant files have additional, unrelated data at the end 227 // (see https://gitlab.com/pdftk-java/pdftk/-/issues/90) 228 // So we have to keep searching if we do not find startxref at the end getStartxref()229 public int getStartxref() throws IOException { 230 int size = Math.min(1024, file.length()); 231 for (int pos = file.length() - size; pos>=0; pos-=1024) { 232 file.seek(pos); 233 // read a bit past a block, in case "startxref" is split between blocks 234 String str = readString(1024+10); 235 int idx = str.lastIndexOf("startxref"); 236 if (idx >= 0) return pos + idx; 237 } 238 throw new InvalidPdfException("PDF startxref not found."); 239 } 240 getHex(int v)241 public static int getHex(int v) { 242 if (v >= '0' && v <= '9') 243 return v - '0'; 244 if (v >= 'A' && v <= 'F') 245 return v - 'A' + 10; 246 if (v >= 'a' && v <= 'f') 247 return v - 'a' + 10; 248 return -1; 249 } 250 nextValidToken()251 public void nextValidToken() throws IOException { 252 int level = 0; 253 String n1 = null; 254 String n2 = null; 255 int ptr = 0; 256 while (nextToken()) { 257 if (type == TK_COMMENT) 258 continue; 259 switch (level) { 260 case 0: 261 { 262 if (type != TK_NUMBER) 263 return; 264 ptr = file.getFilePointer(); 265 n1 = stringValue; 266 ++level; 267 break; 268 } 269 case 1: 270 { 271 if (type != TK_NUMBER) { 272 file.seek(ptr); 273 type = TK_NUMBER; 274 stringValue = n1; 275 return; 276 } 277 n2 = stringValue; 278 ++level; 279 break; 280 } 281 default: 282 { 283 if (type != TK_OTHER || !stringValue.equals("R")) { 284 file.seek(ptr); 285 type = TK_NUMBER; 286 stringValue = n1; 287 return; 288 } 289 type = TK_REF; 290 reference = Integer.parseInt(n1); 291 generation = Integer.parseInt(n2); 292 return; 293 } 294 } 295 } 296 // http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=687669#20 297 if (level > 0) { 298 type = TK_NUMBER; 299 file.seek(ptr); 300 stringValue = n1; 301 return; 302 } 303 throwError("Unexpected end of file"); 304 } 305 nextToken()306 public boolean nextToken() throws IOException { 307 int ch = 0; 308 do { 309 ch = file.read(); 310 } while (ch != -1 && isWhitespace(ch)); 311 if (ch == -1){ 312 type = TK_ENDOFFILE; 313 return false; 314 } 315 316 // Note: We have to initialize stringValue here, after we've looked for the end of the stream, 317 // to ensure that we don't lose the value of a token that might end exactly at the end 318 // of the stream 319 StringBuffer outBuf = null; 320 stringValue = EMPTY; 321 322 switch (ch) { 323 case '[': 324 type = TK_START_ARRAY; 325 break; 326 case ']': 327 type = TK_END_ARRAY; 328 break; 329 case '/': 330 { 331 outBuf = new StringBuffer(); 332 type = TK_NAME; 333 while (true) { 334 ch = file.read(); 335 if (delims[ch + 1]) 336 break; 337 if (ch == '#') { 338 ch = (getHex(file.read()) << 4) + getHex(file.read()); 339 } 340 outBuf.append((char)ch); 341 } 342 backOnePosition(ch); 343 break; 344 } 345 case '>': 346 ch = file.read(); 347 if (ch != '>') 348 throwError("'>' not expected"); 349 type = TK_END_DIC; 350 break; 351 case '<': 352 { 353 int v1 = file.read(); 354 if (v1 == '<') { 355 type = TK_START_DIC; 356 break; 357 } 358 outBuf = new StringBuffer(); 359 type = TK_STRING; 360 hexString = true; 361 int v2 = 0; 362 while (true) { 363 while (isWhitespace(v1)) 364 v1 = file.read(); 365 if (v1 == '>') 366 break; 367 v1 = getHex(v1); 368 if (v1 < 0) 369 break; 370 v2 = file.read(); 371 while (isWhitespace(v2)) 372 v2 = file.read(); 373 if (v2 == '>') { 374 ch = v1 << 4; 375 outBuf.append((char)ch); 376 break; 377 } 378 v2 = getHex(v2); 379 if (v2 < 0) 380 break; 381 ch = (v1 << 4) + v2; 382 outBuf.append((char)ch); 383 v1 = file.read(); 384 } 385 if (v1 < 0 || v2 < 0) 386 throwError("Error reading string"); 387 break; 388 } 389 case '%': 390 type = TK_COMMENT; 391 do { 392 ch = file.read(); 393 } while (ch != -1 && ch != '\r' && ch != '\n'); 394 break; 395 case '(': 396 { 397 outBuf = new StringBuffer(); 398 type = TK_STRING; 399 hexString = false; 400 int nesting = 0; 401 while (true) { 402 ch = file.read(); 403 if (ch == -1) 404 break; 405 if (ch == '(') { 406 ++nesting; 407 } 408 else if (ch == ')') { 409 --nesting; 410 } 411 else if (ch == '\\') { 412 boolean lineBreak = false; 413 ch = file.read(); 414 switch (ch) { 415 case 'n': 416 ch = '\n'; 417 break; 418 case 'r': 419 ch = '\r'; 420 break; 421 case 't': 422 ch = '\t'; 423 break; 424 case 'b': 425 ch = '\b'; 426 break; 427 case 'f': 428 ch = '\f'; 429 break; 430 case '(': 431 case ')': 432 case '\\': 433 break; 434 case '\r': 435 lineBreak = true; 436 ch = file.read(); 437 if (ch != '\n') 438 backOnePosition(ch); 439 break; 440 case '\n': 441 lineBreak = true; 442 break; 443 default: 444 { 445 if (ch < '0' || ch > '7') { 446 break; 447 } 448 int octal = ch - '0'; 449 ch = file.read(); 450 if (ch < '0' || ch > '7') { 451 backOnePosition(ch); 452 ch = octal; 453 break; 454 } 455 octal = (octal << 3) + ch - '0'; 456 ch = file.read(); 457 if (ch < '0' || ch > '7') { 458 backOnePosition(ch); 459 ch = octal; 460 break; 461 } 462 octal = (octal << 3) + ch - '0'; 463 ch = octal & 0xff; 464 break; 465 } 466 } 467 if (lineBreak) 468 continue; 469 if (ch < 0) 470 break; 471 } 472 else if (ch == '\r') { 473 ch = file.read(); 474 if (ch < 0) 475 break; 476 if (ch != '\n') { 477 backOnePosition(ch); 478 ch = '\n'; 479 } 480 } 481 if (nesting == -1) 482 break; 483 outBuf.append((char)ch); 484 } 485 if (ch == -1) 486 throwError("Error reading string"); 487 break; 488 } 489 default: 490 { 491 outBuf = new StringBuffer(); 492 if (ch == '-' || ch == '+' || ch == '.' || (ch >= '0' && ch <= '9')) { 493 type = TK_NUMBER; 494 do { 495 outBuf.append((char)ch); 496 ch = file.read(); 497 } while (ch != -1 && ((ch >= '0' && ch <= '9') || ch == '.')); 498 } 499 else { 500 type = TK_OTHER; 501 do { 502 outBuf.append((char)ch); 503 ch = file.read(); 504 } while (!delims[ch + 1]); 505 } 506 backOnePosition(ch); 507 break; 508 } 509 } 510 if (outBuf != null) 511 stringValue = outBuf.toString(); 512 return true; 513 } 514 intValue()515 public int intValue() { 516 return Integer.parseInt(stringValue); 517 } 518 readLineSegment(byte input[])519 public boolean readLineSegment(byte input[]) throws IOException { 520 int c = -1; 521 boolean eol = false; 522 int ptr = 0; 523 int len = input.length; 524 525 // ssteward, pdftk-1.10, 040922: 526 // skip initial whitespace; added this because PdfReader.rebuildXref() 527 // assumes that line provided by readLineSegment does not have init. whitespace; 528 if ( ptr < len ) { 529 while ( isWhitespace( (c = read()) ) ); 530 } 531 while ( !eol && ptr < len ) { 532 switch (c) { 533 case -1: 534 case '\n': 535 eol = true; 536 break; 537 case '\r': 538 eol = true; 539 int cur = getFilePointer(); 540 if ((read()) != '\n') { 541 seek(cur); 542 } 543 break; 544 default: 545 input[ptr++] = (byte)c; 546 break; 547 } 548 549 // break loop? do it before we read() again 550 if( eol || len <= ptr ) { 551 break; 552 } 553 else { 554 c = read(); 555 } 556 } 557 558 if( len <= ptr ) { 559 eol = false; 560 while (!eol) { 561 switch (c = read()) { 562 case -1: 563 case '\n': 564 eol = true; 565 break; 566 case '\r': 567 eol = true; 568 int cur = getFilePointer(); 569 if ((read()) != '\n') { 570 seek(cur); 571 } 572 break; 573 } 574 } 575 } 576 577 if ((c == -1) && (ptr == 0)) { 578 return false; 579 } 580 if (ptr + 2 <= len) { 581 input[ptr++] = (byte)' '; 582 input[ptr] = (byte)'X'; 583 } 584 return true; 585 } 586 checkObjectStart(byte line[])587 public static int[] checkObjectStart(byte line[]) { 588 try { 589 PRTokeniser tk = new PRTokeniser(line); 590 int num = 0; 591 int gen = 0; 592 if (!tk.nextToken() || tk.getTokenType() != TK_NUMBER) 593 return null; 594 num = tk.intValue(); 595 if (!tk.nextToken() || tk.getTokenType() != TK_NUMBER) 596 return null; 597 gen = tk.intValue(); 598 if (!tk.nextToken()) 599 return null; 600 if (!tk.getStringValue().equals("obj")) 601 return null; 602 return new int[]{num, gen}; 603 } 604 catch (Exception ioe) { 605 // empty on purpose 606 } 607 return null; 608 } 609 isHexString()610 public boolean isHexString() { 611 return this.hexString; 612 } 613 614 } 615