1 /* $Id$ */
2 /*
3 ** file_decomp_PDF.c
4 **
5 ** Copyright (C) 2014-2021 Cisco and/or its affiliates. All rights reserved.
6 **
7 ** This program is free software; you can redistribute it and/or modify
8 ** it under the terms of the GNU General Public License Version 2 as
9 ** published by the Free Software Foundation. You may not use, modify or
10 ** distribute this program under any other version of the GNU General
11 ** Public License.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 */
22
23 #ifdef HAVE_CONFIG_H
24 #include "config.h"
25 #endif
26
27 #include <zlib.h>
28 #include <string.h>
29 #include <stdlib.h>
30
31 #include "file_decomp.h"
32 #ifdef FILE_DECOMP_PDF
33 #include "file_decomp_PDF.h"
34 #include "hi_eo_events.h"
35 #include "mstring.h"
36
37 /* Define characters and tokens in PDF grammar */
38 #define TOK_STRM_OPEN "stream"
39 #define TOK_STRM_CLOSE "endstream"
40
41 #define TOK_OBJ_OPEN "obj"
42 #define TOK_OBJ_CLOSE "endobj"
43
44 #define TOK_DICT_OPEN "<<"
45 #define TOK_DICT_CLOSE ">>"
46 #define TOK_DICT_FILT "Filter"
47 #define TOK_DICT_FLATE "FlateDecode"
48 #define TOK_DICT_FLATE_ALT "Fl"
49 #define TOK_DICT_PARMS "DecodeParms"
50 #define TOK_DICT_PARMS_ALT "DP"
51 #define TOK_DICT_LENGTH "Length"
52 #define TOK_DICT_NULL "null"
53 #define TOK_DICT_NULL_FILT " null " // Enclose the null object in spaces
54 #define TOK_XRF_XREF "xref"
55 #define TOK_XRF_TRAILER "trailer"
56 #define TOK_XRF_STARTXREF "startxref"
57 #define TOK_XRF_END "%%EOF"
58
59 #define WHITESPACE_STRING "\011\012\014\015\040" // plus \000
60
61 #define TOK_EOL_CR "\r"
62 #define TOK_EOL_LF "\n"
63 #define TOK_EOL_CRLF "\r\n"
64
65 #define CHR_CR '\r'
66 #define CHR_LF '\n'
67
68 #define CHR_COMMENT '%'
69
70 #define CHR_ARRAY_OPEN '['
71 #define CHR_ARRAY_CLOSE ']'
72
73 #define CHR_ANGLE_OPEN '<'
74 #define CHR_ANGLE_CLOSE '>'
75
76 #define CHR_SPACE ' '
77 #define CHR_NAME_SEP '/'
78
79 #define IS_WHITESPACE(c) ((strchr((char *)WHITESPACE_STRING, (int)c) != NULL) || (c == 0))
80 #define IS_EOL(c) ((c == CHR_CR) || (c == CHR_LF))
81
82 /* Define the parser states */
83 typedef enum p_states
84 {
85 P_START = 1, // Ground state, nothing 'open'
86 P_COMMENT, // inside a comment (initial state of parser)
87 P_IND_OBJ, // Indirect Object - Sub_State usage
88 P_XREF, // The combined xref, trailer, startxref top level items
89 P_DICT_OBJECT, // A dictionary object
90 P_STREAM // A pseudo state used to process a stream object
91 } p_state_t;
92
93 typedef enum p_xref_substates
94 {
95 P_XREF_TOKEN = 1,
96 P_XREF_END_TOKEN
97 } p_xref_t;
98
99 typedef enum p_dict_substates
100 {
101 P_DICT_OPEN = 1,
102 P_DICT_OPEN_TOK,
103 P_DICT_CLOSE_TOK,
104 P_DICT_FILTER,
105 P_DICT_SKIP,
106 P_DICT_ACTIVE
107 } p_dict_t;
108
109 typedef enum p_indirect_object_substates
110 {
111 P_OBJ_NUMBER = 1,
112 P_GEN_NUMBER,
113 P_OBJ_TOKEN,
114 P_OBJ_EOL,
115 P_STREAM_TOKEN,
116 P_STREAM_EOL,
117 P_STREAM_LF,
118 P_ENDSTREAM_TOKEN,
119 P_ENDOBJ_TOKEN
120 } p_indirect_object_substate_t;
121
122 static struct filters_s
123 {
124 char *Token;
125 uint8_t Length;
126 uint8_t Type;
127 } Filter_Map[] =
128 {
129 { TOK_DICT_FLATE, (sizeof(TOK_DICT_FLATE)-1), FILE_COMPRESSION_TYPE_DEFLATE },
130 { TOK_DICT_FLATE_ALT, (sizeof(TOK_DICT_FLATE_ALT)-1), FILE_COMPRESSION_TYPE_DEFLATE },
131 { TOK_DICT_NULL, (sizeof(TOK_DICT_NULL)-1), FILE_COMPRESSION_TYPE_NONE },
132 { NULL, 0, FILE_COMPRESSION_TYPE_NONE }
133 };
134
135 /* Given a pointer to a /Filter value token, return the
136 associated compression type from the Filter_Map. */
Get_Decomp_Type(uint8_t * Token,uint8_t Length)137 static inline uint8_t Get_Decomp_Type( uint8_t *Token, uint8_t Length )
138 {
139 int Index;
140
141 Index=0;
142
143 while( Filter_Map[Index].Token != NULL )
144 {
145 if( (Filter_Map[Index].Length == Length) &&
146 (strncmp( (const char *)Token, Filter_Map[Index].Token, Length ) == 0 ) )
147 return( Filter_Map[Index].Type );
148 else
149 Index += 1;
150 }
151 return( FILE_COMPRESSION_TYPE_NONE );
152 }
153
Process_One_Filter(fd_session_p_t SessionPtr,uint8_t * Token,uint8_t Length)154 static inline void Process_One_Filter( fd_session_p_t SessionPtr, uint8_t *Token, uint8_t Length )
155 {
156 uint8_t Comp_Type;
157
158 /* Lookup the token and see if it matches a known filter */
159 Comp_Type = Get_Decomp_Type( Token, Length );
160
161 if( Comp_Type != FILE_COMPRESSION_TYPE_NONE )
162 {
163 /* Check if we've found one already. Indicate cascading if we did. */
164 if( SessionPtr->Decomp_Type != FILE_COMPRESSION_TYPE_NONE )
165 {
166 File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_CASC_COMP );
167 SessionPtr->Decomp_Type = FILE_COMPRESSION_TYPE_NONE;
168 }
169 else
170 {
171 /* Found our first matching, supported filter type */
172 SessionPtr->Decomp_Type = Comp_Type;
173 SessionPtr->Decomp_State.PDF.Decomp_Type = Comp_Type;
174 }
175 }
176 else
177 {
178 File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_UNSUP_COMP_TYPE );
179 SessionPtr->Decomp_Type = FILE_COMPRESSION_TYPE_NONE;
180 }
181 }
182
183 /* Parse the buffered Filter_Spec and create a stream decompression
184 mode and/or event alerts. Return File_Decomp_OK if successfui.
185 Return File_Decomp_Error for a parsing error. */
Process_Filter_Spec(fd_session_p_t SessionPtr)186 static fd_status_t Process_Filter_Spec( fd_session_p_t SessionPtr )
187 {
188 /* The following string contains CHR_ARRAY_OPEN, CHR_ARRAY_CLOSE,
189 and CHR_NAME_SEP. */
190 const uint8_t Delim_Str[] = { "\011\012\014\015\040/[]" };
191 bool Found_Array = false;
192 bool Found_Token = false;
193 uint8_t *Filter;
194 uint8_t Length;
195 uint8_t c;
196 int Index;
197
198 fd_status_t Ret_Code = File_Decomp_OK;
199 fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
200
201 /* Assume the 'no compression' result */
202 SessionPtr->Decomp_Type = FILE_COMPRESSION_TYPE_NONE;
203 Filter = NULL;
204 Length = 0;
205
206 for( Index=0; Index<p->Filter_Spec_Index; Index++ )
207 {
208 c = p->Filter_Spec_Buf[Index];
209
210 if( (c == 0) || (strchr( (char *)Delim_Str, (int)c ) != 0) )
211 {
212 if( c == CHR_ARRAY_OPEN )
213 {
214 /* Looks like an array starting, but we are already
215 in an array, or have seen a filter spec already. */
216 if( Found_Array || Found_Token || (Filter != NULL) )
217 {
218 Ret_Code = File_Decomp_Error;
219 break;
220 }
221 else
222 {
223 Found_Array = true;
224 Filter = NULL;
225 Length = 0;
226 continue; // Nothing else to do, goto next char
227 }
228 }
229 else if( c == CHR_ARRAY_CLOSE )
230 {
231 /* We MUST have an array open at this point. */
232 if( !Found_Array )
233 {
234 Ret_Code = File_Decomp_Error;
235 break;
236 }
237 Found_Array = false;
238 }
239
240 /* The white-space or other separator terminates the
241 current filter name we are parsing. */
242 if( (Filter != NULL) && (Length > 0) )
243 {
244 Process_One_Filter( SessionPtr, Filter, Length );
245 Filter = NULL;
246 Length = 0;
247 }
248 }
249 else // non-separator character
250 {
251 /* Start a token if we haven't already. */
252 if( Filter == NULL )
253 {
254 Found_Token = true; // Used in the array syntax checking
255 Filter = &(p->Filter_Spec_Buf[Index]);
256 Length = 1; // We've found one character so far
257 }
258 else
259 {
260 Length += 1;
261 }
262 }
263 }
264
265 /* Indicate an error is we exit the parsing with the array open */
266 if( Found_Array )
267 Ret_Code = File_Decomp_Error;
268
269 /* Any error code implies no compression type */
270 if( Ret_Code == File_Decomp_Error )
271 SessionPtr->Decomp_Type = FILE_COMPRESSION_TYPE_NONE;
272 /* Look for case where the filter name ends at the
273 last character of the filter_spec. */
274 else if( (Filter != NULL) && (Length > 0) )
275 Process_One_Filter( SessionPtr, Filter, Length );
276
277 return( Ret_Code );
278 }
279
280
Init_Parser(fd_session_p_t SessionPtr)281 static inline void Init_Parser( fd_session_p_t SessionPtr )
282 {
283 fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
284 /* The parser starts in the P_COMMENT state we start
285 parsing the file just after the signature is located
286 and the signature is syntactially a comment. */
287 p->State = P_COMMENT;
288 p->Parse_Stack_Index = 0; // Stack is empty
289 }
290
Push_State(fd_PDF_Parse_p_t p)291 static inline fd_status_t Push_State( fd_PDF_Parse_p_t p )
292 {
293 fd_PDF_Parse_Stack_p_t StckPtr;
294
295 if( p->Parse_Stack_Index >= (PARSE_STACK_LEN-1) )
296 return( File_Decomp_Error );
297
298 StckPtr = &(p->Parse_Stack[(p->Parse_Stack_Index)++]);
299
300 StckPtr->State = p->State;
301 StckPtr->Sub_State = p->Sub_State;
302
303 return( File_Decomp_OK );
304 }
305
Pop_State(fd_PDF_Parse_p_t p)306 static inline fd_status_t Pop_State( fd_PDF_Parse_p_t p )
307 {
308 fd_PDF_Parse_Stack_p_t StckPtr;
309
310 if( p->Parse_Stack_Index == 0 )
311 return( File_Decomp_Error );
312
313 StckPtr = &(p->Parse_Stack[--(p->Parse_Stack_Index)]);
314
315 p->Elem_Index = 0; // Reset to beginning of token as can't push/pop in mid-token
316 p->State = StckPtr->State;
317 p->Sub_State = StckPtr->Sub_State;
318
319 return( File_Decomp_OK );
320 }
321
322 /* If there's a previous state on the stack, return a pointer to it, else return NULL */
Get_Previous_State(fd_PDF_Parse_p_t p)323 static inline fd_PDF_Parse_Stack_p_t Get_Previous_State( fd_PDF_Parse_p_t p )
324 {
325 if( p->Parse_Stack_Index == 0 )
326 return( (fd_PDF_Parse_Stack_p_t)NULL );
327
328 return( &(p->Parse_Stack[(p->Parse_Stack_Index)-1]) );
329 }
330
331 /* Objects are the heart and soul of the PDF. In particular, we need to concentrate on Dictionary
332 objects and objects that map to the Filter element in Dictionaries. 'null' is a valid object'.
333 Objects can be recursively composed of arrays of objects. In our limited parsing paradigm, we
334 will only process the contents of top level Dictionaries and ignore deeper levels. We will
335 only explore Dictionary objects within Indirect Objects. */
Handle_State_DICT_OBJECT(fd_session_p_t SessionPtr,uint8_t c)336 static inline fd_status_t Handle_State_DICT_OBJECT( fd_session_p_t SessionPtr, uint8_t c )
337 {
338 char Filter_Tok[] = TOK_DICT_FILT;
339 fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
340
341 /* enter with c being an EOL from the ind obj state */
342 if( p->State != P_DICT_OBJECT )
343 {
344 p->Sub_State = P_DICT_OPEN; // Looking to open a Dict`
345 p->Dict_Nesting_Cnt = 0; // No Dicts are 'active'
346 p->State = P_DICT_OBJECT;
347 p->Filter_Spec_Index = 0;
348 SessionPtr->Decomp_Type = FILE_COMPRESSION_TYPE_NONE;
349 return( File_Decomp_OK );
350 }
351
352 switch( p->Sub_State )
353 {
354 /* look for the first angle bracket */
355 case( P_DICT_OPEN ):
356 {
357 if( c == CHR_ANGLE_OPEN )
358 {
359 p->Sub_State = P_DICT_OPEN_TOK;
360 }
361 else if( !IS_WHITESPACE(c) )
362 {
363 /* for other objects, just skip and wait for the close of the
364 indirect object as we don't parse objects other than Dict's. */
365 if( Pop_State( p ) == File_Decomp_Error )
366 return( File_Decomp_Error );
367 }
368 break;
369 }
370 /* now look for the second angle bracket */
371 case( P_DICT_OPEN_TOK ):
372 {
373 if( c == CHR_ANGLE_OPEN )
374 {
375 /* Only ACTIVE if this is the opening of the
376 'base level' Dict, NOT a nested one. */
377 if( p->Dict_Nesting_Cnt++ == 0 )
378 {
379 p->Sub_State = P_DICT_ACTIVE;
380 }
381 else
382 {
383 p->Sub_State = P_DICT_SKIP;
384 }
385 }
386 else
387 {
388 /* for other objects, just skip and wait for the close of the
389 indirect object as we don't parse objects other than Dict's. */
390 if( Pop_State( p ) == File_Decomp_Error )
391 return( File_Decomp_Error );
392 }
393 break;
394 }
395
396 case( P_DICT_SKIP ):
397 case( P_DICT_ACTIVE ):
398 {
399 /* Main purpose is to search for the value portion of the
400 /Filter entry. Main loop looks for the /Filter token
401 and handles other diversion such as nested Dict objects.
402 If the /Filter token doesn't exist then we don't fill the
403 Filter_Spec_Buf[]. If in skip mode, no need to look for token. */
404 if( (p->Sub_State == P_DICT_ACTIVE) && c == Filter_Tok[p->Elem_Index++] )
405 {
406 if( Filter_Tok[p->Elem_Index] == '\0' )
407 {
408 p->Sub_State = P_DICT_FILTER;
409 }
410 }
411 else
412 {
413 /* On a mis-match, reset back to the start of the token */
414 p->Elem_Index = 0;
415
416 /* we might find a Sub-Dict while we're looking */
417 if( c == CHR_ANGLE_OPEN )
418 {
419 /* Save where we are, and process the Dict */
420 if( Push_State( p ) != File_Decomp_OK )
421 return( File_Decomp_Error );
422 p->Sub_State = P_DICT_OPEN_TOK;
423 }
424 else if( c == CHR_ANGLE_CLOSE )
425 {
426 if( Push_State( p ) != File_Decomp_OK )
427 return( File_Decomp_Error );
428 p->Sub_State = P_DICT_CLOSE_TOK;
429 }
430 }
431 break;
432 }
433
434 case( P_DICT_FILTER ):
435 {
436 if( (c == CHR_ANGLE_CLOSE) ||
437 ((c == CHR_NAME_SEP) && (p->Dict_Nesting_Cnt==3) )) // See the large comment below
438 {
439 if( c == CHR_ANGLE_CLOSE )
440 {
441 if( Push_State( p ) != File_Decomp_OK )
442 return( File_Decomp_Error );
443 p->Sub_State = P_DICT_CLOSE_TOK;
444 }
445 else
446 {
447 p->Sub_State = P_DICT_SKIP;
448 }
449 if( (Process_Filter_Spec( SessionPtr ) == File_Decomp_Error) )
450 return( File_Decomp_Error );
451 }
452 else
453 {
454 /* Since we don't have a full object parse, we need to assure
455 that we capture the entire filter spec string. The '>' is always
456 a terminator, but we also want to terminate on the next /Name entry
457 after a possible array of /Names. The Dict_Nesting_Cnt is used to
458 step through the transition options. The '/' character is only a valid
459 filter spec terminator if we've seen a valid array or one /Name entry. */
460 if( (c == CHR_NAME_SEP) && (p->Dict_Nesting_Cnt==1) )
461 p->Dict_Nesting_Cnt = 3;
462 else if( (c == CHR_ARRAY_OPEN) && (p->Dict_Nesting_Cnt==1) )
463 p->Dict_Nesting_Cnt = 2;
464 else if( (c == CHR_ARRAY_CLOSE) && (p->Dict_Nesting_Cnt==2) )
465 p->Dict_Nesting_Cnt = 3;
466
467 if( p->Filter_Spec_Index < (FILTER_SPEC_BUF_LEN-1) )
468 {
469 p->Filter_Spec_Buf[p->Filter_Spec_Index++] = c;
470 }
471 else
472 return( File_Decomp_Error );
473 }
474 break;
475 }
476
477 case( P_DICT_CLOSE_TOK ):
478 {
479 if( c == CHR_ANGLE_CLOSE )
480 {
481 /* Pop the temp state just prior to the first > */
482 if( Pop_State( p ) == File_Decomp_Error )
483 return( File_Decomp_Error );
484
485 /* Pop back to the state before the <<. */
486 /* But not so fast... Look at what state/sub-state we are popping
487 back to. If it's IND_OBJ, AND we have an active filter type,
488 we don't want to scan to the end of the stream but rather the beginning
489 of the stream. */
490 if( SessionPtr->Decomp_Type != FILE_COMPRESSION_TYPE_NONE )
491 {
492 fd_PDF_Parse_Stack_p_t StckPtr;
493
494 if( (StckPtr = Get_Previous_State( p )) == NULL )
495 {
496 /* There MUST be a previous state that got us here. */
497 return( File_Decomp_Error );
498 }
499 else
500 {
501 if( (StckPtr->State == P_IND_OBJ) &&
502 (StckPtr->Sub_State == P_ENDOBJ_TOKEN) )
503 {
504 StckPtr->Sub_State = P_STREAM_TOKEN;
505 }
506 }
507 }
508 if( Pop_State( p ) == File_Decomp_Error )
509 return( File_Decomp_Error );
510 }
511 else
512 /* Return to where we looking (didn't get >>) */
513 if( Pop_State( p ) == File_Decomp_Error )
514 return( File_Decomp_Error );
515 break;
516 }
517
518 default:
519 return( File_Decomp_Error );
520 }
521
522 return( File_Decomp_OK );
523 }
524
Process_Stream(fd_PDF_Parse_p_t p)525 static inline fd_status_t Process_Stream( fd_PDF_Parse_p_t p )
526 {
527 p->Sub_State = P_ENDSTREAM_TOKEN;
528 p->State = P_IND_OBJ;
529
530 if( Push_State( p ) == File_Decomp_Error )
531 return( File_Decomp_Error );
532 else
533 {
534 p->State = P_STREAM;
535 p->Sub_State = 0;
536 }
537 return( File_Decomp_OK );
538 }
539
540 /* Indirect Objects occur only at the top level of the file and comprise the
541 bulk of the file content. */
Handle_State_IND_OBJ(fd_session_p_t SessionPtr,uint8_t c)542 static inline fd_status_t Handle_State_IND_OBJ( fd_session_p_t SessionPtr, uint8_t c )
543 {
544 static uint8_t Ind_Obj_Token[] = { TOK_OBJ_OPEN };
545 static uint8_t Ind_Obj_End_Token[] = { TOK_OBJ_CLOSE };
546 static uint8_t Stream_Token[] = { TOK_STRM_OPEN };
547 static uint8_t Stream_End_Token[] = { TOK_STRM_CLOSE };
548 fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
549
550 /* Upon initial entry, setup state context */
551 if( p->State != P_IND_OBJ )
552 {
553 p->State = P_IND_OBJ;
554 p->Sub_State = P_OBJ_NUMBER;
555 p->Elem_Index = 1;
556 p->Elem_Buf[0] = c;
557 return( File_Decomp_OK );
558 }
559
560 switch( p->Sub_State )
561 {
562 case( P_OBJ_NUMBER ):
563 case( P_GEN_NUMBER ):
564 {
565 if( isdigit( c ) )
566 {
567 if( p->Elem_Index < (sizeof(p->Elem_Buf)-1))
568 {
569 p->Elem_Buf[p->Elem_Index++] = c;
570 }
571 else
572 return( File_Decomp_Error );
573 }
574 else if( c == CHR_SPACE )
575 {
576 uint32_t Value;
577 p->Elem_Buf[p->Elem_Index] = '\0';
578 Value = (uint32_t)strtoul( (const char *)p->Elem_Buf, NULL, 10 );
579 if( p->Sub_State == P_OBJ_NUMBER )
580 {
581 p->Obj_Number = Value;
582 p->Sub_State = P_GEN_NUMBER;
583 p->Elem_Index = 0;
584 }
585 else
586 {
587 p->Gen_Number = Value;
588 p->Sub_State = P_OBJ_TOKEN;
589 p->Elem_Index = 0;
590 }
591 }
592 break;
593 }
594
595 case( P_OBJ_TOKEN ):
596 {
597 if( c == Ind_Obj_Token[p->Elem_Index++] )
598 {
599 if( Ind_Obj_Token[p->Elem_Index] == '\0' )
600 {
601 p->Sub_State = P_OBJ_EOL;
602 break;
603 }
604 }
605 else
606 {
607 return( File_Decomp_Error );
608 }
609 }
610
611 case( P_OBJ_EOL ):
612 {
613 if( IS_EOL(c) )
614 {
615 DEBUG_WRAP(DebugMessage(DEBUG_HTTPINSPECT, "Indirect Object: objnum: %u\n", p->Obj_Number););
616 p->Sub_State = P_ENDOBJ_TOKEN;
617 /* Save our place in the IND_OBJ and go process an OBJECT */
618 if( Push_State( p ) != File_Decomp_OK )
619 return( File_Decomp_Error );
620 return( Handle_State_DICT_OBJECT( SessionPtr, c ) );
621 }
622
623 break;
624 }
625
626
627 case( P_STREAM_TOKEN ):
628 {
629 if( c == Stream_Token[p->Elem_Index++] )
630 {
631 if( Stream_Token[p->Elem_Index] == '\0' )
632 {
633 /* Look for the limited EOL sequence */
634 p->Sub_State = P_STREAM_EOL;
635 }
636 break;
637 }
638 else if( IS_WHITESPACE(c) )
639 {
640 p->Elem_Index = 0; // reset and keep looking
641 }
642 else
643 return( File_Decomp_Error );
644
645 break;
646 }
647
648 case( P_STREAM_EOL ):
649 {
650 if( c == CHR_CR )
651 {
652 /* The next char MUST be a LF or error */
653 p->Sub_State = P_STREAM_LF;
654 }
655 else if( c == CHR_LF )
656 {
657 if( Process_Stream( p ) != File_Decomp_OK )
658 return( File_Decomp_Error );
659 }
660 else
661 return( File_Decomp_Error );
662
663 break;
664 }
665
666 case( P_STREAM_LF ):
667 {
668 if( c == CHR_LF )
669 {
670 if( Process_Stream( p ) != File_Decomp_OK )
671 return( File_Decomp_Error );
672 }
673 else
674 return( File_Decomp_Error );
675 break;
676 }
677
678 case( P_ENDSTREAM_TOKEN ):
679 {
680 if( c == Stream_End_Token[p->Elem_Index++] )
681 {
682 if( Stream_End_Token[p->Elem_Index] == '\0' )
683 {
684 p->Sub_State = P_ENDOBJ_TOKEN;
685 }
686 }
687 else
688 {
689 p->Elem_Index = 0; // reset and keep looking
690 }
691
692 break;
693 }
694
695 case( P_ENDOBJ_TOKEN ):
696 {
697 if( c == Ind_Obj_End_Token[p->Elem_Index++] )
698 {
699 if( Ind_Obj_End_Token[p->Elem_Index] == '\0' )
700 {
701 /* we found the end of the indirect object, return
702 back to the parent state (always START in this case) */
703 return( Pop_State( p ) );
704 }
705 }
706 else
707 {
708 /* Since we don't necessarily handle all object types correctly,
709 we will spin here searching for the end token. Not the best,
710 but should work if we don't have a full object parser. */
711 p->Elem_Index = 0; // reset and keep looking
712
713 }
714
715 break;
716 }
717
718 default:
719 return( File_Decomp_Error );
720 }
721
722 return( File_Decomp_OK );
723 }
724
725 /* A simple state machine to process the xref/trailer/startxref file segments. No
726 semantic processing and only rough syntactical processing to allow us to skip through
727 this segment. */
Handle_State_XREF(fd_session_p_t SessionPtr,uint8_t c)728 static inline fd_status_t Handle_State_XREF( fd_session_p_t SessionPtr, uint8_t c )
729 {
730 static uint8_t *Xref_Tok;
731 uint8_t Xref_End_Tok[] = { TOK_XRF_END };
732 fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
733
734 if( p->State != P_XREF )
735 {
736 p->Sub_State = P_XREF_TOKEN;
737 p->Elem_Index = 1; // Aready matched the first char in START state
738 p->State = P_XREF;
739 Xref_Tok = (uint8_t *)((c == TOK_XRF_XREF[0]) ? TOK_XRF_XREF : TOK_XRF_STARTXREF);
740 return( File_Decomp_OK );
741 }
742
743 switch( p->Sub_State )
744 {
745 case( P_XREF_TOKEN ):
746 {
747 if( c == Xref_Tok[p->Elem_Index++] )
748 {
749 if( Xref_Tok[p->Elem_Index] == '\0' )
750 {
751 p->Elem_Index = 0;
752 p->Sub_State = P_XREF_END_TOKEN;
753 }
754 }
755 else
756 {
757 return( File_Decomp_Error );
758 }
759 break;
760 }
761
762 case( P_XREF_END_TOKEN ):
763 {
764 if( c == Xref_End_Tok[p->Elem_Index++] )
765 {
766 if( Xref_End_Tok[p->Elem_Index] == '\0' )
767 {
768 p->State = P_START;
769 }
770 }
771 else
772 {
773 /* Since we don't necessarily handle all xref content correctly,
774 we will spin here searching for the end token. Not the best,
775 but should work if we don't have a full object parser. */
776 p->Elem_Index = 0; // reset and keep looking
777
778 }
779
780 break;
781 }
782
783 default:
784 return( File_Decomp_Error );
785 }
786
787 return( File_Decomp_OK );
788 }
789
Handle_State_START(fd_session_p_t SessionPtr,uint8_t c)790 static inline fd_status_t Handle_State_START( fd_session_p_t SessionPtr, uint8_t c )
791 {
792 fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
793 /* Skip any whitespace. This will include
794 the LF as part of a <CRLF> EOL token. */
795 if( IS_WHITESPACE(c) )
796 {
797 return( File_Decomp_OK );
798 }
799 if( c == CHR_COMMENT )
800 {
801 p->State = P_COMMENT;
802 }
803 else if( isdigit( c ) )
804 {
805 /* Save state and process an indirect object */
806 if( Push_State( p ) != File_Decomp_OK )
807 return( File_Decomp_Error );
808 return( Handle_State_IND_OBJ( SessionPtr, c ) );
809 }
810 else if( (c == TOK_XRF_XREF[0]) || (c == TOK_XRF_STARTXREF[0]) )
811 {
812 /* Save state and process the xref block */
813 if( Push_State( p ) != File_Decomp_OK )
814 return( File_Decomp_Error );
815 return( Handle_State_XREF( SessionPtr, c ) );
816 }
817 else if( !(IS_WHITESPACE(c)) )
818 {
819 /* If is not an ind_obj started, or a comment starting, then
820 we don't know what it is, so return an error. */
821 return( File_Decomp_Error );
822 }
823
824 return( File_Decomp_OK );
825 }
826
827 /* Incrementally search the incoming data for a PDF compressed stream
828 (of the type that we can decompress). Move bytes to outgoing data
829 up to the beginning of the compressed segment. If the FILE_REVERT_BIT
830 is set in the Session, remove the /Filter spec that was located by
831 replacing the name with null. */
832
833 /* Parse file until input blocked or stream located. */
Locate_Stream_Beginning(fd_session_p_t SessionPtr)834 static fd_status_t Locate_Stream_Beginning( fd_session_p_t SessionPtr )
835 {
836 fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
837 fd_status_t Ret_Code = File_Decomp_OK;
838 uint8_t c;
839
840 while( 1 )
841 {
842 /* No reason to parse if there's no input or
843 room for output. */
844 if( SessionPtr->Avail_In == 0 )
845 return( File_Decomp_BlockIn );
846 if( SessionPtr->Avail_Out == 0 )
847 return( File_Decomp_BlockOut );
848
849 /* Get next byte in input queue */
850 c = *SessionPtr->Next_In;
851
852 switch( p->State )
853 {
854 /* The 'ground' state of the parser. All indirect objects
855 should be located at this level. */
856 case( P_START ):
857 {
858 if( (Ret_Code = Handle_State_START( SessionPtr, c )) != File_Decomp_OK )
859 return( Ret_Code );
860 break;
861 }
862
863 case( P_COMMENT ):
864 {
865 /* CR or LF closes the comment. The optional LF
866 after a CR will be considered whitespace and
867 removed in the P_START state. */
868 if( IS_EOL(c) )
869 p->State = P_START;
870 break;
871 }
872
873 case( P_IND_OBJ ):
874 {
875 if( (Ret_Code = Handle_State_IND_OBJ( SessionPtr, c )) != File_Decomp_OK )
876 return( Ret_Code );
877 break;
878 }
879
880 case( P_DICT_OBJECT ):
881 {
882 if( (Ret_Code = Handle_State_DICT_OBJECT( SessionPtr, c )) != File_Decomp_OK )
883 return( Ret_Code );
884 break;
885 }
886
887 case( P_XREF ):
888 {
889 if( (Ret_Code = Handle_State_XREF( SessionPtr, c )) != File_Decomp_OK )
890 return( Ret_Code );
891 break;
892 }
893
894 case( P_STREAM ):
895 {
896 return( File_Decomp_Complete );
897 }
898
899 default:
900 return( File_Decomp_Error );
901 }
902 /* After parsing, move the byte from the input to the
903 output stream. We can only be here if there's input
904 available and output space. */
905 (void)Move_1(SessionPtr);
906 }
907 }
908
Init_Stream(fd_session_p_t SessionPtr)909 static fd_status_t Init_Stream( fd_session_p_t SessionPtr )
910 {
911 fd_PDF_p_t StPtr = &(SessionPtr->Decomp_State.PDF);
912
913 switch( StPtr->Decomp_Type )
914 {
915 case FILE_COMPRESSION_TYPE_DEFLATE:
916 {
917 int z_ret;
918
919 z_stream *z_s = &(StPtr->PDF_Decomp_State.Deflate.StreamDeflate);
920
921 memset( (char *)z_s, 0, sizeof(z_stream));
922
923 z_s->zalloc = (alloc_func)NULL;
924 z_s->zfree = (free_func)NULL;
925 SYNC_IN(z_s)
926
927 z_ret = inflateInit2(z_s, 47);
928
929 if( z_ret != Z_OK )
930 {
931 File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_DEFL_FAILURE );
932 return( File_Decomp_Error );
933 }
934
935 break;
936 }
937 default:
938 return( File_Decomp_Error );
939 }
940
941 return( File_Decomp_OK );
942 }
943
Decomp_Stream(fd_session_p_t SessionPtr)944 static fd_status_t Decomp_Stream( fd_session_p_t SessionPtr )
945 {
946 fd_PDF_p_t StPtr = &(SessionPtr->Decomp_State.PDF);
947
948 /* No reason to decompress if there's no input or
949 room for output. */
950 if( SessionPtr->Avail_In == 0 )
951 return( File_Decomp_BlockIn );
952 if( SessionPtr->Avail_Out == 0 )
953 return( File_Decomp_BlockOut );
954
955 switch( StPtr->Decomp_Type )
956 {
957 case FILE_COMPRESSION_TYPE_DEFLATE:
958 {
959 int z_ret;
960 z_stream *z_s = &(StPtr->PDF_Decomp_State.Deflate.StreamDeflate);
961
962 SYNC_IN(z_s)
963
964 z_ret = inflate(z_s, Z_SYNC_FLUSH);
965
966 SYNC_OUT(z_s)
967
968 if( z_ret == Z_STREAM_END )
969 {
970 return( File_Decomp_Complete );
971 }
972
973 if( z_ret != Z_OK )
974 {
975 DEBUG_WRAP(DebugMessage(DEBUG_HTTPINSPECT, "Decompression Error: objnum: %u\n", StPtr->Parse.Obj_Number););
976 File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_DEFL_FAILURE );
977 return( File_Decomp_Error );
978 }
979
980 break;
981 }
982 default:
983 return( File_Decomp_Error );
984 }
985
986 return( File_Decomp_OK );
987 }
988
989 /* After processing a stream, close the decompession engine
990 and return the state of the parser. */
Close_Stream(fd_session_p_t SessionPtr)991 static fd_status_t Close_Stream( fd_session_p_t SessionPtr )
992 {
993 /* Put the parser state back where it was interrupted */
994 if( Pop_State( &(SessionPtr->Decomp_State.PDF.Parse) ) == File_Decomp_Error )
995 return( File_Decomp_Error );
996
997 SessionPtr->Decomp_State.PDF.State = PDF_STATE_LOCATE_STREAM;
998
999 return( File_Decomp_OK );
1000 }
1001
1002 /* Abort the decompression sesson upon command from caller. */
File_Decomp_End_PDF(fd_session_p_t SessionPtr)1003 fd_status_t File_Decomp_End_PDF( fd_session_p_t SessionPtr )
1004 {
1005 fd_PDF_p_t StPtr;
1006
1007 if( SessionPtr == NULL )
1008 return( File_Decomp_Error );
1009
1010 StPtr = &(SessionPtr->Decomp_State.PDF);
1011
1012 if( (StPtr->State != PDF_STATE_INIT_STREAM) &&
1013 (StPtr->State != PDF_STATE_PROCESS_STREAM) )
1014 return( File_Decomp_OK );
1015
1016 switch( StPtr->Decomp_Type )
1017 {
1018 case FILE_COMPRESSION_TYPE_DEFLATE:
1019 {
1020 int z_ret;
1021 z_stream *z_s = &(StPtr->PDF_Decomp_State.Deflate.StreamDeflate);
1022
1023 z_ret = inflateEnd(z_s);
1024
1025 if( z_ret != Z_OK )
1026 {
1027 File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_DEFL_FAILURE );
1028 return( File_Decomp_Error );
1029 }
1030
1031 break;
1032 }
1033 default:
1034 return( File_Decomp_Error );
1035 }
1036
1037 return( File_Decomp_OK );
1038 }
1039
1040
1041 /* From caller, initialize PDF state machine. */
File_Decomp_Init_PDF(fd_session_p_t SessionPtr)1042 fd_status_t File_Decomp_Init_PDF( fd_session_p_t SessionPtr )
1043 {
1044 fd_PDF_p_t StPtr;
1045
1046 if( SessionPtr == NULL )
1047 return( File_Decomp_Error );
1048
1049 StPtr = &(SessionPtr->Decomp_State.PDF);
1050
1051 Init_Parser( SessionPtr );
1052
1053 StPtr->Decomp_Type = FILE_COMPRESSION_TYPE_NONE;
1054
1055 /* Search for Dictionary/Stream object. */
1056 StPtr->State = PDF_STATE_LOCATE_STREAM;
1057
1058 return( File_Decomp_OK );
1059 }
1060
1061 /* Run the PDF state machine */
File_Decomp_PDF(fd_session_p_t SessionPtr)1062 fd_status_t File_Decomp_PDF( fd_session_p_t SessionPtr )
1063 {
1064 fd_status_t Ret_Code;
1065
1066 if( (SessionPtr == NULL) || (SessionPtr->File_Type != FILE_TYPE_PDF) )
1067 return( File_Decomp_Error );
1068
1069 /* Process all data until blocked */
1070 while( 1 )
1071 {
1072 switch( SessionPtr->Decomp_State.PDF.State )
1073 {
1074 case( PDF_STATE_LOCATE_STREAM ):
1075 {
1076 /* Will return File_Decomp_Complete if/when the start of a valid compressed
1077 stream is located. Decomp_Type will be set. The parsing will be suspended. */
1078 if( (Ret_Code = Locate_Stream_Beginning( SessionPtr ) ) == File_Decomp_Error)
1079 {
1080 SessionPtr->Error_Event = HI_EO_SERVER_PDF_PARSE_FAILURE;
1081 return( File_Decomp_DecompError );
1082 }
1083
1084 /* If we didn't succeed then get more input */
1085 if( Ret_Code != File_Decomp_Complete )
1086 return( Ret_Code );
1087
1088 /* The Parsing state remains, we break out to perform the stream
1089 decompression. */
1090 if( SessionPtr->Decomp_Type == FILE_COMPRESSION_TYPE_NONE )
1091 {
1092 break;
1093 }
1094 else
1095 {
1096 DEBUG_WRAP(DebugMessage(DEBUG_HTTPINSPECT, "Compressed stream of type: %u\n", SessionPtr->Decomp_Type););
1097 SessionPtr->Decomp_State.PDF.State = PDF_STATE_INIT_STREAM;
1098 /* If we've located the beginning of stream, set new state
1099 and fall into next state */
1100 }
1101 }
1102
1103 case( PDF_STATE_INIT_STREAM ):
1104 {
1105 /* Initialize the selected decompression engine. */
1106 Ret_Code = Init_Stream( SessionPtr );
1107 if( Ret_Code != File_Decomp_OK )
1108 {
1109 Ret_Code = File_Decomp_End_PDF( SessionPtr );
1110 if( Close_Stream( SessionPtr ) != File_Decomp_OK )
1111 return( File_Decomp_Error );
1112 File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_DEFL_FAILURE );
1113 break;
1114 }
1115
1116 SessionPtr->Decomp_State.PDF.State = PDF_STATE_PROCESS_STREAM;
1117 /* INTENTIONAL FALL-THROUGH INTO PDF_STATE_PROCESS_STREAM CASE. */
1118 }
1119
1120 case( PDF_STATE_PROCESS_STREAM ):
1121 {
1122 Ret_Code = Decomp_Stream( SessionPtr );
1123 /* Has the decompressor indicated the end of the data */
1124 if( Ret_Code == File_Decomp_Error )
1125 {
1126 Ret_Code = File_Decomp_End_PDF( SessionPtr );
1127 if( Close_Stream( SessionPtr ) != File_Decomp_OK )
1128 return( File_Decomp_Error );
1129 File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_DEFL_FAILURE );
1130 break;
1131 }
1132 /* OK -> circle back for more input */
1133 else if( Ret_Code == File_Decomp_OK )
1134 break;
1135 else if( Ret_Code != File_Decomp_Complete )
1136 return( Ret_Code );
1137
1138 /* Close the decompression engine */
1139 if( (Ret_Code = File_Decomp_End_PDF( SessionPtr ) ) == File_Decomp_Error)
1140 return( File_Decomp_Error);
1141
1142 /* Put the parser state back where it was interrupted */
1143 if( (Close_Stream( SessionPtr) ) == File_Decomp_Error )
1144 return( File_Decomp_Error );
1145
1146 break;
1147 }
1148
1149 default:
1150 return( File_Decomp_Error );
1151 } // switch()
1152 } // while()
1153
1154 return( File_Decomp_OK );
1155 }
1156
1157 #endif
1158