1 /* $Id$ */
2 /*
3 ** file_decomp_PDF.c
4 **
5 ** Copyright (C) 2014-2021 Cisco and/or its affiliates. All rights reserved.
6 **
7 ** This program is free software; you can redistribute it and/or modify
8 ** it under the terms of the GNU General Public License Version 2 as
9 ** published by the Free Software Foundation.  You may not use, modify or
10 ** distribute this program under any other version of the GNU General
11 ** Public License.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
21 */
22 
23 #ifdef HAVE_CONFIG_H
24 #include "config.h"
25 #endif
26 
27 #include <zlib.h>
28 #include <string.h>
29 #include <stdlib.h>
30 
31 #include "file_decomp.h"
32 #ifdef FILE_DECOMP_PDF
33 #include "file_decomp_PDF.h"
34 #include "hi_eo_events.h"
35 #include "mstring.h"
36 
37 /* Define characters and tokens in PDF grammar */
38 #define TOK_STRM_OPEN      "stream"
39 #define TOK_STRM_CLOSE     "endstream"
40 
41 #define TOK_OBJ_OPEN       "obj"
42 #define TOK_OBJ_CLOSE      "endobj"
43 
44 #define TOK_DICT_OPEN      "<<"
45 #define TOK_DICT_CLOSE     ">>"
46 #define TOK_DICT_FILT      "Filter"
47 #define TOK_DICT_FLATE     "FlateDecode"
48 #define TOK_DICT_FLATE_ALT "Fl"
49 #define TOK_DICT_PARMS     "DecodeParms"
50 #define TOK_DICT_PARMS_ALT "DP"
51 #define TOK_DICT_LENGTH    "Length"
52 #define TOK_DICT_NULL      "null"
53 #define TOK_DICT_NULL_FILT " null "  // Enclose the null object in spaces
54 #define TOK_XRF_XREF       "xref"
55 #define TOK_XRF_TRAILER    "trailer"
56 #define TOK_XRF_STARTXREF  "startxref"
57 #define TOK_XRF_END        "%%EOF"
58 
59 #define WHITESPACE_STRING  "\011\012\014\015\040" // plus \000
60 
61 #define TOK_EOL_CR         "\r"
62 #define TOK_EOL_LF         "\n"
63 #define TOK_EOL_CRLF       "\r\n"
64 
65 #define CHR_CR             '\r'
66 #define CHR_LF             '\n'
67 
68 #define CHR_COMMENT        '%'
69 
70 #define CHR_ARRAY_OPEN     '['
71 #define CHR_ARRAY_CLOSE    ']'
72 
73 #define CHR_ANGLE_OPEN     '<'
74 #define CHR_ANGLE_CLOSE    '>'
75 
76 #define CHR_SPACE          ' '
77 #define CHR_NAME_SEP       '/'
78 
79 #define IS_WHITESPACE(c) ((strchr((char *)WHITESPACE_STRING, (int)c) != NULL) || (c == 0))
80 #define IS_EOL(c) ((c == CHR_CR) || (c == CHR_LF))
81 
82 /* Define the parser states */
83 typedef enum p_states
84 {
85     P_START = 1,    // Ground state, nothing 'open'
86     P_COMMENT,      // inside a comment (initial state of parser)
87     P_IND_OBJ,      // Indirect Object - Sub_State usage
88     P_XREF,         // The combined xref, trailer, startxref top level items
89     P_DICT_OBJECT,  // A dictionary object
90     P_STREAM        // A pseudo state used to process a stream object
91 } p_state_t;
92 
93 typedef enum p_xref_substates
94 {
95     P_XREF_TOKEN = 1,
96     P_XREF_END_TOKEN
97 } p_xref_t;
98 
99 typedef enum p_dict_substates
100 {
101     P_DICT_OPEN = 1,
102     P_DICT_OPEN_TOK,
103     P_DICT_CLOSE_TOK,
104     P_DICT_FILTER,
105     P_DICT_SKIP,
106     P_DICT_ACTIVE
107 } p_dict_t;
108 
109 typedef enum p_indirect_object_substates
110 {
111     P_OBJ_NUMBER = 1,
112     P_GEN_NUMBER,
113     P_OBJ_TOKEN,
114     P_OBJ_EOL,
115     P_STREAM_TOKEN,
116     P_STREAM_EOL,
117     P_STREAM_LF,
118     P_ENDSTREAM_TOKEN,
119     P_ENDOBJ_TOKEN
120 } p_indirect_object_substate_t;
121 
122 static struct filters_s
123 {
124     char *Token;
125     uint8_t Length;
126     uint8_t Type;
127 } Filter_Map[] =
128 {
129     { TOK_DICT_FLATE, (sizeof(TOK_DICT_FLATE)-1), FILE_COMPRESSION_TYPE_DEFLATE },
130     { TOK_DICT_FLATE_ALT, (sizeof(TOK_DICT_FLATE_ALT)-1), FILE_COMPRESSION_TYPE_DEFLATE },
131     { TOK_DICT_NULL, (sizeof(TOK_DICT_NULL)-1), FILE_COMPRESSION_TYPE_NONE },
132     { NULL, 0, FILE_COMPRESSION_TYPE_NONE }
133 };
134 
135 /* Given a pointer to a /Filter value token, return the
136    associated compression type from the Filter_Map. */
Get_Decomp_Type(uint8_t * Token,uint8_t Length)137 static inline uint8_t Get_Decomp_Type( uint8_t *Token, uint8_t Length )
138 {
139     int Index;
140 
141     Index=0;
142 
143     while( Filter_Map[Index].Token != NULL )
144     {
145         if( (Filter_Map[Index].Length == Length) &&
146             (strncmp( (const char *)Token, Filter_Map[Index].Token, Length ) == 0 ) )
147             return( Filter_Map[Index].Type );
148         else
149             Index += 1;
150     }
151     return( FILE_COMPRESSION_TYPE_NONE );
152 }
153 
Process_One_Filter(fd_session_p_t SessionPtr,uint8_t * Token,uint8_t Length)154 static inline void Process_One_Filter( fd_session_p_t SessionPtr, uint8_t *Token, uint8_t Length )
155 {
156     uint8_t Comp_Type;
157 
158     /* Lookup the token and see if it matches a known filter */
159     Comp_Type = Get_Decomp_Type( Token, Length );
160 
161     if( Comp_Type != FILE_COMPRESSION_TYPE_NONE )
162     {
163         /* Check if we've found one already.  Indicate cascading if we did. */
164         if( SessionPtr->Decomp_Type != FILE_COMPRESSION_TYPE_NONE )
165         {
166             File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_CASC_COMP );
167             SessionPtr->Decomp_Type = FILE_COMPRESSION_TYPE_NONE;
168         }
169         else
170         {
171             /* Found our first matching, supported filter type */
172             SessionPtr->Decomp_Type = Comp_Type;
173             SessionPtr->Decomp_State.PDF.Decomp_Type = Comp_Type;
174         }
175     }
176     else
177     {
178         File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_UNSUP_COMP_TYPE );
179         SessionPtr->Decomp_Type = FILE_COMPRESSION_TYPE_NONE;
180     }
181 }
182 
183 /* Parse the buffered Filter_Spec and create a stream decompression
184    mode and/or event alerts.  Return File_Decomp_OK if successfui.
185    Return File_Decomp_Error for a parsing error. */
Process_Filter_Spec(fd_session_p_t SessionPtr)186 static fd_status_t Process_Filter_Spec( fd_session_p_t SessionPtr )
187 {
188     /* The following string contains CHR_ARRAY_OPEN, CHR_ARRAY_CLOSE,
189        and CHR_NAME_SEP. */
190     const uint8_t Delim_Str[] = { "\011\012\014\015\040/[]" };
191     bool Found_Array = false;
192     bool Found_Token = false;
193     uint8_t *Filter;
194     uint8_t Length;
195     uint8_t c;
196     int Index;
197 
198     fd_status_t Ret_Code = File_Decomp_OK;
199     fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
200 
201     /* Assume the 'no compression' result */
202     SessionPtr->Decomp_Type = FILE_COMPRESSION_TYPE_NONE;
203     Filter = NULL;
204     Length = 0;
205 
206     for( Index=0; Index<p->Filter_Spec_Index; Index++ )
207     {
208         c = p->Filter_Spec_Buf[Index];
209 
210         if( (c == 0) || (strchr( (char *)Delim_Str, (int)c ) != 0) )
211         {
212             if( c == CHR_ARRAY_OPEN )
213             {
214                 /* Looks like an array starting, but we are already
215                    in an array, or have seen a filter spec already.  */
216                 if( Found_Array || Found_Token || (Filter != NULL) )
217                 {
218                     Ret_Code = File_Decomp_Error;
219                     break;
220                 }
221                 else
222                 {
223                     Found_Array = true;
224                     Filter = NULL;
225                     Length = 0;
226                     continue;  // Nothing else to do, goto next char
227                 }
228             }
229             else if( c == CHR_ARRAY_CLOSE )
230             {
231                 /* We MUST have an array open at this point. */
232                 if( !Found_Array )
233                 {
234                     Ret_Code = File_Decomp_Error;
235                     break;
236                 }
237                 Found_Array = false;
238             }
239 
240             /* The white-space or other separator terminates the
241                current filter name we are parsing. */
242             if( (Filter != NULL) && (Length > 0) )
243             {
244                 Process_One_Filter( SessionPtr, Filter, Length );
245                 Filter = NULL;
246                 Length = 0;
247             }
248         }
249         else  // non-separator character
250         {
251             /* Start a token if we haven't already. */
252             if( Filter == NULL )
253             {
254                 Found_Token = true;  // Used in the array syntax checking
255                 Filter = &(p->Filter_Spec_Buf[Index]);
256                 Length = 1;  // We've found one character so far
257             }
258             else
259             {
260                 Length += 1;
261             }
262         }
263     }
264 
265     /* Indicate an error is we exit the parsing with the array open */
266     if( Found_Array )
267         Ret_Code = File_Decomp_Error;
268 
269     /* Any error code implies no compression type */
270     if( Ret_Code == File_Decomp_Error )
271         SessionPtr->Decomp_Type = FILE_COMPRESSION_TYPE_NONE;
272     /* Look for case where the filter name ends at the
273        last character of the filter_spec. */
274     else if( (Filter != NULL) && (Length > 0) )
275         Process_One_Filter( SessionPtr, Filter, Length );
276 
277     return( Ret_Code );
278 }
279 
280 
Init_Parser(fd_session_p_t SessionPtr)281 static inline void Init_Parser( fd_session_p_t SessionPtr )
282 {
283     fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
284     /* The parser starts in the P_COMMENT state we start
285        parsing the file just after the signature is located
286        and the signature is syntactially a comment. */
287     p->State = P_COMMENT;
288     p->Parse_Stack_Index = 0; // Stack is empty
289 }
290 
Push_State(fd_PDF_Parse_p_t p)291 static inline fd_status_t Push_State( fd_PDF_Parse_p_t p )
292 {
293     fd_PDF_Parse_Stack_p_t StckPtr;
294 
295     if( p->Parse_Stack_Index >= (PARSE_STACK_LEN-1) )
296         return( File_Decomp_Error );
297 
298     StckPtr = &(p->Parse_Stack[(p->Parse_Stack_Index)++]);
299 
300     StckPtr->State = p->State;
301     StckPtr->Sub_State = p->Sub_State;
302 
303     return( File_Decomp_OK );
304 }
305 
Pop_State(fd_PDF_Parse_p_t p)306 static inline fd_status_t Pop_State( fd_PDF_Parse_p_t p )
307 {
308     fd_PDF_Parse_Stack_p_t StckPtr;
309 
310     if( p->Parse_Stack_Index == 0 )
311         return( File_Decomp_Error );
312 
313     StckPtr = &(p->Parse_Stack[--(p->Parse_Stack_Index)]);
314 
315     p->Elem_Index = 0;  // Reset to beginning of token as can't push/pop in mid-token
316     p->State = StckPtr->State;
317     p->Sub_State = StckPtr->Sub_State;
318 
319     return( File_Decomp_OK );
320 }
321 
322 /* If there's a previous state on the stack, return a pointer to it, else return NULL */
Get_Previous_State(fd_PDF_Parse_p_t p)323 static inline fd_PDF_Parse_Stack_p_t Get_Previous_State( fd_PDF_Parse_p_t p )
324 {
325     if( p->Parse_Stack_Index == 0 )
326         return( (fd_PDF_Parse_Stack_p_t)NULL );
327 
328     return( &(p->Parse_Stack[(p->Parse_Stack_Index)-1]) );
329 }
330 
331 /* Objects are the heart and soul of the PDF.  In particular, we need to concentrate on Dictionary
332    objects and objects that map to the Filter element in Dictionaries.  'null' is a valid object'.
333    Objects can be recursively composed of arrays of objects. In our limited parsing paradigm, we
334    will only process the contents of top level Dictionaries and ignore deeper levels.  We will
335    only explore Dictionary objects within Indirect Objects.  */
Handle_State_DICT_OBJECT(fd_session_p_t SessionPtr,uint8_t c)336 static inline fd_status_t Handle_State_DICT_OBJECT( fd_session_p_t SessionPtr, uint8_t c )
337 {
338     char Filter_Tok[] = TOK_DICT_FILT;
339     fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
340 
341     /* enter with c being an EOL from the ind obj state */
342     if( p->State != P_DICT_OBJECT )
343     {
344         p->Sub_State = P_DICT_OPEN;  // Looking to open a Dict`
345         p->Dict_Nesting_Cnt = 0;  // No Dicts are 'active'
346         p->State = P_DICT_OBJECT;
347         p->Filter_Spec_Index = 0;
348         SessionPtr->Decomp_Type = FILE_COMPRESSION_TYPE_NONE;
349         return( File_Decomp_OK );
350     }
351 
352     switch( p->Sub_State )
353     {
354         /* look for the first angle bracket */
355         case( P_DICT_OPEN ):
356         {
357             if( c == CHR_ANGLE_OPEN )
358             {
359                 p->Sub_State = P_DICT_OPEN_TOK;
360             }
361             else if( !IS_WHITESPACE(c) )
362             {
363                 /* for other objects, just skip and wait for the close of the
364                    indirect object as we don't parse objects other than Dict's. */
365                 if( Pop_State( p ) == File_Decomp_Error )
366                     return( File_Decomp_Error );
367             }
368             break;
369         }
370         /* now look for the second angle bracket */
371         case( P_DICT_OPEN_TOK ):
372         {
373             if( c == CHR_ANGLE_OPEN )
374             {
375                 /* Only ACTIVE if this is the opening of the
376                    'base level' Dict, NOT a nested one. */
377                 if( p->Dict_Nesting_Cnt++ == 0 )
378                 {
379                     p->Sub_State = P_DICT_ACTIVE;
380                 }
381                 else
382                 {
383                     p->Sub_State = P_DICT_SKIP;
384                 }
385             }
386             else
387             {
388                 /* for other objects, just skip and wait for the close of the
389                    indirect object as we don't parse objects other than Dict's. */
390                 if( Pop_State( p ) == File_Decomp_Error )
391                     return( File_Decomp_Error );
392             }
393             break;
394         }
395 
396         case( P_DICT_SKIP ):
397         case( P_DICT_ACTIVE ):
398         {
399             /* Main purpose is to search for the value portion of the
400                /Filter entry.  Main loop looks for the /Filter token
401                and handles other diversion such as nested Dict objects.
402                If the /Filter token doesn't exist then we don't fill the
403                Filter_Spec_Buf[].  If in skip mode, no need to look for token. */
404             if( (p->Sub_State == P_DICT_ACTIVE) && c == Filter_Tok[p->Elem_Index++] )
405             {
406                 if( Filter_Tok[p->Elem_Index] == '\0' )
407                 {
408                     p->Sub_State = P_DICT_FILTER;
409                 }
410             }
411             else
412             {
413                 /* On a mis-match, reset back to the start of the token */
414                 p->Elem_Index = 0;
415 
416                 /* we might find a Sub-Dict while we're looking */
417                 if( c == CHR_ANGLE_OPEN )
418                 {
419                     /* Save where we are, and process the Dict */
420                     if( Push_State( p ) != File_Decomp_OK )
421                         return( File_Decomp_Error );
422                     p->Sub_State = P_DICT_OPEN_TOK;
423                 }
424                 else if( c == CHR_ANGLE_CLOSE )
425                 {
426                     if( Push_State( p ) != File_Decomp_OK )
427                         return( File_Decomp_Error );
428                     p->Sub_State = P_DICT_CLOSE_TOK;
429                 }
430             }
431             break;
432         }
433 
434         case( P_DICT_FILTER ):
435         {
436             if( (c == CHR_ANGLE_CLOSE) ||
437                 ((c == CHR_NAME_SEP) && (p->Dict_Nesting_Cnt==3) ))  //  See the large comment below
438             {
439                 if( c == CHR_ANGLE_CLOSE )
440                 {
441                     if( Push_State( p ) != File_Decomp_OK )
442                         return( File_Decomp_Error );
443                     p->Sub_State = P_DICT_CLOSE_TOK;
444                 }
445                 else
446                 {
447                     p->Sub_State = P_DICT_SKIP;
448                 }
449                 if( (Process_Filter_Spec( SessionPtr )  == File_Decomp_Error) )
450                     return( File_Decomp_Error );
451             }
452             else
453             {
454                 /* Since we don't have a full object parse, we need to assure
455                    that we capture the entire filter spec string.  The '>' is always
456                    a terminator, but we also want to terminate on the next /Name entry
457                    after a possible array of /Names.  The Dict_Nesting_Cnt is used to
458                    step through the transition options.  The '/' character is only a valid
459                    filter spec terminator if we've seen a valid array or one /Name entry. */
460                 if( (c == CHR_NAME_SEP) && (p->Dict_Nesting_Cnt==1) )
461                     p->Dict_Nesting_Cnt = 3;
462                 else if( (c == CHR_ARRAY_OPEN) && (p->Dict_Nesting_Cnt==1) )
463                     p->Dict_Nesting_Cnt = 2;
464                 else if( (c == CHR_ARRAY_CLOSE) && (p->Dict_Nesting_Cnt==2) )
465                     p->Dict_Nesting_Cnt = 3;
466 
467                 if( p->Filter_Spec_Index < (FILTER_SPEC_BUF_LEN-1) )
468                 {
469                     p->Filter_Spec_Buf[p->Filter_Spec_Index++] = c;
470                 }
471                 else
472                     return( File_Decomp_Error );
473             }
474             break;
475         }
476 
477         case( P_DICT_CLOSE_TOK ):
478         {
479             if( c == CHR_ANGLE_CLOSE )
480             {
481                 /*  Pop the temp state just prior to the first > */
482                 if( Pop_State( p ) == File_Decomp_Error )
483                     return( File_Decomp_Error );
484 
485                 /* Pop back to the state before the <<.  */
486                 /* But not so fast...  Look at what state/sub-state we are popping
487                    back to.  If it's IND_OBJ, AND we have an active filter type,
488                    we don't want to scan to the end of the stream but rather the beginning
489                    of the stream.  */
490                 if( SessionPtr->Decomp_Type != FILE_COMPRESSION_TYPE_NONE )
491                 {
492                     fd_PDF_Parse_Stack_p_t StckPtr;
493 
494                     if( (StckPtr = Get_Previous_State( p )) == NULL )
495                     {
496                         /* There MUST be a previous state that got us here. */
497                         return( File_Decomp_Error );
498                     }
499                     else
500                     {
501                         if( (StckPtr->State == P_IND_OBJ) &&
502                             (StckPtr->Sub_State == P_ENDOBJ_TOKEN) )
503                         {
504                             StckPtr->Sub_State = P_STREAM_TOKEN;
505                         }
506                     }
507                 }
508                 if( Pop_State( p ) == File_Decomp_Error )
509                     return( File_Decomp_Error );
510             }
511             else
512                 /* Return to where we looking (didn't get >>) */
513                 if( Pop_State( p ) == File_Decomp_Error )
514                     return( File_Decomp_Error );
515             break;
516         }
517 
518         default:
519             return( File_Decomp_Error );
520     }
521 
522     return( File_Decomp_OK );
523 }
524 
Process_Stream(fd_PDF_Parse_p_t p)525 static inline fd_status_t Process_Stream( fd_PDF_Parse_p_t p )
526 {
527     p->Sub_State = P_ENDSTREAM_TOKEN;
528     p->State = P_IND_OBJ;
529 
530     if( Push_State( p ) == File_Decomp_Error )
531         return( File_Decomp_Error );
532     else
533         {
534             p->State = P_STREAM;
535             p->Sub_State = 0;
536         }
537     return( File_Decomp_OK );
538 }
539 
540 /* Indirect Objects occur only at the top level of the file and comprise the
541    bulk of the file content. */
Handle_State_IND_OBJ(fd_session_p_t SessionPtr,uint8_t c)542 static inline fd_status_t Handle_State_IND_OBJ( fd_session_p_t SessionPtr, uint8_t c )
543 {
544     static uint8_t Ind_Obj_Token[] = { TOK_OBJ_OPEN };
545     static uint8_t Ind_Obj_End_Token[] = { TOK_OBJ_CLOSE };
546     static uint8_t Stream_Token[] = { TOK_STRM_OPEN };
547     static uint8_t Stream_End_Token[] = { TOK_STRM_CLOSE };
548     fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
549 
550     /* Upon initial entry, setup state context */
551     if( p->State != P_IND_OBJ )
552     {
553         p->State = P_IND_OBJ;
554         p->Sub_State = P_OBJ_NUMBER;
555         p->Elem_Index = 1;
556         p->Elem_Buf[0] = c;
557         return( File_Decomp_OK );
558     }
559 
560     switch( p->Sub_State )
561     {
562         case( P_OBJ_NUMBER ):
563         case( P_GEN_NUMBER ):
564         {
565             if( isdigit( c ) )
566             {
567                 if( p->Elem_Index < (sizeof(p->Elem_Buf)-1))
568                 {
569                     p->Elem_Buf[p->Elem_Index++] = c;
570                 }
571                 else
572                     return( File_Decomp_Error );
573             }
574             else if( c == CHR_SPACE )
575             {
576                 uint32_t Value;
577                 p->Elem_Buf[p->Elem_Index] = '\0';
578                 Value = (uint32_t)strtoul( (const char *)p->Elem_Buf, NULL, 10 );
579                 if( p->Sub_State == P_OBJ_NUMBER )
580                 {
581                     p->Obj_Number = Value;
582                     p->Sub_State = P_GEN_NUMBER;
583                     p->Elem_Index = 0;
584                 }
585                 else
586                 {
587                     p->Gen_Number = Value;
588                     p->Sub_State = P_OBJ_TOKEN;
589                     p->Elem_Index = 0;
590                 }
591             }
592             break;
593         }
594 
595         case( P_OBJ_TOKEN ):
596         {
597             if( c == Ind_Obj_Token[p->Elem_Index++] )
598             {
599                 if( Ind_Obj_Token[p->Elem_Index] == '\0' )
600                 {
601                     p->Sub_State = P_OBJ_EOL;
602                     break;
603                 }
604             }
605             else
606             {
607                 return( File_Decomp_Error );
608             }
609         }
610 
611         case( P_OBJ_EOL ):
612         {
613             if( IS_EOL(c) )
614             {
615                 DEBUG_WRAP(DebugMessage(DEBUG_HTTPINSPECT, "Indirect Object: objnum: %u\n", p->Obj_Number););
616                 p->Sub_State = P_ENDOBJ_TOKEN;
617                 /* Save our place in the IND_OBJ and go process an OBJECT */
618                 if( Push_State( p ) != File_Decomp_OK )
619                     return( File_Decomp_Error );
620                 return( Handle_State_DICT_OBJECT( SessionPtr, c ) );
621             }
622 
623             break;
624         }
625 
626 
627         case( P_STREAM_TOKEN ):
628         {
629             if( c == Stream_Token[p->Elem_Index++] )
630             {
631                 if( Stream_Token[p->Elem_Index] == '\0' )
632                 {
633                     /* Look for the limited EOL sequence */
634                     p->Sub_State = P_STREAM_EOL;
635                 }
636                 break;
637             }
638             else if( IS_WHITESPACE(c) )
639             {
640                 p->Elem_Index = 0;  // reset and keep looking
641             }
642             else
643                 return( File_Decomp_Error );
644 
645             break;
646         }
647 
648         case( P_STREAM_EOL ):
649         {
650             if( c == CHR_CR )
651             {
652                 /* The next char MUST be a LF or error */
653                 p->Sub_State = P_STREAM_LF;
654             }
655             else if( c == CHR_LF )
656             {
657                 if( Process_Stream( p ) != File_Decomp_OK )
658                     return( File_Decomp_Error );
659             }
660             else
661                 return( File_Decomp_Error );
662 
663             break;
664         }
665 
666         case( P_STREAM_LF ):
667         {
668             if( c == CHR_LF )
669             {
670                 if( Process_Stream( p ) != File_Decomp_OK )
671                     return( File_Decomp_Error );
672             }
673             else
674                 return( File_Decomp_Error );
675             break;
676         }
677 
678         case( P_ENDSTREAM_TOKEN ):
679         {
680             if( c == Stream_End_Token[p->Elem_Index++] )
681             {
682                 if( Stream_End_Token[p->Elem_Index] == '\0' )
683                 {
684                     p->Sub_State = P_ENDOBJ_TOKEN;
685                 }
686             }
687             else
688             {
689                 p->Elem_Index = 0;  // reset and keep looking
690             }
691 
692             break;
693         }
694 
695         case( P_ENDOBJ_TOKEN ):
696         {
697             if( c == Ind_Obj_End_Token[p->Elem_Index++] )
698             {
699                 if( Ind_Obj_End_Token[p->Elem_Index] == '\0' )
700                 {
701                     /* we found the end of the indirect object, return
702                        back to the parent state (always START in this case) */
703                     return( Pop_State( p ) );
704                 }
705             }
706             else
707             {
708                 /* Since we don't necessarily handle all object types correctly,
709                    we will spin here searching for the end token.  Not the best,
710                    but should work if we don't have a full object parser. */
711                 p->Elem_Index = 0;  // reset and keep looking
712 
713             }
714 
715             break;
716         }
717 
718         default:
719             return( File_Decomp_Error );
720     }
721 
722     return( File_Decomp_OK );
723 }
724 
725 /* A simple state machine to process the xref/trailer/startxref file segments.  No
726    semantic processing and only rough syntactical processing to allow us to skip through
727    this segment. */
Handle_State_XREF(fd_session_p_t SessionPtr,uint8_t c)728 static inline fd_status_t Handle_State_XREF( fd_session_p_t SessionPtr, uint8_t c )
729 {
730     static uint8_t *Xref_Tok;
731     uint8_t Xref_End_Tok[] = { TOK_XRF_END };
732     fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
733 
734     if( p->State != P_XREF )
735     {
736         p->Sub_State = P_XREF_TOKEN;
737         p->Elem_Index = 1;  // Aready matched the first char in START state
738         p->State = P_XREF;
739         Xref_Tok = (uint8_t *)((c == TOK_XRF_XREF[0]) ? TOK_XRF_XREF : TOK_XRF_STARTXREF);
740         return( File_Decomp_OK );
741     }
742 
743     switch( p->Sub_State )
744     {
745         case( P_XREF_TOKEN ):
746         {
747             if( c == Xref_Tok[p->Elem_Index++] )
748             {
749                 if( Xref_Tok[p->Elem_Index] == '\0' )
750                 {
751                     p->Elem_Index = 0;
752                     p->Sub_State = P_XREF_END_TOKEN;
753                 }
754             }
755             else
756             {
757                 return( File_Decomp_Error );
758             }
759             break;
760         }
761 
762         case( P_XREF_END_TOKEN ):
763         {
764             if( c == Xref_End_Tok[p->Elem_Index++] )
765             {
766                 if( Xref_End_Tok[p->Elem_Index] == '\0' )
767                 {
768                     p->State = P_START;
769                 }
770             }
771             else
772             {
773                 /* Since we don't necessarily handle all xref content correctly,
774                    we will spin here searching for the end token.  Not the best,
775                    but should work if we don't have a full object parser. */
776                 p->Elem_Index = 0;  // reset and keep looking
777 
778             }
779 
780             break;
781         }
782 
783         default:
784             return( File_Decomp_Error );
785     }
786 
787     return( File_Decomp_OK );
788 }
789 
Handle_State_START(fd_session_p_t SessionPtr,uint8_t c)790 static inline fd_status_t Handle_State_START( fd_session_p_t SessionPtr, uint8_t c )
791 {
792     fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
793     /* Skip any whitespace.  This will include
794        the LF as part of a <CRLF> EOL token. */
795     if( IS_WHITESPACE(c) )
796     {
797         return( File_Decomp_OK );
798     }
799     if( c == CHR_COMMENT )
800     {
801         p->State = P_COMMENT;
802     }
803     else if( isdigit( c ) )
804     {
805         /* Save state and process an indirect object */
806         if( Push_State( p ) != File_Decomp_OK )
807             return( File_Decomp_Error );
808         return( Handle_State_IND_OBJ( SessionPtr, c ) );
809     }
810     else if( (c == TOK_XRF_XREF[0]) || (c == TOK_XRF_STARTXREF[0]) )
811     {
812         /* Save state and process the xref block */
813         if( Push_State( p ) != File_Decomp_OK )
814             return( File_Decomp_Error );
815          return( Handle_State_XREF( SessionPtr, c ) );
816     }
817     else if( !(IS_WHITESPACE(c)) )
818     {
819         /* If is not an ind_obj started, or a comment starting, then
820            we don't know what it is, so return an error. */
821         return( File_Decomp_Error );
822     }
823 
824     return( File_Decomp_OK );
825 }
826 
827 /* Incrementally search the incoming data for a PDF compressed stream
828    (of the type that we can decompress).  Move bytes to outgoing data
829    up to the beginning of the compressed segment.  If the FILE_REVERT_BIT
830    is set in the Session, remove the /Filter spec that was located by
831    replacing the name with null.  */
832 
833 /* Parse file until input blocked or stream located. */
Locate_Stream_Beginning(fd_session_p_t SessionPtr)834 static fd_status_t Locate_Stream_Beginning( fd_session_p_t SessionPtr )
835 {
836     fd_PDF_Parse_p_t p = &(SessionPtr->Decomp_State.PDF.Parse);
837     fd_status_t Ret_Code = File_Decomp_OK;
838     uint8_t c;
839 
840     while( 1 )
841     {
842         /* No reason to parse if there's no input or
843            room for output. */
844         if( SessionPtr->Avail_In == 0 )
845             return( File_Decomp_BlockIn );
846         if( SessionPtr->Avail_Out == 0 )
847             return( File_Decomp_BlockOut );
848 
849         /* Get next byte in input queue */
850         c = *SessionPtr->Next_In;
851 
852         switch( p->State )
853         {
854             /* The 'ground' state of the parser. All indirect objects
855                should be located at this level. */
856             case( P_START ):
857             {
858                 if( (Ret_Code = Handle_State_START( SessionPtr, c )) != File_Decomp_OK )
859                     return( Ret_Code );
860                 break;
861             }
862 
863             case( P_COMMENT ):
864             {
865                 /* CR or LF closes the comment.  The optional LF
866                    after a CR will be considered whitespace and
867                    removed in the P_START state. */
868                 if( IS_EOL(c) )
869                     p->State = P_START;
870                 break;
871             }
872 
873             case( P_IND_OBJ ):
874             {
875                 if( (Ret_Code = Handle_State_IND_OBJ( SessionPtr, c )) != File_Decomp_OK )
876                     return( Ret_Code );
877                 break;
878             }
879 
880             case( P_DICT_OBJECT ):
881             {
882                 if( (Ret_Code = Handle_State_DICT_OBJECT( SessionPtr, c )) != File_Decomp_OK )
883                     return( Ret_Code );
884                 break;
885             }
886 
887             case( P_XREF ):
888             {
889                 if( (Ret_Code = Handle_State_XREF( SessionPtr, c )) != File_Decomp_OK )
890                     return( Ret_Code );
891                 break;
892             }
893 
894             case( P_STREAM ):
895             {
896                 return( File_Decomp_Complete );
897             }
898 
899             default:
900                 return( File_Decomp_Error );
901         }
902         /* After parsing, move the byte from the input to the
903            output stream.  We can only be here if there's input
904            available and output space. */
905         (void)Move_1(SessionPtr);
906     }
907 }
908 
Init_Stream(fd_session_p_t SessionPtr)909 static fd_status_t Init_Stream( fd_session_p_t SessionPtr )
910 {
911     fd_PDF_p_t StPtr = &(SessionPtr->Decomp_State.PDF);
912 
913     switch( StPtr->Decomp_Type )
914     {
915         case FILE_COMPRESSION_TYPE_DEFLATE:
916         {
917             int z_ret;
918 
919             z_stream *z_s = &(StPtr->PDF_Decomp_State.Deflate.StreamDeflate);
920 
921             memset( (char *)z_s, 0, sizeof(z_stream));
922 
923             z_s->zalloc = (alloc_func)NULL;
924             z_s->zfree = (free_func)NULL;
925             SYNC_IN(z_s)
926 
927             z_ret = inflateInit2(z_s, 47);
928 
929             if( z_ret != Z_OK )
930             {
931                 File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_DEFL_FAILURE );
932                 return( File_Decomp_Error );
933             }
934 
935             break;
936         }
937         default:
938             return( File_Decomp_Error );
939     }
940 
941     return( File_Decomp_OK );
942 }
943 
Decomp_Stream(fd_session_p_t SessionPtr)944 static fd_status_t Decomp_Stream( fd_session_p_t SessionPtr )
945 {
946     fd_PDF_p_t StPtr = &(SessionPtr->Decomp_State.PDF);
947 
948     /* No reason to decompress if there's no input or
949        room for output. */
950     if( SessionPtr->Avail_In == 0 )
951         return( File_Decomp_BlockIn );
952     if( SessionPtr->Avail_Out == 0 )
953         return( File_Decomp_BlockOut );
954 
955     switch( StPtr->Decomp_Type )
956     {
957         case FILE_COMPRESSION_TYPE_DEFLATE:
958         {
959             int z_ret;
960             z_stream *z_s = &(StPtr->PDF_Decomp_State.Deflate.StreamDeflate);
961 
962             SYNC_IN(z_s)
963 
964             z_ret = inflate(z_s, Z_SYNC_FLUSH);
965 
966             SYNC_OUT(z_s)
967 
968             if( z_ret == Z_STREAM_END )
969             {
970                 return( File_Decomp_Complete );
971             }
972 
973             if( z_ret != Z_OK )
974             {
975                 DEBUG_WRAP(DebugMessage(DEBUG_HTTPINSPECT, "Decompression Error: objnum: %u\n", StPtr->Parse.Obj_Number););
976                 File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_DEFL_FAILURE );
977                 return( File_Decomp_Error );
978             }
979 
980             break;
981         }
982         default:
983             return( File_Decomp_Error );
984     }
985 
986     return( File_Decomp_OK );
987 }
988 
989 /* After processing a stream, close the decompession engine
990    and return the state of the parser. */
Close_Stream(fd_session_p_t SessionPtr)991 static fd_status_t Close_Stream( fd_session_p_t SessionPtr )
992 {
993     /* Put the parser state back where it was interrupted */
994     if( Pop_State( &(SessionPtr->Decomp_State.PDF.Parse) ) == File_Decomp_Error )
995         return( File_Decomp_Error );
996 
997     SessionPtr->Decomp_State.PDF.State = PDF_STATE_LOCATE_STREAM;
998 
999     return( File_Decomp_OK );
1000 }
1001 
1002 /* Abort the decompression sesson upon command from caller. */
File_Decomp_End_PDF(fd_session_p_t SessionPtr)1003 fd_status_t File_Decomp_End_PDF( fd_session_p_t SessionPtr )
1004 {
1005     fd_PDF_p_t StPtr;
1006 
1007     if( SessionPtr == NULL )
1008         return( File_Decomp_Error );
1009 
1010     StPtr = &(SessionPtr->Decomp_State.PDF);
1011 
1012     if( (StPtr->State != PDF_STATE_INIT_STREAM) &&
1013         (StPtr->State != PDF_STATE_PROCESS_STREAM) )
1014         return( File_Decomp_OK );
1015 
1016     switch( StPtr->Decomp_Type )
1017     {
1018         case FILE_COMPRESSION_TYPE_DEFLATE:
1019         {
1020             int z_ret;
1021             z_stream *z_s = &(StPtr->PDF_Decomp_State.Deflate.StreamDeflate);
1022 
1023             z_ret = inflateEnd(z_s);
1024 
1025             if( z_ret != Z_OK )
1026             {
1027                 File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_DEFL_FAILURE );
1028                 return( File_Decomp_Error );
1029             }
1030 
1031             break;
1032         }
1033         default:
1034             return( File_Decomp_Error );
1035     }
1036 
1037     return( File_Decomp_OK );
1038 }
1039 
1040 
1041 /* From caller, initialize PDF state machine. */
File_Decomp_Init_PDF(fd_session_p_t SessionPtr)1042 fd_status_t File_Decomp_Init_PDF( fd_session_p_t SessionPtr )
1043 {
1044     fd_PDF_p_t StPtr;
1045 
1046     if( SessionPtr == NULL )
1047         return( File_Decomp_Error );
1048 
1049     StPtr = &(SessionPtr->Decomp_State.PDF);
1050 
1051     Init_Parser( SessionPtr );
1052 
1053     StPtr->Decomp_Type = FILE_COMPRESSION_TYPE_NONE;
1054 
1055     /* Search for Dictionary/Stream object. */
1056     StPtr->State = PDF_STATE_LOCATE_STREAM;
1057 
1058     return( File_Decomp_OK );
1059 }
1060 
1061 /* Run the PDF state machine */
File_Decomp_PDF(fd_session_p_t SessionPtr)1062 fd_status_t File_Decomp_PDF( fd_session_p_t SessionPtr )
1063 {
1064     fd_status_t Ret_Code;
1065 
1066     if( (SessionPtr == NULL) || (SessionPtr->File_Type != FILE_TYPE_PDF) )
1067         return( File_Decomp_Error );
1068 
1069     /* Process all data until blocked */
1070     while( 1 )
1071     {
1072         switch( SessionPtr->Decomp_State.PDF.State )
1073         {
1074             case( PDF_STATE_LOCATE_STREAM ):
1075             {
1076                 /* Will return File_Decomp_Complete if/when the start of a valid compressed
1077                    stream is located.  Decomp_Type will be set. The parsing will be suspended.  */
1078                 if( (Ret_Code = Locate_Stream_Beginning( SessionPtr ) ) == File_Decomp_Error)
1079                 {
1080                     SessionPtr->Error_Event = HI_EO_SERVER_PDF_PARSE_FAILURE;
1081                     return( File_Decomp_DecompError );
1082                 }
1083 
1084                 /* If we didn't succeed then get more input */
1085                 if( Ret_Code != File_Decomp_Complete )
1086                     return( Ret_Code );
1087 
1088                 /* The Parsing state remains, we break out to perform the stream
1089                    decompression. */
1090                 if( SessionPtr->Decomp_Type == FILE_COMPRESSION_TYPE_NONE )
1091                 {
1092                     break;
1093                 }
1094                 else
1095                 {
1096                     DEBUG_WRAP(DebugMessage(DEBUG_HTTPINSPECT, "Compressed stream of type: %u\n", SessionPtr->Decomp_Type););
1097                     SessionPtr->Decomp_State.PDF.State = PDF_STATE_INIT_STREAM;
1098                     /* If we've located the beginning of stream, set new state
1099                        and fall into next state */
1100                 }
1101             }
1102 
1103             case( PDF_STATE_INIT_STREAM ):
1104             {
1105                 /* Initialize the selected decompression engine. */
1106                 Ret_Code = Init_Stream( SessionPtr );
1107                 if( Ret_Code != File_Decomp_OK )
1108                 {
1109                     Ret_Code = File_Decomp_End_PDF( SessionPtr );
1110                     if( Close_Stream( SessionPtr ) != File_Decomp_OK )
1111                         return( File_Decomp_Error );
1112                     File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_DEFL_FAILURE );
1113                     break;
1114                 }
1115 
1116                 SessionPtr->Decomp_State.PDF.State = PDF_STATE_PROCESS_STREAM;
1117                 /* INTENTIONAL FALL-THROUGH INTO PDF_STATE_PROCESS_STREAM CASE. */
1118             }
1119 
1120             case( PDF_STATE_PROCESS_STREAM ):
1121             {
1122                 Ret_Code = Decomp_Stream( SessionPtr );
1123                 /* Has the decompressor indicated the end of the data */
1124                 if( Ret_Code == File_Decomp_Error )
1125                 {
1126                     Ret_Code = File_Decomp_End_PDF( SessionPtr );
1127                     if( Close_Stream( SessionPtr ) != File_Decomp_OK )
1128                         return( File_Decomp_Error );
1129                     File_Decomp_Alert( SessionPtr, HI_EO_SERVER_PDF_DEFL_FAILURE );
1130                     break;
1131                 }
1132                 /* OK -> circle back for more input */
1133                 else if( Ret_Code == File_Decomp_OK )
1134                     break;
1135                 else if( Ret_Code != File_Decomp_Complete )
1136                     return( Ret_Code );
1137 
1138                 /* Close the decompression engine */
1139                 if( (Ret_Code = File_Decomp_End_PDF( SessionPtr ) ) == File_Decomp_Error)
1140                     return( File_Decomp_Error);
1141 
1142                 /* Put the parser state back where it was interrupted */
1143                 if( (Close_Stream( SessionPtr) ) == File_Decomp_Error )
1144                     return( File_Decomp_Error );
1145 
1146                 break;
1147             }
1148 
1149             default:
1150                 return( File_Decomp_Error );
1151         } // switch()
1152     } // while()
1153 
1154     return( File_Decomp_OK );
1155 }
1156 
1157 #endif
1158