1 /***************************************************************************
2  * Copyright (c) 2009-2010 Open Information Security Foundation
3  * Copyright (c) 2010-2013 Qualys, Inc.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are
8  * met:
9  *
10  * - Redistributions of source code must retain the above copyright
11  *   notice, this list of conditions and the following disclaimer.
12 
13  * - Redistributions in binary form must reproduce the above copyright
14  *   notice, this list of conditions and the following disclaimer in the
15  *   documentation and/or other materials provided with the distribution.
16 
17  * - Neither the name of the Qualys, Inc. nor the names of its
18  *   contributors may be used to endorse or promote products derived from
19  *   this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  ***************************************************************************/
33 
34 /**
35  * @file
36  * @author Ivan Ristic <ivanr@webkreator.com>
37  */
38 
39 #ifndef _HTP_MULTIPART_PRIVATE_H
40 #define	_HTP_MULTIPART_PRIVATE_H
41 
42 #ifdef __cplusplus
43 extern "C" {
44 #endif
45 
46 #include "htp_multipart.h"
47 
48 #define CD_PARAM_OTHER                  0
49 #define CD_PARAM_NAME                   1
50 #define CD_PARAM_FILENAME               2
51 
52 #define DEFAULT_FILE_EXTRACT_LIMIT      16
53 
54 enum htp_part_mode_t {
55     /** When in line mode, the parser is handling part headers. */
56     MODE_LINE = 0,
57 
58     /** When in data mode, the parser is consuming part data. */
59     MODE_DATA = 1
60 };
61 
62 enum htp_multipart_state_t {
63     /** Initial state, after the parser has been created but before the boundary initialized. */
64     STATE_INIT = 0,
65 
66     /** Processing data, waiting for a new line (which might indicate a new boundary). */
67     STATE_DATA = 1,
68 
69     /** Testing a potential boundary. */
70     STATE_BOUNDARY = 2,
71 
72     /** Checking the first byte after a boundary. */
73     STATE_BOUNDARY_IS_LAST1 = 3,
74 
75     /** Checking the second byte after a boundary. */
76     STATE_BOUNDARY_IS_LAST2 = 4,
77 
78     /** Consuming linear whitespace after a boundary. */
79     STATE_BOUNDARY_EAT_LWS = 5,
80 
81     /** Used after a CR byte is detected in STATE_BOUNDARY_EAT_LWS. */
82     STATE_BOUNDARY_EAT_LWS_CR = 6
83 };
84 
85 struct htp_mpartp_t {
86     htp_multipart_t multipart;
87 
88     htp_cfg_t *cfg;
89 
90     int extract_files;
91 
92     int extract_limit;
93 
94     char *extract_dir;
95 
96     int file_count;
97 
98     // Parsing callbacks
99 
100     int (*handle_data)(htp_mpartp_t *mpartp, const unsigned char *data,
101             size_t len, int line_end);
102     int (*handle_boundary)(htp_mpartp_t *mpartp);
103 
104     // Internal parsing fields; move into a private structure
105 
106     /**
107      * Parser state; one of MULTIPART_STATE_* constants.
108      */
109     enum htp_multipart_state_t parser_state;
110 
111     /**
112      * Keeps track of the current position in the boundary matching progress.
113      * When this field reaches boundary_len, we have a boundary match.
114      */
115     size_t boundary_match_pos;
116 
117     /**
118      * Pointer to the part that is currently being processed.
119      */
120     htp_multipart_part_t *current_part;
121 
122     /**
123      * This parser consists of two layers: the outer layer is charged with
124      * finding parts, and the internal layer handles part data. There is an
125      * interesting interaction between the two parsers. Because the
126      * outer layer is seeing every line (it has to, in order to test for
127      * boundaries), it also effectively also splits input into lines. The
128      * inner parser deals with two areas: first is the headers, which are
129      * line based, followed by binary data. When parsing headers, the inner
130      * parser can reuse the lines identified by the outer parser. In this
131      * variable we keep the current parsing mode of the part, which helps
132      * us process input data more efficiently. The possible values are
133      * MULTIPART_MODE_LINE and MULTIPART_MODE_DATA.
134      */
135     enum htp_part_mode_t current_part_mode;
136 
137     /**
138      * Used for buffering when a potential boundary is fragmented
139      * across many input data buffers. On a match, the data stored here is
140      * discarded. When there is no match, the buffer is processed as data
141      * (belonging to the currently active part).
142      */
143     bstr_builder_t *boundary_pieces;
144 
145     bstr_builder_t *part_header_pieces;
146 
147     bstr *pending_header_line;
148 
149     /**
150      * Stores text part pieces until the entire part is seen, at which
151      * point the pieces are assembled into a single buffer, and the
152      * builder cleared.
153      */
154     bstr_builder_t *part_data_pieces;
155 
156     /**
157      * The offset of the current boundary candidate, relative to the most
158      * recent data chunk (first unprocessed chunk of data).
159      */
160     size_t boundary_candidate_pos;
161 
162     /**
163      * When we encounter a CR as the last byte in a buffer, we don't know
164      * if the byte is part of a CRLF combination. If it is, then the CR
165      * might be a part of a boundary. But if it is not, it's current
166      * part's data. Because we know how to handle everything before the
167      * CR, we do, and we use this flag to indicate that a CR byte is
168      * effectively being buffered. This is probably a case of premature
169      * optimization, but I am going to leave it in for now.
170      */
171     int cr_aside;
172 
173     /**
174      * When set, indicates that this parser no longer owns names and
175      * values of MULTIPART_PART_TEXT parts. It is used to avoid data
176      * duplication when the parser is used by LibHTP internally.
177      */
178     int gave_up_data;
179 };
180 
181 htp_status_t htp_mpartp_run_request_file_data_hook(htp_multipart_part_t *part, const unsigned char *data, size_t len);
182 
183 htp_status_t htp_mpart_part_process_headers(htp_multipart_part_t *part);
184 
185 htp_status_t htp_mpartp_parse_header(htp_multipart_part_t *part, const unsigned char *data, size_t len);
186 
187 htp_status_t htp_mpart_part_handle_data(htp_multipart_part_t *part, const unsigned char *data, size_t len, int is_line);
188 
189 int htp_mpartp_is_boundary_character(int c);
190 
191 htp_multipart_part_t *htp_mpart_part_create(htp_mpartp_t *parser);
192 
193 htp_status_t htp_mpart_part_finalize_data(htp_multipart_part_t *part);
194 
195 void htp_mpart_part_destroy(htp_multipart_part_t *part, int gave_up_data);
196 
197 htp_status_t htp_mpart_part_parse_c_d(htp_multipart_part_t *part);
198 
199 #ifdef __cplusplus
200 }
201 #endif
202 
203 #endif	/* _HTP_MULTIPART_PRIVATE_H */
204