1 /***************************************************************************
2 * Copyright (c) 2009-2010 Open Information Security Foundation
3 * Copyright (c) 2010-2013 Qualys, Inc.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are
8 * met:
9 *
10 * - Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12
13 * - Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16
17 * - Neither the name of the Qualys, Inc. nor the names of its
18 * contributors may be used to endorse or promote products derived from
19 * this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 ***************************************************************************/
33
34 /**
35 * @file
36 * @author Ivan Ristic <ivanr@webkreator.com>
37 */
38
39 #include "htp_config_auto.h"
40
41 #include "htp_private.h"
42
43 /**
44 * Is character a linear white space character?
45 *
46 * @param[in] c
47 * @return 0 or 1
48 */
htp_is_lws(int c)49 int htp_is_lws(int c) {
50 if ((c == ' ') || (c == '\t')) return 1;
51 else return 0;
52 }
53
54 /**
55 * Is character a separator character?
56 *
57 * @param[in] c
58 * @return 0 or 1
59 */
htp_is_separator(int c)60 int htp_is_separator(int c) {
61 /* separators = "(" | ")" | "<" | ">" | "@"
62 | "," | ";" | ":" | "\" | <">
63 | "/" | "[" | "]" | "?" | "="
64 | "{" | "}" | SP | HT */
65 switch (c) {
66 case '(':
67 case ')':
68 case '<':
69 case '>':
70 case '@':
71 case ',':
72 case ';':
73 case ':':
74 case '\\':
75 case '"':
76 case '/':
77 case '[':
78 case ']':
79 case '?':
80 case '=':
81 case '{':
82 case '}':
83 case ' ':
84 case '\t':
85 return 1;
86 break;
87 default:
88 return 0;
89 }
90 }
91
92 /**
93 * Is character a text character?
94 *
95 * @param[in] c
96 * @return 0 or 1
97 */
htp_is_text(int c)98 int htp_is_text(int c) {
99 if (c == '\t') return 1;
100 if (c < 32) return 0;
101 return 1;
102 }
103
104 /**
105 * Is character a token character?
106 *
107 * @param[in] c
108 * @return 0 or 1
109 */
htp_is_token(int c)110 int htp_is_token(int c) {
111 /* token = 1*<any CHAR except CTLs or separators> */
112 /* CHAR = <any US-ASCII character (octets 0 - 127)> */
113 if ((c < 32) || (c > 126)) return 0;
114 if (htp_is_separator(c)) return 0;
115 return 1;
116 }
117
118 /**
119 * Remove all line terminators (LF, CR or CRLF) from
120 * the end of the line provided as input.
121 *
122 * @return 0 if nothing was removed, 1 if one or more LF characters were removed, or
123 * 2 if one or more CR and/or LF characters were removed.
124 */
htp_chomp(unsigned char * data,size_t * len)125 int htp_chomp(unsigned char *data, size_t *len) {
126 int r = 0;
127
128 // Loop until there's no more stuff in the buffer
129 while (*len > 0) {
130 // Try one LF first
131 if (data[*len - 1] == LF) {
132 (*len)--;
133 r = 1;
134
135 if (*len == 0) return r;
136
137 // A CR is allowed before LF
138 if (data[*len - 1] == CR) {
139 (*len)--;
140 r = 2;
141 }
142 } else if (data[*len - 1] == CR) {
143 (*len)--;
144 r = 1;
145 } else return r;
146 }
147
148 return r;
149 }
150
151 /**
152 * Is character a white space character?
153 *
154 * @param[in] c
155 * @return 0 or 1
156 */
htp_is_space(int c)157 int htp_is_space(int c) {
158 switch (c) {
159 case ' ':
160 case '\f':
161 case '\v':
162 case '\t':
163 case '\r':
164 case '\n':
165 return 1;
166 default:
167 return 0;
168 }
169 }
170
171 /**
172 * Converts request method, given as a string, into a number.
173 *
174 * @param[in] method
175 * @return Method number of M_UNKNOWN
176 */
htp_convert_method_to_number(bstr * method)177 int htp_convert_method_to_number(bstr *method) {
178 if (method == NULL) return HTP_M_UNKNOWN;
179
180 // TODO Optimize using parallel matching, or something similar.
181
182 if (bstr_cmp_c(method, "GET") == 0) return HTP_M_GET;
183 if (bstr_cmp_c(method, "PUT") == 0) return HTP_M_PUT;
184 if (bstr_cmp_c(method, "POST") == 0) return HTP_M_POST;
185 if (bstr_cmp_c(method, "DELETE") == 0) return HTP_M_DELETE;
186 if (bstr_cmp_c(method, "CONNECT") == 0) return HTP_M_CONNECT;
187 if (bstr_cmp_c(method, "OPTIONS") == 0) return HTP_M_OPTIONS;
188 if (bstr_cmp_c(method, "TRACE") == 0) return HTP_M_TRACE;
189 if (bstr_cmp_c(method, "PATCH") == 0) return HTP_M_PATCH;
190 if (bstr_cmp_c(method, "PROPFIND") == 0) return HTP_M_PROPFIND;
191 if (bstr_cmp_c(method, "PROPPATCH") == 0) return HTP_M_PROPPATCH;
192 if (bstr_cmp_c(method, "MKCOL") == 0) return HTP_M_MKCOL;
193 if (bstr_cmp_c(method, "COPY") == 0) return HTP_M_COPY;
194 if (bstr_cmp_c(method, "MOVE") == 0) return HTP_M_MOVE;
195 if (bstr_cmp_c(method, "LOCK") == 0) return HTP_M_LOCK;
196 if (bstr_cmp_c(method, "UNLOCK") == 0) return HTP_M_UNLOCK;
197 if (bstr_cmp_c(method, "VERSION-CONTROL") == 0) return HTP_M_VERSION_CONTROL;
198 if (bstr_cmp_c(method, "CHECKOUT") == 0) return HTP_M_CHECKOUT;
199 if (bstr_cmp_c(method, "UNCHECKOUT") == 0) return HTP_M_UNCHECKOUT;
200 if (bstr_cmp_c(method, "CHECKIN") == 0) return HTP_M_CHECKIN;
201 if (bstr_cmp_c(method, "UPDATE") == 0) return HTP_M_UPDATE;
202 if (bstr_cmp_c(method, "LABEL") == 0) return HTP_M_LABEL;
203 if (bstr_cmp_c(method, "REPORT") == 0) return HTP_M_REPORT;
204 if (bstr_cmp_c(method, "MKWORKSPACE") == 0) return HTP_M_MKWORKSPACE;
205 if (bstr_cmp_c(method, "MKACTIVITY") == 0) return HTP_M_MKACTIVITY;
206 if (bstr_cmp_c(method, "BASELINE-CONTROL") == 0) return HTP_M_BASELINE_CONTROL;
207 if (bstr_cmp_c(method, "MERGE") == 0) return HTP_M_MERGE;
208 if (bstr_cmp_c(method, "INVALID") == 0) return HTP_M_INVALID;
209 if (bstr_cmp_c(method, "HEAD") == 0) return HTP_M_HEAD;
210
211 return HTP_M_UNKNOWN;
212 }
213
214 /**
215 * Is the given line empty?
216 *
217 * @param[in] data
218 * @param[in] len
219 * @return 0 or 1
220 */
htp_is_line_empty(unsigned char * data,size_t len)221 int htp_is_line_empty(unsigned char *data, size_t len) {
222 if ((len == 1) ||
223 ((len == 2) && (data[0] == CR) && (data[1] == LF))) {
224 return 1;
225 }
226
227 return 0;
228 }
229
230 /**
231 * Does line consist entirely of whitespace characters?
232 *
233 * @param[in] data
234 * @param[in] len
235 * @return 0 or 1
236 */
htp_is_line_whitespace(unsigned char * data,size_t len)237 int htp_is_line_whitespace(unsigned char *data, size_t len) {
238 size_t i;
239
240 for (i = 0; i < len; i++) {
241 if (!isspace(data[i])) {
242 return 0;
243 }
244 }
245
246 return 1;
247 }
248
249 /**
250 * Parses Content-Length string (positive decimal number).
251 * White space is allowed before and after the number.
252 *
253 * @param[in] b
254 * @return Content-Length as a number, or -1 on error.
255 */
htp_parse_content_length(bstr * b,htp_connp_t * connp)256 int64_t htp_parse_content_length(bstr *b, htp_connp_t *connp) {
257 size_t len = bstr_len(b);
258 unsigned char * data = (unsigned char *) bstr_ptr(b);
259 size_t pos = 0;
260 int64_t r = 0;
261
262 if (len == 0) return -1003;
263
264 // Ignore junk before
265 while ((pos < len) && (data[pos] < '0' || data[pos] > '9')) {
266 if (!htp_is_lws(data[pos]) && connp != NULL && r == 0) {
267 htp_log(connp, HTP_LOG_MARK, HTP_LOG_WARNING, 0,
268 "C-L value with extra data in the beginnning");
269 r = -1;
270 }
271 pos++;
272 }
273 if (pos == len) return -1001;
274
275 r = bstr_util_mem_to_pint(data + pos, len - pos, 10, &pos);
276 // Ok to have junk afterwards
277 if (pos < len && connp != NULL) {
278 htp_log(connp, HTP_LOG_MARK, HTP_LOG_WARNING, 0,
279 "C-L value with extra data in the end");
280 }
281 return r;
282 }
283
284 /**
285 * Parses chunk length (positive hexadecimal number). White space is allowed before
286 * and after the number. An error will be returned if the chunk length is greater than
287 * INT32_MAX.
288 *
289 * @param[in] data
290 * @param[in] len
291 * @return Chunk length, or a negative number on error.
292 */
htp_parse_chunked_length(unsigned char * data,size_t len)293 int64_t htp_parse_chunked_length(unsigned char *data, size_t len) {
294 // skip leading line feeds and other control chars
295 while (len) {
296 unsigned char c = *data;
297 if (!(c == 0x0d || c == 0x0a || c == 0x20 || c == 0x09 || c == 0x0b || c == 0x0c))
298 break;
299 data++;
300 len--;
301 }
302 if (len == 0)
303 return -1004;
304
305 // find how much of the data is correctly formatted
306 size_t i = 0;
307 while (i < len) {
308 unsigned char c = data[i];
309 if (!(isdigit(c) ||
310 (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')))
311 break;
312 i++;
313 }
314 // cut off trailing junk
315 if (i != len) {
316 len = i;
317 }
318
319 int64_t chunk_len = htp_parse_positive_integer_whitespace(data, len, 16);
320 if (chunk_len < 0) return chunk_len;
321 if (chunk_len > INT32_MAX) return -1;
322 return chunk_len;
323 }
324
325 /**
326 * A somewhat forgiving parser for a positive integer in a given base.
327 * Only LWS is allowed before and after the number.
328 *
329 * @param[in] data
330 * @param[in] len
331 * @param[in] base
332 * @return The parsed number on success; a negative number on error.
333 */
htp_parse_positive_integer_whitespace(unsigned char * data,size_t len,int base)334 int64_t htp_parse_positive_integer_whitespace(unsigned char *data, size_t len, int base) {
335 if (len == 0) return -1003;
336
337 size_t last_pos;
338 size_t pos = 0;
339
340 // Ignore LWS before
341 while ((pos < len) && (htp_is_lws(data[pos]))) pos++;
342 if (pos == len) return -1001;
343
344 int64_t r = bstr_util_mem_to_pint(data + pos, len - pos, base, &last_pos);
345 if (r < 0) return r;
346
347 // Move after the last digit
348 pos += last_pos;
349
350 // Ignore LWS after
351 while (pos < len) {
352 if (!htp_is_lws(data[pos])) {
353 return -1002;
354 }
355
356 pos++;
357 }
358
359 return r;
360 }
361
362 #ifdef HTP_DEBUG
363
364 /**
365 * Prints one log message to stderr.
366 *
367 * @param[in] stream
368 * @param[in] log
369 */
htp_print_log(FILE * stream,htp_log_t * log)370 void htp_print_log(FILE *stream, htp_log_t *log) {
371 if (log->code != 0) {
372 fprintf(stream, "[%d][code %d][file %s][line %d] %s\n", log->level,
373 log->code, log->file, log->line, log->msg);
374 } else {
375 fprintf(stream, "[%d][file %s][line %d] %s\n", log->level,
376 log->file, log->line, log->msg);
377 }
378 }
379 #endif
380
381 /**
382 * Records one log message.
383 *
384 * @param[in] connp
385 * @param[in] file
386 * @param[in] line
387 * @param[in] level
388 * @param[in] code
389 * @param[in] fmt
390 */
htp_log(htp_connp_t * connp,const char * file,int line,enum htp_log_level_t level,int code,const char * fmt,...)391 void htp_log(htp_connp_t *connp, const char *file, int line, enum htp_log_level_t level, int code, const char *fmt, ...) {
392 if (connp == NULL) return;
393
394 char buf[1024];
395 va_list args;
396
397 // Ignore messages below our log level.
398 if (connp->cfg->log_level < level) {
399 return;
400 }
401
402 va_start(args, fmt);
403
404 int r = vsnprintf(buf, 1024, fmt, args);
405
406 va_end(args);
407
408 if (r < 0) {
409 snprintf(buf, 1024, "[vnsprintf returned error %d]", r);
410 } else if (r >= 1024) {
411 // Indicate overflow with a '+' at the end.
412 buf[1022] = '+';
413 buf[1023] = '\0';
414 }
415
416 // Create a new log entry.
417
418 htp_log_t *log = calloc(1, sizeof (htp_log_t));
419 if (log == NULL) return;
420
421 log->connp = connp;
422 log->file = file;
423 log->line = line;
424 log->level = level;
425 log->code = code;
426 log->msg = strdup(buf);
427
428 htp_list_add(connp->conn->messages, log);
429
430 if (level == HTP_LOG_ERROR) {
431 connp->last_error = log;
432 }
433
434 #ifdef HTP_DEBUG
435 fprintf(stderr, "[LOG] %s\n", log->msg);
436 #endif
437
438 /* coverity[check_return] */
439 htp_hook_run_all(connp->cfg->hook_log, log);
440 }
441
442 /**
443 * Determines if the given line is a continuation (of some previous line).
444 *
445 * @param[in] data
446 * @param[in] len
447 * @return 0 or 1 for false and true, respectively. Returns -1 on error (NULL pointer or length zero).
448 */
htp_connp_is_line_folded(unsigned char * data,size_t len)449 int htp_connp_is_line_folded(unsigned char *data, size_t len) {
450 if ((data == NULL) || (len == 0)) return -1;
451 return htp_is_folding_char(data[0]);
452 }
453
htp_is_folding_char(int c)454 int htp_is_folding_char(int c) {
455 if (htp_is_lws(c) || c == 0) return 1;
456 else return 0;
457 }
458
459 /**
460 * Determines if the given line is a request terminator.
461 *
462 * @param[in] connp
463 * @param[in] data
464 * @param[in] len
465 * @return 0 or 1
466 */
htp_connp_is_line_terminator(htp_connp_t * connp,unsigned char * data,size_t len,int next_no_lf)467 int htp_connp_is_line_terminator(htp_connp_t *connp, unsigned char *data, size_t len, int next_no_lf) {
468 // Is this the end of request headers?
469 switch (connp->cfg->server_personality) {
470 case HTP_SERVER_IIS_5_1:
471 // IIS 5 will accept a whitespace line as a terminator
472 if (htp_is_line_whitespace(data, len)) {
473 return 1;
474 }
475
476 // Fall through
477 default:
478 // Treat an empty line as terminator
479 if (htp_is_line_empty(data, len)) {
480 return 1;
481 }
482 // Only space is terminator if terminator does not follow right away
483 if (len == 2 && htp_is_lws(data[0]) && data[1] == LF) {
484 return next_no_lf;
485 }
486 break;
487 }
488
489 return 0;
490 }
491
492 /**
493 * Determines if the given line can be ignored when it appears before a request.
494 *
495 * @param[in] connp
496 * @param[in] data
497 * @param[in] len
498 * @return 0 or 1
499 */
htp_connp_is_line_ignorable(htp_connp_t * connp,unsigned char * data,size_t len)500 int htp_connp_is_line_ignorable(htp_connp_t *connp, unsigned char *data, size_t len) {
501 return htp_connp_is_line_terminator(connp, data, len, 0);
502 }
503
htp_parse_port(unsigned char * data,size_t len,int * port,int * invalid)504 static htp_status_t htp_parse_port(unsigned char *data, size_t len, int *port, int *invalid) {
505 if (len == 0) {
506 *port = -1;
507 *invalid = 1;
508 return HTP_OK;
509 }
510
511 int64_t port_parsed = htp_parse_positive_integer_whitespace(data, len, 10);
512
513 if (port_parsed < 0) {
514 // Failed to parse the port number.
515 *port = -1;
516 *invalid = 1;
517 } else if ((port_parsed > 0) && (port_parsed < 65536)) {
518 // Valid port number.
519 *port = port_parsed;
520 } else {
521 // Port number out of range.
522 *port = -1;
523 *invalid = 1;
524 }
525
526 return HTP_OK;
527 }
528
529 /**
530 * Parses an authority string, which consists of a hostname with an optional port number; username
531 * and password are not allowed and will not be handled.
532 *
533 * @param[in] hostport
534 * @param[out] hostname A bstring containing the hostname, or NULL if the hostname is invalid. If this value
535 * is not NULL, the caller assumes responsibility for memory management.
536 * @param[out] port Port as text, or NULL if not provided.
537 * @param[out] port_number Port number, or -1 if the port is not present or invalid.
538 * @param[out] invalid Set to 1 if any part of the authority is invalid.
539 * @return HTP_OK on success, HTP_ERROR on memory allocation failure.
540 */
htp_parse_hostport(bstr * hostport,bstr ** hostname,bstr ** port,int * port_number,int * invalid)541 htp_status_t htp_parse_hostport(bstr *hostport, bstr **hostname, bstr **port, int *port_number, int *invalid) {
542 if ((hostport == NULL) || (hostname == NULL) || (port_number == NULL) || (invalid == NULL)) return HTP_ERROR;
543
544 *hostname = NULL;
545 if (port != NULL) {
546 *port = NULL;
547 }
548 *port_number = -1;
549 *invalid = 0;
550
551 unsigned char *data = bstr_ptr(hostport);
552 size_t len = bstr_len(hostport);
553
554 bstr_util_mem_trim(&data, &len);
555
556 if (len == 0) {
557 *invalid = 1;
558 return HTP_OK;
559 }
560
561 // Check for an IPv6 address.
562 if (data[0] == '[') {
563 // IPv6 host.
564
565 // Find the end of the IPv6 address.
566 size_t pos = 0;
567 while ((pos < len) && (data[pos] != ']')) pos++;
568 if (pos == len) {
569 *invalid = 1;
570 return HTP_OK;
571 }
572
573 *hostname = bstr_dup_mem(data, pos + 1);
574 if (*hostname == NULL) return HTP_ERROR;
575
576 // Over the ']'.
577 pos++;
578 if (pos == len) return HTP_OK;
579
580 // Handle port.
581 if (data[pos] == ':') {
582 if (port != NULL) {
583 *port = bstr_dup_mem(data + pos + 1, len - pos - 1);
584 if (*port == NULL) {
585 bstr_free(*hostname);
586 return HTP_ERROR;
587 }
588 }
589
590 return htp_parse_port(data + pos + 1, len - pos - 1, port_number, invalid);
591 } else {
592 *invalid = 1;
593 return HTP_OK;
594 }
595 } else {
596 // Not IPv6 host.
597
598 // Is there a colon?
599 unsigned char *colon = memchr(data, ':', len);
600 if (colon == NULL) {
601 // Hostname alone, no port.
602
603 *hostname = bstr_dup_mem(data, len);
604 if (*hostname == NULL) return HTP_ERROR;
605
606 bstr_to_lowercase(*hostname);
607 } else {
608 // Hostname and port.
609
610 // Ignore whitespace at the end of hostname.
611 unsigned char *hostend = colon;
612 while ((hostend > data) && (isspace(*(hostend - 1)))) hostend--;
613
614 *hostname = bstr_dup_mem(data, hostend - data);
615 if (*hostname == NULL) return HTP_ERROR;
616
617 if (port != NULL) {
618 *port = bstr_dup_mem(colon + 1, len - (colon + 1 - data));
619 if (*port == NULL) {
620 bstr_free(*hostname);
621 return HTP_ERROR;
622 }
623 }
624
625 return htp_parse_port(colon + 1, len - (colon + 1 - data), port_number, invalid);
626 }
627 }
628
629 return HTP_OK;
630 }
631
632 /**
633 * Parses hostport provided in the URI.
634 *
635 * @param[in] connp
636 * @param[in] hostport
637 * @param[in] uri
638 * @return HTP_OK on success or HTP_ERROR error.
639 */
htp_parse_uri_hostport(htp_connp_t * connp,bstr * hostport,htp_uri_t * uri)640 int htp_parse_uri_hostport(htp_connp_t *connp, bstr *hostport, htp_uri_t *uri) {
641 int invalid;
642
643 htp_status_t rc = htp_parse_hostport(hostport, &(uri->hostname), &(uri->port), &(uri->port_number), &invalid);
644 if (rc != HTP_OK) return rc;
645
646 if (invalid) {
647 connp->in_tx->flags |= HTP_HOSTU_INVALID;
648 }
649
650 if (uri->hostname != NULL) {
651 if (htp_validate_hostname(uri->hostname) == 0) {
652 connp->in_tx->flags |= HTP_HOSTU_INVALID;
653 }
654 }
655
656 return HTP_OK;
657 }
658
659 /**
660 * Parses hostport provided in the Host header.
661 *
662 * @param[in] hostport
663 * @param[out] hostname
664 * @param[out] port
665 * @param[out] port_number
666 * @param[out] flags
667 * @return HTP_OK on success or HTP_ERROR error.
668 */
htp_parse_header_hostport(bstr * hostport,bstr ** hostname,bstr ** port,int * port_number,uint64_t * flags)669 htp_status_t htp_parse_header_hostport(bstr *hostport, bstr **hostname, bstr **port, int *port_number, uint64_t *flags) {
670 int invalid;
671
672 htp_status_t rc = htp_parse_hostport(hostport, hostname, port, port_number, &invalid);
673 if (rc != HTP_OK) return rc;
674
675 if (invalid) {
676 *flags |= HTP_HOSTH_INVALID;
677 }
678
679 if (*hostname != NULL) {
680 if (htp_validate_hostname(*hostname) == 0) {
681 *flags |= HTP_HOSTH_INVALID;
682 }
683 }
684
685 return HTP_OK;
686 }
687
688 /**
689 * Parses request URI, making no attempt to validate the contents.
690 *
691 * @param[in] input
692 * @param[in] uri
693 * @return HTP_ERROR on memory allocation failure, HTP_OK otherwise
694 */
htp_parse_uri(bstr * input,htp_uri_t ** uri)695 int htp_parse_uri(bstr *input, htp_uri_t **uri) {
696 // Allow a htp_uri_t structure to be provided on input,
697 // but allocate a new one if the structure is NULL.
698 if (*uri == NULL) {
699 *uri = calloc(1, sizeof (htp_uri_t));
700 if (*uri == NULL) return HTP_ERROR;
701 }
702
703 if (input == NULL) {
704 // The input might be NULL on requests that don't actually
705 // contain the URI. We allow that.
706 return HTP_OK;
707 }
708
709 unsigned char *data = bstr_ptr(input);
710 size_t len = bstr_len(input);
711 size_t start, pos;
712
713 if (len == 0) {
714 // Empty string.
715 return HTP_OK;
716 }
717
718 pos = 0;
719
720 // Scheme test: if it doesn't start with a forward slash character (which it must
721 // for the contents to be a path or an authority, then it must be the scheme part
722 if (data[0] != '/') {
723 // Parse scheme
724
725 // Find the colon, which marks the end of the scheme part
726 start = pos;
727 while ((pos < len) && (data[pos] != ':')) pos++;
728
729 if (pos >= len) {
730 // We haven't found a colon, which means that the URI
731 // is invalid. Apache will ignore this problem and assume
732 // the URI contains an invalid path so, for the time being,
733 // we are going to do the same.
734 pos = 0;
735 } else {
736 // Make a copy of the scheme
737 (*uri)->scheme = bstr_dup_mem(data + start, pos - start);
738 if ((*uri)->scheme == NULL) return HTP_ERROR;
739
740 // Go over the colon
741 pos++;
742 }
743 }
744
745 // Authority test: two forward slash characters and it's an authority.
746 // One, three or more slash characters, and it's a path. We, however,
747 // only attempt to parse authority if we've seen a scheme.
748 if ((*uri)->scheme != NULL)
749 if ((pos + 2 < len) && (data[pos] == '/') && (data[pos + 1] == '/') && (data[pos + 2] != '/')) {
750 // Parse authority
751
752 // Go over the two slash characters
753 start = pos = pos + 2;
754
755 // Authority ends with a question mark, forward slash or hash
756 while ((pos < len) && (data[pos] != '?') && (data[pos] != '/') && (data[pos] != '#')) pos++;
757
758 unsigned char *hostname_start;
759 size_t hostname_len;
760
761 // Are the credentials included in the authority?
762 unsigned char *m = memchr(data + start, '@', pos - start);
763 if (m != NULL) {
764 // Credentials present
765 unsigned char *credentials_start = data + start;
766 size_t credentials_len = m - data - start;
767
768 // Figure out just the hostname part
769 hostname_start = data + start + credentials_len + 1;
770 hostname_len = pos - start - credentials_len - 1;
771
772 // Extract the username and the password
773 m = memchr(credentials_start, ':', credentials_len);
774 if (m != NULL) {
775 // Username and password
776 (*uri)->username = bstr_dup_mem(credentials_start, m - credentials_start);
777 if ((*uri)->username == NULL) return HTP_ERROR;
778 (*uri)->password = bstr_dup_mem(m + 1, credentials_len - (m - credentials_start) - 1);
779 if ((*uri)->password == NULL) return HTP_ERROR;
780 } else {
781 // Username alone
782 (*uri)->username = bstr_dup_mem(credentials_start, credentials_len);
783 if ((*uri)->username == NULL) return HTP_ERROR;
784 }
785 } else {
786 // No credentials
787 hostname_start = data + start;
788 hostname_len = pos - start;
789 }
790
791 // Parsing authority without credentials.
792 if ((hostname_len > 0) && (hostname_start[0] == '[')) {
793 // IPv6 address.
794
795 m = memchr(hostname_start, ']', hostname_len);
796 if (m == NULL) {
797 // Invalid IPv6 address; use the entire string as hostname.
798 (*uri)->hostname = bstr_dup_mem(hostname_start, hostname_len);
799 if ((*uri)->hostname == NULL) return HTP_ERROR;
800 } else {
801 (*uri)->hostname = bstr_dup_mem(hostname_start, m - hostname_start + 1);
802 if ((*uri)->hostname == NULL) return HTP_ERROR;
803
804 // Is there a port?
805 hostname_len = hostname_len - (m - hostname_start + 1);
806 hostname_start = m + 1;
807
808 // Port string
809 m = memchr(hostname_start, ':', hostname_len);
810 if (m != NULL) {
811 size_t port_len = hostname_len - (m - hostname_start) - 1;
812 (*uri)->port = bstr_dup_mem(m + 1, port_len);
813 if ((*uri)->port == NULL) return HTP_ERROR;
814 }
815 }
816 } else {
817 // Not IPv6 address.
818
819 m = memchr(hostname_start, ':', hostname_len);
820 if (m != NULL) {
821 size_t port_len = hostname_len - (m - hostname_start) - 1;
822 hostname_len = hostname_len - port_len - 1;
823
824 // Port string
825 (*uri)->port = bstr_dup_mem(m + 1, port_len);
826 if ((*uri)->port == NULL) return HTP_ERROR;
827 }
828
829 // Hostname
830 (*uri)->hostname = bstr_dup_mem(hostname_start, hostname_len);
831 if ((*uri)->hostname == NULL) return HTP_ERROR;
832 }
833 }
834
835 // Path
836 start = pos;
837
838 // The path part will end with a question mark or a hash character, which
839 // mark the beginning of the query part or the fragment part, respectively.
840 while ((pos < len) && (data[pos] != '?') && (data[pos] != '#')) pos++;
841
842 // Path
843 (*uri)->path = bstr_dup_mem(data + start, pos - start);
844 if ((*uri)->path == NULL) return HTP_ERROR;
845
846 if (pos == len) return HTP_OK;
847
848 // Query
849 if (data[pos] == '?') {
850 // Step over the question mark
851 start = pos + 1;
852
853 // The query part will end with the end of the input
854 // or the beginning of the fragment part
855 while ((pos < len) && (data[pos] != '#')) pos++;
856
857 // Query string
858 (*uri)->query = bstr_dup_mem(data + start, pos - start);
859 if ((*uri)->query == NULL) return HTP_ERROR;
860
861 if (pos == len) return HTP_OK;
862 }
863
864 // Fragment
865 if (data[pos] == '#') {
866 // Step over the hash character
867 start = pos + 1;
868
869 // Fragment; ends with the end of the input
870 (*uri)->fragment = bstr_dup_mem(data + start, len - start);
871 if ((*uri)->fragment == NULL) return HTP_ERROR;
872 }
873
874 return HTP_OK;
875 }
876
877 /**
878 * Convert two input bytes, pointed to by the pointer parameter,
879 * into a single byte by assuming the input consists of hexadecimal
880 * characters. This function will happily convert invalid input.
881 *
882 * @param[in] what
883 * @return hex-decoded byte
884 */
x2c(unsigned char * what)885 static unsigned char x2c(unsigned char *what) {
886 register unsigned char digit;
887
888 digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A') + 10 : (what[0] - '0'));
889 digit *= 16;
890 digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A') + 10 : (what[1] - '0'));
891
892 return digit;
893 }
894
895 /**
896 * Convert a Unicode codepoint into a single-byte, using best-fit
897 * mapping (as specified in the provided configuration structure).
898 *
899 * @param[in] cfg
900 * @param[in] codepoint
901 * @return converted single byte
902 */
bestfit_codepoint(htp_cfg_t * cfg,enum htp_decoder_ctx_t ctx,uint32_t codepoint)903 static uint8_t bestfit_codepoint(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, uint32_t codepoint) {
904 // Is it a single-byte codepoint?
905 if (codepoint < 0x100) {
906 return (uint8_t) codepoint;
907 }
908
909 // Our current implementation converts only the 2-byte codepoints.
910 if (codepoint > 0xffff) {
911 return cfg->decoder_cfgs[ctx].bestfit_replacement_byte;
912 }
913
914 uint8_t *p = cfg->decoder_cfgs[ctx].bestfit_map;
915
916 // TODO Optimize lookup.
917
918 for (;;) {
919 uint32_t x = (p[0] << 8) + p[1];
920
921 if (x == 0) {
922 return cfg->decoder_cfgs[ctx].bestfit_replacement_byte;
923 }
924
925 if (x == codepoint) {
926 return p[2];
927 }
928
929 // Move to the next triplet
930 p += 3;
931 }
932 }
933
934 /**
935 * Decode a UTF-8 encoded path. Overlong characters will be decoded, invalid
936 * characters will be left as-is. Best-fit mapping will be used to convert
937 * UTF-8 into a single-byte stream.
938 *
939 * @param[in] cfg
940 * @param[in] tx
941 * @param[in] path
942 */
htp_utf8_decode_path_inplace(htp_cfg_t * cfg,htp_tx_t * tx,bstr * path)943 void htp_utf8_decode_path_inplace(htp_cfg_t *cfg, htp_tx_t *tx, bstr *path) {
944 if (path == NULL) return;
945
946 uint8_t *data = bstr_ptr(path);
947 if (data == NULL) return;
948
949 size_t len = bstr_len(path);
950 size_t rpos = 0;
951 size_t wpos = 0;
952 uint32_t codepoint = 0;
953 uint32_t state = HTP_UTF8_ACCEPT;
954 uint32_t counter = 0;
955 uint8_t seen_valid = 0;
956
957 while ((rpos < len)&&(wpos < len)) {
958 counter++;
959
960 switch (htp_utf8_decode_allow_overlong(&state, &codepoint, data[rpos])) {
961 case HTP_UTF8_ACCEPT:
962 if (counter == 1) {
963 // ASCII character, which we just copy.
964 data[wpos++] = (uint8_t) codepoint;
965 } else {
966 // A valid UTF-8 character, which we need to convert.
967
968 seen_valid = 1;
969
970 // Check for overlong characters and set the flag accordingly.
971 switch (counter) {
972 case 2:
973 if (codepoint < 0x80) {
974 tx->flags |= HTP_PATH_UTF8_OVERLONG;
975 }
976 break;
977 case 3:
978 if (codepoint < 0x800) {
979 tx->flags |= HTP_PATH_UTF8_OVERLONG;
980 }
981 break;
982 case 4:
983 if (codepoint < 0x10000) {
984 tx->flags |= HTP_PATH_UTF8_OVERLONG;
985 }
986 break;
987 }
988
989 // Special flag for half-width/full-width evasion.
990 if ((codepoint >= 0xff00) && (codepoint <= 0xffef)) {
991 tx->flags |= HTP_PATH_HALF_FULL_RANGE;
992 }
993
994 // Use best-fit mapping to convert to a single byte.
995 data[wpos++] = bestfit_codepoint(cfg, HTP_DECODER_URL_PATH, codepoint);
996 }
997
998 // Advance over the consumed byte and reset the byte counter.
999 rpos++;
1000 counter = 0;
1001
1002 break;
1003
1004 case HTP_UTF8_REJECT:
1005 // Invalid UTF-8 character.
1006
1007 tx->flags |= HTP_PATH_UTF8_INVALID;
1008
1009 // Is the server expected to respond with 400?
1010 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1011 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_invalid_unwanted;
1012 }
1013
1014 // Output the replacement byte, replacing one or more invalid bytes.
1015 data[wpos++] = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_replacement_byte;
1016
1017 // If the invalid byte was first in a sequence, consume it. Otherwise,
1018 // assume it's the starting byte of the next character.
1019 if (counter == 1) {
1020 rpos++;
1021 }
1022
1023 // Reset the decoder state and continue decoding.
1024 state = HTP_UTF8_ACCEPT;
1025 codepoint = 0;
1026 counter = 0;
1027
1028 break;
1029
1030 default:
1031 // Keep going; the character is not yet formed.
1032 rpos++;
1033 break;
1034 }
1035 }
1036
1037 // Did the input stream seem like a valid UTF-8 string?
1038 if ((seen_valid) && (!(tx->flags & HTP_PATH_UTF8_INVALID))) {
1039 tx->flags |= HTP_PATH_UTF8_VALID;
1040 }
1041
1042 // Adjust the length of the string, because
1043 // we're doing in-place decoding.
1044 bstr_adjust_len(path, wpos);
1045 }
1046
1047 /**
1048 * Validate a path that is quite possibly UTF-8 encoded.
1049 *
1050 * @param[in] tx
1051 * @param[in] path
1052 */
htp_utf8_validate_path(htp_tx_t * tx,bstr * path)1053 void htp_utf8_validate_path(htp_tx_t *tx, bstr *path) {
1054 unsigned char *data = bstr_ptr(path);
1055 size_t len = bstr_len(path);
1056 size_t rpos = 0;
1057 uint32_t codepoint = 0;
1058 uint32_t state = HTP_UTF8_ACCEPT;
1059 uint32_t counter = 0; // How many bytes used by a UTF-8 character.
1060 uint8_t seen_valid = 0;
1061
1062 while (rpos < len) {
1063 counter++;
1064
1065 switch (htp_utf8_decode_allow_overlong(&state, &codepoint, data[rpos])) {
1066 case HTP_UTF8_ACCEPT:
1067 // We have a valid character.
1068
1069 if (counter > 1) {
1070 // A valid UTF-8 character, consisting of 2 or more bytes.
1071
1072 seen_valid = 1;
1073
1074 // Check for overlong characters and set the flag accordingly.
1075 switch (counter) {
1076 case 2:
1077 if (codepoint < 0x80) {
1078 tx->flags |= HTP_PATH_UTF8_OVERLONG;
1079 }
1080 break;
1081 case 3:
1082 if (codepoint < 0x800) {
1083 tx->flags |= HTP_PATH_UTF8_OVERLONG;
1084 }
1085 break;
1086 case 4:
1087 if (codepoint < 0x10000) {
1088 tx->flags |= HTP_PATH_UTF8_OVERLONG;
1089 }
1090 break;
1091 }
1092 }
1093
1094 // Special flag for half-width/full-width evasion.
1095 if ((codepoint > 0xfeff) && (codepoint < 0x010000)) {
1096 tx->flags |= HTP_PATH_HALF_FULL_RANGE;
1097 }
1098
1099 // Advance over the consumed byte and reset the byte counter.
1100 rpos++;
1101 counter = 0;
1102
1103 break;
1104
1105 case HTP_UTF8_REJECT:
1106 // Invalid UTF-8 character.
1107
1108 tx->flags |= HTP_PATH_UTF8_INVALID;
1109
1110 // Override the decoder state because we want to continue decoding.
1111 state = HTP_UTF8_ACCEPT;
1112
1113 // Advance over the consumed byte and reset the byte counter.
1114 rpos++;
1115 counter = 0;
1116
1117 break;
1118
1119 default:
1120 // Keep going; the character is not yet formed.
1121 rpos++;
1122 break;
1123 }
1124 }
1125
1126 // Did the input stream seem like a valid UTF-8 string?
1127 if ((seen_valid) && (!(tx->flags & HTP_PATH_UTF8_INVALID))) {
1128 tx->flags |= HTP_PATH_UTF8_VALID;
1129 }
1130 }
1131
1132 /**
1133 * Decode a %u-encoded character, using best-fit mapping as necessary. Path version.
1134 *
1135 * @param[in] cfg
1136 * @param[in] tx
1137 * @param[in] data
1138 * @return decoded byte
1139 */
decode_u_encoding_path(htp_cfg_t * cfg,htp_tx_t * tx,unsigned char * data)1140 static int decode_u_encoding_path(htp_cfg_t *cfg, htp_tx_t *tx, unsigned char *data) {
1141 unsigned int c1 = x2c(data);
1142 unsigned int c2 = x2c(data + 2);
1143 int r = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_replacement_byte;
1144
1145 if (c1 == 0x00) {
1146 r = c2;
1147 tx->flags |= HTP_PATH_OVERLONG_U;
1148 } else {
1149 // Check for fullwidth form evasion
1150 if (c1 == 0xff) {
1151 tx->flags |= HTP_PATH_HALF_FULL_RANGE;
1152 }
1153
1154 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted != HTP_UNWANTED_IGNORE) {
1155 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted;
1156 }
1157
1158 // Use best-fit mapping
1159 unsigned char *p = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_map;
1160
1161 // TODO Optimize lookup.
1162
1163 for (;;) {
1164 // Have we reached the end of the map?
1165 if ((p[0] == 0) && (p[1] == 0)) {
1166 break;
1167 }
1168
1169 // Have we found the mapping we're looking for?
1170 if ((p[0] == c1) && (p[1] == c2)) {
1171 r = p[2];
1172 break;
1173 }
1174
1175 // Move to the next triplet
1176 p += 3;
1177 }
1178 }
1179
1180 // Check for encoded path separators
1181 if ((r == '/') || ((cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes) && (r == '\\'))) {
1182 tx->flags |= HTP_PATH_ENCODED_SEPARATOR;
1183 }
1184
1185 return r;
1186 }
1187
1188 /**
1189 * Decode a %u-encoded character, using best-fit mapping as necessary. Params version.
1190 *
1191 * @param[in] cfg
1192 * @param[in] tx
1193 * @param[in] data
1194 * @return decoded byte
1195 */
decode_u_encoding_params(htp_cfg_t * cfg,enum htp_decoder_ctx_t ctx,unsigned char * data,uint64_t * flags)1196 static int decode_u_encoding_params(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, unsigned char *data, uint64_t *flags) {
1197 unsigned int c1 = x2c(data);
1198 unsigned int c2 = x2c(data + 2);
1199
1200 // Check for overlong usage first.
1201 if (c1 == 0) {
1202 (*flags) |= HTP_URLEN_OVERLONG_U;
1203 return c2;
1204 }
1205
1206 // Both bytes were used.
1207
1208 // Detect half-width and full-width range.
1209 if ((c1 == 0xff) && (c2 <= 0xef)) {
1210 (*flags) |= HTP_URLEN_HALF_FULL_RANGE;
1211 }
1212
1213 // Use best-fit mapping.
1214 unsigned char *p = cfg->decoder_cfgs[ctx].bestfit_map;
1215 int r = cfg->decoder_cfgs[ctx].bestfit_replacement_byte;
1216
1217 // TODO Optimize lookup.
1218
1219 for (;;) {
1220 // Have we reached the end of the map?
1221 if ((p[0] == 0) && (p[1] == 0)) {
1222 break;
1223 }
1224
1225 // Have we found the mapping we're looking for?
1226 if ((p[0] == c1) && (p[1] == c2)) {
1227 r = p[2];
1228 break;
1229 }
1230
1231 // Move to the next triplet
1232 p += 3;
1233 }
1234
1235 return r;
1236 }
1237
1238 /**
1239 * Decode a request path according to the settings in the
1240 * provided configuration structure.
1241 *
1242 * @param[in] cfg
1243 * @param[in] tx
1244 * @param[in] path
1245 */
htp_decode_path_inplace(htp_tx_t * tx,bstr * path)1246 htp_status_t htp_decode_path_inplace(htp_tx_t *tx, bstr *path) {
1247 if (path == NULL) return HTP_ERROR;
1248 unsigned char *data = bstr_ptr(path);
1249 if (data == NULL) return HTP_ERROR;
1250
1251 size_t len = bstr_len(path);
1252
1253 htp_cfg_t *cfg = tx->cfg;
1254
1255 size_t rpos = 0;
1256 size_t wpos = 0;
1257 int previous_was_separator = 0;
1258
1259 while ((rpos < len) && (wpos < len)) {
1260 int c = data[rpos];
1261
1262 // Decode encoded characters
1263 if (c == '%') {
1264 if (rpos + 2 < len) {
1265 int handled = 0;
1266
1267 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_decode) {
1268 // Check for the %u encoding
1269 if ((data[rpos + 1] == 'u') || (data[rpos + 1] == 'U')) {
1270 handled = 1;
1271
1272 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted != HTP_UNWANTED_IGNORE) {
1273 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted;
1274 }
1275
1276 if (rpos + 5 < len) {
1277 if (isxdigit(data[rpos + 2]) && (isxdigit(data[rpos + 3]))
1278 && isxdigit(data[rpos + 4]) && (isxdigit(data[rpos + 5]))) {
1279 // Decode a valid %u encoding
1280 c = decode_u_encoding_path(cfg, tx, &data[rpos + 2]);
1281 rpos += 6;
1282
1283 if (c == 0) {
1284 tx->flags |= HTP_PATH_ENCODED_NUL;
1285
1286 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1287 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted;
1288 }
1289 }
1290 } else {
1291 // Invalid %u encoding
1292 tx->flags |= HTP_PATH_INVALID_ENCODING;
1293
1294 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1295 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1296 }
1297
1298 switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1299 case HTP_URL_DECODE_REMOVE_PERCENT:
1300 // Do not place anything in output; eat
1301 // the percent character
1302 rpos++;
1303 continue;
1304 break;
1305 case HTP_URL_DECODE_PRESERVE_PERCENT:
1306 // Leave the percent character in output
1307 rpos++;
1308 break;
1309 case HTP_URL_DECODE_PROCESS_INVALID:
1310 // Decode invalid %u encoding
1311 c = decode_u_encoding_path(cfg, tx, &data[rpos + 2]);
1312 rpos += 6;
1313 break;
1314 }
1315 }
1316 } else {
1317 // Invalid %u encoding (not enough data)
1318 tx->flags |= HTP_PATH_INVALID_ENCODING;
1319
1320 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1321 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1322 }
1323
1324 switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1325 case HTP_URL_DECODE_REMOVE_PERCENT:
1326 // Do not place anything in output; eat
1327 // the percent character
1328 rpos++;
1329 continue;
1330 break;
1331 case HTP_URL_DECODE_PRESERVE_PERCENT:
1332 // Leave the percent character in output
1333 rpos++;
1334 break;
1335 case HTP_URL_DECODE_PROCESS_INVALID:
1336 // Cannot decode, because there's not enough data.
1337 // Leave the percent character in output
1338 rpos++;
1339 // TODO Configurable handling.
1340 break;
1341 }
1342 }
1343 }
1344 }
1345
1346 // Handle standard URL encoding
1347 if (!handled) {
1348 if ((isxdigit(data[rpos + 1])) && (isxdigit(data[rpos + 2]))) {
1349 c = x2c(&data[rpos + 1]);
1350
1351 if (c == 0) {
1352 tx->flags |= HTP_PATH_ENCODED_NUL;
1353
1354 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1355 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted;
1356 }
1357
1358 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_terminates) {
1359 bstr_adjust_len(path, wpos);
1360 return HTP_OK;
1361 }
1362 }
1363
1364 if ((c == '/') || ((cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes) && (c == '\\'))) {
1365 tx->flags |= HTP_PATH_ENCODED_SEPARATOR;
1366
1367 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1368 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_encoded_unwanted;
1369 }
1370
1371 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_decode) {
1372 // Decode
1373 rpos += 3;
1374 } else {
1375 // Leave encoded
1376 c = '%';
1377 rpos++;
1378 }
1379 } else {
1380 // Decode
1381 rpos += 3;
1382 }
1383 } else {
1384 // Invalid encoding
1385 tx->flags |= HTP_PATH_INVALID_ENCODING;
1386
1387 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1388 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1389 }
1390
1391 switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1392 case HTP_URL_DECODE_REMOVE_PERCENT:
1393 // Do not place anything in output; eat
1394 // the percent character
1395 rpos++;
1396 continue;
1397 break;
1398 case HTP_URL_DECODE_PRESERVE_PERCENT:
1399 // Leave the percent character in output
1400 rpos++;
1401 break;
1402 case HTP_URL_DECODE_PROCESS_INVALID:
1403 // Decode
1404 c = x2c(&data[rpos + 1]);
1405 rpos += 3;
1406 // Note: What if an invalid encoding decodes into a path
1407 // separator? This is theoretical at the moment, because
1408 // the only platform we know doesn't convert separators is
1409 // Apache, who will also respond with 400 if invalid encoding
1410 // is encountered. Thus no check for a separator here.
1411 break;
1412 default:
1413 // Unknown setting
1414 return HTP_ERROR;
1415 break;
1416 }
1417 }
1418 }
1419 } else {
1420 // Invalid URL encoding (not enough data)
1421 tx->flags |= HTP_PATH_INVALID_ENCODING;
1422
1423 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1424 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1425 }
1426
1427 switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1428 case HTP_URL_DECODE_REMOVE_PERCENT:
1429 // Do not place anything in output; eat
1430 // the percent character
1431 rpos++;
1432 continue;
1433 break;
1434 case HTP_URL_DECODE_PRESERVE_PERCENT:
1435 // Leave the percent character in output
1436 rpos++;
1437 break;
1438 case HTP_URL_DECODE_PROCESS_INVALID:
1439 // Cannot decode, because there's not enough data.
1440 // Leave the percent character in output.
1441 // TODO Configurable handling.
1442 rpos++;
1443 break;
1444 }
1445 }
1446 } else {
1447 // One non-encoded character
1448
1449 // Is it a NUL byte?
1450 if (c == 0) {
1451 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_unwanted != HTP_UNWANTED_IGNORE) {
1452 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_unwanted;
1453 }
1454
1455 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_terminates) {
1456 // Terminate path with a raw NUL byte
1457 bstr_adjust_len(path, wpos);
1458 return HTP_OK;
1459 break;
1460 }
1461 }
1462
1463 rpos++;
1464 }
1465
1466 // Place the character into output
1467
1468 // Check for control characters
1469 if (c < 0x20) {
1470 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].control_chars_unwanted != HTP_UNWANTED_IGNORE) {
1471 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].control_chars_unwanted;
1472 }
1473 }
1474
1475 // Convert backslashes to forward slashes, if necessary
1476 if ((c == '\\') && (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes)) {
1477 c = '/';
1478 }
1479
1480 // Lowercase characters, if necessary
1481 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].convert_lowercase) {
1482 c = tolower(c);
1483 }
1484
1485 // If we're compressing separators then we need
1486 // to track if the previous character was a separator
1487 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_compress) {
1488 if (c == '/') {
1489 if (!previous_was_separator) {
1490 data[wpos++] = c;
1491 previous_was_separator = 1;
1492 } else {
1493 // Do nothing; we don't want
1494 // another separator in output
1495 }
1496 } else {
1497 data[wpos++] = c;
1498 previous_was_separator = 0;
1499 }
1500 } else {
1501 data[wpos++] = c;
1502 }
1503 }
1504
1505 bstr_adjust_len(path, wpos);
1506
1507 return HTP_OK;
1508 }
1509
htp_tx_urldecode_uri_inplace(htp_tx_t * tx,bstr * input)1510 htp_status_t htp_tx_urldecode_uri_inplace(htp_tx_t *tx, bstr *input) {
1511 uint64_t flags = 0;
1512
1513 htp_status_t rc = htp_urldecode_inplace_ex(tx->cfg, HTP_DECODER_URL_PATH, input, &flags, &(tx->response_status_expected_number));
1514
1515 if (flags & HTP_URLEN_INVALID_ENCODING) {
1516 tx->flags |= HTP_PATH_INVALID_ENCODING;
1517 }
1518
1519 if (flags & HTP_URLEN_ENCODED_NUL) {
1520 tx->flags |= HTP_PATH_ENCODED_NUL;
1521 }
1522
1523 if (flags & HTP_URLEN_RAW_NUL) {
1524 tx->flags |= HTP_PATH_RAW_NUL;
1525 }
1526
1527 return rc;
1528 }
1529
htp_tx_urldecode_params_inplace(htp_tx_t * tx,bstr * input)1530 htp_status_t htp_tx_urldecode_params_inplace(htp_tx_t *tx, bstr *input) {
1531 return htp_urldecode_inplace_ex(tx->cfg, HTP_DECODER_URLENCODED, input, &(tx->flags), &(tx->response_status_expected_number));
1532 }
1533
htp_urldecode_inplace(htp_cfg_t * cfg,enum htp_decoder_ctx_t ctx,bstr * input,uint64_t * flags)1534 htp_status_t htp_urldecode_inplace(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, bstr *input, uint64_t *flags) {
1535 int expected_status_code = 0;
1536 return htp_urldecode_inplace_ex(cfg, ctx, input, flags, &expected_status_code);
1537 }
1538
htp_urldecode_inplace_ex(htp_cfg_t * cfg,enum htp_decoder_ctx_t ctx,bstr * input,uint64_t * flags,int * expected_status_code)1539 htp_status_t htp_urldecode_inplace_ex(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, bstr *input, uint64_t *flags, int *expected_status_code) {
1540 if (input == NULL) return HTP_ERROR;
1541
1542 unsigned char *data = bstr_ptr(input);
1543 if (data == NULL) return HTP_ERROR;
1544 size_t len = bstr_len(input);
1545
1546 size_t rpos = 0;
1547 size_t wpos = 0;
1548
1549 while ((rpos < len) && (wpos < len)) {
1550 int c = data[rpos];
1551
1552 // Decode encoded characters.
1553 if (c == '%') {
1554 // Need at least 2 additional bytes for %HH.
1555 if (rpos + 2 < len) {
1556 int handled = 0;
1557
1558 // Decode %uHHHH encoding, but only if allowed in configuration.
1559 if (cfg->decoder_cfgs[ctx].u_encoding_decode) {
1560 // The next character must be a case-insensitive u.
1561 if ((data[rpos + 1] == 'u') || (data[rpos + 1] == 'U')) {
1562 handled = 1;
1563
1564 if (cfg->decoder_cfgs[ctx].u_encoding_unwanted != HTP_UNWANTED_IGNORE) {
1565 (*expected_status_code) = cfg->decoder_cfgs[ctx].u_encoding_unwanted;
1566 }
1567
1568 // Need at least 5 additional bytes for %uHHHH.
1569 if (rpos + 5 < len) {
1570 if (isxdigit(data[rpos + 2]) && (isxdigit(data[rpos + 3]))
1571 && isxdigit(data[rpos + 4]) && (isxdigit(data[rpos + 5]))) {
1572 // Decode a valid %u encoding.
1573 c = decode_u_encoding_params(cfg, ctx, &(data[rpos + 2]), flags);
1574 rpos += 6;
1575 } else {
1576 // Invalid %u encoding (could not find 4 xdigits).
1577 (*flags) |= HTP_URLEN_INVALID_ENCODING;
1578
1579 if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1580 (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1581 }
1582
1583 switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1584 case HTP_URL_DECODE_REMOVE_PERCENT:
1585 // Do not place anything in output; consume the %.
1586 rpos++;
1587 continue;
1588 break;
1589 case HTP_URL_DECODE_PRESERVE_PERCENT:
1590 // Leave the % in output.
1591 rpos++;
1592 break;
1593 case HTP_URL_DECODE_PROCESS_INVALID:
1594 // Decode invalid %u encoding.
1595 c = decode_u_encoding_params(cfg, ctx, &(data[rpos + 2]), flags);
1596 rpos += 6;
1597 break;
1598 }
1599 }
1600 } else {
1601 // Invalid %u encoding; not enough data.
1602 (*flags) |= HTP_URLEN_INVALID_ENCODING;
1603
1604 if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1605 (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1606 }
1607
1608 switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1609 case HTP_URL_DECODE_REMOVE_PERCENT:
1610 // Do not place anything in output; consume the %.
1611 rpos++;
1612 continue;
1613 break;
1614 case HTP_URL_DECODE_PRESERVE_PERCENT:
1615 // Leave the % in output.
1616 rpos++;
1617 break;
1618 case HTP_URL_DECODE_PROCESS_INVALID:
1619 // Cannot decode because there's not enough data.
1620 // Leave the % in output.
1621 // TODO Configurable handling of %, u, etc.
1622 rpos++;
1623 break;
1624 }
1625 }
1626 }
1627 }
1628
1629 // Handle standard URL encoding.
1630 if (!handled) {
1631 // Need 2 hexadecimal digits.
1632 if ((isxdigit(data[rpos + 1])) && (isxdigit(data[rpos + 2]))) {
1633 // Decode %HH encoding.
1634 c = x2c(&(data[rpos + 1]));
1635 rpos += 3;
1636 } else {
1637 // Invalid encoding (enough bytes, but not hexadecimal digits).
1638 (*flags) |= HTP_URLEN_INVALID_ENCODING;
1639
1640 if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1641 (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1642 }
1643
1644 switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1645 case HTP_URL_DECODE_REMOVE_PERCENT:
1646 // Do not place anything in output; consume the %.
1647 rpos++;
1648 continue;
1649 break;
1650 case HTP_URL_DECODE_PRESERVE_PERCENT:
1651 // Leave the % in output.
1652 rpos++;
1653 break;
1654 case HTP_URL_DECODE_PROCESS_INVALID:
1655 // Decode.
1656 c = x2c(&(data[rpos + 1]));
1657 rpos += 3;
1658 break;
1659 }
1660 }
1661 }
1662 } else {
1663 // Invalid encoding; not enough data (at least 2 bytes required).
1664 (*flags) |= HTP_URLEN_INVALID_ENCODING;
1665
1666 if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1667 (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1668 }
1669
1670 switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1671 case HTP_URL_DECODE_REMOVE_PERCENT:
1672 // Do not place anything in output; consume the %.
1673 rpos++;
1674 continue;
1675 break;
1676 case HTP_URL_DECODE_PRESERVE_PERCENT:
1677 // Leave the % in output.
1678 rpos++;
1679 break;
1680 case HTP_URL_DECODE_PROCESS_INVALID:
1681 // Cannot decode because there's not enough data.
1682 // Leave the % in output.
1683 // TODO Configurable handling of %, etc.
1684 rpos++;
1685 break;
1686 }
1687 }
1688
1689 // Did we get an encoded NUL byte?
1690 if (c == 0) {
1691 if (cfg->decoder_cfgs[ctx].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1692 (*expected_status_code) = cfg->decoder_cfgs[ctx].nul_encoded_unwanted;
1693 }
1694
1695 (*flags) |= HTP_URLEN_ENCODED_NUL;
1696
1697 if (cfg->decoder_cfgs[ctx].nul_encoded_terminates) {
1698 // Terminate the path at the raw NUL byte.
1699 bstr_adjust_len(input, wpos);
1700 return 1;
1701 }
1702 }
1703
1704 data[wpos++] = c;
1705 } else if (c == '+') {
1706 // Decoding of the plus character is conditional on the configuration.
1707
1708 if (cfg->decoder_cfgs[ctx].plusspace_decode) {
1709 c = 0x20;
1710 }
1711
1712 rpos++;
1713 data[wpos++] = c;
1714 } else {
1715 // One non-encoded byte.
1716
1717 // Did we get a raw NUL byte?
1718 if (c == 0) {
1719 if (cfg->decoder_cfgs[ctx].nul_raw_unwanted != HTP_UNWANTED_IGNORE) {
1720 (*expected_status_code) = cfg->decoder_cfgs[ctx].nul_raw_unwanted;
1721 }
1722
1723 (*flags) |= HTP_URLEN_RAW_NUL;
1724
1725 if (cfg->decoder_cfgs[ctx].nul_raw_terminates) {
1726 // Terminate the path at the encoded NUL byte.
1727 bstr_adjust_len(input, wpos);
1728 return HTP_OK;
1729 }
1730 }
1731
1732 rpos++;
1733 data[wpos++] = c;
1734 }
1735 }
1736
1737 bstr_adjust_len(input, wpos);
1738
1739 return HTP_OK;
1740 }
1741
1742 /**
1743 * Normalize a previously-parsed request URI.
1744 *
1745 * @param[in] connp
1746 * @param[in] incomplete
1747 * @param[in] normalized
1748 * @return HTP_OK or HTP_ERROR
1749 */
htp_normalize_parsed_uri(htp_tx_t * tx,htp_uri_t * incomplete,htp_uri_t * normalized)1750 int htp_normalize_parsed_uri(htp_tx_t *tx, htp_uri_t *incomplete, htp_uri_t *normalized) {
1751 // Scheme.
1752 if (incomplete->scheme != NULL) {
1753 // Duplicate and convert to lowercase.
1754 normalized->scheme = bstr_dup_lower(incomplete->scheme);
1755 if (normalized->scheme == NULL) return HTP_ERROR;
1756 }
1757
1758 // Username.
1759 if (incomplete->username != NULL) {
1760 normalized->username = bstr_dup(incomplete->username);
1761 if (normalized->username == NULL) return HTP_ERROR;
1762 htp_tx_urldecode_uri_inplace(tx, normalized->username);
1763 }
1764
1765 // Password.
1766 if (incomplete->password != NULL) {
1767 normalized->password = bstr_dup(incomplete->password);
1768 if (normalized->password == NULL) return HTP_ERROR;
1769 htp_tx_urldecode_uri_inplace(tx, normalized->password);
1770 }
1771
1772 // Hostname.
1773 if (incomplete->hostname != NULL) {
1774 // We know that incomplete->hostname does not contain
1775 // port information, so no need to check for it here.
1776 normalized->hostname = bstr_dup(incomplete->hostname);
1777 if (normalized->hostname == NULL) return HTP_ERROR;
1778 htp_tx_urldecode_uri_inplace(tx, normalized->hostname);
1779 htp_normalize_hostname_inplace(normalized->hostname);
1780 }
1781
1782 // Port.
1783 if (incomplete->port != NULL) {
1784 int64_t port_parsed = htp_parse_positive_integer_whitespace(
1785 bstr_ptr(incomplete->port), bstr_len(incomplete->port), 10);
1786
1787 if (port_parsed < 0) {
1788 // Failed to parse the port number.
1789 normalized->port_number = -1;
1790 tx->flags |= HTP_HOSTU_INVALID;
1791 } else if ((port_parsed > 0) && (port_parsed < 65536)) {
1792 // Valid port number.
1793 normalized->port_number = (int) port_parsed;
1794 } else {
1795 // Port number out of range.
1796 normalized->port_number = -1;
1797 tx->flags |= HTP_HOSTU_INVALID;
1798 }
1799 } else {
1800 normalized->port_number = -1;
1801 }
1802
1803 // Path.
1804 if (incomplete->path != NULL) {
1805 // Make a copy of the path, so that we can work on it.
1806 normalized->path = bstr_dup(incomplete->path);
1807 if (normalized->path == NULL) return HTP_ERROR;
1808
1809 // Decode URL-encoded (and %u-encoded) characters, as well as lowercase,
1810 // compress separators and convert backslashes.
1811 htp_decode_path_inplace(tx, normalized->path);
1812
1813 // Handle UTF-8 in the path.
1814 if (tx->cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_convert_bestfit) {
1815 // Decode Unicode characters into a single-byte stream, using best-fit mapping.
1816 htp_utf8_decode_path_inplace(tx->cfg, tx, normalized->path);
1817 } else {
1818 // No decoding, but try to validate the path as a UTF-8 stream.
1819 htp_utf8_validate_path(tx, normalized->path);
1820 }
1821
1822 // RFC normalization.
1823 htp_normalize_uri_path_inplace(normalized->path);
1824 }
1825
1826 // Query string.
1827 if (incomplete->query != NULL) {
1828 normalized->query = bstr_dup(incomplete->query);
1829 if (normalized->query == NULL) return HTP_ERROR;
1830 }
1831
1832 // Fragment.
1833 if (incomplete->fragment != NULL) {
1834 normalized->fragment = bstr_dup(incomplete->fragment);
1835 if (normalized->fragment == NULL) return HTP_ERROR;
1836 htp_tx_urldecode_uri_inplace(tx, normalized->fragment);
1837 }
1838
1839 return HTP_OK;
1840 }
1841
1842 /**
1843 * Normalize request hostname. Convert all characters to lowercase and
1844 * remove trailing dots from the end, if present.
1845 *
1846 * @param[in] hostname
1847 * @return Normalized hostname.
1848 */
htp_normalize_hostname_inplace(bstr * hostname)1849 bstr *htp_normalize_hostname_inplace(bstr *hostname) {
1850 if (hostname == NULL) return NULL;
1851
1852 bstr_to_lowercase(hostname);
1853
1854 // Remove dots from the end of the string.
1855 while (bstr_char_at_end(hostname, 0) == '.') bstr_chop(hostname);
1856
1857 return hostname;
1858 }
1859
1860 /**
1861 * Normalize URL path. This function implements the remove dot segments algorithm
1862 * specified in RFC 3986, section 5.2.4.
1863 *
1864 * @param[in] s
1865 */
htp_normalize_uri_path_inplace(bstr * s)1866 void htp_normalize_uri_path_inplace(bstr *s) {
1867 if (s == NULL) return;
1868
1869 unsigned char *data = bstr_ptr(s);
1870 if (data == NULL) return;
1871 size_t len = bstr_len(s);
1872
1873 size_t rpos = 0;
1874 size_t wpos = 0;
1875
1876 int c = -1;
1877 while ((rpos < len)&&(wpos < len)) {
1878 if (c == -1) {
1879 c = data[rpos++];
1880 }
1881
1882 // A. If the input buffer begins with a prefix of "../" or "./",
1883 // then remove that prefix from the input buffer; otherwise,
1884 if (c == '.') {
1885 if ((rpos + 1 < len) && (data[rpos] == '.') && (data[rpos + 1] == '/')) {
1886 c = -1;
1887 rpos += 2;
1888 continue;
1889 } else if ((rpos < len) && (data[rpos] == '/')) {
1890 c = -1;
1891 rpos += 1;
1892 continue;
1893 }
1894 }
1895
1896 if (c == '/') {
1897 // B. if the input buffer begins with a prefix of "/./" or "/.",
1898 // where "." is a complete path segment, then replace that
1899 // prefix with "/" in the input buffer; otherwise,
1900 if ((rpos + 1 < len) && (data[rpos] == '.') && (data[rpos + 1] == '/')) {
1901 c = '/';
1902 rpos += 2;
1903 continue;
1904 } else if ((rpos + 1 == len) && (data[rpos] == '.')) {
1905 c = '/';
1906 rpos += 1;
1907 continue;
1908 }
1909
1910 // C. if the input buffer begins with a prefix of "/../" or "/..",
1911 // where ".." is a complete path segment, then replace that
1912 // prefix with "/" in the input buffer and remove the last
1913 // segment and its preceding "/" (if any) from the output
1914 // buffer; otherwise,
1915 if ((rpos + 2 < len) && (data[rpos] == '.') && (data[rpos + 1] == '.') && (data[rpos + 2] == '/')) {
1916 c = '/';
1917 rpos += 3;
1918
1919 // Remove the last segment
1920 while ((wpos > 0) && (data[wpos - 1] != '/')) wpos--;
1921 if (wpos > 0) wpos--;
1922 continue;
1923 } else if ((rpos + 2 == len) && (data[rpos] == '.') && (data[rpos + 1] == '.')) {
1924 c = '/';
1925 rpos += 2;
1926
1927 // Remove the last segment
1928 while ((wpos > 0) && (data[wpos - 1] != '/')) wpos--;
1929 if (wpos > 0) wpos--;
1930 continue;
1931 }
1932 }
1933
1934 // D. if the input buffer consists only of "." or "..", then remove
1935 // that from the input buffer; otherwise,
1936 if ((c == '.') && (rpos == len)) {
1937 rpos++;
1938 continue;
1939 }
1940
1941 if ((c == '.') && (rpos + 1 == len) && (data[rpos] == '.')) {
1942 rpos += 2;
1943 continue;
1944 }
1945
1946 // E. move the first path segment in the input buffer to the end of
1947 // the output buffer, including the initial "/" character (if
1948 // any) and any subsequent characters up to, but not including,
1949 // the next "/" character or the end of the input buffer.
1950 data[wpos++] = c;
1951
1952 while ((rpos < len) && (data[rpos] != '/') && (wpos < len)) {
1953 data[wpos++] = data[rpos++];
1954 }
1955
1956 c = -1;
1957 }
1958
1959 bstr_adjust_len(s, wpos);
1960 }
1961
1962 /**
1963 *
1964 */
fprint_bstr(FILE * stream,const char * name,bstr * b)1965 void fprint_bstr(FILE *stream, const char *name, bstr *b) {
1966 if (b == NULL) {
1967 fprint_raw_data_ex(stream, name, "(null)", 0, 6);
1968 return;
1969 }
1970
1971 fprint_raw_data_ex(stream, name, bstr_ptr(b), 0, bstr_len(b));
1972 }
1973
1974 /**
1975 *
1976 */
fprint_raw_data(FILE * stream,const char * name,const void * data,size_t len)1977 void fprint_raw_data(FILE *stream, const char *name, const void *data, size_t len) {
1978 fprint_raw_data_ex(stream, name, data, 0, len);
1979 }
1980
1981 /**
1982 *
1983 */
fprint_raw_data_ex(FILE * stream,const char * name,const void * _data,size_t offset,size_t printlen)1984 void fprint_raw_data_ex(FILE *stream, const char *name, const void *_data, size_t offset, size_t printlen) {
1985 const unsigned char *data = (const unsigned char *) _data;
1986 char buf[160];
1987 size_t len = offset + printlen;
1988
1989 fprintf(stream, "\n%s: ptr %p offset %u len %u\n", name, (void*) data, (unsigned int)offset, (unsigned int)len);
1990
1991 while (offset < len) {
1992 size_t i;
1993
1994 snprintf(buf, sizeof(buf), "%x" PRIx64, (unsigned int) offset);
1995 strlcat(buf, " ", sizeof(buf));
1996
1997 i = 0;
1998 while (i < 8) {
1999 if (offset + i < len) {
2000 char step[4];
2001 snprintf(step, sizeof(step), "%02x ", data[offset + i]);
2002 strlcat(buf, step, sizeof(buf));
2003 } else {
2004 strlcat(buf, " ", sizeof(buf));
2005 }
2006
2007 i++;
2008 }
2009
2010 strlcat(buf, " ", sizeof(buf));
2011
2012 i = 8;
2013 while (i < 16) {
2014 if (offset + i < len) {
2015 char step[4];
2016 snprintf(step, sizeof(step), "%02x ", data[offset + i]);
2017 strlcat(buf, step, sizeof(buf));
2018 } else {
2019 strlcat(buf, " ", sizeof(buf));
2020 }
2021
2022 i++;
2023 }
2024
2025 strlcat(buf, " |", sizeof(buf));
2026
2027 i = 0;
2028 char *p = buf + strlen(buf);
2029 while ((offset + i < len) && (i < 16)) {
2030 int c = data[offset + i];
2031
2032 if (isprint(c)) {
2033 *p++ = c;
2034 } else {
2035 *p++ = '.';
2036 }
2037
2038 i++;
2039 }
2040
2041 *p++ = '|';
2042 *p++ = '\n';
2043 *p = '\0';
2044
2045 fprintf(stream, "%s", buf);
2046 offset += 16;
2047 }
2048
2049 fprintf(stream, "\n");
2050 }
2051
2052 /**
2053 *
2054 */
htp_connp_in_state_as_string(htp_connp_t * connp)2055 char *htp_connp_in_state_as_string(htp_connp_t *connp) {
2056 if (connp == NULL) return "NULL";
2057
2058 if (connp->in_state == htp_connp_REQ_IDLE) return "REQ_IDLE";
2059 if (connp->in_state == htp_connp_REQ_LINE) return "REQ_LINE";
2060 if (connp->in_state == htp_connp_REQ_PROTOCOL) return "REQ_PROTOCOL";
2061 if (connp->in_state == htp_connp_REQ_HEADERS) return "REQ_HEADERS";
2062 if (connp->in_state == htp_connp_REQ_CONNECT_CHECK) return "REQ_CONNECT_CHECK";
2063 if (connp->in_state == htp_connp_REQ_CONNECT_WAIT_RESPONSE) return "REQ_CONNECT_WAIT_RESPONSE";
2064 if (connp->in_state == htp_connp_REQ_BODY_DETERMINE) return "REQ_BODY_DETERMINE";
2065 if (connp->in_state == htp_connp_REQ_BODY_IDENTITY) return "REQ_BODY_IDENTITY";
2066 if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_LENGTH) return "REQ_BODY_CHUNKED_LENGTH";
2067 if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_DATA) return "REQ_BODY_CHUNKED_DATA";
2068 if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_DATA_END) return "REQ_BODY_CHUNKED_DATA_END";
2069 if (connp->in_state == htp_connp_REQ_FINALIZE) return "REQ_FINALIZE";
2070 if (connp->in_state == htp_connp_REQ_IGNORE_DATA_AFTER_HTTP_0_9) return "REQ_IGNORE_DATA_AFTER_HTTP_0_9";
2071
2072 return "UNKNOWN";
2073 }
2074
2075 /**
2076 *
2077 */
htp_connp_out_state_as_string(htp_connp_t * connp)2078 char *htp_connp_out_state_as_string(htp_connp_t *connp) {
2079 if (connp == NULL) return "NULL";
2080
2081 if (connp->out_state == htp_connp_RES_IDLE) return "RES_IDLE";
2082 if (connp->out_state == htp_connp_RES_LINE) return "RES_LINE";
2083 if (connp->out_state == htp_connp_RES_HEADERS) return "RES_HEADERS";
2084 if (connp->out_state == htp_connp_RES_BODY_DETERMINE) return "RES_BODY_DETERMINE";
2085 if (connp->out_state == htp_connp_RES_BODY_IDENTITY_CL_KNOWN) return "RES_BODY_IDENTITY_CL_KNOWN";
2086 if (connp->out_state == htp_connp_RES_BODY_IDENTITY_STREAM_CLOSE) return "RES_BODY_IDENTITY_STREAM_CLOSE";
2087 if (connp->out_state == htp_connp_RES_BODY_CHUNKED_LENGTH) return "RES_BODY_CHUNKED_LENGTH";
2088 if (connp->out_state == htp_connp_RES_BODY_CHUNKED_DATA) return "RES_BODY_CHUNKED_DATA";
2089 if (connp->out_state == htp_connp_RES_BODY_CHUNKED_DATA_END) return "RES_BODY_CHUNKED_DATA_END";
2090 if (connp->out_state == htp_connp_RES_FINALIZE) return "RES_BODY_FINALIZE";
2091
2092 return "UNKNOWN";
2093 }
2094
2095 /**
2096 *
2097 */
htp_tx_request_progress_as_string(htp_tx_t * tx)2098 char *htp_tx_request_progress_as_string(htp_tx_t *tx) {
2099 if (tx == NULL) return "NULL";
2100
2101 switch (tx->request_progress) {
2102 case HTP_REQUEST_NOT_STARTED:
2103 return "NOT_STARTED";
2104 case HTP_REQUEST_LINE:
2105 return "REQ_LINE";
2106 case HTP_REQUEST_HEADERS:
2107 return "REQ_HEADERS";
2108 case HTP_REQUEST_BODY:
2109 return "REQ_BODY";
2110 case HTP_REQUEST_TRAILER:
2111 return "REQ_TRAILER";
2112 case HTP_REQUEST_COMPLETE:
2113 return "COMPLETE";
2114 }
2115
2116 return "INVALID";
2117 }
2118
2119 /**
2120 *
2121 */
htp_tx_response_progress_as_string(htp_tx_t * tx)2122 char *htp_tx_response_progress_as_string(htp_tx_t *tx) {
2123 if (tx == NULL) return "NULL";
2124
2125 switch (tx->response_progress) {
2126 case HTP_RESPONSE_NOT_STARTED:
2127 return "NOT_STARTED";
2128 case HTP_RESPONSE_LINE:
2129 return "RES_LINE";
2130 case HTP_RESPONSE_HEADERS:
2131 return "RES_HEADERS";
2132 case HTP_RESPONSE_BODY:
2133 return "RES_BODY";
2134 case HTP_RESPONSE_TRAILER:
2135 return "RES_TRAILER";
2136 case HTP_RESPONSE_COMPLETE:
2137 return "COMPLETE";
2138 }
2139
2140 return "INVALID";
2141 }
2142
htp_unparse_uri_noencode(htp_uri_t * uri)2143 bstr *htp_unparse_uri_noencode(htp_uri_t *uri) {
2144 if (uri == NULL) return NULL;
2145
2146 // On the first pass determine the length of the final string
2147 size_t len = 0;
2148
2149 if (uri->scheme != NULL) {
2150 len += bstr_len(uri->scheme);
2151 len += 3; // "://"
2152 }
2153
2154 if ((uri->username != NULL) || (uri->password != NULL)) {
2155 if (uri->username != NULL) {
2156 len += bstr_len(uri->username);
2157 }
2158
2159 len += 1; // ":"
2160
2161 if (uri->password != NULL) {
2162 len += bstr_len(uri->password);
2163 }
2164
2165 len += 1; // "@"
2166 }
2167
2168 if (uri->hostname != NULL) {
2169 len += bstr_len(uri->hostname);
2170 }
2171
2172 if (uri->port != NULL) {
2173 len += 1; // ":"
2174 len += bstr_len(uri->port);
2175 }
2176
2177 if (uri->path != NULL) {
2178 len += bstr_len(uri->path);
2179 }
2180
2181 if (uri->query != NULL) {
2182 len += 1; // "?"
2183 len += bstr_len(uri->query);
2184 }
2185
2186 if (uri->fragment != NULL) {
2187 len += 1; // "#"
2188 len += bstr_len(uri->fragment);
2189 }
2190
2191 // On the second pass construct the string
2192 bstr *r = bstr_alloc(len);
2193 if (r == NULL) return NULL;
2194
2195 if (uri->scheme != NULL) {
2196 bstr_add_noex(r, uri->scheme);
2197 bstr_add_c_noex(r, "://");
2198 }
2199
2200 if ((uri->username != NULL) || (uri->password != NULL)) {
2201 if (uri->username != NULL) {
2202 bstr_add_noex(r, uri->username);
2203 }
2204
2205 bstr_add_c_noex(r, ":");
2206
2207 if (uri->password != NULL) {
2208 bstr_add_noex(r, uri->password);
2209 }
2210
2211 bstr_add_c_noex(r, "@");
2212 }
2213
2214 if (uri->hostname != NULL) {
2215 bstr_add_noex(r, uri->hostname);
2216 }
2217
2218 if (uri->port != NULL) {
2219 bstr_add_c_noex(r, ":");
2220 bstr_add_noex(r, uri->port);
2221 }
2222
2223 if (uri->path != NULL) {
2224 bstr_add_noex(r, uri->path);
2225 }
2226
2227 if (uri->query != NULL) {
2228 bstr_add_c_noex(r, "?");
2229 bstr_add_noex(r, uri->query);
2230 }
2231
2232 if (uri->fragment != NULL) {
2233 bstr_add_c_noex(r, "#");
2234 bstr_add_noex(r, uri->fragment);
2235 }
2236
2237 return r;
2238 }
2239
2240 /**
2241 * Determine if the information provided on the response line
2242 * is good enough. Browsers are lax when it comes to response
2243 * line parsing. In most cases they will only look for the
2244 * words "http" at the beginning.
2245 *
2246 * @param[in] data pointer to bytearray
2247 * @param[in] len length in bytes of data
2248 * @return 1 for good enough or 0 for not good enough
2249 */
htp_treat_response_line_as_body(const uint8_t * data,size_t len)2250 int htp_treat_response_line_as_body(const uint8_t *data, size_t len) {
2251 // Browser behavior:
2252 // Firefox 3.5.x: (?i)^\s*http
2253 // IE: (?i)^\s*http\s*/
2254 // Safari: ^HTTP/\d+\.\d+\s+\d{3}
2255 size_t pos = 0;
2256
2257 if (data == NULL) return 1;
2258 while ((pos < len) && (htp_is_space(data[pos]) || data[pos] == 0)) pos++;
2259
2260 if (len < pos + 4) return 1;
2261
2262 if ((data[pos] != 'H') && (data[pos] != 'h')) return 1;
2263 if ((data[pos+1] != 'T') && (data[pos+1] != 't')) return 1;
2264 if ((data[pos+2] != 'T') && (data[pos+2] != 't')) return 1;
2265 if ((data[pos+3] != 'P') && (data[pos+3] != 'p')) return 1;
2266
2267 return 0;
2268 }
2269
2270 /**
2271 * Run the REQUEST_BODY_DATA hook.
2272 *
2273 * @param[in] connp
2274 * @param[in] d
2275 */
htp_req_run_hook_body_data(htp_connp_t * connp,htp_tx_data_t * d)2276 htp_status_t htp_req_run_hook_body_data(htp_connp_t *connp, htp_tx_data_t *d) {
2277 // Do not invoke callbacks with an empty data chunk
2278 if ((d->data != NULL) && (d->len == 0)) return HTP_OK;
2279
2280 // Do not invoke callbacks without a transaction.
2281 if (connp->in_tx == NULL) return HTP_OK;
2282
2283 // Run transaction hooks first
2284 htp_status_t rc = htp_hook_run_all(connp->in_tx->hook_request_body_data, d);
2285 if (rc != HTP_OK) return rc;
2286
2287 // Run configuration hooks second
2288 rc = htp_hook_run_all(connp->cfg->hook_request_body_data, d);
2289 if (rc != HTP_OK) return rc;
2290
2291 // On PUT requests, treat request body as file
2292 if (connp->put_file != NULL) {
2293 htp_file_data_t file_data;
2294
2295 file_data.data = d->data;
2296 file_data.len = d->len;
2297 file_data.file = connp->put_file;
2298 file_data.file->len += d->len;
2299
2300 rc = htp_hook_run_all(connp->cfg->hook_request_file_data, &file_data);
2301 if (rc != HTP_OK) return rc;
2302 }
2303
2304 return HTP_OK;
2305 }
2306
2307 /**
2308 * Run the RESPONSE_BODY_DATA hook.
2309 *
2310 * @param[in] connp
2311 * @param[in] d
2312 */
htp_res_run_hook_body_data(htp_connp_t * connp,htp_tx_data_t * d)2313 htp_status_t htp_res_run_hook_body_data(htp_connp_t *connp, htp_tx_data_t *d) {
2314 // Do not invoke callbacks with an empty data chunk.
2315 if ((d->data != NULL) && (d->len == 0)) return HTP_OK;
2316
2317 // Run transaction hooks first
2318 htp_status_t rc = htp_hook_run_all(connp->out_tx->hook_response_body_data, d);
2319 if (rc != HTP_OK) return rc;
2320
2321 // Run configuration hooks second
2322 rc = htp_hook_run_all(connp->cfg->hook_response_body_data, d);
2323 if (rc != HTP_OK) return rc;
2324
2325 return HTP_OK;
2326 }
2327
2328 /**
2329 * Parses the provided memory region, extracting the double-quoted string.
2330 *
2331 * @param[in] data
2332 * @param[in] len
2333 * @param[out] out
2334 * @param[out] endoffset
2335 * @return HTP_OK on success, HTP_DECLINED if the input is not well formed, and HTP_ERROR on fatal errors.
2336 */
htp_extract_quoted_string_as_bstr(unsigned char * data,size_t len,bstr ** out,size_t * endoffset)2337 htp_status_t htp_extract_quoted_string_as_bstr(unsigned char *data, size_t len, bstr **out, size_t *endoffset) {
2338 if ((data == NULL) || (out == NULL)) return HTP_ERROR;
2339
2340 if (len == 0) return HTP_DECLINED;
2341
2342 size_t pos = 0;
2343
2344 // Check that the first character is a double quote.
2345 if (data[pos] != '"') return HTP_DECLINED;
2346
2347 // Step over the double quote.
2348 pos++;
2349 if (pos == len) return HTP_DECLINED;
2350
2351 // Calculate the length of the resulting string.
2352 size_t escaped_chars = 0;
2353 while (pos < len) {
2354 if (data[pos] == '\\') {
2355 if (pos + 1 < len) {
2356 escaped_chars++;
2357 pos += 2;
2358 continue;
2359 }
2360 } else if (data[pos] == '"') {
2361 break;
2362 }
2363
2364 pos++;
2365 }
2366
2367 // Have we reached the end of input without seeing the terminating double quote?
2368 if (pos == len) return HTP_DECLINED;
2369
2370 // Copy the data and unescape it as necessary.
2371 size_t outlen = pos - 1 - escaped_chars;
2372 *out = bstr_alloc(outlen);
2373 if (*out == NULL) return HTP_ERROR;
2374 unsigned char *outptr = bstr_ptr(*out);
2375 size_t outpos = 0;
2376
2377 pos = 1;
2378 while ((pos < len) && (outpos < outlen)) {
2379 // TODO We are not properly unescaping test here, we're only
2380 // handling escaped double quotes.
2381 if (data[pos] == '\\') {
2382 if (pos + 1 < len) {
2383 outptr[outpos++] = data[pos + 1];
2384 pos += 2;
2385 continue;
2386 }
2387 } else if (data[pos] == '"') {
2388 break;
2389 }
2390
2391 outptr[outpos++] = data[pos++];
2392 }
2393
2394 bstr_adjust_len(*out, outlen);
2395
2396 if (endoffset != NULL) {
2397 *endoffset = pos;
2398 }
2399
2400 return HTP_OK;
2401 }
2402
htp_parse_ct_header(bstr * header,bstr ** ct)2403 htp_status_t htp_parse_ct_header(bstr *header, bstr **ct) {
2404 if ((header == NULL) || (ct == NULL)) return HTP_ERROR;
2405
2406 unsigned char *data = bstr_ptr(header);
2407 size_t len = bstr_len(header);
2408
2409 // The assumption here is that the header value we receive
2410 // here has been left-trimmed, which means the starting position
2411 // is on the media type. On some platforms that may not be the
2412 // case, and we may need to do the left-trim ourselves.
2413
2414 // Find the end of the MIME type, using the same approach PHP 5.4.3 uses.
2415 size_t pos = 0;
2416 while ((pos < len) && (data[pos] != ';') && (data[pos] != ',') && (data[pos] != ' ')) pos++;
2417
2418 *ct = bstr_dup_ex(header, 0, pos);
2419 if (*ct == NULL) return HTP_ERROR;
2420
2421 bstr_to_lowercase(*ct);
2422
2423 return HTP_OK;
2424 }
2425
2426 /**
2427 * Implements relaxed (not strictly RFC) hostname validation.
2428 *
2429 * @param[in] hostname
2430 * @return 1 if the supplied hostname is valid; 0 if it is not.
2431 */
htp_validate_hostname(bstr * hostname)2432 int htp_validate_hostname(bstr *hostname) {
2433 unsigned char *data = bstr_ptr(hostname);
2434 size_t len = bstr_len(hostname);
2435 size_t startpos = 0;
2436 size_t pos = 0;
2437
2438 if ((len == 0) || (len > 255)) return 0;
2439
2440 while (pos < len) {
2441 // Validate label characters.
2442 startpos = pos;
2443 while ((pos < len) && (data[pos] != '.')) {
2444 unsigned char c = data[pos];
2445 // According to the RFC, the underscore is not allowed in a label, but
2446 // we allow it here because we think it's often seen in practice.
2447 if (!(((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
2448 ((c >= '0') && (c <= '9')) ||
2449 (c == '-') || (c == '_')))
2450 {
2451 return 0;
2452 }
2453
2454 pos++;
2455 }
2456
2457 // Validate label length.
2458 if ((pos - startpos == 0) || (pos - startpos > 63)) return 0;
2459
2460 if (pos >= len) return 1; // No more data after label.
2461
2462 // How many dots are there?
2463 startpos = pos;
2464 while ((pos < len) && (data[pos] == '.')) pos++;
2465
2466 if (pos - startpos != 1) return 0; // Exactly one dot expected.
2467 }
2468
2469 return 1;
2470 }
2471
htp_uri_free(htp_uri_t * uri)2472 void htp_uri_free(htp_uri_t *uri) {
2473 if (uri == NULL) return;
2474
2475 bstr_free(uri->scheme);
2476 bstr_free(uri->username);
2477 bstr_free(uri->password);
2478 bstr_free(uri->hostname);
2479 bstr_free(uri->port);
2480 bstr_free(uri->path);
2481 bstr_free(uri->query);
2482 bstr_free(uri->fragment);
2483
2484 free(uri);
2485 }
2486
htp_uri_alloc()2487 htp_uri_t *htp_uri_alloc() {
2488 htp_uri_t *u = calloc(1, sizeof (htp_uri_t));
2489 if (u == NULL) return NULL;
2490
2491 u->port_number = -1;
2492
2493 return u;
2494 }
2495
htp_get_version(void)2496 char *htp_get_version(void) {
2497 return HTP_VERSION_STRING_FULL;
2498 }
2499