1 /***************************************************************************
2  * Copyright (c) 2009-2010 Open Information Security Foundation
3  * Copyright (c) 2010-2013 Qualys, Inc.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are
8  * met:
9  *
10  * - Redistributions of source code must retain the above copyright
11  *   notice, this list of conditions and the following disclaimer.
12 
13  * - Redistributions in binary form must reproduce the above copyright
14  *   notice, this list of conditions and the following disclaimer in the
15  *   documentation and/or other materials provided with the distribution.
16 
17  * - Neither the name of the Qualys, Inc. nor the names of its
18  *   contributors may be used to endorse or promote products derived from
19  *   this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  ***************************************************************************/
33 
34 /**
35  * @file
36  * @author Ivan Ristic <ivanr@webkreator.com>
37  */
38 
39 #include "htp_config_auto.h"
40 
41 #include "htp_private.h"
42 
43 /**
44  * Is character a linear white space character?
45  *
46  * @param[in] c
47  * @return 0 or 1
48  */
htp_is_lws(int c)49 int htp_is_lws(int c) {
50     if ((c == ' ') || (c == '\t')) return 1;
51     else return 0;
52 }
53 
54 /**
55  * Is character a separator character?
56  *
57  * @param[in] c
58  * @return 0 or 1
59  */
htp_is_separator(int c)60 int htp_is_separator(int c) {
61     /* separators = "(" | ")" | "<" | ">" | "@"
62                   | "," | ";" | ":" | "\" | <">
63                   | "/" | "[" | "]" | "?" | "="
64                   | "{" | "}" | SP | HT         */
65     switch (c) {
66         case '(':
67         case ')':
68         case '<':
69         case '>':
70         case '@':
71         case ',':
72         case ';':
73         case ':':
74         case '\\':
75         case '"':
76         case '/':
77         case '[':
78         case ']':
79         case '?':
80         case '=':
81         case '{':
82         case '}':
83         case ' ':
84         case '\t':
85             return 1;
86             break;
87         default:
88             return 0;
89     }
90 }
91 
92 /**
93  * Is character a text character?
94  *
95  * @param[in] c
96  * @return 0 or 1
97  */
htp_is_text(int c)98 int htp_is_text(int c) {
99     if (c == '\t') return 1;
100     if (c < 32) return 0;
101     return 1;
102 }
103 
104 /**
105  * Is character a token character?
106  *
107  * @param[in] c
108  * @return 0 or 1
109  */
htp_is_token(int c)110 int htp_is_token(int c) {
111     /* token = 1*<any CHAR except CTLs or separators> */
112     /* CHAR  = <any US-ASCII character (octets 0 - 127)> */
113     if ((c < 32) || (c > 126)) return 0;
114     if (htp_is_separator(c)) return 0;
115     return 1;
116 }
117 
118 /**
119  * Remove all line terminators (LF, CR or CRLF) from
120  * the end of the line provided as input.
121  *
122  * @return 0 if nothing was removed, 1 if one or more LF characters were removed, or
123  *         2 if one or more CR and/or LF characters were removed.
124  */
htp_chomp(unsigned char * data,size_t * len)125 int htp_chomp(unsigned char *data, size_t *len) {
126     int r = 0;
127 
128     // Loop until there's no more stuff in the buffer
129     while (*len > 0) {
130         // Try one LF first
131         if (data[*len - 1] == LF) {
132             (*len)--;
133             r = 1;
134 
135             if (*len == 0) return r;
136 
137             // A CR is allowed before LF
138             if (data[*len - 1] == CR) {
139                 (*len)--;
140                 r = 2;
141             }
142         } else if (data[*len - 1] == CR) {
143             (*len)--;
144             r = 1;
145         } else return r;
146     }
147 
148     return r;
149 }
150 
151 /**
152  * Is character a white space character?
153  *
154  * @param[in] c
155  * @return 0 or 1
156  */
htp_is_space(int c)157 int htp_is_space(int c) {
158     switch (c) {
159         case ' ':
160         case '\f':
161         case '\v':
162         case '\t':
163         case '\r':
164         case '\n':
165             return 1;
166         default:
167             return 0;
168     }
169 }
170 
171 /**
172  * Converts request method, given as a string, into a number.
173  *
174  * @param[in] method
175  * @return Method number of M_UNKNOWN
176  */
htp_convert_method_to_number(bstr * method)177 int htp_convert_method_to_number(bstr *method) {
178     if (method == NULL) return HTP_M_UNKNOWN;
179 
180     // TODO Optimize using parallel matching, or something similar.
181 
182     if (bstr_cmp_c(method, "GET") == 0) return HTP_M_GET;
183     if (bstr_cmp_c(method, "PUT") == 0) return HTP_M_PUT;
184     if (bstr_cmp_c(method, "POST") == 0) return HTP_M_POST;
185     if (bstr_cmp_c(method, "DELETE") == 0) return HTP_M_DELETE;
186     if (bstr_cmp_c(method, "CONNECT") == 0) return HTP_M_CONNECT;
187     if (bstr_cmp_c(method, "OPTIONS") == 0) return HTP_M_OPTIONS;
188     if (bstr_cmp_c(method, "TRACE") == 0) return HTP_M_TRACE;
189     if (bstr_cmp_c(method, "PATCH") == 0) return HTP_M_PATCH;
190     if (bstr_cmp_c(method, "PROPFIND") == 0) return HTP_M_PROPFIND;
191     if (bstr_cmp_c(method, "PROPPATCH") == 0) return HTP_M_PROPPATCH;
192     if (bstr_cmp_c(method, "MKCOL") == 0) return HTP_M_MKCOL;
193     if (bstr_cmp_c(method, "COPY") == 0) return HTP_M_COPY;
194     if (bstr_cmp_c(method, "MOVE") == 0) return HTP_M_MOVE;
195     if (bstr_cmp_c(method, "LOCK") == 0) return HTP_M_LOCK;
196     if (bstr_cmp_c(method, "UNLOCK") == 0) return HTP_M_UNLOCK;
197     if (bstr_cmp_c(method, "VERSION-CONTROL") == 0) return HTP_M_VERSION_CONTROL;
198     if (bstr_cmp_c(method, "CHECKOUT") == 0) return HTP_M_CHECKOUT;
199     if (bstr_cmp_c(method, "UNCHECKOUT") == 0) return HTP_M_UNCHECKOUT;
200     if (bstr_cmp_c(method, "CHECKIN") == 0) return HTP_M_CHECKIN;
201     if (bstr_cmp_c(method, "UPDATE") == 0) return HTP_M_UPDATE;
202     if (bstr_cmp_c(method, "LABEL") == 0) return HTP_M_LABEL;
203     if (bstr_cmp_c(method, "REPORT") == 0) return HTP_M_REPORT;
204     if (bstr_cmp_c(method, "MKWORKSPACE") == 0) return HTP_M_MKWORKSPACE;
205     if (bstr_cmp_c(method, "MKACTIVITY") == 0) return HTP_M_MKACTIVITY;
206     if (bstr_cmp_c(method, "BASELINE-CONTROL") == 0) return HTP_M_BASELINE_CONTROL;
207     if (bstr_cmp_c(method, "MERGE") == 0) return HTP_M_MERGE;
208     if (bstr_cmp_c(method, "INVALID") == 0) return HTP_M_INVALID;
209     if (bstr_cmp_c(method, "HEAD") == 0) return HTP_M_HEAD;
210 
211     return HTP_M_UNKNOWN;
212 }
213 
214 /**
215  * Is the given line empty?
216  *
217  * @param[in] data
218  * @param[in] len
219  * @return 0 or 1
220  */
htp_is_line_empty(unsigned char * data,size_t len)221 int htp_is_line_empty(unsigned char *data, size_t len) {
222     if ((len == 1) ||
223         ((len == 2) && (data[0] == CR) && (data[1] == LF))) {
224         return 1;
225     }
226 
227     return 0;
228 }
229 
230 /**
231  * Does line consist entirely of whitespace characters?
232  *
233  * @param[in] data
234  * @param[in] len
235  * @return 0 or 1
236  */
htp_is_line_whitespace(unsigned char * data,size_t len)237 int htp_is_line_whitespace(unsigned char *data, size_t len) {
238     size_t i;
239 
240     for (i = 0; i < len; i++) {
241         if (!isspace(data[i])) {
242             return 0;
243         }
244     }
245 
246     return 1;
247 }
248 
249 /**
250  * Parses Content-Length string (positive decimal number).
251  * White space is allowed before and after the number.
252  *
253  * @param[in] b
254  * @return Content-Length as a number, or -1 on error.
255  */
htp_parse_content_length(bstr * b,htp_connp_t * connp)256 int64_t htp_parse_content_length(bstr *b, htp_connp_t *connp) {
257     size_t len = bstr_len(b);
258     unsigned char * data = (unsigned char *) bstr_ptr(b);
259     size_t pos = 0;
260     int64_t r = 0;
261 
262     if (len == 0) return -1003;
263 
264     // Ignore junk before
265     while ((pos < len) && (data[pos] < '0' || data[pos] > '9')) {
266         if (!htp_is_lws(data[pos]) && connp != NULL && r == 0) {
267             htp_log(connp, HTP_LOG_MARK, HTP_LOG_WARNING, 0,
268                     "C-L value with extra data in the beginnning");
269             r = -1;
270         }
271         pos++;
272     }
273     if (pos == len) return -1001;
274 
275     r = bstr_util_mem_to_pint(data + pos, len - pos, 10, &pos);
276     // Ok to have junk afterwards
277     if (pos < len && connp != NULL) {
278         htp_log(connp, HTP_LOG_MARK, HTP_LOG_WARNING, 0,
279                 "C-L value with extra data in the end");
280     }
281     return r;
282 }
283 
284 /**
285  * Parses chunk length (positive hexadecimal number). White space is allowed before
286  * and after the number. An error will be returned if the chunk length is greater than
287  * INT32_MAX.
288  *
289  * @param[in] data
290  * @param[in] len
291  * @return Chunk length, or a negative number on error.
292  */
htp_parse_chunked_length(unsigned char * data,size_t len)293 int64_t htp_parse_chunked_length(unsigned char *data, size_t len) {
294     // skip leading line feeds and other control chars
295     while (len) {
296         unsigned char c = *data;
297         if (!(c == 0x0d || c == 0x0a || c == 0x20 || c == 0x09 || c == 0x0b || c == 0x0c))
298             break;
299         data++;
300         len--;
301     }
302     if (len == 0)
303         return -1004;
304 
305     // find how much of the data is correctly formatted
306     size_t i = 0;
307     while (i < len) {
308         unsigned char c = data[i];
309         if (!(isdigit(c) ||
310             (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')))
311             break;
312         i++;
313     }
314     // cut off trailing junk
315     if (i != len) {
316         len = i;
317     }
318 
319     int64_t chunk_len = htp_parse_positive_integer_whitespace(data, len, 16);
320     if (chunk_len < 0) return chunk_len;
321     if (chunk_len > INT32_MAX) return -1;
322     return chunk_len;
323 }
324 
325 /**
326  * A somewhat forgiving parser for a positive integer in a given base.
327  * Only LWS is allowed before and after the number.
328  *
329  * @param[in] data
330  * @param[in] len
331  * @param[in] base
332  * @return The parsed number on success; a negative number on error.
333  */
htp_parse_positive_integer_whitespace(unsigned char * data,size_t len,int base)334 int64_t htp_parse_positive_integer_whitespace(unsigned char *data, size_t len, int base) {
335     if (len == 0) return -1003;
336 
337     size_t last_pos;
338     size_t pos = 0;
339 
340     // Ignore LWS before
341     while ((pos < len) && (htp_is_lws(data[pos]))) pos++;
342     if (pos == len) return -1001;
343 
344     int64_t r = bstr_util_mem_to_pint(data + pos, len - pos, base, &last_pos);
345     if (r < 0) return r;
346 
347     // Move after the last digit
348     pos += last_pos;
349 
350     // Ignore LWS after
351     while (pos < len) {
352         if (!htp_is_lws(data[pos])) {
353             return -1002;
354         }
355 
356         pos++;
357     }
358 
359     return r;
360 }
361 
362 #ifdef HTP_DEBUG
363 
364 /**
365  * Prints one log message to stderr.
366  *
367  * @param[in] stream
368  * @param[in] log
369  */
htp_print_log(FILE * stream,htp_log_t * log)370 void htp_print_log(FILE *stream, htp_log_t *log) {
371     if (log->code != 0) {
372         fprintf(stream, "[%d][code %d][file %s][line %d] %s\n", log->level,
373                 log->code, log->file, log->line, log->msg);
374     } else {
375         fprintf(stream, "[%d][file %s][line %d] %s\n", log->level,
376                 log->file, log->line, log->msg);
377     }
378 }
379 #endif
380 
381 /**
382  * Records one log message.
383  *
384  * @param[in] connp
385  * @param[in] file
386  * @param[in] line
387  * @param[in] level
388  * @param[in] code
389  * @param[in] fmt
390  */
htp_log(htp_connp_t * connp,const char * file,int line,enum htp_log_level_t level,int code,const char * fmt,...)391 void htp_log(htp_connp_t *connp, const char *file, int line, enum htp_log_level_t level, int code, const char *fmt, ...) {
392     if (connp == NULL) return;
393 
394     char buf[1024];
395     va_list args;
396 
397     // Ignore messages below our log level.
398     if (connp->cfg->log_level < level) {
399         return;
400     }
401 
402     va_start(args, fmt);
403 
404     int r = vsnprintf(buf, 1024, fmt, args);
405 
406     va_end(args);
407 
408     if (r < 0) {
409         snprintf(buf, 1024, "[vnsprintf returned error %d]", r);
410     } else if (r >= 1024) {
411         // Indicate overflow with a '+' at the end.
412         buf[1022] = '+';
413         buf[1023] = '\0';
414     }
415 
416     // Create a new log entry.
417 
418     htp_log_t *log = calloc(1, sizeof (htp_log_t));
419     if (log == NULL) return;
420 
421     log->connp = connp;
422     log->file = file;
423     log->line = line;
424     log->level = level;
425     log->code = code;
426     log->msg = strdup(buf);
427 
428     htp_list_add(connp->conn->messages, log);
429 
430     if (level == HTP_LOG_ERROR) {
431         connp->last_error = log;
432     }
433 
434     #ifdef HTP_DEBUG
435     fprintf(stderr, "[LOG] %s\n", log->msg);
436     #endif
437 
438     /* coverity[check_return] */
439     htp_hook_run_all(connp->cfg->hook_log, log);
440 }
441 
442 /**
443  * Determines if the given line is a continuation (of some previous line).
444  *
445  * @param[in] data
446  * @param[in] len
447  * @return 0 or 1 for false and true, respectively. Returns -1 on error (NULL pointer or length zero).
448  */
htp_connp_is_line_folded(unsigned char * data,size_t len)449 int htp_connp_is_line_folded(unsigned char *data, size_t len) {
450     if ((data == NULL) || (len == 0)) return -1;
451     return htp_is_folding_char(data[0]);
452 }
453 
htp_is_folding_char(int c)454 int htp_is_folding_char(int c) {
455     if (htp_is_lws(c) || c == 0) return 1;
456     else return 0;
457 }
458 
459 /**
460  * Determines if the given line is a request terminator.
461  *
462  * @param[in] connp
463  * @param[in] data
464  * @param[in] len
465  * @return 0 or 1
466  */
htp_connp_is_line_terminator(htp_connp_t * connp,unsigned char * data,size_t len,int next_no_lf)467 int htp_connp_is_line_terminator(htp_connp_t *connp, unsigned char *data, size_t len, int next_no_lf) {
468     // Is this the end of request headers?
469     switch (connp->cfg->server_personality) {
470         case HTP_SERVER_IIS_5_1:
471             // IIS 5 will accept a whitespace line as a terminator
472             if (htp_is_line_whitespace(data, len)) {
473                 return 1;
474             }
475 
476             // Fall through
477         default:
478             // Treat an empty line as terminator
479             if (htp_is_line_empty(data, len)) {
480                 return 1;
481             }
482             // Only space is terminator if terminator does not follow right away
483             if (len == 2 && htp_is_lws(data[0]) && data[1] == LF) {
484                 return next_no_lf;
485             }
486             break;
487     }
488 
489     return 0;
490 }
491 
492 /**
493  * Determines if the given line can be ignored when it appears before a request.
494  *
495  * @param[in] connp
496  * @param[in] data
497  * @param[in] len
498  * @return 0 or 1
499  */
htp_connp_is_line_ignorable(htp_connp_t * connp,unsigned char * data,size_t len)500 int htp_connp_is_line_ignorable(htp_connp_t *connp, unsigned char *data, size_t len) {
501     return htp_connp_is_line_terminator(connp, data, len, 0);
502 }
503 
htp_parse_port(unsigned char * data,size_t len,int * port,int * invalid)504 static htp_status_t htp_parse_port(unsigned char *data, size_t len, int *port, int *invalid) {
505     if (len == 0) {
506         *port = -1;
507         *invalid = 1;
508         return HTP_OK;
509     }
510 
511     int64_t port_parsed = htp_parse_positive_integer_whitespace(data, len, 10);
512 
513     if (port_parsed < 0) {
514         // Failed to parse the port number.
515         *port = -1;
516         *invalid = 1;
517     } else if ((port_parsed > 0) && (port_parsed < 65536)) {
518         // Valid port number.
519         *port = port_parsed;
520     } else {
521         // Port number out of range.
522         *port = -1;
523         *invalid = 1;
524     }
525 
526     return HTP_OK;
527 }
528 
529 /**
530  * Parses an authority string, which consists of a hostname with an optional port number; username
531  * and password are not allowed and will not be handled.
532  *
533  * @param[in] hostport
534  * @param[out] hostname A bstring containing the hostname, or NULL if the hostname is invalid. If this value
535  *                      is not NULL, the caller assumes responsibility for memory management.
536  * @param[out] port Port as text, or NULL if not provided.
537  * @param[out] port_number Port number, or -1 if the port is not present or invalid.
538  * @param[out] invalid Set to 1 if any part of the authority is invalid.
539  * @return HTP_OK on success, HTP_ERROR on memory allocation failure.
540  */
htp_parse_hostport(bstr * hostport,bstr ** hostname,bstr ** port,int * port_number,int * invalid)541 htp_status_t htp_parse_hostport(bstr *hostport, bstr **hostname, bstr **port, int *port_number, int *invalid) {
542     if ((hostport == NULL) || (hostname == NULL) || (port_number == NULL) || (invalid == NULL)) return HTP_ERROR;
543 
544     *hostname = NULL;
545     if (port != NULL) {
546         *port = NULL;
547     }
548     *port_number = -1;
549     *invalid = 0;
550 
551     unsigned char *data = bstr_ptr(hostport);
552     size_t len = bstr_len(hostport);
553 
554     bstr_util_mem_trim(&data, &len);
555 
556     if (len == 0) {
557         *invalid = 1;
558         return HTP_OK;
559     }
560 
561     // Check for an IPv6 address.
562     if (data[0] == '[') {
563         // IPv6 host.
564 
565         // Find the end of the IPv6 address.
566         size_t pos = 0;
567         while ((pos < len) && (data[pos] != ']')) pos++;
568         if (pos == len) {
569             *invalid = 1;
570             return HTP_OK;
571         }
572 
573         *hostname = bstr_dup_mem(data, pos + 1);
574         if (*hostname == NULL) return HTP_ERROR;
575 
576         // Over the ']'.
577         pos++;
578         if (pos == len) return HTP_OK;
579 
580         // Handle port.
581         if (data[pos] == ':') {
582             if (port != NULL) {
583                 *port = bstr_dup_mem(data + pos + 1, len - pos - 1);
584                 if (*port == NULL) {
585                     bstr_free(*hostname);
586                     return HTP_ERROR;
587                 }
588             }
589 
590             return htp_parse_port(data + pos + 1, len - pos - 1, port_number, invalid);
591         } else {
592             *invalid = 1;
593             return HTP_OK;
594         }
595     } else {
596         // Not IPv6 host.
597 
598         // Is there a colon?
599         unsigned char *colon = memchr(data, ':', len);
600         if (colon == NULL) {
601             // Hostname alone, no port.
602 
603             *hostname = bstr_dup_mem(data, len);
604             if (*hostname == NULL) return HTP_ERROR;
605 
606             bstr_to_lowercase(*hostname);
607         } else {
608             // Hostname and port.
609 
610             // Ignore whitespace at the end of hostname.
611             unsigned char *hostend = colon;
612             while ((hostend > data) && (isspace(*(hostend - 1)))) hostend--;
613 
614             *hostname = bstr_dup_mem(data, hostend - data);
615             if (*hostname == NULL) return HTP_ERROR;
616 
617             if (port != NULL) {
618                 *port = bstr_dup_mem(colon + 1, len - (colon + 1 - data));
619                 if (*port == NULL) {
620                     bstr_free(*hostname);
621                     return HTP_ERROR;
622                 }
623             }
624 
625             return htp_parse_port(colon + 1, len - (colon + 1 - data), port_number, invalid);
626         }
627     }
628 
629     return HTP_OK;
630 }
631 
632 /**
633  * Parses hostport provided in the URI.
634  *
635  * @param[in] connp
636  * @param[in] hostport
637  * @param[in] uri
638  * @return HTP_OK on success or HTP_ERROR error.
639  */
htp_parse_uri_hostport(htp_connp_t * connp,bstr * hostport,htp_uri_t * uri)640 int htp_parse_uri_hostport(htp_connp_t *connp, bstr *hostport, htp_uri_t *uri) {
641     int invalid;
642 
643     htp_status_t rc = htp_parse_hostport(hostport, &(uri->hostname), &(uri->port), &(uri->port_number), &invalid);
644     if (rc != HTP_OK) return rc;
645 
646     if (invalid) {
647         connp->in_tx->flags |= HTP_HOSTU_INVALID;
648     }
649 
650     if (uri->hostname != NULL) {
651         if (htp_validate_hostname(uri->hostname) == 0) {
652             connp->in_tx->flags |= HTP_HOSTU_INVALID;
653         }
654     }
655 
656     return HTP_OK;
657 }
658 
659 /**
660  * Parses hostport provided in the Host header.
661  *
662  * @param[in] hostport
663  * @param[out] hostname
664  * @param[out] port
665  * @param[out] port_number
666  * @param[out] flags
667  * @return HTP_OK on success or HTP_ERROR error.
668  */
htp_parse_header_hostport(bstr * hostport,bstr ** hostname,bstr ** port,int * port_number,uint64_t * flags)669 htp_status_t htp_parse_header_hostport(bstr *hostport, bstr **hostname, bstr **port, int *port_number, uint64_t *flags) {
670     int invalid;
671 
672     htp_status_t rc = htp_parse_hostport(hostport, hostname, port, port_number, &invalid);
673     if (rc != HTP_OK) return rc;
674 
675     if (invalid) {
676         *flags |= HTP_HOSTH_INVALID;
677     }
678 
679     if (*hostname != NULL) {
680         if (htp_validate_hostname(*hostname) == 0) {
681             *flags |= HTP_HOSTH_INVALID;
682         }
683     }
684 
685     return HTP_OK;
686 }
687 
688 /**
689  * Parses request URI, making no attempt to validate the contents.
690  *
691  * @param[in] input
692  * @param[in] uri
693  * @return HTP_ERROR on memory allocation failure, HTP_OK otherwise
694  */
htp_parse_uri(bstr * input,htp_uri_t ** uri)695 int htp_parse_uri(bstr *input, htp_uri_t **uri) {
696     // Allow a htp_uri_t structure to be provided on input,
697     // but allocate a new one if the structure is NULL.
698     if (*uri == NULL) {
699         *uri = calloc(1, sizeof (htp_uri_t));
700         if (*uri == NULL) return HTP_ERROR;
701     }
702 
703     if (input == NULL) {
704         // The input might be NULL on requests that don't actually
705         // contain the URI. We allow that.
706         return HTP_OK;
707     }
708 
709     unsigned char *data = bstr_ptr(input);
710     size_t len = bstr_len(input);
711     size_t start, pos;
712 
713     if (len == 0) {
714         // Empty string.
715         return HTP_OK;
716     }
717 
718     pos = 0;
719 
720     // Scheme test: if it doesn't start with a forward slash character (which it must
721     // for the contents to be a path or an authority, then it must be the scheme part
722     if (data[0] != '/') {
723         // Parse scheme
724 
725         // Find the colon, which marks the end of the scheme part
726         start = pos;
727         while ((pos < len) && (data[pos] != ':')) pos++;
728 
729         if (pos >= len) {
730             // We haven't found a colon, which means that the URI
731             // is invalid. Apache will ignore this problem and assume
732             // the URI contains an invalid path so, for the time being,
733             // we are going to do the same.
734             pos = 0;
735         } else {
736             // Make a copy of the scheme
737             (*uri)->scheme = bstr_dup_mem(data + start, pos - start);
738             if ((*uri)->scheme == NULL) return HTP_ERROR;
739 
740             // Go over the colon
741             pos++;
742         }
743     }
744 
745     // Authority test: two forward slash characters and it's an authority.
746     // One, three or more slash characters, and it's a path. We, however,
747     // only attempt to parse authority if we've seen a scheme.
748     if ((*uri)->scheme != NULL)
749         if ((pos + 2 < len) && (data[pos] == '/') && (data[pos + 1] == '/') && (data[pos + 2] != '/')) {
750             // Parse authority
751 
752             // Go over the two slash characters
753             start = pos = pos + 2;
754 
755             // Authority ends with a question mark, forward slash or hash
756             while ((pos < len) && (data[pos] != '?') && (data[pos] != '/') && (data[pos] != '#')) pos++;
757 
758             unsigned char *hostname_start;
759             size_t hostname_len;
760 
761             // Are the credentials included in the authority?
762             unsigned char *m = memchr(data + start, '@', pos - start);
763             if (m != NULL) {
764                 // Credentials present
765                 unsigned char *credentials_start = data + start;
766                 size_t credentials_len = m - data - start;
767 
768                 // Figure out just the hostname part
769                 hostname_start = data + start + credentials_len + 1;
770                 hostname_len = pos - start - credentials_len - 1;
771 
772                 // Extract the username and the password
773                 m = memchr(credentials_start, ':', credentials_len);
774                 if (m != NULL) {
775                     // Username and password
776                     (*uri)->username = bstr_dup_mem(credentials_start, m - credentials_start);
777                     if ((*uri)->username == NULL) return HTP_ERROR;
778                     (*uri)->password = bstr_dup_mem(m + 1, credentials_len - (m - credentials_start) - 1);
779                     if ((*uri)->password == NULL) return HTP_ERROR;
780                 } else {
781                     // Username alone
782                     (*uri)->username = bstr_dup_mem(credentials_start, credentials_len);
783                     if ((*uri)->username == NULL) return HTP_ERROR;
784                 }
785             } else {
786                 // No credentials
787                 hostname_start = data + start;
788                 hostname_len = pos - start;
789             }
790 
791             // Parsing authority without credentials.
792             if ((hostname_len > 0) && (hostname_start[0] == '[')) {
793                 // IPv6 address.
794 
795                 m = memchr(hostname_start, ']', hostname_len);
796                 if (m == NULL) {
797                     // Invalid IPv6 address; use the entire string as hostname.
798                     (*uri)->hostname = bstr_dup_mem(hostname_start, hostname_len);
799                     if ((*uri)->hostname == NULL) return HTP_ERROR;
800                 } else {
801                     (*uri)->hostname = bstr_dup_mem(hostname_start, m - hostname_start + 1);
802                     if ((*uri)->hostname == NULL) return HTP_ERROR;
803 
804                     // Is there a port?
805                     hostname_len = hostname_len - (m - hostname_start + 1);
806                     hostname_start = m + 1;
807 
808                     // Port string
809                     m = memchr(hostname_start, ':', hostname_len);
810                     if (m != NULL) {
811                         size_t port_len = hostname_len - (m - hostname_start) - 1;
812                         (*uri)->port = bstr_dup_mem(m + 1, port_len);
813                         if ((*uri)->port == NULL) return HTP_ERROR;
814                     }
815                 }
816             } else {
817                 // Not IPv6 address.
818 
819                 m = memchr(hostname_start, ':', hostname_len);
820                 if (m != NULL) {
821                     size_t port_len = hostname_len - (m - hostname_start) - 1;
822                     hostname_len = hostname_len - port_len - 1;
823 
824                     // Port string
825                     (*uri)->port = bstr_dup_mem(m + 1, port_len);
826                     if ((*uri)->port == NULL) return HTP_ERROR;
827                 }
828 
829                 // Hostname
830                 (*uri)->hostname = bstr_dup_mem(hostname_start, hostname_len);
831                 if ((*uri)->hostname == NULL) return HTP_ERROR;
832             }
833         }
834 
835     // Path
836     start = pos;
837 
838     // The path part will end with a question mark or a hash character, which
839     // mark the beginning of the query part or the fragment part, respectively.
840     while ((pos < len) && (data[pos] != '?') && (data[pos] != '#')) pos++;
841 
842     // Path
843     (*uri)->path = bstr_dup_mem(data + start, pos - start);
844     if ((*uri)->path == NULL) return HTP_ERROR;
845 
846     if (pos == len) return HTP_OK;
847 
848     // Query
849     if (data[pos] == '?') {
850         // Step over the question mark
851         start = pos + 1;
852 
853         // The query part will end with the end of the input
854         // or the beginning of the fragment part
855         while ((pos < len) && (data[pos] != '#')) pos++;
856 
857         // Query string
858         (*uri)->query = bstr_dup_mem(data + start, pos - start);
859         if ((*uri)->query == NULL) return HTP_ERROR;
860 
861         if (pos == len) return HTP_OK;
862     }
863 
864     // Fragment
865     if (data[pos] == '#') {
866         // Step over the hash character
867         start = pos + 1;
868 
869         // Fragment; ends with the end of the input
870         (*uri)->fragment = bstr_dup_mem(data + start, len - start);
871         if ((*uri)->fragment == NULL) return HTP_ERROR;
872     }
873 
874     return HTP_OK;
875 }
876 
877 /**
878  * Convert two input bytes, pointed to by the pointer parameter,
879  * into a single byte by assuming the input consists of hexadecimal
880  * characters. This function will happily convert invalid input.
881  *
882  * @param[in] what
883  * @return hex-decoded byte
884  */
x2c(unsigned char * what)885 static unsigned char x2c(unsigned char *what) {
886     register unsigned char digit;
887 
888     digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A') + 10 : (what[0] - '0'));
889     digit *= 16;
890     digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A') + 10 : (what[1] - '0'));
891 
892     return digit;
893 }
894 
895 /**
896  * Convert a Unicode codepoint into a single-byte, using best-fit
897  * mapping (as specified in the provided configuration structure).
898  *
899  * @param[in] cfg
900  * @param[in] codepoint
901  * @return converted single byte
902  */
bestfit_codepoint(htp_cfg_t * cfg,enum htp_decoder_ctx_t ctx,uint32_t codepoint)903 static uint8_t bestfit_codepoint(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, uint32_t codepoint) {
904     // Is it a single-byte codepoint?
905     if (codepoint < 0x100) {
906         return (uint8_t) codepoint;
907     }
908 
909     // Our current implementation converts only the 2-byte codepoints.
910     if (codepoint > 0xffff) {
911         return cfg->decoder_cfgs[ctx].bestfit_replacement_byte;
912     }
913 
914     uint8_t *p = cfg->decoder_cfgs[ctx].bestfit_map;
915 
916     // TODO Optimize lookup.
917 
918     for (;;) {
919         uint32_t x = (p[0] << 8) + p[1];
920 
921         if (x == 0) {
922             return cfg->decoder_cfgs[ctx].bestfit_replacement_byte;
923         }
924 
925         if (x == codepoint) {
926             return p[2];
927         }
928 
929         // Move to the next triplet
930         p += 3;
931     }
932 }
933 
934 /**
935  * Decode a UTF-8 encoded path. Overlong characters will be decoded, invalid
936  * characters will be left as-is. Best-fit mapping will be used to convert
937  * UTF-8 into a single-byte stream.
938  *
939  * @param[in] cfg
940  * @param[in] tx
941  * @param[in] path
942  */
htp_utf8_decode_path_inplace(htp_cfg_t * cfg,htp_tx_t * tx,bstr * path)943 void htp_utf8_decode_path_inplace(htp_cfg_t *cfg, htp_tx_t *tx, bstr *path) {
944     if (path == NULL) return;
945 
946     uint8_t *data = bstr_ptr(path);
947     if (data == NULL) return;
948 
949     size_t len = bstr_len(path);
950     size_t rpos = 0;
951     size_t wpos = 0;
952     uint32_t codepoint = 0;
953     uint32_t state = HTP_UTF8_ACCEPT;
954     uint32_t counter = 0;
955     uint8_t seen_valid = 0;
956 
957     while ((rpos < len)&&(wpos < len)) {
958         counter++;
959 
960         switch (htp_utf8_decode_allow_overlong(&state, &codepoint, data[rpos])) {
961             case HTP_UTF8_ACCEPT:
962                 if (counter == 1) {
963                     // ASCII character, which we just copy.
964                     data[wpos++] = (uint8_t) codepoint;
965                 } else {
966                     // A valid UTF-8 character, which we need to convert.
967 
968                     seen_valid = 1;
969 
970                     // Check for overlong characters and set the flag accordingly.
971                     switch (counter) {
972                         case 2:
973                             if (codepoint < 0x80) {
974                                 tx->flags |= HTP_PATH_UTF8_OVERLONG;
975                             }
976                             break;
977                         case 3:
978                             if (codepoint < 0x800) {
979                                 tx->flags |= HTP_PATH_UTF8_OVERLONG;
980                             }
981                             break;
982                         case 4:
983                             if (codepoint < 0x10000) {
984                                 tx->flags |= HTP_PATH_UTF8_OVERLONG;
985                             }
986                             break;
987                     }
988 
989                     // Special flag for half-width/full-width evasion.
990                     if ((codepoint >= 0xff00) && (codepoint <= 0xffef)) {
991                         tx->flags |= HTP_PATH_HALF_FULL_RANGE;
992                     }
993 
994                     // Use best-fit mapping to convert to a single byte.
995                     data[wpos++] = bestfit_codepoint(cfg, HTP_DECODER_URL_PATH, codepoint);
996                 }
997 
998                 // Advance over the consumed byte and reset the byte counter.
999                 rpos++;
1000                 counter = 0;
1001 
1002                 break;
1003 
1004             case HTP_UTF8_REJECT:
1005                 // Invalid UTF-8 character.
1006 
1007                 tx->flags |= HTP_PATH_UTF8_INVALID;
1008 
1009                 // Is the server expected to respond with 400?
1010                 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1011                     tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_invalid_unwanted;
1012                 }
1013 
1014                 // Output the replacement byte, replacing one or more invalid bytes.
1015                 data[wpos++] = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_replacement_byte;
1016 
1017                 // If the invalid byte was first in a sequence, consume it. Otherwise,
1018                 // assume it's the starting byte of the next character.
1019                 if (counter == 1) {
1020                     rpos++;
1021                 }
1022 
1023                 // Reset the decoder state and continue decoding.
1024                 state = HTP_UTF8_ACCEPT;
1025                 codepoint = 0;
1026                 counter = 0;
1027 
1028                 break;
1029 
1030             default:
1031                 // Keep going; the character is not yet formed.
1032                 rpos++;
1033                 break;
1034         }
1035     }
1036 
1037     // Did the input stream seem like a valid UTF-8 string?
1038     if ((seen_valid) && (!(tx->flags & HTP_PATH_UTF8_INVALID))) {
1039         tx->flags |= HTP_PATH_UTF8_VALID;
1040     }
1041 
1042     // Adjust the length of the string, because
1043     // we're doing in-place decoding.
1044     bstr_adjust_len(path, wpos);
1045 }
1046 
1047 /**
1048  * Validate a path that is quite possibly UTF-8 encoded.
1049  *
1050  * @param[in] tx
1051  * @param[in] path
1052  */
htp_utf8_validate_path(htp_tx_t * tx,bstr * path)1053 void htp_utf8_validate_path(htp_tx_t *tx, bstr *path) {
1054     unsigned char *data = bstr_ptr(path);
1055     size_t len = bstr_len(path);
1056     size_t rpos = 0;
1057     uint32_t codepoint = 0;
1058     uint32_t state = HTP_UTF8_ACCEPT;
1059     uint32_t counter = 0; // How many bytes used by a UTF-8 character.
1060     uint8_t seen_valid = 0;
1061 
1062     while (rpos < len) {
1063         counter++;
1064 
1065         switch (htp_utf8_decode_allow_overlong(&state, &codepoint, data[rpos])) {
1066             case HTP_UTF8_ACCEPT:
1067                 // We have a valid character.
1068 
1069                 if (counter > 1) {
1070                     // A valid UTF-8 character, consisting of 2 or more bytes.
1071 
1072                     seen_valid = 1;
1073 
1074                     // Check for overlong characters and set the flag accordingly.
1075                     switch (counter) {
1076                         case 2:
1077                             if (codepoint < 0x80) {
1078                                 tx->flags |= HTP_PATH_UTF8_OVERLONG;
1079                             }
1080                             break;
1081                         case 3:
1082                             if (codepoint < 0x800) {
1083                                 tx->flags |= HTP_PATH_UTF8_OVERLONG;
1084                             }
1085                             break;
1086                         case 4:
1087                             if (codepoint < 0x10000) {
1088                                 tx->flags |= HTP_PATH_UTF8_OVERLONG;
1089                             }
1090                             break;
1091                     }
1092                 }
1093 
1094                 // Special flag for half-width/full-width evasion.
1095                 if ((codepoint > 0xfeff) && (codepoint < 0x010000)) {
1096                     tx->flags |= HTP_PATH_HALF_FULL_RANGE;
1097                 }
1098 
1099                 // Advance over the consumed byte and reset the byte counter.
1100                 rpos++;
1101                 counter = 0;
1102 
1103                 break;
1104 
1105             case HTP_UTF8_REJECT:
1106                 // Invalid UTF-8 character.
1107 
1108                 tx->flags |= HTP_PATH_UTF8_INVALID;
1109 
1110                 // Override the decoder state because we want to continue decoding.
1111                 state = HTP_UTF8_ACCEPT;
1112 
1113                 // Advance over the consumed byte and reset the byte counter.
1114                 rpos++;
1115                 counter = 0;
1116 
1117                 break;
1118 
1119             default:
1120                 // Keep going; the character is not yet formed.
1121                 rpos++;
1122                 break;
1123         }
1124     }
1125 
1126     // Did the input stream seem like a valid UTF-8 string?
1127     if ((seen_valid) && (!(tx->flags & HTP_PATH_UTF8_INVALID))) {
1128         tx->flags |= HTP_PATH_UTF8_VALID;
1129     }
1130 }
1131 
1132 /**
1133  * Decode a %u-encoded character, using best-fit mapping as necessary. Path version.
1134  *
1135  * @param[in] cfg
1136  * @param[in] tx
1137  * @param[in] data
1138  * @return decoded byte
1139  */
decode_u_encoding_path(htp_cfg_t * cfg,htp_tx_t * tx,unsigned char * data)1140 static int decode_u_encoding_path(htp_cfg_t *cfg, htp_tx_t *tx, unsigned char *data) {
1141     unsigned int c1 = x2c(data);
1142     unsigned int c2 = x2c(data + 2);
1143     int r = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_replacement_byte;
1144 
1145     if (c1 == 0x00) {
1146         r = c2;
1147         tx->flags |= HTP_PATH_OVERLONG_U;
1148     } else {
1149         // Check for fullwidth form evasion
1150         if (c1 == 0xff) {
1151             tx->flags |= HTP_PATH_HALF_FULL_RANGE;
1152         }
1153 
1154         if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted != HTP_UNWANTED_IGNORE) {
1155             tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted;
1156         }
1157 
1158         // Use best-fit mapping
1159         unsigned char *p = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].bestfit_map;
1160 
1161         // TODO Optimize lookup.
1162 
1163         for (;;) {
1164             // Have we reached the end of the map?
1165             if ((p[0] == 0) && (p[1] == 0)) {
1166                 break;
1167             }
1168 
1169             // Have we found the mapping we're looking for?
1170             if ((p[0] == c1) && (p[1] == c2)) {
1171                 r = p[2];
1172                 break;
1173             }
1174 
1175             // Move to the next triplet
1176             p += 3;
1177         }
1178     }
1179 
1180     // Check for encoded path separators
1181     if ((r == '/') || ((cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes) && (r == '\\'))) {
1182         tx->flags |= HTP_PATH_ENCODED_SEPARATOR;
1183     }
1184 
1185     return r;
1186 }
1187 
1188 /**
1189  * Decode a %u-encoded character, using best-fit mapping as necessary. Params version.
1190  *
1191  * @param[in] cfg
1192  * @param[in] tx
1193  * @param[in] data
1194  * @return decoded byte
1195  */
decode_u_encoding_params(htp_cfg_t * cfg,enum htp_decoder_ctx_t ctx,unsigned char * data,uint64_t * flags)1196 static int decode_u_encoding_params(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, unsigned char *data, uint64_t *flags) {
1197     unsigned int c1 = x2c(data);
1198     unsigned int c2 = x2c(data + 2);
1199 
1200     // Check for overlong usage first.
1201     if (c1 == 0) {
1202         (*flags) |= HTP_URLEN_OVERLONG_U;
1203         return c2;
1204     }
1205 
1206     // Both bytes were used.
1207 
1208     // Detect half-width and full-width range.
1209     if ((c1 == 0xff) && (c2 <= 0xef)) {
1210         (*flags) |= HTP_URLEN_HALF_FULL_RANGE;
1211     }
1212 
1213     // Use best-fit mapping.
1214     unsigned char *p = cfg->decoder_cfgs[ctx].bestfit_map;
1215     int r = cfg->decoder_cfgs[ctx].bestfit_replacement_byte;
1216 
1217     // TODO Optimize lookup.
1218 
1219     for (;;) {
1220         // Have we reached the end of the map?
1221         if ((p[0] == 0) && (p[1] == 0)) {
1222             break;
1223         }
1224 
1225         // Have we found the mapping we're looking for?
1226         if ((p[0] == c1) && (p[1] == c2)) {
1227             r = p[2];
1228             break;
1229         }
1230 
1231         // Move to the next triplet
1232         p += 3;
1233     }
1234 
1235     return r;
1236 }
1237 
1238 /**
1239  * Decode a request path according to the settings in the
1240  * provided configuration structure.
1241  *
1242  * @param[in] cfg
1243  * @param[in] tx
1244  * @param[in] path
1245  */
htp_decode_path_inplace(htp_tx_t * tx,bstr * path)1246 htp_status_t htp_decode_path_inplace(htp_tx_t *tx, bstr *path) {
1247     if (path == NULL) return HTP_ERROR;
1248     unsigned char *data = bstr_ptr(path);
1249     if (data == NULL) return HTP_ERROR;
1250 
1251     size_t len = bstr_len(path);
1252 
1253     htp_cfg_t *cfg = tx->cfg;
1254 
1255     size_t rpos = 0;
1256     size_t wpos = 0;
1257     int previous_was_separator = 0;
1258 
1259     while ((rpos < len) && (wpos < len)) {
1260         int c = data[rpos];
1261 
1262         // Decode encoded characters
1263         if (c == '%') {
1264             if (rpos + 2 < len) {
1265                 int handled = 0;
1266 
1267                 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_decode) {
1268                     // Check for the %u encoding
1269                     if ((data[rpos + 1] == 'u') || (data[rpos + 1] == 'U')) {
1270                         handled = 1;
1271 
1272                         if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted != HTP_UNWANTED_IGNORE) {
1273                             tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].u_encoding_unwanted;
1274                         }
1275 
1276                         if (rpos + 5 < len) {
1277                             if (isxdigit(data[rpos + 2]) && (isxdigit(data[rpos + 3]))
1278                                     && isxdigit(data[rpos + 4]) && (isxdigit(data[rpos + 5]))) {
1279                                 // Decode a valid %u encoding
1280                                 c = decode_u_encoding_path(cfg, tx, &data[rpos + 2]);
1281                                 rpos += 6;
1282 
1283                                 if (c == 0) {
1284                                     tx->flags |= HTP_PATH_ENCODED_NUL;
1285 
1286                                     if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1287                                         tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted;
1288                                     }
1289                                 }
1290                             } else {
1291                                 // Invalid %u encoding
1292                                 tx->flags |= HTP_PATH_INVALID_ENCODING;
1293 
1294                                 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1295                                     tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1296                                 }
1297 
1298                                 switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1299                                     case HTP_URL_DECODE_REMOVE_PERCENT:
1300                                         // Do not place anything in output; eat
1301                                         // the percent character
1302                                         rpos++;
1303                                         continue;
1304                                         break;
1305                                     case HTP_URL_DECODE_PRESERVE_PERCENT:
1306                                         // Leave the percent character in output
1307                                         rpos++;
1308                                         break;
1309                                     case HTP_URL_DECODE_PROCESS_INVALID:
1310                                         // Decode invalid %u encoding
1311                                         c = decode_u_encoding_path(cfg, tx, &data[rpos + 2]);
1312                                         rpos += 6;
1313                                         break;
1314                                 }
1315                             }
1316                         } else {
1317                             // Invalid %u encoding (not enough data)
1318                             tx->flags |= HTP_PATH_INVALID_ENCODING;
1319 
1320                             if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1321                                 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1322                             }
1323 
1324                             switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1325                                 case HTP_URL_DECODE_REMOVE_PERCENT:
1326                                     // Do not place anything in output; eat
1327                                     // the percent character
1328                                     rpos++;
1329                                     continue;
1330                                     break;
1331                                 case HTP_URL_DECODE_PRESERVE_PERCENT:
1332                                     // Leave the percent character in output
1333                                     rpos++;
1334                                     break;
1335                                 case HTP_URL_DECODE_PROCESS_INVALID:
1336                                     // Cannot decode, because there's not enough data.
1337                                     // Leave the percent character in output
1338                                     rpos++;
1339                                     // TODO Configurable handling.
1340                                     break;
1341                             }
1342                         }
1343                     }
1344                 }
1345 
1346                 // Handle standard URL encoding
1347                 if (!handled) {
1348                     if ((isxdigit(data[rpos + 1])) && (isxdigit(data[rpos + 2]))) {
1349                         c = x2c(&data[rpos + 1]);
1350 
1351                         if (c == 0) {
1352                             tx->flags |= HTP_PATH_ENCODED_NUL;
1353 
1354                             if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1355                                 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_unwanted;
1356                             }
1357 
1358                             if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_encoded_terminates) {
1359                                 bstr_adjust_len(path, wpos);
1360                                 return HTP_OK;
1361                             }
1362                         }
1363 
1364                         if ((c == '/') || ((cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes) && (c == '\\'))) {
1365                             tx->flags |= HTP_PATH_ENCODED_SEPARATOR;
1366 
1367                             if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1368                                 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_encoded_unwanted;
1369                             }
1370 
1371                             if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_decode) {
1372                                 // Decode
1373                                 rpos += 3;
1374                             } else {
1375                                 // Leave encoded
1376                                 c = '%';
1377                                 rpos++;
1378                             }
1379                         } else {
1380                             // Decode
1381                             rpos += 3;
1382                         }
1383                     } else {
1384                         // Invalid encoding
1385                         tx->flags |= HTP_PATH_INVALID_ENCODING;
1386 
1387                         if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1388                             tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1389                         }
1390 
1391                         switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1392                             case HTP_URL_DECODE_REMOVE_PERCENT:
1393                                 // Do not place anything in output; eat
1394                                 // the percent character
1395                                 rpos++;
1396                                 continue;
1397                                 break;
1398                             case HTP_URL_DECODE_PRESERVE_PERCENT:
1399                                 // Leave the percent character in output
1400                                 rpos++;
1401                                 break;
1402                             case HTP_URL_DECODE_PROCESS_INVALID:
1403                                 // Decode
1404                                 c = x2c(&data[rpos + 1]);
1405                                 rpos += 3;
1406                                 // Note: What if an invalid encoding decodes into a path
1407                                 //       separator? This is theoretical at the moment, because
1408                                 //       the only platform we know doesn't convert separators is
1409                                 //       Apache, who will also respond with 400 if invalid encoding
1410                                 //       is encountered. Thus no check for a separator here.
1411                                 break;
1412                             default:
1413                                 // Unknown setting
1414                                 return HTP_ERROR;
1415                                 break;
1416                         }
1417                     }
1418                 }
1419             } else {
1420                 // Invalid URL encoding (not enough data)
1421                 tx->flags |= HTP_PATH_INVALID_ENCODING;
1422 
1423                 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1424                     tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_unwanted;
1425                 }
1426 
1427                 switch (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].url_encoding_invalid_handling) {
1428                     case HTP_URL_DECODE_REMOVE_PERCENT:
1429                         // Do not place anything in output; eat
1430                         // the percent character
1431                         rpos++;
1432                         continue;
1433                         break;
1434                     case HTP_URL_DECODE_PRESERVE_PERCENT:
1435                         // Leave the percent character in output
1436                         rpos++;
1437                         break;
1438                     case HTP_URL_DECODE_PROCESS_INVALID:
1439                         // Cannot decode, because there's not enough data.
1440                         // Leave the percent character in output.
1441                         // TODO Configurable handling.
1442                         rpos++;
1443                         break;
1444                 }
1445             }
1446         } else {
1447             // One non-encoded character
1448 
1449             // Is it a NUL byte?
1450             if (c == 0) {
1451                 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_unwanted != HTP_UNWANTED_IGNORE) {
1452                     tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_unwanted;
1453                 }
1454 
1455                 if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].nul_raw_terminates) {
1456                     // Terminate path with a raw NUL byte
1457                     bstr_adjust_len(path, wpos);
1458                     return HTP_OK;
1459                     break;
1460                 }
1461             }
1462 
1463             rpos++;
1464         }
1465 
1466         // Place the character into output
1467 
1468         // Check for control characters
1469         if (c < 0x20) {
1470             if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].control_chars_unwanted != HTP_UNWANTED_IGNORE) {
1471                 tx->response_status_expected_number = cfg->decoder_cfgs[HTP_DECODER_URL_PATH].control_chars_unwanted;
1472             }
1473         }
1474 
1475         // Convert backslashes to forward slashes, if necessary
1476         if ((c == '\\') && (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].backslash_convert_slashes)) {
1477             c = '/';
1478         }
1479 
1480         // Lowercase characters, if necessary
1481         if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].convert_lowercase) {
1482             c = tolower(c);
1483         }
1484 
1485         // If we're compressing separators then we need
1486         // to track if the previous character was a separator
1487         if (cfg->decoder_cfgs[HTP_DECODER_URL_PATH].path_separators_compress) {
1488             if (c == '/') {
1489                 if (!previous_was_separator) {
1490                     data[wpos++] = c;
1491                     previous_was_separator = 1;
1492                 } else {
1493                     // Do nothing; we don't want
1494                     // another separator in output
1495                 }
1496             } else {
1497                 data[wpos++] = c;
1498                 previous_was_separator = 0;
1499             }
1500         } else {
1501             data[wpos++] = c;
1502         }
1503     }
1504 
1505     bstr_adjust_len(path, wpos);
1506 
1507     return HTP_OK;
1508 }
1509 
htp_tx_urldecode_uri_inplace(htp_tx_t * tx,bstr * input)1510 htp_status_t htp_tx_urldecode_uri_inplace(htp_tx_t *tx, bstr *input) {
1511     uint64_t flags = 0;
1512 
1513     htp_status_t rc = htp_urldecode_inplace_ex(tx->cfg, HTP_DECODER_URL_PATH, input, &flags, &(tx->response_status_expected_number));
1514 
1515     if (flags & HTP_URLEN_INVALID_ENCODING) {
1516         tx->flags |= HTP_PATH_INVALID_ENCODING;
1517     }
1518 
1519     if (flags & HTP_URLEN_ENCODED_NUL) {
1520         tx->flags |= HTP_PATH_ENCODED_NUL;
1521     }
1522 
1523     if (flags & HTP_URLEN_RAW_NUL) {
1524         tx->flags |= HTP_PATH_RAW_NUL;
1525     }
1526 
1527     return rc;
1528 }
1529 
htp_tx_urldecode_params_inplace(htp_tx_t * tx,bstr * input)1530 htp_status_t htp_tx_urldecode_params_inplace(htp_tx_t *tx, bstr *input) {
1531     return htp_urldecode_inplace_ex(tx->cfg, HTP_DECODER_URLENCODED, input, &(tx->flags), &(tx->response_status_expected_number));
1532 }
1533 
htp_urldecode_inplace(htp_cfg_t * cfg,enum htp_decoder_ctx_t ctx,bstr * input,uint64_t * flags)1534 htp_status_t htp_urldecode_inplace(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, bstr *input, uint64_t *flags) {
1535     int expected_status_code = 0;
1536     return htp_urldecode_inplace_ex(cfg, ctx, input, flags, &expected_status_code);
1537 }
1538 
htp_urldecode_inplace_ex(htp_cfg_t * cfg,enum htp_decoder_ctx_t ctx,bstr * input,uint64_t * flags,int * expected_status_code)1539 htp_status_t htp_urldecode_inplace_ex(htp_cfg_t *cfg, enum htp_decoder_ctx_t ctx, bstr *input, uint64_t *flags, int *expected_status_code) {
1540     if (input == NULL) return HTP_ERROR;
1541 
1542     unsigned char *data = bstr_ptr(input);
1543     if (data == NULL) return HTP_ERROR;
1544     size_t len = bstr_len(input);
1545 
1546     size_t rpos = 0;
1547     size_t wpos = 0;
1548 
1549     while ((rpos < len) && (wpos < len)) {
1550         int c = data[rpos];
1551 
1552         // Decode encoded characters.
1553         if (c == '%') {
1554             // Need at least 2 additional bytes for %HH.
1555             if (rpos + 2 < len) {
1556                 int handled = 0;
1557 
1558                 // Decode %uHHHH encoding, but only if allowed in configuration.
1559                 if (cfg->decoder_cfgs[ctx].u_encoding_decode) {
1560                     // The next character must be a case-insensitive u.
1561                     if ((data[rpos + 1] == 'u') || (data[rpos + 1] == 'U')) {
1562                         handled = 1;
1563 
1564                         if (cfg->decoder_cfgs[ctx].u_encoding_unwanted != HTP_UNWANTED_IGNORE) {
1565                             (*expected_status_code) = cfg->decoder_cfgs[ctx].u_encoding_unwanted;
1566                         }
1567 
1568                         // Need at least 5 additional bytes for %uHHHH.
1569                         if (rpos + 5 < len) {
1570                             if (isxdigit(data[rpos + 2]) && (isxdigit(data[rpos + 3]))
1571                                     && isxdigit(data[rpos + 4]) && (isxdigit(data[rpos + 5]))) {
1572                                 // Decode a valid %u encoding.
1573                                 c = decode_u_encoding_params(cfg, ctx, &(data[rpos + 2]), flags);
1574                                 rpos += 6;
1575                             } else {
1576                                 // Invalid %u encoding (could not find 4 xdigits).
1577                                 (*flags) |= HTP_URLEN_INVALID_ENCODING;
1578 
1579                                 if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1580                                     (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1581                                 }
1582 
1583                                 switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1584                                     case HTP_URL_DECODE_REMOVE_PERCENT:
1585                                         // Do not place anything in output; consume the %.
1586                                         rpos++;
1587                                         continue;
1588                                         break;
1589                                     case HTP_URL_DECODE_PRESERVE_PERCENT:
1590                                         // Leave the % in output.
1591                                         rpos++;
1592                                         break;
1593                                     case HTP_URL_DECODE_PROCESS_INVALID:
1594                                         // Decode invalid %u encoding.
1595                                         c = decode_u_encoding_params(cfg, ctx, &(data[rpos + 2]), flags);
1596                                         rpos += 6;
1597                                         break;
1598                                 }
1599                             }
1600                         } else {
1601                             // Invalid %u encoding; not enough data.
1602                             (*flags) |= HTP_URLEN_INVALID_ENCODING;
1603 
1604                             if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1605                                 (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1606                             }
1607 
1608                             switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1609                                 case HTP_URL_DECODE_REMOVE_PERCENT:
1610                                     // Do not place anything in output; consume the %.
1611                                     rpos++;
1612                                     continue;
1613                                     break;
1614                                 case HTP_URL_DECODE_PRESERVE_PERCENT:
1615                                     // Leave the % in output.
1616                                     rpos++;
1617                                     break;
1618                                 case HTP_URL_DECODE_PROCESS_INVALID:
1619                                     // Cannot decode because there's not enough data.
1620                                     // Leave the % in output.
1621                                     // TODO Configurable handling of %, u, etc.
1622                                     rpos++;
1623                                     break;
1624                             }
1625                         }
1626                     }
1627                 }
1628 
1629                 // Handle standard URL encoding.
1630                 if (!handled) {
1631                     // Need 2 hexadecimal digits.
1632                     if ((isxdigit(data[rpos + 1])) && (isxdigit(data[rpos + 2]))) {
1633                         // Decode %HH encoding.
1634                         c = x2c(&(data[rpos + 1]));
1635                         rpos += 3;
1636                     } else {
1637                         // Invalid encoding (enough bytes, but not hexadecimal digits).
1638                         (*flags) |= HTP_URLEN_INVALID_ENCODING;
1639 
1640                         if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1641                             (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1642                         }
1643 
1644                         switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1645                             case HTP_URL_DECODE_REMOVE_PERCENT:
1646                                 // Do not place anything in output; consume the %.
1647                                 rpos++;
1648                                 continue;
1649                                 break;
1650                             case HTP_URL_DECODE_PRESERVE_PERCENT:
1651                                 // Leave the % in output.
1652                                 rpos++;
1653                                 break;
1654                             case HTP_URL_DECODE_PROCESS_INVALID:
1655                                 // Decode.
1656                                 c = x2c(&(data[rpos + 1]));
1657                                 rpos += 3;
1658                                 break;
1659                         }
1660                     }
1661                 }
1662             } else {
1663                 // Invalid encoding; not enough data (at least 2 bytes required).
1664                 (*flags) |= HTP_URLEN_INVALID_ENCODING;
1665 
1666                 if (cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted != HTP_UNWANTED_IGNORE) {
1667                     (*expected_status_code) = cfg->decoder_cfgs[ctx].url_encoding_invalid_unwanted;
1668                 }
1669 
1670                 switch (cfg->decoder_cfgs[ctx].url_encoding_invalid_handling) {
1671                     case HTP_URL_DECODE_REMOVE_PERCENT:
1672                         // Do not place anything in output; consume the %.
1673                         rpos++;
1674                         continue;
1675                         break;
1676                     case HTP_URL_DECODE_PRESERVE_PERCENT:
1677                         // Leave the % in output.
1678                         rpos++;
1679                         break;
1680                     case HTP_URL_DECODE_PROCESS_INVALID:
1681                         // Cannot decode because there's not enough data.
1682                         // Leave the % in output.
1683                         // TODO Configurable handling of %, etc.
1684                         rpos++;
1685                         break;
1686                 }
1687             }
1688 
1689             // Did we get an encoded NUL byte?
1690             if (c == 0) {
1691                 if (cfg->decoder_cfgs[ctx].nul_encoded_unwanted != HTP_UNWANTED_IGNORE) {
1692                     (*expected_status_code) = cfg->decoder_cfgs[ctx].nul_encoded_unwanted;
1693                 }
1694 
1695                 (*flags) |= HTP_URLEN_ENCODED_NUL;
1696 
1697                 if (cfg->decoder_cfgs[ctx].nul_encoded_terminates) {
1698                     // Terminate the path at the raw NUL byte.
1699                     bstr_adjust_len(input, wpos);
1700                     return 1;
1701                 }
1702             }
1703 
1704             data[wpos++] = c;
1705         } else if (c == '+') {
1706             // Decoding of the plus character is conditional on the configuration.
1707 
1708             if (cfg->decoder_cfgs[ctx].plusspace_decode) {
1709                 c = 0x20;
1710             }
1711 
1712             rpos++;
1713             data[wpos++] = c;
1714         } else {
1715             // One non-encoded byte.
1716 
1717             // Did we get a raw NUL byte?
1718             if (c == 0) {
1719                 if (cfg->decoder_cfgs[ctx].nul_raw_unwanted != HTP_UNWANTED_IGNORE) {
1720                     (*expected_status_code) = cfg->decoder_cfgs[ctx].nul_raw_unwanted;
1721                 }
1722 
1723                 (*flags) |= HTP_URLEN_RAW_NUL;
1724 
1725                 if (cfg->decoder_cfgs[ctx].nul_raw_terminates) {
1726                     // Terminate the path at the encoded NUL byte.
1727                     bstr_adjust_len(input, wpos);
1728                     return HTP_OK;
1729                 }
1730             }
1731 
1732             rpos++;
1733             data[wpos++] = c;
1734         }
1735     }
1736 
1737     bstr_adjust_len(input, wpos);
1738 
1739     return HTP_OK;
1740 }
1741 
1742 /**
1743  * Normalize a previously-parsed request URI.
1744  *
1745  * @param[in] connp
1746  * @param[in] incomplete
1747  * @param[in] normalized
1748  * @return HTP_OK or HTP_ERROR
1749  */
htp_normalize_parsed_uri(htp_tx_t * tx,htp_uri_t * incomplete,htp_uri_t * normalized)1750 int htp_normalize_parsed_uri(htp_tx_t *tx, htp_uri_t *incomplete, htp_uri_t *normalized) {
1751     // Scheme.
1752     if (incomplete->scheme != NULL) {
1753         // Duplicate and convert to lowercase.
1754         normalized->scheme = bstr_dup_lower(incomplete->scheme);
1755         if (normalized->scheme == NULL) return HTP_ERROR;
1756     }
1757 
1758     // Username.
1759     if (incomplete->username != NULL) {
1760         normalized->username = bstr_dup(incomplete->username);
1761         if (normalized->username == NULL) return HTP_ERROR;
1762         htp_tx_urldecode_uri_inplace(tx, normalized->username);
1763     }
1764 
1765     // Password.
1766     if (incomplete->password != NULL) {
1767         normalized->password = bstr_dup(incomplete->password);
1768         if (normalized->password == NULL) return HTP_ERROR;
1769         htp_tx_urldecode_uri_inplace(tx, normalized->password);
1770     }
1771 
1772     // Hostname.
1773     if (incomplete->hostname != NULL) {
1774         // We know that incomplete->hostname does not contain
1775         // port information, so no need to check for it here.
1776         normalized->hostname = bstr_dup(incomplete->hostname);
1777         if (normalized->hostname == NULL) return HTP_ERROR;
1778         htp_tx_urldecode_uri_inplace(tx, normalized->hostname);
1779         htp_normalize_hostname_inplace(normalized->hostname);
1780     }
1781 
1782     // Port.
1783     if (incomplete->port != NULL) {
1784         int64_t port_parsed = htp_parse_positive_integer_whitespace(
1785                 bstr_ptr(incomplete->port), bstr_len(incomplete->port), 10);
1786 
1787         if (port_parsed < 0) {
1788             // Failed to parse the port number.
1789             normalized->port_number = -1;
1790             tx->flags |= HTP_HOSTU_INVALID;
1791         } else if ((port_parsed > 0) && (port_parsed < 65536)) {
1792             // Valid port number.
1793             normalized->port_number = (int) port_parsed;
1794         } else {
1795             // Port number out of range.
1796             normalized->port_number = -1;
1797             tx->flags |= HTP_HOSTU_INVALID;
1798         }
1799     } else {
1800         normalized->port_number = -1;
1801     }
1802 
1803     // Path.
1804     if (incomplete->path != NULL) {
1805         // Make a copy of the path, so that we can work on it.
1806         normalized->path = bstr_dup(incomplete->path);
1807         if (normalized->path == NULL) return HTP_ERROR;
1808 
1809         // Decode URL-encoded (and %u-encoded) characters, as well as lowercase,
1810         // compress separators and convert backslashes.
1811         htp_decode_path_inplace(tx, normalized->path);
1812 
1813         // Handle UTF-8 in the path.
1814         if (tx->cfg->decoder_cfgs[HTP_DECODER_URL_PATH].utf8_convert_bestfit) {
1815             // Decode Unicode characters into a single-byte stream, using best-fit mapping.
1816             htp_utf8_decode_path_inplace(tx->cfg, tx, normalized->path);
1817         } else {
1818             // No decoding, but try to validate the path as a UTF-8 stream.
1819             htp_utf8_validate_path(tx, normalized->path);
1820         }
1821 
1822         // RFC normalization.
1823         htp_normalize_uri_path_inplace(normalized->path);
1824     }
1825 
1826     // Query string.
1827     if (incomplete->query != NULL) {
1828         normalized->query = bstr_dup(incomplete->query);
1829         if (normalized->query == NULL) return HTP_ERROR;
1830     }
1831 
1832     // Fragment.
1833     if (incomplete->fragment != NULL) {
1834         normalized->fragment = bstr_dup(incomplete->fragment);
1835         if (normalized->fragment == NULL) return HTP_ERROR;
1836         htp_tx_urldecode_uri_inplace(tx, normalized->fragment);
1837     }
1838 
1839     return HTP_OK;
1840 }
1841 
1842 /**
1843  * Normalize request hostname. Convert all characters to lowercase and
1844  * remove trailing dots from the end, if present.
1845  *
1846  * @param[in] hostname
1847  * @return Normalized hostname.
1848  */
htp_normalize_hostname_inplace(bstr * hostname)1849 bstr *htp_normalize_hostname_inplace(bstr *hostname) {
1850     if (hostname == NULL) return NULL;
1851 
1852     bstr_to_lowercase(hostname);
1853 
1854     // Remove dots from the end of the string.
1855     while (bstr_char_at_end(hostname, 0) == '.') bstr_chop(hostname);
1856 
1857     return hostname;
1858 }
1859 
1860 /**
1861  * Normalize URL path. This function implements the remove dot segments algorithm
1862  * specified in RFC 3986, section 5.2.4.
1863  *
1864  * @param[in] s
1865  */
htp_normalize_uri_path_inplace(bstr * s)1866 void htp_normalize_uri_path_inplace(bstr *s) {
1867     if (s == NULL) return;
1868 
1869     unsigned char *data = bstr_ptr(s);
1870     if (data == NULL) return;
1871     size_t len = bstr_len(s);
1872 
1873     size_t rpos = 0;
1874     size_t wpos = 0;
1875 
1876     int c = -1;
1877     while ((rpos < len)&&(wpos < len)) {
1878         if (c == -1) {
1879             c = data[rpos++];
1880         }
1881 
1882         // A. If the input buffer begins with a prefix of "../" or "./",
1883         //    then remove that prefix from the input buffer; otherwise,
1884         if (c == '.') {
1885             if ((rpos + 1 < len) && (data[rpos] == '.') && (data[rpos + 1] == '/')) {
1886                 c = -1;
1887                 rpos += 2;
1888                 continue;
1889             } else if ((rpos < len) && (data[rpos] == '/')) {
1890                 c = -1;
1891                 rpos += 1;
1892                 continue;
1893             }
1894         }
1895 
1896         if (c == '/') {
1897             // B. if the input buffer begins with a prefix of "/./" or "/.",
1898             //    where "." is a complete path segment, then replace that
1899             //    prefix with "/" in the input buffer; otherwise,
1900             if ((rpos + 1 < len) && (data[rpos] == '.') && (data[rpos + 1] == '/')) {
1901                 c = '/';
1902                 rpos += 2;
1903                 continue;
1904             } else if ((rpos + 1 == len) && (data[rpos] == '.')) {
1905                 c = '/';
1906                 rpos += 1;
1907                 continue;
1908             }
1909 
1910             // C. if the input buffer begins with a prefix of "/../" or "/..",
1911             //    where ".." is a complete path segment, then replace that
1912             //    prefix with "/" in the input buffer and remove the last
1913             //    segment and its preceding "/" (if any) from the output
1914             //    buffer; otherwise,
1915             if ((rpos + 2 < len) && (data[rpos] == '.') && (data[rpos + 1] == '.') && (data[rpos + 2] == '/')) {
1916                 c = '/';
1917                 rpos += 3;
1918 
1919                 // Remove the last segment
1920                 while ((wpos > 0) && (data[wpos - 1] != '/')) wpos--;
1921                 if (wpos > 0) wpos--;
1922                 continue;
1923             } else if ((rpos + 2 == len) && (data[rpos] == '.') && (data[rpos + 1] == '.')) {
1924                 c = '/';
1925                 rpos += 2;
1926 
1927                 // Remove the last segment
1928                 while ((wpos > 0) && (data[wpos - 1] != '/')) wpos--;
1929                 if (wpos > 0) wpos--;
1930                 continue;
1931             }
1932         }
1933 
1934         // D.  if the input buffer consists only of "." or "..", then remove
1935         // that from the input buffer; otherwise,
1936         if ((c == '.') && (rpos == len)) {
1937             rpos++;
1938             continue;
1939         }
1940 
1941         if ((c == '.') && (rpos + 1 == len) && (data[rpos] == '.')) {
1942             rpos += 2;
1943             continue;
1944         }
1945 
1946         // E.  move the first path segment in the input buffer to the end of
1947         // the output buffer, including the initial "/" character (if
1948         // any) and any subsequent characters up to, but not including,
1949         // the next "/" character or the end of the input buffer.
1950         data[wpos++] = c;
1951 
1952         while ((rpos < len) && (data[rpos] != '/') && (wpos < len)) {
1953             data[wpos++] = data[rpos++];
1954         }
1955 
1956         c = -1;
1957     }
1958 
1959     bstr_adjust_len(s, wpos);
1960 }
1961 
1962 /**
1963  *
1964  */
fprint_bstr(FILE * stream,const char * name,bstr * b)1965 void fprint_bstr(FILE *stream, const char *name, bstr *b) {
1966     if (b == NULL) {
1967         fprint_raw_data_ex(stream, name, "(null)", 0, 6);
1968         return;
1969     }
1970 
1971     fprint_raw_data_ex(stream, name, bstr_ptr(b), 0, bstr_len(b));
1972 }
1973 
1974 /**
1975  *
1976  */
fprint_raw_data(FILE * stream,const char * name,const void * data,size_t len)1977 void fprint_raw_data(FILE *stream, const char *name, const void *data, size_t len) {
1978     fprint_raw_data_ex(stream, name, data, 0, len);
1979 }
1980 
1981 /**
1982  *
1983  */
fprint_raw_data_ex(FILE * stream,const char * name,const void * _data,size_t offset,size_t printlen)1984 void fprint_raw_data_ex(FILE *stream, const char *name, const void *_data, size_t offset, size_t printlen) {
1985     const unsigned char *data = (const unsigned char *) _data;
1986     char buf[160];
1987     size_t len = offset + printlen;
1988 
1989     fprintf(stream, "\n%s: ptr %p offset %u len %u\n", name, (void*) data, (unsigned int)offset, (unsigned int)len);
1990 
1991     while (offset < len) {
1992         size_t i;
1993 
1994         snprintf(buf, sizeof(buf), "%x" PRIx64, (unsigned int) offset);
1995         strlcat(buf, "  ", sizeof(buf));
1996 
1997         i = 0;
1998         while (i < 8) {
1999             if (offset + i < len) {
2000                 char step[4];
2001                 snprintf(step, sizeof(step), "%02x ", data[offset + i]);
2002                 strlcat(buf, step, sizeof(buf));
2003             } else {
2004                 strlcat(buf, "   ", sizeof(buf));
2005             }
2006 
2007             i++;
2008         }
2009 
2010         strlcat(buf, " ", sizeof(buf));
2011 
2012         i = 8;
2013         while (i < 16) {
2014             if (offset + i < len) {
2015                 char step[4];
2016                 snprintf(step, sizeof(step), "%02x ", data[offset + i]);
2017                 strlcat(buf, step, sizeof(buf));
2018             } else {
2019                 strlcat(buf, "   ", sizeof(buf));
2020             }
2021 
2022             i++;
2023         }
2024 
2025         strlcat(buf, " |", sizeof(buf));
2026 
2027         i = 0;
2028         char *p = buf + strlen(buf);
2029         while ((offset + i < len) && (i < 16)) {
2030             int c = data[offset + i];
2031 
2032             if (isprint(c)) {
2033                 *p++ = c;
2034             } else {
2035                 *p++ = '.';
2036             }
2037 
2038             i++;
2039         }
2040 
2041         *p++ = '|';
2042         *p++ = '\n';
2043         *p = '\0';
2044 
2045         fprintf(stream, "%s", buf);
2046         offset += 16;
2047     }
2048 
2049     fprintf(stream, "\n");
2050 }
2051 
2052 /**
2053  *
2054  */
htp_connp_in_state_as_string(htp_connp_t * connp)2055 char *htp_connp_in_state_as_string(htp_connp_t *connp) {
2056     if (connp == NULL) return "NULL";
2057 
2058     if (connp->in_state == htp_connp_REQ_IDLE) return "REQ_IDLE";
2059     if (connp->in_state == htp_connp_REQ_LINE) return "REQ_LINE";
2060     if (connp->in_state == htp_connp_REQ_PROTOCOL) return "REQ_PROTOCOL";
2061     if (connp->in_state == htp_connp_REQ_HEADERS) return "REQ_HEADERS";
2062     if (connp->in_state == htp_connp_REQ_CONNECT_CHECK) return "REQ_CONNECT_CHECK";
2063     if (connp->in_state == htp_connp_REQ_CONNECT_WAIT_RESPONSE) return "REQ_CONNECT_WAIT_RESPONSE";
2064     if (connp->in_state == htp_connp_REQ_BODY_DETERMINE) return "REQ_BODY_DETERMINE";
2065     if (connp->in_state == htp_connp_REQ_BODY_IDENTITY) return "REQ_BODY_IDENTITY";
2066     if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_LENGTH) return "REQ_BODY_CHUNKED_LENGTH";
2067     if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_DATA) return "REQ_BODY_CHUNKED_DATA";
2068     if (connp->in_state == htp_connp_REQ_BODY_CHUNKED_DATA_END) return "REQ_BODY_CHUNKED_DATA_END";
2069     if (connp->in_state == htp_connp_REQ_FINALIZE) return "REQ_FINALIZE";
2070     if (connp->in_state == htp_connp_REQ_IGNORE_DATA_AFTER_HTTP_0_9) return "REQ_IGNORE_DATA_AFTER_HTTP_0_9";
2071 
2072     return "UNKNOWN";
2073 }
2074 
2075 /**
2076  *
2077  */
htp_connp_out_state_as_string(htp_connp_t * connp)2078 char *htp_connp_out_state_as_string(htp_connp_t *connp) {
2079     if (connp == NULL) return "NULL";
2080 
2081     if (connp->out_state == htp_connp_RES_IDLE) return "RES_IDLE";
2082     if (connp->out_state == htp_connp_RES_LINE) return "RES_LINE";
2083     if (connp->out_state == htp_connp_RES_HEADERS) return "RES_HEADERS";
2084     if (connp->out_state == htp_connp_RES_BODY_DETERMINE) return "RES_BODY_DETERMINE";
2085     if (connp->out_state == htp_connp_RES_BODY_IDENTITY_CL_KNOWN) return "RES_BODY_IDENTITY_CL_KNOWN";
2086     if (connp->out_state == htp_connp_RES_BODY_IDENTITY_STREAM_CLOSE) return "RES_BODY_IDENTITY_STREAM_CLOSE";
2087     if (connp->out_state == htp_connp_RES_BODY_CHUNKED_LENGTH) return "RES_BODY_CHUNKED_LENGTH";
2088     if (connp->out_state == htp_connp_RES_BODY_CHUNKED_DATA) return "RES_BODY_CHUNKED_DATA";
2089     if (connp->out_state == htp_connp_RES_BODY_CHUNKED_DATA_END) return "RES_BODY_CHUNKED_DATA_END";
2090     if (connp->out_state == htp_connp_RES_FINALIZE) return "RES_BODY_FINALIZE";
2091 
2092     return "UNKNOWN";
2093 }
2094 
2095 /**
2096  *
2097  */
htp_tx_request_progress_as_string(htp_tx_t * tx)2098 char *htp_tx_request_progress_as_string(htp_tx_t *tx) {
2099     if (tx == NULL) return "NULL";
2100 
2101     switch (tx->request_progress) {
2102         case HTP_REQUEST_NOT_STARTED:
2103             return "NOT_STARTED";
2104         case HTP_REQUEST_LINE:
2105             return "REQ_LINE";
2106         case HTP_REQUEST_HEADERS:
2107             return "REQ_HEADERS";
2108         case HTP_REQUEST_BODY:
2109             return "REQ_BODY";
2110         case HTP_REQUEST_TRAILER:
2111             return "REQ_TRAILER";
2112         case HTP_REQUEST_COMPLETE:
2113             return "COMPLETE";
2114     }
2115 
2116     return "INVALID";
2117 }
2118 
2119 /**
2120  *
2121  */
htp_tx_response_progress_as_string(htp_tx_t * tx)2122 char *htp_tx_response_progress_as_string(htp_tx_t *tx) {
2123     if (tx == NULL) return "NULL";
2124 
2125     switch (tx->response_progress) {
2126         case HTP_RESPONSE_NOT_STARTED:
2127             return "NOT_STARTED";
2128         case HTP_RESPONSE_LINE:
2129             return "RES_LINE";
2130         case HTP_RESPONSE_HEADERS:
2131             return "RES_HEADERS";
2132         case HTP_RESPONSE_BODY:
2133             return "RES_BODY";
2134         case HTP_RESPONSE_TRAILER:
2135             return "RES_TRAILER";
2136         case HTP_RESPONSE_COMPLETE:
2137             return "COMPLETE";
2138     }
2139 
2140     return "INVALID";
2141 }
2142 
htp_unparse_uri_noencode(htp_uri_t * uri)2143 bstr *htp_unparse_uri_noencode(htp_uri_t *uri) {
2144     if (uri == NULL) return NULL;
2145 
2146     // On the first pass determine the length of the final string
2147     size_t len = 0;
2148 
2149     if (uri->scheme != NULL) {
2150         len += bstr_len(uri->scheme);
2151         len += 3; // "://"
2152     }
2153 
2154     if ((uri->username != NULL) || (uri->password != NULL)) {
2155         if (uri->username != NULL) {
2156             len += bstr_len(uri->username);
2157         }
2158 
2159         len += 1; // ":"
2160 
2161         if (uri->password != NULL) {
2162             len += bstr_len(uri->password);
2163         }
2164 
2165         len += 1; // "@"
2166     }
2167 
2168     if (uri->hostname != NULL) {
2169         len += bstr_len(uri->hostname);
2170     }
2171 
2172     if (uri->port != NULL) {
2173         len += 1; // ":"
2174         len += bstr_len(uri->port);
2175     }
2176 
2177     if (uri->path != NULL) {
2178         len += bstr_len(uri->path);
2179     }
2180 
2181     if (uri->query != NULL) {
2182         len += 1; // "?"
2183         len += bstr_len(uri->query);
2184     }
2185 
2186     if (uri->fragment != NULL) {
2187         len += 1; // "#"
2188         len += bstr_len(uri->fragment);
2189     }
2190 
2191     // On the second pass construct the string
2192     bstr *r = bstr_alloc(len);
2193     if (r == NULL) return NULL;
2194 
2195     if (uri->scheme != NULL) {
2196         bstr_add_noex(r, uri->scheme);
2197         bstr_add_c_noex(r, "://");
2198     }
2199 
2200     if ((uri->username != NULL) || (uri->password != NULL)) {
2201         if (uri->username != NULL) {
2202             bstr_add_noex(r, uri->username);
2203         }
2204 
2205         bstr_add_c_noex(r, ":");
2206 
2207         if (uri->password != NULL) {
2208             bstr_add_noex(r, uri->password);
2209         }
2210 
2211         bstr_add_c_noex(r, "@");
2212     }
2213 
2214     if (uri->hostname != NULL) {
2215         bstr_add_noex(r, uri->hostname);
2216     }
2217 
2218     if (uri->port != NULL) {
2219         bstr_add_c_noex(r, ":");
2220         bstr_add_noex(r, uri->port);
2221     }
2222 
2223     if (uri->path != NULL) {
2224         bstr_add_noex(r, uri->path);
2225     }
2226 
2227     if (uri->query != NULL) {
2228         bstr_add_c_noex(r, "?");
2229         bstr_add_noex(r, uri->query);
2230     }
2231 
2232     if (uri->fragment != NULL) {
2233         bstr_add_c_noex(r, "#");
2234         bstr_add_noex(r, uri->fragment);
2235     }
2236 
2237     return r;
2238 }
2239 
2240 /**
2241  * Determine if the information provided on the response line
2242  * is good enough. Browsers are lax when it comes to response
2243  * line parsing. In most cases they will only look for the
2244  * words "http" at the beginning.
2245  *
2246  * @param[in] data pointer to bytearray
2247  * @param[in] len length in bytes of data
2248  * @return 1 for good enough or 0 for not good enough
2249  */
htp_treat_response_line_as_body(const uint8_t * data,size_t len)2250 int htp_treat_response_line_as_body(const uint8_t *data, size_t len) {
2251     // Browser behavior:
2252     //      Firefox 3.5.x: (?i)^\s*http
2253     //      IE: (?i)^\s*http\s*/
2254     //      Safari: ^HTTP/\d+\.\d+\s+\d{3}
2255     size_t pos = 0;
2256 
2257     if (data == NULL) return 1;
2258     while ((pos < len) && (htp_is_space(data[pos]) || data[pos] == 0)) pos++;
2259 
2260     if (len < pos + 4) return 1;
2261 
2262     if ((data[pos] != 'H') && (data[pos] != 'h')) return 1;
2263     if ((data[pos+1] != 'T') && (data[pos+1] != 't')) return 1;
2264     if ((data[pos+2] != 'T') && (data[pos+2] != 't')) return 1;
2265     if ((data[pos+3] != 'P') && (data[pos+3] != 'p')) return 1;
2266 
2267     return 0;
2268 }
2269 
2270 /**
2271  * Run the REQUEST_BODY_DATA hook.
2272  *
2273  * @param[in] connp
2274  * @param[in] d
2275  */
htp_req_run_hook_body_data(htp_connp_t * connp,htp_tx_data_t * d)2276 htp_status_t htp_req_run_hook_body_data(htp_connp_t *connp, htp_tx_data_t *d) {
2277     // Do not invoke callbacks with an empty data chunk
2278     if ((d->data != NULL) && (d->len == 0)) return HTP_OK;
2279 
2280     // Do not invoke callbacks without a transaction.
2281     if (connp->in_tx == NULL) return HTP_OK;
2282 
2283     // Run transaction hooks first
2284     htp_status_t rc = htp_hook_run_all(connp->in_tx->hook_request_body_data, d);
2285     if (rc != HTP_OK) return rc;
2286 
2287     // Run configuration hooks second
2288     rc = htp_hook_run_all(connp->cfg->hook_request_body_data, d);
2289     if (rc != HTP_OK) return rc;
2290 
2291     // On PUT requests, treat request body as file
2292     if (connp->put_file != NULL) {
2293         htp_file_data_t file_data;
2294 
2295         file_data.data = d->data;
2296         file_data.len = d->len;
2297         file_data.file = connp->put_file;
2298         file_data.file->len += d->len;
2299 
2300         rc = htp_hook_run_all(connp->cfg->hook_request_file_data, &file_data);
2301         if (rc != HTP_OK) return rc;
2302     }
2303 
2304     return HTP_OK;
2305 }
2306 
2307 /**
2308  * Run the RESPONSE_BODY_DATA hook.
2309  *
2310  * @param[in] connp
2311  * @param[in] d
2312  */
htp_res_run_hook_body_data(htp_connp_t * connp,htp_tx_data_t * d)2313 htp_status_t htp_res_run_hook_body_data(htp_connp_t *connp, htp_tx_data_t *d) {
2314     // Do not invoke callbacks with an empty data chunk.
2315     if ((d->data != NULL) && (d->len == 0)) return HTP_OK;
2316 
2317     // Run transaction hooks first
2318     htp_status_t rc = htp_hook_run_all(connp->out_tx->hook_response_body_data, d);
2319     if (rc != HTP_OK) return rc;
2320 
2321     // Run configuration hooks second
2322     rc = htp_hook_run_all(connp->cfg->hook_response_body_data, d);
2323     if (rc != HTP_OK) return rc;
2324 
2325     return HTP_OK;
2326 }
2327 
2328 /**
2329  * Parses the provided memory region, extracting the double-quoted string.
2330  *
2331  * @param[in] data
2332  * @param[in] len
2333  * @param[out] out
2334  * @param[out] endoffset
2335  * @return HTP_OK on success, HTP_DECLINED if the input is not well formed, and HTP_ERROR on fatal errors.
2336  */
htp_extract_quoted_string_as_bstr(unsigned char * data,size_t len,bstr ** out,size_t * endoffset)2337 htp_status_t htp_extract_quoted_string_as_bstr(unsigned char *data, size_t len, bstr **out, size_t *endoffset) {
2338     if ((data == NULL) || (out == NULL)) return HTP_ERROR;
2339 
2340     if (len == 0) return HTP_DECLINED;
2341 
2342     size_t pos = 0;
2343 
2344     // Check that the first character is a double quote.
2345     if (data[pos] != '"') return HTP_DECLINED;
2346 
2347     // Step over the double quote.
2348     pos++;
2349     if (pos == len) return HTP_DECLINED;
2350 
2351     // Calculate the length of the resulting string.
2352     size_t escaped_chars = 0;
2353     while (pos < len) {
2354         if (data[pos] == '\\') {
2355             if (pos + 1 < len) {
2356                 escaped_chars++;
2357                 pos += 2;
2358                 continue;
2359             }
2360         } else if (data[pos] == '"') {
2361             break;
2362         }
2363 
2364         pos++;
2365     }
2366 
2367     // Have we reached the end of input without seeing the terminating double quote?
2368     if (pos == len) return HTP_DECLINED;
2369 
2370     // Copy the data and unescape it as necessary.
2371     size_t outlen = pos - 1 - escaped_chars;
2372     *out = bstr_alloc(outlen);
2373     if (*out == NULL) return HTP_ERROR;
2374     unsigned char *outptr = bstr_ptr(*out);
2375     size_t outpos = 0;
2376 
2377     pos = 1;
2378     while ((pos < len) && (outpos < outlen)) {
2379         // TODO We are not properly unescaping test here, we're only
2380         //      handling escaped double quotes.
2381         if (data[pos] == '\\') {
2382             if (pos + 1 < len) {
2383                 outptr[outpos++] = data[pos + 1];
2384                 pos += 2;
2385                 continue;
2386             }
2387         } else if (data[pos] == '"') {
2388             break;
2389         }
2390 
2391         outptr[outpos++] = data[pos++];
2392     }
2393 
2394     bstr_adjust_len(*out, outlen);
2395 
2396     if (endoffset != NULL) {
2397         *endoffset = pos;
2398     }
2399 
2400     return HTP_OK;
2401 }
2402 
htp_parse_ct_header(bstr * header,bstr ** ct)2403 htp_status_t htp_parse_ct_header(bstr *header, bstr **ct) {
2404     if ((header == NULL) || (ct == NULL)) return HTP_ERROR;
2405 
2406     unsigned char *data = bstr_ptr(header);
2407     size_t len = bstr_len(header);
2408 
2409     // The assumption here is that the header value we receive
2410     // here has been left-trimmed, which means the starting position
2411     // is on the media type. On some platforms that may not be the
2412     // case, and we may need to do the left-trim ourselves.
2413 
2414     // Find the end of the MIME type, using the same approach PHP 5.4.3 uses.
2415     size_t pos = 0;
2416     while ((pos < len) && (data[pos] != ';') && (data[pos] != ',') && (data[pos] != ' ')) pos++;
2417 
2418     *ct = bstr_dup_ex(header, 0, pos);
2419     if (*ct == NULL) return HTP_ERROR;
2420 
2421     bstr_to_lowercase(*ct);
2422 
2423     return HTP_OK;
2424 }
2425 
2426 /**
2427  * Implements relaxed (not strictly RFC) hostname validation.
2428  *
2429  * @param[in] hostname
2430  * @return 1 if the supplied hostname is valid; 0 if it is not.
2431  */
htp_validate_hostname(bstr * hostname)2432 int htp_validate_hostname(bstr *hostname) {
2433     unsigned char *data = bstr_ptr(hostname);
2434     size_t len = bstr_len(hostname);
2435     size_t startpos = 0;
2436     size_t pos = 0;
2437 
2438     if ((len == 0) || (len > 255)) return 0;
2439 
2440     while (pos < len) {
2441         // Validate label characters.
2442         startpos = pos;
2443         while ((pos < len) && (data[pos] != '.')) {
2444             unsigned char c = data[pos];
2445             // According to the RFC, the underscore is not allowed in a label, but
2446             // we allow it here because we think it's often seen in practice.
2447             if (!(((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
2448                         ((c >= '0') && (c <= '9')) ||
2449                          (c == '-') || (c == '_')))
2450             {
2451                 return 0;
2452             }
2453 
2454             pos++;
2455         }
2456 
2457         // Validate label length.
2458         if ((pos - startpos == 0) || (pos - startpos > 63)) return 0;
2459 
2460         if (pos >= len) return 1; // No more data after label.
2461 
2462         // How many dots are there?
2463         startpos = pos;
2464         while ((pos < len) && (data[pos] == '.')) pos++;
2465 
2466         if (pos - startpos != 1) return 0; // Exactly one dot expected.
2467     }
2468 
2469     return 1;
2470 }
2471 
htp_uri_free(htp_uri_t * uri)2472 void htp_uri_free(htp_uri_t *uri) {
2473     if (uri == NULL) return;
2474 
2475     bstr_free(uri->scheme);
2476     bstr_free(uri->username);
2477     bstr_free(uri->password);
2478     bstr_free(uri->hostname);
2479     bstr_free(uri->port);
2480     bstr_free(uri->path);
2481     bstr_free(uri->query);
2482     bstr_free(uri->fragment);
2483 
2484     free(uri);
2485 }
2486 
htp_uri_alloc()2487 htp_uri_t *htp_uri_alloc() {
2488     htp_uri_t *u = calloc(1, sizeof (htp_uri_t));
2489     if (u == NULL) return NULL;
2490 
2491     u->port_number = -1;
2492 
2493     return u;
2494 }
2495 
htp_get_version(void)2496 char *htp_get_version(void) {
2497     return HTP_VERSION_STRING_FULL;
2498 }
2499