1 /*
2  * ModSecurity, http://www.modsecurity.org/
3  * Copyright (c) 2015 - 2021 Trustwave Holdings, Inc. (http://www.trustwave.com/)
4  *
5  * You may not use this file except in compliance with
6  * the License.  You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * If any of the files related to licensing are missing or if you have any
11  * other questions related to licensing please contact Trustwave Holdings, Inc.
12  * directly using the email address security@modsecurity.org.
13  *
14  */
15 
16 #include "src/actions/transformations/html_entity_decode.h"
17 
18 #include <string.h>
19 
20 #include <iostream>
21 #include <string>
22 #include <algorithm>
23 #include <functional>
24 #include <cctype>
25 #include <locale>
26 
27 #include "modsecurity/transaction.h"
28 #include "src/actions/transformations/transformation.h"
29 
30 
31 namespace modsecurity {
32 namespace actions {
33 namespace transformations {
34 
35 
evaluate(const std::string & value,Transaction * transaction)36 std::string HtmlEntityDecode::evaluate(const std::string &value,
37     Transaction *transaction) {
38     std::string ret;
39     unsigned char *input;
40 
41     input = reinterpret_cast<unsigned char *>
42         (malloc(sizeof(char) * value.length()+1));
43 
44     if (input == NULL) {
45         return "";
46     }
47 
48     memcpy(input, value.c_str(), value.length()+1);
49 
50     size_t i = inplace(input, value.length());
51 
52     ret.assign(reinterpret_cast<char *>(input), i);
53     free(input);
54 
55     return ret;
56 }
57 
58 
inplace(unsigned char * input,uint64_t input_len)59 int HtmlEntityDecode::inplace(unsigned char *input, uint64_t input_len) {
60     unsigned char *d = input;
61     int i, count;
62 
63     if ((input == NULL) || (input_len == 0)) {
64         return 0;
65     }
66 
67     i = count = 0;
68     while ((i < input_len) && (count < input_len)) {
69         int z, copy = 1;
70 
71         /* Require an ampersand and at least one character to
72          * start looking into the entity.
73          */
74         if ((input[i] == '&') && (i + 1 < input_len)) {
75             int k, j = i + 1;
76 
77             if (input[j] == '#') {
78                 /* Numerical entity. */
79                 copy++;
80 
81                 if (!(j + 1 < input_len)) {
82                     goto HTML_ENT_OUT; /* Not enough bytes. */
83                 }
84                 j++;
85 
86                 if ((input[j] == 'x') || (input[j] == 'X')) {
87                     /* Hexadecimal entity. */
88                     copy++;
89 
90                     if (!(j + 1 < input_len)) {
91                         goto HTML_ENT_OUT; /* Not enough bytes. */
92                     }
93                     j++; /* j is the position of the first digit now. */
94 
95                     k = j;
96                     while ((j < input_len) && (isxdigit(input[j]))) {
97                         j++;
98                     }
99                     if (j > k) { /* Do we have at least one digit? */
100                         /* Decode the entity. */
101                         char *x;
102                         x = reinterpret_cast<char *>(calloc(sizeof(char),
103                             ((j - k) + 1)));
104                         memcpy(x, (const char *)&input[k], j - k);
105                         *d++ = (unsigned char)strtol(x, NULL, 16);
106                         free(x);
107                         count++;
108 
109                         /* Skip over the semicolon if it's there. */
110                         if ((j < input_len) && (input[j] == ';')) {
111                             i = j + 1;
112                         } else {
113                             i = j;
114                         }
115                         continue;
116                     } else {
117                         goto HTML_ENT_OUT;
118                     }
119                 } else {
120                     /* Decimal entity. */
121                     k = j;
122                     while ((j < input_len) && (isdigit(input[j]))) {
123                         j++;
124                     }
125                     if (j > k) { /* Do we have at least one digit? */
126                         /* Decode the entity. */
127                         char *x;
128                         x = reinterpret_cast<char *>(calloc(sizeof(char),
129                             ((j - k) + 1)));
130                         memcpy(x, (const char *)&input[k], j - k);
131                         *d++ = (unsigned char)strtol(x, NULL, 10);
132                         free(x);
133                         count++;
134 
135                         /* Skip over the semicolon if it's there. */
136                         if ((j < input_len) && (input[j] == ';')) {
137                             i = j + 1;
138                         } else {
139                             i = j;
140                         }
141                         continue;
142                     } else {
143                         goto HTML_ENT_OUT;
144                     }
145                 }
146             } else {
147                 /* Text entity. */
148                 k = j;
149                 while ((j < input_len) && (isalnum(input[j]))) {
150                     j++;
151                 }
152                 if (j > k) { /* Do we have at least one digit? */
153                     char *x;
154                     x = reinterpret_cast<char *>(calloc(sizeof(char),
155                         ((j - k) + 1)));
156                     memcpy(x, (const char *)&input[k], j - k);
157 
158                     /* Decode the entity. */
159                     /* ENH What about others? */
160                     if (strcasecmp(x, "quot") == 0) {
161                         *d++ = '"';
162                     } else if (strcasecmp(x, "amp") == 0) {
163                         *d++ = '&';
164                     } else if (strcasecmp(x, "lt") == 0) {
165                         *d++ = '<';
166                     } else if (strcasecmp(x, "gt") == 0) {
167                         *d++ = '>';
168                     } else if (strcasecmp(x, "nbsp") == 0) {
169                         *d++ = NBSP;
170                     } else {
171                         /* We do no want to convert this entity,
172                          * copy the raw data over. */
173                         copy = j - k + 1;
174                         free(x);
175                         goto HTML_ENT_OUT;
176                     }
177                     free(x);
178 
179                     count++;
180 
181                     /* Skip over the semicolon if it's there. */
182                     if ((j < input_len) && (input[j] == ';')) {
183                         i = j + 1;
184                     } else {
185                         i = j;
186                     }
187 
188                     continue;
189                 }
190             }
191         }
192 
193 HTML_ENT_OUT:
194 
195         for (z = 0; ((z < copy) && (count < input_len)); z++) {
196             *d++ = input[i++];
197             count++;
198         }
199     }
200 
201     *d = '\0';
202 
203     return count;
204 }
205 
206 }  // namespace transformations
207 }  // namespace actions
208 }  // namespace modsecurity
209