1 // Copyright (c) 2007, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 // ---
31 // Author: csilvers@google.com (Craig Silverstein)
32 //
33 // template_modifiers.h has a description of what each escape-routine does.
34 //
35 // When creating a new modifier, you must subclass TemplateModifier
36 // and define your own Modify() method. This method takes the string
37 // to be modified as a char*/int pair. It then emits the modified
38 // version of the string to outbuf. Outbuf is an ExpandEmitter, as
39 // defined in template_modifiers.h. It's a very simple type that
40 // supports appending to a data stream.
41 //
42 // Be very careful editing an existing modifier. Subtle changes can
43 // introduce the possibility for cross-site scripting attacks. If you
44 // do change a modifier, be careful that it does not affect
45 // the list of Safe XSS Alternatives.
46 //
47
48 #include <config.h>
49 #include <stdlib.h>
50 #include <assert.h>
51 #include <string.h>
52 #include <string>
53 #include <vector>
54 #include "htmlparser/htmlparser_cpp.h"
55 #include <ctemplate/template_modifiers.h>
56 #include "template_modifiers_internal.h"
57 #include <ctemplate/per_expand_data.h>
58 using std::string;
59 using std::vector;
60
61 #define strliterallen(s) (sizeof("" s "") - 1)
62
63 // Really we should be using uint_16_t or something, but this is good
64 // enough, and more portable...
65 typedef unsigned int uint16;
66
67 namespace URL {
HasInsecureProtocol(const char * in,int inlen)68 bool HasInsecureProtocol(const char* in, int inlen) {
69 if (inlen > strliterallen("http://") &&
70 strncasecmp(in, "http://", strliterallen("http://")) == 0) {
71 return false; // We're ok, it's an http protocol
72 }
73 if (inlen > strliterallen("https://") &&
74 strncasecmp(in, "https://", strliterallen("https://")) == 0) {
75 return false; // https is ok as well
76 }
77 if (inlen > strliterallen("ftp://") &&
78 strncasecmp(in, "ftp://", strliterallen("ftp://")) == 0) {
79 return false; // and ftp
80 }
81 return true;
82 }
83 } // namespace URL
84
85 namespace ctemplate {
86
87 using ctemplate_htmlparser::HtmlParser;
88
89 // A most-efficient way to append a string literal to the var named 'out'.
90 // The ""s ensure literal is actually a string literal
91 #define APPEND(literal) out->Emit("" literal "", sizeof(literal)-1)
92
93 // Check whether the string of length len is identical to the literal.
94 // The ""s ensure literal is actually a string literal
95 #define STR_IS(str, len, literal) \
96 ((len) == sizeof("" literal "") - 1 && \
97 memcmp(str, literal, sizeof("" literal "") - 1) == 0)
98
~TemplateModifier()99 TemplateModifier::~TemplateModifier() {}
100
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const101 void NullModifier::Modify(const char* in, size_t inlen,
102 const PerExpandData*,
103 ExpandEmitter* out, const string& arg) const {
104 out->Emit(in, inlen);
105 }
106 NullModifier null_modifier;
107
EmitRun(const char * start,const char * limit,ExpandEmitter * out)108 static inline void EmitRun(const char* start, const char* limit,
109 ExpandEmitter* out) {
110 if (start < limit) {
111 out->Emit(start, (limit - start));
112 }
113 }
114
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const115 void HtmlEscape::Modify(const char* in, size_t inlen,
116 const PerExpandData*,
117 ExpandEmitter* out, const string& arg) const {
118 const char* pos = in;
119 const char* start = pos;
120 const char* const limit = in + inlen;
121 while (pos < limit) {
122 switch (*pos) {
123 default:
124 // Increment our counter and look at the next character.
125 ++pos;
126 continue;
127
128 case '&': EmitRun(start, pos, out); APPEND("&"); break;
129 case '"': EmitRun(start, pos, out); APPEND("""); break;
130 case '\'': EmitRun(start, pos, out); APPEND("'"); break;
131 case '<': EmitRun(start, pos, out); APPEND("<"); break;
132 case '>': EmitRun(start, pos, out); APPEND(">"); break;
133
134 case '\r': case '\n': case '\v': case '\f': case '\t':
135 EmitRun(start, pos, out); APPEND(" "); break;
136 }
137 start = ++pos;
138 }
139 EmitRun(start, pos, out);
140 }
141 HtmlEscape html_escape;
142
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const143 void PreEscape::Modify(const char* in, size_t inlen,
144 const PerExpandData*,
145 ExpandEmitter* out, const string& arg) const {
146 const char* pos = in;
147 const char* start = pos;
148 const char* const limit = in + inlen;
149 while (pos < limit) {
150 switch (*pos) {
151 default:
152 // Increment our counter and look at the next character.
153 ++pos;
154 continue;
155
156 // Unlike HtmlEscape, we leave whitespace as is.
157 case '&': EmitRun(start, pos, out); APPEND("&"); break;
158 case '"': EmitRun(start, pos, out); APPEND("""); break;
159 case '\'': EmitRun(start, pos, out); APPEND("'"); break;
160 case '<': EmitRun(start, pos, out); APPEND("<"); break;
161 case '>': EmitRun(start, pos, out); APPEND(">"); break;
162 }
163 start = ++pos;
164 }
165 EmitRun(start, pos, out);
166 }
167 PreEscape pre_escape;
168
169 // We encode the presence and ordering of unclosed tags in a string, using the
170 // letters b, i, s, and e to stand for <b>, <i>, <span>, and <em> respectively.
171 // The most recently opened tag is appended onto the end of the string, so in
172 // the common case of properly nested tags, we need only look at the last
173 // character. If we don't find it there, we need to continue looking at
174 // everything until we find it, because tags may not necessarily be in order.
175 // Similarly, when we add a tag, we need to check each existing tag for a match
176 // so that we don't nest.
177 class UnclosedSnippetTags {
178 public:
179 // We could use ordinary ints for the enum values, but using mnemonic
180 // characters potentially makes debugging easier.
181 typedef enum {
182 TAG_B = 'b',
183 TAG_I = 'i',
184 TAG_EM = 'e',
185 TAG_SPAN = 's',
186 } Tag;
187
UnclosedSnippetTags()188 UnclosedSnippetTags() : tag_length(0) {
189 memset(tags, 0, 5);
190 }
191
192 // Adds a tag to the set of open tags if it's not already open, or otherwise
193 // return false.
MaybeAdd(Tag tag)194 inline bool MaybeAdd(Tag tag) {
195 if (strchr(tags, tag)) {
196 return false;
197 } else {
198 tags[tag_length++] = tag;
199 return true;
200 }
201 }
202
203 // Removes a tag from the set of open tags if it's open, or otherwise return
204 // false.
MaybeRemove(Tag tag)205 inline bool MaybeRemove(Tag tag) {
206 char* tag_location = strchr(tags, tag);
207 if (tag_location) {
208 for (char* c = tag_location; *c; ++c) {
209 // Have to copy all later tags down by one so we don't leave a gap in the
210 // array.
211 *c = *(c + 1);
212 }
213 --tag_length;
214 return true;
215 } else {
216 return false;
217 }
218 }
219
PrintClosingTags(ExpandEmitter * out)220 inline void PrintClosingTags(ExpandEmitter* out) {
221 for (int i = tag_length; i >= 0; --i) {
222 switch (tags[i]) {
223 case TAG_B:
224 out->Emit("</b>"); break;
225 case TAG_I:
226 out->Emit("</i>"); break;
227 case TAG_EM:
228 out->Emit("</em>"); break;
229 case TAG_SPAN:
230 out->Emit("</span>"); break;
231 }
232 }
233 }
234
235 private:
236 char tags[5];
237 int tag_length;
238 };
239
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const240 void SnippetEscape::Modify(const char* in, size_t inlen,
241 const PerExpandData*,
242 ExpandEmitter* out, const string& arg) const {
243 UnclosedSnippetTags unclosed;
244 const char* pos = in;
245 const char* start = pos;
246 const char* const limit = in + inlen;
247 while (pos < limit) {
248 switch (*pos) {
249 default:
250 // Increment our counter and look at the next character.
251 ++pos;
252 continue;
253
254 case '<': {
255 // If there is a permissible tag, just advance pos past it to
256 // make it part of the current run. Notice the use of
257 // "continue" below.
258 const char* const next_pos = pos + 1;
259 const int chars_left = limit - next_pos;
260 if ((chars_left >= 2) && !memcmp(next_pos, "b>", 2)
261 && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_B)) {
262 pos += strliterallen("<b>");
263 continue;
264 } else if ((chars_left >= 2) && !memcmp(next_pos, "i>", 2)
265 && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_I)) {
266 pos += strliterallen("<i>");
267 continue;
268 } else if ((chars_left >= 3) && !memcmp(next_pos, "em>", 3)
269 && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_EM)) {
270 pos += strliterallen("<em>");
271 continue;
272 } else if ((chars_left >= 13) && !memcmp(next_pos, "span dir=", 9)
273 && (!memcmp(next_pos + 9, "ltr>", 4) ||
274 !memcmp(next_pos + 9, "rtl>", 4))
275 && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_SPAN)) {
276 pos += strliterallen("<span dir=ltr>");
277 continue;
278 } else if ((chars_left >= 3) && !memcmp(next_pos, "/b>", 3)
279 && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_B)) {
280 pos += strliterallen("</b>");
281 continue;
282 } else if ((chars_left >= 3) && !memcmp(next_pos, "/i>", 3)
283 && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_I)) {
284 pos += strliterallen("</i>");
285 continue;
286 } else if ((chars_left >= 4) && !memcmp(next_pos, "/em>", 4)
287 && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_EM)) {
288 pos += strliterallen("</em>");
289 continue;
290 } else if ((chars_left >= 6) && !memcmp(next_pos, "/span>", 6)
291 && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_SPAN)) {
292 pos += strliterallen("</span>");
293 continue;
294 } else if ((chars_left >= 3) && !memcmp(next_pos, "br>", 3)) {
295 pos += strliterallen("<br>");
296 continue;
297 } else if ((chars_left >= 4) && !memcmp(next_pos, "wbr>", 4)) {
298 pos += strliterallen("<wbr>");
299 continue;
300 }
301
302 // Emit the entity and break out of the switch.
303 EmitRun(start, pos, out);
304 APPEND("<");
305 break;
306 }
307
308 case '&':
309 EmitRun(start, pos, out);
310 if (pos + 1 < limit && pos[1] == '{') {
311 // Could be a javascript entity, so we need to escape.
312 // (Javascript entities are an xss risk in Netscape 4.)
313 APPEND("&");
314 } else {
315 APPEND("&");
316 }
317 break;
318
319 case '"': EmitRun(start, pos, out); APPEND("""); break;
320 case '\'': EmitRun(start, pos, out); APPEND("'"); break;
321 case '>': EmitRun(start, pos, out); APPEND(">"); break;
322
323 case '\r': case '\n': case '\v': case '\f': case '\t':
324 // non-space whitespace
325 EmitRun(start, pos, out); APPEND(" "); break;
326
327 }
328 start = ++pos;
329 }
330 EmitRun(start, pos, out);
331 unclosed.PrintClosingTags(out);
332 }
333 SnippetEscape snippet_escape;
334
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const335 void CleanseAttribute::Modify(const char* in, size_t inlen,
336 const PerExpandData*,
337 ExpandEmitter* out, const string& arg) const {
338 for (size_t i = 0; i < inlen; ++i) {
339 char c = in[i];
340 switch (c) {
341 case '=': {
342 if (i == 0 || i == (inlen - 1))
343 out->Emit('_');
344 else
345 out->Emit(c);
346 break;
347 }
348 case '-':
349 case '.':
350 case '_':
351 case ':': {
352 out->Emit(c);
353 break;
354 }
355 default: {
356 if ((c >= 'a' && c <= 'z') ||
357 (c >= 'A' && c <= 'Z') ||
358 (c >= '0' && c <= '9')) {
359 out->Emit(c);
360 } else {
361 APPEND("_");
362 }
363 break;
364 }
365 }
366 }
367 }
368 CleanseAttribute cleanse_attribute;
369
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const370 void CleanseCss::Modify(const char* in, size_t inlen,
371 const PerExpandData*,
372 ExpandEmitter* out, const string& arg) const {
373 for (size_t i = 0; i < inlen; ++i) {
374 char c = in[i];
375 switch (c) {
376 case ' ':
377 case '_':
378 case '.':
379 case ',':
380 case '!':
381 case '#':
382 case '%':
383 case '-': {
384 out->Emit(c);
385 break;
386 }
387 default: {
388 if ((c >= 'a' && c <= 'z') ||
389 (c >= 'A' && c <= 'Z') ||
390 (c >= '0' && c <= '9')) {
391 out->Emit(c);
392 }
393 break;
394 }
395 }
396 }
397 }
398 CleanseCss cleanse_css;
399
400 // CssUrlEscape is used as a chained modifier by ValidateUrl
401 // (validate_url_and_css_escape) and is not directly exposed.
402 class CssUrlEscape : public TemplateModifier {
403 public:
404 virtual void Modify(const char* in, size_t inlen,
405 const PerExpandData*, ExpandEmitter* outbuf,
406 const string& arg) const;
407 };
408
409 // URL-encodes the characters [\n\r\\'"()<>*] to ensure the URL can be safely
410 // inserted in a CSS context, e.g:
411 // . In an '@import url("URL");' statement
412 // . In a CSS property such as 'background: url("URL");'
413 // In both locations above, enclosing quotes are optional but parens are not.
414 // We want to make sure the URL cannot exit the parens enclosure, close a
415 // STYLE tag or reset the browser's CSS parser (via comments or newlines).
416 //
417 // References:
418 // . CSS 2.1 URLs: http://www.w3.org/TR/CSS21/syndata.html#url
419 // . CSS 1 URLs: http://www.w3.org/TR/REC-CSS1/#url
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const420 void CssUrlEscape::Modify(const char* in, size_t inlen,
421 const PerExpandData*,
422 ExpandEmitter* out, const string& arg) const {
423 for (size_t i = 0; i < inlen; ++i) {
424 char c = in[i];
425 switch (c) {
426 case '\n': APPEND("%0A"); break;
427 case '\r': APPEND("%0D"); break;
428 case '"': APPEND("%22"); break;
429 case '\'': APPEND("%27"); break;
430 case '(': APPEND("%28"); break;
431 case ')': APPEND("%29"); break;
432 case '*': APPEND("%2A"); break;
433 case '<': APPEND("%3C"); break;
434 case '>': APPEND("%3E"); break;
435 case '\\': APPEND("%5C"); break;
436 default: out->Emit(c); break;
437 }
438 }
439 }
440 CssUrlEscape css_url_escape;
441
442 // These URLs replace unsafe URLs for :U and :I url-escaping modes.
443 const char* const ValidateUrl::kUnsafeUrlReplacement = "#";
444 const char* const ValidateUrl::kUnsafeImgSrcUrlReplacement =
445 "/images/cleardot.gif";
446
Modify(const char * in,size_t inlen,const PerExpandData * per_expand_data,ExpandEmitter * out,const string & arg) const447 void ValidateUrl::Modify(const char* in, size_t inlen,
448 const PerExpandData* per_expand_data,
449 ExpandEmitter* out, const string& arg) const {
450 const char* slashpos = (char*)memchr(in, '/', inlen);
451 if (slashpos == NULL) {
452 slashpos = in + inlen;
453 }
454 const void* colonpos = memchr(in, ':', slashpos - in);
455 // colon before first slash, could be a protocol
456 if (colonpos != NULL && URL::HasInsecureProtocol(in, inlen)) {
457 // It's a bad protocol, so return something safe
458 chained_modifier_.Modify(unsafe_url_replacement_,
459 unsafe_url_replacement_length_,
460 per_expand_data,
461 out,
462 "");
463 return;
464 }
465 // If we get here, it's a valid url, so just escape it
466 chained_modifier_.Modify(in, inlen, per_expand_data, out, "");
467 }
468 ValidateUrl validate_url_and_html_escape(
469 html_escape,
470 ValidateUrl::kUnsafeUrlReplacement);
471 ValidateUrl validate_url_and_javascript_escape(
472 javascript_escape,
473 ValidateUrl::kUnsafeUrlReplacement);
474 ValidateUrl validate_url_and_css_escape(
475 css_url_escape,
476 ValidateUrl::kUnsafeUrlReplacement);
477 ValidateUrl validate_img_src_url_and_html_escape(
478 html_escape,
479 ValidateUrl::kUnsafeImgSrcUrlReplacement);
480 ValidateUrl validate_img_src_url_and_javascript_escape(
481 javascript_escape,
482 ValidateUrl::kUnsafeImgSrcUrlReplacement);
483 ValidateUrl validate_img_src_url_and_css_escape(
484 css_url_escape,
485 ValidateUrl::kUnsafeImgSrcUrlReplacement);
486
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const487 void XmlEscape::Modify(const char* in, size_t inlen,
488 const PerExpandData*,
489 ExpandEmitter* out, const string& arg) const {
490 const char* pos = in;
491 const char* start = pos;
492 const char* const limit = in + inlen;
493 while (pos < limit) {
494 char ch = *pos;
495
496 // According to section 2.2 of the spec
497 // http://www.w3.org/TR/REC-xml/#charsets control characters in range
498 // 0x00-0x1F (except \t, \r and \n) are not valid XML characters. In
499 // particular, conformant parsers are allowed to die when encountering a FF
500 // char in PCDATA sections. These chars are replaced by a space.
501 if (ch >= 0x00 && ch < 0x20 && ch != '\t' && ch != '\r' && ch != '\n') {
502 EmitRun(start, pos, out);
503 out->Emit(' ');
504 start = ++pos;
505 continue;
506 }
507
508 switch (ch) {
509 default:
510 // Increment our counter and look at the next character.
511 ++pos;
512 continue;
513
514 case '&': EmitRun(start, pos, out); APPEND("&"); break;
515 case '"': EmitRun(start, pos, out); APPEND("""); break;
516 case '\'': EmitRun(start, pos, out); APPEND("'"); break;
517 case '<': EmitRun(start, pos, out); APPEND("<"); break;
518 case '>': EmitRun(start, pos, out); APPEND(">"); break;
519 }
520 start = ++pos;
521 }
522 EmitRun(start, pos, out);
523 }
524 XmlEscape xml_escape;
525
526 // This table maps initial characters to code lengths. This could be
527 // done with a 16-byte table and a shift, but there's a substantial
528 // performance increase by eliminating the shift.
529 static const char kCodeLengths[256] = {
530 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
531 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
532 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
533 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
534
535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
539
540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
542 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
543 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
544
545 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
546 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
547 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
548 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
549 };
550
551 // Returns the UTF-8 code-unit starting at start, or the special codepoint
552 // 0xFFFD if the input ends abruptly or is not well-formed UTF-8.
553 // start -- address of the start of the code unit which also receives the
554 // address past the end of the code unit returned.
555 // end -- exclusive end of the string
UTF8CodeUnit(const char ** start,const char * end)556 static inline uint16 UTF8CodeUnit(const char** start, const char *end) {
557 // Use kCodeLengths table to calculate the length of the code unit
558 // from the first character.
559 unsigned char first_char = static_cast<unsigned char>(**start);
560 size_t code_unit_len = kCodeLengths[first_char];
561 if (code_unit_len == 1) {
562 // Return the current byte as a codepoint.
563 // Either it is a valid single byte codepoint, or it's not part of a valid
564 // UTF-8 sequence, and so has to be handled individually.
565 ++*start;
566 return first_char;
567 }
568 const char *code_unit_end = *start + code_unit_len;
569 if (code_unit_end < *start || code_unit_end > end) { // Truncated code unit.
570 ++*start;
571 return 0xFFFDU;
572 }
573 const char* pos = *start;
574 uint16 code_unit = *pos & (0xFFU >> code_unit_len);
575 while (--code_unit_len) {
576 uint16 tail_byte = *(++pos);
577 if ((tail_byte & 0xC0U) != 0x80U) { // Malformed code unit.
578 ++*start;
579 return 0xFFFDU;
580 }
581 code_unit = (code_unit << 6) | (tail_byte & 0x3FU);
582 }
583 *start = code_unit_end;
584 return code_unit;
585 }
586
587 // A good reference is the ECMA standard (3rd ed), section 7.8.4:
588 // http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const589 void JavascriptEscape::Modify(const char* in, size_t inlen,
590 const PerExpandData*,
591 ExpandEmitter* out, const string& arg) const {
592 const char* pos = in;
593 const char* start = pos;
594 const char* const limit = in + inlen;
595
596 if (limit < in) { return; }
597
598 while (pos < limit) {
599 const char* next_pos = pos;
600 uint16 code_unit = UTF8CodeUnit(&next_pos, limit);
601
602 // Test for 16-bit values outside the switch below, because gcc
603 // will emit chained branches rather than a jump table for such a
604 // wide range of values.
605 if (code_unit & 0xFF00) {
606 // Linebreaks according to EcmaScript 262 which cannot appear in strings.
607 if (code_unit == 0x2028) {
608 // Line separator
609 EmitRun(start, pos, out); APPEND("\\u2028");
610 } else if (code_unit == 0x2029) {
611 // Paragraph separator
612 EmitRun(start, pos, out); APPEND("\\u2029");
613 } else {
614 pos = next_pos;
615 continue;
616 }
617 } else {
618 switch (code_unit) {
619 default:
620 // Increment our counter and look at the next character.
621 pos = next_pos;
622 continue;
623
624 case '\0': EmitRun(start, pos, out); APPEND("\\x00"); break;
625 case '"': EmitRun(start, pos, out); APPEND("\\x22"); break;
626 case '\'': EmitRun(start, pos, out); APPEND("\\x27"); break;
627 case '\\': EmitRun(start, pos, out); APPEND("\\\\"); break;
628 case '\t': EmitRun(start, pos, out); APPEND("\\t"); break;
629 case '\r': EmitRun(start, pos, out); APPEND("\\r"); break;
630 case '\n': EmitRun(start, pos, out); APPEND("\\n"); break;
631 case '\b': EmitRun(start, pos, out); APPEND("\\b"); break;
632 case '\f': EmitRun(start, pos, out); APPEND("\\f"); break;
633 case '&': EmitRun(start, pos, out); APPEND("\\x26"); break;
634 case '<': EmitRun(start, pos, out); APPEND("\\x3c"); break;
635 case '>': EmitRun(start, pos, out); APPEND("\\x3e"); break;
636 case '=': EmitRun(start, pos, out); APPEND("\\x3d"); break;
637
638 case '\v':
639 // Do not escape vertical tabs to "\\v" since it is interpreted as 'v'
640 // by JScript according to section 2.1 of
641 // http://wiki.ecmascript.org/lib/exe/fetch.php?
642 // id=resources%3Aresources&cache=cache&
643 // media=resources:jscriptdeviationsfromes3.pdf
644 EmitRun(start, pos, out); APPEND("\\x0b"); break;
645 }
646 }
647 start = pos = next_pos;
648 }
649 EmitRun(start, pos, out);
650 }
651 JavascriptEscape javascript_escape;
652
653
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const654 void JavascriptNumber::Modify(const char* in, size_t inlen,
655 const PerExpandData*,
656 ExpandEmitter* out, const string& arg) const {
657 if (inlen == 0)
658 return;
659
660 if (STR_IS(in, inlen, "true") || STR_IS(in, inlen, "false")) {
661 out->Emit(in, inlen);
662 return;
663 }
664
665 bool valid = true;
666 if (in[0] == '0' && inlen > 2 && (in[1] == 'x' || in[1] == 'X')) {
667 // There must be at least one hex digit after the 0x for it to be valid.
668 // Hex number. Check that it is of the form 0(x|X)[0-9A-Fa-f]+
669 for (size_t i = 2; i < inlen; i++) {
670 char c = in[i];
671 if (!((c >= 'a' && c <= 'f') ||
672 (c >= 'A' && c <= 'F') ||
673 (c >= '0' && c <= '9'))) {
674 valid = false;
675 break;
676 }
677 }
678 } else {
679 // Must be a base-10 (or octal) number.
680 // Check that it has the form [0-9+-.eE]+
681 for (size_t i = 0; i < inlen; i++) {
682 char c = in[i];
683 if (!((c >= '0' && c <= '9') ||
684 c == '+' || c == '-' || c == '.' ||
685 c == 'e' || c == 'E')) {
686 valid = false;
687 break;
688 }
689 }
690 }
691 if (valid) {
692 out->Emit(in, inlen); // Number was valid, output it.
693 } else {
694 APPEND("null"); // Number was not valid, output null instead.
695 }
696 }
697 JavascriptNumber javascript_number;
698
IsUrlQueryEscapeSafeChar(unsigned char c)699 static inline bool IsUrlQueryEscapeSafeChar(unsigned char c) {
700 // Everything not matching [0-9a-zA-Z.,_*/~!()-] is escaped.
701 static unsigned long _safe_characters[8] = {
702 0x00000000L, 0x03fff702L, 0x87fffffeL, 0x47fffffeL,
703 0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L
704 };
705
706 return (_safe_characters[(c)>>5] & (1 << ((c) & 31)));
707 }
708
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const709 void UrlQueryEscape::Modify(const char* in, size_t inlen,
710 const PerExpandData*,
711 ExpandEmitter* out, const string& arg) const {
712 const char* pos = in;
713 const char* const limit = in + inlen;
714 while (true) {
715 // Peel off any initial runs of safe characters and emit them all
716 // at once.
717 const char* start = pos;
718 while (pos < limit && IsUrlQueryEscapeSafeChar(*pos)) {
719 pos++;
720 }
721 EmitRun(start, pos, out);
722
723 // Now deal with a single unsafe character.
724 if (pos < limit) {
725 unsigned char c = *pos;
726 if (c == ' ') {
727 out->Emit('+');
728 } else {
729 out->Emit('%');
730 out->Emit(((c>>4) < 10 ? ((c>>4) + '0') : (((c>>4) - 10) + 'A')));
731 out->Emit(((c&0xf) < 10 ? ((c&0xf) + '0') : (((c&0xf) - 10) + 'A')));
732 }
733 pos++;
734 } else {
735 // We're done!
736 break;
737 }
738 }
739 }
740 UrlQueryEscape url_query_escape;
741
742 // For more information on escaping JSON, see section 2.5 in
743 // http://www.ietf.org/rfc/rfc4627.txt.
744 // Escaping '&', '<', '>' is optional in the JSON proposed RFC
745 // but alleviates concerns with content sniffing if JSON is used
746 // in a context where the browser may attempt to interpret HTML.
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const747 void JsonEscape::Modify(const char* in, size_t inlen,
748 const PerExpandData*,
749 ExpandEmitter* out, const string& arg) const {
750 const char* pos = in;
751 const char* start = pos;
752 const char* const limit = in + inlen;
753 while (pos < limit) {
754 switch (*pos) {
755 default:
756 // Increment our counter and look at the next character.
757 ++pos;
758 continue;
759
760 case '"': EmitRun(start, pos, out); APPEND("\\\""); break;
761 case '\\': EmitRun(start, pos, out); APPEND("\\\\"); break;
762 case '/': EmitRun(start, pos, out); APPEND("\\/"); break;
763 case '\b': EmitRun(start, pos, out); APPEND("\\b"); break;
764 case '\f': EmitRun(start, pos, out); APPEND("\\f"); break;
765 case '\n': EmitRun(start, pos, out); APPEND("\\n"); break;
766 case '\r': EmitRun(start, pos, out); APPEND("\\r"); break;
767 case '\t': EmitRun(start, pos, out); APPEND("\\t"); break;
768 case '&': EmitRun(start, pos, out); APPEND("\\u0026"); break;
769 case '<': EmitRun(start, pos, out); APPEND("\\u003C"); break;
770 case '>': EmitRun(start, pos, out); APPEND("\\u003E"); break;
771 }
772 start = ++pos;
773 }
774 EmitRun(start, pos, out);
775 }
776 JsonEscape json_escape;
777
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const778 void PrefixLine::Modify(const char* in, size_t inlen,
779 const PerExpandData*,
780 ExpandEmitter* out, const string& arg) const {
781 while (inlen > 0) {
782 const char* nl = (const char*)memchr(in, '\n', inlen);
783 const char* cr = (const char*)memchr(in, '\r', nl ? nl - in : inlen);
784 size_t linelen;
785 if (nl == NULL && cr == NULL) {
786 // We're at the last line
787 out->Emit(in, inlen);
788 break;
789 } else {
790 // One or both of \r and \n is set; point to the first char past
791 // the newline. Note for \r\n, that's the char after the \n,
792 // otherwise, it's the char past the \r or the \n we see.
793 if ((nl == NULL) != (cr == NULL)) // one is set, the other is NULL
794 linelen = (nl ? nl : cr) + 1 - in;
795 else if (nl == cr + 1 || nl < cr) // \r\n, or \n comes first
796 linelen = nl + 1 - in;
797 else
798 linelen = cr + 1 - in;
799 }
800 out->Emit(in, linelen);
801 out->Emit(arg); // a new line, so emit the prefix
802 in += linelen;
803 inlen -= linelen;
804 assert(inlen >= 0);
805 }
806 }
807 PrefixLine prefix_line;
808
809
810 // Must be at least one more than the maximum number of alternative modifiers
811 // specified in any given element of g_modifiers.
812 # define MAX_SAFE_ALTERNATIVES 10 // If the compiler complains, increase it.
813
814 // Use the empty string if you want a modifier not to have a long-name.
815 // Use '\0' if you want a modifier not to have a short-name.
816 // Note: not all modifiers are in this array:
817 // 1) SnippetEscape: use html_escape_with_arg=snippet to get this
818 // 2) CleanseAttribute: use html_escape_with_arg=attribute to get this
819 // 3) ValidateUrl: use html_escape_with_arg=url to get this
820 //
821 // Some modifiers define other modifiers that are safe replacements
822 // from an XSS perspective. Replacements are not commutative so for
823 // example H=pre considers H=attribute a safe replacement to it
824 // but H=attribute has no safe replacements.
825 // This struct is not pretty but allows the definitions to be
826 // done without the need for a global initialization method.
827 // Be very careful making a change to g_modifiers as modifiers
828 // point to other ones within that same array so elements
829 // may not be re-ordered easily. Also you need to change
830 // the global g_am_dirs correspondingly.
831 //
832 static struct ModifierWithAlternatives {
833 ModifierInfo modifier_info;
834 ModifierInfo* safe_alt_mods[MAX_SAFE_ALTERNATIVES];
835 } g_modifiers[] = {
836 /* 0 */ { ModifierInfo("cleanse_css", 'c',
837 XSS_WEB_STANDARD, &cleanse_css),
838 {&g_modifiers[16].modifier_info, // url_escape_with_arg=css
839 // img_src_url_escape_with_arg=css
840 &g_modifiers[19].modifier_info} },
841 /* 1 */ { ModifierInfo("html_escape", 'h',
842 XSS_WEB_STANDARD, &html_escape),
843 {&g_modifiers[2].modifier_info, // html_escape_with_arg=snippet
844 &g_modifiers[3].modifier_info, // html_escape_with_arg=pre
845 &g_modifiers[4].modifier_info, // html_escape_with_arg=attribute
846 &g_modifiers[5].modifier_info, // html_escape_with_arg=url
847 &g_modifiers[8].modifier_info, // pre_escape
848 &g_modifiers[9].modifier_info, // url_query_escape
849 &g_modifiers[11].modifier_info, // url_escape_with_arg=html
850 &g_modifiers[12].modifier_info, // url_escape_with_arg=query
851 // img_src_url_escape_with_arg=html
852 &g_modifiers[18].modifier_info} },
853 /* 2 */ { ModifierInfo("html_escape_with_arg=snippet", 'H',
854 XSS_WEB_STANDARD, &snippet_escape),
855 {&g_modifiers[1].modifier_info, // html_escape
856 &g_modifiers[3].modifier_info, // html_escape_with_arg=pre
857 &g_modifiers[4].modifier_info, // html_escape_with_arg=attribute
858 &g_modifiers[8].modifier_info, // pre_escape
859 &g_modifiers[9].modifier_info, // url_query_escape
860 &g_modifiers[12].modifier_info} }, // url_escape_with_arg=query
861 /* 3 */ { ModifierInfo("html_escape_with_arg=pre", 'H',
862 XSS_WEB_STANDARD, &pre_escape),
863 {&g_modifiers[1].modifier_info, // html_escape
864 &g_modifiers[2].modifier_info, // html_escape_with_arg=snippet
865 &g_modifiers[4].modifier_info, // html_escape_with_arg=attribute
866 &g_modifiers[8].modifier_info, // pre_escape
867 &g_modifiers[9].modifier_info, // url_query_escape
868 &g_modifiers[12].modifier_info} }, // url_escape_with_arg=query
869 /* 4 */ { ModifierInfo("html_escape_with_arg=attribute", 'H',
870 XSS_WEB_STANDARD, &cleanse_attribute), {} },
871 /* 5 */ { ModifierInfo("html_escape_with_arg=url", 'H',
872 XSS_WEB_STANDARD, &validate_url_and_html_escape),
873 // img_src_url_escape_with_arg=html
874 {&g_modifiers[18].modifier_info} },
875 /* 6 */ { ModifierInfo("javascript_escape", 'j',
876 XSS_WEB_STANDARD, &javascript_escape),
877 {&g_modifiers[7].modifier_info, // json_escape
878 &g_modifiers[10].modifier_info, // url_escape_with_arg=javascript
879 // img_src_url_escape_with_arg=javascript
880 &g_modifiers[17].modifier_info} },
881 /* 7 */ { ModifierInfo("json_escape", 'o', XSS_WEB_STANDARD, &json_escape),
882 {&g_modifiers[6].modifier_info} }, // javascript_escape
883 /* 8 */ { ModifierInfo("pre_escape", 'p', XSS_WEB_STANDARD, &pre_escape),
884 {&g_modifiers[1].modifier_info, // html_escape
885 &g_modifiers[2].modifier_info, // html_escape_with_arg=snippet
886 &g_modifiers[3].modifier_info, // html_escape_with_arg=pre
887 &g_modifiers[4].modifier_info, // html_escape_with_arg=attr...
888 &g_modifiers[9].modifier_info, // url_query_escape
889 &g_modifiers[12].modifier_info} }, // url_escape_with_arg=query
890 /* 9 */ { ModifierInfo("url_query_escape", 'u',
891 XSS_WEB_STANDARD, &url_query_escape), {} },
892 /* 10 */ { ModifierInfo("url_escape_with_arg=javascript", 'U',
893 XSS_WEB_STANDARD,
894 &validate_url_and_javascript_escape),
895 // img_src_url_escape_with_arg=javascript
896 {&g_modifiers[17].modifier_info} },
897 /* 11 */ { ModifierInfo("url_escape_with_arg=html", 'U',
898 XSS_WEB_STANDARD, &validate_url_and_html_escape),
899 // img_src_url_escape_with_arg=html
900 {&g_modifiers[18].modifier_info} },
901 /* 12 */ { ModifierInfo("url_escape_with_arg=query", 'U',
902 XSS_WEB_STANDARD, &url_query_escape), {} },
903 /* 13 */ { ModifierInfo("none", '\0', XSS_SAFE, &null_modifier), {} },
904 /* 14 */ { ModifierInfo("xml_escape", '\0', XSS_WEB_STANDARD, &xml_escape),
905 {&g_modifiers[1].modifier_info, // html_escape
906 &g_modifiers[4].modifier_info,} }, // H=attribute
907 /* 15 */ { ModifierInfo("javascript_escape_with_arg=number", 'J',
908 XSS_WEB_STANDARD, &javascript_number), {} },
909 /* 16 */ { ModifierInfo("url_escape_with_arg=css", 'U',
910 XSS_WEB_STANDARD, &validate_url_and_css_escape), {} },
911 /* 17 */ { ModifierInfo("img_src_url_escape_with_arg=javascript", 'I',
912 XSS_WEB_STANDARD,
913 &validate_img_src_url_and_javascript_escape), {} },
914 /* 18 */ { ModifierInfo("img_src_url_escape_with_arg=html", 'I',
915 XSS_WEB_STANDARD,
916 &validate_img_src_url_and_html_escape), {} },
917 /* 19 */ { ModifierInfo("img_src_url_escape_with_arg=css", 'I',
918 XSS_WEB_STANDARD,
919 &validate_img_src_url_and_css_escape), {} },
920 };
921
922 static vector<const ModifierInfo*> g_extension_modifiers;
923 static vector<const ModifierInfo*> g_unknown_modifiers;
924
925 // Returns whether or not candidate can be safely (w.r.t XSS)
926 // used in lieu of our ModifierInfo. This is true iff:
927 // 1. Both have the same modifier function OR
928 // 2. Candidate's modifier function is in our ModifierInfo's
929 // list (vector) of safe alternative modifier functions.
930 //
931 // This is used with the auto-escaping code, which automatically
932 // figures out which modifier to apply to a variable based on the
933 // variable's context (in an html "<A HREF", for instance). Some
934 // built-in modifiers are considered safe alternatives from the perspective
935 // of preventing XSS (cross-site-scripting) attacks, in which case
936 // the auto-escaper should allow the choice of which to use in the
937 // template. This is intended only for internal use as it is dangerous
938 // and complicated to figure out which modifier is an XSS-safe
939 // replacement for a given one. Custom modifiers currently may not
940 // indicate safe replacements, only built-in ones may do so.
941 //
942 // Note that this function is not commutative therefore
943 // IsSafeXSSAlternative(a, b) may not be equal to IsSafeXSSAlternative(b, a).
IsSafeXSSAlternative(const ModifierInfo & our,const ModifierInfo & candidate)944 bool IsSafeXSSAlternative(const ModifierInfo& our,
945 const ModifierInfo& candidate) {
946 // Succeeds even for non built-in modifiers but no harm.
947 if (our.modifier == candidate.modifier)
948 return true;
949
950 for (const ModifierWithAlternatives* mod_with_alts = g_modifiers;
951 mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers);
952 ++mod_with_alts) {
953 if (mod_with_alts->modifier_info.long_name == our.long_name)
954 // We found our Modifier in the built-in array g_modifiers.
955 for (int i = 0; mod_with_alts->safe_alt_mods[i] != NULL &&
956 i < MAX_SAFE_ALTERNATIVES; ++i)
957 if (mod_with_alts->safe_alt_mods[i]->long_name == candidate.long_name)
958 // We found candidate in our Modifier's list of safe alternatives.
959 return true;
960 }
961 // our is not built-in or candidate is not a safe replacement to our.
962 return false;
963 }
964
IsExtensionModifier(const char * long_name)965 static inline bool IsExtensionModifier(const char* long_name) {
966 return memcmp(long_name, "x-", 2) == 0;
967 }
968
AddModifierCommon(const char * long_name,const TemplateModifier * modifier,bool xss_safe)969 static bool AddModifierCommon(const char* long_name,
970 const TemplateModifier* modifier, bool xss_safe) {
971 if (!IsExtensionModifier(long_name))
972 return false;
973
974 // TODO(csilvers): store in a map or multimap, rather than a vector
975 for (vector<const ModifierInfo*>::const_iterator mod =
976 g_extension_modifiers.begin();
977 mod != g_extension_modifiers.end();
978 ++mod) {
979 // Check if mod has the same name as us. For modifiers that also take
980 // values, this is everything before the =. The only time it's ok to
981 // have the same name is when we have different modval specializations:
982 // "foo=bar" and "foo=baz" are both valid names. Note "foo" and
983 // "foo=bar" is not valid: foo has no modval, but "foo=bar" does.
984 const size_t new_modifier_namelen = strcspn(long_name, "=");
985 const size_t existing_modifier_namelen = strcspn((*mod)->long_name.c_str(),
986 "=");
987 if (new_modifier_namelen == existing_modifier_namelen &&
988 !memcmp(long_name, (*mod)->long_name.c_str(), new_modifier_namelen)) {
989 if (long_name[new_modifier_namelen] == '=' &&
990 (*mod)->long_name[existing_modifier_namelen] == '=' &&
991 (*mod)->long_name != long_name) {
992 // It's ok, we're different specializations!
993 } else {
994 // It's not ok: we have the same name and no good excuse.
995 return false;
996 }
997 }
998 }
999
1000 g_extension_modifiers.push_back(
1001 new ModifierInfo(long_name, '\0',
1002 xss_safe ? XSS_SAFE : XSS_UNIQUE,
1003 modifier));
1004 return true;
1005 }
1006
1007 // Modifier added with XSS_UNIQUE XssClass.
AddModifier(const char * long_name,const TemplateModifier * modifier)1008 bool AddModifier(const char* long_name,
1009 const TemplateModifier* modifier) {
1010 return AddModifierCommon(long_name, modifier, false);
1011 }
1012
1013 // Modifier added with XSS_SAFE XssClass.
AddXssSafeModifier(const char * long_name,const TemplateModifier * modifier)1014 bool AddXssSafeModifier(const char* long_name,
1015 const TemplateModifier* modifier) {
1016 return AddModifierCommon(long_name, modifier, true);
1017 }
1018
1019 // If candidate_match is a better match for modname/modval than bestmatch,
1020 // update bestmatch. To be a better match, two conditions must be met:
1021 // 1) The candidate's name must match modname
1022 // 2) If the candidate is a specialization (that is, name is of the form
1023 // "foo=bar", then modval matches the specialization value).
1024 // 3) If the candidate is not a specialization, bestmatch isn't a
1025 // specialization either.
1026 // Condition (3) makes sure that if we match the ModifierInfo with name
1027 // "foo=bar", we don't claim the ModifierInfo "foo=" is a better match.
1028 // Recall that by definition, modval will always start with a '=' if present.
UpdateBestMatch(const char * modname,size_t modname_len,const char * modval,size_t modval_len,const ModifierInfo * candidate_match,const ModifierInfo ** best_match)1029 static void UpdateBestMatch(const char* modname, size_t modname_len,
1030 const char* modval, size_t modval_len,
1031 const ModifierInfo* candidate_match,
1032 const ModifierInfo** best_match) {
1033 // It's easiest to handle the two case differently: (1) candidate_match
1034 // refers to a modifier that expects a modifier-value; (2) it doesn't.
1035 if (candidate_match->modval_required) {
1036 // To be a match, we have to fulfill three requirements: we have a
1037 // modval, our modname matches candidate_match's modname (either
1038 // shortname or longname), and our modval is consistent with the
1039 // value specified in the longname (whatever might follow the =).
1040 const char* const longname_start = candidate_match->long_name.c_str();
1041 const char* const equals = strchr(longname_start, '=');
1042 assert(equals != NULL);
1043 if (modval_len > 0 &&
1044 ((modname_len == 1 && *modname == candidate_match->short_name) ||
1045 (modname_len == equals - longname_start &&
1046 memcmp(modname, longname_start, modname_len) == 0)) &&
1047 ((equals[1] == '\0') || // name is "foo=" (not a specialization)
1048 (modval_len
1049 == longname_start + candidate_match->long_name.size() - equals &&
1050 memcmp(modval, equals, modval_len) == 0))) {
1051 // Condition (3) above is satisfied iff our longname is longer than
1052 // best-match's longname (so we prefer "foo=bar" to "foo=").
1053 if (*best_match == NULL ||
1054 candidate_match->long_name.size() > (*best_match)->long_name.size())
1055 *best_match = candidate_match;
1056 }
1057 } else {
1058 // In this case, to be a match: we must *not* have a modval. Our
1059 // modname still must match modinfo's modname (either short or long).
1060 if (modval_len == 0 &&
1061 ((modname_len == 1 && *modname == candidate_match->short_name) ||
1062 (modname_len == candidate_match->long_name.size() &&
1063 !memcmp(modname, candidate_match->long_name.data(), modname_len)))) {
1064 // In the no-modval case, only one match should exist.
1065 assert(*best_match == NULL);
1066 *best_match = candidate_match;
1067 }
1068 }
1069 }
1070
FindModifier(const char * modname,size_t modname_len,const char * modval,size_t modval_len)1071 const ModifierInfo* FindModifier(const char* modname, size_t modname_len,
1072 const char* modval, size_t modval_len) {
1073 // More than one modifier can match, in the case of modval specializations
1074 // (e.g., the modifier "foo=" and "foo=bar" will both match on input of
1075 // modname="foo", modval="bar"). In that case, we take the ModifierInfo
1076 // with the longest longname, since that's the most specialized match.
1077 const ModifierInfo* best_match = NULL;
1078 if (modname_len >= 2 && IsExtensionModifier(modname)) {
1079 for (vector<const ModifierInfo*>::const_iterator mod =
1080 g_extension_modifiers.begin();
1081 mod != g_extension_modifiers.end();
1082 ++mod) {
1083 UpdateBestMatch(modname, modname_len, modval, modval_len,
1084 *mod, &best_match);
1085 }
1086 if (best_match != NULL)
1087 return best_match;
1088
1089 for (vector<const ModifierInfo*>::const_iterator mod =
1090 g_unknown_modifiers.begin();
1091 mod != g_unknown_modifiers.end();
1092 ++mod) {
1093 UpdateBestMatch(modname, modname_len, modval, modval_len,
1094 *mod, &best_match);
1095 }
1096 if (best_match != NULL)
1097 return best_match;
1098 // This is the only situation where we can pass in a modifier of NULL.
1099 // It means "we don't know about this modifier-name."
1100 string fullname(modname, modname_len);
1101 if (modval_len) {
1102 fullname.append(modval, modval_len);
1103 }
1104 // TODO(csilvers): store in a map or multimap, rather than a vector
1105 g_unknown_modifiers.push_back(new ModifierInfo(fullname, '\0',
1106 XSS_UNIQUE, NULL));
1107 return g_unknown_modifiers.back();
1108 } else {
1109 for (const ModifierWithAlternatives* mod_with_alts = g_modifiers;
1110 mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers);
1111 ++mod_with_alts) {
1112 UpdateBestMatch(modname, modname_len, modval, modval_len,
1113 &mod_with_alts->modifier_info, &best_match);
1114 }
1115 return best_match;
1116 }
1117 }
1118
1119 // For escaping variables under the auto-escape mode:
1120 // Each directive below maps to a distinct sequence of
1121 // escaping directives (i.e a vector<ModifierAndValue>) applied
1122 // to a variable during run-time substitution.
1123 // The directives are stored in a global array (g_mods_ae)
1124 // initialized under lock in InitializeGlobalModifiers.
1125 enum AutoModifyDirective {
1126 AM_EMPTY, // Unused, kept as marker.
1127 AM_HTML,
1128 AM_HTML_UNQUOTED,
1129 AM_JS,
1130 AM_JS_NUMBER,
1131 AM_URL_HTML,
1132 AM_URL_QUERY,
1133 AM_STYLE,
1134 AM_XML,
1135 NUM_ENTRIES_AM,
1136 };
1137
1138 // Populates the global vector of hard-coded modifiers that
1139 // Auto-Escape may pick. We point to the appropriate modifier in
1140 // the global g_modifiers.
1141 // Reference these globals via the global array g_am_dirs[] for consistency.
1142 // Note: We allow for more than one ModifierAndValue in the array hence
1143 // the need to terminate with a Null marker. However currently all the
1144 // escaping directives have exactly one ModifierAndValue.
1145 static const ModifierAndValue g_am_empty[] = {
1146 ModifierAndValue(NULL, "", 0)
1147 };
1148 static const ModifierAndValue g_am_html[] = {
1149 ModifierAndValue(&g_modifiers[1].modifier_info, "", 0),
1150 ModifierAndValue(NULL, "", 0)
1151 };
1152 static const ModifierAndValue g_am_html_unquoted[] = {
1153 ModifierAndValue(&g_modifiers[4].modifier_info, "=attribute", 10),
1154 ModifierAndValue(NULL, "", 0)
1155 };
1156 static const ModifierAndValue g_am_js[] = {
1157 ModifierAndValue(&g_modifiers[6].modifier_info, "", 0),
1158 ModifierAndValue(NULL, "", 0)
1159 };
1160 static const ModifierAndValue g_am_js_number[] = {
1161 ModifierAndValue(&g_modifiers[15].modifier_info, "=number", 7),
1162 ModifierAndValue(NULL, "", 0)
1163 };
1164 static const ModifierAndValue g_am_url_html[] = {
1165 ModifierAndValue(&g_modifiers[11].modifier_info, "=html", 5),
1166 ModifierAndValue(NULL, "", 0)
1167 };
1168 static const ModifierAndValue g_am_url_query[] = {
1169 ModifierAndValue(&g_modifiers[9].modifier_info, "", 0),
1170 ModifierAndValue(NULL, "", 0)
1171 };
1172 static const ModifierAndValue g_am_style[] = {
1173 ModifierAndValue(&g_modifiers[0].modifier_info, "", 0),
1174 ModifierAndValue(NULL, "", 0)
1175 };
1176 static const ModifierAndValue g_am_xml[] = {
1177 ModifierAndValue(&g_modifiers[14].modifier_info, "", 0),
1178 ModifierAndValue(NULL, "", 0)
1179 };
1180
1181 static const ModifierAndValue* g_am_dirs[NUM_ENTRIES_AM] = {
1182 g_am_empty, /* AM_EMPTY */
1183 g_am_html, /* AM_HTML */
1184 g_am_html_unquoted, /* AM_HTML_UNQUOTED */
1185 g_am_js, /* AM_JS */
1186 g_am_js_number, /* AM_JS_NUMBER */
1187 g_am_url_html, /* AM_URL_HTML */
1188 g_am_url_query, /* AM_URL_QUERY */
1189 g_am_style, /* AM_STYLE */
1190 g_am_xml, /* AM_XML */
1191 };
1192
PrettyPrintOneModifier(const ModifierAndValue & modval)1193 string PrettyPrintOneModifier(const ModifierAndValue& modval) {
1194 string out;
1195 out.append(":");
1196 if (modval.modifier_info->short_name) // short_name is a char.
1197 out.append(1, modval.modifier_info->short_name);
1198 else
1199 out.append(modval.modifier_info->long_name);
1200 if (modval.value_len != 0)
1201 out.append(modval.value, modval.value_len);
1202 return out;
1203 }
1204
PrettyPrintModifiers(const vector<const ModifierAndValue * > & modvals,const string & separator)1205 string PrettyPrintModifiers(const vector<const ModifierAndValue*>& modvals,
1206 const string& separator) {
1207 string out;
1208 for (vector<const ModifierAndValue*>::const_iterator it =
1209 modvals.begin(); it != modvals.end(); ++it) {
1210 if (it != modvals.begin())
1211 out.append(separator);
1212 out.append(PrettyPrintOneModifier(**it));
1213 }
1214 return out;
1215 }
1216
1217 // Return the sequence of escaping directives to apply for the given context.
1218 // An empty vector indicates an error occurred. Currently we never need
1219 // to chain escaping directives hence on success, the vector is always of
1220 // size 1. This may change in the future.
GetModifierForHtmlJs(HtmlParser * htmlparser,string * error_msg)1221 vector<const ModifierAndValue*> GetModifierForHtmlJs(
1222 HtmlParser* htmlparser, string* error_msg) {
1223 assert(htmlparser);
1224 assert(error_msg);
1225 vector<const ModifierAndValue*> modvals;
1226
1227 // Two cases of being inside javascript:
1228 // 1. Inside raw javascript (within a <script> tag). If the value
1229 // is quoted we apply javascript_escape, if not we have to coerce
1230 // it to a safe value due to the risk of javascript code execution
1231 // hence apply :J=number. If arbitrary code needs to be inserted
1232 // at run-time, the developer must use :none.
1233 // 2. In the value of an attribute that takes javascript such
1234 // as onmouseevent in '<a href="someUrl" onmousevent="{{EVENT}}">'.
1235 // That will be covered in the STATE_VALUE state logic below.
1236 if (htmlparser->InJavascript() &&
1237 htmlparser->state() != HtmlParser::STATE_VALUE) {
1238 if (htmlparser->IsJavascriptQuoted()) {
1239 modvals.push_back(g_am_dirs[AM_JS]);
1240 assert(modvals.size() == 1);
1241 return modvals;
1242 } else {
1243 modvals.push_back(g_am_dirs[AM_JS_NUMBER]);
1244 assert(modvals.size() == 1);
1245 return modvals;
1246 }
1247 }
1248 switch (htmlparser->state()) {
1249 case HtmlParser::STATE_VALUE:{
1250 string attribute_name = htmlparser->attribute();
1251 switch (htmlparser->AttributeType()) {
1252 case HtmlParser::ATTR_URI:
1253 // Case 1: The URL is quoted:
1254 // . Apply :U=html if it is a complete URL or :h if it is a fragment.
1255 // Case 2: The URL is not quoted:
1256 // . If it is a complete URL, we have no safe modifiers that
1257 // won't break it so we have to fail.
1258 // . If it is a URL fragment, then :u is safe and not likely to
1259 // break the URL.
1260 if (!htmlparser->IsAttributeQuoted()) {
1261 if (htmlparser->IsUrlStart()) { // Complete URL.
1262 error_msg->append("Value of URL attribute \"" + attribute_name +
1263 "\" must be enclosed in quotes.");
1264 assert(modvals.empty());
1265 return modvals; // Empty
1266 } else { // URL fragment.
1267 modvals.push_back(g_am_dirs[AM_URL_QUERY]);
1268 }
1269 } else {
1270 // Only validate the URL if we have a complete URL,
1271 // otherwise simply html_escape.
1272 if (htmlparser->IsUrlStart())
1273 modvals.push_back(g_am_dirs[AM_URL_HTML]);
1274 else
1275 modvals.push_back(g_am_dirs[AM_HTML]);
1276 }
1277 break;
1278 case HtmlParser::ATTR_REGULAR:
1279 // If the value is quoted, simply HTML escape, otherwise
1280 // apply stricter escaping using H=attribute.
1281 if (htmlparser->IsAttributeQuoted())
1282 modvals.push_back(g_am_dirs[AM_HTML]);
1283 else
1284 modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
1285 break;
1286 case HtmlParser::ATTR_STYLE:
1287 // If the value is quoted apply :c, otherwise fail.
1288 if (htmlparser->IsAttributeQuoted()) {
1289 modvals.push_back(g_am_dirs[AM_STYLE]);
1290 } else {
1291 error_msg->append("Value of style attribute \"" + attribute_name +
1292 "\" must be enclosed in quotes.");
1293 assert(modvals.empty());
1294 return modvals; // Empty
1295 }
1296 break;
1297 case HtmlParser::ATTR_JS:
1298 // We require javascript accepting attributes (such as onclick)
1299 // to be HTML quoted, otherwise they are vulnerable to
1300 // HTML attribute insertion via the use of whitespace.
1301 if (!htmlparser->IsAttributeQuoted()) {
1302 error_msg->append("Value of javascript attribute \"" +
1303 attribute_name +
1304 "\" must be enclosed in quotes.");
1305 assert(modvals.empty());
1306 return modvals; // Empty
1307 }
1308 // If the variable is quoted apply javascript_escape otherwise
1309 // apply javascript_number which will ensure it is safe against
1310 // code injection.
1311 // Note: We normally need to HTML escape after javascript escape
1312 // but the javascript escape implementation provided makes the
1313 // HTML escape redundant so simply javascript escape.
1314 if (htmlparser->IsJavascriptQuoted())
1315 modvals.push_back(g_am_dirs[AM_JS]);
1316 else
1317 modvals.push_back(g_am_dirs[AM_JS_NUMBER]);
1318 break;
1319 case HtmlParser::ATTR_NONE:
1320 assert("We should be in attribute!" && 0);
1321 default:
1322 assert("Should not be able to get here." && 0);
1323 return modvals; // Empty
1324 }
1325 // In STATE_VALUE particularly, the parser may get out of sync with
1326 // the correct state - that the browser sees - due to the fact that
1327 // it does not get to parse run-time content (variables). So we tell
1328 // the parser there is content that will be expanded here.
1329 // A good example is:
1330 // <a href={{URL}} alt={{NAME}}>
1331 // The parser sees <a href= alt=> and interprets 'alt=' to be
1332 // the value of href.
1333 htmlparser->InsertText(); // Ignore return value.
1334 assert(modvals.size() == 1);
1335 return modvals;
1336 }
1337 case HtmlParser::STATE_TAG:{
1338 // Apply H=attribute to tag names since they are alphabetic.
1339 // Examples of tag names: TITLE, BODY, A and BR.
1340 modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
1341 assert(modvals.size() == 1);
1342 return modvals;
1343 }
1344 case HtmlParser::STATE_ATTR:{
1345 // Apply H=attribute to attribute names since they are alphabetic.
1346 // Examples of attribute names: HREF, SRC and WIDTH.
1347 modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
1348 assert(modvals.size() == 1);
1349 return modvals;
1350 }
1351 case HtmlParser::STATE_COMMENT:
1352 case HtmlParser::STATE_TEXT:{
1353 // Apply :h to regular HTML text and :c if within a style tag.
1354 if (htmlparser->InCss())
1355 modvals.push_back(g_am_dirs[AM_STYLE]);
1356 else
1357 modvals.push_back(g_am_dirs[AM_HTML]);
1358 assert(modvals.size() == 1);
1359 return modvals;
1360 }
1361 default:{
1362 assert("Should not be able to get here." && 0);
1363 return modvals; // Empty
1364 }
1365 }
1366 assert("Should not be able to get here." && 0);
1367 return modvals; // Empty
1368 }
1369
1370 // TODO(jad): Memoize all GetModifierForXXX functions below.
1371 // They don't depend on parser context (from csilvers).
GetModifierForCss(HtmlParser * htmlparser,string * error_msg)1372 vector<const ModifierAndValue*> GetModifierForCss(HtmlParser* htmlparser,
1373 string* error_msg) {
1374 vector<const ModifierAndValue*> modvals;
1375 modvals.push_back(g_am_dirs[AM_STYLE]);
1376 return modvals;
1377 }
1378
GetModifierForXml(HtmlParser * htmlparser,string * error_msg)1379 vector<const ModifierAndValue*> GetModifierForXml(HtmlParser* htmlparser,
1380 string* error_msg) {
1381 vector<const ModifierAndValue*> modvals;
1382 modvals.push_back(g_am_dirs[AM_XML]);
1383 return modvals;
1384 }
1385
GetModifierForJson(HtmlParser * htmlparser,string * error_msg)1386 vector<const ModifierAndValue*> GetModifierForJson(HtmlParser* htmlparser,
1387 string* error_msg) {
1388 vector<const ModifierAndValue*> modvals;
1389 modvals.push_back(g_am_dirs[AM_JS]);
1390 return modvals;
1391 }
1392
GetDefaultModifierForHtml()1393 vector<const ModifierAndValue*> GetDefaultModifierForHtml() {
1394 vector<const ModifierAndValue*> modvals;
1395 modvals.push_back(g_am_dirs[AM_HTML]);
1396 return modvals;
1397 }
1398
GetDefaultModifierForJs()1399 vector<const ModifierAndValue*> GetDefaultModifierForJs() {
1400 vector<const ModifierAndValue*> modvals;
1401 modvals.push_back(g_am_dirs[AM_JS]);
1402 return modvals;
1403 }
1404
GetDefaultModifierForCss()1405 vector<const ModifierAndValue*> GetDefaultModifierForCss() {
1406 return GetModifierForCss(NULL, NULL);
1407 }
1408
GetDefaultModifierForXml()1409 vector<const ModifierAndValue*> GetDefaultModifierForXml() {
1410 return GetModifierForXml(NULL, NULL);
1411 }
1412
GetDefaultModifierForJson()1413 vector<const ModifierAndValue*> GetDefaultModifierForJson() {
1414 return GetModifierForJson(NULL, NULL);
1415 }
1416
1417 }
1418