1 // Copyright (c) 2007, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 // ---
31 // Author: csilvers@google.com (Craig Silverstein)
32 //
33 // template_modifiers.h has a description of what each escape-routine does.
34 //
35 // When creating a new modifier, you must subclass TemplateModifier
36 // and define your own Modify() method.  This method takes the string
37 // to be modified as a char*/int pair.  It then emits the modified
38 // version of the string to outbuf.  Outbuf is an ExpandEmitter, as
39 // defined in template_modifiers.h.  It's a very simple type that
40 // supports appending to a data stream.
41 //
42 // Be very careful editing an existing modifier.  Subtle changes can
43 // introduce the possibility for cross-site scripting attacks.  If you
44 // do change a modifier, be careful that it does not affect
45 // the list of Safe XSS Alternatives.
46 //
47 
48 #include <config.h>
49 #include <stdlib.h>
50 #include <assert.h>
51 #include <string.h>
52 #include <string>
53 #include <vector>
54 #include "htmlparser/htmlparser_cpp.h"
55 #include <ctemplate/template_modifiers.h>
56 #include "template_modifiers_internal.h"
57 #include <ctemplate/per_expand_data.h>
58 using std::string;
59 using std::vector;
60 
61 #define strliterallen(s)  (sizeof("" s "") - 1)
62 
63 // Really we should be using uint_16_t or something, but this is good
64 // enough, and more portable...
65 typedef unsigned int uint16;
66 
67 namespace URL {
HasInsecureProtocol(const char * in,int inlen)68 bool HasInsecureProtocol(const char* in, int inlen) {
69   if (inlen > strliterallen("http://") &&
70       strncasecmp(in, "http://", strliterallen("http://")) == 0) {
71     return false;  // We're ok, it's an http protocol
72   }
73   if (inlen > strliterallen("https://") &&
74       strncasecmp(in, "https://", strliterallen("https://")) == 0) {
75     return false;  // https is ok as well
76   }
77   if (inlen > strliterallen("ftp://") &&
78       strncasecmp(in, "ftp://", strliterallen("ftp://")) == 0) {
79     return false;  // and ftp
80   }
81   return true;
82 }
83 }  // namespace URL
84 
85 namespace ctemplate {
86 
87 using ctemplate_htmlparser::HtmlParser;
88 
89 // A most-efficient way to append a string literal to the var named 'out'.
90 // The ""s ensure literal is actually a string literal
91 #define APPEND(literal)  out->Emit("" literal "", sizeof(literal)-1)
92 
93 // Check whether the string of length len is identical to the literal.
94 // The ""s ensure literal is actually a string literal
95 #define STR_IS(str, len, literal) \
96   ((len) == sizeof("" literal "") - 1 && \
97    memcmp(str, literal, sizeof("" literal "") - 1) == 0)
98 
~TemplateModifier()99 TemplateModifier::~TemplateModifier() {}
100 
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const101 void NullModifier::Modify(const char* in, size_t inlen,
102                           const PerExpandData*,
103                           ExpandEmitter* out, const string& arg) const {
104   out->Emit(in, inlen);
105 }
106 NullModifier null_modifier;
107 
EmitRun(const char * start,const char * limit,ExpandEmitter * out)108 static inline void EmitRun(const char* start, const char* limit,
109                            ExpandEmitter* out) {
110   if (start < limit) {
111     out->Emit(start, (limit - start));
112   }
113 }
114 
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const115 void HtmlEscape::Modify(const char* in, size_t inlen,
116                         const PerExpandData*,
117                         ExpandEmitter* out, const string& arg) const {
118   const char* pos = in;
119   const char* start = pos;
120   const char* const limit = in + inlen;
121   while (pos < limit) {
122     switch (*pos) {
123       default:
124         // Increment our counter and look at the next character.
125         ++pos;
126         continue;
127 
128       case '&':  EmitRun(start, pos, out); APPEND("&amp;");  break;
129       case '"':  EmitRun(start, pos, out); APPEND("&quot;"); break;
130       case '\'': EmitRun(start, pos, out); APPEND("&#39;");  break;
131       case '<':  EmitRun(start, pos, out); APPEND("&lt;");   break;
132       case '>':  EmitRun(start, pos, out); APPEND("&gt;");   break;
133 
134       case '\r': case '\n': case '\v': case '\f': case '\t':
135         EmitRun(start, pos, out); APPEND(" ");      break;
136     }
137     start = ++pos;
138   }
139   EmitRun(start, pos, out);
140 }
141 HtmlEscape html_escape;
142 
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const143 void PreEscape::Modify(const char* in, size_t inlen,
144                        const PerExpandData*,
145                        ExpandEmitter* out, const string& arg) const {
146   const char* pos = in;
147   const char* start = pos;
148   const char* const limit = in + inlen;
149   while (pos < limit)  {
150     switch (*pos) {
151       default:
152         // Increment our counter and look at the next character.
153         ++pos;
154         continue;
155 
156       // Unlike HtmlEscape, we leave whitespace as is.
157       case '&':  EmitRun(start, pos, out); APPEND("&amp;");  break;
158       case '"':  EmitRun(start, pos, out); APPEND("&quot;"); break;
159       case '\'': EmitRun(start, pos, out); APPEND("&#39;");  break;
160       case '<':  EmitRun(start, pos, out); APPEND("&lt;");   break;
161       case '>':  EmitRun(start, pos, out); APPEND("&gt;");   break;
162     }
163     start = ++pos;
164   }
165   EmitRun(start, pos, out);
166 }
167 PreEscape pre_escape;
168 
169 // We encode the presence and ordering of unclosed tags in a string, using the
170 // letters b, i, s, and e to stand for <b>, <i>, <span>, and <em> respectively.
171 // The most recently opened tag is appended onto the end of the string, so in
172 // the common case of properly nested tags, we need only look at the last
173 // character.  If we don't find it there, we need to continue looking at
174 // everything until we find it, because tags may not necessarily be in order.
175 // Similarly, when we add a tag, we need to check each existing tag for a match
176 // so that we don't nest.
177 class UnclosedSnippetTags {
178  public:
179   // We could use ordinary ints for the enum values, but using mnemonic
180   // characters potentially makes debugging easier.
181   typedef enum {
182     TAG_B = 'b',
183     TAG_I = 'i',
184     TAG_EM = 'e',
185     TAG_SPAN = 's',
186   } Tag;
187 
UnclosedSnippetTags()188   UnclosedSnippetTags() : tag_length(0) {
189     memset(tags, 0, 5);
190   }
191 
192   // Adds a tag to the set of open tags if it's not already open, or otherwise
193   // return false.
MaybeAdd(Tag tag)194   inline bool MaybeAdd(Tag tag) {
195     if (strchr(tags, tag)) {
196       return false;
197     } else {
198       tags[tag_length++] = tag;
199       return true;
200     }
201   }
202 
203   // Removes a tag from the set of open tags if it's open, or otherwise return
204   // false.
MaybeRemove(Tag tag)205   inline bool MaybeRemove(Tag tag) {
206     char* tag_location = strchr(tags, tag);
207     if (tag_location) {
208       for (char* c = tag_location; *c; ++c) {
209         // Have to copy all later tags down by one so we don't leave a gap in the
210         // array.
211         *c = *(c + 1);
212       }
213       --tag_length;
214       return true;
215     } else {
216       return false;
217     }
218   }
219 
PrintClosingTags(ExpandEmitter * out)220   inline void PrintClosingTags(ExpandEmitter* out) {
221     for (int i = tag_length; i >= 0; --i) {
222       switch (tags[i]) {
223         case TAG_B:
224           out->Emit("</b>"); break;
225         case TAG_I:
226           out->Emit("</i>"); break;
227         case TAG_EM:
228           out->Emit("</em>"); break;
229         case TAG_SPAN:
230           out->Emit("</span>"); break;
231       }
232     }
233   }
234 
235  private:
236   char tags[5];
237   int tag_length;
238 };
239 
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const240 void SnippetEscape::Modify(const char* in, size_t inlen,
241                            const PerExpandData*,
242                            ExpandEmitter* out, const string& arg) const {
243   UnclosedSnippetTags unclosed;
244   const char* pos = in;
245   const char* start = pos;
246   const char* const limit = in + inlen;
247   while (pos < limit) {
248     switch (*pos) {
249       default:
250         // Increment our counter and look at the next character.
251         ++pos;
252         continue;
253 
254       case '<': {
255         // If there is a permissible tag, just advance pos past it to
256         // make it part of the current run.  Notice the use of
257         // "continue" below.
258         const char* const next_pos = pos + 1;
259         const int chars_left = limit - next_pos;
260         if ((chars_left >= 2) && !memcmp(next_pos, "b>", 2)
261             && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_B)) {
262           pos += strliterallen("<b>");
263           continue;
264         } else if ((chars_left >= 2) && !memcmp(next_pos, "i>", 2)
265                    && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_I)) {
266           pos += strliterallen("<i>");
267           continue;
268         } else if ((chars_left >= 3) && !memcmp(next_pos, "em>", 3)
269                    && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_EM)) {
270           pos += strliterallen("<em>");
271           continue;
272         } else if ((chars_left >= 13) && !memcmp(next_pos, "span dir=", 9)
273                    && (!memcmp(next_pos + 9, "ltr>", 4) ||
274                        !memcmp(next_pos + 9, "rtl>", 4))
275                    && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_SPAN)) {
276           pos += strliterallen("<span dir=ltr>");
277           continue;
278         } else if ((chars_left >= 3) && !memcmp(next_pos, "/b>", 3)
279                    && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_B)) {
280           pos += strliterallen("</b>");
281           continue;
282         } else if ((chars_left >= 3) && !memcmp(next_pos, "/i>", 3)
283                    && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_I)) {
284           pos += strliterallen("</i>");
285           continue;
286         } else if ((chars_left >= 4) && !memcmp(next_pos, "/em>", 4)
287                    && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_EM)) {
288           pos += strliterallen("</em>");
289           continue;
290         } else if ((chars_left >= 6) && !memcmp(next_pos, "/span>", 6)
291                    && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_SPAN)) {
292           pos += strliterallen("</span>");
293           continue;
294         } else if ((chars_left >= 3) && !memcmp(next_pos, "br>", 3)) {
295           pos += strliterallen("<br>");
296           continue;
297         } else if ((chars_left >= 4) && !memcmp(next_pos, "wbr>", 4)) {
298           pos += strliterallen("<wbr>");
299           continue;
300         }
301 
302         // Emit the entity and break out of the switch.
303         EmitRun(start, pos, out);
304         APPEND("&lt;");
305         break;
306       }
307 
308       case '&':
309         EmitRun(start, pos, out);
310         if (pos + 1 < limit && pos[1] == '{') {
311           // Could be a javascript entity, so we need to escape.
312           // (Javascript entities are an xss risk in Netscape 4.)
313           APPEND("&amp;");
314         } else {
315           APPEND("&");
316         }
317         break;
318 
319       case '"':  EmitRun(start, pos, out); APPEND("&quot;"); break;
320       case '\'': EmitRun(start, pos, out); APPEND("&#39;");  break;
321       case '>':  EmitRun(start, pos, out); APPEND("&gt;");   break;
322 
323       case '\r': case '\n': case '\v': case '\f': case '\t':
324         // non-space whitespace
325         EmitRun(start, pos, out); APPEND(" "); break;
326 
327     }
328     start = ++pos;
329   }
330   EmitRun(start, pos, out);
331   unclosed.PrintClosingTags(out);
332 }
333 SnippetEscape snippet_escape;
334 
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const335 void CleanseAttribute::Modify(const char* in, size_t inlen,
336                               const PerExpandData*,
337                               ExpandEmitter* out, const string& arg) const {
338   for (size_t i = 0; i < inlen; ++i) {
339     char c = in[i];
340     switch (c) {
341       case '=': {
342         if (i == 0 || i == (inlen - 1))
343           out->Emit('_');
344         else
345           out->Emit(c);
346         break;
347       }
348       case '-':
349       case '.':
350       case '_':
351       case ':': {
352         out->Emit(c);
353         break;
354       }
355       default: {
356         if ((c >= 'a' && c <= 'z') ||
357             (c >= 'A' && c <= 'Z') ||
358             (c >= '0' && c <= '9')) {
359           out->Emit(c);
360         } else {
361           APPEND("_");
362         }
363         break;
364       }
365     }
366   }
367 }
368 CleanseAttribute cleanse_attribute;
369 
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const370 void CleanseCss::Modify(const char* in, size_t inlen,
371                         const PerExpandData*,
372                         ExpandEmitter* out, const string& arg) const {
373   for (size_t i = 0; i < inlen; ++i) {
374     char c = in[i];
375     switch (c) {
376       case ' ':
377       case '_':
378       case '.':
379       case ',':
380       case '!':
381       case '#':
382       case '%':
383       case '-': {
384         out->Emit(c);
385         break;
386       }
387       default: {
388         if ((c >= 'a' && c <= 'z') ||
389             (c >= 'A' && c <= 'Z') ||
390             (c >= '0' && c <= '9')) {
391           out->Emit(c);
392         }
393         break;
394       }
395     }
396   }
397 }
398 CleanseCss cleanse_css;
399 
400 // CssUrlEscape is used as a chained modifier by ValidateUrl
401 // (validate_url_and_css_escape) and is not directly exposed.
402 class CssUrlEscape : public TemplateModifier {
403  public:
404   virtual void Modify(const char* in, size_t inlen,
405                       const PerExpandData*, ExpandEmitter* outbuf,
406                       const string& arg) const;
407 };
408 
409 // URL-encodes the characters [\n\r\\'"()<>*] to ensure the URL can be safely
410 // inserted in a CSS context, e.g:
411 // . In an '@import url("URL");' statement
412 // . In a CSS property such as 'background: url("URL");'
413 // In both locations above, enclosing quotes are optional but parens are not.
414 // We want to make sure the URL cannot exit the parens enclosure, close a
415 // STYLE tag or reset the browser's CSS parser (via comments or newlines).
416 //
417 // References:
418 // . CSS 2.1 URLs: http://www.w3.org/TR/CSS21/syndata.html#url
419 // . CSS 1 URLs: http://www.w3.org/TR/REC-CSS1/#url
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const420 void CssUrlEscape::Modify(const char* in, size_t inlen,
421                           const PerExpandData*,
422                           ExpandEmitter* out, const string& arg) const {
423   for (size_t i = 0; i < inlen; ++i) {
424     char c = in[i];
425     switch (c) {
426       case '\n': APPEND("%0A"); break;
427       case '\r': APPEND("%0D"); break;
428       case '"':  APPEND("%22"); break;
429       case '\'': APPEND("%27"); break;
430       case '(':  APPEND("%28"); break;
431       case ')':  APPEND("%29"); break;
432       case '*':  APPEND("%2A"); break;
433       case '<':  APPEND("%3C"); break;
434       case '>':  APPEND("%3E"); break;
435       case '\\': APPEND("%5C"); break;
436       default: out->Emit(c); break;
437     }
438   }
439 }
440 CssUrlEscape css_url_escape;
441 
442 // These URLs replace unsafe URLs for :U and :I url-escaping modes.
443 const char* const ValidateUrl::kUnsafeUrlReplacement = "#";
444 const char* const ValidateUrl::kUnsafeImgSrcUrlReplacement =
445     "/images/cleardot.gif";
446 
Modify(const char * in,size_t inlen,const PerExpandData * per_expand_data,ExpandEmitter * out,const string & arg) const447 void ValidateUrl::Modify(const char* in, size_t inlen,
448                          const PerExpandData* per_expand_data,
449                          ExpandEmitter* out, const string& arg) const {
450   const char* slashpos = (char*)memchr(in, '/', inlen);
451   if (slashpos == NULL) {
452     slashpos = in + inlen;
453   }
454   const void* colonpos = memchr(in, ':', slashpos - in);
455   // colon before first slash, could be a protocol
456   if (colonpos != NULL && URL::HasInsecureProtocol(in, inlen)) {
457     // It's a bad protocol, so return something safe
458     chained_modifier_.Modify(unsafe_url_replacement_,
459                              unsafe_url_replacement_length_,
460                              per_expand_data,
461                              out,
462                              "");
463     return;
464   }
465   // If we get here, it's a valid url, so just escape it
466   chained_modifier_.Modify(in, inlen, per_expand_data, out, "");
467 }
468 ValidateUrl validate_url_and_html_escape(
469     html_escape,
470     ValidateUrl::kUnsafeUrlReplacement);
471 ValidateUrl validate_url_and_javascript_escape(
472     javascript_escape,
473     ValidateUrl::kUnsafeUrlReplacement);
474 ValidateUrl validate_url_and_css_escape(
475     css_url_escape,
476     ValidateUrl::kUnsafeUrlReplacement);
477 ValidateUrl validate_img_src_url_and_html_escape(
478     html_escape,
479     ValidateUrl::kUnsafeImgSrcUrlReplacement);
480 ValidateUrl validate_img_src_url_and_javascript_escape(
481     javascript_escape,
482     ValidateUrl::kUnsafeImgSrcUrlReplacement);
483 ValidateUrl validate_img_src_url_and_css_escape(
484     css_url_escape,
485     ValidateUrl::kUnsafeImgSrcUrlReplacement);
486 
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const487 void XmlEscape::Modify(const char* in, size_t inlen,
488                        const PerExpandData*,
489                        ExpandEmitter* out, const string& arg) const {
490   const char* pos = in;
491   const char* start = pos;
492   const char* const limit = in + inlen;
493   while (pos < limit) {
494     char ch = *pos;
495 
496     // According to section 2.2 of the spec
497     // http://www.w3.org/TR/REC-xml/#charsets control characters in range
498     // 0x00-0x1F (except \t, \r and \n) are not valid XML characters. In
499     // particular, conformant parsers are allowed to die when encountering a FF
500     // char in PCDATA sections. These chars are replaced by a space.
501     if (ch >= 0x00 && ch < 0x20 && ch != '\t' && ch != '\r' && ch != '\n') {
502       EmitRun(start, pos, out);
503       out->Emit(' ');
504       start = ++pos;
505       continue;
506     }
507 
508     switch (ch) {
509       default:
510         // Increment our counter and look at the next character.
511         ++pos;
512         continue;
513 
514       case '&':  EmitRun(start, pos, out); APPEND("&amp;");  break;
515       case '"':  EmitRun(start, pos, out); APPEND("&quot;"); break;
516       case '\'': EmitRun(start, pos, out); APPEND("&#39;");  break;
517       case '<':  EmitRun(start, pos, out); APPEND("&lt;");   break;
518       case '>':  EmitRun(start, pos, out); APPEND("&gt;");   break;
519     }
520     start = ++pos;
521   }
522   EmitRun(start, pos, out);
523 }
524 XmlEscape xml_escape;
525 
526 // This table maps initial characters to code lengths.  This could be
527 // done with a 16-byte table and a shift, but there's a substantial
528 // performance increase by eliminating the shift.
529 static const char kCodeLengths[256] = {
530   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
531   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
532   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
533   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
534 
535   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
536   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
537   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
538   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
539 
540   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
541   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
542   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
543   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
544 
545   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
546   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
547   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
548   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
549 };
550 
551 // Returns the UTF-8 code-unit starting at start, or the special codepoint
552 // 0xFFFD if the input ends abruptly or is not well-formed UTF-8.
553 // start -- address of the start of the code unit which also receives the
554 //          address past the end of the code unit returned.
555 // end -- exclusive end of the string
UTF8CodeUnit(const char ** start,const char * end)556 static inline uint16 UTF8CodeUnit(const char** start, const char *end) {
557   // Use kCodeLengths table to calculate the length of the code unit
558   // from the first character.
559   unsigned char first_char = static_cast<unsigned char>(**start);
560   size_t code_unit_len = kCodeLengths[first_char];
561   if (code_unit_len == 1) {
562     // Return the current byte as a codepoint.
563     // Either it is a valid single byte codepoint, or it's not part of a valid
564     // UTF-8 sequence, and so has to be handled individually.
565     ++*start;
566     return first_char;
567   }
568   const char *code_unit_end = *start + code_unit_len;
569   if (code_unit_end < *start || code_unit_end > end) {  // Truncated code unit.
570     ++*start;
571     return 0xFFFDU;
572   }
573   const char* pos = *start;
574   uint16 code_unit = *pos & (0xFFU >> code_unit_len);
575   while (--code_unit_len) {
576     uint16 tail_byte = *(++pos);
577     if ((tail_byte & 0xC0U) != 0x80U) {  // Malformed code unit.
578       ++*start;
579       return 0xFFFDU;
580     }
581     code_unit = (code_unit << 6) | (tail_byte & 0x3FU);
582   }
583   *start = code_unit_end;
584   return code_unit;
585 }
586 
587 // A good reference is the ECMA standard (3rd ed), section 7.8.4:
588 // http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const589 void JavascriptEscape::Modify(const char* in, size_t inlen,
590                               const PerExpandData*,
591                               ExpandEmitter* out, const string& arg) const {
592   const char* pos = in;
593   const char* start = pos;
594   const char* const limit = in + inlen;
595 
596   if (limit < in) { return; }
597 
598   while (pos < limit) {
599     const char* next_pos = pos;
600     uint16 code_unit = UTF8CodeUnit(&next_pos, limit);
601 
602     // Test for 16-bit values outside the switch below, because gcc
603     // will emit chained branches rather than a jump table for such a
604     // wide range of values.
605     if (code_unit & 0xFF00) {
606       // Linebreaks according to EcmaScript 262 which cannot appear in strings.
607       if (code_unit == 0x2028) {
608         // Line separator
609         EmitRun(start, pos, out); APPEND("\\u2028");
610       } else if (code_unit == 0x2029) {
611         // Paragraph separator
612         EmitRun(start, pos, out); APPEND("\\u2029");
613       } else {
614         pos = next_pos;
615         continue;
616       }
617     } else {
618       switch (code_unit) {
619         default:
620           // Increment our counter and look at the next character.
621           pos = next_pos;
622           continue;
623 
624         case '\0': EmitRun(start, pos, out); APPEND("\\x00"); break;
625         case '"':  EmitRun(start, pos, out); APPEND("\\x22"); break;
626         case '\'': EmitRun(start, pos, out); APPEND("\\x27"); break;
627         case '\\': EmitRun(start, pos, out); APPEND("\\\\");  break;
628         case '\t': EmitRun(start, pos, out); APPEND("\\t");   break;
629         case '\r': EmitRun(start, pos, out); APPEND("\\r");   break;
630         case '\n': EmitRun(start, pos, out); APPEND("\\n");   break;
631         case '\b': EmitRun(start, pos, out); APPEND("\\b");   break;
632         case '\f': EmitRun(start, pos, out); APPEND("\\f");   break;
633         case '&':  EmitRun(start, pos, out); APPEND("\\x26"); break;
634         case '<':  EmitRun(start, pos, out); APPEND("\\x3c"); break;
635         case '>':  EmitRun(start, pos, out); APPEND("\\x3e"); break;
636         case '=':  EmitRun(start, pos, out); APPEND("\\x3d"); break;
637 
638         case '\v':
639           // Do not escape vertical tabs to "\\v" since it is interpreted as 'v'
640           // by JScript according to section 2.1 of
641           // http://wiki.ecmascript.org/lib/exe/fetch.php?
642           // id=resources%3Aresources&cache=cache&
643           // media=resources:jscriptdeviationsfromes3.pdf
644           EmitRun(start, pos, out); APPEND("\\x0b"); break;
645       }
646     }
647     start = pos = next_pos;
648   }
649   EmitRun(start, pos, out);
650 }
651 JavascriptEscape javascript_escape;
652 
653 
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const654 void JavascriptNumber::Modify(const char* in, size_t inlen,
655                               const PerExpandData*,
656                               ExpandEmitter* out, const string& arg) const {
657   if (inlen == 0)
658     return;
659 
660   if (STR_IS(in, inlen, "true") || STR_IS(in, inlen, "false")) {
661     out->Emit(in, inlen);
662     return;
663   }
664 
665   bool valid = true;
666   if (in[0] == '0' && inlen > 2 && (in[1] == 'x' || in[1] == 'X')) {
667     // There must be at least one hex digit after the 0x for it to be valid.
668     // Hex number. Check that it is of the form 0(x|X)[0-9A-Fa-f]+
669     for (size_t i = 2; i < inlen; i++) {
670       char c = in[i];
671       if (!((c >= 'a' && c <= 'f') ||
672             (c >= 'A' && c <= 'F') ||
673             (c >= '0' && c <= '9'))) {
674         valid = false;
675         break;
676       }
677     }
678   } else {
679     // Must be a base-10 (or octal) number.
680     // Check that it has the form [0-9+-.eE]+
681     for (size_t i = 0; i < inlen; i++) {
682       char c = in[i];
683       if (!((c >= '0' && c <= '9') ||
684             c == '+' || c == '-' || c == '.' ||
685             c == 'e' || c == 'E')) {
686         valid = false;
687         break;
688       }
689     }
690   }
691   if (valid) {
692     out->Emit(in, inlen);   // Number was valid, output it.
693   } else {
694     APPEND("null");         // Number was not valid, output null instead.
695   }
696 }
697 JavascriptNumber javascript_number;
698 
IsUrlQueryEscapeSafeChar(unsigned char c)699 static inline bool IsUrlQueryEscapeSafeChar(unsigned char c) {
700   // Everything not matching [0-9a-zA-Z.,_*/~!()-] is escaped.
701   static unsigned long _safe_characters[8] = {
702     0x00000000L, 0x03fff702L, 0x87fffffeL, 0x47fffffeL,
703     0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L
704   };
705 
706   return (_safe_characters[(c)>>5] & (1 << ((c) & 31)));
707 }
708 
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const709 void UrlQueryEscape::Modify(const char* in, size_t inlen,
710                             const PerExpandData*,
711                             ExpandEmitter* out, const string& arg) const {
712   const char* pos = in;
713   const char* const limit = in + inlen;
714   while (true) {
715     // Peel off any initial runs of safe characters and emit them all
716     // at once.
717     const char* start = pos;
718     while (pos < limit && IsUrlQueryEscapeSafeChar(*pos)) {
719       pos++;
720     }
721     EmitRun(start, pos, out);
722 
723     // Now deal with a single unsafe character.
724     if (pos < limit) {
725       unsigned char c = *pos;
726       if (c == ' ') {
727         out->Emit('+');
728       } else {
729         out->Emit('%');
730         out->Emit(((c>>4) < 10 ? ((c>>4) + '0') : (((c>>4) - 10) + 'A')));
731         out->Emit(((c&0xf) < 10 ? ((c&0xf) + '0') : (((c&0xf) - 10) + 'A')));
732       }
733       pos++;
734     } else {
735       // We're done!
736       break;
737     }
738   }
739 }
740 UrlQueryEscape url_query_escape;
741 
742 // For more information on escaping JSON, see section 2.5 in
743 // http://www.ietf.org/rfc/rfc4627.txt.
744 // Escaping '&', '<', '>' is optional in the JSON proposed RFC
745 // but alleviates concerns with content sniffing if JSON is used
746 // in a context where the browser may attempt to interpret HTML.
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const747 void JsonEscape::Modify(const char* in, size_t inlen,
748                         const PerExpandData*,
749                         ExpandEmitter* out, const string& arg) const {
750   const char* pos = in;
751   const char* start = pos;
752   const char* const limit = in + inlen;
753   while (pos < limit) {
754     switch (*pos) {
755       default:
756         // Increment our counter and look at the next character.
757         ++pos;
758         continue;
759 
760       case '"':  EmitRun(start, pos, out); APPEND("\\\"");    break;
761       case '\\': EmitRun(start, pos, out); APPEND("\\\\");    break;
762       case '/':  EmitRun(start, pos, out); APPEND("\\/");     break;
763       case '\b': EmitRun(start, pos, out); APPEND("\\b");     break;
764       case '\f': EmitRun(start, pos, out); APPEND("\\f");     break;
765       case '\n': EmitRun(start, pos, out); APPEND("\\n");     break;
766       case '\r': EmitRun(start, pos, out); APPEND("\\r");     break;
767       case '\t': EmitRun(start, pos, out); APPEND("\\t");     break;
768       case '&':  EmitRun(start, pos, out); APPEND("\\u0026"); break;
769       case '<':  EmitRun(start, pos, out); APPEND("\\u003C"); break;
770       case '>':  EmitRun(start, pos, out); APPEND("\\u003E"); break;
771     }
772     start = ++pos;
773   }
774   EmitRun(start, pos, out);
775 }
776 JsonEscape json_escape;
777 
Modify(const char * in,size_t inlen,const PerExpandData *,ExpandEmitter * out,const string & arg) const778 void PrefixLine::Modify(const char* in, size_t inlen,
779                         const PerExpandData*,
780                         ExpandEmitter* out, const string& arg) const {
781   while (inlen > 0) {
782     const char* nl = (const char*)memchr(in, '\n', inlen);
783     const char* cr = (const char*)memchr(in, '\r', nl ? nl - in : inlen);
784     size_t linelen;
785     if (nl == NULL && cr == NULL) {
786       // We're at the last line
787       out->Emit(in, inlen);
788       break;
789     } else {
790       // One or both of \r and \n is set; point to the first char past
791       // the newline.  Note for \r\n, that's the char after the \n,
792       // otherwise, it's the char past the \r or the \n we see.
793       if ((nl == NULL) != (cr == NULL))     // one is set, the other is NULL
794         linelen = (nl ? nl : cr) + 1 - in;
795       else if (nl == cr + 1 || nl < cr)     // \r\n, or \n comes first
796         linelen = nl + 1 - in;
797       else
798         linelen = cr + 1 - in;
799     }
800     out->Emit(in, linelen);
801     out->Emit(arg);               // a new line, so emit the prefix
802     in += linelen;
803     inlen -= linelen;
804     assert(inlen >= 0);
805   }
806 }
807 PrefixLine prefix_line;
808 
809 
810 // Must be at least one more than the maximum number of alternative modifiers
811 // specified in any given element of g_modifiers.
812 # define MAX_SAFE_ALTERNATIVES 10  // If the compiler complains, increase it.
813 
814 // Use the empty string if you want a modifier not to have a long-name.
815 // Use '\0' if you want a modifier not to have a short-name.
816 // Note: not all modifiers are in this array:
817 // 1) SnippetEscape: use html_escape_with_arg=snippet to get this
818 // 2) CleanseAttribute: use html_escape_with_arg=attribute to get this
819 // 3) ValidateUrl: use html_escape_with_arg=url to get this
820 //
821 // Some modifiers define other modifiers that are safe replacements
822 // from an XSS perspective. Replacements are not commutative so for
823 // example H=pre considers H=attribute a safe replacement to it
824 // but H=attribute has no safe replacements.
825 // This struct is not pretty but allows the definitions to be
826 // done without the need for a global initialization method.
827 // Be very careful making a change to g_modifiers as modifiers
828 // point to other ones within that same array so elements
829 // may not be re-ordered easily. Also you need to change
830 // the global g_am_dirs correspondingly.
831 //
832 static struct ModifierWithAlternatives {
833   ModifierInfo modifier_info;
834   ModifierInfo* safe_alt_mods[MAX_SAFE_ALTERNATIVES];
835 } g_modifiers[] = {
836   /* 0 */ { ModifierInfo("cleanse_css", 'c',
837                          XSS_WEB_STANDARD, &cleanse_css),
838             {&g_modifiers[16].modifier_info,  // url_escape_with_arg=css
839              // img_src_url_escape_with_arg=css
840              &g_modifiers[19].modifier_info} },
841   /* 1 */ { ModifierInfo("html_escape", 'h',
842                          XSS_WEB_STANDARD, &html_escape),
843             {&g_modifiers[2].modifier_info,   // html_escape_with_arg=snippet
844              &g_modifiers[3].modifier_info,   // html_escape_with_arg=pre
845              &g_modifiers[4].modifier_info,   // html_escape_with_arg=attribute
846              &g_modifiers[5].modifier_info,   // html_escape_with_arg=url
847              &g_modifiers[8].modifier_info,   // pre_escape
848              &g_modifiers[9].modifier_info,   // url_query_escape
849              &g_modifiers[11].modifier_info,  // url_escape_with_arg=html
850              &g_modifiers[12].modifier_info,  // url_escape_with_arg=query
851              // img_src_url_escape_with_arg=html
852              &g_modifiers[18].modifier_info} },
853   /* 2 */ { ModifierInfo("html_escape_with_arg=snippet", 'H',
854                          XSS_WEB_STANDARD, &snippet_escape),
855             {&g_modifiers[1].modifier_info,   // html_escape
856              &g_modifiers[3].modifier_info,   // html_escape_with_arg=pre
857              &g_modifiers[4].modifier_info,   // html_escape_with_arg=attribute
858              &g_modifiers[8].modifier_info,   // pre_escape
859              &g_modifiers[9].modifier_info,   // url_query_escape
860              &g_modifiers[12].modifier_info} },  // url_escape_with_arg=query
861   /* 3 */ { ModifierInfo("html_escape_with_arg=pre", 'H',
862                          XSS_WEB_STANDARD, &pre_escape),
863             {&g_modifiers[1].modifier_info,   // html_escape
864              &g_modifiers[2].modifier_info,   // html_escape_with_arg=snippet
865              &g_modifiers[4].modifier_info,   // html_escape_with_arg=attribute
866              &g_modifiers[8].modifier_info,   // pre_escape
867              &g_modifiers[9].modifier_info,   // url_query_escape
868              &g_modifiers[12].modifier_info} },  // url_escape_with_arg=query
869   /* 4 */ { ModifierInfo("html_escape_with_arg=attribute", 'H',
870                          XSS_WEB_STANDARD, &cleanse_attribute), {} },
871   /* 5 */ { ModifierInfo("html_escape_with_arg=url", 'H',
872                          XSS_WEB_STANDARD, &validate_url_and_html_escape),
873             // img_src_url_escape_with_arg=html
874             {&g_modifiers[18].modifier_info} },
875   /* 6 */ { ModifierInfo("javascript_escape", 'j',
876                          XSS_WEB_STANDARD, &javascript_escape),
877             {&g_modifiers[7].modifier_info,   // json_escape
878              &g_modifiers[10].modifier_info,  // url_escape_with_arg=javascript
879              // img_src_url_escape_with_arg=javascript
880              &g_modifiers[17].modifier_info} },
881   /* 7 */ { ModifierInfo("json_escape", 'o', XSS_WEB_STANDARD, &json_escape),
882             {&g_modifiers[6].modifier_info} },  // javascript_escape
883   /* 8 */ { ModifierInfo("pre_escape", 'p', XSS_WEB_STANDARD, &pre_escape),
884             {&g_modifiers[1].modifier_info,     // html_escape
885              &g_modifiers[2].modifier_info,     // html_escape_with_arg=snippet
886              &g_modifiers[3].modifier_info,     // html_escape_with_arg=pre
887              &g_modifiers[4].modifier_info,     // html_escape_with_arg=attr...
888              &g_modifiers[9].modifier_info,     // url_query_escape
889              &g_modifiers[12].modifier_info} },   // url_escape_with_arg=query
890   /* 9 */ { ModifierInfo("url_query_escape", 'u',
891                          XSS_WEB_STANDARD, &url_query_escape), {} },
892   /* 10 */ { ModifierInfo("url_escape_with_arg=javascript", 'U',
893                           XSS_WEB_STANDARD,
894                           &validate_url_and_javascript_escape),
895              // img_src_url_escape_with_arg=javascript
896              {&g_modifiers[17].modifier_info} },
897   /* 11 */ { ModifierInfo("url_escape_with_arg=html", 'U',
898                           XSS_WEB_STANDARD, &validate_url_and_html_escape),
899              // img_src_url_escape_with_arg=html
900              {&g_modifiers[18].modifier_info} },
901   /* 12 */ { ModifierInfo("url_escape_with_arg=query", 'U',
902                           XSS_WEB_STANDARD, &url_query_escape), {} },
903   /* 13 */ { ModifierInfo("none", '\0', XSS_SAFE, &null_modifier), {} },
904   /* 14 */ { ModifierInfo("xml_escape", '\0', XSS_WEB_STANDARD, &xml_escape),
905              {&g_modifiers[1].modifier_info,      // html_escape
906               &g_modifiers[4].modifier_info,} },  // H=attribute
907   /* 15 */ { ModifierInfo("javascript_escape_with_arg=number", 'J',
908                           XSS_WEB_STANDARD, &javascript_number), {} },
909   /* 16 */ { ModifierInfo("url_escape_with_arg=css", 'U',
910                           XSS_WEB_STANDARD, &validate_url_and_css_escape), {} },
911   /* 17 */ { ModifierInfo("img_src_url_escape_with_arg=javascript", 'I',
912                           XSS_WEB_STANDARD,
913                           &validate_img_src_url_and_javascript_escape), {} },
914   /* 18 */ { ModifierInfo("img_src_url_escape_with_arg=html", 'I',
915                           XSS_WEB_STANDARD,
916                           &validate_img_src_url_and_html_escape), {} },
917   /* 19 */ { ModifierInfo("img_src_url_escape_with_arg=css", 'I',
918                           XSS_WEB_STANDARD,
919                           &validate_img_src_url_and_css_escape), {} },
920 };
921 
922 static vector<const ModifierInfo*> g_extension_modifiers;
923 static vector<const ModifierInfo*> g_unknown_modifiers;
924 
925 // Returns whether or not candidate can be safely (w.r.t XSS)
926 // used in lieu of our ModifierInfo. This is true iff:
927 //   1. Both have the same modifier function OR
928 //   2. Candidate's modifier function is in our ModifierInfo's
929 //      list (vector) of safe alternative modifier functions.
930 //
931 // This is used with the auto-escaping code, which automatically
932 // figures out which modifier to apply to a variable based on the
933 // variable's context (in an html "<A HREF", for instance).  Some
934 // built-in modifiers are considered safe alternatives from the perspective
935 // of preventing XSS (cross-site-scripting) attacks, in which case
936 // the auto-escaper should allow the choice of which to use in the
937 // template. This is intended only for internal use as it is dangerous
938 // and complicated to figure out which modifier is an XSS-safe
939 // replacement for a given one. Custom modifiers currently may not
940 // indicate safe replacements, only built-in ones may do so.
941 //
942 // Note that this function is not commutative therefore
943 // IsSafeXSSAlternative(a, b) may not be equal to IsSafeXSSAlternative(b, a).
IsSafeXSSAlternative(const ModifierInfo & our,const ModifierInfo & candidate)944 bool IsSafeXSSAlternative(const ModifierInfo& our,
945                           const ModifierInfo& candidate) {
946   // Succeeds even for non built-in modifiers but no harm.
947   if (our.modifier == candidate.modifier)
948     return true;
949 
950   for (const ModifierWithAlternatives* mod_with_alts = g_modifiers;
951        mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers);
952        ++mod_with_alts) {
953     if (mod_with_alts->modifier_info.long_name == our.long_name)
954       // We found our Modifier in the built-in array g_modifiers.
955       for (int i = 0; mod_with_alts->safe_alt_mods[i] != NULL &&
956                i < MAX_SAFE_ALTERNATIVES; ++i)
957         if (mod_with_alts->safe_alt_mods[i]->long_name == candidate.long_name)
958           // We found candidate in our Modifier's list of safe alternatives.
959           return true;
960   }
961   // our is not built-in or candidate is not a safe replacement to our.
962   return false;
963 }
964 
IsExtensionModifier(const char * long_name)965 static inline bool IsExtensionModifier(const char* long_name) {
966   return memcmp(long_name, "x-", 2) == 0;
967 }
968 
AddModifierCommon(const char * long_name,const TemplateModifier * modifier,bool xss_safe)969 static bool AddModifierCommon(const char* long_name,
970                  const TemplateModifier* modifier, bool xss_safe) {
971   if (!IsExtensionModifier(long_name))
972     return false;
973 
974   // TODO(csilvers): store in a map or multimap, rather than a vector
975   for (vector<const ModifierInfo*>::const_iterator mod =
976            g_extension_modifiers.begin();
977        mod != g_extension_modifiers.end();
978        ++mod) {
979     // Check if mod has the same name as us.  For modifiers that also take
980     // values, this is everything before the =.  The only time it's ok to
981     // have the same name is when we have different modval specializations:
982     // "foo=bar" and "foo=baz" are both valid names.  Note "foo" and
983     // "foo=bar" is not valid: foo has no modval, but "foo=bar" does.
984     const size_t new_modifier_namelen = strcspn(long_name, "=");
985     const size_t existing_modifier_namelen = strcspn((*mod)->long_name.c_str(),
986                                                      "=");
987     if (new_modifier_namelen == existing_modifier_namelen &&
988         !memcmp(long_name, (*mod)->long_name.c_str(), new_modifier_namelen)) {
989       if (long_name[new_modifier_namelen] == '=' &&
990           (*mod)->long_name[existing_modifier_namelen] == '=' &&
991           (*mod)->long_name != long_name) {
992         // It's ok, we're different specializations!
993       } else {
994         // It's not ok: we have the same name and no good excuse.
995         return false;
996       }
997     }
998   }
999 
1000   g_extension_modifiers.push_back(
1001       new ModifierInfo(long_name, '\0',
1002                        xss_safe ? XSS_SAFE : XSS_UNIQUE,
1003                        modifier));
1004   return true;
1005 }
1006 
1007 // Modifier added with XSS_UNIQUE XssClass.
AddModifier(const char * long_name,const TemplateModifier * modifier)1008 bool AddModifier(const char* long_name,
1009                  const TemplateModifier* modifier) {
1010   return AddModifierCommon(long_name, modifier, false);
1011 }
1012 
1013 // Modifier added with XSS_SAFE XssClass.
AddXssSafeModifier(const char * long_name,const TemplateModifier * modifier)1014 bool AddXssSafeModifier(const char* long_name,
1015                  const TemplateModifier* modifier) {
1016   return AddModifierCommon(long_name, modifier, true);
1017 }
1018 
1019 // If candidate_match is a better match for modname/modval than bestmatch,
1020 // update bestmatch.  To be a better match, two conditions must be met:
1021 //  1) The candidate's name must match modname
1022 //  2) If the candidate is a specialization (that is, name is of the form
1023 //     "foo=bar", then modval matches the specialization value).
1024 //  3) If the candidate is not a specialization, bestmatch isn't a
1025 //     specialization either.
1026 // Condition (3) makes sure that if we match the ModifierInfo with name
1027 // "foo=bar", we don't claim the ModifierInfo "foo=" is a better match.
1028 // Recall that by definition, modval will always start with a '=' if present.
UpdateBestMatch(const char * modname,size_t modname_len,const char * modval,size_t modval_len,const ModifierInfo * candidate_match,const ModifierInfo ** best_match)1029 static void UpdateBestMatch(const char* modname, size_t modname_len,
1030                             const char* modval, size_t modval_len,
1031                             const ModifierInfo* candidate_match,
1032                             const ModifierInfo** best_match) {
1033   // It's easiest to handle the two case differently: (1) candidate_match
1034   // refers to a modifier that expects a modifier-value; (2) it doesn't.
1035   if (candidate_match->modval_required) {
1036     // To be a match, we have to fulfill three requirements: we have a
1037     // modval, our modname matches candidate_match's modname (either
1038     // shortname or longname), and our modval is consistent with the
1039     // value specified in the longname (whatever might follow the =).
1040     const char* const longname_start = candidate_match->long_name.c_str();
1041     const char* const equals = strchr(longname_start, '=');
1042     assert(equals != NULL);
1043     if (modval_len > 0 &&
1044         ((modname_len == 1 && *modname == candidate_match->short_name) ||
1045          (modname_len == equals - longname_start &&
1046           memcmp(modname, longname_start, modname_len) == 0)) &&
1047         ((equals[1] == '\0') ||  // name is "foo=" (not a specialization)
1048          (modval_len
1049           == longname_start + candidate_match->long_name.size() - equals &&
1050           memcmp(modval, equals, modval_len) == 0))) {
1051       // Condition (3) above is satisfied iff our longname is longer than
1052       // best-match's longname (so we prefer "foo=bar" to "foo=").
1053       if (*best_match == NULL ||
1054           candidate_match->long_name.size() > (*best_match)->long_name.size())
1055         *best_match = candidate_match;
1056     }
1057   } else {
1058     // In this case, to be a match: we must *not* have a modval.  Our
1059     // modname still must match modinfo's modname (either short or long).
1060     if (modval_len == 0 &&
1061         ((modname_len == 1 && *modname == candidate_match->short_name) ||
1062          (modname_len == candidate_match->long_name.size() &&
1063           !memcmp(modname, candidate_match->long_name.data(), modname_len)))) {
1064       // In the no-modval case, only one match should exist.
1065       assert(*best_match == NULL);
1066       *best_match = candidate_match;
1067     }
1068   }
1069 }
1070 
FindModifier(const char * modname,size_t modname_len,const char * modval,size_t modval_len)1071 const ModifierInfo* FindModifier(const char* modname, size_t modname_len,
1072                                  const char* modval, size_t modval_len) {
1073   // More than one modifier can match, in the case of modval specializations
1074   // (e.g., the modifier "foo=" and "foo=bar" will both match on input of
1075   // modname="foo", modval="bar").  In that case, we take the ModifierInfo
1076   // with the longest longname, since that's the most specialized match.
1077   const ModifierInfo* best_match = NULL;
1078   if (modname_len >= 2 && IsExtensionModifier(modname)) {
1079     for (vector<const ModifierInfo*>::const_iterator mod =
1080              g_extension_modifiers.begin();
1081          mod != g_extension_modifiers.end();
1082          ++mod) {
1083       UpdateBestMatch(modname, modname_len, modval, modval_len,
1084                       *mod, &best_match);
1085     }
1086     if (best_match != NULL)
1087       return best_match;
1088 
1089     for (vector<const ModifierInfo*>::const_iterator mod =
1090              g_unknown_modifiers.begin();
1091          mod != g_unknown_modifiers.end();
1092          ++mod) {
1093       UpdateBestMatch(modname, modname_len, modval, modval_len,
1094                       *mod, &best_match);
1095     }
1096     if (best_match != NULL)
1097       return best_match;
1098     // This is the only situation where we can pass in a modifier of NULL.
1099     // It means "we don't know about this modifier-name."
1100     string fullname(modname, modname_len);
1101     if (modval_len) {
1102       fullname.append(modval, modval_len);
1103     }
1104     // TODO(csilvers): store in a map or multimap, rather than a vector
1105     g_unknown_modifiers.push_back(new ModifierInfo(fullname, '\0',
1106                                                    XSS_UNIQUE, NULL));
1107     return g_unknown_modifiers.back();
1108   } else {
1109     for (const ModifierWithAlternatives* mod_with_alts = g_modifiers;
1110          mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers);
1111          ++mod_with_alts) {
1112       UpdateBestMatch(modname, modname_len, modval, modval_len,
1113                       &mod_with_alts->modifier_info, &best_match);
1114     }
1115     return best_match;
1116   }
1117 }
1118 
1119 // For escaping variables under the auto-escape mode:
1120 // Each directive below maps to a distinct sequence of
1121 // escaping directives (i.e a vector<ModifierAndValue>) applied
1122 // to a variable during run-time substitution.
1123 // The directives are stored in a global array (g_mods_ae)
1124 // initialized under lock in InitializeGlobalModifiers.
1125 enum AutoModifyDirective {
1126   AM_EMPTY,                         // Unused, kept as marker.
1127   AM_HTML,
1128   AM_HTML_UNQUOTED,
1129   AM_JS,
1130   AM_JS_NUMBER,
1131   AM_URL_HTML,
1132   AM_URL_QUERY,
1133   AM_STYLE,
1134   AM_XML,
1135   NUM_ENTRIES_AM,
1136 };
1137 
1138 // Populates the global vector of hard-coded modifiers that
1139 // Auto-Escape may pick. We point to the appropriate modifier in
1140 // the global g_modifiers.
1141 // Reference these globals via the global array g_am_dirs[] for consistency.
1142 // Note: We allow for more than one ModifierAndValue in the array hence
1143 // the need to terminate with a Null marker. However currently all the
1144 // escaping directives have exactly one ModifierAndValue.
1145 static const ModifierAndValue g_am_empty[] = {
1146   ModifierAndValue(NULL, "", 0)
1147 };
1148 static const ModifierAndValue g_am_html[] = {
1149   ModifierAndValue(&g_modifiers[1].modifier_info, "", 0),
1150   ModifierAndValue(NULL, "", 0)
1151 };
1152 static const ModifierAndValue g_am_html_unquoted[] = {
1153   ModifierAndValue(&g_modifiers[4].modifier_info, "=attribute", 10),
1154   ModifierAndValue(NULL, "", 0)
1155 };
1156 static const ModifierAndValue g_am_js[] = {
1157   ModifierAndValue(&g_modifiers[6].modifier_info, "", 0),
1158   ModifierAndValue(NULL, "", 0)
1159 };
1160 static const ModifierAndValue g_am_js_number[] = {
1161   ModifierAndValue(&g_modifiers[15].modifier_info, "=number", 7),
1162   ModifierAndValue(NULL, "", 0)
1163 };
1164 static const ModifierAndValue g_am_url_html[] = {
1165   ModifierAndValue(&g_modifiers[11].modifier_info, "=html", 5),
1166   ModifierAndValue(NULL, "", 0)
1167 };
1168 static const ModifierAndValue g_am_url_query[] = {
1169   ModifierAndValue(&g_modifiers[9].modifier_info, "", 0),
1170   ModifierAndValue(NULL, "", 0)
1171 };
1172 static const ModifierAndValue g_am_style[] = {
1173   ModifierAndValue(&g_modifiers[0].modifier_info, "", 0),
1174   ModifierAndValue(NULL, "", 0)
1175 };
1176 static const ModifierAndValue g_am_xml[] = {
1177   ModifierAndValue(&g_modifiers[14].modifier_info, "", 0),
1178   ModifierAndValue(NULL, "", 0)
1179 };
1180 
1181 static const ModifierAndValue* g_am_dirs[NUM_ENTRIES_AM] = {
1182   g_am_empty,                  /* AM_EMPTY */
1183   g_am_html,                   /* AM_HTML */
1184   g_am_html_unquoted,          /* AM_HTML_UNQUOTED */
1185   g_am_js,                     /* AM_JS */
1186   g_am_js_number,              /* AM_JS_NUMBER */
1187   g_am_url_html,               /* AM_URL_HTML */
1188   g_am_url_query,              /* AM_URL_QUERY */
1189   g_am_style,                  /* AM_STYLE */
1190   g_am_xml,                    /* AM_XML */
1191 };
1192 
PrettyPrintOneModifier(const ModifierAndValue & modval)1193 string PrettyPrintOneModifier(const ModifierAndValue& modval) {
1194   string out;
1195   out.append(":");
1196   if (modval.modifier_info->short_name)      // short_name is a char.
1197     out.append(1, modval.modifier_info->short_name);
1198   else
1199     out.append(modval.modifier_info->long_name);
1200   if (modval.value_len != 0)
1201     out.append(modval.value, modval.value_len);
1202   return out;
1203 }
1204 
PrettyPrintModifiers(const vector<const ModifierAndValue * > & modvals,const string & separator)1205 string PrettyPrintModifiers(const vector<const ModifierAndValue*>& modvals,
1206                             const string& separator) {
1207   string out;
1208   for (vector<const ModifierAndValue*>::const_iterator it =
1209            modvals.begin(); it != modvals.end();  ++it) {
1210     if (it != modvals.begin())
1211       out.append(separator);
1212     out.append(PrettyPrintOneModifier(**it));
1213   }
1214   return out;
1215 }
1216 
1217 // Return the sequence of escaping directives to apply for the given context.
1218 // An empty vector indicates an error occurred. Currently we never need
1219 // to chain escaping directives hence on success, the vector is always of
1220 // size 1. This may change in the future.
GetModifierForHtmlJs(HtmlParser * htmlparser,string * error_msg)1221 vector<const ModifierAndValue*> GetModifierForHtmlJs(
1222     HtmlParser* htmlparser, string* error_msg) {
1223   assert(htmlparser);
1224   assert(error_msg);
1225   vector<const ModifierAndValue*> modvals;
1226 
1227   // Two cases of being inside javascript:
1228   // 1. Inside raw javascript (within a <script> tag). If the value
1229   //    is quoted we apply javascript_escape, if not we have to coerce
1230   //    it to a safe value due to the risk of javascript code execution
1231   //    hence apply :J=number. If arbitrary code needs to be inserted
1232   //    at run-time, the developer must use :none.
1233   // 2. In the value of an attribute that takes javascript such
1234   //    as onmouseevent in '<a href="someUrl" onmousevent="{{EVENT}}">'.
1235   //    That will be covered in the STATE_VALUE state logic below.
1236   if (htmlparser->InJavascript() &&
1237       htmlparser->state() != HtmlParser::STATE_VALUE) {
1238     if (htmlparser->IsJavascriptQuoted()) {
1239       modvals.push_back(g_am_dirs[AM_JS]);
1240       assert(modvals.size() == 1);
1241       return modvals;
1242     } else {
1243       modvals.push_back(g_am_dirs[AM_JS_NUMBER]);
1244       assert(modvals.size() == 1);
1245       return modvals;
1246     }
1247   }
1248   switch (htmlparser->state()) {
1249     case HtmlParser::STATE_VALUE:{
1250       string attribute_name = htmlparser->attribute();
1251       switch (htmlparser->AttributeType()) {
1252         case HtmlParser::ATTR_URI:
1253           // Case 1: The URL is quoted:
1254           // . Apply :U=html if it is a complete URL or :h if it is a fragment.
1255           // Case 2: The URL is not quoted:
1256           // .  If it is a complete URL, we have no safe modifiers that
1257           //   won't break it so we have to fail.
1258           // .  If it is a URL fragment, then :u is safe and not likely to
1259           //   break the URL.
1260           if (!htmlparser->IsAttributeQuoted()) {
1261             if (htmlparser->IsUrlStart()) {   // Complete URL.
1262               error_msg->append("Value of URL attribute \"" + attribute_name +
1263                                 "\" must be enclosed in quotes.");
1264               assert(modvals.empty());
1265               return modvals;  // Empty
1266             } else {                                // URL fragment.
1267               modvals.push_back(g_am_dirs[AM_URL_QUERY]);
1268             }
1269           } else {
1270             // Only validate the URL if we have a complete URL,
1271             // otherwise simply html_escape.
1272             if (htmlparser->IsUrlStart())
1273               modvals.push_back(g_am_dirs[AM_URL_HTML]);
1274             else
1275               modvals.push_back(g_am_dirs[AM_HTML]);
1276           }
1277           break;
1278         case HtmlParser::ATTR_REGULAR:
1279           // If the value is quoted, simply HTML escape, otherwise
1280           // apply stricter escaping using H=attribute.
1281           if (htmlparser->IsAttributeQuoted())
1282             modvals.push_back(g_am_dirs[AM_HTML]);
1283           else
1284             modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
1285           break;
1286         case HtmlParser::ATTR_STYLE:
1287           // If the value is quoted apply :c, otherwise fail.
1288           if (htmlparser->IsAttributeQuoted()) {
1289             modvals.push_back(g_am_dirs[AM_STYLE]);
1290           } else {
1291             error_msg->append("Value of style attribute \"" + attribute_name +
1292                               "\" must be enclosed in quotes.");
1293             assert(modvals.empty());
1294             return modvals;   // Empty
1295           }
1296           break;
1297         case HtmlParser::ATTR_JS:
1298           // We require javascript accepting attributes (such as onclick)
1299           // to be HTML quoted, otherwise they are vulnerable to
1300           // HTML attribute insertion via the use of whitespace.
1301           if (!htmlparser->IsAttributeQuoted()) {
1302             error_msg->append("Value of javascript attribute \"" +
1303                               attribute_name +
1304                               "\" must be enclosed in quotes.");
1305             assert(modvals.empty());
1306             return modvals;   // Empty
1307           }
1308           // If the variable is quoted apply javascript_escape otherwise
1309           // apply javascript_number which will ensure it is safe against
1310           // code injection.
1311           // Note: We normally need to HTML escape after javascript escape
1312           // but the javascript escape implementation provided makes the
1313           // HTML escape redundant so simply javascript escape.
1314           if (htmlparser->IsJavascriptQuoted())
1315             modvals.push_back(g_am_dirs[AM_JS]);
1316           else
1317             modvals.push_back(g_am_dirs[AM_JS_NUMBER]);
1318           break;
1319         case HtmlParser::ATTR_NONE:
1320           assert("We should be in attribute!" && 0);
1321         default:
1322           assert("Should not be able to get here." && 0);
1323           return modvals;  // Empty
1324       }
1325       // In STATE_VALUE particularly, the parser may get out of sync with
1326       // the correct state - that the browser sees - due to the fact that
1327       // it does not get to parse run-time content (variables). So we tell
1328       // the parser there is content that will be expanded here.
1329       // A good example is:
1330       //   <a href={{URL}} alt={{NAME}}>
1331       // The parser sees <a href= alt=> and interprets 'alt=' to be
1332       // the value of href.
1333       htmlparser->InsertText();  // Ignore return value.
1334       assert(modvals.size() == 1);
1335       return modvals;
1336     }
1337     case HtmlParser::STATE_TAG:{
1338       // Apply H=attribute to tag names since they are alphabetic.
1339       // Examples of tag names: TITLE, BODY, A and BR.
1340       modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
1341       assert(modvals.size() == 1);
1342       return modvals;
1343     }
1344     case HtmlParser::STATE_ATTR:{
1345       // Apply H=attribute to attribute names since they are alphabetic.
1346       // Examples of attribute names: HREF, SRC and WIDTH.
1347       modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
1348       assert(modvals.size() == 1);
1349       return modvals;
1350     }
1351     case HtmlParser::STATE_COMMENT:
1352     case HtmlParser::STATE_TEXT:{
1353       // Apply :h to regular HTML text and :c if within a style tag.
1354       if (htmlparser->InCss())
1355         modvals.push_back(g_am_dirs[AM_STYLE]);
1356       else
1357         modvals.push_back(g_am_dirs[AM_HTML]);
1358       assert(modvals.size() == 1);
1359       return modvals;
1360     }
1361     default:{
1362       assert("Should not be able to get here." && 0);
1363       return modvals;   // Empty
1364     }
1365   }
1366   assert("Should not be able to get here." && 0);
1367   return modvals;   // Empty
1368 }
1369 
1370 // TODO(jad): Memoize all GetModifierForXXX functions below.
1371 //            They don't depend on parser context (from csilvers).
GetModifierForCss(HtmlParser * htmlparser,string * error_msg)1372 vector<const ModifierAndValue*> GetModifierForCss(HtmlParser* htmlparser,
1373                                                   string* error_msg) {
1374   vector<const ModifierAndValue*> modvals;
1375   modvals.push_back(g_am_dirs[AM_STYLE]);
1376   return modvals;
1377 }
1378 
GetModifierForXml(HtmlParser * htmlparser,string * error_msg)1379 vector<const ModifierAndValue*> GetModifierForXml(HtmlParser* htmlparser,
1380                                                         string* error_msg) {
1381   vector<const ModifierAndValue*> modvals;
1382   modvals.push_back(g_am_dirs[AM_XML]);
1383   return modvals;
1384 }
1385 
GetModifierForJson(HtmlParser * htmlparser,string * error_msg)1386 vector<const ModifierAndValue*> GetModifierForJson(HtmlParser* htmlparser,
1387                                                          string* error_msg) {
1388   vector<const ModifierAndValue*> modvals;
1389   modvals.push_back(g_am_dirs[AM_JS]);
1390   return modvals;
1391 }
1392 
GetDefaultModifierForHtml()1393 vector<const ModifierAndValue*> GetDefaultModifierForHtml() {
1394   vector<const ModifierAndValue*> modvals;
1395   modvals.push_back(g_am_dirs[AM_HTML]);
1396   return modvals;
1397 }
1398 
GetDefaultModifierForJs()1399 vector<const ModifierAndValue*> GetDefaultModifierForJs() {
1400   vector<const ModifierAndValue*> modvals;
1401   modvals.push_back(g_am_dirs[AM_JS]);
1402   return modvals;
1403 }
1404 
GetDefaultModifierForCss()1405 vector<const ModifierAndValue*> GetDefaultModifierForCss() {
1406   return GetModifierForCss(NULL, NULL);
1407 }
1408 
GetDefaultModifierForXml()1409 vector<const ModifierAndValue*> GetDefaultModifierForXml() {
1410   return GetModifierForXml(NULL, NULL);
1411 }
1412 
GetDefaultModifierForJson()1413 vector<const ModifierAndValue*> GetDefaultModifierForJson() {
1414   return GetModifierForJson(NULL, NULL);
1415 }
1416 
1417 }
1418