template_modifiers.cc - OpenGrok cross reference for /dports/textproc/google-ctemplate/ctemplate-ctemplate-2.4/src/template_modifiers.cc

// Copyright (c) 2007, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// ---
// Author: csilvers@google.com (Craig Silverstein)
//
// template_modifiers.h has a description of what each escape-routine does.
//
// When creating a new modifier, you must subclass TemplateModifier
// and define your own Modify() method.  This method takes the string
// to be modified as a char*/int pair.  It then emits the modified
// version of the string to outbuf.  Outbuf is an ExpandEmitter, as
// defined in template_modifiers.h.  It's a very simple type that
// supports appending to a data stream.
//
// Be very careful editing an existing modifier.  Subtle changes can
// introduce the possibility for cross-site scripting attacks.  If you
// do change a modifier, be careful that it does not affect
// the list of Safe XSS Alternatives.
//

#include <config.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <string>
#include <vector>
#include "htmlparser/htmlparser_cpp.h"
#include <ctemplate/template_modifiers.h>
#include "template_modifiers_internal.h"
#include <ctemplate/per_expand_data.h>
using std::string;
using std::vector;

#define strliterallen(s)  (sizeof("" s "") - 1)

// Really we should be using uint_16_t or something, but this is good
// enough, and more portable...
typedef unsigned int uint16;

namespace URL {
bool HasInsecureProtocol(const char* in, int inlen) {
  if (inlen > strliterallen("http://") &&
      strncasecmp(in, "http://", strliterallen("http://")) == 0) {
    return false;  // We're ok, it's an http protocol
  }
  if (inlen > strliterallen("https://") &&
      strncasecmp(in, "https://", strliterallen("https://")) == 0) {
    return false;  // https is ok as well
  }
  if (inlen > strliterallen("ftp://") &&
      strncasecmp(in, "ftp://", strliterallen("ftp://")) == 0) {
    return false;  // and ftp
  }
  return true;
}
}  // namespace URL

namespace ctemplate {

using ctemplate_htmlparser::HtmlParser;

// A most-efficient way to append a string literal to the var named 'out'.
// The ""s ensure literal is actually a string literal
#define APPEND(literal)  out->Emit("" literal "", sizeof(literal)-1)

// Check whether the string of length len is identical to the literal.
// The ""s ensure literal is actually a string literal
#define STR_IS(str, len, literal) \
  ((len) == sizeof("" literal "") - 1 && \
   memcmp(str, literal, sizeof("" literal "") - 1) == 0)

TemplateModifier::~TemplateModifier() {}

void NullModifier::Modify(const char* in, size_t inlen,
                          const PerExpandData*,
                          ExpandEmitter* out, const string& arg) const {
  out->Emit(in, inlen);
}
NullModifier null_modifier;

static inline void EmitRun(const char* start, const char* limit,
                           ExpandEmitter* out) {
  if (start < limit) {
    out->Emit(start, (limit - start));
  }
}

void HtmlEscape::Modify(const char* in, size_t inlen,
                        const PerExpandData*,
                        ExpandEmitter* out, const string& arg) const {
  const char* pos = in;
  const char* start = pos;
  const char* const limit = in + inlen;
  while (pos < limit) {
    switch (*pos) {
      default:
        // Increment our counter and look at the next character.
        ++pos;
        continue;

      case '&':  EmitRun(start, pos, out); APPEND("&amp;");  break;
      case '"':  EmitRun(start, pos, out); APPEND("&quot;"); break;
      case '\'': EmitRun(start, pos, out); APPEND("&#39;");  break;
      case '<':  EmitRun(start, pos, out); APPEND("&lt;");   break;
      case '>':  EmitRun(start, pos, out); APPEND("&gt;");   break;

      case '\r': case '\n': case '\v': case '\f': case '\t':
        EmitRun(start, pos, out); APPEND(" ");      break;
    }
    start = ++pos;
  }
  EmitRun(start, pos, out);
}
HtmlEscape html_escape;

void PreEscape::Modify(const char* in, size_t inlen,
                       const PerExpandData*,
                       ExpandEmitter* out, const string& arg) const {
  const char* pos = in;
  const char* start = pos;
  const char* const limit = in + inlen;
  while (pos < limit)  {
    switch (*pos) {
      default:
        // Increment our counter and look at the next character.
        ++pos;
        continue;

      // Unlike HtmlEscape, we leave whitespace as is.
      case '&':  EmitRun(start, pos, out); APPEND("&amp;");  break;
      case '"':  EmitRun(start, pos, out); APPEND("&quot;"); break;
      case '\'': EmitRun(start, pos, out); APPEND("&#39;");  break;
      case '<':  EmitRun(start, pos, out); APPEND("&lt;");   break;
      case '>':  EmitRun(start, pos, out); APPEND("&gt;");   break;
    }
    start = ++pos;
  }
  EmitRun(start, pos, out);
}
PreEscape pre_escape;

// We encode the presence and ordering of unclosed tags in a string, using the
// letters b, i, s, and e to stand for <b>, <i>, <span>, and <em> respectively.
// The most recently opened tag is appended onto the end of the string, so in
// the common case of properly nested tags, we need only look at the last
// character.  If we don't find it there, we need to continue looking at
// everything until we find it, because tags may not necessarily be in order.
// Similarly, when we add a tag, we need to check each existing tag for a match
// so that we don't nest.
class UnclosedSnippetTags {
 public:
  // We could use ordinary ints for the enum values, but using mnemonic
  // characters potentially makes debugging easier.
  typedef enum {
    TAG_B = 'b',
    TAG_I = 'i',
    TAG_EM = 'e',
    TAG_SPAN = 's',
  } Tag;

  UnclosedSnippetTags() : tag_length(0) {
    memset(tags, 0, 5);
  }

  // Adds a tag to the set of open tags if it's not already open, or otherwise
  // return false.
  inline bool MaybeAdd(Tag tag) {
    if (strchr(tags, tag)) {
      return false;
    } else {
      tags[tag_length++] = tag;
      return true;
    }
  }

  // Removes a tag from the set of open tags if it's open, or otherwise return
  // false.
  inline bool MaybeRemove(Tag tag) {
    char* tag_location = strchr(tags, tag);
    if (tag_location) {
      for (char* c = tag_location; *c; ++c) {
        // Have to copy all later tags down by one so we don't leave a gap in the
        // array.
        *c = *(c + 1);
      }
      --tag_length;
      return true;
    } else {
      return false;
    }
  }

  inline void PrintClosingTags(ExpandEmitter* out) {
    for (int i = tag_length; i >= 0; --i) {
      switch (tags[i]) {
        case TAG_B:
          out->Emit("</b>"); break;
        case TAG_I:
          out->Emit("</i>"); break;
        case TAG_EM:
          out->Emit("</em>"); break;
        case TAG_SPAN:
          out->Emit("</span>"); break;
      }
    }
  }

 private:
  char tags[5];
  int tag_length;
};

void SnippetEscape::Modify(const char* in, size_t inlen,
                           const PerExpandData*,
                           ExpandEmitter* out, const string& arg) const {
  UnclosedSnippetTags unclosed;
  const char* pos = in;
  const char* start = pos;
  const char* const limit = in + inlen;
  while (pos < limit) {
    switch (*pos) {
      default:
        // Increment our counter and look at the next character.
        ++pos;
        continue;

      case '<': {
        // If there is a permissible tag, just advance pos past it to
        // make it part of the current run.  Notice the use of
        // "continue" below.
        const char* const next_pos = pos + 1;
        const int chars_left = limit - next_pos;
        if ((chars_left >= 2) && !memcmp(next_pos, "b>", 2)
            && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_B)) {
          pos += strliterallen("<b>");
          continue;
        } else if ((chars_left >= 2) && !memcmp(next_pos, "i>", 2)
                   && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_I)) {
          pos += strliterallen("<i>");
          continue;
        } else if ((chars_left >= 3) && !memcmp(next_pos, "em>", 3)
                   && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_EM)) {
          pos += strliterallen("<em>");
          continue;
        } else if ((chars_left >= 13) && !memcmp(next_pos, "span dir=", 9)
                   && (!memcmp(next_pos + 9, "ltr>", 4) ||
                       !memcmp(next_pos + 9, "rtl>", 4))
                   && unclosed.MaybeAdd(UnclosedSnippetTags::TAG_SPAN)) {
          pos += strliterallen("<span dir=ltr>");
          continue;
        } else if ((chars_left >= 3) && !memcmp(next_pos, "/b>", 3)
                   && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_B)) {
          pos += strliterallen("</b>");
          continue;
        } else if ((chars_left >= 3) && !memcmp(next_pos, "/i>", 3)
                   && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_I)) {
          pos += strliterallen("</i>");
          continue;
        } else if ((chars_left >= 4) && !memcmp(next_pos, "/em>", 4)
                   && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_EM)) {
          pos += strliterallen("</em>");
          continue;
        } else if ((chars_left >= 6) && !memcmp(next_pos, "/span>", 6)
                   && unclosed.MaybeRemove(UnclosedSnippetTags::TAG_SPAN)) {
          pos += strliterallen("</span>");
          continue;
        } else if ((chars_left >= 3) && !memcmp(next_pos, "br>", 3)) {
          pos += strliterallen("<br>");
          continue;
        } else if ((chars_left >= 4) && !memcmp(next_pos, "wbr>", 4)) {
          pos += strliterallen("<wbr>");
          continue;
        }

        // Emit the entity and break out of the switch.
        EmitRun(start, pos, out);
        APPEND("&lt;");
        break;
      }

      case '&':
        EmitRun(start, pos, out);
        if (pos + 1 < limit && pos[1] == '{') {
          // Could be a javascript entity, so we need to escape.
          // (Javascript entities are an xss risk in Netscape 4.)
          APPEND("&amp;");
        } else {
          APPEND("&");
        }
        break;

      case '"':  EmitRun(start, pos, out); APPEND("&quot;"); break;
      case '\'': EmitRun(start, pos, out); APPEND("&#39;");  break;
      case '>':  EmitRun(start, pos, out); APPEND("&gt;");   break;

      case '\r': case '\n': case '\v': case '\f': case '\t':
        // non-space whitespace
        EmitRun(start, pos, out); APPEND(" "); break;

    }
    start = ++pos;
  }
  EmitRun(start, pos, out);
  unclosed.PrintClosingTags(out);
}
SnippetEscape snippet_escape;

void CleanseAttribute::Modify(const char* in, size_t inlen,
                              const PerExpandData*,
                              ExpandEmitter* out, const string& arg) const {
  for (size_t i = 0; i < inlen; ++i) {
    char c = in[i];
    switch (c) {
      case '=': {
        if (i == 0 || i == (inlen - 1))
          out->Emit('_');
        else
          out->Emit(c);
        break;
      }
      case '-':
      case '.':
      case '_':
      case ':': {
        out->Emit(c);
        break;
      }
      default: {
        if ((c >= 'a' && c <= 'z') ||
            (c >= 'A' && c <= 'Z') ||
            (c >= '0' && c <= '9')) {
          out->Emit(c);
        } else {
          APPEND("_");
        }
        break;
      }
    }
  }
}
CleanseAttribute cleanse_attribute;

void CleanseCss::Modify(const char* in, size_t inlen,
                        const PerExpandData*,
                        ExpandEmitter* out, const string& arg) const {
  for (size_t i = 0; i < inlen; ++i) {
    char c = in[i];
    switch (c) {
      case ' ':
      case '_':
      case '.':
      case ',':
      case '!':
      case '#':
      case '%':
      case '-': {
        out->Emit(c);
        break;
      }
      default: {
        if ((c >= 'a' && c <= 'z') ||
            (c >= 'A' && c <= 'Z') ||
            (c >= '0' && c <= '9')) {
          out->Emit(c);
        }
        break;
      }
    }
  }
}
CleanseCss cleanse_css;

// CssUrlEscape is used as a chained modifier by ValidateUrl
// (validate_url_and_css_escape) and is not directly exposed.
class CssUrlEscape : public TemplateModifier {
 public:
  virtual void Modify(const char* in, size_t inlen,
                      const PerExpandData*, ExpandEmitter* outbuf,
                      const string& arg) const;
};

// URL-encodes the characters [\n\r\\'"()<>*] to ensure the URL can be safely
// inserted in a CSS context, e.g:
// . In an '@import url("URL");' statement
// . In a CSS property such as 'background: url("URL");'
// In both locations above, enclosing quotes are optional but parens are not.
// We want to make sure the URL cannot exit the parens enclosure, close a
// STYLE tag or reset the browser's CSS parser (via comments or newlines).
//
// References:
// . CSS 2.1 URLs: http://www.w3.org/TR/CSS21/syndata.html#url
// . CSS 1 URLs: http://www.w3.org/TR/REC-CSS1/#url
void CssUrlEscape::Modify(const char* in, size_t inlen,
                          const PerExpandData*,
                          ExpandEmitter* out, const string& arg) const {
  for (size_t i = 0; i < inlen; ++i) {
    char c = in[i];
    switch (c) {
      case '\n': APPEND("%0A"); break;
      case '\r': APPEND("%0D"); break;
      case '"':  APPEND("%22"); break;
      case '\'': APPEND("%27"); break;
      case '(':  APPEND("%28"); break;
      case ')':  APPEND("%29"); break;
      case '*':  APPEND("%2A"); break;
      case '<':  APPEND("%3C"); break;
      case '>':  APPEND("%3E"); break;
      case '\\': APPEND("%5C"); break;
      default: out->Emit(c); break;
    }
  }
}
CssUrlEscape css_url_escape;

// These URLs replace unsafe URLs for :U and :I url-escaping modes.
const char* const ValidateUrl::kUnsafeUrlReplacement = "#";
const char* const ValidateUrl::kUnsafeImgSrcUrlReplacement =
    "/images/cleardot.gif";

void ValidateUrl::Modify(const char* in, size_t inlen,
                         const PerExpandData* per_expand_data,
                         ExpandEmitter* out, const string& arg) const {
  const char* slashpos = (char*)memchr(in, '/', inlen);
  if (slashpos == NULL) {
    slashpos = in + inlen;
  }
  const void* colonpos = memchr(in, ':', slashpos - in);
  // colon before first slash, could be a protocol
  if (colonpos != NULL && URL::HasInsecureProtocol(in, inlen)) {
    // It's a bad protocol, so return something safe
    chained_modifier_.Modify(unsafe_url_replacement_,
                             unsafe_url_replacement_length_,
                             per_expand_data,
                             out,
                             "");
    return;
  }
  // If we get here, it's a valid url, so just escape it
  chained_modifier_.Modify(in, inlen, per_expand_data, out, "");
}
ValidateUrl validate_url_and_html_escape(
    html_escape,
    ValidateUrl::kUnsafeUrlReplacement);
ValidateUrl validate_url_and_javascript_escape(
    javascript_escape,
    ValidateUrl::kUnsafeUrlReplacement);
ValidateUrl validate_url_and_css_escape(
    css_url_escape,
    ValidateUrl::kUnsafeUrlReplacement);
ValidateUrl validate_img_src_url_and_html_escape(
    html_escape,
    ValidateUrl::kUnsafeImgSrcUrlReplacement);
ValidateUrl validate_img_src_url_and_javascript_escape(
    javascript_escape,
    ValidateUrl::kUnsafeImgSrcUrlReplacement);
ValidateUrl validate_img_src_url_and_css_escape(
    css_url_escape,
    ValidateUrl::kUnsafeImgSrcUrlReplacement);

void XmlEscape::Modify(const char* in, size_t inlen,
                       const PerExpandData*,
                       ExpandEmitter* out, const string& arg) const {
  const char* pos = in;
  const char* start = pos;
  const char* const limit = in + inlen;
  while (pos < limit) {
    char ch = *pos;

    // According to section 2.2 of the spec
    // http://www.w3.org/TR/REC-xml/#charsets control characters in range
    // 0x00-0x1F (except \t, \r and \n) are not valid XML characters. In
    // particular, conformant parsers are allowed to die when encountering a FF
    // char in PCDATA sections. These chars are replaced by a space.
    if (ch >= 0x00 && ch < 0x20 && ch != '\t' && ch != '\r' && ch != '\n') {
      EmitRun(start, pos, out);
      out->Emit(' ');
      start = ++pos;
      continue;
    }

    switch (ch) {
      default:
        // Increment our counter and look at the next character.
        ++pos;
        continue;

      case '&':  EmitRun(start, pos, out); APPEND("&amp;");  break;
      case '"':  EmitRun(start, pos, out); APPEND("&quot;"); break;
      case '\'': EmitRun(start, pos, out); APPEND("&#39;");  break;
      case '<':  EmitRun(start, pos, out); APPEND("&lt;");   break;
      case '>':  EmitRun(start, pos, out); APPEND("&gt;");   break;
    }
    start = ++pos;
  }
  EmitRun(start, pos, out);
}
XmlEscape xml_escape;

// This table maps initial characters to code lengths.  This could be
// done with a 16-byte table and a shift, but there's a substantial
// performance increase by eliminating the shift.
static const char kCodeLengths[256] = {
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
};

// Returns the UTF-8 code-unit starting at start, or the special codepoint
// 0xFFFD if the input ends abruptly or is not well-formed UTF-8.
// start -- address of the start of the code unit which also receives the
//          address past the end of the code unit returned.
// end -- exclusive end of the string
static inline uint16 UTF8CodeUnit(const char** start, const char *end) {
  // Use kCodeLengths table to calculate the length of the code unit
  // from the first character.
  unsigned char first_char = static_cast<unsigned char>(**start);
  size_t code_unit_len = kCodeLengths[first_char];
  if (code_unit_len == 1) {
    // Return the current byte as a codepoint.
    // Either it is a valid single byte codepoint, or it's not part of a valid
    // UTF-8 sequence, and so has to be handled individually.
    ++*start;
    return first_char;
  }
  const char *code_unit_end = *start + code_unit_len;
  if (code_unit_end < *start || code_unit_end > end) {  // Truncated code unit.
    ++*start;
    return 0xFFFDU;
  }
  const char* pos = *start;
  uint16 code_unit = *pos & (0xFFU >> code_unit_len);
  while (--code_unit_len) {
    uint16 tail_byte = *(++pos);
    if ((tail_byte & 0xC0U) != 0x80U) {  // Malformed code unit.
      ++*start;
      return 0xFFFDU;
    }
    code_unit = (code_unit << 6) | (tail_byte & 0x3FU);
  }
  *start = code_unit_end;
  return code_unit;
}

// A good reference is the ECMA standard (3rd ed), section 7.8.4:
// http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
void JavascriptEscape::Modify(const char* in, size_t inlen,
                              const PerExpandData*,
                              ExpandEmitter* out, const string& arg) const {
  const char* pos = in;
  const char* start = pos;
  const char* const limit = in + inlen;

  if (limit < in) { return; }

  while (pos < limit) {
    const char* next_pos = pos;
    uint16 code_unit = UTF8CodeUnit(&next_pos, limit);

    // Test for 16-bit values outside the switch below, because gcc
    // will emit chained branches rather than a jump table for such a
    // wide range of values.
    if (code_unit & 0xFF00) {
      // Linebreaks according to EcmaScript 262 which cannot appear in strings.
      if (code_unit == 0x2028) {
        // Line separator
        EmitRun(start, pos, out); APPEND("\\u2028");
      } else if (code_unit == 0x2029) {
        // Paragraph separator
        EmitRun(start, pos, out); APPEND("\\u2029");
      } else {
        pos = next_pos;
        continue;
      }
    } else {
      switch (code_unit) {
        default:
          // Increment our counter and look at the next character.
          pos = next_pos;
          continue;

        case '\0': EmitRun(start, pos, out); APPEND("\\x00"); break;
        case '"':  EmitRun(start, pos, out); APPEND("\\x22"); break;
        case '\'': EmitRun(start, pos, out); APPEND("\\x27"); break;
        case '\\': EmitRun(start, pos, out); APPEND("\\\\");  break;
        case '\t': EmitRun(start, pos, out); APPEND("\\t");   break;
        case '\r': EmitRun(start, pos, out); APPEND("\\r");   break;
        case '\n': EmitRun(start, pos, out); APPEND("\\n");   break;
        case '\b': EmitRun(start, pos, out); APPEND("\\b");   break;
        case '\f': EmitRun(start, pos, out); APPEND("\\f");   break;
        case '&':  EmitRun(start, pos, out); APPEND("\\x26"); break;
        case '<':  EmitRun(start, pos, out); APPEND("\\x3c"); break;
        case '>':  EmitRun(start, pos, out); APPEND("\\x3e"); break;
        case '=':  EmitRun(start, pos, out); APPEND("\\x3d"); break;

        case '\v':
          // Do not escape vertical tabs to "\\v" since it is interpreted as 'v'
          // by JScript according to section 2.1 of
          // http://wiki.ecmascript.org/lib/exe/fetch.php?
          // id=resources%3Aresources&cache=cache&
          // media=resources:jscriptdeviationsfromes3.pdf
          EmitRun(start, pos, out); APPEND("\\x0b"); break;
      }
    }
    start = pos = next_pos;
  }
  EmitRun(start, pos, out);
}
JavascriptEscape javascript_escape;


void JavascriptNumber::Modify(const char* in, size_t inlen,
                              const PerExpandData*,
                              ExpandEmitter* out, const string& arg) const {
  if (inlen == 0)
    return;

  if (STR_IS(in, inlen, "true") || STR_IS(in, inlen, "false")) {
    out->Emit(in, inlen);
    return;
  }

  bool valid = true;
  if (in[0] == '0' && inlen > 2 && (in[1] == 'x' || in[1] == 'X')) {
    // There must be at least one hex digit after the 0x for it to be valid.
    // Hex number. Check that it is of the form 0(x|X)[0-9A-Fa-f]+
    for (size_t i = 2; i < inlen; i++) {
      char c = in[i];
      if (!((c >= 'a' && c <= 'f') ||
            (c >= 'A' && c <= 'F') ||
            (c >= '0' && c <= '9'))) {
        valid = false;
        break;
      }
    }
  } else {
    // Must be a base-10 (or octal) number.
    // Check that it has the form [0-9+-.eE]+
    for (size_t i = 0; i < inlen; i++) {
      char c = in[i];
      if (!((c >= '0' && c <= '9') ||
            c == '+' || c == '-' || c == '.' ||
            c == 'e' || c == 'E')) {
        valid = false;
        break;
      }
    }
  }
  if (valid) {
    out->Emit(in, inlen);   // Number was valid, output it.
  } else {
    APPEND("null");         // Number was not valid, output null instead.
  }
}
JavascriptNumber javascript_number;

static inline bool IsUrlQueryEscapeSafeChar(unsigned char c) {
  // Everything not matching [0-9a-zA-Z.,_*/~!()-] is escaped.
  static unsigned long _safe_characters[8] = {
    0x00000000L, 0x03fff702L, 0x87fffffeL, 0x47fffffeL,
    0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L
  };

  return (_safe_characters[(c)>>5] & (1 << ((c) & 31)));
}

void UrlQueryEscape::Modify(const char* in, size_t inlen,
                            const PerExpandData*,
                            ExpandEmitter* out, const string& arg) const {
  const char* pos = in;
  const char* const limit = in + inlen;
  while (true) {
    // Peel off any initial runs of safe characters and emit them all
    // at once.
    const char* start = pos;
    while (pos < limit && IsUrlQueryEscapeSafeChar(*pos)) {
      pos++;
    }
    EmitRun(start, pos, out);

    // Now deal with a single unsafe character.
    if (pos < limit) {
      unsigned char c = *pos;
      if (c == ' ') {
        out->Emit('+');
      } else {
        out->Emit('%');
        out->Emit(((c>>4) < 10 ? ((c>>4) + '0') : (((c>>4) - 10) + 'A')));
        out->Emit(((c&0xf) < 10 ? ((c&0xf) + '0') : (((c&0xf) - 10) + 'A')));
      }
      pos++;
    } else {
      // We're done!
      break;
    }
  }
}
UrlQueryEscape url_query_escape;

// For more information on escaping JSON, see section 2.5 in
// http://www.ietf.org/rfc/rfc4627.txt.
// Escaping '&', '<', '>' is optional in the JSON proposed RFC
// but alleviates concerns with content sniffing if JSON is used
// in a context where the browser may attempt to interpret HTML.
void JsonEscape::Modify(const char* in, size_t inlen,
                        const PerExpandData*,
                        ExpandEmitter* out, const string& arg) const {
  const char* pos = in;
  const char* start = pos;
  const char* const limit = in + inlen;
  while (pos < limit) {
    switch (*pos) {
      default:
        // Increment our counter and look at the next character.
        ++pos;
        continue;

      case '"':  EmitRun(start, pos, out); APPEND("\\\"");    break;
      case '\\': EmitRun(start, pos, out); APPEND("\\\\");    break;
      case '/':  EmitRun(start, pos, out); APPEND("\\/");     break;
      case '\b': EmitRun(start, pos, out); APPEND("\\b");     break;
      case '\f': EmitRun(start, pos, out); APPEND("\\f");     break;
      case '\n': EmitRun(start, pos, out); APPEND("\\n");     break;
      case '\r': EmitRun(start, pos, out); APPEND("\\r");     break;
      case '\t': EmitRun(start, pos, out); APPEND("\\t");     break;
      case '&':  EmitRun(start, pos, out); APPEND("\\u0026"); break;
      case '<':  EmitRun(start, pos, out); APPEND("\\u003C"); break;
      case '>':  EmitRun(start, pos, out); APPEND("\\u003E"); break;
    }
    start = ++pos;
  }
  EmitRun(start, pos, out);
}
JsonEscape json_escape;

void PrefixLine::Modify(const char* in, size_t inlen,
                        const PerExpandData*,
                        ExpandEmitter* out, const string& arg) const {
  while (inlen > 0) {
    const char* nl = (const char*)memchr(in, '\n', inlen);
    const char* cr = (const char*)memchr(in, '\r', nl ? nl - in : inlen);
    size_t linelen;
    if (nl == NULL && cr == NULL) {
      // We're at the last line
      out->Emit(in, inlen);
      break;
    } else {
      // One or both of \r and \n is set; point to the first char past
      // the newline.  Note for \r\n, that's the char after the \n,
      // otherwise, it's the char past the \r or the \n we see.
      if ((nl == NULL) != (cr == NULL))     // one is set, the other is NULL
        linelen = (nl ? nl : cr) + 1 - in;
      else if (nl == cr + 1 || nl < cr)     // \r\n, or \n comes first
        linelen = nl + 1 - in;
      else
        linelen = cr + 1 - in;
    }
    out->Emit(in, linelen);
    out->Emit(arg);               // a new line, so emit the prefix
    in += linelen;
    inlen -= linelen;
    assert(inlen >= 0);
  }
}
PrefixLine prefix_line;


// Must be at least one more than the maximum number of alternative modifiers
// specified in any given element of g_modifiers.
# define MAX_SAFE_ALTERNATIVES 10  // If the compiler complains, increase it.

// Use the empty string if you want a modifier not to have a long-name.
// Use '\0' if you want a modifier not to have a short-name.
// Note: not all modifiers are in this array:
// 1) SnippetEscape: use html_escape_with_arg=snippet to get this
// 2) CleanseAttribute: use html_escape_with_arg=attribute to get this
// 3) ValidateUrl: use html_escape_with_arg=url to get this
//
// Some modifiers define other modifiers that are safe replacements
// from an XSS perspective. Replacements are not commutative so for
// example H=pre considers H=attribute a safe replacement to it
// but H=attribute has no safe replacements.
// This struct is not pretty but allows the definitions to be
// done without the need for a global initialization method.
// Be very careful making a change to g_modifiers as modifiers
// point to other ones within that same array so elements
// may not be re-ordered easily. Also you need to change
// the global g_am_dirs correspondingly.
//
static struct ModifierWithAlternatives {
  ModifierInfo modifier_info;
  ModifierInfo* safe_alt_mods[MAX_SAFE_ALTERNATIVES];
} g_modifiers[] = {
  /* 0 */ { ModifierInfo("cleanse_css", 'c',
                         XSS_WEB_STANDARD, &cleanse_css),
            {&g_modifiers[16].modifier_info,  // url_escape_with_arg=css
             // img_src_url_escape_with_arg=css
             &g_modifiers[19].modifier_info} },
  /* 1 */ { ModifierInfo("html_escape", 'h',
                         XSS_WEB_STANDARD, &html_escape),
            {&g_modifiers[2].modifier_info,   // html_escape_with_arg=snippet
             &g_modifiers[3].modifier_info,   // html_escape_with_arg=pre
             &g_modifiers[4].modifier_info,   // html_escape_with_arg=attribute
             &g_modifiers[5].modifier_info,   // html_escape_with_arg=url
             &g_modifiers[8].modifier_info,   // pre_escape
             &g_modifiers[9].modifier_info,   // url_query_escape
             &g_modifiers[11].modifier_info,  // url_escape_with_arg=html
             &g_modifiers[12].modifier_info,  // url_escape_with_arg=query
             // img_src_url_escape_with_arg=html
             &g_modifiers[18].modifier_info} },
  /* 2 */ { ModifierInfo("html_escape_with_arg=snippet", 'H',
                         XSS_WEB_STANDARD, &snippet_escape),
            {&g_modifiers[1].modifier_info,   // html_escape
             &g_modifiers[3].modifier_info,   // html_escape_with_arg=pre
             &g_modifiers[4].modifier_info,   // html_escape_with_arg=attribute
             &g_modifiers[8].modifier_info,   // pre_escape
             &g_modifiers[9].modifier_info,   // url_query_escape
             &g_modifiers[12].modifier_info} },  // url_escape_with_arg=query
  /* 3 */ { ModifierInfo("html_escape_with_arg=pre", 'H',
                         XSS_WEB_STANDARD, &pre_escape),
            {&g_modifiers[1].modifier_info,   // html_escape
             &g_modifiers[2].modifier_info,   // html_escape_with_arg=snippet
             &g_modifiers[4].modifier_info,   // html_escape_with_arg=attribute
             &g_modifiers[8].modifier_info,   // pre_escape
             &g_modifiers[9].modifier_info,   // url_query_escape
             &g_modifiers[12].modifier_info} },  // url_escape_with_arg=query
  /* 4 */ { ModifierInfo("html_escape_with_arg=attribute", 'H',
                         XSS_WEB_STANDARD, &cleanse_attribute), {} },
  /* 5 */ { ModifierInfo("html_escape_with_arg=url", 'H',
                         XSS_WEB_STANDARD, &validate_url_and_html_escape),
            // img_src_url_escape_with_arg=html
            {&g_modifiers[18].modifier_info} },
  /* 6 */ { ModifierInfo("javascript_escape", 'j',
                         XSS_WEB_STANDARD, &javascript_escape),
            {&g_modifiers[7].modifier_info,   // json_escape
             &g_modifiers[10].modifier_info,  // url_escape_with_arg=javascript
             // img_src_url_escape_with_arg=javascript
             &g_modifiers[17].modifier_info} },
  /* 7 */ { ModifierInfo("json_escape", 'o', XSS_WEB_STANDARD, &json_escape),
            {&g_modifiers[6].modifier_info} },  // javascript_escape
  /* 8 */ { ModifierInfo("pre_escape", 'p', XSS_WEB_STANDARD, &pre_escape),
            {&g_modifiers[1].modifier_info,     // html_escape
             &g_modifiers[2].modifier_info,     // html_escape_with_arg=snippet
             &g_modifiers[3].modifier_info,     // html_escape_with_arg=pre
             &g_modifiers[4].modifier_info,     // html_escape_with_arg=attr...
             &g_modifiers[9].modifier_info,     // url_query_escape
             &g_modifiers[12].modifier_info} },   // url_escape_with_arg=query
  /* 9 */ { ModifierInfo("url_query_escape", 'u',
                         XSS_WEB_STANDARD, &url_query_escape), {} },
  /* 10 */ { ModifierInfo("url_escape_with_arg=javascript", 'U',
                          XSS_WEB_STANDARD,
                          &validate_url_and_javascript_escape),
             // img_src_url_escape_with_arg=javascript
             {&g_modifiers[17].modifier_info} },
  /* 11 */ { ModifierInfo("url_escape_with_arg=html", 'U',
                          XSS_WEB_STANDARD, &validate_url_and_html_escape),
             // img_src_url_escape_with_arg=html
             {&g_modifiers[18].modifier_info} },
  /* 12 */ { ModifierInfo("url_escape_with_arg=query", 'U',
                          XSS_WEB_STANDARD, &url_query_escape), {} },
  /* 13 */ { ModifierInfo("none", '\0', XSS_SAFE, &null_modifier), {} },
  /* 14 */ { ModifierInfo("xml_escape", '\0', XSS_WEB_STANDARD, &xml_escape),
             {&g_modifiers[1].modifier_info,      // html_escape
              &g_modifiers[4].modifier_info,} },  // H=attribute
  /* 15 */ { ModifierInfo("javascript_escape_with_arg=number", 'J',
                          XSS_WEB_STANDARD, &javascript_number), {} },
  /* 16 */ { ModifierInfo("url_escape_with_arg=css", 'U',
                          XSS_WEB_STANDARD, &validate_url_and_css_escape), {} },
  /* 17 */ { ModifierInfo("img_src_url_escape_with_arg=javascript", 'I',
                          XSS_WEB_STANDARD,
                          &validate_img_src_url_and_javascript_escape), {} },
  /* 18 */ { ModifierInfo("img_src_url_escape_with_arg=html", 'I',
                          XSS_WEB_STANDARD,
                          &validate_img_src_url_and_html_escape), {} },
  /* 19 */ { ModifierInfo("img_src_url_escape_with_arg=css", 'I',
                          XSS_WEB_STANDARD,
                          &validate_img_src_url_and_css_escape), {} },
};

static vector<const ModifierInfo*> g_extension_modifiers;
static vector<const ModifierInfo*> g_unknown_modifiers;

// Returns whether or not candidate can be safely (w.r.t XSS)
// used in lieu of our ModifierInfo. This is true iff:
//   1. Both have the same modifier function OR
//   2. Candidate's modifier function is in our ModifierInfo's
//      list (vector) of safe alternative modifier functions.
//
// This is used with the auto-escaping code, which automatically
// figures out which modifier to apply to a variable based on the
// variable's context (in an html "<A HREF", for instance).  Some
// built-in modifiers are considered safe alternatives from the perspective
// of preventing XSS (cross-site-scripting) attacks, in which case
// the auto-escaper should allow the choice of which to use in the
// template. This is intended only for internal use as it is dangerous
// and complicated to figure out which modifier is an XSS-safe
// replacement for a given one. Custom modifiers currently may not
// indicate safe replacements, only built-in ones may do so.
//
// Note that this function is not commutative therefore
// IsSafeXSSAlternative(a, b) may not be equal to IsSafeXSSAlternative(b, a).
bool IsSafeXSSAlternative(const ModifierInfo& our,
                          const ModifierInfo& candidate) {
  // Succeeds even for non built-in modifiers but no harm.
  if (our.modifier == candidate.modifier)
    return true;

  for (const ModifierWithAlternatives* mod_with_alts = g_modifiers;
       mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers);
       ++mod_with_alts) {
    if (mod_with_alts->modifier_info.long_name == our.long_name)
      // We found our Modifier in the built-in array g_modifiers.
      for (int i = 0; mod_with_alts->safe_alt_mods[i] != NULL &&
               i < MAX_SAFE_ALTERNATIVES; ++i)
        if (mod_with_alts->safe_alt_mods[i]->long_name == candidate.long_name)
          // We found candidate in our Modifier's list of safe alternatives.
          return true;
  }
  // our is not built-in or candidate is not a safe replacement to our.
  return false;
}

static inline bool IsExtensionModifier(const char* long_name) {
  return memcmp(long_name, "x-", 2) == 0;
}

static bool AddModifierCommon(const char* long_name,
                 const TemplateModifier* modifier, bool xss_safe) {
  if (!IsExtensionModifier(long_name))
    return false;

  // TODO(csilvers): store in a map or multimap, rather than a vector
  for (vector<const ModifierInfo*>::const_iterator mod =
           g_extension_modifiers.begin();
       mod != g_extension_modifiers.end();
       ++mod) {
    // Check if mod has the same name as us.  For modifiers that also take
    // values, this is everything before the =.  The only time it's ok to
    // have the same name is when we have different modval specializations:
    // "foo=bar" and "foo=baz" are both valid names.  Note "foo" and
    // "foo=bar" is not valid: foo has no modval, but "foo=bar" does.
    const size_t new_modifier_namelen = strcspn(long_name, "=");
    const size_t existing_modifier_namelen = strcspn((*mod)->long_name.c_str(),
                                                     "=");
    if (new_modifier_namelen == existing_modifier_namelen &&
        !memcmp(long_name, (*mod)->long_name.c_str(), new_modifier_namelen)) {
      if (long_name[new_modifier_namelen] == '=' &&
          (*mod)->long_name[existing_modifier_namelen] == '=' &&
          (*mod)->long_name != long_name) {
        // It's ok, we're different specializations!
      } else {
        // It's not ok: we have the same name and no good excuse.
        return false;
      }
    }
  }

  g_extension_modifiers.push_back(
      new ModifierInfo(long_name, '\0',
                       xss_safe ? XSS_SAFE : XSS_UNIQUE,
                       modifier));
  return true;
}

// Modifier added with XSS_UNIQUE XssClass.
bool AddModifier(const char* long_name,
                 const TemplateModifier* modifier) {
  return AddModifierCommon(long_name, modifier, false);
}

// Modifier added with XSS_SAFE XssClass.
bool AddXssSafeModifier(const char* long_name,
                 const TemplateModifier* modifier) {
  return AddModifierCommon(long_name, modifier, true);
}

// If candidate_match is a better match for modname/modval than bestmatch,
// update bestmatch.  To be a better match, two conditions must be met:
//  1) The candidate's name must match modname
//  2) If the candidate is a specialization (that is, name is of the form
//     "foo=bar", then modval matches the specialization value).
//  3) If the candidate is not a specialization, bestmatch isn't a
//     specialization either.
// Condition (3) makes sure that if we match the ModifierInfo with name
// "foo=bar", we don't claim the ModifierInfo "foo=" is a better match.
// Recall that by definition, modval will always start with a '=' if present.
static void UpdateBestMatch(const char* modname, size_t modname_len,
                            const char* modval, size_t modval_len,
                            const ModifierInfo* candidate_match,
                            const ModifierInfo** best_match) {
  // It's easiest to handle the two case differently: (1) candidate_match
  // refers to a modifier that expects a modifier-value; (2) it doesn't.
  if (candidate_match->modval_required) {
    // To be a match, we have to fulfill three requirements: we have a
    // modval, our modname matches candidate_match's modname (either
    // shortname or longname), and our modval is consistent with the
    // value specified in the longname (whatever might follow the =).
    const char* const longname_start = candidate_match->long_name.c_str();
    const char* const equals = strchr(longname_start, '=');
    assert(equals != NULL);
    if (modval_len > 0 &&
        ((modname_len == 1 && *modname == candidate_match->short_name) ||
         (modname_len == equals - longname_start &&
          memcmp(modname, longname_start, modname_len) == 0)) &&
        ((equals[1] == '\0') ||  // name is "foo=" (not a specialization)
         (modval_len
          == longname_start + candidate_match->long_name.size() - equals &&
          memcmp(modval, equals, modval_len) == 0))) {
      // Condition (3) above is satisfied iff our longname is longer than
      // best-match's longname (so we prefer "foo=bar" to "foo=").
      if (*best_match == NULL ||
          candidate_match->long_name.size() > (*best_match)->long_name.size())
        *best_match = candidate_match;
    }
  } else {
    // In this case, to be a match: we must *not* have a modval.  Our
    // modname still must match modinfo's modname (either short or long).
    if (modval_len == 0 &&
        ((modname_len == 1 && *modname == candidate_match->short_name) ||
         (modname_len == candidate_match->long_name.size() &&
          !memcmp(modname, candidate_match->long_name.data(), modname_len)))) {
      // In the no-modval case, only one match should exist.
      assert(*best_match == NULL);
      *best_match = candidate_match;
    }
  }
}

const ModifierInfo* FindModifier(const char* modname, size_t modname_len,
                                 const char* modval, size_t modval_len) {
  // More than one modifier can match, in the case of modval specializations
  // (e.g., the modifier "foo=" and "foo=bar" will both match on input of
  // modname="foo", modval="bar").  In that case, we take the ModifierInfo
  // with the longest longname, since that's the most specialized match.
  const ModifierInfo* best_match = NULL;
  if (modname_len >= 2 && IsExtensionModifier(modname)) {
    for (vector<const ModifierInfo*>::const_iterator mod =
             g_extension_modifiers.begin();
         mod != g_extension_modifiers.end();
         ++mod) {
      UpdateBestMatch(modname, modname_len, modval, modval_len,
                      *mod, &best_match);
    }
    if (best_match != NULL)
      return best_match;

    for (vector<const ModifierInfo*>::const_iterator mod =
             g_unknown_modifiers.begin();
         mod != g_unknown_modifiers.end();
         ++mod) {
      UpdateBestMatch(modname, modname_len, modval, modval_len,
                      *mod, &best_match);
    }
    if (best_match != NULL)
      return best_match;
    // This is the only situation where we can pass in a modifier of NULL.
    // It means "we don't know about this modifier-name."
    string fullname(modname, modname_len);
    if (modval_len) {
      fullname.append(modval, modval_len);
    }
    // TODO(csilvers): store in a map or multimap, rather than a vector
    g_unknown_modifiers.push_back(new ModifierInfo(fullname, '\0',
                                                   XSS_UNIQUE, NULL));
    return g_unknown_modifiers.back();
  } else {
    for (const ModifierWithAlternatives* mod_with_alts = g_modifiers;
         mod_with_alts < g_modifiers + sizeof(g_modifiers)/sizeof(*g_modifiers);
         ++mod_with_alts) {
      UpdateBestMatch(modname, modname_len, modval, modval_len,
                      &mod_with_alts->modifier_info, &best_match);
    }
    return best_match;
  }
}

// For escaping variables under the auto-escape mode:
// Each directive below maps to a distinct sequence of
// escaping directives (i.e a vector<ModifierAndValue>) applied
// to a variable during run-time substitution.
// The directives are stored in a global array (g_mods_ae)
// initialized under lock in InitializeGlobalModifiers.
enum AutoModifyDirective {
  AM_EMPTY,                         // Unused, kept as marker.
  AM_HTML,
  AM_HTML_UNQUOTED,
  AM_JS,
  AM_JS_NUMBER,
  AM_URL_HTML,
  AM_URL_QUERY,
  AM_STYLE,
  AM_XML,
  NUM_ENTRIES_AM,
};

// Populates the global vector of hard-coded modifiers that
// Auto-Escape may pick. We point to the appropriate modifier in
// the global g_modifiers.
// Reference these globals via the global array g_am_dirs[] for consistency.
// Note: We allow for more than one ModifierAndValue in the array hence
// the need to terminate with a Null marker. However currently all the
// escaping directives have exactly one ModifierAndValue.
static const ModifierAndValue g_am_empty[] = {
  ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_html[] = {
  ModifierAndValue(&g_modifiers[1].modifier_info, "", 0),
  ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_html_unquoted[] = {
  ModifierAndValue(&g_modifiers[4].modifier_info, "=attribute", 10),
  ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_js[] = {
  ModifierAndValue(&g_modifiers[6].modifier_info, "", 0),
  ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_js_number[] = {
  ModifierAndValue(&g_modifiers[15].modifier_info, "=number", 7),
  ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_url_html[] = {
  ModifierAndValue(&g_modifiers[11].modifier_info, "=html", 5),
  ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_url_query[] = {
  ModifierAndValue(&g_modifiers[9].modifier_info, "", 0),
  ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_style[] = {
  ModifierAndValue(&g_modifiers[0].modifier_info, "", 0),
  ModifierAndValue(NULL, "", 0)
};
static const ModifierAndValue g_am_xml[] = {
  ModifierAndValue(&g_modifiers[14].modifier_info, "", 0),
  ModifierAndValue(NULL, "", 0)
};

static const ModifierAndValue* g_am_dirs[NUM_ENTRIES_AM] = {
  g_am_empty,                  /* AM_EMPTY */
  g_am_html,                   /* AM_HTML */
  g_am_html_unquoted,          /* AM_HTML_UNQUOTED */
  g_am_js,                     /* AM_JS */
  g_am_js_number,              /* AM_JS_NUMBER */
  g_am_url_html,               /* AM_URL_HTML */
  g_am_url_query,              /* AM_URL_QUERY */
  g_am_style,                  /* AM_STYLE */
  g_am_xml,                    /* AM_XML */
};

string PrettyPrintOneModifier(const ModifierAndValue& modval) {
  string out;
  out.append(":");
  if (modval.modifier_info->short_name)      // short_name is a char.
    out.append(1, modval.modifier_info->short_name);
  else
    out.append(modval.modifier_info->long_name);
  if (modval.value_len != 0)
    out.append(modval.value, modval.value_len);
  return out;
}

string PrettyPrintModifiers(const vector<const ModifierAndValue*>& modvals,
                            const string& separator) {
  string out;
  for (vector<const ModifierAndValue*>::const_iterator it =
           modvals.begin(); it != modvals.end();  ++it) {
    if (it != modvals.begin())
      out.append(separator);
    out.append(PrettyPrintOneModifier(**it));
  }
  return out;
}

// Return the sequence of escaping directives to apply for the given context.
// An empty vector indicates an error occurred. Currently we never need
// to chain escaping directives hence on success, the vector is always of
// size 1. This may change in the future.
vector<const ModifierAndValue*> GetModifierForHtmlJs(
    HtmlParser* htmlparser, string* error_msg) {
  assert(htmlparser);
  assert(error_msg);
  vector<const ModifierAndValue*> modvals;

  // Two cases of being inside javascript:
  // 1. Inside raw javascript (within a <script> tag). If the value
  //    is quoted we apply javascript_escape, if not we have to coerce
  //    it to a safe value due to the risk of javascript code execution
  //    hence apply :J=number. If arbitrary code needs to be inserted
  //    at run-time, the developer must use :none.
  // 2. In the value of an attribute that takes javascript such
  //    as onmouseevent in '<a href="someUrl" onmousevent="{{EVENT}}">'.
  //    That will be covered in the STATE_VALUE state logic below.
  if (htmlparser->InJavascript() &&
      htmlparser->state() != HtmlParser::STATE_VALUE) {
    if (htmlparser->IsJavascriptQuoted()) {
      modvals.push_back(g_am_dirs[AM_JS]);
      assert(modvals.size() == 1);
      return modvals;
    } else {
      modvals.push_back(g_am_dirs[AM_JS_NUMBER]);
      assert(modvals.size() == 1);
      return modvals;
    }
  }
  switch (htmlparser->state()) {
    case HtmlParser::STATE_VALUE:{
      string attribute_name = htmlparser->attribute();
      switch (htmlparser->AttributeType()) {
        case HtmlParser::ATTR_URI:
          // Case 1: The URL is quoted:
          // . Apply :U=html if it is a complete URL or :h if it is a fragment.
          // Case 2: The URL is not quoted:
          // .  If it is a complete URL, we have no safe modifiers that
          //   won't break it so we have to fail.
          // .  If it is a URL fragment, then :u is safe and not likely to
          //   break the URL.
          if (!htmlparser->IsAttributeQuoted()) {
            if (htmlparser->IsUrlStart()) {   // Complete URL.
              error_msg->append("Value of URL attribute \"" + attribute_name +
                                "\" must be enclosed in quotes.");
              assert(modvals.empty());
              return modvals;  // Empty
            } else {                                // URL fragment.
              modvals.push_back(g_am_dirs[AM_URL_QUERY]);
            }
          } else {
            // Only validate the URL if we have a complete URL,
            // otherwise simply html_escape.
            if (htmlparser->IsUrlStart())
              modvals.push_back(g_am_dirs[AM_URL_HTML]);
            else
              modvals.push_back(g_am_dirs[AM_HTML]);
          }
          break;
        case HtmlParser::ATTR_REGULAR:
          // If the value is quoted, simply HTML escape, otherwise
          // apply stricter escaping using H=attribute.
          if (htmlparser->IsAttributeQuoted())
            modvals.push_back(g_am_dirs[AM_HTML]);
          else
            modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
          break;
        case HtmlParser::ATTR_STYLE:
          // If the value is quoted apply :c, otherwise fail.
          if (htmlparser->IsAttributeQuoted()) {
            modvals.push_back(g_am_dirs[AM_STYLE]);
          } else {
            error_msg->append("Value of style attribute \"" + attribute_name +
                              "\" must be enclosed in quotes.");
            assert(modvals.empty());
            return modvals;   // Empty
          }
          break;
        case HtmlParser::ATTR_JS:
          // We require javascript accepting attributes (such as onclick)
          // to be HTML quoted, otherwise they are vulnerable to
          // HTML attribute insertion via the use of whitespace.
          if (!htmlparser->IsAttributeQuoted()) {
            error_msg->append("Value of javascript attribute \"" +
                              attribute_name +
                              "\" must be enclosed in quotes.");
            assert(modvals.empty());
            return modvals;   // Empty
          }
          // If the variable is quoted apply javascript_escape otherwise
          // apply javascript_number which will ensure it is safe against
          // code injection.
          // Note: We normally need to HTML escape after javascript escape
          // but the javascript escape implementation provided makes the
          // HTML escape redundant so simply javascript escape.
          if (htmlparser->IsJavascriptQuoted())
            modvals.push_back(g_am_dirs[AM_JS]);
          else
            modvals.push_back(g_am_dirs[AM_JS_NUMBER]);
          break;
        case HtmlParser::ATTR_NONE:
          assert("We should be in attribute!" && 0);
        default:
          assert("Should not be able to get here." && 0);
          return modvals;  // Empty
      }
      // In STATE_VALUE particularly, the parser may get out of sync with
      // the correct state - that the browser sees - due to the fact that
      // it does not get to parse run-time content (variables). So we tell
      // the parser there is content that will be expanded here.
      // A good example is:
      //   <a href={{URL}} alt={{NAME}}>
      // The parser sees <a href= alt=> and interprets 'alt=' to be
      // the value of href.
      htmlparser->InsertText();  // Ignore return value.
      assert(modvals.size() == 1);
      return modvals;
    }
    case HtmlParser::STATE_TAG:{
      // Apply H=attribute to tag names since they are alphabetic.
      // Examples of tag names: TITLE, BODY, A and BR.
      modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
      assert(modvals.size() == 1);
      return modvals;
    }
    case HtmlParser::STATE_ATTR:{
      // Apply H=attribute to attribute names since they are alphabetic.
      // Examples of attribute names: HREF, SRC and WIDTH.
      modvals.push_back(g_am_dirs[AM_HTML_UNQUOTED]);
      assert(modvals.size() == 1);
      return modvals;
    }
    case HtmlParser::STATE_COMMENT:
    case HtmlParser::STATE_TEXT:{
      // Apply :h to regular HTML text and :c if within a style tag.
      if (htmlparser->InCss())
        modvals.push_back(g_am_dirs[AM_STYLE]);
      else
        modvals.push_back(g_am_dirs[AM_HTML]);
      assert(modvals.size() == 1);
      return modvals;
    }
    default:{
      assert("Should not be able to get here." && 0);
      return modvals;   // Empty
    }
  }
  assert("Should not be able to get here." && 0);
  return modvals;   // Empty
}

// TODO(jad): Memoize all GetModifierForXXX functions below.
//            They don't depend on parser context (from csilvers).
vector<const ModifierAndValue*> GetModifierForCss(HtmlParser* htmlparser,
                                                  string* error_msg) {
  vector<const ModifierAndValue*> modvals;
  modvals.push_back(g_am_dirs[AM_STYLE]);
  return modvals;
}

vector<const ModifierAndValue*> GetModifierForXml(HtmlParser* htmlparser,
                                                        string* error_msg) {
  vector<const ModifierAndValue*> modvals;
  modvals.push_back(g_am_dirs[AM_XML]);
  return modvals;
}

vector<const ModifierAndValue*> GetModifierForJson(HtmlParser* htmlparser,
                                                         string* error_msg) {
  vector<const ModifierAndValue*> modvals;
  modvals.push_back(g_am_dirs[AM_JS]);
  return modvals;
}

vector<const ModifierAndValue*> GetDefaultModifierForHtml() {
  vector<const ModifierAndValue*> modvals;
  modvals.push_back(g_am_dirs[AM_HTML]);
  return modvals;
}

vector<const ModifierAndValue*> GetDefaultModifierForJs() {
  vector<const ModifierAndValue*> modvals;
  modvals.push_back(g_am_dirs[AM_JS]);
  return modvals;
}

vector<const ModifierAndValue*> GetDefaultModifierForCss() {
  return GetModifierForCss(NULL, NULL);
}

vector<const ModifierAndValue*> GetDefaultModifierForXml() {
  return GetModifierForXml(NULL, NULL);
}

vector<const ModifierAndValue*> GetDefaultModifierForJson() {
  return GetModifierForJson(NULL, NULL);
}

}