1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4 
5 using System.ComponentModel;
6 using System.Diagnostics;
7 using System.Runtime.CompilerServices;
8 using System.Text.Internal;
9 using System.Text.Unicode;
10 
11 namespace System.Text.Encodings.Web
12 {
13     /// <summary>
14     /// Represents a type used to do HTML encoding.
15     /// </summary>
16     public abstract class HtmlEncoder : TextEncoder
17     {
18         /// <summary>
19         /// Returns a default built-in instance of <see cref="HtmlEncoder"/>.
20         /// </summary>
21         public static HtmlEncoder Default
22         {
23             get { return DefaultHtmlEncoder.Singleton; }
24         }
25 
26         /// <summary>
27         /// Creates a new instance of HtmlEncoder with provided settings.
28         /// </summary>
29         /// <param name="settings">Settings used to control how the created <see cref="HtmlEncoder"/> encodes, primarily which characters to encode.</param>
30         /// <returns>A new instance of the <see cref="HtmlEncoder"/>.</returns>
Create(TextEncoderSettings settings)31         public static HtmlEncoder Create(TextEncoderSettings settings)
32         {
33             return new DefaultHtmlEncoder(settings);
34         }
35 
36         /// <summary>
37         /// Creates a new instance of HtmlEncoder specifying character to be encoded.
38         /// </summary>
39         /// <param name="allowedRanges">Set of characters that the encoder is allowed to not encode.</param>
40         /// <returns>A new instance of the <see cref="HtmlEncoder"/></returns>
41         /// <remarks>Some characters in <paramref name="allowedRanges"/> might still get encoded, i.e. this parameter is just telling the encoder what ranges it is allowed to not encode, not what characters it must not encode.</remarks>
Create(params UnicodeRange[] allowedRanges)42         public static HtmlEncoder Create(params UnicodeRange[] allowedRanges)
43         {
44             return new DefaultHtmlEncoder(allowedRanges);
45         }
46     }
47 
48     internal sealed class DefaultHtmlEncoder : HtmlEncoder
49     {
50         private AllowedCharactersBitmap _allowedCharacters;
51         internal static readonly DefaultHtmlEncoder Singleton = new DefaultHtmlEncoder(new TextEncoderSettings(UnicodeRanges.BasicLatin));
52 
DefaultHtmlEncoder(TextEncoderSettings settings)53         public DefaultHtmlEncoder(TextEncoderSettings settings)
54         {
55             if (settings == null)
56             {
57                 throw new ArgumentNullException(nameof(settings));
58             }
59 
60             _allowedCharacters = settings.GetAllowedCharacters();
61 
62             // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed
63             // (includes categories Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, Zp)
64             _allowedCharacters.ForbidUndefinedCharacters();
65 
66             ForbidHtmlCharacters(_allowedCharacters);
67         }
68 
ForbidHtmlCharacters(AllowedCharactersBitmap allowedCharacters)69         internal static void ForbidHtmlCharacters(AllowedCharactersBitmap allowedCharacters)
70         {
71             allowedCharacters.ForbidCharacter('<');
72             allowedCharacters.ForbidCharacter('>');
73             allowedCharacters.ForbidCharacter('&');
74             allowedCharacters.ForbidCharacter('\''); // can be used to escape attributes
75             allowedCharacters.ForbidCharacter('\"'); // can be used to escape attributes
76             allowedCharacters.ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks
77         }
78 
DefaultHtmlEncoder(params UnicodeRange[] allowedRanges)79         public DefaultHtmlEncoder(params UnicodeRange[] allowedRanges) : this(new TextEncoderSettings(allowedRanges))
80         { }
81 
82         [MethodImpl(MethodImplOptions.AggressiveInlining)]
WillEncode(int unicodeScalar)83         public override bool WillEncode(int unicodeScalar)
84         {
85             if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) return true;
86             return !_allowedCharacters.IsUnicodeScalarAllowed(unicodeScalar);
87         }
88 
89         [MethodImpl(MethodImplOptions.AggressiveInlining)]
FindFirstCharacterToEncode(char* text, int textLength)90         public unsafe override int FindFirstCharacterToEncode(char* text, int textLength)
91         {
92             return _allowedCharacters.FindFirstCharacterToEncode(text, textLength);
93         }
94 
95         public override int MaxOutputCharactersPerInputCharacter
96         {
97             get { return 10; } // "&#x10FFFF;" is the longest encoded form
98         }
99 
100         static readonly char[] s_quote = "&quot;".ToCharArray();
101         static readonly char[] s_ampersand = "&amp;".ToCharArray();
102         static readonly char[] s_lessthan = "&lt;".ToCharArray();
103         static readonly char[] s_greaterthan = "&gt;".ToCharArray();
104 
TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten)105         public unsafe override bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten)
106         {
107             if (buffer == null)
108             {
109                 throw new ArgumentNullException(nameof(buffer));
110             }
111 
112             if (!WillEncode(unicodeScalar)) { return TryWriteScalarAsChar(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); }
113             else if (unicodeScalar == '\"') { return TryCopyCharacters(s_quote, buffer, bufferLength, out numberOfCharactersWritten); }
114             else if (unicodeScalar == '&') { return TryCopyCharacters(s_ampersand, buffer, bufferLength, out numberOfCharactersWritten); }
115             else if (unicodeScalar == '<') { return TryCopyCharacters(s_lessthan, buffer, bufferLength, out numberOfCharactersWritten); }
116             else if (unicodeScalar == '>') { return TryCopyCharacters(s_greaterthan, buffer, bufferLength, out numberOfCharactersWritten); }
117             else { return TryWriteEncodedScalarAsNumericEntity(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); }
118         }
119 
TryWriteEncodedScalarAsNumericEntity(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten)120         private static unsafe bool TryWriteEncodedScalarAsNumericEntity(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten)
121         {
122             Debug.Assert(buffer != null && bufferLength >= 0);
123 
124             // We're writing the characters in reverse, first determine
125             // how many there are
126             const int nibbleSize = 4;
127             int numberOfHexCharacters = 0;
128             int compareUnicodeScalar = unicodeScalar;
129 
130             do
131             {
132                 Debug.Assert(numberOfHexCharacters < 8, "Couldn't have written 8 characters out by this point.");
133                 numberOfHexCharacters++;
134                 compareUnicodeScalar >>= nibbleSize;
135             } while (compareUnicodeScalar != 0);
136 
137             numberOfCharactersWritten = numberOfHexCharacters + 4; // four chars are &, #, x, and ;
138             Debug.Assert(numberOfHexCharacters > 0, "At least one character should've been written.");
139 
140             if (numberOfHexCharacters + 4 > bufferLength)
141             {
142                 numberOfCharactersWritten = 0;
143                 return false;
144             }
145             // Finally, write out the HTML-encoded scalar value.
146             *buffer = '&';
147             buffer++;
148             *buffer = '#';
149             buffer++;
150             *buffer = 'x';
151 
152             // Jump to the end of the hex position and write backwards
153             buffer += numberOfHexCharacters;
154             do
155             {
156                 *buffer = HexUtil.Int32LsbToHexDigit(unicodeScalar & 0xF);
157                 unicodeScalar >>= nibbleSize;
158                 buffer--;
159             }
160             while (unicodeScalar != 0);
161 
162             buffer += numberOfHexCharacters + 1;
163             *buffer = ';';
164             return true;
165         }
166     }
167 }
168