1 // Licensed to the .NET Foundation under one or more agreements. 2 // The .NET Foundation licenses this file to you under the MIT license. 3 // See the LICENSE file in the project root for more information. 4 5 using System.ComponentModel; 6 using System.Diagnostics; 7 using System.Runtime.CompilerServices; 8 using System.Text.Internal; 9 using System.Text.Unicode; 10 11 namespace System.Text.Encodings.Web 12 { 13 /// <summary> 14 /// Represents a type used to do HTML encoding. 15 /// </summary> 16 public abstract class HtmlEncoder : TextEncoder 17 { 18 /// <summary> 19 /// Returns a default built-in instance of <see cref="HtmlEncoder"/>. 20 /// </summary> 21 public static HtmlEncoder Default 22 { 23 get { return DefaultHtmlEncoder.Singleton; } 24 } 25 26 /// <summary> 27 /// Creates a new instance of HtmlEncoder with provided settings. 28 /// </summary> 29 /// <param name="settings">Settings used to control how the created <see cref="HtmlEncoder"/> encodes, primarily which characters to encode.</param> 30 /// <returns>A new instance of the <see cref="HtmlEncoder"/>.</returns> Create(TextEncoderSettings settings)31 public static HtmlEncoder Create(TextEncoderSettings settings) 32 { 33 return new DefaultHtmlEncoder(settings); 34 } 35 36 /// <summary> 37 /// Creates a new instance of HtmlEncoder specifying character to be encoded. 38 /// </summary> 39 /// <param name="allowedRanges">Set of characters that the encoder is allowed to not encode.</param> 40 /// <returns>A new instance of the <see cref="HtmlEncoder"/></returns> 41 /// <remarks>Some characters in <paramref name="allowedRanges"/> might still get encoded, i.e. this parameter is just telling the encoder what ranges it is allowed to not encode, not what characters it must not encode.</remarks> Create(params UnicodeRange[] allowedRanges)42 public static HtmlEncoder Create(params UnicodeRange[] allowedRanges) 43 { 44 return new DefaultHtmlEncoder(allowedRanges); 45 } 46 } 47 48 internal sealed class DefaultHtmlEncoder : HtmlEncoder 49 { 50 private AllowedCharactersBitmap _allowedCharacters; 51 internal static readonly DefaultHtmlEncoder Singleton = new DefaultHtmlEncoder(new TextEncoderSettings(UnicodeRanges.BasicLatin)); 52 DefaultHtmlEncoder(TextEncoderSettings settings)53 public DefaultHtmlEncoder(TextEncoderSettings settings) 54 { 55 if (settings == null) 56 { 57 throw new ArgumentNullException(nameof(settings)); 58 } 59 60 _allowedCharacters = settings.GetAllowedCharacters(); 61 62 // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed 63 // (includes categories Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, Zp) 64 _allowedCharacters.ForbidUndefinedCharacters(); 65 66 ForbidHtmlCharacters(_allowedCharacters); 67 } 68 ForbidHtmlCharacters(AllowedCharactersBitmap allowedCharacters)69 internal static void ForbidHtmlCharacters(AllowedCharactersBitmap allowedCharacters) 70 { 71 allowedCharacters.ForbidCharacter('<'); 72 allowedCharacters.ForbidCharacter('>'); 73 allowedCharacters.ForbidCharacter('&'); 74 allowedCharacters.ForbidCharacter('\''); // can be used to escape attributes 75 allowedCharacters.ForbidCharacter('\"'); // can be used to escape attributes 76 allowedCharacters.ForbidCharacter('+'); // technically not HTML-specific, but can be used to perform UTF7-based attacks 77 } 78 DefaultHtmlEncoder(params UnicodeRange[] allowedRanges)79 public DefaultHtmlEncoder(params UnicodeRange[] allowedRanges) : this(new TextEncoderSettings(allowedRanges)) 80 { } 81 82 [MethodImpl(MethodImplOptions.AggressiveInlining)] WillEncode(int unicodeScalar)83 public override bool WillEncode(int unicodeScalar) 84 { 85 if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) return true; 86 return !_allowedCharacters.IsUnicodeScalarAllowed(unicodeScalar); 87 } 88 89 [MethodImpl(MethodImplOptions.AggressiveInlining)] FindFirstCharacterToEncode(char* text, int textLength)90 public unsafe override int FindFirstCharacterToEncode(char* text, int textLength) 91 { 92 return _allowedCharacters.FindFirstCharacterToEncode(text, textLength); 93 } 94 95 public override int MaxOutputCharactersPerInputCharacter 96 { 97 get { return 10; } // "" is the longest encoded form 98 } 99 100 static readonly char[] s_quote = """.ToCharArray(); 101 static readonly char[] s_ampersand = "&".ToCharArray(); 102 static readonly char[] s_lessthan = "<".ToCharArray(); 103 static readonly char[] s_greaterthan = ">".ToCharArray(); 104 TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten)105 public unsafe override bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) 106 { 107 if (buffer == null) 108 { 109 throw new ArgumentNullException(nameof(buffer)); 110 } 111 112 if (!WillEncode(unicodeScalar)) { return TryWriteScalarAsChar(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); } 113 else if (unicodeScalar == '\"') { return TryCopyCharacters(s_quote, buffer, bufferLength, out numberOfCharactersWritten); } 114 else if (unicodeScalar == '&') { return TryCopyCharacters(s_ampersand, buffer, bufferLength, out numberOfCharactersWritten); } 115 else if (unicodeScalar == '<') { return TryCopyCharacters(s_lessthan, buffer, bufferLength, out numberOfCharactersWritten); } 116 else if (unicodeScalar == '>') { return TryCopyCharacters(s_greaterthan, buffer, bufferLength, out numberOfCharactersWritten); } 117 else { return TryWriteEncodedScalarAsNumericEntity(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); } 118 } 119 TryWriteEncodedScalarAsNumericEntity(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten)120 private static unsafe bool TryWriteEncodedScalarAsNumericEntity(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) 121 { 122 Debug.Assert(buffer != null && bufferLength >= 0); 123 124 // We're writing the characters in reverse, first determine 125 // how many there are 126 const int nibbleSize = 4; 127 int numberOfHexCharacters = 0; 128 int compareUnicodeScalar = unicodeScalar; 129 130 do 131 { 132 Debug.Assert(numberOfHexCharacters < 8, "Couldn't have written 8 characters out by this point."); 133 numberOfHexCharacters++; 134 compareUnicodeScalar >>= nibbleSize; 135 } while (compareUnicodeScalar != 0); 136 137 numberOfCharactersWritten = numberOfHexCharacters + 4; // four chars are &, #, x, and ; 138 Debug.Assert(numberOfHexCharacters > 0, "At least one character should've been written."); 139 140 if (numberOfHexCharacters + 4 > bufferLength) 141 { 142 numberOfCharactersWritten = 0; 143 return false; 144 } 145 // Finally, write out the HTML-encoded scalar value. 146 *buffer = '&'; 147 buffer++; 148 *buffer = '#'; 149 buffer++; 150 *buffer = 'x'; 151 152 // Jump to the end of the hex position and write backwards 153 buffer += numberOfHexCharacters; 154 do 155 { 156 *buffer = HexUtil.Int32LsbToHexDigit(unicodeScalar & 0xF); 157 unicodeScalar >>= nibbleSize; 158 buffer--; 159 } 160 while (unicodeScalar != 0); 161 162 buffer += numberOfHexCharacters + 1; 163 *buffer = ';'; 164 return true; 165 } 166 } 167 } 168