1 // Licensed to the .NET Foundation under one or more agreements. 2 // The .NET Foundation licenses this file to you under the MIT license. 3 // See the LICENSE file in the project root for more information. 4 5 using System.ComponentModel; 6 using System.Runtime.CompilerServices; 7 using System.Text.Internal; 8 using System.Text.Unicode; 9 10 namespace System.Text.Encodings.Web 11 { 12 /// <summary> 13 /// Represents a type used to do URL encoding. 14 /// </summary> 15 public abstract class UrlEncoder : TextEncoder 16 { 17 /// <summary> 18 /// Returns a default built-in instance of <see cref="UrlEncoder"/>. 19 /// </summary> 20 public static UrlEncoder Default 21 { 22 get { return DefaultUrlEncoder.Singleton; } 23 } 24 25 /// <summary> 26 /// Creates a new instance of UrlEncoder with provided settings. 27 /// </summary> 28 /// <param name="settings">Settings used to control how the created <see cref="UrlEncoder"/> encodes, primarily which characters to encode.</param> 29 /// <returns>A new instance of the <see cref="UrlEncoder"/>.</returns> Create(TextEncoderSettings settings)30 public static UrlEncoder Create(TextEncoderSettings settings) 31 { 32 return new DefaultUrlEncoder(settings); 33 } 34 35 /// <summary> 36 /// Creates a new instance of UrlEncoder specifying character to be encoded. 37 /// </summary> 38 /// <param name="allowedRanges">Set of characters that the encoder is allowed to not encode.</param> 39 /// <returns>A new instance of the <see cref="UrlEncoder"/>.</returns> 40 /// <remarks>Some characters in <paramref name="allowedRanges"/> might still get encoded, i.e. this parameter is just telling the encoder what ranges it is allowed to not encode, not what characters it must not encode.</remarks> Create(params UnicodeRange[] allowedRanges)41 public static UrlEncoder Create(params UnicodeRange[] allowedRanges) 42 { 43 return new DefaultUrlEncoder(allowedRanges); 44 } 45 } 46 47 internal sealed class DefaultUrlEncoder : UrlEncoder 48 { 49 private AllowedCharactersBitmap _allowedCharacters; 50 51 internal static readonly DefaultUrlEncoder Singleton = new DefaultUrlEncoder(new TextEncoderSettings(UnicodeRanges.BasicLatin)); 52 53 // We perform UTF8 conversion of input, which means that the worst case is 54 // 12 output chars per input surrogate char: [input] U+FFFF U+FFFF -> [output] "%XX%YY%ZZ%WW". 55 public override int MaxOutputCharactersPerInputCharacter 56 { 57 get { return 12; } 58 } 59 DefaultUrlEncoder(TextEncoderSettings filter)60 public DefaultUrlEncoder(TextEncoderSettings filter) 61 { 62 if (filter == null) 63 { 64 throw new ArgumentNullException(nameof(filter)); 65 } 66 67 _allowedCharacters = filter.GetAllowedCharacters(); 68 69 // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed 70 // (includes categories Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, Zp) 71 _allowedCharacters.ForbidUndefinedCharacters(); 72 73 // Forbid characters that are special in HTML. 74 // Even though this is a not HTML encoder, 75 // it's unfortunately common for developers to 76 // forget to HTML-encode a string once it has been URL-encoded, 77 // so this offers extra protection. 78 DefaultHtmlEncoder.ForbidHtmlCharacters(_allowedCharacters); 79 80 // Per RFC 3987, Sec. 2.2, we want encodings that are safe for 81 // four particular components: 'isegment', 'ipath-noscheme', 82 // 'iquery', and 'ifragment'. The relevant definitions are below. 83 // 84 // ipath-noscheme = isegment-nz-nc *( "/" isegment ) 85 // 86 // isegment = *ipchar 87 // 88 // isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims 89 // / "@" ) 90 // ; non-zero-length segment without any colon ":" 91 // 92 // ipchar = iunreserved / pct-encoded / sub-delims / ":" 93 // / "@" 94 // 95 // iquery = *( ipchar / iprivate / "/" / "?" ) 96 // 97 // ifragment = *( ipchar / "/" / "?" ) 98 // 99 // iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar 100 // 101 // ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF 102 // / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD 103 // / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD 104 // / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD 105 // / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD 106 // / %xD0000-DFFFD / %xE1000-EFFFD 107 // 108 // pct-encoded = "%" HEXDIG HEXDIG 109 // 110 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" 111 // / "*" / "+" / "," / ";" / "=" 112 // 113 // The only common characters between these four components are the 114 // intersection of 'isegment-nz-nc' and 'ipchar', which is really 115 // just 'isegment-nz-nc' (colons forbidden). 116 // 117 // From this list, the base encoder already forbids "&", "'", "+", 118 // and we'll additionally forbid "=" since it has special meaning 119 // in x-www-form-urlencoded representations. 120 // 121 // This means that the full list of allowed characters from the 122 // Basic Latin set is: 123 // ALPHA / DIGIT / "-" / "." / "_" / "~" / "!" / "$" / "(" / ")" / "*" / "," / ";" / "@" 124 125 const string forbiddenChars = @" #%/:=?[\]^`{|}"; // chars from Basic Latin which aren't already disallowed by the base encoder 126 foreach (char character in forbiddenChars) 127 { 128 _allowedCharacters.ForbidCharacter(character); 129 } 130 131 // Specials (U+FFF0 .. U+FFFF) are forbidden by the definition of 'ucschar' above 132 for (int i = 0; i < 16; i++) 133 { 134 _allowedCharacters.ForbidCharacter((char)(0xFFF0 | i)); 135 } 136 } 137 DefaultUrlEncoder(params UnicodeRange[] allowedRanges)138 public DefaultUrlEncoder(params UnicodeRange[] allowedRanges) : this(new TextEncoderSettings(allowedRanges)) 139 { } 140 141 [MethodImpl(MethodImplOptions.AggressiveInlining)] WillEncode(int unicodeScalar)142 public override bool WillEncode(int unicodeScalar) 143 { 144 if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) return true; 145 return !_allowedCharacters.IsUnicodeScalarAllowed(unicodeScalar); 146 } 147 148 [MethodImpl(MethodImplOptions.AggressiveInlining)] FindFirstCharacterToEncode(char* text, int textLength)149 public unsafe override int FindFirstCharacterToEncode(char* text, int textLength) 150 { 151 if (text == null) 152 { 153 throw new ArgumentNullException(nameof(text)); 154 } 155 return _allowedCharacters.FindFirstCharacterToEncode(text, textLength); 156 } 157 TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten)158 public unsafe override bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten) 159 { 160 if (buffer == null) 161 { 162 throw new ArgumentNullException(nameof(buffer)); 163 } 164 165 if (!WillEncode(unicodeScalar)) { return TryWriteScalarAsChar(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); } 166 167 numberOfCharactersWritten = 0; 168 uint asUtf8 = unchecked((uint)UnicodeHelpers.GetUtf8RepresentationForScalarValue((uint)unicodeScalar)); 169 do 170 { 171 char highNibble, lowNibble; 172 HexUtil.ByteToHexDigits(unchecked((byte)asUtf8), out highNibble, out lowNibble); 173 174 if (numberOfCharactersWritten + 3 > bufferLength) 175 { 176 numberOfCharactersWritten = 0; 177 return false; 178 } 179 180 *buffer = '%'; buffer++; 181 *buffer = highNibble; buffer++; 182 *buffer = lowNibble; buffer++; 183 184 numberOfCharactersWritten += 3; 185 } 186 while ((asUtf8 >>= 8) != 0); 187 return true; 188 } 189 } 190 } 191