1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4 
5 using System.ComponentModel;
6 using System.Runtime.CompilerServices;
7 using System.Text.Internal;
8 using System.Text.Unicode;
9 
10 namespace System.Text.Encodings.Web
11 {
12     /// <summary>
13     /// Represents a type used to do URL encoding.
14     /// </summary>
15     public abstract class UrlEncoder : TextEncoder
16     {
17         /// <summary>
18         /// Returns a default built-in instance of <see cref="UrlEncoder"/>.
19         /// </summary>
20         public static UrlEncoder Default
21         {
22             get { return DefaultUrlEncoder.Singleton; }
23         }
24 
25         /// <summary>
26         /// Creates a new instance of UrlEncoder with provided settings.
27         /// </summary>
28         /// <param name="settings">Settings used to control how the created <see cref="UrlEncoder"/> encodes, primarily which characters to encode.</param>
29         /// <returns>A new instance of the <see cref="UrlEncoder"/>.</returns>
Create(TextEncoderSettings settings)30         public static UrlEncoder Create(TextEncoderSettings settings)
31         {
32             return new DefaultUrlEncoder(settings);
33         }
34 
35         /// <summary>
36         /// Creates a new instance of UrlEncoder specifying character to be encoded.
37         /// </summary>
38         /// <param name="allowedRanges">Set of characters that the encoder is allowed to not encode.</param>
39         /// <returns>A new instance of the <see cref="UrlEncoder"/>.</returns>
40         /// <remarks>Some characters in <paramref name="allowedRanges"/> might still get encoded, i.e. this parameter is just telling the encoder what ranges it is allowed to not encode, not what characters it must not encode.</remarks>
Create(params UnicodeRange[] allowedRanges)41         public static UrlEncoder Create(params UnicodeRange[] allowedRanges)
42         {
43             return new DefaultUrlEncoder(allowedRanges);
44         }
45     }
46 
47     internal sealed class DefaultUrlEncoder : UrlEncoder
48     {
49         private AllowedCharactersBitmap _allowedCharacters;
50 
51         internal static readonly DefaultUrlEncoder Singleton = new DefaultUrlEncoder(new TextEncoderSettings(UnicodeRanges.BasicLatin));
52 
53         // We perform UTF8 conversion of input, which means that the worst case is
54         // 12 output chars per input surrogate char: [input] U+FFFF U+FFFF -> [output] "%XX%YY%ZZ%WW".
55         public override int MaxOutputCharactersPerInputCharacter
56         {
57             get { return 12; }
58         }
59 
DefaultUrlEncoder(TextEncoderSettings filter)60         public DefaultUrlEncoder(TextEncoderSettings filter)
61         {
62             if (filter == null)
63             {
64                 throw new ArgumentNullException(nameof(filter));
65             }
66 
67             _allowedCharacters = filter.GetAllowedCharacters();
68 
69             // Forbid codepoints which aren't mapped to characters or which are otherwise always disallowed
70             // (includes categories Cc, Cs, Co, Cn, Zs [except U+0020 SPACE], Zl, Zp)
71             _allowedCharacters.ForbidUndefinedCharacters();
72 
73             // Forbid characters that are special in HTML.
74             // Even though this is a not HTML encoder,
75             // it's unfortunately common for developers to
76             // forget to HTML-encode a string once it has been URL-encoded,
77             // so this offers extra protection.
78             DefaultHtmlEncoder.ForbidHtmlCharacters(_allowedCharacters);
79 
80             // Per RFC 3987, Sec. 2.2, we want encodings that are safe for
81             // four particular components: 'isegment', 'ipath-noscheme',
82             // 'iquery', and 'ifragment'. The relevant definitions are below.
83             //
84             //    ipath-noscheme = isegment-nz-nc *( "/" isegment )
85             //
86             //    isegment       = *ipchar
87             //
88             //    isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims
89             //                         / "@" )
90             //                   ; non-zero-length segment without any colon ":"
91             //
92             //    ipchar         = iunreserved / pct-encoded / sub-delims / ":"
93             //                   / "@"
94             //
95             //    iquery         = *( ipchar / iprivate / "/" / "?" )
96             //
97             //    ifragment      = *( ipchar / "/" / "?" )
98             //
99             //    iunreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
100             //
101             //    ucschar        = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
102             //                   / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
103             //                   / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
104             //                   / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
105             //                   / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
106             //                   / %xD0000-DFFFD / %xE1000-EFFFD
107             //
108             //    pct-encoded    = "%" HEXDIG HEXDIG
109             //
110             //    sub-delims     = "!" / "$" / "&" / "'" / "(" / ")"
111             //                   / "*" / "+" / "," / ";" / "="
112             //
113             // The only common characters between these four components are the
114             // intersection of 'isegment-nz-nc' and 'ipchar', which is really
115             // just 'isegment-nz-nc' (colons forbidden).
116             //
117             // From this list, the base encoder already forbids "&", "'", "+",
118             // and we'll additionally forbid "=" since it has special meaning
119             // in x-www-form-urlencoded representations.
120             //
121             // This means that the full list of allowed characters from the
122             // Basic Latin set is:
123             // ALPHA / DIGIT / "-" / "." / "_" / "~" / "!" / "$" / "(" / ")" / "*" / "," / ";" / "@"
124 
125             const string forbiddenChars = @" #%/:=?[\]^`{|}"; // chars from Basic Latin which aren't already disallowed by the base encoder
126             foreach (char character in forbiddenChars)
127             {
128                 _allowedCharacters.ForbidCharacter(character);
129             }
130 
131             // Specials (U+FFF0 .. U+FFFF) are forbidden by the definition of 'ucschar' above
132             for (int i = 0; i < 16; i++)
133             {
134                 _allowedCharacters.ForbidCharacter((char)(0xFFF0 | i));
135             }
136         }
137 
DefaultUrlEncoder(params UnicodeRange[] allowedRanges)138         public DefaultUrlEncoder(params UnicodeRange[] allowedRanges) : this(new TextEncoderSettings(allowedRanges))
139         { }
140 
141         [MethodImpl(MethodImplOptions.AggressiveInlining)]
WillEncode(int unicodeScalar)142         public override bool WillEncode(int unicodeScalar)
143         {
144             if (UnicodeHelpers.IsSupplementaryCodePoint(unicodeScalar)) return true;
145             return !_allowedCharacters.IsUnicodeScalarAllowed(unicodeScalar);
146         }
147 
148         [MethodImpl(MethodImplOptions.AggressiveInlining)]
FindFirstCharacterToEncode(char* text, int textLength)149         public unsafe override int FindFirstCharacterToEncode(char* text, int textLength)
150         {
151             if (text == null)
152             {
153                 throw new ArgumentNullException(nameof(text));
154             }
155             return _allowedCharacters.FindFirstCharacterToEncode(text, textLength);
156         }
157 
TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten)158         public unsafe override bool TryEncodeUnicodeScalar(int unicodeScalar, char* buffer, int bufferLength, out int numberOfCharactersWritten)
159         {
160             if (buffer == null)
161             {
162                 throw new ArgumentNullException(nameof(buffer));
163             }
164 
165             if (!WillEncode(unicodeScalar)) { return TryWriteScalarAsChar(unicodeScalar, buffer, bufferLength, out numberOfCharactersWritten); }
166 
167             numberOfCharactersWritten = 0;
168             uint asUtf8 = unchecked((uint)UnicodeHelpers.GetUtf8RepresentationForScalarValue((uint)unicodeScalar));
169             do
170             {
171                 char highNibble, lowNibble;
172                 HexUtil.ByteToHexDigits(unchecked((byte)asUtf8), out highNibble, out lowNibble);
173 
174                 if (numberOfCharactersWritten + 3 > bufferLength)
175                 {
176                     numberOfCharactersWritten = 0;
177                     return false;
178                 }
179 
180                 *buffer = '%'; buffer++;
181                 *buffer = highNibble; buffer++;
182                 *buffer = lowNibble; buffer++;
183 
184                 numberOfCharactersWritten += 3;
185             }
186             while ((asUtf8 >>= 8) != 0);
187             return true;
188         }
189     }
190 }
191