1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4 
5 namespace System.IO
6 {
7     internal static class PatternMatcher
8     {
9         /// <devdoc>
10         ///     Private constants (directly from C header files)
11         /// </devdoc>
12         private const int MATCHES_ARRAY_SIZE = 16;
13         private const char ANSI_DOS_STAR = '>';
14         private const char ANSI_DOS_QM = '<';
15         private const char DOS_DOT = '"';
16 
17         /// <devdoc>
18         ///     Tells whether a given name matches the expression given with a strict (i.e. UNIX like)
19         ///     semantics.  This code is a port of unmanaged code.  Original code comment follows:
20         ///
21         ///    Routine Description:
22         ///
23         ///        This routine compares a Dbcs name and an expression and tells the caller
24         ///        if the name is in the language defined by the expression.  The input name
25         ///        cannot contain wildcards, while the expression may contain wildcards.
26         ///
27         ///        Expression wild cards are evaluated as shown in the nondeterministic
28         ///        finite automatons below.  Note that ~* and ~? are DOS_STAR and DOS_QM.
29         ///
30         ///
31         ///                 ~* is DOS_STAR, ~? is DOS_QM, and ~. is DOS_DOT
32         ///
33         ///
34         ///                                           S
35         ///                                        &lt;-----&lt;
36         ///                                     X  |     |  e       Y
37         ///                 X * Y ==       (0)-----&gt;-(1)-&gt;-----(2)-----(3)
38         ///
39         ///
40         ///                                          S-.
41         ///                                        &lt;-----&lt;
42         ///                                     X  |     |  e       Y
43         ///                 X ~* Y ==      (0)-----&gt;-(1)-&gt;-----(2)-----(3)
44         ///
45         ///
46         ///
47         ///                                    X     S     S     Y
48         ///                 X ?? Y ==      (0)---(1)---(2)---(3)---(4)
49         ///
50         ///
51         ///
52         ///                                    X     .        .      Y
53         ///                 X ~.~. Y ==    (0)---(1)----(2)------(3)---(4)
54         ///                                       |      |________|
55         ///                                       |           ^   |
56         ///                                       |_______________|
57         ///                                          ^EOF or .^
58         ///
59         ///
60         ///                                    X     S-.     S-.     Y
61         ///                 X ~?~? Y ==    (0)---(1)-----(2)-----(3)---(4)
62         ///                                       |      |________|
63         ///                                       |           ^   |
64         ///                                       |_______________|
65         ///                                          ^EOF or .^
66         ///
67         ///
68         ///
69         ///             where S is any single character
70         ///
71         ///                   S-. is any single character except the final .
72         ///
73         ///                   e is a null character transition
74         ///
75         ///                   EOF is the end of the name string
76         ///
77         ///        In words:
78         ///
79         ///            * matches 0 or more characters.
80         ///
81         ///            ? matches exactly 1 character.
82         ///
83         ///            DOS_STAR matches 0 or more characters until encountering and matching
84         ///                the final . in the name.
85         ///
86         ///            DOS_QM matches any single character, or upon encountering a period or
87         ///                end of name string, advances the expression to the end of the
88         ///                set of contiguous DOS_QMs.
89         ///
90         ///            DOS_DOT matches either a . or zero characters beyond name string.
91         ///
92         ///    Arguments:
93         ///
94         ///        Expression - Supplies the input expression to check against
95         ///
96         ///        Name - Supplies the input name to check for.
97         ///
98         ///    Return Value:
99         ///
100         ///        BOOLEAN - TRUE if Name is an element in the set of strings denoted
101         ///            by the input Expression and FALSE otherwise.
102         ///
103         /// </devdoc>
StrictMatchPattern(string expression, string name)104         public static bool StrictMatchPattern(string expression, string name)
105         {
106             //
107             //  The idea behind the algorithm is pretty simple.  We keep track of
108             //  all possible locations in the regular expression that are matching
109             //  the name.  If when the name has been exhausted one of the locations
110             //  in the expression is also just exhausted, the name is in the language
111             //  defined by the regular expression.
112             //
113 
114             if (name == null || name.Length == 0 || expression == null || expression.Length == 0)
115             {
116                 return false;
117             }
118 
119             //
120             //  Special case by far the most common wild card search of * or *.*
121             //
122 
123             if (expression.Equals("*") || expression.Equals("*.*"))
124             {
125                 return true;
126             }
127 
128             // If this class is ever exposed for generic use,
129             // we need to make sure that name doesn't contain wildcards. Currently
130             // the only component that calls this method is FileSystemWatcher and
131             // it will never pass a name that contains a wildcard.
132 
133 
134             //
135             //  Also special case expressions of the form *X.  With this and the prior
136             //  case we have covered virtually all normal queries.
137             //
138             if (expression[0] == '*' && expression.IndexOf('*', 1) == -1)
139             {
140                 int rightLength = expression.Length - 1;
141                 // if name is shorter that the stuff to the right of * in expression, we don't
142                 // need to do the string compare, otherwise we compare rightlength characters
143                 // and the end of both strings.
144                 if (name.Length >= rightLength &&
145                     string.Compare(expression, 1, name, name.Length - rightLength, rightLength, PathInternal.StringComparison) == 0)
146                 {
147                     return true;
148                 }
149             }
150 
151             //
152             //  Walk through the name string, picking off characters.  We go one
153             //  character beyond the end because some wild cards are able to match
154             //  zero characters beyond the end of the string.
155             //
156             //  With each new name character we determine a new set of states that
157             //  match the name so far.  We use two arrays that we swap back and forth
158             //  for this purpose.  One array lists the possible expression states for
159             //  all name characters up to but not including the current one, and other
160             //  array is used to build up the list of states considering the current
161             //  name character as well.  The arrays are then switched and the process
162             //  repeated.
163             //
164             //  There is not a one-to-one correspondence between state number and
165             //  offset into the expression.  This is evident from the NFAs in the
166             //  initial comment to this function.  State numbering is not continuous.
167             //  This allows a simple conversion between state number and expression
168             //  offset.  Each character in the expression can represent one or two
169             //  states.  * and DOS_STAR generate two states: ExprOffset*2 and
170             //  ExprOffset*2 + 1.  All other expression characters can produce only
171             //  a single state.  Thus ExprOffset = State/2.
172             //
173             //
174             //  Here is a short description of the variables involved:
175             //
176             //  NameOffset  - The offset of the current name char being processed.
177             //
178             //  ExprOffset  - The offset of the current expression char being processed.
179             //
180             //  SrcCount    - Prior match being investigated with current name char
181             //
182             //  DestCount   - Next location to put a matching assuming current name char
183             //
184             //  NameFinished - Allows one more iteration through the Matches array
185             //                 after the name is exhausted (to come *s for example)
186             //
187             //  PreviousDestCount - This is used to prevent entry duplication, see comment
188             //
189             //  PreviousMatches   - Holds the previous set of matches (the Src array)
190             //
191             //  CurrentMatches    - Holds the current set of matches (the Dest array)
192             //
193             //  AuxBuffer, LocalBuffer - the storage for the Matches arrays
194             //
195 
196             //
197             //  Set up the initial variables
198             //
199             int nameOffset;
200             int exprOffset;
201             int length;
202 
203             int srcCount;
204             int destCount;
205             int previousDestCount;
206             int matchesCount;
207 
208             char nameChar = '\0';
209             char exprChar = '\0';
210 
211             int[] previousMatches = new int[MATCHES_ARRAY_SIZE];
212             int[] currentMatches = new int[MATCHES_ARRAY_SIZE];
213 
214             int maxState;
215             int currentState;
216 
217             bool nameFinished = false;
218 
219             previousMatches[0] = 0;
220             matchesCount = 1;
221 
222             nameOffset = 0;
223             maxState = expression.Length * 2;
224 
225             while (!nameFinished)
226             {
227                 if (nameOffset < name.Length)
228                 {
229                     nameChar = name[nameOffset];
230                     nameOffset++;
231                 }
232                 else
233                 {
234                     nameFinished = true;
235 
236                     //
237                     //  if we have already exhausted the expression, C#.  Don't
238                     //  continue.
239                     //
240                     if (previousMatches[matchesCount - 1] == maxState)
241                     {
242                         break;
243                     }
244                 }
245 
246                 //
247                 //  Now, for each of the previous stored expression matches, see what
248                 //  we can do with this name character.
249                 //
250                 srcCount = 0;
251                 destCount = 0;
252                 previousDestCount = 0;
253 
254                 while (srcCount < matchesCount)
255                 {
256                     //
257                     //  We have to carry on our expression analysis as far as possible
258                     //  for each character of name, so we loop here until the
259                     //  expression stops matching.  A clue here is that expression
260                     //  cases that can match zero or more characters end with a
261                     //  continue, while those that can accept only a single character
262                     //  end with a break.
263                     //
264                     exprOffset = ((previousMatches[srcCount++] + 1) / 2);
265                     length = 0;
266 
267                     while (true)
268                     {
269                         if (exprOffset == expression.Length)
270                         {
271                             break;
272                         }
273 
274                         //
275                         //  The first time through the loop we don't want
276                         //  to increment ExprOffset.
277                         //
278 
279                         exprOffset += length;
280 
281                         currentState = exprOffset * 2;
282 
283                         if (exprOffset == expression.Length)
284                         {
285                             currentMatches[destCount++] = maxState;
286                             break;
287                         }
288 
289                         exprChar = expression[exprOffset];
290                         length = 1;
291 
292                         //
293                         //  We may be about to exhaust the local
294                         //  space for ExpressionMatches[][], so we have to allocate
295                         //  some pool if this is the case.
296                         //
297 
298                         if (destCount >= MATCHES_ARRAY_SIZE - 2)
299                         {
300                             int newSize = currentMatches.Length * 2;
301                             int[] tmp = new int[newSize];
302                             Array.Copy(currentMatches, 0, tmp, 0, currentMatches.Length);
303                             currentMatches = tmp;
304 
305                             tmp = new int[newSize];
306                             Array.Copy(previousMatches, 0, tmp, 0, previousMatches.Length);
307                             previousMatches = tmp;
308                         }
309 
310                         //
311                         //  * matches any character zero or more times.
312                         //
313 
314                         if (exprChar == '*')
315                         {
316                             currentMatches[destCount++] = currentState;
317                             currentMatches[destCount++] = (currentState + 1);
318                             continue;
319                         }
320 
321                         //
322                         //  DOS_STAR matches any character except . zero or more times.
323                         //
324 
325                         if (exprChar == ANSI_DOS_STAR)
326                         {
327                             bool iCanEatADot = false;
328 
329                             //
330                             //  If we are at a period, determine if we are allowed to
331                             //  consume it, i.e. make sure it is not the last one.
332                             //
333                             if (!nameFinished && (nameChar == '.'))
334                             {
335                                 char tmpChar;
336                                 int offset;
337 
338                                 int nameLength = name.Length;
339                                 for (offset = nameOffset; offset < nameLength; offset++)
340                                 {
341                                     tmpChar = name[offset];
342                                     length = 1;
343 
344                                     if (tmpChar == '.')
345                                     {
346                                         iCanEatADot = true;
347                                         break;
348                                     }
349                                 }
350                             }
351 
352                             if (nameFinished || (nameChar != '.') || iCanEatADot)
353                             {
354                                 currentMatches[destCount++] = currentState;
355                                 currentMatches[destCount++] = (currentState + 1);
356                                 continue;
357                             }
358                             else
359                             {
360                                 //
361                                 //  We are at a period.  We can only match zero
362                                 //  characters (i.e. the epsilon transition).
363                                 //
364                                 currentMatches[destCount++] = (currentState + 1);
365                                 continue;
366                             }
367                         }
368 
369                         //
370                         //  The following expression characters all match by consuming
371                         //  a character, thus force the expression, and thus state
372                         //  forward.
373                         //
374                         currentState += length * 2;
375 
376                         //
377                         //  DOS_QM is the most complicated.  If the name is finished,
378                         //  we can match zero characters.  If this name is a '.', we
379                         //  don't match, but look at the next expression.  Otherwise
380                         //  we match a single character.
381                         //
382                         if (exprChar == ANSI_DOS_QM)
383                         {
384                             if (nameFinished || (nameChar == '.'))
385                             {
386                                 continue;
387                             }
388 
389                             currentMatches[destCount++] = currentState;
390                             break;
391                         }
392 
393                         //
394                         //  A DOS_DOT can match either a period, or zero characters
395                         //  beyond the end of name.
396                         //
397                         if (exprChar == DOS_DOT)
398                         {
399                             if (nameFinished)
400                             {
401                                 continue;
402                             }
403 
404                             if (nameChar == '.')
405                             {
406                                 currentMatches[destCount++] = currentState;
407                                 break;
408                             }
409                         }
410 
411                         //
412                         //  From this point on a name character is required to even
413                         //  continue, let alone make a match.
414                         //
415                         if (nameFinished)
416                         {
417                             break;
418                         }
419 
420                         //
421                         //  If this expression was a '?' we can match it once.
422                         //
423                         if (exprChar == '?')
424                         {
425                             currentMatches[destCount++] = currentState;
426                             break;
427                         }
428 
429                         //
430                         //  Finally, check if the expression char matches the name char
431                         //
432 
433                         if (PathInternal.IsCaseSensitive ?
434                             (exprChar == nameChar) :
435                             (char.ToUpperInvariant(exprChar) == char.ToUpperInvariant(nameChar)))
436                         {
437                             currentMatches[destCount++] = currentState;
438                             break;
439                         }
440 
441                         //
442                         //  The expression didn't match so go look at the next
443                         //  previous match.
444                         //
445 
446                         break;
447                     }
448 
449 
450                     //
451                     //  Prevent duplication in the destination array.
452                     //
453                     //  Each of the arrays is monotonically increasing and non-
454                     //  duplicating, thus we skip over any source element in the src
455                     //  array if we just added the same element to the destination
456                     //  array.  This guarantees non-duplication in the dest. array.
457                     //
458 
459                     if ((srcCount < matchesCount) && (previousDestCount < destCount))
460                     {
461                         while (previousDestCount < destCount)
462                         {
463                             int previousLength = previousMatches.Length;
464                             while ((srcCount < previousLength) && (previousMatches[srcCount] < currentMatches[previousDestCount]))
465                             {
466                                 srcCount += 1;
467                             }
468                             previousDestCount += 1;
469                         }
470                     }
471                 }
472 
473                 //
474                 //  If we found no matches in the just finished iteration, it's time
475                 //  to bail.
476                 //
477 
478                 if (destCount == 0)
479                 {
480                     return false;
481                 }
482 
483                 //
484                 //  Swap the meaning the two arrays
485                 //
486 
487                 {
488                     int[] tmp;
489 
490                     tmp = previousMatches;
491 
492                     previousMatches = currentMatches;
493 
494                     currentMatches = tmp;
495                 }
496 
497                 matchesCount = destCount;
498             }
499 
500             currentState = previousMatches[matchesCount - 1];
501 
502             return currentState == maxState;
503         }
504     }
505 }
506