1-- File created: 2008-10-10 13:29:26
2
3{-# LANGUAGE CPP #-}
4{-# LANGUAGE PatternGuards #-}
5
6module System.FilePath.Glob.Base
7   ( Token(..), Pattern(..)
8
9   , CompOptions(..), MatchOptions(..)
10   , compDefault, compPosix, matchDefault, matchPosix
11
12   , decompile
13
14   , compile
15   , compileWith, tryCompileWith
16   , tokenize -- for tests
17
18   , optimize
19
20   , liftP, tokToLower
21
22   , isLiteral
23   ) where
24
25import Control.Arrow                     (first)
26import Control.Monad.Trans.Class         (lift)
27import Control.Monad.Trans.Except        (ExceptT, runExceptT, throwE)
28import Control.Monad.Trans.Writer.Strict (Writer, runWriter, tell)
29import Control.Exception                 (assert)
30import Data.Char                         (isDigit, isAlpha, toLower)
31import Data.List                         (find, sortBy)
32import Data.List.NonEmpty                (toList)
33import Data.Maybe                        (fromMaybe)
34-- Monoid is re-exported from Prelude as of 4.8.0.0
35#if !MIN_VERSION_base(4,8,0)
36import Data.Monoid                       (Monoid, mappend, mempty, mconcat)
37#endif
38#if MIN_VERSION_base(4,11,0)
39import Data.Semigroup                    (sconcat, stimes)
40#else
41import Data.Semigroup                    (Semigroup, (<>), sconcat, stimes)
42#endif
43import Data.String                       (IsString(fromString))
44import System.FilePath                   ( pathSeparator, extSeparator
45                                         , isExtSeparator, isPathSeparator
46                                         )
47
48import System.FilePath.Glob.Utils ( dropLeadingZeroes
49                                  , isLeft, fromLeft
50                                  , increasingSeq
51                                  , addToRange, overlap
52                                  )
53
54#if __GLASGOW_HASKELL__
55import Text.Read (readPrec, lexP, parens, prec, Lexeme(Ident))
56#endif
57
58data Token
59   -- primitives
60   = Literal !Char
61   | ExtSeparator                              --  .  optimized away to Literal
62   | PathSeparator                             --  /
63   | NonPathSeparator                          --  ?
64   | CharRange !Bool [Either Char (Char,Char)] --  []
65   | OpenRange (Maybe String) (Maybe String)   --  <>
66   | AnyNonPathSeparator                       --  *
67   | AnyDirectory                              --  **/
68
69   -- after optimization only
70   | LongLiteral !Int String
71   | Unmatchable  -- [/], or [.] at the beginning or after a path separator
72   deriving (Eq)
73
74-- Note: CharRanges aren't converted, because this is tricky in general.
75-- Consider for instance [@-[], which includes the range A-Z. This would need
76-- to become [@[a-z]: so essentially we'd need to either:
77--
78--    1) Have a list of ranges of uppercase Unicode. Check if our range
79--       overlaps with any of them and if it does, take the non-overlapping
80--       part and combine it with the toLower of the overlapping part.
81--
82--    2) Simply expand the entire range to a list and map toLower over it.
83--
84-- In either case we'd need to re-optimize the CharRange—we can't assume that
85-- if the uppercase characters are consecutive, so are the lowercase.
86--
87-- 1) might be feasible if someone bothered to get the latest data.
88--
89-- 2) obviously isn't since you might have 'Right (minBound, maxBound)' in
90-- there somewhere.
91--
92-- The current solution is to just check both the toUpper of the character and
93-- the toLower.
94tokToLower :: Token -> Token
95tokToLower (Literal     c)   = Literal       (toLower c)
96tokToLower (LongLiteral n s) = LongLiteral n (map toLower s)
97tokToLower tok               = tok
98
99-- |An abstract data type representing a compiled pattern.
100--
101-- Note that the 'Eq' instance cannot tell you whether two patterns behave in
102-- the same way; only whether they compile to the same 'Pattern'. For instance,
103-- @'compile' \"x\"@ and @'compile' \"[x]\"@ may or may not compare equal,
104-- though a @'match'@ will behave the exact same way no matter which 'Pattern'
105-- is used.
106newtype Pattern = Pattern { unPattern :: [Token] } deriving (Eq)
107
108liftP :: ([Token] -> [Token]) -> Pattern -> Pattern
109liftP f (Pattern pat) = Pattern (f pat)
110
111instance Show Token where
112   show (Literal c)
113      | c `elem` "*?[<"     = ['[',c,']']
114      | otherwise           = assert (not $ isPathSeparator c) [c]
115   show ExtSeparator        = [ extSeparator]
116   show PathSeparator       = [pathSeparator]
117   show NonPathSeparator    = "?"
118   show AnyNonPathSeparator = "*"
119   show AnyDirectory        = "**/"
120   show (LongLiteral _ s)   = concatMap (show . Literal) s
121   show (OpenRange a b)     =
122      '<' : fromMaybe "" a ++ "-" ++
123            fromMaybe "" b ++ ">"
124
125   -- We have to be careful here with ^ and ! lest [a!b] become [!ab]. So we
126   -- just put them at the end.
127   --
128   -- Also, [^x-] was sorted and should not become [^-x].
129   --
130   -- Also, for something like [/!^] or /[.^!] that got optimized to have just ^
131   -- and ! we need to add a dummy /.
132   show (CharRange b r)     =
133      let f = either (:[]) (\(x,y) -> [x,'-',y])
134          (caret,exclamation,fs) =
135             foldr (\c (ca,ex,ss) ->
136                case c of
137                     Left '^' -> ("^",ex,ss)
138                     Left '!' -> (ca,"!",ss)
139                     _        -> (ca,  ex,(f c ++) . ss)
140                   )
141                   ("", "", id)
142                   r
143          (beg,rest) = let s' = fs []
144                           (x,y) = splitAt 1 s'
145                           in if not b && x == "-"
146                                 then (y,x)
147                                 else (s',"")
148       in concat [ "["
149                 , if b then "" else "^"
150                 , if b && null beg && not (null caret && null exclamation) then "/" else ""
151                 , beg, caret, exclamation, rest
152                 , "]"
153                 ]
154
155   show Unmatchable = "[.]"
156
157instance Show Pattern where
158   showsPrec d p = showParen (d > 10) $
159      showString "compile " . showsPrec (d+1) (decompile p)
160
161instance Read Pattern where
162#if __GLASGOW_HASKELL__
163   readPrec = parens . prec 10 $ do
164      Ident "compile" <- lexP
165      fmap compile readPrec
166#else
167   readsPrec d = readParen (d > 10) $ \r -> do
168      ("compile",string) <- lex r
169      (xs,rest) <- readsPrec (d+1) string
170      [(compile xs, rest)]
171#endif
172
173instance Semigroup Pattern where
174   Pattern a <> Pattern b = optimize $ Pattern (a <> b)
175   sconcat = optimize . Pattern . concatMap unPattern . toList
176   stimes n (Pattern a) = optimize $ Pattern (stimes n a)
177
178instance Monoid Pattern where
179   mempty  = Pattern []
180   mappend = (<>)
181   mconcat = optimize . Pattern . concatMap unPattern
182
183instance IsString Pattern where
184    fromString = compile
185
186-- |Options which can be passed to the 'tryCompileWith' or 'compileWith'
187-- functions: with these you can selectively toggle certain features at compile
188-- time.
189--
190-- Note that some of these options depend on each other: classes can never
191-- occur if ranges aren't allowed, for instance.
192
193-- We could presumably put locale information in here, too.
194data CompOptions = CompOptions
195    { characterClasses   :: Bool -- ^Allow character classes, @[[:...:]]@.
196    , characterRanges    :: Bool -- ^Allow character ranges, @[...]@.
197    , numberRanges       :: Bool -- ^Allow open ranges, @\<...>@.
198    , wildcards          :: Bool -- ^Allow wildcards, @*@ and @?@.
199    , recursiveWildcards :: Bool -- ^Allow recursive wildcards, @**/@.
200
201    , pathSepInRanges    :: Bool
202      -- ^Allow path separators in character ranges.
203      --
204      -- If true, @a[/]b@ never matches anything (since character ranges can't
205      -- match path separators); if false and 'errorRecovery' is enabled,
206      -- @a[/]b@ matches itself, i.e. a file named @]b@ in the subdirectory
207      -- @a[@.
208
209    , errorRecovery      :: Bool
210      -- ^If the input is invalid, recover by turning any invalid part into
211      -- literals. For instance, with 'characterRanges' enabled, @[abc@ is an
212      -- error by default (unclosed character range); with 'errorRecovery', the
213      -- @[@ is turned into a literal match, as though 'characterRanges' were
214      -- disabled.
215    } deriving (Show,Read,Eq)
216
217-- |The default set of compilation options: closest to the behaviour of the
218-- @zsh@ shell, with 'errorRecovery' enabled.
219--
220-- All options are enabled.
221compDefault :: CompOptions
222compDefault = CompOptions
223   { characterClasses   = True
224   , characterRanges    = True
225   , numberRanges       = True
226   , wildcards          = True
227   , recursiveWildcards = True
228   , pathSepInRanges    = True
229   , errorRecovery      = True
230   }
231
232-- |Options for POSIX-compliance, as described in @man 7 glob@.
233--
234-- 'numberRanges', 'recursiveWildcards', and 'pathSepInRanges' are disabled.
235compPosix :: CompOptions
236compPosix = CompOptions
237   { characterClasses   = True
238   , characterRanges    = True
239   , numberRanges       = False
240   , wildcards          = True
241   , recursiveWildcards = False
242   , pathSepInRanges    = False
243   , errorRecovery      = True
244   }
245
246-- |Options which can be passed to the 'matchWith' or 'globDirWith' functions:
247-- with these you can selectively toggle certain features at matching time.
248data MatchOptions = MatchOptions
249    { matchDotsImplicitly :: Bool
250      -- ^Allow @*@, @?@, and @**/@ to match @.@ at the beginning of paths.
251
252    , ignoreCase          :: Bool
253      -- ^Case-independent matching.
254
255    , ignoreDotSlash      :: Bool
256      -- ^Treat @./@ as a no-op in both paths and patterns.
257      --
258      -- (Of course e.g. @../@ means something different and will not be
259      -- ignored.)
260    }
261
262-- |The default set of execution options: closest to the behaviour of the @zsh@
263-- shell.
264--
265-- Currently identical to 'matchPosix'.
266matchDefault :: MatchOptions
267matchDefault = matchPosix
268
269-- |Options for POSIX-compliance, as described in @man 7 glob@.
270--
271-- 'ignoreDotSlash' is enabled, the rest are disabled.
272matchPosix :: MatchOptions
273matchPosix = MatchOptions
274   { matchDotsImplicitly = False
275   , ignoreCase          = False
276   , ignoreDotSlash      = True
277   }
278
279-- |Decompiles a 'Pattern' object into its textual representation: essentially
280-- the inverse of 'compile'.
281--
282-- Note, however, that due to internal optimization, @decompile . compile@ is
283-- not the identity function. Instead, @compile . decompile@ is.
284--
285-- Be careful with 'CompOptions': 'decompile' always produces a 'String' which
286-- can be passed to 'compile' to get back the same 'Pattern'. @compileWith
287-- options . decompile@ is /not/ the identity function unless @options@ is
288-- 'compDefault'.
289decompile :: Pattern -> String
290decompile = concatMap show . unPattern
291
292------------------------------------------
293-- COMPILATION
294------------------------------------------
295
296
297-- |Compiles a glob pattern from its textual representation into a 'Pattern'
298-- object.
299--
300-- For the most part, a character matches itself. Recognized operators are as
301-- follows:
302--
303-- [@?@]      Matches any character except path separators.
304--
305-- [@*@]      Matches any number of characters except path separators,
306--            including the empty string.
307--
308-- [@[..\]@]  Matches any of the enclosed characters. Ranges of characters can
309--            be specified by separating the endpoints with a @\'-'@. @\'-'@ or
310--            @']'@ can be matched by including them as the first character(s)
311--            in the list. Never matches path separators: @[\/]@ matches
312--            nothing at all. Named character classes can also be matched:
313--            @[:x:]@ within @[]@ specifies the class named @x@, which matches
314--            certain predefined characters. See below for a full list.
315--
316-- [@[^..\]@ or @[!..\]@] Like @[..]@, but matches any character /not/ listed.
317--                        Note that @[^-x]@ is not the inverse of @[-x]@, but
318--                        the range @[^-x]@.
319--
320-- [@\<m-n>@] Matches any integer in the range m to n, inclusive. The range may
321--            be open-ended by leaving out either number: @\"\<->\"@, for
322--            instance, matches any integer.
323--
324-- [@**/@]    Matches any number of characters, including path separators,
325--            excluding the empty string.
326--
327-- Supported character classes:
328--
329-- [@[:alnum:\]@]  Equivalent to @\"0-9A-Za-z\"@.
330--
331-- [@[:alpha:\]@]  Equivalent to @\"A-Za-z\"@.
332--
333-- [@[:blank:\]@]  Equivalent to @\"\\t \"@.
334--
335-- [@[:cntrl:\]@]  Equivalent to @\"\\0-\\x1f\\x7f\"@.
336--
337-- [@[:digit:\]@]  Equivalent to @\"0-9\"@.
338--
339-- [@[:graph:\]@]  Equivalent to @\"!-~\"@.
340--
341-- [@[:lower:\]@]  Equivalent to @\"a-z\"@.
342--
343-- [@[:print:\]@]  Equivalent to @\" -~\"@.
344--
345-- [@[:punct:\]@]  Equivalent to @\"!-\/:-\@[-`{-~\"@.
346--
347-- [@[:space:\]@]  Equivalent to @\"\\t-\\r \"@.
348--
349-- [@[:upper:\]@]  Equivalent to @\"A-Z\"@.
350--
351-- [@[:xdigit:\]@] Equivalent to @\"0-9A-Fa-f\"@.
352--
353-- Note that path separators (typically @\'/\'@) have to be matched explicitly
354-- or using the @**/@ pattern. In addition, extension separators (typically
355-- @\'.\'@) have to be matched explicitly at the beginning of the pattern or
356-- after any path separator.
357--
358-- If a system supports multiple path separators, any one of them will match
359-- any of them. For instance, on Windows, @\'/\'@ will match itself as well as
360-- @\'\\\'@.
361--
362-- Error recovery will be performed: erroneous operators will not be considered
363-- operators, but matched as literal strings. Such operators include:
364--
365-- * An empty @[]@ or @[^]@ or @[!]@
366--
367-- * A @[@ or @\<@ without a matching @]@ or @>@
368--
369-- * A malformed @\<>@: e.g. nonnumeric characters or no hyphen
370--
371-- So, e.g. @[]@ will match the string @\"[]\"@.
372compile :: String -> Pattern
373compile = compileWith compDefault
374
375-- |Like 'compile', but recognizes operators according to the given
376-- 'CompOptions' instead of the defaults.
377--
378-- If an error occurs and 'errorRecovery' is disabled, 'error' will be called.
379compileWith :: CompOptions -> String -> Pattern
380compileWith opts = either error id . tryCompileWith opts
381
382-- |A safe version of 'compileWith'.
383--
384-- If an error occurs and 'errorRecovery' is disabled, the error message will
385-- be returned in a 'Left'.
386tryCompileWith :: CompOptions -> String -> Either String Pattern
387tryCompileWith opts = fmap optimize . tokenize opts
388
389tokenize :: CompOptions -> String -> Either String Pattern
390tokenize opts = fmap Pattern . sequence . go
391 where
392   err _ c cs | errorRecovery opts = Right (Literal c) : go cs
393   err s _ _                       = [Left s]
394
395   go :: String -> [Either String Token]
396   go [] = []
397   go ('?':cs) | wcs = Right NonPathSeparator : go cs
398   go ('*':cs) | wcs =
399      case cs of
400           '*':p:xs | rwcs && isPathSeparator p
401              -> Right AnyDirectory        : go xs
402           _  -> Right AnyNonPathSeparator : go cs
403
404   go ('[':cs) | crs = let (range,rest) = charRange opts cs
405                        in case range of
406                                Left s -> err s '[' cs
407                                r      -> r : go rest
408
409   go ('<':cs) | ors =
410      let (range, rest) = break (=='>') cs
411       in if null rest
412             then err "compile :: unclosed <> in pattern" '<' cs
413             else case openRange range of
414                       Left s -> err s '<' cs
415                       r      -> r : go (tail rest)
416   go (c:cs)
417      | isPathSeparator c = Right PathSeparator : go cs
418      | isExtSeparator  c = Right  ExtSeparator : go cs
419      | otherwise         = Right (Literal c)   : go cs
420
421   wcs  = wildcards          opts
422   rwcs = recursiveWildcards opts
423   crs  = characterRanges    opts
424   ors  = numberRanges       opts
425
426-- <a-b> where a > b can never match anything; this is not considered an error
427openRange :: String -> Either String Token
428openRange ['-']   = Right $ OpenRange Nothing Nothing
429openRange ('-':s) =
430   case span isDigit s of
431        (b,"") -> Right $ OpenRange Nothing (openRangeNum b)
432        _      -> Left $ "compile :: bad <>, expected number, got " ++ s
433openRange s =
434   case span isDigit s of
435        (a,"-")    -> Right $ OpenRange (openRangeNum a) Nothing
436        (a,'-':s') ->
437           case span isDigit s' of
438                (b,"") -> Right $ OpenRange (openRangeNum a) (openRangeNum b)
439                _ -> Left $ "compile :: bad <>, expected number, got " ++ s'
440        _ -> Left $ "compile :: bad <>, expected number followed by - in " ++ s
441
442openRangeNum :: String -> Maybe String
443openRangeNum = Just . dropLeadingZeroes
444
445type CharRange = [Either Char (Char,Char)]
446
447charRange :: CompOptions -> String -> (Either String Token, String)
448charRange opts zs =
449   case zs of
450        y:ys | y `elem` "^!" ->
451           case ys of
452                -- [!-#] is not the inverse of [-#], it is the range ! through
453                -- #
454                '-':']':xs -> (Right (CharRange False [Left '-']), xs)
455                '-'    :_  -> first (fmap (CharRange True )) (start zs)
456                xs         -> first (fmap (CharRange False)) (start xs)
457        _                  -> first (fmap (CharRange True )) (start zs)
458 where
459   start :: String -> (Either String CharRange, String)
460   start (']':xs) = run $ char ']' xs
461   start ('-':xs) = run $ char '-' xs
462   start xs       = run $ go xs
463
464   run :: ExceptT String (Writer CharRange) String
465       -> (Either String CharRange, String)
466   run m = case runWriter.runExceptT $ m of
467                (Left   err,  _) -> (Left err, [])
468                (Right rest, cs) -> (Right cs, rest)
469
470   go :: String -> ExceptT String (Writer CharRange) String
471   go ('[':':':xs) | characterClasses opts = readClass xs
472   go (    ']':xs) = return xs
473   go (      c:xs) =
474      if not (pathSepInRanges opts) && isPathSeparator c
475         then throwE "compile :: path separator within []"
476         else char c xs
477   go []           = throwE "compile :: unclosed [] in pattern"
478
479   char :: Char -> String -> ExceptT String (Writer CharRange) String
480   char c ('-':x:xs) =
481      if x == ']'
482         then ltell [Left c, Left '-'] >> return xs
483         else ltell [Right (c,x)]      >>     go xs
484
485   char c xs = ltell [Left c] >> go xs
486
487   readClass :: String -> ExceptT String (Writer CharRange) String
488   readClass xs = let (name,end) = span isAlpha xs
489                   in case end of
490                           ':':']':rest -> charClass name            >> go rest
491                           _            -> ltell [Left '[',Left ':'] >> go xs
492
493   charClass :: String -> ExceptT String (Writer CharRange) ()
494   charClass name =
495      -- The POSIX classes
496      --
497      -- TODO: this is ASCII-only, not sure how this should be extended
498      --       Unicode, or with a locale as input, or something else?
499      case name of
500           "alnum"  -> ltell [digit,upper,lower]
501           "alpha"  -> ltell [upper,lower]
502           "blank"  -> ltell blanks
503           "cntrl"  -> ltell [Right ('\0','\x1f'), Left '\x7f']
504           "digit"  -> ltell [digit]
505           "graph"  -> ltell [Right ('!','~')]
506           "lower"  -> ltell [lower]
507           "print"  -> ltell [Right (' ','~')]
508           "punct"  -> ltell punct
509           "space"  -> ltell spaces
510           "upper"  -> ltell [upper]
511           "xdigit" -> ltell [digit, Right ('A','F'), Right ('a','f')]
512           _        ->
513              throwE ("compile :: unknown character class '" ++name++ "'")
514
515   digit  = Right ('0','9')
516   upper  = Right ('A','Z')
517   lower  = Right ('a','z')
518   punct  = map Right [('!','/'), (':','@'), ('[','`'), ('{','~')]
519   blanks = [Left '\t',         Left ' ']
520   spaces = [Right ('\t','\r'), Left ' ']
521
522   ltell = lift . tell
523
524
525------------------------------------------
526-- OPTIMIZATION
527------------------------------------------
528
529
530optimize :: Pattern -> Pattern
531optimize (Pattern pat) =
532   Pattern . fin $
533      case pat of
534         e : ts | e == ExtSeparator || e == Literal '.' ->
535            checkUnmatchable (Literal '.' :) (go ts)
536         _ ->
537            -- Handle the case where the whole pattern starts with a
538            -- now-literalized [.]. LongLiterals haven't been created yet so
539            -- checking for Literal suffices.
540            case go pat of
541                 Literal '.' : _ -> [Unmatchable]
542                 opat -> checkUnmatchable id opat
543 where
544   fin [] = []
545
546   -- Literals to LongLiteral
547   -- Has to be done here: we can't backtrack in go, but some cases might
548   -- result in consecutive Literals being generated.
549   -- E.g. "a[b]".
550   fin (x:y:xs) | Just x' <- isCharLiteral x, Just y' <- isCharLiteral y =
551      let (ls,rest) = spanMaybe isCharLiteral xs
552       in fin $ LongLiteral (length ls + 2)
553                            (foldr (\a -> (a:)) [] (x':y':ls))
554                : rest
555
556   -- concatenate LongLiterals
557   -- Has to be done here because LongLiterals are generated above.
558   --
559   -- So one could say that we have one pass (go) which flattens everything as
560   -- much as it can and one pass (fin) which concatenates what it can.
561   fin (LongLiteral l1 s1 : LongLiteral l2 s2 : xs) =
562      fin $ LongLiteral (l1+l2) (s1++s2) : xs
563
564   fin (LongLiteral l s : Literal c : xs) =
565      fin $ LongLiteral (l+1) (s++[c]) : xs
566
567   fin (LongLiteral 1 s : xs) = Literal (head s) : fin xs
568
569   fin (Literal c : LongLiteral l s : xs) =
570      fin $ LongLiteral (l+1) (c:s) : xs
571
572   fin (x:xs) = x : fin xs
573
574   go [] = []
575
576   -- Get rid of ExtSeparators, so that they can hopefully be combined into
577   -- LongLiterals later.
578   --
579   -- /.                      -> fine
580   -- . elsewhere             -> fine
581   -- /[.]                    -> Unmatchable
582   -- [.] at start of pattern -> handled outside 'go'
583   go (p@PathSeparator : ExtSeparator : xs) = p : Literal '.' : go xs
584   go (ExtSeparator : xs) = Literal '.' : go xs
585   go (p@PathSeparator : x@(CharRange _ _) : xs) =
586      p : case optimizeCharRange True x of
587             x'@(CharRange _ _) -> x' : go xs
588             Literal '.'        -> [Unmatchable]
589             x'                 -> go (x':xs)
590
591   go (x@(CharRange _ _) : xs) =
592      case optimizeCharRange False x of
593           x'@(CharRange _ _) -> x' : go xs
594           x'                 -> go (x':xs)
595
596   -- Put [0-9] in front of <-> to allow compressing <->[0-9]<->. Handling the
597   -- [0-9] first in matching should also be faster in general.
598   go (o@(OpenRange Nothing Nothing) : d : xs) | d == anyDigit =
599      d : go (o : xs)
600
601   go (x:xs) =
602      case find ((== x) . fst) compressables of
603           Just (_, f) -> let (compressed,ys) = span (== x) xs
604                           in if null compressed
605                                 then x : go ys
606                                 else f (length compressed) ++ go (x : ys)
607           Nothing -> x : go xs
608
609   checkUnmatchable f ts = if Unmatchable `elem` ts then [Unmatchable] else f ts
610
611   compressables = [ (AnyNonPathSeparator, const [])
612                   , (AnyDirectory, const [])
613                   , (OpenRange Nothing Nothing, \n -> replicate n anyDigit)
614                   ]
615
616   isCharLiteral (Literal x) = Just x
617   isCharLiteral _           = Nothing
618
619   anyDigit = CharRange True [Right ('0', '9')]
620
621-- | Like 'span', but let's use a -> Maybe b predicate
622spanMaybe :: (a -> Maybe b) -> [a] -> ([b], [a])
623spanMaybe f = go
624 where
625   go xs@[]        = ([], xs)
626   go xs@(x : xs') = case f x of
627      Nothing -> ([], xs)
628      Just y  -> let (ys, zs) = go xs' in (y : ys, zs)
629
630optimizeCharRange :: Bool -> Token -> Token
631optimizeCharRange precededBySlash (CharRange b rs) =
632   fin . stripUnmatchable . go . sortCharRange $ rs
633 where
634   -- [/] is interesting, it actually matches nothing at all
635   -- [.] can be Literalized though, just don't make it into an ExtSeparator so
636   --     that it doesn't match a leading dot
637   fin [Left  c] | b = if isPathSeparator c then Unmatchable else Literal c
638   fin [Right r] | b && r == (minBound,maxBound) = NonPathSeparator
639   fin x = CharRange b x
640
641   stripUnmatchable xs@(_:_:_) | b =
642      filter (\x -> (not precededBySlash || x /= Left '.') && x /= Left '/') xs
643   stripUnmatchable xs = xs
644
645   go [] = []
646
647   go (x@(Left c) : xs) =
648      case xs of
649           [] -> [x]
650           y@(Left d) : ys
651              -- [aaaaa] -> [a]
652              | c == d      -> go$ Left c : ys
653              | d == succ c ->
654                 let (ls,rest)        = span isLeft xs -- start from y
655                     (catable,others) = increasingSeq (map fromLeft ls)
656                     range            = (c, head catable)
657
658                  in -- three (or more) Lefts make a Right
659                     if null catable || null (tail catable)
660                        then x : y : go ys
661                        -- [abcd] -> [a-d]
662                        else go$ Right range : map Left others ++ rest
663
664              | otherwise -> x : go xs
665
666           Right r : ys ->
667              case addToRange r c of
668                   -- [da-c] -> [a-d]
669                   Just r' -> go$ Right r' : ys
670                   Nothing -> x : go xs
671
672   go (x@(Right r) : xs) =
673      case xs of
674           [] -> [x]
675           Left c : ys ->
676              case addToRange r c of
677                   -- [a-cd] -> [a-d]
678                   Just r' -> go$ Right r' : ys
679                   Nothing -> x : go xs
680
681           Right r' : ys ->
682              case overlap r r' of
683                   -- [a-cb-d] -> [a-d]
684                   Just o  -> go$ Right o : ys
685                   Nothing -> x : go xs
686optimizeCharRange _ _ = error "Glob.optimizeCharRange :: internal error"
687
688sortCharRange :: [Either Char (Char,Char)] -> [Either Char (Char,Char)]
689sortCharRange = sortBy cmp
690 where
691   cmp (Left   a)    (Left   b)    = compare a b
692   cmp (Left   a)    (Right (b,_)) = compare a b
693   cmp (Right (a,_)) (Left   b)    = compare a b
694   cmp (Right (a,_)) (Right (b,_)) = compare a b
695
696-- |Returns `True` iff the given `Pattern` is a literal file path, i.e. it has
697-- no wildcards, character ranges, etc.
698isLiteral :: Pattern -> Bool
699isLiteral = all lit . unPattern
700 where
701   lit (Literal _) = True
702   lit (LongLiteral _ _) = True
703   lit PathSeparator = True
704   lit _ = False
705