1{-# LANGUAGE GeneralizedNewtypeDeriving #-}
2-- | See <http://pubs.opengroup.org/onlinepubs/9699919799/utilities/awk.html> for the
3-- full awk grammar.
4module Test.Tasty.Patterns.Parser
5  ( Parser
6  , runParser
7  , ParseResult(..)
8  , expr
9  , parseAwkExpr
10  )
11  where
12
13import Prelude hiding (Ordering(..))
14import Text.ParserCombinators.ReadP hiding (many, optional)
15import Text.ParserCombinators.ReadPrec (readPrec_to_P, minPrec)
16import Text.Read (readPrec)
17import Data.Functor
18import Data.Char
19import Control.Applicative
20import Control.Monad
21import Test.Tasty.Patterns.Types
22import Test.Tasty.Patterns.Expr
23
24type Token = ReadP
25
26-- | A separate 'Parser' data type ensures that we don't forget to skip
27-- spaces.
28newtype Parser a = Parser (ReadP a)
29  deriving (Functor, Applicative, Alternative, Monad, MonadPlus)
30
31data ParseResult a = Success a | Invalid | Ambiguous [a]
32  deriving (Eq, Show)
33
34token :: Token a -> Parser a
35token a = Parser (a <* skipSpaces)
36
37sym :: Char -> Parser ()
38sym = void . token . char
39
40str :: String -> Parser ()
41str = void . token . string
42
43-- | Run a parser
44runParser
45  :: Parser a
46  -> String -- ^ text to parse
47  -> ParseResult a
48runParser (Parser p) s =
49  case filter (null . snd) $ readP_to_S (skipSpaces *> p) s of
50    [(a, _)] -> Success a
51    [] -> Invalid
52    as -> Ambiguous (fst <$> as)
53
54intP :: Parser Int
55intP = token $
56  -- we cannot use the standard Int ReadP parser because it recognizes
57  -- negative numbers, making -1 ambiguous
58  read <$> munch1 isDigit
59
60strP :: Parser String
61strP = token $ readPrec_to_P readPrec minPrec
62  -- this deviates somewhat from the awk string literals, by design
63
64-- | An awk ERE token such as @/foo/@. No special characters are recognized
65-- at the moment, except @\@ as an escape character for @/@ and itself.
66patP :: Parser String
67patP = token $ char '/' *> many ch <* char '/'
68  where
69    ch =
70      satisfy (`notElem` "/\\") <|>
71      (char '\\' *> satisfy (`elem` "/\\"))
72
73nfP :: Parser ()
74nfP = token $ void $ string "NF"
75
76-- | Built-in functions
77builtin :: Parser Expr
78builtin = msum
79  [ fn "length" $ LengthFn <$> optional expr
80    -- we don't support length without parentheses at all,
81    -- because that makes length($1) ambiguous
82    -- (we don't require spaces for concatenation)
83  , fn "toupper" $ ToUpperFn <$> expr
84  , fn "tolower" $ ToLowerFn <$> expr
85  , fn "match" $ MatchFn <$> expr <* sym ',' <*> patP
86  , fn "substr" $ SubstrFn <$> expr <* sym ',' <*> expr <*>
87      optional (sym ',' *> expr)
88  ]
89  where
90    fn :: String -> Parser a -> Parser a
91    fn name args = token (string name) *> sym '(' *> args <* sym ')'
92
93-- | Atomic expressions
94expr0 :: Parser Expr
95expr0 =
96  (sym '(' *> expr <* sym ')') <|>
97  (IntLit <$> intP) <|>
98  (StringLit <$> strP) <|>
99  (ERE <$> patP) <|>
100  (NF <$ nfP) <|>
101  builtin
102
103-- | Arguments to unary operators: atomic expressions and field
104-- expressions
105expr1 :: Parser Expr
106expr1 = makeExprParser expr0
107  [ [ Prefix  (Field <$ sym '$') ] ]
108
109-- | Whether a parser is unary or non-unary.
110--
111-- This roughly corresponds to the @unary_expr@ and @non_unary_expr@
112-- non-terminals in the awk grammar.
113-- (Why roughly? See 'expr2'.)
114data Unary = Unary | NonUnary
115
116-- | Arithmetic expressions.
117--
118-- Unlike awk, non-unary expressions disallow unary operators everywhere,
119-- not just in the leading position, to avoid extra complexity in
120-- 'makeExprParser'.
121--
122-- For example, the expression
123--
124-- >1 3 + -4
125--
126-- is valid in awk because @3 + -4@ is non-unary, but we disallow it here
127-- because 'makeExprParser' does not allow us to distinguish it from
128--
129-- >1 -4 + 3
130--
131-- which is ambiguous.
132expr2 :: Unary -> Parser Expr
133expr2 unary = makeExprParser expr1
134  [ [ Prefix  (Not <$ sym '!') ] ++
135    (case unary of
136      Unary -> [ Prefix  (Neg <$ sym '-') ]
137      NonUnary -> []
138    )
139  , [ InfixL  (Add <$ sym '+')
140    , InfixL  (Sub <$ sym '-')
141    ]
142  ]
143
144-- | Expressions that may include string concatenation
145expr3 :: Parser Expr
146expr3 = concatExpr <|> expr2 Unary
147  where
148    -- The awk spec mandates that concatenation associates to the left.
149    -- But concatenation is associative, so why would we care.
150    concatExpr = Concat <$> nonUnary <*> (nonUnary <|> concatExpr)
151    nonUnary = expr2 NonUnary
152
153-- | Everything with lower precedence than concatenation
154expr4 :: Parser Expr
155expr4 = makeExprParser expr3
156  [ [ InfixN (LT <$ sym '<')
157    , InfixN (GT <$ sym '>')
158    , InfixN (LE <$ str "<=")
159    , InfixN (GE <$ str ">=")
160    , InfixN (EQ <$ str "==")
161    , InfixN (NE <$ str "!=")
162    ]
163  , [ Postfix (flip Match <$ sym '~' <*> patP)
164    , Postfix (flip NoMatch <$ str "!~" <*> patP)
165    ]
166  , [ InfixL (And <$ str "&&") ]
167  , [ InfixL (Or  <$ str "||") ]
168  , [ TernR  ((If <$ sym ':') <$ sym '?') ]
169  ]
170
171-- | The awk-like expression parser
172expr :: Parser Expr
173expr = expr4
174
175-- | Parse an awk expression
176parseAwkExpr :: String -> Maybe Expr
177parseAwkExpr s =
178  case runParser expr s of
179    Success e -> Just e
180    _ -> Nothing
181