1-- |
2-- Module      :  Text.Megaparsec.Char
3-- Copyright   :  © 2015–2019 Megaparsec contributors
4--                © 2007 Paolo Martini
5--                © 1999–2001 Daan Leijen
6-- License     :  FreeBSD
7--
8-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
9-- Stability   :  experimental
10-- Portability :  non-portable
11--
12-- Commonly used character parsers.
13
14{-# LANGUAGE FlexibleContexts    #-}
15{-# LANGUAGE LambdaCase          #-}
16{-# LANGUAGE ScopedTypeVariables #-}
17{-# LANGUAGE TypeFamilies        #-}
18
19module Text.Megaparsec.Char
20  ( -- * Simple parsers
21    newline
22  , crlf
23  , eol
24  , tab
25  , space
26  , space1
27    -- * Categories of characters
28  , controlChar
29  , spaceChar
30  , upperChar
31  , lowerChar
32  , letterChar
33  , alphaNumChar
34  , printChar
35  , digitChar
36  , binDigitChar
37  , octDigitChar
38  , hexDigitChar
39  , markChar
40  , numberChar
41  , punctuationChar
42  , symbolChar
43  , separatorChar
44  , asciiChar
45  , latin1Char
46  , charCategory
47  , categoryName
48    -- * Single character
49  , char
50  , char'
51    -- * Sequence of characters
52  , string
53  , string' )
54where
55
56import Control.Applicative
57import Data.Char
58import Data.Functor (void)
59import Data.Proxy
60import Text.Megaparsec
61import Text.Megaparsec.Common
62
63----------------------------------------------------------------------------
64-- Simple parsers
65
66-- | Parse a newline character.
67
68newline :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
69newline = char '\n'
70{-# INLINE newline #-}
71
72-- | Parse a carriage return character followed by a newline character.
73-- Return the sequence of characters parsed.
74
75crlf :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s)
76crlf = string (tokensToChunk (Proxy :: Proxy s) "\r\n")
77{-# INLINE crlf #-}
78
79-- | Parse a CRLF (see 'crlf') or LF (see 'newline') end of line. Return the
80-- sequence of characters parsed.
81
82eol :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s)
83eol = (tokenToChunk (Proxy :: Proxy s) <$> newline)
84  <|> crlf
85  <?> "end of line"
86{-# INLINE eol #-}
87
88-- | Parse a tab character.
89
90tab :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
91tab = char '\t'
92{-# INLINE tab #-}
93
94-- | Skip /zero/ or more white space characters.
95--
96-- See also: 'skipMany' and 'spaceChar'.
97
98space :: (MonadParsec e s m, Token s ~ Char) => m ()
99space = void $ takeWhileP (Just "white space") isSpace
100{-# INLINE space #-}
101
102-- | Skip /one/ or more white space characters.
103--
104-- See also: 'skipSome' and 'spaceChar'.
105--
106-- @since 6.0.0
107
108space1 :: (MonadParsec e s m, Token s ~ Char) => m ()
109space1 = void $ takeWhile1P (Just "white space") isSpace
110{-# INLINE space1 #-}
111
112----------------------------------------------------------------------------
113-- Categories of characters
114
115-- | Parse a control character (a non-printing character of the Latin-1
116-- subset of Unicode).
117
118controlChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
119controlChar = satisfy isControl <?> "control character"
120{-# INLINE controlChar #-}
121
122-- | Parse a Unicode space character, and the control characters: tab,
123-- newline, carriage return, form feed, and vertical tab.
124
125spaceChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
126spaceChar = satisfy isSpace <?> "white space"
127{-# INLINE spaceChar #-}
128
129-- | Parse an upper-case or title-case alphabetic Unicode character. Title
130-- case is used by a small number of letter ligatures like the
131-- single-character form of Lj.
132
133upperChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
134upperChar = satisfy isUpper <?> "uppercase letter"
135{-# INLINE upperChar #-}
136
137-- | Parse a lower-case alphabetic Unicode character.
138
139lowerChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
140lowerChar = satisfy isLower <?> "lowercase letter"
141{-# INLINE lowerChar #-}
142
143-- | Parse an alphabetic Unicode character: lower-case, upper-case, or
144-- title-case letter, or a letter of case-less scripts\/modifier letter.
145
146letterChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
147letterChar = satisfy isLetter <?> "letter"
148{-# INLINE letterChar #-}
149
150-- | Parse an alphabetic or numeric digit Unicode characters.
151--
152-- Note that the numeric digits outside the ASCII range are parsed by this
153-- parser but not by 'digitChar'. Such digits may be part of identifiers but
154-- are not used by the printer and reader to represent numbers.
155
156alphaNumChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
157alphaNumChar = satisfy isAlphaNum <?> "alphanumeric character"
158{-# INLINE alphaNumChar #-}
159
160-- | Parse a printable Unicode character: letter, number, mark, punctuation,
161-- symbol or space.
162
163printChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
164printChar = satisfy isPrint <?> "printable character"
165{-# INLINE printChar #-}
166
167-- | Parse an ASCII digit, i.e between “0” and “9”.
168
169digitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
170digitChar = satisfy isDigit <?> "digit"
171{-# INLINE digitChar #-}
172
173-- | Parse a binary digit, i.e. "0" or "1".
174--
175-- @since 7.0.0
176
177binDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
178binDigitChar = satisfy isBinDigit <?> "binary digit"
179  where
180    isBinDigit x = x == '0' || x == '1'
181{-# INLINE binDigitChar #-}
182
183-- | Parse an octal digit, i.e. between “0” and “7”.
184
185octDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
186octDigitChar = satisfy isOctDigit <?> "octal digit"
187{-# INLINE octDigitChar #-}
188
189-- | Parse a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”, or
190-- “A” and “F”.
191
192hexDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
193hexDigitChar = satisfy isHexDigit <?> "hexadecimal digit"
194{-# INLINE hexDigitChar #-}
195
196-- | Parse a Unicode mark character (accents and the like), which combines
197-- with preceding characters.
198
199markChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
200markChar = satisfy isMark <?> "mark character"
201{-# INLINE markChar #-}
202
203-- | Parse a Unicode numeric character, including digits from various
204-- scripts, Roman numerals, etc.
205
206numberChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
207numberChar = satisfy isNumber <?> "numeric character"
208{-# INLINE numberChar #-}
209
210-- | Parse a Unicode punctuation character, including various kinds of
211-- connectors, brackets and quotes.
212
213punctuationChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
214punctuationChar = satisfy isPunctuation <?> "punctuation"
215{-# INLINE punctuationChar #-}
216
217-- | Parse a Unicode symbol characters, including mathematical and currency
218-- symbols.
219
220symbolChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
221symbolChar = satisfy isSymbol <?> "symbol"
222{-# INLINE symbolChar #-}
223
224-- | Parse a Unicode space and separator characters.
225
226separatorChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
227separatorChar = satisfy isSeparator <?> "separator"
228{-# INLINE separatorChar #-}
229
230-- | Parse a character from the first 128 characters of the Unicode
231-- character set, corresponding to the ASCII character set.
232
233asciiChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
234asciiChar = satisfy isAscii <?> "ASCII character"
235{-# INLINE asciiChar #-}
236
237-- | Parse a character from the first 256 characters of the Unicode
238-- character set, corresponding to the ISO 8859-1 (Latin-1) character set.
239
240latin1Char :: (MonadParsec e s m, Token s ~ Char) => m (Token s)
241latin1Char = satisfy isLatin1 <?> "Latin-1 character"
242{-# INLINE latin1Char #-}
243
244-- | @'charCategory' cat@ parses character in Unicode General Category
245-- @cat@, see 'Data.Char.GeneralCategory'.
246
247charCategory :: (MonadParsec e s m, Token s ~ Char)
248  => GeneralCategory
249  -> m (Token s)
250charCategory cat = satisfy ((== cat) . generalCategory) <?> categoryName cat
251{-# INLINE charCategory #-}
252
253-- | Return the human-readable name of Unicode General Category.
254
255categoryName :: GeneralCategory -> String
256categoryName = \case
257  UppercaseLetter      -> "uppercase letter"
258  LowercaseLetter      -> "lowercase letter"
259  TitlecaseLetter      -> "titlecase letter"
260  ModifierLetter       -> "modifier letter"
261  OtherLetter          -> "other letter"
262  NonSpacingMark       -> "non-spacing mark"
263  SpacingCombiningMark -> "spacing combining mark"
264  EnclosingMark        -> "enclosing mark"
265  DecimalNumber        -> "decimal number character"
266  LetterNumber         -> "letter number character"
267  OtherNumber          -> "other number character"
268  ConnectorPunctuation -> "connector punctuation"
269  DashPunctuation      -> "dash punctuation"
270  OpenPunctuation      -> "open punctuation"
271  ClosePunctuation     -> "close punctuation"
272  InitialQuote         -> "initial quote"
273  FinalQuote           -> "final quote"
274  OtherPunctuation     -> "other punctuation"
275  MathSymbol           -> "math symbol"
276  CurrencySymbol       -> "currency symbol"
277  ModifierSymbol       -> "modifier symbol"
278  OtherSymbol          -> "other symbol"
279  Space                -> "white space"
280  LineSeparator        -> "line separator"
281  ParagraphSeparator   -> "paragraph separator"
282  Control              -> "control character"
283  Format               -> "format character"
284  Surrogate            -> "surrogate character"
285  PrivateUse           -> "private-use Unicode character"
286  NotAssigned          -> "non-assigned Unicode character"
287
288----------------------------------------------------------------------------
289-- Single character
290
291-- | A type-constrained version of 'single'.
292--
293-- > semicolon = char ';'
294
295char :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
296char = single
297{-# INLINE char #-}
298
299-- | The same as 'char' but case-insensitive. This parser returns the
300-- actually parsed character preserving its case.
301--
302-- >>> parseTest (char' 'e') "E"
303-- 'E'
304-- >>> parseTest (char' 'e') "G"
305-- 1:1:
306-- unexpected 'G'
307-- expecting 'E' or 'e'
308
309char' :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s)
310char' c = choice
311  [ char (toLower c)
312  , char (toUpper c)
313  , char (toTitle c)
314  ]
315{-# INLINE char' #-}
316