1-- | 2-- Module : Text.Megaparsec.Char 3-- Copyright : © 2015–2019 Megaparsec contributors 4-- © 2007 Paolo Martini 5-- © 1999–2001 Daan Leijen 6-- License : FreeBSD 7-- 8-- Maintainer : Mark Karpov <markkarpov92@gmail.com> 9-- Stability : experimental 10-- Portability : non-portable 11-- 12-- Commonly used character parsers. 13 14{-# LANGUAGE FlexibleContexts #-} 15{-# LANGUAGE LambdaCase #-} 16{-# LANGUAGE ScopedTypeVariables #-} 17{-# LANGUAGE TypeFamilies #-} 18 19module Text.Megaparsec.Char 20 ( -- * Simple parsers 21 newline 22 , crlf 23 , eol 24 , tab 25 , space 26 , space1 27 -- * Categories of characters 28 , controlChar 29 , spaceChar 30 , upperChar 31 , lowerChar 32 , letterChar 33 , alphaNumChar 34 , printChar 35 , digitChar 36 , binDigitChar 37 , octDigitChar 38 , hexDigitChar 39 , markChar 40 , numberChar 41 , punctuationChar 42 , symbolChar 43 , separatorChar 44 , asciiChar 45 , latin1Char 46 , charCategory 47 , categoryName 48 -- * Single character 49 , char 50 , char' 51 -- * Sequence of characters 52 , string 53 , string' ) 54where 55 56import Control.Applicative 57import Data.Char 58import Data.Functor (void) 59import Data.Proxy 60import Text.Megaparsec 61import Text.Megaparsec.Common 62 63---------------------------------------------------------------------------- 64-- Simple parsers 65 66-- | Parse a newline character. 67 68newline :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 69newline = char '\n' 70{-# INLINE newline #-} 71 72-- | Parse a carriage return character followed by a newline character. 73-- Return the sequence of characters parsed. 74 75crlf :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s) 76crlf = string (tokensToChunk (Proxy :: Proxy s) "\r\n") 77{-# INLINE crlf #-} 78 79-- | Parse a CRLF (see 'crlf') or LF (see 'newline') end of line. Return the 80-- sequence of characters parsed. 81 82eol :: forall e s m. (MonadParsec e s m, Token s ~ Char) => m (Tokens s) 83eol = (tokenToChunk (Proxy :: Proxy s) <$> newline) 84 <|> crlf 85 <?> "end of line" 86{-# INLINE eol #-} 87 88-- | Parse a tab character. 89 90tab :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 91tab = char '\t' 92{-# INLINE tab #-} 93 94-- | Skip /zero/ or more white space characters. 95-- 96-- See also: 'skipMany' and 'spaceChar'. 97 98space :: (MonadParsec e s m, Token s ~ Char) => m () 99space = void $ takeWhileP (Just "white space") isSpace 100{-# INLINE space #-} 101 102-- | Skip /one/ or more white space characters. 103-- 104-- See also: 'skipSome' and 'spaceChar'. 105-- 106-- @since 6.0.0 107 108space1 :: (MonadParsec e s m, Token s ~ Char) => m () 109space1 = void $ takeWhile1P (Just "white space") isSpace 110{-# INLINE space1 #-} 111 112---------------------------------------------------------------------------- 113-- Categories of characters 114 115-- | Parse a control character (a non-printing character of the Latin-1 116-- subset of Unicode). 117 118controlChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 119controlChar = satisfy isControl <?> "control character" 120{-# INLINE controlChar #-} 121 122-- | Parse a Unicode space character, and the control characters: tab, 123-- newline, carriage return, form feed, and vertical tab. 124 125spaceChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 126spaceChar = satisfy isSpace <?> "white space" 127{-# INLINE spaceChar #-} 128 129-- | Parse an upper-case or title-case alphabetic Unicode character. Title 130-- case is used by a small number of letter ligatures like the 131-- single-character form of Lj. 132 133upperChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 134upperChar = satisfy isUpper <?> "uppercase letter" 135{-# INLINE upperChar #-} 136 137-- | Parse a lower-case alphabetic Unicode character. 138 139lowerChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 140lowerChar = satisfy isLower <?> "lowercase letter" 141{-# INLINE lowerChar #-} 142 143-- | Parse an alphabetic Unicode character: lower-case, upper-case, or 144-- title-case letter, or a letter of case-less scripts\/modifier letter. 145 146letterChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 147letterChar = satisfy isLetter <?> "letter" 148{-# INLINE letterChar #-} 149 150-- | Parse an alphabetic or numeric digit Unicode characters. 151-- 152-- Note that the numeric digits outside the ASCII range are parsed by this 153-- parser but not by 'digitChar'. Such digits may be part of identifiers but 154-- are not used by the printer and reader to represent numbers. 155 156alphaNumChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 157alphaNumChar = satisfy isAlphaNum <?> "alphanumeric character" 158{-# INLINE alphaNumChar #-} 159 160-- | Parse a printable Unicode character: letter, number, mark, punctuation, 161-- symbol or space. 162 163printChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 164printChar = satisfy isPrint <?> "printable character" 165{-# INLINE printChar #-} 166 167-- | Parse an ASCII digit, i.e between “0” and “9”. 168 169digitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 170digitChar = satisfy isDigit <?> "digit" 171{-# INLINE digitChar #-} 172 173-- | Parse a binary digit, i.e. "0" or "1". 174-- 175-- @since 7.0.0 176 177binDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 178binDigitChar = satisfy isBinDigit <?> "binary digit" 179 where 180 isBinDigit x = x == '0' || x == '1' 181{-# INLINE binDigitChar #-} 182 183-- | Parse an octal digit, i.e. between “0” and “7”. 184 185octDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 186octDigitChar = satisfy isOctDigit <?> "octal digit" 187{-# INLINE octDigitChar #-} 188 189-- | Parse a hexadecimal digit, i.e. between “0” and “9”, or “a” and “f”, or 190-- “A” and “F”. 191 192hexDigitChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 193hexDigitChar = satisfy isHexDigit <?> "hexadecimal digit" 194{-# INLINE hexDigitChar #-} 195 196-- | Parse a Unicode mark character (accents and the like), which combines 197-- with preceding characters. 198 199markChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 200markChar = satisfy isMark <?> "mark character" 201{-# INLINE markChar #-} 202 203-- | Parse a Unicode numeric character, including digits from various 204-- scripts, Roman numerals, etc. 205 206numberChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 207numberChar = satisfy isNumber <?> "numeric character" 208{-# INLINE numberChar #-} 209 210-- | Parse a Unicode punctuation character, including various kinds of 211-- connectors, brackets and quotes. 212 213punctuationChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 214punctuationChar = satisfy isPunctuation <?> "punctuation" 215{-# INLINE punctuationChar #-} 216 217-- | Parse a Unicode symbol characters, including mathematical and currency 218-- symbols. 219 220symbolChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 221symbolChar = satisfy isSymbol <?> "symbol" 222{-# INLINE symbolChar #-} 223 224-- | Parse a Unicode space and separator characters. 225 226separatorChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 227separatorChar = satisfy isSeparator <?> "separator" 228{-# INLINE separatorChar #-} 229 230-- | Parse a character from the first 128 characters of the Unicode 231-- character set, corresponding to the ASCII character set. 232 233asciiChar :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 234asciiChar = satisfy isAscii <?> "ASCII character" 235{-# INLINE asciiChar #-} 236 237-- | Parse a character from the first 256 characters of the Unicode 238-- character set, corresponding to the ISO 8859-1 (Latin-1) character set. 239 240latin1Char :: (MonadParsec e s m, Token s ~ Char) => m (Token s) 241latin1Char = satisfy isLatin1 <?> "Latin-1 character" 242{-# INLINE latin1Char #-} 243 244-- | @'charCategory' cat@ parses character in Unicode General Category 245-- @cat@, see 'Data.Char.GeneralCategory'. 246 247charCategory :: (MonadParsec e s m, Token s ~ Char) 248 => GeneralCategory 249 -> m (Token s) 250charCategory cat = satisfy ((== cat) . generalCategory) <?> categoryName cat 251{-# INLINE charCategory #-} 252 253-- | Return the human-readable name of Unicode General Category. 254 255categoryName :: GeneralCategory -> String 256categoryName = \case 257 UppercaseLetter -> "uppercase letter" 258 LowercaseLetter -> "lowercase letter" 259 TitlecaseLetter -> "titlecase letter" 260 ModifierLetter -> "modifier letter" 261 OtherLetter -> "other letter" 262 NonSpacingMark -> "non-spacing mark" 263 SpacingCombiningMark -> "spacing combining mark" 264 EnclosingMark -> "enclosing mark" 265 DecimalNumber -> "decimal number character" 266 LetterNumber -> "letter number character" 267 OtherNumber -> "other number character" 268 ConnectorPunctuation -> "connector punctuation" 269 DashPunctuation -> "dash punctuation" 270 OpenPunctuation -> "open punctuation" 271 ClosePunctuation -> "close punctuation" 272 InitialQuote -> "initial quote" 273 FinalQuote -> "final quote" 274 OtherPunctuation -> "other punctuation" 275 MathSymbol -> "math symbol" 276 CurrencySymbol -> "currency symbol" 277 ModifierSymbol -> "modifier symbol" 278 OtherSymbol -> "other symbol" 279 Space -> "white space" 280 LineSeparator -> "line separator" 281 ParagraphSeparator -> "paragraph separator" 282 Control -> "control character" 283 Format -> "format character" 284 Surrogate -> "surrogate character" 285 PrivateUse -> "private-use Unicode character" 286 NotAssigned -> "non-assigned Unicode character" 287 288---------------------------------------------------------------------------- 289-- Single character 290 291-- | A type-constrained version of 'single'. 292-- 293-- > semicolon = char ';' 294 295char :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s) 296char = single 297{-# INLINE char #-} 298 299-- | The same as 'char' but case-insensitive. This parser returns the 300-- actually parsed character preserving its case. 301-- 302-- >>> parseTest (char' 'e') "E" 303-- 'E' 304-- >>> parseTest (char' 'e') "G" 305-- 1:1: 306-- unexpected 'G' 307-- expecting 'E' or 'e' 308 309char' :: (MonadParsec e s m, Token s ~ Char) => Token s -> m (Token s) 310char' c = choice 311 [ char (toLower c) 312 , char (toUpper c) 313 , char (toTitle c) 314 ] 315{-# INLINE char' #-} 316