1-- | A parser for gtk-doc formatted documentation, see 2-- https://developer.gnome.org/gtk-doc-manual/ for the spec. 3module Data.GI.CodeGen.GtkDoc 4 ( parseGtkDoc 5 , GtkDoc(..) 6 , Token(..) 7 , Language(..) 8 , Link(..) 9 , ListItem(..) 10 , CRef(..) 11 ) where 12 13import Prelude hiding (takeWhile) 14 15#if !MIN_VERSION_base(4,8,0) 16import Control.Applicative ((<$>), (<*)) 17#endif 18#if !MIN_VERSION_base(4,13,0) 19import Data.Monoid ((<>)) 20#endif 21import Control.Applicative ((<|>)) 22 23import Data.Attoparsec.Text 24import Data.Char (isAsciiUpper, isAsciiLower, isDigit) 25import qualified Data.Text as T 26import Data.Text (Text) 27 28-- | A parsed gtk-doc token. 29data Token = Literal Text 30 | Comment Text 31 | Verbatim Text 32 | CodeBlock (Maybe Language) Text 33 | ExternalLink Link 34 | Image Link 35 | List [ListItem] 36 | SectionHeader Int GtkDoc -- ^ A section header of the given depth. 37 | SymbolRef CRef 38 deriving (Show, Eq) 39 40-- | A link to a resource, either offline or a section of the documentation. 41data Link = Link { linkName :: Text 42 , linkAddress :: Text } 43 deriving (Show, Eq) 44 45-- | An item in a list, given by a list of lines (not including ending 46-- newlines). The list is always non-empty, so we represent it by the 47-- first line and then a possibly empty list with the rest of the lines. 48data ListItem = ListItem GtkDoc [GtkDoc] 49 deriving (Show, Eq) 50 51-- | The language for an embedded code block. 52newtype Language = Language Text 53 deriving (Show, Eq) 54 55-- | A reference to some symbol in the API. 56data CRef = FunctionRef Text 57 | ParamRef Text 58 | ConstantRef Text 59 | SignalRef Text Text 60 | LocalSignalRef Text 61 | PropertyRef Text Text 62 | VMethodRef Text Text 63 | StructFieldRef Text Text 64 | TypeRef Text 65 deriving (Show, Eq, Ord) 66 67-- | A parsed representation of gtk-doc formatted documentation. 68newtype GtkDoc = GtkDoc [Token] 69 deriving (Show, Eq) 70 71-- | Parse the given gtk-doc formatted documentation. 72-- 73-- === __Examples__ 74-- >>> parseGtkDoc "" 75-- GtkDoc [] 76-- 77-- >>> parseGtkDoc "func()" 78-- GtkDoc [SymbolRef (FunctionRef "func")] 79-- 80-- >>> parseGtkDoc "literal" 81-- GtkDoc [Literal "literal"] 82-- 83-- >>> parseGtkDoc "This is a long literal" 84-- GtkDoc [Literal "This is a long literal"] 85-- 86-- >>> parseGtkDoc "Call foo() for free cookies" 87-- GtkDoc [Literal "Call ",SymbolRef (FunctionRef "foo"),Literal " for free cookies"] 88-- 89-- >>> parseGtkDoc "The signal ::activate is related to gtk_button_activate()." 90-- GtkDoc [Literal "The signal ",SymbolRef (LocalSignalRef "activate"),Literal " is related to ",SymbolRef (FunctionRef "gtk_button_activate"),Literal "."] 91-- 92-- >>> parseGtkDoc "The signal ##%#GtkButton::activate is related to gtk_button_activate()." 93-- GtkDoc [Literal "The signal ##%",SymbolRef (SignalRef "GtkButton" "activate"),Literal " is related to ",SymbolRef (FunctionRef "gtk_button_activate"),Literal "."] 94-- 95-- >>> parseGtkDoc "# A section\n\n## and a subsection ##\n" 96-- GtkDoc [SectionHeader 1 (GtkDoc [Literal "A section"]),Literal "\n",SectionHeader 2 (GtkDoc [Literal "and a subsection "])] 97-- 98-- >>> parseGtkDoc "Compact list:\n- First item\n- Second item" 99-- GtkDoc [Literal "Compact list:\n",List [ListItem (GtkDoc [Literal "First item"]) [],ListItem (GtkDoc [Literal "Second item"]) []]] 100-- 101-- >>> parseGtkDoc "Spaced list:\n\n- First item\n\n- Second item" 102-- GtkDoc [Literal "Spaced list:\n",List [ListItem (GtkDoc [Literal "First item"]) [],ListItem (GtkDoc [Literal "Second item"]) []]] 103-- 104-- >>> parseGtkDoc "List with urls:\n- [test](http://test)\n- ![](image.png)" 105-- GtkDoc [Literal "List with urls:\n",List [ListItem (GtkDoc [ExternalLink (Link {linkName = "test", linkAddress = "http://test"})]) [],ListItem (GtkDoc [Image (Link {linkName = "", linkAddress = "image.png"})]) []]] 106parseGtkDoc :: Text -> GtkDoc 107parseGtkDoc raw = 108 case parseOnly (parseTokens <* endOfInput) raw of 109 Left e -> 110 error $ "gtk-doc parsing failed with error \"" <> e 111 <> "\" on the input \"" <> T.unpack raw <> "\"" 112 Right tks -> GtkDoc . coalesceLiterals 113 . restoreSHPreNewlines . restoreListPreNewline $ tks 114 115-- | `parseSectionHeader` eats the newline before the section header, 116-- but `parseInitialSectionHeader` does not, since it only matches at 117-- the beginning of the text. This restores the newlines eaten by 118-- `parseSectionHeader`, so a `SectionHeader` returned by the parser 119-- can always be assumed /not/ to have an implicit starting newline. 120restoreSHPreNewlines :: [Token] -> [Token] 121restoreSHPreNewlines [] = [] 122restoreSHPreNewlines (i : rest) = i : restoreNewlines rest 123 where restoreNewlines :: [Token] -> [Token] 124 restoreNewlines [] = [] 125 restoreNewlines (s@(SectionHeader _ _) : rest) = 126 Literal "\n" : s : restoreNewlines rest 127 restoreNewlines (x : rest) = x : restoreNewlines rest 128 129-- | `parseList` eats the newline before the list, restore it. 130restoreListPreNewline :: [Token] -> [Token] 131restoreListPreNewline [] = [] 132restoreListPreNewline (l@(List _) : rest) = 133 Literal "\n" : l : restoreListPreNewline rest 134restoreListPreNewline (x : rest) = x : restoreListPreNewline rest 135 136-- | Accumulate consecutive literals into a single literal. 137coalesceLiterals :: [Token] -> [Token] 138coalesceLiterals tks = go Nothing tks 139 where 140 go :: Maybe Text -> [Token] -> [Token] 141 go Nothing [] = [] 142 go (Just l) [] = [Literal l] 143 go Nothing (Literal l : rest) = go (Just l) rest 144 go (Just l) (Literal l' : rest) = go (Just (l <> l')) rest 145 go Nothing (tk : rest) = tk : go Nothing rest 146 go (Just l) (tk : rest) = Literal l : tk : go Nothing rest 147 148-- | Parser for tokens. 149parseTokens :: Parser [Token] 150parseTokens = headerAndTokens <|> justTokens 151 where -- In case the input starts by a section header. 152 headerAndTokens :: Parser [Token] 153 headerAndTokens = do 154 header <- parseInitialSectionHeader 155 tokens <- justTokens 156 return (header : tokens) 157 158 justTokens :: Parser [Token] 159 justTokens = many' parseToken 160 161-- | Parse a single token. 162-- 163-- === __Examples__ 164-- >>> parseOnly (parseToken <* endOfInput) "func()" 165-- Right (SymbolRef (FunctionRef "func")) 166parseToken :: Parser Token 167parseToken = -- Note that the parsers overlap, so this is not as 168 -- efficient as it could be (if we had combined parsers 169 -- and then branched, so that there is no 170 -- backtracking). But speed is not an issue here, so for 171 -- clarity we keep the parsers distinct. The exception 172 -- is parseFunctionRef, since it does not complicate the 173 -- parser much, and it is the main source of 174 -- backtracking. 175 parseFunctionRef 176 <|> parseSignal 177 <|> parseLocalSignal 178 <|> parseProperty 179 <|> parseVMethod 180 <|> parseStructField 181 <|> parseType 182 <|> parseConstant 183 <|> parseParam 184 <|> parseEscaped 185 <|> parseVerbatim 186 <|> parseCodeBlock 187 <|> parseUrl 188 <|> parseImage 189 <|> parseSectionHeader 190 <|> parseList 191 <|> parseComment 192 <|> parseBoringLiteral 193 194-- | Parse a signal name, of the form 195-- > #Object::signal 196-- 197-- === __Examples__ 198-- >>> parseOnly (parseSignal <* endOfInput) "#GtkButton::activate" 199-- Right (SymbolRef (SignalRef "GtkButton" "activate")) 200parseSignal :: Parser Token 201parseSignal = do 202 _ <- char '#' 203 obj <- parseCIdent 204 _ <- string "::" 205 signal <- signalOrPropName 206 return (SymbolRef (SignalRef obj signal)) 207 208-- | Parse a reference to a signal defined in the current module, of the form 209-- > ::signal 210-- 211-- === __Examples__ 212-- >>> parseOnly (parseLocalSignal <* endOfInput) "::activate" 213-- Right (SymbolRef (LocalSignalRef "activate")) 214parseLocalSignal :: Parser Token 215parseLocalSignal = do 216 _ <- string "::" 217 signal <- signalOrPropName 218 return (SymbolRef (LocalSignalRef signal)) 219 220-- | Parse a property name, of the form 221-- > #Object:property 222-- 223-- === __Examples__ 224-- >>> parseOnly (parseProperty <* endOfInput) "#GtkButton:always-show-image" 225-- Right (SymbolRef (PropertyRef "GtkButton" "always-show-image")) 226parseProperty :: Parser Token 227parseProperty = do 228 _ <- char '#' 229 obj <- parseCIdent 230 _ <- char ':' 231 property <- signalOrPropName 232 return (SymbolRef (PropertyRef obj property)) 233 234-- | Parse an xml comment, of the form 235-- > <!-- comment --> 236-- Note that this function keeps spaces. 237-- 238-- === __Examples__ 239-- >>> parseOnly (parseComment <* endOfInput) "<!-- comment -->" 240-- Right (Comment " comment ") 241parseComment :: Parser Token 242parseComment = do 243 comment <- string "<!--" *> manyTill anyChar (string "-->") 244 return (Comment $ T.pack comment) 245 246-- | Parse a reference to a virtual method, of the form 247-- > #Struct.method() 248-- 249-- === __Examples__ 250-- >>> parseOnly (parseVMethod <* endOfInput) "#Foo.bar()" 251-- Right (SymbolRef (VMethodRef "Foo" "bar")) 252parseVMethod :: Parser Token 253parseVMethod = do 254 _ <- char '#' 255 obj <- parseCIdent 256 _ <- char '.' 257 method <- parseCIdent 258 _ <- string "()" 259 return (SymbolRef (VMethodRef obj method)) 260 261-- | Parse a reference to a struct field, of the form 262-- > #Struct.field 263-- 264-- === __Examples__ 265-- >>> parseOnly (parseStructField <* endOfInput) "#Foo.bar" 266-- Right (SymbolRef (StructFieldRef "Foo" "bar")) 267parseStructField :: Parser Token 268parseStructField = do 269 _ <- char '#' 270 obj <- parseCIdent 271 _ <- char '.' 272 field <- parseCIdent 273 return (SymbolRef (StructFieldRef obj field)) 274 275-- | Parse a reference to a C type, of the form 276-- > #Type 277-- 278-- === __Examples__ 279-- >>> parseOnly (parseType <* endOfInput) "#Foo" 280-- Right (SymbolRef (TypeRef "Foo")) 281parseType :: Parser Token 282parseType = do 283 _ <- char '#' 284 obj <- parseCIdent 285 return (SymbolRef (TypeRef obj)) 286 287-- | Parse a constant, of the form 288-- > %CONSTANT_NAME 289-- 290-- === __Examples__ 291-- >>> parseOnly (parseConstant <* endOfInput) "%TEST_CONSTANT" 292-- Right (SymbolRef (ConstantRef "TEST_CONSTANT")) 293parseConstant :: Parser Token 294parseConstant = do 295 _ <- char '%' 296 c <- parseCIdent 297 return (SymbolRef (ConstantRef c)) 298 299-- | Parse a reference to a parameter, of the form 300-- > @param_name 301-- 302-- === __Examples__ 303-- >>> parseOnly (parseParam <* endOfInput) "@test_param" 304-- Right (SymbolRef (ParamRef "test_param")) 305parseParam :: Parser Token 306parseParam = do 307 _ <- char '@' 308 param <- parseCIdent 309 return (SymbolRef (ParamRef param)) 310 311-- | Whether the given character is valid in a C identifier. 312isCIdent :: Char -> Bool 313isCIdent '_' = True 314isCIdent c = isDigit c || isAsciiUpper c || isAsciiLower c 315 316-- | Name of a signal or property name. Similar to a C identifier, but 317-- hyphens are allowed too. 318signalOrPropName :: Parser Text 319signalOrPropName = takeWhile1 isSignalOrPropIdent 320 where isSignalOrPropIdent :: Char -> Bool 321 isSignalOrPropIdent '-' = True 322 isSignalOrPropIdent c = isCIdent c 323 324-- | Something that could be a valid C identifier (loosely speaking, 325-- we do not need to be too strict here). 326parseCIdent :: Parser Text 327parseCIdent = takeWhile1 isCIdent 328 329-- | Parse a function ref, given by a valid C identifier followed by 330-- '()', for instance 'gtk_widget_show()'. If the identifier is not 331-- followed by "()", return it as a literal instead. 332-- 333-- === __Examples__ 334-- >>> parseOnly (parseFunctionRef <* endOfInput) "test_func()" 335-- Right (SymbolRef (FunctionRef "test_func")) 336-- 337-- >>> parseOnly (parseFunctionRef <* endOfInput) "not_a_func" 338-- Right (Literal "not_a_func") 339parseFunctionRef :: Parser Token 340parseFunctionRef = do 341 ident <- parseCIdent 342 option (Literal ident) (string "()" >> 343 return (SymbolRef (FunctionRef ident))) 344 345-- | Parse a escaped special character, i.e. one preceded by '\'. 346parseEscaped :: Parser Token 347parseEscaped = do 348 _ <- char '\\' 349 c <- satisfy (`elem` ("#@%\\`" :: [Char])) 350 return $ Literal (T.singleton c) 351 352-- | Parse a literal, i.e. anything without a known special 353-- meaning. Note that this parser always consumes the first character, 354-- regardless of what it is. 355parseBoringLiteral :: Parser Token 356parseBoringLiteral = do 357 c <- anyChar 358 boring <- takeWhile (not . special) 359 return $ Literal (T.cons c boring) 360 361-- | List of special characters from the point of view of the parser 362-- (in the sense that they may be the beginning of something with a 363-- special interpretation). 364special :: Char -> Bool 365special '#' = True 366special '@' = True 367special '%' = True 368special '\\' = True 369special '`' = True 370special '|' = True 371special '[' = True 372special '!' = True 373special '\n' = True 374special ':' = True 375special c = isCIdent c 376 377-- | Parse a verbatim string, of the form 378-- > `verbatim text` 379-- 380-- === __Examples__ 381-- >>> parseOnly (parseVerbatim <* endOfInput) "`Example quote!`" 382-- Right (Verbatim "Example quote!") 383parseVerbatim :: Parser Token 384parseVerbatim = do 385 _ <- char '`' 386 v <- takeWhile1 (/= '`') 387 _ <- char '`' 388 return $ Verbatim v 389 390-- | Parse a URL in Markdown syntax, of the form 391-- > [name](url) 392-- 393-- === __Examples__ 394-- >>> parseOnly (parseUrl <* endOfInput) "[haskell](http://haskell.org)" 395-- Right (ExternalLink (Link {linkName = "haskell", linkAddress = "http://haskell.org"})) 396parseUrl :: Parser Token 397parseUrl = do 398 _ <- char '[' 399 name <- takeWhile1 (/= ']') 400 _ <- string "](" 401 address <- takeWhile1 (/= ')') 402 _ <- char ')' 403 return $ ExternalLink $ Link {linkName = name, linkAddress = address} 404 405-- | Parse an image reference, of the form 406-- > ![label](url) 407-- 408-- === __Examples__ 409-- >>> parseOnly (parseImage <* endOfInput) "![](diagram.png)" 410-- Right (Image (Link {linkName = "", linkAddress = "diagram.png"})) 411parseImage :: Parser Token 412parseImage = do 413 _ <- string "![" 414 name <- takeWhile (/= ']') 415 _ <- string "](" 416 address <- takeWhile1 (/= ')') 417 _ <- char ')' 418 return $ Image $ Link {linkName = name, linkAddress = address} 419 420-- | Parse a code block embedded in the documentation. 421parseCodeBlock :: Parser Token 422parseCodeBlock = do 423 _ <- string "|[" 424 lang <- (Just <$> parseLanguage) <|> return Nothing 425 code <- T.pack <$> manyTill anyChar (string "]|") 426 return $ CodeBlock lang code 427 428-- | Parse the language of a code block, specified as a comment. 429parseLanguage :: Parser Language 430parseLanguage = do 431 _ <- string "<!--" 432 skipSpace 433 _ <- string "language=\"" 434 lang <- takeWhile1 (/= '"') 435 _ <- char '"' 436 skipSpace 437 _ <- string "-->" 438 return $ Language lang 439 440-- | Parse a section header, given by a number of hash symbols, and 441-- then ordinary text. Note that this parser "eats" the newline before 442-- and after the section header. 443parseSectionHeader :: Parser Token 444parseSectionHeader = char '\n' >> parseInitialSectionHeader 445 446-- | Parse a section header at the beginning of the text. I.e. this is 447-- the same as `parseSectionHeader`, but we do not expect a newline as 448-- a first character. 449-- 450-- === __Examples__ 451-- >>> parseOnly (parseInitialSectionHeader <* endOfInput) "### Hello! ###\n" 452-- Right (SectionHeader 3 (GtkDoc [Literal "Hello! "])) 453-- 454-- >>> parseOnly (parseInitialSectionHeader <* endOfInput) "# Hello!\n" 455-- Right (SectionHeader 1 (GtkDoc [Literal "Hello!"])) 456parseInitialSectionHeader :: Parser Token 457parseInitialSectionHeader = do 458 hashes <- takeWhile1 (== '#') 459 _ <- many1 space 460 heading <- takeWhile1 (notInClass "#\n") 461 _ <- (string hashes >> char '\n') <|> (char '\n') 462 return $ SectionHeader (T.length hashes) (parseGtkDoc heading) 463 464-- | Parse a list header. Note that the newline before the start of 465-- the list is "eaten" by this parser, but is restored later by 466-- `parseGtkDoc`. 467-- 468-- === __Examples__ 469-- >>> parseOnly (parseList <* endOfInput) "\n- First item\n- Second item" 470-- Right (List [ListItem (GtkDoc [Literal "First item"]) [],ListItem (GtkDoc [Literal "Second item"]) []]) 471-- 472-- >>> parseOnly (parseList <* endOfInput) "\n\n- Two line\n item\n\n- Second item,\n also two lines" 473-- Right (List [ListItem (GtkDoc [Literal "Two line"]) [GtkDoc [Literal "item"]],ListItem (GtkDoc [Literal "Second item,"]) [GtkDoc [Literal "also two lines"]]]) 474parseList :: Parser Token 475parseList = do 476 items <- many1 parseListItem 477 return $ List items 478 where parseListItem :: Parser ListItem 479 parseListItem = do 480 _ <- char '\n' 481 _ <- string "\n- " <|> string "- " 482 first <- takeWhile1 (/= '\n') 483 rest <- many' parseLine 484 return $ ListItem (parseGtkDoc first) (map parseGtkDoc rest) 485 486 parseLine :: Parser Text 487 parseLine = string "\n " >> takeWhile1 (/= '\n') 488