1-- | A parser for gtk-doc formatted documentation, see
2-- https://developer.gnome.org/gtk-doc-manual/ for the spec.
3module Data.GI.CodeGen.GtkDoc
4  ( parseGtkDoc
5  , GtkDoc(..)
6  , Token(..)
7  , Language(..)
8  , Link(..)
9  , ListItem(..)
10  , CRef(..)
11  ) where
12
13import Prelude hiding (takeWhile)
14
15#if !MIN_VERSION_base(4,8,0)
16import Control.Applicative ((<$>), (<*))
17#endif
18#if !MIN_VERSION_base(4,13,0)
19import Data.Monoid ((<>))
20#endif
21import Control.Applicative ((<|>))
22
23import Data.Attoparsec.Text
24import Data.Char (isAsciiUpper, isAsciiLower, isDigit)
25import qualified Data.Text as T
26import Data.Text (Text)
27
28-- | A parsed gtk-doc token.
29data Token = Literal Text
30           | Comment Text
31           | Verbatim Text
32           | CodeBlock (Maybe Language) Text
33           | ExternalLink Link
34           | Image Link
35           | List [ListItem]
36           | SectionHeader Int GtkDoc -- ^ A section header of the given depth.
37           | SymbolRef CRef
38  deriving (Show, Eq)
39
40-- | A link to a resource, either offline or a section of the documentation.
41data Link = Link { linkName :: Text
42                 , linkAddress :: Text }
43  deriving (Show, Eq)
44
45-- | An item in a list, given by a list of lines (not including ending
46-- newlines). The list is always non-empty, so we represent it by the
47-- first line and then a possibly empty list with the rest of the lines.
48data ListItem = ListItem GtkDoc [GtkDoc]
49  deriving (Show, Eq)
50
51-- | The language for an embedded code block.
52newtype Language = Language Text
53  deriving (Show, Eq)
54
55-- | A reference to some symbol in the API.
56data CRef = FunctionRef Text
57          | ParamRef Text
58          | ConstantRef Text
59          | SignalRef Text Text
60          | LocalSignalRef Text
61          | PropertyRef Text Text
62          | VMethodRef Text Text
63          | StructFieldRef Text Text
64          | TypeRef Text
65  deriving (Show, Eq, Ord)
66
67-- | A parsed representation of gtk-doc formatted documentation.
68newtype GtkDoc = GtkDoc [Token]
69  deriving (Show, Eq)
70
71-- | Parse the given gtk-doc formatted documentation.
72--
73-- === __Examples__
74-- >>> parseGtkDoc ""
75-- GtkDoc []
76--
77-- >>> parseGtkDoc "func()"
78-- GtkDoc [SymbolRef (FunctionRef "func")]
79--
80-- >>> parseGtkDoc "literal"
81-- GtkDoc [Literal "literal"]
82--
83-- >>> parseGtkDoc "This is a long literal"
84-- GtkDoc [Literal "This is a long literal"]
85--
86-- >>> parseGtkDoc "Call foo() for free cookies"
87-- GtkDoc [Literal "Call ",SymbolRef (FunctionRef "foo"),Literal " for free cookies"]
88--
89-- >>> parseGtkDoc "The signal ::activate is related to gtk_button_activate()."
90-- GtkDoc [Literal "The signal ",SymbolRef (LocalSignalRef "activate"),Literal " is related to ",SymbolRef (FunctionRef "gtk_button_activate"),Literal "."]
91--
92-- >>> parseGtkDoc "The signal ##%#GtkButton::activate is related to gtk_button_activate()."
93-- GtkDoc [Literal "The signal ##%",SymbolRef (SignalRef "GtkButton" "activate"),Literal " is related to ",SymbolRef (FunctionRef "gtk_button_activate"),Literal "."]
94--
95-- >>> parseGtkDoc "# A section\n\n## and a subsection ##\n"
96-- GtkDoc [SectionHeader 1 (GtkDoc [Literal "A section"]),Literal "\n",SectionHeader 2 (GtkDoc [Literal "and a subsection "])]
97--
98-- >>> parseGtkDoc "Compact list:\n- First item\n- Second item"
99-- GtkDoc [Literal "Compact list:\n",List [ListItem (GtkDoc [Literal "First item"]) [],ListItem (GtkDoc [Literal "Second item"]) []]]
100--
101-- >>> parseGtkDoc "Spaced list:\n\n- First item\n\n- Second item"
102-- GtkDoc [Literal "Spaced list:\n",List [ListItem (GtkDoc [Literal "First item"]) [],ListItem (GtkDoc [Literal "Second item"]) []]]
103--
104-- >>> parseGtkDoc "List with urls:\n- [test](http://test)\n- ![](image.png)"
105-- GtkDoc [Literal "List with urls:\n",List [ListItem (GtkDoc [ExternalLink (Link {linkName = "test", linkAddress = "http://test"})]) [],ListItem (GtkDoc [Image (Link {linkName = "", linkAddress = "image.png"})]) []]]
106parseGtkDoc :: Text -> GtkDoc
107parseGtkDoc raw =
108  case parseOnly (parseTokens <* endOfInput) raw of
109    Left e ->
110      error $ "gtk-doc parsing failed with error \"" <> e
111      <> "\" on the input \"" <> T.unpack raw <> "\""
112    Right tks -> GtkDoc . coalesceLiterals
113                 . restoreSHPreNewlines . restoreListPreNewline $ tks
114
115-- | `parseSectionHeader` eats the newline before the section header,
116-- but `parseInitialSectionHeader` does not, since it only matches at
117-- the beginning of the text. This restores the newlines eaten by
118-- `parseSectionHeader`, so a `SectionHeader` returned by the parser
119-- can always be assumed /not/ to have an implicit starting newline.
120restoreSHPreNewlines :: [Token] -> [Token]
121restoreSHPreNewlines [] = []
122restoreSHPreNewlines (i : rest) = i : restoreNewlines rest
123  where restoreNewlines :: [Token] -> [Token]
124        restoreNewlines [] = []
125        restoreNewlines (s@(SectionHeader _ _) : rest) =
126          Literal "\n" : s : restoreNewlines rest
127        restoreNewlines (x : rest) = x : restoreNewlines rest
128
129-- | `parseList` eats the newline before the list, restore it.
130restoreListPreNewline :: [Token] -> [Token]
131restoreListPreNewline [] = []
132restoreListPreNewline (l@(List _) : rest) =
133  Literal "\n" : l : restoreListPreNewline rest
134restoreListPreNewline (x : rest) = x : restoreListPreNewline rest
135
136-- | Accumulate consecutive literals into a single literal.
137coalesceLiterals :: [Token] -> [Token]
138coalesceLiterals tks = go Nothing tks
139  where
140    go :: Maybe Text -> [Token] -> [Token]
141    go Nothing  [] = []
142    go (Just l) [] = [Literal l]
143    go Nothing (Literal l : rest) = go (Just l) rest
144    go (Just l) (Literal l' : rest) = go (Just (l <> l')) rest
145    go Nothing (tk : rest) = tk : go Nothing rest
146    go (Just l) (tk : rest) = Literal l : tk : go Nothing rest
147
148-- | Parser for tokens.
149parseTokens :: Parser [Token]
150parseTokens = headerAndTokens <|> justTokens
151  where -- In case the input starts by a section header.
152        headerAndTokens :: Parser [Token]
153        headerAndTokens = do
154          header <- parseInitialSectionHeader
155          tokens <- justTokens
156          return (header : tokens)
157
158        justTokens :: Parser [Token]
159        justTokens = many' parseToken
160
161-- | Parse a single token.
162--
163-- === __Examples__
164-- >>> parseOnly (parseToken <* endOfInput) "func()"
165-- Right (SymbolRef (FunctionRef "func"))
166parseToken :: Parser Token
167parseToken = -- Note that the parsers overlap, so this is not as
168             -- efficient as it could be (if we had combined parsers
169             -- and then branched, so that there is no
170             -- backtracking). But speed is not an issue here, so for
171             -- clarity we keep the parsers distinct. The exception
172             -- is parseFunctionRef, since it does not complicate the
173             -- parser much, and it is the main source of
174             -- backtracking.
175                 parseFunctionRef
176             <|> parseSignal
177             <|> parseLocalSignal
178             <|> parseProperty
179             <|> parseVMethod
180             <|> parseStructField
181             <|> parseType
182             <|> parseConstant
183             <|> parseParam
184             <|> parseEscaped
185             <|> parseVerbatim
186             <|> parseCodeBlock
187             <|> parseUrl
188             <|> parseImage
189             <|> parseSectionHeader
190             <|> parseList
191             <|> parseComment
192             <|> parseBoringLiteral
193
194-- | Parse a signal name, of the form
195-- > #Object::signal
196--
197-- === __Examples__
198-- >>> parseOnly (parseSignal <* endOfInput) "#GtkButton::activate"
199-- Right (SymbolRef (SignalRef "GtkButton" "activate"))
200parseSignal :: Parser Token
201parseSignal = do
202  _ <- char '#'
203  obj <- parseCIdent
204  _ <- string "::"
205  signal <- signalOrPropName
206  return (SymbolRef (SignalRef obj signal))
207
208-- | Parse a reference to a signal defined in the current module, of the form
209-- > ::signal
210--
211-- === __Examples__
212-- >>> parseOnly (parseLocalSignal <* endOfInput) "::activate"
213-- Right (SymbolRef (LocalSignalRef "activate"))
214parseLocalSignal :: Parser Token
215parseLocalSignal = do
216  _ <- string "::"
217  signal <- signalOrPropName
218  return (SymbolRef (LocalSignalRef signal))
219
220-- | Parse a property name, of the form
221-- > #Object:property
222--
223-- === __Examples__
224-- >>> parseOnly (parseProperty <* endOfInput) "#GtkButton:always-show-image"
225-- Right (SymbolRef (PropertyRef "GtkButton" "always-show-image"))
226parseProperty :: Parser Token
227parseProperty = do
228  _ <- char '#'
229  obj <- parseCIdent
230  _ <- char ':'
231  property <- signalOrPropName
232  return (SymbolRef (PropertyRef obj property))
233
234-- | Parse an xml comment, of the form
235-- > <!-- comment -->
236-- Note that this function keeps spaces.
237--
238-- === __Examples__
239-- >>> parseOnly (parseComment <* endOfInput) "<!-- comment -->"
240-- Right (Comment " comment ")
241parseComment :: Parser Token
242parseComment = do
243  comment <- string "<!--" *> manyTill anyChar (string "-->")
244  return (Comment $ T.pack comment)
245
246-- | Parse a reference to a virtual method, of the form
247-- > #Struct.method()
248--
249-- === __Examples__
250-- >>> parseOnly (parseVMethod <* endOfInput) "#Foo.bar()"
251-- Right (SymbolRef (VMethodRef "Foo" "bar"))
252parseVMethod :: Parser Token
253parseVMethod = do
254  _ <- char '#'
255  obj <- parseCIdent
256  _ <- char '.'
257  method <- parseCIdent
258  _ <- string "()"
259  return (SymbolRef (VMethodRef obj method))
260
261-- | Parse a reference to a struct field, of the form
262-- > #Struct.field
263--
264-- === __Examples__
265-- >>> parseOnly (parseStructField <* endOfInput) "#Foo.bar"
266-- Right (SymbolRef (StructFieldRef "Foo" "bar"))
267parseStructField :: Parser Token
268parseStructField = do
269  _ <- char '#'
270  obj <- parseCIdent
271  _ <- char '.'
272  field <- parseCIdent
273  return (SymbolRef (StructFieldRef obj field))
274
275-- | Parse a reference to a C type, of the form
276-- > #Type
277--
278-- === __Examples__
279-- >>> parseOnly (parseType <* endOfInput) "#Foo"
280-- Right (SymbolRef (TypeRef "Foo"))
281parseType :: Parser Token
282parseType = do
283  _ <- char '#'
284  obj <- parseCIdent
285  return (SymbolRef (TypeRef obj))
286
287-- | Parse a constant, of the form
288-- > %CONSTANT_NAME
289--
290-- === __Examples__
291-- >>> parseOnly (parseConstant <* endOfInput) "%TEST_CONSTANT"
292-- Right (SymbolRef (ConstantRef "TEST_CONSTANT"))
293parseConstant :: Parser Token
294parseConstant = do
295  _ <- char '%'
296  c <- parseCIdent
297  return (SymbolRef (ConstantRef c))
298
299-- | Parse a reference to a parameter, of the form
300-- > @param_name
301--
302-- === __Examples__
303-- >>> parseOnly (parseParam <* endOfInput) "@test_param"
304-- Right (SymbolRef (ParamRef "test_param"))
305parseParam :: Parser Token
306parseParam = do
307  _ <- char '@'
308  param <- parseCIdent
309  return (SymbolRef (ParamRef param))
310
311-- | Whether the given character is valid in a C identifier.
312isCIdent :: Char -> Bool
313isCIdent '_' = True
314isCIdent c   = isDigit c || isAsciiUpper c || isAsciiLower c
315
316-- | Name of a signal or property name. Similar to a C identifier, but
317-- hyphens are allowed too.
318signalOrPropName :: Parser Text
319signalOrPropName = takeWhile1 isSignalOrPropIdent
320  where isSignalOrPropIdent :: Char -> Bool
321        isSignalOrPropIdent '-' = True
322        isSignalOrPropIdent c = isCIdent c
323
324-- | Something that could be a valid C identifier (loosely speaking,
325-- we do not need to be too strict here).
326parseCIdent :: Parser Text
327parseCIdent = takeWhile1 isCIdent
328
329-- | Parse a function ref, given by a valid C identifier followed by
330-- '()', for instance 'gtk_widget_show()'. If the identifier is not
331-- followed by "()", return it as a literal instead.
332--
333-- === __Examples__
334-- >>> parseOnly (parseFunctionRef <* endOfInput) "test_func()"
335-- Right (SymbolRef (FunctionRef "test_func"))
336--
337-- >>> parseOnly (parseFunctionRef <* endOfInput) "not_a_func"
338-- Right (Literal "not_a_func")
339parseFunctionRef :: Parser Token
340parseFunctionRef = do
341  ident <- parseCIdent
342  option (Literal ident) (string "()" >>
343                          return (SymbolRef (FunctionRef ident)))
344
345-- | Parse a escaped special character, i.e. one preceded by '\'.
346parseEscaped :: Parser Token
347parseEscaped = do
348  _ <- char '\\'
349  c <- satisfy (`elem` ("#@%\\`" :: [Char]))
350  return $ Literal (T.singleton c)
351
352-- | Parse a literal, i.e. anything without a known special
353-- meaning. Note that this parser always consumes the first character,
354-- regardless of what it is.
355parseBoringLiteral :: Parser Token
356parseBoringLiteral = do
357  c <- anyChar
358  boring <- takeWhile (not . special)
359  return $ Literal (T.cons c boring)
360
361-- | List of special characters from the point of view of the parser
362-- (in the sense that they may be the beginning of something with a
363-- special interpretation).
364special :: Char -> Bool
365special '#' = True
366special '@' = True
367special '%' = True
368special '\\' = True
369special '`' = True
370special '|' = True
371special '[' = True
372special '!' = True
373special '\n' = True
374special ':' = True
375special c = isCIdent c
376
377-- | Parse a verbatim string, of the form
378-- > `verbatim text`
379--
380-- === __Examples__
381-- >>> parseOnly (parseVerbatim <* endOfInput) "`Example quote!`"
382-- Right (Verbatim "Example quote!")
383parseVerbatim :: Parser Token
384parseVerbatim = do
385  _ <- char '`'
386  v <- takeWhile1 (/= '`')
387  _ <- char '`'
388  return $ Verbatim v
389
390-- | Parse a URL in Markdown syntax, of the form
391-- > [name](url)
392--
393-- === __Examples__
394-- >>> parseOnly (parseUrl <* endOfInput) "[haskell](http://haskell.org)"
395-- Right (ExternalLink (Link {linkName = "haskell", linkAddress = "http://haskell.org"}))
396parseUrl :: Parser Token
397parseUrl = do
398  _ <- char '['
399  name <- takeWhile1 (/= ']')
400  _ <- string "]("
401  address <- takeWhile1 (/= ')')
402  _ <- char ')'
403  return $ ExternalLink $ Link {linkName = name, linkAddress = address}
404
405-- | Parse an image reference, of the form
406-- > ![label](url)
407--
408-- === __Examples__
409-- >>> parseOnly (parseImage <* endOfInput) "![](diagram.png)"
410-- Right (Image (Link {linkName = "", linkAddress = "diagram.png"}))
411parseImage :: Parser Token
412parseImage = do
413  _ <- string "!["
414  name <- takeWhile (/= ']')
415  _ <- string "]("
416  address <- takeWhile1 (/= ')')
417  _ <- char ')'
418  return $ Image $ Link {linkName = name, linkAddress = address}
419
420-- | Parse a code block embedded in the documentation.
421parseCodeBlock :: Parser Token
422parseCodeBlock = do
423  _ <- string "|["
424  lang <- (Just <$> parseLanguage) <|> return Nothing
425  code <- T.pack <$> manyTill anyChar (string "]|")
426  return $ CodeBlock lang code
427
428-- | Parse the language of a code block, specified as a comment.
429parseLanguage :: Parser Language
430parseLanguage = do
431  _ <- string "<!--"
432  skipSpace
433  _ <- string "language=\""
434  lang <- takeWhile1 (/= '"')
435  _ <- char '"'
436  skipSpace
437  _ <- string "-->"
438  return $ Language lang
439
440-- | Parse a section header, given by a number of hash symbols, and
441-- then ordinary text. Note that this parser "eats" the newline before
442-- and after the section header.
443parseSectionHeader :: Parser Token
444parseSectionHeader = char '\n' >> parseInitialSectionHeader
445
446-- | Parse a section header at the beginning of the text. I.e. this is
447-- the same as `parseSectionHeader`, but we do not expect a newline as
448-- a first character.
449--
450-- === __Examples__
451-- >>> parseOnly (parseInitialSectionHeader <* endOfInput) "### Hello! ###\n"
452-- Right (SectionHeader 3 (GtkDoc [Literal "Hello! "]))
453--
454-- >>> parseOnly (parseInitialSectionHeader <* endOfInput) "# Hello!\n"
455-- Right (SectionHeader 1 (GtkDoc [Literal "Hello!"]))
456parseInitialSectionHeader :: Parser Token
457parseInitialSectionHeader = do
458  hashes <- takeWhile1 (== '#')
459  _ <- many1 space
460  heading <- takeWhile1 (notInClass "#\n")
461  _ <- (string hashes >> char '\n') <|> (char '\n')
462  return $ SectionHeader (T.length hashes) (parseGtkDoc heading)
463
464-- | Parse a list header. Note that the newline before the start of
465-- the list is "eaten" by this parser, but is restored later by
466-- `parseGtkDoc`.
467--
468-- === __Examples__
469-- >>> parseOnly (parseList <* endOfInput) "\n- First item\n- Second item"
470-- Right (List [ListItem (GtkDoc [Literal "First item"]) [],ListItem (GtkDoc [Literal "Second item"]) []])
471--
472-- >>> parseOnly (parseList <* endOfInput) "\n\n- Two line\n  item\n\n- Second item,\n  also two lines"
473-- Right (List [ListItem (GtkDoc [Literal "Two line"]) [GtkDoc [Literal "item"]],ListItem (GtkDoc [Literal "Second item,"]) [GtkDoc [Literal "also two lines"]]])
474parseList :: Parser Token
475parseList = do
476  items <- many1 parseListItem
477  return $ List items
478  where parseListItem :: Parser ListItem
479        parseListItem = do
480          _ <- char '\n'
481          _ <- string "\n- " <|> string "- "
482          first <- takeWhile1 (/= '\n')
483          rest <- many' parseLine
484          return $ ListItem (parseGtkDoc first) (map parseGtkDoc rest)
485
486        parseLine :: Parser Text
487        parseLine = string "\n  " >> takeWhile1 (/= '\n')
488