1{-# LANGUAGE OverloadedStrings #-}
2{- |
3   Module      : Text.Pandoc.Readers.MediaWiki
4   Copyright   : Copyright (C) 2012-2021 John MacFarlane
5   License     : GNU GPL, version 2 or above
6
7   Maintainer  : John MacFarlane <jgm@berkeley.edu>
8   Stability   : alpha
9   Portability : portable
10
11Conversion of mediawiki text to 'Pandoc' document.
12-}
13{-
14TODO:
15_ correctly handle tables within tables
16_ parse templates?
17-}
18module Text.Pandoc.Readers.MediaWiki ( readMediaWiki ) where
19
20import Control.Monad
21import Control.Monad.Except (throwError)
22import Data.Char (isDigit, isSpace)
23import qualified Data.Foldable as F
24import Data.List (intersperse)
25import Data.Maybe (fromMaybe, maybeToList)
26import Data.Sequence (ViewL (..), viewl, (<|))
27import qualified Data.Set as Set
28import Data.Text (Text)
29import qualified Data.Text as T
30import Text.HTML.TagSoup
31import Text.Pandoc.Builder (Blocks, Inlines, trimInlines)
32import qualified Text.Pandoc.Builder as B
33import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
34import Text.Pandoc.Definition
35import Text.Pandoc.Logging
36import Text.Pandoc.Options
37import Text.Pandoc.Parsing hiding (nested)
38import Text.Pandoc.Readers.HTML (htmlTag, isBlockTag, isCommentTag)
39import Text.Pandoc.Shared (crFilter, safeRead, stringify, stripTrailingNewlines,
40                           trim, splitTextBy, tshow)
41import Text.Pandoc.Walk (walk)
42import Text.Pandoc.XML (fromEntities)
43
44-- | Read mediawiki from an input string and return a Pandoc document.
45readMediaWiki :: PandocMonad m
46              => ReaderOptions -- ^ Reader options
47              -> Text          -- ^ String to parse (assuming @'\n'@ line endings)
48              -> m Pandoc
49readMediaWiki opts s = do
50  parsed <- readWithM parseMediaWiki MWState{ mwOptions = opts
51                                            , mwMaxNestingLevel = 4
52                                            , mwNextLinkNumber  = 1
53                                            , mwCategoryLinks = []
54                                            , mwIdentifierList = Set.empty
55                                            , mwLogMessages = []
56                                            , mwInTT = False
57                                            }
58            (crFilter s <> "\n")
59  case parsed of
60    Right result -> return result
61    Left e       -> throwError e
62
63data MWState = MWState { mwOptions         :: ReaderOptions
64                       , mwMaxNestingLevel :: Int
65                       , mwNextLinkNumber  :: Int
66                       , mwCategoryLinks   :: [Inlines]
67                       , mwIdentifierList  :: Set.Set Text
68                       , mwLogMessages     :: [LogMessage]
69                       , mwInTT            :: Bool
70                       }
71
72type MWParser m = ParserT Text MWState m
73
74instance HasReaderOptions MWState where
75  extractReaderOptions = mwOptions
76
77instance HasIdentifierList MWState where
78  extractIdentifierList     = mwIdentifierList
79  updateIdentifierList f st = st{ mwIdentifierList = f $ mwIdentifierList st }
80
81instance HasLogMessages MWState where
82  addLogMessage m s = s{ mwLogMessages = m : mwLogMessages s }
83  getLogMessages = reverse . mwLogMessages
84
85--
86-- auxiliary functions
87--
88
89-- This is used to prevent exponential blowups for things like:
90-- ''a'''a''a'''a''a'''a''a'''a
91nested :: PandocMonad m => MWParser m a -> MWParser m a
92nested p = do
93  nestlevel <- mwMaxNestingLevel `fmap` getState
94  guard $ nestlevel > 0
95  updateState $ \st -> st{ mwMaxNestingLevel = mwMaxNestingLevel st - 1 }
96  res <- p
97  updateState $ \st -> st{ mwMaxNestingLevel = nestlevel }
98  return res
99
100specialChars :: [Char]
101specialChars = "'[]<=&*{}|\":\\"
102
103spaceChars :: [Char]
104spaceChars = " \n\t"
105
106sym :: PandocMonad m => Text -> MWParser m ()
107sym s = () <$ try (string $ T.unpack s)
108
109newBlockTags :: [Text]
110newBlockTags = ["haskell","syntaxhighlight","source","gallery","references"]
111
112isBlockTag' :: Tag Text -> Bool
113isBlockTag' tag@(TagOpen t _) = (isBlockTag tag || t `elem` newBlockTags) &&
114  t `notElem` eitherBlockOrInline
115isBlockTag' tag@(TagClose t) = (isBlockTag tag || t `elem` newBlockTags) &&
116  t `notElem` eitherBlockOrInline
117isBlockTag' tag = isBlockTag tag
118
119isInlineTag' :: Tag Text -> Bool
120isInlineTag' (TagComment _) = True
121isInlineTag' t              = not (isBlockTag' t)
122
123eitherBlockOrInline :: [Text]
124eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
125                               "map", "area", "object"]
126
127htmlComment :: PandocMonad m => MWParser m ()
128htmlComment = () <$ htmlTag isCommentTag
129
130inlinesInTags :: PandocMonad m => Text -> MWParser m Inlines
131inlinesInTags tag = try $ do
132  (_,raw) <- htmlTag (~== TagOpen tag [])
133  if T.any (== '/') raw   -- self-closing tag
134     then return mempty
135     else trimInlines . mconcat <$>
136            manyTill inline (htmlTag (~== TagClose tag))
137
138blocksInTags :: PandocMonad m => Text -> MWParser m Blocks
139blocksInTags tag = try $ do
140  (_,raw) <- htmlTag (~== TagOpen tag [])
141  let closer = if tag == "li"
142                  then htmlTag (~== TagClose ("li" :: Text))
143                     <|> lookAhead (
144                              htmlTag (~== TagOpen ("li" :: Text) [])
145                          <|> htmlTag (~== TagClose ("ol" :: Text))
146                          <|> htmlTag (~== TagClose ("ul" :: Text)))
147                  else htmlTag (~== TagClose tag)
148  if T.any (== '/') raw   -- self-closing tag
149     then return mempty
150     else mconcat <$> manyTill block closer
151
152textInTags :: PandocMonad m => Text -> MWParser m Text
153textInTags tag = try $ do
154  (_,raw) <- htmlTag (~== TagOpen tag [])
155  if T.any (== '/') raw   -- self-closing tag
156     then return ""
157     else T.pack <$> manyTill anyChar (htmlTag (~== TagClose tag))
158
159--
160-- main parser
161--
162
163parseMediaWiki :: PandocMonad m => MWParser m Pandoc
164parseMediaWiki = do
165  bs <- mconcat <$> many block
166  spaces
167  eof
168  categoryLinks <- reverse . mwCategoryLinks <$> getState
169  let categories = if null categoryLinks
170                      then mempty
171                      else B.para $ mconcat $ intersperse B.space categoryLinks
172  reportLogMessages
173  return $ B.doc $ bs <> categories
174
175--
176-- block parsers
177--
178
179block :: PandocMonad m => MWParser m Blocks
180block = do
181  res <- mempty <$ skipMany1 blankline
182     <|> table
183     <|> header
184     <|> hrule
185     <|> orderedList
186     <|> bulletList
187     <|> definitionList
188     <|> mempty <$ try (spaces *> htmlComment)
189     <|> preformatted
190     <|> blockTag
191     <|> (B.rawBlock "mediawiki" <$> template)
192     <|> para
193  trace (T.take 60 $ tshow $ B.toList res)
194  return res
195
196para :: PandocMonad m => MWParser m Blocks
197para = do
198  contents <- trimInlines . mconcat <$> many1 inline
199  if F.all (==Space) contents
200     then return mempty
201     else return $ B.para contents
202
203table :: PandocMonad m => MWParser m Blocks
204table = do
205  tableStart
206  styles <- option [] $
207               parseAttrs <* skipMany spaceChar <* optional (char '|')
208  skipMany spaceChar
209  optional $ template >> skipMany spaceChar
210  optional blanklines
211  let tableWidth = case lookup "width" styles of
212                         Just w  -> fromMaybe 1.0 $ parseWidth w
213                         Nothing -> 1.0
214  caption <- option mempty tableCaption
215  optional rowsep
216  hasheader <- option False $ True <$ lookAhead (skipSpaces *> char '!')
217  (cellspecs',hdr) <- unzip <$> tableRow
218  let widths = map ((tableWidth *) . snd) cellspecs'
219  let restwidth = tableWidth - sum widths
220  let zerocols = length $ filter (==0.0) widths
221  let defaultwidth = if zerocols == 0 || zerocols == length widths
222                        then ColWidthDefault
223                        else ColWidth $ restwidth / fromIntegral zerocols
224  let widths' = map (\w -> if w > 0 then ColWidth w else defaultwidth) widths
225  let cellspecs = zip (map fst cellspecs') widths'
226  rows' <- many $ try $ rowsep *> (map snd <$> tableRow)
227  optional blanklines
228  tableEnd
229  let cols = length hdr
230  let (headers,rows) = if hasheader
231                          then (hdr, rows')
232                          else (replicate cols mempty, hdr:rows')
233  let toRow = Row nullAttr . map B.simpleCell
234      toHeaderRow l = [toRow l | not (null l)]
235  return $ B.table (B.simpleCaption $ B.plain caption)
236                   cellspecs
237                   (TableHead nullAttr $ toHeaderRow headers)
238                   [TableBody nullAttr 0 [] $ map toRow rows]
239                   (TableFoot nullAttr [])
240
241parseAttrs :: PandocMonad m => MWParser m [(Text,Text)]
242parseAttrs = many1 parseAttr
243
244parseAttr :: PandocMonad m => MWParser m (Text, Text)
245parseAttr = try $ do
246  skipMany spaceChar
247  k <- many1Char letter
248  char '='
249  v <- (char '"' >> many1TillChar (satisfy (/='\n')) (char '"'))
250       <|> many1Char (satisfy $ \c -> not (isSpace c) && c /= '|')
251  return (k,v)
252
253tableStart :: PandocMonad m => MWParser m ()
254tableStart = try $ guardColumnOne *> skipSpaces *> sym "{|"
255
256tableEnd :: PandocMonad m => MWParser m ()
257tableEnd = try $ guardColumnOne *> skipSpaces *> sym "|}"
258
259rowsep :: PandocMonad m => MWParser m ()
260rowsep = try $ guardColumnOne *> skipSpaces *> sym "|-" <*
261               many (char '-') <* optional parseAttrs <* blanklines
262
263cellsep :: PandocMonad m => MWParser m ()
264cellsep = try $ do
265  col <- sourceColumn <$> getPosition
266  skipSpaces
267  let pipeSep = do
268        char '|'
269        notFollowedBy (oneOf "-}+")
270        if col == 1
271           then optional (char '|')
272           else void (char '|')
273  let exclSep = do
274        char '!'
275        if col == 1
276           then optional (char '!')
277           else void (char '!')
278  pipeSep <|> exclSep
279
280tableCaption :: PandocMonad m => MWParser m Inlines
281tableCaption = try $ do
282  guardColumnOne
283  skipSpaces
284  sym "|+"
285  optional (try $ parseAttrs *> skipSpaces *> char '|' *> blanklines)
286  trimInlines . mconcat <$>
287    many (notFollowedBy (cellsep <|> rowsep) *> inline)
288
289tableRow :: PandocMonad m => MWParser m [((Alignment, Double), Blocks)]
290tableRow = try $ skipMany htmlComment *> many tableCell
291
292tableCell :: PandocMonad m => MWParser m ((Alignment, Double), Blocks)
293tableCell = try $ do
294  cellsep
295  skipMany spaceChar
296  attrs <- option [] $ try $ parseAttrs <* skipSpaces <* char '|' <*
297                                 notFollowedBy (char '|')
298  skipMany spaceChar
299  pos' <- getPosition
300  ls <- T.concat <$> many (notFollowedBy (cellsep <|> rowsep <|> tableEnd) *>
301                            ((snd <$> withRaw table) <|> countChar 1 anyChar))
302  bs <- parseFromString (do setPosition pos'
303                            mconcat <$> many block) ls
304  let align = case lookup "align" attrs of
305                    Just "left"   -> AlignLeft
306                    Just "right"  -> AlignRight
307                    Just "center" -> AlignCenter
308                    _             -> AlignDefault
309  let width = case lookup "width" attrs of
310                    Just xs -> fromMaybe 0.0 $ parseWidth xs
311                    Nothing -> 0.0
312  return ((align, width), bs)
313
314parseWidth :: Text -> Maybe Double
315parseWidth s =
316  case T.unsnoc s of
317    Just (ds, '%') | T.all isDigit ds -> safeRead $ "0." <> ds
318    _ -> Nothing
319
320template :: PandocMonad m => MWParser m Text
321template = try $ do
322  string "{{"
323  notFollowedBy (char '{')
324  lookAhead $ letter <|> digit <|> char ':'
325  let chunk = template <|> variable <|> many1Char (noneOf "{}") <|> countChar 1 anyChar
326  contents <- manyTill chunk (try $ string "}}")
327  return $ "{{" <> T.concat contents <> "}}"
328
329blockTag :: PandocMonad m => MWParser m Blocks
330blockTag = do
331  (tag, _) <- lookAhead $ htmlTag isBlockTag'
332  case tag of
333      TagOpen "blockquote" _ -> B.blockQuote <$> blocksInTags "blockquote"
334      TagOpen "pre" _ -> B.codeBlock . trimCode <$> textInTags "pre"
335      TagOpen "syntaxhighlight" attrs -> syntaxhighlight "syntaxhighlight" attrs
336      TagOpen "source" attrs -> syntaxhighlight "source" attrs
337      TagOpen "haskell" _ -> B.codeBlockWith ("",["haskell"],[]) . trimCode <$>
338                                textInTags "haskell"
339      TagOpen "gallery" _ -> blocksInTags "gallery"
340      TagOpen "p" _ -> mempty <$ htmlTag (~== tag)
341      TagClose "p"  -> mempty <$ htmlTag (~== tag)
342      _ -> B.rawBlock "html" . snd <$> htmlTag (~== tag)
343
344trimCode :: Text -> Text
345trimCode t = case T.uncons t of
346  Just ('\n', xs) -> stripTrailingNewlines xs
347  _               -> stripTrailingNewlines t
348
349syntaxhighlight :: PandocMonad m => Text -> [Attribute Text] -> MWParser m Blocks
350syntaxhighlight tag attrs = try $ do
351  let mblang = lookup "lang" attrs
352  let mbstart = lookup "start" attrs
353  let mbline = lookup "line" attrs
354  let classes = maybeToList mblang ++ maybe [] (const ["numberLines"]) mbline
355  let kvs = maybe [] (\x -> [("startFrom",x)]) mbstart
356  contents <- textInTags tag
357  return $ B.codeBlockWith ("",classes,kvs) $ trimCode contents
358
359hrule :: PandocMonad m => MWParser m Blocks
360hrule = B.horizontalRule <$ try (string "----" *> many (char '-') *> newline)
361
362guardColumnOne :: PandocMonad m => MWParser m ()
363guardColumnOne = getPosition >>= \pos -> guard (sourceColumn pos == 1)
364
365preformatted :: PandocMonad m => MWParser m Blocks
366preformatted = try $ do
367  guardColumnOne
368  char ' '
369  let endline' = B.linebreak <$ try (newline <* char ' ')
370  let whitespace' = B.str <$> many1Char ('\160' <$ spaceChar)
371  let spToNbsp ' ' = '\160'
372      spToNbsp x   = x
373  let nowiki' = mconcat . intersperse B.linebreak . map B.str .
374                T.lines . fromEntities . T.map spToNbsp <$> try
375                  (htmlTag (~== TagOpen ("nowiki" :: Text) []) *>
376                   manyTillChar anyChar (htmlTag (~== TagClose ("nowiki" :: Text))))
377  let inline' = whitespace' <|> endline' <|> nowiki'
378                  <|> try (notFollowedBy newline *> inline)
379  contents <- mconcat <$> many1 inline'
380  let spacesStr (Str xs) = T.all isSpace xs
381      spacesStr _        = False
382  if F.all spacesStr contents
383     then return mempty
384     else return $ B.para $ encode contents
385
386encode :: Inlines -> Inlines
387encode = B.fromList . normalizeCode . B.toList . walk strToCode
388  where strToCode (Str s) = Code ("",[],[]) s
389        strToCode Space   = Code ("",[],[]) " "
390        strToCode  x      = x
391        normalizeCode []  = []
392        normalizeCode (Code a1 x : Code a2 y : zs) | a1 == a2 =
393          normalizeCode $ Code a1 (x <> y) : zs
394        normalizeCode (x:xs) = x : normalizeCode xs
395
396header :: PandocMonad m => MWParser m Blocks
397header = try $ do
398  guardColumnOne
399  lev <- length <$> many1 (char '=')
400  guard $ lev <= 6
401  contents <- trimInlines . mconcat <$> manyTill inline (count lev $ char '=')
402  opts <- mwOptions <$> getState
403  attr <- (if isEnabled Ext_gfm_auto_identifiers opts
404              then id
405              else modifyIdentifier) <$> registerHeader nullAttr contents
406  return $ B.headerWith attr lev contents
407
408-- See #4731:
409modifyIdentifier :: Attr -> Attr
410modifyIdentifier (ident,cl,kv) = (ident',cl,kv)
411  where ident' = T.map (\c -> if c == '-' then '_' else c) ident
412
413bulletList :: PandocMonad m => MWParser m Blocks
414bulletList = B.bulletList <$>
415   (   many1 (listItem '*')
416   <|> (htmlTag (~== TagOpen ("ul" :: Text) []) *> spaces *> many (listItem '*' <|> li) <*
417        optional (htmlTag (~== TagClose ("ul" :: Text)))) )
418
419orderedList :: PandocMonad m => MWParser m Blocks
420orderedList =
421       (B.orderedList <$> many1 (listItem '#'))
422   <|> try
423       (do (tag,_) <- htmlTag (~== TagOpen ("ol" :: Text) [])
424           spaces
425           items <- many (listItem '#' <|> li)
426           optional (htmlTag (~== TagClose ("ol" :: Text)))
427           let start = fromMaybe 1 $ safeRead $ fromAttrib "start" tag
428           return $ B.orderedListWith (start, DefaultStyle, DefaultDelim) items)
429
430definitionList :: PandocMonad m => MWParser m Blocks
431definitionList = B.definitionList <$> many1 defListItem
432
433defListItem :: PandocMonad m => MWParser m (Inlines, [Blocks])
434defListItem = try $ do
435  terms <- mconcat . intersperse B.linebreak <$> many defListTerm
436  -- we allow dd with no dt, or dt with no dd
437  defs  <- if null terms
438              then notFollowedBy
439                    (try $ skipMany1 (char ':') >> string "<math>") *>
440                       many1 (listItem ':')
441              else many (listItem ':')
442  return (terms, defs)
443
444defListTerm  :: PandocMonad m => MWParser m Inlines
445defListTerm = do
446  guardColumnOne
447  char ';'
448  skipMany spaceChar
449  pos' <- getPosition
450  anyLine >>= parseFromString (do setPosition pos'
451                                  trimInlines . mconcat <$> many inline)
452
453listStart :: PandocMonad m => Char -> MWParser m ()
454listStart c = char c *> notFollowedBy listStartChar
455
456listStartChar :: PandocMonad m => MWParser m Char
457listStartChar = oneOf "*#;:"
458
459anyListStart :: PandocMonad m => MWParser m Char
460anyListStart = guardColumnOne >> oneOf "*#:;"
461
462li :: PandocMonad m => MWParser m Blocks
463li = lookAhead (htmlTag (~== TagOpen ("li" :: Text) [])) *>
464     (firstParaToPlain <$> blocksInTags "li") <* spaces
465
466listItem :: PandocMonad m => Char -> MWParser m Blocks
467listItem c = try $ do
468  guardColumnOne
469  extras <- many (try $ char c <* lookAhead listStartChar)
470  if null extras
471     then listItem' c
472     else do
473       skipMany spaceChar
474       pos' <- getPosition
475       first <- T.concat <$> manyTill listChunk newline
476       rest <- many
477                (try $ string extras *> lookAhead listStartChar *>
478                       (T.concat <$> manyTill listChunk newline))
479       contents <- parseFromString (do setPosition pos'
480                                       many1 $ listItem' c)
481                          (T.unlines (first : rest))
482       case c of
483           '*' -> return $ B.bulletList contents
484           '#' -> return $ B.orderedList contents
485           ':' -> return $ B.definitionList [(mempty, contents)]
486           _   -> mzero
487
488-- The point of this is to handle stuff like
489-- * {{cite book
490-- | blah
491-- | blah
492-- }}
493-- * next list item
494-- which seems to be valid mediawiki.
495listChunk :: PandocMonad m => MWParser m Text
496listChunk = template <|> countChar 1 anyChar
497
498listItem' :: PandocMonad m => Char -> MWParser m Blocks
499listItem' c = try $ do
500  listStart c
501  skipMany spaceChar
502  pos' <- getPosition
503  first <- T.concat <$> manyTill listChunk newline
504  rest <- many (try $ char c *> lookAhead listStartChar *>
505                   (T.concat <$> manyTill listChunk newline))
506  parseFromString (do setPosition pos'
507                      firstParaToPlain . mconcat <$> many1 block)
508      $ T.unlines $ first : rest
509
510firstParaToPlain :: Blocks -> Blocks
511firstParaToPlain contents =
512  case viewl (B.unMany contents) of
513       Para xs :< ys -> B.Many $ Plain xs <| ys
514       _             -> contents
515
516--
517-- inline parsers
518--
519
520inline :: PandocMonad m => MWParser m Inlines
521inline =  whitespace
522      <|> url
523      <|> str
524      <|> doubleQuotes
525      <|> strong
526      <|> emph
527      <|> image
528      <|> internalLink
529      <|> externalLink
530      <|> math
531      <|> inlineTag
532      <|> B.singleton <$> charRef
533      <|> inlineHtml
534      <|> (B.rawInline "mediawiki" <$> variable)
535      <|> (B.rawInline "mediawiki" <$> template)
536      <|> special
537
538str :: PandocMonad m => MWParser m Inlines
539str = B.str <$> many1Char (noneOf $ specialChars ++ spaceChars)
540
541math :: PandocMonad m => MWParser m Inlines
542math = (B.displayMath . trim <$> try (many1 (char ':') >> textInTags "math"))
543   <|> (B.math . trim <$> textInTags "math")
544   <|> (B.displayMath . trim <$> try (dmStart *> manyTillChar anyChar dmEnd))
545   <|> (B.math . trim <$> try (mStart *> manyTillChar (satisfy (/='\n')) mEnd))
546 where dmStart = string "\\["
547       dmEnd   = try (string "\\]")
548       mStart  = string "\\("
549       mEnd    = try (string "\\)")
550
551variable :: PandocMonad m => MWParser m Text
552variable = try $ do
553  string "{{{"
554  contents <- manyTillChar anyChar (try $ string "}}}")
555  return $ "{{{" <> contents <> "}}}"
556
557inlineTag :: PandocMonad m => MWParser m Inlines
558inlineTag = do
559  (tag, _) <- lookAhead $ htmlTag isInlineTag'
560  case tag of
561       TagOpen "ref" _ -> B.note . B.plain <$> inlinesInTags "ref"
562       TagOpen "nowiki" _ -> try $ do
563          (_,raw) <- htmlTag (~== tag)
564          if T.any (== '/') raw
565             then return mempty
566             else B.text . fromEntities <$>
567                       manyTillChar anyChar (htmlTag (~== TagClose ("nowiki" :: Text)))
568       TagOpen "br" _ -> B.linebreak <$ (htmlTag (~== TagOpen ("br" :: Text) []) -- will get /> too
569                            *> optional blankline)
570       TagOpen "strike" _ -> B.strikeout <$> inlinesInTags "strike"
571       TagOpen "del" _ -> B.strikeout <$> inlinesInTags "del"
572       TagOpen "sub" _ -> B.subscript <$> inlinesInTags "sub"
573       TagOpen "sup" _ -> B.superscript <$> inlinesInTags "sup"
574       TagOpen "code" _ -> encode <$> inlinesInTags "code"
575       TagOpen "tt" _ -> do
576         inTT <- mwInTT <$> getState
577         updateState $ \st -> st{ mwInTT = True }
578         result <- encode <$> inlinesInTags "tt"
579         updateState $ \st -> st{ mwInTT = inTT }
580         return result
581       TagOpen "hask" _ -> B.codeWith ("",["haskell"],[]) <$> textInTags "hask"
582       _ -> B.rawInline "html" . snd <$> htmlTag (~== tag)
583
584special :: PandocMonad m => MWParser m Inlines
585special = B.str <$> countChar 1 (notFollowedBy' (htmlTag isBlockTag') *>
586                                  oneOf specialChars)
587
588inlineHtml :: PandocMonad m => MWParser m Inlines
589inlineHtml = B.rawInline "html" . snd <$> htmlTag isInlineTag'
590
591whitespace :: PandocMonad m => MWParser m Inlines
592whitespace = B.space <$ (skipMany1 spaceChar <|> htmlComment)
593         <|> B.softbreak <$ endline
594
595endline :: PandocMonad m => MWParser m ()
596endline = () <$ try (newline <*
597                     notFollowedBy spaceChar <*
598                     notFollowedBy newline <*
599                     notFollowedBy' hrule <*
600                     notFollowedBy tableStart <*
601                     notFollowedBy' header <*
602                     notFollowedBy anyListStart)
603
604imageIdentifiers :: PandocMonad m => [MWParser m ()]
605imageIdentifiers = [sym (identifier <> ":") | identifier <- identifiers]
606    where identifiers = ["File", "Image", "Archivo", "Datei", "Fichier",
607                         "Bild"]
608
609image :: PandocMonad m => MWParser m Inlines
610image = try $ do
611  sym "[["
612  choice imageIdentifiers
613  fname <- addUnderscores <$> many1Char (noneOf "|]")
614  _ <- many imageOption
615  dims <- try (char '|' *> sepBy (manyChar digit) (char 'x') <* string "px")
616          <|> return []
617  _ <- many imageOption
618  let kvs = case dims of
619              [w]    -> [("width", w)]
620              [w, h] -> [("width", w), ("height", h)]
621              _      -> []
622  let attr = ("", [], kvs)
623  caption <-   (B.str fname <$ sym "]]")
624           <|> try (char '|' *> (mconcat <$> manyTill inline (sym "]]")))
625  return $ B.imageWith attr fname ("fig:" <> stringify caption) caption
626
627imageOption :: PandocMonad m => MWParser m Text
628imageOption = try $ char '|' *> opt
629  where
630    opt = try (oneOfStrings [ "border", "thumbnail", "frameless"
631                            , "thumb", "upright", "left", "right"
632                            , "center", "none", "baseline", "sub"
633                            , "super", "top", "text-top", "middle"
634                            , "bottom", "text-bottom" ])
635      <|> try (textStr "frame")
636      <|> try (oneOfStrings ["link=","alt=","page=","class="] <* many (noneOf "|]"))
637
638addUnderscores :: Text -> Text
639addUnderscores = T.intercalate "_" . splitTextBy sep
640  where
641    sep c = isSpace c || c == '_'
642
643internalLink :: PandocMonad m => MWParser m Inlines
644internalLink = try $ do
645  sym "[["
646  pagename <- T.unwords . T.words <$> manyChar (noneOf "|]")
647  label <- option (B.text pagename) $ char '|' *>
648             (  (mconcat <$> many1 (notFollowedBy (char ']') *> inline))
649             -- the "pipe trick"
650             -- [[Help:Contents|] -> "Contents"
651             <|> return (B.text $ T.drop 1 $ T.dropWhile (/=':') pagename) )
652  sym "]]"
653  linktrail <- B.text <$> manyChar letter
654  let link = B.link (addUnderscores pagename) "wikilink" (label <> linktrail)
655  if "Category:" `T.isPrefixOf` pagename
656     then do
657       updateState $ \st -> st{ mwCategoryLinks = link : mwCategoryLinks st }
658       return mempty
659     else return link
660
661externalLink :: PandocMonad m => MWParser m Inlines
662externalLink = try $ do
663  char '['
664  (_, src) <- uri
665  lab <- try (trimInlines . mconcat <$>
666              (skipMany1 spaceChar *> manyTill inline (char ']')))
667       <|> do char ']'
668              num <- mwNextLinkNumber <$> getState
669              updateState $ \st -> st{ mwNextLinkNumber = num + 1 }
670              return $ B.str $ tshow num
671  return $ B.link src "" lab
672
673url :: PandocMonad m => MWParser m Inlines
674url = do
675  (orig, src) <- uri
676  return $ B.link src "" (B.str orig)
677
678-- | Parses a list of inlines between start and end delimiters.
679inlinesBetween :: (PandocMonad m, Show b) => MWParser m a -> MWParser m b -> MWParser m Inlines
680inlinesBetween start end =
681  trimInlines . mconcat <$> try (start >> many1Till inline end)
682
683emph :: PandocMonad m => MWParser m Inlines
684emph = B.emph <$> nested (inlinesBetween start end)
685    where start = sym "''"
686          end   = try $ notFollowedBy' (() <$ strong) >> sym "''"
687
688strong :: PandocMonad m => MWParser m Inlines
689strong = B.strong <$> nested (inlinesBetween start end)
690    where start = sym "'''"
691          end   = sym "'''"
692
693doubleQuotes :: PandocMonad m => MWParser m Inlines
694doubleQuotes = do
695  guardEnabled Ext_smart
696  inTT <- mwInTT <$> getState
697  guard (not inTT)
698  B.doubleQuoted <$> nested (inlinesBetween openDoubleQuote closeDoubleQuote)
699    where openDoubleQuote = sym "\"" >> lookAhead nonspaceChar
700          closeDoubleQuote = try $ sym "\""
701