1{-# LANGUAGE LambdaCase #-}
2{-# LANGUAGE OverloadedStrings #-}
3{- |
4   Module      : Text.Pandoc.Readers.Org.Inlines
5   Copyright   : Copyright (C) 2014-2021 Albert Krewinkel
6   License     : GNU GPL, version 2 or above
7
8   Maintainer  : Albert Krewinkel <tarleb+pandoc@moltkeplatz.de>
9
10Parsers for Org-mode inline elements.
11-}
12module Text.Pandoc.Readers.Org.Inlines
13  ( inline
14  , inlines
15  , addToNotesTable
16  , linkTarget
17  ) where
18
19import Text.Pandoc.Readers.Org.BlockStarts (endOfBlock, noteMarker)
20import Text.Pandoc.Readers.Org.ParserState
21import Text.Pandoc.Readers.Org.Parsing
22import Text.Pandoc.Readers.Org.Shared (cleanLinkText, isImageFilename,
23                                       originalLang, translateLang, exportsCode)
24
25import Text.Pandoc.Builder (Inlines)
26import qualified Text.Pandoc.Builder as B
27import Text.Pandoc.Class.PandocMonad (PandocMonad)
28import Text.Pandoc.Definition
29import Text.Pandoc.Options
30import Text.Pandoc.Readers.LaTeX (inlineCommand, rawLaTeXInline)
31import Text.TeXMath (DisplayType (..), readTeX, writePandoc)
32import qualified Text.TeXMath.Readers.MathML.EntityMap as MathMLEntityMap
33
34import Control.Monad (guard, mplus, mzero, unless, void, when)
35import Control.Monad.Trans (lift)
36import Data.Char (isAlphaNum, isSpace)
37import Data.List (intersperse)
38import qualified Data.Map as M
39import Data.Text (Text)
40import qualified Data.Text as T
41
42--
43-- Functions acting on the parser state
44--
45recordAnchorId :: PandocMonad m => Text -> OrgParser m ()
46recordAnchorId i = updateState $ \s ->
47  s{ orgStateAnchorIds = i : orgStateAnchorIds s }
48
49pushToInlineCharStack :: PandocMonad m => Char -> OrgParser m ()
50pushToInlineCharStack c = updateState $ \s ->
51  s{ orgStateEmphasisCharStack = c:orgStateEmphasisCharStack s }
52
53popInlineCharStack :: PandocMonad m => OrgParser m ()
54popInlineCharStack = updateState $ \s ->
55  s{ orgStateEmphasisCharStack = drop 1 . orgStateEmphasisCharStack $ s }
56
57surroundingEmphasisChar :: PandocMonad m => OrgParser m [Char]
58surroundingEmphasisChar =
59  take 1 . drop 1 . orgStateEmphasisCharStack <$> getState
60
61startEmphasisNewlinesCounting :: PandocMonad m => Int -> OrgParser m ()
62startEmphasisNewlinesCounting maxNewlines = updateState $ \s ->
63  s{ orgStateEmphasisNewlines = Just maxNewlines }
64
65decEmphasisNewlinesCount :: PandocMonad m => OrgParser m ()
66decEmphasisNewlinesCount = updateState $ \s ->
67  s{ orgStateEmphasisNewlines = (\n -> n - 1) <$> orgStateEmphasisNewlines s }
68
69newlinesCountWithinLimits :: PandocMonad m => OrgParser m Bool
70newlinesCountWithinLimits = do
71  st <- getState
72  return $ ((< 0) <$> orgStateEmphasisNewlines st) /= Just True
73
74resetEmphasisNewlines :: PandocMonad m => OrgParser m ()
75resetEmphasisNewlines = updateState $ \s ->
76  s{ orgStateEmphasisNewlines = Nothing }
77
78addToNotesTable :: PandocMonad m => OrgNoteRecord -> OrgParser m ()
79addToNotesTable note = do
80  oldnotes <- orgStateNotes' <$> getState
81  updateState $ \s -> s{ orgStateNotes' = note:oldnotes }
82
83-- | Parse a single Org-mode inline element
84inline :: PandocMonad m => OrgParser m (F Inlines)
85inline =
86  choice [ whitespace
87         , linebreak
88         , cite
89         , footnote
90         , linkOrImage
91         , anchor
92         , inlineCodeBlock
93         , str
94         , endline
95         , emphasizedText
96         , code
97         , math
98         , displayMath
99         , verbatim
100         , subscript
101         , superscript
102         , inlineLaTeX
103         , exportSnippet
104         , macro
105         , smart
106         , symbol
107         ] <* (guard =<< newlinesCountWithinLimits)
108  <?> "inline"
109
110-- | Read the rest of the input as inlines.
111inlines :: PandocMonad m => OrgParser m (F Inlines)
112inlines = trimInlinesF . mconcat <$> many1 inline
113
114-- treat these as potentially non-text when parsing inline:
115specialChars :: [Char]
116specialChars = "\"$'()*+-,./:;<=>@[\\]^_{|}~"
117
118
119whitespace :: PandocMonad m => OrgParser m (F Inlines)
120whitespace = pure B.space <$ skipMany1 spaceChar
121                          <* updateLastPreCharPos
122                          <* updateLastForbiddenCharPos
123             <?> "whitespace"
124
125linebreak :: PandocMonad m => OrgParser m (F Inlines)
126linebreak = try $ pure B.linebreak <$ string "\\\\" <* skipSpaces <* newline
127
128str :: PandocMonad m => OrgParser m (F Inlines)
129str = return . B.str <$> many1Char (noneOf $ specialChars ++ "\n\r ")
130      <* updateLastStrPos
131
132-- | An endline character that can be treated as a space, not a structural
133-- break.  This should reflect the values of the Emacs variable
134-- @org-element-pagaraph-separate@.
135endline :: PandocMonad m => OrgParser m (F Inlines)
136endline = try $ do
137  newline
138  notFollowedBy' endOfBlock
139  decEmphasisNewlinesCount
140  guard =<< newlinesCountWithinLimits
141  updateLastPreCharPos
142  useHardBreaks <- exportPreserveBreaks . orgStateExportSettings <$> getState
143  returnF (if useHardBreaks then B.linebreak else B.softbreak)
144
145
146--
147-- Citations
148--
149
150-- The state of citations is a bit confusing due to the lack of an official
151-- syntax and multiple syntaxes coexisting.  The pandocOrgCite syntax was the
152-- first to be implemented here and is almost identical to Markdown's citation
153-- syntax.  The org-ref package is in wide use to handle citations, but the
154-- syntax is a bit limiting and not quite as simple to write.  The
155-- semi-official Org-mode citation syntax is based on John MacFarlane's Pandoc
156-- sytax and Org-oriented enhancements contributed by Richard Lawrence and
157-- others.  It's dubbed Berkeley syntax due the place of activity of its main
158-- contributors.  All this should be consolidated once an official Org-mode
159-- citation syntax has emerged.
160
161cite :: PandocMonad m => OrgParser m (F Inlines)
162cite = try $ berkeleyCite <|> do
163  guardEnabled Ext_citations
164  (cs, raw) <- withRaw $ choice
165               [ pandocOrgCite
166               , orgRefCite
167               , berkeleyTextualCite
168               ]
169  return $ flip B.cite (B.text raw) <$> cs
170
171-- | A citation in Pandoc Org-mode style (@[prefix \@citekey suffix]@).
172pandocOrgCite :: PandocMonad m => OrgParser m (F [Citation])
173pandocOrgCite = try $
174  char '[' *> skipSpaces *> citeList <* skipSpaces <* char ']'
175
176orgRefCite :: PandocMonad m => OrgParser m (F [Citation])
177orgRefCite = try $ choice
178  [ normalOrgRefCite
179  , fmap (:[]) <$> linkLikeOrgRefCite
180  ]
181
182normalOrgRefCite :: PandocMonad m => OrgParser m (F [Citation])
183normalOrgRefCite = try $ do
184  mode <- orgRefCiteMode
185  firstCitation <- orgRefCiteList mode
186  moreCitations <- many (try $ char ',' *> orgRefCiteList mode)
187  return . sequence $ firstCitation : moreCitations
188 where
189  -- | A list of org-ref style citation keys, parsed as citation of the given
190  -- citation mode.
191  orgRefCiteList :: PandocMonad m => CitationMode -> OrgParser m (F Citation)
192  orgRefCiteList citeMode = try $ do
193    key <- orgRefCiteKey
194    returnF Citation
195     { citationId      = key
196     , citationPrefix  = mempty
197     , citationSuffix  = mempty
198     , citationMode    = citeMode
199     , citationNoteNum = 0
200     , citationHash    = 0
201     }
202
203-- | Read an Berkeley-style Org-mode citation.  Berkeley citation style was
204-- develop and adjusted to Org-mode style by John MacFarlane and Richard
205-- Lawrence, respectively, both philosophers at UC Berkeley.
206berkeleyCite :: PandocMonad m => OrgParser m (F Inlines)
207berkeleyCite = try $ do
208  bcl <- berkeleyCitationList
209  return $ do
210    parens <- berkeleyCiteParens <$> bcl
211    prefix <- berkeleyCiteCommonPrefix <$> bcl
212    suffix <- berkeleyCiteCommonSuffix <$> bcl
213    citationList <- berkeleyCiteCitations <$> bcl
214    return $
215      if parens
216      then toCite
217           . maybe id (alterFirst . prependPrefix) prefix
218           . maybe id (alterLast . appendSuffix) suffix
219           $ citationList
220      else maybe mempty (<> " ") prefix
221             <> toListOfCites (map toInTextMode citationList)
222             <> maybe mempty (", " <>) suffix
223 where
224   toCite :: [Citation] -> Inlines
225   toCite cs = B.cite cs mempty
226
227   toListOfCites :: [Citation] -> Inlines
228   toListOfCites = mconcat . intersperse ", " . map (\c -> B.cite [c] mempty)
229
230   toInTextMode :: Citation -> Citation
231   toInTextMode c = c { citationMode = AuthorInText }
232
233   alterFirst, alterLast :: (a -> a) -> [a] -> [a]
234   alterFirst _ []     = []
235   alterFirst f (c:cs) = f c : cs
236   alterLast  f = reverse . alterFirst f . reverse
237
238   prependPrefix, appendSuffix :: Inlines -> Citation -> Citation
239   prependPrefix pre c = c { citationPrefix = B.toList pre <> citationPrefix c }
240   appendSuffix  suf c = c { citationSuffix = citationSuffix c <> B.toList suf }
241
242data BerkeleyCitationList = BerkeleyCitationList
243  { berkeleyCiteParens       :: Bool
244  , berkeleyCiteCommonPrefix :: Maybe Inlines
245  , berkeleyCiteCommonSuffix :: Maybe Inlines
246  , berkeleyCiteCitations    :: [Citation]
247  }
248berkeleyCitationList :: PandocMonad m => OrgParser m (F BerkeleyCitationList)
249berkeleyCitationList = try $ do
250  char '['
251  parens <- choice [ False <$ berkeleyBareTag, True <$ berkeleyParensTag ]
252  char ':'
253  skipSpaces
254  commonPrefix <- optionMaybe (try $ citationListPart <* char ';')
255  citations    <- citeList
256  commonSuffix <- optionMaybe (try citationListPart)
257  char ']'
258  return (BerkeleyCitationList parens
259    <$> sequence commonPrefix
260    <*> sequence commonSuffix
261    <*> citations)
262 where
263   citationListPart :: PandocMonad m => OrgParser m (F Inlines)
264   citationListPart = fmap (trimInlinesF . mconcat) . try . many1 $ do
265     notFollowedBy' citeKey
266     notFollowedBy (oneOf ";]")
267     inline
268
269berkeleyBareTag :: PandocMonad m => OrgParser m ()
270berkeleyBareTag = try $ void berkeleyBareTag'
271
272berkeleyParensTag :: PandocMonad m => OrgParser m ()
273berkeleyParensTag = try . void $ enclosedByPair1 '(' ')' berkeleyBareTag'
274
275berkeleyBareTag' :: PandocMonad m => OrgParser m ()
276berkeleyBareTag' = try $ void (string "cite")
277
278berkeleyTextualCite :: PandocMonad m => OrgParser m (F [Citation])
279berkeleyTextualCite = try $ do
280  (suppressAuthor, key) <- citeKey
281  returnF . return $ Citation
282    { citationId      = key
283    , citationPrefix  = mempty
284    , citationSuffix  = mempty
285    , citationMode    = if suppressAuthor then SuppressAuthor else AuthorInText
286    , citationNoteNum = 0
287    , citationHash    = 0
288    }
289
290-- The following is what a Berkeley-style bracketed textual citation parser
291-- would look like.  However, as these citations are a subset of Pandoc's Org
292-- citation style, this isn't used.
293-- berkeleyBracketedTextualCite :: PandocMonad m => OrgParser m (F [Citation])
294-- berkeleyBracketedTextualCite = try . (fmap head) $
295--   enclosedByPair1 '[' ']' berkeleyTextualCite
296
297-- | Read a link-like org-ref style citation.  The citation includes pre and
298-- post text.  However, multiple citations are not possible due to limitations
299-- in the syntax.
300linkLikeOrgRefCite :: PandocMonad m => OrgParser m (F Citation)
301linkLikeOrgRefCite = try $ do
302  _    <- string "[["
303  mode <- orgRefCiteMode
304  key  <- orgRefCiteKey
305  _    <- string "]["
306  pre  <- trimInlinesF . mconcat <$> manyTill inline (try $ string "::")
307  spc  <- option False (True <$ spaceChar)
308  suf  <- trimInlinesF . mconcat <$> manyTill inline (try $ string "]]")
309  return $ do
310    pre' <- pre
311    suf' <- suf
312    return Citation
313      { citationId      = key
314      , citationPrefix  = B.toList pre'
315      , citationSuffix  = B.toList (if spc then B.space <> suf' else suf')
316      , citationMode    = mode
317      , citationNoteNum = 0
318      , citationHash    = 0
319      }
320
321-- | Read a citation key.  The characters allowed in citation keys are taken
322-- from the `org-ref-cite-re` variable in `org-ref.el`.
323orgRefCiteKey :: PandocMonad m => OrgParser m Text
324orgRefCiteKey =
325  let citeKeySpecialChars = "-_:\\./," :: String
326      isCiteKeySpecialChar c = c `elem` citeKeySpecialChars
327      isCiteKeyChar c = isAlphaNum c || isCiteKeySpecialChar c
328      endOfCitation = try $ do
329        many $ satisfy isCiteKeySpecialChar
330        satisfy $ not . isCiteKeyChar
331  in try $ satisfy isCiteKeyChar `many1TillChar` lookAhead endOfCitation
332
333
334-- | Supported citation types.  Only a small subset of org-ref types is
335-- supported for now.  TODO: rewrite this, use LaTeX reader as template.
336orgRefCiteMode :: PandocMonad m => OrgParser m CitationMode
337orgRefCiteMode =
338  choice $ map (\(s, mode) -> mode <$ try (string s <* char ':'))
339    [ ("cite", AuthorInText)
340    , ("citep", NormalCitation)
341    , ("citep*", NormalCitation)
342    , ("citet", AuthorInText)
343    , ("citet*", AuthorInText)
344    , ("citeyear", SuppressAuthor)
345    ]
346
347citeList :: PandocMonad m => OrgParser m (F [Citation])
348citeList = sequence <$> sepEndBy1 citation (try $ char ';' *> skipSpaces)
349
350citation :: PandocMonad m => OrgParser m (F Citation)
351citation = try $ do
352  pref <- prefix
353  (suppress_author, key) <- citeKey
354  suff <- suffix
355  return $ do
356    x <- pref
357    y <- suff
358    return Citation
359      { citationId      = key
360      , citationPrefix  = B.toList x
361      , citationSuffix  = B.toList y
362      , citationMode    = if suppress_author
363                          then SuppressAuthor
364                          else NormalCitation
365      , citationNoteNum = 0
366      , citationHash    = 0
367      }
368 where
369   prefix = trimInlinesF . mconcat <$>
370            manyTill inline (char ']' <|> (']' <$ lookAhead citeKey))
371   suffix = try $ do
372     hasSpace <- option False (notFollowedBy nonspaceChar >> return True)
373     skipSpaces
374     rest <- trimInlinesF . mconcat <$>
375             many (notFollowedBy (oneOf ";]") *> inline)
376     return $ if hasSpace
377              then (B.space <>) <$> rest
378              else rest
379
380footnote :: PandocMonad m => OrgParser m (F Inlines)
381footnote = try $ do
382  note <- inlineNote <|> referencedNote
383  withNote <- getExportSetting exportWithFootnotes
384  return $ if withNote then note else mempty
385
386inlineNote :: PandocMonad m => OrgParser m (F Inlines)
387inlineNote = try $ do
388  string "[fn:"
389  ref <- manyChar alphaNum
390  char ':'
391  note <- fmap B.para . trimInlinesF . mconcat <$> many1Till inline (char ']')
392  unless (T.null ref) $
393       addToNotesTable ("fn:" <> ref, note)
394  return $ B.note <$> note
395
396referencedNote :: PandocMonad m => OrgParser m (F Inlines)
397referencedNote = try $ do
398  ref <- noteMarker
399  return $ do
400    notes <- asksF orgStateNotes'
401    case lookup ref notes of
402      Nothing   -> return . B.str $ "[" <> ref <> "]"
403      Just contents  -> do
404        st <- askF
405        let contents' = runF contents st{ orgStateNotes' = [] }
406        return $ B.note contents'
407
408linkOrImage :: PandocMonad m => OrgParser m (F Inlines)
409linkOrImage = explicitOrImageLink
410              <|> selflinkOrImage
411              <|> angleLink
412              <|> plainLink
413              <?> "link or image"
414
415explicitOrImageLink :: PandocMonad m => OrgParser m (F Inlines)
416explicitOrImageLink = try $ do
417  char '['
418  srcF   <- applyCustomLinkFormat =<< possiblyEmptyLinkTarget
419  descr  <- enclosedRaw (char '[') (char ']')
420  titleF <- parseFromString (mconcat <$> many inline) descr
421  char ']'
422  return $ do
423    src <- srcF
424    title <- titleF
425    case cleanLinkText descr of
426      Just imgSrc | isImageFilename imgSrc ->
427        return . B.link src "" $ B.image imgSrc mempty mempty
428      _ ->
429        linkToInlinesF src title
430
431selflinkOrImage :: PandocMonad m => OrgParser m (F Inlines)
432selflinkOrImage = try $ do
433  target <- char '[' *> linkTarget <* char ']'
434  case cleanLinkText target of
435    Nothing        -> case T.uncons target of
436                        Just ('#', _) -> returnF $ B.link target "" (B.str target)
437                        _             -> return $ internalLink target (B.str target)
438    Just nonDocTgt -> if isImageFilename nonDocTgt
439                      then returnF $ B.image nonDocTgt "" ""
440                      else returnF $ B.link nonDocTgt "" (B.str target)
441
442plainLink :: PandocMonad m => OrgParser m (F Inlines)
443plainLink = try $ do
444  (orig, src) <- uri
445  returnF $ B.link src "" (B.str orig)
446
447angleLink :: PandocMonad m => OrgParser m (F Inlines)
448angleLink = try $ do
449  char '<'
450  link <- plainLink
451  char '>'
452  return link
453
454linkTarget :: PandocMonad m => OrgParser m Text
455linkTarget = T.pack <$> enclosedByPair1 '[' ']' (noneOf "\n\r[]")
456
457possiblyEmptyLinkTarget :: PandocMonad m => OrgParser m Text
458possiblyEmptyLinkTarget = try linkTarget <|> ("" <$ string "[]")
459
460applyCustomLinkFormat :: Text -> OrgParser m (F Text)
461applyCustomLinkFormat link = do
462  let (linkType, rest) = T.break (== ':') link
463  return $ do
464    formatter <- M.lookup linkType <$> asksF orgStateLinkFormatters
465    return $ maybe link ($ T.drop 1 rest) formatter
466
467-- | Take a link and return a function which produces new inlines when given
468-- description inlines.
469linkToInlinesF :: Text -> Inlines -> F Inlines
470linkToInlinesF linkStr =
471  case T.uncons linkStr of
472    Nothing       -> pure . B.link mempty ""       -- wiki link (empty by convention)
473    Just ('#', _) -> pure . B.link linkStr ""      -- document-local fraction
474    _             -> case cleanLinkText linkStr of
475      Just extTgt -> return . B.link extTgt ""
476      Nothing     -> internalLink linkStr  -- other internal link
477
478internalLink :: Text -> Inlines -> F Inlines
479internalLink link title = do
480  ids <- asksF orgStateAnchorIds
481  if link `elem` ids
482    then return $ B.link ("#" <> link) "" title
483    else let attr' = ("", ["spurious-link"] , [("target", link)])
484         in return $ B.spanWith attr' (B.emph title)
485
486-- | Parse an anchor like @<<anchor-id>>@ and return an empty span with
487-- @anchor-id@ set as id.  Legal anchors in org-mode are defined through
488-- @org-target-regexp@, which is fairly liberal.  Since no link is created if
489-- @anchor-id@ contains spaces, we are more restrictive in what is accepted as
490-- an anchor.
491anchor :: PandocMonad m => OrgParser m (F Inlines)
492anchor =  try $ do
493  anchorId <- parseAnchor
494  recordAnchorId anchorId
495  returnF $ B.spanWith (solidify anchorId, [], []) mempty
496 where
497       parseAnchor = string "<<"
498                     *> many1Char (noneOf "\t\n\r<>\"' ")
499                     <* string ">>"
500                     <* skipSpaces
501
502-- | Replace every char but [a-zA-Z0-9_.-:] with a hyphen '-'.  This mirrors
503-- the org function @org-export-solidify-link-text@.
504solidify :: Text -> Text
505solidify = T.map replaceSpecialChar
506 where replaceSpecialChar c
507           | isAlphaNum c    = c
508           | c `elem` ("_.-:" :: String) = c
509           | otherwise       = '-'
510
511-- | Parses an inline code block and marks it as an babel block.
512inlineCodeBlock :: PandocMonad m => OrgParser m (F Inlines)
513inlineCodeBlock = try $ do
514  string "src_"
515  lang <- many1Char orgArgWordChar
516  opts <- option [] $ enclosedByPair '[' ']' inlineBlockOption
517  inlineCode <- T.pack <$> enclosedByPair1 '{' '}' (noneOf "\n\r")
518  let attrClasses = [translateLang lang]
519  let attrKeyVal  = originalLang lang <> opts
520  let codeInlineBlck = B.codeWith ("", attrClasses, attrKeyVal) inlineCode
521  returnF $ if exportsCode opts then codeInlineBlck else mempty
522 where
523   inlineBlockOption :: PandocMonad m => OrgParser m (Text, Text)
524   inlineBlockOption = try $ do
525     argKey <- orgArgKey
526     paramValue <- option "yes" orgInlineParamValue
527     return (argKey, paramValue)
528
529   orgInlineParamValue :: PandocMonad m => OrgParser m Text
530   orgInlineParamValue = try $
531     skipSpaces
532       *> notFollowedBy (char ':')
533       *> many1Char (noneOf "\t\n\r ]")
534       <* skipSpaces
535
536
537emphasizedText :: PandocMonad m => OrgParser m (F Inlines)
538emphasizedText = do
539  state <- getState
540  guard . exportEmphasizedText . orgStateExportSettings $ state
541  try $ choice
542    [ emph
543    , strong
544    , strikeout
545    , underline
546    ]
547
548enclosedByPair :: PandocMonad m
549               => Char          -- ^ opening char
550               -> Char          -- ^ closing char
551               -> OrgParser m a   -- ^ parser
552               -> OrgParser m [a]
553enclosedByPair s e p = char s *> manyTill p (char e)
554
555enclosedByPair1 :: PandocMonad m
556               => Char          -- ^ opening char
557               -> Char          -- ^ closing char
558               -> OrgParser m a   -- ^ parser
559               -> OrgParser m [a]
560enclosedByPair1 s e p = char s *> many1Till p (char e)
561
562emph      :: PandocMonad m => OrgParser m (F Inlines)
563emph      = fmap B.emph         <$> emphasisBetween '/'
564
565strong    :: PandocMonad m => OrgParser m (F Inlines)
566strong    = fmap B.strong       <$> emphasisBetween '*'
567
568strikeout :: PandocMonad m => OrgParser m (F Inlines)
569strikeout = fmap B.strikeout    <$> emphasisBetween '+'
570
571underline :: PandocMonad m => OrgParser m (F Inlines)
572underline = fmap B.underline    <$> emphasisBetween '_'
573
574verbatim  :: PandocMonad m => OrgParser m (F Inlines)
575verbatim  = return . B.codeWith ("", ["verbatim"], []) <$> verbatimBetween '='
576
577code      :: PandocMonad m => OrgParser m (F Inlines)
578code      = return . B.code     <$> verbatimBetween '~'
579
580subscript   :: PandocMonad m => OrgParser m (F Inlines)
581subscript   = fmap B.subscript   <$> try (char '_' *> subOrSuperExpr)
582
583superscript :: PandocMonad m => OrgParser m (F Inlines)
584superscript = fmap B.superscript <$> try (char '^' *> subOrSuperExpr)
585
586math      :: PandocMonad m => OrgParser m (F Inlines)
587math      = return . B.math      <$> choice [ math1CharBetween '$'
588                                            , mathTextBetween '$'
589                                            , rawMathBetween "\\(" "\\)"
590                                            ]
591
592displayMath :: PandocMonad m => OrgParser m (F Inlines)
593displayMath = return . B.displayMath <$> choice [ rawMathBetween "\\[" "\\]"
594                                                , rawMathBetween "$$"  "$$"
595                                                ]
596
597updatePositions :: PandocMonad m
598                => Char
599                -> OrgParser m Char
600updatePositions c = do
601  st <- getState
602  let emphasisPreChars = orgStateEmphasisPreChars st
603  when (c `elem` emphasisPreChars) updateLastPreCharPos
604  when (c `elem` emphasisForbiddenBorderChars) updateLastForbiddenCharPos
605  return c
606
607symbol :: PandocMonad m => OrgParser m (F Inlines)
608symbol = return . B.str . T.singleton <$> (oneOf specialChars >>= updatePositions)
609
610emphasisBetween :: PandocMonad m
611                => Char
612                -> OrgParser m (F Inlines)
613emphasisBetween c = try $ do
614  startEmphasisNewlinesCounting emphasisAllowedNewlines
615  res <- enclosedInlines (emphasisStart c) (emphasisEnd c)
616  isTopLevelEmphasis <- null . orgStateEmphasisCharStack <$> getState
617  when isTopLevelEmphasis
618       resetEmphasisNewlines
619  return res
620
621verbatimBetween :: PandocMonad m
622                => Char
623                -> OrgParser m Text
624verbatimBetween c = try $
625  emphasisStart c *>
626  many1TillNOrLessNewlines 1 verbatimChar (emphasisEnd c)
627 where
628   verbatimChar = noneOf "\n\r" >>= updatePositions
629
630-- | Parses a raw string delimited by @c@ using Org's math rules
631mathTextBetween :: PandocMonad m
632                  => Char
633                  -> OrgParser m Text
634mathTextBetween c = try $ do
635  mathStart c
636  body <- many1TillNOrLessNewlines mathAllowedNewlines
637                                   (noneOf (c:"\n\r"))
638                                   (lookAhead $ mathEnd c)
639  final <- mathEnd c
640  return $ T.snoc body final
641
642-- | Parse a single character between @c@ using math rules
643math1CharBetween :: PandocMonad m
644                 => Char
645                -> OrgParser m Text
646math1CharBetween c = try $ do
647  char c
648  res <- noneOf $ c:mathForbiddenBorderChars
649  char c
650  eof <|> () <$ lookAhead (oneOf mathPostChars)
651  return $ T.singleton res
652
653rawMathBetween :: PandocMonad m
654               => Text
655               -> Text
656               -> OrgParser m Text
657rawMathBetween s e = try $ textStr s *> manyTillChar anyChar (try $ textStr e)
658
659-- | Parses the start (opening character) of emphasis
660emphasisStart :: PandocMonad m => Char -> OrgParser m Char
661emphasisStart c = try $ do
662  guard =<< afterEmphasisPreChar
663  guard =<< notAfterString
664  char c
665  lookAhead (noneOf emphasisForbiddenBorderChars)
666  pushToInlineCharStack c
667  -- nested inlines are allowed, so mark this position as one which might be
668  -- followed by another inline.
669  updateLastPreCharPos
670  return c
671
672-- | Parses the closing character of emphasis
673emphasisEnd :: PandocMonad m => Char -> OrgParser m Char
674emphasisEnd c = try $ do
675  guard =<< notAfterForbiddenBorderChar
676  char c
677  eof <|> () <$ lookAhead acceptablePostChars
678  updateLastStrPos
679  popInlineCharStack
680  return c
681 where
682  acceptablePostChars = do
683    emphasisPostChars <- orgStateEmphasisPostChars <$> getState
684    surroundingEmphasisChar >>= \x -> oneOf (x ++ emphasisPostChars)
685
686mathStart :: PandocMonad m => Char -> OrgParser m Char
687mathStart c = try $
688  char c <* notFollowedBy' (oneOf (c:mathForbiddenBorderChars))
689
690mathEnd :: PandocMonad m => Char -> OrgParser m Char
691mathEnd c = try $ do
692  res <- noneOf (c:mathForbiddenBorderChars)
693  char c
694  eof <|> () <$ lookAhead (oneOf mathPostChars)
695  return res
696
697
698enclosedInlines :: (PandocMonad m, Show b) => OrgParser m a
699                -> OrgParser m b
700                -> OrgParser m (F Inlines)
701enclosedInlines start end = try $
702  trimInlinesF . mconcat <$> enclosed start end inline
703
704enclosedRaw :: (PandocMonad m, Show b) => OrgParser m a
705            -> OrgParser m b
706            -> OrgParser m Text
707enclosedRaw start end = try $
708  start *> (onSingleLine <|> spanningTwoLines)
709 where onSingleLine = try $ many1TillChar (noneOf "\n\r") end
710       spanningTwoLines = try $
711         anyLine >>= \f -> mappend (f <> " ") <$> onSingleLine
712
713-- | Like many1Till, but parses at most @n+1@ lines.  @p@ must not consume
714--   newlines.
715many1TillNOrLessNewlines :: PandocMonad m => Int
716                         -> OrgParser m Char
717                         -> OrgParser m a
718                         -> OrgParser m Text
719many1TillNOrLessNewlines n p end = try $
720  nMoreLines (Just n) mempty >>= oneOrMore
721 where
722   nMoreLines Nothing  cs = return cs
723   nMoreLines (Just 0) cs = try $ (cs ++) <$> finalLine
724   nMoreLines k        cs = try $ (final k cs <|> rest k cs)
725                                  >>= uncurry nMoreLines
726   final _ cs = (\x -> (Nothing,      cs ++ x)) <$> try finalLine
727   rest  m cs = (\x -> (minus1 <$> m, cs ++ x ++ "\n")) <$> try (manyTill p newline)
728   finalLine = try $ manyTill p end
729   minus1 k = k - 1
730   oneOrMore cs = T.pack cs <$ guard (not $ null cs)
731
732-- Org allows customization of the way it reads emphasis.  We use the defaults
733-- here (see, e.g., the Emacs Lisp variable `org-emphasis-regexp-components`
734-- for details).
735
736-- | Chars not allowed at the (inner) border of emphasis
737emphasisForbiddenBorderChars :: [Char]
738emphasisForbiddenBorderChars = "\t\n\r "
739
740-- | The maximum number of newlines within
741emphasisAllowedNewlines :: Int
742emphasisAllowedNewlines = 1
743
744-- LaTeX-style math: see `org-latex-regexps` for details
745
746-- | Chars allowed after an inline ($...$) math statement
747mathPostChars :: [Char]
748mathPostChars = "\t\n \"'),-.:;?"
749
750-- | Chars not allowed at the (inner) border of math
751mathForbiddenBorderChars :: [Char]
752mathForbiddenBorderChars = "\t\n\r ,;.$"
753
754-- | Maximum number of newlines in an inline math statement
755mathAllowedNewlines :: Int
756mathAllowedNewlines = 2
757
758-- | Whether we are right behind a char allowed before emphasis
759afterEmphasisPreChar :: PandocMonad m => OrgParser m Bool
760afterEmphasisPreChar = do
761  pos <- getPosition
762  lastPrePos <- orgStateLastPreCharPos <$> getState
763  return $ maybe True (== pos) lastPrePos
764
765-- | Whether the parser is right after a forbidden border char
766notAfterForbiddenBorderChar :: PandocMonad m => OrgParser m Bool
767notAfterForbiddenBorderChar = do
768  pos <- getPosition
769  lastFBCPos <- orgStateLastForbiddenCharPos <$> getState
770  return $ lastFBCPos /= Just pos
771
772-- | Read a sub- or superscript expression
773subOrSuperExpr :: PandocMonad m => OrgParser m (F Inlines)
774subOrSuperExpr = try $
775  simpleSubOrSuperText <|>
776  (choice [ charsInBalanced '{' '}' (noneOf "\n\r")
777          , enclosing ('(', ')') <$> charsInBalanced '(' ')' (noneOf "\n\r")
778          ] >>= parseFromString (mconcat <$> many inline))
779 where enclosing (left, right) s = T.cons left $ T.snoc s right
780
781simpleSubOrSuperText :: PandocMonad m => OrgParser m (F Inlines)
782simpleSubOrSuperText = try $ do
783  state <- getState
784  guard . exportSubSuperscripts . orgStateExportSettings $ state
785  return . B.str <$>
786    choice [ textStr "*"
787           , mappend <$> option "" (T.singleton <$> oneOf "+-")
788                     <*> many1Char alphaNum
789           ]
790
791inlineLaTeX :: PandocMonad m => OrgParser m (F Inlines)
792inlineLaTeX = try $ do
793  cmd <- inlineLaTeXCommand
794  texOpt <- getExportSetting exportWithLatex
795  allowEntities <- getExportSetting exportWithEntities
796  ils <- parseAsInlineLaTeX cmd texOpt
797  maybe mzero returnF $
798     parseAsMathMLSym allowEntities cmd `mplus`
799     parseAsMath cmd texOpt `mplus`
800     ils
801 where
802   parseAsInlineLaTeX :: PandocMonad m
803                      => Text -> TeXExport -> OrgParser m (Maybe Inlines)
804   parseAsInlineLaTeX cs = \case
805     TeXExport -> maybeRight <$> runParserT inlineCommand state "" cs
806     TeXIgnore -> return (Just mempty)
807     TeXVerbatim -> return (Just $ B.str cs)
808
809   parseAsMathMLSym :: Bool -> Text -> Maybe Inlines
810   parseAsMathMLSym allowEntities cs = do
811     -- drop initial backslash and any trailing "{}"
812     let clean = T.dropWhileEnd (`elem` ("{}" :: String)) . T.drop 1
813     -- If entities are disabled, then return the string as text, but
814     -- only if this *is* a MathML entity.
815     case B.str <$> MathMLEntityMap.getUnicode (clean cs) of
816       Just _ | not allowEntities -> Just $ B.str cs
817       x -> x
818
819   state :: ParserState
820   state = def{ stateOptions = def{ readerExtensions =
821                    enableExtension Ext_raw_tex (readerExtensions def) } }
822
823   parseAsMath :: Text -> TeXExport -> Maybe Inlines
824   parseAsMath cs = \case
825     TeXExport -> maybeRight (readTeX cs) >>=
826                  fmap B.fromList . writePandoc DisplayInline
827     TeXIgnore -> Just mempty
828     TeXVerbatim -> Just $ B.str cs
829
830maybeRight :: Either a b -> Maybe b
831maybeRight = either (const Nothing) Just
832
833inlineLaTeXCommand :: PandocMonad m => OrgParser m Text
834inlineLaTeXCommand = try $ do
835  rest <- getInput
836  st <- getState
837  parsed <- (lift . lift) $ runParserT rawLaTeXInline st "source" rest
838  case parsed of
839    Right cs -> do
840      -- drop any trailing whitespace, those are not part of the command as
841      -- far as org mode is concerned.
842      let cmdNoSpc = T.dropWhileEnd isSpace cs
843      let len = T.length cmdNoSpc
844      count len anyChar
845      return cmdNoSpc
846    _ -> mzero
847
848exportSnippet :: PandocMonad m => OrgParser m (F Inlines)
849exportSnippet = try $ do
850  string "@@"
851  format <- many1TillChar (alphaNum <|> char '-') (char ':')
852  snippet <- manyTillChar anyChar (try $ string "@@")
853  returnF $ B.rawInline format snippet
854
855macro :: PandocMonad m => OrgParser m (F Inlines)
856macro = try $ do
857  recursionDepth <- orgStateMacroDepth <$> getState
858  guard $ recursionDepth < 15
859  string "{{{"
860  name <- manyChar alphaNum
861  args <- ([] <$ string "}}}")
862          <|> char '(' *> argument `sepBy` char ',' <* eoa
863  expander <- lookupMacro name <$> getState
864  case expander of
865    Nothing -> mzero
866    Just fn -> do
867      updateState $ \s -> s { orgStateMacroDepth = recursionDepth + 1 }
868      res <- parseFromString (mconcat <$> many inline) $ fn args
869      updateState $ \s -> s { orgStateMacroDepth = recursionDepth }
870      return res
871 where
872  argument = manyChar $ notFollowedBy eoa *> noneOf ","
873  eoa = string ")}}}"
874
875smart :: PandocMonad m => OrgParser m (F Inlines)
876smart = choice [doubleQuoted, singleQuoted, orgApostrophe, orgDash, orgEllipses]
877  where
878    orgDash = do
879      guardOrSmartEnabled =<< getExportSetting exportSpecialStrings
880      pure <$> dash <* updatePositions '-'
881    orgEllipses = do
882      guardOrSmartEnabled =<< getExportSetting exportSpecialStrings
883      pure <$> ellipses <* updatePositions '.'
884    orgApostrophe = do
885      guardEnabled Ext_smart
886      (char '\'' <|> char '\8217') <* updateLastPreCharPos
887                                   <* updateLastForbiddenCharPos
888      returnF (B.str "\x2019")
889
890guardOrSmartEnabled :: PandocMonad m => Bool -> OrgParser m ()
891guardOrSmartEnabled b = do
892  smartExtension <- extensionEnabled Ext_smart <$> getOption readerExtensions
893  guard (b || smartExtension)
894
895singleQuoted :: PandocMonad m => OrgParser m (F Inlines)
896singleQuoted = try $ do
897  guardOrSmartEnabled =<< getExportSetting exportSmartQuotes
898  singleQuoteStart
899  updatePositions '\''
900  withQuoteContext InSingleQuote $
901    fmap B.singleQuoted . trimInlinesF . mconcat <$>
902      many1Till inline (singleQuoteEnd <* updatePositions '\'')
903
904-- doubleQuoted will handle regular double-quoted sections, as well
905-- as dialogues with an open double-quote without a close double-quote
906-- in the same paragraph.
907doubleQuoted :: PandocMonad m => OrgParser m (F Inlines)
908doubleQuoted = try $ do
909  guardOrSmartEnabled =<< getExportSetting exportSmartQuotes
910  doubleQuoteStart
911  updatePositions '"'
912  contents <- mconcat <$> many (try $ notFollowedBy doubleQuoteEnd >> inline)
913  let doubleQuotedContent = withQuoteContext InDoubleQuote $ do
914        doubleQuoteEnd
915        updateLastForbiddenCharPos
916        return . fmap B.doubleQuoted . trimInlinesF $ contents
917  let leftQuoteAndContent = return $ pure (B.str "\8220") <> contents
918  doubleQuotedContent <|> leftQuoteAndContent
919