1{-# LANGUAGE Arrows            #-}
2{-# LANGUAGE CPP               #-}
3{-# LANGUAGE DeriveFoldable    #-}
4{-# LANGUAGE GeneralizedNewtypeDeriving #-}
5{-# LANGUAGE PatternGuards     #-}
6{-# LANGUAGE RecordWildCards   #-}
7{-# LANGUAGE TupleSections     #-}
8{-# LANGUAGE ViewPatterns      #-}
9{-# LANGUAGE OverloadedStrings #-}
10{- |
11   Module      : Text.Pandoc.Readers.Odt.ContentReader
12   Copyright   : Copyright (C) 2015 Martin Linnemann
13   License     : GNU GPL, version 2 or above
14
15   Maintainer  : Martin Linnemann <theCodingMarlin@googlemail.com>
16   Stability   : alpha
17   Portability : portable
18
19The core of the odt reader that converts odt features into Pandoc types.
20-}
21
22module Text.Pandoc.Readers.Odt.ContentReader
23( readerState
24, read_body
25) where
26
27import Control.Applicative hiding (liftA, liftA2, liftA3)
28import Control.Arrow
29import Control.Monad ((<=<))
30
31import qualified Data.ByteString.Lazy as B
32import Data.Foldable (fold)
33import Data.List (find)
34import qualified Data.Map as M
35import qualified Data.Text as T
36import Data.Maybe
37import Data.Monoid (Alt (..))
38
39import Text.TeXMath (readMathML, writeTeX)
40import qualified Text.Pandoc.XML.Light as XML
41
42import Text.Pandoc.Builder hiding (underline)
43import Text.Pandoc.MediaBag (MediaBag, insertMedia)
44import Text.Pandoc.Shared
45import Text.Pandoc.Extensions (extensionsFromList, Extension(..))
46import qualified Text.Pandoc.UTF8 as UTF8
47
48import Text.Pandoc.Readers.Odt.Base
49import Text.Pandoc.Readers.Odt.Namespaces
50import Text.Pandoc.Readers.Odt.StyleReader
51
52import Text.Pandoc.Readers.Odt.Arrows.State (foldS)
53import Text.Pandoc.Readers.Odt.Arrows.Utils
54import Text.Pandoc.Readers.Odt.Generic.Fallible
55import Text.Pandoc.Readers.Odt.Generic.Utils
56import Text.Pandoc.Readers.Odt.Generic.XMLConverter
57
58import qualified Data.Set as Set
59
60--------------------------------------------------------------------------------
61-- State
62--------------------------------------------------------------------------------
63
64type Anchor = T.Text
65type Media = [(FilePath, B.ByteString)]
66
67data ReaderState
68   = ReaderState { -- | A collection of styles read somewhere else.
69                   -- It is only queried here, not modified.
70                   styleSet         :: Styles
71                   -- | A stack of the styles of parent elements.
72                   -- Used to look up inherited style properties.
73                 , styleTrace       :: [Style]
74                   -- | Keeps track of the current depth in nested lists
75                 , currentListLevel :: ListLevel
76                   -- | Lists may provide their own style, but they don't have
77                   -- to. If they do not, the style of a parent list may be used
78                   -- or even a default list style from the paragraph style.
79                   -- This value keeps track of the closest list style there
80                   -- currently is.
81                 , currentListStyle :: Maybe ListStyle
82                   -- | A map from internal anchor names to "pretty" ones.
83                   -- The mapping is a purely cosmetic one.
84                 , bookmarkAnchors  :: M.Map Anchor Anchor
85                   -- | A map of files / binary data from the archive
86                 , envMedia         :: Media
87                   -- | Hold binary resources used in the document
88                 , odtMediaBag      :: MediaBag
89                 }
90  deriving ( Show )
91
92readerState :: Styles -> Media -> ReaderState
93readerState styles media = ReaderState styles [] 0 Nothing M.empty media mempty
94
95--
96pushStyle'  :: Style -> ReaderState -> ReaderState
97pushStyle' style state = state { styleTrace = style : styleTrace state }
98
99--
100popStyle'   :: ReaderState -> ReaderState
101popStyle' state = case styleTrace state of
102                   _:trace -> state  { styleTrace = trace  }
103                   _       -> state
104
105--
106modifyListLevel :: (ListLevel -> ListLevel) -> (ReaderState -> ReaderState)
107modifyListLevel f state = state { currentListLevel = f (currentListLevel state) }
108
109--
110shiftListLevel :: ListLevel -> (ReaderState -> ReaderState)
111shiftListLevel diff = modifyListLevel (+ diff)
112
113--
114swapCurrentListStyle :: Maybe ListStyle -> ReaderState
115                     -> (ReaderState, Maybe ListStyle)
116swapCurrentListStyle mListStyle state = ( state { currentListStyle = mListStyle }
117                                        ,  currentListStyle state
118                                        )
119
120--
121lookupPrettyAnchor :: Anchor -> ReaderState -> Maybe Anchor
122lookupPrettyAnchor anchor ReaderState{..} = M.lookup anchor bookmarkAnchors
123
124--
125putPrettyAnchor :: Anchor -> Anchor -> ReaderState -> ReaderState
126putPrettyAnchor ugly pretty state@ReaderState{..}
127  = state { bookmarkAnchors = M.insert ugly pretty bookmarkAnchors }
128
129--
130usedAnchors :: ReaderState -> [Anchor]
131usedAnchors ReaderState{..} = M.elems bookmarkAnchors
132
133getMediaBag :: ReaderState -> MediaBag
134getMediaBag ReaderState{..} = odtMediaBag
135
136getMediaEnv :: ReaderState -> Media
137getMediaEnv ReaderState{..} = envMedia
138
139insertMedia' :: (FilePath, B.ByteString) -> ReaderState ->  ReaderState
140insertMedia' (fp, bs) state@ReaderState{..}
141  = state { odtMediaBag = insertMedia fp Nothing bs odtMediaBag }
142
143--------------------------------------------------------------------------------
144-- Reader type and associated tools
145--------------------------------------------------------------------------------
146
147type OdtReader      a b = XMLReader     ReaderState a b
148
149type OdtReaderSafe  a b = XMLReaderSafe ReaderState a b
150
151-- | Extract something from the styles
152fromStyles :: (a -> Styles -> b) -> OdtReaderSafe a b
153fromStyles f =     keepingTheValue
154                     (getExtraState >>^ styleSet)
155               >>% f
156
157--
158getStyleByName :: OdtReader StyleName Style
159getStyleByName = fromStyles lookupStyle >>^ maybeToChoice
160
161--
162findStyleFamily :: OdtReader Style StyleFamily
163findStyleFamily = fromStyles getStyleFamily >>^ maybeToChoice
164
165--
166lookupListStyle :: OdtReader StyleName ListStyle
167lookupListStyle = fromStyles lookupListStyleByName >>^ maybeToChoice
168
169--
170switchCurrentListStyle :: OdtReaderSafe (Maybe ListStyle) (Maybe ListStyle)
171switchCurrentListStyle =     keepingTheValue getExtraState
172                         >>% swapCurrentListStyle
173                         >>> first setExtraState
174                         >>^ snd
175
176--
177pushStyle :: OdtReaderSafe Style Style
178pushStyle =     keepingTheValue (
179                  (     keepingTheValue getExtraState
180                    >>% pushStyle'
181                  )
182                  >>> setExtraState
183                )
184            >>^ fst
185
186--
187popStyle :: OdtReaderSafe x x
188popStyle =     keepingTheValue (
189                     getExtraState
190                 >>> arr popStyle'
191                 >>> setExtraState
192               )
193           >>^ fst
194
195--
196getCurrentListLevel :: OdtReaderSafe _x ListLevel
197getCurrentListLevel = getExtraState >>^ currentListLevel
198
199--
200updateMediaWithResource :: OdtReaderSafe (FilePath, B.ByteString) (FilePath, B.ByteString)
201updateMediaWithResource = keepingTheValue (
202                 (keepingTheValue getExtraState
203                  >>% insertMedia'
204                  )
205                 >>> setExtraState
206               )
207           >>^ fst
208
209lookupResource :: OdtReaderSafe FilePath (FilePath, B.ByteString)
210lookupResource = proc target -> do
211    state <- getExtraState -< ()
212    case lookup target (getMediaEnv state) of
213      Just bs -> returnV (target, bs) -<< ()
214      Nothing -> returnV ("", B.empty) -< ()
215
216type AnchorPrefix = T.Text
217
218-- | An adaptation of 'uniqueIdent' from "Text.Pandoc.Shared" that generates a
219-- unique identifier but without assuming that the id should be for a header.
220-- Second argument is a list of already used identifiers.
221uniqueIdentFrom :: AnchorPrefix -> [Anchor] -> Anchor
222uniqueIdentFrom baseIdent usedIdents =
223  let  numIdent n = baseIdent <> "-" <> T.pack (show n)
224  in  if baseIdent `elem` usedIdents
225        then maybe baseIdent numIdent
226             $ find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int])
227               -- if we have more than 60,000, allow repeats
228        else baseIdent
229
230-- | First argument: basis for a new "pretty" anchor if none exists yet
231-- Second argument: a key ("ugly" anchor)
232-- Returns: saved "pretty" anchor or created new one
233getPrettyAnchor :: OdtReaderSafe (AnchorPrefix, Anchor) Anchor
234getPrettyAnchor = proc (baseIdent, uglyAnchor) -> do
235  state <- getExtraState -< ()
236  case lookupPrettyAnchor uglyAnchor state of
237    Just prettyAnchor -> returnA -< prettyAnchor
238    Nothing           -> do
239      let newPretty = uniqueIdentFrom baseIdent (usedAnchors state)
240      modifyExtraState (putPrettyAnchor uglyAnchor newPretty) -<< newPretty
241
242-- | Input: basis for a new header anchor
243-- Output: saved new anchor
244getHeaderAnchor :: OdtReaderSafe Inlines Anchor
245getHeaderAnchor = proc title -> do
246  state <- getExtraState -< ()
247  let exts = extensionsFromList [Ext_auto_identifiers]
248  let anchor = uniqueIdent exts (toList title)
249                (Set.fromList $ usedAnchors state)
250  modifyExtraState (putPrettyAnchor anchor anchor) -<< anchor
251
252
253--------------------------------------------------------------------------------
254-- Working with styles
255--------------------------------------------------------------------------------
256
257--
258readStyleByName :: OdtReader _x (StyleName, Style)
259readStyleByName =
260  findAttr NsText "style-name" >>? keepingTheValue getStyleByName >>^ liftE
261  where
262    liftE :: (StyleName, Fallible Style) -> Fallible (StyleName, Style)
263    liftE (name, Right v) = Right (name, v)
264    liftE (_, Left v)     = Left v
265
266--
267isStyleToTrace :: OdtReader Style Bool
268isStyleToTrace = findStyleFamily >>?^ (==FaText)
269
270--
271withNewStyle :: OdtReaderSafe x Inlines -> OdtReaderSafe x Inlines
272withNewStyle a = proc x -> do
273  fStyle <- readStyleByName -< ()
274  case fStyle of
275    Right (styleName, _) | isCodeStyle styleName -> do
276      inlines <- a -< x
277      arr inlineCode -<< inlines
278    Right (_, style) -> do
279      mFamily    <- arr styleFamily -< style
280      fTextProps <- arr ( maybeToChoice
281                        . textProperties
282                        . styleProperties
283                        )           -< style
284      case fTextProps of
285        Right textProps -> do
286          state        <- getExtraState             -< ()
287          let triple = (state, textProps, mFamily)
288          modifier     <- arr modifierFromStyleDiff -< triple
289          fShouldTrace <- isStyleToTrace            -< style
290          case fShouldTrace of
291            Right shouldTrace ->
292              if shouldTrace
293                then do
294                  pushStyle      -< style
295                  inlines   <- a -< x
296                  popStyle       -< ()
297                  arr modifier   -<< inlines
298                else
299    -- In case anything goes wrong
300                      a -< x
301            Left _ -> a -< x
302        Left _     -> a -< x
303    Left _         -> a -< x
304  where
305    isCodeStyle :: StyleName -> Bool
306    isCodeStyle "Source_Text" = True
307    isCodeStyle _             = False
308
309    inlineCode :: Inlines -> Inlines
310    inlineCode = code . T.concat . map stringify . toList
311
312type PropertyTriple = (ReaderState, TextProperties, Maybe StyleFamily)
313type InlineModifier = Inlines -> Inlines
314
315-- | Given data about the local style changes, calculates how to modify
316-- an instance of 'Inlines'
317modifierFromStyleDiff :: PropertyTriple -> InlineModifier
318modifierFromStyleDiff propertyTriple  =
319  composition $
320  getVPosModifier propertyTriple
321  : map (first ($ propertyTriple) >>> ifThen_else ignore)
322        [ (hasEmphChanged           , emph      )
323        , (hasChanged isStrong      , strong    )
324        , (hasChanged strikethrough , strikeout )
325        ]
326  where
327    ifThen_else else' (if',then') = if if' then then' else else'
328
329    ignore = id :: InlineModifier
330
331    getVPosModifier :: PropertyTriple -> InlineModifier
332    getVPosModifier triple@(_,textProps,_) =
333        let getVPos = Just . verticalPosition
334        in  case lookupPreviousValueM getVPos triple of
335              Nothing      -> ignore
336              Just oldVPos -> getVPosModifier' (oldVPos, verticalPosition textProps)
337
338    getVPosModifier' (oldVPos , newVPos   ) | oldVPos == newVPos = ignore
339    getVPosModifier' ( _      , VPosSub   ) = subscript
340    getVPosModifier' ( _      , VPosSuper ) = superscript
341    getVPosModifier' ( _      ,  _        ) = ignore
342
343    hasEmphChanged :: PropertyTriple -> Bool
344    hasEmphChanged = swing any [ hasChanged  isEmphasised
345                               , hasChangedM pitch
346                               , hasChanged  underline
347                               ]
348
349    hasChanged property triple@(_, property -> newProperty, _) =
350        (/= Just newProperty) (lookupPreviousValue property triple)
351
352    hasChangedM property triple@(_, textProps,_) =
353      fromMaybe False $ (/=) <$> property textProps <*> lookupPreviousValueM property triple
354
355    lookupPreviousValue f = lookupPreviousStyleValue (fmap f . textProperties)
356
357    lookupPreviousValueM f = lookupPreviousStyleValue (f <=< textProperties)
358
359    lookupPreviousStyleValue f (ReaderState{..},_,mFamily)
360      =     findBy f (extendedStylePropertyChain styleTrace styleSet)
361        <|> (f . lookupDefaultStyle' styleSet =<< mFamily)
362
363
364type ParaModifier = Blocks -> Blocks
365
366_MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_      :: Int
367_MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_ :: Int
368_MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_      = 5
369_MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_ = 5
370
371-- | Returns either 'id' or 'blockQuote' depending on the current indentation
372getParaModifier :: Style -> ParaModifier
373getParaModifier Style{..} | Just props <- paraProperties styleProperties
374                          , isBlockQuote (indentation props)
375                                         (margin_left props)
376                          = blockQuote
377                          | otherwise
378                          = id
379  where
380  isBlockQuote mIndent mMargin
381    | LengthValueMM indent <- mIndent
382    ,  indent          > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_
383     = True
384    | LengthValueMM margin <- mMargin
385    ,           margin > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_
386     = True
387    | LengthValueMM indent <- mIndent
388    , LengthValueMM margin <- mMargin
389     = indent + margin > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_
390
391    | PercentValue  indent <- mIndent
392    ,  indent          > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_
393     = True
394    | PercentValue  margin <- mMargin
395    ,           margin > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_
396     = True
397    | PercentValue  indent <- mIndent
398    , PercentValue  margin <- mMargin
399     = indent + margin > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_
400
401    | otherwise
402     = False
403
404--
405constructPara :: OdtReaderSafe Blocks Blocks -> OdtReaderSafe Blocks Blocks
406constructPara reader = proc blocks -> do
407  fStyle <- readStyleByName -< blocks
408  case fStyle of
409    Left   _    -> reader -< blocks
410    Right (styleName, _) | isTableCaptionStyle styleName -> do
411      blocks' <- reader   -< blocks
412      arr tableCaptionP  -< blocks'
413    Right (_, style) -> do
414      let modifier = getParaModifier style
415      blocks' <- reader   -<  blocks
416      arr modifier        -<< blocks'
417  where
418    isTableCaptionStyle :: StyleName -> Bool
419    isTableCaptionStyle "Table" = True
420    isTableCaptionStyle _       = False
421    tableCaptionP b = divWith ("", ["caption"], []) b
422
423type ListConstructor = [Blocks] -> Blocks
424
425getListConstructor :: ListLevelStyle -> ListConstructor
426getListConstructor ListLevelStyle{..} =
427  case listLevelType of
428    LltBullet   -> bulletList
429    LltImage    -> bulletList
430    LltNumbered -> let listNumberStyle = toListNumberStyle listItemFormat
431                       listNumberDelim = toListNumberDelim listItemPrefix
432                                                           listItemSuffix
433                   in  orderedListWith (listItemStart, listNumberStyle, listNumberDelim)
434  where
435    toListNumberStyle  LinfNone      = DefaultStyle
436    toListNumberStyle  LinfNumber    = Decimal
437    toListNumberStyle  LinfRomanLC   = LowerRoman
438    toListNumberStyle  LinfRomanUC   = UpperRoman
439    toListNumberStyle  LinfAlphaLC   = LowerAlpha
440    toListNumberStyle  LinfAlphaUC   = UpperAlpha
441    toListNumberStyle (LinfString _) = Example
442
443    toListNumberDelim  Nothing   (Just ".") = Period
444    toListNumberDelim (Just "" ) (Just ".") = Period
445    toListNumberDelim  Nothing   (Just ")") = OneParen
446    toListNumberDelim (Just "" ) (Just ")") = OneParen
447    toListNumberDelim (Just "(") (Just ")") = TwoParens
448    toListNumberDelim     _          _      = DefaultDelim
449
450
451-- | Determines which style to use for a list, which level to use of that
452-- style, and which type of list to create as a result of this information.
453-- Then prepares the state for eventual child lists and constructs the list from
454-- the results.
455-- Two main cases are handled: The list may provide its own style or it may
456-- rely on a parent list's style. I the former case the current style in the
457-- state must be switched before and after the call to the child converter
458-- while in the latter the child converter can be called directly.
459-- If anything goes wrong, a default ordered-list-constructor is used.
460constructList :: OdtReaderSafe x [Blocks] -> OdtReaderSafe x Blocks
461constructList reader = proc x -> do
462  modifyExtraState (shiftListLevel 1)        -< ()
463  listLevel  <- getCurrentListLevel          -< ()
464  fStyleName <- findAttr NsText "style-name" -< ()
465  case fStyleName of
466    Right styleName -> do
467      fListStyle <- lookupListStyle -< styleName
468      case fListStyle of
469        Right listStyle -> do
470          fLLS <- arr (uncurry getListLevelStyle) -< (listLevel,listStyle)
471          case fLLS of
472            Just listLevelStyle -> do
473              oldListStyle <- switchCurrentListStyle           -<  Just listStyle
474              blocks       <- constructListWith listLevelStyle -<< x
475              switchCurrentListStyle                           -<  oldListStyle
476              returnA                                          -<  blocks
477            Nothing             -> constructOrderedList        -< x
478        Left _                  -> constructOrderedList        -< x
479    Left _ -> do
480      state      <- getExtraState        -< ()
481      mListStyle <- arr currentListStyle -< state
482      case mListStyle of
483        Just listStyle -> do
484          fLLS <- arr (uncurry getListLevelStyle) -< (listLevel,listStyle)
485          case fLLS of
486            Just listLevelStyle -> constructListWith listLevelStyle -<< x
487            Nothing             -> constructOrderedList             -<  x
488        Nothing                 -> constructOrderedList             -<  x
489  where
490    constructOrderedList =
491          reader
492      >>> modifyExtraState (shiftListLevel (-1))
493      >>^ orderedList
494    constructListWith listLevelStyle =
495          reader
496      >>> getListConstructor listLevelStyle
497      ^>> modifyExtraState (shiftListLevel (-1))
498
499--------------------------------------------------------------------------------
500-- Readers
501--------------------------------------------------------------------------------
502
503type ElementMatcher result = (Namespace, ElementName, OdtReader result result)
504
505type InlineMatcher = ElementMatcher Inlines
506
507type BlockMatcher  = ElementMatcher Blocks
508
509newtype FirstMatch a = FirstMatch (Alt Maybe a)
510  deriving (Foldable, Monoid, Semigroup)
511
512firstMatch :: a -> FirstMatch a
513firstMatch = FirstMatch . Alt . Just
514
515--
516matchingElement :: (Monoid e)
517                => Namespace -> ElementName
518                -> OdtReaderSafe  e e
519                -> ElementMatcher e
520matchingElement ns name reader = (ns, name, asResultAccumulator reader)
521  where
522   asResultAccumulator :: (ArrowChoice a, Monoid m) => a m m -> a m (Fallible m)
523   asResultAccumulator a = liftAsSuccess $ keepingTheValue a >>% mappend
524
525--
526matchChildContent'   :: (Monoid result)
527                     => [ElementMatcher result]
528                     ->  OdtReaderSafe _x result
529matchChildContent' ls = returnV mempty >>> matchContent' ls
530
531--
532matchChildContent    :: (Monoid result)
533                     => [ElementMatcher result]
534                     ->  OdtReaderSafe  (result, XML.Content) result
535                     ->  OdtReaderSafe _x result
536matchChildContent ls fallback = returnV mempty >>> matchContent ls fallback
537
538--------------------------------------------
539-- Matchers
540--------------------------------------------
541
542----------------------
543-- Basics
544----------------------
545
546--
547-- | Open Document allows several consecutive spaces if they are marked up
548read_plain_text :: OdtReaderSafe (Inlines, XML.Content) Inlines
549read_plain_text =  fst ^&&& read_plain_text' >>% recover
550  where
551    -- fallible version
552    read_plain_text' :: OdtReader (Inlines, XML.Content) Inlines
553    read_plain_text' =      (     second ( arr extractText )
554                              >>^ spreadChoice >>?! second text
555                            )
556                       >>?% mappend
557    --
558    extractText     :: XML.Content -> Fallible T.Text
559    extractText (XML.Text cData) = succeedWith (XML.cdData cData)
560    extractText         _        = failEmpty
561
562read_text_seq :: InlineMatcher
563read_text_seq  = matchingElement NsText "sequence"
564                 $ matchChildContent [] read_plain_text
565
566
567-- specifically. I honor that, although the current implementation of 'mappend'
568-- for 'Inlines' in "Text.Pandoc.Builder" will collapse them again.
569-- The rational is to be prepared for future modifications.
570read_spaces      :: InlineMatcher
571read_spaces       = matchingElement NsText "s" (
572                          readAttrWithDefault NsText "c" 1 -- how many spaces?
573                      >>^ fromList.(`replicate` Space)
574                    )
575--
576read_line_break  :: InlineMatcher
577read_line_break   = matchingElement NsText "line-break"
578                    $ returnV linebreak
579--
580read_tab         :: InlineMatcher
581read_tab          = matchingElement NsText "tab"
582                    $ returnV space
583--
584read_span        :: InlineMatcher
585read_span         = matchingElement NsText "span"
586                    $ withNewStyle
587                    $ matchChildContent [ read_span
588                                        , read_spaces
589                                        , read_line_break
590                                        , read_tab
591                                        , read_link
592                                        , read_note
593                                        , read_citation
594                                        , read_bookmark
595                                        , read_bookmark_start
596                                        , read_reference_start
597                                        , read_bookmark_ref
598                                        , read_reference_ref
599                                        ] read_plain_text
600
601--
602read_paragraph   :: BlockMatcher
603read_paragraph    = matchingElement NsText "p"
604                    $ constructPara
605                    $ liftA para
606                    $ withNewStyle
607                    $ matchChildContent [ read_span
608                                        , read_spaces
609                                        , read_line_break
610                                        , read_tab
611                                        , read_link
612                                        , read_note
613                                        , read_citation
614                                        , read_bookmark
615                                        , read_bookmark_start
616                                        , read_reference_start
617                                        , read_bookmark_ref
618                                        , read_reference_ref
619                                        , read_frame
620                                        , read_text_seq
621                                        ] read_plain_text
622
623
624----------------------
625-- Headers
626----------------------
627
628--
629read_header      :: BlockMatcher
630read_header       = matchingElement NsText "h"
631                    $  proc blocks -> do
632  level    <- ( readAttrWithDefault NsText "outline-level" 1
633              ) -< blocks
634  children <- ( matchChildContent [ read_span
635                                  , read_spaces
636                                  , read_line_break
637                                  , read_tab
638                                  , read_link
639                                  , read_note
640                                  , read_citation
641                                  , read_bookmark
642                                  , read_bookmark_start
643                                  , read_reference_start
644                                  , read_bookmark_ref
645                                  , read_reference_ref
646                                  , read_frame
647                                  ] read_plain_text
648              ) -< blocks
649  anchor   <- getHeaderAnchor -< children
650  let idAttr = (anchor, [], []) -- no classes, no key-value pairs
651  arr (uncurry3 headerWith) -< (idAttr, level, children)
652
653----------------------
654-- Lists
655----------------------
656
657--
658read_list        :: BlockMatcher
659read_list         = matchingElement NsText "list"
660--                  $ withIncreasedListLevel
661                    $ constructList
662--                  $ liftA bulletList
663                    $ matchChildContent' [ read_list_item
664                                         ]
665--
666read_list_item   :: ElementMatcher [Blocks]
667read_list_item    = matchingElement NsText "list-item"
668                    $ liftA (compactify.(:[]))
669                      ( matchChildContent' [ read_paragraph
670                                           , read_header
671                                           , read_list
672                                           ]
673                      )
674
675
676----------------------
677-- Links
678----------------------
679
680read_link        :: InlineMatcher
681read_link         = matchingElement NsText "a"
682                    $ liftA3 link
683                      ( findAttrTextWithDefault NsXLink  "href"  ""          )
684                      ( findAttrTextWithDefault NsOffice "title" ""          )
685                      ( matchChildContent [ read_span
686                                          , read_note
687                                          , read_citation
688                                          , read_bookmark
689                                          , read_bookmark_start
690                                          , read_reference_start
691                                          , read_bookmark_ref
692                                          , read_reference_ref
693                                          ] read_plain_text                  )
694
695
696-------------------------
697-- Footnotes
698-------------------------
699
700read_note        :: InlineMatcher
701read_note         = matchingElement NsText "note"
702                    $ liftA note
703                    $ matchChildContent' [ read_note_body ]
704
705read_note_body   :: BlockMatcher
706read_note_body    = matchingElement NsText "note-body"
707                    $ matchChildContent' [ read_paragraph ]
708
709-------------------------
710-- Citations
711-------------------------
712
713read_citation    :: InlineMatcher
714read_citation     = matchingElement NsText "bibliography-mark"
715                    $ liftA2 cite
716                      ( liftA2 makeCitation
717                        ( findAttrTextWithDefault NsText "identifier" "" )
718                        ( readAttrWithDefault NsText "number" 0          )
719                      )
720                      ( matchChildContent [] read_plain_text             )
721  where
722   makeCitation :: T.Text -> Int -> [Citation]
723   makeCitation citeId num = [Citation citeId [] [] NormalCitation num 0]
724
725
726----------------------
727-- Tables
728----------------------
729
730--
731read_table        :: BlockMatcher
732read_table         = matchingElement NsTable "table"
733                     $ liftA simpleTable'
734                     $ matchChildContent'  [ read_table_row
735                                           ]
736
737-- | A simple table without a caption or headers
738-- | Infers the number of headers from rows
739simpleTable' :: [[Blocks]] -> Blocks
740simpleTable' []         = simpleTable [] []
741simpleTable' (x : rest) = simpleTable (fmap (const defaults) x) (x : rest)
742  where defaults = fromList []
743
744--
745read_table_row    :: ElementMatcher [[Blocks]]
746read_table_row     = matchingElement NsTable "table-row"
747                     $ liftA (:[])
748                     $ matchChildContent'  [ read_table_cell
749                                           ]
750
751--
752read_table_cell   :: ElementMatcher [Blocks]
753read_table_cell    = matchingElement NsTable "table-cell"
754                     $ liftA (compactify.(:[]))
755                     $ matchChildContent' [ read_paragraph
756                                          ]
757
758----------------------
759-- Frames
760----------------------
761
762--
763read_frame :: InlineMatcher
764read_frame = matchingElement NsDraw "frame"
765             $ filterChildrenName' NsDraw (`elem` ["image", "object", "text-box"])
766           >>> foldS read_frame_child
767           >>> arr fold
768
769read_frame_child :: OdtReaderSafe XML.Element (FirstMatch Inlines)
770read_frame_child =
771  proc child -> case elName child of
772    "image"    -> read_frame_img      -< child
773    "object"   -> read_frame_mathml   -< child
774    "text-box" -> read_frame_text_box -< child
775    _          -> returnV mempty      -< ()
776
777read_frame_img :: OdtReaderSafe XML.Element (FirstMatch Inlines)
778read_frame_img =
779  proc img -> do
780    src <- executeIn (findAttr' NsXLink "href") -< img
781    case fold src of
782      ""   -> returnV mempty -< ()
783      src' -> do
784        let exts = extensionsFromList [Ext_auto_identifiers]
785        resource   <- lookupResource                          -< T.unpack src'
786        _          <- updateMediaWithResource                 -< resource
787        w          <- findAttrText' NsSVG "width"             -< ()
788        h          <- findAttrText' NsSVG "height"            -< ()
789        titleNodes <- matchChildContent' [ read_frame_title ] -< ()
790        alt        <- matchChildContent [] read_plain_text    -< ()
791        arr (firstMatch . uncurry4 imageWith)                 -<
792          (image_attributes w h, src', inlineListToIdentifier exts (toList titleNodes), alt)
793
794read_frame_title :: InlineMatcher
795read_frame_title = matchingElement NsSVG "title" (matchChildContent [] read_plain_text)
796
797image_attributes :: Maybe T.Text -> Maybe T.Text -> Attr
798image_attributes x y =
799  ( "", [], dim "width" x ++ dim "height" y)
800  where
801    dim _ (Just "")   = []
802    dim name (Just v) = [(name, v)]
803    dim _ Nothing     = []
804
805read_frame_mathml :: OdtReaderSafe XML.Element (FirstMatch Inlines)
806read_frame_mathml =
807  proc obj -> do
808    src <- executeIn (findAttr' NsXLink "href") -< obj
809    case fold src of
810      ""   -> returnV mempty -< ()
811      src' -> do
812        let path = T.unpack $
813                    fromMaybe src' (T.stripPrefix "./" src') <> "/content.xml"
814        (_, mathml) <- lookupResource -< path
815        case readMathML (UTF8.toText $ B.toStrict mathml) of
816          Left _     -> returnV mempty -< ()
817          Right exps -> arr (firstMatch . displayMath . writeTeX) -< exps
818
819read_frame_text_box :: OdtReaderSafe XML.Element (FirstMatch Inlines)
820read_frame_text_box = proc box -> do
821    paragraphs <- executeIn (matchChildContent' [ read_paragraph ]) -< box
822    arr read_img_with_caption -< toList paragraphs
823
824read_img_with_caption :: [Block] -> FirstMatch Inlines
825read_img_with_caption (Para [Image attr alt (src,title)] : _) =
826  firstMatch $ singleton (Image attr alt (src, "fig:" <> title))   -- no text, default caption
827read_img_with_caption (Para (Image attr _ (src,title) : txt) : _) =
828  firstMatch $ singleton (Image attr txt (src, "fig:" <> title) )  -- override caption with the text that follows
829read_img_with_caption  ( Para (_ : xs) : ys) =
830  read_img_with_caption (Para xs : ys)
831read_img_with_caption _ =
832  mempty
833
834----------------------
835-- Internal links
836----------------------
837
838_ANCHOR_PREFIX_ :: T.Text
839_ANCHOR_PREFIX_ = "anchor"
840
841--
842readAnchorAttr :: OdtReader _x Anchor
843readAnchorAttr = findAttrText NsText "name"
844
845-- | Beware: may fail
846findAnchorName :: OdtReader AnchorPrefix Anchor
847findAnchorName = (      keepingTheValue readAnchorAttr
848                   >>^  spreadChoice
849                 ) >>?! getPrettyAnchor
850
851
852--
853maybeAddAnchorFrom :: OdtReader Inlines AnchorPrefix
854                   -> OdtReaderSafe Inlines Inlines
855maybeAddAnchorFrom anchorReader =
856  keepingTheValue (anchorReader >>? findAnchorName >>?^ toAnchorElem)
857  >>>
858  proc (inlines, fAnchorElem) -> do
859  case fAnchorElem of
860    Right anchorElem -> returnA -< anchorElem
861    Left _           -> returnA -< inlines
862  where
863    toAnchorElem :: Anchor -> Inlines
864    toAnchorElem anchorID = spanWith (anchorID, [], []) mempty
865                            -- no classes, no key-value pairs
866
867--
868read_bookmark     :: InlineMatcher
869read_bookmark      = matchingElement NsText "bookmark"
870                     $ maybeAddAnchorFrom (liftAsSuccess $ returnV _ANCHOR_PREFIX_)
871
872--
873read_bookmark_start :: InlineMatcher
874read_bookmark_start = matchingElement NsText "bookmark-start"
875                     $ maybeAddAnchorFrom (liftAsSuccess $ returnV _ANCHOR_PREFIX_)
876
877--
878read_reference_start :: InlineMatcher
879read_reference_start = matchingElement NsText "reference-mark-start"
880                     $ maybeAddAnchorFrom readAnchorAttr
881
882-- | Beware: may fail
883findAnchorRef :: OdtReader _x Anchor
884findAnchorRef = (      findAttrText NsText "ref-name"
885                  >>?^ (_ANCHOR_PREFIX_,)
886                ) >>?! getPrettyAnchor
887
888
889--
890maybeInAnchorRef :: OdtReaderSafe Inlines Inlines
891maybeInAnchorRef = proc inlines -> do
892  fRef <- findAnchorRef -< ()
893  case fRef of
894    Right anchor ->
895      arr (toAnchorRef anchor) -<< inlines
896    Left _ -> returnA -< inlines
897  where
898    toAnchorRef :: Anchor -> Inlines -> Inlines
899    toAnchorRef anchor = link ("#" <> anchor) "" -- no title
900
901--
902read_bookmark_ref :: InlineMatcher
903read_bookmark_ref = matchingElement NsText "bookmark-ref"
904                    $    maybeInAnchorRef
905                     <<< matchChildContent [] read_plain_text
906
907--
908read_reference_ref :: InlineMatcher
909read_reference_ref = matchingElement NsText "reference-ref"
910                    $    maybeInAnchorRef
911                     <<< matchChildContent [] read_plain_text
912
913
914----------------------
915-- Entry point
916----------------------
917
918read_text :: OdtReaderSafe _x Pandoc
919read_text = matchChildContent' [ read_header
920                               , read_paragraph
921                               , read_list
922                               , read_table
923                               ]
924            >>^ doc
925
926post_process :: Pandoc -> Pandoc
927post_process (Pandoc m blocks) =
928  Pandoc m (post_process' blocks)
929
930post_process' :: [Block] -> [Block]
931post_process' (Table attr _ specs th tb tf : Div ("", ["caption"], _) blks : xs)
932  = Table attr (Caption Nothing blks) specs th tb tf : post_process' xs
933post_process' bs = bs
934
935read_body :: OdtReader _x (Pandoc, MediaBag)
936read_body = executeInSub NsOffice "body"
937          $ executeInSub NsOffice "text"
938          $ liftAsSuccess
939          $ proc inlines -> do
940             txt   <- read_text     -< inlines
941             state <- getExtraState -< ()
942             returnA                -< (post_process txt, getMediaBag state)
943