1{-# LANGUAGE Arrows #-} 2{-# LANGUAGE CPP #-} 3{-# LANGUAGE DeriveFoldable #-} 4{-# LANGUAGE GeneralizedNewtypeDeriving #-} 5{-# LANGUAGE PatternGuards #-} 6{-# LANGUAGE RecordWildCards #-} 7{-# LANGUAGE TupleSections #-} 8{-# LANGUAGE ViewPatterns #-} 9{-# LANGUAGE OverloadedStrings #-} 10{- | 11 Module : Text.Pandoc.Readers.Odt.ContentReader 12 Copyright : Copyright (C) 2015 Martin Linnemann 13 License : GNU GPL, version 2 or above 14 15 Maintainer : Martin Linnemann <theCodingMarlin@googlemail.com> 16 Stability : alpha 17 Portability : portable 18 19The core of the odt reader that converts odt features into Pandoc types. 20-} 21 22module Text.Pandoc.Readers.Odt.ContentReader 23( readerState 24, read_body 25) where 26 27import Control.Applicative hiding (liftA, liftA2, liftA3) 28import Control.Arrow 29import Control.Monad ((<=<)) 30 31import qualified Data.ByteString.Lazy as B 32import Data.Foldable (fold) 33import Data.List (find) 34import qualified Data.Map as M 35import qualified Data.Text as T 36import Data.Maybe 37import Data.Monoid (Alt (..)) 38 39import Text.TeXMath (readMathML, writeTeX) 40import qualified Text.Pandoc.XML.Light as XML 41 42import Text.Pandoc.Builder hiding (underline) 43import Text.Pandoc.MediaBag (MediaBag, insertMedia) 44import Text.Pandoc.Shared 45import Text.Pandoc.Extensions (extensionsFromList, Extension(..)) 46import qualified Text.Pandoc.UTF8 as UTF8 47 48import Text.Pandoc.Readers.Odt.Base 49import Text.Pandoc.Readers.Odt.Namespaces 50import Text.Pandoc.Readers.Odt.StyleReader 51 52import Text.Pandoc.Readers.Odt.Arrows.State (foldS) 53import Text.Pandoc.Readers.Odt.Arrows.Utils 54import Text.Pandoc.Readers.Odt.Generic.Fallible 55import Text.Pandoc.Readers.Odt.Generic.Utils 56import Text.Pandoc.Readers.Odt.Generic.XMLConverter 57 58import qualified Data.Set as Set 59 60-------------------------------------------------------------------------------- 61-- State 62-------------------------------------------------------------------------------- 63 64type Anchor = T.Text 65type Media = [(FilePath, B.ByteString)] 66 67data ReaderState 68 = ReaderState { -- | A collection of styles read somewhere else. 69 -- It is only queried here, not modified. 70 styleSet :: Styles 71 -- | A stack of the styles of parent elements. 72 -- Used to look up inherited style properties. 73 , styleTrace :: [Style] 74 -- | Keeps track of the current depth in nested lists 75 , currentListLevel :: ListLevel 76 -- | Lists may provide their own style, but they don't have 77 -- to. If they do not, the style of a parent list may be used 78 -- or even a default list style from the paragraph style. 79 -- This value keeps track of the closest list style there 80 -- currently is. 81 , currentListStyle :: Maybe ListStyle 82 -- | A map from internal anchor names to "pretty" ones. 83 -- The mapping is a purely cosmetic one. 84 , bookmarkAnchors :: M.Map Anchor Anchor 85 -- | A map of files / binary data from the archive 86 , envMedia :: Media 87 -- | Hold binary resources used in the document 88 , odtMediaBag :: MediaBag 89 } 90 deriving ( Show ) 91 92readerState :: Styles -> Media -> ReaderState 93readerState styles media = ReaderState styles [] 0 Nothing M.empty media mempty 94 95-- 96pushStyle' :: Style -> ReaderState -> ReaderState 97pushStyle' style state = state { styleTrace = style : styleTrace state } 98 99-- 100popStyle' :: ReaderState -> ReaderState 101popStyle' state = case styleTrace state of 102 _:trace -> state { styleTrace = trace } 103 _ -> state 104 105-- 106modifyListLevel :: (ListLevel -> ListLevel) -> (ReaderState -> ReaderState) 107modifyListLevel f state = state { currentListLevel = f (currentListLevel state) } 108 109-- 110shiftListLevel :: ListLevel -> (ReaderState -> ReaderState) 111shiftListLevel diff = modifyListLevel (+ diff) 112 113-- 114swapCurrentListStyle :: Maybe ListStyle -> ReaderState 115 -> (ReaderState, Maybe ListStyle) 116swapCurrentListStyle mListStyle state = ( state { currentListStyle = mListStyle } 117 , currentListStyle state 118 ) 119 120-- 121lookupPrettyAnchor :: Anchor -> ReaderState -> Maybe Anchor 122lookupPrettyAnchor anchor ReaderState{..} = M.lookup anchor bookmarkAnchors 123 124-- 125putPrettyAnchor :: Anchor -> Anchor -> ReaderState -> ReaderState 126putPrettyAnchor ugly pretty state@ReaderState{..} 127 = state { bookmarkAnchors = M.insert ugly pretty bookmarkAnchors } 128 129-- 130usedAnchors :: ReaderState -> [Anchor] 131usedAnchors ReaderState{..} = M.elems bookmarkAnchors 132 133getMediaBag :: ReaderState -> MediaBag 134getMediaBag ReaderState{..} = odtMediaBag 135 136getMediaEnv :: ReaderState -> Media 137getMediaEnv ReaderState{..} = envMedia 138 139insertMedia' :: (FilePath, B.ByteString) -> ReaderState -> ReaderState 140insertMedia' (fp, bs) state@ReaderState{..} 141 = state { odtMediaBag = insertMedia fp Nothing bs odtMediaBag } 142 143-------------------------------------------------------------------------------- 144-- Reader type and associated tools 145-------------------------------------------------------------------------------- 146 147type OdtReader a b = XMLReader ReaderState a b 148 149type OdtReaderSafe a b = XMLReaderSafe ReaderState a b 150 151-- | Extract something from the styles 152fromStyles :: (a -> Styles -> b) -> OdtReaderSafe a b 153fromStyles f = keepingTheValue 154 (getExtraState >>^ styleSet) 155 >>% f 156 157-- 158getStyleByName :: OdtReader StyleName Style 159getStyleByName = fromStyles lookupStyle >>^ maybeToChoice 160 161-- 162findStyleFamily :: OdtReader Style StyleFamily 163findStyleFamily = fromStyles getStyleFamily >>^ maybeToChoice 164 165-- 166lookupListStyle :: OdtReader StyleName ListStyle 167lookupListStyle = fromStyles lookupListStyleByName >>^ maybeToChoice 168 169-- 170switchCurrentListStyle :: OdtReaderSafe (Maybe ListStyle) (Maybe ListStyle) 171switchCurrentListStyle = keepingTheValue getExtraState 172 >>% swapCurrentListStyle 173 >>> first setExtraState 174 >>^ snd 175 176-- 177pushStyle :: OdtReaderSafe Style Style 178pushStyle = keepingTheValue ( 179 ( keepingTheValue getExtraState 180 >>% pushStyle' 181 ) 182 >>> setExtraState 183 ) 184 >>^ fst 185 186-- 187popStyle :: OdtReaderSafe x x 188popStyle = keepingTheValue ( 189 getExtraState 190 >>> arr popStyle' 191 >>> setExtraState 192 ) 193 >>^ fst 194 195-- 196getCurrentListLevel :: OdtReaderSafe _x ListLevel 197getCurrentListLevel = getExtraState >>^ currentListLevel 198 199-- 200updateMediaWithResource :: OdtReaderSafe (FilePath, B.ByteString) (FilePath, B.ByteString) 201updateMediaWithResource = keepingTheValue ( 202 (keepingTheValue getExtraState 203 >>% insertMedia' 204 ) 205 >>> setExtraState 206 ) 207 >>^ fst 208 209lookupResource :: OdtReaderSafe FilePath (FilePath, B.ByteString) 210lookupResource = proc target -> do 211 state <- getExtraState -< () 212 case lookup target (getMediaEnv state) of 213 Just bs -> returnV (target, bs) -<< () 214 Nothing -> returnV ("", B.empty) -< () 215 216type AnchorPrefix = T.Text 217 218-- | An adaptation of 'uniqueIdent' from "Text.Pandoc.Shared" that generates a 219-- unique identifier but without assuming that the id should be for a header. 220-- Second argument is a list of already used identifiers. 221uniqueIdentFrom :: AnchorPrefix -> [Anchor] -> Anchor 222uniqueIdentFrom baseIdent usedIdents = 223 let numIdent n = baseIdent <> "-" <> T.pack (show n) 224 in if baseIdent `elem` usedIdents 225 then maybe baseIdent numIdent 226 $ find (\x -> numIdent x `notElem` usedIdents) ([1..60000] :: [Int]) 227 -- if we have more than 60,000, allow repeats 228 else baseIdent 229 230-- | First argument: basis for a new "pretty" anchor if none exists yet 231-- Second argument: a key ("ugly" anchor) 232-- Returns: saved "pretty" anchor or created new one 233getPrettyAnchor :: OdtReaderSafe (AnchorPrefix, Anchor) Anchor 234getPrettyAnchor = proc (baseIdent, uglyAnchor) -> do 235 state <- getExtraState -< () 236 case lookupPrettyAnchor uglyAnchor state of 237 Just prettyAnchor -> returnA -< prettyAnchor 238 Nothing -> do 239 let newPretty = uniqueIdentFrom baseIdent (usedAnchors state) 240 modifyExtraState (putPrettyAnchor uglyAnchor newPretty) -<< newPretty 241 242-- | Input: basis for a new header anchor 243-- Output: saved new anchor 244getHeaderAnchor :: OdtReaderSafe Inlines Anchor 245getHeaderAnchor = proc title -> do 246 state <- getExtraState -< () 247 let exts = extensionsFromList [Ext_auto_identifiers] 248 let anchor = uniqueIdent exts (toList title) 249 (Set.fromList $ usedAnchors state) 250 modifyExtraState (putPrettyAnchor anchor anchor) -<< anchor 251 252 253-------------------------------------------------------------------------------- 254-- Working with styles 255-------------------------------------------------------------------------------- 256 257-- 258readStyleByName :: OdtReader _x (StyleName, Style) 259readStyleByName = 260 findAttr NsText "style-name" >>? keepingTheValue getStyleByName >>^ liftE 261 where 262 liftE :: (StyleName, Fallible Style) -> Fallible (StyleName, Style) 263 liftE (name, Right v) = Right (name, v) 264 liftE (_, Left v) = Left v 265 266-- 267isStyleToTrace :: OdtReader Style Bool 268isStyleToTrace = findStyleFamily >>?^ (==FaText) 269 270-- 271withNewStyle :: OdtReaderSafe x Inlines -> OdtReaderSafe x Inlines 272withNewStyle a = proc x -> do 273 fStyle <- readStyleByName -< () 274 case fStyle of 275 Right (styleName, _) | isCodeStyle styleName -> do 276 inlines <- a -< x 277 arr inlineCode -<< inlines 278 Right (_, style) -> do 279 mFamily <- arr styleFamily -< style 280 fTextProps <- arr ( maybeToChoice 281 . textProperties 282 . styleProperties 283 ) -< style 284 case fTextProps of 285 Right textProps -> do 286 state <- getExtraState -< () 287 let triple = (state, textProps, mFamily) 288 modifier <- arr modifierFromStyleDiff -< triple 289 fShouldTrace <- isStyleToTrace -< style 290 case fShouldTrace of 291 Right shouldTrace -> 292 if shouldTrace 293 then do 294 pushStyle -< style 295 inlines <- a -< x 296 popStyle -< () 297 arr modifier -<< inlines 298 else 299 -- In case anything goes wrong 300 a -< x 301 Left _ -> a -< x 302 Left _ -> a -< x 303 Left _ -> a -< x 304 where 305 isCodeStyle :: StyleName -> Bool 306 isCodeStyle "Source_Text" = True 307 isCodeStyle _ = False 308 309 inlineCode :: Inlines -> Inlines 310 inlineCode = code . T.concat . map stringify . toList 311 312type PropertyTriple = (ReaderState, TextProperties, Maybe StyleFamily) 313type InlineModifier = Inlines -> Inlines 314 315-- | Given data about the local style changes, calculates how to modify 316-- an instance of 'Inlines' 317modifierFromStyleDiff :: PropertyTriple -> InlineModifier 318modifierFromStyleDiff propertyTriple = 319 composition $ 320 getVPosModifier propertyTriple 321 : map (first ($ propertyTriple) >>> ifThen_else ignore) 322 [ (hasEmphChanged , emph ) 323 , (hasChanged isStrong , strong ) 324 , (hasChanged strikethrough , strikeout ) 325 ] 326 where 327 ifThen_else else' (if',then') = if if' then then' else else' 328 329 ignore = id :: InlineModifier 330 331 getVPosModifier :: PropertyTriple -> InlineModifier 332 getVPosModifier triple@(_,textProps,_) = 333 let getVPos = Just . verticalPosition 334 in case lookupPreviousValueM getVPos triple of 335 Nothing -> ignore 336 Just oldVPos -> getVPosModifier' (oldVPos, verticalPosition textProps) 337 338 getVPosModifier' (oldVPos , newVPos ) | oldVPos == newVPos = ignore 339 getVPosModifier' ( _ , VPosSub ) = subscript 340 getVPosModifier' ( _ , VPosSuper ) = superscript 341 getVPosModifier' ( _ , _ ) = ignore 342 343 hasEmphChanged :: PropertyTriple -> Bool 344 hasEmphChanged = swing any [ hasChanged isEmphasised 345 , hasChangedM pitch 346 , hasChanged underline 347 ] 348 349 hasChanged property triple@(_, property -> newProperty, _) = 350 (/= Just newProperty) (lookupPreviousValue property triple) 351 352 hasChangedM property triple@(_, textProps,_) = 353 fromMaybe False $ (/=) <$> property textProps <*> lookupPreviousValueM property triple 354 355 lookupPreviousValue f = lookupPreviousStyleValue (fmap f . textProperties) 356 357 lookupPreviousValueM f = lookupPreviousStyleValue (f <=< textProperties) 358 359 lookupPreviousStyleValue f (ReaderState{..},_,mFamily) 360 = findBy f (extendedStylePropertyChain styleTrace styleSet) 361 <|> (f . lookupDefaultStyle' styleSet =<< mFamily) 362 363 364type ParaModifier = Blocks -> Blocks 365 366_MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_ :: Int 367_MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_ :: Int 368_MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_ = 5 369_MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_ = 5 370 371-- | Returns either 'id' or 'blockQuote' depending on the current indentation 372getParaModifier :: Style -> ParaModifier 373getParaModifier Style{..} | Just props <- paraProperties styleProperties 374 , isBlockQuote (indentation props) 375 (margin_left props) 376 = blockQuote 377 | otherwise 378 = id 379 where 380 isBlockQuote mIndent mMargin 381 | LengthValueMM indent <- mIndent 382 , indent > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_ 383 = True 384 | LengthValueMM margin <- mMargin 385 , margin > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_ 386 = True 387 | LengthValueMM indent <- mIndent 388 , LengthValueMM margin <- mMargin 389 = indent + margin > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_MM_ 390 391 | PercentValue indent <- mIndent 392 , indent > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_ 393 = True 394 | PercentValue margin <- mMargin 395 , margin > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_ 396 = True 397 | PercentValue indent <- mIndent 398 , PercentValue margin <- mMargin 399 = indent + margin > _MINIMUM_INDENTATION_FOR_BLOCKQUOTES_IN_PERCENT_ 400 401 | otherwise 402 = False 403 404-- 405constructPara :: OdtReaderSafe Blocks Blocks -> OdtReaderSafe Blocks Blocks 406constructPara reader = proc blocks -> do 407 fStyle <- readStyleByName -< blocks 408 case fStyle of 409 Left _ -> reader -< blocks 410 Right (styleName, _) | isTableCaptionStyle styleName -> do 411 blocks' <- reader -< blocks 412 arr tableCaptionP -< blocks' 413 Right (_, style) -> do 414 let modifier = getParaModifier style 415 blocks' <- reader -< blocks 416 arr modifier -<< blocks' 417 where 418 isTableCaptionStyle :: StyleName -> Bool 419 isTableCaptionStyle "Table" = True 420 isTableCaptionStyle _ = False 421 tableCaptionP b = divWith ("", ["caption"], []) b 422 423type ListConstructor = [Blocks] -> Blocks 424 425getListConstructor :: ListLevelStyle -> ListConstructor 426getListConstructor ListLevelStyle{..} = 427 case listLevelType of 428 LltBullet -> bulletList 429 LltImage -> bulletList 430 LltNumbered -> let listNumberStyle = toListNumberStyle listItemFormat 431 listNumberDelim = toListNumberDelim listItemPrefix 432 listItemSuffix 433 in orderedListWith (listItemStart, listNumberStyle, listNumberDelim) 434 where 435 toListNumberStyle LinfNone = DefaultStyle 436 toListNumberStyle LinfNumber = Decimal 437 toListNumberStyle LinfRomanLC = LowerRoman 438 toListNumberStyle LinfRomanUC = UpperRoman 439 toListNumberStyle LinfAlphaLC = LowerAlpha 440 toListNumberStyle LinfAlphaUC = UpperAlpha 441 toListNumberStyle (LinfString _) = Example 442 443 toListNumberDelim Nothing (Just ".") = Period 444 toListNumberDelim (Just "" ) (Just ".") = Period 445 toListNumberDelim Nothing (Just ")") = OneParen 446 toListNumberDelim (Just "" ) (Just ")") = OneParen 447 toListNumberDelim (Just "(") (Just ")") = TwoParens 448 toListNumberDelim _ _ = DefaultDelim 449 450 451-- | Determines which style to use for a list, which level to use of that 452-- style, and which type of list to create as a result of this information. 453-- Then prepares the state for eventual child lists and constructs the list from 454-- the results. 455-- Two main cases are handled: The list may provide its own style or it may 456-- rely on a parent list's style. I the former case the current style in the 457-- state must be switched before and after the call to the child converter 458-- while in the latter the child converter can be called directly. 459-- If anything goes wrong, a default ordered-list-constructor is used. 460constructList :: OdtReaderSafe x [Blocks] -> OdtReaderSafe x Blocks 461constructList reader = proc x -> do 462 modifyExtraState (shiftListLevel 1) -< () 463 listLevel <- getCurrentListLevel -< () 464 fStyleName <- findAttr NsText "style-name" -< () 465 case fStyleName of 466 Right styleName -> do 467 fListStyle <- lookupListStyle -< styleName 468 case fListStyle of 469 Right listStyle -> do 470 fLLS <- arr (uncurry getListLevelStyle) -< (listLevel,listStyle) 471 case fLLS of 472 Just listLevelStyle -> do 473 oldListStyle <- switchCurrentListStyle -< Just listStyle 474 blocks <- constructListWith listLevelStyle -<< x 475 switchCurrentListStyle -< oldListStyle 476 returnA -< blocks 477 Nothing -> constructOrderedList -< x 478 Left _ -> constructOrderedList -< x 479 Left _ -> do 480 state <- getExtraState -< () 481 mListStyle <- arr currentListStyle -< state 482 case mListStyle of 483 Just listStyle -> do 484 fLLS <- arr (uncurry getListLevelStyle) -< (listLevel,listStyle) 485 case fLLS of 486 Just listLevelStyle -> constructListWith listLevelStyle -<< x 487 Nothing -> constructOrderedList -< x 488 Nothing -> constructOrderedList -< x 489 where 490 constructOrderedList = 491 reader 492 >>> modifyExtraState (shiftListLevel (-1)) 493 >>^ orderedList 494 constructListWith listLevelStyle = 495 reader 496 >>> getListConstructor listLevelStyle 497 ^>> modifyExtraState (shiftListLevel (-1)) 498 499-------------------------------------------------------------------------------- 500-- Readers 501-------------------------------------------------------------------------------- 502 503type ElementMatcher result = (Namespace, ElementName, OdtReader result result) 504 505type InlineMatcher = ElementMatcher Inlines 506 507type BlockMatcher = ElementMatcher Blocks 508 509newtype FirstMatch a = FirstMatch (Alt Maybe a) 510 deriving (Foldable, Monoid, Semigroup) 511 512firstMatch :: a -> FirstMatch a 513firstMatch = FirstMatch . Alt . Just 514 515-- 516matchingElement :: (Monoid e) 517 => Namespace -> ElementName 518 -> OdtReaderSafe e e 519 -> ElementMatcher e 520matchingElement ns name reader = (ns, name, asResultAccumulator reader) 521 where 522 asResultAccumulator :: (ArrowChoice a, Monoid m) => a m m -> a m (Fallible m) 523 asResultAccumulator a = liftAsSuccess $ keepingTheValue a >>% mappend 524 525-- 526matchChildContent' :: (Monoid result) 527 => [ElementMatcher result] 528 -> OdtReaderSafe _x result 529matchChildContent' ls = returnV mempty >>> matchContent' ls 530 531-- 532matchChildContent :: (Monoid result) 533 => [ElementMatcher result] 534 -> OdtReaderSafe (result, XML.Content) result 535 -> OdtReaderSafe _x result 536matchChildContent ls fallback = returnV mempty >>> matchContent ls fallback 537 538-------------------------------------------- 539-- Matchers 540-------------------------------------------- 541 542---------------------- 543-- Basics 544---------------------- 545 546-- 547-- | Open Document allows several consecutive spaces if they are marked up 548read_plain_text :: OdtReaderSafe (Inlines, XML.Content) Inlines 549read_plain_text = fst ^&&& read_plain_text' >>% recover 550 where 551 -- fallible version 552 read_plain_text' :: OdtReader (Inlines, XML.Content) Inlines 553 read_plain_text' = ( second ( arr extractText ) 554 >>^ spreadChoice >>?! second text 555 ) 556 >>?% mappend 557 -- 558 extractText :: XML.Content -> Fallible T.Text 559 extractText (XML.Text cData) = succeedWith (XML.cdData cData) 560 extractText _ = failEmpty 561 562read_text_seq :: InlineMatcher 563read_text_seq = matchingElement NsText "sequence" 564 $ matchChildContent [] read_plain_text 565 566 567-- specifically. I honor that, although the current implementation of 'mappend' 568-- for 'Inlines' in "Text.Pandoc.Builder" will collapse them again. 569-- The rational is to be prepared for future modifications. 570read_spaces :: InlineMatcher 571read_spaces = matchingElement NsText "s" ( 572 readAttrWithDefault NsText "c" 1 -- how many spaces? 573 >>^ fromList.(`replicate` Space) 574 ) 575-- 576read_line_break :: InlineMatcher 577read_line_break = matchingElement NsText "line-break" 578 $ returnV linebreak 579-- 580read_tab :: InlineMatcher 581read_tab = matchingElement NsText "tab" 582 $ returnV space 583-- 584read_span :: InlineMatcher 585read_span = matchingElement NsText "span" 586 $ withNewStyle 587 $ matchChildContent [ read_span 588 , read_spaces 589 , read_line_break 590 , read_tab 591 , read_link 592 , read_note 593 , read_citation 594 , read_bookmark 595 , read_bookmark_start 596 , read_reference_start 597 , read_bookmark_ref 598 , read_reference_ref 599 ] read_plain_text 600 601-- 602read_paragraph :: BlockMatcher 603read_paragraph = matchingElement NsText "p" 604 $ constructPara 605 $ liftA para 606 $ withNewStyle 607 $ matchChildContent [ read_span 608 , read_spaces 609 , read_line_break 610 , read_tab 611 , read_link 612 , read_note 613 , read_citation 614 , read_bookmark 615 , read_bookmark_start 616 , read_reference_start 617 , read_bookmark_ref 618 , read_reference_ref 619 , read_frame 620 , read_text_seq 621 ] read_plain_text 622 623 624---------------------- 625-- Headers 626---------------------- 627 628-- 629read_header :: BlockMatcher 630read_header = matchingElement NsText "h" 631 $ proc blocks -> do 632 level <- ( readAttrWithDefault NsText "outline-level" 1 633 ) -< blocks 634 children <- ( matchChildContent [ read_span 635 , read_spaces 636 , read_line_break 637 , read_tab 638 , read_link 639 , read_note 640 , read_citation 641 , read_bookmark 642 , read_bookmark_start 643 , read_reference_start 644 , read_bookmark_ref 645 , read_reference_ref 646 , read_frame 647 ] read_plain_text 648 ) -< blocks 649 anchor <- getHeaderAnchor -< children 650 let idAttr = (anchor, [], []) -- no classes, no key-value pairs 651 arr (uncurry3 headerWith) -< (idAttr, level, children) 652 653---------------------- 654-- Lists 655---------------------- 656 657-- 658read_list :: BlockMatcher 659read_list = matchingElement NsText "list" 660-- $ withIncreasedListLevel 661 $ constructList 662-- $ liftA bulletList 663 $ matchChildContent' [ read_list_item 664 ] 665-- 666read_list_item :: ElementMatcher [Blocks] 667read_list_item = matchingElement NsText "list-item" 668 $ liftA (compactify.(:[])) 669 ( matchChildContent' [ read_paragraph 670 , read_header 671 , read_list 672 ] 673 ) 674 675 676---------------------- 677-- Links 678---------------------- 679 680read_link :: InlineMatcher 681read_link = matchingElement NsText "a" 682 $ liftA3 link 683 ( findAttrTextWithDefault NsXLink "href" "" ) 684 ( findAttrTextWithDefault NsOffice "title" "" ) 685 ( matchChildContent [ read_span 686 , read_note 687 , read_citation 688 , read_bookmark 689 , read_bookmark_start 690 , read_reference_start 691 , read_bookmark_ref 692 , read_reference_ref 693 ] read_plain_text ) 694 695 696------------------------- 697-- Footnotes 698------------------------- 699 700read_note :: InlineMatcher 701read_note = matchingElement NsText "note" 702 $ liftA note 703 $ matchChildContent' [ read_note_body ] 704 705read_note_body :: BlockMatcher 706read_note_body = matchingElement NsText "note-body" 707 $ matchChildContent' [ read_paragraph ] 708 709------------------------- 710-- Citations 711------------------------- 712 713read_citation :: InlineMatcher 714read_citation = matchingElement NsText "bibliography-mark" 715 $ liftA2 cite 716 ( liftA2 makeCitation 717 ( findAttrTextWithDefault NsText "identifier" "" ) 718 ( readAttrWithDefault NsText "number" 0 ) 719 ) 720 ( matchChildContent [] read_plain_text ) 721 where 722 makeCitation :: T.Text -> Int -> [Citation] 723 makeCitation citeId num = [Citation citeId [] [] NormalCitation num 0] 724 725 726---------------------- 727-- Tables 728---------------------- 729 730-- 731read_table :: BlockMatcher 732read_table = matchingElement NsTable "table" 733 $ liftA simpleTable' 734 $ matchChildContent' [ read_table_row 735 ] 736 737-- | A simple table without a caption or headers 738-- | Infers the number of headers from rows 739simpleTable' :: [[Blocks]] -> Blocks 740simpleTable' [] = simpleTable [] [] 741simpleTable' (x : rest) = simpleTable (fmap (const defaults) x) (x : rest) 742 where defaults = fromList [] 743 744-- 745read_table_row :: ElementMatcher [[Blocks]] 746read_table_row = matchingElement NsTable "table-row" 747 $ liftA (:[]) 748 $ matchChildContent' [ read_table_cell 749 ] 750 751-- 752read_table_cell :: ElementMatcher [Blocks] 753read_table_cell = matchingElement NsTable "table-cell" 754 $ liftA (compactify.(:[])) 755 $ matchChildContent' [ read_paragraph 756 ] 757 758---------------------- 759-- Frames 760---------------------- 761 762-- 763read_frame :: InlineMatcher 764read_frame = matchingElement NsDraw "frame" 765 $ filterChildrenName' NsDraw (`elem` ["image", "object", "text-box"]) 766 >>> foldS read_frame_child 767 >>> arr fold 768 769read_frame_child :: OdtReaderSafe XML.Element (FirstMatch Inlines) 770read_frame_child = 771 proc child -> case elName child of 772 "image" -> read_frame_img -< child 773 "object" -> read_frame_mathml -< child 774 "text-box" -> read_frame_text_box -< child 775 _ -> returnV mempty -< () 776 777read_frame_img :: OdtReaderSafe XML.Element (FirstMatch Inlines) 778read_frame_img = 779 proc img -> do 780 src <- executeIn (findAttr' NsXLink "href") -< img 781 case fold src of 782 "" -> returnV mempty -< () 783 src' -> do 784 let exts = extensionsFromList [Ext_auto_identifiers] 785 resource <- lookupResource -< T.unpack src' 786 _ <- updateMediaWithResource -< resource 787 w <- findAttrText' NsSVG "width" -< () 788 h <- findAttrText' NsSVG "height" -< () 789 titleNodes <- matchChildContent' [ read_frame_title ] -< () 790 alt <- matchChildContent [] read_plain_text -< () 791 arr (firstMatch . uncurry4 imageWith) -< 792 (image_attributes w h, src', inlineListToIdentifier exts (toList titleNodes), alt) 793 794read_frame_title :: InlineMatcher 795read_frame_title = matchingElement NsSVG "title" (matchChildContent [] read_plain_text) 796 797image_attributes :: Maybe T.Text -> Maybe T.Text -> Attr 798image_attributes x y = 799 ( "", [], dim "width" x ++ dim "height" y) 800 where 801 dim _ (Just "") = [] 802 dim name (Just v) = [(name, v)] 803 dim _ Nothing = [] 804 805read_frame_mathml :: OdtReaderSafe XML.Element (FirstMatch Inlines) 806read_frame_mathml = 807 proc obj -> do 808 src <- executeIn (findAttr' NsXLink "href") -< obj 809 case fold src of 810 "" -> returnV mempty -< () 811 src' -> do 812 let path = T.unpack $ 813 fromMaybe src' (T.stripPrefix "./" src') <> "/content.xml" 814 (_, mathml) <- lookupResource -< path 815 case readMathML (UTF8.toText $ B.toStrict mathml) of 816 Left _ -> returnV mempty -< () 817 Right exps -> arr (firstMatch . displayMath . writeTeX) -< exps 818 819read_frame_text_box :: OdtReaderSafe XML.Element (FirstMatch Inlines) 820read_frame_text_box = proc box -> do 821 paragraphs <- executeIn (matchChildContent' [ read_paragraph ]) -< box 822 arr read_img_with_caption -< toList paragraphs 823 824read_img_with_caption :: [Block] -> FirstMatch Inlines 825read_img_with_caption (Para [Image attr alt (src,title)] : _) = 826 firstMatch $ singleton (Image attr alt (src, "fig:" <> title)) -- no text, default caption 827read_img_with_caption (Para (Image attr _ (src,title) : txt) : _) = 828 firstMatch $ singleton (Image attr txt (src, "fig:" <> title) ) -- override caption with the text that follows 829read_img_with_caption ( Para (_ : xs) : ys) = 830 read_img_with_caption (Para xs : ys) 831read_img_with_caption _ = 832 mempty 833 834---------------------- 835-- Internal links 836---------------------- 837 838_ANCHOR_PREFIX_ :: T.Text 839_ANCHOR_PREFIX_ = "anchor" 840 841-- 842readAnchorAttr :: OdtReader _x Anchor 843readAnchorAttr = findAttrText NsText "name" 844 845-- | Beware: may fail 846findAnchorName :: OdtReader AnchorPrefix Anchor 847findAnchorName = ( keepingTheValue readAnchorAttr 848 >>^ spreadChoice 849 ) >>?! getPrettyAnchor 850 851 852-- 853maybeAddAnchorFrom :: OdtReader Inlines AnchorPrefix 854 -> OdtReaderSafe Inlines Inlines 855maybeAddAnchorFrom anchorReader = 856 keepingTheValue (anchorReader >>? findAnchorName >>?^ toAnchorElem) 857 >>> 858 proc (inlines, fAnchorElem) -> do 859 case fAnchorElem of 860 Right anchorElem -> returnA -< anchorElem 861 Left _ -> returnA -< inlines 862 where 863 toAnchorElem :: Anchor -> Inlines 864 toAnchorElem anchorID = spanWith (anchorID, [], []) mempty 865 -- no classes, no key-value pairs 866 867-- 868read_bookmark :: InlineMatcher 869read_bookmark = matchingElement NsText "bookmark" 870 $ maybeAddAnchorFrom (liftAsSuccess $ returnV _ANCHOR_PREFIX_) 871 872-- 873read_bookmark_start :: InlineMatcher 874read_bookmark_start = matchingElement NsText "bookmark-start" 875 $ maybeAddAnchorFrom (liftAsSuccess $ returnV _ANCHOR_PREFIX_) 876 877-- 878read_reference_start :: InlineMatcher 879read_reference_start = matchingElement NsText "reference-mark-start" 880 $ maybeAddAnchorFrom readAnchorAttr 881 882-- | Beware: may fail 883findAnchorRef :: OdtReader _x Anchor 884findAnchorRef = ( findAttrText NsText "ref-name" 885 >>?^ (_ANCHOR_PREFIX_,) 886 ) >>?! getPrettyAnchor 887 888 889-- 890maybeInAnchorRef :: OdtReaderSafe Inlines Inlines 891maybeInAnchorRef = proc inlines -> do 892 fRef <- findAnchorRef -< () 893 case fRef of 894 Right anchor -> 895 arr (toAnchorRef anchor) -<< inlines 896 Left _ -> returnA -< inlines 897 where 898 toAnchorRef :: Anchor -> Inlines -> Inlines 899 toAnchorRef anchor = link ("#" <> anchor) "" -- no title 900 901-- 902read_bookmark_ref :: InlineMatcher 903read_bookmark_ref = matchingElement NsText "bookmark-ref" 904 $ maybeInAnchorRef 905 <<< matchChildContent [] read_plain_text 906 907-- 908read_reference_ref :: InlineMatcher 909read_reference_ref = matchingElement NsText "reference-ref" 910 $ maybeInAnchorRef 911 <<< matchChildContent [] read_plain_text 912 913 914---------------------- 915-- Entry point 916---------------------- 917 918read_text :: OdtReaderSafe _x Pandoc 919read_text = matchChildContent' [ read_header 920 , read_paragraph 921 , read_list 922 , read_table 923 ] 924 >>^ doc 925 926post_process :: Pandoc -> Pandoc 927post_process (Pandoc m blocks) = 928 Pandoc m (post_process' blocks) 929 930post_process' :: [Block] -> [Block] 931post_process' (Table attr _ specs th tb tf : Div ("", ["caption"], _) blks : xs) 932 = Table attr (Caption Nothing blks) specs th tb tf : post_process' xs 933post_process' bs = bs 934 935read_body :: OdtReader _x (Pandoc, MediaBag) 936read_body = executeInSub NsOffice "body" 937 $ executeInSub NsOffice "text" 938 $ liftAsSuccess 939 $ proc inlines -> do 940 txt <- read_text -< inlines 941 state <- getExtraState -< () 942 returnA -< (post_process txt, getMediaBag state) 943