1{-# LANGUAGE OverloadedStrings #-} 2{- | 3 Module : Tests.Readers.Docx 4 Copyright : © 2017-2020 Jesse Rosenthal, John MacFarlane 5 License : GNU GPL, version 2 or above 6 7 Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu> 8 Stability : alpha 9 Portability : portable 10 11Tests for the word docx reader. 12-} 13module Tests.Readers.Docx (tests) where 14 15import Codec.Archive.Zip 16import qualified Data.ByteString as BS 17import qualified Data.ByteString.Lazy as B 18import qualified Data.Map as M 19import qualified Data.Text as T 20import Data.Maybe 21import System.IO.Unsafe 22import Test.Tasty 23import Test.Tasty.HUnit 24import Tests.Helpers 25import Text.Pandoc 26import qualified Text.Pandoc.Class as P 27import qualified Text.Pandoc.MediaBag as MB 28import Text.Pandoc.UTF8 as UTF8 29 30-- We define a wrapper around pandoc that doesn't normalize in the 31-- tests. Since we do our own normalization, we want to make sure 32-- we're doing it right. 33 34newtype NoNormPandoc = NoNormPandoc {unNoNorm :: Pandoc} 35 deriving Show 36 37noNorm :: Pandoc -> NoNormPandoc 38noNorm = NoNormPandoc 39 40defopts :: ReaderOptions 41defopts = def{ readerExtensions = getDefaultExtensions "docx" } 42 43instance ToString NoNormPandoc where 44 toString d = T.unpack $ purely (writeNative def{ writerTemplate = s }) $ toPandoc d 45 where s = case d of 46 NoNormPandoc (Pandoc (Meta m) _) 47 | M.null m -> Nothing 48 | otherwise -> Just mempty -- need this to get meta output 49 50instance ToPandoc NoNormPandoc where 51 toPandoc = unNoNorm 52 53compareOutput :: ReaderOptions 54 -> FilePath 55 -> FilePath 56 -> IO (NoNormPandoc, NoNormPandoc) 57compareOutput opts docxFile nativeFile = do 58 df <- B.readFile docxFile 59 nf <- UTF8.toText <$> BS.readFile nativeFile 60 p <- runIOorExplode $ readDocx opts df 61 df' <- runIOorExplode $ readNative def nf 62 return (noNorm p, noNorm df') 63 64testCompareWithOptsIO :: ReaderOptions -> String -> FilePath -> FilePath -> IO TestTree 65testCompareWithOptsIO opts name docxFile nativeFile = do 66 (dp, np) <- compareOutput opts docxFile nativeFile 67 return $ test id name (dp, np) 68 69testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree 70testCompareWithOpts opts name docxFile nativeFile = 71 unsafePerformIO $ testCompareWithOptsIO opts name docxFile nativeFile 72 73testCompare :: String -> FilePath -> FilePath -> TestTree 74testCompare = testCompareWithOpts defopts 75 76testForWarningsWithOptsIO :: ReaderOptions -> String -> FilePath -> [String] -> IO TestTree 77testForWarningsWithOptsIO opts name docxFile expected = do 78 df <- B.readFile docxFile 79 logs <- runIOorExplode $ setVerbosity ERROR >> readDocx opts df >> P.getLog 80 let warns = [m | DocxParserWarning m <- logs] 81 return $ test id name (T.unlines warns, unlines expected) 82 83testForWarningsWithOpts :: ReaderOptions -> String -> FilePath -> [String] -> TestTree 84testForWarningsWithOpts opts name docxFile expected = 85 unsafePerformIO $ testForWarningsWithOptsIO opts name docxFile expected 86 87-- testForWarnings :: String -> FilePath -> [String] -> TestTree 88-- testForWarnings = testForWarningsWithOpts defopts 89 90getMedia :: FilePath -> FilePath -> IO (Maybe B.ByteString) 91getMedia archivePath mediaPath = fmap fromEntry . findEntryByPath 92 ("word/" ++ mediaPath) . toArchive <$> B.readFile archivePath 93 94compareMediaPathIO :: FilePath -> MB.MediaBag -> FilePath -> IO Bool 95compareMediaPathIO mediaPath mediaBag docxPath = do 96 docxMedia <- getMedia docxPath mediaPath 97 let mbBS = case MB.lookupMedia mediaPath mediaBag of 98 Just item -> MB.mediaContents item 99 Nothing -> error ("couldn't find " ++ 100 mediaPath ++ 101 " in media bag") 102 docxBS = fromMaybe (error ("couldn't find " ++ 103 mediaPath ++ 104 " in media bag")) docxMedia 105 return $ mbBS == docxBS 106 107compareMediaBagIO :: FilePath -> IO Bool 108compareMediaBagIO docxFile = do 109 df <- B.readFile docxFile 110 mb <- runIOorExplode $ readDocx defopts df >> P.getMediaBag 111 bools <- mapM 112 (\(fp, _, _) -> compareMediaPathIO fp mb docxFile) 113 (MB.mediaDirectory mb) 114 return $ and bools 115 116testMediaBagIO :: String -> FilePath -> IO TestTree 117testMediaBagIO name docxFile = do 118 outcome <- compareMediaBagIO docxFile 119 return $ testCase name (assertBool 120 ("Media didn't match media bag in file " ++ docxFile) 121 outcome) 122 123testMediaBag :: String -> FilePath -> TestTree 124testMediaBag name docxFile = unsafePerformIO $ testMediaBagIO name docxFile 125 126tests :: [TestTree] 127tests = [ testGroup "document" 128 [ testCompare 129 "allow different document.xml file as defined in _rels/.rels" 130 "docx/alternate_document_path.docx" 131 "docx/alternate_document_path.native" 132 ] 133 , testGroup "inlines" 134 [ testCompare 135 "font formatting" 136 "docx/inline_formatting.docx" 137 "docx/inline_formatting.native" 138 , testCompare 139 "font formatting with character styles" 140 "docx/char_styles.docx" 141 "docx/char_styles.native" 142 , testCompare 143 "hyperlinks" 144 "docx/links.docx" 145 "docx/links.native" 146 , testCompare 147 "hyperlinks in <w:instrText> tag" 148 "docx/instrText_hyperlink.docx" 149 "docx/instrText_hyperlink.native" 150 , testCompare 151 "inline image" 152 "docx/image.docx" 153 "docx/image_no_embed.native" 154 , testCompare 155 "VML image" 156 "docx/image_vml.docx" 157 "docx/image_vml.native" 158 , testCompare 159 "VML image as object" 160 "docx/image_vml_as_object.docx" 161 "docx/image_vml_as_object.native" 162 , testCompare 163 "inline image in links" 164 "docx/inline_images.docx" 165 "docx/inline_images.native" 166 , testCompare 167 "handling unicode input" 168 "docx/unicode.docx" 169 "docx/unicode.native" 170 , testCompare 171 "literal tabs" 172 "docx/tabs.docx" 173 "docx/tabs.native" 174 , testCompare 175 "special punctuation" 176 "docx/special_punctuation.docx" 177 "docx/special_punctuation.native" 178 , testCompare 179 "normalizing inlines" 180 "docx/normalize.docx" 181 "docx/normalize.native" 182 , testCompare 183 "normalizing inlines deep inside blocks" 184 "docx/deep_normalize.docx" 185 "docx/deep_normalize.native" 186 , testCompare 187 "move trailing spaces outside of formatting" 188 "docx/trailing_spaces_in_formatting.docx" 189 "docx/trailing_spaces_in_formatting.native" 190 , testCompare 191 "remove trailing spaces from last inline" 192 "docx/trim_last_inline.docx" 193 "docx/trim_last_inline.native" 194 , testCompare 195 "inline code (with VerbatimChar style)" 196 "docx/inline_code.docx" 197 "docx/inline_code.native" 198 , testCompare 199 "inline code in subscript and superscript" 200 "docx/verbatim_subsuper.docx" 201 "docx/verbatim_subsuper.native" 202 , testCompare 203 "inlines inside of Structured Document Tags" 204 "docx/sdt_elements.docx" 205 "docx/sdt_elements.native" 206 , testCompare 207 "Structured Document Tags in footnotes" 208 "docx/sdt_in_footnote.docx" 209 "docx/sdt_in_footnote.native" 210 , testCompare 211 "nested Structured Document Tags" 212 "docx/nested_sdt.docx" 213 "docx/nested_sdt.native" 214 , testCompare 215 "nested Smart Tags" 216 "docx/nested_smart_tags.docx" 217 "docx/nested_smart_tags.native" 218 , testCompare 219 "remove anchor spans with nothing pointing to them" 220 "docx/unused_anchors.docx" 221 "docx/unused_anchors.native" 222 , testCompare 223 "collapse overlapping targets (anchor spans)" 224 "docx/overlapping_targets.docx" 225 "docx/overlapping_targets.native" 226 ] 227 , testGroup "blocks" 228 [ testCompare 229 "headers" 230 "docx/headers.docx" 231 "docx/headers.native" 232 , testCompare 233 "headers already having auto identifiers" 234 "docx/already_auto_ident.docx" 235 "docx/already_auto_ident.native" 236 , testCompare 237 "avoid zero-level headers" 238 "docx/0_level_headers.docx" 239 "docx/0_level_headers.native" 240 , testCompare 241 "nested anchor spans in header" 242 "docx/nested_anchors_in_header.docx" 243 "docx/nested_anchors_in_header.native" 244 , testCompare 245 "single numbered item not made into list" 246 "docx/numbered_header.docx" 247 "docx/numbered_header.native" 248 , testCompare 249 "enumerated headers not made into numbered list" 250 "docx/enumerated_headings.docx" 251 "docx/enumerated_headings.native" 252 , testCompare 253 "i18n blocks (headers and blockquotes)" 254 "docx/i18n_blocks.docx" 255 "docx/i18n_blocks.native" 256 , testCompare 257 "lists" 258 "docx/lists.docx" 259 "docx/lists.native" 260 , testCompare 261 "compact lists" 262 "docx/lists-compact.docx" 263 "docx/lists-compact.native" 264 , testCompare 265 "lists with level overrides" 266 "docx/lists_level_override.docx" 267 "docx/lists_level_override.native" 268 , testCompare 269 "lists continuing after interruption" 270 "docx/lists_continuing.docx" 271 "docx/lists_continuing.native" 272 , testCompare 273 "lists restarting after interruption" 274 "docx/lists_restarting.docx" 275 "docx/lists_restarting.native" 276 , testCompare 277 "sublists reset numbering to 1" 278 "docx/lists_sublist_reset.docx" 279 "docx/lists_sublist_reset.native" 280 , testCompare 281 "definition lists" 282 "docx/definition_list.docx" 283 "docx/definition_list.native" 284 , testCompare 285 "custom defined lists in styles" 286 "docx/german_styled_lists.docx" 287 "docx/german_styled_lists.native" 288 , testCompare 289 "user deletes bullet after list item (=> part of item par)" 290 "docx/dummy_item_after_list_item.docx" 291 "docx/dummy_item_after_list_item.native" 292 , testCompare 293 "user deletes bullet after par (=> new par)" 294 "docx/dummy_item_after_paragraph.docx" 295 "docx/dummy_item_after_paragraph.native" 296 , testCompare 297 "footnotes and endnotes" 298 "docx/notes.docx" 299 "docx/notes.native" 300 , testCompare 301 "links in footnotes and endnotes" 302 "docx/link_in_notes.docx" 303 "docx/link_in_notes.native" 304 , testCompare 305 "blockquotes (parsing indent as blockquote)" 306 "docx/block_quotes.docx" 307 "docx/block_quotes_parse_indent.native" 308 , testCompare 309 "hanging indents" 310 "docx/hanging_indent.docx" 311 "docx/hanging_indent.native" 312 , testCompare 313 "tables" 314 "docx/tables.docx" 315 "docx/tables.native" 316 , testCompare 317 "tables with lists in cells" 318 "docx/table_with_list_cell.docx" 319 "docx/table_with_list_cell.native" 320 , testCompare 321 "a table with a header which contains rowspans greater than 1" 322 "docx/table_header_rowspan.docx" 323 "docx/table_header_rowspan.native" 324 , testCompare 325 "tables with one row" 326 "docx/table_one_row.docx" 327 "docx/table_one_row.native" 328 , testCompare 329 "tables with just one row, which is a header" 330 "docx/table_one_header_row.docx" 331 "docx/table_one_header_row.native" 332 , testCompare 333 "tables with variable width" 334 "docx/table_variable_width.docx" 335 "docx/table_variable_width.native" 336 , testCompare 337 "tables with captions which contain a Table field" 338 "docx/table_captions_with_field.docx" 339 "docx/table_captions_with_field.native" 340 , testCompare 341 "tables with captions which don't contain a Table field" 342 "docx/table_captions_no_field.docx" 343 "docx/table_captions_no_field.native" 344 , testCompare 345 "code block" 346 "docx/codeblock.docx" 347 "docx/codeblock.native" 348 , testCompare 349 "combine adjacent code blocks" 350 "docx/adjacent_codeblocks.docx" 351 "docx/adjacent_codeblocks.native" 352 , testCompare 353 "dropcap paragraphs" 354 "docx/drop_cap.docx" 355 "docx/drop_cap.native" 356 ] 357 , testGroup "track changes" 358 [ testCompare 359 "insertion (default)" 360 "docx/track_changes_insertion.docx" 361 "docx/track_changes_insertion_accept.native" 362 , testCompareWithOpts def{readerTrackChanges=AcceptChanges} 363 "insert insertion (accept)" 364 "docx/track_changes_insertion.docx" 365 "docx/track_changes_insertion_accept.native" 366 , testCompareWithOpts def{readerTrackChanges=RejectChanges} 367 "remove insertion (reject)" 368 "docx/track_changes_insertion.docx" 369 "docx/track_changes_insertion_reject.native" 370 , testCompare 371 "deletion (default)" 372 "docx/track_changes_deletion.docx" 373 "docx/track_changes_deletion_accept.native" 374 , testCompareWithOpts def{readerTrackChanges=AcceptChanges} 375 "remove deletion (accept)" 376 "docx/track_changes_deletion.docx" 377 "docx/track_changes_deletion_accept.native" 378 , testCompareWithOpts def{readerTrackChanges=RejectChanges} 379 "insert deletion (reject)" 380 "docx/track_changes_deletion.docx" 381 "docx/track_changes_deletion_reject.native" 382 , testCompareWithOpts def{readerTrackChanges=AllChanges} 383 "keep insertion (all)" 384 "docx/track_changes_deletion.docx" 385 "docx/track_changes_deletion_all.native" 386 , testCompareWithOpts def{readerTrackChanges=AllChanges} 387 "keep deletion (all)" 388 "docx/track_changes_deletion.docx" 389 "docx/track_changes_deletion_all.native" 390 , testCompareWithOpts def{readerTrackChanges=AcceptChanges} 391 "move text (accept)" 392 "docx/track_changes_move.docx" 393 "docx/track_changes_move_accept.native" 394 , testCompareWithOpts def{readerTrackChanges=RejectChanges} 395 "move text (reject)" 396 "docx/track_changes_move.docx" 397 "docx/track_changes_move_reject.native" 398 , testCompareWithOpts def{readerTrackChanges=AllChanges} 399 "move text (all)" 400 "docx/track_changes_move.docx" 401 "docx/track_changes_move_all.native" 402 , testCompareWithOpts def{readerTrackChanges=AcceptChanges} 403 "comments (accept -- no comments)" 404 "docx/comments.docx" 405 "docx/comments_no_comments.native" 406 , testCompareWithOpts def{readerTrackChanges=RejectChanges} 407 "comments (reject -- comments)" 408 "docx/comments.docx" 409 "docx/comments_no_comments.native" 410 , testCompareWithOpts def{readerTrackChanges=AllChanges} 411 "comments (all comments)" 412 "docx/comments.docx" 413 "docx/comments.native" 414 , testCompareWithOpts def{readerTrackChanges=AcceptChanges} 415 "paragraph insertion/deletion (accept)" 416 "docx/paragraph_insertion_deletion.docx" 417 "docx/paragraph_insertion_deletion_accept.native" 418 , testCompareWithOpts def{readerTrackChanges=RejectChanges} 419 "paragraph insertion/deletion (reject)" 420 "docx/paragraph_insertion_deletion.docx" 421 "docx/paragraph_insertion_deletion_reject.native" 422 , testCompareWithOpts def{readerTrackChanges=AllChanges} 423 "paragraph insertion/deletion (all)" 424 "docx/paragraph_insertion_deletion.docx" 425 "docx/paragraph_insertion_deletion_all.native" 426 , testCompareWithOpts def{readerTrackChanges=AllChanges} 427 "paragraph insertion/deletion (all)" 428 "docx/track_changes_scrubbed_metadata.docx" 429 "docx/track_changes_scrubbed_metadata.native" 430 , testForWarningsWithOpts def{readerTrackChanges=AcceptChanges} 431 "comment warnings (accept -- no warnings)" 432 "docx/comments_warning.docx" 433 [] 434 , testForWarningsWithOpts def{readerTrackChanges=RejectChanges} 435 "comment warnings (reject -- no warnings)" 436 "docx/comments_warning.docx" 437 [] 438 , testForWarningsWithOpts def{readerTrackChanges=AllChanges} 439 "comment warnings (all)" 440 "docx/comments_warning.docx" 441 ["Docx comment 1 will not retain formatting"] 442 ] 443 , testGroup "media" 444 [ testMediaBag 445 "image extraction" 446 "docx/image.docx" 447 ] 448 , testGroup "custom styles" 449 [ testCompare 450 "custom styles (`+styles`) not enabled (default)" 451 "docx/custom-style-reference.docx" 452 "docx/custom-style-no-styles.native" 453 , testCompareWithOpts 454 def{readerExtensions=extensionsFromList [Ext_styles]} 455 "custom styles (`+styles`) enabled" 456 "docx/custom-style-reference.docx" 457 "docx/custom-style-with-styles.native" 458 , testCompareWithOpts 459 def{readerExtensions=extensionsFromList [Ext_styles]} 460 "custom styles (`+styles`): Compact style is removed from output" 461 "docx/compact-style-removal.docx" 462 "docx/compact-style-removal.native" 463 ] 464 , testGroup "metadata" 465 [ testCompareWithOpts def{readerStandalone=True} 466 "metadata fields" 467 "docx/metadata.docx" 468 "docx/metadata.native" 469 , testCompareWithOpts def{readerStandalone=True} 470 "stop recording metadata with normal text" 471 "docx/metadata_after_normal.docx" 472 "docx/metadata_after_normal.native" 473 ] 474 ] 475