1{-# LANGUAGE OverloadedStrings #-}
2{- |
3   Module      : Tests.Readers.Docx
4   Copyright   : © 2017-2020 Jesse Rosenthal, John MacFarlane
5   License     : GNU GPL, version 2 or above
6
7   Maintainer  : Jesse Rosenthal <jrosenthal@jhu.edu>
8   Stability   : alpha
9   Portability : portable
10
11Tests for the word docx reader.
12-}
13module Tests.Readers.Docx (tests) where
14
15import Codec.Archive.Zip
16import qualified Data.ByteString as BS
17import qualified Data.ByteString.Lazy as B
18import qualified Data.Map as M
19import qualified Data.Text as T
20import Data.Maybe
21import System.IO.Unsafe
22import Test.Tasty
23import Test.Tasty.HUnit
24import Tests.Helpers
25import Text.Pandoc
26import qualified Text.Pandoc.Class as P
27import qualified Text.Pandoc.MediaBag as MB
28import Text.Pandoc.UTF8 as UTF8
29
30-- We define a wrapper around pandoc that doesn't normalize in the
31-- tests. Since we do our own normalization, we want to make sure
32-- we're doing it right.
33
34newtype NoNormPandoc = NoNormPandoc {unNoNorm :: Pandoc}
35                 deriving Show
36
37noNorm :: Pandoc -> NoNormPandoc
38noNorm = NoNormPandoc
39
40defopts :: ReaderOptions
41defopts = def{ readerExtensions = getDefaultExtensions "docx" }
42
43instance ToString NoNormPandoc where
44  toString d = T.unpack $ purely (writeNative def{ writerTemplate = s }) $ toPandoc d
45   where s = case d of
46                  NoNormPandoc (Pandoc (Meta m) _)
47                    | M.null m  -> Nothing
48                    | otherwise -> Just mempty -- need this to get meta output
49
50instance ToPandoc NoNormPandoc where
51  toPandoc = unNoNorm
52
53compareOutput :: ReaderOptions
54                 -> FilePath
55                 -> FilePath
56                 -> IO (NoNormPandoc, NoNormPandoc)
57compareOutput opts docxFile nativeFile = do
58  df <- B.readFile docxFile
59  nf <- UTF8.toText <$> BS.readFile nativeFile
60  p <- runIOorExplode $ readDocx opts df
61  df' <- runIOorExplode $ readNative def nf
62  return (noNorm p, noNorm df')
63
64testCompareWithOptsIO :: ReaderOptions -> String -> FilePath -> FilePath -> IO TestTree
65testCompareWithOptsIO opts name docxFile nativeFile = do
66  (dp, np) <- compareOutput opts docxFile nativeFile
67  return $ test id name (dp, np)
68
69testCompareWithOpts :: ReaderOptions -> String -> FilePath -> FilePath -> TestTree
70testCompareWithOpts opts name docxFile nativeFile =
71  unsafePerformIO $ testCompareWithOptsIO opts name docxFile nativeFile
72
73testCompare :: String -> FilePath -> FilePath -> TestTree
74testCompare = testCompareWithOpts defopts
75
76testForWarningsWithOptsIO :: ReaderOptions -> String -> FilePath -> [String] -> IO TestTree
77testForWarningsWithOptsIO opts name docxFile expected = do
78  df <- B.readFile docxFile
79  logs <-  runIOorExplode $ setVerbosity ERROR >> readDocx opts df >> P.getLog
80  let warns = [m | DocxParserWarning m <- logs]
81  return $ test id name (T.unlines warns, unlines expected)
82
83testForWarningsWithOpts :: ReaderOptions -> String -> FilePath -> [String] -> TestTree
84testForWarningsWithOpts opts name docxFile expected =
85  unsafePerformIO $ testForWarningsWithOptsIO opts name docxFile expected
86
87-- testForWarnings :: String -> FilePath -> [String] -> TestTree
88-- testForWarnings = testForWarningsWithOpts defopts
89
90getMedia :: FilePath -> FilePath -> IO (Maybe B.ByteString)
91getMedia archivePath mediaPath = fmap fromEntry . findEntryByPath
92    ("word/" ++ mediaPath) . toArchive <$> B.readFile archivePath
93
94compareMediaPathIO :: FilePath -> MB.MediaBag -> FilePath -> IO Bool
95compareMediaPathIO mediaPath mediaBag docxPath = do
96  docxMedia <- getMedia docxPath mediaPath
97  let mbBS   = case MB.lookupMedia mediaPath mediaBag of
98                 Just item    -> MB.mediaContents item
99                 Nothing      -> error ("couldn't find " ++
100                                        mediaPath ++
101                                        " in media bag")
102      docxBS = fromMaybe (error ("couldn't find " ++
103                        mediaPath ++
104                        " in media bag")) docxMedia
105  return $ mbBS == docxBS
106
107compareMediaBagIO :: FilePath -> IO Bool
108compareMediaBagIO docxFile = do
109    df <- B.readFile docxFile
110    mb <- runIOorExplode $ readDocx defopts df >> P.getMediaBag
111    bools <- mapM
112             (\(fp, _, _) -> compareMediaPathIO fp mb docxFile)
113             (MB.mediaDirectory mb)
114    return $ and bools
115
116testMediaBagIO :: String -> FilePath -> IO TestTree
117testMediaBagIO name docxFile = do
118  outcome <- compareMediaBagIO docxFile
119  return $ testCase name (assertBool
120                          ("Media didn't match media bag in file " ++ docxFile)
121                          outcome)
122
123testMediaBag :: String -> FilePath -> TestTree
124testMediaBag name docxFile = unsafePerformIO $ testMediaBagIO name docxFile
125
126tests :: [TestTree]
127tests = [ testGroup "document"
128          [ testCompare
129            "allow different document.xml file as defined in _rels/.rels"
130            "docx/alternate_document_path.docx"
131            "docx/alternate_document_path.native"
132          ]
133        , testGroup "inlines"
134          [ testCompare
135            "font formatting"
136            "docx/inline_formatting.docx"
137            "docx/inline_formatting.native"
138          , testCompare
139            "font formatting with character styles"
140            "docx/char_styles.docx"
141            "docx/char_styles.native"
142          , testCompare
143            "hyperlinks"
144            "docx/links.docx"
145            "docx/links.native"
146          , testCompare
147            "hyperlinks in <w:instrText> tag"
148            "docx/instrText_hyperlink.docx"
149            "docx/instrText_hyperlink.native"
150          , testCompare
151            "inline image"
152            "docx/image.docx"
153            "docx/image_no_embed.native"
154          , testCompare
155            "VML image"
156            "docx/image_vml.docx"
157            "docx/image_vml.native"
158          , testCompare
159            "VML image as object"
160            "docx/image_vml_as_object.docx"
161            "docx/image_vml_as_object.native"
162          , testCompare
163            "inline image in links"
164            "docx/inline_images.docx"
165            "docx/inline_images.native"
166          , testCompare
167            "handling unicode input"
168            "docx/unicode.docx"
169            "docx/unicode.native"
170          , testCompare
171            "literal tabs"
172            "docx/tabs.docx"
173            "docx/tabs.native"
174          , testCompare
175            "special punctuation"
176            "docx/special_punctuation.docx"
177            "docx/special_punctuation.native"
178          , testCompare
179            "normalizing inlines"
180            "docx/normalize.docx"
181            "docx/normalize.native"
182          , testCompare
183            "normalizing inlines deep inside blocks"
184            "docx/deep_normalize.docx"
185            "docx/deep_normalize.native"
186          , testCompare
187            "move trailing spaces outside of formatting"
188            "docx/trailing_spaces_in_formatting.docx"
189            "docx/trailing_spaces_in_formatting.native"
190          , testCompare
191            "remove trailing spaces from last inline"
192            "docx/trim_last_inline.docx"
193            "docx/trim_last_inline.native"
194          , testCompare
195            "inline code (with VerbatimChar style)"
196            "docx/inline_code.docx"
197            "docx/inline_code.native"
198          , testCompare
199            "inline code in subscript and superscript"
200            "docx/verbatim_subsuper.docx"
201            "docx/verbatim_subsuper.native"
202          , testCompare
203            "inlines inside of Structured Document Tags"
204            "docx/sdt_elements.docx"
205            "docx/sdt_elements.native"
206          , testCompare
207            "Structured Document Tags in footnotes"
208            "docx/sdt_in_footnote.docx"
209            "docx/sdt_in_footnote.native"
210          , testCompare
211            "nested Structured Document Tags"
212            "docx/nested_sdt.docx"
213            "docx/nested_sdt.native"
214          , testCompare
215            "nested Smart Tags"
216            "docx/nested_smart_tags.docx"
217            "docx/nested_smart_tags.native"
218          , testCompare
219            "remove anchor spans with nothing pointing to them"
220            "docx/unused_anchors.docx"
221            "docx/unused_anchors.native"
222          , testCompare
223            "collapse overlapping targets (anchor spans)"
224            "docx/overlapping_targets.docx"
225            "docx/overlapping_targets.native"
226          ]
227        , testGroup "blocks"
228          [ testCompare
229            "headers"
230            "docx/headers.docx"
231            "docx/headers.native"
232          , testCompare
233            "headers already having auto identifiers"
234            "docx/already_auto_ident.docx"
235            "docx/already_auto_ident.native"
236          , testCompare
237            "avoid zero-level headers"
238            "docx/0_level_headers.docx"
239            "docx/0_level_headers.native"
240          , testCompare
241            "nested anchor spans in header"
242            "docx/nested_anchors_in_header.docx"
243            "docx/nested_anchors_in_header.native"
244          , testCompare
245            "single numbered item not made into list"
246            "docx/numbered_header.docx"
247            "docx/numbered_header.native"
248          , testCompare
249            "enumerated headers not made into numbered list"
250            "docx/enumerated_headings.docx"
251            "docx/enumerated_headings.native"
252          , testCompare
253            "i18n blocks (headers and blockquotes)"
254            "docx/i18n_blocks.docx"
255            "docx/i18n_blocks.native"
256          , testCompare
257            "lists"
258            "docx/lists.docx"
259            "docx/lists.native"
260          , testCompare
261            "compact lists"
262            "docx/lists-compact.docx"
263            "docx/lists-compact.native"
264          , testCompare
265            "lists with level overrides"
266            "docx/lists_level_override.docx"
267            "docx/lists_level_override.native"
268          , testCompare
269            "lists continuing after interruption"
270            "docx/lists_continuing.docx"
271            "docx/lists_continuing.native"
272          , testCompare
273            "lists restarting after interruption"
274            "docx/lists_restarting.docx"
275            "docx/lists_restarting.native"
276          , testCompare
277            "sublists reset numbering to 1"
278            "docx/lists_sublist_reset.docx"
279            "docx/lists_sublist_reset.native"
280          , testCompare
281            "definition lists"
282            "docx/definition_list.docx"
283            "docx/definition_list.native"
284          , testCompare
285            "custom defined lists in styles"
286            "docx/german_styled_lists.docx"
287            "docx/german_styled_lists.native"
288          , testCompare
289            "user deletes bullet after list item (=> part of item par)"
290            "docx/dummy_item_after_list_item.docx"
291            "docx/dummy_item_after_list_item.native"
292          , testCompare
293            "user deletes bullet after par (=> new par)"
294            "docx/dummy_item_after_paragraph.docx"
295            "docx/dummy_item_after_paragraph.native"
296          , testCompare
297            "footnotes and endnotes"
298            "docx/notes.docx"
299            "docx/notes.native"
300          , testCompare
301            "links in footnotes and endnotes"
302            "docx/link_in_notes.docx"
303            "docx/link_in_notes.native"
304          , testCompare
305            "blockquotes (parsing indent as blockquote)"
306            "docx/block_quotes.docx"
307            "docx/block_quotes_parse_indent.native"
308          , testCompare
309            "hanging indents"
310            "docx/hanging_indent.docx"
311            "docx/hanging_indent.native"
312          , testCompare
313            "tables"
314            "docx/tables.docx"
315            "docx/tables.native"
316          , testCompare
317            "tables with lists in cells"
318            "docx/table_with_list_cell.docx"
319            "docx/table_with_list_cell.native"
320          , testCompare
321            "a table with a header which contains rowspans greater than 1"
322            "docx/table_header_rowspan.docx"
323            "docx/table_header_rowspan.native"
324          , testCompare
325            "tables with one row"
326            "docx/table_one_row.docx"
327            "docx/table_one_row.native"
328          , testCompare
329            "tables with just one row, which is a header"
330            "docx/table_one_header_row.docx"
331            "docx/table_one_header_row.native"
332          , testCompare
333            "tables with variable width"
334            "docx/table_variable_width.docx"
335            "docx/table_variable_width.native"
336          , testCompare
337            "tables with captions which contain a Table field"
338            "docx/table_captions_with_field.docx"
339            "docx/table_captions_with_field.native"
340          , testCompare
341            "tables with captions which don't contain a Table field"
342            "docx/table_captions_no_field.docx"
343            "docx/table_captions_no_field.native"
344          , testCompare
345            "code block"
346            "docx/codeblock.docx"
347            "docx/codeblock.native"
348          , testCompare
349            "combine adjacent code blocks"
350            "docx/adjacent_codeblocks.docx"
351            "docx/adjacent_codeblocks.native"
352          , testCompare
353            "dropcap paragraphs"
354            "docx/drop_cap.docx"
355            "docx/drop_cap.native"
356          ]
357        , testGroup "track changes"
358          [ testCompare
359            "insertion (default)"
360            "docx/track_changes_insertion.docx"
361            "docx/track_changes_insertion_accept.native"
362          , testCompareWithOpts def{readerTrackChanges=AcceptChanges}
363            "insert insertion (accept)"
364            "docx/track_changes_insertion.docx"
365            "docx/track_changes_insertion_accept.native"
366          , testCompareWithOpts def{readerTrackChanges=RejectChanges}
367            "remove insertion (reject)"
368            "docx/track_changes_insertion.docx"
369            "docx/track_changes_insertion_reject.native"
370          , testCompare
371            "deletion (default)"
372            "docx/track_changes_deletion.docx"
373            "docx/track_changes_deletion_accept.native"
374          , testCompareWithOpts def{readerTrackChanges=AcceptChanges}
375            "remove deletion (accept)"
376            "docx/track_changes_deletion.docx"
377            "docx/track_changes_deletion_accept.native"
378          , testCompareWithOpts def{readerTrackChanges=RejectChanges}
379            "insert deletion (reject)"
380            "docx/track_changes_deletion.docx"
381            "docx/track_changes_deletion_reject.native"
382          , testCompareWithOpts def{readerTrackChanges=AllChanges}
383            "keep insertion (all)"
384            "docx/track_changes_deletion.docx"
385            "docx/track_changes_deletion_all.native"
386          , testCompareWithOpts def{readerTrackChanges=AllChanges}
387            "keep deletion (all)"
388            "docx/track_changes_deletion.docx"
389            "docx/track_changes_deletion_all.native"
390          , testCompareWithOpts def{readerTrackChanges=AcceptChanges}
391            "move text (accept)"
392            "docx/track_changes_move.docx"
393            "docx/track_changes_move_accept.native"
394          , testCompareWithOpts def{readerTrackChanges=RejectChanges}
395            "move text (reject)"
396            "docx/track_changes_move.docx"
397            "docx/track_changes_move_reject.native"
398          , testCompareWithOpts def{readerTrackChanges=AllChanges}
399            "move text (all)"
400            "docx/track_changes_move.docx"
401            "docx/track_changes_move_all.native"
402          , testCompareWithOpts def{readerTrackChanges=AcceptChanges}
403            "comments (accept -- no comments)"
404            "docx/comments.docx"
405            "docx/comments_no_comments.native"
406          , testCompareWithOpts def{readerTrackChanges=RejectChanges}
407            "comments (reject -- comments)"
408            "docx/comments.docx"
409            "docx/comments_no_comments.native"
410          , testCompareWithOpts def{readerTrackChanges=AllChanges}
411            "comments (all comments)"
412            "docx/comments.docx"
413            "docx/comments.native"
414          , testCompareWithOpts def{readerTrackChanges=AcceptChanges}
415            "paragraph insertion/deletion (accept)"
416            "docx/paragraph_insertion_deletion.docx"
417            "docx/paragraph_insertion_deletion_accept.native"
418          , testCompareWithOpts def{readerTrackChanges=RejectChanges}
419            "paragraph insertion/deletion (reject)"
420            "docx/paragraph_insertion_deletion.docx"
421            "docx/paragraph_insertion_deletion_reject.native"
422          , testCompareWithOpts def{readerTrackChanges=AllChanges}
423            "paragraph insertion/deletion (all)"
424            "docx/paragraph_insertion_deletion.docx"
425            "docx/paragraph_insertion_deletion_all.native"
426          , testCompareWithOpts def{readerTrackChanges=AllChanges}
427            "paragraph insertion/deletion (all)"
428            "docx/track_changes_scrubbed_metadata.docx"
429            "docx/track_changes_scrubbed_metadata.native"
430          , testForWarningsWithOpts def{readerTrackChanges=AcceptChanges}
431            "comment warnings (accept -- no warnings)"
432            "docx/comments_warning.docx"
433            []
434          , testForWarningsWithOpts def{readerTrackChanges=RejectChanges}
435            "comment warnings (reject -- no warnings)"
436            "docx/comments_warning.docx"
437            []
438          , testForWarningsWithOpts def{readerTrackChanges=AllChanges}
439            "comment warnings (all)"
440            "docx/comments_warning.docx"
441            ["Docx comment 1 will not retain formatting"]
442          ]
443        , testGroup "media"
444          [ testMediaBag
445            "image extraction"
446            "docx/image.docx"
447          ]
448        , testGroup "custom styles"
449          [ testCompare
450            "custom styles (`+styles`) not enabled (default)"
451            "docx/custom-style-reference.docx"
452            "docx/custom-style-no-styles.native"
453          , testCompareWithOpts
454            def{readerExtensions=extensionsFromList [Ext_styles]}
455            "custom styles (`+styles`) enabled"
456            "docx/custom-style-reference.docx"
457            "docx/custom-style-with-styles.native"
458          , testCompareWithOpts
459            def{readerExtensions=extensionsFromList [Ext_styles]}
460            "custom styles (`+styles`): Compact style is removed from output"
461            "docx/compact-style-removal.docx"
462            "docx/compact-style-removal.native"
463          ]
464        , testGroup "metadata"
465          [ testCompareWithOpts def{readerStandalone=True}
466            "metadata fields"
467            "docx/metadata.docx"
468            "docx/metadata.native"
469          , testCompareWithOpts def{readerStandalone=True}
470            "stop recording metadata with normal text"
471            "docx/metadata_after_normal.docx"
472            "docx/metadata_after_normal.native"
473          ]
474        ]
475