1{-# LANGUAGE GeneralizedNewtypeDeriving #-}
2{-# LANGUAGE MultiParamTypeClasses #-}
3-- | This module provides a set of data types to represent message text.
4-- The inline and block types in this module are designed to represent
5-- most of what is found in Markdown documents (particularly the
6-- Commonmark specification) in addition to other things we find in
7-- Mattermost messages, such as username or channel references.
8--
9-- To parse a Markdown document, use 'parseMarkdown'. To actually render
10-- text in this representation, see the module 'Draw.RichText'.
11module Matterhorn.Types.RichText
12  ( Blocks(..)
13  , unBlocks
14
15  , Block(..)
16  , sameBlockType
17  , CodeBlockInfo(..)
18  , Inline(..)
19  , Inlines(..)
20  , unInlines
21
22  , C.ListType(..)
23  , C.ListSpacing(..)
24  , C.EnumeratorType(..)
25  , C.DelimiterType(..)
26  , C.ColAlignment(..)
27
28  , TeamBaseURL(..)
29  , TeamURLName(..)
30
31  , URL(..)
32  , unURL
33
34  , parseMarkdown
35
36  , findUsernames
37  , blockGetURLs
38  , findVerbatimChunk
39  )
40where
41
42import           Prelude ()
43import           Matterhorn.Prelude
44
45import qualified Commonmark as C
46import qualified Commonmark.Extensions as C
47import qualified Commonmark.Inlines as C
48import qualified Commonmark.TokParsers as C
49import           Control.Monad.Identity
50import qualified Data.Foldable as F
51import           Data.List ( intersperse )
52import           Data.Monoid (First(..))
53import qualified Data.Set as S
54import qualified Data.Sequence as Seq
55import           Data.Sequence ( (<|), viewl, viewr, ViewL((:<)), ViewR((:>)) )
56import qualified Data.Text as T
57import qualified Text.Parsec as P
58
59import           Network.Mattermost.Types ( PostId(..), Id(..), ServerBaseURL(..) )
60
61import           Matterhorn.Constants ( userSigilChar, normalChannelSigilChar )
62
63-- | A team name found in a Mattermost post URL
64data TeamURLName = TeamURLName Text
65                 deriving (Eq, Show, Ord)
66
67-- | A server base URL with a team name.
68data TeamBaseURL = TeamBaseURL TeamURLName ServerBaseURL
69                 deriving (Eq, Show)
70
71-- | A sequence of rich text blocks.
72newtype Blocks = Blocks (Seq Block)
73            deriving (Semigroup, Monoid, Show)
74
75unBlocks :: Blocks -> Seq Block
76unBlocks (Blocks bs) = bs
77
78singleB :: Block -> Blocks
79singleB = Blocks . Seq.singleton
80
81-- | A block in a rich text document.
82--
83-- NOTE: update 'sameBlockType' when constructors are added to this
84-- type.
85data Block =
86    Para Inlines
87    -- ^ A paragraph.
88    | Header Int Inlines
89    -- ^ A section header with specified depth and contents.
90    | Blockquote Blocks
91    -- ^ A blockquote.
92    | List C.ListType C.ListSpacing (Seq Blocks)
93    -- ^ An itemized list.
94    | CodeBlock CodeBlockInfo Text
95    -- ^ A code block.
96    | HTMLBlock Text
97    -- ^ A fragment of raw HTML.
98    | HRule
99    -- ^ A horizontal rule.
100    | Table [C.ColAlignment] [Inlines] [[Inlines]]
101    -- ^ A table.
102    deriving (Show)
103
104-- | Returns whether two blocks have the same type.
105sameBlockType :: Block -> Block -> Bool
106sameBlockType (Para {})       (Para {})       = True
107sameBlockType (Header {})     (Header {})     = True
108sameBlockType (Blockquote {}) (Blockquote {}) = True
109sameBlockType (List {})       (List {})       = True
110sameBlockType (CodeBlock {})  (CodeBlock {})  = True
111sameBlockType (HTMLBlock {})  (HTMLBlock {})  = True
112sameBlockType _               _               = False
113
114-- | Information about a code block.
115data CodeBlockInfo =
116    CodeBlockInfo { codeBlockLanguage :: Maybe Text
117                  -- ^ The language of the source code in the code
118                  -- block, if any. This is encoded in Markdown as a
119                  -- sequence of non-whitespace characters following the
120                  -- fenced code block opening backticks.
121                  , codeBlockInfo :: Maybe Text
122                  -- ^ Any text that comes after the language token.
123                  -- This text is separated from the language token by
124                  -- whitespace.
125                  }
126                  deriving (Eq, Show, Ord)
127
128-- | A URL.
129newtype URL = URL Text
130            deriving (Eq, Show, Ord)
131
132unURL :: URL -> Text
133unURL (URL url) = url
134
135-- | The kinds of inline values that can appear in rich text blocks.
136data Inline =
137    EText Text
138    -- ^ Plain text that SHOULD be a contiguous sequence of
139    -- non-whitespace characters.
140    | EEmph Inlines
141    -- ^ Emphasized (usually italicized) content.
142    | EStrikethrough Inlines
143    -- ^ Strikethrough content.
144    | EStrong Inlines
145    -- ^ Boldface content.
146    | ECode Inlines
147    -- ^ A sequence of non-whitespace characters.
148    | ESpace
149    -- ^ A single space.
150    | ESoftBreak
151    -- ^ A soft line break.
152    | ELineBreak
153    -- ^ A hard line break.
154    | ERawHtml Text
155    -- ^ Raw HTML.
156    | EEditSentinel Bool
157    -- ^ A sentinel indicating that some text has been edited (used
158    -- to indicate that mattermost messages have been edited by their
159    -- authors). This has no parsable representation; it is only used
160    -- to annotate a message prior to rendering to add a visual editing
161    -- indicator. The boolean indicates whether the edit was "recent"
162    -- (True) or not (False).
163    | EUser Text
164    -- ^ A user reference. The text here includes only the username, not
165    -- the sigil.
166    | EChannel Text
167    -- ^ A channel reference. The text here includes only the channel
168    -- name, not the sigil.
169    | EHyperlink URL Inlines
170    -- ^ A hyperlink to the specified URL. Optionally provides an
171    -- element sequence indicating the URL's text label; if absent, the
172    -- label is understood to be the URL itself.
173    | EImage URL Inlines
174    -- ^ An image at the specified URL. Optionally provides an element
175    -- sequence indicating the image's "alt" text label; if absent, the
176    -- label is understood to be the URL itself.
177    | EEmoji Text
178    -- ^ An emoji reference. The text here includes only the text
179    -- portion, not the colons, e.g. "foo" instead of ":foo:".
180    | ENonBreaking Inlines
181    -- ^ A sequence of elements that must never be separated during line
182    -- wrapping.
183    | EPermalink TeamURLName PostId (Maybe Inlines)
184    -- ^ A permalink to the specified team (name) and post ID with an
185    -- optional label.
186    deriving (Show, Eq, Ord)
187
188-- | A sequence of inline values.
189newtype Inlines = Inlines (Seq Inline)
190                deriving (Monoid, Ord, Eq, Show)
191
192unInlines :: Inlines -> Seq Inline
193unInlines (Inlines is) = is
194
195singleI :: Inline -> Inlines
196singleI = Inlines . Seq.singleton
197
198instance Semigroup Inlines where
199    (Inlines l) <> (Inlines r) =
200        Inlines $ case (viewr l, viewl r) of
201            (lInit :> lLast, rHead :< rTail) ->
202                case (lLast, rHead) of
203                    (EText a, EText b) ->
204                        lInit <> ((EText $ a <> b) <| rTail)
205                    (ECode a, ECode b) ->
206                        lInit <> ((ECode $ a <> b) <| rTail)
207                    (EEmph a, EEmph b) ->
208                        lInit <> ((EEmph $ a <> b) <| rTail)
209                    (EStrikethrough a, EStrikethrough b) ->
210                        lInit <> ((EStrikethrough $ a <> b) <| rTail)
211                    (EStrong a, EStrong b) ->
212                        lInit <> ((EStrong $ a <> b) <| rTail)
213                    (_, _) ->
214                        l <> r
215            (_, _) -> l <> r
216
217-- A dummy instance just to satisfy commonmark; we don't use this.
218instance C.Rangeable Inlines where
219    ranged _ = id
220
221-- A dummy instance just to satisfy commonmark; we don't use this.
222instance C.HasAttributes Inlines where
223    addAttributes _ = id
224
225instance C.IsInline Inlines where
226    lineBreak = singleI ELineBreak
227    softBreak = singleI ESoftBreak
228    str t = Inlines $ Seq.fromList $
229            filter (/= (EText "")) $
230            intersperse ESpace $ EText <$> T.splitOn " " t
231    entity = singleI . EText
232    escapedChar = singleI . EText . T.singleton
233    emph = singleI . EEmph
234    strong = singleI . EStrong
235    link url _title desc = singleI $ EHyperlink (URL url) desc
236    image url _title desc = singleI $ EImage (URL url) desc
237    code t = singleI $ ECode $ C.str t
238    rawInline _ = singleI . ERawHtml
239
240instance C.HasStrikethrough Inlines where
241    strikethrough = singleI . EStrikethrough
242
243instance C.HasPipeTable Inlines Blocks where
244    pipeTable a h b = singleB $ Table a h b
245
246-- Syntax extension for parsing ~channel references.
247channelSpec :: (Monad m) => C.SyntaxSpec m Inlines Blocks
248channelSpec =
249    mempty { C.syntaxInlineParsers = [C.withAttributes parseChannel]
250           }
251
252parseChannel :: (Monad m) => C.InlineParser m Inlines
253parseChannel = P.try $ do
254    void $ C.symbol normalChannelSigilChar
255    let chunk = C.satisfyWord (const True) <|> C.symbol '_' <|> C.symbol '-'
256    cts <- P.many1 chunk
257    return $ singleI $ EChannel $ C.untokenize cts
258
259-- Syntax extension for parsing @username references.
260usernameSpec :: (Monad m) => C.SyntaxSpec m Inlines Blocks
261usernameSpec =
262    mempty { C.syntaxInlineParsers = [C.withAttributes parseUsername]
263           }
264
265parseUsername :: (Monad m) => C.InlineParser m Inlines
266parseUsername = P.try $ do
267    void $ C.symbol userSigilChar
268    let chunk = C.satisfyWord (const True) <|> C.symbol '_' <|> C.symbol '-'
269        [period] = C.tokenize "" "."
270    uts <- intersperse period <$> P.sepBy1 chunk (C.symbol '.')
271    return $ singleI $ EUser $ C.untokenize uts
272
273-- Syntax extension for parsing :emoji: references.
274--
275-- NOTE: the commonmark-extensions package also provides a syntax
276-- extension for exactly this. Why don't we use it? I'm glad you asked.
277-- We don't use it because that extension actually checks to see whether
278-- emoji are valid by looking in a database (provided by the 'emojis'
279-- package). While that's actually a great feature, it is problematic
280-- when that package's emoji database does not exactly match the one
281-- that the Mattermost server uses. As a result, Matterhorn may think
282-- that some valid emoji (according to the server) is invalid (according
283-- to the 'emojis' package). Instead of using that extension, we made
284-- our own that does *not* validate the emoji references at parse time.
285-- We just parse them and keep them around, and then validate them at
286-- *render* time. That way we can allow anything to parse, but change
287-- how we render valid and invalid emoji based on a copy of the server's
288-- emoji database that we bundle with Matterhorn.
289emojiSpec :: (Monad m) => C.SyntaxSpec m Inlines Blocks
290emojiSpec =
291    mempty { C.syntaxInlineParsers = [C.withAttributes parseEmoji]
292           }
293
294parseEmoji :: (Monad m) => C.InlineParser m Inlines
295parseEmoji = P.try $ do
296    void $ C.symbol ':'
297    ts <- P.many1 $ C.satisfyWord (const True)
298               <|> C.symbol '_'
299               <|> C.symbol '+'
300               <|> C.symbol '-'
301    void $ C.symbol ':'
302    let kw = C.untokenize ts
303    return $ singleI $ EEmoji kw
304
305-- A dummy instance just to satisfy commonmark; we don't use this.
306instance C.HasAttributes Blocks where
307    addAttributes _ = id
308
309-- A dummy instance just to satisfy commonmark; we don't use this.
310instance C.Rangeable Blocks where
311    ranged _ = id
312
313instance C.IsBlock Inlines Blocks where
314    paragraph = singleB . Para
315    plain = singleB . Para
316    thematicBreak = singleB HRule
317    blockQuote = singleB . Blockquote
318    codeBlock infoTxt content = singleB $ CodeBlock (parseCodeBlockInfo infoTxt) content
319    heading level i = singleB $ Header level i
320    rawBlock _format content = singleB $ CodeBlock (parseCodeBlockInfo "") content
321    list ty spacing bs = singleB $ List ty spacing $ Seq.fromList bs
322    referenceLinkDefinition _label (_dest, _title) = mempty
323
324parseCodeBlockInfo :: Text -> CodeBlockInfo
325parseCodeBlockInfo t = CodeBlockInfo lang info
326    where
327        ws = T.words t
328        (lang, info) = case ws of
329            [l, i] -> (Just l, Just i)
330            [l]    -> (Just l, Nothing)
331            _      -> (Nothing, Nothing)
332
333-- | Parse markdown input text to RichText.
334--
335-- Note that this always returns a block sequence even if the input
336-- cannot be parsed. It isn't yet clear just how permissive the
337-- commonmark parser is, but so far we have not encountered any issues.
338-- If the input document is so broken that commonmark cannot parse it,
339-- we return an empty block sequence.
340parseMarkdown :: Maybe TeamBaseURL
341              -- ^ If provided, perform post link detection whenever a
342              -- hyperlink is parsed by checking to see if the post link
343              -- is a post in this Mattermost team
344              -> T.Text
345              -- ^ The markdown input text to parse
346              -> Blocks
347parseMarkdown mBaseUrl t =
348    let customSyntax = mconcat $ markdownExtensions <> [C.defaultSyntaxSpec]
349        markdownExtensions =
350            [ C.autolinkSpec
351            , C.strikethroughSpec
352            , C.pipeTableSpec
353            , usernameSpec
354            , channelSpec
355            , emojiSpec
356            ]
357
358    in case runIdentity $ C.commonmarkWith customSyntax "-" t of
359        Left _ -> mempty
360        Right bs -> case mBaseUrl of
361            Nothing -> bs
362            Just baseUrl -> rewriteBlocksPermalinks baseUrl bs
363
364-- | If the specified URL matches the active server base URL and team
365-- and refers to a post, extract the team name and post ID values and
366-- return them.
367getPermalink :: TeamBaseURL -> Text -> Maybe (TeamURLName, PostId)
368getPermalink (TeamBaseURL tName (ServerBaseURL baseUrl)) url =
369    let newBaseUrl = if "/" `T.isSuffixOf` baseUrl
370                     then baseUrl
371                     else baseUrl <> "/"
372    in if not $ newBaseUrl `T.isPrefixOf` url
373       then Nothing
374       else let rest = T.drop (T.length newBaseUrl) url
375                (tName', rawPIdStr) = T.breakOn "/pl/" rest
376                pIdStr = T.drop 4 rawPIdStr
377            in if tName == TeamURLName tName' && not (T.null pIdStr)
378               then Just (tName, PI $ Id pIdStr)
379               else Nothing
380
381-- | Locate post hyperlinks in the block sequence and rewrite them as
382-- post permalinks.
383rewriteBlocksPermalinks :: TeamBaseURL -> Blocks -> Blocks
384rewriteBlocksPermalinks u (Blocks bs) = Blocks $ rewriteBlockPermalinks u <$> bs
385
386-- | Locate post hyperlinks in the block and rewrite them as post
387-- permalinks.
388rewriteBlockPermalinks :: TeamBaseURL -> Block -> Block
389rewriteBlockPermalinks u (Table a h b) = Table a (rewriteInlinePermalinks u <$> h)
390                                                 (fmap (fmap (rewriteInlinePermalinks u)) b)
391rewriteBlockPermalinks u (Para s) = Para $ rewriteInlinePermalinks u s
392rewriteBlockPermalinks u (Header i s) = Header i $ rewriteInlinePermalinks u s
393rewriteBlockPermalinks u (Blockquote bs) = Blockquote $ rewriteBlocksPermalinks u bs
394rewriteBlockPermalinks u (List ty spacing bss) = List ty spacing $ rewriteBlocksPermalinks u <$> bss
395rewriteBlockPermalinks _ b@(CodeBlock {}) = b
396rewriteBlockPermalinks _ b@(HTMLBlock {}) = b
397rewriteBlockPermalinks _ b@HRule = b
398
399-- | Locate post hyperlinks in the inline sequence and rewrite them as
400-- post permalinks.
401rewriteInlinePermalinks :: TeamBaseURL -> Inlines -> Inlines
402rewriteInlinePermalinks u (Inlines is) = Inlines $ rewriteInlinePermalink u <$> is
403
404-- | Locate post hyperlinks in the inline value and rewrite them as post
405-- permalinks.
406rewriteInlinePermalink :: TeamBaseURL -> Inline -> Inline
407rewriteInlinePermalink u i@(EHyperlink url label) =
408    case getPermalink u (unURL url) of
409        Nothing -> i
410        Just (tName, pId) ->
411            -- Get rid of permalink labels if they just match the URL,
412            -- because that's how Commonmark-extensions parses them. We
413            -- would rather only preserve the label if it differs from
414            -- the URL.
415            let newLabel = if label == Inlines (Seq.fromList [EText $ unURL url])
416                           then Nothing
417                           else Just label
418            in EPermalink tName pId newLabel
419rewriteInlinePermalink u (EEmph s) = EEmph $ rewriteInlinePermalinks u s
420rewriteInlinePermalink u (ECode s) = ECode $ rewriteInlinePermalinks u s
421rewriteInlinePermalink u (EStrikethrough s) = EStrikethrough $ rewriteInlinePermalinks u s
422rewriteInlinePermalink u (EStrong s) = EStrong $ rewriteInlinePermalinks u s
423rewriteInlinePermalink u (ENonBreaking s) = ENonBreaking $ rewriteInlinePermalinks u s
424rewriteInlinePermalink _ i@(EText {}) = i
425rewriteInlinePermalink _ i@ESpace = i
426rewriteInlinePermalink _ i@ESoftBreak = i
427rewriteInlinePermalink _ i@ELineBreak = i
428rewriteInlinePermalink _ i@(EEditSentinel {}) = i
429rewriteInlinePermalink _ i@(ERawHtml {}) = i
430rewriteInlinePermalink _ i@(EEmoji {}) = i
431rewriteInlinePermalink _ i@(EUser {}) = i
432rewriteInlinePermalink _ i@(EChannel {}) = i
433rewriteInlinePermalink _ i@(EImage {}) = i
434rewriteInlinePermalink _ i@(EPermalink {}) = i
435
436-- | Obtain all username references in a rich text document.
437findUsernames :: Blocks -> S.Set T.Text
438findUsernames (Blocks bs) = S.unions $ F.toList $ fmap blockFindUsernames bs
439
440blockFindUsernames :: Block -> S.Set T.Text
441blockFindUsernames (Para is) =
442    inlineFindUsernames $ F.toList $ unInlines is
443blockFindUsernames (Header _ is) =
444    inlineFindUsernames $ F.toList $ unInlines is
445blockFindUsernames (Blockquote bs) =
446    findUsernames bs
447blockFindUsernames (List _ _ bs) =
448    S.unions $ F.toList $ findUsernames <$> bs
449blockFindUsernames _ =
450    mempty
451
452inlineFindUsernames :: [Inline] -> S.Set T.Text
453inlineFindUsernames [] = mempty
454inlineFindUsernames (i : is) =
455    case i of
456        EUser u -> S.insert u $ inlineFindUsernames is
457        _ -> inlineFindUsernames is
458
459-- | Obtain all URLs (and optional labels) in a rich text block.
460blockGetURLs :: Block -> [(Either (TeamURLName, PostId) URL, Maybe Inlines)]
461blockGetURLs (Para is) =
462    catMaybes $ elementGetURL <$> (toList $ unInlines is)
463blockGetURLs (Header _ is) =
464    catMaybes $ elementGetURL <$> (toList $ unInlines is)
465blockGetURLs (Blockquote bs) =
466    mconcat $ blockGetURLs <$> toList (unBlocks bs)
467blockGetURLs (List _ _ bss) =
468    mconcat $ mconcat $
469    (fmap blockGetURLs . F.toList . unBlocks) <$> F.toList bss
470blockGetURLs _ =
471    mempty
472
473elementGetURL :: Inline -> Maybe (Either (TeamURLName, PostId) URL, Maybe Inlines)
474elementGetURL (EHyperlink url label) =
475    Just (Right url, Just label)
476elementGetURL (EImage url label) =
477    Just (Right url, Just label)
478elementGetURL (EPermalink tName pId label) =
479    Just (Left (tName, pId), label)
480elementGetURL _ =
481    Nothing
482
483-- | Find the first code block in a sequence of rich text blocks.
484findVerbatimChunk :: Blocks -> Maybe Text
485findVerbatimChunk (Blocks bs) = getFirst $ F.foldMap go bs
486  where go (CodeBlock _ t) = First (Just t)
487        go _               = First Nothing
488