1{-# LANGUAGE GeneralizedNewtypeDeriving #-} 2{-# LANGUAGE MultiParamTypeClasses #-} 3-- | This module provides a set of data types to represent message text. 4-- The inline and block types in this module are designed to represent 5-- most of what is found in Markdown documents (particularly the 6-- Commonmark specification) in addition to other things we find in 7-- Mattermost messages, such as username or channel references. 8-- 9-- To parse a Markdown document, use 'parseMarkdown'. To actually render 10-- text in this representation, see the module 'Draw.RichText'. 11module Matterhorn.Types.RichText 12 ( Blocks(..) 13 , unBlocks 14 15 , Block(..) 16 , sameBlockType 17 , CodeBlockInfo(..) 18 , Inline(..) 19 , Inlines(..) 20 , unInlines 21 22 , C.ListType(..) 23 , C.ListSpacing(..) 24 , C.EnumeratorType(..) 25 , C.DelimiterType(..) 26 , C.ColAlignment(..) 27 28 , TeamBaseURL(..) 29 , TeamURLName(..) 30 31 , URL(..) 32 , unURL 33 34 , parseMarkdown 35 36 , findUsernames 37 , blockGetURLs 38 , findVerbatimChunk 39 ) 40where 41 42import Prelude () 43import Matterhorn.Prelude 44 45import qualified Commonmark as C 46import qualified Commonmark.Extensions as C 47import qualified Commonmark.Inlines as C 48import qualified Commonmark.TokParsers as C 49import Control.Monad.Identity 50import qualified Data.Foldable as F 51import Data.List ( intersperse ) 52import Data.Monoid (First(..)) 53import qualified Data.Set as S 54import qualified Data.Sequence as Seq 55import Data.Sequence ( (<|), viewl, viewr, ViewL((:<)), ViewR((:>)) ) 56import qualified Data.Text as T 57import qualified Text.Parsec as P 58 59import Network.Mattermost.Types ( PostId(..), Id(..), ServerBaseURL(..) ) 60 61import Matterhorn.Constants ( userSigilChar, normalChannelSigilChar ) 62 63-- | A team name found in a Mattermost post URL 64data TeamURLName = TeamURLName Text 65 deriving (Eq, Show, Ord) 66 67-- | A server base URL with a team name. 68data TeamBaseURL = TeamBaseURL TeamURLName ServerBaseURL 69 deriving (Eq, Show) 70 71-- | A sequence of rich text blocks. 72newtype Blocks = Blocks (Seq Block) 73 deriving (Semigroup, Monoid, Show) 74 75unBlocks :: Blocks -> Seq Block 76unBlocks (Blocks bs) = bs 77 78singleB :: Block -> Blocks 79singleB = Blocks . Seq.singleton 80 81-- | A block in a rich text document. 82-- 83-- NOTE: update 'sameBlockType' when constructors are added to this 84-- type. 85data Block = 86 Para Inlines 87 -- ^ A paragraph. 88 | Header Int Inlines 89 -- ^ A section header with specified depth and contents. 90 | Blockquote Blocks 91 -- ^ A blockquote. 92 | List C.ListType C.ListSpacing (Seq Blocks) 93 -- ^ An itemized list. 94 | CodeBlock CodeBlockInfo Text 95 -- ^ A code block. 96 | HTMLBlock Text 97 -- ^ A fragment of raw HTML. 98 | HRule 99 -- ^ A horizontal rule. 100 | Table [C.ColAlignment] [Inlines] [[Inlines]] 101 -- ^ A table. 102 deriving (Show) 103 104-- | Returns whether two blocks have the same type. 105sameBlockType :: Block -> Block -> Bool 106sameBlockType (Para {}) (Para {}) = True 107sameBlockType (Header {}) (Header {}) = True 108sameBlockType (Blockquote {}) (Blockquote {}) = True 109sameBlockType (List {}) (List {}) = True 110sameBlockType (CodeBlock {}) (CodeBlock {}) = True 111sameBlockType (HTMLBlock {}) (HTMLBlock {}) = True 112sameBlockType _ _ = False 113 114-- | Information about a code block. 115data CodeBlockInfo = 116 CodeBlockInfo { codeBlockLanguage :: Maybe Text 117 -- ^ The language of the source code in the code 118 -- block, if any. This is encoded in Markdown as a 119 -- sequence of non-whitespace characters following the 120 -- fenced code block opening backticks. 121 , codeBlockInfo :: Maybe Text 122 -- ^ Any text that comes after the language token. 123 -- This text is separated from the language token by 124 -- whitespace. 125 } 126 deriving (Eq, Show, Ord) 127 128-- | A URL. 129newtype URL = URL Text 130 deriving (Eq, Show, Ord) 131 132unURL :: URL -> Text 133unURL (URL url) = url 134 135-- | The kinds of inline values that can appear in rich text blocks. 136data Inline = 137 EText Text 138 -- ^ Plain text that SHOULD be a contiguous sequence of 139 -- non-whitespace characters. 140 | EEmph Inlines 141 -- ^ Emphasized (usually italicized) content. 142 | EStrikethrough Inlines 143 -- ^ Strikethrough content. 144 | EStrong Inlines 145 -- ^ Boldface content. 146 | ECode Inlines 147 -- ^ A sequence of non-whitespace characters. 148 | ESpace 149 -- ^ A single space. 150 | ESoftBreak 151 -- ^ A soft line break. 152 | ELineBreak 153 -- ^ A hard line break. 154 | ERawHtml Text 155 -- ^ Raw HTML. 156 | EEditSentinel Bool 157 -- ^ A sentinel indicating that some text has been edited (used 158 -- to indicate that mattermost messages have been edited by their 159 -- authors). This has no parsable representation; it is only used 160 -- to annotate a message prior to rendering to add a visual editing 161 -- indicator. The boolean indicates whether the edit was "recent" 162 -- (True) or not (False). 163 | EUser Text 164 -- ^ A user reference. The text here includes only the username, not 165 -- the sigil. 166 | EChannel Text 167 -- ^ A channel reference. The text here includes only the channel 168 -- name, not the sigil. 169 | EHyperlink URL Inlines 170 -- ^ A hyperlink to the specified URL. Optionally provides an 171 -- element sequence indicating the URL's text label; if absent, the 172 -- label is understood to be the URL itself. 173 | EImage URL Inlines 174 -- ^ An image at the specified URL. Optionally provides an element 175 -- sequence indicating the image's "alt" text label; if absent, the 176 -- label is understood to be the URL itself. 177 | EEmoji Text 178 -- ^ An emoji reference. The text here includes only the text 179 -- portion, not the colons, e.g. "foo" instead of ":foo:". 180 | ENonBreaking Inlines 181 -- ^ A sequence of elements that must never be separated during line 182 -- wrapping. 183 | EPermalink TeamURLName PostId (Maybe Inlines) 184 -- ^ A permalink to the specified team (name) and post ID with an 185 -- optional label. 186 deriving (Show, Eq, Ord) 187 188-- | A sequence of inline values. 189newtype Inlines = Inlines (Seq Inline) 190 deriving (Monoid, Ord, Eq, Show) 191 192unInlines :: Inlines -> Seq Inline 193unInlines (Inlines is) = is 194 195singleI :: Inline -> Inlines 196singleI = Inlines . Seq.singleton 197 198instance Semigroup Inlines where 199 (Inlines l) <> (Inlines r) = 200 Inlines $ case (viewr l, viewl r) of 201 (lInit :> lLast, rHead :< rTail) -> 202 case (lLast, rHead) of 203 (EText a, EText b) -> 204 lInit <> ((EText $ a <> b) <| rTail) 205 (ECode a, ECode b) -> 206 lInit <> ((ECode $ a <> b) <| rTail) 207 (EEmph a, EEmph b) -> 208 lInit <> ((EEmph $ a <> b) <| rTail) 209 (EStrikethrough a, EStrikethrough b) -> 210 lInit <> ((EStrikethrough $ a <> b) <| rTail) 211 (EStrong a, EStrong b) -> 212 lInit <> ((EStrong $ a <> b) <| rTail) 213 (_, _) -> 214 l <> r 215 (_, _) -> l <> r 216 217-- A dummy instance just to satisfy commonmark; we don't use this. 218instance C.Rangeable Inlines where 219 ranged _ = id 220 221-- A dummy instance just to satisfy commonmark; we don't use this. 222instance C.HasAttributes Inlines where 223 addAttributes _ = id 224 225instance C.IsInline Inlines where 226 lineBreak = singleI ELineBreak 227 softBreak = singleI ESoftBreak 228 str t = Inlines $ Seq.fromList $ 229 filter (/= (EText "")) $ 230 intersperse ESpace $ EText <$> T.splitOn " " t 231 entity = singleI . EText 232 escapedChar = singleI . EText . T.singleton 233 emph = singleI . EEmph 234 strong = singleI . EStrong 235 link url _title desc = singleI $ EHyperlink (URL url) desc 236 image url _title desc = singleI $ EImage (URL url) desc 237 code t = singleI $ ECode $ C.str t 238 rawInline _ = singleI . ERawHtml 239 240instance C.HasStrikethrough Inlines where 241 strikethrough = singleI . EStrikethrough 242 243instance C.HasPipeTable Inlines Blocks where 244 pipeTable a h b = singleB $ Table a h b 245 246-- Syntax extension for parsing ~channel references. 247channelSpec :: (Monad m) => C.SyntaxSpec m Inlines Blocks 248channelSpec = 249 mempty { C.syntaxInlineParsers = [C.withAttributes parseChannel] 250 } 251 252parseChannel :: (Monad m) => C.InlineParser m Inlines 253parseChannel = P.try $ do 254 void $ C.symbol normalChannelSigilChar 255 let chunk = C.satisfyWord (const True) <|> C.symbol '_' <|> C.symbol '-' 256 cts <- P.many1 chunk 257 return $ singleI $ EChannel $ C.untokenize cts 258 259-- Syntax extension for parsing @username references. 260usernameSpec :: (Monad m) => C.SyntaxSpec m Inlines Blocks 261usernameSpec = 262 mempty { C.syntaxInlineParsers = [C.withAttributes parseUsername] 263 } 264 265parseUsername :: (Monad m) => C.InlineParser m Inlines 266parseUsername = P.try $ do 267 void $ C.symbol userSigilChar 268 let chunk = C.satisfyWord (const True) <|> C.symbol '_' <|> C.symbol '-' 269 [period] = C.tokenize "" "." 270 uts <- intersperse period <$> P.sepBy1 chunk (C.symbol '.') 271 return $ singleI $ EUser $ C.untokenize uts 272 273-- Syntax extension for parsing :emoji: references. 274-- 275-- NOTE: the commonmark-extensions package also provides a syntax 276-- extension for exactly this. Why don't we use it? I'm glad you asked. 277-- We don't use it because that extension actually checks to see whether 278-- emoji are valid by looking in a database (provided by the 'emojis' 279-- package). While that's actually a great feature, it is problematic 280-- when that package's emoji database does not exactly match the one 281-- that the Mattermost server uses. As a result, Matterhorn may think 282-- that some valid emoji (according to the server) is invalid (according 283-- to the 'emojis' package). Instead of using that extension, we made 284-- our own that does *not* validate the emoji references at parse time. 285-- We just parse them and keep them around, and then validate them at 286-- *render* time. That way we can allow anything to parse, but change 287-- how we render valid and invalid emoji based on a copy of the server's 288-- emoji database that we bundle with Matterhorn. 289emojiSpec :: (Monad m) => C.SyntaxSpec m Inlines Blocks 290emojiSpec = 291 mempty { C.syntaxInlineParsers = [C.withAttributes parseEmoji] 292 } 293 294parseEmoji :: (Monad m) => C.InlineParser m Inlines 295parseEmoji = P.try $ do 296 void $ C.symbol ':' 297 ts <- P.many1 $ C.satisfyWord (const True) 298 <|> C.symbol '_' 299 <|> C.symbol '+' 300 <|> C.symbol '-' 301 void $ C.symbol ':' 302 let kw = C.untokenize ts 303 return $ singleI $ EEmoji kw 304 305-- A dummy instance just to satisfy commonmark; we don't use this. 306instance C.HasAttributes Blocks where 307 addAttributes _ = id 308 309-- A dummy instance just to satisfy commonmark; we don't use this. 310instance C.Rangeable Blocks where 311 ranged _ = id 312 313instance C.IsBlock Inlines Blocks where 314 paragraph = singleB . Para 315 plain = singleB . Para 316 thematicBreak = singleB HRule 317 blockQuote = singleB . Blockquote 318 codeBlock infoTxt content = singleB $ CodeBlock (parseCodeBlockInfo infoTxt) content 319 heading level i = singleB $ Header level i 320 rawBlock _format content = singleB $ CodeBlock (parseCodeBlockInfo "") content 321 list ty spacing bs = singleB $ List ty spacing $ Seq.fromList bs 322 referenceLinkDefinition _label (_dest, _title) = mempty 323 324parseCodeBlockInfo :: Text -> CodeBlockInfo 325parseCodeBlockInfo t = CodeBlockInfo lang info 326 where 327 ws = T.words t 328 (lang, info) = case ws of 329 [l, i] -> (Just l, Just i) 330 [l] -> (Just l, Nothing) 331 _ -> (Nothing, Nothing) 332 333-- | Parse markdown input text to RichText. 334-- 335-- Note that this always returns a block sequence even if the input 336-- cannot be parsed. It isn't yet clear just how permissive the 337-- commonmark parser is, but so far we have not encountered any issues. 338-- If the input document is so broken that commonmark cannot parse it, 339-- we return an empty block sequence. 340parseMarkdown :: Maybe TeamBaseURL 341 -- ^ If provided, perform post link detection whenever a 342 -- hyperlink is parsed by checking to see if the post link 343 -- is a post in this Mattermost team 344 -> T.Text 345 -- ^ The markdown input text to parse 346 -> Blocks 347parseMarkdown mBaseUrl t = 348 let customSyntax = mconcat $ markdownExtensions <> [C.defaultSyntaxSpec] 349 markdownExtensions = 350 [ C.autolinkSpec 351 , C.strikethroughSpec 352 , C.pipeTableSpec 353 , usernameSpec 354 , channelSpec 355 , emojiSpec 356 ] 357 358 in case runIdentity $ C.commonmarkWith customSyntax "-" t of 359 Left _ -> mempty 360 Right bs -> case mBaseUrl of 361 Nothing -> bs 362 Just baseUrl -> rewriteBlocksPermalinks baseUrl bs 363 364-- | If the specified URL matches the active server base URL and team 365-- and refers to a post, extract the team name and post ID values and 366-- return them. 367getPermalink :: TeamBaseURL -> Text -> Maybe (TeamURLName, PostId) 368getPermalink (TeamBaseURL tName (ServerBaseURL baseUrl)) url = 369 let newBaseUrl = if "/" `T.isSuffixOf` baseUrl 370 then baseUrl 371 else baseUrl <> "/" 372 in if not $ newBaseUrl `T.isPrefixOf` url 373 then Nothing 374 else let rest = T.drop (T.length newBaseUrl) url 375 (tName', rawPIdStr) = T.breakOn "/pl/" rest 376 pIdStr = T.drop 4 rawPIdStr 377 in if tName == TeamURLName tName' && not (T.null pIdStr) 378 then Just (tName, PI $ Id pIdStr) 379 else Nothing 380 381-- | Locate post hyperlinks in the block sequence and rewrite them as 382-- post permalinks. 383rewriteBlocksPermalinks :: TeamBaseURL -> Blocks -> Blocks 384rewriteBlocksPermalinks u (Blocks bs) = Blocks $ rewriteBlockPermalinks u <$> bs 385 386-- | Locate post hyperlinks in the block and rewrite them as post 387-- permalinks. 388rewriteBlockPermalinks :: TeamBaseURL -> Block -> Block 389rewriteBlockPermalinks u (Table a h b) = Table a (rewriteInlinePermalinks u <$> h) 390 (fmap (fmap (rewriteInlinePermalinks u)) b) 391rewriteBlockPermalinks u (Para s) = Para $ rewriteInlinePermalinks u s 392rewriteBlockPermalinks u (Header i s) = Header i $ rewriteInlinePermalinks u s 393rewriteBlockPermalinks u (Blockquote bs) = Blockquote $ rewriteBlocksPermalinks u bs 394rewriteBlockPermalinks u (List ty spacing bss) = List ty spacing $ rewriteBlocksPermalinks u <$> bss 395rewriteBlockPermalinks _ b@(CodeBlock {}) = b 396rewriteBlockPermalinks _ b@(HTMLBlock {}) = b 397rewriteBlockPermalinks _ b@HRule = b 398 399-- | Locate post hyperlinks in the inline sequence and rewrite them as 400-- post permalinks. 401rewriteInlinePermalinks :: TeamBaseURL -> Inlines -> Inlines 402rewriteInlinePermalinks u (Inlines is) = Inlines $ rewriteInlinePermalink u <$> is 403 404-- | Locate post hyperlinks in the inline value and rewrite them as post 405-- permalinks. 406rewriteInlinePermalink :: TeamBaseURL -> Inline -> Inline 407rewriteInlinePermalink u i@(EHyperlink url label) = 408 case getPermalink u (unURL url) of 409 Nothing -> i 410 Just (tName, pId) -> 411 -- Get rid of permalink labels if they just match the URL, 412 -- because that's how Commonmark-extensions parses them. We 413 -- would rather only preserve the label if it differs from 414 -- the URL. 415 let newLabel = if label == Inlines (Seq.fromList [EText $ unURL url]) 416 then Nothing 417 else Just label 418 in EPermalink tName pId newLabel 419rewriteInlinePermalink u (EEmph s) = EEmph $ rewriteInlinePermalinks u s 420rewriteInlinePermalink u (ECode s) = ECode $ rewriteInlinePermalinks u s 421rewriteInlinePermalink u (EStrikethrough s) = EStrikethrough $ rewriteInlinePermalinks u s 422rewriteInlinePermalink u (EStrong s) = EStrong $ rewriteInlinePermalinks u s 423rewriteInlinePermalink u (ENonBreaking s) = ENonBreaking $ rewriteInlinePermalinks u s 424rewriteInlinePermalink _ i@(EText {}) = i 425rewriteInlinePermalink _ i@ESpace = i 426rewriteInlinePermalink _ i@ESoftBreak = i 427rewriteInlinePermalink _ i@ELineBreak = i 428rewriteInlinePermalink _ i@(EEditSentinel {}) = i 429rewriteInlinePermalink _ i@(ERawHtml {}) = i 430rewriteInlinePermalink _ i@(EEmoji {}) = i 431rewriteInlinePermalink _ i@(EUser {}) = i 432rewriteInlinePermalink _ i@(EChannel {}) = i 433rewriteInlinePermalink _ i@(EImage {}) = i 434rewriteInlinePermalink _ i@(EPermalink {}) = i 435 436-- | Obtain all username references in a rich text document. 437findUsernames :: Blocks -> S.Set T.Text 438findUsernames (Blocks bs) = S.unions $ F.toList $ fmap blockFindUsernames bs 439 440blockFindUsernames :: Block -> S.Set T.Text 441blockFindUsernames (Para is) = 442 inlineFindUsernames $ F.toList $ unInlines is 443blockFindUsernames (Header _ is) = 444 inlineFindUsernames $ F.toList $ unInlines is 445blockFindUsernames (Blockquote bs) = 446 findUsernames bs 447blockFindUsernames (List _ _ bs) = 448 S.unions $ F.toList $ findUsernames <$> bs 449blockFindUsernames _ = 450 mempty 451 452inlineFindUsernames :: [Inline] -> S.Set T.Text 453inlineFindUsernames [] = mempty 454inlineFindUsernames (i : is) = 455 case i of 456 EUser u -> S.insert u $ inlineFindUsernames is 457 _ -> inlineFindUsernames is 458 459-- | Obtain all URLs (and optional labels) in a rich text block. 460blockGetURLs :: Block -> [(Either (TeamURLName, PostId) URL, Maybe Inlines)] 461blockGetURLs (Para is) = 462 catMaybes $ elementGetURL <$> (toList $ unInlines is) 463blockGetURLs (Header _ is) = 464 catMaybes $ elementGetURL <$> (toList $ unInlines is) 465blockGetURLs (Blockquote bs) = 466 mconcat $ blockGetURLs <$> toList (unBlocks bs) 467blockGetURLs (List _ _ bss) = 468 mconcat $ mconcat $ 469 (fmap blockGetURLs . F.toList . unBlocks) <$> F.toList bss 470blockGetURLs _ = 471 mempty 472 473elementGetURL :: Inline -> Maybe (Either (TeamURLName, PostId) URL, Maybe Inlines) 474elementGetURL (EHyperlink url label) = 475 Just (Right url, Just label) 476elementGetURL (EImage url label) = 477 Just (Right url, Just label) 478elementGetURL (EPermalink tName pId label) = 479 Just (Left (tName, pId), label) 480elementGetURL _ = 481 Nothing 482 483-- | Find the first code block in a sequence of rich text blocks. 484findVerbatimChunk :: Blocks -> Maybe Text 485findVerbatimChunk (Blocks bs) = getFirst $ F.foldMap go bs 486 where go (CodeBlock _ t) = First (Just t) 487 go _ = First Nothing 488