1{-# OPTIONS_GHC -funbox-strict-fields #-} 2{-# LANGUAGE BangPatterns #-} 3{-# LANGUAGE LambdaCase #-} 4{-# LANGUAGE TupleSections #-} 5-- | 6-- Module : Data.Unicode.Internal.NormalizeStream 7-- Copyright : (c) 2016 Harendra Kumar 8-- (c) 2020 Andrew Lelechenko 9-- 10-- License : BSD-3-Clause 11-- Maintainer : harendra.kumar@gmail.com 12-- Stability : experimental 13-- 14-- Stream based normalization. 15-- 16module Data.Unicode.Internal.NormalizeStream 17 ( 18 D.DecomposeMode(..) 19 , stream 20 , unstream 21 , unstreamC 22 ) 23 where 24 25import Data.Char (chr, ord) 26import qualified Data.Text.Array as A 27import Data.Text.Internal (Text (..)) 28import qualified Data.Text.Internal.Encoding.Utf16 as U16 29import Data.Text.Internal.Fusion.Size (betweenSize, 30 upperBound) 31import Data.Text.Internal.Fusion.Types (Step (..), Stream (..)) 32import Data.Text.Internal.Private (runText) 33import Data.Text.Internal.Unsafe.Char (unsafeWrite) 34import Data.Text.Internal.Unsafe.Char (unsafeChr) 35import Data.Text.Internal.Unsafe.Shift (shiftR) 36import GHC.ST (ST (..)) 37import GHC.Types (SPEC(..)) 38 39import qualified Data.Unicode.Properties.CombiningClass as CC 40import qualified Data.Unicode.Properties.Compositions as C 41import qualified Data.Unicode.Properties.Decompose as D 42import qualified Data.Unicode.Properties.DecomposeHangul as H 43 44------------------------------------------------------------------------------- 45-- Reorder buffer to hold characters till the next starter boundary 46------------------------------------------------------------------------------- 47 48-- | A list of combining characters, ordered by 'CC.getCombiningClass'. 49-- Couple of top levels are unrolled and unpacked for efficiency. 50data ReBuf = Empty | One !Char | Many !Char !Char ![Char] 51 52{-# INLINE insertIntoReBuf #-} 53insertIntoReBuf :: Char -> ReBuf -> ReBuf 54insertIntoReBuf c Empty = One c 55insertIntoReBuf c (One c0) 56 | CC.getCombiningClass c < CC.getCombiningClass c0 57 = Many c c0 [] 58 | otherwise 59 = Many c0 c [] 60insertIntoReBuf c (Many c0 c1 cs) 61 | cc < CC.getCombiningClass c0 62 = Many c c0 (c1 : cs) 63 | cc < CC.getCombiningClass c1 64 = Many c0 c (c1 : cs) 65 | otherwise 66 = Many c0 c1 (cs' ++ (c : cs'')) 67 where 68 cc = CC.getCombiningClass c 69 (cs', cs'') = span ((<= cc) . CC.getCombiningClass) cs 70 71writeStr :: A.MArray s -> Int -> [Char] -> ST s Int 72writeStr marr di str = go di str 73 where 74 go i [] = return i 75 go i (c : cs) = do 76 n <- unsafeWrite marr i c 77 go (i + n) cs 78 79{-# INLINE writeReorderBuffer #-} 80writeReorderBuffer :: A.MArray s -> Int -> ReBuf -> ST s Int 81writeReorderBuffer _ di Empty = return di 82 83writeReorderBuffer marr di (One c) = do 84 n <- unsafeWrite marr di c 85 return (di + n) 86 87writeReorderBuffer marr di (Many c1 c2 str) = do 88 n1 <- unsafeWrite marr di c1 89 n2 <- unsafeWrite marr (di + n1) c2 90 writeStr marr (di + n1 + n2) str 91 92------------------------------------------------------------------------------- 93-- Decomposition of Hangul characters is done algorithmically 94------------------------------------------------------------------------------- 95 96-- {-# INLINE decomposeCharHangul #-} 97decomposeCharHangul :: A.MArray s -> Int -> Char -> ST s Int 98decomposeCharHangul marr j c = 99 if t == chr H.jamoTFirst then do 100 n1 <- unsafeWrite marr j l 101 n2 <- unsafeWrite marr (j + n1) v 102 return (j + n1 + n2) 103 else do 104 n1 <- unsafeWrite marr j l 105 n2 <- unsafeWrite marr (j + n1) v 106 n3 <- unsafeWrite marr (j + n1 + n2) t 107 return (j + n1 + n2 + n3) 108 where 109 (l, v, t) = D.decomposeCharHangul c 110 111{-# INLINE decomposeChar #-} 112decomposeChar 113 :: D.DecomposeMode 114 -> A.MArray s -- destination array for decomposition 115 -> Int -- array index 116 -> ReBuf -- reorder buffer 117 -> Char -- char to be decomposed 118 -> ST s (Int, ReBuf) 119decomposeChar mode marr index reBuf ch 120 | D.isHangul ch = do 121 j <- writeReorderBuffer marr index reBuf 122 (, Empty) <$> decomposeCharHangul marr j ch 123 | D.isDecomposable mode ch = 124 decomposeAll marr index reBuf (D.decomposeChar mode ch) 125 | otherwise = 126 reorder marr index reBuf ch 127 128 where 129 130 {-# INLINE decomposeAll #-} 131 decomposeAll _ i rbuf [] = return (i, rbuf) 132 decomposeAll arr i rbuf (x : xs) 133 | D.isDecomposable mode x = do 134 (i', rbuf') <- decomposeAll arr i rbuf (D.decomposeChar mode x) 135 decomposeAll arr i' rbuf' xs 136 | otherwise = do 137 (i', rbuf') <- reorder arr i rbuf x 138 decomposeAll arr i' rbuf' xs 139 140 {-# INLINE reorder #-} 141 reorder arr i rbuf c 142 | CC.isCombining c = return (i, insertIntoReBuf c rbuf) 143 | otherwise = do 144 j <- writeReorderBuffer arr i rbuf 145 n <- unsafeWrite arr j c 146 return (j + n, Empty) 147 148-- | /O(n)/ Convert a 'Text' into a 'Stream Char'. 149stream :: Text -> Stream Char 150stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len) 151 where 152 !end = off+len 153 {-# INLINE next #-} 154 next !i 155 | i >= end = Done 156 -- shift generates only two branches instead of three in case of 157 -- range check, works quite a bit faster with llvm backend. 158 | (n `shiftR` 10) == 0x36 = Yield (U16.chr2 n n2) (i + 2) 159 | otherwise = Yield (unsafeChr n) (i + 1) 160 where 161 n = A.unsafeIndex arr i 162 n2 = A.unsafeIndex arr (i + 1) 163{-# INLINE [0] stream #-} 164 165-- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'. 166unstream :: D.DecomposeMode -> Stream Char -> Text 167unstream mode (Stream next0 s0 len) = runText $ \done -> do 168 -- Before encoding each char we perform a buffer realloc check assuming 169 -- worst case encoding size of two 16-bit units for the char. Just add an 170 -- extra space to the buffer so that we do not end up reallocating even when 171 -- all the chars are encoded as single unit. 172 let margin = 1 + maxDecomposeLen 173 mlen = (upperBound 4 len + margin) 174 arr0 <- A.new mlen 175 let outer !arr !maxi = encode 176 where 177 -- keep the common case loop as small as possible 178 encode !si !di rbuf = 179 -- simply check for the worst case 180 if maxi < di + margin 181 then realloc si di rbuf 182 else 183 case next0 si of 184 Done -> do 185 di' <- writeReorderBuffer arr di rbuf 186 done arr di' 187 Skip si' -> encode si' di rbuf 188 Yield c si' -> do 189 (di', rbuf') <- decomposeChar mode arr di rbuf c 190 encode si' di' rbuf' 191 -- n <- unsafeWrite arr di c 192 -- encode si' (di + n) rbuf 193 194 -- keep uncommon case separate from the common case code 195 {-# NOINLINE realloc #-} 196 realloc !si !di rbuf = do 197 let newlen = maxi * 2 198 arr' <- A.new newlen 199 A.copyM arr' 0 arr 0 di 200 outer arr' (newlen - 1) si di rbuf 201 202 outer arr0 (mlen - 1) s0 0 Empty 203{-# INLINE [0] unstream #-} 204 205-- we can generate this from UCD 206maxDecomposeLen :: Int 207maxDecomposeLen = 32 208 209------------------------------------------------------------------------------- 210-- Composition 211------------------------------------------------------------------------------- 212 213-- If we are composing we do not need to first decompose Hangul. We can just 214-- compose assuming there could be some partially composed syllables e.g. LV 215-- syllable followed by a jamo T. We need to compose this case as well. 216 217-- Hold an L to wait for V, hold an LV to wait for T. 218data JamoBuf 219 = Jamo !Char -- Jamo L, V or T 220 | Hangul !Char -- Hangul Syllable LV or LVT 221 | HangulLV !Char 222 223data RegBuf 224 = RegOne !Char 225 | RegMany !Char !Char ![Char] 226 227data ComposeState 228 = ComposeNone 229 | ComposeReg !RegBuf 230 | ComposeJamo !JamoBuf 231 232------------------------------------------------------------------------------- 233-- Composition of Jamo into Hangul syllables, done algorithmically 234------------------------------------------------------------------------------- 235 236{-# INLINE writeJamoBuf #-} 237writeJamoBuf :: A.MArray s -> Int -> JamoBuf -> ST s Int 238writeJamoBuf arr i jbuf = do 239 n <- unsafeWrite arr i (getCh jbuf) 240 return (i + n) 241 242 where 243 244 getCh (Jamo ch) = ch 245 getCh (Hangul ch) = ch 246 getCh (HangulLV ch) = ch 247 248{-# INLINE initHangul #-} 249initHangul :: Char -> Int -> ST s (Int, ComposeState) 250initHangul c i = return (i, ComposeJamo (Hangul c)) 251 252{-# INLINE initJamo #-} 253initJamo :: Char -> Int -> ST s (Int, ComposeState) 254initJamo c i = return (i, ComposeJamo (Jamo c)) 255 256{-# INLINE insertJamo #-} 257insertJamo 258 :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState) 259insertJamo arr i jbuf ch 260 | ich <= H.jamoLLast = do 261 j <- writeJamoBuf arr i jbuf 262 return (j, ComposeJamo (Jamo ch)) 263 | ich < H.jamoVFirst = 264 flushAndWrite arr i jbuf ch 265 | ich <= H.jamoVLast = do 266 case jbuf of 267 Jamo c -> 268 case H.jamoLIndex c of 269 Just li -> 270 let vi = ich - H.jamoVFirst 271 lvi = li * H.jamoNCount + vi * H.jamoTCount 272 lv = chr (H.hangulFirst + lvi) 273 in return (i, ComposeJamo (HangulLV lv)) 274 Nothing -> writeTwo arr i c ch 275 Hangul c -> writeTwo arr i c ch 276 HangulLV c -> writeTwo arr i c ch 277 | ich <= H.jamoTFirst = do 278 flushAndWrite arr i jbuf ch 279 | otherwise = do 280 let ti = ich - H.jamoTFirst 281 case jbuf of 282 Jamo c -> writeTwo arr i c ch 283 Hangul c 284 | H.isHangulLV c -> do 285 writeLVT arr i c ti 286 | otherwise -> 287 writeTwo arr i c ch 288 HangulLV c -> 289 writeLVT arr i c ti 290 291 where 292 293 ich = ord ch 294 295 {-# INLINE flushAndWrite #-} 296 flushAndWrite marr ix jb c = do 297 j <- writeJamoBuf marr ix jb 298 n <- unsafeWrite marr j c 299 return (j + n, ComposeNone) 300 301 {-# INLINE writeLVT #-} 302 writeLVT marr ix lv ti = do 303 n <- unsafeWrite marr ix (chr ((ord lv) + ti)) 304 return (ix + n, ComposeNone) 305 306 {-# INLINE writeTwo #-} 307 writeTwo marr ix c1 c2 = do 308 n <- unsafeWrite marr ix c1 309 m <- unsafeWrite marr (ix + n) c2 310 return ((ix + n + m), ComposeNone) 311 312{-# INLINE insertHangul #-} 313insertHangul 314 :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState) 315insertHangul arr i jbuf ch = do 316 j <- writeJamoBuf arr i jbuf 317 return (j, ComposeJamo (Hangul ch)) 318 319{-# INLINE insertIntoRegBuf #-} 320insertIntoRegBuf :: Char -> RegBuf -> RegBuf 321insertIntoRegBuf c (RegOne c0) 322 | CC.getCombiningClass c < CC.getCombiningClass c0 323 = RegMany c c0 [] 324 | otherwise 325 = RegMany c0 c [] 326insertIntoRegBuf c (RegMany c0 c1 cs) 327 | cc < CC.getCombiningClass c0 328 = RegMany c c0 (c1 : cs) 329 | cc < CC.getCombiningClass c1 330 = RegMany c0 c (c1 : cs) 331 | otherwise 332 = RegMany c0 c1 (cs' ++ (c : cs'')) 333 where 334 cc = CC.getCombiningClass c 335 (cs', cs'') = span ((<= cc) . CC.getCombiningClass) cs 336 337{-# INLINE writeRegBuf #-} 338writeRegBuf :: A.MArray s -> Int -> RegBuf -> ST s Int 339writeRegBuf arr i = \case 340 RegOne c -> do 341 n <- unsafeWrite arr i c 342 return (i + n) 343 RegMany st c [] -> 344 case C.composePair st c of 345 Just x -> do 346 n <- unsafeWrite arr i x 347 return (i + n) 348 Nothing -> do 349 n <- unsafeWrite arr i st 350 m <- unsafeWrite arr (i + n) c 351 return (i + n + m) 352 RegMany st0 c0 cs0 -> go [] st0 (c0 : cs0) 353 354 where 355 356 -- arguments: uncombined chars, starter, unprocessed str 357 go uncs st [] = writeStr arr i (st : uncs) 358 go uncs st (c : cs) = case C.composePair st c of 359 Nothing -> go (uncs ++ (c : same)) st bigger 360 Just x -> go uncs x cs 361 where 362 cc = CC.getCombiningClass c 363 (same, bigger) = span ((== cc) . CC.getCombiningClass) cs 364 365{-# INLINE flushComposeState #-} 366flushComposeState :: A.MArray s -> Int -> ComposeState -> ST s Int 367flushComposeState arr i = \case 368 ComposeNone -> pure i 369 ComposeReg rbuf -> writeRegBuf arr i rbuf 370 ComposeJamo jbuf -> writeJamoBuf arr i jbuf 371 372{-# INLINE composeChar #-} 373composeChar 374 :: D.DecomposeMode 375 -> A.MArray s -- destination array for composition 376 -> Char -- input char 377 -> Int -- array index 378 -> ComposeState 379 -> ST s (Int, ComposeState) 380composeChar mode marr = go0 381 382 where 383 384 go0 ch !i !st = 385 case st of 386 ComposeReg rbuf 387 | ich < H.jamoLFirst -> 388 composeReg rbuf ch i st 389 | ich <= H.jamoLast -> do 390 j <- writeRegBuf marr i rbuf 391 initJamo ch j 392 | ich < H.hangulFirst -> 393 composeReg rbuf ch i st 394 | ich <= H.hangulLast -> do 395 j <- writeRegBuf marr i rbuf 396 initHangul ch j 397 | otherwise -> 398 composeReg rbuf ch i st 399 ComposeJamo jbuf 400 | ich < H.jamoLFirst -> do 401 jamoToReg marr i jbuf ch 402 | ich <= H.jamoLast -> do 403 insertJamo marr i jbuf ch 404 | ich < H.hangulFirst -> 405 jamoToReg marr i jbuf ch 406 | ich <= H.hangulLast -> do 407 insertHangul marr i jbuf ch 408 | otherwise -> 409 jamoToReg marr i jbuf ch 410 ComposeNone 411 | ich < H.jamoLFirst -> 412 initReg ch i 413 | ich <= H.jamoLast -> 414 initJamo ch i 415 | ich < H.hangulFirst -> 416 initReg ch i 417 | ich <= H.hangulLast -> 418 initHangul ch i 419 | otherwise -> 420 initReg ch i 421 where ich = ord ch 422 423 {-# INLINE jamoToReg #-} 424 jamoToReg arr i jbuf ch = do 425 j <- writeJamoBuf arr i jbuf 426 initReg ch j 427 428 {-# INLINE initReg #-} 429 initReg !ch !i 430 | D.isDecomposable mode ch = 431 go (D.decomposeChar mode ch) i ComposeNone 432 | otherwise = 433 pure (i, ComposeReg (RegOne ch)) 434 435 {-# INLINE composeReg #-} 436 composeReg rbuf !ch !i !st 437 | D.isDecomposable mode ch = 438 go (D.decomposeChar mode ch) i st 439 | CC.isCombining ch = do 440 pure (i, ComposeReg (insertIntoRegBuf ch rbuf)) 441 -- The first char in RegBuf may or may not be a starter. In 442 -- case it is not we rely on composeStarterPair failing. 443 | RegOne s <- rbuf 444 , C.isSecondStarter ch 445 , Just x <- C.composeStarterPair s ch = 446 pure (i, (ComposeReg (RegOne x))) 447 | otherwise = do 448 j <- writeRegBuf marr i rbuf 449 pure (j, ComposeReg (RegOne ch)) 450 451 go [] !i !st = pure (i, st) 452 go (ch : rest) i st = 453 case st of 454 ComposeReg rbuf 455 | H.isHangul ch -> do 456 j <- writeRegBuf marr i rbuf 457 (k, s) <- initHangul ch j 458 go rest k s 459 | H.isJamo ch -> do 460 j <- writeRegBuf marr i rbuf 461 (k, s) <- initJamo ch j 462 go rest k s 463 | D.isDecomposable mode ch -> 464 go (D.decomposeChar mode ch ++ rest) i st 465 | CC.isCombining ch -> do 466 go rest i (ComposeReg (insertIntoRegBuf ch rbuf)) 467 | RegOne s <- rbuf 468 , C.isSecondStarter ch 469 , Just x <- C.composeStarterPair s ch -> 470 go rest i (ComposeReg (RegOne x)) 471 | otherwise -> do 472 j <- writeRegBuf marr i rbuf 473 go rest j (ComposeReg (RegOne ch)) 474 ComposeJamo jbuf 475 | H.isJamo ch -> do 476 (j, s) <- insertJamo marr i jbuf ch 477 go rest j s 478 | H.isHangul ch -> do 479 (j, s) <- insertHangul marr i jbuf ch 480 go rest j s 481 | otherwise -> do 482 j <- writeJamoBuf marr i jbuf 483 case () of 484 _ 485 | D.isDecomposable mode ch -> 486 go (D.decomposeChar mode ch ++ rest) j 487 ComposeNone 488 | otherwise -> 489 go rest j (ComposeReg (RegOne ch)) 490 ComposeNone 491 | H.isHangul ch -> do 492 (j, s) <- initHangul ch i 493 go rest j s 494 | H.isJamo ch -> do 495 (j, s) <- initJamo ch i 496 go rest j s 497 | D.isDecomposable mode ch -> 498 go (D.decomposeChar mode ch ++ rest) i st 499 | otherwise -> 500 go rest i (ComposeReg (RegOne ch)) 501 502-- | /O(n)/ Convert a 'Stream Char' into a composed normalized 'Text'. 503unstreamC :: D.DecomposeMode -> Stream Char -> Text 504unstreamC mode (Stream next0 s0 len) = runText $ \done -> do 505 -- Before encoding each char we perform a buffer realloc check assuming 506 -- worst case encoding size of two 16-bit units for the char. Just add an 507 -- extra space to the buffer so that we do not end up reallocating even when 508 -- all the chars are encoded as single unit. 509 let margin = 1 + maxDecomposeLen 510 mlen = (upperBound 4 len + margin) 511 arr0 <- A.new mlen 512 let outer !arr !maxi = encode SPEC 513 where 514 -- keep the common case loop as small as possible 515 encode !_ !si !di st = 516 -- simply check for the worst case 517 if maxi < di + margin 518 then realloc si di st 519 else 520 case next0 si of 521 Done -> do 522 di' <- flushComposeState arr di st 523 done arr di' 524 Skip si' -> encode SPEC si' di st 525 Yield c si' -> do 526 (di', st') <- composeChar mode arr c di st 527 encode SPEC si' di' st' 528 529 -- keep uncommon case separate from the common case code 530 {-# NOINLINE realloc #-} 531 realloc !si !di st = do 532 let newlen = maxi * 2 533 arr' <- A.new newlen 534 A.copyM arr' 0 arr 0 di 535 outer arr' (newlen - 1) si di st 536 537 outer arr0 (mlen - 1) s0 0 ComposeNone 538{-# INLINE [0] unstreamC #-} 539