1{-# OPTIONS_GHC -funbox-strict-fields #-}
2{-# LANGUAGE BangPatterns #-}
3{-# LANGUAGE LambdaCase #-}
4{-# LANGUAGE TupleSections #-}
5-- |
6-- Module      : Data.Unicode.Internal.NormalizeStream
7-- Copyright   : (c) 2016 Harendra Kumar
8--               (c) 2020 Andrew Lelechenko
9--
10-- License     : BSD-3-Clause
11-- Maintainer  : harendra.kumar@gmail.com
12-- Stability   : experimental
13--
14-- Stream based normalization.
15--
16module Data.Unicode.Internal.NormalizeStream
17    (
18      D.DecomposeMode(..)
19    , stream
20    , unstream
21    , unstreamC
22    )
23    where
24
25import           Data.Char                              (chr, ord)
26import qualified Data.Text.Array                        as A
27import           Data.Text.Internal                     (Text (..))
28import qualified Data.Text.Internal.Encoding.Utf16      as U16
29import           Data.Text.Internal.Fusion.Size         (betweenSize,
30                                                         upperBound)
31import           Data.Text.Internal.Fusion.Types        (Step (..), Stream (..))
32import           Data.Text.Internal.Private             (runText)
33import           Data.Text.Internal.Unsafe.Char         (unsafeWrite)
34import           Data.Text.Internal.Unsafe.Char         (unsafeChr)
35import           Data.Text.Internal.Unsafe.Shift        (shiftR)
36import           GHC.ST                                 (ST (..))
37import           GHC.Types                              (SPEC(..))
38
39import qualified Data.Unicode.Properties.CombiningClass  as CC
40import qualified Data.Unicode.Properties.Compositions    as C
41import qualified Data.Unicode.Properties.Decompose       as D
42import qualified Data.Unicode.Properties.DecomposeHangul as H
43
44-------------------------------------------------------------------------------
45-- Reorder buffer to hold characters till the next starter boundary
46-------------------------------------------------------------------------------
47
48-- | A list of combining characters, ordered by 'CC.getCombiningClass'.
49-- Couple of top levels are unrolled and unpacked for efficiency.
50data ReBuf = Empty | One !Char | Many !Char !Char ![Char]
51
52{-# INLINE insertIntoReBuf #-}
53insertIntoReBuf :: Char -> ReBuf -> ReBuf
54insertIntoReBuf c Empty = One c
55insertIntoReBuf c (One c0)
56    | CC.getCombiningClass c < CC.getCombiningClass c0
57    = Many c c0 []
58    | otherwise
59    = Many c0 c []
60insertIntoReBuf c (Many c0 c1 cs)
61    | cc < CC.getCombiningClass c0
62    = Many c c0 (c1 : cs)
63    | cc < CC.getCombiningClass c1
64    = Many c0 c (c1 : cs)
65    | otherwise
66    = Many c0 c1 (cs' ++ (c : cs''))
67    where
68        cc = CC.getCombiningClass c
69        (cs', cs'') = span ((<= cc) . CC.getCombiningClass) cs
70
71writeStr :: A.MArray s -> Int -> [Char] -> ST s Int
72writeStr marr di str = go di str
73    where
74        go i [] = return i
75        go i (c : cs) = do
76            n <- unsafeWrite marr i c
77            go (i + n) cs
78
79{-# INLINE writeReorderBuffer #-}
80writeReorderBuffer :: A.MArray s -> Int -> ReBuf -> ST s Int
81writeReorderBuffer _ di Empty = return di
82
83writeReorderBuffer marr di (One c) = do
84    n <- unsafeWrite marr di c
85    return (di + n)
86
87writeReorderBuffer marr di (Many c1 c2 str) = do
88    n1 <- unsafeWrite marr di c1
89    n2 <- unsafeWrite marr (di + n1) c2
90    writeStr marr (di + n1 + n2) str
91
92-------------------------------------------------------------------------------
93-- Decomposition of Hangul characters is done algorithmically
94-------------------------------------------------------------------------------
95
96-- {-# INLINE decomposeCharHangul #-}
97decomposeCharHangul :: A.MArray s -> Int -> Char -> ST s Int
98decomposeCharHangul marr j c =
99    if t == chr H.jamoTFirst then do
100        n1 <- unsafeWrite marr j l
101        n2 <- unsafeWrite marr (j + n1) v
102        return (j + n1 + n2)
103    else do
104        n1 <- unsafeWrite marr j l
105        n2 <- unsafeWrite marr (j + n1) v
106        n3 <- unsafeWrite marr (j + n1 + n2) t
107        return (j + n1 + n2 + n3)
108    where
109        (l, v, t) = D.decomposeCharHangul c
110
111{-# INLINE decomposeChar #-}
112decomposeChar
113    :: D.DecomposeMode
114    -> A.MArray s       -- destination array for decomposition
115    -> Int              -- array index
116    -> ReBuf            -- reorder buffer
117    -> Char             -- char to be decomposed
118    -> ST s (Int, ReBuf)
119decomposeChar mode marr index reBuf ch
120    | D.isHangul ch = do
121        j <- writeReorderBuffer marr index reBuf
122        (, Empty) <$> decomposeCharHangul marr j ch
123    | D.isDecomposable mode ch =
124        decomposeAll marr index reBuf (D.decomposeChar mode ch)
125    | otherwise =
126        reorder marr index reBuf ch
127
128    where
129
130    {-# INLINE decomposeAll #-}
131    decomposeAll _ i rbuf [] = return (i, rbuf)
132    decomposeAll arr i rbuf (x : xs)
133        | D.isDecomposable mode x = do
134            (i', rbuf') <- decomposeAll arr i rbuf (D.decomposeChar mode x)
135            decomposeAll arr i' rbuf' xs
136        | otherwise  = do
137            (i', rbuf') <- reorder arr i rbuf x
138            decomposeAll arr i' rbuf' xs
139
140    {-# INLINE reorder #-}
141    reorder arr i rbuf c
142        | CC.isCombining c = return (i, insertIntoReBuf c rbuf)
143        | otherwise = do
144            j <- writeReorderBuffer arr i rbuf
145            n <- unsafeWrite arr j c
146            return (j + n, Empty)
147
148-- | /O(n)/ Convert a 'Text' into a 'Stream Char'.
149stream :: Text -> Stream Char
150stream (Text arr off len) = Stream next off (betweenSize (len `shiftR` 1) len)
151    where
152      !end = off+len
153      {-# INLINE next #-}
154      next !i
155          | i >= end                   = Done
156          -- shift generates only two branches instead of three in case of
157          -- range check, works quite a bit faster with llvm backend.
158          | (n `shiftR` 10) == 0x36    = Yield (U16.chr2 n n2) (i + 2)
159          | otherwise                  = Yield (unsafeChr n) (i + 1)
160          where
161            n  = A.unsafeIndex arr i
162            n2 = A.unsafeIndex arr (i + 1)
163{-# INLINE [0] stream #-}
164
165-- | /O(n)/ Convert a 'Stream Char' into a decompose-normalized 'Text'.
166unstream :: D.DecomposeMode -> Stream Char -> Text
167unstream mode (Stream next0 s0 len) = runText $ \done -> do
168  -- Before encoding each char we perform a buffer realloc check assuming
169  -- worst case encoding size of two 16-bit units for the char. Just add an
170  -- extra space to the buffer so that we do not end up reallocating even when
171  -- all the chars are encoded as single unit.
172  let margin = 1 + maxDecomposeLen
173      mlen = (upperBound 4 len + margin)
174  arr0 <- A.new mlen
175  let outer !arr !maxi = encode
176       where
177        -- keep the common case loop as small as possible
178        encode !si !di rbuf =
179            -- simply check for the worst case
180            if maxi < di + margin
181            then realloc si di rbuf
182            else
183                case next0 si of
184                    Done -> do
185                        di' <- writeReorderBuffer arr di rbuf
186                        done arr di'
187                    Skip si'    -> encode si' di rbuf
188                    Yield c si' -> do
189                                (di', rbuf') <- decomposeChar mode arr di rbuf c
190                                encode si' di' rbuf'
191                                -- n <- unsafeWrite arr di c
192                                -- encode si' (di + n) rbuf
193
194        -- keep uncommon case separate from the common case code
195        {-# NOINLINE realloc #-}
196        realloc !si !di rbuf = do
197            let newlen = maxi * 2
198            arr' <- A.new newlen
199            A.copyM arr' 0 arr 0 di
200            outer arr' (newlen - 1) si di rbuf
201
202  outer arr0 (mlen - 1) s0 0 Empty
203{-# INLINE [0] unstream #-}
204
205-- we can generate this from UCD
206maxDecomposeLen :: Int
207maxDecomposeLen = 32
208
209-------------------------------------------------------------------------------
210-- Composition
211-------------------------------------------------------------------------------
212
213-- If we are composing we do not need to first decompose Hangul. We can just
214-- compose assuming there could be some partially composed syllables e.g. LV
215-- syllable followed by a jamo T. We need to compose this case as well.
216
217-- Hold an L to wait for V, hold an LV to wait for T.
218data JamoBuf
219    = Jamo !Char -- Jamo L, V or T
220    | Hangul !Char -- Hangul Syllable LV or LVT
221    | HangulLV !Char
222
223data RegBuf
224    = RegOne !Char
225    | RegMany !Char !Char ![Char]
226
227data ComposeState
228    = ComposeNone
229    | ComposeReg !RegBuf
230    | ComposeJamo !JamoBuf
231
232-------------------------------------------------------------------------------
233-- Composition of Jamo into Hangul syllables, done algorithmically
234-------------------------------------------------------------------------------
235
236{-# INLINE writeJamoBuf #-}
237writeJamoBuf :: A.MArray s -> Int -> JamoBuf -> ST s Int
238writeJamoBuf arr i jbuf = do
239    n <- unsafeWrite arr i (getCh jbuf)
240    return (i + n)
241
242    where
243
244    getCh (Jamo ch) = ch
245    getCh (Hangul ch) = ch
246    getCh (HangulLV ch) = ch
247
248{-# INLINE initHangul #-}
249initHangul :: Char -> Int -> ST s (Int, ComposeState)
250initHangul c i = return (i, ComposeJamo (Hangul c))
251
252{-# INLINE initJamo #-}
253initJamo :: Char -> Int -> ST s (Int, ComposeState)
254initJamo c i = return (i, ComposeJamo (Jamo c))
255
256{-# INLINE insertJamo #-}
257insertJamo
258    :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState)
259insertJamo arr i jbuf ch
260    | ich <= H.jamoLLast = do
261        j <- writeJamoBuf arr i jbuf
262        return (j, ComposeJamo (Jamo ch))
263    | ich < H.jamoVFirst =
264        flushAndWrite arr i jbuf ch
265    | ich <= H.jamoVLast = do
266        case jbuf of
267            Jamo c ->
268                case H.jamoLIndex c of
269                    Just li ->
270                        let vi = ich - H.jamoVFirst
271                            lvi = li * H.jamoNCount + vi * H.jamoTCount
272                            lv = chr (H.hangulFirst + lvi)
273                         in return (i, ComposeJamo (HangulLV lv))
274                    Nothing -> writeTwo arr i c ch
275            Hangul c -> writeTwo arr i c ch
276            HangulLV c -> writeTwo arr i c ch
277    | ich <= H.jamoTFirst = do
278        flushAndWrite arr i jbuf ch
279    | otherwise = do
280        let ti = ich - H.jamoTFirst
281        case jbuf of
282            Jamo c -> writeTwo arr i c ch
283            Hangul c
284                | H.isHangulLV c -> do
285                    writeLVT arr i c ti
286                | otherwise ->
287                    writeTwo arr i c ch
288            HangulLV c ->
289                writeLVT arr i c ti
290
291    where
292
293    ich = ord ch
294
295    {-# INLINE flushAndWrite #-}
296    flushAndWrite marr ix jb c = do
297        j <- writeJamoBuf marr ix jb
298        n <- unsafeWrite marr j c
299        return (j + n, ComposeNone)
300
301    {-# INLINE writeLVT #-}
302    writeLVT marr ix lv ti = do
303        n <- unsafeWrite marr ix (chr ((ord lv) + ti))
304        return (ix + n, ComposeNone)
305
306    {-# INLINE writeTwo #-}
307    writeTwo marr ix c1 c2 = do
308        n <- unsafeWrite marr ix c1
309        m <- unsafeWrite marr (ix + n) c2
310        return ((ix + n + m), ComposeNone)
311
312{-# INLINE insertHangul #-}
313insertHangul
314    :: A.MArray s -> Int -> JamoBuf -> Char -> ST s (Int, ComposeState)
315insertHangul arr i jbuf ch = do
316    j <- writeJamoBuf arr i jbuf
317    return (j, ComposeJamo (Hangul ch))
318
319{-# INLINE insertIntoRegBuf #-}
320insertIntoRegBuf :: Char -> RegBuf -> RegBuf
321insertIntoRegBuf c (RegOne c0)
322    | CC.getCombiningClass c < CC.getCombiningClass c0
323    = RegMany c c0 []
324    | otherwise
325    = RegMany c0 c []
326insertIntoRegBuf c (RegMany c0 c1 cs)
327    | cc < CC.getCombiningClass c0
328    = RegMany c c0 (c1 : cs)
329    | cc < CC.getCombiningClass c1
330    = RegMany c0 c (c1 : cs)
331    | otherwise
332    = RegMany c0 c1 (cs' ++ (c : cs''))
333    where
334        cc = CC.getCombiningClass c
335        (cs', cs'') = span ((<= cc) . CC.getCombiningClass) cs
336
337{-# INLINE writeRegBuf #-}
338writeRegBuf :: A.MArray s -> Int -> RegBuf -> ST s Int
339writeRegBuf arr i = \case
340    RegOne c -> do
341        n <- unsafeWrite arr i c
342        return (i + n)
343    RegMany st c [] ->
344        case C.composePair st c of
345            Just x -> do
346                n <- unsafeWrite arr i x
347                return (i + n)
348            Nothing -> do
349                n <- unsafeWrite arr i st
350                m <- unsafeWrite arr (i + n) c
351                return (i + n + m)
352    RegMany st0 c0 cs0 -> go [] st0 (c0 : cs0)
353
354    where
355
356    -- arguments: uncombined chars, starter, unprocessed str
357    go uncs st [] = writeStr arr i (st : uncs)
358    go uncs st (c : cs) = case C.composePair st c of
359        Nothing -> go (uncs ++ (c : same)) st bigger
360        Just x  -> go uncs x cs
361        where
362            cc = CC.getCombiningClass c
363            (same, bigger) = span ((== cc) . CC.getCombiningClass) cs
364
365{-# INLINE flushComposeState #-}
366flushComposeState :: A.MArray s -> Int -> ComposeState -> ST s Int
367flushComposeState arr i = \case
368    ComposeNone -> pure i
369    ComposeReg rbuf -> writeRegBuf arr i rbuf
370    ComposeJamo jbuf -> writeJamoBuf arr i jbuf
371
372{-# INLINE composeChar #-}
373composeChar
374    :: D.DecomposeMode
375    -> A.MArray s       -- destination array for composition
376    -> Char             -- input char
377    -> Int              -- array index
378    -> ComposeState
379    -> ST s (Int, ComposeState)
380composeChar mode marr = go0
381
382    where
383
384    go0 ch !i !st =
385        case st of
386            ComposeReg rbuf
387                | ich < H.jamoLFirst ->
388                    composeReg rbuf ch i st
389                | ich <= H.jamoLast -> do
390                    j <- writeRegBuf marr i rbuf
391                    initJamo ch j
392                | ich < H.hangulFirst ->
393                    composeReg rbuf ch i st
394                | ich <= H.hangulLast -> do
395                    j <- writeRegBuf marr i rbuf
396                    initHangul ch j
397                | otherwise ->
398                    composeReg rbuf ch i st
399            ComposeJamo jbuf
400                | ich < H.jamoLFirst -> do
401                    jamoToReg marr i jbuf ch
402                | ich <= H.jamoLast -> do
403                    insertJamo marr i jbuf ch
404                | ich < H.hangulFirst ->
405                    jamoToReg marr i jbuf ch
406                | ich <= H.hangulLast -> do
407                    insertHangul marr i jbuf ch
408                | otherwise ->
409                    jamoToReg marr i jbuf ch
410            ComposeNone
411                | ich < H.jamoLFirst ->
412                    initReg ch i
413                | ich <= H.jamoLast ->
414                    initJamo ch i
415                | ich < H.hangulFirst ->
416                    initReg ch i
417                | ich <= H.hangulLast ->
418                    initHangul ch i
419                | otherwise ->
420                    initReg ch i
421        where ich = ord ch
422
423    {-# INLINE jamoToReg #-}
424    jamoToReg arr i jbuf ch = do
425        j <- writeJamoBuf arr i jbuf
426        initReg ch j
427
428    {-# INLINE initReg #-}
429    initReg !ch !i
430        | D.isDecomposable mode ch =
431            go (D.decomposeChar mode ch) i ComposeNone
432        | otherwise =
433            pure (i, ComposeReg (RegOne ch))
434
435    {-# INLINE composeReg #-}
436    composeReg rbuf !ch !i !st
437        | D.isDecomposable mode ch =
438            go (D.decomposeChar mode ch) i st
439        | CC.isCombining ch = do
440            pure (i, ComposeReg (insertIntoRegBuf ch rbuf))
441        -- The first char in RegBuf may or may not be a starter. In
442        -- case it is not we rely on composeStarterPair failing.
443        | RegOne s <- rbuf
444        , C.isSecondStarter ch
445        , Just x <- C.composeStarterPair s ch =
446            pure (i, (ComposeReg (RegOne x)))
447        | otherwise = do
448            j <- writeRegBuf marr i rbuf
449            pure (j, ComposeReg (RegOne ch))
450
451    go [] !i !st = pure (i, st)
452    go (ch : rest) i st =
453        case st of
454            ComposeReg rbuf
455                | H.isHangul ch -> do
456                    j <- writeRegBuf marr i rbuf
457                    (k, s) <- initHangul ch j
458                    go rest k s
459                | H.isJamo ch -> do
460                    j <- writeRegBuf marr i rbuf
461                    (k, s) <- initJamo ch j
462                    go rest k s
463                | D.isDecomposable mode ch ->
464                    go (D.decomposeChar mode ch ++ rest) i st
465                | CC.isCombining ch -> do
466                    go rest i (ComposeReg (insertIntoRegBuf ch rbuf))
467                | RegOne s <- rbuf
468                , C.isSecondStarter ch
469                , Just x <- C.composeStarterPair s ch ->
470                    go rest i (ComposeReg (RegOne x))
471                | otherwise -> do
472                    j <- writeRegBuf marr i rbuf
473                    go rest j (ComposeReg (RegOne ch))
474            ComposeJamo jbuf
475                | H.isJamo ch -> do
476                    (j, s) <- insertJamo marr i jbuf ch
477                    go rest j s
478                | H.isHangul ch -> do
479                    (j, s) <- insertHangul marr i jbuf ch
480                    go rest j s
481                | otherwise -> do
482                    j <- writeJamoBuf marr i jbuf
483                    case () of
484                        _
485                            | D.isDecomposable mode ch ->
486                                go (D.decomposeChar mode ch ++ rest) j
487                                   ComposeNone
488                            | otherwise ->
489                                go rest j (ComposeReg (RegOne ch))
490            ComposeNone
491                | H.isHangul ch -> do
492                    (j, s) <- initHangul ch i
493                    go rest j s
494                | H.isJamo ch -> do
495                    (j, s) <- initJamo ch i
496                    go rest j s
497                | D.isDecomposable mode ch ->
498                    go (D.decomposeChar mode ch ++ rest) i st
499                | otherwise ->
500                    go rest i (ComposeReg (RegOne ch))
501
502-- | /O(n)/ Convert a 'Stream Char' into a composed normalized 'Text'.
503unstreamC :: D.DecomposeMode -> Stream Char -> Text
504unstreamC mode (Stream next0 s0 len) = runText $ \done -> do
505  -- Before encoding each char we perform a buffer realloc check assuming
506  -- worst case encoding size of two 16-bit units for the char. Just add an
507  -- extra space to the buffer so that we do not end up reallocating even when
508  -- all the chars are encoded as single unit.
509  let margin = 1 + maxDecomposeLen
510      mlen = (upperBound 4 len + margin)
511  arr0 <- A.new mlen
512  let outer !arr !maxi = encode SPEC
513       where
514        -- keep the common case loop as small as possible
515        encode !_ !si !di st =
516            -- simply check for the worst case
517            if maxi < di + margin
518               then realloc si di st
519            else
520                case next0 si of
521                    Done -> do
522                        di' <- flushComposeState arr di st
523                        done arr di'
524                    Skip si'    -> encode SPEC si' di st
525                    Yield c si' -> do
526                        (di', st') <- composeChar mode arr c di st
527                        encode SPEC si' di' st'
528
529        -- keep uncommon case separate from the common case code
530        {-# NOINLINE realloc #-}
531        realloc !si !di st = do
532            let newlen = maxi * 2
533            arr' <- A.new newlen
534            A.copyM arr' 0 arr 0 di
535            outer arr' (newlen - 1) si di st
536
537  outer arr0 (mlen - 1) s0 0 ComposeNone
538{-# INLINE [0] unstreamC #-}
539