1{-# LANGUAGE CPP, MagicHash, BangPatterns #-}
2
3-- |
4-- Module      : Data.Text.Internal.Encoding.Utf8
5-- Copyright   : (c) 2008, 2009 Tom Harper,
6--               (c) 2009, 2010 Bryan O'Sullivan,
7--               (c) 2009 Duncan Coutts
8--
9-- License     : BSD-style
10-- Maintainer  : bos@serpentine.com
11-- Stability   : experimental
12-- Portability : GHC
13--
14-- /Warning/: this is an internal module, and does not have a stable
15-- API or name. Functions in this module may not check or enforce
16-- preconditions expected by public modules. Use at your own risk!
17--
18-- Basic UTF-8 validation and character manipulation.
19module Data.Text.Internal.Encoding.Utf8
20    (
21    -- Decomposition
22      ord2
23    , ord3
24    , ord4
25    -- Construction
26    , chr2
27    , chr3
28    , chr4
29    -- * Validation
30    , validate1
31    , validate2
32    , validate3
33    , validate4
34    ) where
35
36#if defined(TEST_SUITE)
37# undef ASSERTS
38#endif
39
40#if defined(ASSERTS)
41import Control.Exception (assert)
42#endif
43import Data.Bits ((.&.))
44import Data.Text.Internal.Unsafe.Char (ord)
45import Data.Text.Internal.Unsafe.Shift (shiftR)
46import GHC.Exts
47import GHC.Word (Word8(..))
48
49default(Int)
50
51between :: Word8                -- ^ byte to check
52        -> Word8                -- ^ lower bound
53        -> Word8                -- ^ upper bound
54        -> Bool
55between x y z = x >= y && x <= z
56{-# INLINE between #-}
57
58ord2 :: Char -> (Word8,Word8)
59ord2 c =
60#if defined(ASSERTS)
61    assert (n >= 0x80 && n <= 0x07ff)
62#endif
63    (x1,x2)
64    where
65      n  = ord c
66      x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
67      x2 = fromIntegral $ (n .&. 0x3F)   + 0x80
68
69ord3 :: Char -> (Word8,Word8,Word8)
70ord3 c =
71#if defined(ASSERTS)
72    assert (n >= 0x0800 && n <= 0xffff)
73#endif
74    (x1,x2,x3)
75    where
76      n  = ord c
77      x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
78      x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
79      x3 = fromIntegral $ (n .&. 0x3F) + 0x80
80
81ord4 :: Char -> (Word8,Word8,Word8,Word8)
82ord4 c =
83#if defined(ASSERTS)
84    assert (n >= 0x10000)
85#endif
86    (x1,x2,x3,x4)
87    where
88      n  = ord c
89      x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
90      x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
91      x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
92      x4 = fromIntegral $ (n .&. 0x3F) + 0x80
93
94chr2 :: Word8 -> Word8 -> Char
95chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
96    where
97      !y1# = word2Int# x1#
98      !y2# = word2Int# x2#
99      !z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6#
100      !z2# = y2# -# 0x80#
101{-# INLINE chr2 #-}
102
103chr3 :: Word8 -> Word8 -> Word8 -> Char
104chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#))
105    where
106      !y1# = word2Int# x1#
107      !y2# = word2Int# x2#
108      !y3# = word2Int# x3#
109      !z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12#
110      !z2# = uncheckedIShiftL# (y2# -# 0x80#) 6#
111      !z3# = y3# -# 0x80#
112{-# INLINE chr3 #-}
113
114chr4             :: Word8 -> Word8 -> Word8 -> Word8 -> Char
115chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
116    C# (chr# (z1# +# z2# +# z3# +# z4#))
117    where
118      !y1# = word2Int# x1#
119      !y2# = word2Int# x2#
120      !y3# = word2Int# x3#
121      !y4# = word2Int# x4#
122      !z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18#
123      !z2# = uncheckedIShiftL# (y2# -# 0x80#) 12#
124      !z3# = uncheckedIShiftL# (y3# -# 0x80#) 6#
125      !z4# = y4# -# 0x80#
126{-# INLINE chr4 #-}
127
128validate1 :: Word8 -> Bool
129validate1 x1 = x1 <= 0x7F
130{-# INLINE validate1 #-}
131
132validate2 :: Word8 -> Word8 -> Bool
133validate2 x1 x2 = between x1 0xC2 0xDF && between x2 0x80 0xBF
134{-# INLINE validate2 #-}
135
136validate3 :: Word8 -> Word8 -> Word8 -> Bool
137{-# INLINE validate3 #-}
138validate3 x1 x2 x3 = validate3_1 || validate3_2 || validate3_3 || validate3_4
139  where
140    validate3_1 = (x1 == 0xE0) &&
141                  between x2 0xA0 0xBF &&
142                  between x3 0x80 0xBF
143    validate3_2 = between x1 0xE1 0xEC &&
144                  between x2 0x80 0xBF &&
145                  between x3 0x80 0xBF
146    validate3_3 = x1 == 0xED &&
147                  between x2 0x80 0x9F &&
148                  between x3 0x80 0xBF
149    validate3_4 = between x1 0xEE 0xEF &&
150                  between x2 0x80 0xBF &&
151                  between x3 0x80 0xBF
152
153validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
154{-# INLINE validate4 #-}
155validate4 x1 x2 x3 x4 = validate4_1 || validate4_2 || validate4_3
156  where
157    validate4_1 = x1 == 0xF0 &&
158                  between x2 0x90 0xBF &&
159                  between x3 0x80 0xBF &&
160                  between x4 0x80 0xBF
161    validate4_2 = between x1 0xF1 0xF3 &&
162                  between x2 0x80 0xBF &&
163                  between x3 0x80 0xBF &&
164                  between x4 0x80 0xBF
165    validate4_3 = x1 == 0xF4 &&
166                  between x2 0x80 0x8F &&
167                  between x3 0x80 0xBF &&
168                  between x4 0x80 0xBF
169