1{-# LANGUAGE CPP, MagicHash, BangPatterns #-} 2 3-- | 4-- Module : Data.Text.Internal.Encoding.Utf8 5-- Copyright : (c) 2008, 2009 Tom Harper, 6-- (c) 2009, 2010 Bryan O'Sullivan, 7-- (c) 2009 Duncan Coutts 8-- 9-- License : BSD-style 10-- Maintainer : bos@serpentine.com 11-- Stability : experimental 12-- Portability : GHC 13-- 14-- /Warning/: this is an internal module, and does not have a stable 15-- API or name. Functions in this module may not check or enforce 16-- preconditions expected by public modules. Use at your own risk! 17-- 18-- Basic UTF-8 validation and character manipulation. 19module Data.Text.Internal.Encoding.Utf8 20 ( 21 -- Decomposition 22 ord2 23 , ord3 24 , ord4 25 -- Construction 26 , chr2 27 , chr3 28 , chr4 29 -- * Validation 30 , validate1 31 , validate2 32 , validate3 33 , validate4 34 ) where 35 36#if defined(TEST_SUITE) 37# undef ASSERTS 38#endif 39 40#if defined(ASSERTS) 41import Control.Exception (assert) 42#endif 43import Data.Bits ((.&.)) 44import Data.Text.Internal.Unsafe.Char (ord) 45import Data.Text.Internal.Unsafe.Shift (shiftR) 46import GHC.Exts 47import GHC.Word (Word8(..)) 48 49default(Int) 50 51between :: Word8 -- ^ byte to check 52 -> Word8 -- ^ lower bound 53 -> Word8 -- ^ upper bound 54 -> Bool 55between x y z = x >= y && x <= z 56{-# INLINE between #-} 57 58ord2 :: Char -> (Word8,Word8) 59ord2 c = 60#if defined(ASSERTS) 61 assert (n >= 0x80 && n <= 0x07ff) 62#endif 63 (x1,x2) 64 where 65 n = ord c 66 x1 = fromIntegral $ (n `shiftR` 6) + 0xC0 67 x2 = fromIntegral $ (n .&. 0x3F) + 0x80 68 69ord3 :: Char -> (Word8,Word8,Word8) 70ord3 c = 71#if defined(ASSERTS) 72 assert (n >= 0x0800 && n <= 0xffff) 73#endif 74 (x1,x2,x3) 75 where 76 n = ord c 77 x1 = fromIntegral $ (n `shiftR` 12) + 0xE0 78 x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80 79 x3 = fromIntegral $ (n .&. 0x3F) + 0x80 80 81ord4 :: Char -> (Word8,Word8,Word8,Word8) 82ord4 c = 83#if defined(ASSERTS) 84 assert (n >= 0x10000) 85#endif 86 (x1,x2,x3,x4) 87 where 88 n = ord c 89 x1 = fromIntegral $ (n `shiftR` 18) + 0xF0 90 x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80 91 x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80 92 x4 = fromIntegral $ (n .&. 0x3F) + 0x80 93 94chr2 :: Word8 -> Word8 -> Char 95chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#)) 96 where 97 !y1# = word2Int# x1# 98 !y2# = word2Int# x2# 99 !z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6# 100 !z2# = y2# -# 0x80# 101{-# INLINE chr2 #-} 102 103chr3 :: Word8 -> Word8 -> Word8 -> Char 104chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#)) 105 where 106 !y1# = word2Int# x1# 107 !y2# = word2Int# x2# 108 !y3# = word2Int# x3# 109 !z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12# 110 !z2# = uncheckedIShiftL# (y2# -# 0x80#) 6# 111 !z3# = y3# -# 0x80# 112{-# INLINE chr3 #-} 113 114chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char 115chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) = 116 C# (chr# (z1# +# z2# +# z3# +# z4#)) 117 where 118 !y1# = word2Int# x1# 119 !y2# = word2Int# x2# 120 !y3# = word2Int# x3# 121 !y4# = word2Int# x4# 122 !z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18# 123 !z2# = uncheckedIShiftL# (y2# -# 0x80#) 12# 124 !z3# = uncheckedIShiftL# (y3# -# 0x80#) 6# 125 !z4# = y4# -# 0x80# 126{-# INLINE chr4 #-} 127 128validate1 :: Word8 -> Bool 129validate1 x1 = x1 <= 0x7F 130{-# INLINE validate1 #-} 131 132validate2 :: Word8 -> Word8 -> Bool 133validate2 x1 x2 = between x1 0xC2 0xDF && between x2 0x80 0xBF 134{-# INLINE validate2 #-} 135 136validate3 :: Word8 -> Word8 -> Word8 -> Bool 137{-# INLINE validate3 #-} 138validate3 x1 x2 x3 = validate3_1 || validate3_2 || validate3_3 || validate3_4 139 where 140 validate3_1 = (x1 == 0xE0) && 141 between x2 0xA0 0xBF && 142 between x3 0x80 0xBF 143 validate3_2 = between x1 0xE1 0xEC && 144 between x2 0x80 0xBF && 145 between x3 0x80 0xBF 146 validate3_3 = x1 == 0xED && 147 between x2 0x80 0x9F && 148 between x3 0x80 0xBF 149 validate3_4 = between x1 0xEE 0xEF && 150 between x2 0x80 0xBF && 151 between x3 0x80 0xBF 152 153validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool 154{-# INLINE validate4 #-} 155validate4 x1 x2 x3 x4 = validate4_1 || validate4_2 || validate4_3 156 where 157 validate4_1 = x1 == 0xF0 && 158 between x2 0x90 0xBF && 159 between x3 0x80 0xBF && 160 between x4 0x80 0xBF 161 validate4_2 = between x1 0xF1 0xF3 && 162 between x2 0x80 0xBF && 163 between x3 0x80 0xBF && 164 between x4 0x80 0xBF 165 validate4_3 = x1 == 0xF4 && 166 between x2 0x80 0x8F && 167 between x3 0x80 0xBF && 168 between x4 0x80 0xBF 169