1module Distribution.Utils.String 2 ( -- * Encode to/from UTF8 3 decodeStringUtf8 4 , encodeStringUtf8 5 ) where 6 7import Data.Word 8import Data.Bits 9import Data.Char (chr,ord) 10 11-- | Decode 'String' from UTF8-encoded octets. 12-- 13-- Invalid data in the UTF8 stream (this includes code-points @U+D800@ 14-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@). 15-- 16-- See also 'encodeStringUtf8' 17decodeStringUtf8 :: [Word8] -> String 18decodeStringUtf8 = go 19 where 20 go :: [Word8] -> String 21 go [] = [] 22 go (c : cs) 23 | c <= 0x7F = chr (fromIntegral c) : go cs 24 | c <= 0xBF = replacementChar : go cs 25 | c <= 0xDF = twoBytes c cs 26 | c <= 0xEF = moreBytes 3 0x800 cs (fromIntegral $ c .&. 0xF) 27 | c <= 0xF7 = moreBytes 4 0x10000 cs (fromIntegral $ c .&. 0x7) 28 | c <= 0xFB = moreBytes 5 0x200000 cs (fromIntegral $ c .&. 0x3) 29 | c <= 0xFD = moreBytes 6 0x4000000 cs (fromIntegral $ c .&. 0x1) 30 | otherwise = replacementChar : go cs 31 32 twoBytes :: Word8 -> [Word8] -> String 33 twoBytes c0 (c1:cs') 34 | c1 .&. 0xC0 == 0x80 35 = let d = (fromIntegral (c0 .&. 0x1F) `shiftL` 6) 36 .|. fromIntegral (c1 .&. 0x3F) 37 in if d >= 0x80 38 then chr d : go cs' 39 else replacementChar : go cs' 40 twoBytes _ cs' = replacementChar : go cs' 41 42 moreBytes :: Int -> Int -> [Word8] -> Int -> [Char] 43 moreBytes 1 overlong cs' acc 44 | overlong <= acc, acc <= 0x10FFFF, acc < 0xD800 || 0xDFFF < acc 45 = chr acc : go cs' 46 47 | otherwise 48 = replacementChar : go cs' 49 50 moreBytes byteCount overlong (cn:cs') acc 51 | cn .&. 0xC0 == 0x80 52 = moreBytes (byteCount-1) overlong cs' 53 ((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F) 54 55 moreBytes _ _ cs' _ 56 = replacementChar : go cs' 57 58 replacementChar = '\xfffd' 59 60 61-- | Encode 'String' to a list of UTF8-encoded octets 62-- 63-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded 64-- as the replacement character (i.e. @U+FFFD@). 65-- 66-- See also 'decodeUtf8' 67encodeStringUtf8 :: String -> [Word8] 68encodeStringUtf8 [] = [] 69encodeStringUtf8 (c:cs) 70 | c <= '\x07F' = w8 71 : encodeStringUtf8 cs 72 | c <= '\x7FF' = (0xC0 .|. w8ShiftR 6 ) 73 : (0x80 .|. (w8 .&. 0x3F)) 74 : encodeStringUtf8 cs 75 | c <= '\xD7FF'= (0xE0 .|. w8ShiftR 12 ) 76 : (0x80 .|. (w8ShiftR 6 .&. 0x3F)) 77 : (0x80 .|. (w8 .&. 0x3F)) 78 : encodeStringUtf8 cs 79 | c <= '\xDFFF'= 0xEF : 0xBF : 0xBD -- U+FFFD 80 : encodeStringUtf8 cs 81 | c <= '\xFFFF'= (0xE0 .|. w8ShiftR 12 ) 82 : (0x80 .|. (w8ShiftR 6 .&. 0x3F)) 83 : (0x80 .|. (w8 .&. 0x3F)) 84 : encodeStringUtf8 cs 85 | otherwise = (0xf0 .|. w8ShiftR 18 ) 86 : (0x80 .|. (w8ShiftR 12 .&. 0x3F)) 87 : (0x80 .|. (w8ShiftR 6 .&. 0x3F)) 88 : (0x80 .|. (w8 .&. 0x3F)) 89 : encodeStringUtf8 cs 90 where 91 w8 = fromIntegral (ord c) :: Word8 92 w8ShiftR :: Int -> Word8 93 w8ShiftR = fromIntegral . shiftR (ord c) 94