1module Distribution.Utils.String
2    ( -- * Encode to/from UTF8
3      decodeStringUtf8
4    , encodeStringUtf8
5    ) where
6
7import Data.Word
8import Data.Bits
9import Data.Char (chr,ord)
10
11-- | Decode 'String' from UTF8-encoded octets.
12--
13-- Invalid data in the UTF8 stream (this includes code-points @U+D800@
14-- through @U+DFFF@) will be decoded as the replacement character (@U+FFFD@).
15--
16-- See also 'encodeStringUtf8'
17decodeStringUtf8 :: [Word8] -> String
18decodeStringUtf8 = go
19  where
20    go :: [Word8] -> String
21    go []       = []
22    go (c : cs)
23      | c <= 0x7F = chr (fromIntegral c) : go cs
24      | c <= 0xBF = replacementChar : go cs
25      | c <= 0xDF = twoBytes c cs
26      | c <= 0xEF = moreBytes 3 0x800     cs (fromIntegral $ c .&. 0xF)
27      | c <= 0xF7 = moreBytes 4 0x10000   cs (fromIntegral $ c .&. 0x7)
28      | c <= 0xFB = moreBytes 5 0x200000  cs (fromIntegral $ c .&. 0x3)
29      | c <= 0xFD = moreBytes 6 0x4000000 cs (fromIntegral $ c .&. 0x1)
30      | otherwise   = replacementChar : go cs
31
32    twoBytes :: Word8 -> [Word8] -> String
33    twoBytes c0 (c1:cs')
34      | c1 .&. 0xC0 == 0x80
35      = let d = (fromIntegral (c0 .&. 0x1F) `shiftL` 6)
36             .|. fromIntegral (c1 .&. 0x3F)
37         in if d >= 0x80
38               then  chr d                : go cs'
39               else  replacementChar      : go cs'
40    twoBytes _ cs' = replacementChar      : go cs'
41
42    moreBytes :: Int -> Int -> [Word8] -> Int -> [Char]
43    moreBytes 1 overlong cs' acc
44      | overlong <= acc, acc <= 0x10FFFF, acc < 0xD800 || 0xDFFF < acc
45      = chr acc : go cs'
46
47      | otherwise
48      = replacementChar : go cs'
49
50    moreBytes byteCount overlong (cn:cs') acc
51      | cn .&. 0xC0 == 0x80
52      = moreBytes (byteCount-1) overlong cs'
53          ((acc `shiftL` 6) .|. fromIntegral cn .&. 0x3F)
54
55    moreBytes _ _ cs' _
56      = replacementChar : go cs'
57
58    replacementChar = '\xfffd'
59
60
61-- | Encode 'String' to a list of UTF8-encoded octets
62--
63-- Code-points in the @U+D800@-@U+DFFF@ range will be encoded
64-- as the replacement character (i.e. @U+FFFD@).
65--
66-- See also 'decodeUtf8'
67encodeStringUtf8 :: String -> [Word8]
68encodeStringUtf8 []        = []
69encodeStringUtf8 (c:cs)
70  | c <= '\x07F' = w8
71                 : encodeStringUtf8 cs
72  | c <= '\x7FF' = (0xC0 .|.  w8ShiftR  6          )
73                 : (0x80 .|. (w8          .&. 0x3F))
74                 : encodeStringUtf8 cs
75  | c <= '\xD7FF'= (0xE0 .|.  w8ShiftR 12          )
76                 : (0x80 .|. (w8ShiftR  6 .&. 0x3F))
77                 : (0x80 .|. (w8          .&. 0x3F))
78                 : encodeStringUtf8 cs
79  | c <= '\xDFFF'= 0xEF : 0xBF : 0xBD -- U+FFFD
80                 : encodeStringUtf8 cs
81  | c <= '\xFFFF'= (0xE0 .|.  w8ShiftR 12          )
82                 : (0x80 .|. (w8ShiftR  6 .&. 0x3F))
83                 : (0x80 .|. (w8          .&. 0x3F))
84                 : encodeStringUtf8 cs
85  | otherwise    = (0xf0 .|.  w8ShiftR 18          )
86                 : (0x80 .|. (w8ShiftR 12 .&. 0x3F))
87                 : (0x80 .|. (w8ShiftR  6 .&. 0x3F))
88                 : (0x80 .|. (w8          .&. 0x3F))
89                 : encodeStringUtf8 cs
90  where
91    w8 = fromIntegral (ord c) :: Word8
92    w8ShiftR :: Int -> Word8
93    w8ShiftR = fromIntegral . shiftR (ord c)
94