1{-# LANGUAGE CPP #-}
2#if __GLASGOW_HASKELL__ >= 701
3{-# LANGUAGE Trustworthy #-}
4#endif
5--
6-- |
7-- Module      :  Codec.Binary.UTF8.String
8-- Copyright   :  (c) Eric Mertens 2007
9-- License     :  BSD3-style (see LICENSE)
10--
11-- Maintainer:    emertens@galois.com
12-- Stability   :  experimental
13-- Portability :  portable
14--
15-- Support for encoding UTF8 Strings to and from @['Word8']@
16--
17
18module Codec.Binary.UTF8.String (
19      encode
20    , decode
21    , encodeString
22    , decodeString
23    , encodeChar
24
25    , isUTF8Encoded
26    , utf8Encode
27  ) where
28
29import Data.Word        (Word8,Word32)
30import Data.Bits        ((.|.),(.&.),shiftL,shiftR)
31import Data.Char        (chr,ord)
32
33default(Int)
34
35-- | Encode a string using 'encode' and store the result in a 'String'.
36encodeString :: String -> String
37encodeString xs = map (toEnum . fromEnum) (encode xs)
38
39-- | Decode a string using 'decode' using a 'String' as input.
40-- This is not safe but it is necessary if UTF-8 encoded text
41-- has been loaded into a 'String' prior to being decoded.
42decodeString :: String -> String
43decodeString xs = decode (map (toEnum . fromEnum) xs)
44
45replacement_character :: Char
46replacement_character = '\xfffd'
47
48-- | Encode a single Haskell 'Char' to a list of 'Word8' values, in UTF8 format.
49encodeChar :: Char -> [Word8]
50encodeChar = map fromIntegral . go . ord
51 where
52  go oc
53   | oc <= 0x7f       = [oc]
54
55   | oc <= 0x7ff      = [ 0xc0 + (oc `shiftR` 6)
56                        , 0x80 + oc .&. 0x3f
57                        ]
58
59   | oc <= 0xffff     = [ 0xe0 + (oc `shiftR` 12)
60                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
61                        , 0x80 + oc .&. 0x3f
62                        ]
63   | otherwise        = [ 0xf0 + (oc `shiftR` 18)
64                        , 0x80 + ((oc `shiftR` 12) .&. 0x3f)
65                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
66                        , 0x80 + oc .&. 0x3f
67                        ]
68
69
70-- | Encode a Haskell 'String' to a list of 'Word8' values, in UTF8 format.
71encode :: String -> [Word8]
72encode = concatMap encodeChar
73
74--
75-- | Decode a UTF8 string packed into a list of 'Word8' values, directly to 'String'
76--
77decode :: [Word8] -> String
78decode [    ] = ""
79decode (c:cs)
80  | c < 0x80  = chr (fromEnum c) : decode cs
81  | c < 0xc0  = replacement_character : decode cs
82  | c < 0xe0  = multi1
83  | c < 0xf0  = multi_byte 2 0xf  0x800
84  | c < 0xf8  = multi_byte 3 0x7  0x10000
85  | c < 0xfc  = multi_byte 4 0x3  0x200000
86  | c < 0xfe  = multi_byte 5 0x1  0x4000000
87  | otherwise = replacement_character : decode cs
88  where
89    multi1 = case cs of
90      c1 : ds | c1 .&. 0xc0 == 0x80 ->
91        let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|.  fromEnum (c1 .&. 0x3f)
92        in if d >= 0x000080 then toEnum d : decode ds
93                            else replacement_character : decode ds
94      _ -> replacement_character : decode cs
95
96    multi_byte :: Int -> Word8 -> Int -> [Char]
97    multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
98      where
99        aux 0 rs acc
100          | overlong <= acc && acc <= 0x10ffff &&
101            (acc < 0xd800 || 0xdfff < acc)     &&
102            (acc < 0xfffe || 0xffff < acc)      = chr acc : decode rs
103          | otherwise = replacement_character : decode rs
104
105        aux n (r:rs) acc
106          | r .&. 0xc0 == 0x80 = aux (n-1) rs
107                               $ shiftL acc 6 .|. fromEnum (r .&. 0x3f)
108
109        aux _ rs     _ = replacement_character : decode rs
110
111
112-- | @utf8Encode str@ is a convenience function; checks to see if
113-- @str@ isn't UTF-8 encoded before doing so. Sometimes useful, but
114-- you are better off keeping track of the encoding so as to avoid
115-- the cost of checking.
116utf8Encode :: String -> String
117utf8Encode str
118 | isUTF8Encoded str = str
119 | otherwise         = encodeString str
120
121
122-- | @isUTF8Encoded str@ tries to recognize input string as being in UTF-8 form.
123isUTF8Encoded :: String -> Bool
124isUTF8Encoded [] = True
125isUTF8Encoded (x:xs) =
126  case ox of
127    _ | ox < 0x80  -> isUTF8Encoded xs
128      | ox > 0xff  -> False
129      | ox < 0xc0  -> False
130      | ox < 0xe0  -> check1
131      | ox < 0xf0  -> check_byte 2 0xf 0
132      | ox < 0xf8  -> check_byte 3 0x7  0x10000
133      | ox < 0xfc  -> check_byte 4 0x3  0x200000
134      | ox < 0xfe  -> check_byte 5 0x1  0x4000000
135      | otherwise  -> False
136 where
137   ox = toW32 x
138
139   toW32 :: Char -> Word32
140   toW32 ch = fromIntegral (fromEnum ch)
141
142   check1 =
143    case xs of
144     [] -> False
145     c1 : ds
146      | oc .&. 0xc0 /= 0x80 || d < 0x000080 -> False
147      | otherwise -> isUTF8Encoded ds
148      where
149       oc = toW32 c1
150       d = ((ox .&. 0x1f) `shiftL` 6) .|.  (oc .&. 0x3f)
151
152   check_byte :: Int -> Word32 -> Word32 -> Bool
153   check_byte i mask overlong = aux i xs (ox .&. mask)
154      where
155        aux 0 rs acc
156         | overlong <= acc &&
157           acc <= 0x10ffff &&
158           (acc < 0xd800 || 0xdfff < acc) &&
159           (acc < 0xfffe || 0xffff < acc) = isUTF8Encoded rs
160         | otherwise = False
161
162        aux n (r:rs) acc
163         | toW32 r .&. 0xc0 == 0x80 =
164            aux (n-1) rs  (acc `shiftL` 6 .|. (toW32 r .&. 0x3f))
165
166        aux _ _  _ = False
167
168