1{-# LANGUAGE CPP #-} 2#if __GLASGOW_HASKELL__ >= 701 3{-# LANGUAGE Trustworthy #-} 4#endif 5-- 6-- | 7-- Module : Codec.Binary.UTF8.String 8-- Copyright : (c) Eric Mertens 2007 9-- License : BSD3-style (see LICENSE) 10-- 11-- Maintainer: emertens@galois.com 12-- Stability : experimental 13-- Portability : portable 14-- 15-- Support for encoding UTF8 Strings to and from @['Word8']@ 16-- 17 18module Codec.Binary.UTF8.String ( 19 encode 20 , decode 21 , encodeString 22 , decodeString 23 , encodeChar 24 25 , isUTF8Encoded 26 , utf8Encode 27 ) where 28 29import Data.Word (Word8,Word32) 30import Data.Bits ((.|.),(.&.),shiftL,shiftR) 31import Data.Char (chr,ord) 32 33default(Int) 34 35-- | Encode a string using 'encode' and store the result in a 'String'. 36encodeString :: String -> String 37encodeString xs = map (toEnum . fromEnum) (encode xs) 38 39-- | Decode a string using 'decode' using a 'String' as input. 40-- This is not safe but it is necessary if UTF-8 encoded text 41-- has been loaded into a 'String' prior to being decoded. 42decodeString :: String -> String 43decodeString xs = decode (map (toEnum . fromEnum) xs) 44 45replacement_character :: Char 46replacement_character = '\xfffd' 47 48-- | Encode a single Haskell 'Char' to a list of 'Word8' values, in UTF8 format. 49encodeChar :: Char -> [Word8] 50encodeChar = map fromIntegral . go . ord 51 where 52 go oc 53 | oc <= 0x7f = [oc] 54 55 | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) 56 , 0x80 + oc .&. 0x3f 57 ] 58 59 | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) 60 , 0x80 + ((oc `shiftR` 6) .&. 0x3f) 61 , 0x80 + oc .&. 0x3f 62 ] 63 | otherwise = [ 0xf0 + (oc `shiftR` 18) 64 , 0x80 + ((oc `shiftR` 12) .&. 0x3f) 65 , 0x80 + ((oc `shiftR` 6) .&. 0x3f) 66 , 0x80 + oc .&. 0x3f 67 ] 68 69 70-- | Encode a Haskell 'String' to a list of 'Word8' values, in UTF8 format. 71encode :: String -> [Word8] 72encode = concatMap encodeChar 73 74-- 75-- | Decode a UTF8 string packed into a list of 'Word8' values, directly to 'String' 76-- 77decode :: [Word8] -> String 78decode [ ] = "" 79decode (c:cs) 80 | c < 0x80 = chr (fromEnum c) : decode cs 81 | c < 0xc0 = replacement_character : decode cs 82 | c < 0xe0 = multi1 83 | c < 0xf0 = multi_byte 2 0xf 0x800 84 | c < 0xf8 = multi_byte 3 0x7 0x10000 85 | c < 0xfc = multi_byte 4 0x3 0x200000 86 | c < 0xfe = multi_byte 5 0x1 0x4000000 87 | otherwise = replacement_character : decode cs 88 where 89 multi1 = case cs of 90 c1 : ds | c1 .&. 0xc0 == 0x80 -> 91 let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|. fromEnum (c1 .&. 0x3f) 92 in if d >= 0x000080 then toEnum d : decode ds 93 else replacement_character : decode ds 94 _ -> replacement_character : decode cs 95 96 multi_byte :: Int -> Word8 -> Int -> [Char] 97 multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask)) 98 where 99 aux 0 rs acc 100 | overlong <= acc && acc <= 0x10ffff && 101 (acc < 0xd800 || 0xdfff < acc) && 102 (acc < 0xfffe || 0xffff < acc) = chr acc : decode rs 103 | otherwise = replacement_character : decode rs 104 105 aux n (r:rs) acc 106 | r .&. 0xc0 == 0x80 = aux (n-1) rs 107 $ shiftL acc 6 .|. fromEnum (r .&. 0x3f) 108 109 aux _ rs _ = replacement_character : decode rs 110 111 112-- | @utf8Encode str@ is a convenience function; checks to see if 113-- @str@ isn't UTF-8 encoded before doing so. Sometimes useful, but 114-- you are better off keeping track of the encoding so as to avoid 115-- the cost of checking. 116utf8Encode :: String -> String 117utf8Encode str 118 | isUTF8Encoded str = str 119 | otherwise = encodeString str 120 121 122-- | @isUTF8Encoded str@ tries to recognize input string as being in UTF-8 form. 123isUTF8Encoded :: String -> Bool 124isUTF8Encoded [] = True 125isUTF8Encoded (x:xs) = 126 case ox of 127 _ | ox < 0x80 -> isUTF8Encoded xs 128 | ox > 0xff -> False 129 | ox < 0xc0 -> False 130 | ox < 0xe0 -> check1 131 | ox < 0xf0 -> check_byte 2 0xf 0 132 | ox < 0xf8 -> check_byte 3 0x7 0x10000 133 | ox < 0xfc -> check_byte 4 0x3 0x200000 134 | ox < 0xfe -> check_byte 5 0x1 0x4000000 135 | otherwise -> False 136 where 137 ox = toW32 x 138 139 toW32 :: Char -> Word32 140 toW32 ch = fromIntegral (fromEnum ch) 141 142 check1 = 143 case xs of 144 [] -> False 145 c1 : ds 146 | oc .&. 0xc0 /= 0x80 || d < 0x000080 -> False 147 | otherwise -> isUTF8Encoded ds 148 where 149 oc = toW32 c1 150 d = ((ox .&. 0x1f) `shiftL` 6) .|. (oc .&. 0x3f) 151 152 check_byte :: Int -> Word32 -> Word32 -> Bool 153 check_byte i mask overlong = aux i xs (ox .&. mask) 154 where 155 aux 0 rs acc 156 | overlong <= acc && 157 acc <= 0x10ffff && 158 (acc < 0xd800 || 0xdfff < acc) && 159 (acc < 0xfffe || 0xffff < acc) = isUTF8Encoded rs 160 | otherwise = False 161 162 aux n (r:rs) acc 163 | toW32 r .&. 0xc0 == 0x80 = 164 aux (n-1) rs (acc `shiftL` 6 .|. (toW32 r .&. 0x3f)) 165 166 aux _ _ _ = False 167 168