1{-# LANGUAGE Trustworthy #-} 2{-# LANGUAGE NoImplicitPrelude #-} 3 4----------------------------------------------------------------------------- 5-- | 6-- Module : GHC.IO.Encoding.Failure 7-- Copyright : (c) The University of Glasgow, 2008-2011 8-- License : see libraries/base/LICENSE 9-- 10-- Maintainer : libraries@haskell.org 11-- Stability : internal 12-- Portability : non-portable 13-- 14-- Types for specifying how text encoding/decoding fails 15-- 16----------------------------------------------------------------------------- 17 18module GHC.IO.Encoding.Failure ( 19 CodingFailureMode(..), codingFailureModeSuffix, 20 isSurrogate, 21 recoverDecode, recoverEncode 22 ) where 23 24import GHC.IO 25import GHC.IO.Buffer 26import GHC.IO.Exception 27 28import GHC.Base 29import GHC.Char 30import GHC.Word 31import GHC.Show 32import GHC.Num 33import GHC.Real ( fromIntegral ) 34 35--import System.Posix.Internals 36 37-- | The 'CodingFailureMode' is used to construct 'System.IO.TextEncoding's, 38-- and specifies how they handle illegal sequences. 39data CodingFailureMode 40 = ErrorOnCodingFailure 41 -- ^ Throw an error when an illegal sequence is encountered 42 | IgnoreCodingFailure 43 -- ^ Attempt to ignore and recover if an illegal sequence is 44 -- encountered 45 | TransliterateCodingFailure 46 -- ^ Replace with the closest visual match upon an illegal 47 -- sequence 48 | RoundtripFailure 49 -- ^ Use the private-use escape mechanism to attempt to allow 50 -- illegal sequences to be roundtripped. 51 deriving ( Show -- ^ @since 4.4.0.0 52 ) 53 -- This will only work properly for those encodings which are 54 -- strict supersets of ASCII in the sense that valid ASCII data 55 -- is also valid in that encoding. This is not true for 56 -- e.g. UTF-16, because ASCII characters must be padded to two 57 -- bytes to retain their meaning. 58 59-- Note [Roundtripping] 60-- ~~~~~~~~~~~~~~~~~~~~ 61-- 62-- Roundtripping is based on the ideas of PEP383. 63-- 64-- We used to use the range of private-use characters from 0xEF80 to 65-- 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery 66-- to encode these characters. 67-- 68-- However, people didn't like this because it means we don't get 69-- guaranteed roundtripping for byte sequences that look like a UTF-8 70-- encoded codepoint 0xEFxx. 71-- 72-- So now like PEP383 we use lone surrogate codepoints 0xDCxx to escape 73-- undecodable bytes, even though that may confuse Unicode processing 74-- software written in Haskell. This guarantees roundtripping because 75-- unicode input that includes lone surrogate codepoints is invalid by 76-- definition. 77-- 78-- 79-- When we used private-use characters there was a technical problem when it 80-- came to encoding back to bytes using iconv. The iconv code will not fail when 81-- it tries to encode a private-use character (as it would if trying to encode 82-- a surrogate), which means that we wouldn't get a chance to replace it 83-- with the byte we originally escaped. 84-- 85-- To work around this, when filling the buffer to be encoded (in 86-- writeBlocks/withEncodedCString/newEncodedCString), we replaced the 87-- private-use characters with lone surrogates again! Likewise, when 88-- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we had 89-- to do the inverse process. 90-- 91-- The user of String would never see these lone surrogates, but it 92-- ensured that iconv will throw an error when encountering them. We 93-- used lone surrogates in the range 0xDC00 to 0xDCFF for this purpose. 94 95codingFailureModeSuffix :: CodingFailureMode -> String 96codingFailureModeSuffix ErrorOnCodingFailure = "" 97codingFailureModeSuffix IgnoreCodingFailure = "//IGNORE" 98codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT" 99codingFailureModeSuffix RoundtripFailure = "//ROUNDTRIP" 100 101-- | In transliterate mode, we use this character when decoding 102-- unknown bytes. 103-- 104-- This is the defined Unicode replacement character: 105-- <http://www.fileformat.info/info/unicode/char/0fffd/index.htm> 106unrepresentableChar :: Char 107unrepresentableChar = '\xFFFD' 108 109-- It is extraordinarily important that this series of 110-- predicates/transformers gets inlined, because they tend to be used 111-- in inner loops related to text encoding. In particular, 112-- surrogatifyRoundtripCharacter must be inlined (see #5536) 113 114-- | Some characters are actually "surrogate" codepoints defined for 115-- use in UTF-16. We need to signal an invalid character if we detect 116-- them when encoding a sequence of 'Char's into 'Word8's because they 117-- won't give valid Unicode. 118-- 119-- We may also need to signal an invalid character if we detect them 120-- when encoding a sequence of 'Char's into 'Word8's because the 121-- 'RoundtripFailure' mode creates these to round-trip bytes through 122-- our internal UTF-16 encoding. 123{-# INLINE isSurrogate #-} 124isSurrogate :: Char -> Bool 125isSurrogate c = (0xD800 <= x && x <= 0xDBFF) 126 || (0xDC00 <= x && x <= 0xDFFF) 127 where x = ord c 128 129-- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem) 130{-# INLINE escapeToRoundtripCharacterSurrogate #-} 131escapeToRoundtripCharacterSurrogate :: Word8 -> Char 132escapeToRoundtripCharacterSurrogate b 133 | b < 128 = chr (fromIntegral b) 134 -- Disallow 'smuggling' of ASCII bytes. For roundtripping to 135 -- work, this assumes encoding is ASCII-superset. 136 | otherwise = chr (0xDC00 + fromIntegral b) 137 138-- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8) 139{-# INLINE unescapeRoundtripCharacterSurrogate #-} 140unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8 141unescapeRoundtripCharacterSurrogate c 142 | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte 143 | otherwise = Nothing 144 where x = ord c 145 146recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char 147 -> IO (Buffer Word8, Buffer Char) 148recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ } 149 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do 150 --puts $ "recoverDecode " ++ show ir 151 case cfm of 152 ErrorOnCodingFailure -> ioe_decodingError 153 IgnoreCodingFailure -> return (input { bufL=ir+1 }, output) 154 TransliterateCodingFailure -> do 155 ow' <- writeCharBuf oraw ow unrepresentableChar 156 return (input { bufL=ir+1 }, output { bufR=ow' }) 157 RoundtripFailure -> do 158 b <- readWord8Buf iraw ir 159 ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b) 160 return (input { bufL=ir+1 }, output { bufR=ow' }) 161 162recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 163 -> IO (Buffer Char, Buffer Word8) 164recoverEncode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ } 165 output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow } = do 166 (c,ir') <- readCharBuf iraw ir 167 --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir' 168 case cfm of 169 IgnoreCodingFailure -> return (input { bufL=ir' }, output) 170 TransliterateCodingFailure -> do 171 if c == '?' 172 then return (input { bufL=ir' }, output) 173 else do 174 -- XXX: evil hack! To implement transliteration, we just 175 -- poke an ASCII ? into the input buffer and tell the caller 176 -- to try and decode again. This is *probably* safe given 177 -- current uses of TextEncoding. 178 -- 179 -- The "if" test above ensures we skip if the encoding fails 180 -- to deal with the ?, though this should never happen in 181 -- practice as all encodings are in fact capable of 182 -- reperesenting all ASCII characters. 183 _ir' <- writeCharBuf iraw ir '?' 184 return (input, output) 185 186 -- This implementation does not work because e.g. UTF-16 187 -- requires 2 bytes to encode a simple ASCII value 188 --writeWord8Buf oraw ow unrepresentableByte 189 --return (input { bufL=ir' }, output { bufR=ow+1 }) 190 RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do 191 writeWord8Buf oraw ow x 192 return (input { bufL=ir' }, output { bufR=ow+1 }) 193 _ -> ioe_encodingError 194 195ioe_decodingError :: IO a 196ioe_decodingError = ioException 197 (IOError Nothing InvalidArgument "recoverDecode" 198 "invalid byte sequence" Nothing Nothing) 199 200ioe_encodingError :: IO a 201ioe_encodingError = ioException 202 (IOError Nothing InvalidArgument "recoverEncode" 203 "invalid character" Nothing Nothing) 204 205