1{-# LANGUAGE Trustworthy #-}
2{-# LANGUAGE NoImplicitPrelude #-}
3
4-----------------------------------------------------------------------------
5-- |
6-- Module      :  GHC.IO.Encoding.Failure
7-- Copyright   :  (c) The University of Glasgow, 2008-2011
8-- License     :  see libraries/base/LICENSE
9--
10-- Maintainer  :  libraries@haskell.org
11-- Stability   :  internal
12-- Portability :  non-portable
13--
14-- Types for specifying how text encoding/decoding fails
15--
16-----------------------------------------------------------------------------
17
18module GHC.IO.Encoding.Failure (
19    CodingFailureMode(..), codingFailureModeSuffix,
20    isSurrogate,
21    recoverDecode, recoverEncode
22  ) where
23
24import GHC.IO
25import GHC.IO.Buffer
26import GHC.IO.Exception
27
28import GHC.Base
29import GHC.Char
30import GHC.Word
31import GHC.Show
32import GHC.Num
33import GHC.Real ( fromIntegral )
34
35--import System.Posix.Internals
36
37-- | The 'CodingFailureMode' is used to construct 'System.IO.TextEncoding's,
38-- and specifies how they handle illegal sequences.
39data CodingFailureMode
40  = ErrorOnCodingFailure
41       -- ^ Throw an error when an illegal sequence is encountered
42  | IgnoreCodingFailure
43       -- ^ Attempt to ignore and recover if an illegal sequence is
44       -- encountered
45  | TransliterateCodingFailure
46       -- ^ Replace with the closest visual match upon an illegal
47       -- sequence
48  | RoundtripFailure
49       -- ^ Use the private-use escape mechanism to attempt to allow
50       -- illegal sequences to be roundtripped.
51  deriving ( Show -- ^ @since 4.4.0.0
52           )
53       -- This will only work properly for those encodings which are
54       -- strict supersets of ASCII in the sense that valid ASCII data
55       -- is also valid in that encoding. This is not true for
56       -- e.g. UTF-16, because ASCII characters must be padded to two
57       -- bytes to retain their meaning.
58
59-- Note [Roundtripping]
60-- ~~~~~~~~~~~~~~~~~~~~
61--
62-- Roundtripping is based on the ideas of PEP383.
63--
64-- We used to use the range of private-use characters from 0xEF80 to
65-- 0xEFFF designated for "encoding hacks" by the ConScript Unicode Registery
66-- to encode these characters.
67--
68-- However, people didn't like this because it means we don't get
69-- guaranteed roundtripping for byte sequences that look like a UTF-8
70-- encoded codepoint 0xEFxx.
71--
72-- So now like PEP383 we use lone surrogate codepoints 0xDCxx to escape
73-- undecodable bytes, even though that may confuse Unicode processing
74-- software written in Haskell. This guarantees roundtripping because
75-- unicode input that includes lone surrogate codepoints is invalid by
76-- definition.
77--
78--
79-- When we used private-use characters there was a technical problem when it
80-- came to encoding back to bytes using iconv. The iconv code will not fail when
81-- it tries to encode a private-use character (as it would if trying to encode
82-- a surrogate), which means that we wouldn't get a chance to replace it
83-- with the byte we originally escaped.
84--
85-- To work around this, when filling the buffer to be encoded (in
86-- writeBlocks/withEncodedCString/newEncodedCString), we replaced the
87-- private-use characters with lone surrogates again! Likewise, when
88-- reading from a buffer (unpack/unpack_nl/peekEncodedCString) we had
89-- to do the inverse process.
90--
91-- The user of String would never see these lone surrogates, but it
92-- ensured that iconv will throw an error when encountering them.  We
93-- used lone surrogates in the range 0xDC00 to 0xDCFF for this purpose.
94
95codingFailureModeSuffix :: CodingFailureMode -> String
96codingFailureModeSuffix ErrorOnCodingFailure       = ""
97codingFailureModeSuffix IgnoreCodingFailure        = "//IGNORE"
98codingFailureModeSuffix TransliterateCodingFailure = "//TRANSLIT"
99codingFailureModeSuffix RoundtripFailure           = "//ROUNDTRIP"
100
101-- | In transliterate mode, we use this character when decoding
102-- unknown bytes.
103--
104-- This is the defined Unicode replacement character:
105-- <http://www.fileformat.info/info/unicode/char/0fffd/index.htm>
106unrepresentableChar :: Char
107unrepresentableChar = '\xFFFD'
108
109-- It is extraordinarily important that this series of
110-- predicates/transformers gets inlined, because they tend to be used
111-- in inner loops related to text encoding. In particular,
112-- surrogatifyRoundtripCharacter must be inlined (see #5536)
113
114-- | Some characters are actually "surrogate" codepoints defined for
115-- use in UTF-16. We need to signal an invalid character if we detect
116-- them when encoding a sequence of 'Char's into 'Word8's because they
117-- won't give valid Unicode.
118--
119-- We may also need to signal an invalid character if we detect them
120-- when encoding a sequence of 'Char's into 'Word8's because the
121-- 'RoundtripFailure' mode creates these to round-trip bytes through
122-- our internal UTF-16 encoding.
123{-# INLINE isSurrogate #-}
124isSurrogate :: Char -> Bool
125isSurrogate c = (0xD800 <= x && x <= 0xDBFF)
126             || (0xDC00 <= x && x <= 0xDFFF)
127  where x = ord c
128
129-- Bytes (in Buffer Word8) --> lone surrogates (in Buffer CharBufElem)
130{-# INLINE escapeToRoundtripCharacterSurrogate #-}
131escapeToRoundtripCharacterSurrogate :: Word8 -> Char
132escapeToRoundtripCharacterSurrogate b
133  | b < 128   = chr (fromIntegral b)
134      -- Disallow 'smuggling' of ASCII bytes. For roundtripping to
135      -- work, this assumes encoding is ASCII-superset.
136  | otherwise = chr (0xDC00 + fromIntegral b)
137
138-- Lone surrogates (in Buffer CharBufElem) --> bytes (in Buffer Word8)
139{-# INLINE unescapeRoundtripCharacterSurrogate #-}
140unescapeRoundtripCharacterSurrogate :: Char -> Maybe Word8
141unescapeRoundtripCharacterSurrogate c
142    | 0xDC80 <= x && x < 0xDD00 = Just (fromIntegral x) -- Discard high byte
143    | otherwise                 = Nothing
144  where x = ord c
145
146recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char
147              -> IO (Buffer Word8, Buffer Char)
148recoverDecode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
149                  output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow } = do
150 --puts $ "recoverDecode " ++ show ir
151 case cfm of
152  ErrorOnCodingFailure       -> ioe_decodingError
153  IgnoreCodingFailure        -> return (input { bufL=ir+1 }, output)
154  TransliterateCodingFailure -> do
155      ow' <- writeCharBuf oraw ow unrepresentableChar
156      return (input { bufL=ir+1 }, output { bufR=ow' })
157  RoundtripFailure           -> do
158      b <- readWord8Buf iraw ir
159      ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b)
160      return (input { bufL=ir+1 }, output { bufR=ow' })
161
162recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8
163              -> IO (Buffer Char, Buffer Word8)
164recoverEncode cfm input@Buffer{  bufRaw=iraw, bufL=ir, bufR=_  }
165                  output@Buffer{ bufRaw=oraw, bufL=_,  bufR=ow } = do
166  (c,ir') <- readCharBuf iraw ir
167  --puts $ "recoverEncode " ++ show ir ++ " " ++ show ir'
168  case cfm of
169    IgnoreCodingFailure        -> return (input { bufL=ir' }, output)
170    TransliterateCodingFailure -> do
171        if c == '?'
172         then return (input { bufL=ir' }, output)
173         else do
174          -- XXX: evil hack! To implement transliteration, we just
175          -- poke an ASCII ? into the input buffer and tell the caller
176          -- to try and decode again. This is *probably* safe given
177          -- current uses of TextEncoding.
178          --
179          -- The "if" test above ensures we skip if the encoding fails
180          -- to deal with the ?, though this should never happen in
181          -- practice as all encodings are in fact capable of
182          -- reperesenting all ASCII characters.
183          _ir' <- writeCharBuf iraw ir '?'
184          return (input, output)
185
186        -- This implementation does not work because e.g. UTF-16
187        -- requires 2 bytes to encode a simple ASCII value
188        --writeWord8Buf oraw ow unrepresentableByte
189        --return (input { bufL=ir' }, output { bufR=ow+1 })
190    RoundtripFailure | Just x <- unescapeRoundtripCharacterSurrogate c -> do
191        writeWord8Buf oraw ow x
192        return (input { bufL=ir' }, output { bufR=ow+1 })
193    _                          -> ioe_encodingError
194
195ioe_decodingError :: IO a
196ioe_decodingError = ioException
197    (IOError Nothing InvalidArgument "recoverDecode"
198        "invalid byte sequence" Nothing Nothing)
199
200ioe_encodingError :: IO a
201ioe_encodingError = ioException
202    (IOError Nothing InvalidArgument "recoverEncode"
203        "invalid character" Nothing Nothing)
204
205