1{-# LANGUAGE Trustworthy #-} 2{-# LANGUAGE NoImplicitPrelude, ExistentialQuantification #-} 3{-# OPTIONS_GHC -funbox-strict-fields #-} 4 5----------------------------------------------------------------------------- 6-- | 7-- Module : GHC.IO.Encoding.Types 8-- Copyright : (c) The University of Glasgow, 2008-2009 9-- License : see libraries/base/LICENSE 10-- 11-- Maintainer : libraries@haskell.org 12-- Stability : internal 13-- Portability : non-portable 14-- 15-- Types for text encoding/decoding 16-- 17----------------------------------------------------------------------------- 18 19module GHC.IO.Encoding.Types ( 20 BufferCodec(..), 21 TextEncoding(..), 22 TextEncoder, TextDecoder, 23 CodeBuffer, EncodeBuffer, DecodeBuffer, 24 CodingProgress(..) 25 ) where 26 27import GHC.Base 28import GHC.Word 29import GHC.Show 30-- import GHC.IO 31import GHC.IO.Buffer 32 33-- ----------------------------------------------------------------------------- 34-- Text encoders/decoders 35 36data BufferCodec from to state = BufferCodec { 37 encode :: CodeBuffer from to, 38 -- ^ The @encode@ function translates elements of the buffer @from@ 39 -- to the buffer @to@. It should translate as many elements as possible 40 -- given the sizes of the buffers, including translating zero elements 41 -- if there is either not enough room in @to@, or @from@ does not 42 -- contain a complete multibyte sequence. 43 -- 44 -- If multiple CodingProgress returns are possible, OutputUnderflow must be 45 -- preferred to InvalidSequence. This allows GHC's IO library to assume that 46 -- if we observe InvalidSequence there is at least a single element available 47 -- in the output buffer. 48 -- 49 -- The fact that as many elements as possible are translated is used by the IO 50 -- library in order to report translation errors at the point they 51 -- actually occur, rather than when the buffer is translated. 52 53 recover :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to), 54 -- ^ The @recover@ function is used to continue decoding 55 -- in the presence of invalid or unrepresentable sequences. This includes 56 -- both those detected by @encode@ returning @InvalidSequence@ and those 57 -- that occur because the input byte sequence appears to be truncated. 58 -- 59 -- Progress will usually be made by skipping the first element of the @from@ 60 -- buffer. This function should only be called if you are certain that you 61 -- wish to do this skipping and if the @to@ buffer has at least one element 62 -- of free space. Because this function deals with decoding failure, it assumes 63 -- that the from buffer has at least one element. 64 -- 65 -- @recover@ may raise an exception rather than skipping anything. 66 -- 67 -- Currently, some implementations of @recover@ may mutate the input buffer. 68 -- In particular, this feature is used to implement transliteration. 69 -- 70 -- @since 4.4.0.0 71 72 close :: IO (), 73 -- ^ Resources associated with the encoding may now be released. 74 -- The @encode@ function may not be called again after calling 75 -- @close@. 76 77 getState :: IO state, 78 -- ^ Return the current state of the codec. 79 -- 80 -- Many codecs are not stateful, and in these case the state can be 81 -- represented as '()'. Other codecs maintain a state. For 82 -- example, UTF-16 recognises a BOM (byte-order-mark) character at 83 -- the beginning of the input, and remembers thereafter whether to 84 -- use big-endian or little-endian mode. In this case, the state 85 -- of the codec would include two pieces of information: whether we 86 -- are at the beginning of the stream (the BOM only occurs at the 87 -- beginning), and if not, whether to use the big or little-endian 88 -- encoding. 89 90 setState :: state -> IO () 91 -- restore the state of the codec using the state from a previous 92 -- call to 'getState'. 93 } 94 95type CodeBuffer from to = Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to) 96type DecodeBuffer = CodeBuffer Word8 Char 97type EncodeBuffer = CodeBuffer Char Word8 98 99type TextDecoder state = BufferCodec Word8 CharBufElem state 100type TextEncoder state = BufferCodec CharBufElem Word8 state 101 102-- | A 'TextEncoding' is a specification of a conversion scheme 103-- between sequences of bytes and sequences of Unicode characters. 104-- 105-- For example, UTF-8 is an encoding of Unicode characters into a sequence 106-- of bytes. The 'TextEncoding' for UTF-8 is 'System.IO.utf8'. 107data TextEncoding 108 = forall dstate estate . TextEncoding { 109 textEncodingName :: String, 110 -- ^ a string that can be passed to 'System.IO.mkTextEncoding' to 111 -- create an equivalent 'TextEncoding'. 112 mkTextDecoder :: IO (TextDecoder dstate), 113 -- ^ Creates a means of decoding bytes into characters: the result must not 114 -- be shared between several byte sequences or simultaneously across threads 115 mkTextEncoder :: IO (TextEncoder estate) 116 -- ^ Creates a means of encode characters into bytes: the result must not 117 -- be shared between several character sequences or simultaneously across threads 118 } 119 120-- | @since 4.3.0.0 121instance Show TextEncoding where 122 -- | Returns the value of 'textEncodingName' 123 show te = textEncodingName te 124 125-- | @since 4.4.0.0 126data CodingProgress = InputUnderflow -- ^ Stopped because the input contains insufficient available elements, 127 -- or all of the input sequence has been successfully translated. 128 | OutputUnderflow -- ^ Stopped because the output contains insufficient free elements 129 | InvalidSequence -- ^ Stopped because there are sufficient free elements in the output 130 -- to output at least one encoded ASCII character, but the input contains 131 -- an invalid or unrepresentable sequence 132 deriving ( Eq -- ^ @since 4.4.0.0 133 , Show -- ^ @since 4.4.0.0 134 ) 135 136