1{-# LANGUAGE Trustworthy #-}
2{-# LANGUAGE NoImplicitPrelude, ExistentialQuantification #-}
3{-# OPTIONS_GHC -funbox-strict-fields #-}
4
5-----------------------------------------------------------------------------
6-- |
7-- Module      :  GHC.IO.Encoding.Types
8-- Copyright   :  (c) The University of Glasgow, 2008-2009
9-- License     :  see libraries/base/LICENSE
10--
11-- Maintainer  :  libraries@haskell.org
12-- Stability   :  internal
13-- Portability :  non-portable
14--
15-- Types for text encoding/decoding
16--
17-----------------------------------------------------------------------------
18
19module GHC.IO.Encoding.Types (
20    BufferCodec(..),
21    TextEncoding(..),
22    TextEncoder, TextDecoder,
23    CodeBuffer, EncodeBuffer, DecodeBuffer,
24    CodingProgress(..)
25  ) where
26
27import GHC.Base
28import GHC.Word
29import GHC.Show
30-- import GHC.IO
31import GHC.IO.Buffer
32
33-- -----------------------------------------------------------------------------
34-- Text encoders/decoders
35
36data BufferCodec from to state = BufferCodec {
37  encode :: CodeBuffer from to,
38   -- ^ The @encode@ function translates elements of the buffer @from@
39   -- to the buffer @to@.  It should translate as many elements as possible
40   -- given the sizes of the buffers, including translating zero elements
41   -- if there is either not enough room in @to@, or @from@ does not
42   -- contain a complete multibyte sequence.
43   --
44   -- If multiple CodingProgress returns are possible, OutputUnderflow must be
45   -- preferred to InvalidSequence. This allows GHC's IO library to assume that
46   -- if we observe InvalidSequence there is at least a single element available
47   -- in the output buffer.
48   --
49   -- The fact that as many elements as possible are translated is used by the IO
50   -- library in order to report translation errors at the point they
51   -- actually occur, rather than when the buffer is translated.
52
53  recover :: Buffer from -> Buffer to -> IO (Buffer from, Buffer to),
54   -- ^ The @recover@ function is used to continue decoding
55   -- in the presence of invalid or unrepresentable sequences. This includes
56   -- both those detected by @encode@ returning @InvalidSequence@ and those
57   -- that occur because the input byte sequence appears to be truncated.
58   --
59   -- Progress will usually be made by skipping the first element of the @from@
60   -- buffer. This function should only be called if you are certain that you
61   -- wish to do this skipping and if the @to@ buffer has at least one element
62   -- of free space. Because this function deals with decoding failure, it assumes
63   -- that the from buffer has at least one element.
64   --
65   -- @recover@ may raise an exception rather than skipping anything.
66   --
67   -- Currently, some implementations of @recover@ may mutate the input buffer.
68   -- In particular, this feature is used to implement transliteration.
69   --
70   -- @since 4.4.0.0
71
72  close  :: IO (),
73   -- ^ Resources associated with the encoding may now be released.
74   -- The @encode@ function may not be called again after calling
75   -- @close@.
76
77  getState :: IO state,
78   -- ^ Return the current state of the codec.
79   --
80   -- Many codecs are not stateful, and in these case the state can be
81   -- represented as '()'.  Other codecs maintain a state.  For
82   -- example, UTF-16 recognises a BOM (byte-order-mark) character at
83   -- the beginning of the input, and remembers thereafter whether to
84   -- use big-endian or little-endian mode.  In this case, the state
85   -- of the codec would include two pieces of information: whether we
86   -- are at the beginning of the stream (the BOM only occurs at the
87   -- beginning), and if not, whether to use the big or little-endian
88   -- encoding.
89
90  setState :: state -> IO ()
91   -- restore the state of the codec using the state from a previous
92   -- call to 'getState'.
93 }
94
95type CodeBuffer from to = Buffer from -> Buffer to -> IO (CodingProgress, Buffer from, Buffer to)
96type DecodeBuffer = CodeBuffer Word8 Char
97type EncodeBuffer = CodeBuffer Char Word8
98
99type TextDecoder state = BufferCodec Word8 CharBufElem state
100type TextEncoder state = BufferCodec CharBufElem Word8 state
101
102-- | A 'TextEncoding' is a specification of a conversion scheme
103-- between sequences of bytes and sequences of Unicode characters.
104--
105-- For example, UTF-8 is an encoding of Unicode characters into a sequence
106-- of bytes.  The 'TextEncoding' for UTF-8 is 'System.IO.utf8'.
107data TextEncoding
108  = forall dstate estate . TextEncoding  {
109        textEncodingName :: String,
110                   -- ^ a string that can be passed to 'System.IO.mkTextEncoding' to
111                   -- create an equivalent 'TextEncoding'.
112        mkTextDecoder :: IO (TextDecoder dstate),
113                   -- ^ Creates a means of decoding bytes into characters: the result must not
114                   -- be shared between several byte sequences or simultaneously across threads
115        mkTextEncoder :: IO (TextEncoder estate)
116                   -- ^ Creates a means of encode characters into bytes: the result must not
117                   -- be shared between several character sequences or simultaneously across threads
118  }
119
120-- | @since 4.3.0.0
121instance Show TextEncoding where
122  -- | Returns the value of 'textEncodingName'
123  show te = textEncodingName te
124
125-- | @since 4.4.0.0
126data CodingProgress = InputUnderflow  -- ^ Stopped because the input contains insufficient available elements,
127                                      -- or all of the input sequence has been successfully translated.
128                    | OutputUnderflow -- ^ Stopped because the output contains insufficient free elements
129                    | InvalidSequence -- ^ Stopped because there are sufficient free elements in the output
130                                      -- to output at least one encoded ASCII character, but the input contains
131                                      -- an invalid or unrepresentable sequence
132                    deriving ( Eq   -- ^ @since 4.4.0.0
133                             , Show -- ^ @since 4.4.0.0
134                             )
135
136