1-- |
2-- Module      : Basement.String.Encoding.Encoding
3-- License     : BSD-style
4-- Maintainer  : Foundation
5-- Stability   : experimental
6-- Portability : portable
7--
8
9{-# LANGUAGE FlexibleContexts #-}
10
11module Basement.String.Encoding.Encoding
12    ( Encoding(..)
13    , convertFromTo
14    ) where
15
16import           Basement.Compat.Base
17import           Basement.Types.OffsetSize
18import           Basement.Monad
19import           Basement.PrimType
20import           Basement.MutableBuilder
21import           Basement.Numerical.Additive
22import           Basement.UArray (UArray)
23import           Basement.UArray.Mutable (MUArray)
24import qualified Basement.UArray as Vec
25
26class Encoding encoding where
27    -- | the unit element use for the encoding.
28    -- i.e. Word8 for ASCII7 or UTF8, Word16 for UTF16...
29    --
30    type Unit encoding
31
32    -- | define the type of error handling you want to use for the
33    -- next function.
34    --
35    -- > type Error UTF8 = Either UTF8_Invalid
36    --
37    type Error encoding
38
39    -- | consume an `Unit encoding` and return the Unicode point and the position
40    -- of the next possible `Unit encoding`
41    --
42    encodingNext :: encoding
43                      -- ^ only used for type deduction
44                -> (Offset (Unit encoding) -> Unit encoding)
45                      -- ^ method to access a given `Unit encoding`
46                      -- (see `unsafeIndexer`)
47                -> Offset (Unit encoding)
48                      -- ^ offset of the `Unit encoding` where starts the
49                      -- encoding of a given unicode
50                -> Either (Error encoding) (Char, Offset (Unit encoding)) -- ^ either successfully validated the `Unit encoding`
51                      -- and returned the next offset or fail with an
52                      -- `Error encoding`
53
54    -- Write a unicode point encoded into one or multiple `Unit encoding`
55    --
56    -- > build 64 $ sequence_ (write UTF8) "this is a simple list of char..."
57    --
58    encodingWrite :: (PrimMonad st, Monad st)
59                  => encoding
60                      -- ^ only used for type deduction
61                  -> Char
62                      -- ^ the unicode character to encode
63                  -> Builder (UArray (Unit encoding))
64                             (MUArray (Unit encoding))
65                             (Unit encoding) st err ()
66
67-- | helper to convert a given Array in a given encoding into an array
68-- with another encoding.
69--
70-- This is a helper to convert from one String encoding to another.
71-- This function is (quite) slow and needs some work.
72--
73-- ```
74-- let s16 = ... -- string in UTF16
75-- -- create s8, a UTF8 String
76-- let s8  = runST $ convertWith UTF16 UTF8 (toBytes s16)
77--
78-- print s8
79-- ```
80--
81convertFromTo :: ( PrimMonad st, Monad st
82                 , Encoding input, PrimType (Unit input)
83                 , Encoding output, PrimType (Unit output)
84                 )
85              => input
86                -- ^ Input's encoding type
87              -> output
88                -- ^ Output's encoding type
89              -> UArray (Unit input)
90                -- ^ the input raw array
91              -> st (Either (Offset (Unit input), Error input) (UArray (Unit output)))
92convertFromTo inputEncodingTy outputEncodingTy bytes
93    | Vec.null bytes = return . return $ mempty
94    | otherwise      = Vec.unsafeIndexer bytes $ \t -> Vec.builderBuild 64 (loop azero t)
95  where
96    lastUnit = Vec.length bytes
97
98    loop off getter
99      | off .==# lastUnit = return ()
100      | otherwise = case encodingNext inputEncodingTy getter off of
101          Left err -> mFail (off, err)
102          Right (c, noff) -> encodingWrite outputEncodingTy c >> loop noff getter
103