1{-# LANGUAGE BangPatterns               #-}
2{-# LANGUAGE CPP                        #-}
3{-# LANGUAGE DeriveDataTypeable         #-}
4{-# LANGUAGE GeneralizedNewtypeDeriving #-}
5{-# LANGUAGE MagicHash                  #-}
6{-# LANGUAGE RankNTypes                 #-}
7{-# LANGUAGE TypeFamilies               #-}
8{-# LANGUAGE UnboxedTuples              #-}
9{-# LANGUAGE UnliftedFFITypes           #-}
10{-# LANGUAGE Unsafe                     #-}
11{-# LANGUAGE ViewPatterns               #-}
12
13-- |
14-- Module      : Data.Text.Short.Internal
15-- Copyright   : © Herbert Valerio Riedel 2017
16-- License     : BSD3
17--
18-- Maintainer  : hvr@gnu.org
19-- Stability   : stable
20--
21-- Memory-efficient representation of Unicode text strings.
22--
23-- @since 0.1
24module Data.Text.Short.Internal
25    ( -- * The 'ShortText' type
26      ShortText(..)
27
28      -- * Basic operations
29    , null
30    , length
31    , isAscii
32    , splitAt
33    , splitAtEnd
34    , indexEndMaybe
35    , indexMaybe
36    , isPrefixOf
37    , stripPrefix
38    , isSuffixOf
39    , stripSuffix
40
41    , cons
42    , snoc
43    , uncons
44    , unsnoc
45
46    , findIndex
47    , find
48    , all
49
50    , span
51    , spanEnd
52    , split
53
54    , intersperse
55    , intercalate
56    , reverse
57    , replicate
58
59    , filter
60    , dropAround
61
62    , foldl
63    , foldl'
64    , foldr
65    , foldl1
66    , foldl1'
67    , foldr1
68
69      -- * Conversions
70      -- ** 'Char'
71    , singleton
72
73      -- ** 'String'
74    , Data.Text.Short.Internal.fromString
75    , toString
76
77      -- ** 'T.Text'
78    , fromText
79    , toText
80
81      -- ** 'BS.ByteString'
82    , fromShortByteString
83    , fromShortByteStringUnsafe
84    , toShortByteString
85
86    , fromByteString
87    , fromByteStringUnsafe
88    , toByteString
89
90    , toBuilder
91
92      -- * misc
93      -- ** For Haddock
94
95    , BS.ByteString
96    , T.Text
97    , module Prelude
98
99      -- ** Internals
100    , isValidUtf8
101    ) where
102
103import           Control.DeepSeq                (NFData)
104import           Control.Monad.ST               (stToIO)
105import           Data.Binary
106import           Data.Bits
107import qualified Data.ByteString                as BS
108import qualified Data.ByteString.Builder        as BB
109import           Data.ByteString.Short          (ShortByteString)
110import qualified Data.ByteString.Short          as BSS
111import qualified Data.ByteString.Short.Internal as BSSI
112import           Data.Char                      (ord)
113import           Data.Data                      (Data(..),constrIndex, Constr,
114                                                 mkConstr, DataType, mkDataType,
115                                                 Fixity(Prefix))
116import           Data.Hashable                  (Hashable)
117import           Data.Typeable                  (Typeable)
118import qualified Data.List                      as List
119import           Data.Maybe                     (fromMaybe, isNothing)
120import           Data.Semigroup
121import qualified Data.String                    as S
122import qualified Data.Text                      as T
123import qualified Data.Text.Encoding             as T
124import           Foreign.C
125import           GHC.Base                       (assert, unsafeChr)
126import qualified GHC.CString                    as GHC
127import           GHC.Exts                       (Addr#, ByteArray#, Int (I#),
128                                                 Int#, MutableByteArray#,
129                                                 Ptr (..), RealWorld, Word (W#))
130import qualified GHC.Exts
131import qualified GHC.Foreign                    as GHC
132import           GHC.IO.Encoding
133import           GHC.ST
134import           Prelude                        hiding (all, any, break, concat,
135                                                 drop, dropWhile, filter, foldl,
136                                                 foldl1, foldr, foldr1, head,
137                                                 init, last, length, null,
138                                                 replicate, reverse, span,
139                                                 splitAt, tail, take, takeWhile)
140import           System.IO.Unsafe
141import           Text.Printf                    (PrintfArg, formatArg,
142                                                 formatString)
143
144import qualified PrimOps
145
146-- | A compact representation of Unicode strings.
147--
148-- A 'ShortText' value is a sequence of Unicode scalar values, as defined in
149-- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35 §3.9, definition D76 of the Unicode 5.2 standard >;
150-- This means that a 'ShortText' is a list of (scalar) Unicode code-points (i.e. code-points in the range @[U+00 .. U+D7FF] ∪ [U+E000 .. U+10FFFF]@).
151--
152-- This type relates to 'T.Text' as 'ShortByteString' relates to 'BS.ByteString' by providing a more compact type. Please consult the documentation of "Data.ByteString.Short" for more information.
153--
154-- Currently, a boxed unshared 'T.Text' has a memory footprint of 6 words (i.e. 48 bytes on 64-bit systems) plus 2 or 4 bytes per code-point (due to the internal UTF-16 representation). Each 'T.Text' value which can share its payload with another 'T.Text' requires only 4 words additionally. Unlike 'BS.ByteString', 'T.Text' use unpinned memory.
155--
156-- In comparison, the footprint of a boxed 'ShortText' is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1, 2, 3, or 4 bytes per code-point (due to the internal UTF-8 representation).
157-- It can be shown that for realistic data <http://utf8everywhere.org/#asian UTF-16 has a space overhead of 50% over UTF-8>.
158--
159-- __NOTE__: The `Typeable` instance isn't defined for GHC 7.8 (and older) prior to @text-short-0.1.3@
160--
161-- @since 0.1
162newtype ShortText = ShortText ShortByteString
163                  deriving (Hashable,Monoid,NFData,Data.Semigroup.Semigroup,Typeable)
164
165-- | It exposes a similar 'Data' instance abstraction as 'T.Text' (see
166-- discussion referenced there for more details), preserving the
167-- @[Char]@ data abstraction at the cost of inefficiency.
168--
169-- @since 0.1.3
170instance Data ShortText where
171  gfoldl f z txt = z fromString `f` (toString txt)
172  toConstr _ = packConstr
173  gunfold k z c = case constrIndex c of
174    1 -> k (z fromString)
175    _ -> error "gunfold"
176  dataTypeOf _ = shortTextDataType
177
178packConstr :: Constr
179packConstr = mkConstr shortTextDataType "fromString" [] Prefix
180
181shortTextDataType :: DataType
182shortTextDataType = mkDataType "Data.Text.Short" [packConstr]
183
184instance Eq ShortText where
185  {-# INLINE (==) #-}
186  (==) x y
187    | lx /= ly  = False
188    | lx ==  0  = True
189    | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) 0# n# of
190                    0# -> True
191                    _  -> False
192    where
193      !lx@(I# n#) = toLength x
194      !ly = toLength y
195
196instance Ord ShortText where
197  compare t1 t2
198    | n == 0  = compare n1 n2
199    | otherwise = case PrimOps.compareByteArrays# ba1# 0# ba2# 0# n# of
200        r# | I# r# < 0 -> LT
201           | I# r# > 0 -> GT
202           | n1 < n2   -> LT
203           | n1 > n2   -> GT
204           | otherwise -> EQ
205    where
206      ba1# = toByteArray# t1
207      ba2# = toByteArray# t2
208      !n1 = toLength t1
209      !n2 = toLength t2
210      !n@(I# n#) = n1 `min` n2
211
212instance Show ShortText where
213    showsPrec p (ShortText b) = showsPrec p (decodeStringShort' utf8 b)
214    show (ShortText b)        = show        (decodeStringShort' utf8 b)
215
216instance Read ShortText where
217    readsPrec p = map (\(x,s) -> (ShortText $ encodeStringShort utf8 x,s)) . readsPrec p
218
219-- | @since 0.1.2
220instance PrintfArg ShortText where
221  formatArg txt = formatString $ toString txt
222
223-- | The 'Binary' encoding matches the one for 'T.Text'
224#if MIN_VERSION_binary(0,8,1)
225instance Binary ShortText where
226    put = put . toShortByteString
227    get = do
228        sbs <- get
229        case fromShortByteString sbs of
230          Nothing -> fail "Binary.get(ShortText): Invalid UTF-8 stream"
231          Just st -> return st
232#else
233-- fallback via 'ByteString' instance
234instance Binary ShortText where
235    put = put . toByteString
236    get = do
237        bs <- get
238        case fromByteString bs of
239          Nothing -> fail "Binary.get(ShortText): Invalid UTF-8 stream"
240          Just st -> return st
241#endif
242
243-- | \(\mathcal{O}(1)\) Test whether a 'ShortText' is empty.
244--
245-- >>> null ""
246-- True
247--
248-- prop> null (singleton c) == False
249--
250-- prop> null t == (length t == 0)
251--
252-- @since 0.1
253null :: ShortText -> Bool
254null = BSS.null . toShortByteString
255
256-- | \(\mathcal{O}(n)\) Count the number of Unicode code-points in a 'ShortText'.
257--
258-- >>> length "abcd€"
259-- 5
260--
261-- >>> length ""
262-- 0
263--
264-- prop> length t >= 0
265--
266-- @since 0.1
267length :: ShortText -> Int
268length st = fromIntegral $ unsafeDupablePerformIO (c_text_short_length (toByteArray# st) (toCSize st))
269
270foreign import ccall unsafe "hs_text_short_length" c_text_short_length :: ByteArray# -> CSize -> IO CSize
271
272-- | \(\mathcal{O}(n)\) Test whether 'ShortText' contains only ASCII code-points (i.e. only U+0000 through U+007F).
273--
274-- This is a more efficient version of @'all' 'Data.Char.isAscii'@.
275--
276-- >>> isAscii ""
277-- True
278--
279-- >>> isAscii "abc\NUL"
280-- True
281--
282-- >>> isAscii "abcd€"
283-- False
284--
285-- prop> isAscii t == all (< '\x80') t
286--
287-- @since 0.1
288isAscii :: ShortText -> Bool
289isAscii st = (/= 0) $ unsafeDupablePerformIO (c_text_short_is_ascii (toByteArray# st) sz)
290  where
291    sz = toCSize st
292
293foreign import ccall unsafe "hs_text_short_is_ascii" c_text_short_is_ascii :: ByteArray# -> CSize -> IO CInt
294
295-- | \(\mathcal{O}(n)\) Test whether /all/ code points in 'ShortText' satisfy a predicate.
296--
297-- >>> all (const False) ""
298-- True
299--
300-- >>> all (> 'c') "abcdabcd"
301-- False
302--
303-- >>> all (/= 'c') "abdabd"
304-- True
305--
306-- @since 0.1.2
307all :: (Char -> Bool) -> ShortText -> Bool
308all p st = isNothing (findOfs (not . p) st (B 0))
309
310-- | \(\mathcal{O}(n)\) Return the left-most codepoint in 'ShortText' that satisfies the given predicate.
311--
312-- >>> find (> 'b') "abcdabcd"
313-- Just 'c'
314--
315-- >>> find (> 'b') "ababab"
316-- Nothing
317--
318-- @since 0.1.2
319find :: (Char -> Bool) -> ShortText -> Maybe Char
320find p st = go 0
321  where
322    go !ofs
323      | ofs >= sz  = Nothing
324      | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
325                     in c `seq` ofs' `seq`
326                        if p c
327                        then Just c
328                        else go ofs'
329
330    !sz = toB st
331
332-- | \(\mathcal{O}(n)\) Return the index of the left-most codepoint in 'ShortText' that satisfies the given predicate.
333--
334-- >>> findIndex (> 'b') "abcdabcdef"
335-- Just 2
336--
337-- >>> findIndex (> 'b') "ababab"
338-- Nothing
339--
340-- prop> (indexMaybe t =<< findIndex p t) == find p t
341--
342-- @since 0.1.2
343findIndex :: (Char -> Bool) -> ShortText -> Maybe Int
344findIndex p st = go 0 0
345  where
346    go !ofs !i
347      | ofs >= sz  = Nothing
348      | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
349                     in c `seq` ofs' `seq`
350                        if p c
351                        then Just i
352                        else go ofs' (i+1)
353
354    !sz = toB st
355
356
357-- | \(\mathcal{O}(n)\) Splits a string into components delimited by separators,
358-- where the predicate returns True for a separator element.  The
359-- resulting components do not contain the separators.  Two adjacent
360-- separators result in an empty component in the output.  eg.
361--
362-- >>> split (=='a') "aabbaca"
363-- ["","","bb","c",""]
364--
365-- >>> split (=='a') ""
366-- [""]
367--
368-- prop> intercalate (singleton c) (split (== c) t) = t
369--
370-- __NOTE__: 'split' never returns an empty list to match the semantics of its counterpart from "Data.Text".
371--
372-- @since 0.1.3
373split :: (Char -> Bool) -> ShortText -> [ShortText]
374split p st0 = go 0
375  where
376    go !ofs0 = case findOfs' p st0 ofs0 of
377      Just (ofs1,ofs2) -> slice st0 ofs0 (ofs1-ofs0) : go ofs2
378      Nothing
379        | ofs0 == 0 -> st0 : []
380        | otherwise -> slice st0 ofs0 (maxOfs-ofs0) : []
381
382    !maxOfs = toB st0
383
384-- internal helper
385{-# INLINE findOfs #-}
386findOfs :: (Char -> Bool) -> ShortText -> B -> Maybe B
387findOfs p st = go
388  where
389    go :: B -> Maybe B
390    go !ofs | ofs >= sz = Nothing
391    go !ofs | p c       = Just ofs
392            | otherwise = go ofs'
393      where
394        (c,ofs') = decodeCharAtOfs st ofs
395
396    !sz = toB st
397
398{-# INLINE findOfs' #-}
399findOfs' :: (Char -> Bool) -> ShortText -> B -> Maybe (B,B)
400findOfs' p st = go
401  where
402    go :: B -> Maybe (B,B)
403    go !ofs | ofs >= sz = Nothing
404    go !ofs | p c       = Just (ofs,ofs')
405            | otherwise = go ofs'
406      where
407        (c,ofs') = decodeCharAtOfs st ofs
408
409    !sz = toB st
410
411
412{-# INLINE findOfsRev #-}
413findOfsRev :: (Char -> Bool) -> ShortText -> B -> Maybe B
414findOfsRev p st = go
415  where
416    go (B 0) = Nothing
417    go !ofs
418      | p (cp2ch cp) = Just ofs
419      | otherwise    = go (ofs-cpLen cp)
420      where
421        !cp = readCodePointRev st ofs
422
423-- | \(\mathcal{O}(n)\) Split 'ShortText' into longest prefix satisfying the given predicate and the remaining suffix.
424--
425-- >>> span (< 'c') "abcdabcd"
426-- ("ab","cdabcd")
427--
428-- prop> fst (span p t) <> snd (span p t) == t
429--
430-- @since 0.1.2
431span :: (Char -> Bool) -> ShortText -> (ShortText,ShortText)
432span p st
433  | Just ofs <- findOfs (not . p) st (B 0) = splitAtOfs ofs st
434  | otherwise = (st,mempty)
435
436-- | \(\mathcal{O}(n)\) Split 'ShortText' into longest suffix satisfying the given predicate and the preceding prefix.
437--
438-- >>> spanEnd (> 'c') "abcdabcd"
439-- ("abcdabc","d")
440--
441-- prop> fst (spanEnd p t) <> snd (spanEnd p t) == t
442--
443-- @since 0.1.2
444spanEnd :: (Char -> Bool) -> ShortText -> (ShortText,ShortText)
445spanEnd p st
446  | Just ofs <- findOfsRev (not . p) st (toB st) = splitAtOfs ofs st
447  | otherwise = (mempty,st)
448
449----------------------------------------------------------------------------
450
451toCSize :: ShortText -> CSize
452toCSize = fromIntegral . BSS.length . toShortByteString
453
454toB :: ShortText -> B
455toB = fromIntegral . BSS.length . toShortByteString
456
457toLength :: ShortText -> Int
458toLength st = I# (toLength# st)
459
460toLength# :: ShortText -> Int#
461toLength# st = GHC.Exts.sizeofByteArray# (toByteArray# st)
462
463toByteArray# :: ShortText -> ByteArray#
464toByteArray# (ShortText (BSSI.SBS ba#)) = ba#
465
466-- | \(\mathcal{O}(0)\) Converts to UTF-8 encoded 'ShortByteString'
467--
468-- This operation has effectively no overhead, as it's currently merely a @newtype@-cast.
469--
470-- @since 0.1
471toShortByteString :: ShortText -> ShortByteString
472toShortByteString (ShortText b) = b
473
474-- | \(\mathcal{O}(n)\) Converts to UTF-8 encoded 'BS.ByteString'
475--
476-- @since 0.1
477toByteString :: ShortText -> BS.ByteString
478toByteString = BSS.fromShort . toShortByteString
479
480-- | Construct a 'BB.Builder' that encodes 'ShortText' as UTF-8.
481--
482-- @since 0.1
483toBuilder :: ShortText -> BB.Builder
484toBuilder = BB.shortByteString . toShortByteString
485
486-- | \(\mathcal{O}(n)\) Convert to 'String'
487--
488-- prop> (fromString . toString) t == t
489--
490-- __Note__: See documentation of 'fromString' for why @('toString' . 'fromString')@ is not an identity function.
491--
492-- @since 0.1
493toString :: ShortText -> String
494-- NOTE: impl below beats
495--   toString = decodeStringShort' utf8 . toShortByteString
496-- except for smallish strings
497toString st = go 0
498  where
499    go !ofs
500      | ofs >= sz  = []
501      | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
502                     in c `seq` ofs' `seq` (c : go ofs')
503
504    !sz = toB st
505
506----------------------------------------------------------------------------
507-- Folds
508
509-- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with
510-- the binary operator and an initial in forward direction (i.e. from
511-- left to right).
512--
513-- >>> foldl (\_ _ -> True) False ""
514-- False
515--
516-- >>> foldl (\s c -> c : s) ['.'] "abcd"
517-- "dcba."
518--
519-- @since 0.1.2
520foldl :: (a -> Char -> a) -> a -> ShortText -> a
521foldl f z st = go 0 z
522  where
523    go !ofs acc
524      | ofs >= sz  = acc
525      | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
526                     in c `seq` ofs' `seq` go ofs' (f acc c)
527
528    !sz = toB st
529
530-- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with the binary operator.
531--
532-- >>> foldl1 max "abcdcba"
533-- 'd'
534--
535-- >>> foldl1 const "abcd"
536-- 'a'
537--
538-- >>> foldl1 (flip const) "abcd"
539-- 'd'
540--
541-- __Note__: Will throw an 'error' exception if index is out of bounds.
542--
543-- @since 0.1.2
544foldl1 :: (Char -> Char -> Char) -> ShortText -> Char
545foldl1 f st
546  | sz == 0    = error "foldl1: empty ShortText"
547  | otherwise  = go c0sz c0
548  where
549    go !ofs acc
550      | ofs >= sz  = acc
551      | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
552                     in c `seq` ofs' `seq` go ofs' (f acc c)
553    !sz = toB st
554    (c0,c0sz) = decodeCharAtOfs st (B 0)
555
556-- | \(\mathcal{O}(n)\) Strict version of 'foldl'.
557--
558-- @since 0.1.2
559foldl' :: (a -> Char -> a) -> a -> ShortText -> a
560foldl' f !z st = go 0 z
561  where
562    go !ofs !acc
563      | ofs >= sz  = acc
564      | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
565                     in c `seq` ofs' `seq` go ofs' (f acc c)
566
567    !sz = toB st
568
569-- | \(\mathcal{O}(n)\) Strict version of 'foldl1'.
570--
571-- @since 0.1.2
572foldl1' :: (Char -> Char -> Char) -> ShortText -> Char
573foldl1' f st
574  | sz == 0    = error "foldl1: empty ShortText"
575  | otherwise  = go c0sz c0
576  where
577    go !ofs !acc
578      | ofs >= sz  = acc
579      | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
580                     in c `seq` ofs' `seq` go ofs' (f acc c)
581    !sz = toB st
582    (c0,c0sz) = decodeCharAtOfs st (B 0)
583
584-- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with
585-- the binary operator and an initial in reverse direction (i.e. from
586-- right to left).
587--
588-- >>> foldr (\_ _ -> True) False ""
589-- False
590--
591-- >>> foldr (:) ['.'] "abcd"
592-- "abcd."
593--
594-- @since 0.1.2
595foldr :: (Char -> a -> a) -> a -> ShortText -> a
596foldr f z st = go 0
597  where
598    go !ofs
599      | ofs >= sz  = z
600      | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
601                     in c `seq` ofs' `seq` f c (go ofs')
602
603    !sz = toB st
604
605-- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with the binary operator.
606--
607-- >>> foldr1 max "abcdcba"
608-- 'd'
609--
610-- >>> foldr1 const "abcd"
611-- 'a'
612--
613-- >>> foldr1 (flip const) "abcd"
614-- 'd'
615--
616-- __Note__: Will throw an 'error' exception if index is out of bounds.
617--
618-- @since 0.1.2
619foldr1 :: (Char -> Char -> Char) -> ShortText -> Char
620foldr1 f st
621  | sz == 0    = error "foldr1: empty ShortText"
622  | otherwise  = go 0
623  where
624    go !ofs = let (c,ofs') = decodeCharAtOfs st ofs
625              in c `seq` ofs' `seq`
626                 (if ofs' >= sz then c else f c (go ofs'))
627
628    !sz = toB st
629
630-- | \(\mathcal{O}(n)\) Convert to 'T.Text'
631--
632-- prop> (fromText . toText) t == t
633--
634-- prop> (toText . fromText) t == t
635--
636-- This is currently not \(\mathcal{O}(1)\) because currently 'T.Text' uses UTF-16 as its internal representation.
637-- In the event that 'T.Text' will change its internal representation to UTF-8 this operation will become \(\mathcal{O}(1)\).
638--
639-- @since 0.1
640toText :: ShortText -> T.Text
641toText = T.decodeUtf8 . toByteString
642
643----
644
645-- | \(\mathcal{O}(n)\) Construct/pack from 'String'
646--
647-- >>> fromString []
648-- ""
649--
650-- >>> fromString ['a','b','c']
651-- "abc"
652--
653-- >>> fromString ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000
654-- "\55295\65533\65533\57344"
655--
656-- __Note__: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
657--
658-- @since 0.1
659fromString :: String -> ShortText
660fromString []  = mempty
661fromString [c] = singleton c
662fromString s = ShortText . encodeStringShort utf8 . map r $ s
663  where
664    r c | isSurr (ord c) = '\xFFFD'
665        | otherwise      = c
666
667-- | \(\mathcal{O}(n)\) Construct 'ShortText' from 'T.Text'
668--
669-- This is currently not \(\mathcal{O}(1)\) because currently 'T.Text' uses UTF-16 as its internal representation.
670-- In the event that 'T.Text' will change its internal representation to UTF-8 this operation will become \(\mathcal{O}(1)\).
671--
672-- @since 0.1
673fromText :: T.Text -> ShortText
674fromText = fromByteStringUnsafe . T.encodeUtf8
675
676-- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'ShortByteString'
677--
678-- This operation doesn't copy the input 'ShortByteString' but it
679-- cannot be \(\mathcal{O}(1)\) because we need to validate the UTF-8 encoding.
680--
681-- Returns 'Nothing' in case of invalid UTF-8 encoding.
682--
683-- >>> fromShortByteString "\x00\x38\xF0\x90\x8C\x9A" -- U+00 U+38 U+1031A
684-- Just "\NUL8\66330"
685--
686-- >>> fromShortByteString "\xC0\x80" -- invalid denormalised U+00
687-- Nothing
688--
689-- >>> fromShortByteString "\xED\xA0\x80" -- U+D800 (non-scalar code-point)
690-- Nothing
691--
692-- >>> fromShortByteString "\xF4\x8f\xbf\xbf" -- U+10FFFF
693-- Just "\1114111"
694--
695-- >>> fromShortByteString "\xF4\x90\x80\x80" -- U+110000 (invalid)
696-- Nothing
697--
698-- prop> fromShortByteString (toShortByteString t) == Just t
699--
700-- @since 0.1
701fromShortByteString :: ShortByteString -> Maybe ShortText
702fromShortByteString sbs
703  | isValidUtf8 st  = Just st
704  | otherwise       = Nothing
705  where
706    st = ShortText sbs
707
708-- | \(\mathcal{O}(0)\) Construct 'ShortText' from UTF-8 encoded 'ShortByteString'
709--
710-- This operation has effectively no overhead, as it's currently merely a @newtype@-cast.
711--
712-- __WARNING__: Unlike the safe 'fromShortByteString' conversion, this
713-- conversion is /unsafe/ as it doesn't validate the well-formedness of the
714-- UTF-8 encoding.
715--
716-- @since 0.1.1
717fromShortByteStringUnsafe :: ShortByteString -> ShortText
718fromShortByteStringUnsafe = ShortText
719
720-- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'BS.ByteString'
721--
722-- 'fromByteString' accepts (or rejects) the same input data as 'fromShortByteString'.
723--
724-- Returns 'Nothing' in case of invalid UTF-8 encoding.
725--
726-- @since 0.1
727fromByteString :: BS.ByteString -> Maybe ShortText
728fromByteString = fromShortByteString . BSS.toShort
729
730-- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'BS.ByteString'
731--
732-- This operation is \(\mathcal{O}(n)\) because the 'BS.ByteString' needs to be
733-- copied into an unpinned 'ByteArray#'.
734--
735-- __WARNING__: Unlike the safe 'fromByteString' conversion, this
736-- conversion is /unsafe/ as it doesn't validate the well-formedness of the
737-- UTF-8 encoding.
738--
739-- @since 0.1.1
740fromByteStringUnsafe :: BS.ByteString -> ShortText
741fromByteStringUnsafe = ShortText . BSS.toShort
742
743----------------------------------------------------------------------------
744
745encodeString :: TextEncoding -> String -> BS.ByteString
746encodeString te str = unsafePerformIO $ GHC.withCStringLen te str BS.packCStringLen
747
748-- decodeString :: TextEncoding -> BS.ByteString -> Maybe String
749-- decodeString te bs = cvtEx $ unsafePerformIO $ try $ BS.useAsCStringLen bs (GHC.peekCStringLen te)
750--   where
751--     cvtEx :: Either IOException a -> Maybe a
752--     cvtEx = either (const Nothing) Just
753
754decodeString' :: TextEncoding -> BS.ByteString -> String
755decodeString' te bs = unsafePerformIO $ BS.useAsCStringLen bs (GHC.peekCStringLen te)
756
757decodeStringShort' :: TextEncoding -> ShortByteString -> String
758decodeStringShort' te = decodeString' te . BSS.fromShort
759
760encodeStringShort :: TextEncoding -> String -> BSS.ShortByteString
761encodeStringShort te = BSS.toShort . encodeString te
762
763-- isValidUtf8' :: ShortText -> Int
764-- isValidUtf8' st = fromIntegral $ unsafeDupablePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st))
765
766isValidUtf8 :: ShortText -> Bool
767isValidUtf8 st = (==0) $ unsafeDupablePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st))
768
769type CCodePoint = Word
770
771foreign import ccall unsafe "hs_text_short_is_valid_utf8" c_text_short_is_valid_utf8 :: ByteArray# -> CSize -> IO CInt
772
773foreign import ccall unsafe "hs_text_short_index_cp" c_text_short_index :: ByteArray# -> CSize -> CSize -> IO CCodePoint
774
775-- | \(\mathcal{O}(n)\) Lookup /i/-th code-point in 'ShortText'.
776--
777-- Returns 'Nothing' if out of bounds.
778--
779-- prop> indexMaybe (singleton c) 0 == Just c
780--
781-- prop> indexMaybe t 0 == fmap fst (uncons t)
782--
783-- prop> indexMaybe mempty i == Nothing
784--
785-- @since 0.1.2
786indexMaybe :: ShortText -> Int -> Maybe Char
787indexMaybe st i
788  | i < 0      = Nothing
789  | otherwise  = cp2chSafe cp
790  where
791    cp = CP $ unsafeDupablePerformIO (c_text_short_index (toByteArray# st) (toCSize st) (fromIntegral i))
792
793-- | \(\mathcal{O}(n)\) Lookup /i/-th code-point from the end of 'ShortText'.
794--
795-- Returns 'Nothing' if out of bounds.
796--
797-- prop> indexEndMaybe (singleton c) 0 == Just c
798--
799-- prop> indexEndMaybe t 0 == fmap snd (unsnoc t)
800--
801-- prop> indexEndMaybe mempty i == Nothing
802--
803-- @since 0.1.2
804indexEndMaybe :: ShortText -> Int -> Maybe Char
805indexEndMaybe st i
806  | i < 0      = Nothing
807  | otherwise  = cp2chSafe cp
808  where
809    cp = CP $ unsafeDupablePerformIO (c_text_short_index_rev (toByteArray# st) (toCSize st) (fromIntegral i))
810
811foreign import ccall unsafe "hs_text_short_index_cp_rev" c_text_short_index_rev :: ByteArray# -> CSize -> CSize -> IO CCodePoint
812
813
814-- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves.
815--
816-- @'splitAt' n t@ returns a pair of 'ShortText' with the following properties:
817--
818-- prop> length (fst (splitAt n t)) == min (length t) (max 0 n)
819--
820-- prop> fst (splitAt n t) <> snd (splitAt n t) == t
821--
822-- >>> splitAt 2 "abcdef"
823-- ("ab","cdef")
824--
825-- >>> splitAt 10 "abcdef"
826-- ("abcdef","")
827--
828-- >>> splitAt (-1) "abcdef"
829-- ("","abcdef")
830--
831-- @since 0.1.2
832splitAt :: Int -> ShortText -> (ShortText,ShortText)
833splitAt i st
834  | i <= 0    = (mempty,st)
835  | otherwise = splitAtOfs ofs st
836  where
837    ofs   = csizeToB $
838            unsafeDupablePerformIO (c_text_short_index_ofs (toByteArray# st) stsz (fromIntegral i))
839    stsz  = toCSize st
840
841-- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves.
842--
843-- @'splitAtEnd' n t@ returns a pair of 'ShortText' with the following properties:
844--
845-- prop> length (snd (splitAtEnd n t)) == min (length t) (max 0 n)
846--
847-- prop> fst (splitAtEnd n t) <> snd (splitAtEnd n t) == t
848--
849-- prop> splitAtEnd n t == splitAt (length t - n) t
850--
851-- >>> splitAtEnd 2 "abcdef"
852-- ("abcd","ef")
853--
854-- >>> splitAtEnd 10 "abcdef"
855-- ("","abcdef")
856--
857-- >>> splitAtEnd (-1) "abcdef"
858-- ("abcdef","")
859--
860-- @since 0.1.2
861splitAtEnd :: Int -> ShortText -> (ShortText,ShortText)
862splitAtEnd i st
863  | i <= 0      = (st,mempty)
864  | ofs >= stsz = (mempty,st)
865  | otherwise   = splitAtOfs ofs st
866  where
867    ofs   = csizeToB $
868            unsafeDupablePerformIO (c_text_short_index_ofs_rev (toByteArray# st) (toCSize st) (fromIntegral (i-1)))
869    stsz  = toB st
870
871{-# INLINE splitAtOfs #-}
872splitAtOfs :: B -> ShortText -> (ShortText,ShortText)
873splitAtOfs ofs st
874  | ofs  == 0    = (mempty,st)
875  | ofs  >= stsz = (st,mempty)
876  | otherwise    = (slice st 0 ofs, slice st ofs (stsz-ofs))
877  where
878    !stsz  = toB st
879
880foreign import ccall unsafe "hs_text_short_index_ofs" c_text_short_index_ofs :: ByteArray# -> CSize -> CSize -> IO CSize
881
882foreign import ccall unsafe "hs_text_short_index_ofs_rev" c_text_short_index_ofs_rev :: ByteArray# -> CSize -> CSize -> IO CSize
883
884
885-- | \(\mathcal{O}(n)\) Inverse operation to 'cons'
886--
887-- Returns 'Nothing' for empty input 'ShortText'.
888--
889-- prop> uncons (cons c t) == Just (c,t)
890--
891-- >>> uncons ""
892-- Nothing
893--
894-- >>> uncons "fmap"
895-- Just ('f',"map")
896--
897-- @since 0.1.2
898uncons :: ShortText -> Maybe (Char,ShortText)
899uncons st
900  | null st    = Nothing
901  | len2 == 0  = Just (c0, mempty)
902  | otherwise  = Just (c0, slice st ofs len2)
903  where
904    c0  = cp2ch cp0
905    cp0 = readCodePoint st 0
906    ofs = cpLen cp0
907    len2 = toB st - ofs
908
909-- | \(\mathcal{O}(n)\) Inverse operation to 'snoc'
910--
911-- Returns 'Nothing' for empty input 'ShortText'.
912--
913-- prop> unsnoc (snoc t c) == Just (t,c)
914--
915-- >>> unsnoc ""
916-- Nothing
917--
918-- >>> unsnoc "fmap"
919-- Just ("fma",'p')
920--
921-- @since 0.1.2
922unsnoc :: ShortText -> Maybe (ShortText,Char)
923unsnoc st
924  | null st    = Nothing
925  | len1 == 0  = Just (mempty, c0)
926  | otherwise  = Just (slice st 0 len1, c0)
927  where
928    c0  = cp2ch cp0
929    cp0 = readCodePointRev st stsz
930    stsz = toB st
931    len1 = stsz - cpLen cp0
932
933-- | \(\mathcal{O}(n)\) Tests whether the first 'ShortText' is a prefix of the second 'ShortText'
934--
935-- >>> isPrefixOf "ab" "abcdef"
936-- True
937--
938-- >>> isPrefixOf "ac" "abcdef"
939-- False
940--
941-- prop> isPrefixOf "" t == True
942--
943-- prop> isPrefixOf t t == True
944--
945-- @since 0.1.2
946isPrefixOf :: ShortText -> ShortText -> Bool
947isPrefixOf x y
948  | lx > ly = False
949  | lx == 0 = True
950  | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) 0# n# of
951                  0# -> True
952                  _  -> False
953  where
954    !lx@(I# n#) = toLength x
955    !ly = toLength y
956
957-- | \(\mathcal{O}(n)\) Strip prefix from second 'ShortText' argument.
958--
959-- Returns 'Nothing' if first argument is not a prefix of the second argument.
960--
961-- >>> stripPrefix "text-" "text-short"
962-- Just "short"
963--
964-- >>> stripPrefix "test-" "text-short"
965-- Nothing
966--
967-- @since 0.1.2
968stripPrefix :: ShortText -> ShortText -> Maybe ShortText
969stripPrefix pfx t
970  | isPrefixOf pfx t = Just $! snd (splitAtOfs (toB pfx) t)
971  | otherwise        = Nothing
972
973-- | \(\mathcal{O}(n)\) Tests whether the first 'ShortText' is a suffix of the second 'ShortText'
974--
975-- >>> isSuffixOf "ef" "abcdef"
976-- True
977--
978-- >>> isPrefixOf "df" "abcdef"
979-- False
980--
981-- prop> isSuffixOf "" t == True
982--
983-- prop> isSuffixOf t t == True
984--
985-- @since 0.1.2
986isSuffixOf :: ShortText -> ShortText -> Bool
987isSuffixOf x y
988  | lx > ly = False
989  | lx == 0 = True
990  | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) ofs2# n# of
991                  0# -> True
992                  _  -> False
993  where
994    !(I# ofs2#) = ly - lx
995    !lx@(I# n#) = toLength x
996    !ly = toLength y
997
998-- | \(\mathcal{O}(n)\) Strip suffix from second 'ShortText' argument.
999--
1000-- Returns 'Nothing' if first argument is not a suffix of the second argument.
1001--
1002-- >>> stripSuffix "-short" "text-short"
1003-- Just "text"
1004--
1005-- >>> stripSuffix "-utf8" "text-short"
1006-- Nothing
1007--
1008-- @since 0.1.2
1009stripSuffix :: ShortText -> ShortText -> Maybe ShortText
1010stripSuffix sfx t
1011  | isSuffixOf sfx t = Just $! fst (splitAtOfs pfxLen t)
1012  | otherwise        = Nothing
1013  where
1014    pfxLen = toB t - toB sfx
1015
1016----------------------------------------------------------------------------
1017
1018-- | \(\mathcal{O}(n)\) Insert character between characters of 'ShortText'.
1019--
1020-- >>> intersperse '*' "_"
1021-- "_"
1022--
1023-- >>> intersperse '*' "MASH"
1024-- "M*A*S*H"
1025--
1026-- @since 0.1.2
1027intersperse :: Char -> ShortText -> ShortText
1028intersperse c st
1029  | null st = mempty
1030  | sn == 1 = st
1031  | otherwise = create newsz $ \mba -> do
1032      let !cp0 = readCodePoint st 0
1033          !cp0sz = cpLen cp0
1034      writeCodePointN cp0sz mba 0 cp0
1035      go mba (sn - 1) cp0sz cp0sz
1036  where
1037    newsz = ssz + ((sn-1) `mulB` csz)
1038    ssz = toB st
1039    sn  = length st
1040    csz = cpLen cp
1041    cp  = ch2cp c
1042
1043    go :: MBA s -> Int -> B -> B -> ST s ()
1044    go _   0 !_  !_   = return ()
1045    go mba n ofs ofs2 = do
1046      let !cp1 = readCodePoint st ofs2
1047          !cp1sz = cpLen cp1
1048      writeCodePointN csz   mba ofs cp
1049      writeCodePointN cp1sz mba (ofs+csz) cp1
1050      go mba (n-1) (ofs+csz+cp1sz) (ofs2+cp1sz)
1051
1052-- | \(\mathcal{O}(n)\) Insert 'ShortText' inbetween list of 'ShortText's.
1053--
1054-- >>> intercalate ", " []
1055-- ""
1056--
1057-- >>> intercalate ", " ["foo"]
1058-- "foo"
1059--
1060-- >>> intercalate ", " ["foo","bar","doo"]
1061-- "foo, bar, doo"
1062--
1063-- prop> intercalate "" ts == concat ts
1064--
1065-- @since 0.1.2
1066intercalate :: ShortText -> [ShortText] -> ShortText
1067intercalate _ []  = mempty
1068intercalate _ [t] = t
1069intercalate sep ts
1070  | null sep   = mconcat ts
1071  | otherwise  = mconcat (List.intersperse sep ts)
1072
1073-- | \(\mathcal{O}(n*m)\) Replicate a 'ShortText'.
1074--
1075-- A repetition count smaller than 1 results in an empty string result.
1076--
1077-- >>> replicate 3 "jobs!"
1078-- "jobs!jobs!jobs!"
1079--
1080-- >>> replicate 10000 ""
1081-- ""
1082--
1083-- >>> replicate 0 "nothing"
1084-- ""
1085--
1086-- prop> length (replicate n t) == max 0 n * length t
1087--
1088-- @since 0.1.2
1089replicate :: Int -> ShortText -> ShortText
1090replicate n0 t
1091  | n0 < 1     = mempty
1092  | null t    = mempty
1093  | otherwise = create (n0 `mulB` sz) (go 0)
1094  where
1095    go :: Int -> MBA s -> ST s ()
1096    go j mba
1097      | j == n0    = return ()
1098      | otherwise  = do
1099          copyByteArray t 0 mba (j `mulB` sz) sz
1100          go (j+1) mba
1101
1102    sz = toB t
1103
1104-- | \(\mathcal{O}(n)\) Reverse characters in 'ShortText'.
1105--
1106-- >>> reverse "star live desserts"
1107-- "stressed evil rats"
1108--
1109-- prop> reverse (singleton c) == singleton c
1110--
1111-- prop> reverse (reverse t) == t
1112--
1113-- @since 0.1.2
1114reverse :: ShortText -> ShortText
1115reverse st
1116  | null st   = mempty
1117  | sn == 1   = st
1118  | otherwise = create sz $ go sn 0
1119  where
1120    sz = toB st
1121    sn = length st
1122
1123    go :: Int -> B -> MBA s -> ST s ()
1124    go 0 !_  _   = return ()
1125    go i ofs mba = do
1126      let !cp   = readCodePoint st ofs
1127          !cpsz = cpLen cp
1128          !ofs' = ofs+cpsz
1129      writeCodePointN cpsz mba (sz - ofs') cp
1130      go (i-1) ofs' mba
1131
1132
1133-- | \(\mathcal{O}(n)\) Remove characters from 'ShortText' which don't satisfy given predicate.
1134--
1135-- >>> filter (`notElem` ['a','e','i','o','u']) "You don't need vowels to convey information!"
1136-- "Y dn't nd vwls t cnvy nfrmtn!"
1137--
1138-- prop> filter (const False) t == ""
1139--
1140-- prop> filter (const True) t == t
1141--
1142-- prop> length (filter p t) <= length t
1143--
1144-- prop> filter p t == pack [ c | c <- unpack t, p c ]
1145--
1146-- @since 0.1.2
1147filter :: (Char -> Bool) -> ShortText -> ShortText
1148filter p t
1149  = case (mofs1,mofs2) of
1150      (Nothing,   _)       -> t -- no non-accepted characters found
1151      (Just 0,    Nothing) -> mempty -- no accepted characters found
1152      (Just ofs1, Nothing) -> slice t 0 ofs1 -- only prefix accepted
1153      (Just ofs1, Just ofs2) -> createShrink (t0sz-(ofs2-ofs1)) $ \mba -> do
1154        -- copy accepted prefix
1155        copyByteArray t 0 mba 0 ofs1
1156        -- [ofs1 .. ofs2) are a non-accepted region
1157        -- filter rest after ofs2
1158        t1sz <- go mba ofs2 ofs1
1159        return t1sz
1160  where
1161    mofs1 = findOfs (not . p) t (B 0) -- first non-accepted Char
1162    mofs2 = findOfs p t (fromMaybe (B 0) mofs1) -- first accepted Char
1163
1164    t0sz = toB t
1165
1166    go :: MBA s -> B -> B -> ST s B
1167    go mba !t0ofs !t1ofs
1168      | t0ofs >= t0sz = return t1ofs
1169      | otherwise = let !cp = readCodePoint t t0ofs
1170                        !cpsz = cpLen cp
1171                    in if p (cp2ch cp)
1172                       then writeCodePointN cpsz mba t1ofs cp >>
1173                            go mba (t0ofs+cpsz) (t1ofs+cpsz)
1174                       else go mba (t0ofs+cpsz) t1ofs -- skip code-point
1175
1176-- | \(\mathcal{O}(n)\) Strip characters from the beginning end and of 'ShortText' which satisfy given predicate.
1177--
1178-- >>> dropAround (== ' ') "   white   space   "
1179-- "white   space"
1180--
1181-- >>> dropAround (> 'a') "bcdefghi"
1182-- ""
1183--
1184-- @since 0.1.2
1185dropAround :: (Char -> Bool) -> ShortText -> ShortText
1186dropAround p t0 = case (mofs1,mofs2) of
1187                    (Nothing,_) -> mempty
1188                    (Just ofs1,Just ofs2)
1189                      | ofs1 == 0, ofs2 == t0sz  -> t0
1190                      | ofs1 < ofs2  -> create (ofs2-ofs1) $ \mba -> do
1191                          copyByteArray t0 ofs1 mba (B 0) (ofs2-ofs1)
1192                    (_,_) -> error "dropAround: the impossible happened"
1193  where
1194    mofs1 = findOfs    (not . p) t0 (B 0)
1195    mofs2 = findOfsRev (not . p) t0 t0sz
1196    t0sz = toB t0
1197
1198----------------------------------------------------------------------------
1199
1200-- | Construct a new 'ShortText' from an existing one by slicing
1201--
1202-- NB: The 'CSize' arguments refer to byte-offsets
1203slice :: ShortText -> B -> B -> ShortText
1204slice st ofs len
1205  | ofs < 0    = error "invalid offset"
1206  | len < 0    = error "invalid length"
1207  | len' == 0  = mempty
1208  | otherwise  = create len' $ \mba -> copyByteArray st ofs' mba 0 len'
1209  where
1210    len0 = toB st
1211    len' = max 0 (min len (len0-ofs))
1212    ofs' = max 0 ofs
1213
1214----------------------------------------------------------------------------
1215-- low-level MutableByteArray# helpers
1216
1217-- | Byte offset (or size) in bytes
1218--
1219-- This currently wraps an 'Int' because this is what GHC's primops
1220-- currently use for byte offsets/sizes.
1221newtype B = B { unB :: Int }
1222          deriving (Ord,Eq,Num)
1223
1224{- TODO: introduce operators for 'B' to avoid 'Num' -}
1225
1226mulB :: Int -> B -> B
1227mulB n (B b) = B (n*b)
1228
1229csizeFromB :: B -> CSize
1230csizeFromB = fromIntegral . unB
1231
1232csizeToB :: CSize -> B
1233csizeToB = B . fromIntegral
1234
1235data MBA s = MBA# { unMBA# :: MutableByteArray# s }
1236
1237{-# INLINE create #-}
1238create :: B -> (forall s. MBA s -> ST s ()) -> ShortText
1239create n go = runST $ do
1240  mba <- newByteArray n
1241  go mba
1242  unsafeFreeze mba
1243
1244{-# INLINE createShrink #-}
1245createShrink :: B -> (forall s. MBA s -> ST s B) -> ShortText
1246createShrink n go = runST $ do
1247  mba <- newByteArray n
1248  n' <- go mba
1249  if n' < n
1250    then unsafeFreezeShrink mba n'
1251    else unsafeFreeze mba
1252
1253{-# INLINE unsafeFreeze #-}
1254unsafeFreeze :: MBA s -> ST s ShortText
1255unsafeFreeze (MBA# mba#)
1256  = ST $ \s -> case GHC.Exts.unsafeFreezeByteArray# mba# s of
1257                 (# s', ba# #) -> (# s', ShortText (BSSI.SBS ba#) #)
1258
1259{-# INLINE copyByteArray #-}
1260copyByteArray :: ShortText -> B -> MBA s -> B -> B -> ST s ()
1261copyByteArray (ShortText (BSSI.SBS src#)) (B (I# src_off#)) (MBA# dst#) (B (I# dst_off#)) (B (I# len#))
1262  = ST $ \s -> case GHC.Exts.copyByteArray# src# src_off# dst# dst_off# len# s of
1263                 s' -> (# s', () #)
1264
1265{-# INLINE newByteArray #-}
1266newByteArray :: B -> ST s (MBA s)
1267newByteArray (B (I# n#))
1268  = ST $ \s -> case GHC.Exts.newByteArray# n# s of
1269                 (# s', mba# #) -> (# s', MBA# mba# #)
1270
1271{-# INLINE writeWord8Array #-}
1272writeWord8Array :: MBA s -> B -> Word -> ST s ()
1273writeWord8Array (MBA# mba#) (B (I# i#)) (W# w#)
1274  = ST $ \s -> case GHC.Exts.writeWord8Array# mba# i# w# s of
1275                 s' -> (# s', () #)
1276{- not needed yet
1277{-# INLINE indexWord8Array #-}
1278indexWord8Array :: ShortText -> B -> Word
1279indexWord8Array (ShortText (BSSI.SBS ba#)) (B (I# i#)) = W# (GHC.Exts.indexWord8Array# ba# i#)
1280-}
1281
1282{-# INLINE copyAddrToByteArray #-}
1283copyAddrToByteArray :: Ptr a -> MBA RealWorld -> B -> B -> ST RealWorld ()
1284copyAddrToByteArray (Ptr src#) (MBA# dst#) (B (I# dst_off#)) (B (I# len#))
1285  = ST $ \s -> case GHC.Exts.copyAddrToByteArray# src# dst# dst_off# len# s of
1286                 s' -> (# s', () #)
1287
1288----------------------------------------------------------------------------
1289-- unsafeFreezeShrink
1290
1291#if __GLASGOW_HASKELL__ >= 710
1292-- for GHC versions which have the 'shrinkMutableByteArray#' primop
1293{-# INLINE unsafeFreezeShrink #-}
1294unsafeFreezeShrink :: MBA s -> B -> ST s ShortText
1295unsafeFreezeShrink mba n = do
1296  shrink mba n
1297  unsafeFreeze mba
1298
1299{-# INLINE shrink #-}
1300shrink :: MBA s -> B -> ST s ()
1301shrink (MBA# mba#) (B (I# i#))
1302  = ST $ \s -> case GHC.Exts.shrinkMutableByteArray# mba# i# s of
1303                 s' -> (# s', () #)
1304#else
1305-- legacy code for GHC versions which lack `shrinkMutableByteArray#` primop
1306{-# INLINE unsafeFreezeShrink #-}
1307unsafeFreezeShrink :: MBA s -> B -> ST s ShortText
1308unsafeFreezeShrink mba0 n = do
1309  mba' <- newByteArray n
1310  copyByteArray2 mba0 0 mba' 0 n
1311  unsafeFreeze mba'
1312
1313{-# INLINE copyByteArray2 #-}
1314copyByteArray2 :: MBA s -> B -> MBA s -> B -> B -> ST s ()
1315copyByteArray2 (MBA# src#) (B (I# src_off#)) (MBA# dst#) (B (I# dst_off#)) (B( I# len#))
1316  = ST $ \s -> case GHC.Exts.copyMutableByteArray# src# src_off# dst# dst_off# len# s of
1317                 s' -> (# s', () #)
1318#endif
1319
1320----------------------------------------------------------------------------
1321-- Helpers for encoding code points into UTF-8 code units
1322--
1323--   7 bits| <    0x80 | 0xxxxxxx
1324--  11 bits| <   0x800 | 110yyyyx  10xxxxxx
1325--  16 bits| < 0x10000 | 1110yyyy  10yxxxxx  10xxxxxx
1326--  21 bits|           | 11110yyy  10yyxxxx  10xxxxxx  10xxxxxx
1327
1328-- | Unicode Code-point
1329--
1330-- Keeping it as a 'Word' is more convenient for bit-ops and FFI
1331newtype CP = CP Word
1332
1333{-# INLINE ch2cp #-}
1334ch2cp :: Char -> CP
1335ch2cp (ord -> ci)
1336  | isSurr ci  = CP 0xFFFD
1337  | otherwise  = CP (fromIntegral ci)
1338
1339{-# INLINE isSurr #-}
1340isSurr :: (Num i, Bits i) => i -> Bool
1341isSurr ci = ci .&. 0xfff800 == 0xd800
1342
1343{-# INLINE cp2ch #-}
1344cp2ch :: CP -> Char
1345cp2ch (CP w) = (w < 0x110000) `assert` unsafeChr (fromIntegral w)
1346
1347-- used/needed by index-lookup functions to encode out of bounds
1348cp2chSafe :: CP -> Maybe Char
1349cp2chSafe cp
1350  | cpNull cp = Nothing
1351  | otherwise = Just $! cp2ch cp
1352  where
1353    cpNull :: CP -> Bool
1354    cpNull (CP w) = w >= 0x110000
1355
1356{-# INLINE cpLen #-}
1357cpLen :: CP -> B
1358cpLen (CP cp)
1359  | cp <    0x80  = B 1
1360  | cp <   0x800  = B 2
1361  | cp < 0x10000  = B 3
1362  | otherwise     = B 4
1363
1364-- convenience wrapper; unsafe like readCodePoint
1365{-# INLINE decodeCharAtOfs #-}
1366decodeCharAtOfs :: ShortText -> B -> (Char,B)
1367decodeCharAtOfs st ofs = (c,ofs')
1368  where
1369    c    = cp2ch cp
1370    ofs' = ofs + cpLen cp
1371    cp   = readCodePoint st ofs
1372{- pure version of decodeCharAtOfs, but unfortunately significantly slower
1373
1374decodeCharAtOfs st ofs
1375  | b0 < 0x80 = (cp2ch $ CP b0,ofs + B 1)
1376  | otherwise = case b0 `unsafeShiftR` 4 of
1377                  0xf -> (cp2ch $ CP go4, ofs + B 4)
1378                  0xe -> (cp2ch $ CP go3, ofs + B 3)
1379                  _   -> (cp2ch $ CP go2, ofs + B 2)
1380  where
1381    b0    = buf 0
1382    buf j = indexWord8Array st (ofs+j)
1383
1384    go2 =     ((b0    .&. 0x1f) `unsafeShiftL` 6)
1385          .|.  (buf 1 .&. 0x3f)
1386
1387    go3 =     ((b0    .&. 0x0f) `unsafeShiftL` (6+6))
1388          .|. ((buf 1 .&. 0x3f) `unsafeShiftL` 6)
1389          .|.  (buf 2 .&. 0x3f)
1390
1391    go4 =     ((b0    .&. 0x07) `unsafeShiftL` (6+6+6))
1392          .|. ((buf 1 .&. 0x3f) `unsafeShiftL` (6+6))
1393          .|. ((buf 2 .&. 0x3f) `unsafeShiftL` 6)
1394          .|.  (buf 3 .&. 0x3f)
1395-}
1396
1397
1398-- | \(\mathcal{O}(1)\) Construct 'ShortText' from single codepoint.
1399--
1400-- prop> singleton c == pack [c]
1401--
1402-- prop> length (singleton c) == 1
1403--
1404-- >>> singleton 'A'
1405-- "A"
1406--
1407-- >>> map singleton ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000
1408-- ["\55295","\65533","\65533","\57344"]
1409--
1410-- __Note__: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
1411--
1412-- @since 0.1.2
1413singleton :: Char -> ShortText
1414singleton = singleton' . ch2cp
1415
1416singleton' :: CP -> ShortText
1417singleton' cp@(CP cpw)
1418  | cpw <    0x80  = create 1 $ \mba -> writeCodePoint1 mba 0 cp
1419  | cpw <   0x800  = create 2 $ \mba -> writeCodePoint2 mba 0 cp
1420  | cpw < 0x10000  = create 3 $ \mba -> writeCodePoint3 mba 0 cp
1421  | otherwise      = create 4 $ \mba -> writeCodePoint4 mba 0 cp
1422
1423-- | \(\mathcal{O}(n)\) Prepend a character to a 'ShortText'.
1424--
1425-- prop> cons c t == singleton c <> t
1426--
1427-- @since 0.1.2
1428cons :: Char -> ShortText -> ShortText
1429cons (ch2cp -> cp@(CP cpw)) sfx
1430  | n == 0         = singleton' cp
1431  | cpw <    0x80  = create (n+1) $ \mba -> writeCodePoint1 mba 0 cp >> copySfx 1 mba
1432  | cpw <   0x800  = create (n+2) $ \mba -> writeCodePoint2 mba 0 cp >> copySfx 2 mba
1433  | cpw < 0x10000  = create (n+3) $ \mba -> writeCodePoint3 mba 0 cp >> copySfx 3 mba
1434  | otherwise      = create (n+4) $ \mba -> writeCodePoint4 mba 0 cp >> copySfx 4 mba
1435  where
1436    !n = toB sfx
1437
1438    copySfx :: B -> MBA s -> ST s ()
1439    copySfx ofs mba = copyByteArray sfx 0 mba ofs n
1440
1441-- | \(\mathcal{O}(n)\) Append a character to the ond of a 'ShortText'.
1442--
1443-- prop> snoc t c == t <> singleton c
1444--
1445-- @since 0.1.2
1446snoc :: ShortText -> Char -> ShortText
1447snoc pfx (ch2cp -> cp@(CP cpw))
1448  | n == 0         = singleton' cp
1449  | cpw <    0x80  = create (n+1) $ \mba -> copyPfx mba >> writeCodePoint1 mba n cp
1450  | cpw <   0x800  = create (n+2) $ \mba -> copyPfx mba >> writeCodePoint2 mba n cp
1451  | cpw < 0x10000  = create (n+3) $ \mba -> copyPfx mba >> writeCodePoint3 mba n cp
1452  | otherwise      = create (n+4) $ \mba -> copyPfx mba >> writeCodePoint4 mba n cp
1453  where
1454    !n = toB pfx
1455
1456    copyPfx :: MBA s -> ST s ()
1457    copyPfx mba = copyByteArray pfx 0 mba 0 n
1458
1459{-
1460writeCodePoint :: MBA s -> Int -> Word -> ST s ()
1461writeCodePoint mba ofs cp
1462  | cp <    0x80  = writeCodePoint1 mba ofs cp
1463  | cp <   0x800  = writeCodePoint2 mba ofs cp
1464  | cp < 0x10000  = writeCodePoint3 mba ofs cp
1465  | otherwise     = writeCodePoint4 mba ofs cp
1466-}
1467
1468writeCodePointN :: B -> MBA s -> B -> CP -> ST s ()
1469writeCodePointN 1 = writeCodePoint1
1470writeCodePointN 2 = writeCodePoint2
1471writeCodePointN 3 = writeCodePoint3
1472writeCodePointN 4 = writeCodePoint4
1473writeCodePointN _ = undefined
1474
1475writeCodePoint1 :: MBA s -> B -> CP -> ST s ()
1476writeCodePoint1 mba ofs (CP cp) =
1477  writeWord8Array mba ofs cp
1478
1479writeCodePoint2 :: MBA s -> B -> CP -> ST s ()
1480writeCodePoint2 mba ofs (CP cp) = do
1481  writeWord8Array mba  ofs    (0xc0 .|. (cp `unsafeShiftR` 6))
1482  writeWord8Array mba (ofs+1) (0x80 .|. (cp                     .&. 0x3f))
1483
1484writeCodePoint3 :: MBA s -> B -> CP -> ST s ()
1485writeCodePoint3 mba ofs (CP cp) = do
1486  writeWord8Array mba  ofs    (0xe0 .|.  (cp `unsafeShiftR` 12))
1487  writeWord8Array mba (ofs+1) (0x80 .|. ((cp `unsafeShiftR` 6)  .&. 0x3f))
1488  writeWord8Array mba (ofs+2) (0x80 .|.  (cp                    .&. 0x3f))
1489
1490writeCodePoint4 :: MBA s -> B -> CP -> ST s ()
1491writeCodePoint4 mba ofs (CP cp) = do
1492  writeWord8Array mba  ofs    (0xf0 .|.  (cp `unsafeShiftR` 18))
1493  writeWord8Array mba (ofs+1) (0x80 .|. ((cp `unsafeShiftR` 12) .&. 0x3f))
1494  writeWord8Array mba (ofs+2) (0x80 .|. ((cp `unsafeShiftR` 6)  .&. 0x3f))
1495  writeWord8Array mba (ofs+3) (0x80 .|. (cp                     .&. 0x3f))
1496
1497-- beware: UNSAFE!
1498readCodePoint :: ShortText -> B -> CP
1499readCodePoint st (csizeFromB -> ofs)
1500  = CP $ unsafeDupablePerformIO (c_text_short_ofs_cp (toByteArray# st) ofs)
1501
1502foreign import ccall unsafe "hs_text_short_ofs_cp" c_text_short_ofs_cp :: ByteArray# -> CSize -> IO CCodePoint
1503
1504readCodePointRev :: ShortText -> B -> CP
1505readCodePointRev st (csizeFromB -> ofs)
1506  = CP $ unsafeDupablePerformIO (c_text_short_ofs_cp_rev (toByteArray# st) ofs)
1507
1508foreign import ccall unsafe "hs_text_short_ofs_cp_rev" c_text_short_ofs_cp_rev :: ByteArray# -> CSize -> IO CCodePoint
1509
1510----------------------------------------------------------------------------
1511-- string & list literals
1512
1513-- | __Note__: Surrogate pairs (@[U+D800 .. U+DFFF]@) character literals are replaced by U+FFFD.
1514--
1515-- @since 0.1.2
1516instance GHC.Exts.IsList ShortText where
1517    type (Item ShortText) = Char
1518    fromList = fromString
1519    toList   = toString
1520
1521-- | __Note__: Surrogate pairs (@[U+D800 .. U+DFFF]@) in string literals are replaced by U+FFFD.
1522--
1523-- This matches the behaviour of 'S.IsString' instance for 'T.Text'.
1524instance S.IsString ShortText where
1525    fromString = fromStringLit
1526
1527-- i.e., don't inline before Phase 0
1528{-# INLINE [0] fromStringLit #-}
1529fromStringLit :: String -> ShortText
1530fromStringLit = fromString
1531
1532{-# RULES "ShortText empty literal" fromStringLit "" = mempty #-}
1533
1534-- TODO: this doesn't seem to fire
1535{-# RULES "ShortText singleton literal" forall c . fromStringLit [c] = singleton c #-}
1536
1537{-# RULES "ShortText literal ASCII" forall s . fromStringLit (GHC.unpackCString# s) = fromLitAsciiAddr# s #-}
1538
1539{-# RULES "ShortText literal UTF-8" forall s . fromStringLit (GHC.unpackCStringUtf8# s) = fromLitMUtf8Addr# s #-}
1540
1541{-# NOINLINE fromLitAsciiAddr# #-}
1542fromLitAsciiAddr# :: Addr# -> ShortText
1543fromLitAsciiAddr# (Ptr -> ptr) = unsafeDupablePerformIO $ do
1544  sz <- csizeToB `fmap` c_strlen ptr
1545
1546  case sz `compare` 0 of
1547    EQ -> return mempty -- should not happen if rules fire correctly
1548    GT -> stToIO $ do
1549      mba <- newByteArray sz
1550      copyAddrToByteArray ptr mba 0 sz
1551      unsafeFreeze mba
1552    LT -> return (error "fromLitAsciiAddr#")
1553          -- NOTE: should never happen unless strlen(3) overflows (NB: CSize
1554          -- is unsigned; the overflow would occur when converting to
1555          -- 'B')
1556
1557foreign import ccall unsafe "strlen" c_strlen :: CString -> IO CSize
1558
1559-- GHC uses an encoding resembling Modified UTF-8 for non-ASCII string-literals
1560{-# NOINLINE fromLitMUtf8Addr# #-}
1561fromLitMUtf8Addr# :: Addr# -> ShortText
1562fromLitMUtf8Addr# (Ptr -> ptr) = unsafeDupablePerformIO $ do
1563  sz <- B `fmap` c_text_short_mutf8_strlen ptr
1564
1565  case sz `compare` 0 of
1566    EQ -> return mempty -- should not happen if rules fire correctly
1567    GT -> stToIO $ do
1568      mba <- newByteArray sz
1569      copyAddrToByteArray ptr mba 0 sz
1570      unsafeFreeze mba
1571    LT -> do
1572      mba <- stToIO (newByteArray (abs sz))
1573      c_text_short_mutf8_trans ptr (unMBA# mba)
1574      stToIO (unsafeFreeze mba)
1575
1576foreign import ccall unsafe "hs_text_short_mutf8_strlen" c_text_short_mutf8_strlen :: CString -> IO Int
1577
1578foreign import ccall unsafe "hs_text_short_mutf8_trans" c_text_short_mutf8_trans :: CString -> MutableByteArray# RealWorld -> IO ()
1579
1580-- $setup
1581-- >>> :set -XOverloadedStrings
1582-- >>> import Data.Text.Short (pack, unpack, concat)
1583-- >>> import Text.Show.Functions ()
1584-- >>> import qualified Test.QuickCheck.Arbitrary as QC
1585-- >>> import Test.QuickCheck.Instances ()
1586-- >>> instance QC.Arbitrary ShortText where { arbitrary = fmap fromString QC.arbitrary }
1587