1{-# LANGUAGE BangPatterns #-} 2{-# LANGUAGE CPP #-} 3{-# LANGUAGE DeriveDataTypeable #-} 4{-# LANGUAGE GeneralizedNewtypeDeriving #-} 5{-# LANGUAGE MagicHash #-} 6{-# LANGUAGE RankNTypes #-} 7{-# LANGUAGE TypeFamilies #-} 8{-# LANGUAGE UnboxedTuples #-} 9{-# LANGUAGE UnliftedFFITypes #-} 10{-# LANGUAGE Unsafe #-} 11{-# LANGUAGE ViewPatterns #-} 12 13-- | 14-- Module : Data.Text.Short.Internal 15-- Copyright : © Herbert Valerio Riedel 2017 16-- License : BSD3 17-- 18-- Maintainer : hvr@gnu.org 19-- Stability : stable 20-- 21-- Memory-efficient representation of Unicode text strings. 22-- 23-- @since 0.1 24module Data.Text.Short.Internal 25 ( -- * The 'ShortText' type 26 ShortText(..) 27 28 -- * Basic operations 29 , null 30 , length 31 , isAscii 32 , splitAt 33 , splitAtEnd 34 , indexEndMaybe 35 , indexMaybe 36 , isPrefixOf 37 , stripPrefix 38 , isSuffixOf 39 , stripSuffix 40 41 , cons 42 , snoc 43 , uncons 44 , unsnoc 45 46 , findIndex 47 , find 48 , all 49 50 , span 51 , spanEnd 52 , split 53 54 , intersperse 55 , intercalate 56 , reverse 57 , replicate 58 59 , filter 60 , dropAround 61 62 , foldl 63 , foldl' 64 , foldr 65 , foldl1 66 , foldl1' 67 , foldr1 68 69 -- * Conversions 70 -- ** 'Char' 71 , singleton 72 73 -- ** 'String' 74 , Data.Text.Short.Internal.fromString 75 , toString 76 77 -- ** 'T.Text' 78 , fromText 79 , toText 80 81 -- ** 'BS.ByteString' 82 , fromShortByteString 83 , fromShortByteStringUnsafe 84 , toShortByteString 85 86 , fromByteString 87 , fromByteStringUnsafe 88 , toByteString 89 90 , toBuilder 91 92 -- * misc 93 -- ** For Haddock 94 95 , BS.ByteString 96 , T.Text 97 , module Prelude 98 99 -- ** Internals 100 , isValidUtf8 101 ) where 102 103import Control.DeepSeq (NFData) 104import Control.Monad.ST (stToIO) 105import Data.Binary 106import Data.Bits 107import qualified Data.ByteString as BS 108import qualified Data.ByteString.Builder as BB 109import Data.ByteString.Short (ShortByteString) 110import qualified Data.ByteString.Short as BSS 111import qualified Data.ByteString.Short.Internal as BSSI 112import Data.Char (ord) 113import Data.Data (Data(..),constrIndex, Constr, 114 mkConstr, DataType, mkDataType, 115 Fixity(Prefix)) 116import Data.Hashable (Hashable) 117import Data.Typeable (Typeable) 118import qualified Data.List as List 119import Data.Maybe (fromMaybe, isNothing) 120import Data.Semigroup 121import qualified Data.String as S 122import qualified Data.Text as T 123import qualified Data.Text.Encoding as T 124import Foreign.C 125import GHC.Base (assert, unsafeChr) 126import qualified GHC.CString as GHC 127import GHC.Exts (Addr#, ByteArray#, Int (I#), 128 Int#, MutableByteArray#, 129 Ptr (..), RealWorld, Word (W#)) 130import qualified GHC.Exts 131import qualified GHC.Foreign as GHC 132import GHC.IO.Encoding 133import GHC.ST 134import Prelude hiding (all, any, break, concat, 135 drop, dropWhile, filter, foldl, 136 foldl1, foldr, foldr1, head, 137 init, last, length, null, 138 replicate, reverse, span, 139 splitAt, tail, take, takeWhile) 140import System.IO.Unsafe 141import Text.Printf (PrintfArg, formatArg, 142 formatString) 143 144import qualified PrimOps 145 146-- | A compact representation of Unicode strings. 147-- 148-- A 'ShortText' value is a sequence of Unicode scalar values, as defined in 149-- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35 §3.9, definition D76 of the Unicode 5.2 standard >; 150-- This means that a 'ShortText' is a list of (scalar) Unicode code-points (i.e. code-points in the range @[U+00 .. U+D7FF] ∪ [U+E000 .. U+10FFFF]@). 151-- 152-- This type relates to 'T.Text' as 'ShortByteString' relates to 'BS.ByteString' by providing a more compact type. Please consult the documentation of "Data.ByteString.Short" for more information. 153-- 154-- Currently, a boxed unshared 'T.Text' has a memory footprint of 6 words (i.e. 48 bytes on 64-bit systems) plus 2 or 4 bytes per code-point (due to the internal UTF-16 representation). Each 'T.Text' value which can share its payload with another 'T.Text' requires only 4 words additionally. Unlike 'BS.ByteString', 'T.Text' use unpinned memory. 155-- 156-- In comparison, the footprint of a boxed 'ShortText' is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1, 2, 3, or 4 bytes per code-point (due to the internal UTF-8 representation). 157-- It can be shown that for realistic data <http://utf8everywhere.org/#asian UTF-16 has a space overhead of 50% over UTF-8>. 158-- 159-- __NOTE__: The `Typeable` instance isn't defined for GHC 7.8 (and older) prior to @text-short-0.1.3@ 160-- 161-- @since 0.1 162newtype ShortText = ShortText ShortByteString 163 deriving (Hashable,Monoid,NFData,Data.Semigroup.Semigroup,Typeable) 164 165-- | It exposes a similar 'Data' instance abstraction as 'T.Text' (see 166-- discussion referenced there for more details), preserving the 167-- @[Char]@ data abstraction at the cost of inefficiency. 168-- 169-- @since 0.1.3 170instance Data ShortText where 171 gfoldl f z txt = z fromString `f` (toString txt) 172 toConstr _ = packConstr 173 gunfold k z c = case constrIndex c of 174 1 -> k (z fromString) 175 _ -> error "gunfold" 176 dataTypeOf _ = shortTextDataType 177 178packConstr :: Constr 179packConstr = mkConstr shortTextDataType "fromString" [] Prefix 180 181shortTextDataType :: DataType 182shortTextDataType = mkDataType "Data.Text.Short" [packConstr] 183 184instance Eq ShortText where 185 {-# INLINE (==) #-} 186 (==) x y 187 | lx /= ly = False 188 | lx == 0 = True 189 | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) 0# n# of 190 0# -> True 191 _ -> False 192 where 193 !lx@(I# n#) = toLength x 194 !ly = toLength y 195 196instance Ord ShortText where 197 compare t1 t2 198 | n == 0 = compare n1 n2 199 | otherwise = case PrimOps.compareByteArrays# ba1# 0# ba2# 0# n# of 200 r# | I# r# < 0 -> LT 201 | I# r# > 0 -> GT 202 | n1 < n2 -> LT 203 | n1 > n2 -> GT 204 | otherwise -> EQ 205 where 206 ba1# = toByteArray# t1 207 ba2# = toByteArray# t2 208 !n1 = toLength t1 209 !n2 = toLength t2 210 !n@(I# n#) = n1 `min` n2 211 212instance Show ShortText where 213 showsPrec p (ShortText b) = showsPrec p (decodeStringShort' utf8 b) 214 show (ShortText b) = show (decodeStringShort' utf8 b) 215 216instance Read ShortText where 217 readsPrec p = map (\(x,s) -> (ShortText $ encodeStringShort utf8 x,s)) . readsPrec p 218 219-- | @since 0.1.2 220instance PrintfArg ShortText where 221 formatArg txt = formatString $ toString txt 222 223-- | The 'Binary' encoding matches the one for 'T.Text' 224#if MIN_VERSION_binary(0,8,1) 225instance Binary ShortText where 226 put = put . toShortByteString 227 get = do 228 sbs <- get 229 case fromShortByteString sbs of 230 Nothing -> fail "Binary.get(ShortText): Invalid UTF-8 stream" 231 Just st -> return st 232#else 233-- fallback via 'ByteString' instance 234instance Binary ShortText where 235 put = put . toByteString 236 get = do 237 bs <- get 238 case fromByteString bs of 239 Nothing -> fail "Binary.get(ShortText): Invalid UTF-8 stream" 240 Just st -> return st 241#endif 242 243-- | \(\mathcal{O}(1)\) Test whether a 'ShortText' is empty. 244-- 245-- >>> null "" 246-- True 247-- 248-- prop> null (singleton c) == False 249-- 250-- prop> null t == (length t == 0) 251-- 252-- @since 0.1 253null :: ShortText -> Bool 254null = BSS.null . toShortByteString 255 256-- | \(\mathcal{O}(n)\) Count the number of Unicode code-points in a 'ShortText'. 257-- 258-- >>> length "abcd€" 259-- 5 260-- 261-- >>> length "" 262-- 0 263-- 264-- prop> length t >= 0 265-- 266-- @since 0.1 267length :: ShortText -> Int 268length st = fromIntegral $ unsafeDupablePerformIO (c_text_short_length (toByteArray# st) (toCSize st)) 269 270foreign import ccall unsafe "hs_text_short_length" c_text_short_length :: ByteArray# -> CSize -> IO CSize 271 272-- | \(\mathcal{O}(n)\) Test whether 'ShortText' contains only ASCII code-points (i.e. only U+0000 through U+007F). 273-- 274-- This is a more efficient version of @'all' 'Data.Char.isAscii'@. 275-- 276-- >>> isAscii "" 277-- True 278-- 279-- >>> isAscii "abc\NUL" 280-- True 281-- 282-- >>> isAscii "abcd€" 283-- False 284-- 285-- prop> isAscii t == all (< '\x80') t 286-- 287-- @since 0.1 288isAscii :: ShortText -> Bool 289isAscii st = (/= 0) $ unsafeDupablePerformIO (c_text_short_is_ascii (toByteArray# st) sz) 290 where 291 sz = toCSize st 292 293foreign import ccall unsafe "hs_text_short_is_ascii" c_text_short_is_ascii :: ByteArray# -> CSize -> IO CInt 294 295-- | \(\mathcal{O}(n)\) Test whether /all/ code points in 'ShortText' satisfy a predicate. 296-- 297-- >>> all (const False) "" 298-- True 299-- 300-- >>> all (> 'c') "abcdabcd" 301-- False 302-- 303-- >>> all (/= 'c') "abdabd" 304-- True 305-- 306-- @since 0.1.2 307all :: (Char -> Bool) -> ShortText -> Bool 308all p st = isNothing (findOfs (not . p) st (B 0)) 309 310-- | \(\mathcal{O}(n)\) Return the left-most codepoint in 'ShortText' that satisfies the given predicate. 311-- 312-- >>> find (> 'b') "abcdabcd" 313-- Just 'c' 314-- 315-- >>> find (> 'b') "ababab" 316-- Nothing 317-- 318-- @since 0.1.2 319find :: (Char -> Bool) -> ShortText -> Maybe Char 320find p st = go 0 321 where 322 go !ofs 323 | ofs >= sz = Nothing 324 | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 325 in c `seq` ofs' `seq` 326 if p c 327 then Just c 328 else go ofs' 329 330 !sz = toB st 331 332-- | \(\mathcal{O}(n)\) Return the index of the left-most codepoint in 'ShortText' that satisfies the given predicate. 333-- 334-- >>> findIndex (> 'b') "abcdabcdef" 335-- Just 2 336-- 337-- >>> findIndex (> 'b') "ababab" 338-- Nothing 339-- 340-- prop> (indexMaybe t =<< findIndex p t) == find p t 341-- 342-- @since 0.1.2 343findIndex :: (Char -> Bool) -> ShortText -> Maybe Int 344findIndex p st = go 0 0 345 where 346 go !ofs !i 347 | ofs >= sz = Nothing 348 | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 349 in c `seq` ofs' `seq` 350 if p c 351 then Just i 352 else go ofs' (i+1) 353 354 !sz = toB st 355 356 357-- | \(\mathcal{O}(n)\) Splits a string into components delimited by separators, 358-- where the predicate returns True for a separator element. The 359-- resulting components do not contain the separators. Two adjacent 360-- separators result in an empty component in the output. eg. 361-- 362-- >>> split (=='a') "aabbaca" 363-- ["","","bb","c",""] 364-- 365-- >>> split (=='a') "" 366-- [""] 367-- 368-- prop> intercalate (singleton c) (split (== c) t) = t 369-- 370-- __NOTE__: 'split' never returns an empty list to match the semantics of its counterpart from "Data.Text". 371-- 372-- @since 0.1.3 373split :: (Char -> Bool) -> ShortText -> [ShortText] 374split p st0 = go 0 375 where 376 go !ofs0 = case findOfs' p st0 ofs0 of 377 Just (ofs1,ofs2) -> slice st0 ofs0 (ofs1-ofs0) : go ofs2 378 Nothing 379 | ofs0 == 0 -> st0 : [] 380 | otherwise -> slice st0 ofs0 (maxOfs-ofs0) : [] 381 382 !maxOfs = toB st0 383 384-- internal helper 385{-# INLINE findOfs #-} 386findOfs :: (Char -> Bool) -> ShortText -> B -> Maybe B 387findOfs p st = go 388 where 389 go :: B -> Maybe B 390 go !ofs | ofs >= sz = Nothing 391 go !ofs | p c = Just ofs 392 | otherwise = go ofs' 393 where 394 (c,ofs') = decodeCharAtOfs st ofs 395 396 !sz = toB st 397 398{-# INLINE findOfs' #-} 399findOfs' :: (Char -> Bool) -> ShortText -> B -> Maybe (B,B) 400findOfs' p st = go 401 where 402 go :: B -> Maybe (B,B) 403 go !ofs | ofs >= sz = Nothing 404 go !ofs | p c = Just (ofs,ofs') 405 | otherwise = go ofs' 406 where 407 (c,ofs') = decodeCharAtOfs st ofs 408 409 !sz = toB st 410 411 412{-# INLINE findOfsRev #-} 413findOfsRev :: (Char -> Bool) -> ShortText -> B -> Maybe B 414findOfsRev p st = go 415 where 416 go (B 0) = Nothing 417 go !ofs 418 | p (cp2ch cp) = Just ofs 419 | otherwise = go (ofs-cpLen cp) 420 where 421 !cp = readCodePointRev st ofs 422 423-- | \(\mathcal{O}(n)\) Split 'ShortText' into longest prefix satisfying the given predicate and the remaining suffix. 424-- 425-- >>> span (< 'c') "abcdabcd" 426-- ("ab","cdabcd") 427-- 428-- prop> fst (span p t) <> snd (span p t) == t 429-- 430-- @since 0.1.2 431span :: (Char -> Bool) -> ShortText -> (ShortText,ShortText) 432span p st 433 | Just ofs <- findOfs (not . p) st (B 0) = splitAtOfs ofs st 434 | otherwise = (st,mempty) 435 436-- | \(\mathcal{O}(n)\) Split 'ShortText' into longest suffix satisfying the given predicate and the preceding prefix. 437-- 438-- >>> spanEnd (> 'c') "abcdabcd" 439-- ("abcdabc","d") 440-- 441-- prop> fst (spanEnd p t) <> snd (spanEnd p t) == t 442-- 443-- @since 0.1.2 444spanEnd :: (Char -> Bool) -> ShortText -> (ShortText,ShortText) 445spanEnd p st 446 | Just ofs <- findOfsRev (not . p) st (toB st) = splitAtOfs ofs st 447 | otherwise = (mempty,st) 448 449---------------------------------------------------------------------------- 450 451toCSize :: ShortText -> CSize 452toCSize = fromIntegral . BSS.length . toShortByteString 453 454toB :: ShortText -> B 455toB = fromIntegral . BSS.length . toShortByteString 456 457toLength :: ShortText -> Int 458toLength st = I# (toLength# st) 459 460toLength# :: ShortText -> Int# 461toLength# st = GHC.Exts.sizeofByteArray# (toByteArray# st) 462 463toByteArray# :: ShortText -> ByteArray# 464toByteArray# (ShortText (BSSI.SBS ba#)) = ba# 465 466-- | \(\mathcal{O}(0)\) Converts to UTF-8 encoded 'ShortByteString' 467-- 468-- This operation has effectively no overhead, as it's currently merely a @newtype@-cast. 469-- 470-- @since 0.1 471toShortByteString :: ShortText -> ShortByteString 472toShortByteString (ShortText b) = b 473 474-- | \(\mathcal{O}(n)\) Converts to UTF-8 encoded 'BS.ByteString' 475-- 476-- @since 0.1 477toByteString :: ShortText -> BS.ByteString 478toByteString = BSS.fromShort . toShortByteString 479 480-- | Construct a 'BB.Builder' that encodes 'ShortText' as UTF-8. 481-- 482-- @since 0.1 483toBuilder :: ShortText -> BB.Builder 484toBuilder = BB.shortByteString . toShortByteString 485 486-- | \(\mathcal{O}(n)\) Convert to 'String' 487-- 488-- prop> (fromString . toString) t == t 489-- 490-- __Note__: See documentation of 'fromString' for why @('toString' . 'fromString')@ is not an identity function. 491-- 492-- @since 0.1 493toString :: ShortText -> String 494-- NOTE: impl below beats 495-- toString = decodeStringShort' utf8 . toShortByteString 496-- except for smallish strings 497toString st = go 0 498 where 499 go !ofs 500 | ofs >= sz = [] 501 | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 502 in c `seq` ofs' `seq` (c : go ofs') 503 504 !sz = toB st 505 506---------------------------------------------------------------------------- 507-- Folds 508 509-- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with 510-- the binary operator and an initial in forward direction (i.e. from 511-- left to right). 512-- 513-- >>> foldl (\_ _ -> True) False "" 514-- False 515-- 516-- >>> foldl (\s c -> c : s) ['.'] "abcd" 517-- "dcba." 518-- 519-- @since 0.1.2 520foldl :: (a -> Char -> a) -> a -> ShortText -> a 521foldl f z st = go 0 z 522 where 523 go !ofs acc 524 | ofs >= sz = acc 525 | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 526 in c `seq` ofs' `seq` go ofs' (f acc c) 527 528 !sz = toB st 529 530-- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with the binary operator. 531-- 532-- >>> foldl1 max "abcdcba" 533-- 'd' 534-- 535-- >>> foldl1 const "abcd" 536-- 'a' 537-- 538-- >>> foldl1 (flip const) "abcd" 539-- 'd' 540-- 541-- __Note__: Will throw an 'error' exception if index is out of bounds. 542-- 543-- @since 0.1.2 544foldl1 :: (Char -> Char -> Char) -> ShortText -> Char 545foldl1 f st 546 | sz == 0 = error "foldl1: empty ShortText" 547 | otherwise = go c0sz c0 548 where 549 go !ofs acc 550 | ofs >= sz = acc 551 | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 552 in c `seq` ofs' `seq` go ofs' (f acc c) 553 !sz = toB st 554 (c0,c0sz) = decodeCharAtOfs st (B 0) 555 556-- | \(\mathcal{O}(n)\) Strict version of 'foldl'. 557-- 558-- @since 0.1.2 559foldl' :: (a -> Char -> a) -> a -> ShortText -> a 560foldl' f !z st = go 0 z 561 where 562 go !ofs !acc 563 | ofs >= sz = acc 564 | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 565 in c `seq` ofs' `seq` go ofs' (f acc c) 566 567 !sz = toB st 568 569-- | \(\mathcal{O}(n)\) Strict version of 'foldl1'. 570-- 571-- @since 0.1.2 572foldl1' :: (Char -> Char -> Char) -> ShortText -> Char 573foldl1' f st 574 | sz == 0 = error "foldl1: empty ShortText" 575 | otherwise = go c0sz c0 576 where 577 go !ofs !acc 578 | ofs >= sz = acc 579 | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 580 in c `seq` ofs' `seq` go ofs' (f acc c) 581 !sz = toB st 582 (c0,c0sz) = decodeCharAtOfs st (B 0) 583 584-- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with 585-- the binary operator and an initial in reverse direction (i.e. from 586-- right to left). 587-- 588-- >>> foldr (\_ _ -> True) False "" 589-- False 590-- 591-- >>> foldr (:) ['.'] "abcd" 592-- "abcd." 593-- 594-- @since 0.1.2 595foldr :: (Char -> a -> a) -> a -> ShortText -> a 596foldr f z st = go 0 597 where 598 go !ofs 599 | ofs >= sz = z 600 | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 601 in c `seq` ofs' `seq` f c (go ofs') 602 603 !sz = toB st 604 605-- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with the binary operator. 606-- 607-- >>> foldr1 max "abcdcba" 608-- 'd' 609-- 610-- >>> foldr1 const "abcd" 611-- 'a' 612-- 613-- >>> foldr1 (flip const) "abcd" 614-- 'd' 615-- 616-- __Note__: Will throw an 'error' exception if index is out of bounds. 617-- 618-- @since 0.1.2 619foldr1 :: (Char -> Char -> Char) -> ShortText -> Char 620foldr1 f st 621 | sz == 0 = error "foldr1: empty ShortText" 622 | otherwise = go 0 623 where 624 go !ofs = let (c,ofs') = decodeCharAtOfs st ofs 625 in c `seq` ofs' `seq` 626 (if ofs' >= sz then c else f c (go ofs')) 627 628 !sz = toB st 629 630-- | \(\mathcal{O}(n)\) Convert to 'T.Text' 631-- 632-- prop> (fromText . toText) t == t 633-- 634-- prop> (toText . fromText) t == t 635-- 636-- This is currently not \(\mathcal{O}(1)\) because currently 'T.Text' uses UTF-16 as its internal representation. 637-- In the event that 'T.Text' will change its internal representation to UTF-8 this operation will become \(\mathcal{O}(1)\). 638-- 639-- @since 0.1 640toText :: ShortText -> T.Text 641toText = T.decodeUtf8 . toByteString 642 643---- 644 645-- | \(\mathcal{O}(n)\) Construct/pack from 'String' 646-- 647-- >>> fromString [] 648-- "" 649-- 650-- >>> fromString ['a','b','c'] 651-- "abc" 652-- 653-- >>> fromString ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000 654-- "\55295\65533\65533\57344" 655-- 656-- __Note__: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD. 657-- 658-- @since 0.1 659fromString :: String -> ShortText 660fromString [] = mempty 661fromString [c] = singleton c 662fromString s = ShortText . encodeStringShort utf8 . map r $ s 663 where 664 r c | isSurr (ord c) = '\xFFFD' 665 | otherwise = c 666 667-- | \(\mathcal{O}(n)\) Construct 'ShortText' from 'T.Text' 668-- 669-- This is currently not \(\mathcal{O}(1)\) because currently 'T.Text' uses UTF-16 as its internal representation. 670-- In the event that 'T.Text' will change its internal representation to UTF-8 this operation will become \(\mathcal{O}(1)\). 671-- 672-- @since 0.1 673fromText :: T.Text -> ShortText 674fromText = fromByteStringUnsafe . T.encodeUtf8 675 676-- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'ShortByteString' 677-- 678-- This operation doesn't copy the input 'ShortByteString' but it 679-- cannot be \(\mathcal{O}(1)\) because we need to validate the UTF-8 encoding. 680-- 681-- Returns 'Nothing' in case of invalid UTF-8 encoding. 682-- 683-- >>> fromShortByteString "\x00\x38\xF0\x90\x8C\x9A" -- U+00 U+38 U+1031A 684-- Just "\NUL8\66330" 685-- 686-- >>> fromShortByteString "\xC0\x80" -- invalid denormalised U+00 687-- Nothing 688-- 689-- >>> fromShortByteString "\xED\xA0\x80" -- U+D800 (non-scalar code-point) 690-- Nothing 691-- 692-- >>> fromShortByteString "\xF4\x8f\xbf\xbf" -- U+10FFFF 693-- Just "\1114111" 694-- 695-- >>> fromShortByteString "\xF4\x90\x80\x80" -- U+110000 (invalid) 696-- Nothing 697-- 698-- prop> fromShortByteString (toShortByteString t) == Just t 699-- 700-- @since 0.1 701fromShortByteString :: ShortByteString -> Maybe ShortText 702fromShortByteString sbs 703 | isValidUtf8 st = Just st 704 | otherwise = Nothing 705 where 706 st = ShortText sbs 707 708-- | \(\mathcal{O}(0)\) Construct 'ShortText' from UTF-8 encoded 'ShortByteString' 709-- 710-- This operation has effectively no overhead, as it's currently merely a @newtype@-cast. 711-- 712-- __WARNING__: Unlike the safe 'fromShortByteString' conversion, this 713-- conversion is /unsafe/ as it doesn't validate the well-formedness of the 714-- UTF-8 encoding. 715-- 716-- @since 0.1.1 717fromShortByteStringUnsafe :: ShortByteString -> ShortText 718fromShortByteStringUnsafe = ShortText 719 720-- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'BS.ByteString' 721-- 722-- 'fromByteString' accepts (or rejects) the same input data as 'fromShortByteString'. 723-- 724-- Returns 'Nothing' in case of invalid UTF-8 encoding. 725-- 726-- @since 0.1 727fromByteString :: BS.ByteString -> Maybe ShortText 728fromByteString = fromShortByteString . BSS.toShort 729 730-- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'BS.ByteString' 731-- 732-- This operation is \(\mathcal{O}(n)\) because the 'BS.ByteString' needs to be 733-- copied into an unpinned 'ByteArray#'. 734-- 735-- __WARNING__: Unlike the safe 'fromByteString' conversion, this 736-- conversion is /unsafe/ as it doesn't validate the well-formedness of the 737-- UTF-8 encoding. 738-- 739-- @since 0.1.1 740fromByteStringUnsafe :: BS.ByteString -> ShortText 741fromByteStringUnsafe = ShortText . BSS.toShort 742 743---------------------------------------------------------------------------- 744 745encodeString :: TextEncoding -> String -> BS.ByteString 746encodeString te str = unsafePerformIO $ GHC.withCStringLen te str BS.packCStringLen 747 748-- decodeString :: TextEncoding -> BS.ByteString -> Maybe String 749-- decodeString te bs = cvtEx $ unsafePerformIO $ try $ BS.useAsCStringLen bs (GHC.peekCStringLen te) 750-- where 751-- cvtEx :: Either IOException a -> Maybe a 752-- cvtEx = either (const Nothing) Just 753 754decodeString' :: TextEncoding -> BS.ByteString -> String 755decodeString' te bs = unsafePerformIO $ BS.useAsCStringLen bs (GHC.peekCStringLen te) 756 757decodeStringShort' :: TextEncoding -> ShortByteString -> String 758decodeStringShort' te = decodeString' te . BSS.fromShort 759 760encodeStringShort :: TextEncoding -> String -> BSS.ShortByteString 761encodeStringShort te = BSS.toShort . encodeString te 762 763-- isValidUtf8' :: ShortText -> Int 764-- isValidUtf8' st = fromIntegral $ unsafeDupablePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st)) 765 766isValidUtf8 :: ShortText -> Bool 767isValidUtf8 st = (==0) $ unsafeDupablePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st)) 768 769type CCodePoint = Word 770 771foreign import ccall unsafe "hs_text_short_is_valid_utf8" c_text_short_is_valid_utf8 :: ByteArray# -> CSize -> IO CInt 772 773foreign import ccall unsafe "hs_text_short_index_cp" c_text_short_index :: ByteArray# -> CSize -> CSize -> IO CCodePoint 774 775-- | \(\mathcal{O}(n)\) Lookup /i/-th code-point in 'ShortText'. 776-- 777-- Returns 'Nothing' if out of bounds. 778-- 779-- prop> indexMaybe (singleton c) 0 == Just c 780-- 781-- prop> indexMaybe t 0 == fmap fst (uncons t) 782-- 783-- prop> indexMaybe mempty i == Nothing 784-- 785-- @since 0.1.2 786indexMaybe :: ShortText -> Int -> Maybe Char 787indexMaybe st i 788 | i < 0 = Nothing 789 | otherwise = cp2chSafe cp 790 where 791 cp = CP $ unsafeDupablePerformIO (c_text_short_index (toByteArray# st) (toCSize st) (fromIntegral i)) 792 793-- | \(\mathcal{O}(n)\) Lookup /i/-th code-point from the end of 'ShortText'. 794-- 795-- Returns 'Nothing' if out of bounds. 796-- 797-- prop> indexEndMaybe (singleton c) 0 == Just c 798-- 799-- prop> indexEndMaybe t 0 == fmap snd (unsnoc t) 800-- 801-- prop> indexEndMaybe mempty i == Nothing 802-- 803-- @since 0.1.2 804indexEndMaybe :: ShortText -> Int -> Maybe Char 805indexEndMaybe st i 806 | i < 0 = Nothing 807 | otherwise = cp2chSafe cp 808 where 809 cp = CP $ unsafeDupablePerformIO (c_text_short_index_rev (toByteArray# st) (toCSize st) (fromIntegral i)) 810 811foreign import ccall unsafe "hs_text_short_index_cp_rev" c_text_short_index_rev :: ByteArray# -> CSize -> CSize -> IO CCodePoint 812 813 814-- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves. 815-- 816-- @'splitAt' n t@ returns a pair of 'ShortText' with the following properties: 817-- 818-- prop> length (fst (splitAt n t)) == min (length t) (max 0 n) 819-- 820-- prop> fst (splitAt n t) <> snd (splitAt n t) == t 821-- 822-- >>> splitAt 2 "abcdef" 823-- ("ab","cdef") 824-- 825-- >>> splitAt 10 "abcdef" 826-- ("abcdef","") 827-- 828-- >>> splitAt (-1) "abcdef" 829-- ("","abcdef") 830-- 831-- @since 0.1.2 832splitAt :: Int -> ShortText -> (ShortText,ShortText) 833splitAt i st 834 | i <= 0 = (mempty,st) 835 | otherwise = splitAtOfs ofs st 836 where 837 ofs = csizeToB $ 838 unsafeDupablePerformIO (c_text_short_index_ofs (toByteArray# st) stsz (fromIntegral i)) 839 stsz = toCSize st 840 841-- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves. 842-- 843-- @'splitAtEnd' n t@ returns a pair of 'ShortText' with the following properties: 844-- 845-- prop> length (snd (splitAtEnd n t)) == min (length t) (max 0 n) 846-- 847-- prop> fst (splitAtEnd n t) <> snd (splitAtEnd n t) == t 848-- 849-- prop> splitAtEnd n t == splitAt (length t - n) t 850-- 851-- >>> splitAtEnd 2 "abcdef" 852-- ("abcd","ef") 853-- 854-- >>> splitAtEnd 10 "abcdef" 855-- ("","abcdef") 856-- 857-- >>> splitAtEnd (-1) "abcdef" 858-- ("abcdef","") 859-- 860-- @since 0.1.2 861splitAtEnd :: Int -> ShortText -> (ShortText,ShortText) 862splitAtEnd i st 863 | i <= 0 = (st,mempty) 864 | ofs >= stsz = (mempty,st) 865 | otherwise = splitAtOfs ofs st 866 where 867 ofs = csizeToB $ 868 unsafeDupablePerformIO (c_text_short_index_ofs_rev (toByteArray# st) (toCSize st) (fromIntegral (i-1))) 869 stsz = toB st 870 871{-# INLINE splitAtOfs #-} 872splitAtOfs :: B -> ShortText -> (ShortText,ShortText) 873splitAtOfs ofs st 874 | ofs == 0 = (mempty,st) 875 | ofs >= stsz = (st,mempty) 876 | otherwise = (slice st 0 ofs, slice st ofs (stsz-ofs)) 877 where 878 !stsz = toB st 879 880foreign import ccall unsafe "hs_text_short_index_ofs" c_text_short_index_ofs :: ByteArray# -> CSize -> CSize -> IO CSize 881 882foreign import ccall unsafe "hs_text_short_index_ofs_rev" c_text_short_index_ofs_rev :: ByteArray# -> CSize -> CSize -> IO CSize 883 884 885-- | \(\mathcal{O}(n)\) Inverse operation to 'cons' 886-- 887-- Returns 'Nothing' for empty input 'ShortText'. 888-- 889-- prop> uncons (cons c t) == Just (c,t) 890-- 891-- >>> uncons "" 892-- Nothing 893-- 894-- >>> uncons "fmap" 895-- Just ('f',"map") 896-- 897-- @since 0.1.2 898uncons :: ShortText -> Maybe (Char,ShortText) 899uncons st 900 | null st = Nothing 901 | len2 == 0 = Just (c0, mempty) 902 | otherwise = Just (c0, slice st ofs len2) 903 where 904 c0 = cp2ch cp0 905 cp0 = readCodePoint st 0 906 ofs = cpLen cp0 907 len2 = toB st - ofs 908 909-- | \(\mathcal{O}(n)\) Inverse operation to 'snoc' 910-- 911-- Returns 'Nothing' for empty input 'ShortText'. 912-- 913-- prop> unsnoc (snoc t c) == Just (t,c) 914-- 915-- >>> unsnoc "" 916-- Nothing 917-- 918-- >>> unsnoc "fmap" 919-- Just ("fma",'p') 920-- 921-- @since 0.1.2 922unsnoc :: ShortText -> Maybe (ShortText,Char) 923unsnoc st 924 | null st = Nothing 925 | len1 == 0 = Just (mempty, c0) 926 | otherwise = Just (slice st 0 len1, c0) 927 where 928 c0 = cp2ch cp0 929 cp0 = readCodePointRev st stsz 930 stsz = toB st 931 len1 = stsz - cpLen cp0 932 933-- | \(\mathcal{O}(n)\) Tests whether the first 'ShortText' is a prefix of the second 'ShortText' 934-- 935-- >>> isPrefixOf "ab" "abcdef" 936-- True 937-- 938-- >>> isPrefixOf "ac" "abcdef" 939-- False 940-- 941-- prop> isPrefixOf "" t == True 942-- 943-- prop> isPrefixOf t t == True 944-- 945-- @since 0.1.2 946isPrefixOf :: ShortText -> ShortText -> Bool 947isPrefixOf x y 948 | lx > ly = False 949 | lx == 0 = True 950 | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) 0# n# of 951 0# -> True 952 _ -> False 953 where 954 !lx@(I# n#) = toLength x 955 !ly = toLength y 956 957-- | \(\mathcal{O}(n)\) Strip prefix from second 'ShortText' argument. 958-- 959-- Returns 'Nothing' if first argument is not a prefix of the second argument. 960-- 961-- >>> stripPrefix "text-" "text-short" 962-- Just "short" 963-- 964-- >>> stripPrefix "test-" "text-short" 965-- Nothing 966-- 967-- @since 0.1.2 968stripPrefix :: ShortText -> ShortText -> Maybe ShortText 969stripPrefix pfx t 970 | isPrefixOf pfx t = Just $! snd (splitAtOfs (toB pfx) t) 971 | otherwise = Nothing 972 973-- | \(\mathcal{O}(n)\) Tests whether the first 'ShortText' is a suffix of the second 'ShortText' 974-- 975-- >>> isSuffixOf "ef" "abcdef" 976-- True 977-- 978-- >>> isPrefixOf "df" "abcdef" 979-- False 980-- 981-- prop> isSuffixOf "" t == True 982-- 983-- prop> isSuffixOf t t == True 984-- 985-- @since 0.1.2 986isSuffixOf :: ShortText -> ShortText -> Bool 987isSuffixOf x y 988 | lx > ly = False 989 | lx == 0 = True 990 | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) ofs2# n# of 991 0# -> True 992 _ -> False 993 where 994 !(I# ofs2#) = ly - lx 995 !lx@(I# n#) = toLength x 996 !ly = toLength y 997 998-- | \(\mathcal{O}(n)\) Strip suffix from second 'ShortText' argument. 999-- 1000-- Returns 'Nothing' if first argument is not a suffix of the second argument. 1001-- 1002-- >>> stripSuffix "-short" "text-short" 1003-- Just "text" 1004-- 1005-- >>> stripSuffix "-utf8" "text-short" 1006-- Nothing 1007-- 1008-- @since 0.1.2 1009stripSuffix :: ShortText -> ShortText -> Maybe ShortText 1010stripSuffix sfx t 1011 | isSuffixOf sfx t = Just $! fst (splitAtOfs pfxLen t) 1012 | otherwise = Nothing 1013 where 1014 pfxLen = toB t - toB sfx 1015 1016---------------------------------------------------------------------------- 1017 1018-- | \(\mathcal{O}(n)\) Insert character between characters of 'ShortText'. 1019-- 1020-- >>> intersperse '*' "_" 1021-- "_" 1022-- 1023-- >>> intersperse '*' "MASH" 1024-- "M*A*S*H" 1025-- 1026-- @since 0.1.2 1027intersperse :: Char -> ShortText -> ShortText 1028intersperse c st 1029 | null st = mempty 1030 | sn == 1 = st 1031 | otherwise = create newsz $ \mba -> do 1032 let !cp0 = readCodePoint st 0 1033 !cp0sz = cpLen cp0 1034 writeCodePointN cp0sz mba 0 cp0 1035 go mba (sn - 1) cp0sz cp0sz 1036 where 1037 newsz = ssz + ((sn-1) `mulB` csz) 1038 ssz = toB st 1039 sn = length st 1040 csz = cpLen cp 1041 cp = ch2cp c 1042 1043 go :: MBA s -> Int -> B -> B -> ST s () 1044 go _ 0 !_ !_ = return () 1045 go mba n ofs ofs2 = do 1046 let !cp1 = readCodePoint st ofs2 1047 !cp1sz = cpLen cp1 1048 writeCodePointN csz mba ofs cp 1049 writeCodePointN cp1sz mba (ofs+csz) cp1 1050 go mba (n-1) (ofs+csz+cp1sz) (ofs2+cp1sz) 1051 1052-- | \(\mathcal{O}(n)\) Insert 'ShortText' inbetween list of 'ShortText's. 1053-- 1054-- >>> intercalate ", " [] 1055-- "" 1056-- 1057-- >>> intercalate ", " ["foo"] 1058-- "foo" 1059-- 1060-- >>> intercalate ", " ["foo","bar","doo"] 1061-- "foo, bar, doo" 1062-- 1063-- prop> intercalate "" ts == concat ts 1064-- 1065-- @since 0.1.2 1066intercalate :: ShortText -> [ShortText] -> ShortText 1067intercalate _ [] = mempty 1068intercalate _ [t] = t 1069intercalate sep ts 1070 | null sep = mconcat ts 1071 | otherwise = mconcat (List.intersperse sep ts) 1072 1073-- | \(\mathcal{O}(n*m)\) Replicate a 'ShortText'. 1074-- 1075-- A repetition count smaller than 1 results in an empty string result. 1076-- 1077-- >>> replicate 3 "jobs!" 1078-- "jobs!jobs!jobs!" 1079-- 1080-- >>> replicate 10000 "" 1081-- "" 1082-- 1083-- >>> replicate 0 "nothing" 1084-- "" 1085-- 1086-- prop> length (replicate n t) == max 0 n * length t 1087-- 1088-- @since 0.1.2 1089replicate :: Int -> ShortText -> ShortText 1090replicate n0 t 1091 | n0 < 1 = mempty 1092 | null t = mempty 1093 | otherwise = create (n0 `mulB` sz) (go 0) 1094 where 1095 go :: Int -> MBA s -> ST s () 1096 go j mba 1097 | j == n0 = return () 1098 | otherwise = do 1099 copyByteArray t 0 mba (j `mulB` sz) sz 1100 go (j+1) mba 1101 1102 sz = toB t 1103 1104-- | \(\mathcal{O}(n)\) Reverse characters in 'ShortText'. 1105-- 1106-- >>> reverse "star live desserts" 1107-- "stressed evil rats" 1108-- 1109-- prop> reverse (singleton c) == singleton c 1110-- 1111-- prop> reverse (reverse t) == t 1112-- 1113-- @since 0.1.2 1114reverse :: ShortText -> ShortText 1115reverse st 1116 | null st = mempty 1117 | sn == 1 = st 1118 | otherwise = create sz $ go sn 0 1119 where 1120 sz = toB st 1121 sn = length st 1122 1123 go :: Int -> B -> MBA s -> ST s () 1124 go 0 !_ _ = return () 1125 go i ofs mba = do 1126 let !cp = readCodePoint st ofs 1127 !cpsz = cpLen cp 1128 !ofs' = ofs+cpsz 1129 writeCodePointN cpsz mba (sz - ofs') cp 1130 go (i-1) ofs' mba 1131 1132 1133-- | \(\mathcal{O}(n)\) Remove characters from 'ShortText' which don't satisfy given predicate. 1134-- 1135-- >>> filter (`notElem` ['a','e','i','o','u']) "You don't need vowels to convey information!" 1136-- "Y dn't nd vwls t cnvy nfrmtn!" 1137-- 1138-- prop> filter (const False) t == "" 1139-- 1140-- prop> filter (const True) t == t 1141-- 1142-- prop> length (filter p t) <= length t 1143-- 1144-- prop> filter p t == pack [ c | c <- unpack t, p c ] 1145-- 1146-- @since 0.1.2 1147filter :: (Char -> Bool) -> ShortText -> ShortText 1148filter p t 1149 = case (mofs1,mofs2) of 1150 (Nothing, _) -> t -- no non-accepted characters found 1151 (Just 0, Nothing) -> mempty -- no accepted characters found 1152 (Just ofs1, Nothing) -> slice t 0 ofs1 -- only prefix accepted 1153 (Just ofs1, Just ofs2) -> createShrink (t0sz-(ofs2-ofs1)) $ \mba -> do 1154 -- copy accepted prefix 1155 copyByteArray t 0 mba 0 ofs1 1156 -- [ofs1 .. ofs2) are a non-accepted region 1157 -- filter rest after ofs2 1158 t1sz <- go mba ofs2 ofs1 1159 return t1sz 1160 where 1161 mofs1 = findOfs (not . p) t (B 0) -- first non-accepted Char 1162 mofs2 = findOfs p t (fromMaybe (B 0) mofs1) -- first accepted Char 1163 1164 t0sz = toB t 1165 1166 go :: MBA s -> B -> B -> ST s B 1167 go mba !t0ofs !t1ofs 1168 | t0ofs >= t0sz = return t1ofs 1169 | otherwise = let !cp = readCodePoint t t0ofs 1170 !cpsz = cpLen cp 1171 in if p (cp2ch cp) 1172 then writeCodePointN cpsz mba t1ofs cp >> 1173 go mba (t0ofs+cpsz) (t1ofs+cpsz) 1174 else go mba (t0ofs+cpsz) t1ofs -- skip code-point 1175 1176-- | \(\mathcal{O}(n)\) Strip characters from the beginning end and of 'ShortText' which satisfy given predicate. 1177-- 1178-- >>> dropAround (== ' ') " white space " 1179-- "white space" 1180-- 1181-- >>> dropAround (> 'a') "bcdefghi" 1182-- "" 1183-- 1184-- @since 0.1.2 1185dropAround :: (Char -> Bool) -> ShortText -> ShortText 1186dropAround p t0 = case (mofs1,mofs2) of 1187 (Nothing,_) -> mempty 1188 (Just ofs1,Just ofs2) 1189 | ofs1 == 0, ofs2 == t0sz -> t0 1190 | ofs1 < ofs2 -> create (ofs2-ofs1) $ \mba -> do 1191 copyByteArray t0 ofs1 mba (B 0) (ofs2-ofs1) 1192 (_,_) -> error "dropAround: the impossible happened" 1193 where 1194 mofs1 = findOfs (not . p) t0 (B 0) 1195 mofs2 = findOfsRev (not . p) t0 t0sz 1196 t0sz = toB t0 1197 1198---------------------------------------------------------------------------- 1199 1200-- | Construct a new 'ShortText' from an existing one by slicing 1201-- 1202-- NB: The 'CSize' arguments refer to byte-offsets 1203slice :: ShortText -> B -> B -> ShortText 1204slice st ofs len 1205 | ofs < 0 = error "invalid offset" 1206 | len < 0 = error "invalid length" 1207 | len' == 0 = mempty 1208 | otherwise = create len' $ \mba -> copyByteArray st ofs' mba 0 len' 1209 where 1210 len0 = toB st 1211 len' = max 0 (min len (len0-ofs)) 1212 ofs' = max 0 ofs 1213 1214---------------------------------------------------------------------------- 1215-- low-level MutableByteArray# helpers 1216 1217-- | Byte offset (or size) in bytes 1218-- 1219-- This currently wraps an 'Int' because this is what GHC's primops 1220-- currently use for byte offsets/sizes. 1221newtype B = B { unB :: Int } 1222 deriving (Ord,Eq,Num) 1223 1224{- TODO: introduce operators for 'B' to avoid 'Num' -} 1225 1226mulB :: Int -> B -> B 1227mulB n (B b) = B (n*b) 1228 1229csizeFromB :: B -> CSize 1230csizeFromB = fromIntegral . unB 1231 1232csizeToB :: CSize -> B 1233csizeToB = B . fromIntegral 1234 1235data MBA s = MBA# { unMBA# :: MutableByteArray# s } 1236 1237{-# INLINE create #-} 1238create :: B -> (forall s. MBA s -> ST s ()) -> ShortText 1239create n go = runST $ do 1240 mba <- newByteArray n 1241 go mba 1242 unsafeFreeze mba 1243 1244{-# INLINE createShrink #-} 1245createShrink :: B -> (forall s. MBA s -> ST s B) -> ShortText 1246createShrink n go = runST $ do 1247 mba <- newByteArray n 1248 n' <- go mba 1249 if n' < n 1250 then unsafeFreezeShrink mba n' 1251 else unsafeFreeze mba 1252 1253{-# INLINE unsafeFreeze #-} 1254unsafeFreeze :: MBA s -> ST s ShortText 1255unsafeFreeze (MBA# mba#) 1256 = ST $ \s -> case GHC.Exts.unsafeFreezeByteArray# mba# s of 1257 (# s', ba# #) -> (# s', ShortText (BSSI.SBS ba#) #) 1258 1259{-# INLINE copyByteArray #-} 1260copyByteArray :: ShortText -> B -> MBA s -> B -> B -> ST s () 1261copyByteArray (ShortText (BSSI.SBS src#)) (B (I# src_off#)) (MBA# dst#) (B (I# dst_off#)) (B (I# len#)) 1262 = ST $ \s -> case GHC.Exts.copyByteArray# src# src_off# dst# dst_off# len# s of 1263 s' -> (# s', () #) 1264 1265{-# INLINE newByteArray #-} 1266newByteArray :: B -> ST s (MBA s) 1267newByteArray (B (I# n#)) 1268 = ST $ \s -> case GHC.Exts.newByteArray# n# s of 1269 (# s', mba# #) -> (# s', MBA# mba# #) 1270 1271{-# INLINE writeWord8Array #-} 1272writeWord8Array :: MBA s -> B -> Word -> ST s () 1273writeWord8Array (MBA# mba#) (B (I# i#)) (W# w#) 1274 = ST $ \s -> case GHC.Exts.writeWord8Array# mba# i# w# s of 1275 s' -> (# s', () #) 1276{- not needed yet 1277{-# INLINE indexWord8Array #-} 1278indexWord8Array :: ShortText -> B -> Word 1279indexWord8Array (ShortText (BSSI.SBS ba#)) (B (I# i#)) = W# (GHC.Exts.indexWord8Array# ba# i#) 1280-} 1281 1282{-# INLINE copyAddrToByteArray #-} 1283copyAddrToByteArray :: Ptr a -> MBA RealWorld -> B -> B -> ST RealWorld () 1284copyAddrToByteArray (Ptr src#) (MBA# dst#) (B (I# dst_off#)) (B (I# len#)) 1285 = ST $ \s -> case GHC.Exts.copyAddrToByteArray# src# dst# dst_off# len# s of 1286 s' -> (# s', () #) 1287 1288---------------------------------------------------------------------------- 1289-- unsafeFreezeShrink 1290 1291#if __GLASGOW_HASKELL__ >= 710 1292-- for GHC versions which have the 'shrinkMutableByteArray#' primop 1293{-# INLINE unsafeFreezeShrink #-} 1294unsafeFreezeShrink :: MBA s -> B -> ST s ShortText 1295unsafeFreezeShrink mba n = do 1296 shrink mba n 1297 unsafeFreeze mba 1298 1299{-# INLINE shrink #-} 1300shrink :: MBA s -> B -> ST s () 1301shrink (MBA# mba#) (B (I# i#)) 1302 = ST $ \s -> case GHC.Exts.shrinkMutableByteArray# mba# i# s of 1303 s' -> (# s', () #) 1304#else 1305-- legacy code for GHC versions which lack `shrinkMutableByteArray#` primop 1306{-# INLINE unsafeFreezeShrink #-} 1307unsafeFreezeShrink :: MBA s -> B -> ST s ShortText 1308unsafeFreezeShrink mba0 n = do 1309 mba' <- newByteArray n 1310 copyByteArray2 mba0 0 mba' 0 n 1311 unsafeFreeze mba' 1312 1313{-# INLINE copyByteArray2 #-} 1314copyByteArray2 :: MBA s -> B -> MBA s -> B -> B -> ST s () 1315copyByteArray2 (MBA# src#) (B (I# src_off#)) (MBA# dst#) (B (I# dst_off#)) (B( I# len#)) 1316 = ST $ \s -> case GHC.Exts.copyMutableByteArray# src# src_off# dst# dst_off# len# s of 1317 s' -> (# s', () #) 1318#endif 1319 1320---------------------------------------------------------------------------- 1321-- Helpers for encoding code points into UTF-8 code units 1322-- 1323-- 7 bits| < 0x80 | 0xxxxxxx 1324-- 11 bits| < 0x800 | 110yyyyx 10xxxxxx 1325-- 16 bits| < 0x10000 | 1110yyyy 10yxxxxx 10xxxxxx 1326-- 21 bits| | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx 1327 1328-- | Unicode Code-point 1329-- 1330-- Keeping it as a 'Word' is more convenient for bit-ops and FFI 1331newtype CP = CP Word 1332 1333{-# INLINE ch2cp #-} 1334ch2cp :: Char -> CP 1335ch2cp (ord -> ci) 1336 | isSurr ci = CP 0xFFFD 1337 | otherwise = CP (fromIntegral ci) 1338 1339{-# INLINE isSurr #-} 1340isSurr :: (Num i, Bits i) => i -> Bool 1341isSurr ci = ci .&. 0xfff800 == 0xd800 1342 1343{-# INLINE cp2ch #-} 1344cp2ch :: CP -> Char 1345cp2ch (CP w) = (w < 0x110000) `assert` unsafeChr (fromIntegral w) 1346 1347-- used/needed by index-lookup functions to encode out of bounds 1348cp2chSafe :: CP -> Maybe Char 1349cp2chSafe cp 1350 | cpNull cp = Nothing 1351 | otherwise = Just $! cp2ch cp 1352 where 1353 cpNull :: CP -> Bool 1354 cpNull (CP w) = w >= 0x110000 1355 1356{-# INLINE cpLen #-} 1357cpLen :: CP -> B 1358cpLen (CP cp) 1359 | cp < 0x80 = B 1 1360 | cp < 0x800 = B 2 1361 | cp < 0x10000 = B 3 1362 | otherwise = B 4 1363 1364-- convenience wrapper; unsafe like readCodePoint 1365{-# INLINE decodeCharAtOfs #-} 1366decodeCharAtOfs :: ShortText -> B -> (Char,B) 1367decodeCharAtOfs st ofs = (c,ofs') 1368 where 1369 c = cp2ch cp 1370 ofs' = ofs + cpLen cp 1371 cp = readCodePoint st ofs 1372{- pure version of decodeCharAtOfs, but unfortunately significantly slower 1373 1374decodeCharAtOfs st ofs 1375 | b0 < 0x80 = (cp2ch $ CP b0,ofs + B 1) 1376 | otherwise = case b0 `unsafeShiftR` 4 of 1377 0xf -> (cp2ch $ CP go4, ofs + B 4) 1378 0xe -> (cp2ch $ CP go3, ofs + B 3) 1379 _ -> (cp2ch $ CP go2, ofs + B 2) 1380 where 1381 b0 = buf 0 1382 buf j = indexWord8Array st (ofs+j) 1383 1384 go2 = ((b0 .&. 0x1f) `unsafeShiftL` 6) 1385 .|. (buf 1 .&. 0x3f) 1386 1387 go3 = ((b0 .&. 0x0f) `unsafeShiftL` (6+6)) 1388 .|. ((buf 1 .&. 0x3f) `unsafeShiftL` 6) 1389 .|. (buf 2 .&. 0x3f) 1390 1391 go4 = ((b0 .&. 0x07) `unsafeShiftL` (6+6+6)) 1392 .|. ((buf 1 .&. 0x3f) `unsafeShiftL` (6+6)) 1393 .|. ((buf 2 .&. 0x3f) `unsafeShiftL` 6) 1394 .|. (buf 3 .&. 0x3f) 1395-} 1396 1397 1398-- | \(\mathcal{O}(1)\) Construct 'ShortText' from single codepoint. 1399-- 1400-- prop> singleton c == pack [c] 1401-- 1402-- prop> length (singleton c) == 1 1403-- 1404-- >>> singleton 'A' 1405-- "A" 1406-- 1407-- >>> map singleton ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000 1408-- ["\55295","\65533","\65533","\57344"] 1409-- 1410-- __Note__: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD. 1411-- 1412-- @since 0.1.2 1413singleton :: Char -> ShortText 1414singleton = singleton' . ch2cp 1415 1416singleton' :: CP -> ShortText 1417singleton' cp@(CP cpw) 1418 | cpw < 0x80 = create 1 $ \mba -> writeCodePoint1 mba 0 cp 1419 | cpw < 0x800 = create 2 $ \mba -> writeCodePoint2 mba 0 cp 1420 | cpw < 0x10000 = create 3 $ \mba -> writeCodePoint3 mba 0 cp 1421 | otherwise = create 4 $ \mba -> writeCodePoint4 mba 0 cp 1422 1423-- | \(\mathcal{O}(n)\) Prepend a character to a 'ShortText'. 1424-- 1425-- prop> cons c t == singleton c <> t 1426-- 1427-- @since 0.1.2 1428cons :: Char -> ShortText -> ShortText 1429cons (ch2cp -> cp@(CP cpw)) sfx 1430 | n == 0 = singleton' cp 1431 | cpw < 0x80 = create (n+1) $ \mba -> writeCodePoint1 mba 0 cp >> copySfx 1 mba 1432 | cpw < 0x800 = create (n+2) $ \mba -> writeCodePoint2 mba 0 cp >> copySfx 2 mba 1433 | cpw < 0x10000 = create (n+3) $ \mba -> writeCodePoint3 mba 0 cp >> copySfx 3 mba 1434 | otherwise = create (n+4) $ \mba -> writeCodePoint4 mba 0 cp >> copySfx 4 mba 1435 where 1436 !n = toB sfx 1437 1438 copySfx :: B -> MBA s -> ST s () 1439 copySfx ofs mba = copyByteArray sfx 0 mba ofs n 1440 1441-- | \(\mathcal{O}(n)\) Append a character to the ond of a 'ShortText'. 1442-- 1443-- prop> snoc t c == t <> singleton c 1444-- 1445-- @since 0.1.2 1446snoc :: ShortText -> Char -> ShortText 1447snoc pfx (ch2cp -> cp@(CP cpw)) 1448 | n == 0 = singleton' cp 1449 | cpw < 0x80 = create (n+1) $ \mba -> copyPfx mba >> writeCodePoint1 mba n cp 1450 | cpw < 0x800 = create (n+2) $ \mba -> copyPfx mba >> writeCodePoint2 mba n cp 1451 | cpw < 0x10000 = create (n+3) $ \mba -> copyPfx mba >> writeCodePoint3 mba n cp 1452 | otherwise = create (n+4) $ \mba -> copyPfx mba >> writeCodePoint4 mba n cp 1453 where 1454 !n = toB pfx 1455 1456 copyPfx :: MBA s -> ST s () 1457 copyPfx mba = copyByteArray pfx 0 mba 0 n 1458 1459{- 1460writeCodePoint :: MBA s -> Int -> Word -> ST s () 1461writeCodePoint mba ofs cp 1462 | cp < 0x80 = writeCodePoint1 mba ofs cp 1463 | cp < 0x800 = writeCodePoint2 mba ofs cp 1464 | cp < 0x10000 = writeCodePoint3 mba ofs cp 1465 | otherwise = writeCodePoint4 mba ofs cp 1466-} 1467 1468writeCodePointN :: B -> MBA s -> B -> CP -> ST s () 1469writeCodePointN 1 = writeCodePoint1 1470writeCodePointN 2 = writeCodePoint2 1471writeCodePointN 3 = writeCodePoint3 1472writeCodePointN 4 = writeCodePoint4 1473writeCodePointN _ = undefined 1474 1475writeCodePoint1 :: MBA s -> B -> CP -> ST s () 1476writeCodePoint1 mba ofs (CP cp) = 1477 writeWord8Array mba ofs cp 1478 1479writeCodePoint2 :: MBA s -> B -> CP -> ST s () 1480writeCodePoint2 mba ofs (CP cp) = do 1481 writeWord8Array mba ofs (0xc0 .|. (cp `unsafeShiftR` 6)) 1482 writeWord8Array mba (ofs+1) (0x80 .|. (cp .&. 0x3f)) 1483 1484writeCodePoint3 :: MBA s -> B -> CP -> ST s () 1485writeCodePoint3 mba ofs (CP cp) = do 1486 writeWord8Array mba ofs (0xe0 .|. (cp `unsafeShiftR` 12)) 1487 writeWord8Array mba (ofs+1) (0x80 .|. ((cp `unsafeShiftR` 6) .&. 0x3f)) 1488 writeWord8Array mba (ofs+2) (0x80 .|. (cp .&. 0x3f)) 1489 1490writeCodePoint4 :: MBA s -> B -> CP -> ST s () 1491writeCodePoint4 mba ofs (CP cp) = do 1492 writeWord8Array mba ofs (0xf0 .|. (cp `unsafeShiftR` 18)) 1493 writeWord8Array mba (ofs+1) (0x80 .|. ((cp `unsafeShiftR` 12) .&. 0x3f)) 1494 writeWord8Array mba (ofs+2) (0x80 .|. ((cp `unsafeShiftR` 6) .&. 0x3f)) 1495 writeWord8Array mba (ofs+3) (0x80 .|. (cp .&. 0x3f)) 1496 1497-- beware: UNSAFE! 1498readCodePoint :: ShortText -> B -> CP 1499readCodePoint st (csizeFromB -> ofs) 1500 = CP $ unsafeDupablePerformIO (c_text_short_ofs_cp (toByteArray# st) ofs) 1501 1502foreign import ccall unsafe "hs_text_short_ofs_cp" c_text_short_ofs_cp :: ByteArray# -> CSize -> IO CCodePoint 1503 1504readCodePointRev :: ShortText -> B -> CP 1505readCodePointRev st (csizeFromB -> ofs) 1506 = CP $ unsafeDupablePerformIO (c_text_short_ofs_cp_rev (toByteArray# st) ofs) 1507 1508foreign import ccall unsafe "hs_text_short_ofs_cp_rev" c_text_short_ofs_cp_rev :: ByteArray# -> CSize -> IO CCodePoint 1509 1510---------------------------------------------------------------------------- 1511-- string & list literals 1512 1513-- | __Note__: Surrogate pairs (@[U+D800 .. U+DFFF]@) character literals are replaced by U+FFFD. 1514-- 1515-- @since 0.1.2 1516instance GHC.Exts.IsList ShortText where 1517 type (Item ShortText) = Char 1518 fromList = fromString 1519 toList = toString 1520 1521-- | __Note__: Surrogate pairs (@[U+D800 .. U+DFFF]@) in string literals are replaced by U+FFFD. 1522-- 1523-- This matches the behaviour of 'S.IsString' instance for 'T.Text'. 1524instance S.IsString ShortText where 1525 fromString = fromStringLit 1526 1527-- i.e., don't inline before Phase 0 1528{-# INLINE [0] fromStringLit #-} 1529fromStringLit :: String -> ShortText 1530fromStringLit = fromString 1531 1532{-# RULES "ShortText empty literal" fromStringLit "" = mempty #-} 1533 1534-- TODO: this doesn't seem to fire 1535{-# RULES "ShortText singleton literal" forall c . fromStringLit [c] = singleton c #-} 1536 1537{-# RULES "ShortText literal ASCII" forall s . fromStringLit (GHC.unpackCString# s) = fromLitAsciiAddr# s #-} 1538 1539{-# RULES "ShortText literal UTF-8" forall s . fromStringLit (GHC.unpackCStringUtf8# s) = fromLitMUtf8Addr# s #-} 1540 1541{-# NOINLINE fromLitAsciiAddr# #-} 1542fromLitAsciiAddr# :: Addr# -> ShortText 1543fromLitAsciiAddr# (Ptr -> ptr) = unsafeDupablePerformIO $ do 1544 sz <- csizeToB `fmap` c_strlen ptr 1545 1546 case sz `compare` 0 of 1547 EQ -> return mempty -- should not happen if rules fire correctly 1548 GT -> stToIO $ do 1549 mba <- newByteArray sz 1550 copyAddrToByteArray ptr mba 0 sz 1551 unsafeFreeze mba 1552 LT -> return (error "fromLitAsciiAddr#") 1553 -- NOTE: should never happen unless strlen(3) overflows (NB: CSize 1554 -- is unsigned; the overflow would occur when converting to 1555 -- 'B') 1556 1557foreign import ccall unsafe "strlen" c_strlen :: CString -> IO CSize 1558 1559-- GHC uses an encoding resembling Modified UTF-8 for non-ASCII string-literals 1560{-# NOINLINE fromLitMUtf8Addr# #-} 1561fromLitMUtf8Addr# :: Addr# -> ShortText 1562fromLitMUtf8Addr# (Ptr -> ptr) = unsafeDupablePerformIO $ do 1563 sz <- B `fmap` c_text_short_mutf8_strlen ptr 1564 1565 case sz `compare` 0 of 1566 EQ -> return mempty -- should not happen if rules fire correctly 1567 GT -> stToIO $ do 1568 mba <- newByteArray sz 1569 copyAddrToByteArray ptr mba 0 sz 1570 unsafeFreeze mba 1571 LT -> do 1572 mba <- stToIO (newByteArray (abs sz)) 1573 c_text_short_mutf8_trans ptr (unMBA# mba) 1574 stToIO (unsafeFreeze mba) 1575 1576foreign import ccall unsafe "hs_text_short_mutf8_strlen" c_text_short_mutf8_strlen :: CString -> IO Int 1577 1578foreign import ccall unsafe "hs_text_short_mutf8_trans" c_text_short_mutf8_trans :: CString -> MutableByteArray# RealWorld -> IO () 1579 1580-- $setup 1581-- >>> :set -XOverloadedStrings 1582-- >>> import Data.Text.Short (pack, unpack, concat) 1583-- >>> import Text.Show.Functions () 1584-- >>> import qualified Test.QuickCheck.Arbitrary as QC 1585-- >>> import Test.QuickCheck.Instances () 1586-- >>> instance QC.Arbitrary ShortText where { arbitrary = fmap fromString QC.arbitrary } 1587