1{-# LANGUAGE DeriveDataTypeable #-}
2
3-- ------------------------------------------------------------
4
5{- |
6   Module     : Data.Atom
7   Copyright  : Copyright (C) 2008 Uwe Schmidt
8   License    : MIT
9
10   Maintainer : Uwe Schmidt (uwe\@fh-wedel.de)
11   Stability  : experimental
12   Portability: non-portable
13
14   Unique Atoms generated from Strings and
15   managed as flyweights
16
17   Data.Atom can be used for caching and storage optimisation
18   of frequently used strings. An @Atom@ is constructed from a @String@.
19   For two equal strings the identical atom is returned.
20
21   This module can be used for optimizing memory usage when working with
22   strings or names. Many applications use data types like
23   @Map String SomeAttribute@ where a rather fixed set of keys is used.
24   Especially XML applications often work with a limited set of element and attribute names.
25   For these applications it becomes more memory efficient when working with types like
26   @Map Atom SomeAttribute@ and convert the keys into atoms before operating
27   on such a map.
28
29   Internally this module manages a map of atoms. The atoms are internally represented
30   by @ByteString@s. When creating a new atom from a string, the string is first converted
31   into an UTF8 @Word8@ sequence, which is packed into a @ByteString@. This @ByteString@ is looked
32   up in the table of atoms. If it is already there, the value in the map is used as atom, else
33   the new @ByteString@ is inserted into the map.
34
35   Of course the implementation of this name cache uses @unsavePerformIO@.
36   The global cache is managed by ue of an @IORef@ and atomicModifyIORef.
37
38   The following laws hold for atoms
39
40   >
41   > s  ==       t => newAtom s  ==       newAtom t
42   > s `compare` t => newAtom s `compare` newAtom t
43   > show . newAtom == id
44
45   Equality test for @Atom@s runs in /O(1)/, it is just a pointer comarison.
46   The @Ord@ comparisons have the same runtime like the @ByteString@ comparisons.
47   Internally there is an UTF8 comparison, but UTF8 encoding preserves the total order.
48
49   Warning: The internal cache never shrinks during execution. So using it in a
50   undisciplined way can lead to memory leaks.
51-}
52
53-----------------------------------------------------------------------------
54
55module Data.Atom (
56   -- * Atom objects
57   Atom,                -- instance (Eq, Ord, Read, Show)
58   newAtom,             -- :: String -> Atom
59   share                -- :: String -> String
60 ) where
61
62import           Control.DeepSeq
63
64import           Data.ByteString          (ByteString, pack, unpack)
65import           Data.ByteString.Internal (c2w, toForeignPtr, w2c)
66import           Data.IORef
67import qualified Data.Map                 as M
68import           Data.String.Unicode      (unicodeToUtf8)
69import           Data.String.UTF8Decoding (decodeUtf8)
70import           Data.Typeable
71
72import           System.IO.Unsafe         (unsafePerformIO)
73
74-- ------------------------------------------------------------
75
76type Atoms      = M.Map ByteString ByteString
77
78newtype Atom    = A { bs :: ByteString }
79                  deriving (Typeable)
80
81-- ------------------------------------------------------------
82
83-- | the internal cache for the strings
84
85theAtoms        :: IORef Atoms
86theAtoms        = unsafePerformIO (newIORef M.empty)
87{-# NOINLINE theAtoms #-}
88
89-- | insert a bytestring into the atom cache
90
91insertAtom      :: ByteString -> Atoms -> (Atoms, Atom)
92insertAtom s m  = maybe (M.insert s s m, A s)
93                        (\ s' -> (m, A s'))
94                  .
95                  M.lookup s $ m
96
97-- | creation of an @Atom@ from a @String@
98
99newAtom         :: String -> Atom
100newAtom         = unsafePerformIO . newAtom'
101{-# NOINLINE newAtom #-}
102
103-- | The internal operation running in the IO monad
104newAtom'        :: String -> IO Atom
105newAtom' s      = do
106                  -- putStrLn "insert atom into cache"
107                  res <- atomicModifyIORef theAtoms insert
108                  -- putStrLn "atom cache updated"
109                  return res
110  where
111    insert m    = let r = insertAtom (pack. map c2w . unicodeToUtf8 $ s) m
112                  in
113                   fst r `seq` r
114
115-- | Insert a @String@ into the atom cache and convert the atom back into a @String@.
116--
117-- locically @share == id@ holds, but internally equal strings share the same memory.
118
119share           :: String -> String
120share           = show . newAtom
121
122instance Eq Atom where
123    a1 == a2    = fp1 == fp2
124                  where
125                  (fp1, _, _) = toForeignPtr . bs $ a1
126                  (fp2, _, _) = toForeignPtr . bs $ a2
127
128instance Ord Atom where
129    compare a1 a2
130                | a1 == a2      = EQ
131                | otherwise     = compare (bs a1) (bs a2)
132
133instance Read Atom where
134    readsPrec p str = [ (newAtom x, y) | (x, y) <- readsPrec p str ]
135
136instance Show Atom where
137    show        = fst . decodeUtf8 . map w2c . unpack . bs
138    -- show     = show . toForeignPtr . bs                      -- for debug only
139
140instance NFData Atom where
141    rnf x = seq x ()
142
143-----------------------------------------------------------------------------
144