1{-# OPTIONS -funbox-strict-fields #-}
2-- | Common provides simple functions to the backend.  It defines most
3-- of the data types.  All modules should call error via the
4-- common_error function below.
5module Text.Regex.TDFA.Common where
6
7import Text.Regex.Base(RegexOptions(..))
8
9{- By Chris Kuklewicz, 2007-2009. BSD License, see the LICENSE file. -}
10import Data.Array.IArray(Array)
11import Data.IntSet.EnumSet2(EnumSet)
12import qualified Data.IntSet.EnumSet2 as Set(toList)
13import Data.IntMap.CharMap2(CharMap(..))
14import Data.IntMap (IntMap)
15import qualified Data.IntMap as IMap (findWithDefault,assocs,toList,null,size,toAscList)
16import Data.IntSet(IntSet)
17import qualified Data.IntMap.CharMap2 as Map (assocs,toAscList,null)
18import Data.Sequence as S(Seq)
19--import Debug.Trace
20
21import Text.Regex.TDFA.IntArrTrieSet(TrieSet)
22
23{-# INLINE look #-}
24look :: Int -> IntMap a -> a
25look key imap = IMap.findWithDefault (common_error "Text.Regex.DFA.Common" ("key "++show key++" not found in look")) key imap
26
27common_error :: String -> String -> a
28common_error moduleName message =
29  error ("Explict error in module "++moduleName++" : "++message)
30
31on :: (t1 -> t1 -> t2) -> (t -> t1) -> t -> t -> t2
32f `on` g = (\x y -> (g x) `f` (g y))
33
34-- | after 'sort' or 'sortBy' the use of 'nub'\/'nubBy' can be replaced by 'norep'\/'norepBy'
35norep :: (Eq a) => [a]->[a]
36norep [] = []
37norep x@[_] = x
38norep (a:bs@(c:cs)) | a==c = norep (a:cs)
39                    | otherwise = a:norep bs
40
41-- | after 'sort' or 'sortBy' the use of 'nub'\/'nubBy' can be replaced by 'norep'\/'norepBy'
42norepBy :: (a -> a -> Bool) -> [a] -> [a]
43norepBy _ [] = []
44norepBy _ x@[_] = x
45norepBy eqF (a:bs@(c:cs)) | a `eqF` c = norepBy eqF (a:cs)
46                          | otherwise = a:norepBy eqF bs
47
48mapFst :: (Functor f) => (t -> t2) -> f (t, t1) -> f (t2, t1)
49mapFst f = fmap (\ (a,b) -> (f a,b))
50
51mapSnd :: (Functor f) => (t1 -> t2) -> f (t, t1) -> f (t, t2)
52mapSnd f = fmap (\ (a,b) -> (a,f b))
53
54fst3 :: (a,b,c) -> a
55fst3 (x,_,_) = x
56
57snd3 :: (a,b,c) -> b
58snd3 (_,x,_) = x
59
60thd3 :: (a,b,c) -> c
61thd3 (_,_,x) = x
62
63flipOrder :: Ordering -> Ordering
64flipOrder GT = LT
65flipOrder LT = GT
66flipOrder EQ = EQ
67
68noWin :: WinTags -> Bool
69noWin = null
70
71-- | Used to track elements of the pattern that accept characters or
72-- are anchors
73newtype DoPa = DoPa {dopaIndex :: Int} deriving (Eq,Ord)
74
75instance Enum DoPa where
76  toEnum = DoPa
77  fromEnum = dopaIndex
78
79instance Show DoPa where
80  showsPrec p (DoPa {dopaIndex=i}) = ('#':) . showsPrec p i
81
82-- | Control whether the pattern is multiline or case-sensitive like Text.Regex and whether to
83-- capture the subgroups (\\1, \\2, etc).  Controls enabling extra anchor syntax.
84data CompOption = CompOption {
85    caseSensitive :: Bool    -- ^ True in blankCompOpt and defaultCompOpt
86  , multiline :: Bool {- ^ False in blankCompOpt, True in defaultCompOpt. Compile for
87                      newline-sensitive matching.  "By default, newline is a completely ordinary
88                      character with no special meaning in either REs or strings.  With this flag,
89                      inverted bracket expressions and . never match newline, a ^ anchor matches the
90                      null string after any newline in the string in addition to its normal
91                      function, and the $ anchor matches the null string before any newline in the
92                      string in addition to its normal function." -}
93  , rightAssoc :: Bool       -- ^ True (and therefore Right associative) in blankCompOpt and defaultCompOpt
94  , newSyntax :: Bool        -- ^ False in blankCompOpt, True in defaultCompOpt. Add the extended non-POSIX syntax described in "Text.Regex.TDFA" haddock documentation.
95  , lastStarGreedy ::  Bool  -- ^ False by default.  This is POSIX correct but it takes space and is slower.
96                            -- Setting this to true will improve performance, and should be done
97                            -- if you plan to set the captureGroups execoption to False.
98  } deriving (Read,Show)
99
100data ExecOption = ExecOption {
101    captureGroups :: Bool    -- ^ True by default.  Set to False to improve speed (and space).
102  } deriving (Read,Show)
103
104-- | Used by implementation to name certain Postions during
105-- matching. Identity of Position tag to set during a transition
106type Tag = Int
107-- | Internal use to indicate type of tag and preference for larger or smaller Positions
108data OP = Maximize | Minimize | Orbit | Ignore deriving (Eq,Show)
109-- | Internal NFA node identity number
110type Index = Int
111-- | Internal DFA identity is this Set of NFA Index
112type SetIndex = IntSet {- Index -}
113-- | Index into the text being searched
114type Position = Int
115
116-- | GroupIndex is for indexing submatches from capturing
117-- parenthesized groups (PGroup\/Group)
118type GroupIndex = Int
119-- | GroupInfo collects the parent and tag information for an instance
120-- of a group
121data GroupInfo = GroupInfo {
122    thisIndex, parentIndex :: GroupIndex
123  , startTag, stopTag, flagTag :: Tag
124  } deriving Show
125
126-- | The TDFA backend specific 'Regex' type, used by this module's RegexOptions and RegexMaker
127data Regex = Regex {
128    regex_dfa :: DFA                             -- ^ starting DFA state
129  , regex_init :: Index                          -- ^ index of starting state
130  , regex_b_index :: (Index,Index)               -- ^ indexes of smallest and largest states
131  , regex_b_tags :: (Tag,Tag)                    -- ^ indexes of smallest and largest tags
132  , regex_trie :: TrieSet DFA                    -- ^ All DFA states
133  , regex_tags :: Array Tag OP                   -- ^ information about each tag
134  , regex_groups :: Array GroupIndex [GroupInfo] -- ^ information about each group
135  , regex_isFrontAnchored :: Bool                -- ^ used for optimizing execution
136  , regex_compOptions :: CompOption
137  , regex_execOptions :: ExecOption
138  } -- no deriving at all, the DFA may be too big to ever traverse!
139
140
141instance RegexOptions Regex CompOption ExecOption where
142  blankCompOpt =  CompOption { caseSensitive = True
143                             , multiline = False
144                             , rightAssoc = True
145                             , newSyntax = False
146                             , lastStarGreedy = False
147                             }
148  blankExecOpt =  ExecOption { captureGroups = True }
149  defaultCompOpt = CompOption { caseSensitive = True
150                              , multiline = True
151                              , rightAssoc = True
152                              , newSyntax = True
153                              , lastStarGreedy = False
154                              }
155  defaultExecOpt =  ExecOption { captureGroups = True }
156  setExecOpts e r = r {regex_execOptions=e}
157  getExecOpts r = regex_execOptions r
158
159
160data WinEmpty = WinEmpty Instructions
161              | WinTest WhichTest (Maybe WinEmpty) (Maybe WinEmpty)
162  deriving Show
163
164-- | Internal NFA node type
165data QNFA = QNFA {q_id :: Index, q_qt :: QT}
166
167-- | Internal to QNFA type.
168data QT = Simple { qt_win :: WinTags -- ^ empty transitions to the virtual winning state
169                 , qt_trans :: CharMap QTrans -- ^ all ways to leave this QNFA to other or the same QNFA
170                 , qt_other :: QTrans -- ^ default ways to leave this QNFA to other or the same QNFA
171                 }
172        | Testing { qt_test :: WhichTest -- ^ The test to perform
173                  , qt_dopas :: EnumSet DoPa  -- ^ location(s) of the anchor(s) in the original regexp
174                  , qt_a, qt_b :: QT -- ^ use qt_a if test is True, else use qt_b
175                  }
176
177-- | Internal type to represent the tagged transition from one QNFA to
178-- another (or itself).  The key is the Index of the destination QNFA.
179type QTrans = IntMap {- Destination Index -} [TagCommand]
180
181-- | Known predicates, just Beginning of Line (^) and End of Line ($).
182-- Also support for GNU extensions is being added: \\\` beginning of
183-- buffer, \\\' end of buffer, \\\< and \\\> for begin and end of words, \\b
184-- and \\B for word boundary and not word boundary.
185data WhichTest = Test_BOL | Test_EOL -- '^' and '$' (affected by multiline option)
186               | Test_BOB | Test_EOB -- \` and \' begin and end buffer
187               | Test_BOW | Test_EOW -- \< and \> begin and end word
188               | Test_EdgeWord | Test_NotEdgeWord -- \b and \B word boundaries
189  deriving (Show,Eq,Ord,Enum)
190
191-- | The things that can be done with a Tag.  TagTask and
192-- ResetGroupStopTask are for tags with Maximize or Minimize OP
193-- values.  ResetOrbitTask and EnterOrbitTask and LeaveOrbitTask are
194-- for tags with Orbit OP value.
195data TagTask = TagTask | ResetGroupStopTask | SetGroupStopTask
196             | ResetOrbitTask | EnterOrbitTask | LeaveOrbitTask deriving (Show,Eq)
197
198-- | Ordered list of tags and their associated Task
199type TagTasks = [(Tag,TagTask)]
200-- | When attached to a QTrans the TagTask can be done before or after
201-- accepting the character.
202data TagUpdate = PreUpdate TagTask | PostUpdate TagTask deriving (Show,Eq)
203-- | Ordered list of tags and their associated update operation.
204type TagList = [(Tag,TagUpdate)]
205-- | A TagList and the location of the item in the original pattern
206-- that is being accepted.
207type TagCommand = (DoPa,TagList)
208-- | Ordered list of tags and their associated update operation to
209-- perform on an empty transition to the virtual winning state.
210type WinTags = TagList
211
212-- | Internal DFA node, identified by the Set of indices of the QNFA
213-- nodes it represents.
214data DFA = DFA { d_id :: SetIndex, d_dt :: DT } deriving(Show)
215data Transition = Transition { trans_many :: DFA    -- ^ where to go (maximal), including respawning
216                             , trans_single :: DFA  -- ^ where to go, not including respawning
217                             , trans_how :: DTrans    -- ^ how to go, including respawning
218                             }
219-- | Internal to the DFA node
220data DT = Simple' { dt_win :: IntMap {- Source Index -} Instructions -- ^ Actions to perform to win
221                  , dt_trans :: CharMap Transition -- ^ Transition to accept Char
222                  , dt_other :: Transition -- ^ default accepting transition
223                  }
224        | Testing' { dt_test :: WhichTest -- ^ The test to perform
225                   , dt_dopas :: EnumSet DoPa -- ^ location(s) of the anchor(s) in the original regexp
226                   , dt_a,dt_b :: DT      -- ^ use dt_a if test is True else use dt_b
227                   }
228
229-- | Internal type to repesent the commands for the tagged transition.
230-- The outer IntMap is for the destination Index and the inner IntMap
231-- is for the Source Index.  This is convenient since all runtime data
232-- going to the same destination must be compared to find the best.
233--
234-- A Destination IntMap entry may have an empty Source IntMap if and
235-- only if the destination is the starting index and the NFA\/DFA.
236-- This instructs the matching engine to spawn a new entry starting at
237-- the post-update position.
238type DTrans = IntMap {- Index of Destination -} (IntMap {- Index of Source -} (DoPa,Instructions))
239-- type DTrans = IntMap {- Index of Destination -} (IntMap {- Index of Source -} (DoPa,RunState ()))
240-- | Internal convenience type for the text display code
241type DTrans' = [(Index, [(Index, (DoPa, ([(Tag, (Position,Bool))],[String])))])]
242
243-- | Positions for which a * was re-started while looping.  Need to
244-- append locations at back but compare starting with front, so use
245-- Seq as a Queue.  The initial position is saved in basePos (and a
246-- Maximize Tag), the middle positions in the Seq, and the final
247-- position is NOT saved in the Orbits (only in a Maximize Tag).
248--
249-- The orderinal code is being written XXX TODO document it.
250data Orbits = Orbits
251  { inOrbit :: !Bool        -- True if enterOrbit, False if LeaveOrbit
252  , basePos :: Position
253  , ordinal :: (Maybe Int)
254  , getOrbits :: !(Seq Position)
255  } deriving (Show)
256
257-- | The 'newPos' and 'newFlags' lists in Instructions are sorted by, and unique in, the Tag values
258data Instructions = Instructions
259  { newPos :: ![(Tag,Action)] -- False is preUpdate, True is postUpdate (there are no Orbit tags here) -- 2009 : Change to enum from bool?
260  , newOrbits :: !(Maybe (Position -> OrbitTransformer))
261  }
262
263instance Show Instructions where
264  showsPrec p (Instructions pos _)
265    = showParen (p >= 11) $
266        showString "Instructions {" .
267        showString "newPos = " .
268        showsPrec 0 pos .
269        showString ", " .
270        showString "newOrbits = " .
271        showString "<function>" .
272        showString "}"
273
274data Action = SetPre | SetPost | SetVal Int deriving (Show,Eq)
275type OrbitTransformer = OrbitLog -> OrbitLog
276type OrbitLog = IntMap Orbits
277
278instance Show QNFA where
279  show (QNFA {q_id = i, q_qt = qt}) = "QNFA {q_id = "++show i
280                                  ++"\n     ,q_qt = "++ show qt
281                                  ++"\n}"
282
283instance Show QT where
284  show = showQT
285
286showQT :: QT -> String
287showQT (Simple win trans other) = "{qt_win=" ++ show win
288                             ++ "\n, qt_trans=" ++ show (foo trans)
289                             ++ "\n, qt_other=" ++ show (foo' other) ++ "}"
290  where foo :: CharMap QTrans -> [(Char,[(Index,[TagCommand])])]
291        foo = mapSnd foo' . Map.toAscList
292        foo' :: QTrans -> [(Index,[TagCommand])]
293        foo' = IMap.toList
294showQT (Testing test dopas a b) = "{Testing "++show test++" "++show (Set.toList dopas)
295                              ++"\n"++indent' a
296                              ++"\n"++indent' b++"}"
297    where indent' = init . unlines . map (spaces++) . lines . showQT
298          spaces = replicate 9 ' '
299
300instance Show DT where show = showDT
301
302indent :: [String] -> String
303indent = unlines . map (\x -> ' ':' ':x)
304
305showDT :: DT -> String
306showDT (Simple' w t o) =
307       "Simple' { dt_win = " ++ seeWin1
308  ++ "\n        , dt_trans = " ++ seeTrans1
309  ++ "\n        , dt_other = " ++ seeOther1 o
310  ++ "\n        }"
311 where
312  seeWin1 | IMap.null w = "No win"
313          | otherwise = indent . map show . IMap.assocs $ w
314
315  seeTrans1 :: String
316  seeTrans1 | Map.null t = "No (Char,Transition)"
317            | otherwise = ('\n':) . indent $
318     map (\(char,Transition {trans_many=dfa,trans_single=dfa2,trans_how=dtrans}) ->
319                           concat ["("
320                                  ,show char
321                                  ,", MANY "
322                                  ,show (d_id dfa)
323                                  ,", SINGLE "
324                                  ,show (d_id dfa2)
325                                  ,", \n"
326                                  ,seeDTrans dtrans
327                                  ,")"]) (Map.assocs t)
328
329  seeOther1 (Transition {trans_many=dfa,trans_single=dfa2,trans_how=dtrans}) =
330    concat ["(MANY "
331           ,show (d_id dfa)
332           ,", SINGLE "
333           ,show (d_id dfa2)
334           ,", \n"
335           ,seeDTrans dtrans
336           ,")"]
337
338showDT (Testing' wt d a b) = "Testing' { dt_test = " ++ show wt
339                          ++ "\n         , dt_dopas = " ++ show d
340                          ++ "\n         , dt_a = " ++ indent' a
341                          ++ "\n         , dt_b = " ++ indent' b
342                          ++ "\n         }"
343 where indent' = init . unlines . (\s -> case s of
344                                           [] -> []
345                                           (h:t) -> h : (map (spaces ++) t)) . lines . showDT
346       spaces = replicate 10 ' '
347
348
349seeDTrans :: DTrans -> String
350--seeDTrans x = concatMap (\(dest,y) -> unlines . map (\(source,ins) -> show (dest,source,ins) ) . IMap.assocs $ y) (IMap.assocs x)
351seeDTrans x | IMap.null x = "No DTrans"
352seeDTrans x = concatMap seeSource (IMap.assocs x)
353  where seeSource (dest,srcMap) | IMap.null srcMap = indent [show (dest,"SPAWN")]
354                                | otherwise = indent . map (\(source,ins) -> show (dest,source,ins) ) . IMap.assocs $ srcMap
355--        spawnIns = Instructions { newPos = [(0,SetPost)], newOrbits = Nothing }
356
357
358instance Eq QT where
359  t1@(Testing {}) == t2@(Testing {}) =
360    (qt_test t1) == (qt_test t2) && (qt_a t1) == (qt_a t2) && (qt_b t1) == (qt_b t2)
361  (Simple w1 (CharMap t1) o1) == (Simple w2 (CharMap t2) o2) =
362    w1 == w2 && eqTrans && eqQTrans o1 o2
363    where eqTrans :: Bool
364          eqTrans = (IMap.size t1 == IMap.size t2)
365                    && and (zipWith together (IMap.toAscList t1) (IMap.toAscList t2))
366            where together (c1,qtrans1) (c2,qtrans2) = (c1 == c2) && eqQTrans qtrans1 qtrans2
367          eqQTrans :: QTrans -> QTrans -> Bool
368          eqQTrans = (==)
369  _ == _ = False
370