1{-# OPTIONS -funbox-strict-fields #-} 2-- | Common provides simple functions to the backend. It defines most 3-- of the data types. All modules should call error via the 4-- common_error function below. 5module Text.Regex.TDFA.Common where 6 7import Text.Regex.Base(RegexOptions(..)) 8 9{- By Chris Kuklewicz, 2007-2009. BSD License, see the LICENSE file. -} 10import Data.Array.IArray(Array) 11import Data.IntSet.EnumSet2(EnumSet) 12import qualified Data.IntSet.EnumSet2 as Set(toList) 13import Data.IntMap.CharMap2(CharMap(..)) 14import Data.IntMap (IntMap) 15import qualified Data.IntMap as IMap (findWithDefault,assocs,toList,null,size,toAscList) 16import Data.IntSet(IntSet) 17import qualified Data.IntMap.CharMap2 as Map (assocs,toAscList,null) 18import Data.Sequence as S(Seq) 19--import Debug.Trace 20 21import Text.Regex.TDFA.IntArrTrieSet(TrieSet) 22 23{-# INLINE look #-} 24look :: Int -> IntMap a -> a 25look key imap = IMap.findWithDefault (common_error "Text.Regex.DFA.Common" ("key "++show key++" not found in look")) key imap 26 27common_error :: String -> String -> a 28common_error moduleName message = 29 error ("Explict error in module "++moduleName++" : "++message) 30 31on :: (t1 -> t1 -> t2) -> (t -> t1) -> t -> t -> t2 32f `on` g = (\x y -> (g x) `f` (g y)) 33 34-- | after 'sort' or 'sortBy' the use of 'nub'\/'nubBy' can be replaced by 'norep'\/'norepBy' 35norep :: (Eq a) => [a]->[a] 36norep [] = [] 37norep x@[_] = x 38norep (a:bs@(c:cs)) | a==c = norep (a:cs) 39 | otherwise = a:norep bs 40 41-- | after 'sort' or 'sortBy' the use of 'nub'\/'nubBy' can be replaced by 'norep'\/'norepBy' 42norepBy :: (a -> a -> Bool) -> [a] -> [a] 43norepBy _ [] = [] 44norepBy _ x@[_] = x 45norepBy eqF (a:bs@(c:cs)) | a `eqF` c = norepBy eqF (a:cs) 46 | otherwise = a:norepBy eqF bs 47 48mapFst :: (Functor f) => (t -> t2) -> f (t, t1) -> f (t2, t1) 49mapFst f = fmap (\ (a,b) -> (f a,b)) 50 51mapSnd :: (Functor f) => (t1 -> t2) -> f (t, t1) -> f (t, t2) 52mapSnd f = fmap (\ (a,b) -> (a,f b)) 53 54fst3 :: (a,b,c) -> a 55fst3 (x,_,_) = x 56 57snd3 :: (a,b,c) -> b 58snd3 (_,x,_) = x 59 60thd3 :: (a,b,c) -> c 61thd3 (_,_,x) = x 62 63flipOrder :: Ordering -> Ordering 64flipOrder GT = LT 65flipOrder LT = GT 66flipOrder EQ = EQ 67 68noWin :: WinTags -> Bool 69noWin = null 70 71-- | Used to track elements of the pattern that accept characters or 72-- are anchors 73newtype DoPa = DoPa {dopaIndex :: Int} deriving (Eq,Ord) 74 75instance Enum DoPa where 76 toEnum = DoPa 77 fromEnum = dopaIndex 78 79instance Show DoPa where 80 showsPrec p (DoPa {dopaIndex=i}) = ('#':) . showsPrec p i 81 82-- | Control whether the pattern is multiline or case-sensitive like Text.Regex and whether to 83-- capture the subgroups (\\1, \\2, etc). Controls enabling extra anchor syntax. 84data CompOption = CompOption { 85 caseSensitive :: Bool -- ^ True in blankCompOpt and defaultCompOpt 86 , multiline :: Bool {- ^ False in blankCompOpt, True in defaultCompOpt. Compile for 87 newline-sensitive matching. "By default, newline is a completely ordinary 88 character with no special meaning in either REs or strings. With this flag, 89 inverted bracket expressions and . never match newline, a ^ anchor matches the 90 null string after any newline in the string in addition to its normal 91 function, and the $ anchor matches the null string before any newline in the 92 string in addition to its normal function." -} 93 , rightAssoc :: Bool -- ^ True (and therefore Right associative) in blankCompOpt and defaultCompOpt 94 , newSyntax :: Bool -- ^ False in blankCompOpt, True in defaultCompOpt. Add the extended non-POSIX syntax described in "Text.Regex.TDFA" haddock documentation. 95 , lastStarGreedy :: Bool -- ^ False by default. This is POSIX correct but it takes space and is slower. 96 -- Setting this to true will improve performance, and should be done 97 -- if you plan to set the captureGroups execoption to False. 98 } deriving (Read,Show) 99 100data ExecOption = ExecOption { 101 captureGroups :: Bool -- ^ True by default. Set to False to improve speed (and space). 102 } deriving (Read,Show) 103 104-- | Used by implementation to name certain Postions during 105-- matching. Identity of Position tag to set during a transition 106type Tag = Int 107-- | Internal use to indicate type of tag and preference for larger or smaller Positions 108data OP = Maximize | Minimize | Orbit | Ignore deriving (Eq,Show) 109-- | Internal NFA node identity number 110type Index = Int 111-- | Internal DFA identity is this Set of NFA Index 112type SetIndex = IntSet {- Index -} 113-- | Index into the text being searched 114type Position = Int 115 116-- | GroupIndex is for indexing submatches from capturing 117-- parenthesized groups (PGroup\/Group) 118type GroupIndex = Int 119-- | GroupInfo collects the parent and tag information for an instance 120-- of a group 121data GroupInfo = GroupInfo { 122 thisIndex, parentIndex :: GroupIndex 123 , startTag, stopTag, flagTag :: Tag 124 } deriving Show 125 126-- | The TDFA backend specific 'Regex' type, used by this module's RegexOptions and RegexMaker 127data Regex = Regex { 128 regex_dfa :: DFA -- ^ starting DFA state 129 , regex_init :: Index -- ^ index of starting state 130 , regex_b_index :: (Index,Index) -- ^ indexes of smallest and largest states 131 , regex_b_tags :: (Tag,Tag) -- ^ indexes of smallest and largest tags 132 , regex_trie :: TrieSet DFA -- ^ All DFA states 133 , regex_tags :: Array Tag OP -- ^ information about each tag 134 , regex_groups :: Array GroupIndex [GroupInfo] -- ^ information about each group 135 , regex_isFrontAnchored :: Bool -- ^ used for optimizing execution 136 , regex_compOptions :: CompOption 137 , regex_execOptions :: ExecOption 138 } -- no deriving at all, the DFA may be too big to ever traverse! 139 140 141instance RegexOptions Regex CompOption ExecOption where 142 blankCompOpt = CompOption { caseSensitive = True 143 , multiline = False 144 , rightAssoc = True 145 , newSyntax = False 146 , lastStarGreedy = False 147 } 148 blankExecOpt = ExecOption { captureGroups = True } 149 defaultCompOpt = CompOption { caseSensitive = True 150 , multiline = True 151 , rightAssoc = True 152 , newSyntax = True 153 , lastStarGreedy = False 154 } 155 defaultExecOpt = ExecOption { captureGroups = True } 156 setExecOpts e r = r {regex_execOptions=e} 157 getExecOpts r = regex_execOptions r 158 159 160data WinEmpty = WinEmpty Instructions 161 | WinTest WhichTest (Maybe WinEmpty) (Maybe WinEmpty) 162 deriving Show 163 164-- | Internal NFA node type 165data QNFA = QNFA {q_id :: Index, q_qt :: QT} 166 167-- | Internal to QNFA type. 168data QT = Simple { qt_win :: WinTags -- ^ empty transitions to the virtual winning state 169 , qt_trans :: CharMap QTrans -- ^ all ways to leave this QNFA to other or the same QNFA 170 , qt_other :: QTrans -- ^ default ways to leave this QNFA to other or the same QNFA 171 } 172 | Testing { qt_test :: WhichTest -- ^ The test to perform 173 , qt_dopas :: EnumSet DoPa -- ^ location(s) of the anchor(s) in the original regexp 174 , qt_a, qt_b :: QT -- ^ use qt_a if test is True, else use qt_b 175 } 176 177-- | Internal type to represent the tagged transition from one QNFA to 178-- another (or itself). The key is the Index of the destination QNFA. 179type QTrans = IntMap {- Destination Index -} [TagCommand] 180 181-- | Known predicates, just Beginning of Line (^) and End of Line ($). 182-- Also support for GNU extensions is being added: \\\` beginning of 183-- buffer, \\\' end of buffer, \\\< and \\\> for begin and end of words, \\b 184-- and \\B for word boundary and not word boundary. 185data WhichTest = Test_BOL | Test_EOL -- '^' and '$' (affected by multiline option) 186 | Test_BOB | Test_EOB -- \` and \' begin and end buffer 187 | Test_BOW | Test_EOW -- \< and \> begin and end word 188 | Test_EdgeWord | Test_NotEdgeWord -- \b and \B word boundaries 189 deriving (Show,Eq,Ord,Enum) 190 191-- | The things that can be done with a Tag. TagTask and 192-- ResetGroupStopTask are for tags with Maximize or Minimize OP 193-- values. ResetOrbitTask and EnterOrbitTask and LeaveOrbitTask are 194-- for tags with Orbit OP value. 195data TagTask = TagTask | ResetGroupStopTask | SetGroupStopTask 196 | ResetOrbitTask | EnterOrbitTask | LeaveOrbitTask deriving (Show,Eq) 197 198-- | Ordered list of tags and their associated Task 199type TagTasks = [(Tag,TagTask)] 200-- | When attached to a QTrans the TagTask can be done before or after 201-- accepting the character. 202data TagUpdate = PreUpdate TagTask | PostUpdate TagTask deriving (Show,Eq) 203-- | Ordered list of tags and their associated update operation. 204type TagList = [(Tag,TagUpdate)] 205-- | A TagList and the location of the item in the original pattern 206-- that is being accepted. 207type TagCommand = (DoPa,TagList) 208-- | Ordered list of tags and their associated update operation to 209-- perform on an empty transition to the virtual winning state. 210type WinTags = TagList 211 212-- | Internal DFA node, identified by the Set of indices of the QNFA 213-- nodes it represents. 214data DFA = DFA { d_id :: SetIndex, d_dt :: DT } deriving(Show) 215data Transition = Transition { trans_many :: DFA -- ^ where to go (maximal), including respawning 216 , trans_single :: DFA -- ^ where to go, not including respawning 217 , trans_how :: DTrans -- ^ how to go, including respawning 218 } 219-- | Internal to the DFA node 220data DT = Simple' { dt_win :: IntMap {- Source Index -} Instructions -- ^ Actions to perform to win 221 , dt_trans :: CharMap Transition -- ^ Transition to accept Char 222 , dt_other :: Transition -- ^ default accepting transition 223 } 224 | Testing' { dt_test :: WhichTest -- ^ The test to perform 225 , dt_dopas :: EnumSet DoPa -- ^ location(s) of the anchor(s) in the original regexp 226 , dt_a,dt_b :: DT -- ^ use dt_a if test is True else use dt_b 227 } 228 229-- | Internal type to repesent the commands for the tagged transition. 230-- The outer IntMap is for the destination Index and the inner IntMap 231-- is for the Source Index. This is convenient since all runtime data 232-- going to the same destination must be compared to find the best. 233-- 234-- A Destination IntMap entry may have an empty Source IntMap if and 235-- only if the destination is the starting index and the NFA\/DFA. 236-- This instructs the matching engine to spawn a new entry starting at 237-- the post-update position. 238type DTrans = IntMap {- Index of Destination -} (IntMap {- Index of Source -} (DoPa,Instructions)) 239-- type DTrans = IntMap {- Index of Destination -} (IntMap {- Index of Source -} (DoPa,RunState ())) 240-- | Internal convenience type for the text display code 241type DTrans' = [(Index, [(Index, (DoPa, ([(Tag, (Position,Bool))],[String])))])] 242 243-- | Positions for which a * was re-started while looping. Need to 244-- append locations at back but compare starting with front, so use 245-- Seq as a Queue. The initial position is saved in basePos (and a 246-- Maximize Tag), the middle positions in the Seq, and the final 247-- position is NOT saved in the Orbits (only in a Maximize Tag). 248-- 249-- The orderinal code is being written XXX TODO document it. 250data Orbits = Orbits 251 { inOrbit :: !Bool -- True if enterOrbit, False if LeaveOrbit 252 , basePos :: Position 253 , ordinal :: (Maybe Int) 254 , getOrbits :: !(Seq Position) 255 } deriving (Show) 256 257-- | The 'newPos' and 'newFlags' lists in Instructions are sorted by, and unique in, the Tag values 258data Instructions = Instructions 259 { newPos :: ![(Tag,Action)] -- False is preUpdate, True is postUpdate (there are no Orbit tags here) -- 2009 : Change to enum from bool? 260 , newOrbits :: !(Maybe (Position -> OrbitTransformer)) 261 } 262 263instance Show Instructions where 264 showsPrec p (Instructions pos _) 265 = showParen (p >= 11) $ 266 showString "Instructions {" . 267 showString "newPos = " . 268 showsPrec 0 pos . 269 showString ", " . 270 showString "newOrbits = " . 271 showString "<function>" . 272 showString "}" 273 274data Action = SetPre | SetPost | SetVal Int deriving (Show,Eq) 275type OrbitTransformer = OrbitLog -> OrbitLog 276type OrbitLog = IntMap Orbits 277 278instance Show QNFA where 279 show (QNFA {q_id = i, q_qt = qt}) = "QNFA {q_id = "++show i 280 ++"\n ,q_qt = "++ show qt 281 ++"\n}" 282 283instance Show QT where 284 show = showQT 285 286showQT :: QT -> String 287showQT (Simple win trans other) = "{qt_win=" ++ show win 288 ++ "\n, qt_trans=" ++ show (foo trans) 289 ++ "\n, qt_other=" ++ show (foo' other) ++ "}" 290 where foo :: CharMap QTrans -> [(Char,[(Index,[TagCommand])])] 291 foo = mapSnd foo' . Map.toAscList 292 foo' :: QTrans -> [(Index,[TagCommand])] 293 foo' = IMap.toList 294showQT (Testing test dopas a b) = "{Testing "++show test++" "++show (Set.toList dopas) 295 ++"\n"++indent' a 296 ++"\n"++indent' b++"}" 297 where indent' = init . unlines . map (spaces++) . lines . showQT 298 spaces = replicate 9 ' ' 299 300instance Show DT where show = showDT 301 302indent :: [String] -> String 303indent = unlines . map (\x -> ' ':' ':x) 304 305showDT :: DT -> String 306showDT (Simple' w t o) = 307 "Simple' { dt_win = " ++ seeWin1 308 ++ "\n , dt_trans = " ++ seeTrans1 309 ++ "\n , dt_other = " ++ seeOther1 o 310 ++ "\n }" 311 where 312 seeWin1 | IMap.null w = "No win" 313 | otherwise = indent . map show . IMap.assocs $ w 314 315 seeTrans1 :: String 316 seeTrans1 | Map.null t = "No (Char,Transition)" 317 | otherwise = ('\n':) . indent $ 318 map (\(char,Transition {trans_many=dfa,trans_single=dfa2,trans_how=dtrans}) -> 319 concat ["(" 320 ,show char 321 ,", MANY " 322 ,show (d_id dfa) 323 ,", SINGLE " 324 ,show (d_id dfa2) 325 ,", \n" 326 ,seeDTrans dtrans 327 ,")"]) (Map.assocs t) 328 329 seeOther1 (Transition {trans_many=dfa,trans_single=dfa2,trans_how=dtrans}) = 330 concat ["(MANY " 331 ,show (d_id dfa) 332 ,", SINGLE " 333 ,show (d_id dfa2) 334 ,", \n" 335 ,seeDTrans dtrans 336 ,")"] 337 338showDT (Testing' wt d a b) = "Testing' { dt_test = " ++ show wt 339 ++ "\n , dt_dopas = " ++ show d 340 ++ "\n , dt_a = " ++ indent' a 341 ++ "\n , dt_b = " ++ indent' b 342 ++ "\n }" 343 where indent' = init . unlines . (\s -> case s of 344 [] -> [] 345 (h:t) -> h : (map (spaces ++) t)) . lines . showDT 346 spaces = replicate 10 ' ' 347 348 349seeDTrans :: DTrans -> String 350--seeDTrans x = concatMap (\(dest,y) -> unlines . map (\(source,ins) -> show (dest,source,ins) ) . IMap.assocs $ y) (IMap.assocs x) 351seeDTrans x | IMap.null x = "No DTrans" 352seeDTrans x = concatMap seeSource (IMap.assocs x) 353 where seeSource (dest,srcMap) | IMap.null srcMap = indent [show (dest,"SPAWN")] 354 | otherwise = indent . map (\(source,ins) -> show (dest,source,ins) ) . IMap.assocs $ srcMap 355-- spawnIns = Instructions { newPos = [(0,SetPost)], newOrbits = Nothing } 356 357 358instance Eq QT where 359 t1@(Testing {}) == t2@(Testing {}) = 360 (qt_test t1) == (qt_test t2) && (qt_a t1) == (qt_a t2) && (qt_b t1) == (qt_b t2) 361 (Simple w1 (CharMap t1) o1) == (Simple w2 (CharMap t2) o2) = 362 w1 == w2 && eqTrans && eqQTrans o1 o2 363 where eqTrans :: Bool 364 eqTrans = (IMap.size t1 == IMap.size t2) 365 && and (zipWith together (IMap.toAscList t1) (IMap.toAscList t2)) 366 where together (c1,qtrans1) (c2,qtrans2) = (c1 == c2) && eqQTrans qtrans1 qtrans2 367 eqQTrans :: QTrans -> QTrans -> Bool 368 eqQTrans = (==) 369 _ == _ = False 370