1-- XXX design uncertainty: should preResets be inserted into nullView? 2-- if not, why not? ADDED 3 4-- XXX design uncertainty: what does act -> actNullable -> 5-- actNullableTagless not use nullQ and same for inStar, etc? 6-- TODO : try rewriting whole qToNFA in terms of "act" 7-- (That will require re-organizing the continuation data a bit) 8 9-- | "Text.Regex.TDFA.TNFA" converts the CorePattern Q\/P data (and its 10-- Pattern leafs) to a QNFA tagged non-deterministic finite automata. 11-- 12-- This holds every possible way to follow one state by another, while 13-- in the DFA these will be reduced by picking a single best 14-- transition for each (soure,destination) pair. The transitions are 15-- heavily and often redundantly annotated with tasks to perform, and 16-- this redundancy is reduced when picking the best transition. So 17-- far, keeping all this information has helped fix bugs in both the 18-- design and implementation. 19-- 20-- The QNFA for a Pattern with a starTraned Q\/P form with N one 21-- character accepting leaves has at most N+1 nodes. These nodes 22-- repesent the future choices after accepting a leaf. The processing 23-- of Or nodes often reduces this number by sharing at the end of the 24-- different paths. Turning off capturing while compiling the pattern 25-- may (future extension) reduce this further for some patterns by 26-- processing Star with optimizations. This compact design also means 27-- that tags are assigned not just to be updated before taking a 28-- transition (PreUpdate) but also after the transition (PostUpdate). 29-- 30-- Uses recursive do notation. 31 32module Text.Regex.TDFA.TNFA(patternToNFA 33 ,QNFA(..),QT(..),QTrans,TagUpdate(..)) where 34 35{- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -} 36 37import Control.Monad(when) 38import Control.Monad.State(State,runState,execState,get,put,modify) 39import Data.Array.IArray(Array,array) 40import Data.Char(toLower,toUpper,isAlpha,ord) 41import Data.List(foldl') 42import Data.IntMap (IntMap) 43import qualified Data.IntMap as IMap(toAscList,null,unionWith,singleton,fromList,fromDistinctAscList) 44import Data.IntMap.CharMap2(CharMap(..)) 45import qualified Data.IntMap.CharMap2 as Map(null,singleton,map) 46import qualified Data.IntMap.EnumMap2 as EMap(null,keysSet,assocs) 47import Data.IntSet.EnumSet2(EnumSet) 48import qualified Data.IntSet.EnumSet2 as Set(singleton,toList,insert) 49import Data.Maybe(catMaybes,isNothing) 50import Data.Monoid as Mon(Monoid(..)) 51import qualified Data.Set as S(Set,insert,toAscList,empty) 52 53import Text.Regex.TDFA.Common(QT(..),QNFA(..),QTrans,TagTask(..),TagUpdate(..),DoPa(..) 54 ,CompOption(..) 55 ,Tag,TagTasks,TagList,Index,WinTags,GroupIndex,GroupInfo(..) 56 ,common_error,noWin,snd3,mapSnd) 57import Text.Regex.TDFA.CorePattern(Q(..),P(..),OP(..),WhichTest,cleanNullView,NullView 58 ,SetTestInfo(..),Wanted(..),TestInfo 59 ,mustAccept,cannotAccept,patternToQ) 60import Text.Regex.TDFA.Pattern(Pattern(..),PatternSet(..),unSEC,PatternSetCharacterClass(..)) 61--import Debug.Trace 62 63ecart :: String -> a -> a 64ecart _ = id 65 66err :: String -> a 67err t = common_error "Text.Regex.TDFA.TNFA" t 68 69debug :: (Show a) => a -> s -> s 70debug _ s = s 71 72qtwin,qtlose :: QT 73-- qtwin is the continuation after matching the whole pattern. It has 74-- no futher transitions and sets tag #1 to the current position. 75qtwin = Simple {qt_win=[(1,PreUpdate TagTask)],qt_trans=mempty,qt_other=mempty} 76-- qtlose is the continuation to nothing, used when ^ or $ tests fail. 77qtlose = Simple {qt_win=mempty,qt_trans=mempty,qt_other=mempty} 78 79patternToNFA :: CompOption 80 -> (Pattern,(GroupIndex, DoPa)) 81 -> ((Index,Array Index QNFA) 82 ,Array Tag OP 83 ,Array GroupIndex [GroupInfo]) 84patternToNFA compOpt pattern = 85 let (q,tags,groups) = patternToQ compOpt pattern 86 msg = unlines [ show q ] 87 in debug msg (qToNFA compOpt q,tags,groups) 88 89-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == 90-- Query function on Q 91 92nullable :: Q -> Bool 93nullable = not . null . nullQ 94 95notNullable :: Q -> Bool 96notNullable = null . nullQ 97 98-- This asks if the preferred (i.e. first) NullView has no tests. 99maybeOnlyEmpty :: Q -> Maybe WinTags 100maybeOnlyEmpty (Q {nullQ = ((SetTestInfo sti,tags):_)}) = if EMap.null sti then Just tags else Nothing 101maybeOnlyEmpty _ = Nothing 102 103usesQNFA :: Q -> Bool 104usesQNFA (Q {wants=WantsBoth}) = True 105usesQNFA (Q {wants=WantsQNFA}) = True 106usesQNFA _ = False 107 108-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == 109-- Functions related to QT 110 111-- dumb smart constructor used by qToQNFA 112-- Possible: Go through the qt and keep only the best tagged transition(s) to each state to make simple NFA? 113mkQNFA :: Index -> QT -> QNFA 114mkQNFA i qt = debug ("\n>QNFA id="++show i) $ 115 QNFA i (debug ("\ngetting QT for "++show i) qt) 116 117-- This uses the Eq QT instance above 118-- ZZZ 119mkTesting :: QT -> QT 120mkTesting t@(Testing {qt_a=a,qt_b=b}) = if a==b then a else t -- Move to nfsToDFA XXX 121mkTesting t = t 122 123nullQT :: QT -> Bool 124nullQT (Simple {qt_win=w,qt_trans=t,qt_other=o}) = noWin w && Map.null t && IMap.null o 125nullQT _ = False 126 127-- This reconstructs the set of tests checked in processing QT, adding 128-- them to the passed set. 129listTestInfo :: QT -> EnumSet WhichTest -> EnumSet WhichTest 130listTestInfo qt s = execState (helper qt) s 131 where helper (Simple {}) = return () 132 helper (Testing {qt_test = wt, qt_a = a, qt_b = b}) = do 133 modify (Set.insert wt) 134 helper a 135 helper b 136 137-- This is used to view "win" only through NullView, and is used in 138-- processing Or. 139applyNullViews :: NullView -> QT -> QT 140applyNullViews [] win = win 141applyNullViews nvs win = foldl' (dominate win) qtlose (reverse $ cleanNullView nvs) where 142 143-- This is used to prefer to view "win" through NullView. Losing is 144-- replaced by the plain win. This is employed by Star patterns to 145-- express that the first iteration is allowed to match null, but 146-- skipping the NullView occurs if the match fails. 147preferNullViews :: NullView -> QT -> QT 148preferNullViews [] win = win 149preferNullViews nvs win = foldl' (dominate win) win (reverse $ cleanNullView nvs) where 150 151{- 152dominate is common to applyNullViews and preferNullViews above. 153 154Even I no longer understand it without study. 155 156Oversimplified: The last argument has a new set of tests "sti" that 157must be satisfied to then apply the new "tags" and reach the "win" QT. 158Failing any of this set of tests leads to the "lose" QT. 159 160Closer: The "win" may already have some other set of tests leading to 161various branches, this set is cached in winTests. And the "lose" may 162already have some other set of tests leading to various branches. The 163combination of "win" and "lose" and "sti" must check the union of 164these tests, which is "allTests". 165 166Detail: The merging is done by useTest, where the tests in sti divert 167losing to a branch of "lose" and winning to a branch of "win". Tests 168not in sti are unchanged (but the losing DoPa index might be added). 169-} 170dominate :: QT -> QT -> (SetTestInfo,WinTags) -> QT 171dominate win lose x@(SetTestInfo sti,tags) = debug ("dominate "++show x) $ 172 let -- The winning states are reached through the SetTag 173 win' = prependTags' tags win 174 -- get the SetTestInfo 175 winTests = listTestInfo win $ mempty 176 allTests = (listTestInfo lose $ winTests) `mappend` (EMap.keysSet sti) 177 -- The first and second arguments of useTest are sorted 178 -- At all times the second argument of useTest is a subset of the first 179 useTest _ [] w _ = w -- no more dominating tests to fail to choose lose, so just choose win 180 useTest (aTest:tests) allD@((dTest,dopas):ds) w l = 181 let (wA,wB,wD) = branches w 182 (lA,lB,lD) = branches l 183 branches qt@(Testing {}) | aTest==qt_test qt = (qt_a qt,qt_b qt,qt_dopas qt) 184 branches qt = (qt,qt,mempty) 185 in if aTest == dTest 186 then Testing {qt_test = aTest 187 ,qt_dopas = (dopas `mappend` wD) `mappend` lD 188 ,qt_a = useTest tests ds wA lA 189 ,qt_b = lB} 190 else Testing {qt_test = aTest 191 ,qt_dopas = wD `mappend` lD 192 ,qt_a = useTest tests allD wA lA 193 ,qt_b = useTest tests allD wB lB} 194 useTest [] _ _ _ = err "This case in dominate.useText cannot happen: second argument would have to have been null and that is checked before this case" 195 in useTest (Set.toList allTests) (EMap.assocs sti) win' lose 196 197-- 'applyTest' is only used by addTest 198-- 2009: maybe need to keep track of whether a change is actually made 199-- (beyond DoPa tracking) to the QT. 200applyTest :: TestInfo -> QT -> QT 201applyTest (wt,dopa) qt | nullQT qt = qt 202 | otherwise = applyTest' qt where 203 applyTest' :: QT -> QT 204 applyTest' q@(Simple {}) = 205 mkTesting $ Testing {qt_test = wt 206 ,qt_dopas = Set.singleton dopa 207 ,qt_a = q 208 ,qt_b = qtlose} 209 applyTest' q@(Testing {qt_test=wt'}) = 210 case compare wt wt' of 211 LT -> Testing {qt_test = wt 212 ,qt_dopas = Set.singleton dopa 213 ,qt_a = q 214 ,qt_b = qtlose} 215 EQ -> q {qt_dopas = Set.insert dopa (qt_dopas q) 216 ,qt_b = qtlose} 217 GT -> q {qt_a = applyTest' (qt_a q) 218 ,qt_b = applyTest' (qt_b q)} 219 220-- Three ways to merge a pair of QT's varying how winning transitions 221-- are handled. 222-- 223-- mergeQT_2nd is used by the NonEmpty case and always discards the 224-- first argument's win and uses the second argment's win. 225-- 226-- mergeAltQT is used by the Or cases and is biased to the first 227-- argument's winning transition, if present. 228-- 229-- mergeQT is used by Star and mergeE and combines the winning 230-- transitions (concatenating the instructions). 231mergeQT_2nd,mergeAltQT,mergeQT :: QT -> QT -> QT 232mergeQT_2nd q1 q2 | nullQT q1 = q2 233 | otherwise = mergeQTWith (\_ w2 -> w2) q1 q2 234 235mergeAltQT q1 q2 | nullQT q1 = q2 -- prefer winning with w1 then with w2 236 | otherwise = mergeQTWith (\w1 w2 -> if noWin w1 then w2 else w1) q1 q2 237mergeQT q1 q2 | nullQT q1 = q2 -- union wins 238 | nullQT q2 = q1 -- union wins 239 | otherwise = mergeQTWith mappend q1 q2 -- no preference, win with combined SetTag XXX is the wrong thing! "(.?)*" 240 241-- This takes a function which implements a policy on mergining 242-- winning transitions and then merges all the transitions. It opens 243-- the CharMap newtype for more efficient operation, then rewraps it. 244mergeQTWith :: (WinTags -> WinTags -> WinTags) -> QT -> QT -> QT 245mergeQTWith mergeWins = merge where 246 merge :: QT -> QT -> QT 247 merge (Simple w1 t1 o1) (Simple w2 t2 o2) = 248 let w' = mergeWins w1 w2 249 t' = fuseQTrans t1 o1 t2 o2 250 o' = mergeQTrans o1 o2 251 in Simple w' t' o' 252 merge t1@(Testing _ _ a1 b1) s2@(Simple {}) = mkTesting $ 253 t1 {qt_a=(merge a1 s2), qt_b=(merge b1 s2)} 254 merge s1@(Simple {}) t2@(Testing _ _ a2 b2) = mkTesting $ 255 t2 {qt_a=(merge s1 a2), qt_b=(merge s1 b2)} 256 merge t1@(Testing wt1 ds1 a1 b1) t2@(Testing wt2 ds2 a2 b2) = mkTesting $ 257 case compare wt1 wt2 of 258 LT -> t1 {qt_a=(merge a1 t2), qt_b=(merge b1 t2)} 259 EQ -> Testing {qt_test = wt1 -- same as wt2 260 ,qt_dopas = mappend ds1 ds2 261 ,qt_a = merge a1 a2 262 ,qt_b = merge b1 b2} 263 GT -> t2 {qt_a=(merge t1 a2), qt_b=(merge t1 b2)} 264 265 fuseQTrans :: (CharMap QTrans) -> QTrans 266 -> (CharMap QTrans) -> QTrans 267 -> CharMap QTrans 268 fuseQTrans (CharMap t1) o1 (CharMap t2) o2 = CharMap (IMap.fromDistinctAscList (fuse l1 l2)) where 269 l1 = IMap.toAscList t1 270 l2 = IMap.toAscList t2 271 fuse [] y = mapSnd (mergeQTrans o1) y 272 fuse x [] = mapSnd (mergeQTrans o2) x 273 fuse x@((xc,xa):xs) y@((yc,ya):ys) = 274 case compare xc yc of 275 LT -> (xc,mergeQTrans xa o2) : fuse xs y 276 EQ -> (xc,mergeQTrans xa ya) : fuse xs ys 277 GT -> (yc,mergeQTrans o1 ya) : fuse x ys 278 279 mergeQTrans :: QTrans -> QTrans -> QTrans 280 mergeQTrans = IMap.unionWith mappend 281 282-- Note: There are no append* operations. There are only these 283-- prepend* operations because things are only prepended to the future 284-- continuation. And the ordering is significant. 285 286-- This is only used in inStar/nullable 287prependPreTag :: Maybe Tag -> QT -> QT 288prependPreTag Nothing qt = qt 289prependPreTag (Just tag) qt = prependTags' [(tag,PreUpdate TagTask)] qt 290 291prependGroupResets :: [Tag] -> QT -> QT 292prependGroupResets [] qt = qt 293prependGroupResets tags qt = prependTags' [(tag,PreUpdate ResetGroupStopTask)|tag<-tags] qt 294 295prependTags' :: TagList -> QT -> QT 296prependTags' [] qt = qt 297prependTags' tcs' qt@(Testing {}) = qt { qt_a = prependTags' tcs' (qt_a qt) 298 , qt_b = prependTags' tcs' (qt_b qt) } 299prependTags' tcs' (Simple {qt_win=w,qt_trans=t,qt_other=o}) = 300 Simple { qt_win = if noWin w then w else tcs' `mappend` w 301 , qt_trans = Map.map prependQTrans t 302 , qt_other = prependQTrans o } 303 where prependQTrans = fmap (map (\(d,tcs) -> (d,tcs' `mappend` tcs))) 304 305-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == 306-- define type S which is a State monad, this allows the creation of the uniq QNFA ids and storing the QNFA 307-- in an ascending order difference list for later placement in an array. 308 309-- Type of State monad used inside qToNFA 310type S = State (Index -- Next available QNFA index 311 ,[(Index,QNFA)]->[(Index,QNFA)]) -- DList of previous QNFAs 312 313-- Type of continuation of the NFA, not much more complicated 314type E = (TagTasks -- Things to do before the Either QNFA QT 315 -- with OneChar these become PostUpdate otherwise they become PreUpdate 316 ,Either QNFA QT) -- The future, packaged in the best way 317 318-- See documentation below before the 'act' function. This is for use inside a Star pattern. 319type ActCont = ( E -- The eLoop is the dangerous recursive reference to continuation 320 -- future that loops while accepting zero more characters 321 , Maybe E -- This holds the safe non-zero-character accepting continuation 322 , Maybe (TagTasks,QNFA)) -- optimized merger of the above, used only inside act, to avoid orphan QNFA id values 323 324-- newQNFA is the only operation that actually uses the monad get and put operations 325newQNFA :: String -> QT -> S QNFA 326newQNFA s qt = do 327 (thisI,oldQs) <- get 328 let futureI = succ thisI in seq futureI $ debug (">newQNFA< "++s++" : "++show thisI) $ do 329 let qnfa = mkQNFA thisI qt -- (strictQT qt) -- making strictQNFA kills test (1,11) ZZZ 330 put $! (futureI, oldQs . ((thisI,qnfa):)) 331 return qnfa 332 333-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == 334-- E related functions 335 336fromQNFA :: QNFA -> E 337fromQNFA qnfa = (mempty,Left qnfa) 338 339fromQT :: QT -> E 340fromQT qt = (mempty,Right qt) 341 342-- Promises the output will match (_,Left _), used by Or cases when any branch wants a QNFA continuation 343asQNFA :: String -> E -> S E 344asQNFA _ x@(_,Left _) = return x 345asQNFA s (tags,Right qt) = do qnfa <- newQNFA s qt -- YYY Policy choice: leave the tags 346 return (tags, Left qnfa) 347 348-- Convert continuation E into a QNFA, only done at "top level" by qToNFA to get unique start state 349getQNFA :: String -> E -> S QNFA 350getQNFA _ ([],Left qnfa) = return qnfa 351getQNFA s (tags,Left qnfa) = newQNFA s (prependTags' (promoteTasks PreUpdate tags) (q_qt qnfa)) 352getQNFA s (tags,Right qt) = newQNFA s (prependTags' (promoteTasks PreUpdate tags) qt) 353 354-- Extract the QT from the E 355getQT :: E -> QT 356getQT (tags,cont) = prependTags' (promoteTasks PreUpdate tags) (either q_qt id cont) 357 358-- 2009: This looks realllly dodgy, since it can convert a QNFA/Testing to a QT/Testing 359-- without actually achieving anything except adding a DoPa to the Testing. A diagnostic 360-- series of runs might be needed to decide if this ever creates orphan id numbers. 361-- Then applyTest might need to keep track of whether it actually changes anything. 362addTest :: TestInfo -> E -> E 363addTest ti (tags,cont) = (tags, Right . applyTest ti . either q_qt id $ cont) 364 365-- This is used only with PreUpdate and PostUpdate as the first argument. 366promoteTasks :: (TagTask->TagUpdate) -> TagTasks -> TagList 367promoteTasks promote tags = map (\(tag,task) -> (tag,promote task)) tags 368 369-- only used in addWinTags 370demoteTags :: TagList -> TagTasks 371demoteTags = map helper 372 where helper (tag,PreUpdate tt) = (tag,tt) 373 helper (tag,PostUpdate tt) = (tag,tt) 374 375-- This is polymorphic so addWinTags can be cute below 376{-# INLINE addWinTags #-} 377addWinTags :: WinTags -> (TagTasks,a) -> (TagTasks,a) 378addWinTags wtags (tags,cont) = (demoteTags wtags `mappend` tags 379 ,cont) 380 381{-# INLINE addTag' #-} 382-- This is polymorphic so addTagAC can be cute below 383addTag' :: Tag -> (TagTasks,a) -> (TagTasks,a) 384addTag' tag (tags,cont) = ((tag,TagTask):tags 385 ,cont) 386 387-- a Maybe version of addTag' above, specializing 'a' to Either QNFA QT 388addTag :: Maybe Tag -> E -> E 389addTag Nothing e = e 390addTag (Just tag) e = addTag' tag e 391 392{-# INLINE addGroupResets #-} 393-- This is polymorphic so addGroupResetsAC can be cute below 394addGroupResets :: (Show a) => [Tag] -> (TagTasks,a) -> (TagTasks,a) 395addGroupResets [] x = x 396addGroupResets tags (tags',cont) = (foldr (:) tags' . map (\tag -> (tag,ResetGroupStopTask)) $ tags 397 ,cont) 398 399addGroupSets :: (Show a) => [Tag] -> (TagTasks,a) -> (TagTasks,a) 400addGroupSets [] x = x 401addGroupSets tags (tags',cont) = (foldr (:) tags' . map (\tag -> (tag,SetGroupStopTask)) $ tags 402 ,cont) 403 404-- Consume an ActCont. Uses the mergeQT form to combine non-accepting 405-- and accepting view of the continuation. 406getE :: ActCont -> E 407getE (_,_,Just (tags,qnfa)) = (tags, Left qnfa) -- consume optimized mQNFA value returned by Star 408getE (eLoop,Just accepting,_) = fromQT (mergeQT (getQT eLoop) (getQT accepting)) 409getE (eLoop,Nothing,_) = eLoop 410 411-- 2009: See coment for addTest. Here is a case where the third component might be a (Just qnfa) and it 412-- is being lost even though the added test might be redundant. 413addTestAC :: TestInfo -> ActCont -> ActCont 414addTestAC ti (e,mE,_) = (addTest ti e 415 ,fmap (addTest ti) mE 416 ,Nothing) 417 418-- These are AC versions of the add functions on E 419 420addTagAC :: Maybe Tag -> ActCont -> ActCont 421addTagAC Nothing ac = ac 422addTagAC (Just tag) (e,mE,mQNFA) = (addTag' tag e 423 ,fmap (addTag' tag) mE 424 ,fmap (addTag' tag) mQNFA) 425 426addGroupResetsAC :: [Tag] -> ActCont -> ActCont 427addGroupResetsAC [] ac = ac 428addGroupResetsAC tags (e,mE,mQNFA) = (addGroupResets tags e 429 ,fmap (addGroupResets tags) mE 430 ,fmap (addGroupResets tags) mQNFA) 431 432addGroupSetsAC :: [Tag] -> ActCont -> ActCont 433addGroupSetsAC [] ac = ac 434addGroupSetsAC tags (e,mE,mQNFA) = (addGroupSets tags e 435 ,fmap (addGroupSets tags) mE 436 ,fmap (addGroupSets tags) mQNFA) 437 438addWinTagsAC :: WinTags -> ActCont -> ActCont 439addWinTagsAC wtags (e,mE,mQNFA) = (addWinTags wtags e 440 ,fmap (addWinTags wtags) mE 441 ,fmap (addWinTags wtags) mQNFA) 442-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == 443 444-- Initial preTag of 0th tag is implied. No other general pre-tags would be expected. 445-- The qtwin contains the preTag of the 1st tag and is only set when a match is completed. 446-- The fst Index is the index of the unique starting QNFA state. 447-- The snd (Array Index QNFA) is all the QNFA states. 448-- 449-- In the cases below, Empty is handled much like a Test with no TestInfo. 450qToNFA :: CompOption -> Q -> (Index,Array Index QNFA) 451qToNFA compOpt qTop = (q_id startingQNFA 452 ,array (0,pred lastIndex) (table [])) where 453 -- Result startingQNFA is the top level's index 454 -- State pair: fst 0 is the next state number (not yet used) going in, and lastIndex coming out (succ of last used) 455 -- snd id is the difference list of states going in, and the finished list coming out 456 (startingQNFA,(lastIndex,table)) = 457 runState (getTrans qTop (fromQT $ qtwin) >>= getQNFA "top level") startState 458 startState = (0,id) 459 460 getTrans,getTransTagless :: Q -> E -> S E 461 getTrans qIn@(Q {preReset=resets,postSet=sets,preTag=pre,postTag=post,unQ=pIn}) e = debug (">< getTrans "++show qIn++" <>") $ 462 case pIn of 463 -- The case below is the ultimate consumer of every single OneChar in the input and the only caller of 464 -- newTrans/acceptTrans which is the sole source of QT/Simple nodes. 465 OneChar pat -> newTrans "getTrans/OneChar" resets pre pat . addTag post . addGroupSets sets $ e 466 Empty -> return . addGroupResets resets . addTag pre . addTag post . addGroupSets sets $ e 467 Test ti -> return . addGroupResets resets . addTag pre . addTest ti . addTag post . addGroupSets sets $ e 468 _ -> return . addGroupResets resets . addTag pre =<< getTransTagless qIn (addTag post . addGroupSets sets $ e) 469 470 getTransTagless qIn e = debug (">< getTransTagless "++show qIn++" <>") $ 471 case unQ qIn of 472 Seq q1 q2 -> getTrans q1 =<< getTrans q2 e 473 Or [] -> return e 474 Or [q] -> getTrans q e 475 Or qs -> do 476 eqts <- if usesQNFA qIn 477 then do 478 eQNFA <- asQNFA "getTransTagless/Or/usesQNFA" e 479 sequence [ getTrans q eQNFA | q <- qs ] 480 else sequence [ getTrans q e | q <- qs ] 481 let qts = map getQT eqts 482 return (fromQT (foldr1 mergeAltQT qts)) 483 484 Star mOrbit resetTheseOrbits mayFirstBeNull q -> 485 -- mOrbit of Just implies varies q and childGroups q 486 let (e',clear) = -- debug ("\n>"++show e++"\n"++show q++"\n<") $ 487 if notNullable q then (e,True) -- subpattern cannot be null 488 else if null resetTheseOrbits && isNothing mOrbit 489 then case maybeOnlyEmpty q of 490 Just [] -> (e,True) -- True because null of subpattern is same as skipping subpattern 491 Just tagList -> (addWinTags tagList e,False) -- null of subpattern NOT same as skipping 492 _ -> (fromQT . preferNullViews (nullQ q) . getQT $ e,False) -- is NOT same as skipping 493 else (fromQT . resetOrbitsQT resetTheseOrbits . enterOrbitQT mOrbit -- resetOrbitsQT and enterOrbitQT commute 494 . preferNullViews (nullQ q) . getQT . leaveOrbit mOrbit $ e,False) -- perform resets when accepting 0 characters 495 in if cannotAccept q then return e' else mdo 496 mqt <- inStar q this 497 (this,ans) <- case mqt of 498 Nothing -> err ("Weird pattern in getTransTagless/Star: " ++ show (qTop,qIn)) 499 Just qt -> do 500 let qt' = resetOrbitsQT resetTheseOrbits . enterOrbitQT mOrbit $ qt -- resetOrbitsQT and enterOrbitQT commute 501 thisQT = mergeQT qt' . getQT . leaveOrbit mOrbit $ e -- capture of subpattern or leave via next pattern (avoid null of subpattern on way out) 502 ansE = fromQT . mergeQT qt' . getQT $ e' -- capture of subpattern or leave via null of subpattern 503 thisE <- if usesQNFA q 504 then return . fromQNFA =<< newQNFA "getTransTagless/Star" thisQT 505 else return . fromQT $ thisQT 506 return (thisE,ansE) 507 return (if mayFirstBeNull then (if clear then this -- optimization to possibly preserve QNFA 508 else ans) 509 else this) 510 511 {- NonEmpty is like actNullable (Or [Empty,q]) without the extra tag to prefer the first Empty branch -} 512 NonEmpty q -> ecart ("\n> getTransTagless/NonEmpty"++show qIn) $ do 513 -- Assertion to check than Pattern.starTrans did its job right: 514 when (cannotAccept q) (err $ "getTransTagless/NonEmpty : provided with a *cannotAccept* pattern: "++show (qTop,qIn)) 515 when (mustAccept q) (err $ "getTransTagless/NonEmpty : provided with a *mustAccept* pattern: "++show (qTop,qIn)) 516 let e' = case maybeOnlyEmpty qIn of 517 Just [] -> e 518 Just _wtags -> e -- addWinTags wtags e XXX was duplicating tags 519 Nothing -> err $ "getTransTagless/NonEmpty is supposed to have an emptyNull nullView : "++show qIn 520 mqt <- inStar q e 521 return $ case mqt of 522 Nothing -> err ("Weird pattern in getTransTagless/NonEmpty: " ++ show (qTop,qIn)) 523 Just qt -> fromQT . mergeQT_2nd qt . getQT $ e' -- ...and then this sets qt_win to exactly that of e' 524 _ -> err ("This case in Text.Regex.TNFA.TNFA.getTransTagless cannot happen" ++ show (qTop,qIn)) 525 526 inStar,inStarNullableTagless :: Q -> E -> S (Maybe QT) 527 inStar qIn@(Q {preReset=resets,postSet=sets,preTag=pre,postTag=post}) eLoop | notNullable qIn = 528 debug (">< inStar/1 "++show qIn++" <>") $ 529 return . Just . getQT =<< getTrans qIn eLoop 530 | otherwise = 531 debug (">< inStar/2 "++show qIn++" <>") $ 532 return . fmap (prependGroupResets resets . prependPreTag pre) =<< inStarNullableTagless qIn (addTag post . addGroupSets sets $ eLoop) 533 534 inStarNullableTagless qIn eLoop = debug (">< inStarNullableTagless "++show qIn++" <>") $ do 535 case unQ qIn of 536 Empty -> return Nothing -- with Or this discards () branch in "(^|foo|())*" 537 Or [] -> return Nothing 538 Or [q] -> inStar q eLoop 539 Or qs -> do 540 mqts <- if usesQNFA qIn 541 then do eQNFA <- asQNFA "inStarNullableTagless/Or/usesQNFA" eLoop 542 sequence [ inStar q eQNFA | q <- qs ] 543 else sequence [inStar q eLoop | q <- qs ] 544 let qts = catMaybes mqts 545 mqt = if null qts then Nothing else Just (foldr1 mergeAltQT qts) 546 return mqt 547 -- Calls to act are inlined by hand to actNullable. This returns only cases where q1 or q2 or both 548 -- accepted characters. The zero-character case is handled by the tag wrapping by inStar. 549 -- 2009: Does this look dodgy and repetitios of tags? Seq by policy has no preTag or postTag. 550 -- though it can have prependGroupResets, but those are not repeated in children so it is okay. 551 Seq q1 q2 -> do (_,meAcceptingOut,_) <- actNullable q1 =<< actNullable q2 (eLoop,Nothing,Nothing) 552 return (fmap getQT meAcceptingOut) 553 -- Calls to act are inlined by hand and are we losing the tags? 554 Star {} -> do (_,meAcceptingOut,_) <- actNullableTagless qIn (eLoop,Nothing,Nothing) 555 return (fmap getQT meAcceptingOut) 556 NonEmpty {} -> ecart ("\n> inStarNullableTagless/NonEmpty"++show qIn) $ 557 do (_,meAcceptingOut,_) <- actNullableTagless qIn (eLoop,Nothing,Nothing) 558 return (fmap getQT meAcceptingOut) 559 Test {} -> return Nothing -- with Or this discards ^ branch in "(^|foo|())*" 560 OneChar {} -> err ("OneChar cannot have nullable True") 561 562 {- act* functions 563 564 These have a very complicated state that they receive and return as 565 "the continuation". 566 567 (E, Maybe E,Maybe (SetTag,QNFA)) 568 569 The first E is the source of the danger that must be avoided. It 570 starts out a reference to the QNFA/QT state that will be created by 571 the most recent parent Star node. Thus it is a recursive reference 572 from the MonadFix machinery. In particular, this value cannot be 573 returned to the parent Star to be included in itself or we get a "let 574 x = y; y=x" style infinite loop. 575 576 As act* progresses the first E is actually modified to be the parent 577 QNFA/QT as "seen" when all the elements to the right have accepted 0 578 characters. Thus it acquires tags and tests+tags (the NullView data 579 is used for this purpose). 580 581 The second item in the 3-tuple is a Maybe E. This will be used as the 582 source of the QT for this contents of the Star QNFA/QT. It will be 583 merged with the Star's own continuation data. It starts out Nothing 584 and stays that way as long as there are no accepting transitions in 585 the Star's pattern. This is value (via getQT) returned by inStar. 586 587 The third item is a special optimization I added to remove a source 588 of orphaned QNFAs. A Star within Act will often have to create a 589 QNFA node. This cannot go into the second Maybe E item as Just 590 (SetTag,Left QNFA) because this QNFA can have pulled values from the 591 recursive parent Star's QNFA/QT in the first E value. Thus pulling 592 with getQT from the QNFA and using that as the Maybe E would likely 593 cause an infinite loop. This extra QNFA is stored in the thd3 594 location for use by getE. To improve it further it can accumulate 595 Tag information after being formed. 596 597 When a non nullable Q is handled by act it checks to see if the 598 third value is there, in which case it uses that QNFA as the total 599 continuation (subsumed in getE). Otherwise it merges the first E 600 with any (Just E) in the second value to form the continuation. 601 602 -} 603 604 act :: Q -> ActCont -> S (Maybe E) 605 act qIn c | nullable qIn = fmap snd3 $ actNullable qIn c 606 | otherwise = debug (">< act "++show qIn++" <>") $ do 607 mqt <- return . Just =<< getTrans qIn ( getE $ c ) 608 return mqt -- or "return (fromQT qtlose,mqt,Nothing)" 609 610 actNullable,actNullableTagless :: Q -> ActCont -> S ActCont 611 actNullable qIn@(Q {preReset=resets,postSet=sets,preTag=pre,postTag=post,unQ=pIn}) ac = 612 debug (">< actNullable "++show qIn++" <>") $ do 613 case pIn of 614 Empty -> return . addGroupResetsAC resets . addTagAC pre . addTagAC post . addGroupSetsAC sets $ ac 615 Test ti -> return . addGroupResetsAC resets . addTagAC pre . addTestAC ti . addTagAC post . addGroupSetsAC sets $ ac 616 OneChar {} -> err ("OneChar cannot have nullable True ") 617 _ -> return . addGroupResetsAC resets . addTagAC pre =<< actNullableTagless qIn ( addTagAC post . addGroupSetsAC sets $ ac ) 618 619 actNullableTagless qIn ac@(eLoop,mAccepting,mQNFA) = debug (">< actNullableTagless "++show (qIn)++" <>") $ do 620 case unQ qIn of 621 Seq q1 q2 -> actNullable q1 =<< actNullable q2 ac -- We know q1 and q2 are nullable 622 623 Or [] -> return ac 624 Or [q] -> actNullableTagless q ac 625 Or qs -> do 626 cqts <- do 627 if all nullable qs 628 then sequence [fmap snd3 $ actNullable q ac | q <- qs] 629 else do 630 e' <- asQNFA "qToNFA/actNullableTagless/Or" . getE $ ac 631 let act' :: Q -> S (Maybe E) 632 act' q = return . Just =<< getTrans q e' 633 sequence [ if nullable q then fmap snd3 $ actNullable q ac else act' q | q <- qs ] 634 let qts = map getQT (catMaybes cqts) 635 eLoop' = case maybeOnlyEmpty qIn of 636 Just wtags -> addWinTags wtags eLoop -- nullable without tests; avoid getQT 637 Nothing -> fromQT $ applyNullViews (nullQ qIn) (getQT eLoop) -- suspect this of duplicating some tags with nullQ qIn 638 mAccepting' = if null qts 639 then fmap (fromQT . applyNullViews (nullQ qIn) . getQT) mAccepting -- suspect this of duplicating some tags with nullQ qIn 640 else Just (fromQT $ foldr1 mergeAltQT qts) 641 mQNFA' = if null qts 642 then case maybeOnlyEmpty qIn of 643 Just wtags -> fmap (addWinTags wtags) mQNFA 644 Nothing -> Nothing 645 else Nothing 646 return (eLoop',mAccepting',mQNFA') 647 648 Star mOrbit resetTheseOrbits mayFirstBeNull q -> do 649 let (ac0@(_,mAccepting0,_),clear) = 650 if notNullable q 651 then (ac,True) 652 else if null resetTheseOrbits && isNothing mOrbit 653 then case maybeOnlyEmpty q of 654 Just [] -> (ac,True) 655 Just wtags -> (addWinTagsAC wtags ac,False) 656 _ -> let nQ = fromQT . preferNullViews (nullQ q) . getQT 657 in ((nQ eLoop,fmap nQ mAccepting,Nothing),False) 658 else let nQ = fromQT . resetOrbitsQT resetTheseOrbits . enterOrbitQT mOrbit 659 . preferNullViews (nullQ q) . getQT . leaveOrbit mOrbit 660 in ((nQ eLoop,fmap nQ mAccepting,Nothing),False) 661 if cannotAccept q then return ac0 else mdo 662 mChildAccepting <- act q (this,Nothing,Nothing) 663 (thisAC@(this,_,_),ansAC) <- 664 case mChildAccepting of 665 Nothing -> err $ "Weird pattern in getTransTagless/Star: " ++ show (qTop,qIn) 666 Just childAccepting -> do 667 let childQT = resetOrbitsQT resetTheseOrbits . enterOrbitQT mOrbit . getQT $ childAccepting 668 thisQT = mergeQT childQT . getQT . leaveOrbit mOrbit . getE $ ac 669 thisAccepting = 670 case mAccepting of 671 Just futureAccepting -> Just . fromQT . mergeQT childQT . getQT $ futureAccepting 672 Nothing -> Just . fromQT $ childQT 673 thisAll <- if usesQNFA q 674 then do thisQNFA <- newQNFA "actNullableTagless/Star" thisQT 675 return (fromQNFA thisQNFA, thisAccepting, Just (mempty,thisQNFA)) 676 else return (fromQT thisQT, thisAccepting, Nothing) 677 let skipQT = mergeQT childQT . getQT . getE $ ac0 -- for first iteration the continuation uses NullView 678 skipAccepting = 679 case mAccepting0 of 680 Just futureAccepting0 -> Just . fromQT . mergeQT childQT . getQT $ futureAccepting0 681 Nothing -> Just . fromQT $ childQT 682 ansAll = (fromQT skipQT, skipAccepting, Nothing) 683 return (thisAll,ansAll) 684 return (if mayFirstBeNull then (if clear then thisAC else ansAC) 685 else thisAC) 686 NonEmpty q -> ecart ("\n> actNullableTagless/NonEmpty"++show qIn) $ do 687 -- We *know* that q is nullable from Pattern and CorePattern checks, but assert here anyway 688 when (mustAccept q) (err $ "actNullableTagless/NonEmpty : provided with a *mustAccept* pattern: "++show (qTop,qIn)) 689 when (cannotAccept q) (err $ "actNullableTagless/NonEmpty : provided with a *cannotAccept* pattern: "++show (qTop,qIn)) 690 691 {- This is like actNullable (Or [Empty,q]) without the extra tag to prefer the first Empty branch -} 692 let (clearE,_,_) = case maybeOnlyEmpty qIn of 693 Just [] -> ac 694 Just _wtags -> ac -- addWinTagsAC wtags ac XXX was duplicating tags 695 Nothing -> err $ "actNullableTagless/NonEmpty is supposed to have an emptyNull nullView : "++show (qTop,qIn) 696 (_,mChildAccepting,_) <- actNullable q ac 697 case mChildAccepting of 698 Nothing -> err $ "Weird pattern in actNullableTagless/NonEmpty: " ++ show (qTop,qIn) 699 -- cannotAccept q checked for and excluded the above condition (and starTrans!) 700 Just childAccepting -> do 701 let childQT = getQT childAccepting 702 thisAccepting = case mAccepting of 703 Nothing -> Just . fromQT $ childQT 704 Just futureAcceptingE -> Just . fromQT . mergeQT childQT . getQT $ futureAcceptingE 705 -- I _think_ there is no need for mergeQT_2nd in the above. 706 return (clearE,thisAccepting,Nothing) 707 _ -> err $ "This case in Text.Regex.TNFA.TNFA.actNullableTagless cannot happen: "++show (qTop,qIn) 708 709 -- This is applied directly to any qt immediately before passing to mergeQT 710 resetOrbitsQT :: [Tag] -> QT -> QT 711 resetOrbitsQT | lastStarGreedy compOpt = const id 712 | otherwise = (\tags -> prependTags' [(tag,PreUpdate ResetOrbitTask)|tag<-tags]) 713 714 enterOrbitQT :: Maybe Tag -> QT -> QT 715 enterOrbitQT | lastStarGreedy compOpt = const id 716 | otherwise = maybe id (\tag->prependTags' [(tag,PreUpdate EnterOrbitTask)]) 717 718 leaveOrbit :: Maybe Tag -> E -> E 719 leaveOrbit | lastStarGreedy compOpt = const id 720 | otherwise = maybe id (\tag->(\(tags,cont)->((tag,LeaveOrbitTask):tags,cont))) 721 722 -- 'newTrans' is the only place where PostUpdate is used and is only called from getTrans/OneChar 723 -- and is the only caller of 'acceptTrans' to make QT/Simple nodes. 724 newTrans :: String -- debugging string for when a newQNFA is allocated 725 -> [Tag] -- which tags get ResetGroupStopTask in this transition (PreUpdate) 726 -> Maybe Tag -- maybe one TagTask to update before incrementing the offset (PreUpdate) 727 -> Pattern -- the one character accepting Pattern of this transition 728 -> E -- the continuation state, reified to a QNFA, of after this Pattern 729 -- The fst part of the E is consumed here as a TagTask (PostUpdate) 730 -> S E -- the continuation state, as a QT, of before this Pattern 731 newTrans s resets mPre pat (tags,cont) = do 732 i <- case cont of 733 Left qnfa -> return (q_id qnfa) -- strictQNFA ZZZ no help 734 Right qt -> do qnfa <- newQNFA s qt -- strictQT ZZZ no help 735 return (q_id qnfa) 736 let post = promoteTasks PostUpdate tags 737 pre = promoteTasks PreUpdate ([(tag,ResetGroupStopTask) | tag<-resets] ++ maybe [] (\tag -> [(tag,TagTask)]) mPre) 738 return . fromQT $ acceptTrans pre pat post i -- fromQT $ strictQT no help 739 740 -- 'acceptTrans' is the sole creator of QT/Simple and is only called by getTrans/OneChar/newTrans 741 acceptTrans :: TagList -> Pattern -> TagList -> Index -> QT 742 acceptTrans pre pIn post i = 743 let target = IMap.singleton i [(getDoPa pIn,pre++post)] 744 in case pIn of 745 PChar _ char -> 746 let trans = toMap target [char] 747 in Simple { qt_win = mempty, qt_trans = trans, qt_other = mempty } 748 PEscape _ char -> 749 let trans = toMap target [char] 750 in Simple { qt_win = mempty, qt_trans = trans, qt_other = mempty } 751 PDot _ -> Simple { qt_win = mempty, qt_trans = dotTrans, qt_other = target } 752 PAny _ ps -> 753 let trans = toMap target . S.toAscList . decodePatternSet $ ps 754 in Simple { qt_win = mempty, qt_trans = trans, qt_other = mempty } 755 PAnyNot _ ps -> 756 let trans = toMap mempty . S.toAscList . addNewline . decodePatternSet $ ps 757 in Simple { qt_win = mempty, qt_trans = trans, qt_other = target } 758 _ -> err ("Cannot acceptTrans pattern "++show (qTop,pIn)) 759 where -- Take a common destination and a sorted list of unique chraceters 760 -- and create a map from those characters to the common destination 761 toMap :: IntMap [(DoPa,[(Tag, TagUpdate)])] -> [Char] 762 -> CharMap (IntMap [(DoPa,[(Tag, TagUpdate)])]) 763 toMap dest | caseSensitive compOpt = CharMap . IMap.fromDistinctAscList . map (\c -> (ord c,dest)) 764 | otherwise = CharMap . IMap.fromList . ($ []) 765 . foldr (\c dl -> if isAlpha c 766 then (dl.((ord (toUpper c),dest):) 767 .((ord (toLower c),dest):) 768 ) 769 else (dl.((ord c,dest):)) 770 ) id 771 addNewline | multiline compOpt = S.insert '\n' 772 | otherwise = id 773 dotTrans | multiline compOpt = Map.singleton '\n' mempty 774 | otherwise = Mon.mempty 775 776{- 777 778prepend architecture becomes 779prependTags :: TagTask -> [Tag] -> QT -> QT 780which always uses PreUpdate and the same task for all the tags 781 782qt_win seems to only allow PreUpdate so why keep the same type? 783 784 785ADD ORPHAN ID check and make this a fatal error while testing 786 787-} 788 789-- | decodePatternSet cannot handle collating element and treats 790-- equivalence classes as just their definition and nothing more. 791decodePatternSet :: PatternSet -> S.Set Char 792decodePatternSet (PatternSet msc mscc _ msec) = 793 let baseMSC = maybe S.empty id msc 794 withMSCC = foldl (flip S.insert) baseMSC (maybe [] (concatMap decodeCharacterClass . S.toAscList) mscc) 795 withMSEC = foldl (flip S.insert) withMSCC (maybe [] (concatMap unSEC . S.toAscList) msec) 796 in withMSEC 797 798-- | This returns the disctince ascending list of characters 799-- represented by [: :] values in legalCharacterClasses; unrecognized 800-- class names return an empty string 801decodeCharacterClass :: PatternSetCharacterClass -> String 802decodeCharacterClass (PatternSetCharacterClass s) = 803 case s of 804 "alnum" -> ['0'..'9']++['a'..'z']++['A'..'Z'] 805 "digit" -> ['0'..'9'] 806 "punct" -> ['\33'..'\47']++['\58'..'\64']++['\91'..'\95']++"\96"++['\123'..'\126'] 807 "alpha" -> ['a'..'z']++['A'..'Z'] 808 "graph" -> ['\41'..'\126'] 809 "space" -> "\t\n\v\f\r " 810 "blank" -> "\t " 811 "lower" -> ['a'..'z'] 812 "upper" -> ['A'..'Z'] 813 "cntrl" -> ['\0'..'\31']++"\127" -- with NUL 814 "print" -> ['\32'..'\126'] 815 "xdigit" -> ['0'..'9']++['a'..'f']++['A'..'F'] 816 "word" -> ['0'..'9']++['a'..'z']++['A'..'Z']++"_" 817 _ -> [] 818 819{- 820-- | This is the list of recognized [: :] character classes, others 821-- are decoded as empty. 822legalCharacterClasses :: [String] 823legalCharacterClasses = ["alnum","digit","punct","alpha","graph" 824 ,"space","blank","lower","upper","cntrl","print","xdigit","word"] 825 826-} 827