1-- XXX design uncertainty:  should preResets be inserted into nullView?
2-- if not, why not? ADDED
3
4-- XXX design uncertainty: what does act -> actNullable ->
5-- actNullableTagless not use nullQ and same for inStar, etc?
6-- TODO : try rewriting whole qToNFA in terms of "act"
7-- (That will require re-organizing the continuation data a bit)
8
9-- | "Text.Regex.TDFA.TNFA" converts the CorePattern Q\/P data (and its
10-- Pattern leafs) to a QNFA tagged non-deterministic finite automata.
11--
12-- This holds every possible way to follow one state by another, while
13-- in the DFA these will be reduced by picking a single best
14-- transition for each (soure,destination) pair.  The transitions are
15-- heavily and often redundantly annotated with tasks to perform, and
16-- this redundancy is reduced when picking the best transition.  So
17-- far, keeping all this information has helped fix bugs in both the
18-- design and implementation.
19--
20-- The QNFA for a Pattern with a starTraned Q\/P form with N one
21-- character accepting leaves has at most N+1 nodes.  These nodes
22-- repesent the future choices after accepting a leaf.  The processing
23-- of Or nodes often reduces this number by sharing at the end of the
24-- different paths.  Turning off capturing while compiling the pattern
25-- may (future extension) reduce this further for some patterns by
26-- processing Star with optimizations.  This compact design also means
27-- that tags are assigned not just to be updated before taking a
28-- transition (PreUpdate) but also after the transition (PostUpdate).
29--
30-- Uses recursive do notation.
31
32module Text.Regex.TDFA.TNFA(patternToNFA
33                            ,QNFA(..),QT(..),QTrans,TagUpdate(..)) where
34
35{- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -}
36
37import Control.Monad(when)
38import Control.Monad.State(State,runState,execState,get,put,modify)
39import Data.Array.IArray(Array,array)
40import Data.Char(toLower,toUpper,isAlpha,ord)
41import Data.List(foldl')
42import Data.IntMap (IntMap)
43import qualified Data.IntMap as IMap(toAscList,null,unionWith,singleton,fromList,fromDistinctAscList)
44import Data.IntMap.CharMap2(CharMap(..))
45import qualified Data.IntMap.CharMap2 as Map(null,singleton,map)
46import qualified Data.IntMap.EnumMap2 as EMap(null,keysSet,assocs)
47import Data.IntSet.EnumSet2(EnumSet)
48import qualified Data.IntSet.EnumSet2 as Set(singleton,toList,insert)
49import Data.Maybe(catMaybes,isNothing)
50import Data.Monoid as Mon(Monoid(..))
51import qualified Data.Set as S(Set,insert,toAscList,empty)
52
53import Text.Regex.TDFA.Common(QT(..),QNFA(..),QTrans,TagTask(..),TagUpdate(..),DoPa(..)
54                             ,CompOption(..)
55                             ,Tag,TagTasks,TagList,Index,WinTags,GroupIndex,GroupInfo(..)
56                             ,common_error,noWin,snd3,mapSnd)
57import Text.Regex.TDFA.CorePattern(Q(..),P(..),OP(..),WhichTest,cleanNullView,NullView
58                                  ,SetTestInfo(..),Wanted(..),TestInfo
59                                  ,mustAccept,cannotAccept,patternToQ)
60import Text.Regex.TDFA.Pattern(Pattern(..),PatternSet(..),unSEC,PatternSetCharacterClass(..))
61--import Debug.Trace
62
63ecart :: String -> a -> a
64ecart _ = id
65
66err :: String -> a
67err t = common_error "Text.Regex.TDFA.TNFA" t
68
69debug :: (Show a) => a -> s -> s
70debug _ s = s
71
72qtwin,qtlose :: QT
73-- qtwin is the continuation after matching the whole pattern.  It has
74-- no futher transitions and sets tag #1 to the current position.
75qtwin = Simple {qt_win=[(1,PreUpdate TagTask)],qt_trans=mempty,qt_other=mempty}
76-- qtlose is the continuation to nothing, used when ^ or $ tests fail.
77qtlose = Simple {qt_win=mempty,qt_trans=mempty,qt_other=mempty}
78
79patternToNFA :: CompOption
80             -> (Pattern,(GroupIndex, DoPa))
81             -> ((Index,Array Index QNFA)
82                ,Array Tag OP
83                ,Array GroupIndex [GroupInfo])
84patternToNFA compOpt pattern =
85  let (q,tags,groups) = patternToQ compOpt pattern
86      msg = unlines [ show q ]
87  in debug msg (qToNFA compOpt q,tags,groups)
88
89-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- ==
90-- Query function on Q
91
92nullable :: Q -> Bool
93nullable = not . null . nullQ
94
95notNullable :: Q -> Bool
96notNullable = null . nullQ
97
98-- This asks if the preferred (i.e. first) NullView has no tests.
99maybeOnlyEmpty :: Q -> Maybe WinTags
100maybeOnlyEmpty (Q {nullQ = ((SetTestInfo sti,tags):_)}) = if EMap.null sti then Just tags else Nothing
101maybeOnlyEmpty _ = Nothing
102
103usesQNFA :: Q -> Bool
104usesQNFA (Q {wants=WantsBoth}) = True
105usesQNFA (Q {wants=WantsQNFA}) = True
106usesQNFA _ = False
107
108-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- ==
109-- Functions related to QT
110
111-- dumb smart constructor used by qToQNFA
112-- Possible: Go through the qt and keep only the best tagged transition(s) to each state to make simple NFA?
113mkQNFA :: Index -> QT -> QNFA
114mkQNFA i qt = debug ("\n>QNFA id="++show i) $
115  QNFA i (debug ("\ngetting QT for "++show i) qt)
116
117-- This uses the Eq QT instance above
118-- ZZZ
119mkTesting :: QT -> QT
120mkTesting t@(Testing {qt_a=a,qt_b=b}) = if a==b then a else t -- Move to nfsToDFA XXX
121mkTesting t = t
122
123nullQT :: QT -> Bool
124nullQT (Simple {qt_win=w,qt_trans=t,qt_other=o}) = noWin w && Map.null t && IMap.null o
125nullQT _ = False
126
127-- This reconstructs the set of tests checked in processing QT, adding
128-- them to the passed set.
129listTestInfo :: QT -> EnumSet WhichTest -> EnumSet WhichTest
130listTestInfo qt s = execState (helper qt) s
131  where helper (Simple {}) = return ()
132        helper (Testing {qt_test = wt, qt_a = a, qt_b = b}) = do
133          modify (Set.insert wt)
134          helper a
135          helper b
136
137-- This is used to view "win" only through NullView, and is used in
138-- processing Or.
139applyNullViews :: NullView -> QT -> QT
140applyNullViews [] win = win
141applyNullViews nvs win = foldl' (dominate win) qtlose (reverse $ cleanNullView nvs) where
142
143-- This is used to prefer to view "win" through NullView.  Losing is
144-- replaced by the plain win.  This is employed by Star patterns to
145-- express that the first iteration is allowed to match null, but
146-- skipping the NullView occurs if the match fails.
147preferNullViews :: NullView -> QT -> QT
148preferNullViews [] win = win
149preferNullViews nvs win = foldl' (dominate win) win (reverse $ cleanNullView nvs) where
150
151{-
152dominate is common to applyNullViews and preferNullViews above.
153
154Even I no longer understand it without study.
155
156Oversimplified: The last argument has a new set of tests "sti" that
157must be satisfied to then apply the new "tags" and reach the "win" QT.
158Failing any of this set of tests leads to the "lose" QT.
159
160Closer: The "win" may already have some other set of tests leading to
161various branches, this set is cached in winTests.  And the "lose" may
162already have some other set of tests leading to various branches.  The
163combination of "win" and "lose" and "sti" must check the union of
164these tests, which is "allTests".
165
166Detail: The merging is done by useTest, where the tests in sti divert
167losing to a branch of "lose" and winning to a branch of "win".  Tests
168not in sti are unchanged (but the losing DoPa index might be added).
169-}
170dominate :: QT -> QT -> (SetTestInfo,WinTags) -> QT
171dominate win lose x@(SetTestInfo sti,tags) = debug ("dominate "++show x) $
172  let -- The winning states are reached through the SetTag
173      win' = prependTags' tags win
174      -- get the SetTestInfo
175      winTests = listTestInfo win $ mempty
176      allTests = (listTestInfo lose $ winTests) `mappend` (EMap.keysSet sti)
177      -- The first and second arguments of useTest are sorted
178      -- At all times the second argument of useTest is a subset of the first
179      useTest _ [] w _ = w -- no more dominating tests to fail to choose lose, so just choose win
180      useTest (aTest:tests) allD@((dTest,dopas):ds) w l =
181        let (wA,wB,wD) = branches w
182            (lA,lB,lD) = branches l
183            branches qt@(Testing {}) | aTest==qt_test qt = (qt_a qt,qt_b qt,qt_dopas qt)
184            branches qt = (qt,qt,mempty)
185        in if aTest == dTest
186             then Testing {qt_test = aTest
187                          ,qt_dopas = (dopas `mappend` wD) `mappend` lD
188                          ,qt_a = useTest tests ds wA lA
189                          ,qt_b = lB}
190             else Testing {qt_test = aTest
191                          ,qt_dopas = wD `mappend` lD
192                          ,qt_a = useTest tests allD wA lA
193                          ,qt_b = useTest tests allD wB lB}
194      useTest [] _ _  _ = err "This case in dominate.useText cannot happen: second argument would have to have been null and that is checked before this case"
195  in useTest (Set.toList allTests) (EMap.assocs sti) win' lose
196
197-- 'applyTest' is only used by addTest
198-- 2009: maybe need to keep track of whether a change is actually made
199-- (beyond DoPa tracking) to the QT.
200applyTest :: TestInfo -> QT -> QT
201applyTest (wt,dopa) qt | nullQT qt = qt
202                       | otherwise = applyTest' qt where
203  applyTest' :: QT -> QT
204  applyTest' q@(Simple {}) =
205    mkTesting $ Testing {qt_test = wt
206                        ,qt_dopas = Set.singleton dopa
207                        ,qt_a = q
208                        ,qt_b = qtlose}
209  applyTest' q@(Testing {qt_test=wt'}) =
210    case compare wt wt' of
211      LT -> Testing {qt_test = wt
212                    ,qt_dopas = Set.singleton dopa
213                    ,qt_a = q
214                    ,qt_b = qtlose}
215      EQ -> q {qt_dopas = Set.insert dopa (qt_dopas q)
216              ,qt_b = qtlose}
217      GT -> q {qt_a = applyTest' (qt_a q)
218              ,qt_b = applyTest' (qt_b q)}
219
220-- Three ways to merge a pair of QT's varying how winning transitions
221-- are handled.
222--
223-- mergeQT_2nd is used by the NonEmpty case and always discards the
224-- first argument's win and uses the second argment's win.
225--
226-- mergeAltQT is used by the Or cases and is biased to the first
227-- argument's winning transition, if present.
228--
229-- mergeQT is used by Star and mergeE and combines the winning
230-- transitions (concatenating the instructions).
231mergeQT_2nd,mergeAltQT,mergeQT :: QT -> QT -> QT
232mergeQT_2nd q1 q2 | nullQT q1 = q2
233                  | otherwise = mergeQTWith (\_ w2 -> w2) q1 q2
234
235mergeAltQT q1 q2 | nullQT q1 = q2  -- prefer winning with w1 then with w2
236                 | otherwise = mergeQTWith (\w1 w2 -> if noWin w1 then w2 else w1) q1 q2
237mergeQT q1 q2 | nullQT q1 = q2  -- union wins
238              | nullQT q2 = q1  -- union wins
239              | otherwise = mergeQTWith mappend q1 q2 -- no preference, win with combined SetTag XXX is the wrong thing! "(.?)*"
240
241-- This takes a function which implements a policy on mergining
242-- winning transitions and then merges all the transitions.  It opens
243-- the CharMap newtype for more efficient operation, then rewraps it.
244mergeQTWith :: (WinTags -> WinTags -> WinTags) -> QT -> QT -> QT
245mergeQTWith mergeWins = merge where
246  merge :: QT -> QT -> QT
247  merge (Simple w1 t1 o1) (Simple w2 t2 o2) =
248    let w' = mergeWins w1 w2
249        t' = fuseQTrans t1 o1 t2 o2
250        o' = mergeQTrans o1 o2
251    in Simple w' t' o'
252  merge t1@(Testing _ _ a1 b1) s2@(Simple {}) = mkTesting $
253    t1 {qt_a=(merge a1 s2), qt_b=(merge b1 s2)}
254  merge s1@(Simple {}) t2@(Testing _ _ a2 b2) = mkTesting $
255    t2 {qt_a=(merge s1 a2), qt_b=(merge s1 b2)}
256  merge t1@(Testing wt1 ds1 a1 b1) t2@(Testing wt2 ds2 a2 b2) = mkTesting $
257    case compare wt1 wt2 of
258      LT -> t1 {qt_a=(merge a1 t2), qt_b=(merge b1 t2)}
259      EQ -> Testing {qt_test = wt1 -- same as wt2
260                    ,qt_dopas = mappend ds1 ds2
261                    ,qt_a = merge a1 a2
262                    ,qt_b = merge b1 b2}
263      GT -> t2 {qt_a=(merge t1 a2), qt_b=(merge t1 b2)}
264
265  fuseQTrans :: (CharMap QTrans) -> QTrans
266             -> (CharMap QTrans) -> QTrans
267             -> CharMap QTrans
268  fuseQTrans (CharMap t1) o1 (CharMap t2) o2 = CharMap (IMap.fromDistinctAscList (fuse l1 l2)) where
269    l1 = IMap.toAscList t1
270    l2 = IMap.toAscList t2
271    fuse [] y  = mapSnd (mergeQTrans o1) y
272    fuse x  [] = mapSnd (mergeQTrans o2) x
273    fuse x@((xc,xa):xs) y@((yc,ya):ys) =
274      case compare xc yc of
275        LT -> (xc,mergeQTrans xa o2) : fuse xs y
276        EQ -> (xc,mergeQTrans xa ya) : fuse xs ys
277        GT -> (yc,mergeQTrans o1 ya) : fuse x  ys
278
279  mergeQTrans :: QTrans -> QTrans -> QTrans
280  mergeQTrans = IMap.unionWith mappend
281
282-- Note: There are no append* operations. There are only these
283-- prepend* operations because things are only prepended to the future
284-- continuation.  And the ordering is significant.
285
286-- This is only used in inStar/nullable
287prependPreTag :: Maybe Tag -> QT -> QT
288prependPreTag Nothing qt = qt
289prependPreTag (Just tag) qt = prependTags' [(tag,PreUpdate TagTask)] qt
290
291prependGroupResets :: [Tag] -> QT -> QT
292prependGroupResets [] qt = qt
293prependGroupResets tags qt = prependTags' [(tag,PreUpdate ResetGroupStopTask)|tag<-tags] qt
294
295prependTags' :: TagList -> QT -> QT
296prependTags' []  qt = qt
297prependTags' tcs' qt@(Testing {}) = qt { qt_a = prependTags' tcs' (qt_a qt)
298                                       , qt_b = prependTags' tcs' (qt_b qt) }
299prependTags' tcs' (Simple {qt_win=w,qt_trans=t,qt_other=o}) =
300  Simple { qt_win = if noWin w then w else tcs' `mappend` w
301         , qt_trans = Map.map prependQTrans t
302         , qt_other = prependQTrans o }
303  where prependQTrans = fmap (map (\(d,tcs) -> (d,tcs' `mappend` tcs)))
304
305-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- ==
306-- define type S which is a State monad, this allows the creation of the uniq QNFA ids and storing the QNFA
307-- in an ascending order difference list for later placement in an array.
308
309-- Type of State monad used inside qToNFA
310type S = State (Index                             -- Next available QNFA index
311               ,[(Index,QNFA)]->[(Index,QNFA)])    -- DList of previous QNFAs
312
313-- Type of continuation of the NFA, not much more complicated
314type E = (TagTasks            -- Things to do before the Either QNFA QT
315                              -- with OneChar these become PostUpdate otherwise they become PreUpdate
316         ,Either QNFA QT)     -- The future, packaged in the best way
317
318-- See documentation below before the 'act' function.  This is for use inside a Star pattern.
319type ActCont = ( E                      -- The eLoop is the dangerous recursive reference to continuation
320                                        -- future that loops while accepting zero more characters
321               , Maybe E                -- This holds the safe non-zero-character accepting continuation
322               , Maybe (TagTasks,QNFA)) -- optimized merger of the above, used only inside act, to avoid orphan QNFA id values
323
324-- newQNFA is the only operation that actually uses the monad get and put operations
325newQNFA :: String -> QT -> S QNFA
326newQNFA s qt = do
327  (thisI,oldQs) <- get
328  let futureI = succ thisI in seq futureI $ debug (">newQNFA< "++s++" : "++show thisI) $ do
329  let qnfa = mkQNFA thisI qt -- (strictQT qt) -- making strictQNFA kills test (1,11) ZZZ
330  put $! (futureI, oldQs . ((thisI,qnfa):))
331  return qnfa
332
333-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- ==
334-- E related functions
335
336fromQNFA :: QNFA -> E
337fromQNFA qnfa = (mempty,Left qnfa)
338
339fromQT :: QT -> E
340fromQT qt = (mempty,Right qt)
341
342-- Promises the output will match (_,Left _), used by Or cases when any branch wants a QNFA continuation
343asQNFA :: String -> E -> S E
344asQNFA _ x@(_,Left _) = return x
345asQNFA s (tags,Right qt) = do qnfa <- newQNFA s qt      -- YYY Policy choice: leave the tags
346                              return (tags, Left qnfa)
347
348-- Convert continuation E into a QNFA, only done at "top level" by qToNFA to get unique start state
349getQNFA :: String -> E -> S QNFA
350getQNFA _ ([],Left qnfa) = return qnfa
351getQNFA s (tags,Left qnfa) = newQNFA s (prependTags' (promoteTasks PreUpdate tags) (q_qt qnfa))
352getQNFA s (tags,Right qt) = newQNFA s (prependTags' (promoteTasks PreUpdate tags) qt)
353
354-- Extract the QT from the E
355getQT :: E -> QT
356getQT (tags,cont) = prependTags' (promoteTasks PreUpdate tags) (either q_qt id cont)
357
358-- 2009: This looks realllly dodgy, since it can convert a QNFA/Testing to a QT/Testing
359-- without actually achieving anything except adding a DoPa to the Testing.  A diagnostic
360-- series of runs might be needed to decide if this ever creates orphan id numbers.
361-- Then applyTest might need to keep track of whether it actually changes anything.
362addTest :: TestInfo -> E -> E
363addTest ti (tags,cont) = (tags, Right . applyTest ti . either q_qt id $ cont)
364
365-- This is used only with PreUpdate and PostUpdate as the first argument.
366promoteTasks :: (TagTask->TagUpdate) -> TagTasks -> TagList
367promoteTasks promote tags = map (\(tag,task) -> (tag,promote task)) tags
368
369-- only used in addWinTags
370demoteTags :: TagList -> TagTasks
371demoteTags = map helper
372  where helper (tag,PreUpdate tt) = (tag,tt)
373        helper (tag,PostUpdate tt) = (tag,tt)
374
375-- This is polymorphic so addWinTags can be cute below
376{-# INLINE addWinTags #-}
377addWinTags :: WinTags -> (TagTasks,a) -> (TagTasks,a)
378addWinTags wtags (tags,cont) = (demoteTags wtags `mappend` tags
379                               ,cont)
380
381{-# INLINE addTag' #-}
382-- This is polymorphic so addTagAC can be cute below
383addTag' :: Tag -> (TagTasks,a) -> (TagTasks,a)
384addTag' tag (tags,cont) = ((tag,TagTask):tags
385                          ,cont)
386
387-- a Maybe version of addTag' above, specializing 'a' to Either QNFA QT
388addTag :: Maybe Tag -> E -> E
389addTag Nothing e = e
390addTag (Just tag) e = addTag' tag e
391
392{-# INLINE addGroupResets #-}
393-- This is polymorphic so addGroupResetsAC can be cute below
394addGroupResets :: (Show a) => [Tag] -> (TagTasks,a) -> (TagTasks,a)
395addGroupResets [] x = x
396addGroupResets tags (tags',cont) = (foldr (:) tags' . map (\tag -> (tag,ResetGroupStopTask)) $ tags
397                                   ,cont)
398
399addGroupSets :: (Show a) => [Tag] -> (TagTasks,a) -> (TagTasks,a)
400addGroupSets [] x = x
401addGroupSets tags (tags',cont) = (foldr (:) tags' . map (\tag -> (tag,SetGroupStopTask)) $ tags
402                                 ,cont)
403
404-- Consume an ActCont.  Uses the mergeQT form to combine non-accepting
405-- and accepting view of the continuation.
406getE :: ActCont -> E
407getE (_,_,Just (tags,qnfa)) = (tags, Left qnfa)  -- consume optimized mQNFA value returned by Star
408getE (eLoop,Just accepting,_) = fromQT (mergeQT (getQT eLoop) (getQT accepting))
409getE (eLoop,Nothing,_) = eLoop
410
411-- 2009: See coment for addTest.  Here is a case where the third component might be a (Just qnfa) and it
412-- is being lost even though the added test might be redundant.
413addTestAC :: TestInfo -> ActCont -> ActCont
414addTestAC ti (e,mE,_) = (addTest ti e
415                        ,fmap (addTest ti) mE
416                        ,Nothing)
417
418-- These are AC versions of the add functions on E
419
420addTagAC :: Maybe Tag -> ActCont -> ActCont
421addTagAC Nothing ac = ac
422addTagAC (Just tag) (e,mE,mQNFA) = (addTag' tag e
423                                   ,fmap (addTag' tag) mE
424                                   ,fmap (addTag' tag) mQNFA)
425
426addGroupResetsAC :: [Tag] -> ActCont -> ActCont
427addGroupResetsAC [] ac = ac
428addGroupResetsAC tags (e,mE,mQNFA) = (addGroupResets tags e
429                                     ,fmap (addGroupResets tags) mE
430                                     ,fmap (addGroupResets tags) mQNFA)
431
432addGroupSetsAC :: [Tag] -> ActCont -> ActCont
433addGroupSetsAC [] ac = ac
434addGroupSetsAC tags (e,mE,mQNFA) = (addGroupSets tags e
435                                   ,fmap (addGroupSets tags) mE
436                                   ,fmap (addGroupSets tags) mQNFA)
437
438addWinTagsAC :: WinTags -> ActCont -> ActCont
439addWinTagsAC wtags (e,mE,mQNFA) = (addWinTags wtags e
440                                  ,fmap (addWinTags wtags) mE
441                                  ,fmap (addWinTags wtags) mQNFA)
442-- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- ==
443
444-- Initial preTag of 0th tag is implied. No other general pre-tags would be expected.
445-- The qtwin contains the preTag of the 1st tag and is only set when a match is completed.
446-- The fst Index is the index of the unique starting QNFA state.
447-- The snd (Array Index QNFA) is all the QNFA states.
448--
449-- In the cases below, Empty is handled much like a Test with no TestInfo.
450qToNFA :: CompOption -> Q -> (Index,Array Index QNFA)
451qToNFA compOpt qTop = (q_id startingQNFA
452                      ,array (0,pred lastIndex) (table [])) where
453  -- Result startingQNFA is the top level's index
454  -- State pair: fst 0 is the next state number (not yet used) going in, and lastIndex coming out (succ of last used)
455  --             snd id is the difference list of states going in, and the finished list coming out
456  (startingQNFA,(lastIndex,table)) =
457    runState (getTrans qTop (fromQT $ qtwin) >>= getQNFA "top level") startState
458  startState = (0,id)
459
460  getTrans,getTransTagless :: Q -> E -> S E
461  getTrans qIn@(Q {preReset=resets,postSet=sets,preTag=pre,postTag=post,unQ=pIn}) e = debug (">< getTrans "++show qIn++" <>") $
462    case pIn of
463      -- The case below is the ultimate consumer of every single OneChar in the input and the only caller of
464      -- newTrans/acceptTrans which is the sole source of QT/Simple nodes.
465      OneChar pat -> newTrans "getTrans/OneChar" resets pre pat . addTag post . addGroupSets sets $ e
466      Empty -> return . addGroupResets resets . addTag pre . addTag post . addGroupSets sets $ e
467      Test ti -> return . addGroupResets resets . addTag pre . addTest ti . addTag post . addGroupSets sets $ e
468      _ -> return . addGroupResets resets . addTag pre =<< getTransTagless qIn (addTag post . addGroupSets sets $ e)
469
470  getTransTagless qIn e = debug (">< getTransTagless "++show qIn++" <>") $
471    case unQ qIn of
472      Seq q1 q2 -> getTrans q1 =<< getTrans q2 e
473      Or [] -> return e
474      Or [q] -> getTrans q e
475      Or qs -> do
476        eqts <- if usesQNFA qIn
477                  then do
478                    eQNFA <- asQNFA "getTransTagless/Or/usesQNFA" e
479                    sequence [ getTrans q eQNFA | q <- qs ]
480                  else sequence [ getTrans q e | q <- qs ]
481        let qts = map getQT eqts
482        return (fromQT (foldr1 mergeAltQT qts))
483
484      Star mOrbit resetTheseOrbits mayFirstBeNull q ->
485        -- mOrbit of Just implies varies q and childGroups q
486        let (e',clear) = -- debug ("\n>"++show e++"\n"++show q++"\n<") $
487              if notNullable q then (e,True)  -- subpattern cannot be null
488                else if null resetTheseOrbits && isNothing mOrbit
489                       then case maybeOnlyEmpty q of
490                              Just [] -> (e,True)    -- True because null of subpattern is same as skipping subpattern
491                              Just tagList -> (addWinTags tagList e,False) -- null of subpattern NOT same as skipping
492                              _ -> (fromQT . preferNullViews (nullQ q) . getQT $ e,False)  -- is NOT same as skipping
493                       else (fromQT . resetOrbitsQT resetTheseOrbits . enterOrbitQT mOrbit -- resetOrbitsQT and enterOrbitQT commute
494                             . preferNullViews (nullQ q) . getQT . leaveOrbit mOrbit $ e,False)  -- perform resets when accepting 0 characters
495        in if cannotAccept q then return e' else mdo
496        mqt <- inStar q this
497        (this,ans) <- case mqt of
498                        Nothing -> err ("Weird pattern in getTransTagless/Star: " ++ show (qTop,qIn))
499                        Just qt -> do
500                          let qt' = resetOrbitsQT resetTheseOrbits . enterOrbitQT mOrbit $ qt -- resetOrbitsQT and enterOrbitQT commute
501                              thisQT = mergeQT qt' . getQT . leaveOrbit mOrbit $ e -- capture of subpattern or leave via next pattern (avoid null of subpattern on way out)
502                              ansE = fromQT . mergeQT qt' . getQT $ e' -- capture of subpattern or leave via null of subpattern
503                          thisE <- if usesQNFA q
504                                  then return . fromQNFA =<< newQNFA "getTransTagless/Star" thisQT
505                                  else return . fromQT $ thisQT
506                          return (thisE,ansE)
507        return (if mayFirstBeNull then (if clear then this  -- optimization to possibly preserve QNFA
508                                                 else ans)
509                  else this)
510
511      {- NonEmpty is like actNullable (Or [Empty,q]) without the extra tag to prefer the first Empty branch -}
512      NonEmpty q -> ecart ("\n> getTransTagless/NonEmpty"++show qIn)  $ do
513        -- Assertion to check than Pattern.starTrans did its job right:
514        when (cannotAccept q) (err $ "getTransTagless/NonEmpty : provided with a *cannotAccept* pattern: "++show (qTop,qIn))
515        when (mustAccept q) (err $ "getTransTagless/NonEmpty : provided with a *mustAccept* pattern: "++show (qTop,qIn))
516        let e' = case maybeOnlyEmpty qIn of
517                   Just [] -> e
518                   Just _wtags -> e -- addWinTags wtags e  XXX was duplicating tags
519                   Nothing -> err $ "getTransTagless/NonEmpty is supposed to have an emptyNull nullView : "++show qIn
520        mqt <- inStar q e
521        return $ case mqt of
522                   Nothing -> err ("Weird pattern in getTransTagless/NonEmpty: " ++ show (qTop,qIn))
523                   Just qt -> fromQT . mergeQT_2nd qt . getQT $ e' -- ...and then this sets qt_win to exactly that of e'
524      _ -> err ("This case in Text.Regex.TNFA.TNFA.getTransTagless cannot happen" ++ show (qTop,qIn))
525
526  inStar,inStarNullableTagless :: Q -> E -> S (Maybe QT)
527  inStar qIn@(Q {preReset=resets,postSet=sets,preTag=pre,postTag=post}) eLoop | notNullable qIn =
528    debug (">< inStar/1 "++show qIn++" <>") $
529    return . Just . getQT =<< getTrans qIn eLoop
530                                                                 | otherwise =
531    debug (">< inStar/2 "++show qIn++" <>") $
532    return . fmap (prependGroupResets resets . prependPreTag pre) =<< inStarNullableTagless qIn (addTag post . addGroupSets sets $ eLoop)
533
534  inStarNullableTagless qIn eLoop = debug (">< inStarNullableTagless "++show qIn++" <>") $ do
535    case unQ qIn of
536      Empty -> return Nothing -- with Or this discards () branch in "(^|foo|())*"
537      Or [] -> return Nothing
538      Or [q] -> inStar q eLoop
539      Or qs -> do
540        mqts <- if usesQNFA qIn
541                  then do eQNFA <- asQNFA "inStarNullableTagless/Or/usesQNFA" eLoop
542                          sequence [ inStar q eQNFA | q <- qs ]
543                  else sequence [inStar q eLoop | q <- qs ]
544        let qts = catMaybes mqts
545            mqt = if null qts then Nothing else Just (foldr1 mergeAltQT qts)
546        return mqt
547      -- Calls to act are inlined by hand to actNullable.  This returns only cases where q1 or q2 or both
548      -- accepted characters.  The zero-character case is handled by the tag wrapping by inStar.
549      -- 2009: Does this look dodgy and repetitios of tags?  Seq by policy has no preTag or postTag.
550      -- though it can have prependGroupResets, but those are not repeated in children so it is okay.
551      Seq q1 q2 -> do (_,meAcceptingOut,_) <- actNullable q1 =<< actNullable q2 (eLoop,Nothing,Nothing)
552                      return (fmap getQT meAcceptingOut)
553      -- Calls to act are inlined by hand and are we losing the tags?
554      Star {} -> do (_,meAcceptingOut,_) <- actNullableTagless qIn (eLoop,Nothing,Nothing)
555                    return (fmap getQT meAcceptingOut)
556      NonEmpty {} -> ecart ("\n> inStarNullableTagless/NonEmpty"++show qIn) $
557                     do (_,meAcceptingOut,_) <- actNullableTagless qIn (eLoop,Nothing,Nothing)
558                        return (fmap getQT meAcceptingOut)
559      Test {} -> return Nothing -- with Or this discards ^ branch in "(^|foo|())*"
560      OneChar {} -> err ("OneChar cannot have nullable True")
561
562  {- act* functions
563
564  These have a very complicated state that they receive and return as
565  "the continuation".
566
567   (E, Maybe E,Maybe (SetTag,QNFA))
568
569  The first E is the source of the danger that must be avoided.  It
570  starts out a reference to the QNFA/QT state that will be created by
571  the most recent parent Star node.  Thus it is a recursive reference
572  from the MonadFix machinery.  In particular, this value cannot be
573  returned to the parent Star to be included in itself or we get a "let
574  x = y; y=x" style infinite loop.
575
576  As act* progresses the first E is actually modified to be the parent
577  QNFA/QT as "seen" when all the elements to the right have accepted 0
578  characters.  Thus it acquires tags and tests+tags (the NullView data
579  is used for this purpose).
580
581  The second item in the 3-tuple is a Maybe E.  This will be used as the
582  source of the QT for this contents of the Star QNFA/QT.  It will be
583  merged with the Star's own continuation data.  It starts out Nothing
584  and stays that way as long as there are no accepting transitions in
585  the Star's pattern.  This is value (via getQT) returned by inStar.
586
587  The third item is a special optimization I added to remove a source
588  of orphaned QNFAs.  A Star within Act will often have to create a
589  QNFA node.  This cannot go into the second Maybe E item as Just
590  (SetTag,Left QNFA) because this QNFA can have pulled values from the
591  recursive parent Star's QNFA/QT in the first E value.  Thus pulling
592  with getQT from the QNFA and using that as the Maybe E would likely
593  cause an infinite loop.  This extra QNFA is stored in the thd3
594  location for use by getE. To improve it further it can accumulate
595  Tag information after being formed.
596
597  When a non nullable Q is handled by act it checks to see if the
598  third value is there, in which case it uses that QNFA as the total
599  continuation (subsumed in getE).  Otherwise it merges the first E
600  with any (Just E) in the second value to form the continuation.
601
602  -}
603
604  act :: Q -> ActCont -> S (Maybe E)
605  act qIn c | nullable qIn = fmap snd3 $ actNullable qIn c
606            | otherwise = debug (">< act "++show qIn++" <>") $ do
607    mqt <- return . Just =<< getTrans qIn ( getE $ c )
608    return mqt  -- or "return (fromQT qtlose,mqt,Nothing)"
609
610  actNullable,actNullableTagless :: Q -> ActCont -> S ActCont
611  actNullable qIn@(Q {preReset=resets,postSet=sets,preTag=pre,postTag=post,unQ=pIn}) ac =
612    debug (">< actNullable "++show qIn++" <>") $ do
613    case pIn of
614      Empty -> return . addGroupResetsAC resets . addTagAC pre . addTagAC post . addGroupSetsAC sets $ ac
615      Test ti -> return . addGroupResetsAC resets . addTagAC pre . addTestAC ti . addTagAC post . addGroupSetsAC sets $ ac
616      OneChar {} -> err ("OneChar cannot have nullable True ")
617      _ -> return . addGroupResetsAC resets . addTagAC pre =<< actNullableTagless qIn ( addTagAC post . addGroupSetsAC sets $ ac )
618
619  actNullableTagless qIn ac@(eLoop,mAccepting,mQNFA) = debug (">< actNullableTagless "++show (qIn)++" <>") $ do
620    case unQ qIn of
621      Seq q1 q2 -> actNullable q1 =<< actNullable q2 ac   -- We know q1 and q2 are nullable
622
623      Or [] -> return ac
624      Or [q] -> actNullableTagless q ac
625      Or qs -> do
626        cqts <- do
627          if all nullable qs
628            then sequence [fmap snd3 $ actNullable q ac | q <- qs]
629            else do
630              e' <- asQNFA "qToNFA/actNullableTagless/Or" . getE $ ac
631              let act' :: Q -> S (Maybe E)
632                  act' q = return . Just =<< getTrans q e'
633              sequence [ if nullable q then fmap snd3 $ actNullable q ac else act' q | q <- qs ]
634        let qts = map getQT (catMaybes cqts)
635            eLoop' = case maybeOnlyEmpty qIn of
636                       Just wtags -> addWinTags wtags eLoop -- nullable without tests; avoid getQT
637                       Nothing -> fromQT $ applyNullViews (nullQ qIn) (getQT eLoop) -- suspect this of duplicating some tags with nullQ qIn
638            mAccepting' = if null qts
639                            then fmap (fromQT . applyNullViews (nullQ qIn) . getQT) mAccepting -- suspect this of duplicating some tags with nullQ qIn
640                            else Just (fromQT $ foldr1 mergeAltQT qts)
641            mQNFA' = if null qts
642                       then case maybeOnlyEmpty qIn of
643                              Just wtags -> fmap (addWinTags wtags) mQNFA
644                              Nothing -> Nothing
645                       else Nothing
646        return (eLoop',mAccepting',mQNFA')
647
648      Star mOrbit resetTheseOrbits mayFirstBeNull q -> do
649        let (ac0@(_,mAccepting0,_),clear) =
650              if notNullable q
651                then (ac,True)
652                else if null resetTheseOrbits && isNothing mOrbit
653                       then case maybeOnlyEmpty q of
654                              Just [] -> (ac,True)
655                              Just wtags -> (addWinTagsAC wtags ac,False)
656                              _ -> let nQ = fromQT . preferNullViews (nullQ q) . getQT
657                                   in ((nQ eLoop,fmap nQ mAccepting,Nothing),False)
658                       else let nQ = fromQT . resetOrbitsQT resetTheseOrbits . enterOrbitQT mOrbit
659                                     . preferNullViews (nullQ q) . getQT . leaveOrbit mOrbit
660                            in ((nQ eLoop,fmap nQ mAccepting,Nothing),False)
661        if cannotAccept q then return ac0 else mdo
662          mChildAccepting <- act q (this,Nothing,Nothing)
663          (thisAC@(this,_,_),ansAC) <-
664            case mChildAccepting of
665              Nothing -> err $ "Weird pattern in getTransTagless/Star: " ++ show (qTop,qIn)
666              Just childAccepting -> do
667                let childQT = resetOrbitsQT resetTheseOrbits . enterOrbitQT mOrbit . getQT $ childAccepting
668                    thisQT = mergeQT childQT . getQT . leaveOrbit mOrbit . getE $ ac
669                    thisAccepting =
670                      case mAccepting of
671                        Just futureAccepting -> Just . fromQT . mergeQT childQT . getQT $ futureAccepting
672                        Nothing -> Just . fromQT $ childQT
673                thisAll <- if usesQNFA q
674                             then do thisQNFA <- newQNFA "actNullableTagless/Star" thisQT
675                                     return (fromQNFA thisQNFA, thisAccepting, Just (mempty,thisQNFA))
676                             else return (fromQT thisQT, thisAccepting, Nothing)
677                let skipQT = mergeQT childQT . getQT . getE $ ac0  -- for first iteration the continuation uses NullView
678                    skipAccepting =
679                      case mAccepting0 of
680                        Just futureAccepting0 -> Just . fromQT . mergeQT childQT . getQT $ futureAccepting0
681                        Nothing -> Just . fromQT $ childQT
682                    ansAll = (fromQT skipQT, skipAccepting, Nothing)
683                return (thisAll,ansAll)
684          return (if mayFirstBeNull then (if clear then thisAC else ansAC)
685                    else thisAC)
686      NonEmpty q -> ecart ("\n> actNullableTagless/NonEmpty"++show qIn) $ do
687        -- We *know* that q is nullable from Pattern and CorePattern checks, but assert here anyway
688        when (mustAccept q) (err $ "actNullableTagless/NonEmpty : provided with a *mustAccept* pattern: "++show (qTop,qIn))
689        when (cannotAccept q) (err $ "actNullableTagless/NonEmpty : provided with a *cannotAccept* pattern: "++show (qTop,qIn))
690
691        {- This is like actNullable (Or [Empty,q]) without the extra tag to prefer the first Empty branch -}
692        let (clearE,_,_) = case maybeOnlyEmpty qIn of
693                             Just [] -> ac
694                             Just _wtags -> ac -- addWinTagsAC wtags ac XXX was duplicating tags
695                             Nothing -> err $ "actNullableTagless/NonEmpty is supposed to have an emptyNull nullView : "++show (qTop,qIn)
696        (_,mChildAccepting,_) <- actNullable q ac
697        case mChildAccepting of
698          Nothing -> err  $ "Weird pattern in actNullableTagless/NonEmpty: " ++ show (qTop,qIn)
699            -- cannotAccept q checked for and excluded the above condition (and starTrans!)
700          Just childAccepting -> do
701            let childQT = getQT childAccepting
702                thisAccepting = case mAccepting of
703                                  Nothing -> Just . fromQT $ childQT
704                                  Just futureAcceptingE -> Just . fromQT . mergeQT childQT . getQT $ futureAcceptingE
705                                  -- I _think_ there is no need for mergeQT_2nd in the above.
706            return (clearE,thisAccepting,Nothing)
707      _ -> err $ "This case in Text.Regex.TNFA.TNFA.actNullableTagless cannot happen: "++show (qTop,qIn)
708
709  -- This is applied directly to any qt immediately before passing to mergeQT
710  resetOrbitsQT :: [Tag] -> QT -> QT
711  resetOrbitsQT | lastStarGreedy compOpt = const id
712                | otherwise = (\tags -> prependTags' [(tag,PreUpdate ResetOrbitTask)|tag<-tags])
713
714  enterOrbitQT :: Maybe Tag -> QT -> QT
715  enterOrbitQT | lastStarGreedy compOpt = const id
716               | otherwise = maybe id (\tag->prependTags' [(tag,PreUpdate EnterOrbitTask)])
717
718  leaveOrbit :: Maybe Tag -> E -> E
719  leaveOrbit | lastStarGreedy compOpt = const id
720             | otherwise = maybe id (\tag->(\(tags,cont)->((tag,LeaveOrbitTask):tags,cont)))
721
722  -- 'newTrans' is the only place where PostUpdate is used and is only called from getTrans/OneChar
723  --  and is the only caller of 'acceptTrans' to make QT/Simple nodes.
724  newTrans :: String    -- debugging string for when a newQNFA is allocated
725           -> [Tag]     -- which tags get ResetGroupStopTask in this transition (PreUpdate)
726           -> Maybe Tag -- maybe one TagTask to update before incrementing the offset (PreUpdate)
727           -> Pattern   -- the one character accepting Pattern of this transition
728           -> E         -- the continuation state, reified to a QNFA, of after this Pattern
729                       -- The fst part of the E is consumed here as a TagTask (PostUpdate)
730           -> S E       -- the continuation state, as a QT, of before this Pattern
731  newTrans s resets mPre pat (tags,cont) = do
732    i <- case cont of
733           Left qnfa -> return (q_id qnfa)     -- strictQNFA ZZZ no help
734           Right qt -> do qnfa <- newQNFA s qt -- strictQT ZZZ no help
735                          return (q_id qnfa)
736    let post = promoteTasks PostUpdate tags
737        pre  = promoteTasks PreUpdate ([(tag,ResetGroupStopTask) | tag<-resets] ++ maybe [] (\tag -> [(tag,TagTask)]) mPre)
738    return . fromQT $ acceptTrans pre pat post i -- fromQT $ strictQT no help
739
740  -- 'acceptTrans' is the sole creator of QT/Simple and is only called by getTrans/OneChar/newTrans
741  acceptTrans :: TagList -> Pattern -> TagList -> Index -> QT
742  acceptTrans pre pIn post i =
743    let target = IMap.singleton i [(getDoPa pIn,pre++post)]
744    in case pIn of
745         PChar _ char ->
746           let trans = toMap target [char]
747           in Simple { qt_win = mempty, qt_trans = trans, qt_other = mempty }
748         PEscape _ char ->
749           let trans = toMap target [char]
750           in Simple { qt_win = mempty, qt_trans = trans, qt_other = mempty }
751         PDot _ -> Simple { qt_win = mempty, qt_trans = dotTrans, qt_other = target }
752         PAny _ ps ->
753           let trans = toMap target . S.toAscList . decodePatternSet $ ps
754           in Simple { qt_win = mempty, qt_trans = trans, qt_other = mempty }
755         PAnyNot _ ps ->
756           let trans = toMap mempty . S.toAscList . addNewline . decodePatternSet $ ps
757           in Simple { qt_win = mempty, qt_trans = trans, qt_other = target }
758         _ -> err ("Cannot acceptTrans pattern "++show (qTop,pIn))
759    where  -- Take a common destination and a sorted list of unique chraceters
760           -- and create a map from those characters to the common destination
761      toMap :: IntMap [(DoPa,[(Tag, TagUpdate)])] -> [Char]
762            -> CharMap (IntMap [(DoPa,[(Tag, TagUpdate)])])
763      toMap dest | caseSensitive compOpt = CharMap . IMap.fromDistinctAscList . map (\c -> (ord c,dest))
764                 | otherwise = CharMap . IMap.fromList . ($ [])
765                               . foldr (\c dl -> if isAlpha c
766                                                   then (dl.((ord (toUpper c),dest):)
767                                                           .((ord (toLower c),dest):)
768                                                        )
769                                                   else (dl.((ord c,dest):))
770                                       ) id
771      addNewline | multiline compOpt = S.insert '\n'
772                 | otherwise = id
773      dotTrans | multiline compOpt = Map.singleton '\n' mempty
774               | otherwise = Mon.mempty
775
776{-
777
778prepend architecture becomes
779prependTags :: TagTask -> [Tag] -> QT -> QT
780which always uses PreUpdate and the same task for all the tags
781
782qt_win seems to only allow PreUpdate so why keep the same type?
783
784
785ADD ORPHAN ID check and make this a fatal error while testing
786
787-}
788
789-- | decodePatternSet cannot handle collating element and treats
790-- equivalence classes as just their definition and nothing more.
791decodePatternSet :: PatternSet -> S.Set Char
792decodePatternSet (PatternSet msc mscc _ msec) =
793  let baseMSC = maybe S.empty id msc
794      withMSCC = foldl (flip S.insert) baseMSC  (maybe [] (concatMap decodeCharacterClass . S.toAscList) mscc)
795      withMSEC = foldl (flip S.insert) withMSCC (maybe [] (concatMap unSEC . S.toAscList) msec)
796  in withMSEC
797
798-- | This returns the disctince ascending list of characters
799-- represented by [: :] values in legalCharacterClasses; unrecognized
800-- class names return an empty string
801decodeCharacterClass :: PatternSetCharacterClass -> String
802decodeCharacterClass (PatternSetCharacterClass s) =
803  case s of
804    "alnum" -> ['0'..'9']++['a'..'z']++['A'..'Z']
805    "digit" -> ['0'..'9']
806    "punct" -> ['\33'..'\47']++['\58'..'\64']++['\91'..'\95']++"\96"++['\123'..'\126']
807    "alpha" -> ['a'..'z']++['A'..'Z']
808    "graph" -> ['\41'..'\126']
809    "space" -> "\t\n\v\f\r "
810    "blank" -> "\t "
811    "lower" -> ['a'..'z']
812    "upper" -> ['A'..'Z']
813    "cntrl" -> ['\0'..'\31']++"\127" -- with NUL
814    "print" -> ['\32'..'\126']
815    "xdigit" -> ['0'..'9']++['a'..'f']++['A'..'F']
816    "word" -> ['0'..'9']++['a'..'z']++['A'..'Z']++"_"
817    _ -> []
818
819{-
820-- | This is the list of recognized [: :] character classes, others
821-- are decoded as empty.
822legalCharacterClasses :: [String]
823legalCharacterClasses = ["alnum","digit","punct","alpha","graph"
824  ,"space","blank","lower","upper","cntrl","print","xdigit","word"]
825
826-}
827