1-----------------------------------------------------------------------------
2-- |
3-- Module      :  Distribution.Simple.PreProcess.Unlit
4-- Copyright   :  ...
5--
6-- Maintainer  :  cabal-devel@haskell.org
7-- Portability :  portable
8--
9-- Remove the \"literal\" markups from a Haskell source file, including
10-- \"@>@\", \"@\\begin{code}@\", \"@\\end{code}@\", and \"@#@\"
11
12-- This version is interesting because instead of striping comment lines, it
13-- turns them into "-- " style comments. This allows using haddock markup
14-- in literate scripts without having to use "> --" prefix.
15
16module Distribution.Simple.PreProcess.Unlit (unlit,plain) where
17
18import Prelude ()
19import Distribution.Compat.Prelude
20import Distribution.Utils.Generic (safeTail, safeLast, safeInit)
21
22import Data.List (mapAccumL)
23
24data Classified = BirdTrack String | Blank String | Ordinary String
25                | Line !Int String | CPP String
26                | BeginCode | EndCode
27                -- output only:
28                | Error String | Comment String
29
30-- | No unliteration.
31plain :: String -> String -> String
32plain _ hs = hs
33
34classify :: String -> Classified
35classify ('>':s) = BirdTrack s
36classify ('#':s) = case tokens s of
37                     (line:file@('"':_:_):_) | all isDigit line
38                                            && safeLast file == Just '"'
39                                -- this shouldn't fail as we tested for 'all isDigit'
40                                -> Line (fromMaybe (error $ "panic! read @Int " ++ show line) $ readMaybe line) (safeTail (safeInit file)) -- TODO:eradicateNoParse
41                     _          -> CPP s
42  where tokens = unfoldr $ \str -> case lex str of
43                                   (t@(_:_), str'):_ -> Just (t, str')
44                                   _                 -> Nothing
45classify ('\\':s)
46  | "begin{code}" `isPrefixOf` s = BeginCode
47  | "end{code}"   `isPrefixOf` s = EndCode
48classify s | all isSpace s       = Blank s
49classify s                       = Ordinary s
50
51-- So the weird exception for comment indenting is to make things work with
52-- haddock, see classifyAndCheckForBirdTracks below.
53unclassify :: Bool -> Classified -> String
54unclassify _     (BirdTrack s) = ' ':s
55unclassify _     (Blank s)     = s
56unclassify _     (Ordinary s)  = s
57unclassify _     (Line n file) = "# " ++ show n ++ " " ++ show file
58unclassify _     (CPP s)       = '#':s
59unclassify True  (Comment "")  = "  --"
60unclassify True  (Comment s)   = "  -- " ++ s
61unclassify False (Comment "")  = "--"
62unclassify False (Comment s)   = "-- " ++ s
63unclassify _     _             = internalError
64
65-- | 'unlit' takes a filename (for error reports), and transforms the
66--   given string, to eliminate the literate comments from the program text.
67unlit :: FilePath -> String -> Either String String
68unlit file input =
69  let (usesBirdTracks, classified) = classifyAndCheckForBirdTracks
70                                   . inlines
71                                   $ input
72   in either (Left . unlines . map (unclassify usesBirdTracks))
73              Right
74    . checkErrors
75    . reclassify
76    $ classified
77
78  where
79    -- So haddock requires comments and code to align, since it treats comments
80    -- as following the layout rule. This is a pain for us since bird track
81    -- style literate code typically gets indented by two since ">" is replaced
82    -- by " " and people usually use one additional space of indent ie
83    -- "> then the code". On the other hand we cannot just go and indent all
84    -- the comments by two since that does not work for latex style literate
85    -- code. So the hacky solution we use here is that if we see any bird track
86    -- style code then we'll indent all comments by two, otherwise by none.
87    -- Of course this will not work for mixed latex/bird track .lhs files but
88    -- nobody does that, it's silly and specifically recommended against in the
89    -- H98 unlit spec.
90    --
91    classifyAndCheckForBirdTracks =
92      flip mapAccumL False $ \seenBirdTrack line ->
93        let classification = classify line
94         in (seenBirdTrack || isBirdTrack classification, classification)
95
96    isBirdTrack (BirdTrack _) = True
97    isBirdTrack _             = False
98
99    checkErrors ls = case [ e | Error e <- ls ] of
100      []          -> Left  ls
101      (message:_) -> Right (f ++ ":" ++ show n ++ ": " ++ message)
102        where (f, n) = errorPos file 1 ls
103    errorPos f n []              = (f, n)
104    errorPos f n (Error _:_)     = (f, n)
105    errorPos _ _ (Line n' f':ls) = errorPos f' n' ls
106    errorPos f n (_         :ls) = errorPos f  (n+1) ls
107
108-- Here we model a state machine, with each state represented by
109-- a local function. We only have four states (well, five,
110-- if you count the error state), but the rules
111-- to transition between then are not so simple.
112-- Would it be simpler to have more states?
113--
114-- Each state represents the type of line that was last read
115-- i.e. are we in a comment section, or a latex-code section,
116-- or a bird-code section, etc?
117reclassify :: [Classified] -> [Classified]
118reclassify = blank -- begin in blank state
119  where
120    latex []               = []
121    latex (EndCode    :ls) = Blank "" : comment ls
122    latex (BeginCode  :_ ) = [Error "\\begin{code} in code section"]
123    latex (BirdTrack l:ls) = Ordinary ('>':l) : latex ls
124    latex (          l:ls) = l : latex ls
125
126    blank []               = []
127    blank (EndCode    :_ ) = [Error "\\end{code} without \\begin{code}"]
128    blank (BeginCode  :ls) = Blank ""    : latex ls
129    blank (BirdTrack l:ls) = BirdTrack l : bird ls
130    blank (Ordinary  l:ls) = Comment   l : comment ls
131    blank (          l:ls) =           l : blank ls
132
133    bird []              = []
134    bird (EndCode   :_ ) = [Error "\\end{code} without \\begin{code}"]
135    bird (BeginCode :ls) = Blank "" : latex ls
136    bird (Blank l   :ls) = Blank l  : blank ls
137    bird (Ordinary _:_ ) = [Error "program line before comment line"]
138    bird (         l:ls) = l : bird ls
139
140    comment []               = []
141    comment (EndCode    :_ ) = [Error "\\end{code} without \\begin{code}"]
142    comment (BeginCode  :ls) = Blank "" : latex ls
143    comment (CPP l      :ls) = CPP l : comment ls
144    comment (BirdTrack _:_ ) = [Error "comment line before program line"]
145    -- a blank line and another ordinary line following a comment
146    -- will be treated as continuing the comment. Otherwise it's
147    -- then end of the comment, with a blank line.
148    comment (Blank     l:ls@(Ordinary  _:_)) = Comment l : comment ls
149    comment (Blank     l:ls) = Blank l   : blank ls
150    comment (Line n f   :ls) = Line n f  : comment ls
151    comment (Ordinary  l:ls) = Comment l : comment ls
152    comment (Comment   _: _) = internalError
153    comment (Error     _: _) = internalError
154
155-- Re-implementation of 'lines', for better efficiency (but decreased laziness).
156-- Also, importantly, accepts non-standard DOS and Mac line ending characters.
157inlines :: String -> [String]
158inlines xs = lines' xs id
159  where
160  lines' []             acc = [acc []]
161  lines' ('\^M':'\n':s) acc = acc [] : lines' s id    -- DOS
162  lines' ('\^M':s)      acc = acc [] : lines' s id    -- MacOS
163  lines' ('\n':s)       acc = acc [] : lines' s id    -- Unix
164  lines' (c:s)          acc = lines' s (acc . (c:))
165
166internalError :: a
167internalError = error "unlit: internal error"
168