1-- -----------------------------------------------------------------------------
2--
3-- CharSet.hs, part of Alex
4--
5-- (c) Chris Dornan 1995-2000, Simon Marlow 2003
6--
7-- An abstract CharSet type for Alex.  To begin with we'll use Alex's
8-- original definition of sets as functions, then later will
9-- transition to something that will work better with Unicode.
10--
11-- ----------------------------------------------------------------------------}
12
13module CharSet (
14  setSingleton,
15
16  Encoding(..),
17
18  Byte,
19  ByteSet,
20  byteSetSingleton,
21  byteRanges,
22  byteSetRange,
23
24  CharSet, -- abstract
25  emptyCharSet,
26  charSetSingleton,
27  charSet,
28  charSetMinus,
29  charSetComplement,
30  charSetRange,
31  charSetUnion,
32  charSetQuote,
33  setUnions,
34  byteSetToArray,
35  byteSetElems,
36  byteSetElem
37  ) where
38
39import Data.Array
40import Data.Ranged
41import Data.Word
42import Data.Maybe (catMaybes)
43import Data.Char (chr,ord)
44import UTF8
45
46type Byte = Word8
47-- Implementation as functions
48type CharSet = RSet Char
49type ByteSet = RSet Byte
50-- type Utf8Set = RSet [Byte]
51type Utf8Range = Span [Byte]
52
53data Encoding = Latin1 | UTF8
54              deriving (Eq, Show)
55
56emptyCharSet :: CharSet
57emptyCharSet = rSetEmpty
58
59byteSetElem :: ByteSet -> Byte -> Bool
60byteSetElem = rSetHas
61
62charSetSingleton :: Char -> CharSet
63charSetSingleton = rSingleton
64
65setSingleton :: DiscreteOrdered a => a -> RSet a
66setSingleton = rSingleton
67
68charSet :: [Char] -> CharSet
69charSet = setUnions . fmap charSetSingleton
70
71charSetMinus :: CharSet -> CharSet -> CharSet
72charSetMinus = rSetDifference
73
74charSetUnion :: CharSet -> CharSet -> CharSet
75charSetUnion = rSetUnion
76
77setUnions :: DiscreteOrdered a => [RSet a] -> RSet a
78setUnions = foldr rSetUnion rSetEmpty
79
80charSetComplement :: CharSet -> CharSet
81charSetComplement = rSetNegation
82
83charSetRange :: Char -> Char -> CharSet
84charSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)]
85
86byteSetToArray :: ByteSet -> Array Byte Bool
87byteSetToArray set = array (fst (head ass), fst (last ass)) ass
88  where ass = [(c,rSetHas set c) | c <- [0..0xff]]
89
90byteSetElems :: ByteSet -> [Byte]
91byteSetElems set = [c | c <- [0 .. 0xff], rSetHas set c]
92
93charToRanges :: Encoding -> CharSet -> [Utf8Range]
94charToRanges Latin1 =
95    map (fmap ((: []).fromIntegral.ord)) -- Span [Byte]
96  . catMaybes
97  . fmap (charRangeToCharSpan False)
98  . rSetRanges
99charToRanges UTF8 =
100    concat                  -- Span [Byte]
101  . fmap toUtfRange         -- [Span [Byte]]
102  . fmap (fmap UTF8.encode) -- Span [Byte]
103  . catMaybes
104  . fmap (charRangeToCharSpan True)
105  . rSetRanges
106
107-- | Turns a range of characters expressed as a pair of UTF-8 byte sequences into a set of ranges, in which each range of the resulting set is between pairs of sequences of the same length
108toUtfRange :: Span [Byte] -> [Span [Byte]]
109toUtfRange (Span x y) = fix x y
110
111fix :: [Byte] -> [Byte] -> [Span [Byte]]
112fix x y
113    | length x == length y = [Span x y]
114    | length x == 1 = Span x [0x7F] : fix [0xC2,0x80] y
115    | length x == 2 = Span x [0xDF,0xBF] : fix [0xE0,0x80,0x80] y
116    | length x == 3 = Span x [0xEF,0xBF,0xBF] : fix [0xF0,0x80,0x80,0x80] y
117    | otherwise = error "fix: incorrect input given"
118
119
120byteRangeToBytePair :: Span [Byte] -> ([Byte],[Byte])
121byteRangeToBytePair (Span x y) = (x,y)
122
123data Span a = Span a a -- lower bound inclusive, higher bound exclusive
124                       -- (SDM: upper bound inclusive, surely?)
125instance Functor Span where
126    fmap f (Span x y) = Span (f x) (f y)
127
128charRangeToCharSpan :: Bool -> Range Char -> Maybe (Span Char)
129charRangeToCharSpan _ (Range BoundaryAboveAll _) = Nothing
130charRangeToCharSpan _ (Range (BoundaryAbove c) _) | c == maxBound = Nothing
131charRangeToCharSpan _ (Range _ BoundaryBelowAll) = Nothing
132charRangeToCharSpan _ (Range _ (BoundaryBelow c)) | c == minBound = Nothing
133charRangeToCharSpan uni (Range x y) = Just (Span (l x) (h y))
134    where l b = case b of
135            BoundaryBelowAll -> '\0'
136            BoundaryBelow a  -> a
137            BoundaryAbove a  -> succ a
138            BoundaryAboveAll -> error "panic: charRangeToCharSpan"
139          h b = case b of
140            BoundaryBelowAll -> error "panic: charRangeToCharSpan"
141            BoundaryBelow a  -> pred a
142            BoundaryAbove a  -> a
143            BoundaryAboveAll | uni -> chr 0x10ffff
144                             | otherwise -> chr 0xff
145
146byteRanges :: Encoding -> CharSet -> [([Byte],[Byte])]
147byteRanges enc =  fmap byteRangeToBytePair . charToRanges enc
148
149byteSetRange :: Byte -> Byte -> ByteSet
150byteSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)]
151
152byteSetSingleton :: Byte -> ByteSet
153byteSetSingleton = rSingleton
154
155-- TODO: More efficient generated code!
156charSetQuote :: CharSet -> String
157charSetQuote s = "(\\c -> " ++ foldr (\x y -> x ++ " || " ++ y) "False" (map quoteRange (rSetRanges s)) ++ ")"
158    where quoteRange (Range l h) = quoteL l ++ " && " ++ quoteH h
159          quoteL (BoundaryAbove a) = "c > " ++ show a
160          quoteL (BoundaryBelow a) = "c >= " ++ show a
161          quoteL (BoundaryAboveAll) = "False"
162          quoteL (BoundaryBelowAll) = "True"
163          quoteH (BoundaryAbove a) = "c <= " ++ show a
164          quoteH (BoundaryBelow a) = "c < " ++ show a
165          quoteH (BoundaryAboveAll) = "True"
166          quoteH (BoundaryBelowAll) = "False"
167
168