1-- ----------------------------------------------------------------------------- 2-- 3-- CharSet.hs, part of Alex 4-- 5-- (c) Chris Dornan 1995-2000, Simon Marlow 2003 6-- 7-- An abstract CharSet type for Alex. To begin with we'll use Alex's 8-- original definition of sets as functions, then later will 9-- transition to something that will work better with Unicode. 10-- 11-- ----------------------------------------------------------------------------} 12 13module CharSet ( 14 setSingleton, 15 16 Encoding(..), 17 18 Byte, 19 ByteSet, 20 byteSetSingleton, 21 byteRanges, 22 byteSetRange, 23 24 CharSet, -- abstract 25 emptyCharSet, 26 charSetSingleton, 27 charSet, 28 charSetMinus, 29 charSetComplement, 30 charSetRange, 31 charSetUnion, 32 charSetQuote, 33 setUnions, 34 byteSetToArray, 35 byteSetElems, 36 byteSetElem 37 ) where 38 39import Data.Array 40import Data.Ranged 41import Data.Word 42import Data.Maybe (catMaybes) 43import Data.Char (chr,ord) 44import UTF8 45 46type Byte = Word8 47-- Implementation as functions 48type CharSet = RSet Char 49type ByteSet = RSet Byte 50-- type Utf8Set = RSet [Byte] 51type Utf8Range = Span [Byte] 52 53data Encoding = Latin1 | UTF8 54 deriving (Eq, Show) 55 56emptyCharSet :: CharSet 57emptyCharSet = rSetEmpty 58 59byteSetElem :: ByteSet -> Byte -> Bool 60byteSetElem = rSetHas 61 62charSetSingleton :: Char -> CharSet 63charSetSingleton = rSingleton 64 65setSingleton :: DiscreteOrdered a => a -> RSet a 66setSingleton = rSingleton 67 68charSet :: [Char] -> CharSet 69charSet = setUnions . fmap charSetSingleton 70 71charSetMinus :: CharSet -> CharSet -> CharSet 72charSetMinus = rSetDifference 73 74charSetUnion :: CharSet -> CharSet -> CharSet 75charSetUnion = rSetUnion 76 77setUnions :: DiscreteOrdered a => [RSet a] -> RSet a 78setUnions = foldr rSetUnion rSetEmpty 79 80charSetComplement :: CharSet -> CharSet 81charSetComplement = rSetNegation 82 83charSetRange :: Char -> Char -> CharSet 84charSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)] 85 86byteSetToArray :: ByteSet -> Array Byte Bool 87byteSetToArray set = array (fst (head ass), fst (last ass)) ass 88 where ass = [(c,rSetHas set c) | c <- [0..0xff]] 89 90byteSetElems :: ByteSet -> [Byte] 91byteSetElems set = [c | c <- [0 .. 0xff], rSetHas set c] 92 93charToRanges :: Encoding -> CharSet -> [Utf8Range] 94charToRanges Latin1 = 95 map (fmap ((: []).fromIntegral.ord)) -- Span [Byte] 96 . catMaybes 97 . fmap (charRangeToCharSpan False) 98 . rSetRanges 99charToRanges UTF8 = 100 concat -- Span [Byte] 101 . fmap toUtfRange -- [Span [Byte]] 102 . fmap (fmap UTF8.encode) -- Span [Byte] 103 . catMaybes 104 . fmap (charRangeToCharSpan True) 105 . rSetRanges 106 107-- | Turns a range of characters expressed as a pair of UTF-8 byte sequences into a set of ranges, in which each range of the resulting set is between pairs of sequences of the same length 108toUtfRange :: Span [Byte] -> [Span [Byte]] 109toUtfRange (Span x y) = fix x y 110 111fix :: [Byte] -> [Byte] -> [Span [Byte]] 112fix x y 113 | length x == length y = [Span x y] 114 | length x == 1 = Span x [0x7F] : fix [0xC2,0x80] y 115 | length x == 2 = Span x [0xDF,0xBF] : fix [0xE0,0x80,0x80] y 116 | length x == 3 = Span x [0xEF,0xBF,0xBF] : fix [0xF0,0x80,0x80,0x80] y 117 | otherwise = error "fix: incorrect input given" 118 119 120byteRangeToBytePair :: Span [Byte] -> ([Byte],[Byte]) 121byteRangeToBytePair (Span x y) = (x,y) 122 123data Span a = Span a a -- lower bound inclusive, higher bound exclusive 124 -- (SDM: upper bound inclusive, surely?) 125instance Functor Span where 126 fmap f (Span x y) = Span (f x) (f y) 127 128charRangeToCharSpan :: Bool -> Range Char -> Maybe (Span Char) 129charRangeToCharSpan _ (Range BoundaryAboveAll _) = Nothing 130charRangeToCharSpan _ (Range (BoundaryAbove c) _) | c == maxBound = Nothing 131charRangeToCharSpan _ (Range _ BoundaryBelowAll) = Nothing 132charRangeToCharSpan _ (Range _ (BoundaryBelow c)) | c == minBound = Nothing 133charRangeToCharSpan uni (Range x y) = Just (Span (l x) (h y)) 134 where l b = case b of 135 BoundaryBelowAll -> '\0' 136 BoundaryBelow a -> a 137 BoundaryAbove a -> succ a 138 BoundaryAboveAll -> error "panic: charRangeToCharSpan" 139 h b = case b of 140 BoundaryBelowAll -> error "panic: charRangeToCharSpan" 141 BoundaryBelow a -> pred a 142 BoundaryAbove a -> a 143 BoundaryAboveAll | uni -> chr 0x10ffff 144 | otherwise -> chr 0xff 145 146byteRanges :: Encoding -> CharSet -> [([Byte],[Byte])] 147byteRanges enc = fmap byteRangeToBytePair . charToRanges enc 148 149byteSetRange :: Byte -> Byte -> ByteSet 150byteSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)] 151 152byteSetSingleton :: Byte -> ByteSet 153byteSetSingleton = rSingleton 154 155-- TODO: More efficient generated code! 156charSetQuote :: CharSet -> String 157charSetQuote s = "(\\c -> " ++ foldr (\x y -> x ++ " || " ++ y) "False" (map quoteRange (rSetRanges s)) ++ ")" 158 where quoteRange (Range l h) = quoteL l ++ " && " ++ quoteH h 159 quoteL (BoundaryAbove a) = "c > " ++ show a 160 quoteL (BoundaryBelow a) = "c >= " ++ show a 161 quoteL (BoundaryAboveAll) = "False" 162 quoteL (BoundaryBelowAll) = "True" 163 quoteH (BoundaryAbove a) = "c <= " ++ show a 164 quoteH (BoundaryBelow a) = "c < " ++ show a 165 quoteH (BoundaryAboveAll) = "True" 166 quoteH (BoundaryBelowAll) = "False" 167 168