1{-# LANGUAGE DeriveGeneric #-} 2{-# LANGUAGE OverloadedStrings #-} 3{-# LANGUAGE TemplateHaskell #-} 4{-# LANGUAGE TupleSections #-} 5 6-- | Data types for unicode character data and functions for 7-- extracting it from @UnicodeData.txt@. 8module Text.Collate.UnicodeData 9 ( UChar(..), 10 GeneralCategory(..), 11 BidiClass(..), 12 DecompositionType(..), 13 parseUnicodeData, 14 toCanonicalCombiningClassMap, 15 toCanonicalDecompositionMap, 16 genCanonicalCombiningClassMap, 17 genCanonicalDecompositionMap, 18 readCodePoints, 19 ) 20where 21 22import qualified Data.ByteString as B 23import qualified Data.IntMap as M 24import Data.Text (Text) 25import qualified Data.Text as T 26import qualified Data.Text.Encoding as TE 27import qualified Data.Text.Read as TR 28import GHC.Generics (Generic) 29import Instances.TH.Lift () 30import Language.Haskell.TH 31import Language.Haskell.TH.Syntax (qAddDependentFile) 32 33-- | Path to @UnicodeData.txt@. 34unicodeDataPath :: FilePath 35unicodeDataPath = "data/UnicodeData.txt" 36 37readUtf8Text :: FilePath -> IO Text 38readUtf8Text fp = TE.decodeUtf8 <$> B.readFile fp 39 40-- | Generate map of code points to canonical combining class, 41-- from @UnicodeData.txt@. 42genCanonicalCombiningClassMap :: Q Exp 43genCanonicalCombiningClassMap = do 44 qAddDependentFile unicodeDataPath 45 cccmap <- 46 toCanonicalCombiningClassMap . parseUnicodeData 47 <$> runIO (readUtf8Text unicodeDataPath) 48 [|cccmap|] 49 50-- | Generate map of code points to canonical decompositions, 51-- from @UnicodeData.txt@. 52genCanonicalDecompositionMap :: Q Exp 53genCanonicalDecompositionMap = do 54 qAddDependentFile unicodeDataPath 55 dmap <- 56 toCanonicalDecompositionMap . parseUnicodeData 57 <$> runIO (readUtf8Text unicodeDataPath) 58 [|dmap|] 59 60-- | Parse @UnicodeData.txt@ into a map of 'UChar' records. 61parseUnicodeData :: Text -> M.IntMap UChar 62parseUnicodeData = foldr parseLine mempty . T.lines 63 64-- | Convert unicode data to a map from code points to canonical combining 65-- classes. 66toCanonicalCombiningClassMap :: M.IntMap UChar -> M.IntMap Int 67toCanonicalCombiningClassMap = 68 fmap uCanonicalCombiningClass . M.filter ((> 0) . uCanonicalCombiningClass) 69 70-- | Convert unicode data to a map from code points to canonical decompositions. 71toCanonicalDecompositionMap :: M.IntMap UChar -> M.IntMap [Int] 72toCanonicalDecompositionMap = 73 fmap uDecompositionMapping 74 . M.filter 75 ( \x -> 76 uDecompositionType x == Canonical 77 && not (null (uDecompositionMapping x)) 78 ) 79 80data GeneralCategory 81 = Lu 82 | Ll 83 | Lt 84 | Lm 85 | Lo 86 | Mn 87 | Mc 88 | Me 89 | Nd 90 | Nl 91 | No 92 | Pc 93 | Pd 94 | Ps 95 | Pe 96 | Pi 97 | Pf 98 | Po 99 | Sm 100 | Sc 101 | Sk 102 | So 103 | Zs 104 | Zl 105 | Zp 106 | Cc 107 | Cf 108 | Cs 109 | Co 110 | Cn 111 deriving (Show, Read, Eq, Ord, Enum, Generic) 112 113data BidiClass 114 = L 115 | LRE 116 | LRO 117 | R 118 | AL 119 | RLE 120 | RLO 121 | PDF 122 | EN 123 | ES 124 | ET 125 | AN 126 | CS 127 | NSM 128 | BN 129 | B 130 | S 131 | WS 132 | ON 133 | LRI 134 | RLI 135 | FSI 136 | PDI 137 deriving (Show, Read, Eq, Ord, Enum, Generic) 138 139data DecompositionType 140 = Font 141 | NoBreak 142 | Initial 143 | Medial 144 | Final 145 | Isolated 146 | Circle 147 | Super 148 | Sub 149 | Vertical 150 | Wide 151 | Narrow 152 | Small 153 | Square 154 | Fraction 155 | Compat 156 | Canonical 157 deriving (Show, Read, Eq, Ord, Enum, Generic) 158 159-- | A 'UChar' encodes the data in one line of @UnicodeData.txt@. 160data UChar = UChar 161 { uCodePoint :: Int, 162 uName :: Text, 163 uGeneralCategory :: GeneralCategory, 164 uCanonicalCombiningClass :: Int, 165 uBidiClass :: BidiClass, 166 uDecompositionType :: DecompositionType, 167 uDecompositionMapping :: [Int], 168 uNumericTypeAndValue :: (Maybe Int, Maybe Int, Maybe Int), 169 uBidiMirrored :: Bool, 170 uUnicode1Name :: Text, 171 uISOComment :: Text, 172 uSimpleUppercaseMapping :: Int, 173 uSimpleLowercaseMapping :: Int, 174 uSimpleTitlecaseMappping :: Int 175 } 176 deriving (Show, Eq, Ord, Generic) 177 178readCodePoint :: Text -> Int 179readCodePoint t = 180 case TR.hexadecimal t of 181 Left e -> error e -- ok to error at compile-time 182 Right (codepoint, _) -> codepoint 183 184-- | Read a sequence of space-separated hex numbers. 185readCodePoints :: Text -> ([Int], Text) 186readCodePoints t = 187 case TR.hexadecimal t of 188 Left _ -> ([], t) 189 Right (codepoint, rest) -> 190 let (cps, t') = readCodePoints (T.dropWhile (== ' ') rest) 191 in (codepoint : cps, t') 192 193parseDecomp :: Text -> (DecompositionType, [Int]) 194parseDecomp bs = 195 case T.uncons bs of 196 Just ('<', rest) -> (ty, xs) 197 where 198 xs = fst $ readCodePoints cps 199 (x, y) = T.break (== '>') rest 200 cps = T.dropWhile (\c -> c == '>' || c == ' ') y 201 ty = case x of 202 "font" -> Font 203 "noBreak" -> NoBreak 204 "initial" -> Initial 205 "medial" -> Medial 206 "final" -> Final 207 "isolate" -> Isolated 208 "circle" -> Circle 209 "super" -> Super 210 "sub" -> Sub 211 "vertical" -> Vertical 212 "wide" -> Wide 213 "narrow" -> Narrow 214 "small" -> Small 215 "square" -> Square 216 "fraction" -> Fraction 217 "compat" -> Compat 218 _ -> Compat 219 _ -> (Canonical,) . fst $ readCodePoints bs 220 221parseLine :: Text -> M.IntMap UChar -> M.IntMap UChar 222parseLine t = 223 case T.splitOn ";" t of 224 [f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14] -> 225 M.insert codepoint uchar 226 where 227 codepoint = readCodePoint f0 228 (decompType, decompMapping) = parseDecomp f5 229 readNumericValue x = 230 case TR.decimal x of 231 Left _ -> Nothing 232 Right (v, _) -> Just v 233 uchar = 234 UChar 235 { uCodePoint = codepoint, 236 uName = f1, 237 uGeneralCategory = read (T.unpack f2), 238 uCanonicalCombiningClass = either (const 0) fst (TR.decimal f3), 239 uBidiClass = read (T.unpack f4), 240 uDecompositionType = decompType, 241 uDecompositionMapping = decompMapping, 242 uNumericTypeAndValue = 243 ( readNumericValue f6, 244 readNumericValue f7, 245 readNumericValue f8 246 ), 247 uBidiMirrored = f9 == "Y", 248 uUnicode1Name = f10, 249 uISOComment = f11, 250 uSimpleUppercaseMapping = readCodePoint f12, 251 uSimpleLowercaseMapping = readCodePoint f13, 252 uSimpleTitlecaseMappping = readCodePoint f14 253 } 254 _ -> error $ "Wrong number of fields in record:\n" ++ show t 255