1{-# LANGUAGE DeriveGeneric #-}
2{-# LANGUAGE OverloadedStrings #-}
3{-# LANGUAGE TemplateHaskell #-}
4{-# LANGUAGE TupleSections #-}
5
6-- | Data types for unicode character data and functions for
7-- extracting it from @UnicodeData.txt@.
8module Text.Collate.UnicodeData
9  ( UChar(..),
10    GeneralCategory(..),
11    BidiClass(..),
12    DecompositionType(..),
13    parseUnicodeData,
14    toCanonicalCombiningClassMap,
15    toCanonicalDecompositionMap,
16    genCanonicalCombiningClassMap,
17    genCanonicalDecompositionMap,
18    readCodePoints,
19  )
20where
21
22import qualified Data.ByteString as B
23import qualified Data.IntMap as M
24import Data.Text (Text)
25import qualified Data.Text as T
26import qualified Data.Text.Encoding as TE
27import qualified Data.Text.Read as TR
28import GHC.Generics (Generic)
29import Instances.TH.Lift ()
30import Language.Haskell.TH
31import Language.Haskell.TH.Syntax (qAddDependentFile)
32
33-- | Path to @UnicodeData.txt@.
34unicodeDataPath :: FilePath
35unicodeDataPath = "data/UnicodeData.txt"
36
37readUtf8Text :: FilePath -> IO Text
38readUtf8Text fp = TE.decodeUtf8 <$> B.readFile fp
39
40-- | Generate map of code points to canonical combining class,
41-- from @UnicodeData.txt@.
42genCanonicalCombiningClassMap :: Q Exp
43genCanonicalCombiningClassMap = do
44  qAddDependentFile unicodeDataPath
45  cccmap <-
46    toCanonicalCombiningClassMap . parseUnicodeData
47      <$> runIO (readUtf8Text unicodeDataPath)
48  [|cccmap|]
49
50-- | Generate map of code points to canonical decompositions,
51-- from @UnicodeData.txt@.
52genCanonicalDecompositionMap :: Q Exp
53genCanonicalDecompositionMap = do
54  qAddDependentFile unicodeDataPath
55  dmap <-
56    toCanonicalDecompositionMap . parseUnicodeData
57      <$> runIO (readUtf8Text unicodeDataPath)
58  [|dmap|]
59
60-- | Parse @UnicodeData.txt@ into a map of 'UChar' records.
61parseUnicodeData :: Text -> M.IntMap UChar
62parseUnicodeData = foldr parseLine mempty . T.lines
63
64-- | Convert unicode data to a map from code points to canonical combining
65-- classes.
66toCanonicalCombiningClassMap :: M.IntMap UChar -> M.IntMap Int
67toCanonicalCombiningClassMap =
68  fmap uCanonicalCombiningClass . M.filter ((> 0) . uCanonicalCombiningClass)
69
70-- | Convert unicode data to a map from code points to canonical decompositions.
71toCanonicalDecompositionMap :: M.IntMap UChar -> M.IntMap [Int]
72toCanonicalDecompositionMap =
73  fmap uDecompositionMapping
74    . M.filter
75      ( \x ->
76          uDecompositionType x == Canonical
77            && not (null (uDecompositionMapping x))
78      )
79
80data GeneralCategory
81  = Lu
82  | Ll
83  | Lt
84  | Lm
85  | Lo
86  | Mn
87  | Mc
88  | Me
89  | Nd
90  | Nl
91  | No
92  | Pc
93  | Pd
94  | Ps
95  | Pe
96  | Pi
97  | Pf
98  | Po
99  | Sm
100  | Sc
101  | Sk
102  | So
103  | Zs
104  | Zl
105  | Zp
106  | Cc
107  | Cf
108  | Cs
109  | Co
110  | Cn
111  deriving (Show, Read, Eq, Ord, Enum, Generic)
112
113data BidiClass
114  = L
115  | LRE
116  | LRO
117  | R
118  | AL
119  | RLE
120  | RLO
121  | PDF
122  | EN
123  | ES
124  | ET
125  | AN
126  | CS
127  | NSM
128  | BN
129  | B
130  | S
131  | WS
132  | ON
133  | LRI
134  | RLI
135  | FSI
136  | PDI
137  deriving (Show, Read, Eq, Ord, Enum, Generic)
138
139data DecompositionType
140  = Font
141  | NoBreak
142  | Initial
143  | Medial
144  | Final
145  | Isolated
146  | Circle
147  | Super
148  | Sub
149  | Vertical
150  | Wide
151  | Narrow
152  | Small
153  | Square
154  | Fraction
155  | Compat
156  | Canonical
157  deriving (Show, Read, Eq, Ord, Enum, Generic)
158
159-- | A 'UChar' encodes the data in one line of @UnicodeData.txt@.
160data UChar = UChar
161  { uCodePoint :: Int,
162    uName :: Text,
163    uGeneralCategory :: GeneralCategory,
164    uCanonicalCombiningClass :: Int,
165    uBidiClass :: BidiClass,
166    uDecompositionType :: DecompositionType,
167    uDecompositionMapping :: [Int],
168    uNumericTypeAndValue :: (Maybe Int, Maybe Int, Maybe Int),
169    uBidiMirrored :: Bool,
170    uUnicode1Name :: Text,
171    uISOComment :: Text,
172    uSimpleUppercaseMapping :: Int,
173    uSimpleLowercaseMapping :: Int,
174    uSimpleTitlecaseMappping :: Int
175  }
176  deriving (Show, Eq, Ord, Generic)
177
178readCodePoint :: Text -> Int
179readCodePoint t =
180  case TR.hexadecimal t of
181    Left e -> error e -- ok to error at compile-time
182    Right (codepoint, _) -> codepoint
183
184-- | Read a sequence of space-separated hex numbers.
185readCodePoints :: Text -> ([Int], Text)
186readCodePoints t =
187  case TR.hexadecimal t of
188    Left _ -> ([], t)
189    Right (codepoint, rest) ->
190      let (cps, t') = readCodePoints (T.dropWhile (== ' ') rest)
191       in (codepoint : cps, t')
192
193parseDecomp :: Text -> (DecompositionType, [Int])
194parseDecomp bs =
195  case T.uncons bs of
196    Just ('<', rest) -> (ty, xs)
197      where
198        xs = fst $ readCodePoints cps
199        (x, y) = T.break (== '>') rest
200        cps = T.dropWhile (\c -> c == '>' || c == ' ') y
201        ty = case x of
202          "font" -> Font
203          "noBreak" -> NoBreak
204          "initial" -> Initial
205          "medial" -> Medial
206          "final" -> Final
207          "isolate" -> Isolated
208          "circle" -> Circle
209          "super" -> Super
210          "sub" -> Sub
211          "vertical" -> Vertical
212          "wide" -> Wide
213          "narrow" -> Narrow
214          "small" -> Small
215          "square" -> Square
216          "fraction" -> Fraction
217          "compat" -> Compat
218          _ -> Compat
219    _ -> (Canonical,) . fst $ readCodePoints bs
220
221parseLine :: Text -> M.IntMap UChar -> M.IntMap UChar
222parseLine t =
223  case T.splitOn ";" t of
224    [f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14] ->
225      M.insert codepoint uchar
226      where
227        codepoint = readCodePoint f0
228        (decompType, decompMapping) = parseDecomp f5
229        readNumericValue x =
230          case TR.decimal x of
231            Left _ -> Nothing
232            Right (v, _) -> Just v
233        uchar =
234          UChar
235            { uCodePoint = codepoint,
236              uName = f1,
237              uGeneralCategory = read (T.unpack f2),
238              uCanonicalCombiningClass = either (const 0) fst (TR.decimal f3),
239              uBidiClass = read (T.unpack f4),
240              uDecompositionType = decompType,
241              uDecompositionMapping = decompMapping,
242              uNumericTypeAndValue =
243                ( readNumericValue f6,
244                  readNumericValue f7,
245                  readNumericValue f8
246                ),
247              uBidiMirrored = f9 == "Y",
248              uUnicode1Name = f10,
249              uISOComment = f11,
250              uSimpleUppercaseMapping = readCodePoint f12,
251              uSimpleLowercaseMapping = readCodePoint f13,
252              uSimpleTitlecaseMappping = readCodePoint f14
253            }
254    _ -> error $ "Wrong number of fields in record:\n" ++ show t
255