1{-# LANGUAGE CPP #-} 2#if __GLASGOW_HASKELL__ >= 702 3{-# LANGUAGE Trustworthy #-} 4#endif 5#if EMBED 6{-# LANGUAGE TemplateHaskell #-} 7#endif 8----------------------------------------------------------------------------- 9-- | 10-- Module : Text.Hyphenation.Language 11-- Copyright : (C) 2012-2019 Edward Kmett, 12-- (C) 2007 Ned Batchelder 13-- License : BSD-style (see the languageAffix LICENSE) 14-- 15-- Maintainer : Edward Kmett <ekmett@gmail.com> 16-- Stability : provisional 17-- Portability : portable 18-- 19---------------------------------------------------------------------------- 20module Text.Hyphenation.Language 21 ( 22 -- * Pattern file support 23 Language(..) 24 , languageHyphenator 25 -- * Provided language hyphenators 26 , afrikaans, armenian, assamese, basque, bengali, bulgarian, catalan, chinese 27 , coptic, croatian, czech, danish, dutch, english_US, english_GB, esperanto 28 , estonian, ethiopic, {- farsi, -} finnish, french, friulan, galician, georgian, german_1901, german_1996 29 , german_Swiss, greek_Ancient, greek_Mono, greek_Poly, gujarati, hindi, hungarian 30 , icelandic, indonesian, interlingua, irish, italian, kannada, kurmanji, latin, latin_Classic 31 , latvian, lithuanian, malayalam, marathi, mongolian, norwegian_Bokmal 32 , norwegian_Nynorsk, occitan, oriya, panjabi, piedmontese, polish, portuguese, romanian, romansh 33 , russian, sanskrit, serbian_Cyrillic, serbocroatian_Cyrillic 34 , serbocroatian_Latin, slovak, slovenian, spanish, swedish, tamil 35 , telugu, thai, turkish, turkmen, ukrainian, uppersorbian, welsh 36 , loadHyphenator 37 , languageAffix 38 ) where 39 40import Codec.Compression.GZip 41#if __GLASGOW_HASKELL__ < 710 42import Data.Functor ((<$>)) 43#endif 44import qualified Data.IntMap as IM 45import qualified Data.Text as T 46import qualified Data.Text.Encoding as T 47import Text.Hyphenation.ByteStringLazyCompat as Lazy 48import Text.Hyphenation.Hyphenator 49import Text.Hyphenation.Pattern 50import Text.Hyphenation.Exception 51import System.IO.Unsafe 52 53#if !EMBED 54import Paths_hyphenation 55#else 56import Data.FileEmbed 57import qualified Data.ByteString.Char8 as Strict 58 59hyphenatorFiles :: [(FilePath, Strict.ByteString)] 60hyphenatorFiles = $(embedDir "data") 61#endif 62 63-- $setup 64-- >>> import Text.Hyphenation.Hyphenator 65 66chrLine :: String -> [(Int, Char)] 67chrLine (x:xs) = fmap (\y -> (fromEnum y, x)) xs 68chrLine [] = [] 69 70-- | Read a built-in language file from the data directory where cabal installed this package. 71-- 72-- (e.g. @hyphenateLanguage \"en-us\"@ opens @\"\/Users\/ekmett\/.cabal\/share\/hyphenation-0.2\/ghc-7.4.1\/hyph-en-us.hyp.txt\"@ 73-- among others when run on the author's local machine) 74loadHyphenator :: Language -> IO Hyphenator 75#if !EMBED 76loadHyphenator language = do 77 let affix = languageAffix language 78 hyp <- unzipUtf8 <$> (getDataFileName ("hyph-" ++ affix ++ ".hyp.txt.gz") >>= Lazy.readFile) 79 pat <- unzipUtf8 <$> (getDataFileName ("hyph-" ++ affix ++ ".pat.txt.gz") >>= Lazy.readFile) 80 chr <- unzipUtf8 <$> (getDataFileName ("hyph-" ++ affix ++ ".chr.txt.gz") >>= Lazy.readFile) 81 let chrMap = IM.fromList (Prelude.lines chr >>= chrLine) 82 tryLookup x = IM.findWithDefault x (fromEnum x) chrMap 83 (defaultLeftMin, defaultRightMin) = languageMins language 84 return $ Hyphenator tryLookup (parsePatterns pat) (parseExceptions hyp) defaultLeftMin defaultRightMin 85#else 86loadHyphenator language = return $ Hyphenator tryLookup (parsePatterns pat) (parseExceptions hyp) defaultLeftMin defaultRightMin 87 where affix = languageAffix language 88 Just hyp = unzipUtf8 . Lazy.fromStrict <$> lookup ("hyph-" ++ affix ++ ".hyp.txt.gz") hyphenatorFiles 89 Just pat = unzipUtf8 . Lazy.fromStrict <$> lookup ("hyph-" ++ affix ++ ".pat.txt.gz") hyphenatorFiles 90 Just chr = unzipUtf8 . Lazy.fromStrict <$> lookup ("hyph-" ++ affix ++ ".chr.txt.gz") hyphenatorFiles 91 chrMap = IM.fromList (Prelude.lines chr >>= chrLine) 92 (defaultLeftMin, defaultRightMin) = languageMins language 93 tryLookup x = IM.findWithDefault x (fromEnum x) chrMap 94#endif 95 96unzipUtf8 :: ByteString -> String 97unzipUtf8 = 98 T.unpack . T.decodeUtf8With (\ _ -> fmap (toEnum . fromEnum)) 99 . Lazy.toStrict . decompress 100 101-- | A strongly typed set of available languages you can use for hyphenation. 102data Language 103 = Afrikaans 104 | Armenian 105 | Assamese 106 | Basque 107 | Bengali 108 | Bulgarian 109 | Catalan 110 | Chinese 111 | Coptic 112 | Croatian 113 | Czech 114 | Danish 115 | Dutch 116 | English_US | English_GB 117 | Esperanto 118 | Estonian 119 | Ethiopic 120 -- | Farsi 121 | Finnish 122 | French 123 | Friulan 124 | Galician 125 | Georgian 126 | German_1901 | German_1996 | German_Swiss 127 | Greek_Ancient 128 | Greek_Mono 129 | Greek_Poly 130 | Gujarati 131 | Hindi 132 | Hungarian 133 | Icelandic 134 | Indonesian 135 | Interlingua 136 | Irish 137 | Italian 138 | Kannada 139 | Kurmanji 140 | Latin 141 | Latin_Classic 142 | Latvian 143 | Lithuanian 144 | Malayalam 145 | Marathi 146 | Mongolian 147 | Norwegian_Bokmal | Norwegian_Nynorsk 148 | Occitan 149 | Oriya 150 | Panjabi 151 | Piedmontese 152 | Polish 153 | Portuguese 154 | Romanian 155 | Romansh 156 | Russian 157 | Sanskrit 158 | Serbian_Cyrillic 159 | Serbocroatian_Cyrillic | Serbocroatian_Latin 160 | Slovak 161 | Slovenian 162 | Spanish 163 | Swedish 164 | Tamil 165 | Telugu 166 | Thai 167 | Turkish 168 | Turkmen 169 | Ukrainian 170 | Uppersorbian 171 | Welsh 172 deriving (Eq,Ord,Show,Bounded,Enum) 173 174 175-- | the infix portion of the data file names used for this language 176languageAffix :: Language -> String 177languageAffix s = case s of 178 Afrikaans -> "af" 179 Armenian -> "hy" 180 Assamese -> "as" 181 Basque -> "eu" 182 Bengali -> "bn" 183 Bulgarian -> "bg" 184 Catalan -> "ca" 185 Chinese -> "zh-latn-pinyin" 186 Coptic -> "cop" 187 Croatian -> "hr" 188 Czech -> "cs" 189 Danish -> "da" 190 Dutch -> "nl" 191 English_US -> "en-us" 192 English_GB -> "en-gb" 193 Esperanto -> "eo" 194 Estonian -> "et" 195 Ethiopic -> "mul-ethi" 196 -- Farsi -> "fa" 197 Finnish -> "fi" 198 French -> "fr" 199 Friulan -> "fur" 200 Galician -> "gl" 201 Georgian -> "ka" 202 German_1901 -> "de-1901" 203 German_1996 -> "de-1996" 204 German_Swiss -> "de-ch-1901" 205 Greek_Ancient -> "grc" 206 Greek_Mono -> "el-monoton" 207 Greek_Poly -> "el-polyton" 208 Gujarati -> "gu" 209 Hindi -> "hi" 210 Hungarian -> "hu" 211 Icelandic -> "is" 212 Indonesian -> "id" 213 Interlingua -> "ia" 214 Irish -> "ga" 215 Italian -> "it" 216 Kannada -> "kn" 217 Kurmanji -> "kmr" 218 Latin -> "la" 219 Latin_Classic -> "la-x-classic" 220 Latvian -> "lv" 221 Lithuanian -> "lt" 222 Malayalam -> "ml" 223 Marathi -> "mr" 224 Mongolian -> "mn-cyrl" 225 Norwegian_Bokmal -> "nb" 226 Norwegian_Nynorsk -> "nn" 227 Occitan -> "oc" 228 Oriya -> "or" 229 Panjabi -> "pa" 230 Piedmontese -> "pms" 231 Polish -> "pl" 232 Portuguese -> "pt" 233 Romanian -> "ro" 234 Romansh -> "rm" 235 Russian -> "ru" 236 Sanskrit -> "sa" 237 Serbian_Cyrillic -> "sr-cyrl" 238 Serbocroatian_Cyrillic -> "sh-cyrl" 239 Serbocroatian_Latin -> "sh-latn" 240 Slovak -> "sk" 241 Slovenian -> "sl" 242 Spanish -> "es" 243 Swedish -> "sv" 244 Tamil -> "ta" 245 Telugu -> "te" 246 Thai -> "th" 247 Turkish -> "tr" 248 Turkmen -> "tk" 249 Ukrainian -> "uk" 250 Uppersorbian -> "hsb" 251 Welsh -> "cy" 252 253 254-- | The number of characters from the beginning and end of a word not to hyphenate in this language. 255languageMins :: Language -> (Int, Int) 256languageMins s = case s of 257 Afrikaans -> (1, 2) 258 Armenian -> (1, 2) 259 Assamese -> (1, 1) 260 Basque -> (2, 2) 261 Bengali -> (1, 1) 262 Bulgarian -> (2, 2) 263 Catalan -> (2, 2) 264 Chinese -> (1, 1) 265 Coptic -> (1, 1) 266 Croatian -> (2, 2) 267 Czech -> (2, 3) 268 Danish -> (2, 2) 269 Dutch -> (2, 2) 270 English_GB -> (2, 3) 271 English_US -> (2, 3) 272 Esperanto -> (2, 2) 273 Estonian -> (2, 3) 274 Ethiopic -> (1, 1) 275 -- Farsi -> (,) 276 Finnish -> (2, 2) 277 French -> (2, 3) 278 Friulan -> (2, 2) 279 Galician -> (2, 2) 280 Georgian -> (1, 2) 281 German_1901 -> (2, 2) 282 German_1996 -> (2, 2) 283 German_Swiss -> (2, 2) 284 Greek_Ancient -> (1, 1) 285 Greek_Mono -> (1, 1) 286 Greek_Poly -> (1, 1) 287 Gujarati -> (1, 1) 288 Hindi -> (1, 1) 289 Hungarian -> (2, 2) 290 Icelandic -> (2, 2) 291 Indonesian -> (2, 2) 292 Interlingua -> (2, 2) 293 Irish -> (2, 3) 294 Italian -> (2, 2) 295 Kannada -> (1, 1) 296 Kurmanji -> (2, 2) 297 Latin -> (2, 2) 298 Latin_Classic -> (2, 2) 299 Latvian -> (2, 2) 300 Lithuanian -> (2, 2) 301 Malayalam -> (1, 1) 302 Marathi -> (1, 1) 303 Mongolian -> (2, 2) 304 Norwegian_Bokmal -> (2, 2) 305 Norwegian_Nynorsk -> (2, 2) 306 Occitan -> (2, 2) 307 Oriya -> (1, 1) 308 Panjabi -> (1, 1) 309 Piedmontese -> (2, 2) 310 Polish -> (2, 2) 311 Portuguese -> (2, 3) 312 Romanian -> (2, 2) 313 Romansh -> (2, 2) 314 Russian -> (2, 2) 315 Sanskrit -> (1, 3) 316 Serbian_Cyrillic -> (2, 2) 317 Serbocroatian_Cyrillic -> (2, 2) 318 Serbocroatian_Latin -> (2, 2) 319 Slovak -> (2, 3) 320 Slovenian -> (2, 2) 321 Spanish -> (2, 2) 322 Swedish -> (2, 2) 323 Tamil -> (1, 1) 324 Telugu -> (1, 1) 325 Thai -> (2, 3) 326 Turkish -> (2, 2) 327 Turkmen -> (2, 2) 328 Ukrainian -> (2, 2) 329 Uppersorbian -> (2, 2) 330 Welsh -> (2, 3) 331 332 333-- | 334-- >>> hyphenate english_US "supercalifragilisticexpialadocious" 335-- ["su","per","cal","ifrag","ilis","tic","ex","pi","al","ado","cious"] 336-- 337-- favors US hyphenation 338english_US :: Hyphenator 339 340-- | 341-- >>> hyphenate english_GB "supercalifragilisticexpialadocious" 342-- ["su","per","cal","i","fra","gil","istic","ex","pi","alado","cious"] 343-- 344-- favors UK hyphenation 345english_GB :: Hyphenator 346 347-- | 348-- >>> hyphenate french "anticonstitutionnellement" 349-- ["an","ti","cons","ti","tu","tion","nel","le","ment"] 350french :: Hyphenator 351 352-- | 353-- >>> hyphenate icelandic "va\240lahei\240avegavinnuverkf\230rageymslusk\250r" 354-- ["va\240la","hei\240a","vega","vinnu","verk","f\230ra","geymslu","sk\250r"] 355icelandic :: Hyphenator 356 357-- | Hyphenators for a wide array of languages. 358afrikaans, armenian, assamese, basque, bengali, bulgarian, catalan, chinese, 359 coptic, croatian, czech, danish, dutch, esperanto, 360 estonian, ethiopic, {- farsi, -} finnish, friulan, galician, georgian, german_1901, german_1996, 361 german_Swiss, greek_Ancient, greek_Mono, greek_Poly, gujarati, hindi, hungarian, 362 indonesian, interlingua, irish, italian, kannada, kurmanji, latin, latin_Classic, 363 latvian, lithuanian, malayalam, marathi, mongolian, norwegian_Bokmal, 364 norwegian_Nynorsk, occitan, oriya, panjabi, piedmontese, polish, portuguese, romanian, 365 romansh, russian, sanskrit, serbian_Cyrillic, serbocroatian_Cyrillic, 366 serbocroatian_Latin, slovak, slovenian, spanish, swedish, tamil, 367 telugu, thai, turkish, turkmen, ukrainian, uppersorbian, welsh :: Hyphenator 368 369afrikaans = unsafePerformIO (loadHyphenator Afrikaans) 370armenian = unsafePerformIO (loadHyphenator Armenian) 371assamese = unsafePerformIO (loadHyphenator Assamese) 372basque = unsafePerformIO (loadHyphenator Basque) 373bengali = unsafePerformIO (loadHyphenator Bengali) 374bulgarian = unsafePerformIO (loadHyphenator Bulgarian) 375catalan = unsafePerformIO (loadHyphenator Catalan) 376chinese = unsafePerformIO (loadHyphenator Chinese) 377coptic = unsafePerformIO (loadHyphenator Coptic) 378croatian = unsafePerformIO (loadHyphenator Croatian) 379czech = unsafePerformIO (loadHyphenator Czech) 380danish = unsafePerformIO (loadHyphenator Danish) 381dutch = unsafePerformIO (loadHyphenator Dutch) 382english_US = unsafePerformIO (loadHyphenator English_US) 383english_GB = unsafePerformIO (loadHyphenator English_GB) 384esperanto = unsafePerformIO (loadHyphenator Esperanto) 385estonian = unsafePerformIO (loadHyphenator Estonian) 386ethiopic = unsafePerformIO (loadHyphenator Ethiopic) 387-- farsi = unsafePerformIO (loadHyphenator Farsi) 388finnish = unsafePerformIO (loadHyphenator Finnish) 389french = unsafePerformIO (loadHyphenator French) 390friulan = unsafePerformIO (loadHyphenator Friulan) 391galician = unsafePerformIO (loadHyphenator Galician) 392georgian = unsafePerformIO (loadHyphenator Georgian) 393german_1901 = unsafePerformIO (loadHyphenator German_1901) 394german_1996 = unsafePerformIO (loadHyphenator German_1996) 395german_Swiss = unsafePerformIO (loadHyphenator German_Swiss) 396greek_Ancient = unsafePerformIO (loadHyphenator Greek_Ancient) 397greek_Mono = unsafePerformIO (loadHyphenator Greek_Mono) 398greek_Poly = unsafePerformIO (loadHyphenator Greek_Poly) 399gujarati = unsafePerformIO (loadHyphenator Gujarati) 400hindi = unsafePerformIO (loadHyphenator Hindi) 401hungarian = unsafePerformIO (loadHyphenator Hungarian) 402icelandic = unsafePerformIO (loadHyphenator Icelandic) 403indonesian = unsafePerformIO (loadHyphenator Indonesian) 404interlingua = unsafePerformIO (loadHyphenator Interlingua) 405irish = unsafePerformIO (loadHyphenator Irish) 406italian = unsafePerformIO (loadHyphenator Italian) 407kannada = unsafePerformIO (loadHyphenator Kannada) 408kurmanji = unsafePerformIO (loadHyphenator Kurmanji) 409latin = unsafePerformIO (loadHyphenator Latin) 410latin_Classic = unsafePerformIO (loadHyphenator Latin_Classic) 411latvian = unsafePerformIO (loadHyphenator Latvian) 412lithuanian = unsafePerformIO (loadHyphenator Lithuanian) 413malayalam = unsafePerformIO (loadHyphenator Malayalam) 414marathi = unsafePerformIO (loadHyphenator Marathi) 415mongolian = unsafePerformIO (loadHyphenator Mongolian) 416norwegian_Bokmal = unsafePerformIO (loadHyphenator Norwegian_Bokmal) 417norwegian_Nynorsk = unsafePerformIO (loadHyphenator Norwegian_Nynorsk) 418occitan = unsafePerformIO (loadHyphenator Occitan) 419oriya = unsafePerformIO (loadHyphenator Oriya) 420panjabi = unsafePerformIO (loadHyphenator Panjabi) 421piedmontese = unsafePerformIO (loadHyphenator Piedmontese) 422polish = unsafePerformIO (loadHyphenator Polish) 423portuguese = unsafePerformIO (loadHyphenator Portuguese) 424romanian = unsafePerformIO (loadHyphenator Romanian) 425romansh = unsafePerformIO (loadHyphenator Romansh) 426russian = unsafePerformIO (loadHyphenator Russian) 427sanskrit = unsafePerformIO (loadHyphenator Sanskrit) 428serbian_Cyrillic = unsafePerformIO (loadHyphenator Serbian_Cyrillic) 429serbocroatian_Cyrillic = unsafePerformIO (loadHyphenator Serbocroatian_Cyrillic) 430serbocroatian_Latin = unsafePerformIO (loadHyphenator Serbocroatian_Latin) 431slovak = unsafePerformIO (loadHyphenator Slovak) 432slovenian = unsafePerformIO (loadHyphenator Slovenian) 433spanish = unsafePerformIO (loadHyphenator Spanish) 434swedish = unsafePerformIO (loadHyphenator Swedish) 435tamil = unsafePerformIO (loadHyphenator Tamil) 436telugu = unsafePerformIO (loadHyphenator Telugu) 437thai = unsafePerformIO (loadHyphenator Thai) 438turkish = unsafePerformIO (loadHyphenator Turkish) 439turkmen = unsafePerformIO (loadHyphenator Turkmen) 440ukrainian = unsafePerformIO (loadHyphenator Ukrainian) 441uppersorbian = unsafePerformIO (loadHyphenator Uppersorbian) 442welsh = unsafePerformIO (loadHyphenator Welsh) 443 444-- | Load (and cache) the hyphenator for a given language. 445languageHyphenator :: Language -> Hyphenator 446languageHyphenator s = case s of 447 Afrikaans -> afrikaans 448 Armenian -> armenian 449 Assamese -> assamese 450 Basque -> basque 451 Bengali -> bengali 452 Bulgarian -> bulgarian 453 Catalan -> catalan 454 Chinese -> chinese 455 Coptic -> coptic 456 Croatian -> croatian 457 Czech -> czech 458 Danish -> danish 459 Dutch -> dutch 460 English_US -> english_US 461 English_GB -> english_GB 462 Esperanto -> esperanto 463 Estonian -> estonian 464 Ethiopic -> ethiopic 465 -- Farsi -> farsi 466 Finnish -> finnish 467 French -> french 468 Friulan -> friulan 469 Galician -> galician 470 Georgian -> georgian 471 German_1901 -> german_1901 472 German_1996 -> german_1996 473 German_Swiss -> german_Swiss 474 Greek_Ancient -> greek_Ancient 475 Greek_Mono -> greek_Mono 476 Greek_Poly -> greek_Poly 477 Gujarati -> gujarati 478 Hindi -> hindi 479 Hungarian -> hungarian 480 Icelandic -> icelandic 481 Indonesian -> indonesian 482 Interlingua -> interlingua 483 Irish -> irish 484 Italian -> italian 485 Kannada -> kannada 486 Kurmanji -> kurmanji 487 Latin -> latin 488 Latin_Classic -> latin_Classic 489 Latvian -> latvian 490 Lithuanian -> lithuanian 491 Malayalam -> malayalam 492 Marathi -> marathi 493 Mongolian -> mongolian 494 Norwegian_Bokmal -> norwegian_Bokmal 495 Norwegian_Nynorsk -> norwegian_Nynorsk 496 Occitan -> occitan 497 Oriya -> oriya 498 Panjabi -> panjabi 499 Piedmontese -> piedmontese 500 Polish -> polish 501 Portuguese -> portuguese 502 Romanian -> romanian 503 Romansh -> romansh 504 Russian -> russian 505 Sanskrit -> sanskrit 506 Serbian_Cyrillic -> serbian_Cyrillic 507 Serbocroatian_Cyrillic -> serbocroatian_Cyrillic 508 Serbocroatian_Latin -> serbocroatian_Latin 509 Slovak -> slovak 510 Slovenian -> slovenian 511 Spanish -> spanish 512 Swedish -> swedish 513 Tamil -> tamil 514 Telugu -> telugu 515 Thai -> thai 516 Turkish -> turkish 517 Turkmen -> turkmen 518 Ukrainian -> ukrainian 519 Uppersorbian -> uppersorbian 520 Welsh -> welsh 521