1{-# LANGUAGE CPP #-}
2#if __GLASGOW_HASKELL__ >= 702
3{-# LANGUAGE Trustworthy #-}
4#endif
5#if EMBED
6{-# LANGUAGE TemplateHaskell #-}
7#endif
8-----------------------------------------------------------------------------
9-- |
10-- Module      :  Text.Hyphenation.Language
11-- Copyright   :  (C) 2012-2019 Edward Kmett,
12--                (C) 2007 Ned Batchelder
13-- License     :  BSD-style (see the languageAffix LICENSE)
14--
15-- Maintainer  :  Edward Kmett <ekmett@gmail.com>
16-- Stability   :  provisional
17-- Portability :  portable
18--
19----------------------------------------------------------------------------
20module Text.Hyphenation.Language
21  (
22  -- * Pattern file support
23    Language(..)
24  , languageHyphenator
25  -- * Provided language hyphenators
26  , afrikaans, armenian, assamese, basque, bengali, bulgarian, catalan, chinese
27  , coptic, croatian, czech, danish, dutch, english_US, english_GB, esperanto
28  , estonian, ethiopic, {- farsi, -} finnish, french, friulan, galician, georgian, german_1901, german_1996
29  , german_Swiss, greek_Ancient, greek_Mono, greek_Poly, gujarati, hindi, hungarian
30  , icelandic, indonesian, interlingua, irish, italian, kannada, kurmanji, latin, latin_Classic
31  , latvian, lithuanian, malayalam, marathi, mongolian, norwegian_Bokmal
32  , norwegian_Nynorsk, occitan, oriya, panjabi, piedmontese, polish, portuguese, romanian, romansh
33  , russian, sanskrit, serbian_Cyrillic, serbocroatian_Cyrillic
34  , serbocroatian_Latin, slovak, slovenian, spanish, swedish, tamil
35  , telugu, thai, turkish, turkmen, ukrainian, uppersorbian, welsh
36  , loadHyphenator
37  , languageAffix
38  ) where
39
40import Codec.Compression.GZip
41#if __GLASGOW_HASKELL__ < 710
42import Data.Functor ((<$>))
43#endif
44import qualified Data.IntMap as IM
45import qualified Data.Text as T
46import qualified Data.Text.Encoding as T
47import Text.Hyphenation.ByteStringLazyCompat as Lazy
48import Text.Hyphenation.Hyphenator
49import Text.Hyphenation.Pattern
50import Text.Hyphenation.Exception
51import System.IO.Unsafe
52
53#if !EMBED
54import Paths_hyphenation
55#else
56import Data.FileEmbed
57import qualified Data.ByteString.Char8 as Strict
58
59hyphenatorFiles :: [(FilePath, Strict.ByteString)]
60hyphenatorFiles = $(embedDir "data")
61#endif
62
63-- $setup
64-- >>> import Text.Hyphenation.Hyphenator
65
66chrLine :: String -> [(Int, Char)]
67chrLine (x:xs) = fmap (\y -> (fromEnum y, x)) xs
68chrLine [] = []
69
70-- | Read a built-in language file from the data directory where cabal installed this package.
71--
72-- (e.g. @hyphenateLanguage \"en-us\"@ opens @\"\/Users\/ekmett\/.cabal\/share\/hyphenation-0.2\/ghc-7.4.1\/hyph-en-us.hyp.txt\"@
73-- among others when run on the author's local machine)
74loadHyphenator :: Language -> IO Hyphenator
75#if !EMBED
76loadHyphenator language = do
77  let affix = languageAffix language
78  hyp <- unzipUtf8 <$> (getDataFileName ("hyph-" ++ affix ++ ".hyp.txt.gz") >>= Lazy.readFile)
79  pat <- unzipUtf8 <$> (getDataFileName ("hyph-" ++ affix ++ ".pat.txt.gz") >>= Lazy.readFile)
80  chr <- unzipUtf8 <$> (getDataFileName ("hyph-" ++ affix ++ ".chr.txt.gz") >>= Lazy.readFile)
81  let chrMap = IM.fromList (Prelude.lines chr >>= chrLine)
82      tryLookup x = IM.findWithDefault x (fromEnum x) chrMap
83      (defaultLeftMin, defaultRightMin) = languageMins language
84  return $ Hyphenator tryLookup (parsePatterns pat) (parseExceptions hyp) defaultLeftMin defaultRightMin
85#else
86loadHyphenator language = return $ Hyphenator tryLookup (parsePatterns pat) (parseExceptions hyp) defaultLeftMin defaultRightMin
87  where affix = languageAffix language
88        Just hyp = unzipUtf8 . Lazy.fromStrict <$> lookup ("hyph-" ++ affix ++ ".hyp.txt.gz") hyphenatorFiles
89        Just pat = unzipUtf8 . Lazy.fromStrict <$> lookup ("hyph-" ++ affix ++ ".pat.txt.gz") hyphenatorFiles
90        Just chr = unzipUtf8 . Lazy.fromStrict <$> lookup ("hyph-" ++ affix ++ ".chr.txt.gz") hyphenatorFiles
91        chrMap = IM.fromList (Prelude.lines chr >>= chrLine)
92        (defaultLeftMin, defaultRightMin) = languageMins language
93        tryLookup x = IM.findWithDefault x (fromEnum x) chrMap
94#endif
95
96unzipUtf8 :: ByteString -> String
97unzipUtf8 =
98  T.unpack . T.decodeUtf8With (\ _ -> fmap (toEnum . fromEnum))
99  . Lazy.toStrict . decompress
100
101-- | A strongly typed set of available languages you can use for hyphenation.
102data Language
103  = Afrikaans
104  | Armenian
105  | Assamese
106  | Basque
107  | Bengali
108  | Bulgarian
109  | Catalan
110  | Chinese
111  | Coptic
112  | Croatian
113  | Czech
114  | Danish
115  | Dutch
116  | English_US | English_GB
117  | Esperanto
118  | Estonian
119  | Ethiopic
120  -- | Farsi
121  | Finnish
122  | French
123  | Friulan
124  | Galician
125  | Georgian
126  | German_1901 | German_1996 | German_Swiss
127  | Greek_Ancient
128  | Greek_Mono
129  | Greek_Poly
130  | Gujarati
131  | Hindi
132  | Hungarian
133  | Icelandic
134  | Indonesian
135  | Interlingua
136  | Irish
137  | Italian
138  | Kannada
139  | Kurmanji
140  | Latin
141  | Latin_Classic
142  | Latvian
143  | Lithuanian
144  | Malayalam
145  | Marathi
146  | Mongolian
147  | Norwegian_Bokmal | Norwegian_Nynorsk
148  | Occitan
149  | Oriya
150  | Panjabi
151  | Piedmontese
152  | Polish
153  | Portuguese
154  | Romanian
155  | Romansh
156  | Russian
157  | Sanskrit
158  | Serbian_Cyrillic
159  | Serbocroatian_Cyrillic | Serbocroatian_Latin
160  | Slovak
161  | Slovenian
162  | Spanish
163  | Swedish
164  | Tamil
165  | Telugu
166  | Thai
167  | Turkish
168  | Turkmen
169  | Ukrainian
170  | Uppersorbian
171  | Welsh
172  deriving (Eq,Ord,Show,Bounded,Enum)
173
174
175-- | the infix portion of the data file names used for this language
176languageAffix :: Language -> String
177languageAffix s = case s of
178  Afrikaans -> "af"
179  Armenian -> "hy"
180  Assamese -> "as"
181  Basque -> "eu"
182  Bengali -> "bn"
183  Bulgarian -> "bg"
184  Catalan -> "ca"
185  Chinese -> "zh-latn-pinyin"
186  Coptic -> "cop"
187  Croatian -> "hr"
188  Czech -> "cs"
189  Danish -> "da"
190  Dutch -> "nl"
191  English_US -> "en-us"
192  English_GB -> "en-gb"
193  Esperanto -> "eo"
194  Estonian -> "et"
195  Ethiopic -> "mul-ethi"
196  -- Farsi -> "fa"
197  Finnish -> "fi"
198  French -> "fr"
199  Friulan -> "fur"
200  Galician -> "gl"
201  Georgian -> "ka"
202  German_1901  -> "de-1901"
203  German_1996  -> "de-1996"
204  German_Swiss -> "de-ch-1901"
205  Greek_Ancient -> "grc"
206  Greek_Mono -> "el-monoton"
207  Greek_Poly -> "el-polyton"
208  Gujarati -> "gu"
209  Hindi -> "hi"
210  Hungarian -> "hu"
211  Icelandic -> "is"
212  Indonesian -> "id"
213  Interlingua -> "ia"
214  Irish -> "ga"
215  Italian -> "it"
216  Kannada -> "kn"
217  Kurmanji -> "kmr"
218  Latin -> "la"
219  Latin_Classic -> "la-x-classic"
220  Latvian -> "lv"
221  Lithuanian -> "lt"
222  Malayalam -> "ml"
223  Marathi -> "mr"
224  Mongolian -> "mn-cyrl"
225  Norwegian_Bokmal  -> "nb"
226  Norwegian_Nynorsk -> "nn"
227  Occitan -> "oc"
228  Oriya -> "or"
229  Panjabi -> "pa"
230  Piedmontese -> "pms"
231  Polish -> "pl"
232  Portuguese -> "pt"
233  Romanian -> "ro"
234  Romansh -> "rm"
235  Russian -> "ru"
236  Sanskrit -> "sa"
237  Serbian_Cyrillic -> "sr-cyrl"
238  Serbocroatian_Cyrillic -> "sh-cyrl"
239  Serbocroatian_Latin -> "sh-latn"
240  Slovak -> "sk"
241  Slovenian -> "sl"
242  Spanish -> "es"
243  Swedish -> "sv"
244  Tamil -> "ta"
245  Telugu -> "te"
246  Thai -> "th"
247  Turkish -> "tr"
248  Turkmen -> "tk"
249  Ukrainian -> "uk"
250  Uppersorbian -> "hsb"
251  Welsh -> "cy"
252
253
254-- | The number of characters from the beginning and end of a word not to hyphenate in this language.
255languageMins :: Language -> (Int, Int)
256languageMins s = case s of
257  Afrikaans -> (1, 2)
258  Armenian -> (1, 2)
259  Assamese -> (1, 1)
260  Basque -> (2, 2)
261  Bengali -> (1, 1)
262  Bulgarian -> (2, 2)
263  Catalan -> (2, 2)
264  Chinese -> (1, 1)
265  Coptic -> (1, 1)
266  Croatian -> (2, 2)
267  Czech -> (2, 3)
268  Danish -> (2, 2)
269  Dutch -> (2, 2)
270  English_GB -> (2, 3)
271  English_US -> (2, 3)
272  Esperanto -> (2, 2)
273  Estonian -> (2, 3)
274  Ethiopic -> (1, 1)
275  -- Farsi -> (,)
276  Finnish -> (2, 2)
277  French -> (2, 3)
278  Friulan -> (2, 2)
279  Galician -> (2, 2)
280  Georgian -> (1, 2)
281  German_1901 -> (2, 2)
282  German_1996 -> (2, 2)
283  German_Swiss -> (2, 2)
284  Greek_Ancient -> (1, 1)
285  Greek_Mono -> (1, 1)
286  Greek_Poly -> (1, 1)
287  Gujarati -> (1, 1)
288  Hindi -> (1, 1)
289  Hungarian -> (2, 2)
290  Icelandic -> (2, 2)
291  Indonesian -> (2, 2)
292  Interlingua -> (2, 2)
293  Irish -> (2, 3)
294  Italian -> (2, 2)
295  Kannada -> (1, 1)
296  Kurmanji -> (2, 2)
297  Latin -> (2, 2)
298  Latin_Classic -> (2, 2)
299  Latvian -> (2, 2)
300  Lithuanian -> (2, 2)
301  Malayalam -> (1, 1)
302  Marathi -> (1, 1)
303  Mongolian -> (2, 2)
304  Norwegian_Bokmal -> (2, 2)
305  Norwegian_Nynorsk -> (2, 2)
306  Occitan -> (2, 2)
307  Oriya -> (1, 1)
308  Panjabi -> (1, 1)
309  Piedmontese -> (2, 2)
310  Polish -> (2, 2)
311  Portuguese -> (2, 3)
312  Romanian -> (2, 2)
313  Romansh -> (2, 2)
314  Russian -> (2, 2)
315  Sanskrit -> (1, 3)
316  Serbian_Cyrillic -> (2, 2)
317  Serbocroatian_Cyrillic -> (2, 2)
318  Serbocroatian_Latin -> (2, 2)
319  Slovak -> (2, 3)
320  Slovenian -> (2, 2)
321  Spanish -> (2, 2)
322  Swedish -> (2, 2)
323  Tamil -> (1, 1)
324  Telugu -> (1, 1)
325  Thai -> (2, 3)
326  Turkish -> (2, 2)
327  Turkmen -> (2, 2)
328  Ukrainian -> (2, 2)
329  Uppersorbian -> (2, 2)
330  Welsh -> (2, 3)
331
332
333-- |
334-- >>> hyphenate english_US "supercalifragilisticexpialadocious"
335-- ["su","per","cal","ifrag","ilis","tic","ex","pi","al","ado","cious"]
336--
337-- favors US hyphenation
338english_US :: Hyphenator
339
340-- |
341-- >>> hyphenate english_GB "supercalifragilisticexpialadocious"
342-- ["su","per","cal","i","fra","gil","istic","ex","pi","alado","cious"]
343--
344-- favors UK hyphenation
345english_GB :: Hyphenator
346
347-- |
348-- >>> hyphenate french "anticonstitutionnellement"
349-- ["an","ti","cons","ti","tu","tion","nel","le","ment"]
350french :: Hyphenator
351
352-- |
353-- >>> hyphenate icelandic "va\240lahei\240avegavinnuverkf\230rageymslusk\250r"
354-- ["va\240la","hei\240a","vega","vinnu","verk","f\230ra","geymslu","sk\250r"]
355icelandic :: Hyphenator
356
357-- | Hyphenators for a wide array of languages.
358afrikaans, armenian, assamese, basque, bengali, bulgarian, catalan, chinese,
359 coptic, croatian, czech, danish, dutch, esperanto,
360 estonian, ethiopic, {- farsi, -} finnish, friulan, galician, georgian, german_1901, german_1996,
361 german_Swiss, greek_Ancient, greek_Mono, greek_Poly, gujarati, hindi, hungarian,
362 indonesian, interlingua, irish, italian, kannada, kurmanji, latin, latin_Classic,
363 latvian, lithuanian, malayalam, marathi, mongolian, norwegian_Bokmal,
364 norwegian_Nynorsk, occitan, oriya, panjabi, piedmontese, polish, portuguese, romanian,
365 romansh, russian, sanskrit, serbian_Cyrillic, serbocroatian_Cyrillic,
366 serbocroatian_Latin, slovak, slovenian, spanish, swedish, tamil,
367 telugu, thai, turkish, turkmen, ukrainian, uppersorbian, welsh :: Hyphenator
368
369afrikaans = unsafePerformIO (loadHyphenator Afrikaans)
370armenian = unsafePerformIO (loadHyphenator Armenian)
371assamese = unsafePerformIO (loadHyphenator Assamese)
372basque = unsafePerformIO (loadHyphenator Basque)
373bengali = unsafePerformIO (loadHyphenator Bengali)
374bulgarian = unsafePerformIO (loadHyphenator Bulgarian)
375catalan = unsafePerformIO (loadHyphenator Catalan)
376chinese = unsafePerformIO (loadHyphenator Chinese)
377coptic = unsafePerformIO (loadHyphenator Coptic)
378croatian = unsafePerformIO (loadHyphenator Croatian)
379czech = unsafePerformIO (loadHyphenator Czech)
380danish = unsafePerformIO (loadHyphenator Danish)
381dutch = unsafePerformIO (loadHyphenator Dutch)
382english_US = unsafePerformIO (loadHyphenator English_US)
383english_GB = unsafePerformIO (loadHyphenator English_GB)
384esperanto = unsafePerformIO (loadHyphenator Esperanto)
385estonian = unsafePerformIO (loadHyphenator Estonian)
386ethiopic = unsafePerformIO (loadHyphenator Ethiopic)
387-- farsi = unsafePerformIO (loadHyphenator Farsi)
388finnish = unsafePerformIO (loadHyphenator Finnish)
389french = unsafePerformIO (loadHyphenator French)
390friulan = unsafePerformIO (loadHyphenator Friulan)
391galician = unsafePerformIO (loadHyphenator Galician)
392georgian = unsafePerformIO (loadHyphenator Georgian)
393german_1901 = unsafePerformIO (loadHyphenator German_1901)
394german_1996 = unsafePerformIO (loadHyphenator German_1996)
395german_Swiss = unsafePerformIO (loadHyphenator German_Swiss)
396greek_Ancient = unsafePerformIO (loadHyphenator Greek_Ancient)
397greek_Mono = unsafePerformIO (loadHyphenator Greek_Mono)
398greek_Poly = unsafePerformIO (loadHyphenator Greek_Poly)
399gujarati = unsafePerformIO (loadHyphenator Gujarati)
400hindi = unsafePerformIO (loadHyphenator Hindi)
401hungarian = unsafePerformIO (loadHyphenator Hungarian)
402icelandic = unsafePerformIO (loadHyphenator Icelandic)
403indonesian = unsafePerformIO (loadHyphenator Indonesian)
404interlingua = unsafePerformIO (loadHyphenator Interlingua)
405irish = unsafePerformIO (loadHyphenator Irish)
406italian = unsafePerformIO (loadHyphenator Italian)
407kannada = unsafePerformIO (loadHyphenator Kannada)
408kurmanji = unsafePerformIO (loadHyphenator Kurmanji)
409latin = unsafePerformIO (loadHyphenator Latin)
410latin_Classic = unsafePerformIO (loadHyphenator Latin_Classic)
411latvian = unsafePerformIO (loadHyphenator Latvian)
412lithuanian = unsafePerformIO (loadHyphenator Lithuanian)
413malayalam = unsafePerformIO (loadHyphenator Malayalam)
414marathi = unsafePerformIO (loadHyphenator Marathi)
415mongolian = unsafePerformIO (loadHyphenator Mongolian)
416norwegian_Bokmal = unsafePerformIO (loadHyphenator Norwegian_Bokmal)
417norwegian_Nynorsk = unsafePerformIO (loadHyphenator Norwegian_Nynorsk)
418occitan = unsafePerformIO (loadHyphenator Occitan)
419oriya = unsafePerformIO (loadHyphenator Oriya)
420panjabi = unsafePerformIO (loadHyphenator Panjabi)
421piedmontese = unsafePerformIO (loadHyphenator Piedmontese)
422polish = unsafePerformIO (loadHyphenator Polish)
423portuguese = unsafePerformIO (loadHyphenator Portuguese)
424romanian = unsafePerformIO (loadHyphenator Romanian)
425romansh = unsafePerformIO (loadHyphenator Romansh)
426russian = unsafePerformIO (loadHyphenator Russian)
427sanskrit = unsafePerformIO (loadHyphenator Sanskrit)
428serbian_Cyrillic = unsafePerformIO (loadHyphenator Serbian_Cyrillic)
429serbocroatian_Cyrillic = unsafePerformIO (loadHyphenator Serbocroatian_Cyrillic)
430serbocroatian_Latin = unsafePerformIO (loadHyphenator Serbocroatian_Latin)
431slovak = unsafePerformIO (loadHyphenator Slovak)
432slovenian = unsafePerformIO (loadHyphenator Slovenian)
433spanish = unsafePerformIO (loadHyphenator Spanish)
434swedish = unsafePerformIO (loadHyphenator Swedish)
435tamil = unsafePerformIO (loadHyphenator Tamil)
436telugu = unsafePerformIO (loadHyphenator Telugu)
437thai = unsafePerformIO (loadHyphenator Thai)
438turkish = unsafePerformIO (loadHyphenator Turkish)
439turkmen = unsafePerformIO (loadHyphenator Turkmen)
440ukrainian = unsafePerformIO (loadHyphenator Ukrainian)
441uppersorbian = unsafePerformIO (loadHyphenator Uppersorbian)
442welsh = unsafePerformIO (loadHyphenator Welsh)
443
444-- | Load (and cache) the hyphenator for a given language.
445languageHyphenator :: Language -> Hyphenator
446languageHyphenator s = case s of
447  Afrikaans -> afrikaans
448  Armenian -> armenian
449  Assamese -> assamese
450  Basque -> basque
451  Bengali -> bengali
452  Bulgarian -> bulgarian
453  Catalan -> catalan
454  Chinese -> chinese
455  Coptic -> coptic
456  Croatian -> croatian
457  Czech -> czech
458  Danish -> danish
459  Dutch -> dutch
460  English_US -> english_US
461  English_GB -> english_GB
462  Esperanto -> esperanto
463  Estonian -> estonian
464  Ethiopic -> ethiopic
465  -- Farsi -> farsi
466  Finnish -> finnish
467  French -> french
468  Friulan -> friulan
469  Galician -> galician
470  Georgian -> georgian
471  German_1901  -> german_1901
472  German_1996  -> german_1996
473  German_Swiss -> german_Swiss
474  Greek_Ancient -> greek_Ancient
475  Greek_Mono -> greek_Mono
476  Greek_Poly -> greek_Poly
477  Gujarati -> gujarati
478  Hindi -> hindi
479  Hungarian -> hungarian
480  Icelandic -> icelandic
481  Indonesian -> indonesian
482  Interlingua -> interlingua
483  Irish -> irish
484  Italian -> italian
485  Kannada -> kannada
486  Kurmanji -> kurmanji
487  Latin -> latin
488  Latin_Classic -> latin_Classic
489  Latvian -> latvian
490  Lithuanian -> lithuanian
491  Malayalam -> malayalam
492  Marathi -> marathi
493  Mongolian -> mongolian
494  Norwegian_Bokmal  -> norwegian_Bokmal
495  Norwegian_Nynorsk -> norwegian_Nynorsk
496  Occitan -> occitan
497  Oriya -> oriya
498  Panjabi -> panjabi
499  Piedmontese -> piedmontese
500  Polish -> polish
501  Portuguese -> portuguese
502  Romanian -> romanian
503  Romansh -> romansh
504  Russian -> russian
505  Sanskrit -> sanskrit
506  Serbian_Cyrillic -> serbian_Cyrillic
507  Serbocroatian_Cyrillic -> serbocroatian_Cyrillic
508  Serbocroatian_Latin -> serbocroatian_Latin
509  Slovak -> slovak
510  Slovenian -> slovenian
511  Spanish -> spanish
512  Swedish -> swedish
513  Tamil -> tamil
514  Telugu -> telugu
515  Thai -> thai
516  Turkish -> turkish
517  Turkmen -> turkmen
518  Ukrainian -> ukrainian
519  Uppersorbian -> uppersorbian
520  Welsh -> welsh
521