1 // IMPORTANT : when making changes in language detection logic and per-language
2 // rules here, be sure to also bump FORMATTING_VERSION_ID in src/lvtinydom.cpp
3 
4 #include "../include/textlang.h"
5 #include "../include/hyphman.h"
6 #include "../include/lvtinydom.h"
7 #include "../include/fb2def.h"
8 #include "../include/crlog.h"
9 
10 // Uncomment to see which lang_tags are seen and lang_cfg created
11 // #define DEBUG_LANG_USAGE
12 
13 // Some macros to expand: LANG_STARTS_WITH(("fr") ("es"))   (no comma!)
14 // to: lang_tag.startsWith("fr") || lang_tag.startsWith("es") || false
15 // (from https://stackoverflow.com/questions/19680962/translate-sequence-in-macro-parameters-to-separate-macros )
16 #define PRIMITIVE_SEQ_ITERATE(...) __VA_ARGS__ ## _END
17 #define SEQ_ITERATE(...) PRIMITIVE_SEQ_ITERATE(__VA_ARGS__)
18 #define LANG_STARTS_WITH(seq) SEQ_ITERATE(LANG_STARTS_WITH_EACH_1 seq)
19 #define LANG_STARTS_WITH_EACH_1(...) lang_tag.startsWith(__VA_ARGS__) || LANG_STARTS_WITH_EACH_2
20 #define LANG_STARTS_WITH_EACH_2(...) lang_tag.startsWith(__VA_ARGS__) || LANG_STARTS_WITH_EACH_1
21 #define LANG_STARTS_WITH_EACH_1_END false
22 #define LANG_STARTS_WITH_EACH_2_END false
23 
24 // (hyph_filename_prefix added because CoolReader may still have both
25 // current "Italian.pattern" and old "Italian_hyphen_(Alan).pdb".)
26 // (Romanian and Ukrainian have the prefix truncated because previous
27 // pattern files, still in CoolReader, had these truncated names.)
28 static struct {
29     const char * lang_tag;
30     const char * hyph_filename_prefix;
31     const char * hyph_filename;
32     int left_hyphen_min;
33     int right_hyphen_min;
34 } _hyph_dict_table[] = {
35     { "hy",    "Armenian",      "Armenian.pattern",      1, 2 },
36     { "eu",    "Basque",        "Basque.pattern",        2, 2 },
37     { "bg",    "Bulgarian",     "Bulgarian.pattern",     2, 2 },
38     { "ca",    "Catalan",       "Catalan.pattern",       2, 2 },
39     { "cs",    "Czech",         "Czech.pattern",         2, 3 },
40     { "da",    "Danish",        "Danish.pattern",        2, 2 },
41     { "nl",    "Dutch",         "Dutch.pattern",         2, 2 },
42     { "en-GB", "English_GB",    "English_GB.pattern",    2, 3 },
43     { "en",    "English_US",    "English_US.pattern",    2, 3 },
44     { "eo",    "Esperanto",     "Esperanto.pattern",     2, 2 },
45     { "et",    "Estonian",      "Estonian.pattern",      2, 3 },
46     { "fi",    "Finnish",       "Finnish.pattern",       2, 2 },
47     { "fr",    "French",        "French.pattern",        2, 1 }, // see French.pattern file for why right_hyphen_min=1
48     { "fur",   "Friulian",      "Friulian.pattern",      2, 2 },
49     { "gl",    "Galician",      "Galician.pattern",      2, 2 },
50     { "ka",    "Georgian",      "Georgian.pattern",      1, 2 },
51     { "de",    "German",        "German.pattern",        2, 2 },
52     { "el",    "Greek",         "Greek.pattern",         1, 1 },
53     { "hr",    "Croatian",      "Croatian.pattern",      2, 2 },
54     { "hu",    "Hungarian",     "Hungarian.pattern",     2, 2 },
55     { "is",    "Icelandic",     "Icelandic.pattern",     2, 2 },
56     { "ga",    "Irish",         "Irish.pattern",         2, 3 },
57     { "it",    "Italian",       "Italian.pattern",       2, 2 },
58     { "la-lit","Latin_liturgical","Latin_liturgical.pattern",2, 2 },
59     { "la",    "Latin",         "Latin.pattern",         2, 2 },
60     { "lv",    "Latvian",       "Latvian.pattern",       2, 2 },
61     { "lt",    "Lithuanian",    "Lithuanian.pattern",    2, 2 },
62     { "mk",    "Macedonian",    "Macedonian.pattern",    2, 2 },
63     { "no",    "Norwegian",     "Norwegian.pattern",     2, 2 },
64     { "oc",    "Occitan",       "Occitan.pattern",       2, 2 },
65     { "pms",   "Piedmontese",   "Piedmontese.pattern",   2, 2 },
66     { "pl",    "Polish",        "Polish.pattern",        2, 2 },
67     { "pt-BR", "Portuguese_BR", "Portuguese_BR.pattern", 2, 3 },
68     { "pt",    "Portuguese",    "Portuguese.pattern",    2, 3 },
69     { "ro",    "Roman",         "Romanian.pattern",      2, 2 }, // truncated prefix (see above)
70     { "rm",    "Romansh",       "Romansh.pattern",       2, 2 },
71     { "ru-GB", "Russian_EnGB",  "Russian_EnGB.pattern",  2, 2 },
72     { "ru-US", "Russian_EnUS",  "Russian_EnUS.pattern",  2, 2 },
73     { "ru",    "Russian",       "Russian.pattern",       2, 2 },
74     { "sr",    "Serbian",       "Serbian.pattern",       2, 2 },
75     { "sk",    "Slovak",        "Slovak.pattern",        2, 3 },
76     { "sl",    "Slovenian",     "Slovenian.pattern",     2, 2 },
77     { "es",    "Spanish",       "Spanish.pattern",       2, 2 },
78     { "sv",    "Swedish",       "Swedish.pattern",       2, 2 },
79     { "tr",    "Turkish",       "Turkish.pattern",       2, 2 },
80     { "uk",    "Ukrain",        "Ukrainian.pattern",     2, 2 }, // truncated prefix (see above)
81     { "cy",    "Welsh",         "Welsh.pattern",         2, 3 },
82     { "zu",    "Zulu",          "Zulu.pattern",          2, 1 }, // defaulting to 2,1, left hyphenmin might need tweaking
83     // No-lang hyph methods, for legacy HyphMan methods: other lang properties will be from English
84     { "en#@none",        "@none",        "@none",        2, 2 },
85     { "en#@softhyphens", "@softhyphens", "@softhyphens", 2, 2 },
86     { "en#@algorithm",   "@algorithm",   "@algorithm",   2, 2 },
87     { NULL, NULL, NULL, 0, 0 }
88 };
89 
90 // Init global TextLangMan members
91 lString32 TextLangMan::_main_lang = TEXTLANG_DEFAULT_MAIN_LANG_32;
92 bool TextLangMan::_embedded_langs_enabled = TEXTLANG_DEFAULT_EMBEDDED_LANGS_ENABLED;
93 LVPtrVector<TextLangCfg> TextLangMan::_lang_cfg_list;
94 
95 bool TextLangMan::_hyphenation_enabled = TEXTLANG_DEFAULT_HYPHENATION_ENABLED;
96 bool TextLangMan::_hyphenation_soft_hyphens_only = TEXTLANG_DEFAULT_HYPH_SOFT_HYPHENS_ONLY;
97 bool TextLangMan::_hyphenation_force_algorithmic = TEXTLANG_DEFAULT_HYPH_FORCE_ALGORITHMIC;
98 bool TextLangMan::_overridden_hyph_method =   !TEXTLANG_DEFAULT_HYPHENATION_ENABLED
99                                             || TEXTLANG_DEFAULT_HYPH_SOFT_HYPHENS_ONLY
100                                             || TEXTLANG_DEFAULT_HYPH_FORCE_ALGORITHMIC ;
101 // These will be set when we can
102 HyphMethod * TextLangMan::_no_hyph_method = NULL;
103 HyphMethod * TextLangMan::_algo_hyph_method = NULL;
104 HyphMethod * TextLangMan::_soft_hyphens_method = NULL;
105 
TextLangMan()106 TextLangMan::TextLangMan() {
107 }
108 
~TextLangMan()109 TextLangMan::~TextLangMan() {
110 }
111 
getHash()112 lUInt32 TextLangMan::getHash() {
113     lUInt32 hash = _main_lang.getHash();
114     hash = hash << 4;
115     hash = hash + (_embedded_langs_enabled << 3);
116     hash = hash + (_hyphenation_soft_hyphens_only << 2);
117     hash = hash + (_hyphenation_force_algorithmic << 1);
118     hash = hash + _hyphenation_enabled;
119     // printf("TextLangMan::getHash %x\n", hash);
120     return hash;
121 }
122 
123 // No need to explicitely call this in frontend code.
124 // Calling HyphMan::uninit() will have this one called.
uninit()125 void TextLangMan::uninit() {
126     _lang_cfg_list.clear();
127 }
128 
129 // For HyphMan legacy methods
setMainLangFromHyphDict(lString32 id)130 void TextLangMan::setMainLangFromHyphDict( lString32 id ) {
131     // When setting up TextlangMan thru HyphMan legacy methods,
132     // disable embedded langs, for a consistent hyphenation.
133     TextLangMan::setEmbeddedLangsEnabled( false );
134     // Update flags if asked for @none, @softhyphens or @algorithm
135     TextLangMan::setHyphenationEnabled( id != HYPH_DICT_ID_NONE );
136     TextLangMan::setHyphenationSoftHyphensOnly( id == HYPH_DICT_ID_SOFTHYPHENS );
137     TextLangMan::setHyphenationForceAlgorithmic( id == HYPH_DICT_ID_ALGORITHM );
138 
139     for (int i=0; _hyph_dict_table[i].lang_tag!=NULL; i++) {
140         if ( id.startsWith( _hyph_dict_table[i].hyph_filename_prefix ) ) {
141             TextLangMan::setMainLang( lString32(_hyph_dict_table[i].lang_tag) );
142             #ifdef DEBUG_LANG_USAGE
143             printf("TextLangMan::setMainLangFromHyphDict %s => %s\n",
144                 UnicodeToLocal(id).c_str(), UnicodeToLocal(TextLangMan::getMainLang()).c_str());
145             #endif
146             return;
147         }
148     }
149     CRLog::warn("lang not found for hyphenation dict: %s\n", UnicodeToLocal(id).c_str());
150 }
151 
152 // Used only by TextLangCfg
getHyphMethodForLang(lString32 lang_tag)153 HyphMethod * TextLangMan::getHyphMethodForLang( lString32 lang_tag ) {
154     // Look for full lang_tag
155 #if 1
156     // CoolReader use dynamically loaded hyphenation dictionaries (at startup)
157     HyphDictionaryList* dictList = HyphMan::getDictList();
158     HyphDictionary* dict;
159     lString32 dict_lang_tag;
160     lang_tag.lowercase();
161     int left_hyphen_min = 2;
162     int right_hyphen_min = 3;
163     for (int i = 0; i < dictList->length(); i++) {
164         dict = dictList->get(i);
165         if (dict) {
166             if (dict->getType() == HDT_DICT_ALAN || dict->getType() == HDT_DICT_TEX)
167                 dict_lang_tag = TextLangMan::getLangTag(dict->getTitle());      // for dictionary's files
168             else
169                 dict_lang_tag = TextLangMan::getLangTag(dict->getId());         // for default dictionaries
170             dict_lang_tag.lowercase();
171             if (lang_tag == dict_lang_tag) {
172                 for (int j=0; _hyph_dict_table[j].lang_tag!=NULL; j++) {
173                     if ( lang_tag == lString32(_hyph_dict_table[j].lang_tag).lowercase() ) {
174                         left_hyphen_min = _hyph_dict_table[j].left_hyphen_min;
175                         right_hyphen_min = _hyph_dict_table[j].right_hyphen_min;
176                         break;
177                     }
178                 }
179                 return HyphMan::getHyphMethodForDictionary( dict->getId(), left_hyphen_min, right_hyphen_min );
180             }
181         }
182     }
183     // Look for lang_tag initial subpart
184     int m_pos = lang_tag.pos("-");
185     if ( m_pos > 0 ) {
186         lString32 lang_tag2 = lang_tag.substr(0, m_pos);
187         lang_tag2.lowercase();
188         for (int i = 0; i < dictList->length(); i++) {
189             dict = dictList->get(i);
190             if (dict) {
191                 if (dict->getType() == HDT_DICT_ALAN || dict->getType() == HDT_DICT_TEX)
192                     dict_lang_tag = TextLangMan::getLangTag(dict->getTitle());
193                 else
194                     dict_lang_tag = TextLangMan::getLangTag(dict->getId());     // for default dictionaries
195                 dict_lang_tag.lowercase();
196                 if (lang_tag2 == dict_lang_tag)
197                     for (int j=0; _hyph_dict_table[j].lang_tag!=NULL; j++) {
198                         if ( lang_tag == lString32(_hyph_dict_table[j].lang_tag).lowercase() ) {
199                             left_hyphen_min = _hyph_dict_table[j].left_hyphen_min;
200                             right_hyphen_min = _hyph_dict_table[j].right_hyphen_min;
201                             break;
202                         }
203                     }
204                     return HyphMan::getHyphMethodForDictionary( dict->getId(), left_hyphen_min, right_hyphen_min );
205             }
206         }
207     }
208 #else
209     // koreader use hardcoded hyphenation dictionary table
210     for (int i=0; _hyph_dict_table[i].lang_tag!=NULL; i++) {
211         if ( lang_tag == lString32(_hyph_dict_table[i].lang_tag).lowercase() ) {
212             return HyphMan::getHyphMethodForDictionary( lString32(_hyph_dict_table[i].hyph_filename),
213                         _hyph_dict_table[i].left_hyphen_min, _hyph_dict_table[i].right_hyphen_min);
214         }
215     }
216     // Look for lang_tag initial subpart
217     int m_pos = lang_tag.pos("-");
218     if ( m_pos > 0 ) {
219         lString32 lang_tag2 = lang_tag.substr(0, m_pos);
220         for (int i=0; _hyph_dict_table[i].lang_tag!=NULL; i++) {
221             if ( lang_tag2 == lString32(_hyph_dict_table[i].lang_tag).lowercase() ) {
222                 return HyphMan::getHyphMethodForDictionary( lString32(_hyph_dict_table[i].hyph_filename),
223                             _hyph_dict_table[i].left_hyphen_min, _hyph_dict_table[i].right_hyphen_min);
224             }
225         }
226     }
227 #endif
228     // Fallback to English_US, as other languages are more likely to get mixed
229     // with english text (it feels better than using @algorithm)
230     return HyphMan::getHyphMethodForDictionary(TEXTLANG_FALLBACK_HYPH_DICT_ID);
231 }
232 
233 // Return the (single and cached) TextLangCfg for the provided lang_tag
getTextLangCfg(lString32 lang_tag)234 TextLangCfg * TextLangMan::getTextLangCfg( lString32 lang_tag ) {
235     if ( !_embedded_langs_enabled ) {
236         // Drop provided lang_tag: always return main lang TextLangCfg
237         lang_tag = _main_lang;
238     }
239     // Not sure if we can lowercase lang_tag and avoid duplicate (Harfbuzz might
240     // need the proper lang tag with some parts starting with some uppercase letter)
241     for ( int i=0; i<_lang_cfg_list.length(); i++ ) {
242         if ( _lang_cfg_list[i]->_lang_tag == lang_tag ) {
243             // printf("TextLangCfg %s reused\n", UnicodeToLocal(lang_tag).c_str());
244             // There should rarely be more than 3 lang in a document, so move
245             // any requested far down in the list at top to shorten next loops.
246             if ( i > 2 ) {
247                 _lang_cfg_list.move(0, i);
248                 return _lang_cfg_list[0];
249             }
250             return _lang_cfg_list[i];
251         }
252     }
253     // Not found in cache: create it
254     TextLangCfg * lang_cfg = new TextLangCfg( lang_tag );
255     _lang_cfg_list.add( lang_cfg ); // and cache it
256     return lang_cfg;
257 }
258 
getTextLangCfg()259 TextLangCfg * TextLangMan::getTextLangCfg() {
260     // No lang_tag specified: return main lang one
261     return TextLangMan::getTextLangCfg( _main_lang );
262 }
263 
getTextLangCfg(ldomNode * node)264 TextLangCfg * TextLangMan::getTextLangCfg( ldomNode * node ) {
265     if ( !_embedded_langs_enabled || !node ) {
266         // No need to look at nodes: return main lang one
267         return TextLangMan::getTextLangCfg( _main_lang );
268     }
269     if ( node->isText() )
270         node = node->getParentNode();
271     // We are usually called from renderFinalBlock() with a node that
272     // we know has a lang= attribute.
273     // But we may be called in other contexts (e.g. writeNodeEx) with
274     // any node: so, look at this node parents for that lang= attribute.
275     for ( ; !node->isRoot(); node = node->getParentNode() ) {
276         if ( node->hasAttribute( attr_lang ) ) {
277             lString32 lang_tag = node->getAttributeValue( attr_lang );
278             if ( !lang_tag.empty() )
279                 return TextLangMan::getTextLangCfg( lang_tag );
280         }
281     }
282     // No parent with lang= attribute: return main lang one
283     return TextLangMan::getTextLangCfg( _main_lang );
284 }
285 
getLangNodeIndex(ldomNode * node)286 int TextLangMan::getLangNodeIndex( ldomNode * node ) {
287     if ( !_embedded_langs_enabled || !node ) {
288         // No need to look up if !_embedded_langs_enabled
289         return 0;
290     }
291     if ( node->isText() )
292         node = node->getParentNode();
293     for ( ; !node->isRoot(); node = node->getParentNode() ) {
294         if ( node->hasAttribute( attr_lang ) ) {
295             if ( !node->getAttributeValue( attr_lang ).empty() ) {
296                 return node->getDataIndex();
297             }
298         }
299     }
300     return 0;
301 }
302 
303 // For HyphMan::hyphenate()
getMainLangHyphMethod()304 HyphMethod * TextLangMan::getMainLangHyphMethod() {
305     return getTextLangCfg()->getHyphMethod();
306 }
307 
getLangTag(const lString32 & title)308 lString32 TextLangMan::getLangTag(const lString32& title)
309 {
310     for (int i = 0; _hyph_dict_table[i].lang_tag!=NULL; i++) {
311         if (title == lString32(_hyph_dict_table[i].hyph_filename_prefix)) {
312             return lString32(_hyph_dict_table[i].lang_tag);
313         }
314     }
315     return lString32();
316 }
317 
resetCounters()318 void TextLangMan::resetCounters() {
319     for ( int i=0; i<_lang_cfg_list.length(); i++ ) {
320         _lang_cfg_list[i]->resetCounters();
321     }
322 }
323 
324 // TextLangCfg object: per language holder of language specificities
325 
326 // For CSS "content: open-quote / close-quote"
327 typedef struct quotes_spec {
328     const char * lang_tag;
329     const lChar32 *  open_quote_level_1;
330     const lChar32 * close_quote_level_1;
331     const lChar32 *  open_quote_level_2;
332     const lChar32 * close_quote_level_2;
333 } quotes_spec;
334 
335 // List built 20200601 from https://html.spec.whatwg.org/multipage/rendering.html#quotes
336 // 2nd part of lang_tag lowercased for easier comparison, and if multiple
337 // lang_tag with the same starting chars, put the longest first.
338 // Small issue: 3-letters lang tag not specified here might match
339 // a 2-letter lang tag specified here ("ito" will get those from "it").
340 static quotes_spec _quotes_spec_table[] = {
341     { "af",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
342     { "agq",      U"\x201e", U"\x201d", U"\x201a", U"\x2019" }, /* „ ” ‚ ’ */
343     { "ak",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
344     { "am",       U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
345     { "ar",       U"\x201d", U"\x201c", U"\x2019", U"\x2018" }, /* ” “ ’ ‘ */
346     { "asa",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
347     { "ast",      U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
348     { "az-cyrl",  U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
349     { "az",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
350     { "bas",      U"\x00ab", U"\x00bb", U"\x201e", U"\x201c" }, /* « » „ “ */
351     { "bem",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
352     { "bez",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
353     { "be",       U"\x00ab", U"\x00bb", U"\x201e", U"\x201c" }, /* « » „ “ */
354     { "bg",       U"\x201e", U"\x201c", U"\x2018", U"\x2019" }, /* „ “ ‘ ’ */
355     { "bm",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
356     { "bn",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
357     { "brx",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
358     { "br",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
359     { "bs-cyrl",  U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
360     { "bs",       U"\x201e", U"\x201d", U"\x2018", U"\x2019" }, /* „ ” ‘ ’ */
361     { "ca",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
362     { "cgg",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
363     { "chr",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
364     { "cs",       U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
365     { "cy",       U"\x2018", U"\x2019", U"\x201c", U"\x201d" }, /* ‘ ’ “ ” */
366     { "dav",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
367     { "da",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
368     { "de",       U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
369     { "dje",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
370     { "dsb",      U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
371     { "dua",      U"\x00ab", U"\x00bb", U"\x2018", U"\x2019" }, /* « » ‘ ’ */
372     { "dyo",      U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
373     { "dz",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
374     { "ebu",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
375     { "ee",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
376     { "el",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
377     { "en",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
378     { "eo",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
379     { "es",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
380     { "et",       U"\x201e", U"\x201c", U"\x00ab", U"\x00bb" }, /* „ “ « » */
381     { "eu",       U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
382     { "ewo",      U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
383     { "fa",       U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
384     { "ff",       U"\x201e", U"\x201d", U"\x201a", U"\x2019" }, /* „ ” ‚ ’ */
385     { "fil",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
386     { "fi",       U"\x201d", U"\x201d", U"\x2019", U"\x2019" }, /* ” ” ’ ’ */
387     { "fo",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
388     { "fr-ch",    U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
389     // { "fr",    U"\x00ab", U"\x00bb", U"\x00ab", U"\x00bb" }, /* « » « » */  /* Same pair for both level, bit sad... */
390     { "fr",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */  /* Better to have "fr" just as "it" */
391     { "fur",      U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */  /* Defaulting to "it", needs verification */
392     { "ga",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
393     { "gd",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
394     { "gl",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
395     { "gsw",      U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
396     { "guz",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
397     { "gu",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
398     { "ha",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
399     { "he",       U"\x201d", U"\x201d", U"\x2019", U"\x2019" }, /* ” ” ’ ’ */
400     { "hi",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
401     { "hr",       U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
402     { "hsb",      U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
403     { "hu",       U"\x201e", U"\x201d", U"\x00bb", U"\x00ab" }, /* „ ” » « */
404     { "hy",       U"\x00ab", U"\x00bb", U"\x00ab", U"\x00bb" }, /* « » « » */
405     { "id",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
406     { "ig",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
407     { "is",       U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
408     { "it",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
409     { "ja",       U"\x300c", U"\x300d", U"\x300e", U"\x300f" }, /* 「 」 『 』 */
410     { "jgo",      U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
411     { "jmc",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
412     { "kab",      U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
413     { "kam",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
414     { "ka",       U"\x201e", U"\x201c", U"\x2018", U"\x2019" }, /* „ “ “ ” */
415     { "kde",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
416     { "kea",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
417     { "khq",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
418     { "ki",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
419     { "kkj",      U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
420     { "kk",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
421     { "kln",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
422     { "km",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
423     { "kn",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
424     { "ko",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
425     { "ksb",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
426     { "ksf",      U"\x00ab", U"\x00bb", U"\x2018", U"\x2019" }, /* « » ‘ ’ */
427     { "ky",       U"\x00ab", U"\x00bb", U"\x201e", U"\x201c" }, /* « » „ “ */
428     { "la-lit",   U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */  /* "la" just as "it" */
429     { "la",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */  /* "la" just as "it" */
430     { "lag",      U"\x201d", U"\x201d", U"\x2019", U"\x2019" }, /* ” ” ’ ’ */
431     { "lb",       U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
432     { "lg",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
433     { "ln",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
434     { "lo",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
435     { "lrc",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
436     { "lt",       U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
437     { "luo",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
438     { "luy",      U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
439     { "lu",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
440     { "lv",       U"\x201c", U"\x201d", U"\x201e", U"\x201d" }, /* “ ” „ ” */
441     { "mas",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
442     { "mer",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
443     { "mfe",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
444     { "mgo",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
445     { "mg",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
446     { "mk",       U"\x201e", U"\x201c", U"\x2019", U"\x2018" }, /* „ “ ’ ‘ */
447     { "ml",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
448     { "mn",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
449     { "mr",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
450     { "ms",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
451     { "mt",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
452     { "mua",      U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
453     { "my",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
454     { "mzn",      U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
455     { "naq",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
456     { "nb",       U"\x00ab", U"\x00bb", U"\x2018", U"\x2019" }, /* « » ‘ ’ */
457     { "nd",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
458     { "ne",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
459     { "nl",       U"\x2018", U"\x2019", U"\x201c", U"\x201d" }, /* ‘ ’ “ ” */
460     { "nmg",      U"\x201e", U"\x201d", U"\x00ab", U"\x00bb" }, /* „ ” « » */
461     { "nnh",      U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
462     { "nn",       U"\x00ab", U"\x00bb", U"\x2018", U"\x2019" }, /* « » ‘ ’ */
463     { "nus",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
464     { "nyn",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
465     { "oc",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
466     { "pa",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
467     { "pl",       U"\x201e", U"\x201d", U"\x00ab", U"\x00bb" }, /* „ ” « » */
468     { "pms",      U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */  /* Defaulting to "it", needs verification */
469     { "pt-br",    U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
470     { "pt-pt",    U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
471     { "pt",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
472     { "rm",       U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
473     { "rn",       U"\x201d", U"\x201d", U"\x2019", U"\x2019" }, /* ” ” ’ ’ */
474     { "rof",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
475     { "ro",       U"\x201e", U"\x201d", U"\x00ab", U"\x00bb" }, /* „ ” « » */
476     { "ru",       U"\x00ab", U"\x00bb", U"\x201e", U"\x201c" }, /* « » „ “ */
477     { "rwk",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
478     { "rw",       U"\x00ab", U"\x00bb", U"\x2018", U"\x2019" }, /* « » ‘ ’ */
479     { "sah",      U"\x00ab", U"\x00bb", U"\x201e", U"\x201c" }, /* « » „ “ */
480     { "saq",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
481     { "sbp",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
482     { "seh",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
483     { "ses",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
484     { "sg",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
485     { "shi-latn", U"\x00ab", U"\x00bb", U"\x201e", U"\x201d" }, /* « » „ ” */
486     { "shi",      U"\x00ab", U"\x00bb", U"\x201e", U"\x201d" }, /* « » „ ” */
487     { "si",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
488     { "sk",       U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
489     { "sl",       U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
490     { "sn",       U"\x201d", U"\x201d", U"\x2019", U"\x2019" }, /* ” ” ’ ’ */
491     { "so",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
492     { "sq",       U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
493     { "sr-latn",  U"\x201e", U"\x201c", U"\x2018", U"\x2018" }, /* „ “ ‘ ‘ */
494     { "sr",       U"\x201e", U"\x201d", U"\x2019", U"\x2019" }, /* „ ” ’ ’ */
495     { "sv",       U"\x201d", U"\x201d", U"\x2019", U"\x2019" }, /* ” ” ’ ’ */
496     { "sw",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
497     { "ta",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
498     { "teo",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
499     { "te",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
500     { "th",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
501     { "ti-er",    U"\x2018", U"\x2019", U"\x201c", U"\x201d" }, /* ‘ ’ “ ” */
502     { "tk",       U"\x201c", U"\x201d", U"\x201c", U"\x201d" }, /* “ ” “ ” */
503     { "to",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
504     { "tr",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
505     { "twq",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
506     { "tzm",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
507     { "uk",       U"\x00ab", U"\x00bb", U"\x201e", U"\x201c" }, /* « » „ “ */
508     { "ur",       U"\x201d", U"\x201c", U"\x2019", U"\x2018" }, /* ” “ ’ ‘ */
509     { "uz-cyrl",  U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
510     { "uz",       U"\x201c", U"\x201d", U"\x2019", U"\x2018" }, /* “ ” ’ ‘ */
511     { "vai-latn", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
512     { "vai",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
513     { "vi",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
514     { "vun",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
515     { "xog",      U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
516     { "yav",      U"\x00ab", U"\x00bb", U"\x00ab", U"\x00bb" }, /* « » « » */
517     { "yo",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
518     { "yue-hans", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
519     { "yue",      U"\x300c", U"\x300d", U"\x300e", U"\x300f" }, /* 「 」 『 』 */
520     { "zgh",      U"\x00ab", U"\x00bb", U"\x201e", U"\x201d" }, /* « » „ ” */
521     { "zh-hant",  U"\x300c", U"\x300d", U"\x300e", U"\x300f" }, /* 「 」 『 』 */
522     { "zh",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
523     { "zu",       U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
524     { NULL, NULL, NULL, NULL, NULL }
525 };
526 // Default to quotes for English
527 static quotes_spec _quotes_spec_default = { "", U"\x201c", U"\x201d", U"\x2018", U"\x2019" };
528 
529 #if USE_LIBUNIBREAK==1
530 #if KO_LIBUNIBREAK_PATCH==1
lb_char_sub_func_english(struct LineBreakContext * lbpCtx,const lChar32 * text,int pos,int next_usable)531 lChar32 lb_char_sub_func_english(struct LineBreakContext *lbpCtx, const lChar32 * text, int pos, int next_usable) {
532     // https://github.com/koreader/crengine/issues/364
533     // Normally, line breaks are allowed at both sides of an em-dash.
534     // When an em-dash is at the "end of a word" (or beginning), we want to avoid separating it from its word,
535     // this is detected by looking for letters/numbers at both sides of the dash, if on any side a space
536     // is closer than any letter/number, treat it as a non-breakable dash.
537     // The current implementation does not allow examining the following characters beyond the current node,
538     // so the detection is not perfect and we replace the dash with "opening" or "closing" characters
539     // (or "ambiguous), to play safer (note that "}" allows a break after, while ")" doesn't).
540     //
541     // The intent is the following:
542     //   blah—blah                     ->  —  (break before or after)
543     //   blah “—blah , <p>—blah        ->  {  (do not break after)
544     //   blah—” Blah , blah—”</p>      ->  }  (do not break before)
545     //   blah — blah , blah —<em>blah  ->  "  (break only at spaces)
546     switch ( text[pos] ) {
547         case 0x2014:  // em dash
548         case 0x2E3A:  // two-em dash
549         case 0x2E3B:  // three-em dash
550             {
551                 // The variable "replacement" will be the output char,
552                 // we start by setting it to the actual input char.
553                 // It will be '{' if no-break on right,
554                 //            '}' if no-break on left,
555                 //            '"' if no-break on both.
556                 lChar32 replacement = text[pos];
557                 int new_pos;
558                 enum LineBreakClass new_lbc;
559                 // 1. Detect no-break on right (scan left of dash)
560                 //
561                 // already at the beginning of text
562                 if ( pos == 0 ) {
563                     replacement = '{';
564                 }
565                 else {
566                     // inspect preceding characters
567                     new_pos = pos;
568                     while ( new_pos > 0) {
569                         new_pos--;
570                         new_lbc = lb_get_char_class(lbpCtx, text[new_pos]);
571                         if ( new_lbc == LBP_AL || new_lbc == LBP_NU ) {
572                             // found word / number
573                             break;
574                         }
575                         else if ( new_lbc == LBP_SP || new_pos == 0 ) {
576                             // found space or beginning
577                             replacement = '{';
578                             break;
579                         }
580                     }
581                 }
582                 // 2. Detect no-break on left (scan right of dash)
583                 //    If already no-break on right, replacement will be '"'
584                 //
585                 // already at the end of text
586                 if ( next_usable == 0 ) {
587                     replacement = ( replacement == '{' ) ? '"' : '}';
588                 }
589                 else {
590                     // inspect following characters
591                     new_pos = pos;
592                     while ( new_pos < pos+next_usable ) {
593                         new_pos++;
594                         new_lbc = lb_get_char_class(lbpCtx, text[new_pos]);
595                         if ( new_lbc == LBP_AL || new_lbc == LBP_NU ) {
596                             // found word / number
597                             break;
598                         }
599                         else if ( new_lbc == LBP_SP || new_pos == pos+next_usable ) {
600                             // found space or end (of the current text node, there could be letters beyond)
601                             replacement = ( replacement == '{' ) ? '"' : '}';
602                             break;
603                         }
604                     }
605                 }
606                 return replacement;
607             }
608             break;
609         default:
610             break;
611     }
612     return text[pos];
613 }
614 #endif      // KO_LIBUNIBREAK_PATCH==1
615 
lb_char_sub_func_polish(struct LineBreakContext * lbpCtx,const lChar32 * text,int pos,int next_usable)616 lChar32 lb_char_sub_func_polish(struct LineBreakContext *lbpCtx, const lChar32 * text, int pos, int next_usable) {
617     // https://github.com/koreader/koreader/issues/5645#issuecomment-559193057
618     // Letters aiouwzAIOUWS are prepositions that should not be left at the
619     // end of a line.
620     // Make them behave (for libunibreak) just like a opening paren (which
621     // being LBC_OP, will prevent a line break after it, even if followed
622     // by a space).
623     if ( pos >= 1 && text[pos-1] == ' ' ) {
624         switch ( text[pos] ) {
625             case 'A':
626             case 'I':
627             case 'O':
628             case 'U':
629             case 'W':
630             case 'Z': // Meaning in english:
631             case 'a': // and
632             case 'i': // and
633             case 'o': // about
634             case 'u': // at
635             case 'w': // in
636             case 'z': // with
637                 return '(';
638                 break;
639             default:
640                 break;
641         }
642     }
643     return text[pos];
644 }
645 
lb_char_sub_func_czech_slovak(struct LineBreakContext * lbpCtx,const lChar32 * text,int pos,int next_usable)646 lChar32 lb_char_sub_func_czech_slovak(struct LineBreakContext *lbpCtx, const lChar32 * text, int pos, int next_usable) {
647     // Same for Czech and Slovak : AIiVvOoUuSsZzKk
648     // https://tex.stackexchange.com/questions/27780/one-letter-word-at-the-end-of-line
649     // https://github.com/michal-h21/luavlna
650     if ( pos >= 1 && text[pos-1] == ' ' ) {
651         switch ( text[pos] ) {
652             case 'A':
653             case 'I':
654             case 'K':
655             case 'O':
656             case 'S':
657             case 'U':
658             case 'V':
659             case 'Z':
660             case 'i':
661             case 'k':
662             case 'o':
663             case 's':
664             case 'u':
665             case 'v':
666             case 'z':
667                 return '(';
668                 break;
669             default:
670                 break;
671         }
672     }
673     return text[pos];
674 }
675 #endif
676 
677 // Instantiate a new TextLangCfg with properties adequate to the provided lang_tag
TextLangCfg(lString32 lang_tag)678 TextLangCfg::TextLangCfg( lString32 lang_tag ) {
679     if ( TextLangMan::_no_hyph_method == NULL ) {
680         // We need to init static TextLangMan::_no_hyph_method and friends after
681         // HyphMan is set up. Do that here, even if unrelated, as TextLangCfg
682         // creation is called less often that every other methods around here.
683         TextLangMan::_no_hyph_method = HyphMan::getHyphMethodForDictionary(HYPH_DICT_ID_NONE);
684         TextLangMan::_soft_hyphens_method = HyphMan::getHyphMethodForDictionary(HYPH_DICT_ID_SOFTHYPHENS);
685         TextLangMan::_algo_hyph_method = HyphMan::getHyphMethodForDictionary(HYPH_DICT_ID_ALGORITHM);
686     }
687 
688     // Keep as our id the provided and non-lowercase'd lang_tag (with possibly bogus #@algorithm)
689     _lang_tag = lang_tag;
690     // Harfbuzz may know more than us about exotic/complex lang tags,
691     // so let it deal the the provided one as-is.
692     lString32 hb_lang_tag = lang_tag;
693     // Lowercase it for our tests
694     lang_tag.lowercase(); // (used by LANG_STARTS_WITH() macros)
695 
696     // Get hyph method/dictionary from _hyph_dict_table
697     _hyph_method = TextLangMan::getHyphMethodForLang(lang_tag);
698 
699     // Cleanup if we got "en#@something" from legacy HyphMan methods
700     int h_pos = lang_tag.pos("#");
701     if ( h_pos > 0 ) {
702         lang_tag = lang_tag.substr(0, h_pos);
703         hb_lang_tag = hb_lang_tag.substr(0, h_pos); // Also clean the one for HB
704     }
705     #ifdef DEBUG_LANG_USAGE
706     printf("TextLangCfg %s created (%s %s)\n", UnicodeToLocal(_lang_tag).c_str(),
707                     UnicodeToLocal(lang_tag).c_str(), UnicodeToLocal(_hyph_method->getId()).c_str());
708     #endif
709 
710     // https://drafts.csswg.org/css-text-3/#script-tagging
711     // We might need to check for the script subpart (optional 2nd
712     // subpart) Lant, Hant, Hrkt... and make some non latin language
713     // with a Lant script behave more like latin languages...
714 
715     // Note that Harfbuzz seems to do the right same thing with
716     // either "zh-TW" and "zh-Hant".
717 
718     // See for more clever/complex handling of lang tags:
719     // https://android.googlesource.com/platform/frameworks/minikin/+/refs/heads/master/libs/minikin/Locale.cpp
720 
721     // We thought about adding a 2nd fallback font per-language, but it feels
722     // a bit wrong to limit this feature to documents with lang tags.
723     // Better to implement a generic font fallback chain independant of language.
724 
725     // https://unicode.org/reports/tr14/#Hyphen : in Polish and Portuguese,
726     // a real hyphen at end of line must be duplicated at start of next line.
727     _duplicate_real_hyphen_on_next_line = false;
728 
729 #if USE_HARFBUZZ==1
730     _hb_language = hb_language_from_string(UnicodeToLocal(hb_lang_tag).c_str(), -1);
731 #endif
732 
733 #if USE_LIBUNIBREAK==1
734     // libunibreak per-language LineBreakProperties extensions
735     //
736     // Rules extracted from libunibreak/src/linebreakdef.c, so we can adapt
737     // them and build LineBreakProperties adequately for more languages.
738     // See https://en.wikipedia.org/wiki/Quotation_mark
739     // These are mostly need only for languages that may add a space between
740     // the quote and its content - otherwise, the quote will be part of the
741     // word it sticks to, and break will be allowed on the other side which
742     // probably is a space.
743     // When a language allows the use of unpaired quotes (same quote on both
744     // sides), it seems best to not specify anything.
745     bool has_left_single_quotation_mark_opening = false;   // U+2018 ‘
746     bool has_left_single_quotation_mark_closing = false;
747     bool has_right_single_quotation_mark_opening = false;  // U+2019 ’
748     bool has_right_single_quotation_mark_closing = false;
749     bool has_right_single_quotation_mark_glue = false;
750     bool has_left_double_quotation_mark_opening = false;   // U+201C “
751     bool has_left_double_quotation_mark_closing = false;
752     bool has_right_double_quotation_mark_opening = false;  // U+201D ”
753     bool has_right_double_quotation_mark_closing = false;
754     bool has_left_single_angle_quotation_mark_opening = false;   // U+2039 ‹
755     bool has_left_single_angle_quotation_mark_closing = false;
756     bool has_right_single_angle_quotation_mark_opening = false;  // U+203A ›
757     bool has_right_single_angle_quotation_mark_closing = false;
758     bool has_left_double_angle_quotation_mark_opening = false;   // U+00AB «
759     bool has_left_double_angle_quotation_mark_closing = false;
760     bool has_right_double_angle_quotation_mark_opening = false;  // U+00BB »
761     bool has_right_double_angle_quotation_mark_closing = false;
762     // Additional rule for treating em-dashes as e.g. "horizontal bar"
763     // This is appropriate for languages that typically have a space at a
764     // breakable side of the dash
765     bool has_em_dash_alphabetic = false; // U+2014 —, U+2E3A ⸺, U+2E3B ⸻
766 
767     // Note: these macros use 'lang_tag'.
768     if ( LANG_STARTS_WITH(("en")) ) { // English
769         has_left_single_quotation_mark_opening = true; // no right..closing in linebreakdef.c
770         has_left_double_quotation_mark_opening = true;
771         has_right_double_quotation_mark_closing = true;
772     }
773     else if ( LANG_STARTS_WITH(("fr") ("es")) ) { // French, Spanish
774         has_left_single_quotation_mark_opening = true; // no right..closing in linebreakdef.c
775         has_left_double_quotation_mark_opening = true;
776         has_right_double_quotation_mark_closing = true;
777         has_left_single_angle_quotation_mark_opening = true;
778         has_right_single_angle_quotation_mark_closing = true;
779         has_left_double_angle_quotation_mark_opening = true;
780         has_right_double_angle_quotation_mark_closing = true;
781         has_em_dash_alphabetic = true;
782     }
783     else if ( LANG_STARTS_WITH(("de")) ) { // German
784         has_left_single_quotation_mark_closing = true;
785         has_right_single_quotation_mark_glue = true;
786         has_left_double_quotation_mark_closing = true;
787         /* Next ones commented out, as non-inverted usage of these
788          * quotation marks can be found in pure "de" text - and
789          * generally, these quotations marks are stuck to their
790          * quoted first or last word and have only a space on the
791          * other side, and so should be fine with just being "QU"
792          * for libunibreak.
793          * See https://github.com/koreader/koreader/issues/6717
794         has_left_single_angle_quotation_mark_closing = true;
795         has_right_single_angle_quotation_mark_opening = true;
796         has_left_double_angle_quotation_mark_closing = true;
797         has_right_double_angle_quotation_mark_opening = true;
798         */
799     }
800     else if ( LANG_STARTS_WITH(("ru")) ) { // Russian
801         has_left_double_quotation_mark_closing = true;
802         has_left_double_angle_quotation_mark_opening = true;
803         has_right_double_angle_quotation_mark_closing = true;
804     }
805     else if ( LANG_STARTS_WITH(("zh")) ) { // Chinese
806         has_left_single_quotation_mark_opening = true;
807         has_right_single_quotation_mark_closing = true;
808         has_left_double_quotation_mark_opening = true;
809         has_right_double_quotation_mark_closing = true;
810     }
811     // Add languages rules here, or reuse previous one with other languages if needed.
812 
813     // Set up _lb_props.
814     // Important: the unicode indices must be in strict ascending order (or libunibreak
815     // might abort checking them all)
816     int n = 0;
817     if ( has_left_double_angle_quotation_mark_opening )  _lb_props[n++] = { 0x00AB, 0x00AB, LBP_OP };
818     if ( has_left_double_angle_quotation_mark_closing )  _lb_props[n++] = { 0x00AB, 0x00AB, LBP_CL };
819     // Soft-Hyphens are handled by Hyphman hyphenate(), have them handled as Zero-Width-Joiner by
820     // libunibreak so they don't allow any break and don't prevent hyphenate() to handle them correctly.
821     _lb_props[n++] = { 0x00AD, 0x00AD, LBP_ZWJ };
822     if ( has_right_double_angle_quotation_mark_opening ) _lb_props[n++] = { 0x00BB, 0x00BB, LBP_OP };
823     if ( has_right_double_angle_quotation_mark_closing ) _lb_props[n++] = { 0x00BB, 0x00BB, LBP_CL };
824     if ( has_em_dash_alphabetic )                        _lb_props[n++] = { 0x2014, 0x2014, LBP_AL };
825     if ( has_left_single_quotation_mark_opening )        _lb_props[n++] = { 0x2018, 0x2018, LBP_OP };
826     if ( has_left_single_quotation_mark_closing )        _lb_props[n++] = { 0x2018, 0x2018, LBP_CL };
827     if ( has_right_single_quotation_mark_opening )       _lb_props[n++] = { 0x2019, 0x2019, LBP_OP };
828     if ( has_right_single_quotation_mark_closing )       _lb_props[n++] = { 0x2019, 0x2019, LBP_CL };
829     if ( has_right_single_quotation_mark_glue )          _lb_props[n++] = { 0x2019, 0x2019, LBP_GL };
830     if ( has_left_double_quotation_mark_opening )        _lb_props[n++] = { 0x201C, 0x201C, LBP_OP };
831     if ( has_left_double_quotation_mark_closing )        _lb_props[n++] = { 0x201C, 0x201C, LBP_CL };
832     if ( has_right_double_quotation_mark_opening )       _lb_props[n++] = { 0x201D, 0x201D, LBP_OP };
833     if ( has_right_double_quotation_mark_closing )       _lb_props[n++] = { 0x201D, 0x201D, LBP_CL };
834     if ( has_left_single_angle_quotation_mark_opening )  _lb_props[n++] = { 0x2039, 0x2039, LBP_OP };
835     if ( has_left_single_angle_quotation_mark_closing )  _lb_props[n++] = { 0x2039, 0x2039, LBP_CL };
836     if ( has_right_single_angle_quotation_mark_opening ) _lb_props[n++] = { 0x203A, 0x203A, LBP_OP };
837     if ( has_right_single_angle_quotation_mark_closing ) _lb_props[n++] = { 0x203A, 0x203A, LBP_CL };
838     if ( has_em_dash_alphabetic )                        _lb_props[n++] = { 0x2E3A, 0x2E3B, LBP_AL };
839     // End of list
840     _lb_props[n++] = { 0, 0, LBP_Undefined };
841         // When adding properties, be sure combinations for all languages
842         // do fit in _lb_props[MAX_NB_LB_PROPS_ITEMS] (MAX_NB_LB_PROPS_ITEMS
843         // is defined in textlang.h, currently at 20).
844     // Done with libunibreak per-language LineBreakProperties extensions
845 
846     // Other line breaking and text layout tweaks
847     _lb_char_sub_func = NULL;
848 #if KO_LIBUNIBREAK_PATCH==1
849     if ( LANG_STARTS_WITH(("en")) ) { // English
850         _lb_char_sub_func = &lb_char_sub_func_english;
851     } else
852 #endif
853     if ( LANG_STARTS_WITH(("pl")) ) { // Polish
854         _lb_char_sub_func = &lb_char_sub_func_polish;
855         _duplicate_real_hyphen_on_next_line = true;
856     }
857     else if ( LANG_STARTS_WITH(("cs") ("sk")) ) { // Czech, Slovak
858         _lb_char_sub_func = &lb_char_sub_func_czech_slovak;
859     }
860     else if ( LANG_STARTS_WITH(("pt") ("sr")) ) { // Portuguese, Serbian
861         _duplicate_real_hyphen_on_next_line = true;
862     }
863 #endif
864 
865     // Language default opening and closing quotes, for CSS
866     //   "q::before { content: open-quote }" and
867     //   "q::after  { content: close-quote }"
868     quotes_spec * quotes = &_quotes_spec_default;
869     for (int i=0; _quotes_spec_table[i].lang_tag!=NULL; i++) {
870         if ( lang_tag.startsWith( _quotes_spec_table[i].lang_tag ) ) {
871             quotes = &_quotes_spec_table[i];
872             break;
873         }
874     }
875     // Avoid a wrap after/before an opening/close quote.
876     const lChar32 * quote_joiner = U"\x2060";
877         // (Zero width, equivalent to deprecated ZERO WIDTH NO-BREAK SPACE)
878         // We might want with some languages to use a non-breaking thin space instead.
879 
880     _open_quote1  << quotes->open_quote_level_1    << quote_joiner;
881     _close_quote1 << quote_joiner   << quotes->close_quote_level_1;
882     _open_quote2  << quotes->open_quote_level_2    << quote_joiner;
883     _close_quote2 << quote_joiner   << quotes->close_quote_level_2;
884 
885     resetCounters();
886 }
887 
~TextLangCfg()888 TextLangCfg::~TextLangCfg() {
889 }
890 
resetCounters()891 void TextLangCfg::resetCounters() {
892     _quote_nesting_level = 0;
893 }
894 
getOpeningQuote(bool update_level)895 lString32 & TextLangCfg::getOpeningQuote( bool update_level ) {
896     if ( !update_level )
897         return _open_quote1;
898     _quote_nesting_level++;
899     return (_quote_nesting_level % 2) ? _open_quote1 : _open_quote2;
900 }
901 
getClosingQuote(bool update_level)902 lString32 & TextLangCfg::getClosingQuote( bool update_level ) {
903     if ( !update_level )
904         return _close_quote1;
905     _quote_nesting_level--;
906     return ((_quote_nesting_level+1) % 2) ? _close_quote1 : _close_quote2;
907 }
908 
getHyphenHangingPercent()909 int TextLangCfg::getHyphenHangingPercent() {
910     return 70; // 70%
911 }
912 
getHangingPercent(bool right_hanging,bool & check_font,const lChar32 * text,int pos,int next_usable)913 int TextLangCfg::getHangingPercent( bool right_hanging, bool & check_font, const lChar32 * text, int pos, int next_usable ) {
914     // We get provided with the BiDi re-ordered m_text (so, visually
915     // ordered) and the index of char: if needed, we can look at
916     // previous or next chars for context to decide how much to hang
917     // (i.e. consecutive punctuations).
918 
919     // If we ever need to tweak this per language, try to avoid checks
920     // for the lang_tag in here:
921     // - either set bool members to enable or disable some checks and tweaks
922     // - or make this hanging_percent_func_generic, and add dedicated
923     //   functions per language, hanging_percent_func_french, that
924     //   could fallback to calling hanging_percent_func_generic after
925     //   some checks - and have TextLangCfg::getHangingPercent() call
926     //   the dedicated function pointer stored as a member.
927 
928     // We might want to prevent any hanging with Chinese and Japanese
929     // as the text might be mostly full-width glyphs, and this might
930     // break the grid. This is less risky if the main font is a CJK
931     // font, but if it is not, punctuation might be picked from the
932     // main non-CJK font and won't be full-width.
933     // Or we could round any value to 0 or 100%  (and/or tweak any
934     // glyph in lvtextfm.cpp so it looks like it is full-width).
935 
936     lChar32 ch = text[pos];
937     int ratio = 0;
938 
939     // In French, there's usually a space before and after guillemets,
940     // or before a quotation mark. Having them hanging, and then a
941     // space, looks like there's a hole in the margin.
942     // So, for some chars, we'll avoid hanging or reduce the hanging
943     // ratio if the next/prev char is a space char.
944     // This might not happen in other languages, so let's do that
945     // prevention generically. If needed, make that dependant on
946     // a boolean member, set to true if LANG_STARTS_WITH(("fr")).
947     bool space_alongside = false;
948     if ( right_hanging ) {
949         if ( pos > 0 ) {
950             lChar32 prev_ch = text[pos-1];
951             if ( prev_ch == 0x0020 || prev_ch == 0x00A0 || (prev_ch >= 0x2000 && prev_ch <= 0x200A ) ) {
952                 // Normal space, no-break space, and other unicode spaces (except zero-width ones)
953                 space_alongside = true;
954             }
955         }
956     }
957     else {
958         if ( next_usable > 0 ) {
959             lChar32 next_ch = text[pos+1];
960             if ( next_ch == 0x0020 || next_ch == 0x00A0 || (next_ch >= 0x2000 && next_ch <= 0x200A ) ) {
961                 // Normal space, no-break space, and other unicode spaces (except zero-width ones)
962                 space_alongside = true;
963             }
964         }
965     }
966 
967     // For the common punctuations, parens and quotes, we check and
968     // return the same value whether asked for left or right hanging.
969     // Normally, libunibreak has prevented them from happening on
970     // one of the sides - but with RTL text, they may happen on
971     // the other side. Also, some BiDi mirrorable chars "([])" might
972     // be mirrored in the provided *text when not-using HarfBuzz, but
973     // won't be mirrored when using HarfBuzz - so let's handle
974     // all of them no matter the hanging side asked for.
975     // Also, because in some languages, quotation marks and guillemets
976     // are used reverted, we include left and right ones in both sets.
977 
978     // Most values taken from the "protusion" section in:
979     // https://source.contextgarden.net/tex/context/base/mkiv/font-imp-quality.lua
980     // https://www.w3.org/Mail/flatten/index?subject=Amending+hanging-punctuation+for+Western+typography&list=www-style
981     // and the microtypography thesis: http://www.pragma-ade.nl/pdftex/thesis.pdf
982     // (screenshot at https://github.com/koreader/koreader/issues/6235#issuecomment-639307634)
983 
984     switch (ch) {
985         case 0x0027: // ' single quote
986         case 0x002C: // , comma
987         case 0x002D: // - minus
988         case 0x002E: // . period
989         case 0x0060: // ` back quote
990         // case 0x00AD: // soft hyphen (we don't draw them, so don't handle them)
991         case 0x060C: // ، arabic comma
992         case 0x06D4: // ۔ arabic full stop
993         case 0x2010: // ‐ hyphen
994         case 0x2018: // ‘ left single quotation mark
995         case 0x2019: // ’ right single quotation mark
996         case 0x201A: // ‚ single low-9 quotation mark
997         case 0x201B: // ‛ single high-reversed-9 quotation mark
998             ratio = 70;
999             break;
1000         case 0x2039: // ‹ left single guillemet
1001         case 0x203A: // › right single guillemet
1002             // These are wider than the previous ones, and hanging by 70% with a space
1003             // alongside can give a feeling of bad justification. So, hang less.
1004             ratio = space_alongside ? 20 : 70;
1005             break;
1006         case 0x0022: // " double quote
1007         case 0x003A: // : colon
1008         case 0x003B: // ; semicolon
1009         case 0x061B: // ؛ arabic semicolon
1010         case 0x201C: // “ left double quotation mark
1011         case 0x201D: // ” right double quotation mark
1012         case 0x201E: // „ double low-9 quotation mark
1013         case 0x201F: // ‟ double high-reversed-9 quotation mark
1014             ratio = 50;
1015             break;
1016         case 0x00AB: // « left guillemet
1017         case 0x00BB: // » right guillemet
1018             // These are wider than the previous ones, and hanging by 50% with a space
1019             // alongside can give a feeling of bad justification. So, hang less.
1020             ratio = space_alongside ? 20 : 50;
1021             break;
1022         case 0x2013: // – endash
1023             // Should have enough body inside (with only 30% hanging)
1024             ratio = 30;
1025             break;
1026         case 0x0021: // !
1027         case 0x003F: // ?
1028         case 0x00A1: // ¡
1029         case 0x00BF: // ¿
1030         case 0x061F: // ؟
1031         case 0x2014: // — emdash
1032         case 0x2026: // … ellipsis
1033             // These will have enough body inside (with only 20% hanging),
1034             // so they shouldn't hurt when space_alongside.
1035             ratio = 20;
1036             break;
1037         case 0x0028: // (
1038         case 0x0029: // )
1039         case 0x005B: // [
1040         case 0x005D: // ]
1041         case 0x007B: // {
1042         case 0x007D: // }
1043             ratio  = 5;
1044             break;
1045         default:
1046             break;
1047     }
1048     if ( ratio ) {
1049         check_font = false;
1050         return ratio;
1051     }
1052     // Other are non punctuation but slight adjustment for some letters,
1053     // that might be ignored if the font already include some negative
1054     // left side bearing.
1055     // The hanging ratio is small, so no need to correct if space_alongside.
1056     check_font = true;
1057     if ( right_hanging ) {
1058         switch (ch) {
1059             case 'A':
1060             case 'F':
1061             case 'K':
1062             case 'L':
1063             case 'T':
1064             case 'V':
1065             case 'W':
1066             case 'X':
1067             case 'Y':
1068             case 'k':
1069             case 'r':
1070             case 't':
1071             case 'v':
1072             case 'w':
1073             case 'x':
1074             case 'y':
1075                 ratio  = 5;
1076                 break;
1077             default:
1078                 break;
1079         }
1080     }
1081     else { // left hanging
1082         switch (ch) {
1083             case 'A':
1084             case 'J':
1085             case 'T':
1086             case 'V':
1087             case 'W':
1088             case 'X':
1089             case 'Y':
1090             case 'v':
1091             case 'w':
1092             case 'x':
1093             case 'y':
1094                 ratio  = 5;
1095                 break;
1096             default:
1097                 break;
1098         }
1099     }
1100     return ratio;
1101 }
1102