1 // IMPORTANT : when making changes in language detection logic and per-language
2 // rules here, be sure to also bump FORMATTING_VERSION_ID in src/lvtinydom.cpp
3
4 #include "../include/textlang.h"
5 #include "../include/hyphman.h"
6 #include "../include/lvtinydom.h"
7 #include "../include/fb2def.h"
8 #include "../include/crlog.h"
9
10 // Uncomment to see which lang_tags are seen and lang_cfg created
11 // #define DEBUG_LANG_USAGE
12
13 // Some macros to expand: LANG_STARTS_WITH(("fr") ("es")) (no comma!)
14 // to: lang_tag.startsWith("fr") || lang_tag.startsWith("es") || false
15 // (from https://stackoverflow.com/questions/19680962/translate-sequence-in-macro-parameters-to-separate-macros )
16 #define PRIMITIVE_SEQ_ITERATE(...) __VA_ARGS__ ## _END
17 #define SEQ_ITERATE(...) PRIMITIVE_SEQ_ITERATE(__VA_ARGS__)
18 #define LANG_STARTS_WITH(seq) SEQ_ITERATE(LANG_STARTS_WITH_EACH_1 seq)
19 #define LANG_STARTS_WITH_EACH_1(...) lang_tag.startsWith(__VA_ARGS__) || LANG_STARTS_WITH_EACH_2
20 #define LANG_STARTS_WITH_EACH_2(...) lang_tag.startsWith(__VA_ARGS__) || LANG_STARTS_WITH_EACH_1
21 #define LANG_STARTS_WITH_EACH_1_END false
22 #define LANG_STARTS_WITH_EACH_2_END false
23
24 // (hyph_filename_prefix added because CoolReader may still have both
25 // current "Italian.pattern" and old "Italian_hyphen_(Alan).pdb".)
26 // (Romanian and Ukrainian have the prefix truncated because previous
27 // pattern files, still in CoolReader, had these truncated names.)
28 static struct {
29 const char * lang_tag;
30 const char * hyph_filename_prefix;
31 const char * hyph_filename;
32 int left_hyphen_min;
33 int right_hyphen_min;
34 } _hyph_dict_table[] = {
35 { "hy", "Armenian", "Armenian.pattern", 1, 2 },
36 { "eu", "Basque", "Basque.pattern", 2, 2 },
37 { "bg", "Bulgarian", "Bulgarian.pattern", 2, 2 },
38 { "ca", "Catalan", "Catalan.pattern", 2, 2 },
39 { "cs", "Czech", "Czech.pattern", 2, 3 },
40 { "da", "Danish", "Danish.pattern", 2, 2 },
41 { "nl", "Dutch", "Dutch.pattern", 2, 2 },
42 { "en-GB", "English_GB", "English_GB.pattern", 2, 3 },
43 { "en", "English_US", "English_US.pattern", 2, 3 },
44 { "eo", "Esperanto", "Esperanto.pattern", 2, 2 },
45 { "et", "Estonian", "Estonian.pattern", 2, 3 },
46 { "fi", "Finnish", "Finnish.pattern", 2, 2 },
47 { "fr", "French", "French.pattern", 2, 1 }, // see French.pattern file for why right_hyphen_min=1
48 { "fur", "Friulian", "Friulian.pattern", 2, 2 },
49 { "gl", "Galician", "Galician.pattern", 2, 2 },
50 { "ka", "Georgian", "Georgian.pattern", 1, 2 },
51 { "de", "German", "German.pattern", 2, 2 },
52 { "el", "Greek", "Greek.pattern", 1, 1 },
53 { "hr", "Croatian", "Croatian.pattern", 2, 2 },
54 { "hu", "Hungarian", "Hungarian.pattern", 2, 2 },
55 { "is", "Icelandic", "Icelandic.pattern", 2, 2 },
56 { "ga", "Irish", "Irish.pattern", 2, 3 },
57 { "it", "Italian", "Italian.pattern", 2, 2 },
58 { "la-lit","Latin_liturgical","Latin_liturgical.pattern",2, 2 },
59 { "la", "Latin", "Latin.pattern", 2, 2 },
60 { "lv", "Latvian", "Latvian.pattern", 2, 2 },
61 { "lt", "Lithuanian", "Lithuanian.pattern", 2, 2 },
62 { "mk", "Macedonian", "Macedonian.pattern", 2, 2 },
63 { "no", "Norwegian", "Norwegian.pattern", 2, 2 },
64 { "oc", "Occitan", "Occitan.pattern", 2, 2 },
65 { "pms", "Piedmontese", "Piedmontese.pattern", 2, 2 },
66 { "pl", "Polish", "Polish.pattern", 2, 2 },
67 { "pt-BR", "Portuguese_BR", "Portuguese_BR.pattern", 2, 3 },
68 { "pt", "Portuguese", "Portuguese.pattern", 2, 3 },
69 { "ro", "Roman", "Romanian.pattern", 2, 2 }, // truncated prefix (see above)
70 { "rm", "Romansh", "Romansh.pattern", 2, 2 },
71 { "ru-GB", "Russian_EnGB", "Russian_EnGB.pattern", 2, 2 },
72 { "ru-US", "Russian_EnUS", "Russian_EnUS.pattern", 2, 2 },
73 { "ru", "Russian", "Russian.pattern", 2, 2 },
74 { "sr", "Serbian", "Serbian.pattern", 2, 2 },
75 { "sk", "Slovak", "Slovak.pattern", 2, 3 },
76 { "sl", "Slovenian", "Slovenian.pattern", 2, 2 },
77 { "es", "Spanish", "Spanish.pattern", 2, 2 },
78 { "sv", "Swedish", "Swedish.pattern", 2, 2 },
79 { "tr", "Turkish", "Turkish.pattern", 2, 2 },
80 { "uk", "Ukrain", "Ukrainian.pattern", 2, 2 }, // truncated prefix (see above)
81 { "cy", "Welsh", "Welsh.pattern", 2, 3 },
82 { "zu", "Zulu", "Zulu.pattern", 2, 1 }, // defaulting to 2,1, left hyphenmin might need tweaking
83 // No-lang hyph methods, for legacy HyphMan methods: other lang properties will be from English
84 { "en#@none", "@none", "@none", 2, 2 },
85 { "en#@softhyphens", "@softhyphens", "@softhyphens", 2, 2 },
86 { "en#@algorithm", "@algorithm", "@algorithm", 2, 2 },
87 { NULL, NULL, NULL, 0, 0 }
88 };
89
90 // Init global TextLangMan members
91 lString32 TextLangMan::_main_lang = TEXTLANG_DEFAULT_MAIN_LANG_32;
92 bool TextLangMan::_embedded_langs_enabled = TEXTLANG_DEFAULT_EMBEDDED_LANGS_ENABLED;
93 LVPtrVector<TextLangCfg> TextLangMan::_lang_cfg_list;
94
95 bool TextLangMan::_hyphenation_enabled = TEXTLANG_DEFAULT_HYPHENATION_ENABLED;
96 bool TextLangMan::_hyphenation_soft_hyphens_only = TEXTLANG_DEFAULT_HYPH_SOFT_HYPHENS_ONLY;
97 bool TextLangMan::_hyphenation_force_algorithmic = TEXTLANG_DEFAULT_HYPH_FORCE_ALGORITHMIC;
98 bool TextLangMan::_overridden_hyph_method = !TEXTLANG_DEFAULT_HYPHENATION_ENABLED
99 || TEXTLANG_DEFAULT_HYPH_SOFT_HYPHENS_ONLY
100 || TEXTLANG_DEFAULT_HYPH_FORCE_ALGORITHMIC ;
101 // These will be set when we can
102 HyphMethod * TextLangMan::_no_hyph_method = NULL;
103 HyphMethod * TextLangMan::_algo_hyph_method = NULL;
104 HyphMethod * TextLangMan::_soft_hyphens_method = NULL;
105
TextLangMan()106 TextLangMan::TextLangMan() {
107 }
108
~TextLangMan()109 TextLangMan::~TextLangMan() {
110 }
111
getHash()112 lUInt32 TextLangMan::getHash() {
113 lUInt32 hash = _main_lang.getHash();
114 hash = hash << 4;
115 hash = hash + (_embedded_langs_enabled << 3);
116 hash = hash + (_hyphenation_soft_hyphens_only << 2);
117 hash = hash + (_hyphenation_force_algorithmic << 1);
118 hash = hash + _hyphenation_enabled;
119 // printf("TextLangMan::getHash %x\n", hash);
120 return hash;
121 }
122
123 // No need to explicitely call this in frontend code.
124 // Calling HyphMan::uninit() will have this one called.
uninit()125 void TextLangMan::uninit() {
126 _lang_cfg_list.clear();
127 }
128
129 // For HyphMan legacy methods
setMainLangFromHyphDict(lString32 id)130 void TextLangMan::setMainLangFromHyphDict( lString32 id ) {
131 // When setting up TextlangMan thru HyphMan legacy methods,
132 // disable embedded langs, for a consistent hyphenation.
133 TextLangMan::setEmbeddedLangsEnabled( false );
134 // Update flags if asked for @none, @softhyphens or @algorithm
135 TextLangMan::setHyphenationEnabled( id != HYPH_DICT_ID_NONE );
136 TextLangMan::setHyphenationSoftHyphensOnly( id == HYPH_DICT_ID_SOFTHYPHENS );
137 TextLangMan::setHyphenationForceAlgorithmic( id == HYPH_DICT_ID_ALGORITHM );
138
139 for (int i=0; _hyph_dict_table[i].lang_tag!=NULL; i++) {
140 if ( id.startsWith( _hyph_dict_table[i].hyph_filename_prefix ) ) {
141 TextLangMan::setMainLang( lString32(_hyph_dict_table[i].lang_tag) );
142 #ifdef DEBUG_LANG_USAGE
143 printf("TextLangMan::setMainLangFromHyphDict %s => %s\n",
144 UnicodeToLocal(id).c_str(), UnicodeToLocal(TextLangMan::getMainLang()).c_str());
145 #endif
146 return;
147 }
148 }
149 CRLog::warn("lang not found for hyphenation dict: %s\n", UnicodeToLocal(id).c_str());
150 }
151
152 // Used only by TextLangCfg
getHyphMethodForLang(lString32 lang_tag)153 HyphMethod * TextLangMan::getHyphMethodForLang( lString32 lang_tag ) {
154 // Look for full lang_tag
155 #if 1
156 // CoolReader use dynamically loaded hyphenation dictionaries (at startup)
157 HyphDictionaryList* dictList = HyphMan::getDictList();
158 HyphDictionary* dict;
159 lString32 dict_lang_tag;
160 lang_tag.lowercase();
161 int left_hyphen_min = 2;
162 int right_hyphen_min = 3;
163 for (int i = 0; i < dictList->length(); i++) {
164 dict = dictList->get(i);
165 if (dict) {
166 if (dict->getType() == HDT_DICT_ALAN || dict->getType() == HDT_DICT_TEX)
167 dict_lang_tag = TextLangMan::getLangTag(dict->getTitle()); // for dictionary's files
168 else
169 dict_lang_tag = TextLangMan::getLangTag(dict->getId()); // for default dictionaries
170 dict_lang_tag.lowercase();
171 if (lang_tag == dict_lang_tag) {
172 for (int j=0; _hyph_dict_table[j].lang_tag!=NULL; j++) {
173 if ( lang_tag == lString32(_hyph_dict_table[j].lang_tag).lowercase() ) {
174 left_hyphen_min = _hyph_dict_table[j].left_hyphen_min;
175 right_hyphen_min = _hyph_dict_table[j].right_hyphen_min;
176 break;
177 }
178 }
179 return HyphMan::getHyphMethodForDictionary( dict->getId(), left_hyphen_min, right_hyphen_min );
180 }
181 }
182 }
183 // Look for lang_tag initial subpart
184 int m_pos = lang_tag.pos("-");
185 if ( m_pos > 0 ) {
186 lString32 lang_tag2 = lang_tag.substr(0, m_pos);
187 lang_tag2.lowercase();
188 for (int i = 0; i < dictList->length(); i++) {
189 dict = dictList->get(i);
190 if (dict) {
191 if (dict->getType() == HDT_DICT_ALAN || dict->getType() == HDT_DICT_TEX)
192 dict_lang_tag = TextLangMan::getLangTag(dict->getTitle());
193 else
194 dict_lang_tag = TextLangMan::getLangTag(dict->getId()); // for default dictionaries
195 dict_lang_tag.lowercase();
196 if (lang_tag2 == dict_lang_tag)
197 for (int j=0; _hyph_dict_table[j].lang_tag!=NULL; j++) {
198 if ( lang_tag == lString32(_hyph_dict_table[j].lang_tag).lowercase() ) {
199 left_hyphen_min = _hyph_dict_table[j].left_hyphen_min;
200 right_hyphen_min = _hyph_dict_table[j].right_hyphen_min;
201 break;
202 }
203 }
204 return HyphMan::getHyphMethodForDictionary( dict->getId(), left_hyphen_min, right_hyphen_min );
205 }
206 }
207 }
208 #else
209 // koreader use hardcoded hyphenation dictionary table
210 for (int i=0; _hyph_dict_table[i].lang_tag!=NULL; i++) {
211 if ( lang_tag == lString32(_hyph_dict_table[i].lang_tag).lowercase() ) {
212 return HyphMan::getHyphMethodForDictionary( lString32(_hyph_dict_table[i].hyph_filename),
213 _hyph_dict_table[i].left_hyphen_min, _hyph_dict_table[i].right_hyphen_min);
214 }
215 }
216 // Look for lang_tag initial subpart
217 int m_pos = lang_tag.pos("-");
218 if ( m_pos > 0 ) {
219 lString32 lang_tag2 = lang_tag.substr(0, m_pos);
220 for (int i=0; _hyph_dict_table[i].lang_tag!=NULL; i++) {
221 if ( lang_tag2 == lString32(_hyph_dict_table[i].lang_tag).lowercase() ) {
222 return HyphMan::getHyphMethodForDictionary( lString32(_hyph_dict_table[i].hyph_filename),
223 _hyph_dict_table[i].left_hyphen_min, _hyph_dict_table[i].right_hyphen_min);
224 }
225 }
226 }
227 #endif
228 // Fallback to English_US, as other languages are more likely to get mixed
229 // with english text (it feels better than using @algorithm)
230 return HyphMan::getHyphMethodForDictionary(TEXTLANG_FALLBACK_HYPH_DICT_ID);
231 }
232
233 // Return the (single and cached) TextLangCfg for the provided lang_tag
getTextLangCfg(lString32 lang_tag)234 TextLangCfg * TextLangMan::getTextLangCfg( lString32 lang_tag ) {
235 if ( !_embedded_langs_enabled ) {
236 // Drop provided lang_tag: always return main lang TextLangCfg
237 lang_tag = _main_lang;
238 }
239 // Not sure if we can lowercase lang_tag and avoid duplicate (Harfbuzz might
240 // need the proper lang tag with some parts starting with some uppercase letter)
241 for ( int i=0; i<_lang_cfg_list.length(); i++ ) {
242 if ( _lang_cfg_list[i]->_lang_tag == lang_tag ) {
243 // printf("TextLangCfg %s reused\n", UnicodeToLocal(lang_tag).c_str());
244 // There should rarely be more than 3 lang in a document, so move
245 // any requested far down in the list at top to shorten next loops.
246 if ( i > 2 ) {
247 _lang_cfg_list.move(0, i);
248 return _lang_cfg_list[0];
249 }
250 return _lang_cfg_list[i];
251 }
252 }
253 // Not found in cache: create it
254 TextLangCfg * lang_cfg = new TextLangCfg( lang_tag );
255 _lang_cfg_list.add( lang_cfg ); // and cache it
256 return lang_cfg;
257 }
258
getTextLangCfg()259 TextLangCfg * TextLangMan::getTextLangCfg() {
260 // No lang_tag specified: return main lang one
261 return TextLangMan::getTextLangCfg( _main_lang );
262 }
263
getTextLangCfg(ldomNode * node)264 TextLangCfg * TextLangMan::getTextLangCfg( ldomNode * node ) {
265 if ( !_embedded_langs_enabled || !node ) {
266 // No need to look at nodes: return main lang one
267 return TextLangMan::getTextLangCfg( _main_lang );
268 }
269 if ( node->isText() )
270 node = node->getParentNode();
271 // We are usually called from renderFinalBlock() with a node that
272 // we know has a lang= attribute.
273 // But we may be called in other contexts (e.g. writeNodeEx) with
274 // any node: so, look at this node parents for that lang= attribute.
275 for ( ; !node->isRoot(); node = node->getParentNode() ) {
276 if ( node->hasAttribute( attr_lang ) ) {
277 lString32 lang_tag = node->getAttributeValue( attr_lang );
278 if ( !lang_tag.empty() )
279 return TextLangMan::getTextLangCfg( lang_tag );
280 }
281 }
282 // No parent with lang= attribute: return main lang one
283 return TextLangMan::getTextLangCfg( _main_lang );
284 }
285
getLangNodeIndex(ldomNode * node)286 int TextLangMan::getLangNodeIndex( ldomNode * node ) {
287 if ( !_embedded_langs_enabled || !node ) {
288 // No need to look up if !_embedded_langs_enabled
289 return 0;
290 }
291 if ( node->isText() )
292 node = node->getParentNode();
293 for ( ; !node->isRoot(); node = node->getParentNode() ) {
294 if ( node->hasAttribute( attr_lang ) ) {
295 if ( !node->getAttributeValue( attr_lang ).empty() ) {
296 return node->getDataIndex();
297 }
298 }
299 }
300 return 0;
301 }
302
303 // For HyphMan::hyphenate()
getMainLangHyphMethod()304 HyphMethod * TextLangMan::getMainLangHyphMethod() {
305 return getTextLangCfg()->getHyphMethod();
306 }
307
getLangTag(const lString32 & title)308 lString32 TextLangMan::getLangTag(const lString32& title)
309 {
310 for (int i = 0; _hyph_dict_table[i].lang_tag!=NULL; i++) {
311 if (title == lString32(_hyph_dict_table[i].hyph_filename_prefix)) {
312 return lString32(_hyph_dict_table[i].lang_tag);
313 }
314 }
315 return lString32();
316 }
317
resetCounters()318 void TextLangMan::resetCounters() {
319 for ( int i=0; i<_lang_cfg_list.length(); i++ ) {
320 _lang_cfg_list[i]->resetCounters();
321 }
322 }
323
324 // TextLangCfg object: per language holder of language specificities
325
326 // For CSS "content: open-quote / close-quote"
327 typedef struct quotes_spec {
328 const char * lang_tag;
329 const lChar32 * open_quote_level_1;
330 const lChar32 * close_quote_level_1;
331 const lChar32 * open_quote_level_2;
332 const lChar32 * close_quote_level_2;
333 } quotes_spec;
334
335 // List built 20200601 from https://html.spec.whatwg.org/multipage/rendering.html#quotes
336 // 2nd part of lang_tag lowercased for easier comparison, and if multiple
337 // lang_tag with the same starting chars, put the longest first.
338 // Small issue: 3-letters lang tag not specified here might match
339 // a 2-letter lang tag specified here ("ito" will get those from "it").
340 static quotes_spec _quotes_spec_table[] = {
341 { "af", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
342 { "agq", U"\x201e", U"\x201d", U"\x201a", U"\x2019" }, /* „ ” ‚ ’ */
343 { "ak", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
344 { "am", U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
345 { "ar", U"\x201d", U"\x201c", U"\x2019", U"\x2018" }, /* ” “ ’ ‘ */
346 { "asa", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
347 { "ast", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
348 { "az-cyrl", U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
349 { "az", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
350 { "bas", U"\x00ab", U"\x00bb", U"\x201e", U"\x201c" }, /* « » „ “ */
351 { "bem", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
352 { "bez", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
353 { "be", U"\x00ab", U"\x00bb", U"\x201e", U"\x201c" }, /* « » „ “ */
354 { "bg", U"\x201e", U"\x201c", U"\x2018", U"\x2019" }, /* „ “ ‘ ’ */
355 { "bm", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
356 { "bn", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
357 { "brx", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
358 { "br", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
359 { "bs-cyrl", U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
360 { "bs", U"\x201e", U"\x201d", U"\x2018", U"\x2019" }, /* „ ” ‘ ’ */
361 { "ca", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
362 { "cgg", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
363 { "chr", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
364 { "cs", U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
365 { "cy", U"\x2018", U"\x2019", U"\x201c", U"\x201d" }, /* ‘ ’ “ ” */
366 { "dav", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
367 { "da", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
368 { "de", U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
369 { "dje", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
370 { "dsb", U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
371 { "dua", U"\x00ab", U"\x00bb", U"\x2018", U"\x2019" }, /* « » ‘ ’ */
372 { "dyo", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
373 { "dz", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
374 { "ebu", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
375 { "ee", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
376 { "el", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
377 { "en", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
378 { "eo", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
379 { "es", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
380 { "et", U"\x201e", U"\x201c", U"\x00ab", U"\x00bb" }, /* „ “ « » */
381 { "eu", U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
382 { "ewo", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
383 { "fa", U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
384 { "ff", U"\x201e", U"\x201d", U"\x201a", U"\x2019" }, /* „ ” ‚ ’ */
385 { "fil", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
386 { "fi", U"\x201d", U"\x201d", U"\x2019", U"\x2019" }, /* ” ” ’ ’ */
387 { "fo", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
388 { "fr-ch", U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
389 // { "fr", U"\x00ab", U"\x00bb", U"\x00ab", U"\x00bb" }, /* « » « » */ /* Same pair for both level, bit sad... */
390 { "fr", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */ /* Better to have "fr" just as "it" */
391 { "fur", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */ /* Defaulting to "it", needs verification */
392 { "ga", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
393 { "gd", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
394 { "gl", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
395 { "gsw", U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
396 { "guz", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
397 { "gu", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
398 { "ha", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
399 { "he", U"\x201d", U"\x201d", U"\x2019", U"\x2019" }, /* ” ” ’ ’ */
400 { "hi", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
401 { "hr", U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
402 { "hsb", U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
403 { "hu", U"\x201e", U"\x201d", U"\x00bb", U"\x00ab" }, /* „ ” » « */
404 { "hy", U"\x00ab", U"\x00bb", U"\x00ab", U"\x00bb" }, /* « » « » */
405 { "id", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
406 { "ig", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
407 { "is", U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
408 { "it", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
409 { "ja", U"\x300c", U"\x300d", U"\x300e", U"\x300f" }, /* 「 」 『 』 */
410 { "jgo", U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
411 { "jmc", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
412 { "kab", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
413 { "kam", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
414 { "ka", U"\x201e", U"\x201c", U"\x2018", U"\x2019" }, /* „ “ “ ” */
415 { "kde", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
416 { "kea", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
417 { "khq", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
418 { "ki", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
419 { "kkj", U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
420 { "kk", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
421 { "kln", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
422 { "km", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
423 { "kn", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
424 { "ko", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
425 { "ksb", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
426 { "ksf", U"\x00ab", U"\x00bb", U"\x2018", U"\x2019" }, /* « » ‘ ’ */
427 { "ky", U"\x00ab", U"\x00bb", U"\x201e", U"\x201c" }, /* « » „ “ */
428 { "la-lit", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */ /* "la" just as "it" */
429 { "la", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */ /* "la" just as "it" */
430 { "lag", U"\x201d", U"\x201d", U"\x2019", U"\x2019" }, /* ” ” ’ ’ */
431 { "lb", U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
432 { "lg", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
433 { "ln", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
434 { "lo", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
435 { "lrc", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
436 { "lt", U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
437 { "luo", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
438 { "luy", U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
439 { "lu", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
440 { "lv", U"\x201c", U"\x201d", U"\x201e", U"\x201d" }, /* “ ” „ ” */
441 { "mas", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
442 { "mer", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
443 { "mfe", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
444 { "mgo", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
445 { "mg", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
446 { "mk", U"\x201e", U"\x201c", U"\x2019", U"\x2018" }, /* „ “ ’ ‘ */
447 { "ml", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
448 { "mn", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
449 { "mr", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
450 { "ms", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
451 { "mt", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
452 { "mua", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
453 { "my", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
454 { "mzn", U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
455 { "naq", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
456 { "nb", U"\x00ab", U"\x00bb", U"\x2018", U"\x2019" }, /* « » ‘ ’ */
457 { "nd", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
458 { "ne", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
459 { "nl", U"\x2018", U"\x2019", U"\x201c", U"\x201d" }, /* ‘ ’ “ ” */
460 { "nmg", U"\x201e", U"\x201d", U"\x00ab", U"\x00bb" }, /* „ ” « » */
461 { "nnh", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
462 { "nn", U"\x00ab", U"\x00bb", U"\x2018", U"\x2019" }, /* « » ‘ ’ */
463 { "nus", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
464 { "nyn", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
465 { "oc", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
466 { "pa", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
467 { "pl", U"\x201e", U"\x201d", U"\x00ab", U"\x00bb" }, /* „ ” « » */
468 { "pms", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */ /* Defaulting to "it", needs verification */
469 { "pt-br", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
470 { "pt-pt", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
471 { "pt", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
472 { "rm", U"\x00ab", U"\x00bb", U"\x2039", U"\x203a" }, /* « » ‹ › */
473 { "rn", U"\x201d", U"\x201d", U"\x2019", U"\x2019" }, /* ” ” ’ ’ */
474 { "rof", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
475 { "ro", U"\x201e", U"\x201d", U"\x00ab", U"\x00bb" }, /* „ ” « » */
476 { "ru", U"\x00ab", U"\x00bb", U"\x201e", U"\x201c" }, /* « » „ “ */
477 { "rwk", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
478 { "rw", U"\x00ab", U"\x00bb", U"\x2018", U"\x2019" }, /* « » ‘ ’ */
479 { "sah", U"\x00ab", U"\x00bb", U"\x201e", U"\x201c" }, /* « » „ “ */
480 { "saq", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
481 { "sbp", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
482 { "seh", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
483 { "ses", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
484 { "sg", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
485 { "shi-latn", U"\x00ab", U"\x00bb", U"\x201e", U"\x201d" }, /* « » „ ” */
486 { "shi", U"\x00ab", U"\x00bb", U"\x201e", U"\x201d" }, /* « » „ ” */
487 { "si", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
488 { "sk", U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
489 { "sl", U"\x201e", U"\x201c", U"\x201a", U"\x2018" }, /* „ “ ‚ ‘ */
490 { "sn", U"\x201d", U"\x201d", U"\x2019", U"\x2019" }, /* ” ” ’ ’ */
491 { "so", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
492 { "sq", U"\x00ab", U"\x00bb", U"\x201c", U"\x201d" }, /* « » “ ” */
493 { "sr-latn", U"\x201e", U"\x201c", U"\x2018", U"\x2018" }, /* „ “ ‘ ‘ */
494 { "sr", U"\x201e", U"\x201d", U"\x2019", U"\x2019" }, /* „ ” ’ ’ */
495 { "sv", U"\x201d", U"\x201d", U"\x2019", U"\x2019" }, /* ” ” ’ ’ */
496 { "sw", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
497 { "ta", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
498 { "teo", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
499 { "te", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
500 { "th", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
501 { "ti-er", U"\x2018", U"\x2019", U"\x201c", U"\x201d" }, /* ‘ ’ “ ” */
502 { "tk", U"\x201c", U"\x201d", U"\x201c", U"\x201d" }, /* “ ” “ ” */
503 { "to", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
504 { "tr", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
505 { "twq", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
506 { "tzm", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
507 { "uk", U"\x00ab", U"\x00bb", U"\x201e", U"\x201c" }, /* « » „ “ */
508 { "ur", U"\x201d", U"\x201c", U"\x2019", U"\x2018" }, /* ” “ ’ ‘ */
509 { "uz-cyrl", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
510 { "uz", U"\x201c", U"\x201d", U"\x2019", U"\x2018" }, /* “ ” ’ ‘ */
511 { "vai-latn", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
512 { "vai", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
513 { "vi", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
514 { "vun", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
515 { "xog", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
516 { "yav", U"\x00ab", U"\x00bb", U"\x00ab", U"\x00bb" }, /* « » « » */
517 { "yo", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
518 { "yue-hans", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
519 { "yue", U"\x300c", U"\x300d", U"\x300e", U"\x300f" }, /* 「 」 『 』 */
520 { "zgh", U"\x00ab", U"\x00bb", U"\x201e", U"\x201d" }, /* « » „ ” */
521 { "zh-hant", U"\x300c", U"\x300d", U"\x300e", U"\x300f" }, /* 「 」 『 』 */
522 { "zh", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
523 { "zu", U"\x201c", U"\x201d", U"\x2018", U"\x2019" }, /* “ ” ‘ ’ */
524 { NULL, NULL, NULL, NULL, NULL }
525 };
526 // Default to quotes for English
527 static quotes_spec _quotes_spec_default = { "", U"\x201c", U"\x201d", U"\x2018", U"\x2019" };
528
529 #if USE_LIBUNIBREAK==1
530 #if KO_LIBUNIBREAK_PATCH==1
lb_char_sub_func_english(struct LineBreakContext * lbpCtx,const lChar32 * text,int pos,int next_usable)531 lChar32 lb_char_sub_func_english(struct LineBreakContext *lbpCtx, const lChar32 * text, int pos, int next_usable) {
532 // https://github.com/koreader/crengine/issues/364
533 // Normally, line breaks are allowed at both sides of an em-dash.
534 // When an em-dash is at the "end of a word" (or beginning), we want to avoid separating it from its word,
535 // this is detected by looking for letters/numbers at both sides of the dash, if on any side a space
536 // is closer than any letter/number, treat it as a non-breakable dash.
537 // The current implementation does not allow examining the following characters beyond the current node,
538 // so the detection is not perfect and we replace the dash with "opening" or "closing" characters
539 // (or "ambiguous), to play safer (note that "}" allows a break after, while ")" doesn't).
540 //
541 // The intent is the following:
542 // blah—blah -> — (break before or after)
543 // blah “—blah , <p>—blah -> { (do not break after)
544 // blah—” Blah , blah—”</p> -> } (do not break before)
545 // blah — blah , blah —<em>blah -> " (break only at spaces)
546 switch ( text[pos] ) {
547 case 0x2014: // em dash
548 case 0x2E3A: // two-em dash
549 case 0x2E3B: // three-em dash
550 {
551 // The variable "replacement" will be the output char,
552 // we start by setting it to the actual input char.
553 // It will be '{' if no-break on right,
554 // '}' if no-break on left,
555 // '"' if no-break on both.
556 lChar32 replacement = text[pos];
557 int new_pos;
558 enum LineBreakClass new_lbc;
559 // 1. Detect no-break on right (scan left of dash)
560 //
561 // already at the beginning of text
562 if ( pos == 0 ) {
563 replacement = '{';
564 }
565 else {
566 // inspect preceding characters
567 new_pos = pos;
568 while ( new_pos > 0) {
569 new_pos--;
570 new_lbc = lb_get_char_class(lbpCtx, text[new_pos]);
571 if ( new_lbc == LBP_AL || new_lbc == LBP_NU ) {
572 // found word / number
573 break;
574 }
575 else if ( new_lbc == LBP_SP || new_pos == 0 ) {
576 // found space or beginning
577 replacement = '{';
578 break;
579 }
580 }
581 }
582 // 2. Detect no-break on left (scan right of dash)
583 // If already no-break on right, replacement will be '"'
584 //
585 // already at the end of text
586 if ( next_usable == 0 ) {
587 replacement = ( replacement == '{' ) ? '"' : '}';
588 }
589 else {
590 // inspect following characters
591 new_pos = pos;
592 while ( new_pos < pos+next_usable ) {
593 new_pos++;
594 new_lbc = lb_get_char_class(lbpCtx, text[new_pos]);
595 if ( new_lbc == LBP_AL || new_lbc == LBP_NU ) {
596 // found word / number
597 break;
598 }
599 else if ( new_lbc == LBP_SP || new_pos == pos+next_usable ) {
600 // found space or end (of the current text node, there could be letters beyond)
601 replacement = ( replacement == '{' ) ? '"' : '}';
602 break;
603 }
604 }
605 }
606 return replacement;
607 }
608 break;
609 default:
610 break;
611 }
612 return text[pos];
613 }
614 #endif // KO_LIBUNIBREAK_PATCH==1
615
lb_char_sub_func_polish(struct LineBreakContext * lbpCtx,const lChar32 * text,int pos,int next_usable)616 lChar32 lb_char_sub_func_polish(struct LineBreakContext *lbpCtx, const lChar32 * text, int pos, int next_usable) {
617 // https://github.com/koreader/koreader/issues/5645#issuecomment-559193057
618 // Letters aiouwzAIOUWS are prepositions that should not be left at the
619 // end of a line.
620 // Make them behave (for libunibreak) just like a opening paren (which
621 // being LBC_OP, will prevent a line break after it, even if followed
622 // by a space).
623 if ( pos >= 1 && text[pos-1] == ' ' ) {
624 switch ( text[pos] ) {
625 case 'A':
626 case 'I':
627 case 'O':
628 case 'U':
629 case 'W':
630 case 'Z': // Meaning in english:
631 case 'a': // and
632 case 'i': // and
633 case 'o': // about
634 case 'u': // at
635 case 'w': // in
636 case 'z': // with
637 return '(';
638 break;
639 default:
640 break;
641 }
642 }
643 return text[pos];
644 }
645
lb_char_sub_func_czech_slovak(struct LineBreakContext * lbpCtx,const lChar32 * text,int pos,int next_usable)646 lChar32 lb_char_sub_func_czech_slovak(struct LineBreakContext *lbpCtx, const lChar32 * text, int pos, int next_usable) {
647 // Same for Czech and Slovak : AIiVvOoUuSsZzKk
648 // https://tex.stackexchange.com/questions/27780/one-letter-word-at-the-end-of-line
649 // https://github.com/michal-h21/luavlna
650 if ( pos >= 1 && text[pos-1] == ' ' ) {
651 switch ( text[pos] ) {
652 case 'A':
653 case 'I':
654 case 'K':
655 case 'O':
656 case 'S':
657 case 'U':
658 case 'V':
659 case 'Z':
660 case 'i':
661 case 'k':
662 case 'o':
663 case 's':
664 case 'u':
665 case 'v':
666 case 'z':
667 return '(';
668 break;
669 default:
670 break;
671 }
672 }
673 return text[pos];
674 }
675 #endif
676
677 // Instantiate a new TextLangCfg with properties adequate to the provided lang_tag
TextLangCfg(lString32 lang_tag)678 TextLangCfg::TextLangCfg( lString32 lang_tag ) {
679 if ( TextLangMan::_no_hyph_method == NULL ) {
680 // We need to init static TextLangMan::_no_hyph_method and friends after
681 // HyphMan is set up. Do that here, even if unrelated, as TextLangCfg
682 // creation is called less often that every other methods around here.
683 TextLangMan::_no_hyph_method = HyphMan::getHyphMethodForDictionary(HYPH_DICT_ID_NONE);
684 TextLangMan::_soft_hyphens_method = HyphMan::getHyphMethodForDictionary(HYPH_DICT_ID_SOFTHYPHENS);
685 TextLangMan::_algo_hyph_method = HyphMan::getHyphMethodForDictionary(HYPH_DICT_ID_ALGORITHM);
686 }
687
688 // Keep as our id the provided and non-lowercase'd lang_tag (with possibly bogus #@algorithm)
689 _lang_tag = lang_tag;
690 // Harfbuzz may know more than us about exotic/complex lang tags,
691 // so let it deal the the provided one as-is.
692 lString32 hb_lang_tag = lang_tag;
693 // Lowercase it for our tests
694 lang_tag.lowercase(); // (used by LANG_STARTS_WITH() macros)
695
696 // Get hyph method/dictionary from _hyph_dict_table
697 _hyph_method = TextLangMan::getHyphMethodForLang(lang_tag);
698
699 // Cleanup if we got "en#@something" from legacy HyphMan methods
700 int h_pos = lang_tag.pos("#");
701 if ( h_pos > 0 ) {
702 lang_tag = lang_tag.substr(0, h_pos);
703 hb_lang_tag = hb_lang_tag.substr(0, h_pos); // Also clean the one for HB
704 }
705 #ifdef DEBUG_LANG_USAGE
706 printf("TextLangCfg %s created (%s %s)\n", UnicodeToLocal(_lang_tag).c_str(),
707 UnicodeToLocal(lang_tag).c_str(), UnicodeToLocal(_hyph_method->getId()).c_str());
708 #endif
709
710 // https://drafts.csswg.org/css-text-3/#script-tagging
711 // We might need to check for the script subpart (optional 2nd
712 // subpart) Lant, Hant, Hrkt... and make some non latin language
713 // with a Lant script behave more like latin languages...
714
715 // Note that Harfbuzz seems to do the right same thing with
716 // either "zh-TW" and "zh-Hant".
717
718 // See for more clever/complex handling of lang tags:
719 // https://android.googlesource.com/platform/frameworks/minikin/+/refs/heads/master/libs/minikin/Locale.cpp
720
721 // We thought about adding a 2nd fallback font per-language, but it feels
722 // a bit wrong to limit this feature to documents with lang tags.
723 // Better to implement a generic font fallback chain independant of language.
724
725 // https://unicode.org/reports/tr14/#Hyphen : in Polish and Portuguese,
726 // a real hyphen at end of line must be duplicated at start of next line.
727 _duplicate_real_hyphen_on_next_line = false;
728
729 #if USE_HARFBUZZ==1
730 _hb_language = hb_language_from_string(UnicodeToLocal(hb_lang_tag).c_str(), -1);
731 #endif
732
733 #if USE_LIBUNIBREAK==1
734 // libunibreak per-language LineBreakProperties extensions
735 //
736 // Rules extracted from libunibreak/src/linebreakdef.c, so we can adapt
737 // them and build LineBreakProperties adequately for more languages.
738 // See https://en.wikipedia.org/wiki/Quotation_mark
739 // These are mostly need only for languages that may add a space between
740 // the quote and its content - otherwise, the quote will be part of the
741 // word it sticks to, and break will be allowed on the other side which
742 // probably is a space.
743 // When a language allows the use of unpaired quotes (same quote on both
744 // sides), it seems best to not specify anything.
745 bool has_left_single_quotation_mark_opening = false; // U+2018 ‘
746 bool has_left_single_quotation_mark_closing = false;
747 bool has_right_single_quotation_mark_opening = false; // U+2019 ’
748 bool has_right_single_quotation_mark_closing = false;
749 bool has_right_single_quotation_mark_glue = false;
750 bool has_left_double_quotation_mark_opening = false; // U+201C “
751 bool has_left_double_quotation_mark_closing = false;
752 bool has_right_double_quotation_mark_opening = false; // U+201D ”
753 bool has_right_double_quotation_mark_closing = false;
754 bool has_left_single_angle_quotation_mark_opening = false; // U+2039 ‹
755 bool has_left_single_angle_quotation_mark_closing = false;
756 bool has_right_single_angle_quotation_mark_opening = false; // U+203A ›
757 bool has_right_single_angle_quotation_mark_closing = false;
758 bool has_left_double_angle_quotation_mark_opening = false; // U+00AB «
759 bool has_left_double_angle_quotation_mark_closing = false;
760 bool has_right_double_angle_quotation_mark_opening = false; // U+00BB »
761 bool has_right_double_angle_quotation_mark_closing = false;
762 // Additional rule for treating em-dashes as e.g. "horizontal bar"
763 // This is appropriate for languages that typically have a space at a
764 // breakable side of the dash
765 bool has_em_dash_alphabetic = false; // U+2014 —, U+2E3A ⸺, U+2E3B ⸻
766
767 // Note: these macros use 'lang_tag'.
768 if ( LANG_STARTS_WITH(("en")) ) { // English
769 has_left_single_quotation_mark_opening = true; // no right..closing in linebreakdef.c
770 has_left_double_quotation_mark_opening = true;
771 has_right_double_quotation_mark_closing = true;
772 }
773 else if ( LANG_STARTS_WITH(("fr") ("es")) ) { // French, Spanish
774 has_left_single_quotation_mark_opening = true; // no right..closing in linebreakdef.c
775 has_left_double_quotation_mark_opening = true;
776 has_right_double_quotation_mark_closing = true;
777 has_left_single_angle_quotation_mark_opening = true;
778 has_right_single_angle_quotation_mark_closing = true;
779 has_left_double_angle_quotation_mark_opening = true;
780 has_right_double_angle_quotation_mark_closing = true;
781 has_em_dash_alphabetic = true;
782 }
783 else if ( LANG_STARTS_WITH(("de")) ) { // German
784 has_left_single_quotation_mark_closing = true;
785 has_right_single_quotation_mark_glue = true;
786 has_left_double_quotation_mark_closing = true;
787 /* Next ones commented out, as non-inverted usage of these
788 * quotation marks can be found in pure "de" text - and
789 * generally, these quotations marks are stuck to their
790 * quoted first or last word and have only a space on the
791 * other side, and so should be fine with just being "QU"
792 * for libunibreak.
793 * See https://github.com/koreader/koreader/issues/6717
794 has_left_single_angle_quotation_mark_closing = true;
795 has_right_single_angle_quotation_mark_opening = true;
796 has_left_double_angle_quotation_mark_closing = true;
797 has_right_double_angle_quotation_mark_opening = true;
798 */
799 }
800 else if ( LANG_STARTS_WITH(("ru")) ) { // Russian
801 has_left_double_quotation_mark_closing = true;
802 has_left_double_angle_quotation_mark_opening = true;
803 has_right_double_angle_quotation_mark_closing = true;
804 }
805 else if ( LANG_STARTS_WITH(("zh")) ) { // Chinese
806 has_left_single_quotation_mark_opening = true;
807 has_right_single_quotation_mark_closing = true;
808 has_left_double_quotation_mark_opening = true;
809 has_right_double_quotation_mark_closing = true;
810 }
811 // Add languages rules here, or reuse previous one with other languages if needed.
812
813 // Set up _lb_props.
814 // Important: the unicode indices must be in strict ascending order (or libunibreak
815 // might abort checking them all)
816 int n = 0;
817 if ( has_left_double_angle_quotation_mark_opening ) _lb_props[n++] = { 0x00AB, 0x00AB, LBP_OP };
818 if ( has_left_double_angle_quotation_mark_closing ) _lb_props[n++] = { 0x00AB, 0x00AB, LBP_CL };
819 // Soft-Hyphens are handled by Hyphman hyphenate(), have them handled as Zero-Width-Joiner by
820 // libunibreak so they don't allow any break and don't prevent hyphenate() to handle them correctly.
821 _lb_props[n++] = { 0x00AD, 0x00AD, LBP_ZWJ };
822 if ( has_right_double_angle_quotation_mark_opening ) _lb_props[n++] = { 0x00BB, 0x00BB, LBP_OP };
823 if ( has_right_double_angle_quotation_mark_closing ) _lb_props[n++] = { 0x00BB, 0x00BB, LBP_CL };
824 if ( has_em_dash_alphabetic ) _lb_props[n++] = { 0x2014, 0x2014, LBP_AL };
825 if ( has_left_single_quotation_mark_opening ) _lb_props[n++] = { 0x2018, 0x2018, LBP_OP };
826 if ( has_left_single_quotation_mark_closing ) _lb_props[n++] = { 0x2018, 0x2018, LBP_CL };
827 if ( has_right_single_quotation_mark_opening ) _lb_props[n++] = { 0x2019, 0x2019, LBP_OP };
828 if ( has_right_single_quotation_mark_closing ) _lb_props[n++] = { 0x2019, 0x2019, LBP_CL };
829 if ( has_right_single_quotation_mark_glue ) _lb_props[n++] = { 0x2019, 0x2019, LBP_GL };
830 if ( has_left_double_quotation_mark_opening ) _lb_props[n++] = { 0x201C, 0x201C, LBP_OP };
831 if ( has_left_double_quotation_mark_closing ) _lb_props[n++] = { 0x201C, 0x201C, LBP_CL };
832 if ( has_right_double_quotation_mark_opening ) _lb_props[n++] = { 0x201D, 0x201D, LBP_OP };
833 if ( has_right_double_quotation_mark_closing ) _lb_props[n++] = { 0x201D, 0x201D, LBP_CL };
834 if ( has_left_single_angle_quotation_mark_opening ) _lb_props[n++] = { 0x2039, 0x2039, LBP_OP };
835 if ( has_left_single_angle_quotation_mark_closing ) _lb_props[n++] = { 0x2039, 0x2039, LBP_CL };
836 if ( has_right_single_angle_quotation_mark_opening ) _lb_props[n++] = { 0x203A, 0x203A, LBP_OP };
837 if ( has_right_single_angle_quotation_mark_closing ) _lb_props[n++] = { 0x203A, 0x203A, LBP_CL };
838 if ( has_em_dash_alphabetic ) _lb_props[n++] = { 0x2E3A, 0x2E3B, LBP_AL };
839 // End of list
840 _lb_props[n++] = { 0, 0, LBP_Undefined };
841 // When adding properties, be sure combinations for all languages
842 // do fit in _lb_props[MAX_NB_LB_PROPS_ITEMS] (MAX_NB_LB_PROPS_ITEMS
843 // is defined in textlang.h, currently at 20).
844 // Done with libunibreak per-language LineBreakProperties extensions
845
846 // Other line breaking and text layout tweaks
847 _lb_char_sub_func = NULL;
848 #if KO_LIBUNIBREAK_PATCH==1
849 if ( LANG_STARTS_WITH(("en")) ) { // English
850 _lb_char_sub_func = &lb_char_sub_func_english;
851 } else
852 #endif
853 if ( LANG_STARTS_WITH(("pl")) ) { // Polish
854 _lb_char_sub_func = &lb_char_sub_func_polish;
855 _duplicate_real_hyphen_on_next_line = true;
856 }
857 else if ( LANG_STARTS_WITH(("cs") ("sk")) ) { // Czech, Slovak
858 _lb_char_sub_func = &lb_char_sub_func_czech_slovak;
859 }
860 else if ( LANG_STARTS_WITH(("pt") ("sr")) ) { // Portuguese, Serbian
861 _duplicate_real_hyphen_on_next_line = true;
862 }
863 #endif
864
865 // Language default opening and closing quotes, for CSS
866 // "q::before { content: open-quote }" and
867 // "q::after { content: close-quote }"
868 quotes_spec * quotes = &_quotes_spec_default;
869 for (int i=0; _quotes_spec_table[i].lang_tag!=NULL; i++) {
870 if ( lang_tag.startsWith( _quotes_spec_table[i].lang_tag ) ) {
871 quotes = &_quotes_spec_table[i];
872 break;
873 }
874 }
875 // Avoid a wrap after/before an opening/close quote.
876 const lChar32 * quote_joiner = U"\x2060";
877 // (Zero width, equivalent to deprecated ZERO WIDTH NO-BREAK SPACE)
878 // We might want with some languages to use a non-breaking thin space instead.
879
880 _open_quote1 << quotes->open_quote_level_1 << quote_joiner;
881 _close_quote1 << quote_joiner << quotes->close_quote_level_1;
882 _open_quote2 << quotes->open_quote_level_2 << quote_joiner;
883 _close_quote2 << quote_joiner << quotes->close_quote_level_2;
884
885 resetCounters();
886 }
887
~TextLangCfg()888 TextLangCfg::~TextLangCfg() {
889 }
890
resetCounters()891 void TextLangCfg::resetCounters() {
892 _quote_nesting_level = 0;
893 }
894
getOpeningQuote(bool update_level)895 lString32 & TextLangCfg::getOpeningQuote( bool update_level ) {
896 if ( !update_level )
897 return _open_quote1;
898 _quote_nesting_level++;
899 return (_quote_nesting_level % 2) ? _open_quote1 : _open_quote2;
900 }
901
getClosingQuote(bool update_level)902 lString32 & TextLangCfg::getClosingQuote( bool update_level ) {
903 if ( !update_level )
904 return _close_quote1;
905 _quote_nesting_level--;
906 return ((_quote_nesting_level+1) % 2) ? _close_quote1 : _close_quote2;
907 }
908
getHyphenHangingPercent()909 int TextLangCfg::getHyphenHangingPercent() {
910 return 70; // 70%
911 }
912
getHangingPercent(bool right_hanging,bool & check_font,const lChar32 * text,int pos,int next_usable)913 int TextLangCfg::getHangingPercent( bool right_hanging, bool & check_font, const lChar32 * text, int pos, int next_usable ) {
914 // We get provided with the BiDi re-ordered m_text (so, visually
915 // ordered) and the index of char: if needed, we can look at
916 // previous or next chars for context to decide how much to hang
917 // (i.e. consecutive punctuations).
918
919 // If we ever need to tweak this per language, try to avoid checks
920 // for the lang_tag in here:
921 // - either set bool members to enable or disable some checks and tweaks
922 // - or make this hanging_percent_func_generic, and add dedicated
923 // functions per language, hanging_percent_func_french, that
924 // could fallback to calling hanging_percent_func_generic after
925 // some checks - and have TextLangCfg::getHangingPercent() call
926 // the dedicated function pointer stored as a member.
927
928 // We might want to prevent any hanging with Chinese and Japanese
929 // as the text might be mostly full-width glyphs, and this might
930 // break the grid. This is less risky if the main font is a CJK
931 // font, but if it is not, punctuation might be picked from the
932 // main non-CJK font and won't be full-width.
933 // Or we could round any value to 0 or 100% (and/or tweak any
934 // glyph in lvtextfm.cpp so it looks like it is full-width).
935
936 lChar32 ch = text[pos];
937 int ratio = 0;
938
939 // In French, there's usually a space before and after guillemets,
940 // or before a quotation mark. Having them hanging, and then a
941 // space, looks like there's a hole in the margin.
942 // So, for some chars, we'll avoid hanging or reduce the hanging
943 // ratio if the next/prev char is a space char.
944 // This might not happen in other languages, so let's do that
945 // prevention generically. If needed, make that dependant on
946 // a boolean member, set to true if LANG_STARTS_WITH(("fr")).
947 bool space_alongside = false;
948 if ( right_hanging ) {
949 if ( pos > 0 ) {
950 lChar32 prev_ch = text[pos-1];
951 if ( prev_ch == 0x0020 || prev_ch == 0x00A0 || (prev_ch >= 0x2000 && prev_ch <= 0x200A ) ) {
952 // Normal space, no-break space, and other unicode spaces (except zero-width ones)
953 space_alongside = true;
954 }
955 }
956 }
957 else {
958 if ( next_usable > 0 ) {
959 lChar32 next_ch = text[pos+1];
960 if ( next_ch == 0x0020 || next_ch == 0x00A0 || (next_ch >= 0x2000 && next_ch <= 0x200A ) ) {
961 // Normal space, no-break space, and other unicode spaces (except zero-width ones)
962 space_alongside = true;
963 }
964 }
965 }
966
967 // For the common punctuations, parens and quotes, we check and
968 // return the same value whether asked for left or right hanging.
969 // Normally, libunibreak has prevented them from happening on
970 // one of the sides - but with RTL text, they may happen on
971 // the other side. Also, some BiDi mirrorable chars "([])" might
972 // be mirrored in the provided *text when not-using HarfBuzz, but
973 // won't be mirrored when using HarfBuzz - so let's handle
974 // all of them no matter the hanging side asked for.
975 // Also, because in some languages, quotation marks and guillemets
976 // are used reverted, we include left and right ones in both sets.
977
978 // Most values taken from the "protusion" section in:
979 // https://source.contextgarden.net/tex/context/base/mkiv/font-imp-quality.lua
980 // https://www.w3.org/Mail/flatten/index?subject=Amending+hanging-punctuation+for+Western+typography&list=www-style
981 // and the microtypography thesis: http://www.pragma-ade.nl/pdftex/thesis.pdf
982 // (screenshot at https://github.com/koreader/koreader/issues/6235#issuecomment-639307634)
983
984 switch (ch) {
985 case 0x0027: // ' single quote
986 case 0x002C: // , comma
987 case 0x002D: // - minus
988 case 0x002E: // . period
989 case 0x0060: // ` back quote
990 // case 0x00AD: // soft hyphen (we don't draw them, so don't handle them)
991 case 0x060C: // ، arabic comma
992 case 0x06D4: // ۔ arabic full stop
993 case 0x2010: // ‐ hyphen
994 case 0x2018: // ‘ left single quotation mark
995 case 0x2019: // ’ right single quotation mark
996 case 0x201A: // ‚ single low-9 quotation mark
997 case 0x201B: // ‛ single high-reversed-9 quotation mark
998 ratio = 70;
999 break;
1000 case 0x2039: // ‹ left single guillemet
1001 case 0x203A: // › right single guillemet
1002 // These are wider than the previous ones, and hanging by 70% with a space
1003 // alongside can give a feeling of bad justification. So, hang less.
1004 ratio = space_alongside ? 20 : 70;
1005 break;
1006 case 0x0022: // " double quote
1007 case 0x003A: // : colon
1008 case 0x003B: // ; semicolon
1009 case 0x061B: // ؛ arabic semicolon
1010 case 0x201C: // “ left double quotation mark
1011 case 0x201D: // ” right double quotation mark
1012 case 0x201E: // „ double low-9 quotation mark
1013 case 0x201F: // ‟ double high-reversed-9 quotation mark
1014 ratio = 50;
1015 break;
1016 case 0x00AB: // « left guillemet
1017 case 0x00BB: // » right guillemet
1018 // These are wider than the previous ones, and hanging by 50% with a space
1019 // alongside can give a feeling of bad justification. So, hang less.
1020 ratio = space_alongside ? 20 : 50;
1021 break;
1022 case 0x2013: // – endash
1023 // Should have enough body inside (with only 30% hanging)
1024 ratio = 30;
1025 break;
1026 case 0x0021: // !
1027 case 0x003F: // ?
1028 case 0x00A1: // ¡
1029 case 0x00BF: // ¿
1030 case 0x061F: // ؟
1031 case 0x2014: // — emdash
1032 case 0x2026: // … ellipsis
1033 // These will have enough body inside (with only 20% hanging),
1034 // so they shouldn't hurt when space_alongside.
1035 ratio = 20;
1036 break;
1037 case 0x0028: // (
1038 case 0x0029: // )
1039 case 0x005B: // [
1040 case 0x005D: // ]
1041 case 0x007B: // {
1042 case 0x007D: // }
1043 ratio = 5;
1044 break;
1045 default:
1046 break;
1047 }
1048 if ( ratio ) {
1049 check_font = false;
1050 return ratio;
1051 }
1052 // Other are non punctuation but slight adjustment for some letters,
1053 // that might be ignored if the font already include some negative
1054 // left side bearing.
1055 // The hanging ratio is small, so no need to correct if space_alongside.
1056 check_font = true;
1057 if ( right_hanging ) {
1058 switch (ch) {
1059 case 'A':
1060 case 'F':
1061 case 'K':
1062 case 'L':
1063 case 'T':
1064 case 'V':
1065 case 'W':
1066 case 'X':
1067 case 'Y':
1068 case 'k':
1069 case 'r':
1070 case 't':
1071 case 'v':
1072 case 'w':
1073 case 'x':
1074 case 'y':
1075 ratio = 5;
1076 break;
1077 default:
1078 break;
1079 }
1080 }
1081 else { // left hanging
1082 switch (ch) {
1083 case 'A':
1084 case 'J':
1085 case 'T':
1086 case 'V':
1087 case 'W':
1088 case 'X':
1089 case 'Y':
1090 case 'v':
1091 case 'w':
1092 case 'x':
1093 case 'y':
1094 ratio = 5;
1095 break;
1096 default:
1097 break;
1098 }
1099 }
1100 return ratio;
1101 }
1102