1 /**
2 * Yudit Unicode Editor Source File
3 *
4 * GNU Copyright (C) 1997-2006 Gaspar Sinai <gaspar@yudit.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License, version 2,
8 * dated June 1991. See file COPYYING for details.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19 #include "stoolkit/SCluster.h"
20 #include "stoolkit/SProperties.h"
21 #include "stoolkit/SUniMap.h"
22
23
24 static unsigned int
25 getRISCluster (const SV_UCS4& unicode, unsigned int index,
26 SV_UCS4* ret, int* finished);
27
28 static unsigned int
29 getRovasCluster (const SV_UCS4& unicode, unsigned int index,
30 SV_UCS4* ret, int* finished, bool isPUA);
31
32 static unsigned int
33 getJamoCluster (const SV_UCS4& unicode, unsigned int index,
34 SV_UCS4* ret, int* finished);
35
36 static SS_UCS4 precomposJamos(SV_UCS4* jamo);
37
38 static unsigned int
39 getSouthIndicCluster (unsigned int scriptcode,
40 const SV_UCS4& unicode,
41 unsigned int index, SV_UCS4* ret, int* finished);
42
43 static unsigned int
44 getIndicCluster (unsigned int scriptcode,
45 const SV_UCS4& unicode,
46 unsigned int index, SV_UCS4* ret, int* finished);
47
48 static SUniMap* clusters;
49 static SUniMap* indic;
50 static SProperties* ligatureUnics;
51 static SProperties* ligatureClust;
52
53 static SProperties* ligatureCache;
54 static SS_UCS4 counters[SD_SCRIPT_MAX];
55
56 static SS_UCS4 nextLigature (unsigned int script,
57 const SS_UCS4* unicode, unsigned int length);
58
59 static void initLigatures();
60
61 SString yuditClusterError;
62
63 /**
64 * Try to form a cluster - an abstract glyphs that can
65 * be broken apart once made. It can be rendered by
66 * a font that contains glyphs and ligatureUnics by subdividing
67 * the cluster. The cluster is in memory order -
68 * vowels are place on the appropriate side.
69 *
70 * Clusters will replace the current SGlyph architecture.
71 * All new things should be added here.
72 *
73 * 2002-04-03 - added surrogate clusters.
74 *
75 * @param ucs4 is the input vector.
76 * @param i is the index in this vector - next character.
77 * @param finished is set to 0 if more data is needed
78 * this parameter can be null.
79 * @return the new index in ucs4.
80 */
81 unsigned int
getCluster(const SV_UCS4 & ucs4,unsigned int index,SV_UCS4 * retchar,int * finished)82 getCluster (const SV_UCS4& ucs4,
83 unsigned int index, SV_UCS4* retchar, int *finished)
84 {
85 if (finished) *finished = -1;
86
87 /* pack surrogates into a cluster
88 - no combining marks on surrogates for the time being. */
89 if (ucs4[index] >= 0xd800 && ucs4[index] <= 0xdbff)
90 {
91 if (ucs4.size() < index+2)
92 {
93 if (finished) *finished = 0;
94 retchar->append (ucs4[index]);
95 return index + 1;
96 }
97 if (ucs4[index+1] >= 0xdc00 && ucs4[index+1] <= 0xdfff)
98 {
99 retchar->append (((ucs4[index] & 0x3ff)<< 10)
100 + (ucs4[index+1]&0x3ff) + 0x10000);
101 if (finished) *finished = 1;
102 return index+2;
103 }
104 return index;
105 }
106 /* start the game */
107 initLigatures();
108
109 /* Should be able to start with ZWJ */
110 int scriptcode = (
111 (ucs4[index] == 0x200D || ucs4[index] == 0x25CC) && index+1 < ucs4.size())
112 ? getUnicodeScript (ucs4[index+1]) : getUnicodeScript (ucs4[index]);
113
114 if (scriptcode < 0) return index;
115 unsigned int ret = index;
116 yuditClusterError.clear();
117 switch (scriptcode)
118 {
119 case SD_DEVANAGARI:
120 case SD_BENGALI:
121 case SD_GURMUKHI:
122 case SD_GUJARATI:
123 case SD_ORIYA:
124 case SD_KANNADA:
125 case SD_MALAYALAM:
126 case SD_SINHALA:
127 case SD_TELUGU:
128 if (!indic->isOK()) break;;
129 ret = getIndicCluster (
130 (unsigned int)scriptcode, ucs4, index, retchar, finished);
131 break;
132 case SD_HANGUL_JAMO:
133 ret = getJamoCluster (ucs4, index, retchar, finished);
134 break;
135 case SD_TIBETAN:
136 case SD_THAI:
137 case SD_LAO:
138 ret = getSouthIndicCluster ((unsigned int)scriptcode,
139 ucs4, index, retchar, finished);
140 //if (ret>0) fprintf (stderr, "TIBET Tibetan: %d\n", ret-index);
141 break;
142 case SD_TAMIL:
143 case SD_YUDIT:
144 if (!clusters->isOK()) break;
145 ret = clusters->lift (ucs4, index, true, retchar);
146 break;
147 case SD_ROVASIRAS:
148 ret = getRovasCluster (ucs4, index, retchar, finished, false);
149 break;
150 case SD_PUA_ROVAS:
151 ret = getRovasCluster (ucs4, index, retchar, finished, true);
152 break;
153 case SD_REGIONAL_INDICATOR_SYMBOL:
154 ret = getRISCluster (ucs4, index, retchar, finished);
155 break;
156 }
157 if (finished==0 && yuditClusterError.size())
158 {
159 // If you want to debug things uncomment this.
160 //fprintf (stderr, "SCluster.cpp:%*.*s\n", SSARGS(yuditClusterError));
161 }
162 return ret;
163 }
164
165 /**
166 * -1 non rovas.
167 * 1 rovas basic
168 * 2 rovas liga
169 * 3 rovas yudit cluster
170 * 0 ZWJ
171 */
getRovasType(SS_UCS4 chr)172 int getRovasType (SS_UCS4 chr)
173 {
174 if (chr == 0x200d) return 0;
175 if (chr >= 0x10c80 && chr <= 0x10cff)
176 {
177 return 1;
178 }
179 if (getLigatureScriptCode(chr) == SD_ROVASIRAS) return 3;
180 return -1;
181 }
182
183 /**
184 * -1 non rovas.
185 * 1 rovas basic
186 * 2 rovas liga
187 * 3 rovas yudit cluster
188 * 0 ZWJ
189 */
getPUARovasType(SS_UCS4 chr)190 int getPUARovasType (SS_UCS4 chr)
191 {
192 if (chr == 0x200d) return 0;
193 if (chr >= 0xee00 && chr <= 0xee29)
194 {
195 return 1;
196 }
197 if (chr >= 0xee30 && chr <= 0xee8b)
198 {
199 return 2;
200 }
201 if (getLigatureScriptCode(chr) == SD_PUA_ROVAS) return 3;
202 return -1;
203 }
204
205 static unsigned int
getRovasCluster(const SV_UCS4 & unicode,unsigned int index,SV_UCS4 * ret,int * finished,bool isPUA)206 getRovasCluster (const SV_UCS4& unicode, unsigned int index,
207 SV_UCS4* ret, int* finished, bool isPUA) {
208
209 unsigned int usize = unicode.size();
210 if (index>=usize) return index;
211
212 /* set it to finished - this routine would not be called to other scirpts */
213 if (finished) *finished = 1;
214 /* Some platforms have unsigned char */
215 int prevchartype = 0;
216
217 SS_UCS4 nextLig = 0;
218 unsigned int i;
219 int ligatureType = isPUA ? SD_PUA_ROVAS : SD_ROVASIRAS;
220 for (i=index;i<usize; i++)
221 {
222 SS_UCS4 next = unicode[i];
223 int chartype = isPUA ? getPUARovasType (next) : getRovasType (next);
224 switch (chartype)
225 {
226 case 0:
227 if (prevchartype < 1)
228 {
229 if (i > index+1)
230 {
231 nextLig = nextLigature (ligatureType,
232 &unicode.array()[index], i-index);
233 if (nextLig) ret->append (nextLig);
234 return i;
235 }
236 ret->clear();
237 return index;
238 }
239 ret->append (next);
240 break;
241 case 1:
242 case 2:
243 if (prevchartype != 0)
244 {
245 if (i > index+1)
246 {
247 nextLig = nextLigature (ligatureType,
248 &unicode.array()[index], i-index);
249 if (nextLig) ret->append (nextLig);
250 return i;
251 }
252 ret->clear();
253 return index;
254 }
255 ret->append (next);
256 break;
257 case -1:
258 if (i > index+1)
259 {
260 nextLig = nextLigature (ligatureType,
261 &unicode.array()[index], i-index);
262 if (nextLig) ret->append (nextLig);
263 return i;
264 }
265 ret->clear();
266 return index;
267 }
268 prevchartype = chartype;
269 }
270 /* Not yet finished. Return unfinished cluster */
271 if (finished) *finished = 0;
272 if (ret->size() > 1)
273 {
274 nextLig = nextLigature (ligatureType,
275 &unicode.array()[index], i-index);
276 if (nextLig) ret->append (nextLig);
277 return i;
278 }
279 ret->clear();
280 return index;
281 }
282
283 static unsigned int
getRISCluster(const SV_UCS4 & unicode,unsigned int index,SV_UCS4 * ret,int * finished)284 getRISCluster (const SV_UCS4& unicode, unsigned int index,
285 SV_UCS4* ret, int* finished) {
286
287 unsigned int usize = unicode.size();
288 if (index>=usize) return index;
289
290 /* set it to finished - this routine would not be called to other scirpts */
291 if (finished) *finished = 1;
292
293 SS_UCS4 nextLig = 0;
294 unsigned int i;
295 int ligatureType = SD_REGIONAL_INDICATOR_SYMBOL;
296 for (i=index;i<index+2 && i<usize; i++)
297 {
298 SS_UCS4 next = unicode[i];
299 if (getUnicodeScript(next) != SD_REGIONAL_INDICATOR_SYMBOL) {
300 if (finished) *finished = 0;
301 break;
302 }
303 ret->append (next - 0x1f1e6 + (int) 'A');
304 }
305 if (ret->size() > 1) {
306 nextLig = nextLigature (ligatureType,
307 &unicode.array()[index], i-index);
308 if (nextLig) ret->append (nextLig);
309 return i;
310 }
311 if (finished) *finished = 1;
312 ret->clear();
313 return index;
314 }
315
316 /**
317 * Create a JAMO Cluster as of Unicode 3.0 Chapter 3.11.
318 * 1. L.X V.X T.X X.L X.V X.T
319 * 2. T.L
320 * 3. V.L
321 * 4. T.V
322 * In short: Cluster=L*V*T*
323 * Asterisk means: one or more.
324 * @param finished is set to 1 if exact match happens
325 * 0 is not yet finished
326 * -1 if illegal sequence start.
327 */
328 static unsigned int
getJamoCluster(const SV_UCS4 & unicode,unsigned int index,SV_UCS4 * ret,int * finished)329 getJamoCluster (const SV_UCS4& unicode, unsigned int index,
330 SV_UCS4* ret, int* finished)
331 {
332
333 unsigned int usize = unicode.size();
334 if (index>=usize) return index;
335
336
337 /* set it to finished - this routine would not be called to other scirpts */
338 if (finished) *finished = -1;
339 /* Some platforms have unsigned char */
340 int prevchartype = getJamoClass (unicode[index]);
341
342 SS_UCS4 nextLig = 0;
343 unsigned int i;
344 for (i=index;i<usize; i++)
345 {
346 SS_UCS4 next = unicode[i];
347 int chartype = getJamoClass (next);
348 switch (chartype)
349 {
350 case SD_JAMO_L:
351 if (prevchartype != SD_JAMO_L)
352 {
353 nextLig = precomposJamos (ret);
354 if (nextLig==0)
355 {
356 nextLig = nextLigature (SD_HANGUL_JAMO,
357 &unicode.array()[index], i-index);
358 }
359 if (nextLig) ret->append (nextLig);
360 return i;
361 }
362 ret->append (next);
363 break;
364 case SD_JAMO_V:
365 if (prevchartype != SD_JAMO_L && prevchartype != SD_JAMO_V)
366 {
367 nextLig = precomposJamos (ret);
368 if (nextLig ==0)
369 {
370 nextLig = nextLigature (SD_HANGUL_JAMO,
371 &unicode.array()[index], i-index);
372 }
373 if (nextLig) ret->append (nextLig);
374 return i;
375 }
376 ret->append (next);
377 break;
378 case SD_JAMO_T:
379 /* Do we really have TT sequence ? According to Unicode yes. Hmm.. */
380 if (prevchartype != SD_JAMO_V && prevchartype != SD_JAMO_T)
381 {
382 nextLig = precomposJamos (ret);
383 if (nextLig==0)
384 {
385 nextLig = nextLigature (SD_HANGUL_JAMO,
386 &unicode.array()[index], i-index);
387 }
388 if (nextLig) ret->append (nextLig);
389 return i;
390 }
391 ret->append (next);
392 break;
393 case SD_JAMO_X:
394 default:
395 /* Tone marks can follow the cluster */
396 // They are suported as composing anyway...
397 #if 0
398 if (next == 0x302e || next == 0x302f)
399 {
400 ret->append (next);
401 i++;
402 }
403 #endif
404 nextLig = precomposJamos (ret);
405 if (nextLig==0)
406 {
407 nextLig = nextLigature (SD_HANGUL_JAMO,
408 &unicode.array()[index], i-index);
409 }
410 if (nextLig) ret->append (nextLig);
411 return i;
412 break;
413 }
414 prevchartype = chartype;
415 }
416 /* Not yet finished. Return unfinished cluster */
417 if (finished) *finished = 0;
418 if (ret->size()>=1)
419 {
420 nextLig = precomposJamos (ret);
421 if (nextLig==0)
422 {
423 nextLig = nextLigature (SD_HANGUL_JAMO,
424 &unicode.array()[index], i-index);
425 }
426 if (nextLig) ret->append (nextLig);
427 return i;
428 }
429 ret->clear();
430 return index;
431 }
432
433 /**
434 * Precompose JAMOs that are present in unicode tables
435 * @param jamo is the vector that holds input jamos and
436 * output precompositions.
437 * @return the precomposed JAMOS or 0
438 */
439 static SS_UCS4
precomposJamos(SV_UCS4 * jamo)440 precomposJamos(SV_UCS4* jamo)
441 {
442 if (jamo->size()==0) return 0;
443 if (jamo->size()==1) return 0;
444 SS_UCS4 last = (*jamo)[jamo->size()-1];
445 if (last==0x302e || last==0x302f)
446 {
447 if (jamo->size()<=2) return 0;
448 if (jamo->size()>4) return 0;
449 jamo->truncate (jamo->size()-1);
450 }
451 else if (jamo->size()>3)
452 {
453 return 0;
454 }
455
456 SS_UCS4 l = (*jamo)[0];
457 SS_UCS4 v = (*jamo)[1];
458 SS_UCS4 t = (jamo->size() >= 3) ? (*jamo)[2] : 0x11a7;
459 /* tone marks will be rendered first */
460 if (last==0x302e || last==0x302f)
461 {
462 jamo->insert (0, last);
463 }
464 if (l>=0x1100 && l<=0x1112
465 && v>=0x1161 && v<=0x1175
466 && t>=0x11a7 && t<=0x11c2)
467 {
468 jamo->clear();
469 SS_UCS4 vle = 21*28* (l-0x1100) + 28 * (v-0x1161) + (t-0x11a7) + 0xac00;
470 jamo->append (vle);
471 /* create a unique key */
472 if (last==0x302e)
473 {
474 vle = vle & 0x3fff;
475 }
476 else if (last==0x302f)
477 {
478 vle = vle & 0x7fff;
479 }
480 vle += 0x80000000 + (0x10000 * SD_HANGUL_PREC);
481 return vle;
482 }
483 return 0;
484 }
485
486 /**
487 * Get cluster for South Indian Thai-like scripts
488 * The cluster is rendered and treated together. It has
489 * a unicode and a separated memory representation.
490 * Memory representation is only used for fallback rendering.
491 * A cluster is
492 *
493 * a) Consonant + Top/Bottom/Right Sign [+ ...]
494 * b) Consonant + Nukta
495 * c) Consonant + Nukta + Top/Bottom/Right Sign [+ ...]
496 * d) Indep-Vowel + Top/Bottom Sign [+ ...]
497 *
498 * @param finished is set to 1 if exact match happens
499 * 0 is not yet finished
500 * -1 if illegal sequence start.
501 * It also sets yuditClusterError to an appropriate string.
502 */
503 static unsigned int
getSouthIndicCluster(unsigned int scriptcode,const SV_UCS4 & unicode,unsigned int index,SV_UCS4 * ret,int * finished)504 getSouthIndicCluster (unsigned int scriptcode,
505 const SV_UCS4& unicode, unsigned int index, SV_UCS4* ret, int* finished)
506 {
507 unsigned int usize = unicode.size();
508 unsigned int i;
509 if (finished) *finished = 1;
510 /* Some platforms have unsigned char */
511
512 char prevchartype = (char)0x7f; /* big enough */
513 SS_UCS4 nextLig = 0;
514 for (i=index;i<usize; i++)
515 {
516 SS_UCS4 next = unicode[i];
517 char chartype = (char) indic->encode (next);
518 unsigned int sc = getUnicodeScript (next);
519 if (sc!=scriptcode && next != 0x25cc && next != 0x200d && next != 0x200c)
520 {
521 if (ret->size()==0)
522 {
523 /* can not start with it */
524 if (finished) *finished=-1;
525 }
526 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
527 if (nextLig) ret->append (nextLig);
528 return i;
529 }
530 switch (chartype)
531 {
532 case SD_INDIC_INDEP_VOWEL:
533 ret->append (next);
534 if (i+1 < usize)
535 {
536 SS_UCS4 n = unicode[i+1];
537 char ct = (char) indic->encode (n);
538 if (ct != SD_INDIC_TOP_VOWEL && ct != SD_INDIC_BOTTOM_VOWEL)
539 {
540 if (ret->size()==1) return index;
541 nextLig = nextLigature (scriptcode,
542 &unicode.array()[index], i-index+1);
543 if (nextLig) ret->append (nextLig);
544 return i+1;
545 }
546 }
547 break;
548 case SD_INDIC_CONSONANT_BASE:
549 case SD_INDIC_CONSONANT_POST_BASE:
550 case SD_INDIC_CONSONANT_BELOW_BASE:
551 ret->append (next);
552 if (i+1 < usize)
553 {
554 SS_UCS4 n = unicode[i+1];
555 char ct = (char) indic->encode (n);
556 if (ct != SD_INDIC_NUKTA
557 && ct != SD_INDIC_RIGHT_VOWEL
558 && ct != SD_INDIC_TOP_VOWEL
559 && ct != SD_INDIC_BOTTOM_VOWEL)
560 {
561 if (ret->size()==1) return index;
562 nextLig = nextLigature (scriptcode,
563 &unicode.array()[index], i-index+1);
564 if (nextLig) ret->append (nextLig);
565 return i+1;
566 }
567 }
568 break;
569 case SD_INDIC_NUKTA:
570 if (ret->size()==0)
571 {
572 /* can not start with it */
573 if (finished) *finished=-1;
574 yuditClusterError = "Cluster should not start with a subjoined consonant.";
575 return index;
576 }
577 if (prevchartype != SD_INDIC_CONSONANT_BASE
578 && prevchartype != SD_INDIC_CONSONANT_POST_BASE
579 && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE)
580 {
581 yuditClusterError = "Subjoined consonant should be preceded by a full consonant.";
582 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
583 if (nextLig) ret->append (nextLig);
584 return i;
585 }
586 ret->append (next);
587 if (i+1 < usize)
588 {
589 SS_UCS4 n = unicode[i+1];
590 char ct = (char) indic->encode (n);
591 if (ct != SD_INDIC_RIGHT_VOWEL
592 && ct != SD_INDIC_TOP_VOWEL
593 && ct != SD_INDIC_BOTTOM_VOWEL)
594 {
595 if (ret->size()==1) return index;
596 nextLig = nextLigature (scriptcode,
597 &unicode.array()[index], i-index+1);
598 if (nextLig) ret->append (nextLig);
599 return i+1;
600 }
601 }
602 break;
603 case SD_INDIC_LEFT_VOWEL:
604 case SD_INDIC_RIGHT_VOWEL:
605 case SD_INDIC_TOP_VOWEL:
606 case SD_INDIC_BOTTOM_VOWEL:
607 if (ret->size()==0)
608 {
609 /* can not start with it */
610 if (finished) *finished=-1;
611 yuditClusterError = "Cluster should not start with a dependent wovel.";
612 return index;
613 }
614 if (prevchartype != SD_INDIC_INDEP_VOWEL
615 && prevchartype != SD_INDIC_CONSONANT_BASE
616 && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
617 && prevchartype != SD_INDIC_CONSONANT_POST_BASE
618 && prevchartype != SD_INDIC_NUKTA
619 && prevchartype != SD_INDIC_RIGHT_VOWEL
620 && prevchartype != SD_INDIC_TOP_VOWEL
621 && prevchartype != SD_INDIC_BOTTOM_VOWEL)
622 {
623 yuditClusterError = "Dependent sign should be preceded by another character.";
624 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
625 if (nextLig) ret->append (nextLig);
626 return i;
627 }
628 ret->append (next);
629 if (i+1 < usize)
630 {
631 SS_UCS4 n = unicode[i+1];
632 char ct = (char) indic->encode (n);
633 if (ct != SD_INDIC_RIGHT_VOWEL
634 && ct != SD_INDIC_TOP_VOWEL
635 && ct != SD_INDIC_BOTTOM_VOWEL)
636 {
637 if (ret->size()==1) return index;
638 nextLig = nextLigature (scriptcode,
639 &unicode.array()[index], i-index+1);
640 if (nextLig) ret->append (nextLig);
641 return i+1;
642 }
643 }
644 break;
645 case SD_INDIC_SIGN:
646 if (ret->size()==0)
647 {
648 /* can start with it */
649 // if (finished) *finished=-1;
650 return index;
651 }
652 ret->append (next);
653 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
654 if (nextLig) ret->append (nextLig);
655 return i+1;
656 default:
657 if (ret->size()==0)
658 {
659 if (finished) *finished=1;
660 return index;
661 }
662 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
663 if (nextLig) ret->append (nextLig);
664 return i;
665 }
666 prevchartype = chartype;
667 }
668 // fprintf (stderr, "TIBET index=%d\n", index);
669 if (finished) *finished = 0;
670 if (ret->size()>1)
671 {
672 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
673 if (nextLig) ret->append (nextLig);
674 return i;
675 }
676 ret->clear();
677 return index;
678 }
679
680 /**
681 * Get cluster for North Indian Devanagari-like scripts
682 * The cluster is rendered and treated together. It has
683 * a unicode and a seperated memory representation.
684 * Memory representation is only used for fallback rendering.
685 * A cluster is
686 * a) Consonant
687 * b) Consonant + Halant
688 * c) Consonant + Halant + ZWJ
689 * d) Consonant + Nukta + Halant
690 * e) Consonant + Nukta + Halant + ZWJ
691 * f) Independent Vowel
692 * g) Independent Vowel + Vowel
693 * h) [b|c|d|e]*
694 * i) [b|c|d|e]* a
695 * j) [b|c|d|e]* Vowel
696 * k) [a-i] ending with Modifier
697 * l) [a-i] ending with ZWNJ
698 * For bengali
699 * Consonant + ZWJ
700 * Halant + Consonant
701 * are also possible.
702 * @param scriptcode is one of the scripts (Hard-Coded)
703 * @return index if nothing was lifted off vector, return
704 * the number of unicode characters + index otherwise.
705 * append the output cluster to ret, last element is ligature
706 * code - if any.
707 * @param finished is set to 1 if exact match happens
708 * 0 is not yet finished
709 * -1 if illegal sequence start.
710 * It also sets yuditClusterError to an appripriate string.
711 */
712 static unsigned int
getIndicCluster(unsigned int scriptcode,const SV_UCS4 & unicode,unsigned int index,SV_UCS4 * ret,int * finished)713 getIndicCluster (unsigned int scriptcode,
714 const SV_UCS4& unicode, unsigned int index, SV_UCS4* ret, int* finished)
715 {
716 unsigned int usize = unicode.size();
717 unsigned int i;
718 if (finished) *finished = 1;
719 /* Some platforms have unsigned char */
720
721 char prevchartype = (char)0x7f; /* big enough */
722 SS_UCS4 nextLig = 0;
723 for (i=index;i<usize; i++)
724 {
725 SS_UCS4 next = unicode[i];
726 char chartype = (char) indic->encode (next);
727 //fprintf (stderr, "getIndicCluster=%u %d\n", next, chartype);
728 unsigned int sc = getUnicodeScript (next);
729 if (sc!=scriptcode && chartype != SD_INDIC_ZWNJ && chartype != SD_INDIC_ZWJ
730 && next != 0x25cc)
731 {
732 if (ret->size()==0)
733 {
734 /* can not start with it */
735 if (finished) *finished=-1;
736 }
737 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
738 if (nextLig) ret->append (nextLig);
739 return i;
740 }
741 switch (chartype)
742 {
743 case SD_INDIC_INDEP_VOWEL:
744 ret->append (next);
745 if (i+1 < usize)
746 {
747 SS_UCS4 n = unicode[i+1];
748 char ct = (char) indic->encode (n);
749 if (ct != SD_INDIC_BOTTOM_VOWEL
750 && ct != SD_INDIC_TOP_VOWEL
751 && ct != SD_INDIC_LEFT_VOWEL
752 && ct != SD_INDIC_LEFT_RIGHT_VOWEL
753 && ct != SD_INDIC_RIGHT_VOWEL
754 && ct != SD_INDIC_MODIFIER
755 && ct != SD_INDIC_HALANT)
756 {
757 if (ret->size()==1) return index;
758 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
759 if (nextLig) ret->append (nextLig);
760 return i+1;
761 }
762 }
763 break;
764 case SD_INDIC_LEFT_VOWEL:
765 if (ret->size()==0)
766 {
767 /* can not start with it */
768 if (finished) *finished=-1;
769 yuditClusterError = "Cluster should not start with dependent vowel.";
770 return index;
771 }
772 if (prevchartype != SD_INDIC_CONSONANT_BASE
773 && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
774 && prevchartype != SD_INDIC_CONSONANT_POST_BASE
775 && prevchartype != SD_INDIC_CONSONANT_DEAD
776 && prevchartype != SD_INDIC_HALANT
777 && prevchartype != SD_INDIC_NUKTA
778 && prevchartype != SD_INDIC_INDEP_VOWEL)
779 {
780 yuditClusterError = "Dependent vowel should be preceded by consonant, nukta, halant or independent vowel.";
781 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
782 if (nextLig) ret->append (nextLig);
783 return i;
784 }
785 if (scriptcode == SD_MALAYALAM)
786 ret->insert (ret->size()-1, next);
787 else ret->insert (0, next);
788 if (i+1 < usize)
789 {
790 SS_UCS4 n = unicode[i+1];
791 char ct = (char) indic->encode (n);
792 if (ct != SD_INDIC_MODIFIER)
793 {
794 nextLig = nextLigature (scriptcode, &unicode.array()[index],i-index+1);
795 if (nextLig) ret->append (nextLig);
796 return i+1;
797 }
798 }
799 break;
800 case SD_INDIC_LEFT_RIGHT_VOWEL:
801 if (ret->size()==0)
802 {
803 /* can not start with it */
804 if (finished) *finished=-1;
805 yuditClusterError = "Cluster should not start with dependent vowel.";
806 return index;
807 }
808 if (prevchartype != SD_INDIC_CONSONANT_BASE
809 && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
810 && prevchartype != SD_INDIC_CONSONANT_POST_BASE
811 && prevchartype != SD_INDIC_CONSONANT_DEAD
812 && prevchartype != SD_INDIC_HALANT
813 && prevchartype != SD_INDIC_NUKTA
814 && prevchartype != SD_INDIC_INDEP_VOWEL)
815 {
816 yuditClusterError = "Dependent vowel should be preceded by consonant, nukta, halant or independent vowel.";
817 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
818 if (nextLig) ret->append (nextLig);
819 return i;
820 }
821 /* this will be the fallback rendering */
822 {
823 SS_UCS4 l = getLRVowelLeft (next);
824 SS_UCS4 r = getLRVowelRight (next);
825 if (l && r)
826 {
827 if (scriptcode == SD_MALAYALAM)
828 ret->insert (ret->size()-1, l);
829 else ret->insert (0, l);
830 ret->append (r);
831 }
832 else
833 {
834 ret->append (next);
835 }
836 }
837 if (i+1 < usize)
838 {
839 SS_UCS4 n = unicode[i+1];
840 char ct = (char) indic->encode (n);
841 if (ct != SD_INDIC_MODIFIER)
842 {
843 nextLig = nextLigature (scriptcode, &unicode.array()[index],i-index+1);
844 if (nextLig) ret->append (nextLig);
845 return i+1;
846 }
847 }
848 break;
849 case SD_INDIC_MODIFIER:
850 if (ret->size()==0)
851 {
852 /* can not start with it */
853 yuditClusterError = "Cluster should not start with a modifier.";
854 if (finished) *finished=-1;
855 return index;
856 }
857 if ( prevchartype != SD_INDIC_INDEP_VOWEL
858 && prevchartype != SD_INDIC_TOP_VOWEL
859 && prevchartype != SD_INDIC_BOTTOM_VOWEL
860 && prevchartype != SD_INDIC_LEFT_VOWEL
861 && prevchartype != SD_INDIC_LEFT_RIGHT_VOWEL
862 && prevchartype != SD_INDIC_RIGHT_VOWEL
863 && prevchartype != SD_INDIC_CONSONANT_BASE
864 && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
865 && prevchartype != SD_INDIC_CONSONANT_POST_BASE
866 && prevchartype != SD_INDIC_CONSONANT_DEAD
867 && prevchartype != SD_INDIC_NUKTA)
868 {
869 nextLig = nextLigature (scriptcode,
870 &unicode.array()[index], i-index);
871 if (nextLig) ret->append (nextLig);
872 return i;
873 }
874 ret->append (next);
875 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
876 if (nextLig) ret->append (nextLig);
877 return i +1;
878
879 case SD_INDIC_SIGN:
880 if (ret->size()==0)
881 {
882 /* can start with it */
883 // if (finished) *finished=-1;
884 return index;
885 }
886 ret->append (next);
887 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
888 if (nextLig) ret->append (nextLig);
889 return i+1;
890
891 case SD_INDIC_RIGHT_VOWEL:
892 case SD_INDIC_TOP_VOWEL:
893 case SD_INDIC_BOTTOM_VOWEL:
894 if (ret->size()==0)
895 {
896 /* can not start with it */
897 yuditClusterError = "Cluster should not start with dependent vowel.";
898 if (finished) *finished=-1;
899 return index;
900 }
901 if (prevchartype != SD_INDIC_CONSONANT_BASE
902 && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
903 && prevchartype != SD_INDIC_CONSONANT_POST_BASE
904 && prevchartype != SD_INDIC_HALANT
905 && prevchartype != SD_INDIC_NUKTA
906 && prevchartype != SD_INDIC_CONSONANT_DEAD
907 && prevchartype != SD_INDIC_INDEP_VOWEL)
908 {
909 yuditClusterError = "Dependent vowel should be preceded by consonant, nukta, halant or independent vowel.";
910 nextLig = nextLigature (scriptcode,
911 &unicode.array()[index], i-index);
912 if (nextLig) ret->append (nextLig);
913 return i;
914 }
915 ret->append (next);
916 if (i+1 < usize)
917 {
918 SS_UCS4 n = unicode[i+1];
919 char ct = (char) indic->encode (n);
920 if (ct != SD_INDIC_MODIFIER)
921 {
922 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
923 if (nextLig) ret->append (nextLig);
924 return i +1;
925 }
926 }
927 break;
928 case SD_INDIC_CONSONANT_BASE:
929 case SD_INDIC_CONSONANT_BELOW_BASE:
930 case SD_INDIC_CONSONANT_POST_BASE:
931 if (ret->size() > 0 && prevchartype != SD_INDIC_HALANT
932 && prevchartype != SD_INDIC_ZWJ
933 && prevchartype != SD_INDIC_CONSONANT_DEAD)
934 {
935 yuditClusterError = "Consonant should be preceded by halant or nukta or ZWJ";
936 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
937 if (nextLig) ret->append (nextLig);
938 return i;
939 }
940 ret->append (next);
941 if (i+1 < usize)
942 {
943 SS_UCS4 n = unicode[i+1];
944 char ct = (char) indic->encode (n);
945 if (ct != SD_INDIC_HALANT
946 && ct != SD_INDIC_NUKTA
947 && ct != SD_INDIC_ZWNJ
948 && ct != SD_INDIC_ZWJ
949 && ct != SD_INDIC_MODIFIER
950 && ct != SD_INDIC_BOTTOM_VOWEL
951 && ct != SD_INDIC_TOP_VOWEL
952 && ct != SD_INDIC_LEFT_VOWEL
953 && ct != SD_INDIC_LEFT_RIGHT_VOWEL
954 && ct != SD_INDIC_CONSONANT_DEAD
955 && ct != SD_INDIC_RIGHT_VOWEL)
956 {
957 if (ret->size()==1) return index;
958 nextLig = nextLigature (scriptcode,&unicode.array()[index],i-index+1);
959 if (nextLig) ret->append (nextLig);
960 return i+1;
961 }
962 }
963 break;
964 case SD_INDIC_ZWNJ:
965 if (ret->size()==0)
966 {
967 /* can not start with it */
968 yuditClusterError = "Cluster can not start with a ZWNJ.";
969 if (finished) *finished=-1;
970 return index;
971 }
972 #if 0
973 if (prevchartype != SD_INDIC_HALANT)
974 {
975 yuditClusterError = "ZWNJ should be preceded by a halant.";
976 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
977 if (nextLig) ret->append (nextLig);
978 return i;
979 }
980 #endif
981 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
982 if (nextLig) ret->append (nextLig);
983 return i+1;
984 case SD_INDIC_NUKTA:
985 if (ret->size()==0)
986 {
987 /* can not start with it */
988 yuditClusterError = "Cluster can not start with a nukta.";
989 if (finished) *finished=-1;
990 return index;
991 }
992 if (prevchartype != SD_INDIC_CONSONANT_BASE
993 && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
994 && prevchartype != SD_INDIC_CONSONANT_DEAD
995 && prevchartype != SD_INDIC_CONSONANT_POST_BASE)
996 {
997 yuditClusterError = "Nukta should be preceded by a consonant.";
998 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
999 if (nextLig) ret->append (nextLig);
1000 return i;
1001 }
1002 ret->append (next);
1003 if (i+1 < usize)
1004 {
1005 SS_UCS4 n = unicode[i+1];
1006 char ct = (char) indic->encode (n);
1007 if (ct != SD_INDIC_HALANT
1008 && ct != SD_INDIC_MODIFIER
1009 && ct != SD_INDIC_BOTTOM_VOWEL
1010 && ct != SD_INDIC_TOP_VOWEL
1011 && ct != SD_INDIC_LEFT_VOWEL
1012 && ct != SD_INDIC_LEFT_RIGHT_VOWEL
1013 && ct != SD_INDIC_RIGHT_VOWEL)
1014 {
1015 if (ret->size()==1) return index;
1016 nextLig = nextLigature (scriptcode,&unicode.array()[index],i-index+1);
1017 if (nextLig) ret->append (nextLig);
1018 return i+1;
1019 }
1020 }
1021 break;
1022 case SD_INDIC_ZWJ:
1023 // Bengali can start with ZWJ - it needs a little work.
1024 #if 0
1025 if (ret->size()==0)
1026 {
1027 /* can not start with it */
1028 yuditClusterError = "Cluster can not start with a ZWJ.";
1029 if (finished) *finished=-1;
1030 return index;
1031 }
1032 if (prevchartype != SD_INDIC_HALANT)
1033 {
1034 yuditClusterError = "ZWJ should be preceded by a halant.";
1035 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
1036 if (nextLig) ret->append (nextLig);
1037 return i;
1038 }
1039 #endif
1040 ret->append (next);
1041 break;
1042 case SD_INDIC_HALANT:
1043 // Bengali can start with a halant - Yaphala
1044 if (next != 0x09cd && ret->size()==0)
1045 {
1046 /* can not start with it */
1047 yuditClusterError = "Cluster can not start with a halant.";
1048 if (finished) *finished=-1;
1049 return index;
1050 }
1051 if (next != 0x09cd
1052 && prevchartype != SD_INDIC_INDEP_VOWEL
1053 && prevchartype != SD_INDIC_CONSONANT_BASE
1054 && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
1055 && prevchartype != SD_INDIC_CONSONANT_POST_BASE
1056 && prevchartype != SD_INDIC_NUKTA)
1057 {
1058 yuditClusterError = "Halant should be preceded by an independent vowel, a consonant or nukta.";
1059 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
1060 if (nextLig) ret->append (nextLig);
1061 return i;
1062 }
1063 ret->append (next);
1064 break;
1065 case SD_INDIC_CONSONANT_DEAD:
1066 // Finish the cluster - I dont know any better solution.
1067 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
1068 if (nextLig) ret->append (nextLig);
1069 return i+1;
1070 default:
1071 if (ret->size()==0)
1072 {
1073 if (finished) *finished=1;
1074 return index;
1075 }
1076 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
1077 if (nextLig) ret->append (nextLig);
1078 return i;
1079 break;
1080 }
1081 prevchartype = chartype;
1082 }
1083 if (finished) *finished = 0;
1084 if (ret->size()>1)
1085 {
1086 nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
1087 if (nextLig) ret->append (nextLig);
1088 return i;
1089 }
1090 ret->clear();
1091 return index;
1092 }
1093
1094
1095 /**
1096 * Generate a next ligature number if it still does not exist
1097 */
nextLigature(unsigned int script,const SS_UCS4 * unicode,unsigned int length)1098 static SS_UCS4 nextLigature (unsigned int script,
1099 const SS_UCS4* unicode, unsigned int length)
1100 {
1101 initLigatures ();
1102 if (length<2) return 0;
1103
1104 SString key = SString((char*)unicode, sizeof (SS_UCS4) * length);
1105 const SString* cac = ligatureCache->get (key);
1106 SS_UCS4 liga;
1107 if (cac && cac->size()==sizeof (SS_UCS4))
1108 {
1109 liga = *(SS_UCS4*) (cac->array());
1110 return liga;
1111 }
1112 liga = counters[script];
1113
1114 /* check overflow */
1115 if ((liga & 0xffff) == 0xffff) return 0;
1116 liga++;
1117 counters[script] = liga;
1118 /* FIXME: check overflow */
1119 SString vle = SString((char*)&liga, sizeof (SS_UCS4));
1120 ligatureCache->put (key, vle);
1121 //fprintf (stderr, "New Ligature[%d]=%X\n", script, liga);
1122 return liga;
1123 }
1124
1125 int
getUnicodeScript(SS_UCS4 comp)1126 getUnicodeScript (SS_UCS4 comp)
1127 {
1128 /* TONE LETTERS */
1129 switch (comp)
1130 {
1131 case 0x304B: return SD_YUDIT;
1132 case 0x304D: return SD_YUDIT;
1133 case 0x304F: return SD_YUDIT;
1134 case 0x3051: return SD_YUDIT;
1135 case 0x3053: return SD_YUDIT;
1136 case 0x30AB: return SD_YUDIT;
1137 case 0x30AD: return SD_YUDIT;
1138 case 0x30AF: return SD_YUDIT;
1139 case 0x30B1: return SD_YUDIT;
1140 case 0x30B3: return SD_YUDIT;
1141 case 0x30BB: return SD_YUDIT;
1142 case 0x30C4: return SD_YUDIT;
1143 case 0x30C8: return SD_YUDIT;
1144 case 0x31F7: return SD_YUDIT;
1145 case 0x00E6: return SD_YUDIT;
1146 case 0x0254: return SD_YUDIT;
1147 case 0x028C: return SD_YUDIT;
1148 case 0x0259: return SD_YUDIT;
1149 case 0x025A: return SD_YUDIT;
1150 default: break;
1151 }
1152 if (comp >= 0x02E5 && comp <= 0x02E9) return SD_YUDIT;
1153 if (getJamoClass (comp)>0) return SD_HANGUL_JAMO;
1154
1155 if (comp >= 0x1f1e6 && comp <= 0x1f1ff) {
1156 return SD_REGIONAL_INDICATOR_SYMBOL;
1157 }
1158
1159 if (comp >= 0x1000)
1160 {
1161 if (getRovasType (comp) == 1)
1162 {
1163 return SD_ROVASIRAS;
1164 }
1165 if (getPUARovasType (comp) == 1)
1166 {
1167 return SD_PUA_ROVAS;
1168 }
1169 return -1;
1170 }
1171
1172 if (comp < 0x0900 ) return -1;
1173 if (comp < 0x0980) return SD_DEVANAGARI;
1174 if (comp < 0x0A00) return SD_BENGALI;
1175 if (comp < 0x0A80) return SD_GURMUKHI;
1176 if (comp < 0x0B00) return SD_GUJARATI;
1177 if (comp < 0x0B80) return SD_ORIYA;
1178 if (comp < 0x0C00) return SD_TAMIL;
1179 if (comp < 0x0C80) return SD_TELUGU;
1180 if (comp < 0x0D00) return SD_KANNADA;
1181 if (comp < 0x0D80) return SD_MALAYALAM;
1182 if (comp < 0x0E00) return SD_SINHALA;
1183 if (comp < 0x0E80) return SD_THAI;
1184 if (comp < 0x0F00) return SD_LAO;
1185 if (comp < 0x0FFF) return SD_TIBETAN;
1186 return -1;
1187 }
1188 /**
1189 * return true if this is covered
1190 */
1191 bool
isCoveredScipt(SS_UCS4 comp,int sc)1192 isCoveredScipt (SS_UCS4 comp, int sc)
1193 {
1194 switch (sc)
1195 {
1196 case SD_YUDIT: return false;
1197 case SD_REGIONAL_INDICATOR_SYMBOL: return (comp>=0x1f1e6 && comp<=0x1f1ff);
1198 case SD_DEVANAGARI: return (comp>=0x0900 && comp<0x0980);
1199 case SD_BENGALI: return (comp>=0x0980 && comp<0x0a00);
1200 case SD_BENGALI_BEGIN: return (comp>=0x0980 && comp<0x0a00);
1201 case SD_GURMUKHI: return (comp>=0x0a00 && comp<0x0a80);
1202 case SD_GUJARATI: return (comp>=0x0a80 && comp<0x0b00);
1203 case SD_ORIYA: return (comp>=0x0b00 && comp<0x0b80);
1204 case SD_TAMIL: return (comp>=0x0b80 && comp<0x0c00);
1205 case SD_TELUGU: return (comp>=0x0c00 && comp<0x0c80);
1206 case SD_KANNADA: return (comp>=0x0c80 && comp<0x0d00);
1207 case SD_MALAYALAM: return (comp>=0x0d00 && comp<0x0d80);
1208 case SD_SINHALA: return (comp>=0x0d80 && comp<0x0e00);
1209 case SD_THAI: return (comp>=0x0e00 && comp<0x0e80);
1210 case SD_LAO: return (comp>=0x0e80 && comp<0x0f00);
1211 case SD_TIBETAN: return (comp>=0x0f00 && comp<0x0fff);
1212 case SD_HANGUL_JAMO: return (getJamoClass(comp) != 0);
1213 case SD_HANGUL_PREC: return (getJamoClass(comp) != 0);
1214 }
1215 return false;
1216 }
1217
1218
1219 /**
1220 * Add combining ligature. A combining ligature is a ligature
1221 * with combining marks. The ligature can be a unicode or
1222 8 Yudit ligature.
1223 * @param unicode is the unicode representation of the while thing
1224 * @param ul is the unicode repr. length
1225 * @param ligAndMarks contains one ligature + all the marks to it.
1226 * @param cl is the length of ligAndMarks.
1227 */
1228 SS_UCS4
addCombiningLigature(const SS_UCS4 * unicode,unsigned int ul,const SS_UCS4 * ligAndMarks,unsigned int cl)1229 addCombiningLigature (const SS_UCS4* unicode, unsigned int ul,
1230 const SS_UCS4* ligAndMarks, unsigned int cl)
1231 {
1232 SS_UCS4 nl = nextLigature (SD_COMBINING_LIGATURE, unicode, ul);
1233 const SString* found = ligatureUnics->get (
1234 SString((char*) &nl, sizeof (SS_UCS4)));
1235 if (found == 0)
1236 {
1237 putLigatureUnicode (nl, unicode, ul);
1238 putLigatureCluster (nl, ligAndMarks, cl);
1239 }
1240 return nl;
1241 }
1242
1243 /**
1244 * Put ligature away to remember
1245 */
1246 void
putLigatureUnicode(SS_UCS4 ligature,const SS_UCS4 * buffer,unsigned int bufsize)1247 putLigatureUnicode (SS_UCS4 ligature, const SS_UCS4* buffer, unsigned int bufsize)
1248 {
1249 if (ligature <= 0x80000000 || ligature >= 0xA0000000) return;
1250 initLigatures();
1251 SString key ((char*)& ligature, sizeof (SS_UCS4));
1252 const SString* ret = ligatureUnics->get (key);
1253 if (ret) return; /* already there */
1254 ligatureUnics->put (key, SString((char*)buffer, bufsize * sizeof (SS_UCS4)));
1255 }
1256
1257 /**
1258 * Put ligature away to remember
1259 */
1260 void
putLigatureCluster(SS_UCS4 ligature,const SS_UCS4 * buffer,unsigned int bufsize)1261 putLigatureCluster (SS_UCS4 ligature, const SS_UCS4* buffer, unsigned int bufsize)
1262 {
1263 if (ligature <= 0x80000000 || ligature >= 0xA0000000) return;
1264 initLigatures ();
1265 SString key ((char*)& ligature, sizeof (SS_UCS4));
1266 const SString* ret = ligatureClust->get (key);
1267 if (ret) return; /* already there */
1268 ligatureClust->put (key, SString((char*)buffer, bufsize * sizeof (SS_UCS4)));
1269 }
1270
1271 unsigned int
getLigatureUnicode(SS_UCS4 lig,SS_UCS4 * buffer)1272 getLigatureUnicode (SS_UCS4 lig, SS_UCS4* buffer)
1273 {
1274 SS_UCS4 ligature = lig;
1275 int sc = getLigatureScriptCode(ligature);
1276 //
1277 // SD_BENGALI_BEGIN is an artificial shape-code.
1278 //
1279 if (sc == SD_BENGALI_BEGIN)
1280 {
1281 unsigned int en = (SD_BENGALI << 16) | 0x80000000;
1282 ligature = (ligature & 0xffff) | en;
1283 }
1284 if (ligatureUnics == 0) return 0;
1285 const SString* ret = ligatureUnics->get (
1286 SString((char*) &ligature, sizeof (SS_UCS4)));
1287 if (ret==0) return 0;
1288 if (buffer==0) return ret->size()/sizeof (SS_UCS4);
1289 memcpy (buffer, ret->array(), ret->size());
1290 return ret->size()/sizeof (SS_UCS4);
1291 }
1292
1293 unsigned int
getLigatureCluster(SS_UCS4 lig,SS_UCS4 * buffer)1294 getLigatureCluster (SS_UCS4 lig, SS_UCS4* buffer)
1295 {
1296 SS_UCS4 ligature = lig;
1297 int sc = getLigatureScriptCode(ligature);
1298 //
1299 // SD_BENGALI_BEGIN is an artificial shape-code.
1300 //
1301 if (sc == SD_BENGALI_BEGIN)
1302 {
1303 unsigned int en = (SD_BENGALI << 16) | 0x80000000;
1304 ligature = (ligature & 0xffff) | en;
1305 }
1306 if (ligatureClust == 0) return 0;
1307 const SString* ret = ligatureClust->get (
1308 SString((char*) &ligature, sizeof (SS_UCS4)));
1309 if (ret==0) return 0;
1310 if (buffer==0) return ret->size()/sizeof (SS_UCS4);
1311 memcpy (buffer, ret->array(), ret->size());
1312 return ret->size()/sizeof (SS_UCS4);
1313 }
1314
1315 static void
initLigatures()1316 initLigatures()
1317 {
1318 if (ligatureUnics == 0)
1319 {
1320 clusters = new SUniMap("cluster");
1321 CHECK_NEW (clusters);
1322 indic = new SUniMap("indic");
1323 CHECK_NEW (indic);
1324
1325 ligatureUnics = new SProperties();
1326 CHECK_NEW (ligatureUnics);
1327 ligatureClust = new SProperties();
1328 CHECK_NEW (ligatureClust);
1329 ligatureCache = new SProperties();
1330 CHECK_NEW (ligatureCache);
1331 for (unsigned int i=0; i<SD_SCRIPT_MAX; i++)
1332 {
1333 counters[i] = 0x80000000 + (0x10000 * i);
1334 }
1335 }
1336 }
1337
1338 int
getLigatureScriptCode(SS_UCS4 comp)1339 getLigatureScriptCode (SS_UCS4 comp)
1340 {
1341 if (comp < 0x80000000) return -1;
1342 SS_UCS4 en = comp & 0x7fff0000;
1343 en = en >> 16;
1344 return (int) en;
1345 }
1346
1347 /* get script name or null */
1348 const char*
getLigatureScript(SS_UCS4 comp)1349 getLigatureScript (SS_UCS4 comp)
1350 {
1351 if (comp <= 0x80000000 || comp >= 0xA0000000) return 0;
1352 SS_UCS4 en = comp & 0x7fff0000;
1353 en = en >> 16;
1354 /* I modified this to return Script name as in MS Opentype spec.*/
1355 switch (en)
1356 {
1357 case SD_YUDIT: return "yudit";
1358 case SD_DEVANAGARI: return "deva";
1359 case SD_BENGALI: return "beng";
1360 case SD_BENGALI_BEGIN: return "beng";
1361 case SD_GURMUKHI: return "guru";
1362 case SD_GUJARATI: return "gujr";
1363 case SD_ORIYA: return "orya";
1364 case SD_TAMIL: return "taml";
1365 case SD_TELUGU: return "telu";
1366 case SD_KANNADA: return "knda";
1367 case SD_MALAYALAM: return "mlym";
1368 case SD_SINHALA: return "sinh";
1369 case SD_HANGUL_JAMO: return "jamo";
1370 case SD_HANGUL_PREC: return "hang";
1371 case SD_THAI: return "thai";
1372 case SD_LAO: return "lao ";
1373 case SD_TIBETAN: return "tibt";
1374 case SD_ROVASIRAS: return "rovs";
1375 case SD_PUA_ROVAS: return "prvs";
1376 case SD_REGIONAL_INDICATOR_SYMBOL: return "flag";
1377 }
1378 return 0;
1379 }
1380
1381 bool
isLigature(SS_UCS4 _comp)1382 isLigature (SS_UCS4 _comp)
1383 {
1384 /* Yudit ligatures below 0x80008000 are considered hacked glyphs only */
1385 return (_comp >= 0x80008000 && _comp > 0x80000000 && _comp < 0xA0000000);
1386 }
1387
1388 SS_UCS4
getHalant(int index)1389 getHalant (int index)
1390 {
1391 switch (index)
1392 {
1393 case SD_DEVANAGARI:
1394 return 0x094D;
1395 case SD_BENGALI:
1396 return 0x09CD;
1397 case SD_BENGALI_BEGIN:
1398 return 0x09CD;
1399 case SD_GURMUKHI:
1400 return 0x0A4D;
1401 case SD_GUJARATI:
1402 return 0x0ACD;
1403 case SD_ORIYA:
1404 return 0x0B4D;
1405 case SD_TELUGU:
1406 return 0x0C4D;
1407 case SD_KANNADA:
1408 return 0x0CCD;
1409 case SD_MALAYALAM:
1410 return 0x0D4D;
1411 case SD_SINHALA:
1412 return 0x0DCD;
1413 default:
1414 return 0;
1415 }
1416 return 0;
1417 }
1418
getCharType(SS_UCS4 unchar)1419 int getCharType (SS_UCS4 unchar)
1420 {
1421 initLigatures();
1422 char echartype = (char) indic->encode (unchar);
1423 return (int) echartype;
1424 }
1425
1426 /**
1427 * get left part of LR vowel
1428 */
1429 SS_UCS4
getLRVowelLeft(SS_UCS4 u)1430 getLRVowelLeft (SS_UCS4 u)
1431 {
1432 switch (u)
1433 {
1434 case 0x09CB:
1435 case 0x09CC:
1436 return 0x09c7;
1437 case 0x0b4b:
1438 case 0x0b4c:
1439 return 0x0b47;
1440 case 0x0d4b:
1441 return 0x0d47;
1442 case 0x0d4a:
1443 case 0x0d4c:
1444 return 0x0d46;
1445 default:
1446 break;
1447 }
1448 return 0;
1449 }
1450 /**
1451 * get right part of LR vowel
1452 */
1453 SS_UCS4
getLRVowelRight(SS_UCS4 u)1454 getLRVowelRight (SS_UCS4 u)
1455 {
1456 switch (u)
1457 {
1458 case 0x09CB:
1459 return 0x09be;
1460 case 0x09CC:
1461 return 0x09d7;
1462 case 0x0b4b:
1463 return 0x0b3e;
1464 case 0x0b4c:
1465 return 0x0b57;
1466 case 0x0d4a:
1467 case 0x0d4b:
1468 return 0x0d3e;
1469 case 0x0d4c:
1470 return 0x0d57;
1471 default:
1472 break;
1473 }
1474 return 0;
1475 }
1476
1477 /**
1478 * Decompose yudit ligature into unicode characters
1479 */
1480 void
expandYuditLigatures(SV_UCS4 * decd)1481 expandYuditLigatures (SV_UCS4* decd)
1482 {
1483 if (decd->size()!=1 || (*decd)[0] < 0x80000000) return;
1484 SS_UCS4 ucs4 = (*decd)[0];
1485 decd->remove (0);
1486 /* Yudit ligatures*/
1487 switch (ucs4)
1488 {
1489 case 0x80000010: /* JIS X 0213: 02B65 */
1490 decd->append (0x02E9);
1491 decd->append (0x02E5);
1492 break;
1493 case 0x80000011: /* JIS X 0213: 02B66 */
1494 decd->append (0x02E5);
1495 decd->append (0x02E9);
1496 break;
1497 // Generated by ./jiscompose.pl at 2002-04-15
1498 // Add this to stoolkit/SCluster.cpp expandYuditLigatures
1499 case 0x80000040: /* JIS X 0213: 0x2477 */
1500 decd->append (0x304B);
1501 decd->append (0x309A);
1502 break;
1503 case 0x80000041: /* JIS X 0213: 0x2478 */
1504 decd->append (0x304D);
1505 decd->append (0x309A);
1506 break;
1507 case 0x80000042: /* JIS X 0213: 0x2479 */
1508 decd->append (0x304F);
1509 decd->append (0x309A);
1510 break;
1511 case 0x80000043: /* JIS X 0213: 0x247A */
1512 decd->append (0x3051);
1513 decd->append (0x309A);
1514 break;
1515 case 0x80000044: /* JIS X 0213: 0x247B */
1516 decd->append (0x3053);
1517 decd->append (0x309A);
1518 break;
1519 case 0x80000045: /* JIS X 0213: 0x2577 */
1520 decd->append (0x30AB);
1521 decd->append (0x309A);
1522 break;
1523 case 0x80000046: /* JIS X 0213: 0x2578 */
1524 decd->append (0x30AD);
1525 decd->append (0x309A);
1526 break;
1527 case 0x80000047: /* JIS X 0213: 0x2579 */
1528 decd->append (0x30AF);
1529 decd->append (0x309A);
1530 break;
1531 case 0x80000048: /* JIS X 0213: 0x257A */
1532 decd->append (0x30B1);
1533 decd->append (0x309A);
1534 break;
1535 case 0x80000049: /* JIS X 0213: 0x257B */
1536 decd->append (0x30B3);
1537 decd->append (0x309A);
1538 break;
1539 case 0x8000004A: /* JIS X 0213: 0x257C */
1540 decd->append (0x30BB);
1541 decd->append (0x309A);
1542 break;
1543 case 0x8000004B: /* JIS X 0213: 0x257D */
1544 decd->append (0x30C4);
1545 decd->append (0x309A);
1546 break;
1547 case 0x8000004C: /* JIS X 0213: 0x257E */
1548 decd->append (0x30C8);
1549 decd->append (0x309A);
1550 break;
1551 case 0x8000004D: /* JIS X 0213: 0x2678 */
1552 decd->append (0x31F7);
1553 decd->append (0x309A);
1554 break;
1555 case 0x8000004E: /* JIS X 0213: 0x2B44 */
1556 decd->append (0x00E6);
1557 decd->append (0x0300);
1558 break;
1559 case 0x8000004F: /* JIS X 0213: 0x2B48 */
1560 decd->append (0x0254);
1561 decd->append (0x0300);
1562 break;
1563 case 0x80000050: /* JIS X 0213: 0x2B49 */
1564 decd->append (0x0254);
1565 decd->append (0x0301);
1566 break;
1567 case 0x80000051: /* JIS X 0213: 0x2B4A */
1568 decd->append (0x028C);
1569 decd->append (0x0300);
1570 break;
1571 case 0x80000052: /* JIS X 0213: 0x2B4B */
1572 decd->append (0x028C);
1573 decd->append (0x0301);
1574 break;
1575 case 0x80000053: /* JIS X 0213: 0x2B4C */
1576 decd->append (0x0259);
1577 decd->append (0x0300);
1578 break;
1579 case 0x80000054: /* JIS X 0213: 0x2B4D */
1580 decd->append (0x0259);
1581 decd->append (0x0301);
1582 break;
1583 case 0x80000055: /* JIS X 0213: 0x2B4E */
1584 decd->append (0x025A);
1585 decd->append (0x0300);
1586 break;
1587 case 0x80000056: /* JIS X 0213: 0x2B4F */
1588 decd->append (0x025A);
1589 decd->append (0x0301);
1590 break;
1591 // END OF ./jiscompose.pl
1592 default:
1593 break;
1594 }
1595 if (decd->size()==0) decd->append(0xfffd);
1596 return;
1597 }
1598
1599 /**
1600 * Get the Jamo class
1601 * @param ucs is the unicode character
1602 * @return one of
1603 * <ul>
1604 * <li> SD_JAMO_X </li>
1605 * <li> SD_JAMO_L </li>
1606 * <li> SD_JAMO_V </li>
1607 * <li> SD_JAMO_T </li>
1608 * </ul>
1609 */
1610 int
getJamoClass(SS_UCS4 uc)1611 getJamoClass (SS_UCS4 uc)
1612 {
1613 if (uc >= 0x1100 && uc <= 0x115f) return SD_JAMO_L;
1614 if (uc >= 0x1160 && uc <= 0x11a2) return SD_JAMO_V;
1615 if (uc >= 0x11a8 && uc <= 0x11f9) return SD_JAMO_T;
1616 return SD_JAMO_X;
1617 }
1618
1619 /* get the name of OTF font shaping feature name */
1620 const char*
getShapeCode(unsigned int icode)1621 getShapeCode (unsigned int icode)
1622 {
1623 static const char* shapes[] = {
1624 "isol",
1625 "init",
1626 "medi",
1627 "fina",
1628 "med2",
1629 "fin2",
1630 "fin3",
1631 "init",
1632 };
1633 if (icode >= 8) return "unknown";
1634 return shapes[icode];
1635 }
1636