1 /**
2  *  Yudit Unicode Editor Source File
3  *
4  *  GNU Copyright (C) 1997-2006  Gaspar Sinai <gaspar@yudit.org>
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License, version 2,
8  *  dated June 1991. See file COPYYING for details.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program; if not, write to the Free Software
17  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18  */
19 #include "stoolkit/SCluster.h"
20 #include "stoolkit/SProperties.h"
21 #include "stoolkit/SUniMap.h"
22 
23 
24 static unsigned int
25 getRISCluster (const SV_UCS4& unicode, unsigned int index,
26   SV_UCS4* ret, int* finished);
27 
28 static unsigned int
29 getRovasCluster (const SV_UCS4& unicode, unsigned int index,
30   SV_UCS4* ret, int* finished, bool isPUA);
31 
32 static unsigned int
33 getJamoCluster (const SV_UCS4& unicode, unsigned int index,
34   SV_UCS4* ret, int* finished);
35 
36 static SS_UCS4 precomposJamos(SV_UCS4* jamo);
37 
38 static unsigned int
39 getSouthIndicCluster (unsigned int scriptcode,
40   const SV_UCS4& unicode,
41   unsigned int index, SV_UCS4* ret, int* finished);
42 
43 static unsigned int
44 getIndicCluster (unsigned int scriptcode,
45   const SV_UCS4& unicode,
46   unsigned int index, SV_UCS4* ret, int* finished);
47 
48 static SUniMap* clusters;
49 static SUniMap* indic;
50 static SProperties*  ligatureUnics;
51 static SProperties*  ligatureClust;
52 
53 static SProperties*  ligatureCache;
54 static SS_UCS4  counters[SD_SCRIPT_MAX];
55 
56 static SS_UCS4 nextLigature (unsigned int script,
57    const SS_UCS4* unicode, unsigned int length);
58 
59 static void initLigatures();
60 
61 SString yuditClusterError;
62 
63 /**
64  * Try to form a cluster - an abstract glyphs that can
65  * be broken apart once made. It can be rendered by
66  * a font that contains glyphs and ligatureUnics by subdividing
67  * the cluster. The cluster is in memory order -
68  * vowels are place on the appropriate side.
69  *
70  * Clusters will replace the current SGlyph architecture.
71  * All new things should be added here.
72  *
73  * 2002-04-03 - added surrogate clusters.
74  *
75  * @param ucs4 is the input vector.
76  * @param i is the index in this vector - next character.
77  * @param finished is set to 0 if more data is needed
78  *  this parameter can be null.
79  * @return the new index in ucs4.
80  */
81 unsigned int
getCluster(const SV_UCS4 & ucs4,unsigned int index,SV_UCS4 * retchar,int * finished)82 getCluster (const SV_UCS4& ucs4,
83    unsigned int index, SV_UCS4* retchar, int *finished)
84 {
85   if (finished) *finished = -1;
86 
87   /* pack surrogates into a cluster
88      - no combining marks on surrogates for the time being. */
89   if (ucs4[index] >= 0xd800 && ucs4[index] <= 0xdbff)
90   {
91     if (ucs4.size() < index+2)
92     {
93        if (finished) *finished = 0;
94        retchar->append (ucs4[index]);
95        return index + 1;
96     }
97     if (ucs4[index+1] >= 0xdc00 && ucs4[index+1] <= 0xdfff)
98     {
99        retchar->append (((ucs4[index] & 0x3ff)<< 10)
100            + (ucs4[index+1]&0x3ff) + 0x10000);
101        if (finished) *finished = 1;
102        return index+2;
103     }
104     return index;
105   }
106   /* start the game */
107   initLigatures();
108 
109   /* Should be able to start with ZWJ */
110   int scriptcode = (
111       (ucs4[index] == 0x200D || ucs4[index] == 0x25CC) && index+1 < ucs4.size())
112       ?  getUnicodeScript (ucs4[index+1]) : getUnicodeScript (ucs4[index]);
113 
114   if (scriptcode < 0) return index;
115   unsigned int ret = index;
116   yuditClusterError.clear();
117   switch (scriptcode)
118   {
119   case SD_DEVANAGARI:
120   case SD_BENGALI:
121   case SD_GURMUKHI:
122   case SD_GUJARATI:
123   case SD_ORIYA:
124   case SD_KANNADA:
125   case SD_MALAYALAM:
126   case SD_SINHALA:
127   case SD_TELUGU:
128     if (!indic->isOK()) break;;
129     ret = getIndicCluster (
130          (unsigned int)scriptcode, ucs4, index, retchar, finished);
131     break;
132   case SD_HANGUL_JAMO:
133     ret = getJamoCluster (ucs4, index, retchar, finished);
134     break;
135   case SD_TIBETAN:
136   case SD_THAI:
137   case SD_LAO:
138     ret = getSouthIndicCluster ((unsigned int)scriptcode,
139           ucs4, index, retchar, finished);
140     //if (ret>0) fprintf (stderr, "TIBET Tibetan: %d\n", ret-index);
141     break;
142   case SD_TAMIL:
143   case SD_YUDIT:
144     if (!clusters->isOK()) break;
145     ret = clusters->lift (ucs4, index, true, retchar);
146     break;
147   case SD_ROVASIRAS:
148     ret = getRovasCluster (ucs4, index, retchar, finished, false);
149     break;
150   case SD_PUA_ROVAS:
151     ret = getRovasCluster (ucs4, index, retchar, finished, true);
152     break;
153   case SD_REGIONAL_INDICATOR_SYMBOL:
154     ret = getRISCluster (ucs4, index, retchar, finished);
155     break;
156   }
157   if (finished==0 && yuditClusterError.size())
158   {
159     // If you want to debug things uncomment this.
160     //fprintf (stderr, "SCluster.cpp:%*.*s\n", SSARGS(yuditClusterError));
161   }
162   return ret;
163 }
164 
165 /**
166  * -1 non rovas.
167  * 1 rovas basic
168  * 2 rovas liga
169  * 3 rovas yudit cluster
170  * 0 ZWJ
171  */
getRovasType(SS_UCS4 chr)172 int getRovasType (SS_UCS4 chr)
173 {
174     if (chr == 0x200d) return 0;
175     if (chr >= 0x10c80 && chr <= 0x10cff)
176     {
177         return 1;
178     }
179     if (getLigatureScriptCode(chr) == SD_ROVASIRAS) return 3;
180     return -1;
181 }
182 
183 /**
184  * -1 non rovas.
185  * 1 rovas basic
186  * 2 rovas liga
187  * 3 rovas yudit cluster
188  * 0 ZWJ
189  */
getPUARovasType(SS_UCS4 chr)190 int getPUARovasType (SS_UCS4 chr)
191 {
192     if (chr == 0x200d) return 0;
193     if (chr >= 0xee00 && chr <= 0xee29)
194     {
195         return 1;
196     }
197     if (chr >= 0xee30 && chr <= 0xee8b)
198     {
199         return 2;
200     }
201     if (getLigatureScriptCode(chr) == SD_PUA_ROVAS) return 3;
202     return -1;
203 }
204 
205 static unsigned int
getRovasCluster(const SV_UCS4 & unicode,unsigned int index,SV_UCS4 * ret,int * finished,bool isPUA)206 getRovasCluster (const SV_UCS4& unicode, unsigned int index,
207   SV_UCS4* ret, int* finished, bool isPUA) {
208 
209   unsigned int usize = unicode.size();
210   if (index>=usize) return index;
211 
212   /* set it to finished - this routine would not be called to other scirpts  */
213   if (finished) *finished = 1;
214   /* Some platforms have unsigned char */
215   int prevchartype = 0;
216 
217   SS_UCS4 nextLig = 0;
218   unsigned int i;
219   int ligatureType = isPUA ? SD_PUA_ROVAS : SD_ROVASIRAS;
220   for (i=index;i<usize; i++)
221   {
222     SS_UCS4 next = unicode[i];
223     int chartype = isPUA ? getPUARovasType (next) : getRovasType (next);
224     switch (chartype)
225     {
226     case 0:
227       if (prevchartype < 1)
228       {
229         if (i > index+1)
230         {
231           nextLig = nextLigature (ligatureType,
232               &unicode.array()[index], i-index);
233           if (nextLig) ret->append (nextLig);
234           return i;
235         }
236         ret->clear();
237         return index;
238       }
239       ret->append (next);
240       break;
241     case 1:
242     case 2:
243       if (prevchartype != 0)
244       {
245         if (i > index+1)
246         {
247           nextLig = nextLigature (ligatureType,
248                 &unicode.array()[index], i-index);
249           if (nextLig) ret->append (nextLig);
250           return i;
251         }
252         ret->clear();
253         return index;
254       }
255       ret->append (next);
256       break;
257     case -1:
258       if (i > index+1)
259       {
260         nextLig = nextLigature (ligatureType,
261             &unicode.array()[index], i-index);
262         if (nextLig) ret->append (nextLig);
263         return i;
264       }
265       ret->clear();
266       return index;
267     }
268     prevchartype = chartype;
269   }
270   /* Not yet finished. Return unfinished cluster */
271   if (finished) *finished = 0;
272   if (ret->size() > 1)
273   {
274      nextLig = nextLigature (ligatureType,
275           &unicode.array()[index], i-index);
276      if (nextLig) ret->append (nextLig);
277      return i;
278   }
279   ret->clear();
280   return index;
281 }
282 
283 static unsigned int
getRISCluster(const SV_UCS4 & unicode,unsigned int index,SV_UCS4 * ret,int * finished)284 getRISCluster (const SV_UCS4& unicode, unsigned int index,
285   SV_UCS4* ret, int* finished) {
286 
287   unsigned int usize = unicode.size();
288   if (index>=usize) return index;
289 
290   /* set it to finished - this routine would not be called to other scirpts  */
291   if (finished) *finished = 1;
292 
293   SS_UCS4 nextLig = 0;
294   unsigned int i;
295   int ligatureType = SD_REGIONAL_INDICATOR_SYMBOL;
296   for (i=index;i<index+2 && i<usize; i++)
297   {
298     SS_UCS4 next = unicode[i];
299     if (getUnicodeScript(next) != SD_REGIONAL_INDICATOR_SYMBOL) {
300         if (finished) *finished = 0;
301         break;
302     }
303     ret->append (next - 0x1f1e6 + (int) 'A');
304   }
305   if (ret->size() > 1) {
306      nextLig = nextLigature (ligatureType,
307           &unicode.array()[index], i-index);
308      if (nextLig) ret->append (nextLig);
309      return i;
310   }
311   if (finished) *finished = 1;
312   ret->clear();
313   return index;
314 }
315 
316 /**
317  * Create a JAMO Cluster as of Unicode 3.0 Chapter 3.11.
318  * 1. L.X V.X T.X X.L X.V X.T
319  * 2. T.L
320  * 3. V.L
321  * 4. T.V
322  * In short:  Cluster=L*V*T*
323  * Asterisk means: one or more.
324  * @param finished is set to 1 if exact match happens
325  *                           0 is not yet finished
326  *                          -1 if illegal sequence start.
327  */
328 static unsigned int
getJamoCluster(const SV_UCS4 & unicode,unsigned int index,SV_UCS4 * ret,int * finished)329 getJamoCluster (const SV_UCS4& unicode, unsigned int index,
330   SV_UCS4* ret, int* finished)
331 {
332 
333   unsigned int usize = unicode.size();
334   if (index>=usize) return index;
335 
336 
337   /* set it to finished - this routine would not be called to other scirpts  */
338   if (finished) *finished = -1;
339   /* Some platforms have unsigned char */
340   int prevchartype = getJamoClass (unicode[index]);
341 
342   SS_UCS4 nextLig = 0;
343   unsigned int i;
344   for (i=index;i<usize; i++)
345   {
346     SS_UCS4 next = unicode[i];
347     int chartype = getJamoClass (next);
348     switch (chartype)
349     {
350     case SD_JAMO_L:
351       if (prevchartype != SD_JAMO_L)
352       {
353         nextLig = precomposJamos (ret);
354         if (nextLig==0)
355         {
356           nextLig = nextLigature (SD_HANGUL_JAMO,
357             &unicode.array()[index], i-index);
358         }
359         if (nextLig) ret->append (nextLig);
360         return i;
361       }
362       ret->append (next);
363       break;
364     case SD_JAMO_V:
365       if (prevchartype != SD_JAMO_L && prevchartype != SD_JAMO_V)
366       {
367         nextLig = precomposJamos (ret);
368         if (nextLig ==0)
369         {
370           nextLig = nextLigature (SD_HANGUL_JAMO,
371               &unicode.array()[index], i-index);
372         }
373         if (nextLig) ret->append (nextLig);
374         return i;
375       }
376       ret->append (next);
377       break;
378     case SD_JAMO_T:
379       /* Do we really have TT sequence ? According to Unicode yes. Hmm.. */
380       if (prevchartype != SD_JAMO_V && prevchartype != SD_JAMO_T)
381       {
382         nextLig = precomposJamos (ret);
383         if (nextLig==0)
384         {
385            nextLig = nextLigature (SD_HANGUL_JAMO,
386             &unicode.array()[index], i-index);
387         }
388         if (nextLig) ret->append (nextLig);
389         return i;
390       }
391       ret->append (next);
392       break;
393     case SD_JAMO_X:
394     default:
395       /* Tone marks can follow the cluster */
396 // They are suported as composing anyway...
397 #if 0
398       if (next == 0x302e || next == 0x302f)
399       {
400         ret->append (next);
401         i++;
402       }
403 #endif
404       nextLig = precomposJamos (ret);
405       if (nextLig==0)
406       {
407         nextLig = nextLigature (SD_HANGUL_JAMO,
408           &unicode.array()[index], i-index);
409       }
410       if (nextLig) ret->append (nextLig);
411       return i;
412       break;
413     }
414     prevchartype = chartype;
415   }
416   /* Not yet finished. Return unfinished cluster */
417   if (finished) *finished = 0;
418   if (ret->size()>=1)
419   {
420      nextLig = precomposJamos (ret);
421      if (nextLig==0)
422      {
423        nextLig = nextLigature (SD_HANGUL_JAMO,
424           &unicode.array()[index], i-index);
425      }
426      if (nextLig) ret->append (nextLig);
427      return i;
428   }
429   ret->clear();
430   return index;
431 }
432 
433 /**
434  * Precompose JAMOs that are present in unicode tables
435  * @param jamo is the vector that holds input jamos and
436  *  output precompositions.
437  * @return the precomposed JAMOS or 0
438  */
439 static SS_UCS4
precomposJamos(SV_UCS4 * jamo)440 precomposJamos(SV_UCS4* jamo)
441 {
442   if (jamo->size()==0) return 0;
443   if (jamo->size()==1) return 0;
444   SS_UCS4 last = (*jamo)[jamo->size()-1];
445   if (last==0x302e || last==0x302f)
446   {
447     if (jamo->size()<=2) return 0;
448     if (jamo->size()>4) return 0;
449     jamo->truncate (jamo->size()-1);
450   }
451   else if (jamo->size()>3)
452   {
453     return 0;
454   }
455 
456   SS_UCS4 l = (*jamo)[0];
457   SS_UCS4 v = (*jamo)[1];
458   SS_UCS4 t = (jamo->size() >= 3) ? (*jamo)[2] : 0x11a7;
459   /* tone marks will be rendered first */
460   if (last==0x302e || last==0x302f)
461   {
462     jamo->insert (0, last);
463   }
464   if (l>=0x1100 && l<=0x1112
465    && v>=0x1161 && v<=0x1175
466    && t>=0x11a7 && t<=0x11c2)
467   {
468     jamo->clear();
469     SS_UCS4 vle = 21*28* (l-0x1100) + 28 * (v-0x1161) + (t-0x11a7) + 0xac00;
470     jamo->append (vle);
471     /* create a unique key */
472     if (last==0x302e)
473     {
474       vle = vle & 0x3fff;
475     }
476     else if (last==0x302f)
477     {
478       vle = vle & 0x7fff;
479     }
480     vle +=  0x80000000 + (0x10000 * SD_HANGUL_PREC);
481     return vle;
482   }
483   return 0;
484 }
485 
486 /**
487  * Get cluster for South Indian Thai-like scripts
488  * The cluster is rendered and treated together. It has
489  * a unicode and a separated memory representation.
490  * Memory representation is only used for fallback rendering.
491  * A cluster is
492  *
493  * a) Consonant + Top/Bottom/Right Sign [+ ...]
494  * b) Consonant + Nukta
495  * c) Consonant + Nukta + Top/Bottom/Right Sign [+ ...]
496  * d) Indep-Vowel + Top/Bottom Sign [+ ...]
497  *
498  * @param finished is set to 1 if exact match happens
499  *                           0 is not yet finished
500  *                          -1 if illegal sequence start.
501  * It also sets yuditClusterError to an appropriate string.
502  */
503 static unsigned int
getSouthIndicCluster(unsigned int scriptcode,const SV_UCS4 & unicode,unsigned int index,SV_UCS4 * ret,int * finished)504 getSouthIndicCluster (unsigned int scriptcode,
505   const SV_UCS4& unicode, unsigned int index, SV_UCS4* ret, int* finished)
506 {
507   unsigned int usize = unicode.size();
508   unsigned int i;
509   if (finished) *finished = 1;
510   /* Some platforms have unsigned char */
511 
512   char prevchartype = (char)0x7f; /* big enough */
513   SS_UCS4 nextLig = 0;
514   for (i=index;i<usize; i++)
515   {
516     SS_UCS4 next = unicode[i];
517     char chartype = (char) indic->encode (next);
518     unsigned int sc = getUnicodeScript (next);
519     if (sc!=scriptcode && next != 0x25cc && next != 0x200d && next != 0x200c)
520     {
521       if (ret->size()==0)
522       {
523         /* can not start with it */
524         if (finished) *finished=-1;
525       }
526       nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
527       if (nextLig) ret->append (nextLig);
528       return i;
529     }
530     switch (chartype)
531     {
532     case SD_INDIC_INDEP_VOWEL:
533       ret->append (next);
534       if (i+1 < usize)
535       {
536         SS_UCS4 n = unicode[i+1];
537         char ct = (char) indic->encode (n);
538         if (ct != SD_INDIC_TOP_VOWEL && ct != SD_INDIC_BOTTOM_VOWEL)
539         {
540            if (ret->size()==1) return index;
541            nextLig = nextLigature (scriptcode,
542                 &unicode.array()[index], i-index+1);
543            if (nextLig) ret->append (nextLig);
544            return i+1;
545         }
546       }
547       break;
548     case SD_INDIC_CONSONANT_BASE:
549     case SD_INDIC_CONSONANT_POST_BASE:
550     case SD_INDIC_CONSONANT_BELOW_BASE:
551       ret->append (next);
552       if (i+1 < usize)
553       {
554         SS_UCS4 n = unicode[i+1];
555         char ct = (char) indic->encode (n);
556         if (ct != SD_INDIC_NUKTA
557              && ct != SD_INDIC_RIGHT_VOWEL
558              && ct != SD_INDIC_TOP_VOWEL
559              && ct != SD_INDIC_BOTTOM_VOWEL)
560         {
561            if (ret->size()==1) return index;
562            nextLig = nextLigature (scriptcode,
563                 &unicode.array()[index], i-index+1);
564            if (nextLig) ret->append (nextLig);
565            return i+1;
566         }
567       }
568       break;
569     case SD_INDIC_NUKTA:
570       if (ret->size()==0)
571       {
572         /* can not start with it */
573         if (finished) *finished=-1;
574         yuditClusterError = "Cluster should not start with a subjoined consonant.";
575         return index;
576       }
577       if (prevchartype != SD_INDIC_CONSONANT_BASE
578          && prevchartype != SD_INDIC_CONSONANT_POST_BASE
579          && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE)
580       {
581         yuditClusterError = "Subjoined consonant should be preceded by a full consonant.";
582         nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
583         if (nextLig) ret->append (nextLig);
584         return i;
585       }
586       ret->append (next);
587       if (i+1 < usize)
588       {
589         SS_UCS4 n = unicode[i+1];
590         char ct = (char) indic->encode (n);
591         if (ct != SD_INDIC_RIGHT_VOWEL
592              && ct != SD_INDIC_TOP_VOWEL
593              && ct != SD_INDIC_BOTTOM_VOWEL)
594         {
595            if (ret->size()==1) return index;
596            nextLig = nextLigature (scriptcode,
597                 &unicode.array()[index], i-index+1);
598            if (nextLig) ret->append (nextLig);
599            return i+1;
600         }
601       }
602       break;
603     case SD_INDIC_LEFT_VOWEL:
604     case SD_INDIC_RIGHT_VOWEL:
605     case SD_INDIC_TOP_VOWEL:
606     case SD_INDIC_BOTTOM_VOWEL:
607       if (ret->size()==0)
608       {
609         /* can not start with it */
610         if (finished) *finished=-1;
611         yuditClusterError = "Cluster should not start with a dependent wovel.";
612         return index;
613       }
614       if (prevchartype != SD_INDIC_INDEP_VOWEL
615 	   && prevchartype != SD_INDIC_CONSONANT_BASE
616 	   && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
617 	   && prevchartype != SD_INDIC_CONSONANT_POST_BASE
618 	   && prevchartype != SD_INDIC_NUKTA
619 	   && prevchartype != SD_INDIC_RIGHT_VOWEL
620 	   && prevchartype != SD_INDIC_TOP_VOWEL
621 	   && prevchartype != SD_INDIC_BOTTOM_VOWEL)
622       {
623         yuditClusterError = "Dependent sign should be preceded by another character.";
624         nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
625         if (nextLig) ret->append (nextLig);
626         return i;
627       }
628       ret->append (next);
629       if (i+1 < usize)
630       {
631         SS_UCS4 n = unicode[i+1];
632         char ct = (char) indic->encode (n);
633         if (ct != SD_INDIC_RIGHT_VOWEL
634 	     && ct != SD_INDIC_TOP_VOWEL
635 	     && ct != SD_INDIC_BOTTOM_VOWEL)
636         {
637            if (ret->size()==1) return index;
638            nextLig = nextLigature (scriptcode,
639                 &unicode.array()[index], i-index+1);
640            if (nextLig) ret->append (nextLig);
641            return i+1;
642         }
643       }
644       break;
645     case SD_INDIC_SIGN:
646       if (ret->size()==0)
647       {
648         /* can start with it */
649         // if (finished) *finished=-1;
650         return index;
651       }
652       ret->append (next);
653       nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
654       if (nextLig) ret->append (nextLig);
655       return i+1;
656     default:
657       if (ret->size()==0)
658       {
659         if (finished) *finished=1;
660         return index;
661       }
662       nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
663       if (nextLig) ret->append (nextLig);
664       return i;
665     }
666     prevchartype = chartype;
667   }
668 // fprintf (stderr, "TIBET index=%d\n", index);
669   if (finished) *finished = 0;
670   if (ret->size()>1)
671   {
672      nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
673      if (nextLig) ret->append (nextLig);
674      return i;
675   }
676   ret->clear();
677   return index;
678 }
679 
680 /**
681  * Get cluster for North Indian Devanagari-like scripts
682  * The cluster is rendered and treated together. It has
683  * a unicode and a seperated memory representation.
684  * Memory representation is only used for fallback rendering.
685  * A cluster is
686  * a) Consonant
687  * b) Consonant + Halant
688  * c) Consonant + Halant + ZWJ
689  * d) Consonant + Nukta + Halant
690  * e) Consonant + Nukta + Halant + ZWJ
691  * f) Independent Vowel
692  * g) Independent Vowel + Vowel
693  * h) [b|c|d|e]*
694  * i) [b|c|d|e]* a
695  * j) [b|c|d|e]* Vowel
696  * k) [a-i] ending with Modifier
697  * l) [a-i] ending with ZWNJ
698  * For bengali
699  * Consonant + ZWJ
700  * Halant + Consonant
701  * are also possible.
702  * @param scriptcode is one of the scripts (Hard-Coded)
703  * @return index if nothing was lifted off vector, return
704  * the number of unicode characters + index otherwise.
705  * append the output cluster to ret, last element is ligature
706  * code - if any.
707  * @param finished is set to 1 if exact match happens
708  *                           0 is not yet finished
709  *                          -1 if illegal sequence start.
710  * It also sets yuditClusterError to an appripriate string.
711  */
712 static unsigned int
getIndicCluster(unsigned int scriptcode,const SV_UCS4 & unicode,unsigned int index,SV_UCS4 * ret,int * finished)713 getIndicCluster (unsigned int scriptcode,
714   const SV_UCS4& unicode, unsigned int index, SV_UCS4* ret, int* finished)
715 {
716   unsigned int usize = unicode.size();
717   unsigned int i;
718   if (finished) *finished = 1;
719   /* Some platforms have unsigned char */
720 
721   char prevchartype = (char)0x7f; /* big enough */
722   SS_UCS4 nextLig = 0;
723   for (i=index;i<usize; i++)
724   {
725    SS_UCS4 next = unicode[i];
726    char chartype = (char) indic->encode (next);
727 //fprintf (stderr, "getIndicCluster=%u %d\n", next, chartype);
728    unsigned int sc = getUnicodeScript (next);
729    if (sc!=scriptcode && chartype != SD_INDIC_ZWNJ && chartype != SD_INDIC_ZWJ
730         && next != 0x25cc)
731    {
732      if (ret->size()==0)
733      {
734        /* can not start with it */
735        if (finished) *finished=-1;
736      }
737      nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
738      if (nextLig) ret->append (nextLig);
739      return i;
740    }
741    switch (chartype)
742    {
743    case SD_INDIC_INDEP_VOWEL:
744      ret->append (next);
745      if (i+1 < usize)
746      {
747        SS_UCS4 n = unicode[i+1];
748        char ct = (char) indic->encode (n);
749        if (ct != SD_INDIC_BOTTOM_VOWEL
750             && ct != SD_INDIC_TOP_VOWEL
751             && ct != SD_INDIC_LEFT_VOWEL
752             && ct != SD_INDIC_LEFT_RIGHT_VOWEL
753             && ct != SD_INDIC_RIGHT_VOWEL
754             && ct != SD_INDIC_MODIFIER
755             && ct != SD_INDIC_HALANT)
756        {
757           if (ret->size()==1) return index;
758           nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
759           if (nextLig) ret->append (nextLig);
760           return i+1;
761        }
762      }
763      break;
764    case SD_INDIC_LEFT_VOWEL:
765      if (ret->size()==0)
766      {
767        /* can not start with it */
768        if (finished) *finished=-1;
769        yuditClusterError = "Cluster should not start with dependent vowel.";
770        return index;
771      }
772      if (prevchartype != SD_INDIC_CONSONANT_BASE
773            && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
774            && prevchartype != SD_INDIC_CONSONANT_POST_BASE
775            && prevchartype != SD_INDIC_CONSONANT_DEAD
776            && prevchartype != SD_INDIC_HALANT
777            && prevchartype != SD_INDIC_NUKTA
778            && prevchartype != SD_INDIC_INDEP_VOWEL)
779      {
780        yuditClusterError = "Dependent vowel should be preceded by consonant, nukta, halant or independent vowel.";
781        nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
782        if (nextLig) ret->append (nextLig);
783        return i;
784      }
785      if (scriptcode == SD_MALAYALAM)
786 	ret->insert (ret->size()-1, next);
787      else ret->insert (0, next);
788      if (i+1 < usize)
789      {
790        SS_UCS4 n = unicode[i+1];
791        char ct = (char) indic->encode (n);
792        if (ct != SD_INDIC_MODIFIER)
793        {
794          nextLig = nextLigature (scriptcode, &unicode.array()[index],i-index+1);
795          if (nextLig) ret->append (nextLig);
796          return i+1;
797        }
798      }
799      break;
800    case SD_INDIC_LEFT_RIGHT_VOWEL:
801      if (ret->size()==0)
802      {
803        /* can not start with it */
804        if (finished) *finished=-1;
805        yuditClusterError = "Cluster should not start with dependent vowel.";
806        return index;
807      }
808      if (prevchartype != SD_INDIC_CONSONANT_BASE
809            && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
810            && prevchartype != SD_INDIC_CONSONANT_POST_BASE
811            && prevchartype != SD_INDIC_CONSONANT_DEAD
812            && prevchartype != SD_INDIC_HALANT
813            && prevchartype != SD_INDIC_NUKTA
814            && prevchartype != SD_INDIC_INDEP_VOWEL)
815      {
816        yuditClusterError = "Dependent vowel should be preceded by consonant, nukta, halant or independent vowel.";
817        nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
818        if (nextLig) ret->append (nextLig);
819        return i;
820      }
821      /* this will be the fallback rendering */
822      {
823        SS_UCS4 l = getLRVowelLeft (next);
824        SS_UCS4 r = getLRVowelRight (next);
825        if (l && r)
826        {
827          if (scriptcode == SD_MALAYALAM)
828 	    ret->insert (ret->size()-1, l);
829 	 else ret->insert (0, l);
830          ret->append (r);
831        }
832        else
833        {
834          ret->append (next);
835        }
836      }
837      if (i+1 < usize)
838      {
839        SS_UCS4 n = unicode[i+1];
840        char ct = (char) indic->encode (n);
841        if (ct != SD_INDIC_MODIFIER)
842        {
843          nextLig = nextLigature (scriptcode, &unicode.array()[index],i-index+1);
844          if (nextLig) ret->append (nextLig);
845          return i+1;
846        }
847      }
848      break;
849    case SD_INDIC_MODIFIER:
850      if (ret->size()==0)
851      {
852        /* can not start with it */
853        yuditClusterError = "Cluster should not start with a modifier.";
854        if (finished) *finished=-1;
855        return index;
856      }
857      if (     prevchartype != SD_INDIC_INDEP_VOWEL
858            && prevchartype != SD_INDIC_TOP_VOWEL
859            && prevchartype != SD_INDIC_BOTTOM_VOWEL
860            && prevchartype != SD_INDIC_LEFT_VOWEL
861            && prevchartype != SD_INDIC_LEFT_RIGHT_VOWEL
862            && prevchartype != SD_INDIC_RIGHT_VOWEL
863            && prevchartype != SD_INDIC_CONSONANT_BASE
864            && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
865            && prevchartype != SD_INDIC_CONSONANT_POST_BASE
866            && prevchartype != SD_INDIC_CONSONANT_DEAD
867 	   && prevchartype != SD_INDIC_NUKTA)
868      {
869        nextLig = nextLigature (scriptcode,
870                &unicode.array()[index], i-index);
871        if (nextLig) ret->append (nextLig);
872        return i;
873      }
874      ret->append (next);
875      nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
876      if (nextLig) ret->append (nextLig);
877      return i +1;
878 
879    case SD_INDIC_SIGN:
880      if (ret->size()==0)
881      {
882        /* can start with it */
883        // if (finished) *finished=-1;
884        return index;
885      }
886      ret->append (next);
887      nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
888      if (nextLig) ret->append (nextLig);
889      return i+1;
890 
891    case SD_INDIC_RIGHT_VOWEL:
892    case SD_INDIC_TOP_VOWEL:
893    case SD_INDIC_BOTTOM_VOWEL:
894      if (ret->size()==0)
895      {
896        /* can not start with it */
897        yuditClusterError = "Cluster should not start with dependent vowel.";
898        if (finished) *finished=-1;
899        return index;
900      }
901      if (prevchartype != SD_INDIC_CONSONANT_BASE
902            && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
903            && prevchartype != SD_INDIC_CONSONANT_POST_BASE
904            && prevchartype != SD_INDIC_HALANT
905            && prevchartype != SD_INDIC_NUKTA
906            && prevchartype != SD_INDIC_CONSONANT_DEAD
907 	   && prevchartype != SD_INDIC_INDEP_VOWEL)
908      {
909         yuditClusterError = "Dependent vowel should be preceded by consonant, nukta, halant or independent vowel.";
910        nextLig = nextLigature (scriptcode,
911                &unicode.array()[index], i-index);
912        if (nextLig) ret->append (nextLig);
913        return i;
914      }
915      ret->append (next);
916      if (i+1 < usize)
917      {
918        SS_UCS4 n = unicode[i+1];
919        char ct = (char) indic->encode (n);
920        if (ct != SD_INDIC_MODIFIER)
921        {
922          nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
923          if (nextLig) ret->append (nextLig);
924          return i +1;
925        }
926      }
927      break;
928    case SD_INDIC_CONSONANT_BASE:
929    case SD_INDIC_CONSONANT_BELOW_BASE:
930    case SD_INDIC_CONSONANT_POST_BASE:
931      if (ret->size() > 0 && prevchartype != SD_INDIC_HALANT
932            && prevchartype != SD_INDIC_ZWJ
933            && prevchartype != SD_INDIC_CONSONANT_DEAD)
934      {
935        yuditClusterError = "Consonant should be preceded by halant or nukta or ZWJ";
936        nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
937        if (nextLig) ret->append (nextLig);
938        return i;
939      }
940      ret->append (next);
941      if (i+1 < usize)
942      {
943        SS_UCS4 n = unicode[i+1];
944        char ct = (char) indic->encode (n);
945        if (ct != SD_INDIC_HALANT
946             && ct != SD_INDIC_NUKTA
947             && ct != SD_INDIC_ZWNJ
948             && ct != SD_INDIC_ZWJ
949             && ct != SD_INDIC_MODIFIER
950             && ct != SD_INDIC_BOTTOM_VOWEL
951             && ct != SD_INDIC_TOP_VOWEL
952             && ct != SD_INDIC_LEFT_VOWEL
953             && ct != SD_INDIC_LEFT_RIGHT_VOWEL
954             && ct != SD_INDIC_CONSONANT_DEAD
955             && ct != SD_INDIC_RIGHT_VOWEL)
956        {
957           if (ret->size()==1) return index;
958           nextLig = nextLigature (scriptcode,&unicode.array()[index],i-index+1);
959           if (nextLig) ret->append (nextLig);
960           return i+1;
961        }
962      }
963      break;
964    case SD_INDIC_ZWNJ:
965      if (ret->size()==0)
966      {
967        /* can not start with it */
968        yuditClusterError = "Cluster can not start with a ZWNJ.";
969        if (finished) *finished=-1;
970        return index;
971      }
972 #if 0
973      if (prevchartype != SD_INDIC_HALANT)
974      {
975        yuditClusterError = "ZWNJ should be preceded by a halant.";
976        nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
977        if (nextLig) ret->append (nextLig);
978        return i;
979      }
980 #endif
981      nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
982      if (nextLig) ret->append (nextLig);
983      return i+1;
984    case SD_INDIC_NUKTA:
985      if (ret->size()==0)
986      {
987        /* can not start with it */
988        yuditClusterError = "Cluster can not start with a nukta.";
989        if (finished) *finished=-1;
990        return index;
991      }
992      if (prevchartype != SD_INDIC_CONSONANT_BASE
993         && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
994         && prevchartype != SD_INDIC_CONSONANT_DEAD
995         && prevchartype != SD_INDIC_CONSONANT_POST_BASE)
996      {
997        yuditClusterError = "Nukta should be preceded by a consonant.";
998        nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
999        if (nextLig) ret->append (nextLig);
1000        return i;
1001      }
1002      ret->append (next);
1003      if (i+1 < usize)
1004      {
1005        SS_UCS4 n = unicode[i+1];
1006        char ct = (char) indic->encode (n);
1007        if (ct != SD_INDIC_HALANT
1008             && ct != SD_INDIC_MODIFIER
1009             && ct != SD_INDIC_BOTTOM_VOWEL
1010             && ct != SD_INDIC_TOP_VOWEL
1011             && ct != SD_INDIC_LEFT_VOWEL
1012             && ct != SD_INDIC_LEFT_RIGHT_VOWEL
1013             && ct != SD_INDIC_RIGHT_VOWEL)
1014        {
1015           if (ret->size()==1) return index;
1016           nextLig = nextLigature (scriptcode,&unicode.array()[index],i-index+1);
1017           if (nextLig) ret->append (nextLig);
1018           return i+1;
1019        }
1020      }
1021      break;
1022    case SD_INDIC_ZWJ:
1023      // Bengali can start with ZWJ - it needs a little work.
1024 #if 0
1025      if (ret->size()==0)
1026      {
1027        /* can not start with it */
1028        yuditClusterError = "Cluster can not start with a ZWJ.";
1029        if (finished) *finished=-1;
1030        return index;
1031      }
1032      if (prevchartype != SD_INDIC_HALANT)
1033      {
1034        yuditClusterError = "ZWJ should be preceded by a halant.";
1035        nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
1036        if (nextLig) ret->append (nextLig);
1037        return i;
1038      }
1039 #endif
1040      ret->append (next);
1041      break;
1042    case SD_INDIC_HALANT:
1043      // Bengali can start with a halant - Yaphala
1044      if (next != 0x09cd && ret->size()==0)
1045      {
1046        /* can not start with it */
1047        yuditClusterError = "Cluster can not start with a halant.";
1048        if (finished) *finished=-1;
1049        return index;
1050      }
1051      if (next != 0x09cd
1052           && prevchartype != SD_INDIC_INDEP_VOWEL
1053           && prevchartype != SD_INDIC_CONSONANT_BASE
1054           && prevchartype != SD_INDIC_CONSONANT_BELOW_BASE
1055           && prevchartype != SD_INDIC_CONSONANT_POST_BASE
1056           && prevchartype != SD_INDIC_NUKTA)
1057      {
1058        yuditClusterError = "Halant should be preceded by an independent vowel, a consonant or nukta.";
1059        nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
1060        if (nextLig) ret->append (nextLig);
1061        return i;
1062      }
1063      ret->append (next);
1064      break;
1065    case SD_INDIC_CONSONANT_DEAD:
1066      // Finish the cluster - I dont know any better solution.
1067      nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index+1);
1068      if (nextLig) ret->append (nextLig);
1069      return i+1;
1070    default:
1071      if (ret->size()==0)
1072      {
1073        if (finished) *finished=1;
1074        return index;
1075      }
1076      nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
1077      if (nextLig) ret->append (nextLig);
1078      return i;
1079     break;
1080    }
1081    prevchartype = chartype;
1082   }
1083   if (finished) *finished = 0;
1084   if (ret->size()>1)
1085   {
1086      nextLig = nextLigature (scriptcode, &unicode.array()[index], i-index);
1087      if (nextLig) ret->append (nextLig);
1088      return i;
1089   }
1090   ret->clear();
1091   return index;
1092 }
1093 
1094 
1095 /**
1096  * Generate a next ligature number if it still does not exist
1097  */
nextLigature(unsigned int script,const SS_UCS4 * unicode,unsigned int length)1098 static SS_UCS4 nextLigature (unsigned int script,
1099    const SS_UCS4* unicode, unsigned int length)
1100 {
1101   initLigatures ();
1102   if (length<2) return 0;
1103 
1104   SString key = SString((char*)unicode, sizeof (SS_UCS4) * length);
1105   const SString* cac = ligatureCache->get (key);
1106   SS_UCS4 liga;
1107   if (cac && cac->size()==sizeof (SS_UCS4))
1108   {
1109     liga = *(SS_UCS4*) (cac->array());
1110     return liga;
1111   }
1112   liga = counters[script];
1113 
1114   /* check overflow */
1115   if ((liga & 0xffff) == 0xffff) return 0;
1116   liga++;
1117   counters[script] = liga;
1118   /* FIXME: check overflow */
1119   SString vle = SString((char*)&liga, sizeof (SS_UCS4));
1120   ligatureCache->put (key, vle);
1121   //fprintf (stderr, "New Ligature[%d]=%X\n", script, liga);
1122   return liga;
1123 }
1124 
1125 int
getUnicodeScript(SS_UCS4 comp)1126 getUnicodeScript (SS_UCS4 comp)
1127 {
1128   /* TONE LETTERS */
1129   switch (comp)
1130   {
1131   case 0x304B: return SD_YUDIT;
1132   case 0x304D: return SD_YUDIT;
1133   case 0x304F: return SD_YUDIT;
1134   case 0x3051: return SD_YUDIT;
1135   case 0x3053: return SD_YUDIT;
1136   case 0x30AB: return SD_YUDIT;
1137   case 0x30AD: return SD_YUDIT;
1138   case 0x30AF: return SD_YUDIT;
1139   case 0x30B1: return SD_YUDIT;
1140   case 0x30B3: return SD_YUDIT;
1141   case 0x30BB: return SD_YUDIT;
1142   case 0x30C4: return SD_YUDIT;
1143   case 0x30C8: return SD_YUDIT;
1144   case 0x31F7: return SD_YUDIT;
1145   case 0x00E6: return SD_YUDIT;
1146   case 0x0254: return SD_YUDIT;
1147   case 0x028C: return SD_YUDIT;
1148   case 0x0259: return SD_YUDIT;
1149   case 0x025A: return SD_YUDIT;
1150   default: break;
1151   }
1152   if (comp >= 0x02E5 && comp <= 0x02E9) return SD_YUDIT;
1153   if (getJamoClass (comp)>0) return SD_HANGUL_JAMO;
1154 
1155   if (comp >= 0x1f1e6 && comp <= 0x1f1ff) {
1156     return SD_REGIONAL_INDICATOR_SYMBOL;
1157   }
1158 
1159   if (comp >= 0x1000)
1160   {
1161     if (getRovasType (comp) == 1)
1162     {
1163         return SD_ROVASIRAS;
1164     }
1165     if (getPUARovasType (comp) == 1)
1166     {
1167         return SD_PUA_ROVAS;
1168     }
1169     return -1;
1170   }
1171 
1172   if (comp < 0x0900 ) return -1;
1173   if (comp < 0x0980) return SD_DEVANAGARI;
1174   if (comp < 0x0A00) return SD_BENGALI;
1175   if (comp < 0x0A80) return SD_GURMUKHI;
1176   if (comp < 0x0B00) return SD_GUJARATI;
1177   if (comp < 0x0B80) return SD_ORIYA;
1178   if (comp < 0x0C00) return SD_TAMIL;
1179   if (comp < 0x0C80) return SD_TELUGU;
1180   if (comp < 0x0D00) return SD_KANNADA;
1181   if (comp < 0x0D80) return SD_MALAYALAM;
1182   if (comp < 0x0E00) return SD_SINHALA;
1183   if (comp < 0x0E80) return SD_THAI;
1184   if (comp < 0x0F00) return SD_LAO;
1185   if (comp < 0x0FFF) return SD_TIBETAN;
1186   return -1;
1187 }
1188 /**
1189  * return true if this is covered
1190  */
1191 bool
isCoveredScipt(SS_UCS4 comp,int sc)1192 isCoveredScipt (SS_UCS4 comp, int sc)
1193 {
1194   switch (sc)
1195   {
1196   case SD_YUDIT: return false;
1197   case SD_REGIONAL_INDICATOR_SYMBOL: return (comp>=0x1f1e6 && comp<=0x1f1ff);
1198   case SD_DEVANAGARI: return (comp>=0x0900 && comp<0x0980);
1199   case SD_BENGALI: return (comp>=0x0980 && comp<0x0a00);
1200   case SD_BENGALI_BEGIN: return (comp>=0x0980 && comp<0x0a00);
1201   case SD_GURMUKHI: return (comp>=0x0a00 && comp<0x0a80);
1202   case SD_GUJARATI: return (comp>=0x0a80 && comp<0x0b00);
1203   case SD_ORIYA: return (comp>=0x0b00 && comp<0x0b80);
1204   case SD_TAMIL: return (comp>=0x0b80 && comp<0x0c00);
1205   case SD_TELUGU: return (comp>=0x0c00 && comp<0x0c80);
1206   case SD_KANNADA: return (comp>=0x0c80 && comp<0x0d00);
1207   case SD_MALAYALAM: return (comp>=0x0d00 && comp<0x0d80);
1208   case SD_SINHALA: return (comp>=0x0d80 && comp<0x0e00);
1209   case SD_THAI: return (comp>=0x0e00 && comp<0x0e80);
1210   case SD_LAO: return (comp>=0x0e80 && comp<0x0f00);
1211   case SD_TIBETAN: return (comp>=0x0f00 && comp<0x0fff);
1212   case SD_HANGUL_JAMO: return (getJamoClass(comp) != 0);
1213   case SD_HANGUL_PREC: return (getJamoClass(comp) != 0);
1214   }
1215   return false;
1216 }
1217 
1218 
1219 /**
1220  * Add combining ligature. A combining ligature is a ligature
1221  * with combining marks. The ligature can be a unicode or
1222  8 Yudit ligature.
1223  * @param unicode is the unicode representation of the while thing
1224  * @param ul is the unicode repr. length
1225  * @param ligAndMarks contains one ligature + all the marks to it.
1226  * @param cl is the length of ligAndMarks.
1227  */
1228 SS_UCS4
addCombiningLigature(const SS_UCS4 * unicode,unsigned int ul,const SS_UCS4 * ligAndMarks,unsigned int cl)1229 addCombiningLigature (const SS_UCS4* unicode, unsigned int ul,
1230   const SS_UCS4* ligAndMarks, unsigned int cl)
1231 {
1232   SS_UCS4 nl = nextLigature (SD_COMBINING_LIGATURE, unicode, ul);
1233   const SString* found = ligatureUnics->get (
1234       SString((char*) &nl, sizeof (SS_UCS4)));
1235   if (found == 0)
1236   {
1237      putLigatureUnicode (nl, unicode, ul);
1238      putLigatureCluster (nl, ligAndMarks, cl);
1239   }
1240   return nl;
1241 }
1242 
1243 /**
1244  * Put ligature away to remember
1245  */
1246 void
putLigatureUnicode(SS_UCS4 ligature,const SS_UCS4 * buffer,unsigned int bufsize)1247 putLigatureUnicode (SS_UCS4 ligature, const SS_UCS4* buffer, unsigned int bufsize)
1248 {
1249   if (ligature <=  0x80000000 || ligature >= 0xA0000000) return;
1250   initLigatures();
1251   SString key ((char*)& ligature, sizeof (SS_UCS4));
1252   const SString* ret = ligatureUnics->get (key);
1253   if (ret) return; /* already there */
1254   ligatureUnics->put (key, SString((char*)buffer, bufsize * sizeof (SS_UCS4)));
1255 }
1256 
1257 /**
1258  * Put ligature away to remember
1259  */
1260 void
putLigatureCluster(SS_UCS4 ligature,const SS_UCS4 * buffer,unsigned int bufsize)1261 putLigatureCluster (SS_UCS4 ligature, const SS_UCS4* buffer, unsigned int bufsize)
1262 {
1263   if (ligature <=  0x80000000 || ligature >= 0xA0000000) return;
1264   initLigatures ();
1265   SString key ((char*)& ligature, sizeof (SS_UCS4));
1266   const SString* ret = ligatureClust->get (key);
1267   if (ret) return; /* already there */
1268   ligatureClust->put (key, SString((char*)buffer, bufsize * sizeof (SS_UCS4)));
1269 }
1270 
1271 unsigned int
getLigatureUnicode(SS_UCS4 lig,SS_UCS4 * buffer)1272 getLigatureUnicode (SS_UCS4 lig, SS_UCS4* buffer)
1273 {
1274   SS_UCS4 ligature = lig;
1275   int sc = getLigatureScriptCode(ligature);
1276   //
1277   // SD_BENGALI_BEGIN is an artificial shape-code.
1278   //
1279   if (sc == SD_BENGALI_BEGIN)
1280   {
1281      unsigned int en = (SD_BENGALI << 16) |  0x80000000;
1282      ligature =  (ligature & 0xffff) | en;
1283   }
1284   if (ligatureUnics == 0) return 0;
1285   const SString* ret = ligatureUnics->get (
1286       SString((char*) &ligature, sizeof (SS_UCS4)));
1287   if (ret==0) return 0;
1288   if (buffer==0) return ret->size()/sizeof (SS_UCS4);
1289   memcpy (buffer, ret->array(), ret->size());
1290   return ret->size()/sizeof (SS_UCS4);
1291 }
1292 
1293 unsigned int
getLigatureCluster(SS_UCS4 lig,SS_UCS4 * buffer)1294 getLigatureCluster (SS_UCS4 lig, SS_UCS4* buffer)
1295 {
1296   SS_UCS4 ligature = lig;
1297   int sc = getLigatureScriptCode(ligature);
1298   //
1299   // SD_BENGALI_BEGIN is an artificial shape-code.
1300   //
1301   if (sc == SD_BENGALI_BEGIN)
1302   {
1303      unsigned int en = (SD_BENGALI << 16) |  0x80000000;
1304      ligature =  (ligature & 0xffff) | en;
1305   }
1306   if (ligatureClust == 0) return 0;
1307   const SString* ret = ligatureClust->get (
1308       SString((char*) &ligature, sizeof (SS_UCS4)));
1309   if (ret==0) return 0;
1310   if (buffer==0) return ret->size()/sizeof (SS_UCS4);
1311   memcpy (buffer, ret->array(), ret->size());
1312   return ret->size()/sizeof (SS_UCS4);
1313 }
1314 
1315 static void
initLigatures()1316 initLigatures()
1317 {
1318   if (ligatureUnics == 0)
1319   {
1320     clusters = new SUniMap("cluster");
1321     CHECK_NEW (clusters);
1322     indic = new SUniMap("indic");
1323     CHECK_NEW (indic);
1324 
1325     ligatureUnics = new SProperties();
1326     CHECK_NEW (ligatureUnics);
1327     ligatureClust = new SProperties();
1328     CHECK_NEW (ligatureClust);
1329     ligatureCache = new SProperties();
1330     CHECK_NEW (ligatureCache);
1331     for (unsigned int i=0; i<SD_SCRIPT_MAX; i++)
1332     {
1333       counters[i] = 0x80000000 + (0x10000 * i);
1334     }
1335   }
1336 }
1337 
1338 int
getLigatureScriptCode(SS_UCS4 comp)1339 getLigatureScriptCode (SS_UCS4 comp)
1340 {
1341   if (comp < 0x80000000) return -1;
1342   SS_UCS4 en = comp & 0x7fff0000;
1343   en = en >> 16;
1344   return (int) en;
1345 }
1346 
1347 /* get script name or null */
1348 const char*
getLigatureScript(SS_UCS4 comp)1349 getLigatureScript (SS_UCS4 comp)
1350 {
1351   if (comp <= 0x80000000 || comp >= 0xA0000000) return 0;
1352   SS_UCS4 en = comp & 0x7fff0000;
1353   en = en >> 16;
1354   /* I modified this to return Script name as in MS Opentype spec.*/
1355   switch (en)
1356   {
1357   case SD_YUDIT: return "yudit";
1358   case SD_DEVANAGARI: return "deva";
1359   case SD_BENGALI: return "beng";
1360   case SD_BENGALI_BEGIN: return "beng";
1361   case SD_GURMUKHI: return "guru";
1362   case SD_GUJARATI: return "gujr";
1363   case SD_ORIYA: return "orya";
1364   case SD_TAMIL: return "taml";
1365   case SD_TELUGU: return "telu";
1366   case SD_KANNADA: return "knda";
1367   case SD_MALAYALAM: return "mlym";
1368   case SD_SINHALA: return "sinh";
1369   case SD_HANGUL_JAMO: return "jamo";
1370   case SD_HANGUL_PREC: return "hang";
1371   case SD_THAI: return "thai";
1372   case SD_LAO: return "lao ";
1373   case SD_TIBETAN: return "tibt";
1374   case SD_ROVASIRAS: return "rovs";
1375   case SD_PUA_ROVAS: return "prvs";
1376   case SD_REGIONAL_INDICATOR_SYMBOL: return "flag";
1377   }
1378   return 0;
1379 }
1380 
1381 bool
isLigature(SS_UCS4 _comp)1382 isLigature (SS_UCS4 _comp)
1383 {
1384   /* Yudit ligatures below 0x80008000 are considered hacked glyphs only */
1385   return (_comp >= 0x80008000 && _comp > 0x80000000 && _comp < 0xA0000000);
1386 }
1387 
1388 SS_UCS4
getHalant(int index)1389 getHalant (int index)
1390 {
1391   switch (index)
1392   {
1393   case SD_DEVANAGARI:
1394     return 0x094D;
1395   case SD_BENGALI:
1396     return 0x09CD;
1397   case SD_BENGALI_BEGIN:
1398     return 0x09CD;
1399   case SD_GURMUKHI:
1400     return 0x0A4D;
1401   case SD_GUJARATI:
1402     return 0x0ACD;
1403   case SD_ORIYA:
1404     return 0x0B4D;
1405   case SD_TELUGU:
1406     return 0x0C4D;
1407   case SD_KANNADA:
1408     return 0x0CCD;
1409   case SD_MALAYALAM:
1410     return 0x0D4D;
1411   case SD_SINHALA:
1412     return 0x0DCD;
1413   default:
1414     return 0;
1415   }
1416   return 0;
1417 }
1418 
getCharType(SS_UCS4 unchar)1419 int getCharType (SS_UCS4 unchar)
1420 {
1421   initLigatures();
1422   char echartype = (char) indic->encode (unchar);
1423   return (int) echartype;
1424 }
1425 
1426 /**
1427  * get left part of LR vowel
1428  */
1429 SS_UCS4
getLRVowelLeft(SS_UCS4 u)1430 getLRVowelLeft (SS_UCS4 u)
1431 {
1432   switch (u)
1433   {
1434   case 0x09CB:
1435   case 0x09CC:
1436     return 0x09c7;
1437   case 0x0b4b:
1438   case 0x0b4c:
1439     return 0x0b47;
1440   case 0x0d4b:
1441     return 0x0d47;
1442   case 0x0d4a:
1443   case 0x0d4c:
1444     return 0x0d46;
1445   default:
1446     break;
1447   }
1448   return 0;
1449 }
1450 /**
1451  * get right part of LR vowel
1452  */
1453 SS_UCS4
getLRVowelRight(SS_UCS4 u)1454 getLRVowelRight (SS_UCS4 u)
1455 {
1456   switch (u)
1457   {
1458   case 0x09CB:
1459     return 0x09be;
1460   case 0x09CC:
1461     return 0x09d7;
1462   case 0x0b4b:
1463     return 0x0b3e;
1464   case 0x0b4c:
1465     return 0x0b57;
1466   case 0x0d4a:
1467   case 0x0d4b:
1468     return 0x0d3e;
1469   case 0x0d4c:
1470     return 0x0d57;
1471   default:
1472     break;
1473   }
1474   return 0;
1475 }
1476 
1477 /**
1478  * Decompose yudit ligature into unicode characters
1479  */
1480 void
expandYuditLigatures(SV_UCS4 * decd)1481 expandYuditLigatures (SV_UCS4* decd)
1482 {
1483   if (decd->size()!=1 || (*decd)[0] < 0x80000000) return;
1484   SS_UCS4 ucs4 = (*decd)[0];
1485   decd->remove (0);
1486   /* Yudit ligatures*/
1487   switch (ucs4)
1488   {
1489   case 0x80000010: /* JIS X 0213: 02B65 */
1490     decd->append (0x02E9);
1491     decd->append (0x02E5);
1492     break;
1493   case 0x80000011: /* JIS X 0213: 02B66 */
1494     decd->append (0x02E5);
1495     decd->append (0x02E9);
1496     break;
1497 // Generated by ./jiscompose.pl at 2002-04-15
1498 // Add this to stoolkit/SCluster.cpp expandYuditLigatures
1499   case 0x80000040: /* JIS X 0213: 0x2477 */
1500     decd->append (0x304B);
1501     decd->append (0x309A);
1502     break;
1503   case 0x80000041: /* JIS X 0213: 0x2478 */
1504     decd->append (0x304D);
1505     decd->append (0x309A);
1506     break;
1507   case 0x80000042: /* JIS X 0213: 0x2479 */
1508     decd->append (0x304F);
1509     decd->append (0x309A);
1510     break;
1511   case 0x80000043: /* JIS X 0213: 0x247A */
1512     decd->append (0x3051);
1513     decd->append (0x309A);
1514     break;
1515   case 0x80000044: /* JIS X 0213: 0x247B */
1516     decd->append (0x3053);
1517     decd->append (0x309A);
1518     break;
1519   case 0x80000045: /* JIS X 0213: 0x2577 */
1520     decd->append (0x30AB);
1521     decd->append (0x309A);
1522     break;
1523   case 0x80000046: /* JIS X 0213: 0x2578 */
1524     decd->append (0x30AD);
1525     decd->append (0x309A);
1526     break;
1527   case 0x80000047: /* JIS X 0213: 0x2579 */
1528     decd->append (0x30AF);
1529     decd->append (0x309A);
1530     break;
1531   case 0x80000048: /* JIS X 0213: 0x257A */
1532     decd->append (0x30B1);
1533     decd->append (0x309A);
1534     break;
1535   case 0x80000049: /* JIS X 0213: 0x257B */
1536     decd->append (0x30B3);
1537     decd->append (0x309A);
1538     break;
1539   case 0x8000004A: /* JIS X 0213: 0x257C */
1540     decd->append (0x30BB);
1541     decd->append (0x309A);
1542     break;
1543   case 0x8000004B: /* JIS X 0213: 0x257D */
1544     decd->append (0x30C4);
1545     decd->append (0x309A);
1546     break;
1547   case 0x8000004C: /* JIS X 0213: 0x257E */
1548     decd->append (0x30C8);
1549     decd->append (0x309A);
1550     break;
1551   case 0x8000004D: /* JIS X 0213: 0x2678 */
1552     decd->append (0x31F7);
1553     decd->append (0x309A);
1554     break;
1555   case 0x8000004E: /* JIS X 0213: 0x2B44 */
1556     decd->append (0x00E6);
1557     decd->append (0x0300);
1558     break;
1559   case 0x8000004F: /* JIS X 0213: 0x2B48 */
1560     decd->append (0x0254);
1561     decd->append (0x0300);
1562     break;
1563   case 0x80000050: /* JIS X 0213: 0x2B49 */
1564     decd->append (0x0254);
1565     decd->append (0x0301);
1566     break;
1567   case 0x80000051: /* JIS X 0213: 0x2B4A */
1568     decd->append (0x028C);
1569     decd->append (0x0300);
1570     break;
1571   case 0x80000052: /* JIS X 0213: 0x2B4B */
1572     decd->append (0x028C);
1573     decd->append (0x0301);
1574     break;
1575   case 0x80000053: /* JIS X 0213: 0x2B4C */
1576     decd->append (0x0259);
1577     decd->append (0x0300);
1578     break;
1579   case 0x80000054: /* JIS X 0213: 0x2B4D */
1580     decd->append (0x0259);
1581     decd->append (0x0301);
1582     break;
1583   case 0x80000055: /* JIS X 0213: 0x2B4E */
1584     decd->append (0x025A);
1585     decd->append (0x0300);
1586     break;
1587   case 0x80000056: /* JIS X 0213: 0x2B4F */
1588     decd->append (0x025A);
1589     decd->append (0x0301);
1590     break;
1591 // END OF ./jiscompose.pl
1592   default:
1593     break;
1594   }
1595   if (decd->size()==0) decd->append(0xfffd);
1596   return;
1597 }
1598 
1599 /**
1600  * Get the Jamo class
1601  * @param ucs is the unicode character
1602  * @return one of
1603  * <ul>
1604  *  <li> SD_JAMO_X </li>
1605  *  <li> SD_JAMO_L </li>
1606  *  <li> SD_JAMO_V </li>
1607  *  <li> SD_JAMO_T </li>
1608  * </ul>
1609  */
1610 int
getJamoClass(SS_UCS4 uc)1611 getJamoClass (SS_UCS4 uc)
1612 {
1613   if (uc >= 0x1100 && uc <= 0x115f) return SD_JAMO_L;
1614   if (uc >= 0x1160 && uc <= 0x11a2) return SD_JAMO_V;
1615   if (uc >= 0x11a8 && uc <= 0x11f9) return SD_JAMO_T;
1616   return SD_JAMO_X;
1617 }
1618 
1619 /* get the name of OTF font shaping feature name */
1620 const char*
getShapeCode(unsigned int icode)1621 getShapeCode (unsigned int icode)
1622 {
1623   static const char* shapes[] = {
1624      "isol",
1625      "init",
1626      "medi",
1627      "fina",
1628      "med2",
1629      "fin2",
1630      "fin3",
1631      "init",
1632   };
1633   if (icode >= 8) return "unknown";
1634   return shapes[icode];
1635 }
1636