1 /* CharGlyphMap.java -- Manages the 'cmap' table of TrueType fonts
2    Copyright (C) 2006 Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 
39 package gnu.java.awt.font.opentype;
40 
41 import java.nio.ByteBuffer;
42 import java.nio.CharBuffer;
43 import java.nio.ShortBuffer;
44 import java.nio.IntBuffer;
45 
46 
47 /**
48  * A mapping from Unicode codepoints to glyphs. This mapping
49  * does not perform any re-ordering or decomposition, so it
50  * is not everything that is needed to support Unicode.
51  *
52  * <p>This class manages the <code>cmap</code> table of
53  * OpenType and TrueType fonts.
54  *
55  * @see <a href="http://partners.adobe.com/asn/tech/type/opentype/cmap.jsp">
56  *      the <code>cmap</code> part of Adobe&#x2019; OpenType Specification</a>
57  *
58  * @see <a href="http://developer.apple.com/fonts/TTRefMan/RM06/Chap6cmap.html">
59  *      the <code>cmap</code> section of Apple&#x2019;s TrueType Reference
60  *      Manual</a>
61  *
62  * @author Sascha Brawer (brawer@dandelis.ch)
63  */
64 public abstract class CharGlyphMap
65 {
66   private static final int PLATFORM_UNICODE = 0;
67   private static final int PLATFORM_MACINTOSH = 1;
68   private static final int PLATFORM_MICROSOFT = 3;
69 
70 
71   /**
72    * Determines the glyph index for a given Unicode codepoint.  Users
73    * should be aware that the character-to-glyph mapping not not
74    * everything that is needed for full Unicode support.  For example,
75    * the <code>cmap</code> table is not able to synthesize accented
76    * glyphs from the canonical decomposition sequence, even if the
77    * font would contain a glyph for the composed form.
78    *
79    * @param ucs4 the Unicode codepoint in UCS-4 encoding. Surrogates
80    * (U+D800 to U+DFFF) cannot be passed, they must be mapped to
81    * UCS-4 first.
82    *
83    * @return the glyph index, or 0 if the font does not contain
84    * a glyph for this codepoint.
85    */
getGlyph(int ucs4)86   public abstract int getGlyph(int ucs4);
87 
88 
89   /**
90    * Reads a CharGlyphMap from an OpenType or TrueType <code>cmap</code>
91    * table. The current implementation works as follows:
92    *
93    * <p><ol><li>If the font has a type 4 cmap for the Unicode platform
94    * (encoding 0, 1, 2, 3 or 4), or a type 4 cmap for the Microsoft
95    * platform (encodings 1 or 10), that table is used to map Unicode
96    * codepoints to glyphs.  Most recent fonts, both for Macintosh and
97    * Windows, should provide such a table.</li>
98    *
99    * <li>Otherwise, if the font has any type 0 cmap for the Macintosh
100    * platform, a Unicode-to-glyph mapping is synthesized from certain
101    * type 0 cmaps. The current implementation collects mappings from
102    * Roman, Icelandic, Turkish, Croatian, Romanian, Eastern European,
103    * Cyrillic, Greek, Hebrew, Arabic and Farsi cmaps.</li>.</ol>
104    *
105    * @param buf a buffer whose position is right at the start
106    * of the entire <code>cmap</code> table, and whose limit
107    * is at its end.
108    *
109    * @return a concrete subclass of <code>CharGlyphMap</code>
110    * that performs the mapping.
111    *
112    * @see <a href=
113    * "http://partners.adobe.com/asn/tech/type/opentype/cmap.jsp"
114    * >the <code>cmap</code> part of Adobe&#x2019; OpenType Specification</a>
115    *
116    * @see <a href=
117    * "http://developer.apple.com/fonts/TTRefMan/RM06/Chap6cmap.html"
118    * >the <code>cmap</code> section of Apple&#x2019;s TrueType Reference
119    * Manual</a>
120    */
forTable(ByteBuffer buf)121   public static CharGlyphMap forTable(ByteBuffer buf)
122   {
123     boolean hasType0 = false;
124     int start4 = -1, platform4 = 0, encoding4 = 0;
125     int start12 = -1, platform12 = 0, encoding12 = 0;
126     int version;
127     int numTables;
128     int tableStart = buf.position();
129     int limit = buf.limit();
130     int format, platform, language, encoding, length, offset;
131 
132     version = buf.getChar();
133     if (version != 0)
134       return null;
135 
136     numTables = buf.getChar();
137     for (int i = 0; i < numTables; i++)
138     {
139       buf.limit(limit).position(tableStart + 4 + i * 8);
140       platform = buf.getChar();
141       encoding = buf.getChar();
142       offset = tableStart + buf.getInt();
143 
144       buf.position(offset);
145       format = buf.getChar();
146 
147       switch (format)
148       {
149       case 0:
150         hasType0 = true;
151         break;
152 
153       case 4:
154         length = buf.getChar();
155         language = buf.getChar();
156         if ((start4 == -1)
157             && Type4.isSupported(platform, language, encoding))
158         {
159           start4 = offset;
160           platform4 = platform;
161           encoding4 = encoding;
162         }
163         break;
164 
165       case 12:
166         if ((start12 == -1) && Type12.isSupported(platform, encoding))
167         {
168           start12 = offset;
169           platform12 = platform;
170           encoding12 = encoding;
171         }
172         break;
173       }
174     }
175 
176 
177     if (start12 >= 0)
178     {
179       try
180       {
181         buf.limit(limit).position(start12);
182         return new Type12(buf, platform12, encoding12);
183       }
184       catch (Exception ex)
185       {
186         ex.printStackTrace();
187       }
188     }
189 
190     if (start4 >= 0)
191     {
192       try
193       {
194         buf.limit(limit).position(start4);
195         return Type4.readTable(buf, platform4, encoding4);
196       }
197       catch (Exception ex)
198       {
199       }
200     }
201 
202     if (hasType0)
203     {
204       try
205       {
206         buf.limit(limit).position(tableStart);
207         return new Type0(buf);
208       }
209       catch (Exception ex)
210       {
211       }
212     }
213 
214     return new Dummy();
215   }
216 
217 
218   /**
219    * A dummy mapping that maps anything to the undefined glyph.
220    * Used if no other cmap is understood in a font.
221    *
222    * @author Sascha Brawer (brawer@dandelis.ch)
223    */
224   private static final class Dummy
225     extends CharGlyphMap
226   {
getGlyph(int ucs4)227     public int getGlyph(int ucs4)
228     {
229       return 0;
230     }
231   }
232 
233 
234   /**
235    * A mapping from Unicode code points to glyph IDs through CMAP Type
236    * 0 tables. These tables have serious limitations: Only the first
237    * 256 glyphs can be addressed, and the source of the mapping is not
238    * Unicode, but an encoding used on the Macintosh.
239    *
240    * <p>However, some fonts have only a Type 0 cmap. In this case, we
241    * process all the Type 0 tables we understand, and establish
242    * a reversed glyph-to-Unicode mapping. When a glyph is requested
243    * for a given Unicode character, we perform a linear search on the
244    * reversed table to find the glyph which maps to the requested
245    * character. While not blazingly fast, this gives a reasonable
246    * fallback for old fonts.
247    *
248    * @author Sascha Brawer (brawer@dandelis.ch)
249    */
250   private static final class Type0
251     extends CharGlyphMap
252   {
253     /**
254      * An array whose <code>i</code>-th element indicates the
255      * Unicode code point of glyph <code>i</code> in the font.
256      */
257     private char[] glyphToUCS2 = new char[256];
258 
259 
260     /**
261      * A String whose <code>charAt(i)</code> is the Unicode character
262      * that corresponds to the codepoint <code>i + 127</code> in the
263      * MacOS Arabic encoding.
264      *
265      * @see <a href=
266      * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ARABIC.TXT"
267      * >the Unicode mapping table for the MacOS Arabic encoding</a>
268      */
269     private static final String UPPER_ARABIC
270       = "\u007e\u0000\u00c4\u00a0\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1"
271       + "\u00e0\u00e2\u00e4\u06ba\u00ab\u00e7\u00e9\u00e8\u00ea\u00eb"
272       + "\u00ed\u2026\u00ee\u00ef\u00f1\u00f3\u00bb\u00f4\u00f6\u00f7"
273       + "\u00fa\u00f9\u00fb\u00fc\u0020\u0021\"\u0023\u0024\u066a"
274       + "\u0026\u0027\u0028\u0029\u002a\u002b\u060c\u002d\u002e\u002f"
275       + "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669"
276       + "\u003a\u061b\u003c\u003d\u003e\u061f\u274a\u0621\u0622\u0623"
277       + "\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d"
278       + "\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637"
279       + "\u0638\u0639\u063a\u005b\\\u005d\u005e\u005f\u0640\u0641"
280       + "\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u064b"
281       + "\u064c\u064d\u064e\u064f\u0650\u0651\u0652\u067e\u0679\u0686"
282       + "\u06d5\u06a4\u06af\u0688\u0691\u007b\u007c\u007d\u0698\u06d2";
283 
284 
285     /**
286      * A String whose <code>charAt(i)</code> is the Unicode character
287      * that corresponds to the codepoint <code>i + 127</code> in the
288      * MacOS East European Roman encoding.
289      *
290      * @see <a href=
291      * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/CENTEURO.TXT"
292      * >the Unicode mapping table for the MacOS Central European
293      * encoding</a>
294      */
295     private static final String UPPER_EAST_EUROPEAN_ROMAN
296       = "\u007e\u0000\u00c4\u0100\u0101\u00c9\u0104\u00d6\u00dc\u00e1"
297       + "\u0105\u010c\u00e4\u010d\u0106\u0107\u00e9\u0179\u017a\u010e"
298       + "\u00ed\u010f\u0112\u0113\u0116\u00f3\u0117\u00f4\u00f6\u00f5"
299       + "\u00fa\u011a\u011b\u00fc\u2020\u00b0\u0118\u00a3\u00a7\u2022"
300       + "\u00b6\u00df\u00ae\u00a9\u2122\u0119\u00a8\u2260\u0123\u012e"
301       + "\u012f\u012a\u2264\u2265\u012b\u0136\u2202\u2211\u0142\u013b"
302       + "\u013c\u013d\u013e\u0139\u013a\u0145\u0146\u0143\u00ac\u221a"
303       + "\u0144\u0147\u2206\u00ab\u00bb\u2026\u00a0\u0148\u0150\u00d5"
304       + "\u0151\u014c\u2013\u2014\u201c\u201d\u2018\u2019\u00f7\u25ca"
305       + "\u014d\u0154\u0155\u0158\u2039\u203a\u0159\u0156\u0157\u0160"
306       + "\u201a\u201e\u0161\u015a\u015b\u00c1\u0164\u0165\u00cd\u017d"
307       + "\u017e\u016a\u00d3\u00d4\u016b\u016e\u00da\u016f\u0170\u0171"
308       + "\u0172\u0173\u00dd\u00fd\u0137\u017b\u0141\u017c\u0122\u02c7";
309 
310 
311     /**
312      * A String whose <code>charAt(i)</code> is the Unicode character
313      * that corresponds to the codepoint <code>i + 127</code> in the
314      * MacOS Roman encoding for the Croatian language.
315      *
316      * @see <a href=
317      * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/CROATIAN.TXT"
318      * >the Unicode mapping table for the MacOS Croatian encoding</a>
319      */
320     private static final String UPPER_CROATIAN
321       = "\u007e\u0000\u00c4\u00c5\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1"
322       + "\u00e0\u00e2\u00e4\u00e3\u00e5\u00e7\u00e9\u00e8\u00ea\u00eb"
323       + "\u00ed\u00ec\u00ee\u00ef\u00f1\u00f3\u00f2\u00f4\u00f6\u00f5"
324       + "\u00fa\u00f9\u00fb\u00fc\u2020\u00b0\u00a2\u00a3\u00a7\u2022"
325       + "\u00b6\u00df\u00ae\u0160\u2122\u00b4\u00a8\u2260\u017d\u00d8"
326       + "\u221e\u00b1\u2264\u2265\u2206\u00b5\u2202\u2211\u220f\u0161"
327       + "\u222b\u00aa\u00ba\u03a9\u017e\u00f8\u00bf\u00a1\u00ac\u221a"
328       + "\u0192\u2248\u0106\u00ab\u010c\u2026\u00a0\u00c0\u00c3\u00d5"
329       + "\u0152\u0153\u0110\u2014\u201c\u201d\u2018\u2019\u00f7\u25ca"
330       + "\uf8ff\u00a9\u2044\u20ac\u2039\u203a\u00c6\u00bb\u2013\u00b7"
331       + "\u201a\u201e\u2030\u00c2\u0107\u00c1\u010d\u00c8\u00cd\u00ce"
332       + "\u00cf\u00cc\u00d3\u00d4\u0111\u00d2\u00da\u00db\u00d9\u0131"
333       + "\u02c6\u02dc\u00af\u03c0\u00cb\u02da\u00b8\u00ca\u00e6\u02c7";
334 
335 
336     /**
337      * A String whose <code>charAt(i)</code> is the Unicode character
338      * that corresponds to the codepoint <code>i + 127</code> in the
339      * MacOS Cyrillic encoding.
340      *
341      * @see <a href=
342      * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/CYRILLIC.TXT"
343      * >the Unicode mapping table for the MacOS Cyrillic encoding</a>
344      */
345     private static final String UPPER_CYRILLIC
346       = "\u007e\u0000\u0410\u0411\u0412\u0413\u0414\u0415\u0416\u0417"
347       + "\u0418\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u0420\u0421"
348       + "\u0422\u0423\u0424\u0425\u0426\u0427\u0428\u0429\u042a\u042b"
349       + "\u042c\u042d\u042e\u042f\u2020\u00b0\u0490\u00a3\u00a7\u2022"
350       + "\u00b6\u0406\u00ae\u00a9\u2122\u0402\u0452\u2260\u0403\u0453"
351       + "\u221e\u00b1\u2264\u2265\u0456\u00b5\u0491\u0408\u0404\u0454"
352       + "\u0407\u0457\u0409\u0459\u040a\u045a\u0458\u0405\u00ac\u221a"
353       + "\u0192\u2248\u2206\u00ab\u00bb\u2026\u00a0\u040b\u045b\u040c"
354       + "\u045c\u0455\u2013\u2014\u201c\u201d\u2018\u2019\u00f7\u201e"
355       + "\u040e\u045e\u040f\u045f\u2116\u0401\u0451\u044f\u0430\u0431"
356       + "\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b"
357       + "\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445"
358       + "\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u20ac";
359 
360 
361     /**
362      * A String whose <code>charAt(i)</code> is the Unicode character
363      * that corresponds to the codepoint <code>i + 127</code> in the
364      * MacOS Arabic encoding with the Farsi language.
365      *
366      * @see <a href=
367      * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/FARSI.TXT"
368      * >the Unicode mapping table for the MacOS Farsi encoding</a>
369      */
370     private static final String UPPER_FARSI
371       = "\u007e\u0000\u00c4\u00a0\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1"
372       + "\u00e0\u00e2\u00e4\u06ba\u00ab\u00e7\u00e9\u00e8\u00ea\u00eb"
373       + "\u00ed\u2026\u00ee\u00ef\u00f1\u00f3\u00bb\u00f4\u00f6\u00f7"
374       + "\u00fa\u00f9\u00fb\u00fc\u0020\u0021\"\u0023\u0024\u066a"
375       + "\u0026\u0027\u0028\u0029\u002a\u002b\u060c\u002d\u002e\u002f"
376       + "\u06f0\u06f1\u06f2\u06f3\u06f4\u06f5\u06f6\u06f7\u06f8\u06f9"
377       + "\u003a\u061b\u003c\u003d\u003e\u061f\u274a\u0621\u0622\u0623"
378       + "\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d"
379       + "\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637"
380       + "\u0638\u0639\u063a\u005b\\\u005d\u005e\u005f\u0640\u0641"
381       + "\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u064b"
382       + "\u064c\u064d\u064e\u064f\u0650\u0651\u0652\u067e\u0679\u0686"
383       + "\u06d5\u06a4\u06af\u0688\u0691\u007b\u007c\u007d\u0698\u06d2";
384 
385 
386     /**
387      * A String whose <code>charAt(i)</code> is the Unicode character
388      * that corresponds to the codepoint <code>i + 127</code> in the
389      * MacOS Greek encoding.
390      *
391      * @see <a
392      * href="http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/GREEK.TXT"
393      * >the Unicode mapping table for the MacOS Greek encoding</a>
394      */
395     private static final String UPPER_GREEK
396       = "\u007e\u0000\u00c4\u00b9\u00b2\u00c9\u00b3\u00d6\u00dc\u0385"
397       + "\u00e0\u00e2\u00e4\u0384\u00a8\u00e7\u00e9\u00e8\u00ea\u00eb"
398       + "\u00a3\u2122\u00ee\u00ef\u2022\u00bd\u2030\u00f4\u00f6\u00a6"
399       + "\u20ac\u00f9\u00fb\u00fc\u2020\u0393\u0394\u0398\u039b\u039e"
400       + "\u03a0\u00df\u00ae\u00a9\u03a3\u03aa\u00a7\u2260\u00b0\u00b7"
401       + "\u0391\u00b1\u2264\u2265\u00a5\u0392\u0395\u0396\u0397\u0399"
402       + "\u039a\u039c\u03a6\u03ab\u03a8\u03a9\u03ac\u039d\u00ac\u039f"
403       + "\u03a1\u2248\u03a4\u00ab\u00bb\u2026\u00a0\u03a5\u03a7\u0386"
404       + "\u0388\u0153\u2013\u2015\u201c\u201d\u2018\u2019\u00f7\u0389"
405       + "\u038a\u038c\u038e\u03ad\u03ae\u03af\u03cc\u038f\u03cd\u03b1"
406       + "\u03b2\u03c8\u03b4\u03b5\u03c6\u03b3\u03b7\u03b9\u03be\u03ba"
407       + "\u03bb\u03bc\u03bd\u03bf\u03c0\u03ce\u03c1\u03c3\u03c4\u03b8"
408       + "\u03c9\u03c2\u03c7\u03c5\u03b6\u03ca\u03cb\u0390\u03b0\u00ad";
409 
410 
411     /**
412      * A String whose <code>charAt(i)</code> is the Unicode character
413      * that corresponds to the codepoint <code>i + 127</code> in the
414      * MacOS Hebrew encoding.
415      *
416      * <p>The codepoint 0x81 (HEBREW LIGATURE YIDDISH YOD YOD PATAH)
417      * has no composed Unicode equivalent, but is expressed as the
418      * sequence U+05F2 U+05B7 in Unicode. A similar situation exists
419      * with the codepoint 0xC0 (HEBREW LIGATURE LAMED HOLAM), which
420      * MacOS converts to U+F86A U+05DC U+05B9. To correctly deal
421      * with these sequences, we probably should synthesize a ligature
422      * table if a Hebrew font only provides a Type 0 CMAP.
423      *
424      * @see <a href=
425      * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/HEBREW.TXT"
426      * >the Unicode mapping table for the MacOS Hebrew encoding</a>
427      */
428     private static final String UPPER_HEBREW
429       = "\u007e\u0000\u00c4\u0000\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1"
430       + "\u00e0\u00e2\u00e4\u00e3\u00e5\u00e7\u00e9\u00e8\u00ea\u00eb"
431       + "\u00ed\u00ec\u00ee\u00ef\u00f1\u00f3\u00f2\u00f4\u00f6\u00f5"
432       + "\u00fa\u00f9\u00fb\u00fc\u0020\u0021\"\u0023\u0024\u0025"
433       + "\u20aa\u0027\u0029\u0028\u002a\u002b\u002c\u002d\u002e\u002f"
434       + "\u0030\u0031\u0032\u0033\u0034\u0035\u0036\u0037\u0038\u0039"
435       + "\u003a\u003b\u003c\u003d\u003e\u003f\u0000\u201e\uf89b\uf89c"
436       + "\uf89d\uf89e\u05bc\ufb4b\ufb35\u2026\u00a0\u05b8\u05b7\u05b5"
437       + "\u05b6\u05b4\u2013\u2014\u201c\u201d\u2018\u2019\ufb2a\ufb2b"
438       + "\u05bf\u05b0\u05b2\u05b1\u05bb\u05b9\u0000\u05b3\u05d0\u05d1"
439       + "\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05da\u05db"
440       + "\u05dc\u05dd\u05de\u05df\u05e0\u05e1\u05e2\u05e3\u05e4\u05e5"
441       + "\u05e6\u05e7\u05e8\u05e9\u05ea\u007d\u005d\u007b\u005b\u007c";
442 
443 
444     /**
445      * A String whose <code>charAt(i)</code> is the Unicode character
446      * that corresponds to the codepoint <code>i + 127</code> in the
447      * MacOS Roman encoding with the Icelandic language.
448      *
449      * @see <a href=
450      * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ICELAND.TXT"
451      * >the Unicode mapping table for the MacOS Icelandic encoding</a>
452      */
453     private static final String UPPER_ICELANDIC
454       = "\u007e\u0000\u00c4\u00c5\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1"
455       + "\u00e0\u00e2\u00e4\u00e3\u00e5\u00e7\u00e9\u00e8\u00ea\u00eb"
456       + "\u00ed\u00ec\u00ee\u00ef\u00f1\u00f3\u00f2\u00f4\u00f6\u00f5"
457       + "\u00fa\u00f9\u00fb\u00fc\u00dd\u00b0\u00a2\u00a3\u00a7\u2022"
458       + "\u00b6\u00df\u00ae\u00a9\u2122\u00b4\u00a8\u2260\u00c6\u00d8"
459       + "\u221e\u00b1\u2264\u2265\u00a5\u00b5\u2202\u2211\u220f\u03c0"
460       + "\u222b\u00aa\u00ba\u03a9\u00e6\u00f8\u00bf\u00a1\u00ac\u221a"
461       + "\u0192\u2248\u2206\u00ab\u00bb\u2026\u00a0\u00c0\u00c3\u00d5"
462       + "\u0152\u0153\u2013\u2014\u201c\u201d\u2018\u2019\u00f7\u25ca"
463       + "\u00ff\u0178\u2044\u20ac\u00d0\u00f0\u00de\u00fe\u00fd\u00b7"
464       + "\u201a\u201e\u2030\u00c2\u00ca\u00c1\u00cb\u00c8\u00cd\u00ce"
465       + "\u00cf\u00cc\u00d3\u00d4\uf8ff\u00d2\u00da\u00db\u00d9\u0131"
466       + "\u02c6\u02dc\u00af\u02d8\u02d9\u02da\u00b8\u02dd\u02db\u02c7";
467 
468 
469     /**
470      * A String whose <code>charAt(i)</code> is the Unicode character
471      * that corresponds to the codepoint <code>i + 127</code> in the
472      * MacOS Roman encoding for most languages. Exceptions include
473      * Croatian, Icelandic, Romanian, and Turkish.
474      *
475      * @see <a
476      * href="http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT"
477      * >the Unicode mapping table for the MacOS Roman encoding</a>
478      */
479     private static final String UPPER_ROMAN
480       = "\u007e\u0000\u00c4\u00c5\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1"
481       + "\u00e0\u00e2\u00e4\u00e3\u00e5\u00e7\u00e9\u00e8\u00ea\u00eb"
482       + "\u00ed\u00ec\u00ee\u00ef\u00f1\u00f3\u00f2\u00f4\u00f6\u00f5"
483       + "\u00fa\u00f9\u00fb\u00fc\u2020\u00b0\u00a2\u00a3\u00a7\u2022"
484       + "\u00b6\u00df\u00ae\u00a9\u2122\u00b4\u00a8\u2260\u00c6\u00d8"
485       + "\u221e\u00b1\u2264\u2265\u00a5\u00b5\u2202\u2211\u220f\u03c0"
486       + "\u222b\u00aa\u00ba\u03a9\u00e6\u00f8\u00bf\u00a1\u00ac\u221a"
487       + "\u0192\u2248\u2206\u00ab\u00bb\u2026\u00a0\u00c0\u00c3\u00d5"
488       + "\u0152\u0153\u2013\u2014\u201c\u201d\u2018\u2019\u00f7\u25ca"
489       + "\u00ff\u0178\u2044\u20ac\u2039\u203a\ufb01\ufb02\u2021\u00b7"
490       + "\u201a\u201e\u2030\u00c2\u00ca\u00c1\u00cb\u00c8\u00cd\u00ce"
491       + "\u00cf\u00cc\u00d3\u00d4\uf8ff\u00d2\u00da\u00db\u00d9\u0131"
492       + "\u02c6\u02dc\u00af\u02d8\u02d9\u02da\u00b8\u02dd\u02db\u02c7";
493 
494 
495     /**
496      * A String whose <code>charAt(i)</code> is the Unicode character
497      * that corresponds to the codepoint <code>i + 127</code> in the
498      * MacOS Roman encoding with the Romanian language.
499      *
500      * @see <a href=
501      * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMANIAN.TXT"
502      * >the Unicode mapping table for the MacOS Romanian encoding</a>
503      */
504     private static final String UPPER_ROMANIAN
505       = "\u007e\u0000\u00c4\u00c5\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1"
506       + "\u00e0\u00e2\u00e4\u00e3\u00e5\u00e7\u00e9\u00e8\u00ea\u00eb"
507       + "\u00ed\u00ec\u00ee\u00ef\u00f1\u00f3\u00f2\u00f4\u00f6\u00f5"
508       + "\u00fa\u00f9\u00fb\u00fc\u2020\u00b0\u00a2\u00a3\u00a7\u2022"
509       + "\u00b6\u00df\u00ae\u00a9\u2122\u00b4\u00a8\u2260\u0102\u0218"
510       + "\u221e\u00b1\u2264\u2265\u00a5\u00b5\u2202\u2211\u220f\u03c0"
511       + "\u222b\u00aa\u00ba\u03a9\u0103\u0219\u00bf\u00a1\u00ac\u221a"
512       + "\u0192\u2248\u2206\u00ab\u00bb\u2026\u00a0\u00c0\u00c3\u00d5"
513       + "\u0152\u0153\u2013\u2014\u201c\u201d\u2018\u2019\u00f7\u25ca"
514       + "\u00ff\u0178\u2044\u20ac\u2039\u203a\u021a\u021b\u2021\u00b7"
515       + "\u201a\u201e\u2030\u00c2\u00ca\u00c1\u00cb\u00c8\u00cd\u00ce"
516       + "\u00cf\u00cc\u00d3\u00d4\uf8ff\u00d2\u00da\u00db\u00d9\u0131"
517       + "\u02c6\u02dc\u00af\u02d8\u02d9\u02da\u00b8\u02dd\u02db\u02c7";
518 
519 
520     /**
521      * A String whose <code>charAt(i)</code> is the Unicode character
522      * that corresponds to the codepoint <code>i + 127</code> in the
523      * MacOS Roman encoding with the Turkish language.
524      *
525      * @see <a href=
526      * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/TURKISH.TXT"
527      * >the Unicode mapping table for the MacOS Turkish encoding</a>
528      */
529     private static final String UPPER_TURKISH
530       = "\u007e\u0000\u00c4\u00c5\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1"
531       + "\u00e0\u00e2\u00e4\u00e3\u00e5\u00e7\u00e9\u00e8\u00ea\u00eb"
532       + "\u00ed\u00ec\u00ee\u00ef\u00f1\u00f3\u00f2\u00f4\u00f6\u00f5"
533       + "\u00fa\u00f9\u00fb\u00fc\u2020\u00b0\u00a2\u00a3\u00a7\u2022"
534       + "\u00b6\u00df\u00ae\u00a9\u2122\u00b4\u00a8\u2260\u00c6\u00d8"
535       + "\u221e\u00b1\u2264\u2265\u00a5\u00b5\u2202\u2211\u220f\u03c0"
536       + "\u222b\u00aa\u00ba\u03a9\u00e6\u00f8\u00bf\u00a1\u00ac\u221a"
537       + "\u0192\u2248\u2206\u00ab\u00bb\u2026\u00a0\u00c0\u00c3\u00d5"
538       + "\u0152\u0153\u2013\u2014\u201c\u201d\u2018\u2019\u00f7\u25ca"
539       + "\u00ff\u0178\u011e\u011f\u0130\u0131\u015e\u015f\u2021\u00b7"
540       + "\u201a\u201e\u2030\u00c2\u00ca\u00c1\u00cb\u00c8\u00cd\u00ce"
541       + "\u00cf\u00cc\u00d3\u00d4\uf8ff\u00d2\u00da\u00db\u00d9\uf8a0"
542       + "\u02c6\u02dc\u00af\u02d8\u02d9\u02da\u00b8\u02dd\u02db\u02c7";
543 
544 
545     /**
546      * Constructs a CharGlyphMap.Type0 from all type 0 cmaps provided
547      * by the font. The implementation is able to fuse multiple type
548      * 0 cmaps, such as the MacRoman, Turkish, Icelandic and Croatian
549      * encoding, into a single map from Unicode characters to glyph
550      * indices.
551      *
552      * @param buf a ByteBuffer whose position is right at the
553      * beginning of the entire cmap table of the font (<i>not</i>
554      * at some subtable).
555      */
Type0(ByteBuffer buf)556     public Type0(ByteBuffer buf)
557     {
558       int numTables;
559       int tableStart = buf.position();
560       int limit = buf.limit();
561 
562       /* The CMAP version must be 0. */
563       if (buf.getChar() != 0)
564         throw new IllegalStateException();
565 
566       numTables = buf.getChar();
567       for (int i = 0; i < numTables; i++)
568       {
569         buf.limit(limit).position(tableStart + 4 + i * 8);
570         int platform = buf.getChar();
571         int encoding = buf.getChar();
572         int offset = tableStart + buf.getInt();
573 
574         buf.position(offset);
575         int format = buf.getChar();
576         int length = buf.getChar();
577         buf.limit(offset + length);
578         int language = buf.getChar();
579 
580         if (format == 0)
581           readSingleTable(buf, platform, language, encoding);
582       }
583     }
584 
585 
586     /**
587      * Processes a CMAP Type 0 table whose platform, encoding and
588      * language are already known.
589      *
590      * @param buf the buffer to read the table from, positioned
591      *        right after the language tag.
592      */
readSingleTable(ByteBuffer buf, int platform, int language, int encoding)593     private void readSingleTable(ByteBuffer buf,
594                                  int platform, int language,
595                                  int encoding)
596     {
597       String upper = getUpper129(platform, encoding, language);
598       if (upper == null)
599         return;
600 
601       /* Skip the MacOS codepoints [0 .. 31] because they do not
602        * correspond to any Unicode codepoint.
603        */
604       buf.position(buf.position() + 32);
605 
606       /* Irrespective of script and language, the MacOS codepoints
607        * [32 .. 126] correspond to the same Unicode codepoint.
608        */
609       for (int i = 32; i < 126; i++)
610         glyphToUCS2[buf.get() & 0xff] = (char) i;
611 
612       for (int i = 127; i < 256; i++)
613         glyphToUCS2[buf.get() & 0xff] = upper.charAt(i - 127);
614 
615       /* Glyph 0 is always the undefined character, which has
616        * no codepoint in Unicode.
617        */
618       glyphToUCS2[0] = 0;
619     }
620 
621 
622     /**
623      * Determines the glyph index for a given Unicode codepoint.
624      *
625      * @param ucs4 the Unicode codepoint in UCS-4 encoding.
626      *
627      * @return the glyph index, or 0 if the font does not contain
628      * a glyph for this codepoint.
629      */
getGlyph(int ucs4)630     public int getGlyph(int ucs4)
631     {
632       /* This linear search is not exactly super fast. However,
633        * only really ancient fonts have only a type 0 cmap,
634        * so it should not hurt in very many cases. If it shows
635        * to be a performance problem, one could do a binary search
636        * on a 256-entry table sorted by Unicode codepoint. The
637        * matching index of that table could then be used to look
638        * up the glyph ID at that position.
639        */
640       for (int i = 0; i < 256; i++)
641         if (glyphToUCS2[i] == ucs4)
642           return i;
643       return 0;
644     }
645 
646 
647     /**
648      * Returns a String whose <code>charAt(i)</code> is the Unicode
649      * character that corresponds to the codepoint <code>i +
650      * 127</code> in the encoding specified by the platform, script
651      * and language tag of a Type 0 CMAP.
652      *
653      * @param language the language tag in the cmap subtable.  For the
654      * Macintosh platform, this is 0 to indicate language-neutral
655      * encoding, or the MacOS language code <i>plus one.</i> The
656      * Apple documentation does not mention that one needs to be
657      * added, but the Adobe OpenType specification does.
658      *
659      * @return a String for mapping the top 129 characters to
660      * UCS-2. If <code>platform</code> is not <code>1</code>
661      * (indicating Macintosh), or if the combination of
662      * <code>script</code> and <code>language</code> is not
663      * recognized, <code>null</code> will be returned.
664      */
getUpper129(int platform, int script, int language)665     private static String getUpper129(int platform, int script, int language)
666     {
667       if (platform != PLATFORM_MACINTOSH)
668         return null;
669 
670       switch (script)
671       {
672       case 0: /* smRoman */
673         if (language == /* langIcelandic+1 */ 16)
674           return UPPER_ICELANDIC;
675         else if (language == /* langTurkish+1 */ 18)
676           return UPPER_TURKISH;
677         else if (language == /* langCroatian+1 */ 19)
678           return UPPER_CROATIAN;
679         else if (language == /* langRomanian+1 */ 38)
680           return UPPER_ROMANIAN;
681         else if (language == /* language-neutral */ 0)
682           return UPPER_ROMAN;
683         else
684           return null;
685 
686       case 4: /* smArabic */
687         if (language == /* langFarsi+1 */ 32)
688           return UPPER_FARSI;
689         else
690           return UPPER_ARABIC;
691 
692       case 5: /* smHebrew */
693         return UPPER_HEBREW;
694 
695       case 6: /* smGreek */
696         return UPPER_GREEK;
697 
698       case 7: /* smCyrillic */
699         return UPPER_CYRILLIC;
700 
701       case 29: /* smSlavic == smEastEurRoman */
702         return UPPER_EAST_EUROPEAN_ROMAN;
703       }
704 
705       return null;
706     }
707   }
708 
709 
710   /**
711    * A mapping from Unicode code points to glyph IDs through CMAP Type
712    * 4 tables. These tables are able to map two-byte encoded text
713    * to glyph IDs, such as Unicode Basic Multilingual Plane which
714    * contains U+0000 .. U+FFFE without surrogates.
715    *
716    * @author Sascha Brawer (brawer@dandelis.ch)
717    */
718   private static final class Type4
719     extends CharGlyphMap
720   {
721     /**
722      * Determines whether this implementation supports a combination
723      * of platform, language and encoding is supported for a type 4
724      * <code>cmap</code> table.
725      *
726      * <p>Currently, we support the following combinations:
727      *
728      * <ul><li>the Unicode platform in encodings 0, 1, 2, 3 and
729      * 4;</li>
730      *
731      * <li>the Microsoft platform in encodings 1 (Basic Multilingual
732      * Plane) and 10 (full Unicode).</li></ul>
733      *
734      * <p>Most recent Macintosh fonts provide a type 4
735      * <code>cmap</code> for Unicode. Microsoft recommends providing a
736      * type 4 <code>cmap</code> for encoding 1 of the Microsoft
737      * platform. The implementation of GNU Classpath supports both
738      * variants.
739      *
740      * <p>Not supported are ShiftJIS, Big5, Wansung, Johab, and other
741      * non-Unicode encodings. Text can easily be converted to Unicode
742      * using the java.nio.charset package.
743      */
isSupported(int platform, int language, int encoding)744     static boolean isSupported(int platform, int language, int encoding)
745     {
746       switch (platform)
747       {
748       case PLATFORM_UNICODE:
749         return (encoding >= 0) && (encoding <= 4);
750 
751       case PLATFORM_MICROSOFT:
752         return (encoding == /* Basic Multilingual Plane */ 1)
753           || (encoding == /* Full Unicode */ 10);
754       }
755 
756       return false;
757     }
758 
759 
760     /**
761      * Processes a CMAP Type 4 table whose platform, encoding and
762      * language are already known. We understand the Unicode platform
763      * with encodings 0, 1, 2, 3 and 4, and the Microsoft platform
764      * with encodings 1 (Unicode BMP) and 10 (UCS-4).
765      *
766      * @param buf the buffer to read the table from, positioned at
767      * its beginning.
768      *
769      * @return a Type4 table, or <code>null</code> if the combination
770      * of platform and encoding is not understood.
771      */
readTable(ByteBuffer buf, int platform, int encoding)772     static Type4 readTable(ByteBuffer buf,
773                            int platform, int encoding)
774     {
775       int tableStart = buf.position();
776       char format = buf.getChar();
777       int length = buf.getChar();
778       int language = buf.getChar();
779 
780       if ((format != 4) || !isSupported(platform, language, encoding))
781         throw new IllegalArgumentException();
782 
783       buf.limit(tableStart + length);
784 
785       int segCountX2 = buf.getChar();
786       int segCount = segCountX2 / 2;
787       int searchRange = buf.getChar();
788       int entrySelector = buf.getChar();
789       int rangeShift = buf.getChar();
790 
791       CharBuffer endCode, startCode, idRangeOffset_glyphID;
792       ShortBuffer idDelta;
793 
794       int pos = buf.position();
795       endCode = buf.asCharBuffer();
796       pos += segCountX2 + /* reservedPad */ 2;
797 
798       buf.position(pos);
799       startCode = buf.asCharBuffer();
800       pos += segCountX2;
801 
802       buf.position(pos);
803       idDelta = buf.asShortBuffer();
804       pos += segCountX2;
805 
806       buf.position(pos);
807       idRangeOffset_glyphID = buf.asCharBuffer();
808 
809       endCode.limit(segCount);
810       startCode.limit(segCount);
811       idDelta.limit(segCount);
812       idRangeOffset_glyphID.limit((buf.limit() - pos) / 2);
813 
814       return new Type4(segCount,
815                        endCode, startCode, idDelta,
816                        idRangeOffset_glyphID);
817     }
818 
819 
820     private CharBuffer lastChar;
821     private CharBuffer firstChar;
822     private ShortBuffer idDelta;
823     private CharBuffer rangeID;
824     private int numSegments;
825 
Type4(int numSegments, CharBuffer lastChar, CharBuffer firstChar, ShortBuffer idDelta, CharBuffer rangeID)826     private Type4(int numSegments,
827                   CharBuffer lastChar, CharBuffer firstChar,
828                   ShortBuffer idDelta, CharBuffer rangeID)
829     {
830       this.numSegments = numSegments;
831       this.lastChar = lastChar;
832       this.firstChar = firstChar;
833       this.idDelta = idDelta;
834       this.rangeID = rangeID;
835     }
836 
837 
838     /**
839      * Determines the glyph index for a given Unicode codepoint.
840      *
841      * @param ucs4 the Unicode codepoint in UCS-4 encoding.
842      *
843      * @return the glyph index, or 0 if the font does not contain
844      * a glyph for this codepoint.
845      */
getGlyph(int ucs4)846     public int getGlyph(int ucs4)
847     {
848       char c, segStart;
849       int segment, idRangeOffset;
850 
851       if (ucs4 > 0xffff)
852         return 0;
853 
854       c = (char) ucs4;
855       segment = find(c);
856       segStart = firstChar.get(segment);
857       if ((c < segStart) || (c > lastChar.get(segment)))
858         return 0;
859 
860       /*
861        *      System.out.println("seg " + segment
862        *                 + ", range=" + (int) rangeID[segment]
863        *                 + ", delta=" + delta[segment]);
864        */
865 
866       idRangeOffset = rangeID.get(segment);
867       if (idRangeOffset == 0)
868         return (int) (char) (((int) c) + idDelta.get(segment));
869       int result = rangeID.get((idRangeOffset >> 1)
870                                + (c - segStart) + segment);
871       if (result == 0)
872         return 0;
873       return (int) (char) (result + idDelta.get(segment));
874     }
875 
876 
find(char c)877     private int find(char c)
878     {
879       int min, max, mid;
880 
881       min = 0;
882       max = numSegments - 1;
883       mid = max >> 1;
884 
885       while (min < max)
886       {
887         // System.out.println("(" + min + "," + max + ") " + mid);
888         char val = lastChar.get(mid);
889         if (val == c)
890           break;
891         else if (val < c)
892           min = mid + 1;
893         else if (val > c)
894           max = mid;
895         mid = (min + max) >> 1;
896       }
897 
898       return mid;
899     }
900   }
901 
902 
903   /**
904    * A mapping from Unicode code points to glyph IDs through CMAP Type
905    * 12 tables. These tables are able to map four-byte encoded text
906    * to glyph IDs, such as Unicode UCS-4.
907    *
908    * @author Sascha Brawer (brawer@dandelis.ch)
909    */
910   private static final class Type12
911     extends CharGlyphMap
912   {
913     int numGroups;
914     IntBuffer data;
915 
916 
917     /**
918      * Determines whether this implementation supports a combination
919      * of platform and encoding for a type 12 <code>cmap</code> table.
920      *
921      * <p>Currently, we support the following combinations:
922      *
923      * <ul><li>the Unicode platform in encodings 0, 1, 2, 3 and
924      * 4;</li>
925      *
926      * <li>the Microsoft platform in encodings 1 (Basic Multilingual
927      * Plane) and 10 (full Unicode).</li></ul>
928      */
isSupported(int platform, int encoding)929     static boolean isSupported(int platform, int encoding)
930     {
931       switch (platform)
932       {
933       case PLATFORM_UNICODE:
934         return (encoding >= 0) && (encoding <= 4);
935 
936       case PLATFORM_MICROSOFT:
937         return (encoding == /* Basic Multilingual Plane */ 1)
938           || (encoding == /* Full Unicode */ 10);
939       }
940 
941       return false;
942     }
943 
944 
945     /**
946      * Constructs a <code>cmap</code> type 12 table whose platform and
947      * encoding are already known. We understand the Unicode platform
948      * with encodings 0, 1, 2, 3 and 4, and the Microsoft platform
949      * with encodings 1 (Unicode BMP) and 10 (UCS-4).
950      *
951      * @param buf the buffer to read the table from, positioned at
952      * its beginning.
953      */
Type12(ByteBuffer buf, int platform, int encoding)954     Type12(ByteBuffer buf, int platform, int encoding)
955     {
956       int tableStart = buf.position();
957       int format = buf.getChar();
958       if ((format != 12) || !isSupported(platform, encoding))
959         throw new IllegalStateException();
960 
961       buf.getChar(); // skip reserved field
962       buf.limit(tableStart + buf.getInt());
963       int language = buf.getInt();
964       numGroups = buf.getInt();
965       data = buf.asIntBuffer();
966     }
967 
968 
969     /**
970      * Determines the glyph index for a given Unicode codepoint.  Users
971      * should be aware that the character-to-glyph mapping not not
972      * everything that is needed for full Unicode support.  For example,
973      * the <code>cmap</code> table is not able to synthesize accented
974      * glyphs from the canonical decomposition sequence, even if the
975      * font would contain a glyph for the composed form.
976      *
977      * @param ucs4 the Unicode codepoint in UCS-4 encoding. Surrogates
978      * (U+D800 to U+DFFF) cannot be passed, they must be mapped to
979      * UCS-4 first.
980      *
981      * @return the glyph index, or 0 if the font does not contain
982      * a glyph for this codepoint.
983      */
getGlyph(int ucs4)984     public int getGlyph(int ucs4)
985     {
986       int min, max, mid, startCharCode, endCharCode;
987 
988       min = 0;
989       max = numGroups - 1;
990       mid = max >> 1;
991       do
992       {
993         startCharCode = data.get(3 * mid);
994         endCharCode = data.get(3 * mid + 1);
995 
996 
997         /*
998         System.out.println("group " + mid + " (U+"
999         + Integer.toHexString(startCharCode)
1000         + " .. U+" + Integer.toHexString(endCharCode)
1001         + "): glyph " + (int) data.get(mid*3+2));
1002         */
1003 
1004         if ((startCharCode <= ucs4)  && (ucs4 <= endCharCode))
1005           return ucs4
1006             - startCharCode
1007             + /* startGlyphID */ data.get(mid * 3 + 2);
1008 
1009         if (endCharCode < ucs4)
1010           min = mid + 1;
1011         else
1012           max = mid;
1013         mid = (min + max) >> 1;
1014       }
1015       while (min < max);
1016 
1017       startCharCode = data.get(3 * mid);
1018       endCharCode = data.get(3 * mid + 1);
1019       if ((startCharCode <= ucs4)  && (ucs4 <= endCharCode))
1020         return ucs4
1021           - startCharCode
1022           + /* startGlyphID */ data.get(mid * 3 + 2);
1023 
1024       return 0;
1025     }
1026   }
1027 }
1028