1 /* CharGlyphMap.java -- Manages the 'cmap' table of TrueType fonts 2 Copyright (C) 2006 Free Software Foundation, Inc. 3 4 This file is part of GNU Classpath. 5 6 GNU Classpath is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 GNU Classpath is distributed in the hope that it will be useful, but 12 WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with GNU Classpath; see the file COPYING. If not, write to the 18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA. 20 21 Linking this library statically or dynamically with other modules is 22 making a combined work based on this library. Thus, the terms and 23 conditions of the GNU General Public License cover the whole 24 combination. 25 26 As a special exception, the copyright holders of this library give you 27 permission to link this library with independent modules to produce an 28 executable, regardless of the license terms of these independent 29 modules, and to copy and distribute the resulting executable under 30 terms of your choice, provided that you also meet, for each linked 31 independent module, the terms and conditions of the license of that 32 module. An independent module is a module which is not derived from 33 or based on this library. If you modify this library, you may extend 34 this exception to your version of the library, but you are not 35 obligated to do so. If you do not wish to do so, delete this 36 exception statement from your version. */ 37 38 39 package gnu.java.awt.font.opentype; 40 41 import java.nio.ByteBuffer; 42 import java.nio.CharBuffer; 43 import java.nio.ShortBuffer; 44 import java.nio.IntBuffer; 45 46 47 /** 48 * A mapping from Unicode codepoints to glyphs. This mapping 49 * does not perform any re-ordering or decomposition, so it 50 * is not everything that is needed to support Unicode. 51 * 52 * <p>This class manages the <code>cmap</code> table of 53 * OpenType and TrueType fonts. 54 * 55 * @see <a href="http://partners.adobe.com/asn/tech/type/opentype/cmap.jsp"> 56 * the <code>cmap</code> part of Adobe’ OpenType Specification</a> 57 * 58 * @see <a href="http://developer.apple.com/fonts/TTRefMan/RM06/Chap6cmap.html"> 59 * the <code>cmap</code> section of Apple’s TrueType Reference 60 * Manual</a> 61 * 62 * @author Sascha Brawer (brawer@dandelis.ch) 63 */ 64 public abstract class CharGlyphMap 65 { 66 private static final int PLATFORM_UNICODE = 0; 67 private static final int PLATFORM_MACINTOSH = 1; 68 private static final int PLATFORM_MICROSOFT = 3; 69 70 71 /** 72 * Determines the glyph index for a given Unicode codepoint. Users 73 * should be aware that the character-to-glyph mapping not not 74 * everything that is needed for full Unicode support. For example, 75 * the <code>cmap</code> table is not able to synthesize accented 76 * glyphs from the canonical decomposition sequence, even if the 77 * font would contain a glyph for the composed form. 78 * 79 * @param ucs4 the Unicode codepoint in UCS-4 encoding. Surrogates 80 * (U+D800 to U+DFFF) cannot be passed, they must be mapped to 81 * UCS-4 first. 82 * 83 * @return the glyph index, or 0 if the font does not contain 84 * a glyph for this codepoint. 85 */ getGlyph(int ucs4)86 public abstract int getGlyph(int ucs4); 87 88 89 /** 90 * Reads a CharGlyphMap from an OpenType or TrueType <code>cmap</code> 91 * table. The current implementation works as follows: 92 * 93 * <p><ol><li>If the font has a type 4 cmap for the Unicode platform 94 * (encoding 0, 1, 2, 3 or 4), or a type 4 cmap for the Microsoft 95 * platform (encodings 1 or 10), that table is used to map Unicode 96 * codepoints to glyphs. Most recent fonts, both for Macintosh and 97 * Windows, should provide such a table.</li> 98 * 99 * <li>Otherwise, if the font has any type 0 cmap for the Macintosh 100 * platform, a Unicode-to-glyph mapping is synthesized from certain 101 * type 0 cmaps. The current implementation collects mappings from 102 * Roman, Icelandic, Turkish, Croatian, Romanian, Eastern European, 103 * Cyrillic, Greek, Hebrew, Arabic and Farsi cmaps.</li>.</ol> 104 * 105 * @param buf a buffer whose position is right at the start 106 * of the entire <code>cmap</code> table, and whose limit 107 * is at its end. 108 * 109 * @return a concrete subclass of <code>CharGlyphMap</code> 110 * that performs the mapping. 111 * 112 * @see <a href= 113 * "http://partners.adobe.com/asn/tech/type/opentype/cmap.jsp" 114 * >the <code>cmap</code> part of Adobe’ OpenType Specification</a> 115 * 116 * @see <a href= 117 * "http://developer.apple.com/fonts/TTRefMan/RM06/Chap6cmap.html" 118 * >the <code>cmap</code> section of Apple’s TrueType Reference 119 * Manual</a> 120 */ forTable(ByteBuffer buf)121 public static CharGlyphMap forTable(ByteBuffer buf) 122 { 123 boolean hasType0 = false; 124 int start4 = -1, platform4 = 0, encoding4 = 0; 125 int start12 = -1, platform12 = 0, encoding12 = 0; 126 int version; 127 int numTables; 128 int tableStart = buf.position(); 129 int limit = buf.limit(); 130 int format, platform, language, encoding, length, offset; 131 132 version = buf.getChar(); 133 if (version != 0) 134 return null; 135 136 numTables = buf.getChar(); 137 for (int i = 0; i < numTables; i++) 138 { 139 buf.limit(limit).position(tableStart + 4 + i * 8); 140 platform = buf.getChar(); 141 encoding = buf.getChar(); 142 offset = tableStart + buf.getInt(); 143 144 buf.position(offset); 145 format = buf.getChar(); 146 147 switch (format) 148 { 149 case 0: 150 hasType0 = true; 151 break; 152 153 case 4: 154 length = buf.getChar(); 155 language = buf.getChar(); 156 if ((start4 == -1) 157 && Type4.isSupported(platform, language, encoding)) 158 { 159 start4 = offset; 160 platform4 = platform; 161 encoding4 = encoding; 162 } 163 break; 164 165 case 12: 166 if ((start12 == -1) && Type12.isSupported(platform, encoding)) 167 { 168 start12 = offset; 169 platform12 = platform; 170 encoding12 = encoding; 171 } 172 break; 173 } 174 } 175 176 177 if (start12 >= 0) 178 { 179 try 180 { 181 buf.limit(limit).position(start12); 182 return new Type12(buf, platform12, encoding12); 183 } 184 catch (Exception ex) 185 { 186 ex.printStackTrace(); 187 } 188 } 189 190 if (start4 >= 0) 191 { 192 try 193 { 194 buf.limit(limit).position(start4); 195 return Type4.readTable(buf, platform4, encoding4); 196 } 197 catch (Exception ex) 198 { 199 } 200 } 201 202 if (hasType0) 203 { 204 try 205 { 206 buf.limit(limit).position(tableStart); 207 return new Type0(buf); 208 } 209 catch (Exception ex) 210 { 211 } 212 } 213 214 return new Dummy(); 215 } 216 217 218 /** 219 * A dummy mapping that maps anything to the undefined glyph. 220 * Used if no other cmap is understood in a font. 221 * 222 * @author Sascha Brawer (brawer@dandelis.ch) 223 */ 224 private static final class Dummy 225 extends CharGlyphMap 226 { getGlyph(int ucs4)227 public int getGlyph(int ucs4) 228 { 229 return 0; 230 } 231 } 232 233 234 /** 235 * A mapping from Unicode code points to glyph IDs through CMAP Type 236 * 0 tables. These tables have serious limitations: Only the first 237 * 256 glyphs can be addressed, and the source of the mapping is not 238 * Unicode, but an encoding used on the Macintosh. 239 * 240 * <p>However, some fonts have only a Type 0 cmap. In this case, we 241 * process all the Type 0 tables we understand, and establish 242 * a reversed glyph-to-Unicode mapping. When a glyph is requested 243 * for a given Unicode character, we perform a linear search on the 244 * reversed table to find the glyph which maps to the requested 245 * character. While not blazingly fast, this gives a reasonable 246 * fallback for old fonts. 247 * 248 * @author Sascha Brawer (brawer@dandelis.ch) 249 */ 250 private static final class Type0 251 extends CharGlyphMap 252 { 253 /** 254 * An array whose <code>i</code>-th element indicates the 255 * Unicode code point of glyph <code>i</code> in the font. 256 */ 257 private char[] glyphToUCS2 = new char[256]; 258 259 260 /** 261 * A String whose <code>charAt(i)</code> is the Unicode character 262 * that corresponds to the codepoint <code>i + 127</code> in the 263 * MacOS Arabic encoding. 264 * 265 * @see <a href= 266 * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ARABIC.TXT" 267 * >the Unicode mapping table for the MacOS Arabic encoding</a> 268 */ 269 private static final String UPPER_ARABIC 270 = "\u007e\u0000\u00c4\u00a0\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1" 271 + "\u00e0\u00e2\u00e4\u06ba\u00ab\u00e7\u00e9\u00e8\u00ea\u00eb" 272 + "\u00ed\u2026\u00ee\u00ef\u00f1\u00f3\u00bb\u00f4\u00f6\u00f7" 273 + "\u00fa\u00f9\u00fb\u00fc\u0020\u0021\"\u0023\u0024\u066a" 274 + "\u0026\u0027\u0028\u0029\u002a\u002b\u060c\u002d\u002e\u002f" 275 + "\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669" 276 + "\u003a\u061b\u003c\u003d\u003e\u061f\u274a\u0621\u0622\u0623" 277 + "\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d" 278 + "\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637" 279 + "\u0638\u0639\u063a\u005b\\\u005d\u005e\u005f\u0640\u0641" 280 + "\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u064b" 281 + "\u064c\u064d\u064e\u064f\u0650\u0651\u0652\u067e\u0679\u0686" 282 + "\u06d5\u06a4\u06af\u0688\u0691\u007b\u007c\u007d\u0698\u06d2"; 283 284 285 /** 286 * A String whose <code>charAt(i)</code> is the Unicode character 287 * that corresponds to the codepoint <code>i + 127</code> in the 288 * MacOS East European Roman encoding. 289 * 290 * @see <a href= 291 * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/CENTEURO.TXT" 292 * >the Unicode mapping table for the MacOS Central European 293 * encoding</a> 294 */ 295 private static final String UPPER_EAST_EUROPEAN_ROMAN 296 = "\u007e\u0000\u00c4\u0100\u0101\u00c9\u0104\u00d6\u00dc\u00e1" 297 + "\u0105\u010c\u00e4\u010d\u0106\u0107\u00e9\u0179\u017a\u010e" 298 + "\u00ed\u010f\u0112\u0113\u0116\u00f3\u0117\u00f4\u00f6\u00f5" 299 + "\u00fa\u011a\u011b\u00fc\u2020\u00b0\u0118\u00a3\u00a7\u2022" 300 + "\u00b6\u00df\u00ae\u00a9\u2122\u0119\u00a8\u2260\u0123\u012e" 301 + "\u012f\u012a\u2264\u2265\u012b\u0136\u2202\u2211\u0142\u013b" 302 + "\u013c\u013d\u013e\u0139\u013a\u0145\u0146\u0143\u00ac\u221a" 303 + "\u0144\u0147\u2206\u00ab\u00bb\u2026\u00a0\u0148\u0150\u00d5" 304 + "\u0151\u014c\u2013\u2014\u201c\u201d\u2018\u2019\u00f7\u25ca" 305 + "\u014d\u0154\u0155\u0158\u2039\u203a\u0159\u0156\u0157\u0160" 306 + "\u201a\u201e\u0161\u015a\u015b\u00c1\u0164\u0165\u00cd\u017d" 307 + "\u017e\u016a\u00d3\u00d4\u016b\u016e\u00da\u016f\u0170\u0171" 308 + "\u0172\u0173\u00dd\u00fd\u0137\u017b\u0141\u017c\u0122\u02c7"; 309 310 311 /** 312 * A String whose <code>charAt(i)</code> is the Unicode character 313 * that corresponds to the codepoint <code>i + 127</code> in the 314 * MacOS Roman encoding for the Croatian language. 315 * 316 * @see <a href= 317 * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/CROATIAN.TXT" 318 * >the Unicode mapping table for the MacOS Croatian encoding</a> 319 */ 320 private static final String UPPER_CROATIAN 321 = "\u007e\u0000\u00c4\u00c5\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1" 322 + "\u00e0\u00e2\u00e4\u00e3\u00e5\u00e7\u00e9\u00e8\u00ea\u00eb" 323 + "\u00ed\u00ec\u00ee\u00ef\u00f1\u00f3\u00f2\u00f4\u00f6\u00f5" 324 + "\u00fa\u00f9\u00fb\u00fc\u2020\u00b0\u00a2\u00a3\u00a7\u2022" 325 + "\u00b6\u00df\u00ae\u0160\u2122\u00b4\u00a8\u2260\u017d\u00d8" 326 + "\u221e\u00b1\u2264\u2265\u2206\u00b5\u2202\u2211\u220f\u0161" 327 + "\u222b\u00aa\u00ba\u03a9\u017e\u00f8\u00bf\u00a1\u00ac\u221a" 328 + "\u0192\u2248\u0106\u00ab\u010c\u2026\u00a0\u00c0\u00c3\u00d5" 329 + "\u0152\u0153\u0110\u2014\u201c\u201d\u2018\u2019\u00f7\u25ca" 330 + "\uf8ff\u00a9\u2044\u20ac\u2039\u203a\u00c6\u00bb\u2013\u00b7" 331 + "\u201a\u201e\u2030\u00c2\u0107\u00c1\u010d\u00c8\u00cd\u00ce" 332 + "\u00cf\u00cc\u00d3\u00d4\u0111\u00d2\u00da\u00db\u00d9\u0131" 333 + "\u02c6\u02dc\u00af\u03c0\u00cb\u02da\u00b8\u00ca\u00e6\u02c7"; 334 335 336 /** 337 * A String whose <code>charAt(i)</code> is the Unicode character 338 * that corresponds to the codepoint <code>i + 127</code> in the 339 * MacOS Cyrillic encoding. 340 * 341 * @see <a href= 342 * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/CYRILLIC.TXT" 343 * >the Unicode mapping table for the MacOS Cyrillic encoding</a> 344 */ 345 private static final String UPPER_CYRILLIC 346 = "\u007e\u0000\u0410\u0411\u0412\u0413\u0414\u0415\u0416\u0417" 347 + "\u0418\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u0420\u0421" 348 + "\u0422\u0423\u0424\u0425\u0426\u0427\u0428\u0429\u042a\u042b" 349 + "\u042c\u042d\u042e\u042f\u2020\u00b0\u0490\u00a3\u00a7\u2022" 350 + "\u00b6\u0406\u00ae\u00a9\u2122\u0402\u0452\u2260\u0403\u0453" 351 + "\u221e\u00b1\u2264\u2265\u0456\u00b5\u0491\u0408\u0404\u0454" 352 + "\u0407\u0457\u0409\u0459\u040a\u045a\u0458\u0405\u00ac\u221a" 353 + "\u0192\u2248\u2206\u00ab\u00bb\u2026\u00a0\u040b\u045b\u040c" 354 + "\u045c\u0455\u2013\u2014\u201c\u201d\u2018\u2019\u00f7\u201e" 355 + "\u040e\u045e\u040f\u045f\u2116\u0401\u0451\u044f\u0430\u0431" 356 + "\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b" 357 + "\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445" 358 + "\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u20ac"; 359 360 361 /** 362 * A String whose <code>charAt(i)</code> is the Unicode character 363 * that corresponds to the codepoint <code>i + 127</code> in the 364 * MacOS Arabic encoding with the Farsi language. 365 * 366 * @see <a href= 367 * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/FARSI.TXT" 368 * >the Unicode mapping table for the MacOS Farsi encoding</a> 369 */ 370 private static final String UPPER_FARSI 371 = "\u007e\u0000\u00c4\u00a0\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1" 372 + "\u00e0\u00e2\u00e4\u06ba\u00ab\u00e7\u00e9\u00e8\u00ea\u00eb" 373 + "\u00ed\u2026\u00ee\u00ef\u00f1\u00f3\u00bb\u00f4\u00f6\u00f7" 374 + "\u00fa\u00f9\u00fb\u00fc\u0020\u0021\"\u0023\u0024\u066a" 375 + "\u0026\u0027\u0028\u0029\u002a\u002b\u060c\u002d\u002e\u002f" 376 + "\u06f0\u06f1\u06f2\u06f3\u06f4\u06f5\u06f6\u06f7\u06f8\u06f9" 377 + "\u003a\u061b\u003c\u003d\u003e\u061f\u274a\u0621\u0622\u0623" 378 + "\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d" 379 + "\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637" 380 + "\u0638\u0639\u063a\u005b\\\u005d\u005e\u005f\u0640\u0641" 381 + "\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u064b" 382 + "\u064c\u064d\u064e\u064f\u0650\u0651\u0652\u067e\u0679\u0686" 383 + "\u06d5\u06a4\u06af\u0688\u0691\u007b\u007c\u007d\u0698\u06d2"; 384 385 386 /** 387 * A String whose <code>charAt(i)</code> is the Unicode character 388 * that corresponds to the codepoint <code>i + 127</code> in the 389 * MacOS Greek encoding. 390 * 391 * @see <a 392 * href="http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/GREEK.TXT" 393 * >the Unicode mapping table for the MacOS Greek encoding</a> 394 */ 395 private static final String UPPER_GREEK 396 = "\u007e\u0000\u00c4\u00b9\u00b2\u00c9\u00b3\u00d6\u00dc\u0385" 397 + "\u00e0\u00e2\u00e4\u0384\u00a8\u00e7\u00e9\u00e8\u00ea\u00eb" 398 + "\u00a3\u2122\u00ee\u00ef\u2022\u00bd\u2030\u00f4\u00f6\u00a6" 399 + "\u20ac\u00f9\u00fb\u00fc\u2020\u0393\u0394\u0398\u039b\u039e" 400 + "\u03a0\u00df\u00ae\u00a9\u03a3\u03aa\u00a7\u2260\u00b0\u00b7" 401 + "\u0391\u00b1\u2264\u2265\u00a5\u0392\u0395\u0396\u0397\u0399" 402 + "\u039a\u039c\u03a6\u03ab\u03a8\u03a9\u03ac\u039d\u00ac\u039f" 403 + "\u03a1\u2248\u03a4\u00ab\u00bb\u2026\u00a0\u03a5\u03a7\u0386" 404 + "\u0388\u0153\u2013\u2015\u201c\u201d\u2018\u2019\u00f7\u0389" 405 + "\u038a\u038c\u038e\u03ad\u03ae\u03af\u03cc\u038f\u03cd\u03b1" 406 + "\u03b2\u03c8\u03b4\u03b5\u03c6\u03b3\u03b7\u03b9\u03be\u03ba" 407 + "\u03bb\u03bc\u03bd\u03bf\u03c0\u03ce\u03c1\u03c3\u03c4\u03b8" 408 + "\u03c9\u03c2\u03c7\u03c5\u03b6\u03ca\u03cb\u0390\u03b0\u00ad"; 409 410 411 /** 412 * A String whose <code>charAt(i)</code> is the Unicode character 413 * that corresponds to the codepoint <code>i + 127</code> in the 414 * MacOS Hebrew encoding. 415 * 416 * <p>The codepoint 0x81 (HEBREW LIGATURE YIDDISH YOD YOD PATAH) 417 * has no composed Unicode equivalent, but is expressed as the 418 * sequence U+05F2 U+05B7 in Unicode. A similar situation exists 419 * with the codepoint 0xC0 (HEBREW LIGATURE LAMED HOLAM), which 420 * MacOS converts to U+F86A U+05DC U+05B9. To correctly deal 421 * with these sequences, we probably should synthesize a ligature 422 * table if a Hebrew font only provides a Type 0 CMAP. 423 * 424 * @see <a href= 425 * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/HEBREW.TXT" 426 * >the Unicode mapping table for the MacOS Hebrew encoding</a> 427 */ 428 private static final String UPPER_HEBREW 429 = "\u007e\u0000\u00c4\u0000\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1" 430 + "\u00e0\u00e2\u00e4\u00e3\u00e5\u00e7\u00e9\u00e8\u00ea\u00eb" 431 + "\u00ed\u00ec\u00ee\u00ef\u00f1\u00f3\u00f2\u00f4\u00f6\u00f5" 432 + "\u00fa\u00f9\u00fb\u00fc\u0020\u0021\"\u0023\u0024\u0025" 433 + "\u20aa\u0027\u0029\u0028\u002a\u002b\u002c\u002d\u002e\u002f" 434 + "\u0030\u0031\u0032\u0033\u0034\u0035\u0036\u0037\u0038\u0039" 435 + "\u003a\u003b\u003c\u003d\u003e\u003f\u0000\u201e\uf89b\uf89c" 436 + "\uf89d\uf89e\u05bc\ufb4b\ufb35\u2026\u00a0\u05b8\u05b7\u05b5" 437 + "\u05b6\u05b4\u2013\u2014\u201c\u201d\u2018\u2019\ufb2a\ufb2b" 438 + "\u05bf\u05b0\u05b2\u05b1\u05bb\u05b9\u0000\u05b3\u05d0\u05d1" 439 + "\u05d2\u05d3\u05d4\u05d5\u05d6\u05d7\u05d8\u05d9\u05da\u05db" 440 + "\u05dc\u05dd\u05de\u05df\u05e0\u05e1\u05e2\u05e3\u05e4\u05e5" 441 + "\u05e6\u05e7\u05e8\u05e9\u05ea\u007d\u005d\u007b\u005b\u007c"; 442 443 444 /** 445 * A String whose <code>charAt(i)</code> is the Unicode character 446 * that corresponds to the codepoint <code>i + 127</code> in the 447 * MacOS Roman encoding with the Icelandic language. 448 * 449 * @see <a href= 450 * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ICELAND.TXT" 451 * >the Unicode mapping table for the MacOS Icelandic encoding</a> 452 */ 453 private static final String UPPER_ICELANDIC 454 = "\u007e\u0000\u00c4\u00c5\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1" 455 + "\u00e0\u00e2\u00e4\u00e3\u00e5\u00e7\u00e9\u00e8\u00ea\u00eb" 456 + "\u00ed\u00ec\u00ee\u00ef\u00f1\u00f3\u00f2\u00f4\u00f6\u00f5" 457 + "\u00fa\u00f9\u00fb\u00fc\u00dd\u00b0\u00a2\u00a3\u00a7\u2022" 458 + "\u00b6\u00df\u00ae\u00a9\u2122\u00b4\u00a8\u2260\u00c6\u00d8" 459 + "\u221e\u00b1\u2264\u2265\u00a5\u00b5\u2202\u2211\u220f\u03c0" 460 + "\u222b\u00aa\u00ba\u03a9\u00e6\u00f8\u00bf\u00a1\u00ac\u221a" 461 + "\u0192\u2248\u2206\u00ab\u00bb\u2026\u00a0\u00c0\u00c3\u00d5" 462 + "\u0152\u0153\u2013\u2014\u201c\u201d\u2018\u2019\u00f7\u25ca" 463 + "\u00ff\u0178\u2044\u20ac\u00d0\u00f0\u00de\u00fe\u00fd\u00b7" 464 + "\u201a\u201e\u2030\u00c2\u00ca\u00c1\u00cb\u00c8\u00cd\u00ce" 465 + "\u00cf\u00cc\u00d3\u00d4\uf8ff\u00d2\u00da\u00db\u00d9\u0131" 466 + "\u02c6\u02dc\u00af\u02d8\u02d9\u02da\u00b8\u02dd\u02db\u02c7"; 467 468 469 /** 470 * A String whose <code>charAt(i)</code> is the Unicode character 471 * that corresponds to the codepoint <code>i + 127</code> in the 472 * MacOS Roman encoding for most languages. Exceptions include 473 * Croatian, Icelandic, Romanian, and Turkish. 474 * 475 * @see <a 476 * href="http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT" 477 * >the Unicode mapping table for the MacOS Roman encoding</a> 478 */ 479 private static final String UPPER_ROMAN 480 = "\u007e\u0000\u00c4\u00c5\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1" 481 + "\u00e0\u00e2\u00e4\u00e3\u00e5\u00e7\u00e9\u00e8\u00ea\u00eb" 482 + "\u00ed\u00ec\u00ee\u00ef\u00f1\u00f3\u00f2\u00f4\u00f6\u00f5" 483 + "\u00fa\u00f9\u00fb\u00fc\u2020\u00b0\u00a2\u00a3\u00a7\u2022" 484 + "\u00b6\u00df\u00ae\u00a9\u2122\u00b4\u00a8\u2260\u00c6\u00d8" 485 + "\u221e\u00b1\u2264\u2265\u00a5\u00b5\u2202\u2211\u220f\u03c0" 486 + "\u222b\u00aa\u00ba\u03a9\u00e6\u00f8\u00bf\u00a1\u00ac\u221a" 487 + "\u0192\u2248\u2206\u00ab\u00bb\u2026\u00a0\u00c0\u00c3\u00d5" 488 + "\u0152\u0153\u2013\u2014\u201c\u201d\u2018\u2019\u00f7\u25ca" 489 + "\u00ff\u0178\u2044\u20ac\u2039\u203a\ufb01\ufb02\u2021\u00b7" 490 + "\u201a\u201e\u2030\u00c2\u00ca\u00c1\u00cb\u00c8\u00cd\u00ce" 491 + "\u00cf\u00cc\u00d3\u00d4\uf8ff\u00d2\u00da\u00db\u00d9\u0131" 492 + "\u02c6\u02dc\u00af\u02d8\u02d9\u02da\u00b8\u02dd\u02db\u02c7"; 493 494 495 /** 496 * A String whose <code>charAt(i)</code> is the Unicode character 497 * that corresponds to the codepoint <code>i + 127</code> in the 498 * MacOS Roman encoding with the Romanian language. 499 * 500 * @see <a href= 501 * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMANIAN.TXT" 502 * >the Unicode mapping table for the MacOS Romanian encoding</a> 503 */ 504 private static final String UPPER_ROMANIAN 505 = "\u007e\u0000\u00c4\u00c5\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1" 506 + "\u00e0\u00e2\u00e4\u00e3\u00e5\u00e7\u00e9\u00e8\u00ea\u00eb" 507 + "\u00ed\u00ec\u00ee\u00ef\u00f1\u00f3\u00f2\u00f4\u00f6\u00f5" 508 + "\u00fa\u00f9\u00fb\u00fc\u2020\u00b0\u00a2\u00a3\u00a7\u2022" 509 + "\u00b6\u00df\u00ae\u00a9\u2122\u00b4\u00a8\u2260\u0102\u0218" 510 + "\u221e\u00b1\u2264\u2265\u00a5\u00b5\u2202\u2211\u220f\u03c0" 511 + "\u222b\u00aa\u00ba\u03a9\u0103\u0219\u00bf\u00a1\u00ac\u221a" 512 + "\u0192\u2248\u2206\u00ab\u00bb\u2026\u00a0\u00c0\u00c3\u00d5" 513 + "\u0152\u0153\u2013\u2014\u201c\u201d\u2018\u2019\u00f7\u25ca" 514 + "\u00ff\u0178\u2044\u20ac\u2039\u203a\u021a\u021b\u2021\u00b7" 515 + "\u201a\u201e\u2030\u00c2\u00ca\u00c1\u00cb\u00c8\u00cd\u00ce" 516 + "\u00cf\u00cc\u00d3\u00d4\uf8ff\u00d2\u00da\u00db\u00d9\u0131" 517 + "\u02c6\u02dc\u00af\u02d8\u02d9\u02da\u00b8\u02dd\u02db\u02c7"; 518 519 520 /** 521 * A String whose <code>charAt(i)</code> is the Unicode character 522 * that corresponds to the codepoint <code>i + 127</code> in the 523 * MacOS Roman encoding with the Turkish language. 524 * 525 * @see <a href= 526 * "http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/TURKISH.TXT" 527 * >the Unicode mapping table for the MacOS Turkish encoding</a> 528 */ 529 private static final String UPPER_TURKISH 530 = "\u007e\u0000\u00c4\u00c5\u00c7\u00c9\u00d1\u00d6\u00dc\u00e1" 531 + "\u00e0\u00e2\u00e4\u00e3\u00e5\u00e7\u00e9\u00e8\u00ea\u00eb" 532 + "\u00ed\u00ec\u00ee\u00ef\u00f1\u00f3\u00f2\u00f4\u00f6\u00f5" 533 + "\u00fa\u00f9\u00fb\u00fc\u2020\u00b0\u00a2\u00a3\u00a7\u2022" 534 + "\u00b6\u00df\u00ae\u00a9\u2122\u00b4\u00a8\u2260\u00c6\u00d8" 535 + "\u221e\u00b1\u2264\u2265\u00a5\u00b5\u2202\u2211\u220f\u03c0" 536 + "\u222b\u00aa\u00ba\u03a9\u00e6\u00f8\u00bf\u00a1\u00ac\u221a" 537 + "\u0192\u2248\u2206\u00ab\u00bb\u2026\u00a0\u00c0\u00c3\u00d5" 538 + "\u0152\u0153\u2013\u2014\u201c\u201d\u2018\u2019\u00f7\u25ca" 539 + "\u00ff\u0178\u011e\u011f\u0130\u0131\u015e\u015f\u2021\u00b7" 540 + "\u201a\u201e\u2030\u00c2\u00ca\u00c1\u00cb\u00c8\u00cd\u00ce" 541 + "\u00cf\u00cc\u00d3\u00d4\uf8ff\u00d2\u00da\u00db\u00d9\uf8a0" 542 + "\u02c6\u02dc\u00af\u02d8\u02d9\u02da\u00b8\u02dd\u02db\u02c7"; 543 544 545 /** 546 * Constructs a CharGlyphMap.Type0 from all type 0 cmaps provided 547 * by the font. The implementation is able to fuse multiple type 548 * 0 cmaps, such as the MacRoman, Turkish, Icelandic and Croatian 549 * encoding, into a single map from Unicode characters to glyph 550 * indices. 551 * 552 * @param buf a ByteBuffer whose position is right at the 553 * beginning of the entire cmap table of the font (<i>not</i> 554 * at some subtable). 555 */ Type0(ByteBuffer buf)556 public Type0(ByteBuffer buf) 557 { 558 int numTables; 559 int tableStart = buf.position(); 560 int limit = buf.limit(); 561 562 /* The CMAP version must be 0. */ 563 if (buf.getChar() != 0) 564 throw new IllegalStateException(); 565 566 numTables = buf.getChar(); 567 for (int i = 0; i < numTables; i++) 568 { 569 buf.limit(limit).position(tableStart + 4 + i * 8); 570 int platform = buf.getChar(); 571 int encoding = buf.getChar(); 572 int offset = tableStart + buf.getInt(); 573 574 buf.position(offset); 575 int format = buf.getChar(); 576 int length = buf.getChar(); 577 buf.limit(offset + length); 578 int language = buf.getChar(); 579 580 if (format == 0) 581 readSingleTable(buf, platform, language, encoding); 582 } 583 } 584 585 586 /** 587 * Processes a CMAP Type 0 table whose platform, encoding and 588 * language are already known. 589 * 590 * @param buf the buffer to read the table from, positioned 591 * right after the language tag. 592 */ readSingleTable(ByteBuffer buf, int platform, int language, int encoding)593 private void readSingleTable(ByteBuffer buf, 594 int platform, int language, 595 int encoding) 596 { 597 String upper = getUpper129(platform, encoding, language); 598 if (upper == null) 599 return; 600 601 /* Skip the MacOS codepoints [0 .. 31] because they do not 602 * correspond to any Unicode codepoint. 603 */ 604 buf.position(buf.position() + 32); 605 606 /* Irrespective of script and language, the MacOS codepoints 607 * [32 .. 126] correspond to the same Unicode codepoint. 608 */ 609 for (int i = 32; i < 126; i++) 610 glyphToUCS2[buf.get() & 0xff] = (char) i; 611 612 for (int i = 127; i < 256; i++) 613 glyphToUCS2[buf.get() & 0xff] = upper.charAt(i - 127); 614 615 /* Glyph 0 is always the undefined character, which has 616 * no codepoint in Unicode. 617 */ 618 glyphToUCS2[0] = 0; 619 } 620 621 622 /** 623 * Determines the glyph index for a given Unicode codepoint. 624 * 625 * @param ucs4 the Unicode codepoint in UCS-4 encoding. 626 * 627 * @return the glyph index, or 0 if the font does not contain 628 * a glyph for this codepoint. 629 */ getGlyph(int ucs4)630 public int getGlyph(int ucs4) 631 { 632 /* This linear search is not exactly super fast. However, 633 * only really ancient fonts have only a type 0 cmap, 634 * so it should not hurt in very many cases. If it shows 635 * to be a performance problem, one could do a binary search 636 * on a 256-entry table sorted by Unicode codepoint. The 637 * matching index of that table could then be used to look 638 * up the glyph ID at that position. 639 */ 640 for (int i = 0; i < 256; i++) 641 if (glyphToUCS2[i] == ucs4) 642 return i; 643 return 0; 644 } 645 646 647 /** 648 * Returns a String whose <code>charAt(i)</code> is the Unicode 649 * character that corresponds to the codepoint <code>i + 650 * 127</code> in the encoding specified by the platform, script 651 * and language tag of a Type 0 CMAP. 652 * 653 * @param language the language tag in the cmap subtable. For the 654 * Macintosh platform, this is 0 to indicate language-neutral 655 * encoding, or the MacOS language code <i>plus one.</i> The 656 * Apple documentation does not mention that one needs to be 657 * added, but the Adobe OpenType specification does. 658 * 659 * @return a String for mapping the top 129 characters to 660 * UCS-2. If <code>platform</code> is not <code>1</code> 661 * (indicating Macintosh), or if the combination of 662 * <code>script</code> and <code>language</code> is not 663 * recognized, <code>null</code> will be returned. 664 */ getUpper129(int platform, int script, int language)665 private static String getUpper129(int platform, int script, int language) 666 { 667 if (platform != PLATFORM_MACINTOSH) 668 return null; 669 670 switch (script) 671 { 672 case 0: /* smRoman */ 673 if (language == /* langIcelandic+1 */ 16) 674 return UPPER_ICELANDIC; 675 else if (language == /* langTurkish+1 */ 18) 676 return UPPER_TURKISH; 677 else if (language == /* langCroatian+1 */ 19) 678 return UPPER_CROATIAN; 679 else if (language == /* langRomanian+1 */ 38) 680 return UPPER_ROMANIAN; 681 else if (language == /* language-neutral */ 0) 682 return UPPER_ROMAN; 683 else 684 return null; 685 686 case 4: /* smArabic */ 687 if (language == /* langFarsi+1 */ 32) 688 return UPPER_FARSI; 689 else 690 return UPPER_ARABIC; 691 692 case 5: /* smHebrew */ 693 return UPPER_HEBREW; 694 695 case 6: /* smGreek */ 696 return UPPER_GREEK; 697 698 case 7: /* smCyrillic */ 699 return UPPER_CYRILLIC; 700 701 case 29: /* smSlavic == smEastEurRoman */ 702 return UPPER_EAST_EUROPEAN_ROMAN; 703 } 704 705 return null; 706 } 707 } 708 709 710 /** 711 * A mapping from Unicode code points to glyph IDs through CMAP Type 712 * 4 tables. These tables are able to map two-byte encoded text 713 * to glyph IDs, such as Unicode Basic Multilingual Plane which 714 * contains U+0000 .. U+FFFE without surrogates. 715 * 716 * @author Sascha Brawer (brawer@dandelis.ch) 717 */ 718 private static final class Type4 719 extends CharGlyphMap 720 { 721 /** 722 * Determines whether this implementation supports a combination 723 * of platform, language and encoding is supported for a type 4 724 * <code>cmap</code> table. 725 * 726 * <p>Currently, we support the following combinations: 727 * 728 * <ul><li>the Unicode platform in encodings 0, 1, 2, 3 and 729 * 4;</li> 730 * 731 * <li>the Microsoft platform in encodings 1 (Basic Multilingual 732 * Plane) and 10 (full Unicode).</li></ul> 733 * 734 * <p>Most recent Macintosh fonts provide a type 4 735 * <code>cmap</code> for Unicode. Microsoft recommends providing a 736 * type 4 <code>cmap</code> for encoding 1 of the Microsoft 737 * platform. The implementation of GNU Classpath supports both 738 * variants. 739 * 740 * <p>Not supported are ShiftJIS, Big5, Wansung, Johab, and other 741 * non-Unicode encodings. Text can easily be converted to Unicode 742 * using the java.nio.charset package. 743 */ isSupported(int platform, int language, int encoding)744 static boolean isSupported(int platform, int language, int encoding) 745 { 746 switch (platform) 747 { 748 case PLATFORM_UNICODE: 749 return (encoding >= 0) && (encoding <= 4); 750 751 case PLATFORM_MICROSOFT: 752 return (encoding == /* Basic Multilingual Plane */ 1) 753 || (encoding == /* Full Unicode */ 10); 754 } 755 756 return false; 757 } 758 759 760 /** 761 * Processes a CMAP Type 4 table whose platform, encoding and 762 * language are already known. We understand the Unicode platform 763 * with encodings 0, 1, 2, 3 and 4, and the Microsoft platform 764 * with encodings 1 (Unicode BMP) and 10 (UCS-4). 765 * 766 * @param buf the buffer to read the table from, positioned at 767 * its beginning. 768 * 769 * @return a Type4 table, or <code>null</code> if the combination 770 * of platform and encoding is not understood. 771 */ readTable(ByteBuffer buf, int platform, int encoding)772 static Type4 readTable(ByteBuffer buf, 773 int platform, int encoding) 774 { 775 int tableStart = buf.position(); 776 char format = buf.getChar(); 777 int length = buf.getChar(); 778 int language = buf.getChar(); 779 780 if ((format != 4) || !isSupported(platform, language, encoding)) 781 throw new IllegalArgumentException(); 782 783 buf.limit(tableStart + length); 784 785 int segCountX2 = buf.getChar(); 786 int segCount = segCountX2 / 2; 787 int searchRange = buf.getChar(); 788 int entrySelector = buf.getChar(); 789 int rangeShift = buf.getChar(); 790 791 CharBuffer endCode, startCode, idRangeOffset_glyphID; 792 ShortBuffer idDelta; 793 794 int pos = buf.position(); 795 endCode = buf.asCharBuffer(); 796 pos += segCountX2 + /* reservedPad */ 2; 797 798 buf.position(pos); 799 startCode = buf.asCharBuffer(); 800 pos += segCountX2; 801 802 buf.position(pos); 803 idDelta = buf.asShortBuffer(); 804 pos += segCountX2; 805 806 buf.position(pos); 807 idRangeOffset_glyphID = buf.asCharBuffer(); 808 809 endCode.limit(segCount); 810 startCode.limit(segCount); 811 idDelta.limit(segCount); 812 idRangeOffset_glyphID.limit((buf.limit() - pos) / 2); 813 814 return new Type4(segCount, 815 endCode, startCode, idDelta, 816 idRangeOffset_glyphID); 817 } 818 819 820 private CharBuffer lastChar; 821 private CharBuffer firstChar; 822 private ShortBuffer idDelta; 823 private CharBuffer rangeID; 824 private int numSegments; 825 Type4(int numSegments, CharBuffer lastChar, CharBuffer firstChar, ShortBuffer idDelta, CharBuffer rangeID)826 private Type4(int numSegments, 827 CharBuffer lastChar, CharBuffer firstChar, 828 ShortBuffer idDelta, CharBuffer rangeID) 829 { 830 this.numSegments = numSegments; 831 this.lastChar = lastChar; 832 this.firstChar = firstChar; 833 this.idDelta = idDelta; 834 this.rangeID = rangeID; 835 } 836 837 838 /** 839 * Determines the glyph index for a given Unicode codepoint. 840 * 841 * @param ucs4 the Unicode codepoint in UCS-4 encoding. 842 * 843 * @return the glyph index, or 0 if the font does not contain 844 * a glyph for this codepoint. 845 */ getGlyph(int ucs4)846 public int getGlyph(int ucs4) 847 { 848 char c, segStart; 849 int segment, idRangeOffset; 850 851 if (ucs4 > 0xffff) 852 return 0; 853 854 c = (char) ucs4; 855 segment = find(c); 856 segStart = firstChar.get(segment); 857 if ((c < segStart) || (c > lastChar.get(segment))) 858 return 0; 859 860 /* 861 * System.out.println("seg " + segment 862 * + ", range=" + (int) rangeID[segment] 863 * + ", delta=" + delta[segment]); 864 */ 865 866 idRangeOffset = rangeID.get(segment); 867 if (idRangeOffset == 0) 868 return (int) (char) (((int) c) + idDelta.get(segment)); 869 int result = rangeID.get((idRangeOffset >> 1) 870 + (c - segStart) + segment); 871 if (result == 0) 872 return 0; 873 return (int) (char) (result + idDelta.get(segment)); 874 } 875 876 find(char c)877 private int find(char c) 878 { 879 int min, max, mid; 880 881 min = 0; 882 max = numSegments - 1; 883 mid = max >> 1; 884 885 while (min < max) 886 { 887 // System.out.println("(" + min + "," + max + ") " + mid); 888 char val = lastChar.get(mid); 889 if (val == c) 890 break; 891 else if (val < c) 892 min = mid + 1; 893 else if (val > c) 894 max = mid; 895 mid = (min + max) >> 1; 896 } 897 898 return mid; 899 } 900 } 901 902 903 /** 904 * A mapping from Unicode code points to glyph IDs through CMAP Type 905 * 12 tables. These tables are able to map four-byte encoded text 906 * to glyph IDs, such as Unicode UCS-4. 907 * 908 * @author Sascha Brawer (brawer@dandelis.ch) 909 */ 910 private static final class Type12 911 extends CharGlyphMap 912 { 913 int numGroups; 914 IntBuffer data; 915 916 917 /** 918 * Determines whether this implementation supports a combination 919 * of platform and encoding for a type 12 <code>cmap</code> table. 920 * 921 * <p>Currently, we support the following combinations: 922 * 923 * <ul><li>the Unicode platform in encodings 0, 1, 2, 3 and 924 * 4;</li> 925 * 926 * <li>the Microsoft platform in encodings 1 (Basic Multilingual 927 * Plane) and 10 (full Unicode).</li></ul> 928 */ isSupported(int platform, int encoding)929 static boolean isSupported(int platform, int encoding) 930 { 931 switch (platform) 932 { 933 case PLATFORM_UNICODE: 934 return (encoding >= 0) && (encoding <= 4); 935 936 case PLATFORM_MICROSOFT: 937 return (encoding == /* Basic Multilingual Plane */ 1) 938 || (encoding == /* Full Unicode */ 10); 939 } 940 941 return false; 942 } 943 944 945 /** 946 * Constructs a <code>cmap</code> type 12 table whose platform and 947 * encoding are already known. We understand the Unicode platform 948 * with encodings 0, 1, 2, 3 and 4, and the Microsoft platform 949 * with encodings 1 (Unicode BMP) and 10 (UCS-4). 950 * 951 * @param buf the buffer to read the table from, positioned at 952 * its beginning. 953 */ Type12(ByteBuffer buf, int platform, int encoding)954 Type12(ByteBuffer buf, int platform, int encoding) 955 { 956 int tableStart = buf.position(); 957 int format = buf.getChar(); 958 if ((format != 12) || !isSupported(platform, encoding)) 959 throw new IllegalStateException(); 960 961 buf.getChar(); // skip reserved field 962 buf.limit(tableStart + buf.getInt()); 963 int language = buf.getInt(); 964 numGroups = buf.getInt(); 965 data = buf.asIntBuffer(); 966 } 967 968 969 /** 970 * Determines the glyph index for a given Unicode codepoint. Users 971 * should be aware that the character-to-glyph mapping not not 972 * everything that is needed for full Unicode support. For example, 973 * the <code>cmap</code> table is not able to synthesize accented 974 * glyphs from the canonical decomposition sequence, even if the 975 * font would contain a glyph for the composed form. 976 * 977 * @param ucs4 the Unicode codepoint in UCS-4 encoding. Surrogates 978 * (U+D800 to U+DFFF) cannot be passed, they must be mapped to 979 * UCS-4 first. 980 * 981 * @return the glyph index, or 0 if the font does not contain 982 * a glyph for this codepoint. 983 */ getGlyph(int ucs4)984 public int getGlyph(int ucs4) 985 { 986 int min, max, mid, startCharCode, endCharCode; 987 988 min = 0; 989 max = numGroups - 1; 990 mid = max >> 1; 991 do 992 { 993 startCharCode = data.get(3 * mid); 994 endCharCode = data.get(3 * mid + 1); 995 996 997 /* 998 System.out.println("group " + mid + " (U+" 999 + Integer.toHexString(startCharCode) 1000 + " .. U+" + Integer.toHexString(endCharCode) 1001 + "): glyph " + (int) data.get(mid*3+2)); 1002 */ 1003 1004 if ((startCharCode <= ucs4) && (ucs4 <= endCharCode)) 1005 return ucs4 1006 - startCharCode 1007 + /* startGlyphID */ data.get(mid * 3 + 2); 1008 1009 if (endCharCode < ucs4) 1010 min = mid + 1; 1011 else 1012 max = mid; 1013 mid = (min + max) >> 1; 1014 } 1015 while (min < max); 1016 1017 startCharCode = data.get(3 * mid); 1018 endCharCode = data.get(3 * mid + 1); 1019 if ((startCharCode <= ucs4) && (ucs4 <= endCharCode)) 1020 return ucs4 1021 - startCharCode 1022 + /* startGlyphID */ data.get(mid * 3 + 2); 1023 1024 return 0; 1025 } 1026 } 1027 } 1028