1 /* Copyright (C) 1997, 2000 artofcode LLC.  All rights reserved.
2 
3   This program is free software; you can redistribute it and/or modify it
4   under the terms of the GNU General Public License as published by the
5   Free Software Foundation; either version 2 of the License, or (at your
6   option) any later version.
7 
8   This program is distributed in the hope that it will be useful, but
9   WITHOUT ANY WARRANTY; without even the implied warranty of
10   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11   General Public License for more details.
12 
13   You should have received a copy of the GNU General Public License along
14   with this program; if not, write to the Free Software Foundation, Inc.,
15   59 Temple Place, Suite 330, Boston, MA, 02111-1307.
16 
17 */
18 
19 /*$Id: gsfcmap.c,v 1.14.2.2.2.1 2003/01/17 00:49:02 giles Exp $ */
20 /* CMap character decoding */
21 #include "memory_.h"
22 #include "errors.h"
23 #include "gx.h"
24 #include "gserrors.h"
25 #include "gsstruct.h"
26 #include "gsutil.h"		/* for gs_next_ids */
27 #include "gxfcmap.h"
28 
29 /* GC descriptors */
30 public_st_cmap();
31 /* Because lookup ranges can be elements of arrays, */
32 /* their enum_ptrs procedure must never return 0 prematurely. */
33 private
34 ENUM_PTRS_WITH(code_lookup_range_enum_ptrs,
35                gx_code_lookup_range_t *pclr) return 0;
36 case 0:
37     if (pclr->value_type == CODE_VALUE_GLYPH) {
38         const byte *pv = pclr->values.data;
39         int k;
40 
41         for (k = 0; k < pclr->num_keys; ++k) {
42             gs_glyph glyph = 0;
43             int i;
44 
45             for (i = 0; i < pclr->value_size; ++i)
46                 glyph = (glyph << 8) + *pv++;
47             pclr->cmap->mark_glyph(glyph, pclr->cmap->mark_glyph_data);
48         }
49     }
50     return ENUM_OBJ(pclr->cmap);
51 case 1: return ENUM_STRING(&pclr->keys);
52 case 2: return ENUM_STRING(&pclr->values);
53 ENUM_PTRS_END
54 private
55 RELOC_PTRS_WITH(code_lookup_range_reloc_ptrs, gx_code_lookup_range_t *pclr)
56     RELOC_VAR(pclr->cmap);
57     RELOC_STRING_VAR(pclr->keys);
58     RELOC_STRING_VAR(pclr->values);
59 RELOC_PTRS_END
60 public_st_code_lookup_range();
61 public_st_code_lookup_range_element();
62 
63 /* ---------------- Procedures ---------------- */
64 
65 /*
66  * Initialize a just-allocated CMap, to ensure that all pointers are clean
67  * for the GC.
68  */
69 void
gs_cmap_init(gs_cmap_t * pcmap)70 gs_cmap_init(gs_cmap_t *pcmap)
71 {
72     memset(pcmap, 0, sizeof(*pcmap));
73     pcmap->id = gs_next_ids(1);
74     uid_set_invalid(&pcmap->uid);
75 }
76 
77 /*
78  * Create an Identity CMap.
79  */
80 int
gs_cmap_create_identity(gs_cmap_t ** ppcmap,int num_bytes,int wmode,gs_memory_t * mem)81 gs_cmap_create_identity(gs_cmap_t **ppcmap, int num_bytes, int wmode,
82 			gs_memory_t *mem)
83 {
84     gs_cmap_t *pcmap =
85 	gs_alloc_struct(mem, gs_cmap_t, &st_cmap,
86 			"gs_cmap_create_identity(CMap)");
87     gx_code_space_range_t *range = (gx_code_space_range_t *)
88 	gs_alloc_bytes(mem, sizeof(gx_code_space_range_t),
89 		       "gs_cmap_create_identity(code space range)");
90     gx_code_lookup_range_t *lookup =
91 	gs_alloc_struct_array(mem, 1, gx_code_lookup_range_t,
92 			      &st_code_lookup_range,
93 			      "gs_cmap_create_identity(lookup range)");
94     /* We allocate CIDSystemInfo dynamically only for the sake of the GC. */
95     gs_cid_system_info_t *pcidsi =
96 	gs_alloc_struct(mem, gs_cid_system_info_t, &st_cid_system_info,
97 			"gs_cmap_create_identity(CIDSystemInfo)");
98     static const byte key_data[8] = {
99 	0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff
100     };
101     static const gs_cid_system_info_t identity_cidsi = {
102 	{ (const byte *)"Adobe", 5 },
103 	{ (const byte *)"Identity", 8 },
104 	0
105     };
106 
107     if (pcmap == 0 || range == 0 || lookup == 0 || pcidsi == 0)
108 	return_error(gs_error_VMerror);
109     if (num_bytes != 2)		/* for now */
110 	return_error(gs_error_rangecheck);
111     gs_cmap_init(pcmap);
112     pcmap->CMapType = 1;
113     pcmap->CMapName.data = (const byte *)
114 	(wmode ? "Identity-V" : "Identity-H");
115     pcmap->CMapName.size = 10;
116     *pcidsi = identity_cidsi;
117     pcmap->CIDSystemInfo = pcidsi;
118     pcmap->num_fonts = 1;
119     pcmap->CMapVersion = 1.0;
120     /* no uid, UIDOffset */
121     pcmap->WMode = wmode;
122     memset(range->first, 0, num_bytes);
123     memset(range->last, 0xff, num_bytes);
124     range->size = num_bytes;
125     pcmap->code_space.ranges = range;
126     pcmap->code_space.num_ranges = 1;
127     memset(lookup, 0, sizeof(*lookup));
128     lookup->cmap = pcmap;
129     lookup->key_size = num_bytes;
130     lookup->num_keys = 1;
131     lookup->key_is_range = true;
132     /*
133      * It's OK to break const here, because the strings are never
134      * freed, and the GC can handle strings outside the heap.
135      */
136     lookup->keys.data = (byte*) (key_data + 4 - num_bytes);
137     lookup->keys.size = num_bytes * 2;
138     lookup->value_type = CODE_VALUE_CID;
139     lookup->value_size = num_bytes;
140     /* ditto */
141     lookup->values.data = (byte*) key_data;
142     lookup->values.size = num_bytes;
143     pcmap->def.lookup = lookup;
144     pcmap->def.num_lookup = 1;
145     /* no notdef */
146     /* no mark_glyph, mark_glyph_data, glyph_name, glyph_name_data */
147     *ppcmap = pcmap;
148     return 0;
149 }
150 
151 /*
152  * multi-dimensional range comparator
153  */
154 
155 private void
print_msg_str_in_range(const byte * str,const byte * key_lo,const byte * key_hi,int key_size)156 print_msg_str_in_range(const byte *str,
157                        const byte *key_lo, const byte *key_hi,
158                        int key_size)
159 {
160     debug_print_string_hex(str, key_size);
161     dlprintf(" in ");
162     debug_print_string_hex(key_lo, key_size);
163     dlprintf(" - ");
164     debug_print_string_hex(key_hi, key_size);
165     dlprintf("\n");
166 }
167 
168 private int
gs_cmap_get_shortest_chr(const gx_code_map_t * pcmap,uint * pfidx)169 gs_cmap_get_shortest_chr(const gx_code_map_t * pcmap, uint *pfidx)
170 {
171     int i;
172     int len_shortest = MAX_CMAP_CODE_SIZE;
173     uint fidx_shortest = 0; /* font index for this fallback */
174 
175     for (i = pcmap->num_lookup - 1; i >= 0; --i) {
176         const gx_code_lookup_range_t *pclr = &pcmap->lookup[i];
177         if ((pclr->key_prefix_size + pclr->key_size) <= len_shortest) {
178            len_shortest = (pclr->key_prefix_size + pclr->key_size);
179            fidx_shortest = pclr->font_index;
180         }
181     }
182 
183     *pfidx = fidx_shortest;
184     return len_shortest;
185 }
186 
187 /*
188  * multi-dimensional relative position calculator
189  *
190  * Returns offset of the given CID, considering CID range
191  * as array of CIDs (the last index changes fastest).
192  */
193 private int
gs_multidim_CID_offset(const byte * key_str,const byte * key_lo,const byte * key_hi,int key_size)194 gs_multidim_CID_offset(const byte *key_str,
195                         const byte *key_lo, const byte *key_hi,
196 			int key_size)
197 {
198 
199     int i;	/* index for current dimension */
200     int CID_offset = 0;
201 
202     if (gs_debug_c('J')) {
203         dlprintf("[J]gmCo()         calc CID_offset for 0x");
204         print_msg_str_in_range(key_str, key_lo, key_hi, key_size);
205     }
206 
207     for (i = 0; i < key_size; i++)
208         CID_offset = CID_offset * (key_hi[i] - key_lo[i] + 1) +
209             key_str[i] - key_lo[i];
210 
211     if_debug1('J', "[J]gmCo()         CID_offset = %d\n", CID_offset);
212     return CID_offset;
213 }
214 
215 /* Get a big-endian integer. */
216 private uint
bytes2int(const byte * p,int n)217 bytes2int(const byte *p, int n)
218 {
219     uint v = 0;
220     int i;
221 
222     for (i = 0; i < n; ++i)
223         v = (v << 8) + p[i];
224     return v;
225 }
226 
227 /*
228  * Decode a character from a string using a code map, updating the index.
229  * Return 0 for a CID or name, N > 0 for a character code where N is the
230  * number of bytes in the code, or an error.  Shift the decoded bytes into
231  * *pchr.  For undefined characters, set *pglyph = gs_no_glyph and return 0.
232  */
233 private int
code_map_decode_next(const gx_code_map_t * pcmap,const gs_const_string * pstr,uint * pindex,uint * pfidx,gs_char * pchr,gs_glyph * pglyph)234 code_map_decode_next(const gx_code_map_t * pcmap, const gs_const_string * pstr,
235                      uint * pindex, uint * pfidx,
236                      gs_char * pchr, gs_glyph * pglyph)
237 {
238     const byte *str = pstr->data + *pindex;
239     uint ssize = pstr->size - *pindex;
240     /*
241      * The keys are not sorted due to 'usecmap'.  Possible optimization :
242      * merge and sort keys in 'zbuildcmap', then use binary search here.
243      * This would be valuable for UniJIS-UTF8-H, which contains about 7000
244      * keys.
245      */
246     int i;
247 
248     for (i = pcmap->num_lookup - 1; i >= 0; --i) { /* reverse scan order due to 'usecmap' */
249         const gx_code_lookup_range_t *pclr = &pcmap->lookup[i];
250         int pre_size = pclr->key_prefix_size, key_size = pclr->key_size,
251             chr_size = pre_size + key_size;
252 
253         if (ssize < chr_size)
254             continue;
255         if (memcmp(str, pclr->key_prefix, pre_size))
256             continue;
257         /* Search the lookup range. We could use binary search. */
258         {
259             const byte *key = pclr->keys.data;
260             int step = key_size;
261             int k;
262             const byte *pvalue;
263 
264             if (pclr->key_is_range) {
265                 step <<= 1;
266                 for (k = 0; k < pclr->num_keys; ++k, key += step)
267                     if (memcmp(str + pre_size, key, key_size) >= 0 &&
268                         memcmp(str + pre_size, key + key_size, key_size) <= 0)
269                         break;
270             } else {
271                 for (k = 0; k < pclr->num_keys; ++k, key += step)
272                     if (!memcmp(str + pre_size, key, key_size))
273                         break;
274             }
275             if (k == pclr->num_keys)
276                 continue;
277             /* We have a match.  Return the result. */
278             *pchr = (*pchr << (chr_size * 8)) + bytes2int(str, chr_size);
279             *pindex += chr_size;
280             *pfidx = pclr->font_index;
281             pvalue = pclr->values.data + k * pclr->value_size;
282             switch (pclr->value_type) {
283             case CODE_VALUE_CID:
284                 *pglyph = gs_min_cid_glyph +
285                     bytes2int(pvalue, pclr->value_size) +
286                     bytes2int(str + pre_size, key_size) -
287                     bytes2int(key, key_size);
288                 return 0;
289             case CODE_VALUE_GLYPH:
290                 *pglyph = bytes2int(pvalue, pclr->value_size);
291                 return 0;
292             case CODE_VALUE_CHARS:
293                 *pglyph =
294                     bytes2int(pvalue, pclr->value_size) +
295                     bytes2int(str + pre_size, key_size) -
296                     bytes2int(key, key_size);
297                 return pclr->value_size;
298             default:            /* shouldn't happen */
299                 return_error(gs_error_rangecheck);
300             }
301         }
302     }
303     /* No mapping. */
304     *pglyph = gs_no_glyph;
305     return 0;
306 }
307 
308 private int
code_map_decode_next_multidim_regime(const gx_code_map_t * pcmap,const gs_const_string * pstr,uint * pindex,uint * pfidx,gs_char * pchr,gs_glyph * pglyph)309 code_map_decode_next_multidim_regime(const gx_code_map_t * pcmap,
310                      const gs_const_string * pstr,
311                      uint * pindex, uint * pfidx,
312                      gs_char * pchr, gs_glyph * pglyph)
313 {
314     const byte *str = pstr->data + *pindex;
315     uint ssize = pstr->size - *pindex;
316     /*
317      * The keys are not sorted due to 'usecmap'.  Possible optimization :
318      * merge and sort keys in 'zbuildcmap', then use binary search here.
319      * This would be valuable for UniJIS-UTF8-H, which contains about 7000
320      * keys.
321      */
322     int i;
323 
324     /*
325      * In the fallback of CMap decoding procedure, there is "partial matching".
326      * For detail, refer PostScript Ref. Manual v3 at the end of Fonts chapter.
327      */
328 
329     /* "pm" stands for partial match (not pointer), temporal use. */
330     int pm_maxlen = 0;		/* partial match: max length */
331     int pm_index = *pindex;	/* partial match: ptr index (in str) */
332     uint pm_fidx = *pfidx;	/* partial match: ptr font index */
333     gs_char pm_chr = *pchr;	/* partial match: ptr character */
334 
335     *pchr = '\0';
336 
337     if (gs_debug_c('J')) {
338         dlprintf("[J]CMDNmr() is called: str=(");
339         debug_print_string_hex(str, ssize);
340         dlprintf3(") @ 0x%lx ssize=%d, %d ranges to check\n",
341                        str, ssize, pcmap->num_lookup);
342     }
343 
344     for (i = pcmap->num_lookup - 1; i >= 0; --i) {
345 	/* main loop - scan the map passed via pcmap */
346 	/* reverse scan order due to 'usecmap' */
347 
348         const gx_code_lookup_range_t *pclr = &pcmap->lookup[i];
349         int pre_size = pclr->key_prefix_size, key_size = pclr->key_size,
350             chr_size = pre_size + key_size;
351 
352         int j = 0;
353 	/* length of the given byte stream is shorter than
354          * chr-length of current range, no need for further check,
355          * skip to the next range.
356          */
357         if (ssize < chr_size)
358             continue;
359 
360         if (0 < pre_size) {
361             const byte * prefix = pclr->key_prefix;
362             /* check partial match in prefix */
363             for (j = 0; j < pre_size; j++)
364                if (prefix[j] != str[j])
365                    break;
366 
367             if (0 == j)			/* no match, skip to next i */
368                 continue;
369             else if (j < pre_size) {	/* not exact, partial match */
370                 if (gs_debug_c('J')) {
371                     dlprintf("[J]CMDNmr() partial match with prefix:");
372                     print_msg_str_in_range(str, prefix,
373                                                 prefix, pre_size);
374                 }
375 
376                 if (pm_maxlen < j) {
377                     pm_maxlen = chr_size;
378                     pm_chr = bytes2int(str, chr_size);
379                     pm_index = (*pindex) + chr_size;
380                     pm_fidx = pclr->font_index;
381                 }
382                 continue ; /* no need to check key, skip to next i */
383             }
384 
385             if (gs_debug_c('J')) {
386                 dlprintf("[J]CMDNmr()   full match with prefix:");
387                 print_msg_str_in_range(str, prefix, prefix, pre_size);
388             }
389 
390         } /* if (0 < pre_size) */
391 
392         /* full match in prefix. check key */
393         {
394             const byte *key = pclr->keys.data;
395             int step = key_size;
396             int k, l, m;
397             const byte *pvalue = NULL;
398 
399 	    /* when range is "range", 2 keys for lo-end and hi-end
400 	     * are stacked. So twice the step. current "key" points
401 	     * lo-end of current range, and the pointer for hi-end
402 	     * is calculated by (key + step - key_size).
403 	     */
404 
405             if (pclr->key_is_range)
406 		step <<=1; 	/* step = step * 2; */
407 
408             for (k = 0; k < pclr->num_keys; ++k, key += step) {
409 
410 		if_debug0('j', "[j]CMDNmr()     check key:");
411 		if (gs_debug_c('j'))
412 		    print_msg_str_in_range(str + pre_size,
413 		        key, key + step - key_size, key_size) ;
414 
415 		for (l = 0; l < key_size; l++) {
416 		    byte c = str[l + pre_size];
417 		    if (c < key[l] || c > key[step - key_size + l])
418 			break;
419 		}
420 
421 		if (pm_maxlen < pre_size + l) {
422                     pm_maxlen = chr_size;
423                     pm_chr = bytes2int(str, chr_size);
424                     pm_index = (*pindex) + chr_size;
425                     pm_fidx = pclr->font_index;
426                 }
427                 if (l == key_size)
428                         break;
429 	    }
430 
431             /* all keys are tried, but found no match. */
432             /* go to next prefix. */
433             if (k == pclr->num_keys)
434                 continue;
435 
436             /* We have a match.  Return the result. */
437             *pchr = bytes2int(str, chr_size);
438             *pindex += chr_size;
439             *pfidx = pclr->font_index;
440             pvalue = pclr->values.data + k * pclr->value_size;
441 
442             if (gs_debug_c('J')) {
443                 dlprintf("[J]CMDNmr()     full matched pvalue=(");
444                 debug_print_string_hex(pvalue, pclr->value_size);
445                 dlprintf(")\n");
446             }
447 
448             switch (pclr->value_type) {
449             case CODE_VALUE_CID:
450                 *pglyph = gs_min_cid_glyph +
451                     bytes2int(pvalue, pclr->value_size) +
452                     gs_multidim_CID_offset(str + pre_size,
453                         key, key + step - key_size, key_size);
454                 return 0;
455             case CODE_VALUE_NOTDEF:
456                 *pglyph = gs_min_cid_glyph +
457                     bytes2int(pvalue, pclr->value_size);
458                 return 0;
459             case CODE_VALUE_GLYPH:
460                 *pglyph = bytes2int(pvalue, pclr->value_size);
461                 return 0;
462             case CODE_VALUE_CHARS:
463                 *pglyph =
464                     bytes2int(pvalue, pclr->value_size) +
465                     bytes2int(str + pre_size, key_size) -
466                     bytes2int(key, key_size);
467                 return pclr->value_size;
468             default:            /* shouldn't happen */
469                 return_error(gs_error_rangecheck);
470             }
471         }
472     }
473     /* No mapping. */
474     *pchr = pm_chr;
475     *pindex = pm_index;
476     *pfidx = pm_fidx;
477     *pglyph = gs_no_glyph;
478     if (gs_debug_c('J')) {
479         dlprintf("[J]CMDNmr()     no full match, use partial match for (");
480         debug_print_string_hex(str, pm_maxlen);
481         dlprintf(")\n");
482     }
483     return 0;
484 }
485 
486 /*
487  * Decode a character from a string using a CMap.
488  * Return like code_map_decode_next.
489  * At present, the range specification by (begin|end)codespacerange
490  * is not used in this function. Therefore, this function accepts
491  * some invalid CMap which def & undef maps exceed the codespacerange.
492  * It should be checked in this function, or some procedure in gs_cmap.ps.
493  */
494 int
gs_cmap_decode_next(const gs_cmap_t * pcmap,const gs_const_string * pstr,uint * pindex,uint * pfidx,gs_char * pchr,gs_glyph * pglyph)495 gs_cmap_decode_next(const gs_cmap_t * pcmap, const gs_const_string * pstr,
496                     uint * pindex, uint * pfidx,
497                     gs_char * pchr, gs_glyph * pglyph)
498 {
499     uint save_index = *pindex;
500     int code;
501 
502     uint pm_index;
503     uint pm_fidx;
504     gs_char pm_chr;
505 
506     /* For first, check defined map */
507     if_debug0('J', "[J]GCDN() check def CMap\n");
508     code =
509         code_map_decode_next_multidim_regime(&pcmap->def, pstr, pindex, pfidx, pchr, pglyph);
510 
511     /* This is defined character */
512     if (code != 0 || *pglyph != gs_no_glyph)
513         return code;
514 
515     /* In here, this is NOT defined character */
516     /* save partially matched results */
517     pm_index = *pindex;
518     pm_fidx = *pfidx;
519     pm_chr = *pchr;
520 
521     /* check notdef map. */
522     if_debug0('J', "[J]GCDN() check notdef CMap\n");
523     *pindex = save_index;
524     code =
525 	code_map_decode_next_multidim_regime(&pcmap->notdef, pstr, pindex, pfidx, pchr, pglyph);
526 
527     /* This is defined "notdef" character. */
528     if (code != 0 || *pglyph != gs_no_glyph)
529         return code;
530 
531     /*
532      * This is undefined in def & undef maps,
533      * use partially matched result with default notdef (CID = 0).
534      */
535     if (save_index < pm_index) {
536 
537 	/* there was some partially matched */
538 
539         *pglyph = gs_min_cid_glyph;	/* CID = 0 */
540         *pindex = pm_index;
541         *pfidx = pm_fidx;
542         *pchr = '\0';
543          return 0; /* should return some error for partial matched .notdef? */
544     }
545     else {
546 	/* no match */
547 
548 	/* Even partial match is failed.
549          * Getting the shortest length from defined characters,
550          * and take the leading bytes (with same length of the shortest
551          * defined chr) as an unidentified character: CID = 0.
552 	 * Also this procedure is specified in PS Ref. Manual v3,
553          * at the end of Fonts chapter.
554          */
555 
556 	const byte *str = pstr->data + save_index;
557 	uint ssize = pstr->size - save_index;
558 	int chr_size_shortest =
559 		gs_cmap_get_shortest_chr(&pcmap->def, pfidx);
560 
561 	if (chr_size_shortest <= ssize) {
562             *pglyph = gs_min_cid_glyph;	/* CID = 0, this is CMap fallback */
563             *pindex = save_index + chr_size_shortest;
564 	    *pchr = '\0';
565             if (gs_debug_c('J')) {
566                 dlprintf1("[J]GCDN() no partial match, skip %d byte (",
567                                                chr_size_shortest);
568                 debug_print_string_hex(str, chr_size_shortest);
569                 dlprintf(")\n");
570             }
571             return 0; /* should return some error for fallback .notdef? */
572 	}
573 	else {
574             /* Undecodable string is shorter than the shortest character,
575              * there's no way except to return error.
576              */
577 	    *pglyph = gs_no_glyph;
578 	    return -1;
579             if (gs_debug_c('J')) {
580                 dlprintf2("[J]GCDN() left data in buffer (%d) is shorter than shortest defined character (%d)\n",
581                   ssize, chr_size_shortest);
582             }
583             *pglyph = gs_no_glyph;
584             return_error(e_rangecheck);
585 	}
586     }
587 }
588