1 /* Copyright (C) 1995-1998, Digital Equipment Corporation. */
2 /* All rights reserved. */
3 /* See the file pstotext.txt for a full description. */
4 /* Last modified on Fri Jan 09 21:19:00 AEST 2004 by rjl */
5 /* modified on Fri Jan 09 08:21:00 AEST 2004 by rjl */
6 /* modified on Wed Oct 28 08:42:15 PST 1998 by mcjones */
7 /* modified on Sun Jul 28 00:00:00 UTC 1996 by rjl */
8
9 /* Modifications by rjl
10 * Fixed compiler warnings
11 */
12
13 /* This module is based on OCR_PS.m3, a module of the Virtual Paper
14 project at the DEC Systems Research Center:
15 http://www.research.digital.com/SRC/virtualpaper/ */
16
17 #include <math.h>
18 #include <string.h>
19 #include <stdlib.h>
20 #include "ptotdll.h"
21
22 #ifndef NULL
23 #define NULL 0
24 #endif
25
26 #define BOOLEAN int
27 #define FALSE 0
28 #define TRUE 1
29
30 #define MIN(a,b) ((a)<=(b)?(a):(b))
31 #define MAX(a,b) ((b)<=(a)?(a):(b))
32
33 /* Character encoding. Each element of the QE directive produced by
34 ocr.ps is either an index in the StandardGlyph array, or is
35 "NonstandardGlyph" (indicating the corresponding entry in the font's
36 encoding specifies some nonstandard glyph). */
37
38 typedef unsigned GlyphIndex;
39 #define NonstandardGlyph 9999
40
41 #define UnknownChar '#' /* substitute for nonstandard glyph */
42
43 /* The first 256 entries in StandardGlyphs correspond to ISOLatin1;
44 the next 28 entries correspond to characters not in ISOLatin1, but
45 defined in the standard /Times-Roman font. */
46
47 #define LastISOLatin1 255
48
49 #define FIRSTSpecialGlyphs (LastISOLatin1+1)
50 #define LASTSpecialGlyphs (LastISOLatin1+28)
51 static const char *SpecialGlyphs[] = {
52 "''", /* quotedblright */
53 "S\237", /* Scaron */
54 "+", /* dagger */
55 "<", /* guilsinglleft */
56 "Z\237", /* Zcaron */
57 "#", /* daggerdbl */
58 "L/", /* Lslash */
59 "...", /* ellipsis */
60 ">", /* guilsinglright */
61 "oe", /* oe */
62 "fi", /* fi */
63 ".", /* bullet */
64 "o/oo", /* perthousand */
65 "''", /* quotedblbase */
66 "--", /* endash */
67 "---", /* emdash */
68 "^TM", /* trademark */
69 "f", /* florin */
70 "l/", /* lslash */
71 "s\237", /* scaron */
72 "Y\250", /* Ydieresis */
73 "fl", /* fl */
74 "/", /* fraction */
75 "``", /* quotedblleft */
76 "'", /* quotesinglbase */
77 "'", /* quotesingle */
78 "z\237", /* zcaron */
79 "OE" /* OE */
80 };
81
82 /* The next 256 entries correspond to the self-named glyphs used in
83 Type 3 fonts from dvips: "\000", ..., "\377": */
84
85 #define FirstDvips (LASTSpecialGlyphs+1)
86 #define LastDvips (FirstDvips+256-1)
87
88 /* The next 512 entries correspond to glyph names used in Microsoft
89 TrueType fonts: "G00", ..., "Gff" and "G00", ..., "GFF", which
90 in both cases correspond to ISOLatin1 with some extensions. */
91
92 #define FirstTT1 (LastDvips+1)
93 #define LastTT1 (FirstTT1+256-1)
94 #define FirstTT2 (LastTT1+1)
95 #define LastTT2 (FirstTT2+256-1)
96 #define FirstOldDvips (LastTT2+1)
97 #define LastOldDvips (FirstOldDvips+128-1) /* note only 128 */
98
99 #define FIRSTTTSpecialGlyphs (FirstTT1+130)
100 #define LASTTTSpecialGlyphs (FirstTT1+159)
101 static const char *TTSpecialGlyphs[] = {
102 "'", /* quotesinglbase */
103 "f", /* florin */
104 "''", /* quotdblbase */
105 "...", /* ellipsis */
106 "+", /* dagger */
107 "#", /* daggerdbl */
108 "\223", /* circumflex */
109 "o/oo", /* perthousand */
110 "S\237", /* Scaron */
111 "<", /* guilsinglleft */
112 "OE", /* OE */
113 "#", /* <undefined> */
114 "#", /* <undefined> */
115 "#", /* <undefined> */
116 "#", /* <undefined> */
117 "`", /* ISOLatin1: quoteleft */
118 "'", /* ISOLatin1: quoteright */
119 "``", /* quotedblleft */
120 "''", /* quotedblright */
121 ".", /* bullet */
122 "--", /* endash */
123 "---", /* emdash */
124 "~", /* ISOLatin1: tilde */
125 "^TM", /* trademark */
126 "s\237", /* scaron */
127 ">", /* guilsinglright */
128 "oe", /* oe */
129 "#", /* <undefined> */
130 "#", /* <undefined> */
131 "Y\250" /* Ydieresis" */
132 };
133
134 #define FIRSTDvipsGlyphs FirstDvips
135 #define LASTDvipsGlyphs (FirstDvips+127)
136 static const char *DvipsGlyphs[] = {
137 /* 00x */
138 "\\Gamma", "\\Delta", "\\Theta", "\\Lambda",
139 "\\Xi", "\\Pi", "\\Sigma", "\\Upsilon",
140 /* 01x */
141 "\\Phi", "\\Psi", "\\Omega", "ff", "fi", "fl", "ffi", "ffl",
142 /* 02x */
143 "i", /* \imath */
144 "j", /* \jmath */
145 "`",
146 "'",
147 "\237", /* caron */
148 "\226", /* breve */
149 "\257", /* macron */
150 "\232", /* ring */
151 /* 03x */
152 "\270", /* cedilla */
153 "\337", /* germandbls */
154 "ae",
155 "oe",
156 "\370", /* oslash */
157 "AE",
158 "OE",
159 "\330", /* Oslash */
160 /* 04x */
161 "/" /* bar for Polish suppressed-L ??? */, "!", "''", "#",
162 "$", "%", "&", "'",
163 /* 05x */
164 "(", ")", "*", "+",
165 ",", "\255" /* hyphen */, ".", "/",
166 /* 06x */
167 "0", "1", "2", "3", "4", "5", "6", "7",
168 /* 07x */
169 "8", "9", ":", ";",
170 "!" /* exclamdown */, "=", "?" /* questiondown */, "?",
171 /* 010x */
172 "@", "A", "B", "C", "D", "E", "F", "G",
173 /* 011x */
174 "H", "I", "J", "K", "L", "M", "N", "O",
175 /* 012x */
176 "P", "Q", "R", "S", "T", "U", "V", "W",
177 /* 013x */
178 "X", "Y", "Z", "[",
179 "``", "]", "\223" /* circumflex */, "\227" /* dotaccent */,
180 /* 014x */
181 "`", "a", "b", "c", "d", "e", "f", "g",
182 /* 015x */
183 "h", "i", "j", "k", "l", "m", "n", "o",
184 /* 016x */
185 "p", "q", "r", "s", "t", "u", "v", "w",
186 /* 017x */
187 "x", "y", "z",
188 "--", /* en dash */
189 "---", /* em dash */
190 "\235", /* hungarumlaut */
191 "~",
192 "\250" /* dieresis */
193 };
194
195 #define FIRSTCorkSpecialGlyphs FirstDvips
196 #define LASTCorkSpecialGlyphs (FirstDvips+0277)
197 static const char *CorkSpecialGlyphs[] = {
198 /* 000 - accents for lowercase letters */
199 "`",
200 "'",
201 "^",
202 "~",
203 "\230", /* umlaut/dieresis */
204 "\235", /* hungarumlaut */
205 "\232", /* ring */
206 "\237", /* hacek/caron */
207 "\226", /* breve */
208 "\257", /* macron */
209 "\227", /* dot above/dotaccent */
210 "\270", /* cedilla */
211 "\236", /* ogonek */
212 /* 015 - miscellaneous */
213 "'", /* single base quote/quotesinglbase */
214 "<", /* single opening guillemet/guilsinglleft */
215 ">", /* single closing guillemet/guilsinglright */
216 "``", /* english opening quotes/quotedblleft */
217 "''", /* english closing quotes/quotedblright */
218 ",,", /* base quotes/quotedblbase */
219 "<<", /* opening guillemets/guillemotleft */
220 ">>", /* closing guillemets/guillemotright */
221 "--", /* en dash/endash */
222 "---", /* em dash/emdash */
223 "", /* compound work mark (invisible)/ */
224 "o", /* perthousandzero (used in conjunction with %) */
225 "\220", /* dotless i/dotlessi */
226 "j", /* dotless j */
227 "ff", /* ligature ff */
228 "fi", /* ligature fi */
229 "fl", /* ligature fl */
230 "ffi", /* ligature ffi */
231 "ffl", /* ligature ffl */
232 "_", /* visible space */
233 /* 041 - ASCII */
234 "!", "\"", "#", "$", "%", "&", "'",
235 "(", ")", "*", "+", ",", "-", ".", "/",
236 "0", "1", "2", "3", "4", "5", "6", "7",
237 "8", "9", ":", ";", "<", "=", ">", "?",
238 "@", "A", "B", "C", "D", "E", "F", "G",
239 "H", "I", "J", "K", "L", "M", "N", "O",
240 "P", "Q", "R", "S", "T", "U", "V", "W",
241 "X", "Y", "Z", "[", "\\","]", "^", "_",
242 "`", "a", "b", "c", "d", "e", "f", "g",
243 "h", "i", "j", "k", "l", "m", "n", "o",
244 "p", "q", "r", "s", "t", "u", "v", "w",
245 "x", "y", "z", "{", "|", "}", "~", "\255", /* hyphenchar (hanging) */
246 /* 200 - letters for eastern European languages from latin-2 */
247 "A\226", /* Abreve */
248 "A\236", /* Aogonek */
249 "C\264", /* Cacute */
250 "C\237", /* Chacek */
251 "D\237", /* Dhacek */
252 "E\237", /* Ehacek */
253 "E\236", /* Eogonek */
254 "G\226", /* Gbreve */
255 "L\264", /* Lacute */
256 "L\237", /* Lhacek */
257 "L/", /* Lslash/Lstroke */
258 "N\264", /* Nacute */
259 "N\237", /* Nhacek */
260 "\\NG", /* Eng */
261 "O\235", /* Ohungarumlaut */
262 "R\264", /* Racute */
263 "R\237", /* Rhacek */
264 "S\264", /* Sacute */
265 "S\237", /* Shacek */
266 "S\270", /* Scedilla */
267 "T\237", /* Thacek */
268 "T\270", /* Tcedilla */
269 "U\235", /* Uhungarumlaut */
270 "U\232", /* Uring */
271 "Y\250", /* Ydieresis */
272 "Z\264", /* Zacute */
273 "Z\237", /* Zhacek */
274 "Z\227", /* Zdot */
275 "IJ", /* IJ */
276 "I\227", /* Idot */
277 "\\dj", /* dbar */
278 "\247", /* section */
279 "a\226", /* abreve */
280 "a\236", /* aogonek */
281 "c\222", /* cacute */
282 "c\237", /* chacek */
283 "d\237", /* dhacek */
284 "e\237", /* ehacek */
285 "e\236", /* eogonek */
286 "g\226", /* gbreve */
287 "l\222", /* lacute */
288 "l\237", /* lhacek */
289 "l/", /* lslash */
290 "n\222", /* nacute */
291 "n\237", /* nhacek */
292 "\\ng", /* eng */
293 "o\235", /* ohungarumlaut */
294 "r\222", /* racute */
295 "r\237", /* rhacek */
296 "s\222", /* sacute */
297 "s\237", /* shacek */
298 "s\270", /* scedilla */
299 "t\237", /* thacek */
300 "t\270", /* tcedilla */
301 "u\235", /* uhungarumlaut */
302 "u\232", /* uring */
303 "y\230", /* ydieresis */
304 "z\222", /* zacute */
305 "z\237", /* zhacek */
306 "z\227", /* zdot */
307 "ij", /* ij */
308 "\241", /* exclamdown */
309 "\277", /* questiondown */
310 "\243" /* sterling */
311 /* 0300-0377 is same as ISO 8859/1 except:
312 0337 is Ess-zed and 0377 is ess-zed/germandbls */
313 };
314
315 /* There are gaps in the set of printable ISOLatin1 characters: */
316 /*CONST ISOLatin1Gaps = SET OF [0..255] {
317 8_0..8_37, 8_177..8_217, 8_231, 8_234};
318 */
319
320 typedef struct {
321 double blx, bly, toprx, topry; /* font matrix in character coordinates */
322 struct {double x, y;} chr[256]; /* widths in character coordinates */
323 } MetricsRec;
324 typedef MetricsRec *Metrics;
325 typedef Metrics MetricsTable[];
326
327 typedef GlyphIndex EncodingVector[256];
328 typedef EncodingVector *Encoding;
329 typedef Encoding EncodingTable[];
330
331 typedef struct {
332 double x, y; /* (1000,0) in font's character coordinate system */
333 double xp, yp; /* (0,1000) in font's character coordinate system */
334 int e; /* index in "encoding" */
335 int m; /* index in "metrics" */
336 double bx, by, tx, ty; /* height of font bbox in reporting coordinates */
337 } FontRec;
338 typedef FontRec *Font;
339 typedef Font FontTable[];
340
341
342 /* Instance "T". */
343 #define state_normal 0
344 #define state_metrics 1
345 #define state_encoding 2
346 typedef struct {
347 double itransform[6]; /* transform from device to default coordinates */
348 int metricsSize;
349 MetricsTable *metrics;
350 int encodingSize;
351 EncodingTable *encoding;
352 BOOLEAN dvipsIsCork; /* assume Cork rather than "OT1" for dvips output */
353 int fontSize;
354 FontTable *font;
355
356 /* Data for current word prefix: */
357 char buf[1000];
358 int lbuf; /* elements 0 through "lbuf-1" of "buf" are in use */
359 int f; /* font number */
360 double x0, y0, x1, y1; /* initial and final currentpoint */
361
362 BOOLEAN nonEmptyPage;
363 long blx, bly, toprx, topry; /* bounding box of last word output */
364 char word[1000]; /* last word output */
365 int state;
366 /* state-specific components: */
367 /* state_encoding: */ int encoding_e, encoding_n, encoding_i;
368 /* state_metrics: */ int metrics_m, metrics_i;
369 } T;
370
371 static int ReadChar(char **instr);
372 static void UnreadChar(char **instr);
373 static int ReadInt(char **instr);
374 static long ReadLong(char **instr);
375 static int ParseInverseTransform(T *t, char *instr);
376 static int ParseEncoding(T *t, char *instr);
377 static int ParseEncodingMore(T *t, char *instr);
378 static void ReadPair(double *x, double *y, char **instr);
379 static int ParseFont(T *t, char *instr);
380 static int ParseMetrics(T *t, char *instr);
381 static int ParseMetricsMore(T *t, char *instr);
382 static void Itransform(T *t, double *x1, double *y1, double x0, double y0);
383 static void Output(T *t, const char **pre, const char **word,
384 int *llx, int *lly, int *urx, int *ury);
385 static BOOLEAN SameDirection(double x0, double y0, double x1, double y1);
386 static int ParseString(
387 T *t, char *instr, const char **pre, const char **word, const char **post,
388 int *llx, int *lly, int *urx, int *ury);
389
pstotextInit(void ** instance)390 int DLLEXPORT pstotextInit(void **instance) {
391 T *t;
392 int i;
393
394 t = (T *)malloc(sizeof(T));
395 if (t == NULL) return PSTOTEXT_INIT_MALLOC;
396
397 t->state = state_normal;
398
399 /* Initialize t->itransform to the identity transform. */
400 t->itransform[0] = 1.0;
401 t->itransform[1] = 0.0;
402 t->itransform[2] = 0.0;
403 t->itransform[3] = 1.0;
404 t->itransform[4] = 0.0;
405 t->itransform[5] = 0.0;
406
407 t->metricsSize = t->encodingSize = t->fontSize = 100;
408
409 t->metrics = (MetricsTable *)malloc(t->metricsSize * sizeof(Metrics));
410 if (t->metrics == NULL) {
411 free(t);
412 return PSTOTEXT_INIT_MALLOC;
413 }
414 for(i=0; i<t->metricsSize; i++)(*t->metrics)[i] = NULL;
415
416 t->encoding = (EncodingTable *)malloc(t->encodingSize * sizeof(Encoding));
417 if (t->encoding == NULL) {
418 free(t);
419 return PSTOTEXT_INIT_MALLOC;
420 }
421 for(i=0;i<t->encodingSize;i++)(*t->encoding)[i] = NULL;
422
423 t->dvipsIsCork = FALSE;
424
425 t->font = (FontTable *)malloc(t->fontSize * sizeof(Font));
426 if (t->font == NULL) {
427 free(t);
428 return PSTOTEXT_INIT_MALLOC;
429 }
430 for(i=0;i<t->fontSize;i++)(*t->font)[i] = NULL;
431
432 t->lbuf = 0;
433 t->nonEmptyPage = FALSE;
434 t->blx = t->bly = t->toprx = t->topry = 0;
435
436 *instance = t;
437
438 return 0;
439 }
440
pstotextSetCork(void * instance,int value)441 int DLLEXPORT pstotextSetCork(void *instance, int value) {
442 T *t = (T *)instance;
443 t->dvipsIsCork = value;
444 return 0;
445 }
446
pstotextExit(void * instance)447 int DLLEXPORT pstotextExit(void *instance) {
448 T *t = (T *)instance;
449 free(t->metrics);
450 free(t->encoding);
451 free(t->font);
452 free(t);
453 return 0;
454 }
455
ReadChar(char ** instr)456 static int ReadChar(char **instr) {
457 int c = **(unsigned char**)instr;
458 (*instr)++;
459 return c;
460 }
461
UnreadChar(char ** instr)462 static void UnreadChar(char **instr) {
463 (*instr)--;
464 }
465
ReadInt(char ** instr)466 static int ReadInt(char **instr) {
467 int i = 0;
468 int sign = 1;
469 int c;
470 while ((c = ReadChar(instr))==' ') /* skip */ ;
471 if (c=='-') {sign = -1; c = ReadChar(instr); }
472 while ('0' <= c && c <= '9') {i = i*10+(c-'0'); c = ReadChar(instr);}
473 UnreadChar(instr);
474 return i*sign;
475 }
476
ReadLong(char ** instr)477 static long ReadLong(char **instr) {
478 long i = 0;
479 int sign = 1;
480 int c;
481 while ((c = ReadChar(instr))==' ') /* skip */ ;
482 if (c=='-') {sign = -1; c = ReadChar(instr); }
483 while ('0' <= c && c <= '9') {i = i*10+(c-'0'); c = ReadChar(instr);}
484 UnreadChar(instr);
485 return i*sign;
486 }
487
ParseInverseTransform(T * t,char * instr)488 static int ParseInverseTransform(T *t, char *instr) {
489 int i;
490 for (i = 0; i<6; i++) t->itransform[i] = ReadLong(&instr) / 100.0;
491 return 0;
492 }
493
ParseEncoding(T * t,char * instr)494 static int ParseEncoding(T *t, char *instr) {
495 /* Parse first line of QE directive. */
496 int e = ReadInt(&instr);
497 int n = ReadInt(&instr);
498 int i;
499 if (e<0) return PSTOTEXT_FILTER_BADENCODINGNUMBER;
500 if (n>/*256*/1024) return PSTOTEXT_FILTER_TOOMANYGLYPHINDEXES;
501
502 /* Grow "t->encoding" if necessary. */
503 if (t->encodingSize<=e) {
504 int oldSize = t->encodingSize;
505 t->encodingSize = 2*e;
506 t->encoding = (EncodingTable *)realloc(
507 (char *)t->encoding,
508 t->encodingSize * sizeof(Encoding)
509 );
510 for(i=oldSize;i<t->encodingSize;i++)(*t->encoding)[i] = NULL;
511 }
512
513 /* If this is the first encoding numbered "e", allocate array. */
514 if ((*t->encoding)[e] == NULL)
515 (*t->encoding)[e] = (EncodingVector *)malloc(sizeof(EncodingVector));
516
517 t-> state = state_encoding;
518 t->encoding_e = e; t->encoding_n = n; t->encoding_i = 0;
519
520 return 0;
521 }
522
ParseEncodingMore(T * t,char * instr)523 static int ParseEncodingMore(T *t, char *instr) {
524 /* Parse subsequent line of QE directive. */
525 Encoding enc = (*t->encoding)[t->encoding_e];
526 int i, tooSparse;
527
528 for (i = t->encoding_i; i<t->encoding_i+16 ; i++)
529 (*enc)[i] = (i<t->encoding_n) ? ReadInt(&instr) : NonstandardGlyph;
530
531 t->encoding_i += 16;
532 if (t->encoding_i < 256) /* skip */ ;
533 else {
534 /* End of directive. */
535 t->state = state_normal;
536
537 /* Some applications build the encoding vector incrementally. If
538 this one doesn't have at least the lower-case letters, we augment
539 it with ISOLatin1. */
540 tooSparse = 0;
541 for (i = 'a'; i<='z'; i++)
542 tooSparse = (*enc)[i] == NonstandardGlyph;
543 if (tooSparse)
544 for (i = 0; i<256; i++)
545 if ((*enc)[i] == NonstandardGlyph) (*enc)[i] = i;
546 }
547
548 return 0;
549
550 }
551
552 #define GuessAscend 0.9
553 #define GuessDescend -0.3
554
ReadPair(double * x,double * y,char ** instr)555 static void ReadPair(double /*out*/ *x, /*out*/ double *y, char **instr) {
556 *x = ReadLong(instr) / 100.0;
557 *y = ReadLong(instr) / 100.0;
558 }
559
ParseFont(T * t,char * instr)560 static int ParseFont(T *t, char *instr) {
561 /* Parse QF directive. */
562 int n = ReadInt(&instr), i;
563 Metrics mt;
564 Font f;
565 double xmax, bly, topry;
566 if (n<0) return PSTOTEXT_FILTER_BADFONTNUMBER;
567
568 /* Grow "t->font" if necessary. */
569 if (t->fontSize<=n) {
570 int oldSize = t->fontSize;
571 t->fontSize = 2*n;
572 t->font = (FontTable *)realloc(
573 (char *)t->font,
574 t->fontSize * sizeof(Font)
575 );
576 for(i=oldSize;i<t->fontSize;i++)(*t->font)[i] = NULL;
577 }
578
579 /* If this is the first font numbered "n", allocate "FontRec". */
580 if ((*t->font)[n] == NULL)
581 (*t->font)[n] = (Font)malloc(sizeof(FontRec));
582
583 f = (*t->font)[n];
584 ReadPair(&f->x, &f->y, &instr);
585 ReadPair(&f->xp, &f->yp, &instr);
586 f->e = ReadInt(&instr);
587 if ((*t->encoding)[f->e] == NULL) return PSTOTEXT_FILTER_BADENCODINGNUMBER;
588 f->m = ReadInt(&instr);
589 mt = (*t->metrics)[f->m];
590 if (mt == NULL) return PSTOTEXT_FILTER_BADMETRICNUMBER;
591
592 /* Transform height of font bounding box to reporting coordinates: */
593 f->bx = f->xp * mt->bly / 1000.0;
594 f->by = f->yp * mt->bly / 1000.0;
595 f->tx = f->xp * mt->topry / 1000.0;
596 f->ty = f->yp * mt->topry / 1000.0;
597
598 /* In some fonts produced by dvips, the FontBBox is incorrectly
599 defined as [0 0 1 1]. We check for this, and apply the same
600 heuristic used for an undefined FontBBox in "ParseMetrics". */
601 if (f->by-f->ty < 1.1) {
602 xmax = 0.0;
603 for (i = 0; i<256; i++)
604 if (mt->chr[i].x > xmax) xmax = mt->chr[i].x;
605 bly = GuessDescend * xmax; topry = GuessAscend * xmax;
606 f->bx = f->xp * bly / 1000.0;
607 f->by = f->yp * bly / 1000.0;
608 f->tx = f->xp * topry / 1000.0;
609 f->ty = f->yp * topry / 1000.0;
610 }
611
612 return 0;
613 }
614
ParseMetrics(T * t,char * instr)615 static int ParseMetrics(T *t, char *instr) {
616 /* Parse first line of QM directive. */
617 int m = ReadInt(&instr), i;
618 Metrics mt;
619
620 if (m<0) return PSTOTEXT_FILTER_BADMETRICNUMBER;
621
622 /* Grow "t->metrics" if necessary. */
623 if (t->metricsSize<=m) {
624 int oldSize = t->metricsSize;
625 t->metricsSize = 2*m;
626 t->metrics = (MetricsTable *)realloc(
627 (char *)t->metrics,
628 t->metricsSize * sizeof(Metrics)
629 );
630 for (i=oldSize;i<t->metricsSize;i++)(*t->metrics)[i] = NULL;
631 }
632
633 /* If this is the first metrics numbered "m", allocate "MetricsRec". */
634 if ((*t->metrics)[m] == NULL)
635 (*t->metrics)[m] = (Metrics)malloc(sizeof(MetricsRec));
636
637 mt = (*t->metrics)[m];
638
639 ReadPair(&mt->blx, &mt->bly, &instr);
640 ReadPair(&mt->toprx, &mt->topry, &instr);
641
642 t->state = state_metrics; t->metrics_m = m; t->metrics_i = 0;
643
644 return 0;
645 }
646
ParseMetricsMore(T * t,char * instr)647 static int ParseMetricsMore(T *t, char *instr) {
648 /* Parse subsequent line of QM directive. */
649 int i;
650 Metrics mt = (*t->metrics)[t->metrics_m];
651
652 for (i = t->metrics_i; i<t->metrics_i+8; i++)
653 ReadPair(&mt->chr[i].x, &mt->chr[i].y, &instr);
654
655 t->metrics_i += 8;
656 if (t->metrics_i < 256) /* skip */ ;
657 else {
658 /* End of directive. */
659 t->state = state_normal;
660
661 /* If "FontBBox" was not specified, take a guess. */
662 if (mt->blx == 0.0 && mt->bly == 0.0 && mt->toprx == 0.0 && mt->topry == 0.0) {
663 for (i = 0; i<256; i++)
664 if (mt->chr[i].x > mt->toprx) mt->toprx = mt->chr[i].x;
665 mt->bly = GuessDescend * mt->toprx;
666 mt->topry = GuessAscend * mt->toprx;
667 }
668 }
669
670 return 0;
671 }
672
Itransform(T * t,double * x1,double * y1,double x0,double y0)673 static void Itransform(T *t, double *x1, double *y1, double x0, double y0) {
674 /* Set (*x1, *y1) to (t->itransform) * (x0, y0). */
675 *x1 = t->itransform[0]*x0 + t->itransform[2]*y0 + t->itransform[4];
676 *y1 = t->itransform[1]*x0 + t->itransform[3]*y0 + t->itransform[5];
677 }
678
Output(T * t,const char ** pre,const char ** word,int * llx,int * lly,int * urx,int * ury)679 static void Output(T *t, const char **pre, const char **word,
680 int *llx, int *lly, int *urx, int *ury) {
681 /* Output the next word. */
682 double x0, y0, x1, y1, x2, y2, x3, y3;
683 long blx, bly, toprx, topry, mid;
684 Font f;
685
686 f = (*t->font)[t->f];
687
688 /* Compute the corners of the parallelogram with width "(t->x0,t->y0)"
689 to "(t->x1,t->y1)" and height "(f.bx,f.by)" to "(f.tx,f.ty)". Then
690 compute the bottom left corner and the top right corner of the
691 bounding box (rectangle with sides parallel to the coordinate
692 system) of this rectangle. */
693 x0 = t->x0 + f->bx; y0 = t->y0 + f->by;
694 x1 = t->x1 + f->bx; y1 = t->y1 + f->by;
695 x2 = t->x0 + f->tx; y2 = t->y0 + f->ty;
696 x3 = t->x1 + f->tx; y3 = t->y1 + f->ty;
697
698 blx = (long)ceil(MIN(MIN(MIN(x0, x1), x2), x3));
699 bly = (long)ceil(MAX(MAX(MAX(y0, y1), y2), y3)); /* *** should this be floor? PMcJ 981002 */
700 toprx = (long)floor(MAX(MAX(MAX(x0, x1), x2), x3));
701 topry = (long)floor(MIN(MIN(MIN(y0, y1), y2), y3)); /* *** should this be ceil? PMcJ 981002 */
702
703 if (blx!=toprx && bly!=topry) {
704
705 /* Output word separator if this isn't first word on page. */
706 if (t->nonEmptyPage) {
707 mid = (topry+bly) / 2;
708 if (blx<toprx && topry<bly
709 && t->blx <= blx
710 && t->topry <= mid
711 && mid <= t->bly) *pre = " "; /* same line */
712 else *pre = "\n"; /* different line */
713 }
714 else *pre = "";
715
716 /* Output elements "0" through "t->lbuf-1" of "t->buf". */
717 t->buf[t->lbuf] = '\0';
718 strncpy(t->word, t->buf, t->lbuf+1);
719 *word = t->word;
720
721 t->nonEmptyPage = TRUE;
722 t->blx = blx; t->bly = bly; t->toprx = toprx; t->topry = topry;
723
724 /* transform device units to default PostScript units */
725 Itransform( t, &x1, &y1, (double)blx, (double)bly);
726 blx = (long)floor(x1); bly = (long)floor(y1);
727 Itransform( t, &x1, &y1, (double)toprx, (double)topry);
728 toprx = (long)ceil(x1); topry = (long)ceil(y1);
729
730 if (blx < toprx) {
731 *llx = blx;
732 *urx = toprx;
733 }
734 else {
735 *llx = toprx;
736 *urx = blx;
737 }
738 if (bly < topry) {
739 *lly = bly;
740 *ury = topry;
741 }
742 else {
743 *lly = topry;
744 *ury = bly;
745 }
746
747 } /*if (blx!=toprx && bly!=topry) { */
748
749 t->lbuf = 0;
750 }
751
SameDirection(double x0,double y0,double x1,double y1)752 static BOOLEAN SameDirection(double x0, double y0, double x1, double y1) {
753 return (y0 == 0.0 && y1 == 0.0 && x0*x1 > 0.0)
754 || (x0 == 0.0 && x1 == 0.0 && y0*y1 > 0.0)
755 || (x0 * y1 == x1 * y0);
756 }
757
ParseString(T * t,char * instr,const char ** pre,const char ** word,const char ** post,int * llx,int * lly,int * urx,int * ury)758 static int ParseString(T *t, char *instr,
759 const char **pre, const char **word, const char **post,
760 int *llx, int *lly, int *urx, int *ury) {
761 /* Parse QS directive. */
762 #define spaceTol 0.3 /* fraction of average character width to signal word break */
763 char buf[1000];
764 int n, ch, i, j, in, l;
765 Font f;
766 Encoding enc;
767 GlyphIndex glyph;
768 double x0, y0, x1, y1, xsp, ysp, dx, dy, maxx, maxy;
769
770 #define SetBuf() \
771 { \
772 strncpy(t->buf, buf, l); \
773 t->lbuf = l; \
774 t->f = n; \
775 t->x0 = x0; t->y0 = y0; t->x1 = x1; t->y1 = y1; \
776 }
777
778 n = ReadInt(&instr); /* index in "t->font" */
779 f = (*t->font)[n];
780 if (f == NULL) return PSTOTEXT_FILTER_BADFONTNUMBER;
781 enc = (*t->encoding)[f->e];
782 if (enc==NULL) return PSTOTEXT_FILTER_BADENCODINGNUMBER;
783 ReadPair(&x0, &y0, &instr); /* initial currentpoint */
784 j = ReadInt(&instr); /* length of string */
785 ch = ReadChar(&instr);
786 if (ch != ' ')
787 return PSTOTEXT_FILTER_BADQS;
788
789 l = 0;
790 for (i = 0; i<=j-1; i++) {
791 in = ReadChar(&instr);
792 /* if (in=='\0') return PSTOTEXT_FILTER_BADQS; */ /* TeX uses '\0' */
793 glyph = (*enc)[in];
794
795 /* If "glyph==0", then "in" mapped to the glyph ".notdef". This
796 is usually a mistake, but we check for several known cases: */
797 if (glyph == 0) {
798
799 /* If any element of the current encoding is in the range used
800 by Microsoft TrueType, assume this character is, too. */
801 int k; BOOLEAN tt = FALSE;
802 for(k = 0; !tt && k < sizeof(*enc)/sizeof((*enc)[0]); k++) {
803 if (FirstTT1 <= (*enc)[k] && (*enc)[k] <= LastTT2) tt = TRUE;
804 }
805 if (tt) glyph = FirstTT1 + (int)in;
806 /* There are too many other exceptions to actually trap this:
807 else if (in == '\r') ; // Adobe Illustrator does this...
808 else if (in == '\t') ; // MacDraw Pro does this...
809 else if (in == '\032') ; // MS Word on Mac does this...
810 else return PSTOTEXT_FILTER_BADGLYPHINDEX;
811 */
812 }
813 if (glyph == 0)
814 /* skip */;
815 else if (glyph <= LastISOLatin1) {
816 buf[l] = (char)glyph;
817 /* *** if (glyph IN ISOLatin1Gaps) buf[l] = UnknownChar; */
818 l++;
819 }
820 else if (glyph <= LASTSpecialGlyphs) {
821 const char *str = SpecialGlyphs[glyph-FIRSTSpecialGlyphs];
822 int lstr = strlen(str);
823 strncpy(&buf[l], str, lstr);
824 l += lstr;
825 }
826 else if (glyph <= LastDvips) {
827 const char *str; int lstr; char tempstr[2];
828 if (t->dvipsIsCork) {
829 if (glyph <= LASTCorkSpecialGlyphs)
830 str = CorkSpecialGlyphs[glyph-FIRSTCorkSpecialGlyphs];
831 else if (glyph == FIRSTCorkSpecialGlyphs+0337)
832 str = "SS";
833 else if (glyph == FIRSTCorkSpecialGlyphs+0377)
834 str = "\337";
835 else {
836 tempstr[0] = (char)(glyph-FIRSTCorkSpecialGlyphs); tempstr[1] = '\0';
837 str = &tempstr[0];
838 }
839 }
840 else if (glyph <= LASTDvipsGlyphs)
841 /* Assume old text layout (OT1?). */
842 str = DvipsGlyphs[glyph-FIRSTDvipsGlyphs];
843 else {
844 tempstr[0] = UnknownChar; tempstr[1] = '\0';
845 str = &tempstr[0];
846 }
847 lstr = strlen(str);
848 strncpy(&buf[l], str, lstr);
849 l += lstr;
850 }
851 else if (glyph <= LastTT2) {
852 if (FirstTT2 <= glyph) glyph -= FirstTT2-FirstTT1;
853 if (glyph < FirstTT1+32) {
854 buf[l] = UnknownChar; l++;
855 }
856 else if (glyph < FIRSTTTSpecialGlyphs ||
857 LASTTTSpecialGlyphs < glyph) {
858 buf[l] = (char)(glyph - FirstTT1); l++;
859 }
860 else {
861 const char *str = TTSpecialGlyphs[glyph-FIRSTTTSpecialGlyphs];
862 int lstr = strlen(str);
863 strncpy(&buf[l], str, lstr);
864 l += lstr;
865 }
866 }
867 else if (glyph <= LastOldDvips) {
868 const char *str = DvipsGlyphs[glyph-FirstOldDvips];
869 int lstr = strlen(str);
870 strncpy(&buf[l], str, lstr);
871 l += lstr;
872 }
873 else if (glyph == NonstandardGlyph) { /* not in StandardGlyphs */
874 buf[l] = UnknownChar;
875 l++;
876 }
877 else return PSTOTEXT_FILTER_BADGLYPHINDEX;
878
879 /* We no longer substitute minus for hyphen. */
880 /* if (buf[l-1] == '\255') buf[l-1] = '-'; */
881 }
882
883 ReadPair(&x1, &y1, &instr); /* final currentpoint */
884 if (l != 0) { /* "l==0" e.g., when Adobe Illustrator outputs "\r" */
885 if (t->lbuf == 0) {SetBuf();}
886 else {
887 /* If the distance between this string and the previous one is
888 less than "spaceTol" times the minimum of the average
889 character widths in the two strings, and the two strings
890 are in the same direction, then append this string to the
891 previous one. Otherwise, output the previous string and
892 then save the current one.
893
894 Sometimes this string overlaps the previous string, e.g.,
895 when TeX is overprinting an accent over another character.
896 So we make a special case for this (but only handle the
897 left-to-right orientation). */
898
899 /* Set "(xsp,ysp)" to the reporting space coordinates of the
900 minimum of the average width of the characters in this
901 string and the previous one. */
902
903 xsp = MIN((t->x1-t->x0) / t->lbuf, (x1-x0) / l);
904 ysp = MIN((t->y1-t->y0) / t->lbuf, (y1-y0) / l);
905
906 dx = x0 - t->x1;
907 dy = y0 - t->y1;
908 maxx = spaceTol * xsp;
909 maxy = spaceTol * ysp;
910 if ((dx*dx + dy*dy < maxx*maxx + maxy*maxy)
911 || ((t->y1 == y0 && t->x0 <= t->x1 && t->x0 <= x0 && x0 <= t->x1)
912 && SameDirection(t->x1-t->x0, t->y1-t->y0, x1-x0, y1-y0))) {
913 if (t->lbuf+l >= sizeof(t->buf)) {
914 Output(t, pre, word, llx, lly, urx, ury);
915 *post = "";
916 SetBuf();
917 }
918 else {
919 strncpy(&t->buf[t->lbuf], buf, l);
920 t->lbuf += l;
921 t->x1 = x1; t->y1 = y1;
922 /* *** Merge font bounding boxes? */
923 }
924 }
925 else {
926 Output(t, pre, word, llx, lly, urx, ury);
927 *post = "";
928 SetBuf();
929 }
930 }
931 }
932
933 return 0;
934 }
935
pstotextFilter(void * instance,char * instr,const char ** pre,const char ** word,const char ** post,int * llx,int * lly,int * urx,int * ury)936 int DLLEXPORT pstotextFilter(void *instance, char *instr,
937 const char **pre, const char **word, const char **post,
938 int *llx, int *lly, int *urx, int *ury) {
939 T *t = (T *)instance;
940 int c;
941 *word = NULL;
942 switch (t->state) {
943 case state_normal:
944 do {c = ReadChar(&instr); if (c=='\0') return 0;} while (c!='Q');
945 c = ReadChar(&instr);
946 switch (c) {
947 case 'I': return ParseInverseTransform(t, instr);
948 case 'M': return ParseMetrics(t, instr);
949 case 'E': return ParseEncoding(t, instr);
950 case 'F': return ParseFont(t, instr);
951 case 'S': return ParseString(
952 t, instr, pre, word, post, llx, lly, urx, ury);
953 case 'C':
954 case 'P': /* copypage, showpage */
955 /* If any QS directives have been encountered on this page,
956 t->buf will be nonempty now. */
957 if (t->lbuf > 0) {
958 Output(t, pre, word, llx, lly, urx, ury);
959 *post = "\n\f\n";
960 }
961 else {
962 *pre = "";
963 *word = "";
964 *llx = 0; *lly = 0; *urx = 0; *ury = 0;
965 *post = "\f\n";
966 }
967 t->nonEmptyPage = FALSE;
968 t->blx = t->bly = t->toprx = t->topry = 0;
969 break;
970 case 'Z': /* erasepage */ /* skip */ break;
971 case '\0': return 0;
972 /* default: skip */
973 }
974 break;
975 case state_metrics: return ParseMetricsMore(t, instr);
976 case state_encoding: return ParseEncodingMore(t, instr);
977 }
978 return 0;
979 }
980
981