1 /* Copyright (C) 1995-1998, Digital Equipment Corporation.    */
2 /* All rights reserved.                                       */
3 /* See the file pstotext.txt for a full description.          */
4 /* Last modified on Fri Jan 09 21:19:00 AEST 2004 by rjl   */
5 /*      modified on Fri Jan 09 08:21:00 AEST 2004 by rjl       */
6 /*      modified on Wed Oct 28 08:42:15 PST 1998 by mcjones   */
7 /*      modified on Sun Jul 28 00:00:00 UTC 1996 by rjl       */
8 
9 /* Modifications by rjl
10  *   Fixed compiler warnings
11  */
12 
13 /* This module is based on OCR_PS.m3, a module of the Virtual Paper
14    project at the DEC Systems Research Center:
15    http://www.research.digital.com/SRC/virtualpaper/ */
16 
17 #include <math.h>
18 #include <string.h>
19 #include <stdlib.h>
20 #include "ptotdll.h"
21 
22 #ifndef NULL
23 #define NULL 0
24 #endif
25 
26 #define BOOLEAN int
27 #define FALSE 0
28 #define TRUE 1
29 
30 #define MIN(a,b) ((a)<=(b)?(a):(b))
31 #define MAX(a,b) ((b)<=(a)?(a):(b))
32 
33 /* Character encoding.  Each element of the QE directive produced by
34    ocr.ps is either an index in the StandardGlyph array, or is
35    "NonstandardGlyph" (indicating the corresponding entry in the font's
36    encoding specifies some nonstandard glyph). */
37 
38 typedef unsigned GlyphIndex;
39 #define NonstandardGlyph 9999
40 
41 #define UnknownChar '#' /* substitute for nonstandard glyph */
42 
43 /* The first 256 entries in StandardGlyphs correspond to ISOLatin1;
44    the next 28 entries correspond to characters not in ISOLatin1, but
45    defined in the standard /Times-Roman font. */
46 
47 #define LastISOLatin1 255
48 
49 #define FIRSTSpecialGlyphs (LastISOLatin1+1)
50 #define LASTSpecialGlyphs (LastISOLatin1+28)
51 static const char *SpecialGlyphs[] = {
52     "''",    /* quotedblright */
53     "S\237", /* Scaron */
54     "+",     /* dagger */
55     "<",     /* guilsinglleft */
56     "Z\237", /* Zcaron */
57     "#",     /* daggerdbl */
58     "L/",    /* Lslash */
59     "...",   /* ellipsis */
60     ">",     /* guilsinglright */
61     "oe",    /* oe */
62     "fi",    /* fi */
63     ".",     /* bullet */
64     "o/oo",  /* perthousand */
65     "''",    /* quotedblbase */
66     "--",    /* endash */
67     "---",   /* emdash */
68     "^TM",   /* trademark */
69     "f",     /* florin */
70     "l/",    /* lslash */
71     "s\237", /* scaron */
72     "Y\250", /* Ydieresis */
73     "fl",    /* fl */
74     "/",     /* fraction */
75     "``",    /* quotedblleft */
76     "'",     /* quotesinglbase */
77     "'",     /* quotesingle */
78     "z\237", /* zcaron */
79     "OE"     /* OE */
80   };
81 
82 /* The next 256 entries correspond to the self-named glyphs used in
83    Type 3 fonts from dvips: "\000", ..., "\377":  */
84 
85 #define FirstDvips (LASTSpecialGlyphs+1)
86 #define LastDvips  (FirstDvips+256-1)
87 
88 /* The next 512 entries correspond to glyph names used in Microsoft
89    TrueType fonts: "G00", ..., "Gff" and "G00", ..., "GFF", which
90    in both cases correspond to ISOLatin1 with some extensions. */
91 
92 #define FirstTT1 (LastDvips+1)
93 #define LastTT1 (FirstTT1+256-1)
94 #define FirstTT2 (LastTT1+1)
95 #define LastTT2 (FirstTT2+256-1)
96 #define FirstOldDvips (LastTT2+1)
97 #define LastOldDvips (FirstOldDvips+128-1) /* note only 128 */
98 
99 #define FIRSTTTSpecialGlyphs (FirstTT1+130)
100 #define LASTTTSpecialGlyphs (FirstTT1+159)
101 static const char *TTSpecialGlyphs[] = {
102     "'",     /* quotesinglbase */
103     "f",     /* florin */
104     "''",    /* quotdblbase */
105     "...",   /* ellipsis */
106     "+",     /* dagger */
107     "#",     /* daggerdbl */
108     "\223",  /* circumflex */
109     "o/oo",  /* perthousand */
110     "S\237", /* Scaron */
111     "<",     /* guilsinglleft */
112     "OE",    /* OE */
113     "#",     /* <undefined> */
114     "#",     /* <undefined> */
115     "#",     /* <undefined> */
116     "#",     /* <undefined> */
117     "`",     /* ISOLatin1: quoteleft */
118     "'",     /* ISOLatin1: quoteright */
119     "``",    /* quotedblleft */
120     "''",    /* quotedblright */
121     ".",     /* bullet */
122     "--",    /* endash */
123     "---",   /* emdash */
124     "~",     /* ISOLatin1: tilde */
125     "^TM",   /* trademark */
126     "s\237", /* scaron */
127     ">",     /* guilsinglright */
128     "oe",    /* oe */
129     "#",     /* <undefined> */
130     "#",     /* <undefined> */
131     "Y\250"  /* Ydieresis" */
132   };
133 
134 #define FIRSTDvipsGlyphs FirstDvips
135 #define LASTDvipsGlyphs (FirstDvips+127)
136 static const char *DvipsGlyphs[] = {
137   /* 00x */
138     "\\Gamma", "\\Delta", "\\Theta", "\\Lambda",
139     "\\Xi", "\\Pi", "\\Sigma", "\\Upsilon",
140   /* 01x */
141     "\\Phi", "\\Psi", "\\Omega", "ff", "fi", "fl", "ffi", "ffl",
142   /* 02x */
143     "i",     /* \imath */
144     "j",     /* \jmath */
145     "`",
146     "'",
147     "\237",  /* caron */
148     "\226",  /* breve */
149     "\257",  /* macron */
150     "\232",  /* ring */
151   /* 03x */
152     "\270",  /* cedilla */
153     "\337",  /* germandbls */
154     "ae",
155     "oe",
156     "\370",  /* oslash */
157     "AE",
158     "OE",
159     "\330",  /* Oslash */
160   /* 04x */
161     "/" /* bar for Polish suppressed-L ??? */, "!", "''", "#",
162     "$", "%", "&", "'",
163   /* 05x */
164     "(", ")", "*", "+",
165     ",", "\255" /* hyphen */, ".", "/",
166   /* 06x */
167     "0", "1", "2", "3", "4", "5", "6", "7",
168   /* 07x */
169     "8", "9", ":", ";",
170     "!" /* exclamdown */, "=", "?" /* questiondown */, "?",
171   /* 010x */
172     "@", "A", "B", "C", "D", "E", "F", "G",
173   /* 011x */
174     "H", "I", "J", "K", "L", "M", "N", "O",
175   /* 012x */
176     "P", "Q", "R", "S", "T", "U", "V", "W",
177   /* 013x */
178     "X", "Y", "Z", "[",
179     "``", "]", "\223" /* circumflex */, "\227" /* dotaccent */,
180   /* 014x */
181     "`", "a", "b", "c", "d", "e", "f", "g",
182   /* 015x */
183     "h", "i", "j", "k", "l", "m", "n", "o",
184   /* 016x */
185     "p", "q", "r", "s", "t", "u", "v", "w",
186   /* 017x */
187     "x", "y", "z",
188     "--",    /* en dash */
189     "---",   /* em dash */
190     "\235",  /* hungarumlaut */
191     "~",
192     "\250"   /* dieresis */
193   };
194 
195 #define FIRSTCorkSpecialGlyphs FirstDvips
196 #define LASTCorkSpecialGlyphs (FirstDvips+0277)
197 static const char *CorkSpecialGlyphs[] = {
198   /* 000 - accents for lowercase letters */
199     "`",
200     "'",
201     "^",
202     "~",
203     "\230",  /* umlaut/dieresis */
204     "\235",  /* hungarumlaut */
205     "\232",  /* ring */
206     "\237",  /* hacek/caron */
207     "\226",  /* breve */
208     "\257",  /* macron */
209     "\227",  /* dot above/dotaccent */
210     "\270",  /* cedilla */
211     "\236",  /* ogonek */
212   /* 015 - miscellaneous */
213     "'",     /* single base quote/quotesinglbase */
214     "<",     /* single opening guillemet/guilsinglleft */
215     ">",     /* single closing guillemet/guilsinglright */
216     "``",    /* english opening quotes/quotedblleft */
217     "''",    /* english closing quotes/quotedblright */
218     ",,",    /* base quotes/quotedblbase */
219     "<<",    /* opening guillemets/guillemotleft */
220     ">>",    /* closing guillemets/guillemotright */
221     "--",    /* en dash/endash */
222     "---",   /* em dash/emdash */
223     "",      /* compound work mark (invisible)/ */
224     "o",     /* perthousandzero (used in conjunction with %) */
225     "\220",  /* dotless i/dotlessi */
226     "j",     /* dotless j */
227     "ff",    /* ligature ff */
228     "fi",    /* ligature fi */
229     "fl",    /* ligature fl */
230     "ffi",   /* ligature ffi */
231     "ffl",   /* ligature ffl */
232     "_",     /* visible space */
233   /* 041 - ASCII */
234          "!", "\"", "#", "$", "%", "&", "'",
235     "(", ")", "*", "+", ",", "-", ".", "/",
236     "0", "1", "2", "3", "4", "5", "6", "7",
237     "8", "9", ":", ";", "<", "=", ">", "?",
238     "@", "A", "B", "C", "D", "E", "F", "G",
239     "H", "I", "J", "K", "L", "M", "N", "O",
240     "P", "Q", "R", "S", "T", "U", "V", "W",
241     "X", "Y", "Z", "[", "\\","]", "^", "_",
242     "`", "a", "b", "c", "d", "e", "f", "g",
243     "h", "i", "j", "k", "l", "m", "n", "o",
244     "p", "q", "r", "s", "t", "u", "v", "w",
245     "x", "y", "z", "{", "|", "}", "~", "\255", /* hyphenchar (hanging) */
246   /* 200 - letters for eastern European languages from latin-2 */
247     "A\226", /* Abreve */
248     "A\236", /* Aogonek */
249     "C\264", /* Cacute */
250     "C\237", /* Chacek */
251     "D\237", /* Dhacek */
252     "E\237", /* Ehacek */
253     "E\236", /* Eogonek */
254     "G\226", /* Gbreve */
255     "L\264", /* Lacute */
256     "L\237", /* Lhacek */
257     "L/",    /* Lslash/Lstroke */
258     "N\264", /* Nacute */
259     "N\237", /* Nhacek */
260     "\\NG",  /* Eng */
261     "O\235", /* Ohungarumlaut */
262     "R\264", /* Racute */
263     "R\237", /* Rhacek */
264     "S\264", /* Sacute */
265     "S\237", /* Shacek */
266     "S\270", /* Scedilla */
267     "T\237", /* Thacek */
268     "T\270", /* Tcedilla */
269     "U\235", /* Uhungarumlaut */
270     "U\232", /* Uring */
271     "Y\250", /* Ydieresis */
272     "Z\264", /* Zacute */
273     "Z\237", /* Zhacek */
274     "Z\227", /* Zdot */
275     "IJ",    /* IJ */
276     "I\227", /* Idot */
277     "\\dj",  /* dbar */
278     "\247",  /* section */
279     "a\226", /* abreve */
280     "a\236", /* aogonek */
281     "c\222", /* cacute */
282     "c\237", /* chacek */
283     "d\237", /* dhacek */
284     "e\237", /* ehacek */
285     "e\236", /* eogonek */
286     "g\226", /* gbreve */
287     "l\222", /* lacute */
288     "l\237", /* lhacek */
289     "l/",    /* lslash */
290     "n\222", /* nacute */
291     "n\237", /* nhacek */
292     "\\ng",  /* eng */
293     "o\235", /* ohungarumlaut */
294     "r\222", /* racute */
295     "r\237", /* rhacek */
296     "s\222", /* sacute */
297     "s\237", /* shacek */
298     "s\270", /* scedilla */
299     "t\237", /* thacek */
300     "t\270", /* tcedilla */
301     "u\235", /* uhungarumlaut */
302     "u\232", /* uring */
303     "y\230", /* ydieresis */
304     "z\222", /* zacute */
305     "z\237", /* zhacek */
306     "z\227", /* zdot */
307     "ij",    /* ij */
308     "\241",  /* exclamdown */
309     "\277",  /* questiondown */
310     "\243"   /* sterling */
311   /* 0300-0377 is same as ISO 8859/1 except:
312        0337 is Ess-zed and 0377 is ess-zed/germandbls */
313 };
314 
315 /* There are gaps in the set of printable ISOLatin1 characters: */
316 /*CONST ISOLatin1Gaps = SET OF [0..255] {
317     8_0..8_37, 8_177..8_217, 8_231, 8_234};
318 */
319 
320 typedef struct {
321   double blx, bly, toprx, topry; /* font matrix in character coordinates */
322   struct {double x, y;} chr[256]; /* widths in character coordinates */
323 } MetricsRec;
324 typedef MetricsRec *Metrics;
325 typedef Metrics MetricsTable[];
326 
327 typedef GlyphIndex EncodingVector[256];
328 typedef EncodingVector *Encoding;
329 typedef Encoding EncodingTable[];
330 
331 typedef struct {
332   double x, y; /* (1000,0) in font's character coordinate system */
333   double xp, yp; /* (0,1000) in font's character coordinate system */
334   int e; /* index in "encoding" */
335   int m; /* index in "metrics" */
336   double bx, by, tx, ty; /* height of font bbox in reporting coordinates */
337 } FontRec;
338 typedef FontRec *Font;
339 typedef Font FontTable[];
340 
341 
342 /* Instance "T". */
343 #define state_normal 0
344 #define state_metrics 1
345 #define state_encoding 2
346 typedef struct {
347   double itransform[6]; /* transform from device to default coordinates */
348   int metricsSize;
349   MetricsTable *metrics;
350   int encodingSize;
351   EncodingTable *encoding;
352   BOOLEAN dvipsIsCork; /* assume Cork rather than "OT1" for dvips output */
353   int fontSize;
354   FontTable *font;
355 
356   /* Data for current word prefix: */
357   char buf[1000];
358   int lbuf; /* elements 0 through "lbuf-1" of "buf" are in use */
359   int f; /* font number */
360   double x0, y0, x1, y1; /* initial and final currentpoint */
361 
362   BOOLEAN nonEmptyPage;
363   long blx, bly, toprx, topry; /* bounding box of last word output */
364   char word[1000]; /* last word output */
365   int state;
366   /* state-specific components: */
367   /* state_encoding: */ int encoding_e, encoding_n, encoding_i;
368   /* state_metrics: */ int metrics_m, metrics_i;
369 } T;
370 
371 static int ReadChar(char **instr);
372 static void UnreadChar(char **instr);
373 static int ReadInt(char **instr);
374 static long ReadLong(char **instr);
375 static int ParseInverseTransform(T *t, char *instr);
376 static int ParseEncoding(T *t, char *instr);
377 static int ParseEncodingMore(T *t, char *instr);
378 static void ReadPair(double *x, double *y, char **instr);
379 static int ParseFont(T *t, char *instr);
380 static int ParseMetrics(T *t, char *instr);
381 static int ParseMetricsMore(T *t, char *instr);
382 static void Itransform(T *t, double *x1, double *y1, double x0, double y0);
383 static void Output(T *t, const char **pre, const char **word,
384   int *llx, int *lly, int *urx, int *ury);
385 static BOOLEAN SameDirection(double x0, double y0, double x1, double y1);
386 static int ParseString(
387   T *t, char *instr, const char **pre, const char **word, const char **post,
388   int *llx, int *lly, int *urx, int *ury);
389 
pstotextInit(void ** instance)390 int DLLEXPORT pstotextInit(void **instance) {
391   T *t;
392   int i;
393 
394   t = (T *)malloc(sizeof(T));
395   if (t == NULL) return PSTOTEXT_INIT_MALLOC;
396 
397   t->state = state_normal;
398 
399   /* Initialize t->itransform to the identity transform. */
400   t->itransform[0] = 1.0;
401   t->itransform[1] = 0.0;
402   t->itransform[2] = 0.0;
403   t->itransform[3] = 1.0;
404   t->itransform[4] = 0.0;
405   t->itransform[5] = 0.0;
406 
407   t->metricsSize = t->encodingSize = t->fontSize = 100;
408 
409   t->metrics = (MetricsTable *)malloc(t->metricsSize * sizeof(Metrics));
410   if (t->metrics == NULL) {
411     free(t);
412     return PSTOTEXT_INIT_MALLOC;
413   }
414   for(i=0; i<t->metricsSize; i++)(*t->metrics)[i] = NULL;
415 
416   t->encoding = (EncodingTable *)malloc(t->encodingSize * sizeof(Encoding));
417   if (t->encoding == NULL) {
418     free(t);
419     return PSTOTEXT_INIT_MALLOC;
420   }
421   for(i=0;i<t->encodingSize;i++)(*t->encoding)[i] = NULL;
422 
423   t->dvipsIsCork = FALSE;
424 
425   t->font = (FontTable *)malloc(t->fontSize * sizeof(Font));
426   if (t->font == NULL) {
427     free(t);
428     return PSTOTEXT_INIT_MALLOC;
429   }
430   for(i=0;i<t->fontSize;i++)(*t->font)[i] = NULL;
431 
432   t->lbuf = 0;
433   t->nonEmptyPage = FALSE;
434   t->blx = t->bly = t->toprx = t->topry = 0;
435 
436   *instance = t;
437 
438   return 0;
439 }
440 
pstotextSetCork(void * instance,int value)441 int DLLEXPORT pstotextSetCork(void *instance, int value) {
442   T *t = (T *)instance;
443   t->dvipsIsCork = value;
444   return 0;
445 }
446 
pstotextExit(void * instance)447 int DLLEXPORT pstotextExit(void *instance) {
448   T *t = (T *)instance;
449   free(t->metrics);
450   free(t->encoding);
451   free(t->font);
452   free(t);
453   return 0;
454 }
455 
ReadChar(char ** instr)456 static int ReadChar(char **instr) {
457   int c = **(unsigned char**)instr;
458   (*instr)++;
459   return c;
460 }
461 
UnreadChar(char ** instr)462 static void UnreadChar(char **instr) {
463   (*instr)--;
464 }
465 
ReadInt(char ** instr)466 static int ReadInt(char **instr) {
467   int i = 0;
468   int sign = 1;
469   int c;
470   while ((c = ReadChar(instr))==' ') /* skip */ ;
471   if (c=='-') {sign = -1; c = ReadChar(instr); }
472   while ('0' <= c && c <= '9') {i = i*10+(c-'0'); c = ReadChar(instr);}
473   UnreadChar(instr);
474   return i*sign;
475 }
476 
ReadLong(char ** instr)477 static long ReadLong(char **instr) {
478   long i = 0;
479   int sign = 1;
480   int c;
481   while ((c = ReadChar(instr))==' ') /* skip */ ;
482   if (c=='-') {sign = -1; c = ReadChar(instr); }
483   while ('0' <= c && c <= '9') {i = i*10+(c-'0'); c = ReadChar(instr);}
484   UnreadChar(instr);
485   return i*sign;
486 }
487 
ParseInverseTransform(T * t,char * instr)488 static int ParseInverseTransform(T *t, char *instr) {
489   int i;
490   for (i = 0; i<6; i++) t->itransform[i] = ReadLong(&instr) / 100.0;
491   return 0;
492 }
493 
ParseEncoding(T * t,char * instr)494 static int ParseEncoding(T *t, char *instr) {
495   /* Parse first line of QE directive. */
496   int e = ReadInt(&instr);
497   int n = ReadInt(&instr);
498   int i;
499   if (e<0) return PSTOTEXT_FILTER_BADENCODINGNUMBER;
500   if (n>/*256*/1024) return PSTOTEXT_FILTER_TOOMANYGLYPHINDEXES;
501 
502   /* Grow "t->encoding" if necessary. */
503   if (t->encodingSize<=e) {
504     int oldSize = t->encodingSize;
505     t->encodingSize = 2*e;
506     t->encoding = (EncodingTable *)realloc(
507       (char *)t->encoding,
508       t->encodingSize * sizeof(Encoding)
509     );
510     for(i=oldSize;i<t->encodingSize;i++)(*t->encoding)[i] = NULL;
511   }
512 
513   /* If this is the first encoding numbered "e", allocate array. */
514   if ((*t->encoding)[e] == NULL)
515     (*t->encoding)[e] = (EncodingVector *)malloc(sizeof(EncodingVector));
516 
517   t-> state = state_encoding;
518   t->encoding_e = e; t->encoding_n = n; t->encoding_i = 0;
519 
520   return 0;
521 }
522 
ParseEncodingMore(T * t,char * instr)523 static int ParseEncodingMore(T *t, char *instr) {
524   /* Parse subsequent line of QE directive. */
525   Encoding enc = (*t->encoding)[t->encoding_e];
526   int i, tooSparse;
527 
528   for (i = t->encoding_i; i<t->encoding_i+16 ; i++)
529     (*enc)[i] = (i<t->encoding_n) ? ReadInt(&instr) : NonstandardGlyph;
530 
531   t->encoding_i += 16;
532   if (t->encoding_i < 256) /* skip */ ;
533   else {
534     /* End of directive. */
535     t->state = state_normal;
536 
537     /* Some applications build the encoding vector incrementally.  If
538        this one doesn't have at least the lower-case letters, we augment
539        it with ISOLatin1. */
540     tooSparse = 0;
541     for (i = 'a'; i<='z'; i++)
542       tooSparse = (*enc)[i] == NonstandardGlyph;
543     if (tooSparse)
544       for (i = 0; i<256; i++)
545         if ((*enc)[i] == NonstandardGlyph) (*enc)[i] = i;
546   }
547 
548   return 0;
549 
550 }
551 
552 #define GuessAscend 0.9
553 #define GuessDescend -0.3
554 
ReadPair(double * x,double * y,char ** instr)555 static void ReadPair(double /*out*/ *x, /*out*/ double *y, char **instr) {
556   *x = ReadLong(instr) / 100.0;
557   *y = ReadLong(instr) / 100.0;
558 }
559 
ParseFont(T * t,char * instr)560 static int ParseFont(T *t, char *instr) {
561   /* Parse QF directive. */
562   int n = ReadInt(&instr), i;
563   Metrics mt;
564   Font f;
565   double xmax, bly, topry;
566   if (n<0) return PSTOTEXT_FILTER_BADFONTNUMBER;
567 
568   /* Grow "t->font" if necessary. */
569   if (t->fontSize<=n) {
570     int oldSize = t->fontSize;
571     t->fontSize = 2*n;
572     t->font = (FontTable *)realloc(
573       (char *)t->font,
574       t->fontSize * sizeof(Font)
575     );
576     for(i=oldSize;i<t->fontSize;i++)(*t->font)[i] = NULL;
577   }
578 
579   /* If this is the first font numbered "n", allocate "FontRec". */
580   if ((*t->font)[n] == NULL)
581     (*t->font)[n] = (Font)malloc(sizeof(FontRec));
582 
583   f = (*t->font)[n];
584   ReadPair(&f->x, &f->y, &instr);
585   ReadPair(&f->xp, &f->yp, &instr);
586   f->e = ReadInt(&instr);
587   if ((*t->encoding)[f->e] == NULL) return PSTOTEXT_FILTER_BADENCODINGNUMBER;
588   f->m = ReadInt(&instr);
589   mt = (*t->metrics)[f->m];
590   if (mt == NULL) return PSTOTEXT_FILTER_BADMETRICNUMBER;
591 
592   /* Transform height of font bounding box to reporting coordinates: */
593   f->bx = f->xp * mt->bly / 1000.0;
594   f->by = f->yp * mt->bly / 1000.0;
595   f->tx = f->xp * mt->topry / 1000.0;
596   f->ty = f->yp * mt->topry / 1000.0;
597 
598   /* In some fonts produced by dvips, the FontBBox is incorrectly
599      defined as [0 0 1 1].  We check for this, and apply the same
600      heuristic used for an undefined FontBBox in "ParseMetrics".  */
601   if (f->by-f->ty < 1.1) {
602     xmax = 0.0;
603     for (i = 0; i<256; i++)
604       if (mt->chr[i].x > xmax) xmax = mt->chr[i].x;
605       bly = GuessDescend * xmax; topry = GuessAscend * xmax;
606       f->bx = f->xp * bly / 1000.0;
607       f->by = f->yp * bly / 1000.0;
608       f->tx = f->xp * topry / 1000.0;
609       f->ty = f->yp * topry / 1000.0;
610   }
611 
612   return 0;
613 }
614 
ParseMetrics(T * t,char * instr)615 static int ParseMetrics(T *t, char *instr) {
616   /* Parse first line of QM directive. */
617   int m = ReadInt(&instr), i;
618   Metrics mt;
619 
620   if (m<0) return PSTOTEXT_FILTER_BADMETRICNUMBER;
621 
622   /* Grow "t->metrics" if necessary. */
623   if (t->metricsSize<=m) {
624     int oldSize = t->metricsSize;
625     t->metricsSize = 2*m;
626     t->metrics = (MetricsTable *)realloc(
627       (char *)t->metrics,
628       t->metricsSize * sizeof(Metrics)
629     );
630     for (i=oldSize;i<t->metricsSize;i++)(*t->metrics)[i] = NULL;
631   }
632 
633   /* If this is the first metrics numbered "m", allocate "MetricsRec". */
634   if ((*t->metrics)[m] == NULL)
635     (*t->metrics)[m] = (Metrics)malloc(sizeof(MetricsRec));
636 
637   mt = (*t->metrics)[m];
638 
639   ReadPair(&mt->blx, &mt->bly, &instr);
640   ReadPair(&mt->toprx, &mt->topry, &instr);
641 
642   t->state = state_metrics; t->metrics_m = m; t->metrics_i = 0;
643 
644   return 0;
645 }
646 
ParseMetricsMore(T * t,char * instr)647 static int ParseMetricsMore(T *t, char *instr) {
648   /* Parse subsequent line of QM directive. */
649   int i;
650   Metrics mt = (*t->metrics)[t->metrics_m];
651 
652   for (i = t->metrics_i; i<t->metrics_i+8; i++)
653     ReadPair(&mt->chr[i].x, &mt->chr[i].y, &instr);
654 
655   t->metrics_i += 8;
656   if (t->metrics_i < 256) /* skip */ ;
657   else {
658     /* End of directive. */
659     t->state = state_normal;
660 
661     /* If "FontBBox" was not specified, take a guess. */
662     if (mt->blx == 0.0 && mt->bly == 0.0 && mt->toprx == 0.0 && mt->topry == 0.0) {
663       for (i = 0; i<256; i++)
664         if (mt->chr[i].x > mt->toprx) mt->toprx = mt->chr[i].x;
665       mt->bly = GuessDescend * mt->toprx;
666       mt->topry = GuessAscend * mt->toprx;
667     }
668   }
669 
670   return 0;
671 }
672 
Itransform(T * t,double * x1,double * y1,double x0,double y0)673 static void Itransform(T *t, double *x1, double *y1, double x0, double y0) {
674 /* Set (*x1, *y1) to (t->itransform) * (x0, y0). */
675   *x1 = t->itransform[0]*x0 + t->itransform[2]*y0 + t->itransform[4];
676   *y1 = t->itransform[1]*x0 + t->itransform[3]*y0 + t->itransform[5];
677 }
678 
Output(T * t,const char ** pre,const char ** word,int * llx,int * lly,int * urx,int * ury)679 static void Output(T *t, const char **pre, const char **word,
680     int *llx, int *lly, int *urx, int *ury) {
681   /* Output the next word. */
682   double x0, y0, x1, y1, x2, y2, x3, y3;
683   long blx, bly, toprx, topry, mid;
684   Font f;
685 
686   f = (*t->font)[t->f];
687 
688   /* Compute the corners of the parallelogram with width "(t->x0,t->y0)"
689      to "(t->x1,t->y1)" and height "(f.bx,f.by)" to "(f.tx,f.ty)". Then
690      compute the bottom left corner and the top right corner of the
691      bounding box (rectangle with sides parallel to the coordinate
692      system) of this rectangle. */
693   x0 = t->x0 + f->bx; y0 = t->y0 + f->by;
694   x1 = t->x1 + f->bx; y1 = t->y1 + f->by;
695   x2 = t->x0 + f->tx; y2 = t->y0 + f->ty;
696   x3 = t->x1 + f->tx; y3 = t->y1 + f->ty;
697 
698   blx = (long)ceil(MIN(MIN(MIN(x0, x1), x2), x3));
699   bly = (long)ceil(MAX(MAX(MAX(y0, y1), y2), y3)); /* *** should this be floor? PMcJ 981002 */
700   toprx = (long)floor(MAX(MAX(MAX(x0, x1), x2), x3));
701   topry = (long)floor(MIN(MIN(MIN(y0, y1), y2), y3)); /* *** should this be ceil? PMcJ 981002 */
702 
703   if (blx!=toprx && bly!=topry) {
704 
705     /* Output word separator if this isn't first word on page. */
706     if (t->nonEmptyPage) {
707       mid = (topry+bly) / 2;
708       if (blx<toprx && topry<bly
709 	  && t->blx <= blx
710 	  && t->topry <= mid
711 	  && mid <= t->bly) *pre = " "; /* same line */
712       else *pre = "\n"; /* different line */
713     }
714     else *pre = "";
715 
716     /* Output elements "0" through "t->lbuf-1" of "t->buf". */
717     t->buf[t->lbuf] = '\0';
718     strncpy(t->word, t->buf, t->lbuf+1);
719     *word = t->word;
720 
721     t->nonEmptyPage = TRUE;
722     t->blx = blx; t->bly = bly; t->toprx = toprx; t->topry = topry;
723 
724     /* transform device units to default PostScript units */
725     Itransform( t, &x1, &y1, (double)blx, (double)bly);
726     blx = (long)floor(x1); bly = (long)floor(y1);
727     Itransform( t, &x1, &y1, (double)toprx, (double)topry);
728     toprx = (long)ceil(x1); topry = (long)ceil(y1);
729 
730     if (blx < toprx) {
731 	*llx = blx;
732 	*urx = toprx;
733     }
734     else {
735 	*llx = toprx;
736 	*urx = blx;
737     }
738     if (bly < topry) {
739 	*lly = bly;
740 	*ury = topry;
741     }
742     else {
743 	*lly = topry;
744 	*ury = bly;
745     }
746 
747   } /*if (blx!=toprx && bly!=topry) { */
748 
749   t->lbuf = 0;
750 }
751 
SameDirection(double x0,double y0,double x1,double y1)752 static BOOLEAN SameDirection(double x0, double y0, double x1, double y1) {
753   return (y0 == 0.0 && y1 == 0.0 && x0*x1 > 0.0)
754       || (x0 == 0.0 && x1 == 0.0 && y0*y1 > 0.0)
755       || (x0 * y1 == x1 * y0);
756 }
757 
ParseString(T * t,char * instr,const char ** pre,const char ** word,const char ** post,int * llx,int * lly,int * urx,int * ury)758 static int ParseString(T *t, char *instr,
759   const char **pre, const char **word, const char **post,
760   int *llx, int *lly, int *urx, int *ury) {
761   /* Parse QS directive. */
762 #define spaceTol 0.3 /* fraction of average character width to signal word break */
763   char buf[1000];
764   int n, ch, i, j, in, l;
765   Font f;
766   Encoding enc;
767   GlyphIndex glyph;
768   double x0, y0, x1, y1, xsp, ysp, dx, dy, maxx, maxy;
769 
770 #define SetBuf() \
771   { \
772   strncpy(t->buf, buf, l); \
773   t->lbuf = l; \
774   t->f = n; \
775   t->x0 = x0; t->y0 = y0; t->x1 = x1; t->y1 = y1; \
776   }
777 
778   n = ReadInt(&instr); /* index in "t->font" */
779   f = (*t->font)[n];
780   if (f == NULL) return PSTOTEXT_FILTER_BADFONTNUMBER;
781   enc = (*t->encoding)[f->e];
782   if (enc==NULL) return PSTOTEXT_FILTER_BADENCODINGNUMBER;
783   ReadPair(&x0, &y0, &instr); /* initial currentpoint */
784   j = ReadInt(&instr); /* length of string */
785   ch = ReadChar(&instr);
786   if (ch != ' ')
787     return PSTOTEXT_FILTER_BADQS;
788 
789   l = 0;
790   for (i = 0; i<=j-1; i++) {
791     in = ReadChar(&instr);
792     /* if (in=='\0') return PSTOTEXT_FILTER_BADQS; */ /* TeX uses '\0' */
793     glyph = (*enc)[in];
794 
795     /* If "glyph==0", then "in" mapped to the glyph ".notdef".  This
796        is usually a mistake, but we check for several known cases: */
797     if (glyph == 0) {
798 
799       /* If any element of the current encoding is in the range used
800          by Microsoft TrueType, assume this character is, too. */
801       int k; BOOLEAN tt = FALSE;
802       for(k = 0; !tt && k < sizeof(*enc)/sizeof((*enc)[0]); k++) {
803         if (FirstTT1 <= (*enc)[k] && (*enc)[k] <= LastTT2) tt = TRUE;
804       }
805       if (tt) glyph = FirstTT1 + (int)in;
806       /* There are too many other exceptions to actually trap this:
807         else if (in == '\r') ; // Adobe Illustrator does this...
808         else if (in == '\t') ; // MacDraw Pro does this...
809         else if (in == '\032') ; // MS Word on Mac does this...
810         else return PSTOTEXT_FILTER_BADGLYPHINDEX;
811       */
812     }
813     if (glyph == 0)
814       /* skip */;
815     else if (glyph <= LastISOLatin1) {
816       buf[l] = (char)glyph;
817       /* *** if (glyph IN ISOLatin1Gaps) buf[l] = UnknownChar; */
818       l++;
819     }
820     else if (glyph <= LASTSpecialGlyphs) {
821       const char *str = SpecialGlyphs[glyph-FIRSTSpecialGlyphs];
822       int lstr = strlen(str);
823       strncpy(&buf[l], str, lstr);
824       l += lstr;
825     }
826     else if (glyph <= LastDvips) {
827       const char *str; int lstr; char tempstr[2];
828       if (t->dvipsIsCork) {
829         if (glyph <= LASTCorkSpecialGlyphs)
830           str = CorkSpecialGlyphs[glyph-FIRSTCorkSpecialGlyphs];
831         else if (glyph == FIRSTCorkSpecialGlyphs+0337)
832           str = "SS";
833         else if (glyph == FIRSTCorkSpecialGlyphs+0377)
834           str = "\337";
835         else {
836           tempstr[0] = (char)(glyph-FIRSTCorkSpecialGlyphs); tempstr[1] = '\0';
837           str = &tempstr[0];
838         }
839       }
840       else if (glyph <= LASTDvipsGlyphs)
841         /* Assume old text layout (OT1?). */
842         str = DvipsGlyphs[glyph-FIRSTDvipsGlyphs];
843       else {
844         tempstr[0] = UnknownChar; tempstr[1] = '\0';
845         str = &tempstr[0];
846       }
847       lstr = strlen(str);
848       strncpy(&buf[l], str, lstr);
849       l += lstr;
850     }
851     else if (glyph <= LastTT2) {
852       if (FirstTT2 <= glyph) glyph -= FirstTT2-FirstTT1;
853       if (glyph < FirstTT1+32) {
854         buf[l] = UnknownChar; l++;
855       }
856       else if (glyph < FIRSTTTSpecialGlyphs ||
857             LASTTTSpecialGlyphs < glyph) {
858         buf[l] = (char)(glyph - FirstTT1); l++;
859       }
860       else {
861         const char *str = TTSpecialGlyphs[glyph-FIRSTTTSpecialGlyphs];
862         int lstr = strlen(str);
863         strncpy(&buf[l], str, lstr);
864         l += lstr;
865       }
866     }
867     else if (glyph <= LastOldDvips) {
868       const char *str = DvipsGlyphs[glyph-FirstOldDvips];
869       int lstr = strlen(str);
870       strncpy(&buf[l], str, lstr);
871       l += lstr;
872     }
873     else if (glyph == NonstandardGlyph) { /* not in StandardGlyphs */
874       buf[l] = UnknownChar;
875       l++;
876     }
877     else return PSTOTEXT_FILTER_BADGLYPHINDEX;
878 
879     /* We no longer substitute minus for hyphen. */
880     /* if (buf[l-1] == '\255') buf[l-1] = '-'; */
881   }
882 
883   ReadPair(&x1, &y1, &instr); /* final currentpoint */
884   if (l != 0) { /* "l==0" e.g., when Adobe Illustrator outputs "\r" */
885     if (t->lbuf == 0) {SetBuf();}
886     else {
887       /* If the distance between this string and the previous one is
888          less than "spaceTol" times the minimum of the average
889          character widths in the two strings, and the two strings
890          are in the same direction, then append this string to the
891          previous one.  Otherwise, output the previous string and
892          then save the current one.
893 
894          Sometimes this string overlaps the previous string, e.g.,
895          when TeX is overprinting an accent over another character.
896          So we make a special case for this (but only handle the
897          left-to-right orientation). */
898 
899       /* Set "(xsp,ysp)" to the reporting space coordinates of the
900          minimum of the average width of the characters in this
901          string and the previous one. */
902 
903       xsp = MIN((t->x1-t->x0) / t->lbuf, (x1-x0) / l);
904       ysp = MIN((t->y1-t->y0) / t->lbuf, (y1-y0) / l);
905 
906       dx = x0 - t->x1;
907       dy = y0 - t->y1;
908       maxx = spaceTol * xsp;
909       maxy = spaceTol * ysp;
910       if ((dx*dx + dy*dy < maxx*maxx + maxy*maxy)
911           || ((t->y1 == y0 && t->x0 <= t->x1 && t->x0 <= x0 && x0 <= t->x1)
912          && SameDirection(t->x1-t->x0, t->y1-t->y0, x1-x0, y1-y0))) {
913         if (t->lbuf+l >= sizeof(t->buf)) {
914           Output(t, pre, word, llx, lly, urx, ury);
915           *post = "";
916           SetBuf();
917         }
918         else {
919           strncpy(&t->buf[t->lbuf], buf, l);
920           t->lbuf += l;
921           t->x1 = x1; t->y1 = y1;
922           /* *** Merge font bounding boxes? */
923         }
924       }
925       else {
926         Output(t, pre, word, llx, lly, urx, ury);
927         *post = "";
928         SetBuf();
929       }
930     }
931   }
932 
933   return 0;
934 }
935 
pstotextFilter(void * instance,char * instr,const char ** pre,const char ** word,const char ** post,int * llx,int * lly,int * urx,int * ury)936 int DLLEXPORT pstotextFilter(void *instance, char *instr,
937   const char **pre, const char **word, const char **post,
938   int *llx, int *lly, int *urx, int *ury) {
939   T *t = (T *)instance;
940   int c;
941   *word = NULL;
942   switch (t->state) {
943     case state_normal:
944       do {c = ReadChar(&instr); if (c=='\0') return 0;} while (c!='Q');
945       c = ReadChar(&instr);
946       switch (c) {
947         case 'I': return ParseInverseTransform(t, instr);
948         case 'M': return ParseMetrics(t, instr);
949         case 'E': return ParseEncoding(t, instr);
950         case 'F': return ParseFont(t, instr);
951         case 'S': return ParseString(
952                            t, instr, pre, word, post, llx, lly, urx, ury);
953         case 'C':
954         case 'P': /* copypage, showpage */
955                   /* If any QS directives have been encountered on this page,
956                      t->buf will be nonempty now. */
957                   if (t->lbuf > 0) {
958                     Output(t, pre, word, llx, lly, urx, ury);
959                     *post = "\n\f\n";
960                   }
961                   else {
962                     *pre = "";
963                     *word = "";
964                     *llx = 0; *lly = 0; *urx = 0; *ury = 0;
965                     *post = "\f\n";
966                   }
967                   t->nonEmptyPage = FALSE;
968                   t->blx = t->bly = t->toprx = t->topry = 0;
969                   break;
970         case 'Z': /* erasepage */ /* skip */ break;
971         case '\0': return 0;
972         /* default: skip */
973       }
974       break;
975     case state_metrics: return ParseMetricsMore(t, instr);
976     case state_encoding: return ParseEncodingMore(t, instr);
977   }
978   return 0;
979 }
980 
981